From 00563022426232238fd11874930f2dcbaf968922 Mon Sep 17 00:00:00 2001
From: Ray Bell <rayjohnbell0@gmail.com>
Date: Mon, 20 May 2024 17:09:36 -0400
Subject: [PATCH 01/23] Update operators.rst (#4339)

Closes https://github.com/rapidsai/cugraph/issues/4337

Authors:
  - Ray Bell (https://github.com/raybellwaves)
  - Tingyu Wang (https://github.com/tingyu66)

Approvers:
  - Tingyu Wang (https://github.com/tingyu66)
  - Rick Ratzel (https://github.com/rlratzel)

URL: https://github.com/rapidsai/cugraph/pull/4339
---
 .../api_docs/cugraph-ops/python/operators.rst | 24 +++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/docs/cugraph/source/api_docs/cugraph-ops/python/operators.rst b/docs/cugraph/source/api_docs/cugraph-ops/python/operators.rst
index 3e6664b2db5..8b5efd7aa36 100644
--- a/docs/cugraph/source/api_docs/cugraph-ops/python/operators.rst
+++ b/docs/cugraph/source/api_docs/cugraph-ops/python/operators.rst
@@ -47,10 +47,26 @@ Graph Attention (GATConv/GATv2Conv)
 .. autosummary::
    :toctree: ../../api/ops
 
-   operators.mha_gat_n2n_fwd
-   operators.mha_gat_n2n_bwd
-   operators.mha_gat_n2n_efeat_fwd
-   operators.mha_gat_n2n_efeat_bwd
+   operators.mha_gat_n2n_fwd_bf16_fp32
+   operators.mha_gat_n2n_fwd_fp16_fp32
+   operators.mha_gat_n2n_fwd_fp32_fp32
+   operators.mha_gat_n2n_bwd_bf16_bf16_bf16_fp32
+   operators.mha_gat_n2n_bwd_bf16_bf16_fp32_fp32
+   operators.mha_gat_n2n_bwd_bf16_fp32_fp32_fp32
+   operators.mha_gat_n2n_bwd_fp16_fp16_fp16_fp32
+   operators.mha_gat_n2n_bwd_fp16_fp16_fp32_fp32
+   operators.mha_gat_n2n_bwd_fp16_fp32_fp32_fp32
+   operators.mha_gat_n2n_bwd_fp32_fp32_fp32_fp32
+   operators.mha_gat_n2n_efeat_fwd_bf16_fp32
+   operators.mha_gat_n2n_efeat_fwd_fp16_fp32
+   operators.mha_gat_n2n_efeat_fwd_fp32_fp32
+   operators.mha_gat_n2n_efeat_bwd_bf16_bf16_bf16_fp32
+   operators.mha_gat_n2n_efeat_bwd_bf16_bf16_fp32_fp32
+   operators.mha_gat_n2n_efeat_bwd_bf16_fp32_fp32_fp32
+   operators.mha_gat_n2n_efeat_bwd_fp16_fp16_fp16_fp32
+   operators.mha_gat_n2n_efeat_bwd_fp16_fp16_fp32_fp32
+   operators.mha_gat_n2n_efeat_bwd_fp16_fp32_fp32_fp32
+   operators.mha_gat_n2n_efeat_bwd_fp32_fp32_fp32_fp32
 
    operators.mha_gat_v2_n2n_fwd
    operators.mha_gat_v2_n2n_bwd

From ed5f27fbacc5efc5b84dba1edf67eeb77ce4a1ad Mon Sep 17 00:00:00 2001
From: Paul Taylor <178183+trxcllnt@users.noreply.github.com>
Date: Mon, 20 May 2024 14:11:36 -0700
Subject: [PATCH 02/23] Update pip devcontainers to UCX v1.15.0 (#4360)

Authors:
  - Paul Taylor (https://github.com/trxcllnt)

Approvers:
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/cugraph/pull/4360
---
 .devcontainer/cuda11.8-pip/devcontainer.json | 2 +-
 .devcontainer/cuda12.2-pip/devcontainer.json | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.devcontainer/cuda11.8-pip/devcontainer.json b/.devcontainer/cuda11.8-pip/devcontainer.json
index d225f15f755..851a992f5b9 100644
--- a/.devcontainer/cuda11.8-pip/devcontainer.json
+++ b/.devcontainer/cuda11.8-pip/devcontainer.json
@@ -11,7 +11,7 @@
   "hostRequirements": {"gpu": "optional"},
   "features": {
     "ghcr.io/rapidsai/devcontainers/features/ucx:24.6": {
-      "version": "1.14.1"
+      "version": "1.15.0"
     },
     "ghcr.io/rapidsai/devcontainers/features/cuda:24.6": {
       "version": "11.8",
diff --git a/.devcontainer/cuda12.2-pip/devcontainer.json b/.devcontainer/cuda12.2-pip/devcontainer.json
index e472f4621f9..c8654ded2ee 100644
--- a/.devcontainer/cuda12.2-pip/devcontainer.json
+++ b/.devcontainer/cuda12.2-pip/devcontainer.json
@@ -11,7 +11,7 @@
   "hostRequirements": {"gpu": "optional"},
   "features": {
     "ghcr.io/rapidsai/devcontainers/features/ucx:24.6": {
-      "version": "1.14.1"
+      "version": "1.15.0"
     },
     "ghcr.io/rapidsai/devcontainers/features/cuda:24.6": {
       "version": "12.2",

From b9f6e8ca5f067b7c390dd13ed6251abe7ecc71d5 Mon Sep 17 00:00:00 2001
From: Paul Taylor <178183+trxcllnt@users.noreply.github.com>
Date: Mon, 20 May 2024 14:13:23 -0700
Subject: [PATCH 03/23] add --rm and --name to devcontainer run args (#4361)

* Remove the devcontainer when the VSCode window closes
* Adds a descriptive name to the running container:
  ```shell
  $ docker ps -a
  CONTAINER ID   IMAGE            ...  NAMES
  0dbb364fe544   vsc-cugraph-...  ...  rapids-cugraph-24.06-cuda12.2-conda

  $ docker rm -f rapids-cugraph-24.06-cuda12.2-conda
  ```

Authors:
  - Paul Taylor (https://github.com/trxcllnt)
  - Ralph Liu (https://github.com/nv-rliu)

Approvers:
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/cugraph/pull/4361
---
 .devcontainer/cuda11.8-conda/devcontainer.json | 5 +++++
 .devcontainer/cuda11.8-pip/devcontainer.json   | 5 +++++
 .devcontainer/cuda12.2-conda/devcontainer.json | 5 +++++
 .devcontainer/cuda12.2-pip/devcontainer.json   | 5 +++++
 ci/release/update-version.sh                   | 1 +
 5 files changed, 21 insertions(+)

diff --git a/.devcontainer/cuda11.8-conda/devcontainer.json b/.devcontainer/cuda11.8-conda/devcontainer.json
index bab521f485d..7c9cd0258a4 100644
--- a/.devcontainer/cuda11.8-conda/devcontainer.json
+++ b/.devcontainer/cuda11.8-conda/devcontainer.json
@@ -8,6 +8,11 @@
       "BASE": "rapidsai/devcontainers:24.06-cpp-cuda11.8-mambaforge-ubuntu22.04"
     }
   },
+  "runArgs": [
+    "--rm",
+    "--name",
+    "${localEnv:USER}-rapids-${localWorkspaceFolderBasename}-24.06-cuda11.8-conda"
+  ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.6": {}
diff --git a/.devcontainer/cuda11.8-pip/devcontainer.json b/.devcontainer/cuda11.8-pip/devcontainer.json
index 851a992f5b9..a4dc168505b 100644
--- a/.devcontainer/cuda11.8-pip/devcontainer.json
+++ b/.devcontainer/cuda11.8-pip/devcontainer.json
@@ -8,6 +8,11 @@
       "BASE": "rapidsai/devcontainers:24.06-cpp-cuda11.8-ubuntu22.04"
     }
   },
+  "runArgs": [
+    "--rm",
+    "--name",
+    "${localEnv:USER}-rapids-${localWorkspaceFolderBasename}-24.06-cuda11.8-pip"
+  ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
     "ghcr.io/rapidsai/devcontainers/features/ucx:24.6": {
diff --git a/.devcontainer/cuda12.2-conda/devcontainer.json b/.devcontainer/cuda12.2-conda/devcontainer.json
index bcaabab572b..eae4967f3b2 100644
--- a/.devcontainer/cuda12.2-conda/devcontainer.json
+++ b/.devcontainer/cuda12.2-conda/devcontainer.json
@@ -8,6 +8,11 @@
       "BASE": "rapidsai/devcontainers:24.06-cpp-mambaforge-ubuntu22.04"
     }
   },
+  "runArgs": [
+    "--rm",
+    "--name",
+    "${localEnv:USER}-rapids-${localWorkspaceFolderBasename}-24.06-cuda12.2-conda"
+  ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.6": {}
diff --git a/.devcontainer/cuda12.2-pip/devcontainer.json b/.devcontainer/cuda12.2-pip/devcontainer.json
index c8654ded2ee..393a5c63d23 100644
--- a/.devcontainer/cuda12.2-pip/devcontainer.json
+++ b/.devcontainer/cuda12.2-pip/devcontainer.json
@@ -8,6 +8,11 @@
       "BASE": "rapidsai/devcontainers:24.06-cpp-cuda12.2-ubuntu22.04"
     }
   },
+  "runArgs": [
+    "--rm",
+    "--name",
+    "${localEnv:USER}-rapids-${localWorkspaceFolderBasename}-24.06-cuda12.2-pip"
+  ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
     "ghcr.io/rapidsai/devcontainers/features/ucx:24.6": {
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index 9a7324fb330..f5c14e8d315 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -105,6 +105,7 @@ find .devcontainer/ -type f -name devcontainer.json -print0 | while IFS= read -r
     sed_runner "s@rapidsai/devcontainers/features/ucx:[0-9.]*@rapidsai/devcontainers/features/ucx:${NEXT_SHORT_TAG_PEP440}@" "${filename}"
     sed_runner "s@rapidsai/devcontainers/features/cuda:[0-9.]*@rapidsai/devcontainers/features/cuda:${NEXT_SHORT_TAG_PEP440}@" "${filename}"
     sed_runner "s@rapidsai/devcontainers/features/rapids-build-utils:[0-9.]*@rapidsai/devcontainers/features/rapids-build-utils:${NEXT_SHORT_TAG_PEP440}@" "${filename}"
+    sed_runner "s@rapids-\${localWorkspaceFolderBasename}-[0-9.]*@rapids-\${localWorkspaceFolderBasename}-${NEXT_SHORT_TAG}@g" "${filename}"
 done
 
 sed_runner "s/:[0-9][0-9]\.[0-9][0-9]/:${NEXT_SHORT_TAG}/" ./notebooks/README.md

From 624e961a91387580c788027dd4665b3efdf91f9b Mon Sep 17 00:00:00 2001
From: Erik Welch <erik.n.welch@gmail.com>
Date: Mon, 20 May 2024 19:24:47 -0500
Subject: [PATCH 04/23] nx-cugraph: add `ego_graph` (#4395)

Authors:
  - Erik Welch (https://github.com/eriknw)

Approvers:
  - Rick Ratzel (https://github.com/rlratzel)

URL: https://github.com/rapidsai/cugraph/pull/4395
---
 python/nx-cugraph/README.md                   |   2 +
 python/nx-cugraph/_nx_cugraph/__init__.py     |   5 +
 python/nx-cugraph/lint.yaml                   |   8 +-
 python/nx-cugraph/nx_cugraph/convert.py       |   9 +-
 .../nx_cugraph/generators/__init__.py         |   3 +-
 .../nx-cugraph/nx_cugraph/generators/ego.py   | 161 ++++++++++++++++++
 .../nx_cugraph/tests/test_ego_graph.py        |  81 +++++++++
 python/nx-cugraph/pyproject.toml              |   5 +-
 8 files changed, 265 insertions(+), 9 deletions(-)
 create mode 100644 python/nx-cugraph/nx_cugraph/generators/ego.py
 create mode 100644 python/nx-cugraph/nx_cugraph/tests/test_ego_graph.py

diff --git a/python/nx-cugraph/README.md b/python/nx-cugraph/README.md
index 75b5c1c5aa9..27825585c28 100644
--- a/python/nx-cugraph/README.md
+++ b/python/nx-cugraph/README.md
@@ -216,6 +216,8 @@ Below is the list of algorithms that are currently supported in nx-cugraph.
  └─ <a href="https://networkx.org/documentation/stable/reference/generated/networkx.generators.classic.wheel_graph.html#networkx.generators.classic.wheel_graph">wheel_graph</a>
 <a href="https://networkx.org/documentation/stable/reference/generators.html#module-networkx.generators.community">community</a>
  └─ <a href="https://networkx.org/documentation/stable/reference/generated/networkx.generators.community.caveman_graph.html#networkx.generators.community.caveman_graph">caveman_graph</a>
+<a href="https://networkx.org/documentation/stable/reference/generators.html#module-networkx.generators.ego">ego</a>
+ └─ <a href="https://networkx.org/documentation/stable/reference/generated/networkx.generators.ego.ego_graph.html#networkx.generators.ego.ego_graph">ego_graph</a>
 <a href="https://networkx.org/documentation/stable/reference/generators.html#module-networkx.generators.small">small</a>
  ├─ <a href="https://networkx.org/documentation/stable/reference/generated/networkx.generators.small.bull_graph.html#networkx.generators.small.bull_graph">bull_graph</a>
  ├─ <a href="https://networkx.org/documentation/stable/reference/generated/networkx.generators.small.chvatal_graph.html#networkx.generators.small.chvatal_graph">chvatal_graph</a>
diff --git a/python/nx-cugraph/_nx_cugraph/__init__.py b/python/nx-cugraph/_nx_cugraph/__init__.py
index edc96983b8f..f57b90eb402 100644
--- a/python/nx-cugraph/_nx_cugraph/__init__.py
+++ b/python/nx-cugraph/_nx_cugraph/__init__.py
@@ -77,6 +77,7 @@
         "diamond_graph",
         "dodecahedral_graph",
         "edge_betweenness_centrality",
+        "ego_graph",
         "eigenvector_centrality",
         "empty_graph",
         "florentine_families_graph",
@@ -163,6 +164,7 @@
         "clustering": "Directed graphs and `weight` parameter are not yet supported.",
         "core_number": "Directed graphs are not yet supported.",
         "edge_betweenness_centrality": "`weight` parameter is not yet supported, and RNG with seed may be different.",
+        "ego_graph": "Weighted ego_graph with negative cycles is not yet supported. `NotImplementedError` will be raised if there are negative `distance` edge weights.",
         "eigenvector_centrality": "`nstart` parameter is not used, but it is checked for validity.",
         "from_pandas_edgelist": "cudf.DataFrame inputs also supported; value columns with str is unsuppported.",
         "generic_bfs_edges": "`neighbors` and `sort_neighbors` parameters are not yet supported.",
@@ -191,6 +193,9 @@
         "bellman_ford_path_length": {
             "dtype : dtype or None, optional": "The data type (np.float32, np.float64, or None) to use for the edge weights in the algorithm. If None, then dtype is determined by the edge values.",
         },
+        "ego_graph": {
+            "dtype : dtype or None, optional": "The data type (np.float32, np.float64, or None) to use for the edge weights in the algorithm. If None, then dtype is determined by the edge values.",
+        },
         "eigenvector_centrality": {
             "dtype : dtype or None, optional": "The data type (np.float32, np.float64, or None) to use for the edge weights in the algorithm. If None, then dtype is determined by the edge values.",
         },
diff --git a/python/nx-cugraph/lint.yaml b/python/nx-cugraph/lint.yaml
index d220cb18df3..c4422ffb97d 100644
--- a/python/nx-cugraph/lint.yaml
+++ b/python/nx-cugraph/lint.yaml
@@ -26,7 +26,7 @@ repos:
       - id: mixed-line-ending
       - id: trailing-whitespace
   - repo: https://github.com/abravalheri/validate-pyproject
-    rev: v0.16
+    rev: v0.17
     hooks:
       - id: validate-pyproject
         name: Validate pyproject.toml
@@ -50,7 +50,7 @@ repos:
       - id: black
       # - id: black-jupyter
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.4.2
+    rev: v0.4.4
     hooks:
       - id: ruff
         args: [--fix-only, --show-fixes]  # --unsafe-fixes]
@@ -62,7 +62,7 @@ repos:
         additional_dependencies: &flake8_dependencies
           # These versions need updated manually
           - flake8==7.0.0
-          - flake8-bugbear==24.4.21
+          - flake8-bugbear==24.4.26
           - flake8-simplify==0.21.0
   - repo: https://github.com/asottile/yesqa
     rev: v1.5.0
@@ -77,7 +77,7 @@ repos:
         additional_dependencies: [tomli]
         files: ^(nx_cugraph|docs)/
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.4.2
+    rev: v0.4.4
     hooks:
       - id: ruff
   - repo: https://github.com/pre-commit/pre-commit-hooks
diff --git a/python/nx-cugraph/nx_cugraph/convert.py b/python/nx-cugraph/nx_cugraph/convert.py
index f265540a161..b34245d5031 100644
--- a/python/nx-cugraph/nx_cugraph/convert.py
+++ b/python/nx-cugraph/nx_cugraph/convert.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -562,7 +562,12 @@ def to_networkx(G: nxcg.Graph, *, sort_edges: bool = False) -> nx.Graph:
         dst_iter = map(id_to_key.__getitem__, dst_indices)
     if G.is_multigraph() and (G.edge_keys is not None or G.edge_indices is not None):
         if G.edge_keys is not None:
-            edge_keys = G.edge_keys
+            if not G.is_directed():
+                edge_keys = [k for k, m in zip(G.edge_keys, mask.tolist()) if m]
+            else:
+                edge_keys = G.edge_keys
+        elif not G.is_directed():
+            edge_keys = G.edge_indices[mask].tolist()
         else:
             edge_keys = G.edge_indices.tolist()
         if edge_values:
diff --git a/python/nx-cugraph/nx_cugraph/generators/__init__.py b/python/nx-cugraph/nx_cugraph/generators/__init__.py
index c1834a4dec7..60a9d92373a 100644
--- a/python/nx-cugraph/nx_cugraph/generators/__init__.py
+++ b/python/nx-cugraph/nx_cugraph/generators/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -12,5 +12,6 @@
 # limitations under the License.
 from .classic import *
 from .community import *
+from .ego import *
 from .small import *
 from .social import *
diff --git a/python/nx-cugraph/nx_cugraph/generators/ego.py b/python/nx-cugraph/nx_cugraph/generators/ego.py
new file mode 100644
index 00000000000..66c9c8b95ee
--- /dev/null
+++ b/python/nx-cugraph/nx_cugraph/generators/ego.py
@@ -0,0 +1,161 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+
+import cupy as cp
+import networkx as nx
+import numpy as np
+import pylibcugraph as plc
+
+import nx_cugraph as nxcg
+
+from ..utils import _dtype_param, _get_float_dtype, index_dtype, networkx_algorithm
+
+__all__ = ["ego_graph"]
+
+
+@networkx_algorithm(
+    extra_params=_dtype_param, version_added="24.06", _plc={"bfs", "ego_graph", "sssp"}
+)
+def ego_graph(
+    G, n, radius=1, center=True, undirected=False, distance=None, *, dtype=None
+):
+    """Weighted ego_graph with negative cycles is not yet supported. `NotImplementedError` will be raised if there are negative `distance` edge weights."""  # noqa: E501
+    if isinstance(G, nx.Graph):
+        G = nxcg.from_networkx(G, preserve_all_attrs=True)
+    if n not in G:
+        if distance is None:
+            raise nx.NodeNotFound(f"Source {n} is not in G")
+        raise nx.NodeNotFound(f"Node {n} not found in graph")
+    src_index = n if G.key_to_id is None else G.key_to_id[n]
+    symmetrize = "union" if undirected and G.is_directed() else None
+    if distance is None or distance not in G.edge_values:
+        # Simple BFS to determine nodes
+        if radius is not None and radius <= 0:
+            if center:
+                node_ids = cp.array([src_index], dtype=index_dtype)
+            else:
+                node_ids = cp.empty(0, dtype=index_dtype)
+            node_mask = None
+        else:
+            if radius is None or np.isinf(radius):
+                radius = -1
+            else:
+                radius = math.ceil(radius)
+            distances, unused_predecessors, node_ids = plc.bfs(
+                handle=plc.ResourceHandle(),
+                graph=G._get_plc_graph(symmetrize=symmetrize),
+                sources=cp.array([src_index], index_dtype),
+                direction_optimizing=False,  # True for undirected only; what's best?
+                depth_limit=radius,
+                compute_predecessors=False,
+                do_expensive_check=False,
+            )
+            node_mask = distances != np.iinfo(distances.dtype).max
+    else:
+        # SSSP to determine nodes
+        if callable(distance):
+            raise NotImplementedError("callable `distance` argument is not supported")
+        if symmetrize and G.is_multigraph():
+            # G._get_plc_graph does not implement `symmetrize=True` w/ edge array
+            raise NotImplementedError(
+                "Weighted ego_graph with undirected=True not implemented"
+            )
+        # Check for negative values since we don't support negative cycles
+        edge_vals = G.edge_values[distance]
+        if distance in G.edge_masks:
+            edge_vals = edge_vals[G.edge_masks[distance]]
+        if (edge_vals < 0).any():
+            raise NotImplementedError(
+                "Negative edge weights not yet supported by ego_graph"
+            )
+        # PERF: we could use BFS if all edges are equal
+        if radius is None:
+            radius = np.inf
+        dtype = _get_float_dtype(dtype, graph=G, weight=distance)
+        node_ids, distances, unused_predecessors = plc.sssp(
+            resource_handle=plc.ResourceHandle(),
+            graph=(G.to_undirected() if symmetrize else G)._get_plc_graph(
+                distance, 1, dtype
+            ),
+            source=src_index,
+            cutoff=np.nextafter(radius, np.inf, dtype=np.float64),
+            compute_predecessors=True,  # TODO: False is not yet supported
+            do_expensive_check=False,
+        )
+        node_mask = distances != np.finfo(distances.dtype).max
+
+    if node_mask is not None:
+        if not center:
+            node_mask &= node_ids != src_index
+        node_ids = node_ids[node_mask]
+    if node_ids.size == G._N:
+        return G.copy()
+    # TODO: create renumbering helper function(s)
+    node_ids.sort()  # TODO: is this ever necessary? Keep for safety
+    node_values = {key: val[node_ids] for key, val in G.node_values.items()}
+    node_masks = {key: val[node_ids] for key, val in G.node_masks.items()}
+
+    G._sort_edge_indices()  # TODO: is this ever necessary? Keep for safety
+    edge_mask = cp.isin(G.src_indices, node_ids) & cp.isin(G.dst_indices, node_ids)
+    src_indices = cp.searchsorted(node_ids, G.src_indices[edge_mask]).astype(
+        index_dtype
+    )
+    dst_indices = cp.searchsorted(node_ids, G.dst_indices[edge_mask]).astype(
+        index_dtype
+    )
+    edge_values = {key: val[edge_mask] for key, val in G.edge_values.items()}
+    edge_masks = {key: val[edge_mask] for key, val in G.edge_masks.items()}
+
+    # Renumber nodes
+    if (id_to_key := G.id_to_key) is not None:
+        key_to_id = {
+            id_to_key[old_index]: new_index
+            for new_index, old_index in enumerate(node_ids.tolist())
+        }
+    else:
+        key_to_id = {
+            old_index: new_index
+            for new_index, old_index in enumerate(node_ids.tolist())
+        }
+    kwargs = {
+        "N": node_ids.size,
+        "src_indices": src_indices,
+        "dst_indices": dst_indices,
+        "edge_values": edge_values,
+        "edge_masks": edge_masks,
+        "node_values": node_values,
+        "node_masks": node_masks,
+        "key_to_id": key_to_id,
+    }
+    if G.is_multigraph():
+        if G.edge_keys is not None:
+            kwargs["edge_keys"] = [
+                x for x, m in zip(G.edge_keys, edge_mask.tolist()) if m
+            ]
+        if G.edge_indices is not None:
+            kwargs["edge_indices"] = G.edge_indices[edge_mask]
+    rv = G.__class__.from_coo(**kwargs)
+    rv.graph.update(G.graph)
+    return rv
+
+
+@ego_graph._can_run
+def _(G, n, radius=1, center=True, undirected=False, distance=None, *, dtype=None):
+    if distance is not None and undirected and G.is_directed() and G.is_multigraph():
+        return "Weighted ego_graph with undirected=True not implemented"
+    if distance is not None and nx.is_negatively_weighted(G, weight=distance):
+        return "Weighted ego_graph with negative cycles not yet supported"
+    if callable(distance):
+        return "callable `distance` argument is not supported"
+    return True
diff --git a/python/nx-cugraph/nx_cugraph/tests/test_ego_graph.py b/python/nx-cugraph/nx_cugraph/tests/test_ego_graph.py
new file mode 100644
index 00000000000..5474f9d79e3
--- /dev/null
+++ b/python/nx-cugraph/nx_cugraph/tests/test_ego_graph.py
@@ -0,0 +1,81 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import networkx as nx
+import pytest
+from packaging.version import parse
+
+import nx_cugraph as nxcg
+
+from .testing_utils import assert_graphs_equal
+
+nxver = parse(nx.__version__)
+
+
+if nxver.major == 3 and nxver.minor < 2:
+    pytest.skip("Need NetworkX >=3.2 to test ego_graph", allow_module_level=True)
+
+
+@pytest.mark.parametrize(
+    "create_using", [nx.Graph, nx.DiGraph, nx.MultiGraph, nx.MultiDiGraph]
+)
+@pytest.mark.parametrize("radius", [-1, 0, 1, 1.5, 2, float("inf"), None])
+@pytest.mark.parametrize("center", [True, False])
+@pytest.mark.parametrize("undirected", [False, True])
+@pytest.mark.parametrize("multiple_edges", [False, True])
+@pytest.mark.parametrize("n", [0, 3])
+def test_ego_graph_cycle_graph(
+    create_using, radius, center, undirected, multiple_edges, n
+):
+    Gnx = nx.cycle_graph(7, create_using=create_using)
+    if multiple_edges:
+        # Test multigraph with multiple edges
+        if not Gnx.is_multigraph():
+            return
+        Gnx.add_edges_from(nx.cycle_graph(7, create_using=nx.DiGraph).edges)
+        Gnx.add_edge(0, 1, 10)
+    Gcg = nxcg.from_networkx(Gnx, preserve_all_attrs=True)
+    assert_graphs_equal(Gnx, Gcg)  # Sanity check
+
+    kwargs = {"radius": radius, "center": center, "undirected": undirected}
+    Hnx = nx.ego_graph(Gnx, n, **kwargs)
+    Hcg = nx.ego_graph(Gnx, n, **kwargs, backend="cugraph")
+    assert_graphs_equal(Hnx, Hcg)
+    with pytest.raises(nx.NodeNotFound, match="not in G"):
+        nx.ego_graph(Gnx, -1, **kwargs)
+    with pytest.raises(nx.NodeNotFound, match="not in G"):
+        nx.ego_graph(Gnx, -1, **kwargs, backend="cugraph")
+    # Using sssp with default weight of 1 should give same answer as bfs
+    nx.set_edge_attributes(Gnx, 1, name="weight")
+    Gcg = nxcg.from_networkx(Gnx, preserve_all_attrs=True)
+    assert_graphs_equal(Gnx, Gcg)  # Sanity check
+
+    kwargs["distance"] = "weight"
+    H2nx = nx.ego_graph(Gnx, n, **kwargs)
+    is_nx32 = nxver.major == 3 and nxver.minor == 2
+    if undirected and Gnx.is_directed() and Gnx.is_multigraph():
+        if is_nx32:
+            # `should_run` was added in nx 3.3
+            match = "Weighted ego_graph with undirected=True not implemented"
+        else:
+            match = "not implemented by cugraph"
+        with pytest.raises(RuntimeError, match=match):
+            nx.ego_graph(Gnx, n, **kwargs, backend="cugraph")
+        with pytest.raises(NotImplementedError, match="ego_graph"):
+            nx.ego_graph(Gcg, n, **kwargs)
+    else:
+        H2cg = nx.ego_graph(Gnx, n, **kwargs, backend="cugraph")
+        assert_graphs_equal(H2nx, H2cg)
+        with pytest.raises(nx.NodeNotFound, match="not found in graph"):
+            nx.ego_graph(Gnx, -1, **kwargs)
+        with pytest.raises(nx.NodeNotFound, match="not found in graph"):
+            nx.ego_graph(Gnx, -1, **kwargs, backend="cugraph")
diff --git a/python/nx-cugraph/pyproject.toml b/python/nx-cugraph/pyproject.toml
index a7daf01775b..477fe8bb493 100644
--- a/python/nx-cugraph/pyproject.toml
+++ b/python/nx-cugraph/pyproject.toml
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 [build-system]
 
@@ -19,7 +19,7 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 classifiers = [
-    "Development Status :: 3 - Alpha",
+    "Development Status :: 4 - Beta",
     "License :: OSI Approved :: Apache Software License",
     "Programming Language :: Python",
     "Programming Language :: Python :: 3",
@@ -233,6 +233,7 @@ ignore = [
 "nx_cugraph/**/tests/*py" = ["S101", "S311", "T201", "D103", "D100"]
 "_nx_cugraph/__init__.py" = ["E501"]
 "nx_cugraph/algorithms/**/*py" = ["D205", "D401"]  # Allow flexible docstrings for algorithms
+"nx_cugraph/generators/**/*py" = ["D205", "D401"]  # Allow flexible docstrings for generators
 "nx_cugraph/interface.py" = ["D401"]  # Flexible docstrings
 "scripts/update_readme.py" = ["INP001"]  # Not part of a package
 

From 6bd08d245d82bb93d93f81b8875c04e31bb7450c Mon Sep 17 00:00:00 2001
From: Ray Bell <rayjohnbell0@gmail.com>
Date: Mon, 20 May 2024 21:02:17 -0400
Subject: [PATCH 05/23] test sphinx mapping to networkx (#4323)

Closes https://github.com/rapidsai/cugraph/issues/4285

I'll report back if it get to render locally. Ran out of mem building the library on my ec2 machine (g5.2xlarge: 32 Gb RAM) but I did just build cudf. I'll try again soon.

~~Also brainstorming here and I probably should upstream this to the rapids CI process. Would be nice to have /ok to test docs which just builds the docs for this PR~~ created https://github.com/nv-gha-runners/nvidia-runners/issues/25

I tried building the docs locally and got

```
WARNING: [autosummary] failed to import cugraph.jaccard_w.
Possible hints:
* AttributeError: module 'cugraph' has no attribute 'jaccard_w'
* ImportError:
* ModuleNotFoundError: No module named 'cugraph.jaccard_w'
WARNING: [autosummary] failed to import cugraph.overlap_w.
Possible hints:
* ModuleNotFoundError: No module named 'cugraph.overlap_w'
* ImportError:
* AttributeError: module 'cugraph' has no attribute 'overlap_w'
WARNING: [autosummary] failed to import cugraph.sorensen_w.
Possible hints:
* ModuleNotFoundError: No module named 'cugraph.sorensen_w'
* ImportError:
* AttributeError: module 'cugraph' has no attribute 'sorensen_w'
```

Think this comes from https://github.com/rapidsai/cugraph/blob/abe69c0419b67b567d3c8fce91ee1a062d53e385/docs/cugraph/source/api_docs/cugraph/link_prediction.rst#L14

but I may have messed up my build

Authors:
  - Ray Bell (https://github.com/raybellwaves)
  - Rick Ratzel (https://github.com/rlratzel)

Approvers:
  - Rick Ratzel (https://github.com/rlratzel)

URL: https://github.com/rapidsai/cugraph/pull/4323
---
 docs/cugraph/source/conf.py                        | 14 +++++++++++---
 python/cugraph/cugraph/structure/convert_matrix.py |  4 +++-
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/docs/cugraph/source/conf.py b/docs/cugraph/source/conf.py
index 952b962aca2..66bc3137fba 100644
--- a/docs/cugraph/source/conf.py
+++ b/docs/cugraph/source/conf.py
@@ -190,9 +190,17 @@
      'Miscellaneous'),
 ]
 
-# Example configuration for intersphinx: refer to the Python standard library.
-intersphinx_mapping = {'https://docs.python.org/': None}
-
+# Connect docs in other projects
+intersphinx_mapping = {
+    "networkx": (
+        "https://networkx.org/documentation/stable/",
+        "https://networkx.org/documentation/stable/objects.inv",
+    ),
+    "python": (
+        "https://docs.python.org/3",
+        "https://docs.python.org/3/objects.inv",
+    ),
+}
 
 # Config numpydoc
 numpydoc_show_inherited_class_members = False
diff --git a/python/cugraph/cugraph/structure/convert_matrix.py b/python/cugraph/cugraph/structure/convert_matrix.py
index ca8e93c482b..b9b9554b870 100644
--- a/python/cugraph/cugraph/structure/convert_matrix.py
+++ b/python/cugraph/cugraph/structure/convert_matrix.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -277,6 +277,8 @@ def from_pandas_edgelist(
     renumber=True,
 ):
     """
+    See :func:`networkx.convert_matrix.from_pandas_edgelist`.
+
     Initialize a graph from the edge list. It is an error to call this
     method on an initialized Graph object. Source argument is source
     column name and destination argument is destination column name.

From 2779f3230feee01e1474c8374c688c8f8b021d65 Mon Sep 17 00:00:00 2001
From: Naim <110031745+naimnv@users.noreply.github.com>
Date: Tue, 21 May 2024 03:44:59 +0200
Subject: [PATCH 06/23] MNMG Approximation Algorithm for the Weighted Matching
 Problem (#4315)

MNMG [Approximation Algorithm for the Weighted Matching Problem](https://web.archive.org/web/20081031230449id_/http://www.ii.uib.no/~fredrikm/fredrik/papers/CP75.pdf)

Authors:
  - Naim (https://github.com/naimnv)

Approvers:
  - Chuck Hastings (https://github.com/ChuckHastings)
  - Seunghwa Kang (https://github.com/seunghwak)

URL: https://github.com/rapidsai/cugraph/pull/4315
---
 cpp/CMakeLists.txt                            |   2 +
 cpp/include/cugraph/algorithms.hpp            |  26 ++
 .../approx_weighted_matching_impl.cuh         | 392 ++++++++++++++++++
 .../community/approx_weighted_matching_mg.cu  |  50 +++
 .../community/approx_weighted_matching_sg.cu  |  50 +++
 cpp/tests/CMakeLists.txt                      |   8 +
 .../community/mg_weighted_matching_test.cpp   | 232 +++++++++++
 .../community/weighted_matching_test.cpp      | 182 ++++++++
 8 files changed, 942 insertions(+)
 create mode 100644 cpp/src/community/approx_weighted_matching_impl.cuh
 create mode 100644 cpp/src/community/approx_weighted_matching_mg.cu
 create mode 100644 cpp/src/community/approx_weighted_matching_sg.cu
 create mode 100644 cpp/tests/community/mg_weighted_matching_test.cpp
 create mode 100644 cpp/tests/community/weighted_matching_test.cpp

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index d3dfdbd068c..57e0aa2d078 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -288,6 +288,8 @@ set(CUGRAPH_SOURCES
     src/structure/symmetrize_edgelist_mg.cu
     src/community/triangle_count_sg.cu
     src/community/triangle_count_mg.cu
+    src/community/approx_weighted_matching_sg.cu
+    src/community/approx_weighted_matching_mg.cu
     src/traversal/k_hop_nbrs_sg.cu
     src/traversal/k_hop_nbrs_mg.cu
     src/mtmg/vertex_result.cu
diff --git a/cpp/include/cugraph/algorithms.hpp b/cpp/include/cugraph/algorithms.hpp
index 0caa151daac..7c4a978c4b4 100644
--- a/cpp/include/cugraph/algorithms.hpp
+++ b/cpp/include/cugraph/algorithms.hpp
@@ -2368,6 +2368,32 @@ rmm::device_uvector<vertex_t> vertex_coloring(
   graph_view_t<vertex_t, edge_t, false, multi_gpu> const& graph_view,
   raft::random::RngState& rng_state);
 
+/*
+ * @brief Approximate Weighted Matching
+ *
+ * A matching in an undirected graph G = (V, E) is a pairing of adjacent vertices
+ * such that each vertex is matched with at most one other vertex, the objective
+ * being to match as many vertices as possible or to maximise the sum of the
+ * weights of the matched edges. Here we provide an implementation of an
+ * approximation algorithm to the weighted Maximum matching. See
+ * https://web.archive.org/web/20081031230449id_/http://www.ii.uib.no/~fredrikm/fredrik/papers/CP75.pdf
+ * for further information.
+ *
+ * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
+ * @tparam edge_t Type of edge identifiers. Needs to be an integral type.
+ * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false)
+ * @param[in] handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator,
+ * and handles to various CUDA libraries) to run graph algorithms.
+ * @param[in] graph_view Graph view object.
+ * @param[in] edge_weight_view View object holding edge weights for @p graph_view.
+ * @return A tuple of device vector of matched vertex ids and sum of the weights of the matched
+ * edges.
+ */
+template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
+std::tuple<rmm::device_uvector<vertex_t>, weight_t> approximate_weighted_matching(
+  raft::handle_t const& handle,
+  graph_view_t<vertex_t, edge_t, false, multi_gpu> const& graph_view,
+  edge_property_view_t<edge_t, weight_t const*> edge_weight_view);
 }  // namespace cugraph
 
 /**
diff --git a/cpp/src/community/approx_weighted_matching_impl.cuh b/cpp/src/community/approx_weighted_matching_impl.cuh
new file mode 100644
index 00000000000..e693beee489
--- /dev/null
+++ b/cpp/src/community/approx_weighted_matching_impl.cuh
@@ -0,0 +1,392 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "prims/fill_edge_property.cuh"
+#include "prims/reduce_op.cuh"
+#include "prims/transform_e.cuh"
+#include "prims/transform_reduce_e_by_src_dst_key.cuh"
+#include "prims/update_edge_src_dst_property.cuh"
+#include "utilities/collect_comm.cuh"
+
+#include <cugraph/algorithms.hpp>
+#include <cugraph/detail/shuffle_wrappers.hpp>
+#include <cugraph/detail/utility_wrappers.hpp>
+
+#include <raft/core/handle.hpp>
+
+#include <thrust/fill.h>
+
+namespace cugraph {
+
+namespace detail {
+
+template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
+std::tuple<rmm::device_uvector<vertex_t>, weight_t> approximate_weighted_matching(
+  raft::handle_t const& handle,
+  cugraph::graph_view_t<vertex_t, edge_t, false, multi_gpu> const& graph_view,
+  edge_property_view_t<edge_t, weight_t const*> edge_weight_view)
+{
+  CUGRAPH_EXPECTS(graph_view.is_symmetric(),
+                  "Invalid input arguments: input graph for approximate_weighted_matching must "
+                  "need to be symmetric");
+
+  using graph_view_t = cugraph::graph_view_t<vertex_t, edge_t, false, multi_gpu>;
+
+  graph_view_t current_graph_view(graph_view);
+  if (current_graph_view.has_edge_mask()) { current_graph_view.clear_edge_mask(); }
+
+  cugraph::edge_property_t<graph_view_t, bool> edge_masks_even(handle, current_graph_view);
+  cugraph::fill_edge_property(handle, current_graph_view, bool{false}, edge_masks_even);
+  cugraph::edge_property_t<graph_view_t, bool> edge_masks_odd(handle, current_graph_view);
+  cugraph::fill_edge_property(handle, current_graph_view, bool{false}, edge_masks_odd);
+
+  if (graph_view.has_edge_mask()) {
+    current_graph_view.attach_edge_mask(*(graph_view.edge_mask_view()));
+  }
+  // Mask out self-loop
+  cugraph::transform_e(
+    handle,
+    current_graph_view,
+    cugraph::edge_src_dummy_property_t{}.view(),
+    cugraph::edge_dst_dummy_property_t{}.view(),
+    cugraph::edge_dummy_property_t{}.view(),
+    [] __device__(auto src, auto dst, thrust::nullopt_t, thrust::nullopt_t, thrust::nullopt_t) {
+      return !(src == dst);
+    },
+    edge_masks_even.mutable_view());
+
+  if (current_graph_view.has_edge_mask()) current_graph_view.clear_edge_mask();
+  current_graph_view.attach_edge_mask(edge_masks_even.view());
+
+  auto constexpr invalid_partner = invalid_vertex_id<vertex_t>::value;
+  rmm::device_uvector<weight_t> offers_from_partners(
+    current_graph_view.local_vertex_partition_range_size(), handle.get_stream());
+
+  rmm::device_uvector<vertex_t> partners(current_graph_view.local_vertex_partition_range_size(),
+                                         handle.get_stream());
+
+  thrust::fill(handle.get_thrust_policy(), partners.begin(), partners.end(), invalid_partner);
+  thrust::fill(handle.get_thrust_policy(),
+               offers_from_partners.begin(),
+               offers_from_partners.end(),
+               weight_t{0.0});
+
+  rmm::device_uvector<vertex_t> local_vertices(
+    current_graph_view.local_vertex_partition_range_size(), handle.get_stream());
+  detail::sequence_fill(handle.get_stream(),
+                        local_vertices.begin(),
+                        local_vertices.size(),
+                        current_graph_view.local_vertex_partition_range_first());
+
+  edge_src_property_t<graph_view_t, vertex_t> src_key_cache(handle);
+  cugraph::edge_src_property_t<graph_view_t, bool> src_match_flags(handle);
+  cugraph::edge_dst_property_t<graph_view_t, bool> dst_match_flags(handle);
+
+  if constexpr (graph_view_t::is_multi_gpu) {
+    src_key_cache = edge_src_property_t<graph_view_t, vertex_t>(handle, current_graph_view);
+
+    update_edge_src_property(handle, current_graph_view, local_vertices.begin(), src_key_cache);
+
+    src_match_flags = cugraph::edge_src_property_t<graph_view_t, bool>(handle, current_graph_view);
+    dst_match_flags = cugraph::edge_dst_property_t<graph_view_t, bool>(handle, current_graph_view);
+  }
+
+  vertex_t loop_counter = 0;
+  while (true) {
+    //
+    // For each candidate vertex, find the best possible target
+    //
+
+    rmm::device_uvector<vertex_t> candidates(0, handle.get_stream());
+    rmm::device_uvector<weight_t> offers_from_candidates(0, handle.get_stream());
+    rmm::device_uvector<vertex_t> targets(0, handle.get_stream());
+
+    // FIXME: This can be implemented more efficiently if per_v_transform_reduce_incoming|outgoing_e
+    // is updated to support reduction on thrust::tuple.
+    std::forward_as_tuple(candidates, std::tie(offers_from_candidates, targets)) =
+      cugraph::transform_reduce_e_by_src_key(
+        handle,
+        current_graph_view,
+        cugraph::edge_src_dummy_property_t{}.view(),
+        cugraph::edge_dst_dummy_property_t{}.view(),
+        edge_weight_view,
+        graph_view_t::is_multi_gpu
+          ? src_key_cache.view()
+          : detail::edge_major_property_view_t<vertex_t, vertex_t const*>(local_vertices.begin()),
+        [] __device__(auto, auto dst, thrust::nullopt_t, thrust::nullopt_t, auto wt) {
+          return thrust::make_tuple(wt, dst);
+        },
+        thrust::make_tuple(weight_t{0.0}, invalid_partner),
+        reduce_op::maximum<thrust::tuple<weight_t, vertex_t>>{},
+        true);
+
+    //
+    // For each target, find the best offer
+    //
+
+    if constexpr (graph_view_t::is_multi_gpu) {
+      auto vertex_partition_range_lasts = current_graph_view.vertex_partition_range_lasts();
+
+      rmm::device_uvector<vertex_t> d_vertex_partition_range_lasts(
+        vertex_partition_range_lasts.size(), handle.get_stream());
+
+      raft::update_device(d_vertex_partition_range_lasts.data(),
+                          vertex_partition_range_lasts.data(),
+                          vertex_partition_range_lasts.size(),
+                          handle.get_stream());
+
+      auto& major_comm = handle.get_subcomm(cugraph::partition_manager::major_comm_name());
+      auto const major_comm_size = major_comm.get_size();
+      auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
+      auto const minor_comm_size = minor_comm.get_size();
+
+      auto key_func = cugraph::detail::compute_gpu_id_from_int_vertex_t<vertex_t>{
+        raft::device_span<vertex_t const>(d_vertex_partition_range_lasts.data(),
+                                          d_vertex_partition_range_lasts.size()),
+        major_comm_size,
+        minor_comm_size};
+
+      std::forward_as_tuple(std::tie(candidates, offers_from_candidates, targets), std::ignore) =
+        cugraph::groupby_gpu_id_and_shuffle_values(
+          handle.get_comms(),
+          thrust::make_zip_iterator(thrust::make_tuple(
+            candidates.begin(), offers_from_candidates.begin(), targets.begin())),
+          thrust::make_zip_iterator(
+            thrust::make_tuple(candidates.end(), offers_from_candidates.end(), targets.end())),
+          [key_func] __device__(auto val) { return key_func(thrust::get<2>(val)); },
+          handle.get_stream());
+    }
+
+    auto itr_to_tuples = thrust::make_zip_iterator(
+      thrust::make_tuple(offers_from_candidates.begin(), candidates.begin()));
+
+    thrust::sort_by_key(handle.get_thrust_policy(), targets.begin(), targets.end(), itr_to_tuples);
+
+    auto nr_unique_targets = thrust::count_if(handle.get_thrust_policy(),
+                                              thrust::make_counting_iterator(size_t{0}),
+                                              thrust::make_counting_iterator(targets.size()),
+                                              is_first_in_run_t<vertex_t const*>{targets.data()});
+
+    rmm::device_uvector<vertex_t> unique_targets(nr_unique_targets, handle.get_stream());
+    rmm::device_uvector<weight_t> best_offers_to_targets(nr_unique_targets, handle.get_stream());
+    rmm::device_uvector<vertex_t> best_candidates(nr_unique_targets, handle.get_stream());
+
+    auto itr_to_reduced_tuples = thrust::make_zip_iterator(
+      thrust::make_tuple(best_offers_to_targets.begin(), best_candidates.begin()));
+
+    auto new_end = thrust::reduce_by_key(
+      handle.get_thrust_policy(),
+      targets.begin(),
+      targets.end(),
+      itr_to_tuples,
+      unique_targets.begin(),
+      itr_to_reduced_tuples,
+      thrust::equal_to<vertex_t>{},
+      [] __device__(auto pair1, auto pair2) { return (pair1 > pair2) ? pair1 : pair2; });
+
+    vertex_t nr_reduces_tuples =
+      static_cast<vertex_t>(thrust::distance(unique_targets.begin(), new_end.first));
+
+    targets                = std::move(unique_targets);
+    offers_from_candidates = std::move(best_offers_to_targets);
+    candidates             = std::move(best_candidates);
+
+    //
+    //  two vertex offer each other, that's a match
+    //
+
+    kv_store_t<vertex_t, vertex_t, false> target_candidate_map(targets.begin(),
+                                                               targets.end(),
+                                                               candidates.begin(),
+                                                               invalid_vertex_id<vertex_t>::value,
+                                                               invalid_vertex_id<vertex_t>::value,
+                                                               handle.get_stream());
+
+    rmm::device_uvector<vertex_t> candidates_of_candidates(0, handle.get_stream());
+
+    if (graph_view_t::is_multi_gpu) {
+      auto& comm       = handle.get_comms();
+      auto& major_comm = handle.get_subcomm(cugraph::partition_manager::major_comm_name());
+      auto const major_comm_size = major_comm.get_size();
+      auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
+      auto const minor_comm_size = minor_comm.get_size();
+
+      auto partitions_range_lasts = graph_view.vertex_partition_range_lasts();
+      rmm::device_uvector<vertex_t> d_partitions_range_lasts(partitions_range_lasts.size(),
+                                                             handle.get_stream());
+
+      raft::update_device(d_partitions_range_lasts.data(),
+                          partitions_range_lasts.data(),
+                          partitions_range_lasts.size(),
+                          handle.get_stream());
+
+      cugraph::detail::compute_gpu_id_from_int_vertex_t<vertex_t> vertex_to_gpu_id_op{
+        raft::device_span<vertex_t const>(d_partitions_range_lasts.data(),
+                                          d_partitions_range_lasts.size()),
+        major_comm_size,
+        minor_comm_size};
+
+      candidates_of_candidates = cugraph::collect_values_for_keys(handle,
+                                                                  target_candidate_map.view(),
+                                                                  candidates.begin(),
+                                                                  candidates.end(),
+                                                                  vertex_to_gpu_id_op);
+    } else {
+      candidates_of_candidates.resize(candidates.size(), handle.get_stream());
+
+      target_candidate_map.view().find(candidates.begin(),
+                                       candidates.end(),
+                                       candidates_of_candidates.begin(),
+                                       handle.get_stream());
+    }
+
+    //
+    // Mask out neighborhood of matched vertices
+    //
+
+    rmm::device_uvector<bool> is_vertex_matched = rmm::device_uvector<bool>(
+      current_graph_view.local_vertex_partition_range_size(), handle.get_stream());
+    thrust::fill(
+      handle.get_thrust_policy(), is_vertex_matched.begin(), is_vertex_matched.end(), bool{false});
+
+    thrust::for_each(
+      handle.get_thrust_policy(),
+      thrust::make_zip_iterator(thrust::make_tuple(candidates_of_candidates.begin(),
+                                                   targets.begin(),
+                                                   candidates.begin(),
+                                                   offers_from_candidates.begin())),
+      thrust::make_zip_iterator(thrust::make_tuple(candidates_of_candidates.end(),
+                                                   targets.end(),
+                                                   candidates.end(),
+                                                   offers_from_candidates.end())),
+      [partners             = partners.begin(),
+       offers_from_partners = offers_from_partners.begin(),
+       is_vertex_matched =
+         raft::device_span<bool>(is_vertex_matched.data(), is_vertex_matched.size()),
+       v_first =
+         current_graph_view.local_vertex_partition_range_first()] __device__(auto msrc_tgt) {
+        auto candidate_of_candidate = thrust::get<0>(msrc_tgt);
+        auto tgt                    = thrust::get<1>(msrc_tgt);
+        auto candiate               = thrust::get<2>(msrc_tgt);
+        auto offer_value            = thrust::get<3>(msrc_tgt);
+
+        if (candidate_of_candidate != invalid_partner && candidate_of_candidate == tgt) {
+          auto tgt_offset                  = tgt - v_first;
+          is_vertex_matched[tgt_offset]    = true;
+          partners[tgt_offset]             = candiate;
+          offers_from_partners[tgt_offset] = offer_value;
+        }
+      });
+
+    if (current_graph_view.compute_number_of_edges(handle) == 0) { break; }
+
+    if constexpr (graph_view_t::is_multi_gpu) {
+      cugraph::update_edge_src_property(
+        handle, current_graph_view, is_vertex_matched.begin(), src_match_flags);
+      cugraph::update_edge_dst_property(
+        handle, current_graph_view, is_vertex_matched.begin(), dst_match_flags);
+    }
+
+    if (loop_counter % 2 == 0) {
+      if constexpr (graph_view_t::is_multi_gpu) {
+        cugraph::transform_e(
+          handle,
+          current_graph_view,
+          src_match_flags.view(),
+          dst_match_flags.view(),
+          cugraph::edge_dummy_property_t{}.view(),
+          [] __device__(
+            auto src, auto dst, auto is_src_matched, auto is_dst_matched, thrust::nullopt_t) {
+            return !((is_src_matched == true) || (is_dst_matched == true));
+          },
+          edge_masks_odd.mutable_view());
+      } else {
+        cugraph::transform_e(
+          handle,
+          current_graph_view,
+          detail::edge_major_property_view_t<vertex_t, bool const*>(is_vertex_matched.begin()),
+          detail::edge_minor_property_view_t<vertex_t, bool const*>(is_vertex_matched.begin(),
+                                                                    vertex_t{0}),
+          cugraph::edge_dummy_property_t{}.view(),
+          [] __device__(
+            auto src, auto dst, auto is_src_matched, auto is_dst_matched, thrust::nullopt_t) {
+            return !((is_src_matched == true) || (is_dst_matched == true));
+          },
+          edge_masks_odd.mutable_view());
+      }
+
+      if (current_graph_view.has_edge_mask()) current_graph_view.clear_edge_mask();
+      cugraph::fill_edge_property(handle, current_graph_view, bool{false}, edge_masks_even);
+      current_graph_view.attach_edge_mask(edge_masks_odd.view());
+    } else {
+      if constexpr (graph_view_t::is_multi_gpu) {
+        cugraph::transform_e(
+          handle,
+          current_graph_view,
+          src_match_flags.view(),
+          dst_match_flags.view(),
+          cugraph::edge_dummy_property_t{}.view(),
+          [] __device__(
+            auto src, auto dst, auto is_src_matched, auto is_dst_matched, thrust::nullopt_t) {
+            return !((is_src_matched == true) || (is_dst_matched == true));
+          },
+          edge_masks_even.mutable_view());
+      } else {
+        cugraph::transform_e(
+          handle,
+          current_graph_view,
+          detail::edge_major_property_view_t<vertex_t, bool const*>(is_vertex_matched.begin()),
+          detail::edge_minor_property_view_t<vertex_t, bool const*>(is_vertex_matched.begin(),
+                                                                    vertex_t{0}),
+          cugraph::edge_dummy_property_t{}.view(),
+          [] __device__(
+            auto src, auto dst, auto is_src_matched, auto is_dst_matched, thrust::nullopt_t) {
+            return !((is_src_matched == true) || (is_dst_matched == true));
+          },
+          edge_masks_even.mutable_view());
+      }
+
+      if (current_graph_view.has_edge_mask()) current_graph_view.clear_edge_mask();
+      cugraph::fill_edge_property(handle, current_graph_view, bool{false}, edge_masks_odd);
+      current_graph_view.attach_edge_mask(edge_masks_even.view());
+    }
+
+    loop_counter++;
+  }
+
+  weight_t sum_matched_edge_weights = thrust::reduce(
+    handle.get_thrust_policy(), offers_from_partners.begin(), offers_from_partners.end());
+
+  if constexpr (graph_view_t::is_multi_gpu) {
+    sum_matched_edge_weights = host_scalar_allreduce(
+      handle.get_comms(), sum_matched_edge_weights, raft::comms::op_t::SUM, handle.get_stream());
+  }
+
+  return std::make_tuple(std::move(partners), sum_matched_edge_weights / 2.0);
+}
+}  // namespace detail
+
+template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
+std::tuple<rmm::device_uvector<vertex_t>, weight_t> approximate_weighted_matching(
+  raft::handle_t const& handle,
+  graph_view_t<vertex_t, edge_t, false, multi_gpu> const& graph_view,
+  edge_property_view_t<edge_t, weight_t const*> edge_weight_view)
+{
+  return detail::approximate_weighted_matching(handle, graph_view, edge_weight_view);
+}
+
+}  // namespace cugraph
diff --git a/cpp/src/community/approx_weighted_matching_mg.cu b/cpp/src/community/approx_weighted_matching_mg.cu
new file mode 100644
index 00000000000..41d6c3d97e0
--- /dev/null
+++ b/cpp/src/community/approx_weighted_matching_mg.cu
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "approx_weighted_matching_impl.cuh"
+
+namespace cugraph {
+
+template std::tuple<rmm::device_uvector<int32_t>, float> approximate_weighted_matching(
+  raft::handle_t const& handle,
+  graph_view_t<int32_t, int32_t, false, true> const& graph_view,
+  edge_property_view_t<int32_t, float const*> edge_weight_view);
+
+template std::tuple<rmm::device_uvector<int32_t>, double> approximate_weighted_matching(
+  raft::handle_t const& handle,
+  graph_view_t<int32_t, int32_t, false, true> const& graph_view,
+  edge_property_view_t<int32_t, double const*> edge_weight_view);
+
+template std::tuple<rmm::device_uvector<int32_t>, float> approximate_weighted_matching(
+  raft::handle_t const& handle,
+  graph_view_t<int32_t, int64_t, false, true> const& graph_view,
+  edge_property_view_t<int64_t, float const*> edge_weight_view);
+
+template std::tuple<rmm::device_uvector<int64_t>, float> approximate_weighted_matching(
+  raft::handle_t const& handle,
+  graph_view_t<int64_t, int64_t, false, true> const& graph_view,
+  edge_property_view_t<int64_t, float const*> edge_weight_view);
+
+template std::tuple<rmm::device_uvector<int32_t>, double> approximate_weighted_matching(
+  raft::handle_t const& handle,
+  graph_view_t<int32_t, int64_t, false, true> const& graph_view,
+  edge_property_view_t<int64_t, double const*> edge_weight_view);
+
+template std::tuple<rmm::device_uvector<int64_t>, double> approximate_weighted_matching(
+  raft::handle_t const& handle,
+  graph_view_t<int64_t, int64_t, false, true> const& graph_view,
+  edge_property_view_t<int64_t, double const*> edge_weight_view);
+
+}  // namespace cugraph
diff --git a/cpp/src/community/approx_weighted_matching_sg.cu b/cpp/src/community/approx_weighted_matching_sg.cu
new file mode 100644
index 00000000000..418a43d51ae
--- /dev/null
+++ b/cpp/src/community/approx_weighted_matching_sg.cu
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "approx_weighted_matching_impl.cuh"
+
+namespace cugraph {
+
+template std::tuple<rmm::device_uvector<int32_t>, float> approximate_weighted_matching(
+  raft::handle_t const& handle,
+  graph_view_t<int32_t, int32_t, false, false> const& graph_view,
+  edge_property_view_t<int32_t, float const*> edge_weight_view);
+
+template std::tuple<rmm::device_uvector<int32_t>, double> approximate_weighted_matching(
+  raft::handle_t const& handle,
+  graph_view_t<int32_t, int32_t, false, false> const& graph_view,
+  edge_property_view_t<int32_t, double const*> edge_weight_view);
+
+template std::tuple<rmm::device_uvector<int32_t>, float> approximate_weighted_matching(
+  raft::handle_t const& handle,
+  graph_view_t<int32_t, int64_t, false, false> const& graph_view,
+  edge_property_view_t<int64_t, float const*> edge_weight_view);
+
+template std::tuple<rmm::device_uvector<int64_t>, float> approximate_weighted_matching(
+  raft::handle_t const& handle,
+  graph_view_t<int64_t, int64_t, false, false> const& graph_view,
+  edge_property_view_t<int64_t, float const*> edge_weight_view);
+
+template std::tuple<rmm::device_uvector<int32_t>, double> approximate_weighted_matching(
+  raft::handle_t const& handle,
+  graph_view_t<int32_t, int64_t, false, false> const& graph_view,
+  edge_property_view_t<int64_t, double const*> edge_weight_view);
+
+template std::tuple<rmm::device_uvector<int64_t>, double> approximate_weighted_matching(
+  raft::handle_t const& handle,
+  graph_view_t<int64_t, int64_t, false, false> const& graph_view,
+  edge_property_view_t<int64_t, double const*> edge_weight_view);
+
+}  // namespace cugraph
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 19097add541..ced3b7bedb1 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -309,6 +309,10 @@ ConfigureTest(LOUVAIN_TEST community/louvain_test.cpp)
 # - LEIDEN tests ----------------------------------------------------------------------------------
 ConfigureTest(LEIDEN_TEST community/leiden_test.cpp)
 
+###################################################################################################
+# - WEIGHTED MATCHING tests ----------------------------------------------------------------------------------
+ConfigureTest(WEIGHTED_MATCHING_TEST community/weighted_matching_test.cpp)
+
 ###################################################################################################
 # - Legacy ECG tests -------------------------------------------------------------------------------------
 ConfigureTest(LEGACY_ECG_TEST community/legacy_ecg_test.cpp)
@@ -570,6 +574,10 @@ if(BUILD_CUGRAPH_MG_TESTS)
     # - MG LEIDEN tests --------------------------------------------------------------------------
     ConfigureTestMG(MG_LEIDEN_TEST community/mg_leiden_test.cpp)
 
+    ###############################################################################################
+    # - MG WEIGHTED MATCHING tests --------------------------------------------------------------------------
+    ConfigureTestMG(MG_WEIGHTED_MATCHING_TEST community/mg_weighted_matching_test.cpp)
+
     ###############################################################################################
     # - MG ECG tests --------------------------------------------------------------------------
     ConfigureTestMG(MG_ECG_TEST community/mg_ecg_test.cpp)
diff --git a/cpp/tests/community/mg_weighted_matching_test.cpp b/cpp/tests/community/mg_weighted_matching_test.cpp
new file mode 100644
index 00000000000..21963922ab1
--- /dev/null
+++ b/cpp/tests/community/mg_weighted_matching_test.cpp
@@ -0,0 +1,232 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governin_from_mtxg permissions and
+ * limitations under the License.
+ */
+
+#include "utilities/base_fixture.hpp"
+#include "utilities/conversion_utilities.hpp"
+#include "utilities/property_generator_utilities.hpp"
+#include "utilities/test_graphs.hpp"
+
+#include <cugraph/algorithms.hpp>
+#include <cugraph/edge_partition_view.hpp>
+#include <cugraph/edge_property.hpp>
+#include <cugraph/edge_src_dst_property.hpp>
+#include <cugraph/graph_functions.hpp>
+#include <cugraph/graph_view.hpp>
+#include <cugraph/utilities/dataframe_buffer.hpp>
+#include <cugraph/utilities/high_res_timer.hpp>
+#include <cugraph/utilities/host_scalar_comm.hpp>
+
+#include <raft/random/rng_state.hpp>
+
+#include <gtest/gtest.h>
+
+#include <chrono>
+#include <iostream>
+#include <random>
+
+struct WeightedMatching_UseCase {
+  bool edge_masking{false};
+  bool check_correctness{true};
+};
+
+template <typename input_usecase_t>
+class Tests_MGWeightedMatching
+  : public ::testing::TestWithParam<std::tuple<WeightedMatching_UseCase, input_usecase_t>> {
+ public:
+  Tests_MGWeightedMatching() {}
+
+  static void SetUpTestCase() { handle_ = cugraph::test::initialize_mg_handle(); }
+  static void TearDownTestCase() { handle_.reset(); }
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
+  void run_current_test(std::tuple<WeightedMatching_UseCase, input_usecase_t> const& param)
+  {
+    auto [weighted_matching_usecase, input_usecase] = param;
+
+    HighResTimer hr_timer{};
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());
+      handle_->get_comms().barrier();
+      hr_timer.start("MG Construct graph");
+    }
+
+    constexpr bool multi_gpu = true;
+
+    bool test_weighted    = true;
+    bool renumber         = true;
+    bool drop_self_loops  = false;
+    bool drop_multi_edges = false;
+
+    auto [mg_graph, mg_edge_weights, mg_renumber_map] =
+      cugraph::test::construct_graph<vertex_t, edge_t, weight_t, false, multi_gpu>(
+        *handle_, input_usecase, test_weighted, renumber, drop_self_loops, drop_multi_edges);
+
+    std::tie(mg_graph, mg_edge_weights, mg_renumber_map) = cugraph::symmetrize_graph(
+      *handle_,
+      std::move(mg_graph),
+      std::move(mg_edge_weights),
+      mg_renumber_map ? std::optional<rmm::device_uvector<vertex_t>>(std::move(*mg_renumber_map))
+                      : std::nullopt,
+      false);
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());
+      handle_->get_comms().barrier();
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    auto mg_graph_view = mg_graph.view();
+    auto mg_edge_weight_view =
+      mg_edge_weights ? std::make_optional((*mg_edge_weights).view()) : std::nullopt;
+
+    std::optional<cugraph::edge_property_t<decltype(mg_graph_view), bool>> edge_mask{std::nullopt};
+    if (weighted_matching_usecase.edge_masking) {
+      edge_mask = cugraph::test::generate<decltype(mg_graph_view), bool>::edge_property(
+        *handle_, mg_graph_view, 2);
+      mg_graph_view.attach_edge_mask((*edge_mask).view());
+    }
+
+    rmm::device_uvector<vertex_t> mg_partners(0, handle_->get_stream());
+    weight_t mg_matching_weights;
+
+    std::forward_as_tuple(mg_partners, mg_matching_weights) =
+      cugraph::approximate_weighted_matching<vertex_t, edge_t, weight_t, multi_gpu>(
+        *handle_, mg_graph_view, (*mg_edge_weights).view());
+
+    if (weighted_matching_usecase.check_correctness) {
+      auto h_mg_partners = cugraph::test::to_host(*handle_, mg_partners);
+
+      auto constexpr invalid_partner = cugraph::invalid_vertex_id<vertex_t>::value;
+
+      rmm::device_uvector<vertex_t> mg_aggregate_partners(0, handle_->get_stream());
+      std::tie(std::ignore, mg_aggregate_partners) =
+        cugraph::test::mg_vertex_property_values_to_sg_vertex_property_values(
+          *handle_,
+          std::optional<raft::device_span<vertex_t const>>{std::nullopt},
+          mg_graph_view.local_vertex_partition_range(),
+          std::optional<raft::device_span<vertex_t const>>{std::nullopt},
+          std::optional<raft::device_span<vertex_t const>>{std::nullopt},
+          raft::device_span<vertex_t const>(mg_partners.data(), mg_partners.size()));
+
+      cugraph::graph_t<vertex_t, edge_t, false, false> sg_graph(*handle_);
+      std::optional<
+        cugraph::edge_property_t<cugraph::graph_view_t<vertex_t, edge_t, false, false>, weight_t>>
+        sg_edge_weights{std::nullopt};
+      std::tie(sg_graph, sg_edge_weights, std::ignore) = cugraph::test::mg_graph_to_sg_graph(
+        *handle_,
+        mg_graph_view,
+        mg_edge_weight_view,
+        std::optional<raft::device_span<vertex_t const>>(std::nullopt),
+        false);
+
+      if (handle_->get_comms().get_rank() == 0) {
+        auto sg_graph_view = sg_graph.view();
+
+        rmm::device_uvector<vertex_t> sg_partners(0, handle_->get_stream());
+        weight_t sg_matching_weights;
+
+        std::forward_as_tuple(sg_partners, sg_matching_weights) =
+          cugraph::approximate_weighted_matching<vertex_t, edge_t, weight_t, false>(
+            *handle_, sg_graph_view, (*sg_edge_weights).view());
+        auto h_sg_partners           = cugraph::test::to_host(*handle_, sg_partners);
+        auto h_mg_aggregate_partners = cugraph::test::to_host(*handle_, mg_aggregate_partners);
+
+        ASSERT_FLOAT_EQ(mg_matching_weights, sg_matching_weights)
+          << "SG and MG matching weights are different";
+        ASSERT_TRUE(
+          std::equal(h_sg_partners.begin(), h_sg_partners.end(), h_mg_aggregate_partners.begin()));
+      }
+    }
+  }
+
+ private:
+  static std::unique_ptr<raft::handle_t> handle_;
+};
+
+template <typename input_usecase_t>
+std::unique_ptr<raft::handle_t> Tests_MGWeightedMatching<input_usecase_t>::handle_ = nullptr;
+
+using Tests_MGWeightedMatching_File = Tests_MGWeightedMatching<cugraph::test::File_Usecase>;
+using Tests_MGWeightedMatching_Rmat = Tests_MGWeightedMatching<cugraph::test::Rmat_Usecase>;
+
+TEST_P(Tests_MGWeightedMatching_File, CheckInt32Int32FloatFloat)
+{
+  run_current_test<int32_t, int32_t, float, int>(
+    override_File_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_MGWeightedMatching_File, CheckInt32Int64FloatFloat)
+{
+  run_current_test<int32_t, int64_t, float, int>(
+    override_File_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_MGWeightedMatching_File, CheckInt64Int64FloatFloat)
+{
+  run_current_test<int64_t, int64_t, float, int>(
+    override_File_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_MGWeightedMatching_Rmat, CheckInt32Int32FloatFloat)
+{
+  run_current_test<int32_t, int32_t, float, int>(
+    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_MGWeightedMatching_Rmat, CheckInt32Int64FloatFloat)
+{
+  run_current_test<int32_t, int64_t, float, int>(
+    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_MGWeightedMatching_Rmat, CheckInt64Int64FloatFloat)
+{
+  run_current_test<int64_t, int64_t, float, int>(
+    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+  file_test,
+  Tests_MGWeightedMatching_File,
+  ::testing::Combine(::testing::Values(WeightedMatching_UseCase{false},
+                                       WeightedMatching_UseCase{true}),
+                     ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(rmat_small_test,
+                         Tests_MGWeightedMatching_Rmat,
+                         ::testing::Combine(::testing::Values(WeightedMatching_UseCase{false},
+                                                              WeightedMatching_UseCase{true}),
+                                            ::testing::Values(cugraph::test::Rmat_Usecase(
+                                              3, 2, 0.57, 0.19, 0.19, 0, true, false))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_benchmark_test, /* note that scale & edge factor can be overridden in benchmarking (with
+                          --gtest_filter to select only the rmat_benchmark_test with a specific
+                          vertex & edge type combination) by command line arguments and do not
+                          include more than one Rmat_Usecase that differ only in scale or edge
+                          factor (to avoid running same benchmarks more than once) */
+  Tests_MGWeightedMatching_Rmat,
+  ::testing::Combine(
+    ::testing::Values(WeightedMatching_UseCase{false, false},
+                      WeightedMatching_UseCase{true, false}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, true, false))));
+
+CUGRAPH_MG_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/community/weighted_matching_test.cpp b/cpp/tests/community/weighted_matching_test.cpp
new file mode 100644
index 00000000000..436273c3be3
--- /dev/null
+++ b/cpp/tests/community/weighted_matching_test.cpp
@@ -0,0 +1,182 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governin_from_mtxg permissions and
+ * limitations under the License.
+ */
+
+#include "utilities/base_fixture.hpp"
+#include "utilities/conversion_utilities.hpp"
+#include "utilities/property_generator_utilities.hpp"
+#include "utilities/test_graphs.hpp"
+
+#include <cugraph/algorithms.hpp>
+#include <cugraph/edge_partition_view.hpp>
+#include <cugraph/edge_property.hpp>
+#include <cugraph/edge_src_dst_property.hpp>
+#include <cugraph/graph_functions.hpp>
+#include <cugraph/graph_view.hpp>
+#include <cugraph/utilities/high_res_timer.hpp>
+
+#include <raft/random/rng_state.hpp>
+
+#include <gtest/gtest.h>
+
+#include <chrono>
+#include <iostream>
+#include <random>
+
+struct WeightedMatching_UseCase {
+  bool edge_masking{false};
+  bool check_correctness{true};
+};
+
+template <typename input_usecase_t>
+class Tests_SGWeightedMatching
+  : public ::testing::TestWithParam<std::tuple<WeightedMatching_UseCase, input_usecase_t>> {
+ public:
+  Tests_SGWeightedMatching() {}
+
+  static void SetUpTestCase() {}
+  static void TearDownTestCase() {}
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
+  void run_current_test(std::tuple<WeightedMatching_UseCase, input_usecase_t> const& param)
+  {
+    auto [weighted_matching_usecase, input_usecase] = param;
+
+    raft::handle_t handle{};
+    HighResTimer hr_timer{};
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());
+      hr_timer.start("Construct graph");
+    }
+
+    constexpr bool multi_gpu = false;
+
+    bool test_weighted    = true;
+    bool renumber         = true;
+    bool drop_self_loops  = false;
+    bool drop_multi_edges = false;
+
+    auto [sg_graph, sg_edge_weights, sg_renumber_map] =
+      cugraph::test::construct_graph<vertex_t, edge_t, weight_t, false, multi_gpu>(
+        handle, input_usecase, test_weighted, renumber, drop_self_loops, drop_multi_edges);
+
+    std::tie(sg_graph, sg_edge_weights, sg_renumber_map) = cugraph::symmetrize_graph(
+      handle, std::move(sg_graph), std::move(sg_edge_weights), std::move(sg_renumber_map), false);
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    auto sg_graph_view = sg_graph.view();
+    auto sg_edge_weight_view =
+      sg_edge_weights ? std::make_optional((*sg_edge_weights).view()) : std::nullopt;
+
+    std::optional<cugraph::edge_property_t<decltype(sg_graph_view), bool>> edge_mask{std::nullopt};
+    if (weighted_matching_usecase.edge_masking) {
+      edge_mask = cugraph::test::generate<decltype(sg_graph_view), bool>::edge_property(
+        handle, sg_graph_view, 2);
+      sg_graph_view.attach_edge_mask((*edge_mask).view());
+    }
+
+    rmm::device_uvector<vertex_t> d_partners(0, handle.get_stream());
+    weight_t total_matching_weights;
+
+    std::forward_as_tuple(d_partners, total_matching_weights) =
+      cugraph::approximate_weighted_matching<vertex_t, edge_t, weight_t, multi_gpu>(
+        handle, sg_graph_view, (*sg_edge_weights).view());
+
+    if (weighted_matching_usecase.check_correctness) {
+      auto h_partners                = cugraph::test::to_host(handle, d_partners);
+      auto constexpr invalid_partner = cugraph::invalid_vertex_id<vertex_t>::value;
+
+      std::for_each(h_partners.begin(), h_partners.end(), [&invalid_partner, h_partners](auto& v) {
+        if (v != invalid_partner) ASSERT_TRUE(h_partners[h_partners[v]] == v);
+      });
+    }
+  }
+};
+
+using Tests_SGWeightedMatching_File = Tests_SGWeightedMatching<cugraph::test::File_Usecase>;
+using Tests_SGWeightedMatching_Rmat = Tests_SGWeightedMatching<cugraph::test::Rmat_Usecase>;
+
+TEST_P(Tests_SGWeightedMatching_File, CheckInt32Int32FloatFloat)
+{
+  run_current_test<int32_t, int32_t, float, int>(
+    override_File_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_SGWeightedMatching_File, CheckInt32Int64FloatFloat)
+{
+  run_current_test<int32_t, int64_t, float, int>(
+    override_File_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_SGWeightedMatching_File, CheckInt64Int64FloatFloat)
+{
+  run_current_test<int64_t, int64_t, float, int>(
+    override_File_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_SGWeightedMatching_Rmat, CheckInt32Int32FloatFloat)
+{
+  run_current_test<int32_t, int32_t, float, int>(
+    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_SGWeightedMatching_Rmat, CheckInt32Int64FloatFloat)
+{
+  run_current_test<int32_t, int64_t, float, int>(
+    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_SGWeightedMatching_Rmat, CheckInt64Int64FloatFloat)
+{
+  run_current_test<int64_t, int64_t, float, int>(
+    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+  file_test,
+  Tests_SGWeightedMatching_File,
+  ::testing::Combine(::testing::Values(WeightedMatching_UseCase{false},
+                                       WeightedMatching_UseCase{true}),
+                     ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(rmat_small_test,
+                         Tests_SGWeightedMatching_Rmat,
+                         ::testing::Combine(::testing::Values(WeightedMatching_UseCase{false},
+                                                              WeightedMatching_UseCase{true}),
+                                            ::testing::Values(cugraph::test::Rmat_Usecase(
+                                              3, 3, 0.57, 0.19, 0.19, 0, true, false))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_benchmark_test, /* note that scale & edge factor can be overridden in benchmarking (with
+                          --gtest_filter to select only the rmat_benchmark_test with a specific
+                          vertex & edge type combination) by command line arguments and do not
+                          include more than one Rmat_Usecase that differ only in scale or edge
+                          factor (to avoid running same benchmarks more than once) */
+  Tests_SGWeightedMatching_Rmat,
+  ::testing::Combine(
+    ::testing::Values(WeightedMatching_UseCase{false, false},
+                      WeightedMatching_UseCase{true, false}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false))));
+
+CUGRAPH_TEST_PROGRAM_MAIN()

From 5b72af3a8a8434201a017d659108701ff3a077f6 Mon Sep 17 00:00:00 2001
From: Naim <110031745+naimnv@users.noreply.github.com>
Date: Tue, 21 May 2024 16:25:02 +0200
Subject: [PATCH 07/23] Fix a bug in kv_store_t implementation (#4434)

Fix a bug in kv_store_t implementation

Authors:
  - Naim (https://github.com/naimnv)

Approvers:
  - Chuck Hastings (https://github.com/ChuckHastings)

URL: https://github.com/rapidsai/cugraph/pull/4434
---
 cpp/src/prims/kv_store.cuh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cpp/src/prims/kv_store.cuh b/cpp/src/prims/kv_store.cuh
index 2cc7856d87a..76b64b5692b 100644
--- a/cpp/src/prims/kv_store.cuh
+++ b/cpp/src/prims/kv_store.cuh
@@ -526,6 +526,7 @@ class kv_cuco_store_t {
                     std::conditional_t<!std::is_arithmetic_v<value_t>, value_t, void>>(0, stream))
   {
     allocate(capacity, invalid_key, invalid_value, stream);
+    if constexpr (!std::is_arithmetic_v<value_t>) { invalid_value_ = invalid_value; }
     capacity_ = capacity;
     size_     = 0;
   }

From 8d8b4fde6922efbae88b46e840b9af7f089b30bb Mon Sep 17 00:00:00 2001
From: Ray Bell <rayjohnbell0@gmail.com>
Date: Tue, 21 May 2024 12:18:07 -0400
Subject: [PATCH 08/23] Update DGL_support.md (#4327)

Believe the path suggested previously is outdated.

Authors:
  - Ray Bell (https://github.com/raybellwaves)
  - Don Acosta (https://github.com/acostadon)
  - Rick Ratzel (https://github.com/rlratzel)

Approvers:
  - Don Acosta (https://github.com/acostadon)

URL: https://github.com/rapidsai/cugraph/pull/4327
---
 docs/cugraph/source/graph_support/DGL_support.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/cugraph/source/graph_support/DGL_support.md b/docs/cugraph/source/graph_support/DGL_support.md
index dc4f66180ac..9df462155fd 100644
--- a/docs/cugraph/source/graph_support/DGL_support.md
+++ b/docs/cugraph/source/graph_support/DGL_support.md
@@ -17,7 +17,7 @@ mamba install cugraph-dgl -c rapidsai-nightly -c rapidsai -c pytorch -c conda-fo
 
 ### Create the conda development environment
 ```
-mamba env create -n cugraph_dgl_dev --file conda/cugraph_dgl_dev_11.6.yml
+conda env create -n cugraph_dgl_dev --file conda/environments/all_cuda-122_arch-x86_64.yaml
 ```
 
 ### Install  in editable mode

From ddfaacf1bd9d305b31e2c55b7ae954ba13f21899 Mon Sep 17 00:00:00 2001
From: Ray Bell <rayjohnbell0@gmail.com>
Date: Tue, 21 May 2024 21:24:18 -0400
Subject: [PATCH 09/23] DOC: doc-update-link-for-cugraphops (#4279)

Fixes a broken link

https://github.com/rapidsai/cugraph-ops/blob/branch-23.04/README.md -> https://github.com/rapidsai/cugraph/blob/branch-24.04/readme_pages/cugraph_ops.md

Authors:
  - Ray Bell (https://github.com/raybellwaves)
  - Alex Barghi (https://github.com/alexbarghi-nv)
  - Rick Ratzel (https://github.com/rlratzel)

Approvers:
  - Don Acosta (https://github.com/acostadon)

URL: https://github.com/rapidsai/cugraph/pull/4279
---
 docs/cugraph/source/graph_support/cugraphops_support.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/cugraph/source/graph_support/cugraphops_support.rst b/docs/cugraph/source/graph_support/cugraphops_support.rst
index fd79564f849..96b13f62a9c 100644
--- a/docs/cugraph/source/graph_support/cugraphops_support.rst
+++ b/docs/cugraph/source/graph_support/cugraphops_support.rst
@@ -7,4 +7,4 @@ cugraph-ops aims to be a low-level, framework agnostic library providing commonl
 .. toctree::
    :maxdepth: 3
 
-   https://github.com/rapidsai/cugraph-ops/blob/branch-23.04/README.md
+   https://github.com/rapidsai/cugraph/blob/branch-24.06/readme_pages/cugraph_ops.md

From 0d64b729568ea5240ea32d990542a9edf471f349 Mon Sep 17 00:00:00 2001
From: Chuck Hastings <45364586+ChuckHastings@users.noreply.github.com>
Date: Thu, 23 May 2024 17:12:32 -0400
Subject: [PATCH 10/23] Fix stream synchronization in MTMG graph construction
 (#4275)

Restructure to pass stream instead of handle and synchronize appropriately.

Closes #4236

Authors:
  - Chuck Hastings (https://github.com/ChuckHastings)

Approvers:
  - Seunghwa Kang (https://github.com/seunghwak)

URL: https://github.com/rapidsai/cugraph/pull/4275
---
 .../mtmg/detail/per_device_edgelist.hpp       | 90 +++++++++----------
 cpp/include/cugraph/mtmg/edge_property.hpp    |  3 +-
 .../cugraph/mtmg/edge_property_view.hpp       |  3 +-
 cpp/include/cugraph/mtmg/edgelist.hpp         | 10 ++-
 cpp/include/cugraph/mtmg/handle.hpp           |  7 +-
 .../cugraph/mtmg/per_thread_edgelist.hpp      | 45 +++++-----
 cpp/tests/mtmg/multi_node_threaded_test.cu    |  6 +-
 cpp/tests/mtmg/threaded_test.cu               |  6 +-
 cpp/tests/mtmg/threaded_test_jaccard.cu       |  6 +-
 cpp/tests/mtmg/threaded_test_louvain.cu       |  6 +-
 10 files changed, 96 insertions(+), 86 deletions(-)

diff --git a/cpp/include/cugraph/mtmg/detail/per_device_edgelist.hpp b/cpp/include/cugraph/mtmg/detail/per_device_edgelist.hpp
index 7fd5bb726e6..63d7fd9685e 100644
--- a/cpp/include/cugraph/mtmg/detail/per_device_edgelist.hpp
+++ b/cpp/include/cugraph/mtmg/detail/per_device_edgelist.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -62,17 +62,17 @@ class per_device_edgelist_t {
   /**
    * @brief Construct a new per device edgelist t object
    *
-   * @param handle             MTMG resource handle - used to identify GPU resources
    * @param device_buffer_size Number of edges to store in each device buffer
    * @param use_weight         Whether or not the edgelist will have weights
    * @param use_edge_id        Whether or not the edgelist will have edge ids
    * @param use_edge_type      Whether or not the edgelist will have edge types
+   * @param stream_view        CUDA stream view
    */
-  per_device_edgelist_t(cugraph::mtmg::handle_t const& handle,
-                        size_t device_buffer_size,
+  per_device_edgelist_t(size_t device_buffer_size,
                         bool use_weight,
                         bool use_edge_id,
-                        bool use_edge_type)
+                        bool use_edge_type,
+                        rmm::cuda_stream_view stream_view)
     : device_buffer_size_{device_buffer_size},
       current_pos_{0},
       src_{},
@@ -89,7 +89,7 @@ class per_device_edgelist_t {
       edge_type_ = std::make_optional(std::vector<rmm::device_uvector<edge_type_t>>());
     }
 
-    create_new_buffers(handle);
+    create_new_buffers(stream_view);
   }
 
   /**
@@ -111,19 +111,19 @@ class per_device_edgelist_t {
   /**
    * @brief Append a list of edges to the edge list
    *
-   * @param handle     The resource handle
-   * @param src        Source vertex id
-   * @param dst        Destination vertex id
-   * @param wgt        Edge weight
-   * @param edge_id    Edge id
-   * @param edge_type  Edge type
+   * @param src         Source vertex id
+   * @param dst         Destination vertex id
+   * @param wgt         Edge weight
+   * @param edge_id     Edge id
+   * @param edge_type   Edge type
+   * @param stream_view CUDA stream view
    */
-  void append(handle_t const& handle,
-              raft::host_span<vertex_t const> src,
+  void append(raft::host_span<vertex_t const> src,
               raft::host_span<vertex_t const> dst,
               std::optional<raft::host_span<weight_t const>> wgt,
               std::optional<raft::host_span<edge_t const>> edge_id,
-              std::optional<raft::host_span<edge_type_t const>> edge_type)
+              std::optional<raft::host_span<edge_type_t const>> edge_type,
+              rmm::cuda_stream_view stream_view)
   {
     std::vector<std::tuple<size_t, size_t, size_t, size_t>> copy_positions;
 
@@ -142,13 +142,13 @@ class per_device_edgelist_t {
         pos += copy_count;
         current_pos_ += copy_count;
 
-        if (current_pos_ == src_.back().size()) { create_new_buffers(handle); }
+        if (current_pos_ == src_.back().size()) { create_new_buffers(stream_view); }
       }
     }
 
     std::for_each(copy_positions.begin(),
                   copy_positions.end(),
-                  [&handle,
+                  [&stream_view,
                    &this_src = src_,
                    &src,
                    &this_dst = dst_,
@@ -164,47 +164,45 @@ class per_device_edgelist_t {
                     raft::update_device(this_src[buffer_idx].begin() + buffer_pos,
                                         src.begin() + input_pos,
                                         copy_count,
-                                        handle.get_stream());
+                                        stream_view);
 
                     raft::update_device(this_dst[buffer_idx].begin() + buffer_pos,
                                         dst.begin() + input_pos,
                                         copy_count,
-                                        handle.get_stream());
+                                        stream_view);
 
                     if (this_wgt)
                       raft::update_device((*this_wgt)[buffer_idx].begin() + buffer_pos,
                                           wgt->begin() + input_pos,
                                           copy_count,
-                                          handle.get_stream());
+                                          stream_view);
 
                     if (this_edge_id)
                       raft::update_device((*this_edge_id)[buffer_idx].begin() + buffer_pos,
                                           edge_id->begin() + input_pos,
                                           copy_count,
-                                          handle.get_stream());
+                                          stream_view);
 
                     if (this_edge_type)
                       raft::update_device((*this_edge_type)[buffer_idx].begin() + buffer_pos,
                                           edge_type->begin() + input_pos,
                                           copy_count,
-                                          handle.get_stream());
+                                          stream_view);
                   });
-
-    handle.sync_stream();
   }
 
   /**
    * @brief  Mark the edgelist as ready for reading (all writes are complete)
    *
-   * @param handle     The resource handle
+   * @param stream_view  CUDA stream view
    */
-  void finalize_buffer(handle_t const& handle)
+  void finalize_buffer(rmm::cuda_stream_view stream_view)
   {
-    src_.back().resize(current_pos_, handle.get_stream());
-    dst_.back().resize(current_pos_, handle.get_stream());
-    if (wgt_) wgt_->back().resize(current_pos_, handle.get_stream());
-    if (edge_id_) edge_id_->back().resize(current_pos_, handle.get_stream());
-    if (edge_type_) edge_type_->back().resize(current_pos_, handle.get_stream());
+    src_.back().resize(current_pos_, stream_view);
+    dst_.back().resize(current_pos_, stream_view);
+    if (wgt_) wgt_->back().resize(current_pos_, stream_view);
+    if (edge_id_) edge_id_->back().resize(current_pos_, stream_view);
+    if (edge_type_) edge_type_->back().resize(current_pos_, stream_view);
   }
 
   bool use_weight() const { return wgt_.has_value(); }
@@ -230,16 +228,18 @@ class per_device_edgelist_t {
   void consolidate_and_shuffle(cugraph::mtmg::handle_t const& handle, bool store_transposed)
   {
     if (src_.size() > 1) {
+      auto stream = handle.raft_handle().get_stream();
+
       size_t total_size = std::transform_reduce(
         src_.begin(), src_.end(), size_t{0}, std::plus<size_t>(), [](auto& d_vector) {
           return d_vector.size();
         });
 
-      resize_and_copy_buffers(handle.get_stream(), src_, total_size);
-      resize_and_copy_buffers(handle.get_stream(), dst_, total_size);
-      if (wgt_) resize_and_copy_buffers(handle.get_stream(), *wgt_, total_size);
-      if (edge_id_) resize_and_copy_buffers(handle.get_stream(), *edge_id_, total_size);
-      if (edge_type_) resize_and_copy_buffers(handle.get_stream(), *edge_type_, total_size);
+      resize_and_copy_buffers(src_, total_size, stream);
+      resize_and_copy_buffers(dst_, total_size, stream);
+      if (wgt_) resize_and_copy_buffers(*wgt_, total_size, stream);
+      if (edge_id_) resize_and_copy_buffers(*edge_id_, total_size, stream);
+      if (edge_type_) resize_and_copy_buffers(*edge_type_, total_size, stream);
     }
 
     auto tmp_wgt     = wgt_ ? std::make_optional(std::move((*wgt_)[0])) : std::nullopt;
@@ -267,9 +267,9 @@ class per_device_edgelist_t {
 
  private:
   template <typename T>
-  void resize_and_copy_buffers(rmm::cuda_stream_view stream,
-                               std::vector<rmm::device_uvector<T>>& buffer,
-                               size_t total_size)
+  void resize_and_copy_buffers(std::vector<rmm::device_uvector<T>>& buffer,
+                               size_t total_size,
+                               rmm::cuda_stream_view stream)
   {
     size_t pos = buffer[0].size();
     buffer[0].resize(total_size, stream);
@@ -286,16 +286,16 @@ class per_device_edgelist_t {
     buffer = std::move(new_buffer);
   }
 
-  void create_new_buffers(cugraph::mtmg::handle_t const& handle)
+  void create_new_buffers(rmm::cuda_stream_view stream_view)
   {
-    src_.emplace_back(device_buffer_size_, handle.get_stream());
-    dst_.emplace_back(device_buffer_size_, handle.get_stream());
+    src_.emplace_back(device_buffer_size_, stream_view);
+    dst_.emplace_back(device_buffer_size_, stream_view);
 
-    if (wgt_) { wgt_->emplace_back(device_buffer_size_, handle.get_stream()); }
+    if (wgt_) { wgt_->emplace_back(device_buffer_size_, stream_view); }
 
-    if (edge_id_) { edge_id_->emplace_back(device_buffer_size_, handle.get_stream()); }
+    if (edge_id_) { edge_id_->emplace_back(device_buffer_size_, stream_view); }
 
-    if (edge_type_) { edge_type_->emplace_back(device_buffer_size_, handle.get_stream()); }
+    if (edge_type_) { edge_type_->emplace_back(device_buffer_size_, stream_view); }
 
     current_pos_ = 0;
   }
diff --git a/cpp/include/cugraph/mtmg/edge_property.hpp b/cpp/include/cugraph/mtmg/edge_property.hpp
index afa72492b9a..0b27ca85e46 100644
--- a/cpp/include/cugraph/mtmg/edge_property.hpp
+++ b/cpp/include/cugraph/mtmg/edge_property.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,7 +18,6 @@
 
 #include <cugraph/mtmg/detail/device_shared_wrapper.hpp>
 #include <cugraph/mtmg/edge_property_view.hpp>
-#include <cugraph/mtmg/handle.hpp>
 
 namespace cugraph {
 namespace mtmg {
diff --git a/cpp/include/cugraph/mtmg/edge_property_view.hpp b/cpp/include/cugraph/mtmg/edge_property_view.hpp
index c84a6458e1d..6416ea382ef 100644
--- a/cpp/include/cugraph/mtmg/edge_property_view.hpp
+++ b/cpp/include/cugraph/mtmg/edge_property_view.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,7 +17,6 @@
 #pragma once
 
 #include <cugraph/mtmg/detail/device_shared_wrapper.hpp>
-#include <cugraph/mtmg/handle.hpp>
 
 namespace cugraph {
 namespace mtmg {
diff --git a/cpp/include/cugraph/mtmg/edgelist.hpp b/cpp/include/cugraph/mtmg/edgelist.hpp
index 90c53dfbb64..d5d2bd2bca7 100644
--- a/cpp/include/cugraph/mtmg/edgelist.hpp
+++ b/cpp/include/cugraph/mtmg/edgelist.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -39,7 +39,7 @@ class edgelist_t : public detail::device_shared_wrapper_t<
            bool use_edge_type)
   {
     detail::per_device_edgelist_t<vertex_t, weight_t, edge_t, edge_type_t> tmp(
-      handle, device_buffer_size, use_weight, use_edge_id, use_edge_type);
+      device_buffer_size, use_weight, use_edge_id, use_edge_type, handle.get_stream());
 
     detail::device_shared_wrapper_t<
       detail::per_device_edgelist_t<vertex_t, weight_t, edge_t, edge_type_t>>::set(handle,
@@ -49,7 +49,11 @@ class edgelist_t : public detail::device_shared_wrapper_t<
   /**
    * @brief Stop inserting edges into this edgelist so we can use the edges
    */
-  void finalize_buffer(handle_t const& handle) { this->get(handle).finalize_buffer(handle); }
+  void finalize_buffer(handle_t const& handle)
+  {
+    handle.sync_stream_pool();
+    this->get(handle).finalize_buffer(handle.get_stream());
+  }
 
   /**
    * @brief Consolidate for the edgelist edges into a single edgelist and then
diff --git a/cpp/include/cugraph/mtmg/handle.hpp b/cpp/include/cugraph/mtmg/handle.hpp
index 0b02091a3cc..26c283f6acf 100644
--- a/cpp/include/cugraph/mtmg/handle.hpp
+++ b/cpp/include/cugraph/mtmg/handle.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -79,6 +79,11 @@ class handle_t {
    */
   void sync_stream() const { sync_stream(get_stream()); }
 
+  /**
+   * @brief Sync all streams in the stream pool
+   */
+  void sync_stream_pool() const { raft::resource::sync_stream_pool(raft_handle_); }
+
   /**
    * @brief get thrust policy for the stream
    *
diff --git a/cpp/include/cugraph/mtmg/per_thread_edgelist.hpp b/cpp/include/cugraph/mtmg/per_thread_edgelist.hpp
index b672db48719..73d69fdd5a7 100644
--- a/cpp/include/cugraph/mtmg/per_thread_edgelist.hpp
+++ b/cpp/include/cugraph/mtmg/per_thread_edgelist.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,7 +18,6 @@
 
 #include <cugraph/mtmg/detail/device_shared_wrapper.hpp>
 #include <cugraph/mtmg/detail/per_device_edgelist.hpp>
-#include <cugraph/mtmg/handle.hpp>
 
 namespace cugraph {
 namespace mtmg {
@@ -70,21 +69,21 @@ class per_thread_edgelist_t {
   /**
    * @brief Append an edge to the edge list
    *
-   * @param handle     The resource handle
-   * @param src        Source vertex id
-   * @param dst        Destination vertex id
-   * @param wgt        Edge weight
-   * @param edge_id    Edge id
-   * @param edge_type  Edge type
+   * @param src         Source vertex id
+   * @param dst         Destination vertex id
+   * @param wgt         Edge weight
+   * @param edge_id     Edge id
+   * @param edge_type   Edge type
+   * @param stream_view The cuda stream
    */
-  void append(handle_t const& handle,
-              vertex_t src,
+  void append(vertex_t src,
               vertex_t dst,
               std::optional<weight_t> wgt,
               std::optional<edge_t> edge_id,
-              std::optional<edge_type_t> edge_type)
+              std::optional<edge_type_t> edge_type,
+              rmm::cuda_stream_view stream_view)
   {
-    if (current_pos_ == src_.size()) { flush(handle); }
+    if (current_pos_ == src_.size()) { flush(stream_view); }
 
     src_[current_pos_] = src;
     dst_[current_pos_] = dst;
@@ -98,19 +97,19 @@ class per_thread_edgelist_t {
   /**
    * @brief Append a list of edges to the edge list
    *
-   * @param handle     The resource handle
    * @param src        Source vertex id
    * @param dst        Destination vertex id
    * @param wgt        Edge weight
    * @param edge_id    Edge id
    * @param edge_type  Edge type
+   * @param stream_view The cuda stream
    */
-  void append(handle_t const& handle,
-              raft::host_span<vertex_t const> src,
+  void append(raft::host_span<vertex_t const> src,
               raft::host_span<vertex_t const> dst,
               std::optional<raft::host_span<weight_t const>> wgt,
               std::optional<raft::host_span<edge_t const>> edge_id,
-              std::optional<raft::host_span<edge_type_t const>> edge_type)
+              std::optional<raft::host_span<edge_type_t const>> edge_type,
+              rmm::cuda_stream_view stream_view)
   {
     size_t count = src.size();
     size_t pos   = 0;
@@ -131,7 +130,7 @@ class per_thread_edgelist_t {
                   edge_type.begin() + pos + copy_count,
                   edge_type_->begin() + current_pos_);
 
-      if (current_pos_ == src_.size()) { flush(handle); }
+      if (current_pos_ == src_.size()) { flush(stream_view); }
 
       count -= copy_count;
       pos += copy_count;
@@ -141,12 +140,13 @@ class per_thread_edgelist_t {
   /**
    * @brief Flush thread data from host to GPU memory
    *
-   * @param handle     The resource handle
+   * @param stream_view The cuda stream
+   * @param sync       If true, synchronize the asynchronous copy of data;
+   *                   defaults to false.
    */
-  void flush(handle_t const& handle)
+  void flush(rmm::cuda_stream_view stream_view, bool sync = false)
   {
     edgelist_.append(
-      handle,
       raft::host_span<vertex_t const>{src_.data(), current_pos_},
       raft::host_span<vertex_t const>{dst_.data(), current_pos_},
       wgt_ ? std::make_optional(raft::host_span<weight_t const>{wgt_->data(), current_pos_})
@@ -155,9 +155,12 @@ class per_thread_edgelist_t {
                : std::nullopt,
       edge_type_
         ? std::make_optional(raft::host_span<edge_type_t const>{edge_type_->data(), current_pos_})
-        : std::nullopt);
+        : std::nullopt,
+      stream_view);
 
     current_pos_ = 0;
+
+    if (sync) stream_view.synchronize();
   }
 
  private:
diff --git a/cpp/tests/mtmg/multi_node_threaded_test.cu b/cpp/tests/mtmg/multi_node_threaded_test.cu
index 1ad83761d51..24852562b86 100644
--- a/cpp/tests/mtmg/multi_node_threaded_test.cu
+++ b/cpp/tests/mtmg/multi_node_threaded_test.cu
@@ -175,15 +175,15 @@ class Tests_Multithreaded
 
         for (size_t j = starting_edge_offset; j < h_src_v.size(); j += stride) {
           per_thread_edgelist.append(
-            thread_handle,
             h_src_v[j],
             h_dst_v[j],
             h_weights_v ? std::make_optional((*h_weights_v)[j]) : std::nullopt,
             std::nullopt,
-            std::nullopt);
+            std::nullopt,
+            thread_handle.get_stream());
         }
 
-        per_thread_edgelist.flush(thread_handle);
+        per_thread_edgelist.flush(thread_handle.get_stream());
       });
     }
 
diff --git a/cpp/tests/mtmg/threaded_test.cu b/cpp/tests/mtmg/threaded_test.cu
index f55a102ea67..df5a9e079df 100644
--- a/cpp/tests/mtmg/threaded_test.cu
+++ b/cpp/tests/mtmg/threaded_test.cu
@@ -191,15 +191,15 @@ class Tests_Multithreaded
 
         for (size_t j = i; j < h_src_v.size(); j += num_threads) {
           per_thread_edgelist.append(
-            thread_handle,
             h_src_v[j],
             h_dst_v[j],
             h_weights_v ? std::make_optional((*h_weights_v)[j]) : std::nullopt,
             std::nullopt,
-            std::nullopt);
+            std::nullopt,
+            thread_handle.get_stream());
         }
 
-        per_thread_edgelist.flush(thread_handle);
+        per_thread_edgelist.flush(thread_handle.get_stream());
       });
     }
 
diff --git a/cpp/tests/mtmg/threaded_test_jaccard.cu b/cpp/tests/mtmg/threaded_test_jaccard.cu
index a64cc8ee1fa..0f531796cff 100644
--- a/cpp/tests/mtmg/threaded_test_jaccard.cu
+++ b/cpp/tests/mtmg/threaded_test_jaccard.cu
@@ -184,15 +184,15 @@ class Tests_Multithreaded
 
         for (size_t j = i; j < h_src_v.size(); j += num_threads) {
           per_thread_edgelist.append(
-            thread_handle,
             h_src_v[j],
             h_dst_v[j],
             h_weights_v ? std::make_optional((*h_weights_v)[j]) : std::nullopt,
             std::nullopt,
-            std::nullopt);
+            std::nullopt,
+            thread_handle.get_stream());
         }
 
-        per_thread_edgelist.flush(thread_handle);
+        per_thread_edgelist.flush(thread_handle.get_stream());
       });
     }
 
diff --git a/cpp/tests/mtmg/threaded_test_louvain.cu b/cpp/tests/mtmg/threaded_test_louvain.cu
index c8faf33dae2..ab51d701b57 100644
--- a/cpp/tests/mtmg/threaded_test_louvain.cu
+++ b/cpp/tests/mtmg/threaded_test_louvain.cu
@@ -191,15 +191,15 @@ class Tests_Multithreaded
 
         for (size_t j = i; j < h_src_v.size(); j += num_threads) {
           per_thread_edgelist.append(
-            thread_handle,
             h_src_v[j],
             h_dst_v[j],
             h_weights_v ? std::make_optional((*h_weights_v)[j]) : std::nullopt,
             std::nullopt,
-            std::nullopt);
+            std::nullopt,
+            thread_handle.get_stream());
         }
 
-        per_thread_edgelist.flush(thread_handle);
+        per_thread_edgelist.flush(thread_handle.get_stream());
       });
     }
 

From e6c842fff4c88358f39d75cdcc01a689a6eea912 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <45857425+seunghwak@users.noreply.github.com>
Date: Thu, 23 May 2024 21:46:20 -0700
Subject: [PATCH 11/23] Biased sampling primitive (#4430)

This PR restructures the current sampling primitive implementation and adds biased sampling support.

Closes #4288

Authors:
  - Seunghwa Kang (https://github.com/seunghwak)
  - Chuck Hastings (https://github.com/ChuckHastings)

Approvers:
  - Chuck Hastings (https://github.com/ChuckHastings)

URL: https://github.com/rapidsai/cugraph/pull/4430
---
 .../cugraph/edge_partition_device_view.cuh    |   70 +-
 .../cugraph/utilities/thrust_tuple_utils.hpp  |   62 +-
 .../cugraph/vertex_partition_device_view.cuh  |    3 +-
 .../detail/extract_transform_v_frontier_e.cuh |   57 +-
 cpp/src/prims/detail/partition_v_frontier.cuh |   91 +
 cpp/src/prims/detail/prim_functors.cuh        |    7 +-
 .../sample_and_compute_local_nbr_indices.cuh  | 2441 +++++++++++++++++
 .../prims/detail/transform_v_frontier_e.cuh   |  627 +++++
 ...r_v_random_select_transform_outgoing_e.cuh | 1235 +--------
 .../transform_reduce_e_by_src_dst_key.cuh     |   26 +-
 ...rm_reduce_v_frontier_outgoing_e_by_dst.cuh |   17 +-
 cpp/src/prims/update_v_frontier.cuh           |   16 +-
 ...er_v_random_select_transform_outgoing_e.cu |  130 +-
 cpp/tests/utilities/debug_utilities_mg.cpp    |   12 +-
 14 files changed, 3547 insertions(+), 1247 deletions(-)
 create mode 100644 cpp/src/prims/detail/partition_v_frontier.cuh
 create mode 100644 cpp/src/prims/detail/sample_and_compute_local_nbr_indices.cuh
 create mode 100644 cpp/src/prims/detail/transform_v_frontier_e.cuh

diff --git a/cpp/include/cugraph/edge_partition_device_view.cuh b/cpp/include/cugraph/edge_partition_device_view.cuh
index fc19a8f68dd..583b0a37214 100644
--- a/cpp/include/cugraph/edge_partition_device_view.cuh
+++ b/cpp/include/cugraph/edge_partition_device_view.cuh
@@ -214,9 +214,9 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
   }
 
   template <typename MajorIterator>
-  size_t compute_number_of_edges(MajorIterator major_first,
-                                 MajorIterator major_last,
-                                 rmm::cuda_stream_view stream) const
+  __host__ size_t compute_number_of_edges(MajorIterator major_first,
+                                          MajorIterator major_last,
+                                          rmm::cuda_stream_view stream) const
   {
     return dcs_nzd_vertices_ ? thrust::transform_reduce(
                                  rmm::exec_policy(stream),
@@ -250,7 +250,7 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
                                  thrust::plus<size_t>());
   }
 
-  rmm::device_uvector<edge_t> compute_local_degrees(rmm::cuda_stream_view stream) const
+  __host__ rmm::device_uvector<edge_t> compute_local_degrees(rmm::cuda_stream_view stream) const
   {
     rmm::device_uvector<edge_t> local_degrees(this->major_range_size(), stream);
     if (dcs_nzd_vertices_) {
@@ -277,9 +277,9 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
   }
 
   template <typename MajorIterator>
-  rmm::device_uvector<edge_t> compute_local_degrees(MajorIterator major_first,
-                                                    MajorIterator major_last,
-                                                    rmm::cuda_stream_view stream) const
+  __host__ rmm::device_uvector<edge_t> compute_local_degrees(MajorIterator major_first,
+                                                             MajorIterator major_last,
+                                                             rmm::cuda_stream_view stream) const
   {
     rmm::device_uvector<edge_t> local_degrees(thrust::distance(major_first, major_last), stream);
     if (dcs_nzd_vertices_) {
@@ -306,10 +306,10 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
   }
 
   template <typename MaskIterator, typename MajorIterator>
-  size_t compute_number_of_edges_with_mask(MaskIterator mask_first,
-                                           MajorIterator major_first,
-                                           MajorIterator major_last,
-                                           rmm::cuda_stream_view stream) const
+  __host__ size_t compute_number_of_edges_with_mask(MaskIterator mask_first,
+                                                    MajorIterator major_first,
+                                                    MajorIterator major_last,
+                                                    rmm::cuda_stream_view stream) const
   {
     return dcs_nzd_vertices_ ? thrust::transform_reduce(
                                  rmm::exec_policy(stream),
@@ -348,8 +348,8 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
   }
 
   template <typename MaskIterator>
-  rmm::device_uvector<edge_t> compute_local_degrees_with_mask(MaskIterator mask_first,
-                                                              rmm::cuda_stream_view stream) const
+  __host__ rmm::device_uvector<edge_t> compute_local_degrees_with_mask(
+    MaskIterator mask_first, rmm::cuda_stream_view stream) const
   {
     rmm::device_uvector<edge_t> local_degrees(this->major_range_size(), stream);
     if (dcs_nzd_vertices_) {
@@ -384,10 +384,11 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
   }
 
   template <typename MaskIterator, typename MajorIterator>
-  rmm::device_uvector<edge_t> compute_local_degrees_with_mask(MaskIterator mask_first,
-                                                              MajorIterator major_first,
-                                                              MajorIterator major_last,
-                                                              rmm::cuda_stream_view stream) const
+  __host__ rmm::device_uvector<edge_t> compute_local_degrees_with_mask(
+    MaskIterator mask_first,
+    MajorIterator major_first,
+    MajorIterator major_last,
+    rmm::cuda_stream_view stream) const
   {
     rmm::device_uvector<edge_t> local_degrees(thrust::distance(major_first, major_last), stream);
     if (dcs_nzd_vertices_) {
@@ -553,9 +554,9 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
   }
 
   template <typename MajorIterator>
-  size_t compute_number_of_edges(MajorIterator major_first,
-                                 MajorIterator major_last,
-                                 rmm::cuda_stream_view stream) const
+  __host__ size_t compute_number_of_edges(MajorIterator major_first,
+                                          MajorIterator major_last,
+                                          rmm::cuda_stream_view stream) const
   {
     return thrust::transform_reduce(
       rmm::exec_policy(stream),
@@ -573,7 +574,7 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
       thrust::plus<size_t>());
   }
 
-  rmm::device_uvector<edge_t> compute_local_degrees(rmm::cuda_stream_view stream) const
+  __host__ rmm::device_uvector<edge_t> compute_local_degrees(rmm::cuda_stream_view stream) const
   {
     rmm::device_uvector<edge_t> local_degrees(this->major_range_size(), stream);
     thrust::transform(rmm::exec_policy(stream),
@@ -589,9 +590,9 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
   }
 
   template <typename MajorIterator>
-  rmm::device_uvector<edge_t> compute_local_degrees(MajorIterator major_first,
-                                                    MajorIterator major_last,
-                                                    rmm::cuda_stream_view stream) const
+  __host__ rmm::device_uvector<edge_t> compute_local_degrees(MajorIterator major_first,
+                                                             MajorIterator major_last,
+                                                             rmm::cuda_stream_view stream) const
   {
     rmm::device_uvector<edge_t> local_degrees(thrust::distance(major_first, major_last), stream);
     thrust::transform(rmm::exec_policy(stream),
@@ -607,10 +608,10 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
   }
 
   template <typename MaskIterator, typename MajorIterator>
-  size_t compute_number_of_edges_with_mask(MaskIterator mask_first,
-                                           MajorIterator major_first,
-                                           MajorIterator major_last,
-                                           rmm::cuda_stream_view stream) const
+  __host__ size_t compute_number_of_edges_with_mask(MaskIterator mask_first,
+                                                    MajorIterator major_first,
+                                                    MajorIterator major_last,
+                                                    rmm::cuda_stream_view stream) const
   {
     return thrust::transform_reduce(
       rmm::exec_policy(stream),
@@ -632,8 +633,8 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
   }
 
   template <typename MaskIterator>
-  rmm::device_uvector<edge_t> compute_local_degrees_with_mask(MaskIterator mask_first,
-                                                              rmm::cuda_stream_view stream) const
+  __host__ rmm::device_uvector<edge_t> compute_local_degrees_with_mask(
+    MaskIterator mask_first, rmm::cuda_stream_view stream) const
   {
     rmm::device_uvector<edge_t> local_degrees(this->major_range_size(), stream);
     thrust::transform(
@@ -651,10 +652,11 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
   }
 
   template <typename MaskIterator, typename MajorIterator>
-  rmm::device_uvector<edge_t> compute_local_degrees_with_mask(MaskIterator mask_first,
-                                                              MajorIterator major_first,
-                                                              MajorIterator major_last,
-                                                              rmm::cuda_stream_view stream) const
+  __host__ rmm::device_uvector<edge_t> compute_local_degrees_with_mask(
+    MaskIterator mask_first,
+    MajorIterator major_first,
+    MajorIterator major_last,
+    rmm::cuda_stream_view stream) const
   {
     rmm::device_uvector<edge_t> local_degrees(thrust::distance(major_first, major_last), stream);
     thrust::transform(
diff --git a/cpp/include/cugraph/utilities/thrust_tuple_utils.hpp b/cpp/include/cugraph/utilities/thrust_tuple_utils.hpp
index d98754f51d1..304a5b94bd6 100644
--- a/cpp/include/cugraph/utilities/thrust_tuple_utils.hpp
+++ b/cpp/include/cugraph/utilities/thrust_tuple_utils.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@
 
 #include <rmm/device_uvector.hpp>
 
+#include <thrust/iterator/iterator_traits.h>
 #include <thrust/tuple.h>
 
 #include <array>
@@ -30,7 +31,7 @@ template <typename TupleType, size_t I, size_t N>
 struct is_thrust_tuple_of_arithemetic_impl {
   constexpr bool evaluate() const
   {
-    if (!std::is_arithmetic<typename thrust::tuple_element<I, TupleType>::type>::value) {
+    if (!std::is_arithmetic_v<typename thrust::tuple_element<I, TupleType>::type>) {
       return false;
     } else {
       return is_thrust_tuple_of_arithemetic_impl<TupleType, I + 1, N>().evaluate();
@@ -123,19 +124,19 @@ struct is_arithmetic_vector : std::false_type {};
 
 template <template <typename> typename Vector, typename T>
 struct is_arithmetic_vector<Vector<T>, Vector>
-  : std::integral_constant<bool, std::is_arithmetic<T>::value> {};
+  : std::integral_constant<bool, std::is_arithmetic_v<T>> {};
 
 template <typename T>
 struct is_std_tuple_of_arithmetic_vectors : std::false_type {};
 
 template <typename... Ts>
 struct is_std_tuple_of_arithmetic_vectors<std::tuple<rmm::device_uvector<Ts>...>> {
-  static constexpr bool value = (... && std::is_arithmetic<Ts>::value);
+  static constexpr bool value = (... && std::is_arithmetic_v<Ts>);
 };
 
 template <typename T>
 struct is_arithmetic_or_thrust_tuple_of_arithmetic
-  : std::integral_constant<bool, std::is_arithmetic<T>::value> {};
+  : std::integral_constant<bool, std::is_arithmetic_v<T>> {};
 
 template <typename... Ts>
 struct is_arithmetic_or_thrust_tuple_of_arithmetic<thrust::tuple<Ts...>>
@@ -196,8 +197,8 @@ auto to_thrust_tuple(thrust::tuple<Ts...> tuple_value)
 }
 
 template <typename Iterator,
-          typename std::enable_if_t<std::is_arithmetic<
-            typename std::iterator_traits<Iterator>::value_type>::value>* = nullptr>
+          typename std::enable_if_t<
+            std::is_arithmetic_v<typename std::iterator_traits<Iterator>::value_type>>* = nullptr>
 auto to_thrust_iterator_tuple(Iterator iter)
 {
   return thrust::make_tuple(iter);
@@ -211,6 +212,53 @@ auto to_thrust_iterator_tuple(Iterator iter)
   return iter.get_iterator_tuple();
 }
 
+template <typename T, size_t I, typename std::enable_if_t<std::is_arithmetic_v<T>>* = nullptr>
+#ifdef __CUDACC__
+__host__ __device__
+#endif
+  auto
+  thrust_tuple_get_or_identity(T val)
+{
+  return val;
+}
+
+template <typename T,
+          size_t I,
+          typename std::enable_if_t<is_thrust_tuple_of_arithmetic<T>::value>* = nullptr>
+#ifdef __CUDACC__
+__host__ __device__
+#endif
+  auto
+  thrust_tuple_get_or_identity(T val)
+{
+  return thrust::get<I>(val);
+}
+
+template <typename Iterator,
+          size_t I,
+          typename std::enable_if_t<std::is_arithmetic_v<
+            typename thrust::iterator_traits<Iterator>::value_type>>* = nullptr>
+#ifdef __CUDACC__
+__host__ __device__
+#endif
+  auto
+  thrust_tuple_get_or_identity(Iterator val)
+{
+  return val;
+}
+
+template <typename Iterator,
+          size_t I,
+          typename std::enable_if_t<is_thrust_tuple_of_arithmetic<
+            typename thrust::iterator_traits<Iterator>::value_type>::value>* = nullptr>
+#ifdef __CUDACC__
+__host__ __device__
+#endif
+  auto
+  thrust_tuple_get_or_identity(Iterator val)
+{
+  return thrust::get<I>(val.get_iterator_tuple());
+}
 // a temporary function to emulate thrust::tuple_cat (not supported) using std::tuple_cat (should
 // retire once thrust::tuple is replaced with cuda::std::tuple)
 template <typename... TupleTypes>
diff --git a/cpp/include/cugraph/vertex_partition_device_view.cuh b/cpp/include/cugraph/vertex_partition_device_view.cuh
index ac1e340210b..20f889fa191 100644
--- a/cpp/include/cugraph/vertex_partition_device_view.cuh
+++ b/cpp/include/cugraph/vertex_partition_device_view.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,6 +16,7 @@
 #pragma once
 
 #include <cugraph/utilities/error.hpp>
+#include <cugraph/vertex_partition_view.hpp>
 
 #include <type_traits>
 
diff --git a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh
index ff9a4f506d0..04448c9e51d 100644
--- a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh
+++ b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh
@@ -188,13 +188,8 @@ __global__ static void extract_transform_v_frontier_e_hypersparse_or_low_degree(
 
     edge_t local_degree{0};
     if (lane_id < static_cast<int32_t>(max_key_idx - min_key_idx)) {
-      auto key = *(key_first + idx);
-      vertex_t major{};
-      if constexpr (std::is_same_v<key_t, vertex_t>) {
-        major = key;
-      } else {
-        major = thrust::get<0>(key);
-      }
+      auto key   = *(key_first + idx);
+      auto major = thrust_tuple_get_or_identity<key_t, 0>(key);
       if constexpr (hypersparse) {
         auto major_hypersparse_idx = edge_partition.major_hypersparse_idx_from_major_nocheck(major);
         if (major_hypersparse_idx) {
@@ -333,13 +328,8 @@ __global__ static void extract_transform_v_frontier_e_mid_degree(
   cuda::atomic_ref<size_t, cuda::thread_scope_device> buffer_idx(*buffer_idx_ptr);
 
   while (idx < static_cast<size_t>(thrust::distance(key_first, key_last))) {
-    auto key = *(key_first + idx);
-    vertex_t major{};
-    if constexpr (std::is_same_v<key_t, vertex_t>) {
-      major = key;
-    } else {
-      major = thrust::get<0>(key);
-    }
+    auto key          = *(key_first + idx);
+    auto major        = thrust_tuple_get_or_identity<key_t, 0>(key);
     auto major_offset = edge_partition.major_offset_from_major_nocheck(major);
     vertex_t const* indices{nullptr};
     edge_t local_edge_offset{};
@@ -432,13 +422,8 @@ __global__ static void extract_transform_v_frontier_e_high_degree(
   cuda::atomic_ref<size_t, cuda::thread_scope_device> buffer_idx(*buffer_idx_ptr);
 
   while (idx < static_cast<size_t>(thrust::distance(key_first, key_last))) {
-    auto key = *(key_first + idx);
-    vertex_t major{};
-    if constexpr (std::is_same_v<key_t, vertex_t>) {
-      major = key;
-    } else {
-      major = thrust::get<0>(key);
-    }
+    auto key          = *(key_first + idx);
+    auto major        = thrust_tuple_get_or_identity<key_t, 0>(key);
     auto major_offset = edge_partition.major_offset_from_major_nocheck(major);
     vertex_t const* indices{nullptr};
     edge_t local_edge_offset{};
@@ -561,15 +546,10 @@ extract_transform_v_frontier_e(raft::handle_t const& handle,
                                                          thrust::optional<output_value_t>>>>);
 
   if (do_expensive_check) {
-    vertex_t const* frontier_vertex_first{nullptr};
-    vertex_t const* frontier_vertex_last{nullptr};
-    if constexpr (std::is_same_v<key_t, vertex_t>) {
-      frontier_vertex_first = frontier.begin();
-      frontier_vertex_last  = frontier.end();
-    } else {
-      frontier_vertex_first = thrust::get<0>(frontier.begin().get_iterator_tuple());
-      frontier_vertex_last  = thrust::get<0>(frontier.end().get_iterator_tuple());
-    }
+    auto frontier_vertex_first =
+      thrust_tuple_get_or_identity<decltype(frontier.begin()), 0>(frontier.begin());
+    auto frontier_vertex_last =
+      thrust_tuple_get_or_identity<decltype(frontier.end()), 0>(frontier.end());
     auto num_invalid_keys =
       frontier.size() -
       thrust::count_if(handle.get_thrust_policy(),
@@ -659,17 +639,12 @@ extract_transform_v_frontier_e(raft::handle_t const& handle,
         get_dataframe_buffer_end(edge_partition_frontier_key_buffer);
     }
 
-    vertex_t const* edge_partition_frontier_major_first{nullptr};
-    vertex_t const* edge_partition_frontier_major_last{nullptr};
-    if constexpr (std::is_same_v<key_t, vertex_t>) {
-      edge_partition_frontier_major_first = edge_partition_frontier_key_first;
-      edge_partition_frontier_major_last  = edge_partition_frontier_key_last;
-    } else {
-      edge_partition_frontier_major_first =
-        thrust::get<0>(edge_partition_frontier_key_first.get_iterator_tuple());
-      edge_partition_frontier_major_last =
-        thrust::get<0>(edge_partition_frontier_key_last.get_iterator_tuple());
-    }
+    auto edge_partition_frontier_major_first =
+      thrust_tuple_get_or_identity<decltype(edge_partition_frontier_key_first), 0>(
+        edge_partition_frontier_key_first);
+    auto edge_partition_frontier_major_last =
+      thrust_tuple_get_or_identity<decltype(edge_partition_frontier_key_last), 0>(
+        edge_partition_frontier_key_last);
 
     auto segment_offsets = graph_view.local_edge_partition_segment_offsets(i);
     auto max_pushes      = edge_partition.compute_number_of_edges(
diff --git a/cpp/src/prims/detail/partition_v_frontier.cuh b/cpp/src/prims/detail/partition_v_frontier.cuh
new file mode 100644
index 00000000000..018960d9a54
--- /dev/null
+++ b/cpp/src/prims/detail/partition_v_frontier.cuh
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cugraph/edge_partition_device_view.cuh>
+#include <cugraph/edge_partition_edge_property_device_view.cuh>
+#include <cugraph/edge_partition_endpoint_property_device_view.cuh>
+#include <cugraph/partition_manager.hpp>
+#include <cugraph/utilities/dataframe_buffer.hpp>
+#include <cugraph/utilities/host_scalar_comm.hpp>
+#include <cugraph/utilities/mask_utils.cuh>
+#include <cugraph/utilities/misc_utils.cuh>
+#include <cugraph/utilities/shuffle_comm.cuh>
+#include <cugraph/vertex_partition_device_view.cuh>
+
+#include <raft/random/rng.cuh>
+
+#include <cub/cub.cuh>
+#include <cuda/atomic>
+#include <cuda/functional>
+#include <thrust/copy.h>
+#include <thrust/count.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/optional.h>
+#include <thrust/remove.h>
+#include <thrust/sort.h>
+#include <thrust/tabulate.h>
+#include <thrust/tuple.h>
+#include <thrust/unique.h>
+
+#include <optional>
+#include <tuple>
+
+namespace cugraph {
+
+namespace detail {
+
+template <typename ValueIterator>
+std::tuple<rmm::device_uvector<size_t> /* indices */,
+           std::vector<size_t> /* offsets (size = value_offsets.size()) */>
+partition_v_frontier(raft::handle_t const& handle,
+                     ValueIterator frontier_value_first,
+                     ValueIterator frontier_value_last,
+                     std::vector<typename thrust::iterator_traits<ValueIterator>::value_type> const&
+                       thresholds /* size = # partitions - 1, thresholds[i] marks the end
+                                     (exclusive) of the i'th partition value range */
+)
+{
+  rmm::device_uvector<size_t> indices(thrust::distance(frontier_value_first, frontier_value_last),
+                                      handle.get_stream());
+  thrust::sequence(handle.get_thrust_policy(), indices.begin(), indices.end(), size_t{0});
+  std::vector<size_t> v_frontier_partition_offsets(thresholds.size() + 2);
+  v_frontier_partition_offsets[0] = size_t{0};
+  v_frontier_partition_offsets.back() =
+    static_cast<size_t>(thrust::distance(frontier_value_first, frontier_value_last));
+
+  auto index_first = indices.begin();
+  auto index_last  = indices.end();
+  for (size_t i = 0; i < thresholds.size(); ++i) {
+    auto false_first =
+      thrust::partition(handle.get_thrust_policy(),
+                        index_first,
+                        index_last,
+                        [frontier_value_first, threshold = thresholds[i]] __device__(size_t idx) {
+                          return *(frontier_value_first + idx) < threshold;
+                        });
+    v_frontier_partition_offsets[1 + i] =
+      v_frontier_partition_offsets[i] + thrust::distance(index_first, false_first);
+    index_first = false_first;
+  }
+
+  return std::make_tuple(std::move(indices), std::move(v_frontier_partition_offsets));
+}
+
+}  // namespace detail
+
+}  // namespace cugraph
diff --git a/cpp/src/prims/detail/prim_functors.cuh b/cpp/src/prims/detail/prim_functors.cuh
index d142aed1051..f426cd993ea 100644
--- a/cpp/src/prims/detail/prim_functors.cuh
+++ b/cpp/src/prims/detail/prim_functors.cuh
@@ -89,12 +89,7 @@ struct call_e_op_with_key_t {
   __device__ auto operator()(
     key_t key, typename GraphViewType::edge_type i /* index in edge_partition's edge list */) const
   {
-    typename GraphViewType::vertex_type major{};
-    if constexpr (std::is_same_v<key_t, typename GraphViewType::vertex_type>) {
-      major = key;
-    } else {
-      major = thrust::get<0>(key);
-    }
+    auto major        = thrust_tuple_get_or_identity<key_t, 0>(key);
     auto major_offset = edge_partition.major_offset_from_major_nocheck(major);
     auto minor        = *(edge_partition.indices() + i);
     auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor);
diff --git a/cpp/src/prims/detail/sample_and_compute_local_nbr_indices.cuh b/cpp/src/prims/detail/sample_and_compute_local_nbr_indices.cuh
new file mode 100644
index 00000000000..ba6f7dea040
--- /dev/null
+++ b/cpp/src/prims/detail/sample_and_compute_local_nbr_indices.cuh
@@ -0,0 +1,2441 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "prims/detail/partition_v_frontier.cuh"
+#include "prims/detail/transform_v_frontier_e.cuh"
+#include "prims/property_op_utils.cuh"
+
+#include <cugraph/detail/utility_wrappers.hpp>
+#include <cugraph/edge_partition_device_view.cuh>
+#include <cugraph/edge_partition_edge_property_device_view.cuh>
+#include <cugraph/edge_partition_endpoint_property_device_view.cuh>
+#include <cugraph/partition_manager.hpp>
+#include <cugraph/utilities/dataframe_buffer.hpp>
+#include <cugraph/utilities/host_scalar_comm.hpp>
+#include <cugraph/utilities/mask_utils.cuh>
+#include <cugraph/utilities/misc_utils.cuh>
+#include <cugraph/utilities/shuffle_comm.cuh>
+#include <cugraph/vertex_partition_device_view.cuh>
+
+#include <raft/random/rng.cuh>
+#ifndef NO_CUGRAPH_OPS
+#include <cugraph-ops/graph/sampling.hpp>
+#endif
+
+#include <cub/cub.cuh>
+#include <cuda/atomic>
+#include <cuda/functional>
+#include <thrust/adjacent_difference.h>
+#include <thrust/copy.h>
+#include <thrust/count.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/optional.h>
+#include <thrust/remove.h>
+#include <thrust/sort.h>
+#include <thrust/tabulate.h>
+#include <thrust/tuple.h>
+#include <thrust/unique.h>
+
+#include <optional>
+#include <tuple>
+
+namespace cugraph {
+
+namespace detail {
+
+int32_t constexpr sample_and_compute_local_nbr_indices_block_size = 256;
+
+size_t constexpr compute_valid_local_nbr_count_inclusive_sum_local_degree_threshold =
+  packed_bools_per_word() *
+  size_t{4} /* tuning parameter */;  // minimum local degree to compute inclusive sums of valid
+                                     // local neighbors per word to accelerate finding n'th local
+                                     // neighbor vertex
+size_t constexpr compute_valid_local_nbr_count_inclusive_sum_mid_local_degree_threshold =
+  packed_bools_per_word() * static_cast<size_t>(raft::warp_size()) *
+  size_t{4} /* tuning parameter */;  // minimum local degree to use a CUDA warp
+size_t constexpr compute_valid_local_nbr_count_inclusive_sum_high_local_degree_threshold =
+  packed_bools_per_word() * static_cast<size_t>(sample_and_compute_local_nbr_indices_block_size) *
+  size_t{4} /* tuning parameter */;  // minimum local degree to use a CUDA block
+
+// this functor output will later be used to convert global value (neighbor index, random number) to
+// (local value, minor_comm_rank) pairs.
+template <typename value_t>
+struct compute_local_value_displacements_and_global_value_t {
+  raft::device_span<value_t const> gathered_local_values{};
+  raft::device_span<value_t>
+    partitioned_local_value_displacements{};  // one partition per gpu in the same minor_comm
+  raft::device_span<value_t> global_values{};
+  int minor_comm_size{};
+
+  __device__ void operator()(size_t i) const
+  {
+    constexpr int buffer_size = 8;  // tuning parameter
+    value_t displacements[buffer_size];
+    value_t sum{0};
+    for (int round = 0; round < (minor_comm_size + buffer_size - 1) / buffer_size; ++round) {
+      auto loop_count = std::min(buffer_size, minor_comm_size - round * buffer_size);
+      for (int j = 0; j < loop_count; ++j) {
+        displacements[j] = sum;
+        sum += gathered_local_values[i + (round * buffer_size + j) * global_values.size()];
+      }
+      thrust::copy(
+        thrust::seq,
+        displacements,
+        displacements + loop_count,
+        partitioned_local_value_displacements.begin() + i * minor_comm_size + round * buffer_size);
+    }
+    global_values[i] = sum;
+  }
+};
+
+// convert a (neighbor value, key index) pair  to a (minor_comm_rank, intra-partition offset, local
+// neighbor value, key index) quadruplet, minor_comm_rank is set to -1 if a neighbor value is
+// invalid
+template <typename value_t>
+struct convert_pair_to_quadruplet_t {
+  raft::device_span<value_t const>
+    partitioned_local_value_displacements{};  // one partition per gpu in the same minor_comm
+  raft::device_span<size_t> tx_counts{};
+  size_t stride{};
+  int minor_comm_size{};
+  value_t invalid_value{};
+
+  __device__ thrust::tuple<int, size_t, value_t, size_t> operator()(
+    thrust::tuple<value_t, size_t> pair) const
+  {
+    auto nbr_value       = thrust::get<0>(pair);
+    auto key_idx         = thrust::get<1>(pair);
+    auto local_nbr_value = nbr_value;
+    int minor_comm_rank{-1};
+    size_t intra_partition_offset{};
+    if (nbr_value != invalid_value) {
+      auto displacement_first =
+        partitioned_local_value_displacements.begin() + key_idx * minor_comm_size;
+      minor_comm_rank =
+        static_cast<int>(thrust::distance(
+          displacement_first,
+          thrust::upper_bound(
+            thrust::seq, displacement_first, displacement_first + minor_comm_size, nbr_value))) -
+        1;
+      local_nbr_value -= *(displacement_first + minor_comm_rank);
+      cuda::atomic_ref<size_t, cuda::thread_scope_device> counter(tx_counts[minor_comm_rank]);
+      intra_partition_offset = counter.fetch_add(size_t{1}, cuda::std::memory_order_relaxed);
+    }
+    return thrust::make_tuple(minor_comm_rank, intra_partition_offset, local_nbr_value, key_idx);
+  }
+};
+
+struct shuffle_index_compute_offset_t {
+  raft::device_span<int const> minor_comm_ranks{};
+  raft::device_span<size_t const> intra_partition_displacements{};
+  raft::device_span<size_t const> tx_displacements{};
+
+  __device__ size_t operator()(size_t i) const
+  {
+    auto minor_comm_rank = minor_comm_ranks[i];
+    assert(minor_comm_rank != -1);
+    return tx_displacements[minor_comm_rank] + intra_partition_displacements[i];
+  }
+};
+
+// to convert neighbor index excluding masked out edges to neighbor index ignoring edge mask
+template <typename GraphViewType, typename EdgePartitionEdgeMaskWrapper, typename VertexIterator>
+struct find_nth_valid_nbr_idx_t {
+  using vertex_t = typename GraphViewType::vertex_type;
+  using edge_t   = typename GraphViewType::edge_type;
+
+  edge_partition_device_view_t<vertex_t, edge_t, GraphViewType::is_multi_gpu> edge_partition{};
+  EdgePartitionEdgeMaskWrapper edge_partition_e_mask;
+  VertexIterator major_first{};
+  thrust::tuple<raft::device_span<size_t const>, raft::device_span<edge_t const>>
+    major_valid_local_nbr_count_inclusive_sums{};
+
+  __device__ edge_t operator()(thrust::tuple<edge_t, size_t> pair) const
+  {
+    edge_t local_nbr_idx = thrust::get<0>(pair);
+    size_t major_idx     = thrust::get<1>(pair);
+    auto major           = *(major_first + major_idx);
+    auto major_offset    = edge_partition.major_offset_from_major_nocheck(major);
+    vertex_t const* indices{nullptr};
+    edge_t edge_offset{0};
+    [[maybe_unused]] edge_t local_degree{0};
+    if constexpr (GraphViewType::is_multi_gpu) {
+      auto major_hypersparse_first = edge_partition.major_hypersparse_first();
+      if (major_hypersparse_first && (major >= *major_hypersparse_first)) {
+        auto major_hypersparse_idx = edge_partition.major_hypersparse_idx_from_major_nocheck(major);
+        if (major_hypersparse_idx) {
+          thrust::tie(indices, edge_offset, local_degree) = edge_partition.local_edges(
+            edge_partition.major_offset_from_major_nocheck(*major_hypersparse_first) +
+            *major_hypersparse_idx);
+        }
+      } else {
+        thrust::tie(indices, edge_offset, local_degree) = edge_partition.local_edges(major_offset);
+      }
+    } else {
+      thrust::tie(indices, edge_offset, local_degree) = edge_partition.local_edges(major_offset);
+    }
+
+    if (local_degree < compute_valid_local_nbr_count_inclusive_sum_local_degree_threshold) {
+      local_nbr_idx = find_nth_set_bits(
+        (*edge_partition_e_mask).value_first(), edge_offset, local_degree, local_nbr_idx + 1);
+    } else {
+      auto inclusive_sum_first = thrust::get<1>(major_valid_local_nbr_count_inclusive_sums).begin();
+      auto start_offset = thrust::get<0>(major_valid_local_nbr_count_inclusive_sums)[major_idx];
+      auto end_offset   = thrust::get<0>(major_valid_local_nbr_count_inclusive_sums)[major_idx + 1];
+      auto word_idx =
+        static_cast<edge_t>(thrust::distance(inclusive_sum_first + start_offset,
+                                             thrust::upper_bound(thrust::seq,
+                                                                 inclusive_sum_first + start_offset,
+                                                                 inclusive_sum_first + end_offset,
+                                                                 local_nbr_idx)));
+      local_nbr_idx =
+        word_idx * packed_bools_per_word() +
+        find_nth_set_bits(
+          (*edge_partition_e_mask).value_first(),
+          edge_offset + word_idx * packed_bools_per_word(),
+          local_degree - word_idx * packed_bools_per_word(),
+          (local_nbr_idx + 1) -
+            ((word_idx > 0) ? *(inclusive_sum_first + start_offset + word_idx - 1) : edge_t{0}));
+    }
+    return local_nbr_idx;
+  }
+};
+
+template <typename vertex_t, typename edge_t, bool multi_gpu>
+__global__ static void compute_valid_local_nbr_count_inclusive_sums_mid_local_degree(
+  edge_partition_device_view_t<vertex_t, edge_t, multi_gpu> edge_partition,
+  edge_partition_edge_property_device_view_t<edge_t, uint32_t const*, bool> edge_partition_e_mask,
+  vertex_t const* edge_partition_frontier_major_first,
+  raft::device_span<size_t const> inclusive_sum_offsets,
+  raft::device_span<size_t const> frontier_indices,
+  raft::device_span<edge_t> inclusive_sums)
+{
+  static_assert(sample_and_compute_local_nbr_indices_block_size % raft::warp_size() == 0);
+
+  auto const tid     = threadIdx.x + blockIdx.x * blockDim.x;
+  auto const lane_id = tid % raft::warp_size();
+
+  auto idx = static_cast<size_t>(tid / raft::warp_size());
+
+  using WarpScan = cub::WarpScan<edge_t, raft::warp_size()>;
+  __shared__ typename WarpScan::TempStorage temp_storage;
+
+  while (idx < frontier_indices.size()) {
+    auto frontier_idx = frontier_indices[idx];
+    auto major        = *(edge_partition_frontier_major_first + frontier_idx);
+    vertex_t major_idx{};
+    if constexpr (multi_gpu) {
+      major_idx = *(edge_partition.major_idx_from_major_nocheck(major));
+    } else {
+      major_idx = edge_partition.major_offset_from_major_nocheck(major);
+    }
+    auto edge_offset  = edge_partition.local_offset(major_idx);
+    auto local_degree = edge_partition.local_degree(major_idx);
+
+    auto start_offset       = inclusive_sum_offsets[frontier_idx];
+    auto end_offset         = inclusive_sum_offsets[frontier_idx + 1];
+    auto num_inclusive_sums = end_offset - start_offset;
+    auto rounded_up_num_inclusive_sums =
+      ((num_inclusive_sums + raft::warp_size() - 1) / raft::warp_size()) * raft::warp_size();
+    edge_t sum{0};
+    for (size_t j = lane_id; j <= rounded_up_num_inclusive_sums; j += raft::warp_size()) {
+      auto inc =
+        (j < num_inclusive_sums)
+          ? static_cast<edge_t>(count_set_bits(
+              edge_partition_e_mask.value_first(),
+              edge_offset + packed_bools_per_word() * j,
+              cuda::std::min(packed_bools_per_word(), local_degree - packed_bools_per_word() * j)))
+          : edge_t{0};
+      WarpScan(temp_storage).InclusiveSum(inc, inc);
+      inclusive_sums[start_offset + j] = sum + inc;
+      sum += __shfl_sync(raft::warp_full_mask(), inc, raft::warp_size() - 1);
+    }
+
+    idx += gridDim.x * (blockDim.x / raft::warp_size());
+  }
+}
+
+template <typename vertex_t, typename edge_t, bool multi_gpu>
+__global__ static void compute_valid_local_nbr_count_inclusive_sums_high_local_degree(
+  edge_partition_device_view_t<vertex_t, edge_t, multi_gpu> edge_partition,
+  edge_partition_edge_property_device_view_t<edge_t, uint32_t const*, bool> edge_partition_e_mask,
+  vertex_t const* edge_partition_frontier_major_first,
+  raft::device_span<size_t const> inclusive_sum_offsets,
+  raft::device_span<size_t const> frontier_indices,
+  raft::device_span<edge_t> inclusive_sums)
+{
+  static_assert(sample_and_compute_local_nbr_indices_block_size % raft::warp_size() == 0);
+
+  auto idx = static_cast<size_t>(blockIdx.x);
+
+  using BlockScan = cub::BlockScan<edge_t, sample_and_compute_local_nbr_indices_block_size>;
+  __shared__ typename BlockScan::TempStorage temp_storage;
+
+  __shared__ edge_t sum;
+
+  while (idx < frontier_indices.size()) {
+    auto frontier_idx = frontier_indices[idx];
+    auto major        = *(edge_partition_frontier_major_first + frontier_idx);
+    vertex_t major_idx{};
+    if constexpr (multi_gpu) {
+      major_idx = *(edge_partition.major_idx_from_major_nocheck(major));
+    } else {
+      major_idx = edge_partition.major_offset_from_major_nocheck(major);
+    }
+    auto edge_offset  = edge_partition.local_offset(major_idx);
+    auto local_degree = edge_partition.local_degree(major_idx);
+
+    auto start_offset       = inclusive_sum_offsets[frontier_idx];
+    auto end_offset         = inclusive_sum_offsets[frontier_idx + 1];
+    auto num_inclusive_sums = end_offset - start_offset;
+    auto rounded_up_num_inclusive_sums =
+      ((num_inclusive_sums + sample_and_compute_local_nbr_indices_block_size - 1) /
+       sample_and_compute_local_nbr_indices_block_size) *
+      sample_and_compute_local_nbr_indices_block_size;
+    if (threadIdx.x == sample_and_compute_local_nbr_indices_block_size - 1) { sum = 0; }
+    for (size_t j = threadIdx.x; j <= rounded_up_num_inclusive_sums; j += blockDim.x) {
+      auto inc =
+        (j < num_inclusive_sums)
+          ? static_cast<edge_t>(count_set_bits(
+              edge_partition_e_mask.value_first(),
+              edge_offset + packed_bools_per_word() * j,
+              cuda::std::min(packed_bools_per_word(), local_degree - packed_bools_per_word() * j)))
+          : edge_t{0};
+      BlockScan(temp_storage).InclusiveSum(inc, inc);
+      inclusive_sums[start_offset + j] = sum + inc;
+      __syncthreads();
+      if (threadIdx.x == sample_and_compute_local_nbr_indices_block_size - 1) { sum += inc; }
+    }
+
+    idx += gridDim.x;
+  }
+}
+
+template <typename value_t>
+std::tuple<rmm::device_uvector<value_t>, rmm::device_uvector<value_t>>
+compute_frontier_value_sums_and_partitioned_local_value_sum_displacements(
+  raft::handle_t const& handle,
+  raft::device_span<value_t const> aggregate_local_frontier_local_value_sums,
+  std::vector<size_t> const& local_frontier_displacements,
+  std::vector<size_t> const& local_frontier_sizes)
+{
+  auto& minor_comm     = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
+  auto minor_comm_rank = minor_comm.get_rank();
+  auto minor_comm_size = minor_comm.get_size();
+
+  rmm::device_uvector<value_t> frontier_gathered_local_value_sums(0, handle.get_stream());
+  std::tie(frontier_gathered_local_value_sums, std::ignore) =
+    shuffle_values(minor_comm,
+                   aggregate_local_frontier_local_value_sums.begin(),
+                   local_frontier_sizes,
+                   handle.get_stream());
+
+  rmm::device_uvector<value_t> frontier_value_sums(local_frontier_sizes[minor_comm_rank],
+                                                   handle.get_stream());
+  rmm::device_uvector<value_t> frontier_partitioned_local_value_sum_displacements(
+    frontier_value_sums.size() * minor_comm_size, handle.get_stream());
+
+  thrust::for_each(
+    handle.get_thrust_policy(),
+    thrust::make_counting_iterator(size_t{0}),
+    thrust::make_counting_iterator(frontier_value_sums.size()),
+    compute_local_value_displacements_and_global_value_t<value_t>{
+      raft::device_span<value_t const>(frontier_gathered_local_value_sums.data(),
+                                       frontier_gathered_local_value_sums.size()),
+      raft::device_span<value_t>(frontier_partitioned_local_value_sum_displacements.data(),
+                                 frontier_partitioned_local_value_sum_displacements.size()),
+      raft::device_span<value_t>(frontier_value_sums.data(), frontier_value_sums.size()),
+      minor_comm_size});
+
+  return std::make_tuple(std::move(frontier_value_sums),
+                         std::move(frontier_partitioned_local_value_sum_displacements));
+}
+
+template <typename GraphViewType, typename VertexIterator>
+std::vector<
+  std::tuple<rmm::device_uvector<size_t>, rmm::device_uvector<typename GraphViewType::edge_type>>>
+compute_valid_local_nbr_count_inclusive_sums(
+  raft::handle_t const& handle,
+  GraphViewType const& graph_view,
+  VertexIterator aggregate_local_frontier_major_first,
+  std::vector<size_t> const& local_frontier_displacements,
+  std::vector<size_t> const& local_frontier_sizes)
+{
+  using vertex_t = typename GraphViewType::vertex_type;
+  using edge_t   = typename GraphViewType::edge_type;
+  static_assert(
+    std::is_same_v<typename thrust::iterator_traits<VertexIterator>::value_type, vertex_t>);
+
+  auto edge_mask_view = graph_view.edge_mask_view();
+
+  std::vector<std::tuple<rmm::device_uvector<size_t>, rmm::device_uvector<edge_t>>>
+    local_frontier_valid_local_nbr_count_inclusive_sums{};
+  local_frontier_valid_local_nbr_count_inclusive_sums.reserve(
+    graph_view.number_of_local_edge_partitions());
+
+  for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) {
+    auto edge_partition =
+      edge_partition_device_view_t<vertex_t, edge_t, GraphViewType::is_multi_gpu>(
+        graph_view.local_edge_partition_view(i));
+    auto edge_partition_e_mask =
+      edge_mask_view
+        ? thrust::make_optional<
+            detail::edge_partition_edge_property_device_view_t<edge_t, uint32_t const*, bool>>(
+            *edge_mask_view, i)
+        : thrust::nullopt;
+
+    auto edge_partition_frontier_major_first =
+      aggregate_local_frontier_major_first + local_frontier_displacements[i];
+
+    auto edge_partition_local_degrees = edge_partition.compute_local_degrees(
+      edge_partition_frontier_major_first,
+      edge_partition_frontier_major_first + local_frontier_sizes[i],
+      handle.get_stream());
+    auto inclusive_sum_offsets =
+      rmm::device_uvector<size_t>(local_frontier_sizes[i] + 1, handle.get_stream());
+    inclusive_sum_offsets.set_element_to_zero_async(0, handle.get_stream());
+    auto size_first = thrust::make_transform_iterator(
+      edge_partition_local_degrees.begin(),
+      cuda::proclaim_return_type<size_t>([] __device__(edge_t local_degree) {
+        return static_cast<size_t>((local_degree + packed_bools_per_word() - 1) /
+                                   packed_bools_per_word());
+      }));
+    thrust::inclusive_scan(handle.get_thrust_policy(),
+                           size_first,
+                           size_first + edge_partition_local_degrees.size(),
+                           inclusive_sum_offsets.begin() + 1);
+
+    auto [edge_partition_frontier_indices, frontier_partition_offsets] = partition_v_frontier(
+      handle,
+      edge_partition_local_degrees.begin(),
+      edge_partition_local_degrees.end(),
+      std::vector<edge_t>{
+        static_cast<edge_t>(compute_valid_local_nbr_count_inclusive_sum_local_degree_threshold),
+        static_cast<edge_t>(compute_valid_local_nbr_count_inclusive_sum_mid_local_degree_threshold),
+        static_cast<edge_t>(
+          compute_valid_local_nbr_count_inclusive_sum_high_local_degree_threshold)});
+
+    rmm::device_uvector<edge_t> inclusive_sums(
+      inclusive_sum_offsets.back_element(handle.get_stream()), handle.get_stream());
+
+    thrust::for_each(
+      handle.get_thrust_policy(),
+      edge_partition_frontier_indices.begin() + frontier_partition_offsets[1],
+      edge_partition_frontier_indices.begin() + frontier_partition_offsets[2],
+      [edge_partition,
+       edge_partition_e_mask,
+       edge_partition_frontier_major_first,
+       inclusive_sum_offsets = raft::device_span<size_t const>(inclusive_sum_offsets.data(),
+                                                               inclusive_sum_offsets.size()),
+       inclusive_sums        = raft::device_span<edge_t>(inclusive_sums.data(),
+                                                  inclusive_sums.size())] __device__(size_t i) {
+        auto major = *(edge_partition_frontier_major_first + i);
+        vertex_t major_idx{};
+        if constexpr (GraphViewType::is_multi_gpu) {
+          major_idx = *(edge_partition.major_idx_from_major_nocheck(major));
+        } else {
+          major_idx = edge_partition.major_offset_from_major_nocheck(major);
+        }
+        auto edge_offset  = edge_partition.local_offset(major_idx);
+        auto local_degree = edge_partition.local_degree(major_idx);
+        edge_t sum{0};
+        auto start_offset = inclusive_sum_offsets[i];
+        auto end_offset   = inclusive_sum_offsets[i + 1];
+        for (size_t j = 0; j < end_offset - start_offset; ++j) {
+          sum += count_set_bits(
+            (*edge_partition_e_mask).value_first(),
+            edge_offset + packed_bools_per_word() * j,
+            cuda::std::min(packed_bools_per_word(), local_degree - packed_bools_per_word() * j));
+          inclusive_sums[start_offset + j] = sum;
+        }
+      });
+
+    auto mid_partition_size = frontier_partition_offsets[3] - frontier_partition_offsets[2];
+    if (mid_partition_size > 0) {
+      raft::grid_1d_warp_t update_grid(mid_partition_size,
+                                       sample_and_compute_local_nbr_indices_block_size,
+                                       handle.get_device_properties().maxGridSize[0]);
+      compute_valid_local_nbr_count_inclusive_sums_mid_local_degree<<<update_grid.num_blocks,
+                                                                      update_grid.block_size,
+                                                                      0,
+                                                                      handle.get_stream()>>>(
+        edge_partition,
+        *edge_partition_e_mask,
+        edge_partition_frontier_major_first,
+        raft::device_span<size_t const>(inclusive_sum_offsets.data(), inclusive_sum_offsets.size()),
+        raft::device_span<size_t const>(
+          edge_partition_frontier_indices.data() + frontier_partition_offsets[2],
+          frontier_partition_offsets[3] - frontier_partition_offsets[2]),
+        raft::device_span<edge_t>(inclusive_sums.data(), inclusive_sums.size()));
+    }
+
+    auto high_partition_size = frontier_partition_offsets[4] - frontier_partition_offsets[3];
+    if (high_partition_size > 0) {
+      raft::grid_1d_block_t update_grid(high_partition_size,
+                                        sample_and_compute_local_nbr_indices_block_size,
+                                        handle.get_device_properties().maxGridSize[0]);
+      compute_valid_local_nbr_count_inclusive_sums_high_local_degree<<<update_grid.num_blocks,
+                                                                       update_grid.block_size,
+                                                                       0,
+                                                                       handle.get_stream()>>>(
+        edge_partition,
+        *edge_partition_e_mask,
+        edge_partition_frontier_major_first,
+        raft::device_span<size_t const>(inclusive_sum_offsets.data(), inclusive_sum_offsets.size()),
+        raft::device_span<size_t const>(
+          edge_partition_frontier_indices.data() + frontier_partition_offsets[3],
+          frontier_partition_offsets[4] - frontier_partition_offsets[3]),
+        raft::device_span<edge_t>(inclusive_sums.data(), inclusive_sums.size()));
+    }
+
+    local_frontier_valid_local_nbr_count_inclusive_sums.push_back(
+      std::make_tuple(std::move(inclusive_sum_offsets), std::move(inclusive_sums)));
+  }
+
+  return local_frontier_valid_local_nbr_count_inclusive_sums;
+}
+
+template <typename edge_t>
+rmm::device_uvector<edge_t> compute_uniform_sampling_index_without_replacement(
+  raft::handle_t const& handle,
+  rmm::device_uvector<edge_t>&& frontier_degrees,
+  raft::random::RngState& rng_state,
+  size_t K)
+{
+#ifndef NO_CUGRAPH_OPS
+  edge_t mid_partition_degree_range_last = static_cast<edge_t>(K * 10);  // tuning parameter
+  assert(mid_partition_degree_range_last > K);
+  size_t high_partition_oversampling_K = K * 2;  // tuning parameter
+  assert(high_partition_oversampling_K > K);
+
+  auto [frontier_indices, frontier_partition_offsets] = partition_v_frontier(
+    handle,
+    frontier_degrees.begin(),
+    frontier_degrees.end(),
+    std::vector<edge_t>{static_cast<edge_t>(K + 1), mid_partition_degree_range_last + 1});
+
+  rmm::device_uvector<edge_t> nbr_indices(frontier_degrees.size() * K, handle.get_stream());
+
+  auto low_partition_size = frontier_partition_offsets[1];
+  if (low_partition_size > 0) {
+    thrust::for_each(
+      handle.get_thrust_policy(),
+      thrust::make_counting_iterator(size_t{0}),
+      thrust::make_counting_iterator(low_partition_size * K),
+      [K,
+       frontier_indices =
+         raft::device_span<size_t const>(frontier_indices.data(), low_partition_size),
+       frontier_degrees =
+         raft::device_span<edge_t const>(frontier_degrees.data(), frontier_degrees.size()),
+       nbr_indices = raft::device_span<edge_t>(nbr_indices.data(), nbr_indices.size()),
+       invalid_idx = cugraph::ops::graph::INVALID_ID<edge_t>] __device__(size_t i) {
+        auto frontier_idx = frontier_indices[i / K];
+        auto degree       = frontier_degrees[frontier_idx];
+        auto sample_idx   = static_cast<edge_t>(i % K);
+        nbr_indices[frontier_idx * K + sample_idx] =
+          (sample_idx < degree) ? sample_idx : invalid_idx;
+      });
+  }
+
+  auto mid_partition_size = frontier_partition_offsets[2] - frontier_partition_offsets[1];
+  if (mid_partition_size > 0) {
+    // FIXME: tmp_degrees & tmp_nbr_indices can be avoided if we customize
+    // cugraph::ops::get_sampling_index
+    rmm::device_uvector<edge_t> tmp_degrees(mid_partition_size, handle.get_stream());
+    rmm::device_uvector<edge_t> tmp_nbr_indices(mid_partition_size * K, handle.get_stream());
+    thrust::gather(handle.get_thrust_policy(),
+                   frontier_indices.begin() + frontier_partition_offsets[1],
+                   frontier_indices.begin() + frontier_partition_offsets[2],
+                   frontier_degrees.begin(),
+                   tmp_degrees.begin());
+    cugraph::ops::graph::get_sampling_index(tmp_nbr_indices.data(),
+                                            rng_state,
+                                            tmp_degrees.data(),
+                                            mid_partition_size,
+                                            static_cast<int32_t>(K),
+                                            false,
+                                            handle.get_stream());
+    thrust::for_each(
+      handle.get_thrust_policy(),
+      thrust::make_counting_iterator(size_t{0}),
+      thrust::make_counting_iterator(mid_partition_size * K),
+      [K,
+       frontier_indices = raft::device_span<size_t const>(
+         frontier_indices.data() + frontier_partition_offsets[1], mid_partition_size),
+       tmp_nbr_indices = tmp_nbr_indices.data(),
+       nbr_indices     = nbr_indices.data()] __device__(size_t i) {
+        auto frontier_idx                          = frontier_indices[i / K];
+        auto sample_idx                            = static_cast<edge_t>(i % K);
+        nbr_indices[frontier_idx * K + sample_idx] = tmp_nbr_indices[i];
+      });
+  }
+
+  auto high_partition_size = frontier_partition_offsets[3] - frontier_partition_offsets[2];
+  if (high_partition_size > 0) {
+    // to limit memory footprint ((1 << 20) is a tuning parameter), std::max for forward progress
+    // guarantee when high_partition_oversampling_K is exorbitantly large
+    auto seeds_to_sort_per_iteration =
+      std::max(static_cast<size_t>(handle.get_device_properties().multiProcessorCount * (1 << 20)) /
+                 high_partition_oversampling_K,
+               size_t{1});
+
+    rmm::device_uvector<edge_t> tmp_nbr_indices(
+      seeds_to_sort_per_iteration * high_partition_oversampling_K, handle.get_stream());
+    assert(high_partition_oversampling_K * 2 <=
+           static_cast<size_t>(std::numeric_limits<int32_t>::max()));
+    rmm::device_uvector<int32_t> tmp_sample_indices(
+      tmp_nbr_indices.size(),
+      handle.get_stream());  // sample indices ([0, high_partition_oversampling_K)) within a segment
+                             // (one segment per seed)
+
+    rmm::device_uvector<edge_t> segment_sorted_tmp_nbr_indices(tmp_nbr_indices.size(),
+                                                               handle.get_stream());
+    rmm::device_uvector<int32_t> segment_sorted_tmp_sample_indices(tmp_nbr_indices.size(),
+                                                                   handle.get_stream());
+
+    rmm::device_uvector<std::byte> d_tmp_storage(0, handle.get_stream());
+    size_t tmp_storage_bytes{0};
+
+    auto num_chunks =
+      (high_partition_size + seeds_to_sort_per_iteration - 1) / seeds_to_sort_per_iteration;
+    for (size_t i = 0; i < num_chunks; ++i) {
+      size_t num_segments = std::min(seeds_to_sort_per_iteration,
+                                     high_partition_size - seeds_to_sort_per_iteration * i);
+
+      rmm::device_uvector<edge_t> unique_counts(num_segments, handle.get_stream());
+
+      std::optional<rmm::device_uvector<size_t>> retry_segment_indices{std::nullopt};
+      std::optional<rmm::device_uvector<edge_t>> retry_degrees{std::nullopt};
+      std::optional<rmm::device_uvector<edge_t>> retry_nbr_indices{std::nullopt};
+      std::optional<rmm::device_uvector<int32_t>> retry_sample_indices{std::nullopt};
+      std::optional<rmm::device_uvector<edge_t>> retry_segment_sorted_nbr_indices{std::nullopt};
+      std::optional<rmm::device_uvector<int32_t>> retry_segment_sorted_sample_indices{std::nullopt};
+      while (true) {
+        auto segment_frontier_index_first = frontier_indices.begin() +
+                                            frontier_partition_offsets[2] +
+                                            seeds_to_sort_per_iteration * i;
+        auto segment_frontier_degree_first = thrust::make_transform_iterator(
+          segment_frontier_index_first,
+          indirection_t<size_t, decltype(frontier_degrees.begin())>{frontier_degrees.begin()});
+
+        if (retry_segment_indices) {
+          retry_degrees =
+            rmm::device_uvector<edge_t>((*retry_segment_indices).size(), handle.get_stream());
+          thrust::gather(handle.get_thrust_policy(),
+                         (*retry_segment_indices).begin(),
+                         (*retry_segment_indices).end(),
+                         segment_frontier_degree_first,
+                         (*retry_degrees).begin());
+          retry_nbr_indices = rmm::device_uvector<edge_t>(
+            (*retry_segment_indices).size() * high_partition_oversampling_K, handle.get_stream());
+          retry_sample_indices =
+            rmm::device_uvector<int32_t>((*retry_nbr_indices).size(), handle.get_stream());
+          retry_segment_sorted_nbr_indices =
+            rmm::device_uvector<edge_t>((*retry_nbr_indices).size(), handle.get_stream());
+          retry_segment_sorted_sample_indices =
+            rmm::device_uvector<int32_t>((*retry_nbr_indices).size(), handle.get_stream());
+        }
+
+        if (retry_segment_indices) {
+          cugraph::ops::graph::get_sampling_index(
+            (*retry_nbr_indices).data(),
+            rng_state,
+            (*retry_degrees).begin(),
+            (*retry_degrees).size(),
+            static_cast<int32_t>(high_partition_oversampling_K),
+            true,
+            handle.get_stream());
+        } else {
+          // FIXME: this temporary is unnecessary if we update get_sampling_index to take a thrust
+          // iterator
+          rmm::device_uvector<edge_t> tmp_degrees(num_segments, handle.get_stream());
+          thrust::copy(handle.get_thrust_policy(),
+                       segment_frontier_degree_first,
+                       segment_frontier_degree_first + num_segments,
+                       tmp_degrees.begin());
+          cugraph::ops::graph::get_sampling_index(
+            tmp_nbr_indices.data(),
+            rng_state,
+            tmp_degrees.data(),
+            num_segments,
+            static_cast<int32_t>(high_partition_oversampling_K),
+            true,
+            handle.get_stream());
+        }
+
+        if (retry_segment_indices) {
+          thrust::for_each(
+            handle.get_thrust_policy(),
+            thrust::make_counting_iterator(size_t{0}),
+            thrust::make_counting_iterator((*retry_segment_indices).size() *
+                                           high_partition_oversampling_K),
+            [high_partition_oversampling_K,
+             unique_counts                  = unique_counts.data(),
+             segment_sorted_tmp_nbr_indices = segment_sorted_tmp_nbr_indices.data(),
+             retry_segment_indices          = (*retry_segment_indices).data(),
+             retry_nbr_indices              = (*retry_nbr_indices).data(),
+             retry_sample_indices           = (*retry_sample_indices).data()] __device__(size_t i) {
+              auto segment_idx  = retry_segment_indices[i / high_partition_oversampling_K];
+              auto sample_idx   = static_cast<edge_t>(i % high_partition_oversampling_K);
+              auto unique_count = unique_counts[segment_idx];
+              auto output_first = thrust::make_zip_iterator(
+                thrust::make_tuple(retry_nbr_indices, retry_sample_indices));
+              // sample index for the previously selected neighbor indices should be smaller than
+              // the new candidates to ensure that the previously selected neighbor indices will be
+              // selected again
+              if (sample_idx < unique_count) {
+                *(output_first + i) = thrust::make_tuple(
+                  segment_sorted_tmp_nbr_indices[segment_idx * high_partition_oversampling_K +
+                                                 sample_idx],
+                  static_cast<int32_t>(sample_idx));
+              } else {
+                *(output_first + i) =
+                  thrust::make_tuple(retry_nbr_indices[i],
+                                     high_partition_oversampling_K + (sample_idx - unique_count));
+              }
+            });
+        } else {
+          thrust::tabulate(
+            handle.get_thrust_policy(),
+            tmp_sample_indices.begin(),
+            tmp_sample_indices.begin() + num_segments * high_partition_oversampling_K,
+            [high_partition_oversampling_K] __device__(size_t i) {
+              return static_cast<int32_t>(i % high_partition_oversampling_K);
+            });
+        }
+
+        // sort the (sample neighbor index, sample index) pairs (key: sample neighbor index)
+
+        cub::DeviceSegmentedSort::SortPairs(
+          static_cast<void*>(nullptr),
+          tmp_storage_bytes,
+          retry_segment_indices ? (*retry_nbr_indices).data() : tmp_nbr_indices.data(),
+          retry_segment_indices ? (*retry_segment_sorted_nbr_indices).data()
+                                : segment_sorted_tmp_nbr_indices.data(),
+          retry_segment_indices ? (*retry_sample_indices).data() : tmp_sample_indices.data(),
+          retry_segment_indices ? (*retry_segment_sorted_sample_indices).data()
+                                : segment_sorted_tmp_sample_indices.data(),
+          (retry_segment_indices ? (*retry_segment_indices).size() : num_segments) *
+            high_partition_oversampling_K,
+          retry_segment_indices ? (*retry_segment_indices).size() : num_segments,
+          thrust::make_transform_iterator(thrust::make_counting_iterator(size_t{0}),
+                                          multiplier_t<size_t>{high_partition_oversampling_K}),
+          thrust::make_transform_iterator(thrust::make_counting_iterator(size_t{1}),
+                                          multiplier_t<size_t>{high_partition_oversampling_K}),
+          handle.get_stream());
+        if (tmp_storage_bytes > d_tmp_storage.size()) {
+          d_tmp_storage = rmm::device_uvector<std::byte>(tmp_storage_bytes, handle.get_stream());
+        }
+        cub::DeviceSegmentedSort::SortPairs(
+          d_tmp_storage.data(),
+          tmp_storage_bytes,
+          retry_segment_indices ? (*retry_nbr_indices).data() : tmp_nbr_indices.data(),
+          retry_segment_indices ? (*retry_segment_sorted_nbr_indices).data()
+                                : segment_sorted_tmp_nbr_indices.data(),
+          retry_segment_indices ? (*retry_sample_indices).data() : tmp_sample_indices.data(),
+          retry_segment_indices ? (*retry_segment_sorted_sample_indices).data()
+                                : segment_sorted_tmp_sample_indices.data(),
+          (retry_segment_indices ? (*retry_segment_indices).size() : num_segments) *
+            high_partition_oversampling_K,
+          retry_segment_indices ? (*retry_segment_indices).size() : num_segments,
+          thrust::make_transform_iterator(thrust::make_counting_iterator(size_t{0}),
+                                          multiplier_t<size_t>{high_partition_oversampling_K}),
+          thrust::make_transform_iterator(thrust::make_counting_iterator(size_t{1}),
+                                          multiplier_t<size_t>{high_partition_oversampling_K}),
+          handle.get_stream());
+
+        // count the number of unique neighbor indices
+
+        if (retry_segment_indices) {
+          thrust::for_each(
+            handle.get_thrust_policy(),
+            thrust::make_counting_iterator(size_t{0}),
+            thrust::make_counting_iterator((*retry_segment_indices).size()),
+            [high_partition_oversampling_K,
+             unique_counts                   = unique_counts.data(),
+             retry_segment_indices           = (*retry_segment_indices).data(),
+             retry_segment_sorted_pair_first = thrust::make_zip_iterator(
+               thrust::make_tuple((*retry_segment_sorted_nbr_indices).begin(),
+                                  (*retry_segment_sorted_sample_indices).begin())),
+             segment_sorted_pair_first = thrust::make_zip_iterator(thrust::make_tuple(
+               segment_sorted_tmp_nbr_indices.begin(),
+               segment_sorted_tmp_sample_indices.begin()))] __device__(size_t i) {
+              auto unique_count          = static_cast<edge_t>(thrust::distance(
+                retry_segment_sorted_pair_first + high_partition_oversampling_K * i,
+                thrust::unique(
+                  thrust::seq,
+                  retry_segment_sorted_pair_first + high_partition_oversampling_K * i,
+                  retry_segment_sorted_pair_first + high_partition_oversampling_K * (i + 1),
+                  [] __device__(auto lhs, auto rhs) {
+                    return thrust::get<0>(lhs) == thrust::get<0>(rhs);
+                  })));
+              auto segment_idx           = retry_segment_indices[i];
+              unique_counts[segment_idx] = unique_count;
+              thrust::copy(
+                thrust::seq,
+                retry_segment_sorted_pair_first + high_partition_oversampling_K * i,
+                retry_segment_sorted_pair_first + high_partition_oversampling_K * i + unique_count,
+                segment_sorted_pair_first + high_partition_oversampling_K * segment_idx);
+            });
+        } else {
+          thrust::tabulate(
+            handle.get_thrust_policy(),
+            unique_counts.begin(),
+            unique_counts.end(),
+            [high_partition_oversampling_K,
+             segment_sorted_pair_first = thrust::make_zip_iterator(thrust::make_tuple(
+               segment_sorted_tmp_nbr_indices.begin(),
+               segment_sorted_tmp_sample_indices.begin()))] __device__(size_t i) {
+              return static_cast<edge_t>(thrust::distance(
+                segment_sorted_pair_first + high_partition_oversampling_K * i,
+                thrust::unique(thrust::seq,
+                               segment_sorted_pair_first + high_partition_oversampling_K * i,
+                               segment_sorted_pair_first + high_partition_oversampling_K * (i + 1),
+                               [] __device__(auto lhs, auto rhs) {
+                                 return thrust::get<0>(lhs) == thrust::get<0>(rhs);
+                               })));
+            });
+        }
+
+        auto num_retry_segments =
+          thrust::count_if(handle.get_thrust_policy(),
+                           unique_counts.begin(),
+                           unique_counts.end(),
+                           [K] __device__(auto count) { return count < K; });
+        if (num_retry_segments > 0) {
+          retry_segment_indices =
+            rmm::device_uvector<size_t>(num_retry_segments, handle.get_stream());
+          thrust::copy_if(handle.get_thrust_policy(),
+                          thrust::make_counting_iterator(size_t{0}),
+                          thrust::make_counting_iterator(num_segments),
+                          (*retry_segment_indices).begin(),
+                          [K, unique_counts = unique_counts.data()] __device__(size_t i) {
+                            return unique_counts[i] < K;
+                          });
+        } else {
+          break;
+        }
+      }
+
+      // sort the segment-sorted (sample index, sample neighbor index) pairs (key: sample index)
+
+      cub::DeviceSegmentedSort::SortPairs(
+        static_cast<void*>(nullptr),
+        tmp_storage_bytes,
+        segment_sorted_tmp_sample_indices.data(),
+        tmp_sample_indices.data(),
+        segment_sorted_tmp_nbr_indices.data(),
+        tmp_nbr_indices.data(),
+        num_segments * high_partition_oversampling_K,
+        num_segments,
+        thrust::make_transform_iterator(thrust::make_counting_iterator(size_t{0}),
+                                        multiplier_t<size_t>{high_partition_oversampling_K}),
+        thrust::make_transform_iterator(
+          thrust::make_counting_iterator(size_t{0}),
+          cuda::proclaim_return_type<size_t>(
+            [high_partition_oversampling_K, unique_counts = unique_counts.data()] __device__(
+              size_t i) { return i * high_partition_oversampling_K + unique_counts[i]; })),
+        handle.get_stream());
+      if (tmp_storage_bytes > d_tmp_storage.size()) {
+        d_tmp_storage = rmm::device_uvector<std::byte>(tmp_storage_bytes, handle.get_stream());
+      }
+      cub::DeviceSegmentedSort::SortPairs(
+        d_tmp_storage.data(),
+        tmp_storage_bytes,
+        segment_sorted_tmp_sample_indices.data(),
+        tmp_sample_indices.data(),
+        segment_sorted_tmp_nbr_indices.data(),
+        tmp_nbr_indices.data(),
+        num_segments * high_partition_oversampling_K,
+        num_segments,
+        thrust::make_transform_iterator(thrust::make_counting_iterator(size_t{0}),
+                                        multiplier_t<size_t>{high_partition_oversampling_K}),
+        thrust::make_transform_iterator(
+          thrust::make_counting_iterator(size_t{0}),
+          cuda::proclaim_return_type<size_t>(
+            [high_partition_oversampling_K, unique_counts = unique_counts.data()] __device__(
+              size_t i) { return i * high_partition_oversampling_K + unique_counts[i]; })),
+        handle.get_stream());
+
+      // copy the neighbor indices back to nbr_indices
+
+      thrust::for_each(
+        handle.get_thrust_policy(),
+        thrust::make_counting_iterator(size_t{0}),
+        thrust::make_counting_iterator(num_segments * K),
+        [K,
+         high_partition_oversampling_K,
+         frontier_indices = frontier_indices.begin() + frontier_partition_offsets[2] +
+                            seeds_to_sort_per_iteration * i,
+         tmp_nbr_indices = tmp_nbr_indices.data(),
+         nbr_indices     = nbr_indices.data()] __device__(size_t i) {
+          auto seed_idx   = *(frontier_indices + i / K);
+          auto sample_idx = static_cast<edge_t>(i % K);
+          *(nbr_indices + seed_idx * K + sample_idx) =
+            *(tmp_nbr_indices + (i / K) * high_partition_oversampling_K + sample_idx);
+        });
+    }
+  }
+
+  frontier_degrees.resize(0, handle.get_stream());
+  frontier_degrees.shrink_to_fit(handle.get_stream());
+
+  return nbr_indices;
+#else
+  CUGRAPH_FAIL("unimplemented.");
+#endif
+}
+
+template <typename edge_t, typename bias_t>
+void compute_biased_sampling_index_without_replacement(
+  raft::handle_t const& handle,
+  std::optional<raft::device_span<size_t const>>
+    input_frontier_indices,  // input_biases & input_degree_offsets
+                             // are already packed if std::nullopt
+  raft::device_span<size_t const> input_degree_offsets,
+  raft::device_span<bias_t const> input_biases,  // bias 0 edges can't be selected
+  std::optional<raft::device_span<size_t const>>
+    output_frontier_indices,  // output_biases is already packed if std::nullopt
+  raft::device_span<edge_t> output_nbr_indices,
+  std::optional<raft::device_span<bias_t>> output_keys,
+  raft::random::RngState& rng_state,
+  size_t K,
+  bool jump)
+{
+  if (jump) {  // Algorithm A-ExpJ
+    CUGRAPH_FAIL(
+      "unimplemented.");  // FIXME: this could be faster especially for high-degree vertices
+  } else {                // Algorithm A-Res
+    // update packed input degree offsets if input_frontier_indices.has_value() is true
+
+    auto packed_input_degree_offsets =
+      input_frontier_indices ? std::make_optional<rmm::device_uvector<size_t>>(
+                                 (*input_frontier_indices).size() + 1, handle.get_stream())
+                             : std::nullopt;
+    if (packed_input_degree_offsets) {
+      (*packed_input_degree_offsets).set_element_to_zero_async(0, handle.get_stream());
+      auto degree_first = thrust::make_transform_iterator(
+        (*input_frontier_indices).begin(),
+        cuda::proclaim_return_type<size_t>([input_degree_offsets] __device__(size_t i) {
+          return input_degree_offsets[i + 1] - input_degree_offsets[i];
+        }));
+      thrust::inclusive_scan(handle.get_thrust_policy(),
+                             degree_first,
+                             degree_first + (*input_frontier_indices).size(),
+                             (*packed_input_degree_offsets).begin() + 1);
+    }
+
+    // generate (key, nbr_index) pairs
+
+    size_t num_pairs{};
+    raft::update_host(
+      &num_pairs,
+      packed_input_degree_offsets
+        ? (*packed_input_degree_offsets).data() + ((*packed_input_degree_offsets).size() - 1)
+        : input_degree_offsets.data() + (input_degree_offsets.size() - 1),
+      1,
+      handle.get_stream());
+    handle.sync_stream();
+    rmm::device_uvector<bias_t> keys(num_pairs, handle.get_stream());
+
+    cugraph::detail::uniform_random_fill(
+      handle.get_stream(), keys.data(), keys.size(), bias_t{0.0}, bias_t{1.0}, rng_state);
+
+    if (input_frontier_indices) {
+      auto bias_first = thrust::make_transform_iterator(
+        thrust::make_counting_iterator(size_t{0}),
+        cuda::proclaim_return_type<bias_t>(
+          [input_biases,
+           input_degree_offsets,
+           frontier_indices            = *input_frontier_indices,
+           packed_input_degree_offsets = raft::device_span<size_t const>(
+             (*packed_input_degree_offsets).data(),
+             (*packed_input_degree_offsets).size())] __device__(size_t i) {
+            auto it           = thrust::upper_bound(thrust::seq,
+                                          packed_input_degree_offsets.begin() + 1,
+                                          packed_input_degree_offsets.end(),
+                                          i);
+            auto idx          = thrust::distance(packed_input_degree_offsets.begin() + 1, it);
+            auto frontier_idx = frontier_indices[idx];
+            return input_biases[input_degree_offsets[frontier_idx] +
+                                (i - packed_input_degree_offsets[idx])];
+          }));
+      thrust::transform(
+        handle.get_thrust_policy(),
+        keys.begin(),
+        keys.end(),
+        bias_first,
+        keys.begin(),
+        cuda::proclaim_return_type<bias_t>([] __device__(bias_t r, bias_t b) {
+          return b > 0.0
+                   ? cuda::std::min(-log(r) / b, std::numeric_limits<bias_t>::max())
+                   : std::numeric_limits<
+                       bias_t>::infinity() /* inf used as invalid value (can't be selected) */;
+        }));
+    } else {
+      thrust::transform(handle.get_thrust_policy(),
+                        keys.begin(),
+                        keys.end(),
+                        input_biases.begin(),
+                        keys.begin(),
+                        cuda::proclaim_return_type<bias_t>([] __device__(bias_t r, bias_t b) {
+                          return b > 0.0
+                                   ? cuda::std::min(-log(r) / b, std::numeric_limits<bias_t>::max())
+                                   : std::numeric_limits<bias_t>::infinity()
+                            /* inf used as invalid value (can't be selected) */;
+                        }));
+    }
+
+    rmm::device_uvector<edge_t> nbr_indices(keys.size(), handle.get_stream());
+    thrust::tabulate(
+      handle.get_thrust_policy(),
+      nbr_indices.begin(),
+      nbr_indices.end(),
+      [offsets = packed_input_degree_offsets
+                   ? raft::device_span<size_t const>((*packed_input_degree_offsets).data(),
+                                                     (*packed_input_degree_offsets).size())
+                   : input_degree_offsets] __device__(size_t i) {
+        auto it  = thrust::upper_bound(thrust::seq, offsets.begin() + 1, offsets.end(), i);
+        auto idx = thrust::distance(offsets.begin() + 1, it);
+        return static_cast<edge_t>(i - offsets[idx]);
+      });
+
+    // pick top K for each frontier index
+
+    rmm::device_uvector<std::byte> d_tmp_storage(0, handle.get_stream());
+    size_t tmp_storage_bytes{0};
+
+    rmm::device_uvector<bias_t> segment_sorted_keys(keys.size(), handle.get_stream());
+    rmm::device_uvector<edge_t> segment_sorted_nbr_indices(nbr_indices.size(), handle.get_stream());
+
+    cub::DeviceSegmentedSort::SortPairs(
+      static_cast<void*>(nullptr),
+      tmp_storage_bytes,
+      keys.data(),
+      segment_sorted_keys.data(),
+      nbr_indices.data(),
+      segment_sorted_nbr_indices.data(),
+      keys.size(),
+      input_frontier_indices ? (*input_frontier_indices).size() : (input_degree_offsets.size() - 1),
+      packed_input_degree_offsets ? (*packed_input_degree_offsets).begin()
+                                  : input_degree_offsets.begin(),
+      (packed_input_degree_offsets ? (*packed_input_degree_offsets).begin()
+                                   : input_degree_offsets.begin()) +
+        1,
+      handle.get_stream());
+    if (tmp_storage_bytes > d_tmp_storage.size()) {
+      d_tmp_storage = rmm::device_uvector<std::byte>(tmp_storage_bytes, handle.get_stream());
+    }
+    cub::DeviceSegmentedSort::SortPairs(
+      d_tmp_storage.data(),
+      tmp_storage_bytes,
+      keys.data(),
+      segment_sorted_keys.data(),
+      nbr_indices.data(),
+      segment_sorted_nbr_indices.data(),
+      keys.size(),
+      input_frontier_indices ? (*input_frontier_indices).size() : input_degree_offsets.size() - 1,
+      packed_input_degree_offsets ? (*packed_input_degree_offsets).begin()
+                                  : input_degree_offsets.begin(),
+      (packed_input_degree_offsets ? (*packed_input_degree_offsets).begin()
+                                   : input_degree_offsets.begin()) +
+        1,
+      handle.get_stream());
+
+    if (output_frontier_indices) {
+      thrust::for_each(
+        handle.get_thrust_policy(),
+        thrust::make_counting_iterator(size_t{0}),
+        thrust::make_counting_iterator((*output_frontier_indices).size() * K),
+        [input_degree_offsets =
+           packed_input_degree_offsets
+             ? raft::device_span<size_t const>((*packed_input_degree_offsets).data(),
+                                               (*packed_input_degree_offsets).size())
+             : input_degree_offsets,
+         output_frontier_indices = *output_frontier_indices,
+         output_keys,
+         output_nbr_indices,
+         segment_sorted_keys =
+           raft::device_span<bias_t const>(segment_sorted_keys.data(), segment_sorted_keys.size()),
+         segment_sorted_nbr_indices = raft::device_span<edge_t const>(
+           segment_sorted_nbr_indices.data(), segment_sorted_nbr_indices.size()),
+         K,
+         invalid_idx = cugraph::ops::graph::INVALID_ID<edge_t>] __device__(size_t i) {
+          auto output_frontier_idx = output_frontier_indices[i / K];
+          auto output_idx          = output_frontier_idx * K + (i % K);
+          auto degree              = input_degree_offsets[i / K + 1] - input_degree_offsets[i / K];
+          auto input_idx           = input_degree_offsets[i / K] + (i % K);
+          if ((i % K < degree) &&
+              (segment_sorted_keys[input_idx] < std::numeric_limits<bias_t>::infinity())) {
+            if (output_keys) { (*output_keys)[output_idx] = segment_sorted_keys[input_idx]; }
+            output_nbr_indices[output_idx] = segment_sorted_nbr_indices[input_idx];
+          } else {
+            if (output_keys) {
+              (*output_keys)[output_idx] = std::numeric_limits<bias_t>::infinity();
+            }
+            output_nbr_indices[output_idx] = invalid_idx;
+          }
+        });
+    } else {
+      thrust::for_each(
+        handle.get_thrust_policy(),
+        thrust::make_counting_iterator(size_t{0}),
+        thrust::make_counting_iterator(output_nbr_indices.size()),
+        [input_degree_offsets =
+           packed_input_degree_offsets
+             ? raft::device_span<size_t const>((*packed_input_degree_offsets).data(),
+                                               (*packed_input_degree_offsets).size())
+             : input_degree_offsets,
+         output_keys,
+         output_nbr_indices,
+         segment_sorted_keys =
+           raft::device_span<bias_t const>(segment_sorted_keys.data(), segment_sorted_keys.size()),
+         segment_sorted_nbr_indices = raft::device_span<edge_t const>(
+           segment_sorted_nbr_indices.data(), segment_sorted_nbr_indices.size()),
+         K,
+         invalid_idx = cugraph::ops::graph::INVALID_ID<edge_t>] __device__(size_t i) {
+          auto degree    = input_degree_offsets[i / K + 1] - input_degree_offsets[i / K];
+          auto input_idx = input_degree_offsets[i / K] + (i % K);
+          if ((i % K < degree) &&
+              (segment_sorted_keys[input_idx] < std::numeric_limits<bias_t>::infinity())) {
+            if (output_keys) { (*output_keys)[i] = segment_sorted_keys[input_idx]; }
+            output_nbr_indices[i] = segment_sorted_nbr_indices[input_idx];
+          } else {
+            if (output_keys) { (*output_keys)[i] = std::numeric_limits<bias_t>::infinity(); }
+            output_nbr_indices[i] = invalid_idx;
+          }
+        });
+    }
+  }
+
+  return;
+}
+
+template <typename GraphViewType, typename VertexIterator>
+rmm::device_uvector<typename GraphViewType::edge_type>
+compute_aggregate_local_frontier_local_degrees(
+  raft::handle_t const& handle,
+  GraphViewType const& graph_view,
+  VertexIterator aggregate_local_frontier_major_first,
+  std::vector<size_t> const& local_frontier_displacements,
+  std::vector<size_t> const& local_frontier_sizes)
+{
+  using vertex_t = typename GraphViewType::vertex_type;
+  using edge_t   = typename GraphViewType::edge_type;
+  static_assert(
+    std::is_same_v<typename thrust::iterator_traits<VertexIterator>::value_type, vertex_t>);
+
+  auto edge_mask_view = graph_view.edge_mask_view();
+
+  auto aggregate_local_frontier_local_degrees = rmm::device_uvector<edge_t>(
+    local_frontier_displacements.back() + local_frontier_sizes.back(), handle.get_stream());
+  for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) {
+    auto edge_partition =
+      edge_partition_device_view_t<vertex_t, edge_t, GraphViewType::is_multi_gpu>(
+        graph_view.local_edge_partition_view(i));
+    auto edge_partition_e_mask =
+      edge_mask_view
+        ? thrust::make_optional<
+            detail::edge_partition_edge_property_device_view_t<edge_t, uint32_t const*, bool>>(
+            *edge_mask_view, i)
+        : thrust::nullopt;
+
+    auto edge_partition_frontier_major_first =
+      aggregate_local_frontier_major_first + local_frontier_displacements[i];
+    auto edge_partition_frontier_local_degrees =
+      !edge_partition_e_mask ? edge_partition.compute_local_degrees(
+                                 edge_partition_frontier_major_first,
+                                 edge_partition_frontier_major_first + local_frontier_sizes[i],
+                                 handle.get_stream())
+                             : edge_partition.compute_local_degrees_with_mask(
+                                 (*edge_partition_e_mask).value_first(),
+                                 edge_partition_frontier_major_first,
+                                 edge_partition_frontier_major_first + local_frontier_sizes[i],
+                                 handle.get_stream());
+
+    // FIXME: this copy is unnecessary if edge_partition.compute_local_degrees() takes a pointer
+    // to the output array
+    thrust::copy(handle.get_thrust_policy(),
+                 edge_partition_frontier_local_degrees.begin(),
+                 edge_partition_frontier_local_degrees.end(),
+                 aggregate_local_frontier_local_degrees.begin() + local_frontier_displacements[i]);
+  }
+
+  return aggregate_local_frontier_local_degrees;
+}
+
+// return (bias segmented local inclusive sums, segment offsets) pairs for each key in th eaggregate
+// local frontier
+template <typename GraphViewType,
+          typename KeyIterator,
+          typename EdgeSrcValueInputWrapper,
+          typename EdgeDstValueInputWrapper,
+          typename EdgeValueInputWrapper,
+          typename EdgeBiasOp>
+std::tuple<rmm::device_uvector<
+             typename edge_op_result_type<typename thrust::iterator_traits<KeyIterator>::value_type,
+                                          typename GraphViewType::vertex_type,
+                                          typename EdgeSrcValueInputWrapper::value_type,
+                                          typename EdgeDstValueInputWrapper::value_type,
+                                          typename EdgeValueInputWrapper::value_type,
+                                          EdgeBiasOp>::type>,
+           rmm::device_uvector<size_t>>
+compute_aggregate_local_frontier_biases(raft::handle_t const& handle,
+                                        GraphViewType const& graph_view,
+                                        KeyIterator aggregate_local_frontier_key_first,
+                                        EdgeSrcValueInputWrapper edge_src_value_input,
+                                        EdgeDstValueInputWrapper edge_dst_value_input,
+                                        EdgeValueInputWrapper edge_value_input,
+                                        EdgeBiasOp e_bias_op,
+                                        std::vector<size_t> const& local_frontier_displacements,
+                                        std::vector<size_t> const& local_frontier_sizes,
+                                        bool do_expensive_check)
+{
+  using vertex_t = typename GraphViewType::vertex_type;
+  using edge_t   = typename GraphViewType::edge_type;
+  using key_t    = typename thrust::iterator_traits<KeyIterator>::value_type;
+
+  using bias_t = typename edge_op_result_type<key_t,
+                                              vertex_t,
+                                              typename EdgeSrcValueInputWrapper::value_type,
+                                              typename EdgeDstValueInputWrapper::value_type,
+                                              typename EdgeValueInputWrapper::value_type,
+                                              EdgeBiasOp>::type;
+
+  auto [aggregate_local_frontier_biases, aggregate_local_frontier_local_degree_offsets] =
+    transform_v_frontier_e(handle,
+                           graph_view,
+                           aggregate_local_frontier_key_first,
+                           edge_src_value_input,
+                           edge_dst_value_input,
+                           edge_value_input,
+                           e_bias_op,
+                           local_frontier_displacements,
+                           local_frontier_sizes);
+
+  if (do_expensive_check) {
+    auto num_invalid_biases = thrust::count_if(
+      handle.get_thrust_policy(),
+      aggregate_local_frontier_biases.begin(),
+      aggregate_local_frontier_biases.end(),
+      check_out_of_range_t<bias_t>{bias_t{0.0}, std::numeric_limits<bias_t>::max()});
+    if constexpr (GraphViewType::is_multi_gpu) {
+      num_invalid_biases = host_scalar_allreduce(
+        handle.get_comms(), num_invalid_biases, raft::comms::op_t::SUM, handle.get_stream());
+    }
+    CUGRAPH_EXPECTS(num_invalid_biases == 0,
+                    "invalid_input_argument: e_bias_op return values should be non-negative and "
+                    "should not exceed std::numeirc_limits<bias_t>::max().");
+  }
+
+  return std::make_tuple(std::move(aggregate_local_frontier_biases),
+                         std::move(aggregate_local_frontier_local_degree_offsets));
+}
+
+// drop the sample_nbr_values array elements having invalid_value if multi_gpu is true
+template <typename value_t, bool multi_gpu>
+std::tuple<rmm::device_uvector<value_t>,
+           std::optional<rmm::device_uvector<size_t>>,
+           std::vector<size_t>>
+shuffle_and_compute_local_nbr_values(raft::handle_t const& handle,
+                                     rmm::device_uvector<value_t>&& sample_nbr_values,
+                                     std::optional<raft::device_span<value_t const>>
+                                       frontier_partitioned_value_local_sum_displacements,
+                                     std::vector<size_t> const& local_frontier_displacements,
+                                     std::vector<size_t> const& local_frontier_sizes,
+                                     size_t K,
+                                     value_t invalid_value)
+{
+  int minor_comm_rank{0};
+  int minor_comm_size{1};
+  if constexpr (multi_gpu) {
+    auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
+    minor_comm_rank  = minor_comm.get_rank();
+    minor_comm_size  = minor_comm.get_size();
+  }
+
+  auto sample_local_nbr_values = std::move(
+    sample_nbr_values);  // neighbor value within an edge partition (note that each vertex's
+                         // neighbors are distributed in minor_comm_size partitions)
+  std::optional<rmm::device_uvector<size_t>> key_indices{
+    std::nullopt};  // relevant only when (minor_comm_size > 1)
+  std::vector<size_t> local_frontier_sample_offsets{};
+  if (minor_comm_size > 1) {
+    auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
+
+    key_indices = rmm::device_uvector<size_t>(sample_local_nbr_values.size(), handle.get_stream());
+    auto minor_comm_ranks =
+      rmm::device_uvector<int>(sample_local_nbr_values.size(), handle.get_stream());
+    auto intra_partition_displacements =
+      rmm::device_uvector<size_t>(sample_local_nbr_values.size(), handle.get_stream());
+    rmm::device_uvector<size_t> d_tx_counts(minor_comm_size, handle.get_stream());
+    thrust::fill(handle.get_thrust_policy(), d_tx_counts.begin(), d_tx_counts.end(), size_t{0});
+    auto input_pair_first = thrust::make_zip_iterator(
+      thrust::make_tuple(sample_local_nbr_values.begin(),
+                         thrust::make_transform_iterator(thrust::make_counting_iterator(size_t{0}),
+                                                         divider_t<size_t>{K})));
+    thrust::transform(
+      handle.get_thrust_policy(),
+      input_pair_first,
+      input_pair_first + sample_local_nbr_values.size(),
+      thrust::make_zip_iterator(thrust::make_tuple(minor_comm_ranks.begin(),
+                                                   intra_partition_displacements.begin(),
+                                                   sample_local_nbr_values.begin(),
+                                                   (*key_indices).begin())),
+      convert_pair_to_quadruplet_t<value_t>{
+        raft::device_span<value_t const>(
+          (*frontier_partitioned_value_local_sum_displacements).data(),
+          (*frontier_partitioned_value_local_sum_displacements).size()),
+        raft::device_span<size_t>(d_tx_counts.data(), d_tx_counts.size()),
+        local_frontier_sizes[minor_comm_rank],
+        minor_comm_size,
+        invalid_value});
+    rmm::device_uvector<size_t> tx_displacements(minor_comm_size, handle.get_stream());
+    thrust::exclusive_scan(
+      handle.get_thrust_policy(), d_tx_counts.begin(), d_tx_counts.end(), tx_displacements.begin());
+    auto tmp_sample_local_nbr_values =
+      rmm::device_uvector<value_t>(tx_displacements.back_element(handle.get_stream()) +
+                                     d_tx_counts.back_element(handle.get_stream()),
+                                   handle.get_stream());
+    auto tmp_key_indices =
+      rmm::device_uvector<size_t>(tmp_sample_local_nbr_values.size(), handle.get_stream());
+    auto pair_first = thrust::make_zip_iterator(
+      thrust::make_tuple(sample_local_nbr_values.begin(), (*key_indices).begin()));
+    thrust::scatter_if(
+      handle.get_thrust_policy(),
+      pair_first,
+      pair_first + sample_local_nbr_values.size(),
+      thrust::make_transform_iterator(
+        thrust::make_counting_iterator(size_t{0}),
+        shuffle_index_compute_offset_t{
+          raft::device_span<int const>(minor_comm_ranks.data(), minor_comm_ranks.size()),
+          raft::device_span<size_t const>(intra_partition_displacements.data(),
+                                          intra_partition_displacements.size()),
+          raft::device_span<size_t const>(tx_displacements.data(), tx_displacements.size())}),
+      minor_comm_ranks.begin(),
+      thrust::make_zip_iterator(
+        thrust::make_tuple(tmp_sample_local_nbr_values.begin(), tmp_key_indices.begin())),
+      is_not_equal_t<int>{-1});
+
+    sample_local_nbr_values = std::move(tmp_sample_local_nbr_values);
+    key_indices             = std::move(tmp_key_indices);
+
+    std::vector<size_t> h_tx_counts(d_tx_counts.size());
+    raft::update_host(
+      h_tx_counts.data(), d_tx_counts.data(), d_tx_counts.size(), handle.get_stream());
+    handle.sync_stream();
+
+    pair_first = thrust::make_zip_iterator(
+      thrust::make_tuple(sample_local_nbr_values.begin(), (*key_indices).begin()));
+    auto [rx_value_buffer, rx_counts] =
+      shuffle_values(minor_comm, pair_first, h_tx_counts, handle.get_stream());
+
+    sample_local_nbr_values          = std::move(std::get<0>(rx_value_buffer));
+    key_indices                      = std::move(std::get<1>(rx_value_buffer));
+    local_frontier_sample_offsets    = std::vector<size_t>(rx_counts.size() + 1);
+    local_frontier_sample_offsets[0] = size_t{0};
+    std::inclusive_scan(
+      rx_counts.begin(), rx_counts.end(), local_frontier_sample_offsets.begin() + 1);
+  } else {
+    local_frontier_sample_offsets =
+      std::vector<size_t>{size_t{0}, local_frontier_sizes[minor_comm_rank] * K};
+  }
+
+  return std::make_tuple(std::move(sample_local_nbr_values),
+                         std::move(key_indices),
+                         std::move(local_frontier_sample_offsets));
+}
+
+// skip conversion if local neighbor index is cugraph::ops::graph::INVALID_ID<edge_t>
+template <typename GraphViewType, typename VertexIterator>
+rmm::device_uvector<typename GraphViewType::edge_type> convert_to_unmasked_local_nbr_idx(
+  raft::handle_t const& handle,
+  GraphViewType const& graph_view,
+  VertexIterator aggregate_local_frontier_major_first,
+  rmm::device_uvector<typename GraphViewType::edge_type>&& local_nbr_indices,
+  std::optional<raft::device_span<size_t const>> key_indices,
+  std::vector<size_t> const& local_frontier_sample_offsets,
+  std::vector<size_t> const& local_frontier_displacements,
+  std::vector<size_t> const& local_frontier_sizes,
+  size_t K)
+{
+  using vertex_t = typename GraphViewType::vertex_type;
+  using edge_t   = typename GraphViewType::edge_type;
+  static_assert(
+    std::is_same_v<vertex_t, typename thrust::iterator_traits<VertexIterator>::value_type>);
+
+  auto edge_mask_view = graph_view.edge_mask_view();
+
+  // to avoid searching the entire neighbor list K times for high degree vertices with edge masking
+  auto local_frontier_valid_local_nbr_count_inclusive_sums =
+    compute_valid_local_nbr_count_inclusive_sums(handle,
+                                                 graph_view,
+                                                 aggregate_local_frontier_major_first,
+                                                 local_frontier_displacements,
+                                                 local_frontier_sizes);
+
+  auto sample_major_idx_first = thrust::make_transform_iterator(
+    thrust::make_counting_iterator(size_t{0}),
+    cuda::proclaim_return_type<size_t>(
+      [K,
+       key_indices = key_indices ? thrust::make_optional<raft::device_span<size_t const>>(
+                                     (*key_indices).data(), (*key_indices).size())
+                                 : thrust::nullopt] __device__(size_t i) {
+        return key_indices ? (*key_indices)[i] : i / K;
+      }));
+  auto pair_first = thrust::make_zip_iterator(local_nbr_indices.begin(), sample_major_idx_first);
+  for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) {
+    auto edge_partition =
+      edge_partition_device_view_t<vertex_t, edge_t, GraphViewType::is_multi_gpu>(
+        graph_view.local_edge_partition_view(i));
+    auto edge_partition_e_mask =
+      edge_mask_view
+        ? thrust::make_optional<
+            detail::edge_partition_edge_property_device_view_t<edge_t, uint32_t const*, bool>>(
+            *edge_mask_view, i)
+        : thrust::nullopt;
+
+    auto edge_partition_frontier_major_first =
+      aggregate_local_frontier_major_first + local_frontier_displacements[i];
+    thrust::transform_if(
+      handle.get_thrust_policy(),
+      pair_first + local_frontier_sample_offsets[i],
+      pair_first + local_frontier_sample_offsets[i + 1],
+      local_nbr_indices.begin() + local_frontier_sample_offsets[i],
+      local_nbr_indices.begin() + local_frontier_sample_offsets[i],
+      find_nth_valid_nbr_idx_t<GraphViewType,
+                               decltype(edge_partition_e_mask),
+                               decltype(edge_partition_frontier_major_first)>{
+        edge_partition,
+        edge_partition_e_mask,
+        edge_partition_frontier_major_first,
+        thrust::make_tuple(
+          raft::device_span<size_t const>(
+            std::get<0>(local_frontier_valid_local_nbr_count_inclusive_sums[i]).data(),
+            std::get<0>(local_frontier_valid_local_nbr_count_inclusive_sums[i]).size()),
+          raft::device_span<edge_t const>(
+            std::get<1>(local_frontier_valid_local_nbr_count_inclusive_sums[i]).data(),
+            std::get<1>(local_frontier_valid_local_nbr_count_inclusive_sums[i]).size()))},
+      is_not_equal_t<edge_t>{cugraph::ops::graph::INVALID_ID<edge_t>});
+  }
+
+  return std::move(local_nbr_indices);
+}
+
+template <typename GraphViewType, typename KeyIterator>
+std::tuple<rmm::device_uvector<typename GraphViewType::edge_type>,
+           std::optional<rmm::device_uvector<size_t>>,
+           std::vector<size_t>>
+uniform_sample_and_compute_local_nbr_indices(
+  raft::handle_t const& handle,
+  GraphViewType const& graph_view,
+  KeyIterator aggregate_local_frontier_key_first,
+  std::vector<size_t> const& local_frontier_displacements,
+  std::vector<size_t> const& local_frontier_sizes,
+  raft::random::RngState& rng_state,
+  size_t K,
+  bool with_replacement)
+{
+  using vertex_t = typename GraphViewType::vertex_type;
+  using edge_t   = typename GraphViewType::edge_type;
+  using key_t    = typename thrust::iterator_traits<KeyIterator>::value_type;
+
+  int minor_comm_size{1};
+  if constexpr (GraphViewType::is_multi_gpu) {
+    auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
+    minor_comm_size  = minor_comm.get_size();
+  }
+
+  auto aggregate_local_frontier_major_first =
+    thrust_tuple_get_or_identity<KeyIterator, 0>(aggregate_local_frontier_key_first);
+
+  auto edge_mask_view = graph_view.edge_mask_view();
+
+  // 1. compute degrees
+
+  rmm::device_uvector<edge_t> frontier_degrees(0, handle.get_stream());
+  std::optional<rmm::device_uvector<edge_t>> frontier_partitioned_local_degree_displacements{
+    std::nullopt};
+  {
+    auto aggregate_local_frontier_local_degrees =
+      compute_aggregate_local_frontier_local_degrees(handle,
+                                                     graph_view,
+                                                     aggregate_local_frontier_major_first,
+                                                     local_frontier_displacements,
+                                                     local_frontier_sizes);
+
+    if (minor_comm_size > 1) {
+      std::tie(frontier_degrees, frontier_partitioned_local_degree_displacements) =
+        compute_frontier_value_sums_and_partitioned_local_value_sum_displacements(
+          handle,
+          raft::device_span<edge_t const>(aggregate_local_frontier_local_degrees.data(),
+                                          aggregate_local_frontier_local_degrees.size()),
+          local_frontier_displacements,
+          local_frontier_sizes);
+      aggregate_local_frontier_local_degrees.resize(0, handle.get_stream());
+      aggregate_local_frontier_local_degrees.shrink_to_fit(handle.get_stream());
+    } else {
+      frontier_degrees = std::move(aggregate_local_frontier_local_degrees);
+    }
+  }
+
+  // 2. sample neighbor indices
+
+  rmm::device_uvector<edge_t> nbr_indices(0, handle.get_stream());
+
+  if (with_replacement) {
+    if (frontier_degrees.size() > 0) {
+      nbr_indices.resize(frontier_degrees.size() * K, handle.get_stream());
+      cugraph::ops::graph::get_sampling_index(nbr_indices.data(),
+                                              rng_state,
+                                              frontier_degrees.data(),
+                                              static_cast<edge_t>(frontier_degrees.size()),
+                                              static_cast<int32_t>(K),
+                                              with_replacement,
+                                              handle.get_stream());
+      frontier_degrees.resize(0, handle.get_stream());
+      frontier_degrees.shrink_to_fit(handle.get_stream());
+    }
+  } else {
+    nbr_indices = compute_uniform_sampling_index_without_replacement(
+      handle, std::move(frontier_degrees), rng_state, K);
+  }
+
+  // 3. shuffle neighbor indices
+
+  auto [local_nbr_indices, key_indices, local_frontier_sample_offsets] =
+    shuffle_and_compute_local_nbr_values<edge_t, GraphViewType::is_multi_gpu>(
+      handle,
+      std::move(nbr_indices),
+      frontier_partitioned_local_degree_displacements
+        ? std::make_optional<raft::device_span<edge_t const>>(
+            (*frontier_partitioned_local_degree_displacements).data(),
+            (*frontier_partitioned_local_degree_displacements).size())
+        : std::nullopt,
+      local_frontier_displacements,
+      local_frontier_sizes,
+      K,
+      cugraph::ops::graph::INVALID_ID<edge_t>);
+
+  // 4. convert neighbor indices in the neighbor list considering edge mask to neighbor indices in
+  // the neighbor list ignoring edge mask
+
+  if (edge_mask_view) {
+    local_nbr_indices = convert_to_unmasked_local_nbr_idx(
+      handle,
+      graph_view,
+      aggregate_local_frontier_major_first,
+      std::move(local_nbr_indices),
+      key_indices ? std::make_optional<raft::device_span<size_t const>>((*key_indices).data(),
+                                                                        (*key_indices).size())
+                  : std::nullopt,
+      local_frontier_sample_offsets,
+      local_frontier_displacements,
+      local_frontier_sizes,
+      K);
+  }
+
+  return std::make_tuple(
+    std::move(local_nbr_indices), std::move(key_indices), std::move(local_frontier_sample_offsets));
+}
+
+template <typename GraphViewType,
+          typename KeyIterator,
+          typename EdgeSrcValueInputWrapper,
+          typename EdgeDstValueInputWrapper,
+          typename EdgeValueInputWrapper,
+          typename EdgeBiasOp>
+std::tuple<rmm::device_uvector<typename GraphViewType::edge_type>,
+           std::optional<rmm::device_uvector<size_t>>,
+           std::vector<size_t>>
+biased_sample_and_compute_local_nbr_indices(
+  raft::handle_t const& handle,
+  GraphViewType const& graph_view,
+  KeyIterator aggregate_local_frontier_key_first,
+  EdgeSrcValueInputWrapper edge_src_value_input,
+  EdgeDstValueInputWrapper edge_dst_value_input,
+  EdgeValueInputWrapper edge_value_input,
+  EdgeBiasOp e_bias_op,
+  std::vector<size_t> const& local_frontier_displacements,
+  std::vector<size_t> const& local_frontier_sizes,
+  raft::random::RngState& rng_state,
+  size_t K,
+  bool with_replacement,
+  bool do_expensive_check /* check e_bias_op return values */)
+{
+  using vertex_t = typename GraphViewType::vertex_type;
+  using edge_t   = typename GraphViewType::edge_type;
+  using key_t    = typename thrust::iterator_traits<KeyIterator>::value_type;
+
+  using bias_t = typename edge_op_result_type<key_t,
+                                              vertex_t,
+                                              typename EdgeSrcValueInputWrapper::value_type,
+                                              typename EdgeDstValueInputWrapper::value_type,
+                                              typename EdgeValueInputWrapper::value_type,
+                                              EdgeBiasOp>::type;
+
+  int minor_comm_rank{0};
+  int minor_comm_size{1};
+  if constexpr (GraphViewType::is_multi_gpu) {
+    auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
+    minor_comm_rank  = minor_comm.get_rank();
+    minor_comm_size  = minor_comm.get_size();
+  }
+  assert(minor_comm_size == graph_view.number_of_local_edge_partitions());
+
+  auto aggregate_local_frontier_major_first =
+    thrust_tuple_get_or_identity<KeyIterator, 0>(aggregate_local_frontier_key_first);
+
+  auto edge_mask_view = graph_view.edge_mask_view();
+
+  // 1. compute biases
+
+  auto [aggregate_local_frontier_biases, aggregate_local_frontier_local_degree_offsets] =
+    compute_aggregate_local_frontier_biases(handle,
+                                            graph_view,
+                                            aggregate_local_frontier_key_first,
+                                            edge_src_value_input,
+                                            edge_dst_value_input,
+                                            edge_value_input,
+                                            e_bias_op,
+                                            local_frontier_displacements,
+                                            local_frontier_sizes,
+                                            do_expensive_check);
+
+  // 2. sample neighbor indices and shuffle neighbor indices
+
+  rmm::device_uvector<edge_t> local_nbr_indices(0, handle.get_stream());
+  std::optional<rmm::device_uvector<size_t>> key_indices{std::nullopt};
+  std::vector<size_t> local_frontier_sample_offsets{};
+  if (with_replacement) {
+    // computet segmented inclusive sums (one segment per seed)
+
+    auto key_first = thrust::make_transform_iterator(
+      thrust::make_counting_iterator(size_t{0}),
+      cuda::proclaim_return_type<size_t>(
+        [offsets = raft::device_span<size_t const>(
+           aggregate_local_frontier_local_degree_offsets.data(),
+           aggregate_local_frontier_local_degree_offsets.size())] __device__(size_t i) {
+          return static_cast<size_t>(thrust::distance(
+            offsets.begin() + 1,
+            thrust::upper_bound(thrust::seq, offsets.begin() + 1, offsets.end(), i)));
+        }));
+    thrust::inclusive_scan_by_key(handle.get_thrust_policy(),
+                                  key_first,
+                                  key_first + aggregate_local_frontier_biases.size(),
+                                  get_dataframe_buffer_begin(aggregate_local_frontier_biases),
+                                  get_dataframe_buffer_begin(aggregate_local_frontier_biases));
+
+    auto aggregate_local_frontier_bias_segmented_local_inclusive_sums =
+      std::move(aggregate_local_frontier_biases);
+
+    auto aggregate_local_frontier_bias_local_sums = rmm::device_uvector<bias_t>(
+      local_frontier_displacements.back() + local_frontier_sizes.back(), handle.get_stream());
+    thrust::tabulate(
+      handle.get_thrust_policy(),
+      get_dataframe_buffer_begin(aggregate_local_frontier_bias_local_sums),
+      get_dataframe_buffer_end(aggregate_local_frontier_bias_local_sums),
+      [offsets =
+         raft::device_span<size_t const>(aggregate_local_frontier_local_degree_offsets.data(),
+                                         aggregate_local_frontier_local_degree_offsets.size()),
+       aggregate_local_frontier_bias_segmented_local_inclusive_sums =
+         raft::device_span<bias_t const>(
+           aggregate_local_frontier_bias_segmented_local_inclusive_sums.data(),
+           aggregate_local_frontier_bias_segmented_local_inclusive_sums
+             .size())] __device__(size_t i) {
+        auto degree = offsets[i + 1] - offsets[i];
+        if (degree > 0) {
+          return aggregate_local_frontier_bias_segmented_local_inclusive_sums[offsets[i] + degree -
+                                                                              1];
+        } else {
+          return bias_t{0.0};
+        }
+      });
+
+    rmm::device_uvector<bias_t> frontier_bias_sums(0, handle.get_stream());
+    std::optional<rmm::device_uvector<bias_t>> frontier_partitioned_bias_local_sum_displacements{
+      std::nullopt};
+    if (minor_comm_size > 1) {
+      std::tie(frontier_bias_sums, frontier_partitioned_bias_local_sum_displacements) =
+        compute_frontier_value_sums_and_partitioned_local_value_sum_displacements(
+          handle,
+          raft::device_span<bias_t const>(aggregate_local_frontier_bias_local_sums.data(),
+                                          aggregate_local_frontier_bias_local_sums.size()),
+          local_frontier_displacements,
+          local_frontier_sizes);
+      aggregate_local_frontier_bias_local_sums.resize(0, handle.get_stream());
+      aggregate_local_frontier_bias_local_sums.shrink_to_fit(handle.get_stream());
+    } else {
+      frontier_bias_sums = std::move(aggregate_local_frontier_bias_local_sums);
+    }
+
+    rmm::device_uvector<bias_t> sample_random_numbers(frontier_bias_sums.size() * K,
+                                                      handle.get_stream());
+    cugraph::detail::uniform_random_fill(handle.get_stream(),
+                                         sample_random_numbers.data(),
+                                         sample_random_numbers.size(),
+                                         bias_t{0.0},
+                                         bias_t{1.0},
+                                         rng_state);
+    thrust::transform(
+      handle.get_thrust_policy(),
+      sample_random_numbers.begin(),
+      sample_random_numbers.end(),
+      thrust::make_counting_iterator(size_t{0}),
+      sample_random_numbers.begin(),
+      [frontier_bias_sums =
+         raft::device_span<bias_t const>(frontier_bias_sums.data(), frontier_bias_sums.size()),
+       K,
+       invalid_value = std::numeric_limits<bias_t>::infinity()] __device__(bias_t r, size_t i) {
+        // frontier_bias_sums[i / K] will be 0 if degree is 0 or all the edges have 0 bias
+        return frontier_bias_sums[i / K] > 0.0 ? r * frontier_bias_sums[i / K] : invalid_value;
+      });
+
+    rmm::device_uvector<bias_t> sample_local_random_numbers(0, handle.get_stream());
+    std::tie(sample_local_random_numbers, key_indices, local_frontier_sample_offsets) =
+      shuffle_and_compute_local_nbr_values<bias_t, GraphViewType::is_multi_gpu>(
+        handle,
+        std::move(sample_random_numbers),
+        frontier_partitioned_bias_local_sum_displacements
+          ? std::make_optional<raft::device_span<bias_t const>>(
+              (*frontier_partitioned_bias_local_sum_displacements).data(),
+              (*frontier_partitioned_bias_local_sum_displacements).size())
+          : std::nullopt,
+        local_frontier_displacements,
+        local_frontier_sizes,
+        K,
+        std::numeric_limits<bias_t>::infinity());
+
+    local_nbr_indices.resize(sample_local_random_numbers.size(), handle.get_stream());
+    for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) {
+      thrust::tabulate(
+        handle.get_thrust_policy(),
+        local_nbr_indices.begin() + local_frontier_sample_offsets[i],
+        local_nbr_indices.begin() + local_frontier_sample_offsets[i + 1],
+        [K,
+         sample_local_random_numbers = raft::device_span<bias_t>(
+           sample_local_random_numbers.data() + local_frontier_sample_offsets[i],
+           local_frontier_sample_offsets[i + 1] - local_frontier_sample_offsets[i]),
+         key_indices =
+           key_indices ? thrust::make_optional<raft::device_span<size_t const>>(
+                           (*key_indices).data() + local_frontier_sample_offsets[i],
+                           local_frontier_sample_offsets[i + 1] - local_frontier_sample_offsets[i])
+                       : thrust::nullopt,
+         aggregate_local_frontier_bias_segmented_local_inclusive_sums = raft::device_span<bias_t>(
+           aggregate_local_frontier_bias_segmented_local_inclusive_sums.data(),
+           aggregate_local_frontier_bias_segmented_local_inclusive_sums.size()),
+         local_degree_offsets = raft::device_span<size_t const>(
+           aggregate_local_frontier_local_degree_offsets.data() + local_frontier_displacements[i],
+           local_frontier_sizes[i] + 1),
+         invalid_random_number = std::numeric_limits<bias_t>::infinity(),
+         invalid_idx           = cugraph::ops::graph::INVALID_ID<edge_t>] __device__(size_t i) {
+          auto key_idx             = key_indices ? (*key_indices)[i] : (i / K);
+          auto local_random_number = sample_local_random_numbers[i];
+          if (local_random_number != invalid_random_number) {
+            auto local_degree = static_cast<edge_t>(local_degree_offsets[key_idx + 1] -
+                                                    local_degree_offsets[key_idx]);
+            auto inclusive_sum_first =
+              aggregate_local_frontier_bias_segmented_local_inclusive_sums.begin() +
+              local_degree_offsets[key_idx];
+            auto inclusive_sum_last = inclusive_sum_first + local_degree;
+            auto local_nbr_idx      = static_cast<edge_t>(thrust::distance(
+              inclusive_sum_first,
+              thrust::upper_bound(
+                thrust::seq, inclusive_sum_first, inclusive_sum_last, local_random_number)));
+            return cuda::std::min(local_nbr_idx, local_degree - 1);
+          } else {
+            return invalid_idx;
+          }
+        });
+    }
+  } else {
+    rmm::device_uvector<edge_t> frontier_degrees(0, handle.get_stream());
+    std::optional<rmm::device_uvector<edge_t>> frontier_partitioned_local_degree_displacements{
+      std::nullopt};
+    {
+      rmm::device_uvector<edge_t> aggregate_local_frontier_local_degrees(
+        local_frontier_displacements.back() + local_frontier_sizes.back(), handle.get_stream());
+      thrust::adjacent_difference(handle.get_thrust_policy(),
+                                  aggregate_local_frontier_local_degree_offsets.begin() + 1,
+                                  aggregate_local_frontier_local_degree_offsets.end(),
+                                  aggregate_local_frontier_local_degrees.begin());
+      if (minor_comm_size > 1) {
+        std::tie(frontier_degrees, frontier_partitioned_local_degree_displacements) =
+          compute_frontier_value_sums_and_partitioned_local_value_sum_displacements(
+            handle,
+            raft::device_span<edge_t const>(aggregate_local_frontier_local_degrees.data(),
+                                            aggregate_local_frontier_local_degrees.size()),
+            local_frontier_displacements,
+            local_frontier_sizes);
+      } else {
+        frontier_degrees = std::move(aggregate_local_frontier_local_degrees);
+      }
+    }
+
+    auto [frontier_indices, frontier_partition_offsets] =
+      partition_v_frontier(handle,
+                           frontier_degrees.begin(),
+                           frontier_degrees.end(),
+                           std::vector<edge_t>{static_cast<edge_t>(K + 1),
+                                               static_cast<edge_t>(minor_comm_size * K * 2)});
+
+    rmm::device_uvector<edge_t> nbr_indices(frontier_degrees.size() * K, handle.get_stream());
+
+    if (minor_comm_size > 1) {
+      auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
+
+      std::vector<size_t> low_local_frontier_sizes{};
+      low_local_frontier_sizes =
+        host_scalar_allgather(minor_comm, frontier_partition_offsets[1], handle.get_stream());
+      std::vector<size_t> low_local_frontier_displacements(low_local_frontier_sizes.size());
+      std::exclusive_scan(low_local_frontier_sizes.begin(),
+                          low_local_frontier_sizes.end(),
+                          low_local_frontier_displacements.begin(),
+                          size_t{0});
+
+      if (low_local_frontier_displacements.back() + low_local_frontier_sizes.back() > 0) {
+        // aggregate frontier indices with their degrees in the low range
+
+        auto aggregate_low_local_frontier_indices = rmm::device_uvector<size_t>(
+          low_local_frontier_displacements.back() + low_local_frontier_sizes.back(),
+          handle.get_stream());
+        device_allgatherv(minor_comm,
+                          frontier_indices.begin(),
+                          aggregate_low_local_frontier_indices.begin(),
+                          low_local_frontier_sizes,
+                          low_local_frontier_displacements,
+                          handle.get_stream());
+
+        // collect 0 bias value neighbor indices
+
+        rmm::device_uvector<size_t> zero_bias_frontier_indices(
+          aggregate_low_local_frontier_indices.size() * K /* generous upper bound */,
+          handle.get_stream());
+        rmm::device_uvector<edge_t> zero_bias_local_nbr_indices(zero_bias_frontier_indices.size(),
+                                                                handle.get_stream());
+        rmm::device_scalar<size_t> counter(0, handle.get_stream());
+        std::vector<size_t> zero_bias_count_inclusive_sums(low_local_frontier_sizes.size());
+        for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) {
+          thrust::for_each(
+            handle.get_thrust_policy(),
+            aggregate_low_local_frontier_indices.begin() + low_local_frontier_displacements[i],
+            aggregate_low_local_frontier_indices.begin() +
+              (low_local_frontier_displacements[i] + low_local_frontier_sizes[i]),
+            [aggregate_local_frontier_biases = raft::device_span<bias_t const>(
+               aggregate_local_frontier_biases.data(), aggregate_local_frontier_biases.size()),
+             aggregate_local_frontier_local_degree_offsets = raft::device_span<size_t const>(
+               aggregate_local_frontier_local_degree_offsets.data(),
+               aggregate_local_frontier_local_degree_offsets.size()),
+             zero_bias_frontier_indices = raft::device_span<size_t>(
+               zero_bias_frontier_indices.data(), zero_bias_frontier_indices.size()),
+             zero_bias_local_nbr_indices = raft::device_span<edge_t>(
+               zero_bias_local_nbr_indices.data(), zero_bias_local_nbr_indices.size()),
+             input_offset = local_frontier_displacements[i],
+             counter      = counter.data()] __device__(size_t i) {
+              auto start_offset = aggregate_local_frontier_local_degree_offsets[input_offset + i];
+              auto end_offset = aggregate_local_frontier_local_degree_offsets[input_offset + i + 1];
+              cuda::atomic_ref<size_t, cuda::thread_scope_device> atomic_counter(*counter);
+              for (auto j = start_offset; j < end_offset; ++j) {
+                if (aggregate_local_frontier_biases[j] == 0.0) {
+                  auto idx = atomic_counter.fetch_add(size_t{1}, cuda::std::memory_order_relaxed);
+                  zero_bias_frontier_indices[idx]  = i;
+                  zero_bias_local_nbr_indices[idx] = j - start_offset;
+                }
+              }
+            });
+          zero_bias_count_inclusive_sums[i] = counter.value(handle.get_stream());
+        }
+        zero_bias_frontier_indices.resize(zero_bias_count_inclusive_sums.back(),
+                                          handle.get_stream());
+        zero_bias_frontier_indices.shrink_to_fit(handle.get_stream());
+        zero_bias_local_nbr_indices.resize(frontier_indices.size(), handle.get_stream());
+        zero_bias_local_nbr_indices.shrink_to_fit(handle.get_stream());
+        std::vector<size_t> zero_bias_counts(zero_bias_count_inclusive_sums.size());
+        std::adjacent_difference(zero_bias_count_inclusive_sums.begin(),
+                                 zero_bias_count_inclusive_sums.end(),
+                                 zero_bias_counts.begin());
+
+        rmm::device_uvector<size_t> low_frontier_gathered_zero_bias_frontier_indices(
+          0, handle.get_stream());
+        rmm::device_uvector<edge_t> low_frontier_gathered_zero_bias_nbr_indices(
+          0, handle.get_stream());
+        std::vector<size_t> rx_counts{};
+        std::forward_as_tuple(std::tie(low_frontier_gathered_zero_bias_frontier_indices,
+                                       low_frontier_gathered_zero_bias_nbr_indices),
+                              rx_counts) =
+          shuffle_values(minor_comm,
+                         thrust::make_zip_iterator(zero_bias_frontier_indices.begin(),
+                                                   zero_bias_local_nbr_indices.begin()),
+                         zero_bias_counts,
+                         handle.get_stream());
+        std::vector<size_t> rx_displacements(rx_counts.size());
+        std::exclusive_scan(
+          rx_counts.begin(), rx_counts.end(), rx_displacements.begin(), size_t{0});
+
+        // convert local neighbor indices to global neighbor indices and sort
+
+        auto pair_first =
+          thrust::make_zip_iterator(low_frontier_gathered_zero_bias_frontier_indices.begin(),
+                                    low_frontier_gathered_zero_bias_nbr_indices.begin());
+        for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) {
+          thrust::transform(
+            handle.get_thrust_policy(),
+            pair_first + rx_displacements[i],
+            pair_first + rx_displacements[i] + rx_counts[i],
+            low_frontier_gathered_zero_bias_nbr_indices.begin() + rx_displacements[i],
+            cuda::proclaim_return_type<edge_t>(
+              [frontier_partitioned_local_degree_displacements = raft::device_span<edge_t const>(
+                 (*frontier_partitioned_local_degree_displacements).data(),
+                 (*frontier_partitioned_local_degree_displacements).size()),
+               minor_comm_size,
+               minor_comm_rank = i] __device__(auto pair) {
+                auto frontier_idx  = thrust::get<0>(pair);
+                auto local_nbr_idx = thrust::get<1>(pair);
+                return frontier_partitioned_local_degree_displacements[frontier_idx *
+                                                                         minor_comm_size +
+                                                                       minor_comm_rank] +
+                       local_nbr_idx;
+              }));
+        }
+
+        thrust::sort(handle.get_thrust_policy(),
+                     pair_first,
+                     pair_first + low_frontier_gathered_zero_bias_frontier_indices.size());
+
+        // update neighbor indices excluding zero bias neighbor indices
+
+        thrust::for_each(
+          handle.get_thrust_policy(),
+          frontier_indices.begin(),
+          frontier_indices.begin() + frontier_partition_offsets[1],
+          [sorted_zero_bias_frontier_indices = raft::device_span<size_t const>(
+             low_frontier_gathered_zero_bias_frontier_indices.data(),
+             low_frontier_gathered_zero_bias_frontier_indices.size()),
+           sorted_zero_bias_nbr_indices =
+             raft::device_span<edge_t const>(low_frontier_gathered_zero_bias_nbr_indices.data(),
+                                             low_frontier_gathered_zero_bias_nbr_indices.size()),
+           frontier_degrees =
+             raft::device_span<edge_t const>(frontier_degrees.data(), frontier_degrees.size()),
+           nbr_indices = raft::device_span<edge_t>(nbr_indices.data(), nbr_indices.size()),
+           K,
+           invalid_idx = cugraph::ops::graph::INVALID_ID<edge_t>] __device__(size_t i) {
+            auto first = thrust::lower_bound(thrust::seq,
+                                             sorted_zero_bias_frontier_indices.begin(),
+                                             sorted_zero_bias_frontier_indices.end(),
+                                             i);
+            auto last =
+              thrust::upper_bound(thrust::seq, first, sorted_zero_bias_frontier_indices.end(), i);
+            auto degree      = frontier_degrees[i];
+            edge_t num_valid = 0;
+            if (thrust::distance(first, last) == 0) {
+              thrust::sequence(thrust::seq,
+                               nbr_indices.begin() + i * K,
+                               nbr_indices.begin() + i * K + degree,
+                               edge_t{0});
+              num_valid = degree;
+            } else {
+              auto start_offset =
+                thrust::distance(sorted_zero_bias_frontier_indices.begin(), first);
+              auto end_offset = thrust::distance(sorted_zero_bias_frontier_indices.begin(), last);
+              for (size_t j = 0; j < degree; ++j) {
+                if (!thrust::binary_search(thrust::seq,
+                                           sorted_zero_bias_nbr_indices.begin() + start_offset,
+                                           sorted_zero_bias_nbr_indices.begin() + end_offset,
+                                           j)) {
+                  *(nbr_indices.begin() + i * K + num_valid) = j;
+                  ++num_valid;
+                }
+              }
+            }
+            thrust::fill(thrust::seq,
+                         nbr_indices.begin() + i * K + num_valid,
+                         nbr_indices.begin() + (i + 1) * K,
+                         invalid_idx);
+          });
+      }
+
+      auto mid_frontier_size = frontier_partition_offsets[2] - frontier_partition_offsets[1];
+      std::vector<size_t> mid_local_frontier_sizes{};
+      mid_local_frontier_sizes =
+        host_scalar_allgather(minor_comm, mid_frontier_size, handle.get_stream());
+      std::vector<size_t> mid_local_frontier_displacements(mid_local_frontier_sizes.size());
+      std::exclusive_scan(mid_local_frontier_sizes.begin(),
+                          mid_local_frontier_sizes.end(),
+                          mid_local_frontier_displacements.begin(),
+                          size_t{0});
+
+      if (mid_local_frontier_displacements.back() + mid_local_frontier_sizes.back() > 0) {
+        // aggregate frontier indices with their degrees in the medium range
+
+        auto aggregate_mid_local_frontier_indices = rmm::device_uvector<size_t>(
+          mid_local_frontier_displacements.back() + mid_local_frontier_sizes.back(),
+          handle.get_stream());
+        device_allgatherv(minor_comm,
+                          frontier_indices.begin() + frontier_partition_offsets[1],
+                          aggregate_mid_local_frontier_indices.begin(),
+                          mid_local_frontier_sizes,
+                          mid_local_frontier_displacements,
+                          handle.get_stream());
+
+        // compute local degrees for the aggregated frontier indices
+
+        rmm::device_uvector<edge_t> aggregate_mid_local_frontier_local_degrees(
+          aggregate_mid_local_frontier_indices.size(), handle.get_stream());
+        for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) {
+          thrust::transform(
+            handle.get_thrust_policy(),
+            aggregate_mid_local_frontier_indices.begin() + mid_local_frontier_displacements[i],
+            aggregate_mid_local_frontier_indices.begin() + mid_local_frontier_displacements[i] +
+              mid_local_frontier_sizes[i],
+            aggregate_mid_local_frontier_local_degrees.begin() +
+              mid_local_frontier_displacements[i],
+            cuda::proclaim_return_type<edge_t>(
+              [offsets = raft::device_span<size_t const>(
+                 aggregate_local_frontier_local_degree_offsets.data() +
+                   local_frontier_displacements[i],
+                 local_frontier_sizes[i] + 1)] __device__(size_t i) {
+                return static_cast<edge_t>(offsets[i + 1] - offsets[i]);
+              }));
+        }
+
+        // gather biases for the aggregated frontier indices
+
+        rmm::device_uvector<bias_t> aggregate_mid_local_frontier_biases(0, handle.get_stream());
+        std::vector<size_t> mid_local_frontier_degree_sums(mid_local_frontier_sizes.size());
+        {
+          rmm::device_uvector<size_t> aggregate_mid_local_frontier_local_degree_offsets(
+            aggregate_mid_local_frontier_local_degrees.size() + 1, handle.get_stream());
+          aggregate_mid_local_frontier_local_degree_offsets.set_element_to_zero_async(
+            0, handle.get_stream());
+          thrust::inclusive_scan(handle.get_thrust_policy(),
+                                 aggregate_mid_local_frontier_local_degrees.begin(),
+                                 aggregate_mid_local_frontier_local_degrees.end(),
+                                 aggregate_mid_local_frontier_local_degree_offsets.begin() + 1);
+          aggregate_mid_local_frontier_biases.resize(
+            aggregate_mid_local_frontier_local_degree_offsets.back_element(handle.get_stream()),
+            handle.get_stream());
+
+          std::vector<size_t> mid_local_frontier_degree_sum_lasts(
+            mid_local_frontier_degree_sums.size());
+          for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) {
+            thrust::for_each(
+              handle.get_thrust_policy(),
+              thrust::make_counting_iterator(size_t{0}),
+              thrust::make_counting_iterator(mid_local_frontier_sizes[i]),
+              [aggregate_local_frontier_biases = raft::device_span<bias_t>(
+                 aggregate_local_frontier_biases.data(), aggregate_local_frontier_biases.size()),
+               aggregate_local_frontier_local_degree_offsets =
+                 raft::device_span<size_t>(aggregate_local_frontier_local_degree_offsets.data(),
+                                           aggregate_local_frontier_local_degree_offsets.size()),
+               mid_local_frontier_indices = raft::device_span<size_t const>(
+                 aggregate_mid_local_frontier_indices.data() + mid_local_frontier_displacements[i],
+                 mid_local_frontier_sizes[i]),
+               aggregate_mid_local_frontier_biases =
+                 raft::device_span<bias_t>(aggregate_mid_local_frontier_biases.data(),
+                                           aggregate_mid_local_frontier_biases.size()),
+               aggregate_mid_local_frontier_local_degree_offsets = raft::device_span<size_t>(
+                 aggregate_mid_local_frontier_local_degree_offsets.data(),
+                 aggregate_mid_local_frontier_local_degree_offsets.size()),
+               input_offset  = local_frontier_displacements[i],
+               output_offset = mid_local_frontier_displacements[i]] __device__(size_t i) {
+                thrust::copy(
+                  thrust::seq,
+                  aggregate_local_frontier_biases.begin() +
+                    aggregate_local_frontier_local_degree_offsets[input_offset +
+                                                                  mid_local_frontier_indices[i]],
+                  aggregate_local_frontier_biases.begin() +
+                    aggregate_local_frontier_local_degree_offsets
+                      [input_offset + (mid_local_frontier_indices[i] + 1)],
+                  aggregate_mid_local_frontier_biases.begin() +
+                    aggregate_mid_local_frontier_local_degree_offsets[output_offset + i]);
+              });
+            mid_local_frontier_degree_sum_lasts[i] =
+              aggregate_mid_local_frontier_local_degree_offsets.element(
+                mid_local_frontier_displacements[i] + mid_local_frontier_sizes[i],
+                handle.get_stream());
+          }
+          std::adjacent_difference(mid_local_frontier_degree_sum_lasts.begin(),
+                                   mid_local_frontier_degree_sum_lasts.end(),
+                                   mid_local_frontier_degree_sums.begin());
+        }
+        aggregate_mid_local_frontier_indices.resize(0, handle.get_stream());
+        aggregate_mid_local_frontier_indices.shrink_to_fit(handle.get_stream());
+
+        // shuffle local degrees & biases
+
+        rmm::device_uvector<size_t> mid_frontier_gathered_local_degree_offsets(0,
+                                                                               handle.get_stream());
+        {
+          rmm::device_uvector<edge_t> mid_frontier_gathered_local_degrees(0, handle.get_stream());
+          std::tie(mid_frontier_gathered_local_degrees, std::ignore) =
+            shuffle_values(minor_comm,
+                           aggregate_mid_local_frontier_local_degrees.data(),
+                           mid_local_frontier_sizes,
+                           handle.get_stream());
+          aggregate_mid_local_frontier_local_degrees.resize(0, handle.get_stream());
+          aggregate_mid_local_frontier_local_degrees.shrink_to_fit(handle.get_stream());
+          mid_frontier_gathered_local_degree_offsets.resize(
+            mid_frontier_gathered_local_degrees.size() + 1, handle.get_stream());
+          mid_frontier_gathered_local_degree_offsets.set_element_to_zero_async(0,
+                                                                               handle.get_stream());
+          thrust::inclusive_scan(handle.get_thrust_policy(),
+                                 mid_frontier_gathered_local_degrees.begin(),
+                                 mid_frontier_gathered_local_degrees.end(),
+                                 mid_frontier_gathered_local_degree_offsets.begin() + 1);
+        }
+
+        rmm::device_uvector<bias_t> mid_frontier_gathered_biases(0, handle.get_stream());
+        std::tie(mid_frontier_gathered_biases, std::ignore) =
+          shuffle_values(minor_comm,
+                         aggregate_mid_local_frontier_biases.data(),
+                         mid_local_frontier_degree_sums,
+                         handle.get_stream());
+        aggregate_mid_local_frontier_biases.resize(0, handle.get_stream());
+        aggregate_mid_local_frontier_biases.shrink_to_fit(handle.get_stream());
+
+        auto mid_frontier_degree_first = thrust::make_transform_iterator(
+          frontier_indices.begin() + frontier_partition_offsets[1],
+          cuda::proclaim_return_type<edge_t>(
+            [frontier_degrees = raft::device_span<edge_t>(
+               frontier_degrees.data(), frontier_degrees.size())] __device__(size_t i) {
+              return frontier_degrees[i];
+            }));
+        rmm::device_uvector<size_t> mid_frontier_degree_offsets(mid_frontier_size + 1,
+                                                                handle.get_stream());
+        mid_frontier_degree_offsets.set_element_to_zero_async(0, handle.get_stream());
+        thrust::inclusive_scan(handle.get_thrust_policy(),
+                               mid_frontier_degree_first,
+                               mid_frontier_degree_first + mid_frontier_size,
+                               mid_frontier_degree_offsets.begin() + 1);
+        rmm::device_uvector<bias_t> mid_frontier_biases(mid_frontier_gathered_biases.size(),
+                                                        handle.get_stream());
+        thrust::for_each(
+          handle.get_thrust_policy(),
+          thrust::make_counting_iterator(size_t{0}),
+          thrust::make_counting_iterator(mid_frontier_size),
+          [mid_frontier_gathered_local_degree_offsets =
+             raft::device_span<size_t>(mid_frontier_gathered_local_degree_offsets.data(),
+                                       mid_frontier_gathered_local_degree_offsets.size()),
+           mid_frontier_gathered_biases = raft::device_span<bias_t const>(
+             mid_frontier_gathered_biases.data(), mid_frontier_gathered_biases.size()),
+           mid_frontier_degree_offsets = raft::device_span<size_t>(
+             mid_frontier_degree_offsets.data(), mid_frontier_degree_offsets.size()),
+           mid_frontier_biases =
+             raft::device_span<bias_t>(mid_frontier_biases.data(), mid_frontier_biases.size()),
+           minor_comm_size,
+           mid_frontier_size] __device__(size_t i) {
+            auto output_offset = mid_frontier_degree_offsets[i];
+            for (int j = 0; j < minor_comm_size; ++j) {
+              auto input_offset =
+                mid_frontier_gathered_local_degree_offsets[mid_frontier_size * j + i];
+              auto input_size =
+                mid_frontier_gathered_local_degree_offsets[mid_frontier_size * j + i + 1] -
+                input_offset;
+              thrust::copy(thrust::seq,
+                           mid_frontier_gathered_biases.begin() + input_offset,
+                           mid_frontier_gathered_biases.begin() + input_offset + input_size,
+                           mid_frontier_biases.begin() + output_offset);
+              output_offset += input_size;
+            }
+          });
+
+        // now sample and update indices
+
+        compute_biased_sampling_index_without_replacement<edge_t, bias_t>(
+          handle,
+          std::nullopt,
+          raft::device_span<size_t const>(mid_frontier_degree_offsets.data(),
+                                          mid_frontier_degree_offsets.size()),
+          raft::device_span<bias_t const>(mid_frontier_biases.data(), mid_frontier_biases.size()),
+          std::make_optional<raft::device_span<size_t const>>(
+            frontier_indices.begin() + frontier_partition_offsets[1], mid_frontier_size),
+          raft::device_span<edge_t>(nbr_indices.data(), nbr_indices.size()),
+          std::nullopt,
+          rng_state,
+          K,
+          false);
+      }
+
+      auto high_frontier_size = frontier_partition_offsets[3] - frontier_partition_offsets[2];
+      std::vector<size_t> high_local_frontier_sizes{};
+      high_local_frontier_sizes =
+        host_scalar_allgather(minor_comm, high_frontier_size, handle.get_stream());
+
+      std::vector<size_t> high_local_frontier_displacements(high_local_frontier_sizes.size());
+      std::exclusive_scan(high_local_frontier_sizes.begin(),
+                          high_local_frontier_sizes.end(),
+                          high_local_frontier_displacements.begin(),
+                          size_t{0});
+      if (high_local_frontier_displacements.back() + high_local_frontier_sizes.back() > 0) {
+        // aggregate frontier indices with their degrees in the high range
+
+        auto aggregate_high_local_frontier_indices = rmm::device_uvector<size_t>(
+          high_local_frontier_displacements.back() + high_local_frontier_sizes.back(),
+          handle.get_stream());
+        device_allgatherv(minor_comm,
+                          frontier_indices.begin() + frontier_partition_offsets[2],
+                          aggregate_high_local_frontier_indices.begin(),
+                          high_local_frontier_sizes,
+                          high_local_frontier_displacements,
+                          handle.get_stream());
+
+        // local sample and update indices
+
+        rmm::device_uvector<edge_t> aggregate_high_local_frontier_local_nbr_indices(
+          (high_local_frontier_displacements.back() + high_local_frontier_sizes.back()) * K,
+          handle.get_stream());
+        rmm::device_uvector<bias_t> aggregate_high_local_frontier_keys(
+          aggregate_high_local_frontier_local_nbr_indices.size(), handle.get_stream());
+        for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) {
+          compute_biased_sampling_index_without_replacement<edge_t, bias_t>(
+            handle,
+            std::make_optional<raft::device_span<size_t const>>(
+              aggregate_high_local_frontier_indices.data() + high_local_frontier_displacements[i],
+              high_local_frontier_sizes[i]),
+            raft::device_span<size_t const>(aggregate_local_frontier_local_degree_offsets.data() +
+                                              local_frontier_displacements[i],
+                                            local_frontier_sizes[i] + 1),
+            raft::device_span<bias_t>(aggregate_local_frontier_biases.data(),
+                                      aggregate_local_frontier_biases.size()),
+            std::nullopt,
+            raft::device_span<edge_t>(aggregate_high_local_frontier_local_nbr_indices.data() +
+                                        high_local_frontier_displacements[i] * K,
+                                      high_local_frontier_sizes[i] * K),
+            std::make_optional<raft::device_span<bias_t>>(
+              aggregate_high_local_frontier_keys.data() + high_local_frontier_displacements[i] * K,
+              high_local_frontier_sizes[i] * K),
+            rng_state,
+            K,
+            false);
+        }
+
+        // shuffle local sampling outputs
+
+        std::vector<size_t> tx_counts(high_local_frontier_sizes);
+        std::transform(high_local_frontier_sizes.begin(),
+                       high_local_frontier_sizes.end(),
+                       tx_counts.begin(),
+                       [K](size_t size) { return size * K; });
+        rmm::device_uvector<edge_t> high_frontier_gathered_local_nbr_indices(0,
+                                                                             handle.get_stream());
+        std::tie(high_frontier_gathered_local_nbr_indices, std::ignore) =
+          shuffle_values(minor_comm,
+                         aggregate_high_local_frontier_local_nbr_indices.data(),
+                         tx_counts,
+                         handle.get_stream());
+        rmm::device_uvector<bias_t> high_frontier_gathered_keys(0, handle.get_stream());
+        std::tie(high_frontier_gathered_keys, std::ignore) = shuffle_values(
+          minor_comm, aggregate_high_local_frontier_keys.data(), tx_counts, handle.get_stream());
+        aggregate_high_local_frontier_local_nbr_indices.resize(0, handle.get_stream());
+        aggregate_high_local_frontier_local_nbr_indices.shrink_to_fit(handle.get_stream());
+        aggregate_high_local_frontier_keys.resize(0, handle.get_stream());
+        aggregate_high_local_frontier_keys.shrink_to_fit(handle.get_stream());
+
+        // merge local sampling outputs
+
+        rmm::device_uvector<edge_t> high_frontier_nbr_indices(
+          high_frontier_size * minor_comm_size * K, handle.get_stream());
+        rmm::device_uvector<bias_t> high_frontier_keys(high_frontier_nbr_indices.size(),
+                                                       handle.get_stream());
+        auto index_first = thrust::make_transform_iterator(
+          thrust::make_counting_iterator(size_t{0}),
+          cuda::proclaim_return_type<size_t>(
+            [K, minor_comm_rank, minor_comm_size, high_frontier_size] __device__(size_t i) {
+              auto idx             = i / (K * minor_comm_size);
+              auto minor_comm_rank = (i % (K * minor_comm_size)) / K;
+              return minor_comm_rank * (high_frontier_size * K) + idx * K + (i % K);
+            }));
+        auto high_frontier_gathered_nbr_idx_first = thrust::make_transform_iterator(
+          thrust::counting_iterator(size_t{0}),
+          cuda::proclaim_return_type<edge_t>(
+            [frontier_partitioned_local_degree_displacements = raft::device_span<edge_t const>(
+               (*frontier_partitioned_local_degree_displacements).data(),
+               (*frontier_partitioned_local_degree_displacements).size()),
+             high_frontier_indices = raft::device_span<size_t const>(
+               frontier_indices.data() + frontier_partition_offsets[2], high_frontier_size),
+             high_frontier_gathered_local_nbr_indices =
+               raft::device_span<edge_t const>(high_frontier_gathered_local_nbr_indices.data(),
+                                               high_frontier_gathered_local_nbr_indices.size()),
+             K,
+             minor_comm_size,
+             high_frontier_size] __device__(size_t i) {
+              auto minor_comm_rank = static_cast<int>(i / (high_frontier_size * K));
+              auto frontier_idx    = high_frontier_indices[(i % (high_frontier_size * K)) / K];
+              return frontier_partitioned_local_degree_displacements[frontier_idx *
+                                                                       minor_comm_size +
+                                                                     minor_comm_rank] +
+                     high_frontier_gathered_local_nbr_indices[i];
+            }));
+        thrust::gather(
+          handle.get_thrust_policy(),
+          index_first,
+          index_first + high_frontier_nbr_indices.size(),
+          thrust::make_zip_iterator(high_frontier_gathered_nbr_idx_first,
+                                    high_frontier_gathered_keys.begin()),
+          thrust::make_zip_iterator(high_frontier_nbr_indices.begin(), high_frontier_keys.begin()));
+        high_frontier_gathered_local_nbr_indices.resize(0, handle.get_stream());
+        high_frontier_gathered_local_nbr_indices.shrink_to_fit(handle.get_stream());
+        high_frontier_gathered_keys.resize(0, handle.get_stream());
+        high_frontier_gathered_keys.shrink_to_fit(handle.get_stream());
+
+        rmm::device_uvector<std::byte> d_tmp_storage(0, handle.get_stream());
+        size_t tmp_storage_bytes{0};
+
+        rmm::device_uvector<edge_t> high_frontier_segment_sorted_nbr_indices(
+          high_frontier_nbr_indices.size(), handle.get_stream());
+        rmm::device_uvector<bias_t> high_frontier_segment_sorted_keys(high_frontier_keys.size(),
+                                                                      handle.get_stream());
+        cub::DeviceSegmentedSort::SortPairs(
+          static_cast<void*>(nullptr),
+          tmp_storage_bytes,
+          high_frontier_keys.data(),
+          high_frontier_segment_sorted_keys.data(),
+          high_frontier_nbr_indices.data(),
+          high_frontier_segment_sorted_nbr_indices.data(),
+          high_frontier_size * K * minor_comm_size,
+          high_frontier_size,
+          thrust::make_transform_iterator(thrust::make_counting_iterator(size_t{0}),
+                                          multiplier_t<size_t>{minor_comm_size * K}),
+          thrust::make_transform_iterator(thrust::make_counting_iterator(size_t{1}),
+                                          multiplier_t<size_t>{minor_comm_size * K}),
+          handle.get_stream());
+        if (tmp_storage_bytes > d_tmp_storage.size()) {
+          d_tmp_storage = rmm::device_uvector<std::byte>(tmp_storage_bytes, handle.get_stream());
+        }
+        cub::DeviceSegmentedSort::SortPairs(
+          d_tmp_storage.data(),
+          tmp_storage_bytes,
+          high_frontier_keys.data(),
+          high_frontier_segment_sorted_keys.data(),
+          high_frontier_nbr_indices.data(),
+          high_frontier_segment_sorted_nbr_indices.data(),
+          high_frontier_size * K * minor_comm_size,
+          high_frontier_size,
+          thrust::make_transform_iterator(thrust::make_counting_iterator(size_t{0}),
+                                          multiplier_t<size_t>{minor_comm_size * K}),
+          thrust::make_transform_iterator(thrust::make_counting_iterator(size_t{1}),
+                                          multiplier_t<size_t>{minor_comm_size * K}),
+          handle.get_stream());
+
+        thrust::for_each(
+          handle.get_thrust_policy(),
+          thrust::make_counting_iterator(size_t{0}),
+          thrust::make_counting_iterator(high_frontier_size),
+          [high_frontier_indices = raft::device_span<size_t const>(
+             frontier_indices.data() + frontier_partition_offsets[2], high_frontier_size),
+           high_frontier_segment_sorted_nbr_indices =
+             raft::device_span<edge_t const>(high_frontier_segment_sorted_nbr_indices.data(),
+                                             high_frontier_segment_sorted_nbr_indices.size()),
+           nbr_indices = raft::device_span<edge_t>(nbr_indices.data(), nbr_indices.size()),
+           K,
+           minor_comm_size] __device__(size_t i) {
+            thrust::copy(
+              thrust::seq,
+              high_frontier_segment_sorted_nbr_indices.begin() + (i * K * minor_comm_size),
+              high_frontier_segment_sorted_nbr_indices.begin() + (i * K * minor_comm_size + K),
+              nbr_indices.begin() + high_frontier_indices[i] * K);
+          });
+      }
+    } else {
+      thrust::for_each(
+        handle.get_thrust_policy(),
+        frontier_indices.begin(),
+        frontier_indices.begin() + frontier_partition_offsets[1],
+        [aggregate_local_frontier_biases = raft::device_span<bias_t>(
+           aggregate_local_frontier_biases.data(), aggregate_local_frontier_biases.size()),
+         aggregate_local_frontier_local_degree_offsets =
+           raft::device_span<size_t>(aggregate_local_frontier_local_degree_offsets.data(),
+                                     aggregate_local_frontier_local_degree_offsets.size()),
+         nbr_indices = raft::device_span<edge_t>(nbr_indices.data(), nbr_indices.size()),
+         K,
+         invalid_idx = cugraph::ops::graph::INVALID_ID<edge_t>] __device__(size_t i) {
+          auto start_offset = aggregate_local_frontier_local_degree_offsets[i];
+          auto degree       = aggregate_local_frontier_local_degree_offsets[i + 1] - start_offset;
+          edge_t num_valid  = 0;
+          for (size_t j = 0; j < degree; ++j) {
+            auto bias = aggregate_local_frontier_biases[start_offset + j];
+            if (bias > 0.0) {
+              *(nbr_indices.begin() + i * K + num_valid) = j;
+              ++num_valid;
+            }
+          }
+          thrust::fill(thrust::seq,
+                       nbr_indices.begin() + i * K + num_valid,
+                       nbr_indices.begin() + (i + 1) * K,
+                       invalid_idx);
+        });
+
+      auto mid_and_high_frontier_size =
+        frontier_partition_offsets[3] - frontier_partition_offsets[1];
+      compute_biased_sampling_index_without_replacement<edge_t, bias_t>(
+        handle,
+        std::make_optional<raft::device_span<size_t const>>(
+          frontier_indices.data() + frontier_partition_offsets[1], mid_and_high_frontier_size),
+        raft::device_span<size_t const>(aggregate_local_frontier_local_degree_offsets.data(),
+                                        aggregate_local_frontier_local_degree_offsets.size()),
+        raft::device_span<bias_t const>(aggregate_local_frontier_biases.data(),
+                                        aggregate_local_frontier_biases.size()),
+        std::make_optional<raft::device_span<size_t const>>(
+          frontier_indices.data() + frontier_partition_offsets[1], mid_and_high_frontier_size),
+        raft::device_span<edge_t>(nbr_indices.data(), nbr_indices.size()),
+        std::nullopt,
+        rng_state,
+        K,
+        false);
+    }
+
+    std::tie(local_nbr_indices, key_indices, local_frontier_sample_offsets) =
+      shuffle_and_compute_local_nbr_values<edge_t, GraphViewType::is_multi_gpu>(
+        handle,
+        std::move(nbr_indices),
+        frontier_partitioned_local_degree_displacements
+          ? std::make_optional<raft::device_span<edge_t const>>(
+              (*frontier_partitioned_local_degree_displacements).data(),
+              (*frontier_partitioned_local_degree_displacements).size())
+          : std::nullopt,
+        local_frontier_displacements,
+        local_frontier_sizes,
+        K,
+        cugraph::ops::graph::INVALID_ID<edge_t>);
+  }
+
+  // 3. convert neighbor indices in the neighbor list considering edge mask to neighbor indices in
+  // the neighbor list ignoring edge mask
+
+  if (edge_mask_view) {
+    local_nbr_indices = convert_to_unmasked_local_nbr_idx(
+      handle,
+      graph_view,
+      aggregate_local_frontier_major_first,
+      std::move(local_nbr_indices),
+      key_indices ? std::make_optional<raft::device_span<size_t const>>((*key_indices).data(),
+                                                                        (*key_indices).size())
+                  : std::nullopt,
+      local_frontier_sample_offsets,
+      local_frontier_displacements,
+      local_frontier_sizes,
+      K);
+  }
+
+  return std::make_tuple(
+    std::move(local_nbr_indices), std::move(key_indices), std::move(local_frontier_sample_offsets));
+}
+
+}  // namespace detail
+
+}  // namespace cugraph
diff --git a/cpp/src/prims/detail/transform_v_frontier_e.cuh b/cpp/src/prims/detail/transform_v_frontier_e.cuh
new file mode 100644
index 00000000000..7d8824849f0
--- /dev/null
+++ b/cpp/src/prims/detail/transform_v_frontier_e.cuh
@@ -0,0 +1,627 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "prims/detail/partition_v_frontier.cuh"
+#include "prims/property_op_utils.cuh"
+
+#include <cugraph/edge_partition_device_view.cuh>
+#include <cugraph/edge_partition_edge_property_device_view.cuh>
+#include <cugraph/edge_partition_endpoint_property_device_view.cuh>
+#include <cugraph/edge_src_dst_property.hpp>
+#include <cugraph/graph_view.hpp>
+#include <cugraph/utilities/dataframe_buffer.hpp>
+#include <cugraph/utilities/error.hpp>
+#include <cugraph/utilities/thrust_tuple_utils.hpp>
+
+#include <raft/core/handle.hpp>
+
+#include <thrust/copy.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/optional.h>
+#include <thrust/tuple.h>
+
+#include <type_traits>
+
+namespace cugraph {
+
+namespace detail {
+
+// FIXME: block size requires tuning
+int32_t constexpr transform_v_frontier_e_kernel_block_size = 128;
+
+template <typename key_t,
+          typename GraphViewType,
+          typename EdgePartitionSrcValueInputWrapper,
+          typename EdgePartitionDstValueInputWrapper,
+          typename EdgePartitionEdgeValueInputWrapper,
+          typename EdgeOp,
+          typename ValueIterator>
+__device__ void transform_v_frontier_e_update_buffer_element(
+  edge_partition_device_view_t<typename GraphViewType::vertex_type,
+                               typename GraphViewType::edge_type,
+                               GraphViewType::is_multi_gpu>& edge_partition,
+  key_t key,
+  typename GraphViewType::vertex_type major_offset,
+  typename GraphViewType::vertex_type minor,
+  typename GraphViewType::edge_type edge_offset,
+  EdgePartitionSrcValueInputWrapper edge_partition_src_value_input,
+  EdgePartitionDstValueInputWrapper edge_partition_dst_value_input,
+  EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input,
+  EdgeOp e_op,
+  ValueIterator value_iter)
+{
+  using vertex_t = typename GraphViewType::vertex_type;
+
+  auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor);
+  std::conditional_t<GraphViewType::is_storage_transposed, vertex_t, key_t> key_or_src{};
+  std::conditional_t<GraphViewType::is_storage_transposed, key_t, vertex_t> key_or_dst{};
+  if constexpr (GraphViewType::is_storage_transposed) {
+    key_or_src = minor;
+    key_or_dst = key;
+  } else {
+    key_or_src = key;
+    key_or_dst = minor;
+  }
+  auto src_offset = GraphViewType::is_storage_transposed ? minor_offset : major_offset;
+  auto dst_offset = GraphViewType::is_storage_transposed ? major_offset : minor_offset;
+
+  *value_iter = e_op(key_or_src,
+                     key_or_dst,
+                     edge_partition_src_value_input.get(src_offset),
+                     edge_partition_dst_value_input.get(dst_offset),
+                     edge_partition_e_value_input.get(edge_offset));
+}
+
+template <bool hypersparse,
+          typename GraphViewType,
+          typename KeyIterator,
+          typename IndexIterator,
+          typename EdgePartitionSrcValueInputWrapper,
+          typename EdgePartitionDstValueInputWrapper,
+          typename EdgePartitionEdgeValueInputWrapper,
+          typename EdgePartitionEdgeMaskWrapper,
+          typename EdgeOp,
+          typename ValueIterator>
+__global__ static void transform_v_frontier_e_hypersparse_or_low_degree(
+  edge_partition_device_view_t<typename GraphViewType::vertex_type,
+                               typename GraphViewType::edge_type,
+                               GraphViewType::is_multi_gpu> edge_partition,
+  KeyIterator edge_partition_frontier_key_first,
+  IndexIterator edge_partition_frontier_key_index_first,
+  IndexIterator edge_partition_frontier_key_index_last,
+  EdgePartitionSrcValueInputWrapper edge_partition_src_value_input,
+  EdgePartitionDstValueInputWrapper edge_partition_dst_value_input,
+  EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input,
+  EdgePartitionEdgeMaskWrapper edge_partition_e_mask,
+  raft::device_span<size_t const> edge_partition_frontier_local_degree_offsets,
+  EdgeOp e_op,
+  ValueIterator value_first)
+{
+  using vertex_t = typename GraphViewType::vertex_type;
+  using edge_t   = typename GraphViewType::edge_type;
+  using key_t    = typename thrust::iterator_traits<KeyIterator>::value_type;
+
+  auto const tid = threadIdx.x + blockIdx.x * blockDim.x;
+  auto idx       = static_cast<size_t>(tid);
+
+  while (idx < static_cast<size_t>(thrust::distance(edge_partition_frontier_key_index_first,
+                                                    edge_partition_frontier_key_index_last))) {
+    auto key_idx      = *(edge_partition_frontier_key_index_first + idx);
+    auto key          = *(edge_partition_frontier_key_first + key_idx);
+    auto major        = thrust_tuple_get_or_identity<key_t, 0>(key);
+    auto major_offset = edge_partition.major_offset_from_major_nocheck(major);
+    vertex_t const* indices{nullptr};
+    [[maybe_unused]] edge_t edge_offset{};
+    edge_t local_degree{};
+    if constexpr (hypersparse) {
+      auto major_idx = edge_partition.major_idx_from_major_nocheck(major);
+      if (major_idx) {
+        thrust::tie(indices, edge_offset, local_degree) = edge_partition.local_edges(*major_idx);
+      } else {
+        local_degree = edge_t{0};
+      }
+    } else {
+      thrust::tie(indices, edge_offset, local_degree) = edge_partition.local_edges(major_offset);
+    }
+    auto this_key_value_first = value_first + edge_partition_frontier_local_degree_offsets[key_idx];
+    if (edge_partition_e_mask) {
+      edge_t counter{0};
+      for (edge_t i = 0; i < local_degree; ++i) {
+        if ((*edge_partition_e_mask).get(edge_offset + i)) {
+          transform_v_frontier_e_update_buffer_element<key_t, GraphViewType>(
+            edge_partition,
+            key,
+            major_offset,
+            indices[i],
+            edge_offset + i,
+            edge_partition_src_value_input,
+            edge_partition_dst_value_input,
+            edge_partition_e_value_input,
+            e_op,
+            this_key_value_first + counter);
+          ++counter;
+        }
+      }
+    } else {
+      for (edge_t i = 0; i < local_degree; ++i) {
+        transform_v_frontier_e_update_buffer_element<key_t, GraphViewType>(
+          edge_partition,
+          key,
+          major_offset,
+          indices[i],
+          edge_offset + i,
+          edge_partition_src_value_input,
+          edge_partition_dst_value_input,
+          edge_partition_e_value_input,
+          e_op,
+          this_key_value_first + i);
+      }
+    }
+
+    idx += gridDim.x * blockDim.x;
+  }
+}
+
+template <typename GraphViewType,
+          typename KeyIterator,
+          typename IndexIterator,
+          typename EdgePartitionSrcValueInputWrapper,
+          typename EdgePartitionDstValueInputWrapper,
+          typename EdgePartitionEdgeValueInputWrapper,
+          typename EdgePartitionEdgeMaskWrapper,
+          typename EdgeOp,
+          typename ValueIterator>
+__global__ static void transform_v_frontier_e_mid_degree(
+  edge_partition_device_view_t<typename GraphViewType::vertex_type,
+                               typename GraphViewType::edge_type,
+                               GraphViewType::is_multi_gpu> edge_partition,
+  KeyIterator edge_partition_frontier_key_first,
+  IndexIterator edge_partition_frontier_key_index_first,
+  IndexIterator edge_partition_frontier_key_index_last,
+  EdgePartitionSrcValueInputWrapper edge_partition_src_value_input,
+  EdgePartitionDstValueInputWrapper edge_partition_dst_value_input,
+  EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input,
+  EdgePartitionEdgeMaskWrapper edge_partition_e_mask,
+  raft::device_span<size_t const> edge_partition_frontier_local_degree_offsets,
+  EdgeOp e_op,
+  ValueIterator value_first)
+{
+  using vertex_t = typename GraphViewType::vertex_type;
+  using edge_t   = typename GraphViewType::edge_type;
+  using key_t    = typename thrust::iterator_traits<KeyIterator>::value_type;
+
+  auto const tid = threadIdx.x + blockIdx.x * blockDim.x;
+  static_assert(transform_v_frontier_e_kernel_block_size % raft::warp_size() == 0);
+  auto const lane_id = tid % raft::warp_size();
+  size_t idx         = static_cast<size_t>(tid / raft::warp_size());
+
+  using WarpScan = cub::WarpScan<edge_t, raft::warp_size()>;
+  __shared__ typename WarpScan::TempStorage temp_storage;
+
+  while (idx < static_cast<size_t>(thrust::distance(edge_partition_frontier_key_index_first,
+                                                    edge_partition_frontier_key_index_last))) {
+    auto key_idx      = *(edge_partition_frontier_key_index_first + idx);
+    auto key          = *(edge_partition_frontier_key_first + key_idx);
+    auto major        = thrust_tuple_get_or_identity<key_t, 0>(key);
+    auto major_offset = edge_partition.major_offset_from_major_nocheck(major);
+    vertex_t const* indices{nullptr};
+    [[maybe_unused]] edge_t edge_offset{};
+    edge_t local_degree{};
+    thrust::tie(indices, edge_offset, local_degree) = edge_partition.local_edges(major_offset);
+    auto this_key_value_first = value_first + edge_partition_frontier_local_degree_offsets[key_idx];
+    if (edge_partition_e_mask) {
+      // FIXME: it might be faster to update in warp-sync way
+      edge_t counter{0};
+      for (edge_t i = lane_id; i < local_degree; i += raft::warp_size()) {
+        if ((*edge_partition_e_mask).get(edge_offset + i)) { ++counter; }
+      }
+      edge_t offset_within_warp{};
+      WarpScan(temp_storage).ExclusiveSum(counter, offset_within_warp);
+      counter = 0;
+      for (edge_t i = lane_id; i < local_degree; i += raft::warp_size()) {
+        if ((*edge_partition_e_mask).get(edge_offset + i)) {
+          transform_v_frontier_e_update_buffer_element<key_t, GraphViewType>(
+            edge_partition,
+            key,
+            major_offset,
+            indices[i],
+            edge_offset + i,
+            edge_partition_src_value_input,
+            edge_partition_dst_value_input,
+            edge_partition_e_value_input,
+            e_op,
+            this_key_value_first + offset_within_warp + counter);
+          ++counter;
+        }
+      }
+    } else {
+      for (edge_t i = lane_id; i < local_degree; i += raft::warp_size()) {
+        transform_v_frontier_e_update_buffer_element<key_t, GraphViewType>(
+          edge_partition,
+          key,
+          major_offset,
+          indices[i],
+          edge_offset + i,
+          edge_partition_src_value_input,
+          edge_partition_dst_value_input,
+          edge_partition_e_value_input,
+          e_op,
+          this_key_value_first + i);
+      }
+    }
+
+    idx += gridDim.x * (blockDim.x / raft::warp_size());
+  }
+}
+
+template <typename GraphViewType,
+          typename KeyIterator,
+          typename IndexIterator,
+          typename EdgePartitionSrcValueInputWrapper,
+          typename EdgePartitionDstValueInputWrapper,
+          typename EdgePartitionEdgeValueInputWrapper,
+          typename EdgePartitionEdgeMaskWrapper,
+          typename EdgeOp,
+          typename ValueIterator>
+__global__ static void transform_v_frontier_e_high_degree(
+  edge_partition_device_view_t<typename GraphViewType::vertex_type,
+                               typename GraphViewType::edge_type,
+                               GraphViewType::is_multi_gpu> edge_partition,
+  KeyIterator edge_partition_frontier_key_first,
+  IndexIterator edge_partition_frontier_key_index_first,
+  IndexIterator edge_partition_frontier_key_index_last,
+  EdgePartitionSrcValueInputWrapper edge_partition_src_value_input,
+  EdgePartitionDstValueInputWrapper edge_partition_dst_value_input,
+  EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input,
+  EdgePartitionEdgeMaskWrapper edge_partition_e_mask,
+  raft::device_span<size_t const> edge_partition_frontier_local_degree_offsets,
+  EdgeOp e_op,
+  ValueIterator value_first)
+{
+  using vertex_t = typename GraphViewType::vertex_type;
+  using edge_t   = typename GraphViewType::edge_type;
+  using key_t    = typename thrust::iterator_traits<KeyIterator>::value_type;
+
+  auto idx = static_cast<size_t>(blockIdx.x);
+
+  using BlockScan = cub::BlockScan<edge_t, transform_v_frontier_e_kernel_block_size>;
+  __shared__ typename BlockScan::TempStorage temp_storage;
+
+  while (idx < static_cast<size_t>(thrust::distance(edge_partition_frontier_key_index_first,
+                                                    edge_partition_frontier_key_index_last))) {
+    auto key_idx      = *(edge_partition_frontier_key_index_first + idx);
+    auto key          = *(edge_partition_frontier_key_first + key_idx);
+    auto major        = thrust_tuple_get_or_identity<key_t, 0>(key);
+    auto major_offset = edge_partition.major_offset_from_major_nocheck(major);
+    vertex_t const* indices{nullptr};
+    [[maybe_unused]] edge_t edge_offset{};
+    edge_t local_degree{};
+    thrust::tie(indices, edge_offset, local_degree) = edge_partition.local_edges(major_offset);
+    auto this_key_value_first = value_first + edge_partition_frontier_local_degree_offsets[key_idx];
+    if (edge_partition_e_mask) {
+      // FIXME: it might be faster to update in block-sync way
+      edge_t counter{0};
+      for (edge_t i = threadIdx.x; i < local_degree; i += blockDim.x) {
+        if ((*edge_partition_e_mask).get(edge_offset + i)) { ++counter; }
+      }
+      edge_t offset_within_block{};
+      BlockScan(temp_storage).ExclusiveSum(counter, offset_within_block);
+      counter = 0;
+      for (edge_t i = threadIdx.x; i < local_degree; i += blockDim.x) {
+        if ((*edge_partition_e_mask).get(edge_offset + i)) {
+          transform_v_frontier_e_update_buffer_element<key_t, GraphViewType>(
+            edge_partition,
+            key,
+            major_offset,
+            indices[i],
+            edge_offset + i,
+            edge_partition_src_value_input,
+            edge_partition_dst_value_input,
+            edge_partition_e_value_input,
+            e_op,
+            this_key_value_first + offset_within_block + counter);
+          ++counter;
+        }
+      }
+    } else {
+      for (edge_t i = threadIdx.x; i < local_degree; i += blockDim.x) {
+        transform_v_frontier_e_update_buffer_element<key_t, GraphViewType>(
+          edge_partition,
+          key,
+          major_offset,
+          indices[i],
+          edge_offset + i,
+          edge_partition_src_value_input,
+          edge_partition_dst_value_input,
+          edge_partition_e_value_input,
+          e_op,
+          this_key_value_first + i);
+      }
+    }
+
+    idx += gridDim.x;
+  }
+}
+
+// return std::tuple of e_op results and offsets
+template <typename GraphViewType,
+          typename KeyIterator,
+          typename EdgeSrcValueInputWrapper,
+          typename EdgeDstValueInputWrapper,
+          typename EdgeValueInputWrapper,
+          typename EdgeOp>
+auto transform_v_frontier_e(raft::handle_t const& handle,
+                            GraphViewType const& graph_view,
+                            KeyIterator aggregate_local_frontier_key_first,
+                            EdgeSrcValueInputWrapper edge_src_value_input,
+                            EdgeDstValueInputWrapper edge_dst_value_input,
+                            EdgeValueInputWrapper edge_value_input,
+                            EdgeOp e_op,
+                            std::vector<size_t> const& local_frontier_displacements,
+                            std::vector<size_t> const& local_frontier_sizes)
+{
+  using vertex_t = typename GraphViewType::vertex_type;
+  using edge_t   = typename GraphViewType::edge_type;
+  using key_t    = typename thrust::iterator_traits<KeyIterator>::value_type;
+
+  using e_op_result_t =
+    typename detail::edge_op_result_type<key_t,
+                                         vertex_t,
+                                         typename EdgeSrcValueInputWrapper::value_type,
+                                         typename EdgeDstValueInputWrapper::value_type,
+                                         typename EdgeValueInputWrapper::value_type,
+                                         EdgeOp>::type;
+  static_assert(is_arithmetic_or_thrust_tuple_of_arithmetic<e_op_result_t>::value);
+
+  using edge_partition_src_input_device_view_t = std::conditional_t<
+    std::is_same_v<typename EdgeSrcValueInputWrapper::value_type, thrust::nullopt_t>,
+    detail::edge_partition_endpoint_dummy_property_device_view_t<vertex_t>,
+    detail::edge_partition_endpoint_property_device_view_t<
+      vertex_t,
+      typename EdgeSrcValueInputWrapper::value_iterator,
+      typename EdgeSrcValueInputWrapper::value_type>>;
+  using edge_partition_dst_input_device_view_t = std::conditional_t<
+    std::is_same_v<typename EdgeDstValueInputWrapper::value_type, thrust::nullopt_t>,
+    detail::edge_partition_endpoint_dummy_property_device_view_t<vertex_t>,
+    detail::edge_partition_endpoint_property_device_view_t<
+      vertex_t,
+      typename EdgeDstValueInputWrapper::value_iterator,
+      typename EdgeDstValueInputWrapper::value_type>>;
+  using edge_partition_e_input_device_view_t = std::conditional_t<
+    std::is_same_v<typename EdgeValueInputWrapper::value_type, thrust::nullopt_t>,
+    detail::edge_partition_edge_dummy_property_device_view_t<vertex_t>,
+    detail::edge_partition_edge_property_device_view_t<
+      edge_t,
+      typename EdgeValueInputWrapper::value_iterator,
+      typename EdgeValueInputWrapper::value_type>>;
+
+  auto edge_mask_view = graph_view.edge_mask_view();
+
+  // 1. update aggregate_local_frontier_local_degree_offsets
+
+  auto aggregate_local_frontier_local_degree_offsets = rmm::device_uvector<size_t>(
+    local_frontier_displacements.back() + local_frontier_sizes.back() + 1, handle.get_stream());
+  aggregate_local_frontier_local_degree_offsets.set_element_to_zero_async(
+    aggregate_local_frontier_local_degree_offsets.size() - 1, handle.get_stream());
+  for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) {
+    auto edge_partition =
+      edge_partition_device_view_t<vertex_t, edge_t, GraphViewType::is_multi_gpu>(
+        graph_view.local_edge_partition_view(i));
+    auto edge_partition_e_mask =
+      edge_mask_view
+        ? thrust::make_optional<
+            detail::edge_partition_edge_property_device_view_t<edge_t, uint32_t const*, bool>>(
+            *edge_mask_view, i)
+        : thrust::nullopt;
+
+    auto edge_partition_frontier_key_first =
+      aggregate_local_frontier_key_first + local_frontier_displacements[i];
+    auto edge_partition_frontier_major_first =
+      thrust_tuple_get_or_identity<KeyIterator, 0>(edge_partition_frontier_key_first);
+
+    auto edge_partition_frontier_local_degrees =
+      edge_partition_e_mask ? edge_partition.compute_local_degrees_with_mask(
+                                (*edge_partition_e_mask).value_first(),
+                                edge_partition_frontier_major_first,
+                                edge_partition_frontier_major_first + local_frontier_sizes[i],
+                                handle.get_stream())
+                            : edge_partition.compute_local_degrees(
+                                edge_partition_frontier_major_first,
+                                edge_partition_frontier_major_first + local_frontier_sizes[i],
+                                handle.get_stream());
+
+    // FIXME: this copy is unnecessary if edge_partition.compute_local_degrees() takes a pointer
+    // to the output array
+    thrust::copy(
+      handle.get_thrust_policy(),
+      edge_partition_frontier_local_degrees.begin(),
+      edge_partition_frontier_local_degrees.end(),
+      aggregate_local_frontier_local_degree_offsets.begin() + local_frontier_displacements[i]);
+  }
+  thrust::exclusive_scan(handle.get_thrust_policy(),
+                         aggregate_local_frontier_local_degree_offsets.begin(),
+                         aggregate_local_frontier_local_degree_offsets.end(),
+                         aggregate_local_frontier_local_degree_offsets.begin());
+
+  // 2. update aggregate_value_buffer
+
+  auto aggregate_value_buffer = allocate_dataframe_buffer<e_op_result_t>(
+    aggregate_local_frontier_local_degree_offsets.back_element(handle.get_stream()),
+    handle.get_stream());
+
+  for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) {
+    auto edge_partition =
+      edge_partition_device_view_t<vertex_t, edge_t, GraphViewType::is_multi_gpu>(
+        graph_view.local_edge_partition_view(i));
+    auto edge_partition_e_mask =
+      edge_mask_view
+        ? thrust::make_optional<
+            detail::edge_partition_edge_property_device_view_t<edge_t, uint32_t const*, bool>>(
+            *edge_mask_view, i)
+        : thrust::nullopt;
+
+    auto edge_partition_frontier_key_first =
+      aggregate_local_frontier_key_first + local_frontier_displacements[i];
+    auto edge_partition_frontier_major_first =
+      thrust_tuple_get_or_identity<KeyIterator, 0>(edge_partition_frontier_key_first);
+
+    rmm::device_uvector<size_t> edge_partition_key_indices(local_frontier_sizes[i],
+                                                           handle.get_stream());
+    thrust::sequence(handle.get_thrust_policy(),
+                     edge_partition_key_indices.begin(),
+                     edge_partition_key_indices.end(),
+                     size_t{0});
+
+    auto edge_partition_frontier_local_degree_offsets = raft::device_span<size_t const>(
+      aggregate_local_frontier_local_degree_offsets.data() + local_frontier_displacements[i],
+      local_frontier_sizes[i] + 1);
+
+    edge_partition_src_input_device_view_t edge_partition_src_value_input{};
+    edge_partition_dst_input_device_view_t edge_partition_dst_value_input{};
+    if constexpr (GraphViewType::is_storage_transposed) {
+      edge_partition_src_value_input = edge_partition_src_input_device_view_t(edge_src_value_input);
+      edge_partition_dst_value_input =
+        edge_partition_dst_input_device_view_t(edge_dst_value_input, i);
+    } else {
+      edge_partition_src_value_input =
+        edge_partition_src_input_device_view_t(edge_src_value_input, i);
+      edge_partition_dst_value_input = edge_partition_dst_input_device_view_t(edge_dst_value_input);
+    }
+    auto edge_partition_e_value_input = edge_partition_e_input_device_view_t(edge_value_input, i);
+
+    auto segment_offsets = graph_view.local_edge_partition_segment_offsets(i);
+    if (segment_offsets) {
+      auto [edge_partition_key_indices, edge_partition_v_frontier_partition_offsets] =
+        partition_v_frontier(
+          handle,
+          edge_partition_frontier_major_first,
+          edge_partition_frontier_major_first + local_frontier_sizes[i],
+          std::vector<vertex_t>{edge_partition.major_range_first() + (*segment_offsets)[1],
+                                edge_partition.major_range_first() + (*segment_offsets)[2],
+                                edge_partition.major_range_first() + (*segment_offsets)[3]});
+
+      // FIXME: we may further improve performance by 1) concurrently running kernels on different
+      // segments; 2) individually tuning block sizes for different segments; and 3) adding one
+      // more segment for very high degree vertices and running segmented reduction
+      static_assert(detail::num_sparse_segments_per_vertex_partition == 3);
+      auto high_size = edge_partition_v_frontier_partition_offsets[1];
+      if (high_size > 0) {
+        raft::grid_1d_block_t update_grid(high_size,
+                                          detail::transform_v_frontier_e_kernel_block_size,
+                                          handle.get_device_properties().maxGridSize[0]);
+        detail::transform_v_frontier_e_high_degree<GraphViewType>
+          <<<update_grid.num_blocks, update_grid.block_size, 0, handle.get_stream()>>>(
+            edge_partition,
+            edge_partition_frontier_key_first,
+            edge_partition_key_indices.begin() + edge_partition_v_frontier_partition_offsets[0],
+            edge_partition_key_indices.begin() + edge_partition_v_frontier_partition_offsets[1],
+            edge_partition_src_value_input,
+            edge_partition_dst_value_input,
+            edge_partition_e_value_input,
+            edge_partition_e_mask,
+            edge_partition_frontier_local_degree_offsets,
+            e_op,
+            get_dataframe_buffer_begin(aggregate_value_buffer));
+      }
+      auto mid_size = edge_partition_v_frontier_partition_offsets[2] -
+                      edge_partition_v_frontier_partition_offsets[1];
+      if (mid_size > 0) {
+        raft::grid_1d_warp_t update_grid(mid_size,
+                                         detail::transform_v_frontier_e_kernel_block_size,
+                                         handle.get_device_properties().maxGridSize[0]);
+        detail::transform_v_frontier_e_mid_degree<GraphViewType>
+          <<<update_grid.num_blocks, update_grid.block_size, 0, handle.get_stream()>>>(
+            edge_partition,
+            edge_partition_frontier_key_first,
+            edge_partition_key_indices.begin() + edge_partition_v_frontier_partition_offsets[1],
+            edge_partition_key_indices.begin() + edge_partition_v_frontier_partition_offsets[2],
+            edge_partition_src_value_input,
+            edge_partition_dst_value_input,
+            edge_partition_e_value_input,
+            edge_partition_e_mask,
+            edge_partition_frontier_local_degree_offsets,
+            e_op,
+            get_dataframe_buffer_begin(aggregate_value_buffer));
+      }
+      auto low_size = edge_partition_v_frontier_partition_offsets[3] -
+                      edge_partition_v_frontier_partition_offsets[2];
+      if (low_size > 0) {
+        raft::grid_1d_thread_t update_grid(low_size,
+                                           detail::transform_v_frontier_e_kernel_block_size,
+                                           handle.get_device_properties().maxGridSize[0]);
+        detail::transform_v_frontier_e_hypersparse_or_low_degree<false, GraphViewType>
+          <<<update_grid.num_blocks, update_grid.block_size, 0, handle.get_stream()>>>(
+            edge_partition,
+            edge_partition_frontier_key_first,
+            edge_partition_key_indices.begin() + edge_partition_v_frontier_partition_offsets[2],
+            edge_partition_key_indices.begin() + edge_partition_v_frontier_partition_offsets[3],
+            edge_partition_src_value_input,
+            edge_partition_dst_value_input,
+            edge_partition_e_value_input,
+            edge_partition_e_mask,
+            edge_partition_frontier_local_degree_offsets,
+            e_op,
+            get_dataframe_buffer_begin(aggregate_value_buffer));
+      }
+      auto hypersparse_size = edge_partition_v_frontier_partition_offsets[4] -
+                              edge_partition_v_frontier_partition_offsets[3];
+      if (hypersparse_size > 0) {
+        raft::grid_1d_thread_t update_grid(hypersparse_size,
+                                           detail::transform_v_frontier_e_kernel_block_size,
+                                           handle.get_device_properties().maxGridSize[0]);
+        detail::transform_v_frontier_e_hypersparse_or_low_degree<true, GraphViewType>
+          <<<update_grid.num_blocks, update_grid.block_size, 0, handle.get_stream()>>>(
+            edge_partition,
+            edge_partition_frontier_key_first,
+            edge_partition_key_indices.begin() + edge_partition_v_frontier_partition_offsets[3],
+            edge_partition_key_indices.begin() + edge_partition_v_frontier_partition_offsets[4],
+            edge_partition_src_value_input,
+            edge_partition_dst_value_input,
+            edge_partition_e_value_input,
+            edge_partition_e_mask,
+            edge_partition_frontier_local_degree_offsets,
+            e_op,
+            get_dataframe_buffer_begin(aggregate_value_buffer));
+      }
+    } else {
+      raft::grid_1d_thread_t update_grid(local_frontier_sizes[i],
+                                         detail::transform_v_frontier_e_kernel_block_size,
+                                         handle.get_device_properties().maxGridSize[0]);
+
+      detail::transform_v_frontier_e_hypersparse_or_low_degree<false, GraphViewType>
+        <<<update_grid.num_blocks, update_grid.block_size, 0, handle.get_stream()>>>(
+          edge_partition,
+          edge_partition_frontier_key_first,
+          thrust::make_counting_iterator(size_t{0}),
+          thrust::make_counting_iterator(local_frontier_sizes[i]),
+          edge_partition_src_value_input,
+          edge_partition_dst_value_input,
+          edge_partition_e_value_input,
+          edge_partition_e_mask,
+          edge_partition_frontier_local_degree_offsets,
+          e_op,
+          get_dataframe_buffer_begin(aggregate_value_buffer));
+    }
+  }
+
+  return std::make_tuple(std::move(aggregate_value_buffer),
+                         std::move(aggregate_local_frontier_local_degree_offsets));
+}
+
+}  // namespace detail
+
+}  // namespace cugraph
diff --git a/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh b/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh
index 5240c49cb80..a004741f719 100644
--- a/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh
+++ b/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh
@@ -15,6 +15,7 @@
  */
 #pragma once
 
+#include "prims/detail/sample_and_compute_local_nbr_indices.cuh"
 #include "prims/property_op_utils.cuh"
 
 #include <cugraph/edge_partition_device_view.cuh>
@@ -53,98 +54,19 @@ namespace cugraph {
 
 namespace detail {
 
-int32_t constexpr per_v_random_select_transform_outgoing_e_block_size = 256;
-
-size_t constexpr compute_valid_local_nbr_count_inclusive_sum_local_degree_threshold =
-  packed_bools_per_word() *
-  size_t{4} /* tuning parameter */;  // minimum local degree to compute inclusive sums of valid
-                                     // local neighbors per word to accelerate finding n'th local
-                                     // neighbor vertex
-size_t constexpr compute_valid_local_nbr_count_inclusive_sum_mid_local_degree_threshold =
-  packed_bools_per_word() * static_cast<size_t>(raft::warp_size()) *
-  size_t{
-    4} /* tuning parameter */;  // minimum local degree to use a CUDA warp to compute inclusive sums
-size_t constexpr compute_valid_local_nbr_count_inclusive_sum_high_local_degree_threshold =
-  packed_bools_per_word() * per_v_random_select_transform_outgoing_e_block_size *
-  size_t{4} /* tuning parameter */;  // minimum local degree to use a CUDA block to compute
-                                     // inclusive sums
-
-template <typename edge_t>
-struct compute_local_degree_displacements_and_global_degree_t {
-  raft::device_span<edge_t const> gathered_local_degrees{};
-  raft::device_span<edge_t>
-    partitioned_local_degree_displacements{};  // one partition per gpu in the same minor_comm
-  raft::device_span<edge_t> global_degrees{};
-  int minor_comm_size{};
-
-  __device__ void operator()(size_t i) const
-  {
-    constexpr int buffer_size = 8;  // tuning parameter
-    edge_t displacements[buffer_size];
-    edge_t sum{0};
-    for (int round = 0; round < (minor_comm_size + buffer_size - 1) / buffer_size; ++round) {
-      auto loop_count = std::min(buffer_size, minor_comm_size - round * buffer_size);
-      for (int j = 0; j < loop_count; ++j) {
-        displacements[j] = sum;
-        sum += gathered_local_degrees[i + (round * buffer_size + j) * global_degrees.size()];
-      }
-      thrust::copy(
-        thrust::seq,
-        displacements,
-        displacements + loop_count,
-        partitioned_local_degree_displacements.begin() + i * minor_comm_size + round * buffer_size);
-    }
-    global_degrees[i] = sum;
-  }
-};
-
-// convert a (neighbor index, key index) pair  to a (minor_comm_rank, intra-partition offset,
-// neighbor index, key index) quadruplet, minor_comm_rank is set to -1 if an neighbor index is
-// invalid
-template <typename edge_t>
-struct convert_pair_to_quadruplet_t {
-  raft::device_span<edge_t const>
-    partitioned_local_degree_displacements{};  // one partition per gpu in the same minor_comm
-  raft::device_span<size_t> tx_counts{};
-  size_t stride{};
-  int minor_comm_size{};
-  edge_t invalid_idx{};
-
-  __device__ thrust::tuple<int, size_t, edge_t, size_t> operator()(
-    thrust::tuple<edge_t, size_t> index_pair) const
-  {
-    auto nbr_idx       = thrust::get<0>(index_pair);
-    auto key_idx       = thrust::get<1>(index_pair);
-    auto local_nbr_idx = nbr_idx;
-    int minor_comm_rank{-1};
-    size_t intra_partition_offset{};
-    if (nbr_idx != invalid_idx) {
-      auto displacement_first =
-        partitioned_local_degree_displacements.begin() + key_idx * minor_comm_size;
-      minor_comm_rank =
-        static_cast<int>(thrust::distance(
-          displacement_first,
-          thrust::upper_bound(
-            thrust::seq, displacement_first, displacement_first + minor_comm_size, nbr_idx))) -
-        1;
-      local_nbr_idx -= *(displacement_first + minor_comm_rank);
-      cuda::atomic_ref<size_t, cuda::thread_scope_device> counter(tx_counts[minor_comm_rank]);
-      intra_partition_offset = counter.fetch_add(size_t{1}, cuda::std::memory_order_relaxed);
-    }
-    return thrust::make_tuple(minor_comm_rank, intra_partition_offset, local_nbr_idx, key_idx);
-  }
-};
-
-struct shuffle_index_compute_offset_t {
-  raft::device_span<int const> minor_comm_ranks{};
-  raft::device_span<size_t const> intra_partition_displacements{};
-  raft::device_span<size_t const> tx_displacements{};
-
-  __device__ size_t operator()(size_t i) const
+template <typename GraphViewType,
+          typename EdgeSrcValueInputWrapper,
+          typename EdgeDstValueInputWrapper,
+          typename EdgeValueInputWrapper,
+          typename key_t>
+struct constant_e_bias_op_t {
+  __device__ float operator()(key_t,
+                              typename GraphViewType::vertex_type,
+                              typename EdgeSrcValueInputWrapper::value_type,
+                              typename EdgeDstValueInputWrapper::value_type,
+                              typename EdgeValueInputWrapper::value_type) const
   {
-    auto minor_comm_rank = minor_comm_ranks[i];
-    assert(minor_comm_rank != -1);
-    return tx_displacements[minor_comm_rank] + intra_partition_displacements[i];
+    return 1.0;
   }
 };
 
@@ -158,23 +80,12 @@ struct check_invalid_t {
   }
 };
 
-template <typename edge_t>
-struct invalid_minor_comm_rank_t {
-  int invalid_minor_comm_rank{};
-  __device__ bool operator()(thrust::tuple<edge_t, int, size_t> triplet) const
-  {
-    return thrust::get<1>(triplet) == invalid_minor_comm_rank;
-  }
-};
-
 template <typename GraphViewType,
           typename KeyIterator,
           typename LocalNbrIdxIterator,
-          typename OutputValueIterator,
           typename EdgePartitionSrcValueInputWrapper,
           typename EdgePartitionDstValueInputWrapper,
           typename EdgePartitionEdgeValueInputWrapper,
-          typename EdgePartitionEdgeMaskWrapper,
           typename EdgeOp,
           typename T>
 struct transform_local_nbr_indices_t {
@@ -186,28 +97,19 @@ struct transform_local_nbr_indices_t {
   thrust::optional<size_t const*> local_key_indices{thrust::nullopt};
   KeyIterator key_first{};
   LocalNbrIdxIterator local_nbr_idx_first{};
-  OutputValueIterator output_value_first{};
   EdgePartitionSrcValueInputWrapper edge_partition_src_value_input;
   EdgePartitionDstValueInputWrapper edge_partition_dst_value_input;
   EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input;
-  EdgePartitionEdgeMaskWrapper edge_partition_e_mask;
-  thrust::optional<thrust::tuple<raft::device_span<size_t const>, raft::device_span<edge_t const>>>
-    key_valid_local_nbr_count_inclusive_sums{};
   EdgeOp e_op{};
   edge_t invalid_idx{};
   thrust::optional<T> invalid_value{thrust::nullopt};
   size_t K{};
 
-  __device__ void operator()(size_t i) const
+  __device__ T operator()(size_t i) const
   {
-    auto key_idx = local_key_indices ? (*local_key_indices)[i] : (i / K);
-    auto key     = *(key_first + key_idx);
-    vertex_t major{};
-    if constexpr (std::is_same_v<key_t, vertex_t>) {
-      major = key;
-    } else {
-      major = thrust::get<0>(key);
-    }
+    auto key_idx      = local_key_indices ? (*local_key_indices)[i] : (i / K);
+    auto key          = *(key_first + key_idx);
+    auto major        = thrust_tuple_get_or_identity<key_t, 0>(key);
     auto major_offset = edge_partition.major_offset_from_major_nocheck(major);
     vertex_t const* indices{nullptr};
     edge_t edge_offset{0};
@@ -230,31 +132,6 @@ struct transform_local_nbr_indices_t {
     auto local_nbr_idx = *(local_nbr_idx_first + i);
     if (local_nbr_idx != invalid_idx) {
       vertex_t minor{};
-      if (edge_partition_e_mask) {
-        if (local_degree < compute_valid_local_nbr_count_inclusive_sum_local_degree_threshold) {
-          local_nbr_idx = find_nth_set_bits(
-            (*edge_partition_e_mask).value_first(), edge_offset, local_degree, local_nbr_idx + 1);
-        } else {
-          auto inclusive_sum_first =
-            thrust::get<1>(*key_valid_local_nbr_count_inclusive_sums).begin();
-          auto start_offset = thrust::get<0>(*key_valid_local_nbr_count_inclusive_sums)[key_idx];
-          auto end_offset = thrust::get<0>(*key_valid_local_nbr_count_inclusive_sums)[key_idx + 1];
-          auto word_idx   = static_cast<edge_t>(
-            thrust::distance(inclusive_sum_first + start_offset,
-                             thrust::upper_bound(thrust::seq,
-                                                 inclusive_sum_first + start_offset,
-                                                 inclusive_sum_first + end_offset,
-                                                 local_nbr_idx)));
-          local_nbr_idx = word_idx * packed_bools_per_word() +
-                          find_nth_set_bits(
-                            (*edge_partition_e_mask).value_first(),
-                            edge_offset + word_idx * packed_bools_per_word(),
-                            local_degree - word_idx * packed_bools_per_word(),
-                            (local_nbr_idx + 1) -
-                              ((word_idx > 0) ? *(inclusive_sum_first + start_offset + word_idx - 1)
-                                              : edge_t{0}));
-        }
-      }
       minor             = indices[local_nbr_idx];
       auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor);
 
@@ -271,14 +148,13 @@ struct transform_local_nbr_indices_t {
       }
       auto src_offset = GraphViewType::is_storage_transposed ? minor_offset : major_offset;
       auto dst_offset = GraphViewType::is_storage_transposed ? major_offset : minor_offset;
-      *(output_value_first + i) =
-        e_op(key_or_src,
-             key_or_dst,
-             edge_partition_src_value_input.get(src_offset),
-             edge_partition_dst_value_input.get(dst_offset),
-             edge_partition_e_value_input.get(edge_offset + local_nbr_idx));
+      return e_op(key_or_src,
+                  key_or_dst,
+                  edge_partition_src_value_input.get(src_offset),
+                  edge_partition_dst_value_input.get(dst_offset),
+                  edge_partition_e_value_input.get(edge_offset + local_nbr_idx));
     } else if (invalid_value) {
-      *(output_value_first + i) = *invalid_value;
+      return *invalid_value;
     }
   }
 };
@@ -327,630 +203,13 @@ struct return_value_compute_offset_t {
   }
 };
 
-template <typename vertex_t, typename edge_t, bool multi_gpu>
-__global__ static void compute_valid_local_nbr_inclusive_sums_mid_local_degree(
-  edge_partition_device_view_t<vertex_t, edge_t, multi_gpu> edge_partition,
-  edge_partition_edge_property_device_view_t<edge_t, uint32_t const*, bool> edge_partition_e_mask,
-  raft::device_span<vertex_t const> edge_partition_frontier_majors,
-  raft::device_span<size_t const> inclusive_sum_offsets,
-  raft::device_span<size_t const> frontier_indices,
-  raft::device_span<edge_t> inclusive_sums)
-{
-  static_assert(per_v_random_select_transform_outgoing_e_block_size % raft::warp_size() == 0);
-
-  auto const tid     = threadIdx.x + blockIdx.x * blockDim.x;
-  auto const lane_id = tid % raft::warp_size();
-
-  auto idx = static_cast<size_t>(tid / raft::warp_size());
-
-  using WarpScan = cub::WarpScan<edge_t, raft::warp_size()>;
-  __shared__ typename WarpScan::TempStorage temp_storage;
-
-  while (idx < frontier_indices.size()) {
-    auto frontier_idx = frontier_indices[idx];
-    auto major        = edge_partition_frontier_majors[frontier_idx];
-    vertex_t major_idx{};
-    if constexpr (multi_gpu) {
-      major_idx = *(edge_partition.major_idx_from_major_nocheck(major));
-    } else {
-      major_idx = edge_partition.major_offset_from_major_nocheck(major);
-    }
-    auto edge_offset  = edge_partition.local_offset(major_idx);
-    auto local_degree = edge_partition.local_degree(major_idx);
-
-    auto start_offset       = inclusive_sum_offsets[frontier_idx];
-    auto end_offset         = inclusive_sum_offsets[frontier_idx + 1];
-    auto num_inclusive_sums = end_offset - start_offset;
-    auto rounded_up_num_inclusive_sums =
-      ((num_inclusive_sums + raft::warp_size() - 1) / raft::warp_size()) * raft::warp_size();
-    edge_t sum{0};
-    for (size_t j = lane_id; j <= rounded_up_num_inclusive_sums; j += raft::warp_size()) {
-      auto inc =
-        (j < num_inclusive_sums)
-          ? static_cast<edge_t>(count_set_bits(
-              edge_partition_e_mask.value_first(),
-              edge_offset + packed_bools_per_word() * j,
-              cuda::std::min(packed_bools_per_word(), local_degree - packed_bools_per_word() * j)))
-          : edge_t{0};
-      WarpScan(temp_storage).InclusiveSum(inc, inc);
-      inclusive_sums[start_offset + j] = sum + inc;
-      sum += __shfl_sync(raft::warp_full_mask(), inc, raft::warp_size() - 1);
-    }
-
-    idx += gridDim.x * (blockDim.x / raft::warp_size());
-  }
-}
-
-template <typename vertex_t, typename edge_t, bool multi_gpu>
-__global__ static void compute_valid_local_nbr_inclusive_sums_high_local_degree(
-  edge_partition_device_view_t<vertex_t, edge_t, multi_gpu> edge_partition,
-  edge_partition_edge_property_device_view_t<edge_t, uint32_t const*, bool> edge_partition_e_mask,
-  raft::device_span<vertex_t const> edge_partition_frontier_majors,
-  raft::device_span<size_t const> inclusive_sum_offsets,
-  raft::device_span<size_t const> frontier_indices,
-  raft::device_span<edge_t> inclusive_sums)
-{
-  static_assert(per_v_random_select_transform_outgoing_e_block_size % raft::warp_size() == 0);
-
-  auto idx = static_cast<size_t>(blockIdx.x);
-
-  using BlockScan = cub::BlockScan<edge_t, per_v_random_select_transform_outgoing_e_block_size>;
-  __shared__ typename BlockScan::TempStorage temp_storage;
-
-  __shared__ edge_t sum;
-
-  while (idx < frontier_indices.size()) {
-    auto frontier_idx = frontier_indices[idx];
-    auto major        = edge_partition_frontier_majors[frontier_idx];
-    vertex_t major_idx{};
-    if constexpr (multi_gpu) {
-      major_idx = *(edge_partition.major_idx_from_major_nocheck(major));
-    } else {
-      major_idx = edge_partition.major_offset_from_major_nocheck(major);
-    }
-    auto edge_offset  = edge_partition.local_offset(major_idx);
-    auto local_degree = edge_partition.local_degree(major_idx);
-
-    auto start_offset       = inclusive_sum_offsets[frontier_idx];
-    auto end_offset         = inclusive_sum_offsets[frontier_idx + 1];
-    auto num_inclusive_sums = end_offset - start_offset;
-    auto rounded_up_num_inclusive_sums =
-      ((num_inclusive_sums + per_v_random_select_transform_outgoing_e_block_size - 1) /
-       per_v_random_select_transform_outgoing_e_block_size) *
-      per_v_random_select_transform_outgoing_e_block_size;
-    if (threadIdx.x == per_v_random_select_transform_outgoing_e_block_size - 1) { sum = 0; }
-    for (size_t j = threadIdx.x; j <= rounded_up_num_inclusive_sums; j += blockDim.x) {
-      auto inc =
-        (j < num_inclusive_sums)
-          ? static_cast<edge_t>(count_set_bits(
-              edge_partition_e_mask.value_first(),
-              edge_offset + packed_bools_per_word() * j,
-              cuda::std::min(packed_bools_per_word(), local_degree - packed_bools_per_word() * j)))
-          : edge_t{0};
-      BlockScan(temp_storage).InclusiveSum(inc, inc);
-      inclusive_sums[start_offset + j] = sum + inc;
-      __syncthreads();
-      if (threadIdx.x == per_v_random_select_transform_outgoing_e_block_size - 1) { sum += inc; }
-    }
-
-    idx += gridDim.x;
-  }
-}
-
-template <typename vertex_t, typename edge_t, bool multi_gpu>
-std::tuple<rmm::device_uvector<size_t>, rmm::device_uvector<edge_t>>
-compute_valid_local_nbr_count_inclusive_sums(
-  raft::handle_t const& handle,
-  edge_partition_device_view_t<vertex_t, edge_t, multi_gpu> const& edge_partition,
-  edge_partition_edge_property_device_view_t<edge_t, uint32_t const*, bool> const&
-    edge_partition_e_mask,
-  raft::device_span<vertex_t const> edge_partition_frontier_majors)
-{
-  auto edge_partition_local_degrees =
-    edge_partition.compute_local_degrees(edge_partition_frontier_majors.begin(),
-                                         edge_partition_frontier_majors.end(),
-                                         handle.get_stream());
-  auto offsets =
-    rmm::device_uvector<size_t>(edge_partition_frontier_majors.size() + 1, handle.get_stream());
-  offsets.set_element_to_zero_async(0, handle.get_stream());
-  auto size_first = thrust::make_transform_iterator(
-    edge_partition_local_degrees.begin(),
-    cuda::proclaim_return_type<size_t>([] __device__(edge_t local_degree) {
-      return static_cast<size_t>((local_degree + packed_bools_per_word() - 1) /
-                                 packed_bools_per_word());
-    }));
-  thrust::inclusive_scan(handle.get_thrust_policy(),
-                         size_first,
-                         size_first + edge_partition_local_degrees.size(),
-                         offsets.begin() + 1);
-
-  rmm::device_uvector<size_t> frontier_indices(edge_partition_frontier_majors.size(),
-                                               handle.get_stream());
-  frontier_indices.resize(
-    thrust::distance(
-      frontier_indices.begin(),
-      thrust::copy_if(
-        handle.get_thrust_policy(),
-        thrust::make_counting_iterator(size_t{0}),
-        thrust::make_counting_iterator(edge_partition_frontier_majors.size()),
-        frontier_indices.begin(),
-        [threshold     = compute_valid_local_nbr_count_inclusive_sum_local_degree_threshold,
-         local_degrees = raft::device_span<edge_t const>(
-           edge_partition_local_degrees.data(),
-           edge_partition_local_degrees.size())] __device__(size_t i) {
-          return local_degrees[i] >= threshold;
-        })),
-    handle.get_stream());
-
-  auto low_last = thrust::partition(
-    handle.get_thrust_policy(),
-    frontier_indices.begin(),
-    frontier_indices.end(),
-    [threshold = compute_valid_local_nbr_count_inclusive_sum_mid_local_degree_threshold,
-     local_degrees =
-       raft::device_span<edge_t const>(edge_partition_local_degrees.data(),
-                                       edge_partition_local_degrees.size())] __device__(size_t i) {
-      return local_degrees[i] < threshold;
-    });
-  auto mid_last = thrust::partition(
-    handle.get_thrust_policy(),
-    low_last,
-    frontier_indices.end(),
-    [threshold = compute_valid_local_nbr_count_inclusive_sum_high_local_degree_threshold,
-     local_degrees =
-       raft::device_span<edge_t const>(edge_partition_local_degrees.data(),
-                                       edge_partition_local_degrees.size())] __device__(size_t i) {
-      return local_degrees[i] < threshold;
-    });
-
-  rmm::device_uvector<edge_t> inclusive_sums(offsets.back_element(handle.get_stream()),
-                                             handle.get_stream());
-
-  thrust::for_each(
-    handle.get_thrust_policy(),
-    frontier_indices.begin(),
-    low_last,
-    [edge_partition,
-     edge_partition_e_mask,
-     edge_partition_frontier_majors,
-     offsets        = raft::device_span<size_t const>(offsets.data(), offsets.size()),
-     inclusive_sums = raft::device_span<edge_t>(inclusive_sums.data(),
-                                                inclusive_sums.size())] __device__(size_t i) {
-      auto major = edge_partition_frontier_majors[i];
-      vertex_t major_idx{};
-      if constexpr (multi_gpu) {
-        major_idx = *(edge_partition.major_idx_from_major_nocheck(major));
-      } else {
-        major_idx = edge_partition.major_offset_from_major_nocheck(major);
-      }
-      auto edge_offset  = edge_partition.local_offset(major_idx);
-      auto local_degree = edge_partition.local_degree(major_idx);
-      edge_t sum{0};
-      auto start_offset = offsets[i];
-      auto end_offset   = offsets[i + 1];
-      for (size_t j = 0; j < end_offset - start_offset; ++j) {
-        sum += count_set_bits(
-          edge_partition_e_mask.value_first(),
-          edge_offset + packed_bools_per_word() * j,
-          cuda::std::min(packed_bools_per_word(), local_degree - packed_bools_per_word() * j));
-        inclusive_sums[start_offset + j] = sum;
-      }
-    });
-
-  if (thrust::distance(low_last, mid_last) > 0) {
-    raft::grid_1d_warp_t update_grid(thrust::distance(low_last, mid_last),
-                                     per_v_random_select_transform_outgoing_e_block_size,
-                                     handle.get_device_properties().maxGridSize[0]);
-    compute_valid_local_nbr_inclusive_sums_mid_local_degree<<<update_grid.num_blocks,
-                                                              update_grid.block_size,
-                                                              0,
-                                                              handle.get_stream()>>>(
-      edge_partition,
-      edge_partition_e_mask,
-      edge_partition_frontier_majors,
-      raft::device_span<size_t const>(offsets.data(), offsets.size()),
-      raft::device_span<size_t const>(low_last, thrust::distance(low_last, mid_last)),
-      raft::device_span<edge_t>(inclusive_sums.data(), inclusive_sums.size()));
-  }
-
-  if (thrust::distance(mid_last, frontier_indices.end()) > 0) {
-    raft::grid_1d_block_t update_grid(thrust::distance(mid_last, frontier_indices.end()),
-                                      per_v_random_select_transform_outgoing_e_block_size,
-                                      handle.get_device_properties().maxGridSize[0]);
-    compute_valid_local_nbr_inclusive_sums_high_local_degree<<<update_grid.num_blocks,
-                                                               update_grid.block_size,
-                                                               0,
-                                                               handle.get_stream()>>>(
-      edge_partition,
-      edge_partition_e_mask,
-      edge_partition_frontier_majors,
-      raft::device_span<size_t const>(offsets.data(), offsets.size()),
-      raft::device_span<size_t const>(mid_last, thrust::distance(mid_last, frontier_indices.end())),
-      raft::device_span<edge_t>(inclusive_sums.data(), inclusive_sums.size()));
-  }
-
-  return std::make_tuple(std::move(offsets), std::move(inclusive_sums));
-}
-
-template <typename edge_t>
-rmm::device_uvector<edge_t> get_sampling_index_without_replacement(
-  raft::handle_t const& handle,
-  rmm::device_uvector<edge_t>&& frontier_degrees,
-  raft::random::RngState& rng_state,
-  size_t K)
-{
-#ifndef NO_CUGRAPH_OPS
-  edge_t mid_partition_degree_range_last = static_cast<edge_t>(K * 10);  // tuning parameter
-  assert(mid_partition_degree_range_last > K);
-  size_t high_partition_over_sampling_K = K * 2;  // tuning parameter
-  assert(high_partition_over_sampling_K > K);
-
-  rmm::device_uvector<edge_t> sample_nbr_indices(frontier_degrees.size() * K, handle.get_stream());
-
-  rmm::device_uvector<size_t> seed_indices(frontier_degrees.size(), handle.get_stream());
-  thrust::sequence(handle.get_thrust_policy(), seed_indices.begin(), seed_indices.end(), size_t{0});
-  auto low_first =
-    thrust::make_zip_iterator(thrust::make_tuple(frontier_degrees.begin(), seed_indices.begin()));
-  auto mid_first = thrust::partition(
-    handle.get_thrust_policy(),
-    low_first,
-    low_first + frontier_degrees.size(),
-    [K] __device__(auto pair) { return thrust::get<0>(pair) <= static_cast<edge_t>(K); });
-  auto low_partition_size = static_cast<size_t>(thrust::distance(low_first, mid_first));
-  auto high_first =
-    thrust::partition(handle.get_thrust_policy(),
-                      mid_first,
-                      mid_first + (frontier_degrees.size() - low_partition_size),
-                      [mid_partition_degree_range_last] __device__(auto pair) {
-                        return thrust::get<0>(pair) < mid_partition_degree_range_last;
-                      });
-  auto mid_partition_size  = static_cast<size_t>(thrust::distance(mid_first, high_first));
-  auto high_partition_size = frontier_degrees.size() - (low_partition_size + mid_partition_size);
-
-  if (low_partition_size > 0) {
-    thrust::for_each(handle.get_thrust_policy(),
-                     thrust::make_counting_iterator(size_t{0}),
-                     thrust::make_counting_iterator(low_partition_size * K),
-                     [K,
-                      low_first,
-                      sample_nbr_indices = sample_nbr_indices.data(),
-                      invalid_idx = cugraph::ops::graph::INVALID_ID<edge_t>] __device__(size_t i) {
-                       auto pair       = *(low_first + (i / K));
-                       auto degree     = thrust::get<0>(pair);
-                       auto seed_idx   = thrust::get<1>(pair);
-                       auto sample_idx = static_cast<edge_t>(i % K);
-                       sample_nbr_indices[seed_idx * K + sample_idx] =
-                         (sample_idx < degree) ? sample_idx : invalid_idx;
-                     });
-  }
-
-  if (mid_partition_size > 0) {
-    rmm::device_uvector<edge_t> tmp_sample_nbr_indices(mid_partition_size * K, handle.get_stream());
-    // FIXME: we can avoid the follow-up copy if get_sampling_index takes output offsets for
-    // sampling output
-    cugraph::ops::graph::get_sampling_index(tmp_sample_nbr_indices.data(),
-                                            rng_state,
-                                            thrust::get<0>(mid_first.get_iterator_tuple()),
-                                            mid_partition_size,
-                                            static_cast<int32_t>(K),
-                                            false,
-                                            handle.get_stream());
-    thrust::for_each(handle.get_thrust_policy(),
-                     thrust::make_counting_iterator(size_t{0}),
-                     thrust::make_counting_iterator(mid_partition_size * K),
-                     [K,
-                      seed_index_first       = thrust::get<1>(mid_first.get_iterator_tuple()),
-                      tmp_sample_nbr_indices = tmp_sample_nbr_indices.data(),
-                      sample_nbr_indices     = sample_nbr_indices.data()] __device__(size_t i) {
-                       auto seed_idx                                 = *(seed_index_first + i / K);
-                       auto sample_idx                               = static_cast<edge_t>(i % K);
-                       sample_nbr_indices[seed_idx * K + sample_idx] = tmp_sample_nbr_indices[i];
-                     });
-  }
-
-  if (high_partition_size > 0) {
-    // to limit memory footprint ((1 << 20) is a tuning parameter), std::max for forward progress
-    // guarantee when high_partition_over_sampling_K is exorbitantly large
-    auto seeds_to_sort_per_iteration =
-      std::max(static_cast<size_t>(handle.get_device_properties().multiProcessorCount * (1 << 20)) /
-                 high_partition_over_sampling_K,
-               size_t{1});
-
-    rmm::device_uvector<edge_t> tmp_sample_nbr_indices(
-      seeds_to_sort_per_iteration * high_partition_over_sampling_K, handle.get_stream());
-    assert(high_partition_over_sampling_K * 2 <=
-           static_cast<size_t>(std::numeric_limits<int32_t>::max()));
-    rmm::device_uvector<int32_t> tmp_sample_indices(
-      seeds_to_sort_per_iteration * high_partition_over_sampling_K,
-      handle.get_stream());  // sample indices within a segment (one partition per seed)
-
-    rmm::device_uvector<edge_t> segment_sorted_tmp_sample_nbr_indices(
-      seeds_to_sort_per_iteration * high_partition_over_sampling_K, handle.get_stream());
-    rmm::device_uvector<int32_t> segment_sorted_tmp_sample_indices(
-      seeds_to_sort_per_iteration * high_partition_over_sampling_K, handle.get_stream());
-
-    rmm::device_uvector<std::byte> d_tmp_storage(0, handle.get_stream());
-    size_t tmp_storage_bytes{0};
-
-    auto num_chunks =
-      (high_partition_size + seeds_to_sort_per_iteration - 1) / seeds_to_sort_per_iteration;
-    for (size_t i = 0; i < num_chunks; ++i) {
-      size_t num_segments = std::min(seeds_to_sort_per_iteration,
-                                     high_partition_size - seeds_to_sort_per_iteration * i);
-
-      rmm::device_uvector<edge_t> unique_counts(num_segments, handle.get_stream());
-
-      std::optional<rmm::device_uvector<size_t>> retry_segment_indices{std::nullopt};
-      std::optional<rmm::device_uvector<edge_t>> retry_degrees{std::nullopt};
-      std::optional<rmm::device_uvector<edge_t>> retry_sample_nbr_indices{std::nullopt};
-      std::optional<rmm::device_uvector<int32_t>> retry_sample_indices{std::nullopt};
-      std::optional<rmm::device_uvector<edge_t>> retry_segment_sorted_sample_nbr_indices{
-        std::nullopt};
-      std::optional<rmm::device_uvector<int32_t>> retry_segment_sorted_sample_indices{std::nullopt};
-      while (true) {
-        auto segment_degree_first =
-          thrust::get<0>(high_first.get_iterator_tuple()) + seeds_to_sort_per_iteration * i;
-
-        if (retry_segment_indices) {
-          retry_degrees =
-            rmm::device_uvector<edge_t>((*retry_segment_indices).size(), handle.get_stream());
-          thrust::transform(
-            handle.get_thrust_policy(),
-            (*retry_segment_indices).begin(),
-            (*retry_segment_indices).end(),
-            (*retry_degrees).begin(),
-            indirection_t<size_t, decltype(segment_degree_first)>{segment_degree_first});
-          retry_sample_nbr_indices = rmm::device_uvector<edge_t>(
-            (*retry_segment_indices).size() * high_partition_over_sampling_K, handle.get_stream());
-          retry_sample_indices = rmm::device_uvector<int32_t>(
-            (*retry_segment_indices).size() * high_partition_over_sampling_K, handle.get_stream());
-          retry_segment_sorted_sample_nbr_indices = rmm::device_uvector<edge_t>(
-            (*retry_segment_indices).size() * high_partition_over_sampling_K, handle.get_stream());
-          retry_segment_sorted_sample_indices = rmm::device_uvector<int32_t>(
-            (*retry_segment_indices).size() * high_partition_over_sampling_K, handle.get_stream());
-        }
-
-        cugraph::ops::graph::get_sampling_index(
-          retry_segment_indices ? (*retry_sample_nbr_indices).data()
-                                : tmp_sample_nbr_indices.data(),
-          rng_state,
-          retry_segment_indices ? (*retry_degrees).begin() : segment_degree_first,
-          retry_segment_indices ? (*retry_degrees).size() : num_segments,
-          static_cast<int32_t>(high_partition_over_sampling_K),
-          true,
-          handle.get_stream());
-
-        if (retry_segment_indices) {
-          thrust::for_each(
-            handle.get_thrust_policy(),
-            thrust::make_counting_iterator(size_t{0}),
-            thrust::make_counting_iterator((*retry_segment_indices).size() *
-                                           high_partition_over_sampling_K),
-            [high_partition_over_sampling_K,
-             unique_counts                         = unique_counts.data(),
-             segment_sorted_tmp_sample_nbr_indices = segment_sorted_tmp_sample_nbr_indices.data(),
-             retry_segment_indices                 = (*retry_segment_indices).data(),
-             retry_sample_nbr_indices              = (*retry_sample_nbr_indices).data(),
-             retry_sample_indices = (*retry_sample_indices).data()] __device__(size_t i) {
-              auto segment_idx  = retry_segment_indices[i / high_partition_over_sampling_K];
-              auto sample_idx   = static_cast<edge_t>(i % high_partition_over_sampling_K);
-              auto unique_count = unique_counts[segment_idx];
-              auto output_first = thrust::make_zip_iterator(
-                thrust::make_tuple(retry_sample_nbr_indices, retry_sample_indices));
-              // sample index for the previously selected neighbor indices should be smaller than
-              // the new candidates to ensure that the previously selected neighbor indices will be
-              // selected again
-              if (sample_idx < unique_count) {
-                *(output_first + i) =
-                  thrust::make_tuple(segment_sorted_tmp_sample_nbr_indices
-                                       [segment_idx * high_partition_over_sampling_K + sample_idx],
-                                     static_cast<int32_t>(sample_idx));
-              } else {
-                *(output_first + i) =
-                  thrust::make_tuple(retry_sample_nbr_indices[i],
-                                     high_partition_over_sampling_K + (sample_idx - unique_count));
-              }
-            });
-        } else {
-          thrust::tabulate(
-            handle.get_thrust_policy(),
-            tmp_sample_indices.begin(),
-            tmp_sample_indices.begin() + num_segments * high_partition_over_sampling_K,
-            [high_partition_over_sampling_K] __device__(size_t i) {
-              return static_cast<int32_t>(i % high_partition_over_sampling_K);
-            });
-        }
-
-        // sort the (sample neighbor index, sample index) pairs (key: sample neighbor index)
-
-        cub::DeviceSegmentedSort::SortPairs(
-          static_cast<void*>(nullptr),
-          tmp_storage_bytes,
-          retry_segment_indices ? (*retry_sample_nbr_indices).data()
-                                : tmp_sample_nbr_indices.data(),
-          retry_segment_indices ? (*retry_segment_sorted_sample_nbr_indices).data()
-                                : segment_sorted_tmp_sample_nbr_indices.data(),
-          retry_segment_indices ? (*retry_sample_indices).data() : tmp_sample_indices.data(),
-          retry_segment_indices ? (*retry_segment_sorted_sample_indices).data()
-                                : segment_sorted_tmp_sample_indices.data(),
-          (retry_segment_indices ? (*retry_segment_indices).size() : num_segments) *
-            high_partition_over_sampling_K,
-          retry_segment_indices ? (*retry_segment_indices).size() : num_segments,
-          thrust::make_transform_iterator(thrust::make_counting_iterator(size_t{0}),
-                                          multiplier_t<size_t>{high_partition_over_sampling_K}),
-          thrust::make_transform_iterator(thrust::make_counting_iterator(size_t{1}),
-                                          multiplier_t<size_t>{high_partition_over_sampling_K}),
-          handle.get_stream());
-        if (tmp_storage_bytes > d_tmp_storage.size()) {
-          d_tmp_storage = rmm::device_uvector<std::byte>(tmp_storage_bytes, handle.get_stream());
-        }
-        cub::DeviceSegmentedSort::SortPairs(
-          d_tmp_storage.data(),
-          tmp_storage_bytes,
-          retry_segment_indices ? (*retry_sample_nbr_indices).data()
-                                : tmp_sample_nbr_indices.data(),
-          retry_segment_indices ? (*retry_segment_sorted_sample_nbr_indices).data()
-                                : segment_sorted_tmp_sample_nbr_indices.data(),
-          retry_segment_indices ? (*retry_sample_indices).data() : tmp_sample_indices.data(),
-          retry_segment_indices ? (*retry_segment_sorted_sample_indices).data()
-                                : segment_sorted_tmp_sample_indices.data(),
-          (retry_segment_indices ? (*retry_segment_indices).size() : num_segments) *
-            high_partition_over_sampling_K,
-          retry_segment_indices ? (*retry_segment_indices).size() : num_segments,
-          thrust::make_transform_iterator(thrust::make_counting_iterator(size_t{0}),
-                                          multiplier_t<size_t>{high_partition_over_sampling_K}),
-          thrust::make_transform_iterator(thrust::make_counting_iterator(size_t{1}),
-                                          multiplier_t<size_t>{high_partition_over_sampling_K}),
-          handle.get_stream());
-
-        // count the number of unique neighbor indices
-
-        if (retry_segment_indices) {
-          thrust::for_each(
-            handle.get_thrust_policy(),
-            thrust::make_counting_iterator(size_t{0}),
-            thrust::make_counting_iterator((*retry_segment_indices).size()),
-            [high_partition_over_sampling_K,
-             unique_counts                   = unique_counts.data(),
-             retry_segment_indices           = (*retry_segment_indices).data(),
-             retry_segment_sorted_pair_first = thrust::make_zip_iterator(
-               thrust::make_tuple((*retry_segment_sorted_sample_nbr_indices).begin(),
-                                  (*retry_segment_sorted_sample_indices).begin())),
-             segment_sorted_pair_first = thrust::make_zip_iterator(thrust::make_tuple(
-               segment_sorted_tmp_sample_nbr_indices.begin(),
-               segment_sorted_tmp_sample_indices.begin()))] __device__(size_t i) {
-              auto unique_count          = static_cast<edge_t>(thrust::distance(
-                retry_segment_sorted_pair_first + high_partition_over_sampling_K * i,
-                thrust::unique(
-                  thrust::seq,
-                  retry_segment_sorted_pair_first + high_partition_over_sampling_K * i,
-                  retry_segment_sorted_pair_first + high_partition_over_sampling_K * (i + 1),
-                  [] __device__(auto lhs, auto rhs) {
-                    return thrust::get<0>(lhs) == thrust::get<0>(rhs);
-                  })));
-              auto segment_idx           = retry_segment_indices[i];
-              unique_counts[segment_idx] = unique_count;
-              thrust::copy(
-                thrust::seq,
-                retry_segment_sorted_pair_first + high_partition_over_sampling_K * i,
-                retry_segment_sorted_pair_first + high_partition_over_sampling_K * i + unique_count,
-                segment_sorted_pair_first + high_partition_over_sampling_K * segment_idx);
-            });
-        } else {
-          thrust::tabulate(
-            handle.get_thrust_policy(),
-            unique_counts.begin(),
-            unique_counts.end(),
-            [high_partition_over_sampling_K,
-             segment_sorted_pair_first = thrust::make_zip_iterator(thrust::make_tuple(
-               segment_sorted_tmp_sample_nbr_indices.begin(),
-               segment_sorted_tmp_sample_indices.begin()))] __device__(size_t i) {
-              return static_cast<edge_t>(thrust::distance(
-                segment_sorted_pair_first + high_partition_over_sampling_K * i,
-                thrust::unique(thrust::seq,
-                               segment_sorted_pair_first + high_partition_over_sampling_K * i,
-                               segment_sorted_pair_first + high_partition_over_sampling_K * (i + 1),
-                               [] __device__(auto lhs, auto rhs) {
-                                 return thrust::get<0>(lhs) == thrust::get<0>(rhs);
-                               })));
-            });
-        }
-
-        auto num_retry_segments =
-          thrust::count_if(handle.get_thrust_policy(),
-                           unique_counts.begin(),
-                           unique_counts.end(),
-                           [K] __device__(auto count) { return count < K; });
-        if (num_retry_segments > 0) {
-          retry_segment_indices =
-            rmm::device_uvector<size_t>(num_retry_segments, handle.get_stream());
-          thrust::copy_if(handle.get_thrust_policy(),
-                          thrust::make_counting_iterator(size_t{0}),
-                          thrust::make_counting_iterator(num_segments),
-                          (*retry_segment_indices).begin(),
-                          [K, unique_counts = unique_counts.data()] __device__(size_t i) {
-                            return unique_counts[i] < K;
-                          });
-        } else {
-          break;
-        }
-      }
-
-      // sort the segment-sorted (sample index, sample neighbor index) pairs (key: sample index)
-
-      cub::DeviceSegmentedSort::SortPairs(
-        static_cast<void*>(nullptr),
-        tmp_storage_bytes,
-        segment_sorted_tmp_sample_indices.data(),
-        tmp_sample_indices.data(),
-        segment_sorted_tmp_sample_nbr_indices.data(),
-        tmp_sample_nbr_indices.data(),
-        num_segments * high_partition_over_sampling_K,
-        num_segments,
-        thrust::make_transform_iterator(thrust::make_counting_iterator(size_t{0}),
-                                        multiplier_t<size_t>{high_partition_over_sampling_K}),
-        thrust::make_transform_iterator(
-          thrust::make_counting_iterator(size_t{0}),
-          cuda::proclaim_return_type<size_t>(
-            [high_partition_over_sampling_K, unique_counts = unique_counts.data()] __device__(
-              size_t i) { return i * high_partition_over_sampling_K + unique_counts[i]; })),
-        handle.get_stream());
-      if (tmp_storage_bytes > d_tmp_storage.size()) {
-        d_tmp_storage = rmm::device_uvector<std::byte>(tmp_storage_bytes, handle.get_stream());
-      }
-      cub::DeviceSegmentedSort::SortPairs(
-        d_tmp_storage.data(),
-        tmp_storage_bytes,
-        segment_sorted_tmp_sample_indices.data(),
-        tmp_sample_indices.data(),
-        segment_sorted_tmp_sample_nbr_indices.data(),
-        tmp_sample_nbr_indices.data(),
-        num_segments * high_partition_over_sampling_K,
-        num_segments,
-        thrust::make_transform_iterator(thrust::make_counting_iterator(size_t{0}),
-                                        multiplier_t<size_t>{high_partition_over_sampling_K}),
-        thrust::make_transform_iterator(
-          thrust::make_counting_iterator(size_t{0}),
-          cuda::proclaim_return_type<size_t>(
-            [high_partition_over_sampling_K, unique_counts = unique_counts.data()] __device__(
-              size_t i) { return i * high_partition_over_sampling_K + unique_counts[i]; })),
-        handle.get_stream());
-
-      // copy the neighbor indices back to sample_nbr_indices
-
-      thrust::for_each(
-        handle.get_thrust_policy(),
-        thrust::make_counting_iterator(size_t{0}),
-        thrust::make_counting_iterator(num_segments * K),
-        [K,
-         high_partition_over_sampling_K,
-         seed_indices =
-           thrust::get<1>(high_first.get_iterator_tuple()) + seeds_to_sort_per_iteration * i,
-         tmp_sample_nbr_indices = tmp_sample_nbr_indices.data(),
-         sample_nbr_indices     = sample_nbr_indices.data()] __device__(size_t i) {
-          auto seed_idx   = *(seed_indices + i / K);
-          auto sample_idx = static_cast<edge_t>(i % K);
-          *(sample_nbr_indices + seed_idx * K + sample_idx) =
-            *(tmp_sample_nbr_indices + (i / K) * high_partition_over_sampling_K + sample_idx);
-        });
-    }
-  }
-
-  frontier_degrees.resize(0, handle.get_stream());
-  frontier_degrees.shrink_to_fit(handle.get_stream());
-
-  return sample_nbr_indices;
-#else
-  CUGRAPH_FAIL("unimplemented.");
-#endif
-}
-
 template <bool incoming,
           typename GraphViewType,
           typename VertexFrontierBucketType,
           typename EdgeSrcValueInputWrapper,
           typename EdgeDstValueInputWrapper,
           typename EdgeValueInputWrapper,
+          typename EdgeBiasOp,
           typename EdgeOp,
           typename T>
 std::tuple<std::optional<rmm::device_uvector<size_t>>,
@@ -961,6 +220,7 @@ per_v_random_select_transform_e(raft::handle_t const& handle,
                                 EdgeSrcValueInputWrapper edge_src_value_input,
                                 EdgeDstValueInputWrapper edge_dst_value_input,
                                 EdgeValueInputWrapper edge_value_input,
+                                EdgeBiasOp e_bias_op,
                                 EdgeOp e_op,
                                 raft::random::RngState& rng_state,
                                 size_t K,
@@ -1017,18 +277,14 @@ per_v_random_select_transform_e(raft::handle_t const& handle,
     GraphViewType::is_multi_gpu
       ? handle.get_subcomm(cugraph::partition_manager::minor_comm_name()).get_size()
       : int{1};
+  assert(graph_view.number_of_local_edge_partitions() == minor_comm_size);
 
   if (do_expensive_check) {
     // FIXME: better re-factor this check function?
-    vertex_t const* frontier_vertex_first{nullptr};
-    vertex_t const* frontier_vertex_last{nullptr};
-    if constexpr (std::is_same_v<key_t, vertex_t>) {
-      frontier_vertex_first = frontier.begin();
-      frontier_vertex_last  = frontier.end();
-    } else {
-      frontier_vertex_first = thrust::get<0>(frontier.begin().get_iterator_tuple());
-      frontier_vertex_last  = thrust::get<0>(frontier.end().get_iterator_tuple());
-    }
+    auto frontier_vertex_first =
+      thrust_tuple_get_or_identity<decltype(frontier.begin()), 0>(frontier.begin());
+    auto frontier_vertex_last =
+      thrust_tuple_get_or_identity<decltype(frontier.end()), 0>(frontier.end());
     auto num_invalid_keys =
       frontier.size() -
       thrust::count_if(handle.get_thrust_policy(),
@@ -1044,19 +300,12 @@ per_v_random_select_transform_e(raft::handle_t const& handle,
                     "Invalid input argument: frontier includes out-of-range keys.");
   }
 
-  auto frontier_key_first = frontier.begin();
-  auto frontier_key_last  = frontier.end();
-
   std::vector<size_t> local_frontier_sizes{};
   if (minor_comm_size > 1) {
     auto& minor_comm     = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
-    local_frontier_sizes = host_scalar_allgather(
-      minor_comm,
-      static_cast<size_t>(thrust::distance(frontier_key_first, frontier_key_last)),
-      handle.get_stream());
+    local_frontier_sizes = host_scalar_allgather(minor_comm, frontier.size(), handle.get_stream());
   } else {
-    local_frontier_sizes = std::vector<size_t>{static_cast<size_t>(
-      static_cast<vertex_t>(thrust::distance(frontier_key_first, frontier_key_last)))};
+    local_frontier_sizes = std::vector<size_t>{frontier.size()};
   }
   std::vector<size_t> local_frontier_displacements(local_frontier_sizes.size());
   std::exclusive_scan(local_frontier_sizes.begin(),
@@ -1066,7 +315,7 @@ per_v_random_select_transform_e(raft::handle_t const& handle,
 
   // 1. aggregate frontier
 
-  auto aggregate_local_frontier_keys =
+  auto aggregate_local_frontier =
     (minor_comm_size > 1)
       ? std::make_optional<key_buffer_t>(
           local_frontier_displacements.back() + local_frontier_sizes.back(), handle.get_stream())
@@ -1074,257 +323,78 @@ per_v_random_select_transform_e(raft::handle_t const& handle,
   if (minor_comm_size > 1) {
     auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
     device_allgatherv(minor_comm,
-                      frontier_key_first,
-                      get_dataframe_buffer_begin(*aggregate_local_frontier_keys),
+                      frontier.begin(),
+                      get_dataframe_buffer_begin(*aggregate_local_frontier),
                       local_frontier_sizes,
                       local_frontier_displacements,
                       handle.get_stream());
   }
 
-  // 2. compute degrees
-
-  auto edge_mask_view = graph_view.edge_mask_view();
-
-  auto aggregate_local_frontier_local_degrees =
-    (minor_comm_size > 1)
-      ? std::make_optional<rmm::device_uvector<edge_t>>(
-          local_frontier_displacements.back() + local_frontier_sizes.back(), handle.get_stream())
-      : std::nullopt;
-  rmm::device_uvector<edge_t> frontier_degrees(frontier.size(), handle.get_stream());
-
-  std::optional<std::vector<std::tuple<rmm::device_uvector<size_t>, rmm::device_uvector<edge_t>>>>
-    local_frontier_valid_local_nbr_count_inclusive_sums{};  // to avoid searching the entire
-                                                            // neighbor list K times for high degree
-                                                            // vertices with edge masking
-  if (edge_mask_view) {
-    local_frontier_valid_local_nbr_count_inclusive_sums =
-      std::vector<std::tuple<rmm::device_uvector<size_t>, rmm::device_uvector<edge_t>>>{};
-    (*local_frontier_valid_local_nbr_count_inclusive_sums)
-      .reserve(graph_view.number_of_local_edge_partitions());
-  }
-
-  for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) {
-    auto edge_partition =
-      edge_partition_device_view_t<vertex_t, edge_t, GraphViewType::is_multi_gpu>(
-        graph_view.local_edge_partition_view(i));
-    auto edge_partition_e_mask =
-      edge_mask_view
-        ? thrust::make_optional<
-            detail::edge_partition_edge_property_device_view_t<edge_t, uint32_t const*, bool>>(
-            *edge_mask_view, i)
-        : thrust::nullopt;
-
-    vertex_t const* edge_partition_frontier_major_first{nullptr};
-
-    auto edge_partition_frontier_key_first =
-      ((minor_comm_size > 1) ? get_dataframe_buffer_begin(*aggregate_local_frontier_keys)
-                             : frontier_key_first) +
-      local_frontier_displacements[i];
-    if constexpr (std::is_same_v<key_t, vertex_t>) {
-      edge_partition_frontier_major_first = edge_partition_frontier_key_first;
-    } else {
-      edge_partition_frontier_major_first = thrust::get<0>(edge_partition_frontier_key_first);
-    }
-
-    auto edge_partition_frontier_local_degrees =
-      edge_partition_e_mask ? edge_partition.compute_local_degrees_with_mask(
-                                (*edge_partition_e_mask).value_first(),
-                                edge_partition_frontier_major_first,
-                                edge_partition_frontier_major_first + local_frontier_sizes[i],
-                                handle.get_stream())
-                            : edge_partition.compute_local_degrees(
-                                edge_partition_frontier_major_first,
-                                edge_partition_frontier_major_first + local_frontier_sizes[i],
-                                handle.get_stream());
-
-    if (minor_comm_size > 1) {
-      // FIXME: this copy is unnecessary if edge_partition.compute_local_degrees() takes a pointer
-      // to the output array
-      thrust::copy(
-        handle.get_thrust_policy(),
-        edge_partition_frontier_local_degrees.begin(),
-        edge_partition_frontier_local_degrees.end(),
-        (*aggregate_local_frontier_local_degrees).begin() + local_frontier_displacements[i]);
-    } else {
-      frontier_degrees = std::move(edge_partition_frontier_local_degrees);
-    }
-
-    if (edge_partition_e_mask) {
-      (*local_frontier_valid_local_nbr_count_inclusive_sums)
-        .push_back(compute_valid_local_nbr_count_inclusive_sums(
-          handle,
-          edge_partition,
-          *edge_partition_e_mask,
-          raft::device_span<vertex_t const>(edge_partition_frontier_major_first,
-                                            local_frontier_sizes[i])));
-    }
-  }
-
-  auto frontier_partitioned_local_degree_displacements =
-    (minor_comm_size > 1)
-      ? std::make_optional<rmm::device_uvector<edge_t>>(size_t{0}, handle.get_stream())
-      : std::nullopt;  // one partition per gpu in the same minor_comm
-  if (minor_comm_size > 1) {
-    auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
-
-    rmm::device_uvector<edge_t> frontier_gathered_local_degrees(0, handle.get_stream());
-    std::tie(frontier_gathered_local_degrees, std::ignore) =
-      shuffle_values(minor_comm,
-                     (*aggregate_local_frontier_local_degrees).begin(),
-                     local_frontier_sizes,
-                     handle.get_stream());
-    aggregate_local_frontier_local_degrees = std::nullopt;
-    frontier_partitioned_local_degree_displacements =
-      rmm::device_uvector<edge_t>(frontier_degrees.size() * minor_comm_size, handle.get_stream());
-    thrust::for_each(
-      handle.get_thrust_policy(),
-      thrust::make_counting_iterator(size_t{0}),
-      thrust::make_counting_iterator(frontier_degrees.size()),
-      compute_local_degree_displacements_and_global_degree_t<edge_t>{
-        raft::device_span<edge_t const>(frontier_gathered_local_degrees.data(),
-                                        frontier_gathered_local_degrees.size()),
-        raft::device_span<edge_t>((*frontier_partitioned_local_degree_displacements).data(),
-                                  (*frontier_partitioned_local_degree_displacements).size()),
-        raft::device_span<edge_t>(frontier_degrees.data(), frontier_degrees.size()),
-        minor_comm_size});
-  }
-
-  // 3. randomly select neighbor indices
-
-  rmm::device_uvector<edge_t> sample_nbr_indices(0, handle.get_stream());
-  if (with_replacement) {
-    if (frontier_degrees.size() > 0) {
-      sample_nbr_indices.resize(frontier.size() * K, handle.get_stream());
-      cugraph::ops::graph::get_sampling_index(sample_nbr_indices.data(),
-                                              rng_state,
-                                              frontier_degrees.data(),
-                                              static_cast<edge_t>(frontier_degrees.size()),
-                                              static_cast<int32_t>(K),
-                                              with_replacement,
-                                              handle.get_stream());
-      frontier_degrees.resize(0, handle.get_stream());
-      frontier_degrees.shrink_to_fit(handle.get_stream());
-    }
+  // 2. randomly select neighbor indices and compute local neighbor indices for every local edge
+  // partition
+
+  rmm::device_uvector<edge_t> sample_local_nbr_indices(0, handle.get_stream());
+  std::optional<rmm::device_uvector<size_t>> sample_key_indices{std::nullopt};
+  std::vector<size_t> local_frontier_sample_offsets{};
+  if constexpr (std::is_same_v<EdgeBiasOp,
+                               constant_e_bias_op_t<GraphViewType,
+                                                    EdgeSrcValueInputWrapper,
+                                                    EdgeDstValueInputWrapper,
+                                                    EdgeValueInputWrapper,
+                                                    key_t>>) {
+    std::tie(sample_local_nbr_indices, sample_key_indices, local_frontier_sample_offsets) =
+      uniform_sample_and_compute_local_nbr_indices(
+        handle,
+        graph_view,
+        (minor_comm_size > 1) ? get_dataframe_buffer_begin(*aggregate_local_frontier)
+                              : frontier.begin(),
+        local_frontier_displacements,
+        local_frontier_sizes,
+        rng_state,
+        K,
+        with_replacement);
   } else {
-    sample_nbr_indices =
-      get_sampling_index_without_replacement(handle, std::move(frontier_degrees), rng_state, K);
+    std::tie(sample_local_nbr_indices, sample_key_indices, local_frontier_sample_offsets) =
+      biased_sample_and_compute_local_nbr_indices(
+        handle,
+        graph_view,
+        (minor_comm_size > 1) ? get_dataframe_buffer_begin(*aggregate_local_frontier)
+                              : frontier.begin(),
+        edge_src_value_input,
+        edge_dst_value_input,
+        edge_value_input,
+        e_bias_op,
+        local_frontier_displacements,
+        local_frontier_sizes,
+        rng_state,
+        K,
+        with_replacement,
+        do_expensive_check);
   }
 
-  // 4. shuffle randomly selected indices
-
-  auto sample_local_nbr_indices = std::move(
-    sample_nbr_indices);  // neighbor index within an edge partition (note that each vertex's
-                          // neighbors are distributed in minor_comm_size partitions)
-  std::optional<rmm::device_uvector<size_t>> sample_key_indices{
-    std::nullopt};  // relevant only when (minor_comm_size > 1)
-  auto local_frontier_sample_counts        = std::vector<size_t>{};
-  auto local_frontier_sample_displacements = std::vector<size_t>{};
-  if (minor_comm_size > 1) {
-    auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
-
-    sample_key_indices =
-      rmm::device_uvector<size_t>(sample_local_nbr_indices.size(), handle.get_stream());
-    auto minor_comm_ranks =
-      rmm::device_uvector<int>(sample_local_nbr_indices.size(), handle.get_stream());
-    auto intra_partition_displacements =
-      rmm::device_uvector<size_t>(sample_local_nbr_indices.size(), handle.get_stream());
-    rmm::device_uvector<size_t> d_tx_counts(minor_comm_size, handle.get_stream());
-    thrust::fill(handle.get_thrust_policy(), d_tx_counts.begin(), d_tx_counts.end(), size_t{0});
-    auto input_pair_first = thrust::make_zip_iterator(
-      thrust::make_tuple(sample_local_nbr_indices.begin(),
-                         thrust::make_transform_iterator(thrust::make_counting_iterator(size_t{0}),
-                                                         divider_t<size_t>{K})));
-    thrust::transform(
-      handle.get_thrust_policy(),
-      input_pair_first,
-      input_pair_first + sample_local_nbr_indices.size(),
-      thrust::make_zip_iterator(thrust::make_tuple(minor_comm_ranks.begin(),
-                                                   intra_partition_displacements.begin(),
-                                                   sample_local_nbr_indices.begin(),
-                                                   (*sample_key_indices).begin())),
-      convert_pair_to_quadruplet_t<edge_t>{
-        raft::device_span<edge_t const>((*frontier_partitioned_local_degree_displacements).data(),
-                                        (*frontier_partitioned_local_degree_displacements).size()),
-        raft::device_span<size_t>(d_tx_counts.data(), d_tx_counts.size()),
-        frontier.size(),
-        minor_comm_size,
-        cugraph::ops::graph::INVALID_ID<edge_t>});
-    rmm::device_uvector<size_t> tx_displacements(minor_comm_size, handle.get_stream());
-    thrust::exclusive_scan(
-      handle.get_thrust_policy(), d_tx_counts.begin(), d_tx_counts.end(), tx_displacements.begin());
-    auto tmp_sample_local_nbr_indices =
-      rmm::device_uvector<edge_t>(tx_displacements.back_element(handle.get_stream()) +
-                                    d_tx_counts.back_element(handle.get_stream()),
-                                  handle.get_stream());
-    auto tmp_sample_key_indices =
-      rmm::device_uvector<size_t>(tmp_sample_local_nbr_indices.size(), handle.get_stream());
-    auto pair_first = thrust::make_zip_iterator(
-      thrust::make_tuple(sample_local_nbr_indices.begin(), (*sample_key_indices).begin()));
-    thrust::scatter_if(
-      handle.get_thrust_policy(),
-      pair_first,
-      pair_first + sample_local_nbr_indices.size(),
-      thrust::make_transform_iterator(
-        thrust::make_counting_iterator(size_t{0}),
-        shuffle_index_compute_offset_t{
-          raft::device_span<int const>(minor_comm_ranks.data(), minor_comm_ranks.size()),
-          raft::device_span<size_t const>(intra_partition_displacements.data(),
-                                          intra_partition_displacements.size()),
-          raft::device_span<size_t const>(tx_displacements.data(), tx_displacements.size())}),
-      minor_comm_ranks.begin(),
-      thrust::make_zip_iterator(
-        thrust::make_tuple(tmp_sample_local_nbr_indices.begin(), tmp_sample_key_indices.begin())),
-      is_not_equal_t<int>{-1});
-
-    sample_local_nbr_indices = std::move(tmp_sample_local_nbr_indices);
-    sample_key_indices       = std::move(tmp_sample_key_indices);
-
-    std::vector<size_t> h_tx_counts(d_tx_counts.size());
-    raft::update_host(
-      h_tx_counts.data(), d_tx_counts.data(), d_tx_counts.size(), handle.get_stream());
-    handle.sync_stream();
-
-    pair_first = thrust::make_zip_iterator(
-      thrust::make_tuple(sample_local_nbr_indices.begin(), (*sample_key_indices).begin()));
-    auto [rx_value_buffer, rx_counts] =
-      shuffle_values(minor_comm, pair_first, h_tx_counts, handle.get_stream());
-
-    sample_local_nbr_indices            = std::move(std::get<0>(rx_value_buffer));
-    sample_key_indices                  = std::move(std::get<1>(rx_value_buffer));
-    local_frontier_sample_displacements = std::vector<size_t>(rx_counts.size());
-    std::exclusive_scan(
-      rx_counts.begin(), rx_counts.end(), local_frontier_sample_displacements.begin(), size_t{0});
-    local_frontier_sample_counts = std::move(rx_counts);
-  } else {
-    local_frontier_sample_counts.push_back(frontier.size() * K);
-    local_frontier_sample_displacements.push_back(size_t{0});
-  }
+  std::vector<size_t> local_frontier_sample_counts(minor_comm_size);
+  std::adjacent_difference(local_frontier_sample_offsets.begin() + 1,
+                           local_frontier_sample_offsets.end(),
+                           local_frontier_sample_counts.begin());
 
-  // 5. transform
+  // 3. transform
 
-  auto sample_e_op_results = allocate_dataframe_buffer<T>(
-    local_frontier_sample_displacements.back() + local_frontier_sample_counts.back(),
-    handle.get_stream());
+  auto sample_e_op_results =
+    allocate_dataframe_buffer<T>(local_frontier_sample_offsets.back(), handle.get_stream());
   for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) {
     auto edge_partition =
       edge_partition_device_view_t<vertex_t, edge_t, GraphViewType::is_multi_gpu>(
         graph_view.local_edge_partition_view(i));
-    auto edge_partition_e_mask =
-      edge_mask_view
-        ? thrust::make_optional<
-            detail::edge_partition_edge_property_device_view_t<edge_t, uint32_t const*, bool>>(
-            *edge_mask_view, i)
-        : thrust::nullopt;
 
     auto edge_partition_frontier_key_first =
-      ((minor_comm_size > 1) ? get_dataframe_buffer_begin(*aggregate_local_frontier_keys)
-                             : frontier_key_first) +
+      ((minor_comm_size > 1) ? get_dataframe_buffer_begin(*aggregate_local_frontier)
+                             : frontier.begin()) +
       local_frontier_displacements[i];
     auto edge_partition_sample_local_nbr_index_first =
-      sample_local_nbr_indices.begin() + local_frontier_sample_displacements[i];
+      sample_local_nbr_indices.begin() + local_frontier_sample_offsets[i];
 
     auto edge_partition_sample_e_op_result_first =
-      get_dataframe_buffer_begin(sample_e_op_results) + local_frontier_sample_displacements[i];
+      get_dataframe_buffer_begin(sample_e_op_results) + local_frontier_sample_offsets[i];
 
     edge_partition_src_input_device_view_t edge_partition_src_value_input{};
     edge_partition_dst_input_device_view_t edge_partition_dst_value_input{};
@@ -1339,87 +409,62 @@ per_v_random_select_transform_e(raft::handle_t const& handle,
     }
     auto edge_partition_e_value_input = edge_partition_e_input_device_view_t(edge_value_input, i);
 
-    if (minor_comm_size > 1) {
+    if (sample_key_indices) {
       auto edge_partition_sample_key_index_first =
-        (*sample_key_indices).begin() + local_frontier_sample_displacements[i];
-      thrust::for_each(
+        (*sample_key_indices).begin() + local_frontier_sample_offsets[i];
+      thrust::transform(
         handle.get_thrust_policy(),
         thrust::make_counting_iterator(size_t{0}),
         thrust::make_counting_iterator(local_frontier_sample_counts[i]),
+        edge_partition_sample_e_op_result_first,
         transform_local_nbr_indices_t<GraphViewType,
                                       decltype(edge_partition_frontier_key_first),
                                       decltype(edge_partition_sample_local_nbr_index_first),
-                                      decltype(edge_partition_sample_e_op_result_first),
                                       edge_partition_src_input_device_view_t,
                                       edge_partition_dst_input_device_view_t,
                                       edge_partition_e_input_device_view_t,
-                                      decltype(edge_partition_e_mask),
                                       EdgeOp,
                                       T>{
           edge_partition,
           thrust::make_optional(edge_partition_sample_key_index_first),
           edge_partition_frontier_key_first,
           edge_partition_sample_local_nbr_index_first,
-          edge_partition_sample_e_op_result_first,
           edge_partition_src_value_input,
           edge_partition_dst_value_input,
           edge_partition_e_value_input,
-          edge_partition_e_mask,
-          local_frontier_valid_local_nbr_count_inclusive_sums
-            ? thrust::make_optional(thrust::make_tuple(
-                raft::device_span<size_t const>(
-                  std::get<0>((*local_frontier_valid_local_nbr_count_inclusive_sums)[i]).data(),
-                  std::get<0>((*local_frontier_valid_local_nbr_count_inclusive_sums)[i]).size()),
-                raft::device_span<edge_t const>(
-                  std::get<1>((*local_frontier_valid_local_nbr_count_inclusive_sums)[i]).data(),
-                  std::get<1>((*local_frontier_valid_local_nbr_count_inclusive_sums)[i]).size())))
-            : thrust::nullopt,
           e_op,
           cugraph::ops::graph::INVALID_ID<edge_t>,
           to_thrust_optional(invalid_value),
           K});
     } else {
-      thrust::for_each(
+      thrust::transform(
         handle.get_thrust_policy(),
         thrust::make_counting_iterator(size_t{0}),
         thrust::make_counting_iterator(frontier.size() * K),
+        edge_partition_sample_e_op_result_first,
         transform_local_nbr_indices_t<GraphViewType,
                                       decltype(edge_partition_frontier_key_first),
                                       decltype(edge_partition_sample_local_nbr_index_first),
-                                      decltype(edge_partition_sample_e_op_result_first),
                                       edge_partition_src_input_device_view_t,
                                       edge_partition_dst_input_device_view_t,
                                       edge_partition_e_input_device_view_t,
-                                      decltype(edge_partition_e_mask),
                                       EdgeOp,
-                                      T>{
-          edge_partition,
-          thrust::nullopt,
-          edge_partition_frontier_key_first,
-          edge_partition_sample_local_nbr_index_first,
-          edge_partition_sample_e_op_result_first,
-          edge_partition_src_value_input,
-          edge_partition_dst_value_input,
-          edge_partition_e_value_input,
-          edge_partition_e_mask,
-          local_frontier_valid_local_nbr_count_inclusive_sums
-            ? thrust::make_optional(thrust::make_tuple(
-                raft::device_span<size_t const>(
-                  std::get<0>((*local_frontier_valid_local_nbr_count_inclusive_sums)[i]).data(),
-                  std::get<0>((*local_frontier_valid_local_nbr_count_inclusive_sums)[i]).size()),
-                raft::device_span<edge_t const>(
-                  std::get<1>((*local_frontier_valid_local_nbr_count_inclusive_sums)[i]).data(),
-                  std::get<1>((*local_frontier_valid_local_nbr_count_inclusive_sums)[i]).size())))
-            : thrust::nullopt,
-          e_op,
-          cugraph::ops::graph::INVALID_ID<edge_t>,
-          to_thrust_optional(invalid_value),
-          K});
+                                      T>{edge_partition,
+                                         thrust::nullopt,
+                                         edge_partition_frontier_key_first,
+                                         edge_partition_sample_local_nbr_index_first,
+                                         edge_partition_src_value_input,
+                                         edge_partition_dst_value_input,
+                                         edge_partition_e_value_input,
+                                         e_op,
+                                         cugraph::ops::graph::INVALID_ID<edge_t>,
+                                         to_thrust_optional(invalid_value),
+                                         K});
     }
   }
-  aggregate_local_frontier_keys = std::nullopt;
+  aggregate_local_frontier = std::nullopt;
 
-  // 6. shuffle randomly selected & transformed results and update sample_offsets
+  // 4. shuffle randomly selected & transformed results and update sample_offsets
 
   auto sample_offsets = invalid_value ? std::nullopt
                                       : std::make_optional<rmm::device_uvector<size_t>>(
@@ -1542,6 +587,8 @@ per_v_random_select_transform_e(raft::handle_t const& handle,
   return std::make_tuple(std::move(sample_offsets), std::move(sample_e_op_results));
 #else
   CUGRAPH_FAIL("unimplemented.");
+  return std::make_tuple(std::nullopt,
+                         allocate_dataframe_buffer<t>(size_t{0}, rmm::cuda_stream_view{}));
 #endif
 }
 
@@ -1579,12 +626,15 @@ per_v_random_select_transform_e(raft::handle_t const& handle,
  * to this process in multi-GPU). Use either cugraph::edge_property_t::view() (if @p e_op needs to
  * access edge property values) or cugraph::edge_dummy_property_t::view() (if @p e_op does not
  * access edge property values).
- * @param e_bias_op Quinary operator takes edge source, edge destination, property values for the
- * source, destination, and edge and returns a floating point bias value to be used in biased random
- * selection.
- * @param e_op Quinary operator takes edge source, edge destination, property values for the source,
- * destination, and edge and returns a value to be collected in the output. This function is called
- * only for the selected edges.
+ * @param e_bias_op Quinary operator takes (tagged-)edge source, edge destination, property values
+ * for the source, destination, and edge and returns a floating point bias value to be used in
+ * biased random selection. The return value should be non-negative. The bias value of 0 indicates
+ * that the corresponding edge cannot be selected. Assuming that the return value type is bias_t,
+ * the sum of the bias values for any seed vertex should not exceed
+ * std::numeric_limits<bias_t>::max().
+ * @param e_op Quinary operator takes (tagged-)edge source, edge destination, property values for
+ * the source, destination, and edge and returns a value to be collected in the output. This
+ * function is called only for the selected edges.
  * @param K Number of outgoing edges to select per (tagged-)vertex.
  * @param with_replacement A flag to specify whether a single outgoing edge can be selected multiple
  * times (if @p with_replacement = true) or can be selected only once (if @p with_replacement =
@@ -1616,7 +666,7 @@ per_v_random_select_transform_outgoing_e(raft::handle_t const& handle,
                                          VertexFrontierBucketType const& frontier,
                                          EdgeSrcValueInputWrapper edge_src_value_input,
                                          EdgeDstValueInputWrapper edge_dst_value_input,
-                                         EdgeValueInputWrapper egde_value_input,
+                                         EdgeValueInputWrapper edge_value_input,
                                          EdgeBiasOp e_bias_op,
                                          EdgeOp e_op,
                                          raft::random::RngState& rng_state,
@@ -1625,10 +675,19 @@ per_v_random_select_transform_outgoing_e(raft::handle_t const& handle,
                                          std::optional<T> invalid_value,
                                          bool do_expensive_check = false)
 {
-  CUGRAPH_FAIL("unimplemented.");
-
-  return std::make_tuple(std::nullopt,
-                         allocate_dataframe_buffer<T>(size_t{0}, rmm::cuda_stream_view{}));
+  return detail::per_v_random_select_transform_e<false>(handle,
+                                                        graph_view,
+                                                        frontier,
+                                                        edge_src_value_input,
+                                                        edge_dst_value_input,
+                                                        edge_value_input,
+                                                        e_bias_op,
+                                                        e_op,
+                                                        rng_state,
+                                                        K,
+                                                        with_replacement,
+                                                        invalid_value,
+                                                        do_expensive_check);
 }
 
 /**
@@ -1664,9 +723,9 @@ per_v_random_select_transform_outgoing_e(raft::handle_t const& handle,
  * to this process in multi-GPU). Use either cugraph::edge_property_t::view() (if @p e_op needs to
  * access edge property values) or cugraph::edge_dummy_property_t::view() (if @p e_op does not
  * access edge property values).
- * @param e_op Quinary operator takes edge source, edge destination, property values for the source,
- * destination, and edge and returns a value to be collected in the output. This function is called
- * only for the selected edges.
+ * @param e_op Quinary operator takes (tagged-)edge source, edge destination, property values for
+ * the source, destination, and edge and returns a value to be collected in the output. This
+ * function is called only for the selected edges.
  * @param K Number of outgoing edges to select per (tagged-)vertex.
  * @param with_replacement A flag to specify whether a single outgoing edge can be selected multiple
  * times (if @p with_replacement = true) or can be selected only once (if @p with_replacement =
@@ -1705,18 +764,24 @@ per_v_random_select_transform_outgoing_e(raft::handle_t const& handle,
                                          std::optional<T> invalid_value,
                                          bool do_expensive_check = false)
 {
-  return detail::per_v_random_select_transform_e<false>(handle,
-                                                        graph_view,
-                                                        frontier,
-                                                        edge_src_value_input,
-                                                        edge_dst_value_input,
-                                                        edge_value_input,
-                                                        e_op,
-                                                        rng_state,
-                                                        K,
-                                                        with_replacement,
-                                                        invalid_value,
-                                                        do_expensive_check);
+  return detail::per_v_random_select_transform_e<false>(
+    handle,
+    graph_view,
+    frontier,
+    edge_src_value_input,
+    edge_dst_value_input,
+    edge_value_input,
+    detail::constant_e_bias_op_t<GraphViewType,
+                                 EdgeSrcValueInputWrapper,
+                                 EdgeDstValueInputWrapper,
+                                 EdgeValueInputWrapper,
+                                 typename VertexFrontierBucketType::key_type>{},
+    e_op,
+    rng_state,
+    K,
+    with_replacement,
+    invalid_value,
+    do_expensive_check);
 }
 
 }  // namespace cugraph
diff --git a/cpp/src/prims/transform_reduce_e_by_src_dst_key.cuh b/cpp/src/prims/transform_reduce_e_by_src_dst_key.cuh
index 2fb87527c62..3abce6f8bd5 100644
--- a/cpp/src/prims/transform_reduce_e_by_src_dst_key.cuh
+++ b/cpp/src/prims/transform_reduce_e_by_src_dst_key.cuh
@@ -56,7 +56,7 @@ template <bool edge_partition_src_key,
           typename EdgePartitionSrcDstKeyInputWrapper,
           typename EdgeOp,
           typename ValueIterator>
-__device__ void update_buffer_element(
+__device__ void transform_reduce_e_by_src_dst_key_update_buffer_element(
   edge_partition_device_view_t<typename GraphViewType::vertex_type,
                                typename GraphViewType::edge_type,
                                GraphViewType::is_multi_gpu>& edge_partition,
@@ -140,7 +140,8 @@ __global__ static void transform_reduce_by_src_dst_key_hypersparse(
       edge_t counter{0};
       for (edge_t i = 0; i < local_degree; ++i) {
         if ((*edge_partition_e_mask).get(edge_offset + i)) {
-          update_buffer_element<edge_partition_src_key, GraphViewType>(
+          transform_reduce_e_by_src_dst_key_update_buffer_element<edge_partition_src_key,
+                                                                  GraphViewType>(
             edge_partition,
             major,
             indices[i],
@@ -157,7 +158,8 @@ __global__ static void transform_reduce_by_src_dst_key_hypersparse(
       }
     } else {
       for (edge_t i = 0; i < local_degree; ++i) {
-        update_buffer_element<edge_partition_src_key, GraphViewType>(
+        transform_reduce_e_by_src_dst_key_update_buffer_element<edge_partition_src_key,
+                                                                GraphViewType>(
           edge_partition,
           major,
           indices[i],
@@ -224,7 +226,8 @@ __global__ static void transform_reduce_by_src_dst_key_low_degree(
       edge_t counter{0};
       for (edge_t i = 0; i < local_degree; ++i) {
         if ((*edge_partition_e_mask).get(edge_offset + i)) {
-          update_buffer_element<edge_partition_src_key, GraphViewType>(
+          transform_reduce_e_by_src_dst_key_update_buffer_element<edge_partition_src_key,
+                                                                  GraphViewType>(
             edge_partition,
             major,
             indices[i],
@@ -241,7 +244,8 @@ __global__ static void transform_reduce_by_src_dst_key_low_degree(
       }
     } else {
       for (edge_t i = 0; i < local_degree; ++i) {
-        update_buffer_element<edge_partition_src_key, GraphViewType>(
+        transform_reduce_e_by_src_dst_key_update_buffer_element<edge_partition_src_key,
+                                                                GraphViewType>(
           edge_partition,
           major,
           indices[i],
@@ -321,7 +325,8 @@ __global__ static void transform_reduce_by_src_dst_key_mid_degree(
       counter = 0;
       for (edge_t i = lane_id; i < local_degree; i += raft::warp_size()) {
         if ((*edge_partition_e_mask).get(edge_offset + i)) {
-          update_buffer_element<edge_partition_src_key, GraphViewType>(
+          transform_reduce_e_by_src_dst_key_update_buffer_element<edge_partition_src_key,
+                                                                  GraphViewType>(
             edge_partition,
             major,
             indices[i],
@@ -338,7 +343,8 @@ __global__ static void transform_reduce_by_src_dst_key_mid_degree(
       }
     } else {
       for (edge_t i = lane_id; i < local_degree; i += raft::warp_size()) {
-        update_buffer_element<edge_partition_src_key, GraphViewType>(
+        transform_reduce_e_by_src_dst_key_update_buffer_element<edge_partition_src_key,
+                                                                GraphViewType>(
           edge_partition,
           major,
           indices[i],
@@ -415,7 +421,8 @@ __global__ static void transform_reduce_by_src_dst_key_high_degree(
       counter = 0;
       for (edge_t i = threadIdx.x; i < local_degree; i += blockDim.x) {
         if ((*edge_partition_e_mask).get(edge_offset + i)) {
-          update_buffer_element<edge_partition_src_key, GraphViewType>(
+          transform_reduce_e_by_src_dst_key_update_buffer_element<edge_partition_src_key,
+                                                                  GraphViewType>(
             edge_partition,
             major,
             indices[i],
@@ -432,7 +439,8 @@ __global__ static void transform_reduce_by_src_dst_key_high_degree(
       }
     } else {
       for (edge_t i = threadIdx.x; i < local_degree; i += blockDim.x) {
-        update_buffer_element<edge_partition_src_key, GraphViewType>(
+        transform_reduce_e_by_src_dst_key_update_buffer_element<edge_partition_src_key,
+                                                                GraphViewType>(
           edge_partition,
           major,
           indices[i],
diff --git a/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh b/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh
index 70787abeffb..0432e25ae86 100644
--- a/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh
+++ b/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh
@@ -192,12 +192,8 @@ size_t compute_num_out_nbrs_from_frontier(raft::handle_t const& handle,
 
   size_t ret{0};
 
-  vertex_t const* local_frontier_vertex_first{nullptr};
-  if constexpr (std::is_same_v<key_t, vertex_t>) {
-    local_frontier_vertex_first = frontier.begin();
-  } else {
-    local_frontier_vertex_first = thrust::get<0>(frontier.begin().get_iterator_tuple());
-  }
+  auto local_frontier_vertex_first =
+    thrust_tuple_get_or_identity<decltype(frontier.begin()), 0>(frontier.begin());
 
   std::vector<size_t> local_frontier_sizes{};
   if constexpr (GraphViewType::is_multi_gpu) {
@@ -400,12 +396,9 @@ transform_reduce_v_frontier_outgoing_e_by_dst(raft::handle_t const& handle,
       d_vertex_lasts.data(), h_vertex_lasts.data(), h_vertex_lasts.size(), handle.get_stream());
     rmm::device_uvector<edge_t> d_tx_buffer_last_boundaries(d_vertex_lasts.size(),
                                                             handle.get_stream());
-    vertex_t const* dst_first{nullptr};
-    if constexpr (std::is_same_v<key_t, vertex_t>) {
-      dst_first = get_dataframe_buffer_begin(key_buffer);
-    } else {
-      dst_first = thrust::get<0>(get_dataframe_buffer_begin(key_buffer).get_iterator_tuple());
-    }
+    auto dst_first =
+      thrust_tuple_get_or_identity<decltype(get_dataframe_buffer_begin(key_buffer)), 0>(
+        get_dataframe_buffer_begin(key_buffer));
     thrust::lower_bound(handle.get_thrust_policy(),
                         dst_first,
                         dst_first + size_dataframe_buffer(key_buffer),
diff --git a/cpp/src/prims/update_v_frontier.cuh b/cpp/src/prims/update_v_frontier.cuh
index 0e739bb4c65..a9b0a6b823b 100644
--- a/cpp/src/prims/update_v_frontier.cuh
+++ b/cpp/src/prims/update_v_frontier.cuh
@@ -60,12 +60,8 @@ struct update_v_frontier_call_v_op_t {
   {
     auto key     = thrust::get<0>(pair);
     auto payload = thrust::get<1>(pair);
-    vertex_t v_offset{};
-    if constexpr (std::is_same_v<key_t, vertex_t>) {
-      v_offset = key - local_vertex_partition_range_first;
-    } else {
-      v_offset = thrust::get<0>(key) - local_vertex_partition_range_first;
-    }
+    auto v_offset =
+      thrust_tuple_get_or_identity<key_t, 0>(key) - local_vertex_partition_range_first;
     auto v_val       = *(vertex_value_input_first + v_offset);
     auto v_op_result = v_op(key, v_val, payload);
     if (thrust::get<1>(v_op_result)) {
@@ -98,12 +94,8 @@ struct update_v_frontier_call_v_op_t<vertex_t,
 
   __device__ uint8_t operator()(key_t key) const
   {
-    vertex_t v_offset{};
-    if constexpr (std::is_same_v<key_t, vertex_t>) {
-      v_offset = key - local_vertex_partition_range_first;
-    } else {
-      v_offset = thrust::get<0>(key) - local_vertex_partition_range_first;
-    }
+    auto v_offset =
+      thrust_tuple_get_or_identity<key_t, 0>(key) - local_vertex_partition_range_first;
     auto v_val       = *(vertex_value_input_first + v_offset);
     auto v_op_result = v_op(key, v_val);
     if (thrust::get<1>(v_op_result)) {
diff --git a/cpp/tests/prims/mg_per_v_random_select_transform_outgoing_e.cu b/cpp/tests/prims/mg_per_v_random_select_transform_outgoing_e.cu
index 2ea8635fe36..97c7333cd2e 100644
--- a/cpp/tests/prims/mg_per_v_random_select_transform_outgoing_e.cu
+++ b/cpp/tests/prims/mg_per_v_random_select_transform_outgoing_e.cu
@@ -49,7 +49,15 @@
 
 #include <random>
 
-template <typename vertex_t, typename property_t>
+template <typename vertex_t, typename weight_t, typename property_t>
+struct e_bias_op_t {
+  __device__ weight_t operator()(vertex_t, vertex_t, property_t, property_t, weight_t w) const
+  {
+    return w;
+  }
+};
+
+template <typename vertex_t, typename weight_t, typename property_t>
 struct e_op_t {
   using result_t = decltype(cugraph::thrust_tuple_cat(thrust::tuple<vertex_t, vertex_t>{},
                                                       cugraph::to_thrust_tuple(property_t{}),
@@ -70,6 +78,22 @@ struct e_op_t {
       return thrust::make_tuple(src, dst, src_prop, dst_prop);
     }
   }
+
+  __device__ result_t
+  operator()(vertex_t src, vertex_t dst, property_t src_prop, property_t dst_prop, weight_t w) const
+  {
+    if constexpr (cugraph::is_thrust_tuple_of_arithmetic<property_t>::value) {
+      static_assert(thrust::tuple_size<property_t>::value == size_t{2});
+      return thrust::make_tuple(src,
+                                dst,
+                                thrust::get<0>(src_prop),
+                                thrust::get<1>(src_prop),
+                                thrust::get<0>(dst_prop),
+                                thrust::get<1>(dst_prop));
+    } else {
+      return thrust::make_tuple(src, dst, src_prop, dst_prop);
+    }
+  }
 };
 
 struct Prims_Usecase {
@@ -77,7 +101,7 @@ struct Prims_Usecase {
   size_t K{0};
   bool with_replacement{false};
   bool use_invalid_value{false};
-  bool test_weighted{false};
+  bool use_weight_as_bias{false};
   bool edge_masking{false};
   bool check_correctness{true};
 };
@@ -112,11 +136,9 @@ class Tests_MGPerVRandomSelectTransformOutgoingE
       hr_timer.start("MG Construct graph");
     }
 
-    cugraph::graph_t<vertex_t, edge_t, false, true> mg_graph(*handle_);
-    std::optional<rmm::device_uvector<vertex_t>> mg_renumber_map{std::nullopt};
-    std::tie(mg_graph, std::ignore, mg_renumber_map) =
+    auto [mg_graph, mg_edge_weights, mg_renumber_map] =
       cugraph::test::construct_graph<vertex_t, edge_t, weight_t, false, true>(
-        *handle_, input_usecase, prims_usecase.test_weighted, true);
+        *handle_, input_usecase, prims_usecase.use_weight_as_bias, true);
 
     if (cugraph::test::g_perf) {
       RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
@@ -126,6 +148,8 @@ class Tests_MGPerVRandomSelectTransformOutgoingE
     }
 
     auto mg_graph_view = mg_graph.view();
+    auto mg_edge_weight_view =
+      mg_edge_weights ? std::make_optional((*mg_edge_weights).view()) : std::nullopt;
 
     std::optional<cugraph::edge_property_t<decltype(mg_graph_view), bool>> edge_mask{std::nullopt};
     if (prims_usecase.edge_masking) {
@@ -188,17 +212,31 @@ class Tests_MGPerVRandomSelectTransformOutgoingE
     }
 
     auto [mg_sample_offsets, mg_sample_e_op_results] =
-      cugraph::per_v_random_select_transform_outgoing_e(*handle_,
-                                                        mg_graph_view,
-                                                        mg_vertex_frontier.bucket(bucket_idx_cur),
-                                                        mg_src_prop.view(),
-                                                        mg_dst_prop.view(),
-                                                        cugraph::edge_dummy_property_t{}.view(),
-                                                        e_op_t<vertex_t, property_t>{},
-                                                        rng_state,
-                                                        prims_usecase.K,
-                                                        prims_usecase.with_replacement,
-                                                        invalid_value);
+      prims_usecase.use_weight_as_bias ? cugraph::per_v_random_select_transform_outgoing_e(
+                                           *handle_,
+                                           mg_graph_view,
+                                           mg_vertex_frontier.bucket(bucket_idx_cur),
+                                           mg_src_prop.view(),
+                                           mg_dst_prop.view(),
+                                           *mg_edge_weight_view,
+                                           e_bias_op_t<vertex_t, weight_t, property_t>{},
+                                           e_op_t<vertex_t, weight_t, property_t>{},
+                                           rng_state,
+                                           prims_usecase.K,
+                                           prims_usecase.with_replacement,
+                                           invalid_value)
+                                       : cugraph::per_v_random_select_transform_outgoing_e(
+                                           *handle_,
+                                           mg_graph_view,
+                                           mg_vertex_frontier.bucket(bucket_idx_cur),
+                                           mg_src_prop.view(),
+                                           mg_dst_prop.view(),
+                                           cugraph::edge_dummy_property_t{}.view(),
+                                           e_op_t<vertex_t, weight_t, property_t>{},
+                                           rng_state,
+                                           prims_usecase.K,
+                                           prims_usecase.with_replacement,
+                                           invalid_value);
 
     if (cugraph::test::g_perf) {
       RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
@@ -503,14 +541,22 @@ INSTANTIATE_TEST_SUITE_P(
   file_test,
   Tests_MGPerVRandomSelectTransformOutgoingE_File,
   ::testing::Combine(
-    ::testing::Values(Prims_Usecase{size_t{1000}, size_t{4}, false, false, false, false, true},
-                      Prims_Usecase{size_t{1000}, size_t{4}, false, false, false, true, true},
-                      Prims_Usecase{size_t{1000}, size_t{4}, false, true, false, false, true},
-                      Prims_Usecase{size_t{1000}, size_t{4}, false, true, false, true, true},
-                      Prims_Usecase{size_t{1000}, size_t{4}, true, false, false, false, true},
-                      Prims_Usecase{size_t{1000}, size_t{4}, true, false, false, true, true},
-                      Prims_Usecase{size_t{1000}, size_t{4}, true, true, false, false, true},
-                      Prims_Usecase{size_t{1000}, size_t{4}, true, true, false, true, true}),
+    ::testing::Values(Prims_Usecase{size_t{1000}, size_t{4}, false, false, false, false},
+                      Prims_Usecase{size_t{1000}, size_t{4}, false, false, false, true},
+                      Prims_Usecase{size_t{1000}, size_t{4}, false, false, true, false},
+                      Prims_Usecase{size_t{1000}, size_t{4}, false, false, true, true},
+                      Prims_Usecase{size_t{1000}, size_t{4}, false, true, false, false},
+                      Prims_Usecase{size_t{1000}, size_t{4}, false, true, false, true},
+                      Prims_Usecase{size_t{1000}, size_t{4}, false, true, true, false},
+                      Prims_Usecase{size_t{1000}, size_t{4}, false, true, true, true},
+                      Prims_Usecase{size_t{1000}, size_t{4}, true, false, false, false},
+                      Prims_Usecase{size_t{1000}, size_t{4}, true, false, false, true},
+                      Prims_Usecase{size_t{1000}, size_t{4}, true, false, true, false},
+                      Prims_Usecase{size_t{1000}, size_t{4}, true, false, true, true},
+                      Prims_Usecase{size_t{1000}, size_t{4}, true, true, false, false},
+                      Prims_Usecase{size_t{1000}, size_t{4}, true, true, false, true},
+                      Prims_Usecase{size_t{1000}, size_t{4}, true, true, true, false},
+                      Prims_Usecase{size_t{1000}, size_t{4}, true, true, true, true}),
     ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"),
                       cugraph::test::File_Usecase("test/datasets/web-Google.mtx"),
                       cugraph::test::File_Usecase("test/datasets/ljournal-2008.mtx"),
@@ -520,14 +566,22 @@ INSTANTIATE_TEST_SUITE_P(
   rmat_small_test,
   Tests_MGPerVRandomSelectTransformOutgoingE_Rmat,
   ::testing::Combine(
-    ::testing::Values(Prims_Usecase{size_t{1000}, size_t{4}, false, false, false, false, true},
-                      Prims_Usecase{size_t{1000}, size_t{4}, false, false, false, true, true},
-                      Prims_Usecase{size_t{1000}, size_t{4}, false, true, false, false, true},
-                      Prims_Usecase{size_t{1000}, size_t{4}, false, true, false, true, true},
-                      Prims_Usecase{size_t{1000}, size_t{4}, true, false, false, false, true},
-                      Prims_Usecase{size_t{1000}, size_t{4}, true, false, false, true, true},
-                      Prims_Usecase{size_t{1000}, size_t{4}, true, true, false, false, true},
-                      Prims_Usecase{size_t{1000}, size_t{4}, true, true, false, true, true}),
+    ::testing::Values(Prims_Usecase{size_t{1000}, size_t{4}, false, false, false, false},
+                      Prims_Usecase{size_t{1000}, size_t{4}, false, false, false, true},
+                      Prims_Usecase{size_t{1000}, size_t{4}, false, false, true, false},
+                      Prims_Usecase{size_t{1000}, size_t{4}, false, false, true, true},
+                      Prims_Usecase{size_t{1000}, size_t{4}, false, true, false, false},
+                      Prims_Usecase{size_t{1000}, size_t{4}, false, true, false, true},
+                      Prims_Usecase{size_t{1000}, size_t{4}, false, true, true, false},
+                      Prims_Usecase{size_t{1000}, size_t{4}, false, true, true, true},
+                      Prims_Usecase{size_t{1000}, size_t{4}, true, false, false, false},
+                      Prims_Usecase{size_t{1000}, size_t{4}, true, false, false, true},
+                      Prims_Usecase{size_t{1000}, size_t{4}, true, false, true, false},
+                      Prims_Usecase{size_t{1000}, size_t{4}, true, false, true, true},
+                      Prims_Usecase{size_t{1000}, size_t{4}, true, true, false, false},
+                      Prims_Usecase{size_t{1000}, size_t{4}, true, true, false, true},
+                      Prims_Usecase{size_t{1000}, size_t{4}, true, true, true, false},
+                      Prims_Usecase{size_t{1000}, size_t{4}, true, true, true, true}),
     ::testing::Values(cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, false, false))));
 
 INSTANTIATE_TEST_SUITE_P(
@@ -541,12 +595,20 @@ INSTANTIATE_TEST_SUITE_P(
     ::testing::Values(
       Prims_Usecase{size_t{10000000}, size_t{25}, false, false, false, false, false},
       Prims_Usecase{size_t{10000000}, size_t{25}, false, false, false, true, false},
+      Prims_Usecase{size_t{10000000}, size_t{25}, false, false, true, false, false},
+      Prims_Usecase{size_t{10000000}, size_t{25}, false, false, true, true, false},
       Prims_Usecase{size_t{10000000}, size_t{25}, false, true, false, false, false},
       Prims_Usecase{size_t{10000000}, size_t{25}, false, true, false, true, false},
+      Prims_Usecase{size_t{10000000}, size_t{25}, false, true, true, false, false},
+      Prims_Usecase{size_t{10000000}, size_t{25}, false, true, true, true, false},
       Prims_Usecase{size_t{10000000}, size_t{25}, true, false, false, false, false},
       Prims_Usecase{size_t{10000000}, size_t{25}, true, false, false, true, false},
+      Prims_Usecase{size_t{10000000}, size_t{25}, true, false, true, false, false},
+      Prims_Usecase{size_t{10000000}, size_t{25}, true, false, true, true, false},
       Prims_Usecase{size_t{10000000}, size_t{25}, true, true, false, false, false},
-      Prims_Usecase{size_t{10000000}, size_t{25}, true, true, false, true, false}),
+      Prims_Usecase{size_t{10000000}, size_t{25}, true, true, false, true, false},
+      Prims_Usecase{size_t{10000000}, size_t{25}, true, true, true, false, false},
+      Prims_Usecase{size_t{10000000}, size_t{25}, true, true, true, true, false}),
     ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false))));
 
 CUGRAPH_MG_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/utilities/debug_utilities_mg.cpp b/cpp/tests/utilities/debug_utilities_mg.cpp
index 622943b3c08..3423189a8ad 100644
--- a/cpp/tests/utilities/debug_utilities_mg.cpp
+++ b/cpp/tests/utilities/debug_utilities_mg.cpp
@@ -20,37 +20,37 @@ namespace test {
 
 template void print_edges(
   raft::handle_t const& handle,
-  cugraph::graph_view_t<int32_t, int32_t, false, false> const& graph_view,
+  cugraph::graph_view_t<int32_t, int32_t, false, true> const& graph_view,
   std::optional<cugraph::edge_property_view_t<int32_t, float const*>> edge_weight_view,
   std::optional<raft::device_span<int32_t const>> renumber_map);
 
 template void print_edges(
   raft::handle_t const& handle,
-  cugraph::graph_view_t<int32_t, int32_t, true, false> const& graph_view,
+  cugraph::graph_view_t<int32_t, int32_t, true, true> const& graph_view,
   std::optional<cugraph::edge_property_view_t<int32_t, float const*>> edge_weight_view,
   std::optional<raft::device_span<int32_t const>> renumber_map);
 
 template void print_edges(
   raft::handle_t const& handle,
-  cugraph::graph_view_t<int32_t, int64_t, false, false> const& graph_view,
+  cugraph::graph_view_t<int32_t, int64_t, false, true> const& graph_view,
   std::optional<cugraph::edge_property_view_t<int64_t, float const*>> edge_weight_view,
   std::optional<raft::device_span<int32_t const>> renumber_map);
 
 template void print_edges(
   raft::handle_t const& handle,
-  cugraph::graph_view_t<int32_t, int64_t, true, false> const& graph_view,
+  cugraph::graph_view_t<int32_t, int64_t, true, true> const& graph_view,
   std::optional<cugraph::edge_property_view_t<int64_t, float const*>> edge_weight_view,
   std::optional<raft::device_span<int32_t const>> renumber_map);
 
 template void print_edges(
   raft::handle_t const& handle,
-  cugraph::graph_view_t<int64_t, int64_t, false, false> const& graph_view,
+  cugraph::graph_view_t<int64_t, int64_t, false, true> const& graph_view,
   std::optional<cugraph::edge_property_view_t<int64_t, float const*>> edge_weight_view,
   std::optional<raft::device_span<int64_t const>> renumber_map);
 
 template void print_edges(
   raft::handle_t const& handle,
-  cugraph::graph_view_t<int64_t, int64_t, true, false> const& graph_view,
+  cugraph::graph_view_t<int64_t, int64_t, true, true> const& graph_view,
   std::optional<cugraph::edge_property_view_t<int64_t, float const*>> edge_weight_view,
   std::optional<raft::device_span<int64_t const>> renumber_map);
 

From 6c333d3fe50b842f39f92f096212db3b8c74ca8b Mon Sep 17 00:00:00 2001
From: Ralph Liu <137829296+nv-rliu@users.noreply.github.com>
Date: Fri, 24 May 2024 05:13:15 -0400
Subject: [PATCH 12/23] Call New `replicate_edgelist` Function (#4441)

Closes #4440

This PR updates `enable_batch` to use the updated implementation for `replicate_edgelist`.

Authors:
  - Ralph Liu (https://github.com/nv-rliu)

Approvers:
  - Rick Ratzel (https://github.com/rlratzel)
  - Joseph Nke (https://github.com/jnke2016)

URL: https://github.com/rapidsai/cugraph/pull/4441
---
 .../cugraph/structure/graph_implementation/simpleGraph.py | 7 ++-----
 python/cugraph/cugraph/structure/replicate_edgelist.py    | 8 ++------
 2 files changed, 4 insertions(+), 11 deletions(-)

diff --git a/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py b/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py
index c90607f9bf6..2b974ee3ebf 100644
--- a/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py
+++ b/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py
@@ -12,6 +12,7 @@
 # limitations under the License.
 
 from cugraph.structure import graph_primtypes_wrapper
+from cugraph.structure.replicate_edgelist import replicate_cudf_dataframe
 from cugraph.structure.symmetrize import symmetrize
 from cugraph.structure.number_map import NumberMap
 import cugraph.dask.common.mg_utils as mg_utils
@@ -680,16 +681,12 @@ def enable_batch(self):
 
     def _replicate_edgelist(self):
         client = mg_utils.get_client()
-        comms = Comms.get_comms()
 
         # FIXME: There  might be a better way to control it
         if client is None:
             return
-        work_futures = replication.replicate_cudf_dataframe(
-            self.edgelist.edgelist_df, client=client, comms=comms
-        )
 
-        self.batch_edgelists = work_futures
+        self.batch_edgelists = replicate_cudf_dataframe(self.edgelist.edgelist_df)
 
     def _replicate_adjlist(self):
         client = mg_utils.get_client()
diff --git a/python/cugraph/cugraph/structure/replicate_edgelist.py b/python/cugraph/cugraph/structure/replicate_edgelist.py
index d413e50e485..728b247c139 100644
--- a/python/cugraph/cugraph/structure/replicate_edgelist.py
+++ b/python/cugraph/cugraph/structure/replicate_edgelist.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -269,7 +269,6 @@ def replicate_cudf_dataframe(cudf_dataframe):
         )
 
     _client = default_client()
-
     if not isinstance(cudf_dataframe, dask_cudf.DataFrame):
         if isinstance(cudf_dataframe, cudf.DataFrame):
             df = dask_cudf.from_cudf(
@@ -287,10 +286,7 @@ def replicate_cudf_dataframe(cudf_dataframe):
     df = get_persisted_df_worker_map(df, _client)
 
     ddf = _mg_call_plc_replicate(
-        _client,
-        Comms.get_session_id(),
-        df,
-        "dataframe",
+        _client, Comms.get_session_id(), df, "dataframe", cudf_dataframe.columns
     )
 
     return ddf

From 30465c2a6d053d57a8a75951656d54a416e402be Mon Sep 17 00:00:00 2001
From: Naim <110031745+naimnv@users.noreply.github.com>
Date: Sat, 25 May 2024 02:48:50 +0200
Subject: [PATCH 13/23] Fix bug in kv_store_t's insertion methods (#4444)

Update size_ field of kv_cuco_store_t with correct values.

Authors:
  - Naim (https://github.com/naimnv)

Approvers:
  - Chuck Hastings (https://github.com/ChuckHastings)
  - Seunghwa Kang (https://github.com/seunghwak)

URL: https://github.com/rapidsai/cugraph/pull/4444
---
 cpp/src/prims/kv_store.cuh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cpp/src/prims/kv_store.cuh b/cpp/src/prims/kv_store.cuh
index 76b64b5692b..5001a20bb83 100644
--- a/cpp/src/prims/kv_store.cuh
+++ b/cpp/src/prims/kv_store.cuh
@@ -584,7 +584,7 @@ class kv_cuco_store_t {
         store_value_offsets.end(),
         kv_cuco_insert_and_increment_t<decltype(mutable_device_ref), KeyIterator>{
           mutable_device_ref, key_first, counter.data(), std::numeric_limits<size_t>::max()});
-      size_ += counter.value(stream);
+      size_ = counter.value(stream);
       resize_optional_dataframe_buffer<value_t>(store_values_, size_, stream);
       thrust::scatter_if(rmm::exec_policy(stream),
                          value_first,
@@ -636,7 +636,7 @@ class kv_cuco_store_t {
                                                   pred_op,
                                                   counter.data(),
                                                   std::numeric_limits<size_t>::max()});
-      size_ += counter.value(stream);
+      size_ = counter.value(stream);
       resize_optional_dataframe_buffer<value_t>(store_values_, size_, stream);
       thrust::scatter_if(rmm::exec_policy(stream),
                          value_first,
@@ -688,7 +688,7 @@ class kv_cuco_store_t {
         store_value_offsets.end(),
         kv_cuco_insert_and_increment_t<decltype(mutable_device_ref), KeyIterator>{
           mutable_device_ref, key_first, counter.data(), std::numeric_limits<size_t>::max()});
-      size_ += counter.value(stream);
+      size_ = counter.value(stream);
       resize_optional_dataframe_buffer<value_t>(store_values_, size_, stream);
       thrust::scatter_if(rmm::exec_policy(stream),
                          value_first,

From 1c3f3a8ffb0e22ab0674aff79e675706bbba5f2c Mon Sep 17 00:00:00 2001
From: Joseph Nke <76006812+jnke2016@users.noreply.github.com>
Date: Tue, 28 May 2024 14:40:27 +0100
Subject: [PATCH 14/23] Move edge triangle count to the stable API (#4382)

This PR

1. Performs edge triangle count in chunk
2. Enables k - 1 core optimization
3. Add C++ tests for edge triangle count
4. Move edge triangle count to the stable API
5. Implement MG edge triangle count and add tests
6. Update 'mg_graph_to_sg_graph' to support 'edge_ids' along with tests

closes #4370
closes #4371

Authors:
  - Joseph Nke (https://github.com/jnke2016)
  - Rick Ratzel (https://github.com/rlratzel)

Approvers:
  - Chuck Hastings (https://github.com/ChuckHastings)
  - Seunghwa Kang (https://github.com/seunghwak)

URL: https://github.com/rapidsai/cugraph/pull/4382
---
 cpp/CMakeLists.txt                            |   1 +
 cpp/include/cugraph/algorithms.hpp            |  18 +
 .../community/edge_triangle_count_impl.cuh    | 361 ++++++++++++++----
 cpp/src/community/edge_triangle_count_mg.cu   |  33 ++
 cpp/src/community/edge_triangle_count_sg.cu   |  18 +-
 cpp/src/community/k_truss_impl.cuh            | 292 +++++++-------
 cpp/tests/CMakeLists.txt                      |   9 +
 .../mg_betweenness_centrality_test.cpp        |  16 +-
 .../mg_edge_betweenness_centrality_test.cpp   |  14 +-
 .../mg_eigenvector_centrality_test.cpp        |  16 +-
 .../centrality/mg_katz_centrality_test.cpp    |  16 +-
 .../community/edge_triangle_count_test.cpp    | 260 +++++++++++++
 cpp/tests/community/mg_ecg_test.cpp           |  14 +-
 .../community/mg_edge_triangle_count_test.cpp | 253 ++++++++++++
 cpp/tests/community/mg_egonet_test.cu         |  16 +-
 cpp/tests/community/mg_leiden_test.cpp        |  14 +-
 cpp/tests/community/mg_louvain_test.cpp       |  14 +-
 .../community/mg_triangle_count_test.cpp      |  16 +-
 .../community/mg_weighted_matching_test.cpp   |  14 +-
 .../mg_weakly_connected_components_test.cpp   |  16 +-
 cpp/tests/cores/mg_core_number_test.cpp       |  16 +-
 cpp/tests/cores/mg_k_core_test.cpp            |  16 +-
 cpp/tests/link_analysis/mg_hits_test.cpp      |  16 +-
 cpp/tests/link_analysis/mg_pagerank_test.cpp  |  16 +-
 cpp/tests/mtmg/threaded_test_louvain.cu       |   4 +-
 cpp/tests/prims/mg_count_if_e.cu              |  16 +-
 cpp/tests/prims/mg_count_if_v.cu              |  16 +-
 cpp/tests/prims/mg_extract_transform_e.cu     |  16 +-
 ...extract_transform_v_frontier_outgoing_e.cu |  16 +-
 ...r_v_pair_transform_dst_nbr_intersection.cu |  16 +-
 ...transform_dst_nbr_weighted_intersection.cu |  20 +-
 ...er_v_random_select_transform_outgoing_e.cu |  16 +-
 ...rm_reduce_dst_key_aggregated_outgoing_e.cu |  16 +-
 ..._v_transform_reduce_incoming_outgoing_e.cu |  16 +-
 cpp/tests/prims/mg_reduce_v.cu                |  16 +-
 ...st_nbr_intersection_of_e_endpoints_by_v.cu |  16 +-
 cpp/tests/prims/mg_transform_reduce_e.cu      |  16 +-
 .../mg_transform_reduce_e_by_src_dst_key.cu   |  16 +-
 cpp/tests/prims/mg_transform_reduce_v.cu      |  16 +-
 ...orm_reduce_v_frontier_outgoing_e_by_dst.cu |  16 +-
 cpp/tests/structure/mg_coarsen_graph_test.cpp |  17 +-
 ..._count_self_loops_and_multi_edges_test.cpp |  16 +-
 ...has_edge_and_compute_multiplicity_test.cpp |  16 +-
 .../structure/mg_induced_subgraph_test.cu     |  14 +-
 cpp/tests/structure/mg_symmetrize_test.cpp    |  16 +-
 .../structure/mg_transpose_storage_test.cpp   |  16 +-
 cpp/tests/structure/mg_transpose_test.cpp     |  16 +-
 cpp/tests/traversal/mg_bfs_test.cpp           |  16 +-
 .../traversal/mg_extract_bfs_paths_test.cu    |  16 +-
 cpp/tests/traversal/mg_k_hop_nbrs_test.cpp    |  16 +-
 cpp/tests/traversal/mg_sssp_test.cpp          |  16 +-
 cpp/tests/utilities/conversion_utilities.hpp  |  15 +-
 .../utilities/conversion_utilities_impl.cuh   |  25 +-
 .../utilities/conversion_utilities_mg.cu      |  24 ++
 54 files changed, 1448 insertions(+), 514 deletions(-)
 create mode 100644 cpp/src/community/edge_triangle_count_mg.cu
 create mode 100644 cpp/tests/community/edge_triangle_count_test.cpp
 create mode 100644 cpp/tests/community/mg_edge_triangle_count_test.cpp

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 57e0aa2d078..2527599fece 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -180,6 +180,7 @@ set(CUGRAPH_SOURCES
     src/community/detail/refine_sg.cu
     src/community/detail/refine_mg.cu
     src/community/edge_triangle_count_sg.cu
+    src/community/edge_triangle_count_mg.cu
     src/community/detail/maximal_independent_moves_sg.cu
     src/community/detail/maximal_independent_moves_mg.cu
     src/detail/utility_wrappers.cu
diff --git a/cpp/include/cugraph/algorithms.hpp b/cpp/include/cugraph/algorithms.hpp
index 7c4a978c4b4..cc42399f091 100644
--- a/cpp/include/cugraph/algorithms.hpp
+++ b/cpp/include/cugraph/algorithms.hpp
@@ -2007,6 +2007,24 @@ void triangle_count(raft::handle_t const& handle,
                     raft::device_span<edge_t> counts,
                     bool do_expensive_check = false);
 
+/*
+ * @brief Compute edge triangle counts.
+ *
+ * Compute edge triangle counts for the entire set of edges.
+ *
+ * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
+ * @tparam edge_t Type of edge identifiers. Needs to be an integral type.
+ * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false)
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param graph_view Graph view object.
+ *
+ * @return edge_property_t containing the edge triangle count
+ */
+template <typename vertex_t, typename edge_t, bool multi_gpu>
+edge_property_t<graph_view_t<vertex_t, edge_t, false, multi_gpu>, edge_t> edge_triangle_count(
+  raft::handle_t const& handle, graph_view_t<vertex_t, edge_t, false, multi_gpu> const& graph_view);
+
 /*
  * @brief Compute K-Truss.
  *
diff --git a/cpp/src/community/edge_triangle_count_impl.cuh b/cpp/src/community/edge_triangle_count_impl.cuh
index 1370c1a17f2..c4277e240be 100644
--- a/cpp/src/community/edge_triangle_count_impl.cuh
+++ b/cpp/src/community/edge_triangle_count_impl.cuh
@@ -17,12 +17,17 @@
 #pragma once
 
 #include "detail/graph_partition_utils.cuh"
+#include "prims/edge_bucket.cuh"
+#include "prims/transform_e.cuh"
 #include "prims/transform_reduce_dst_nbr_intersection_of_e_endpoints_by_v.cuh"
 
+#include <cugraph/detail/shuffle_wrappers.hpp>
 #include <cugraph/graph_functions.hpp>
 #include <cugraph/graph_view.hpp>
 #include <cugraph/utilities/error.hpp>
 
+#include <raft/util/integer_utils.hpp>
+
 #include <thrust/adjacent_difference.h>
 #include <thrust/iterator/zip_iterator.h>
 #include <thrust/sort.h>
@@ -34,8 +39,9 @@ namespace detail {
 
 template <typename vertex_t, typename edge_t, typename EdgeIterator>
 struct update_edges_p_r_q_r_num_triangles {
-  size_t num_edges{};  // rename to num_edges
+  size_t num_edges{};
   const edge_t edge_first_or_second{};
+  size_t chunk_start{};
   raft::device_span<size_t const> intersection_offsets{};
   raft::device_span<vertex_t const> intersection_indices{};
   raft::device_span<edge_t> num_triangles{};
@@ -48,28 +54,22 @@ struct update_edges_p_r_q_r_num_triangles {
       thrust::seq, intersection_offsets.begin() + 1, intersection_offsets.end(), i);
     auto idx = thrust::distance(intersection_offsets.begin() + 1, itr);
     if (edge_first_or_second == 0) {
-      auto p_r_pair =
-        thrust::make_tuple(thrust::get<0>(*(edge_first + idx)), intersection_indices[i]);
+      auto p_r_pair = thrust::make_tuple(thrust::get<0>(*(edge_first + chunk_start + idx)),
+                                         intersection_indices[i]);
 
       // Find its position in 'edges'
       auto itr_p_r_p_q =
-        thrust::lower_bound(thrust::seq,
-                            edge_first,
-                            edge_first + num_edges,  // pass the number of vertex pairs
-                            p_r_pair);
+        thrust::lower_bound(thrust::seq, edge_first, edge_first + num_edges, p_r_pair);
 
       assert(*itr_p_r_p_q == p_r_pair);
       idx = thrust::distance(edge_first, itr_p_r_p_q);
     } else {
-      auto p_r_pair =
-        thrust::make_tuple(thrust::get<1>(*(edge_first + idx)), intersection_indices[i]);
+      auto p_r_pair = thrust::make_tuple(thrust::get<1>(*(edge_first + chunk_start + idx)),
+                                         intersection_indices[i]);
 
       // Find its position in 'edges'
       auto itr_p_r_p_q =
-        thrust::lower_bound(thrust::seq,
-                            edge_first,
-                            edge_first + num_edges,  // pass the number of vertex pairs
-                            p_r_pair);
+        thrust::lower_bound(thrust::seq, edge_first, edge_first + num_edges, p_r_pair);
       assert(*itr_p_r_p_q == p_r_pair);
       idx = thrust::distance(edge_first, itr_p_r_p_q);
     }
@@ -78,77 +78,296 @@ struct update_edges_p_r_q_r_num_triangles {
   }
 };
 
+template <typename vertex_t, typename edge_t, typename EdgeIterator>
+struct extract_p_r_q_r {
+  size_t chunk_start{};
+  size_t p_r_or_q_r{};
+  raft::device_span<size_t const> intersection_offsets{};
+  raft::device_span<vertex_t const> intersection_indices{};
+  EdgeIterator edge_first;
+
+  __device__ thrust::tuple<vertex_t, vertex_t> operator()(edge_t i) const
+  {
+    auto itr = thrust::upper_bound(
+      thrust::seq, intersection_offsets.begin() + 1, intersection_offsets.end(), i);
+    auto idx = thrust::distance(intersection_offsets.begin() + 1, itr);
+
+    if (p_r_or_q_r == 0) {
+      return thrust::make_tuple(thrust::get<0>(*(edge_first + chunk_start + idx)),
+                                intersection_indices[i]);
+    } else {
+      return thrust::make_tuple(thrust::get<1>(*(edge_first + chunk_start + idx)),
+                                intersection_indices[i]);
+    }
+  }
+};
+
+template <typename vertex_t, typename edge_t, typename EdgeIterator>
+struct extract_q_r {
+  size_t chunk_start{};
+  raft::device_span<size_t const> intersection_offsets{};
+  raft::device_span<vertex_t const> intersection_indices{};
+  EdgeIterator edge_first;
+
+  __device__ thrust::tuple<vertex_t, vertex_t> operator()(edge_t i) const
+  {
+    auto itr = thrust::upper_bound(
+      thrust::seq, intersection_offsets.begin() + 1, intersection_offsets.end(), i);
+    auto idx  = thrust::distance(intersection_offsets.begin() + 1, itr);
+    auto pair = thrust::make_tuple(thrust::get<1>(*(edge_first + chunk_start + idx)),
+                                   intersection_indices[i]);
+
+    return pair;
+  }
+};
+
 template <typename vertex_t, typename edge_t, bool store_transposed, bool multi_gpu>
-std::enable_if_t<!multi_gpu, rmm::device_uvector<edge_t>> edge_triangle_count_impl(
+edge_property_t<graph_view_t<vertex_t, edge_t, false, multi_gpu>, edge_t> edge_triangle_count_impl(
   raft::handle_t const& handle,
-  graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu> const& graph_view,
-  raft::device_span<vertex_t> edgelist_srcs,
-  raft::device_span<vertex_t> edgelist_dsts)
+  graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu> const& graph_view)
 {
-  auto edge_first = thrust::make_zip_iterator(edgelist_srcs.begin(), edgelist_dsts.begin());
+  using weight_t = float;
+  rmm::device_uvector<vertex_t> edgelist_srcs(0, handle.get_stream());
+  rmm::device_uvector<vertex_t> edgelist_dsts(0, handle.get_stream());
+  std::tie(edgelist_srcs, edgelist_dsts, std::ignore, std::ignore, std::ignore) =
+    decompress_to_edgelist<vertex_t, edge_t, weight_t, int32_t>(
+      handle, graph_view, std::nullopt, std::nullopt, std::nullopt, std::nullopt);
 
-  thrust::sort(handle.get_thrust_policy(), edge_first, edge_first + edgelist_srcs.size());
+  auto edge_first = thrust::make_zip_iterator(edgelist_srcs.begin(), edgelist_dsts.begin());
 
-  // FIXME: Perform 'nbr_intersection' in chunks to reduce peak memory.
-  auto [intersection_offsets, intersection_indices] =
-    detail::nbr_intersection(handle,
-                             graph_view,
-                             cugraph::edge_dummy_property_t{}.view(),
-                             edge_first,
-                             edge_first + edgelist_srcs.size(),
-                             std::array<bool, 2>{true, true},
-                             false /*FIXME: pass 'do_expensive_check' as argument*/);
+  size_t edges_to_intersect_per_iteration =
+    static_cast<size_t>(handle.get_device_properties().multiProcessorCount) * (1 << 17);
 
+  auto num_chunks =
+    raft::div_rounding_up_safe(edgelist_srcs.size(), edges_to_intersect_per_iteration);
+  size_t prev_chunk_size   = 0;
+  auto num_remaining_edges = edgelist_srcs.size();
   rmm::device_uvector<edge_t> num_triangles(edgelist_srcs.size(), handle.get_stream());
 
-  // Update the number of triangles of each (p, q) edges by looking at their intersection
-  // size
-  thrust::adjacent_difference(handle.get_thrust_policy(),
-                              intersection_offsets.begin() + 1,
-                              intersection_offsets.end(),
-                              num_triangles.begin());
-
-  // Given intersection offsets and indices that are used to update the number of
-  // triangles of (p, q) edges where `r`s are the intersection indices, update
-  // the number of triangles of the pairs (p, r) and (q, r).
-
-  thrust::for_each(
-    handle.get_thrust_policy(),
-    thrust::make_counting_iterator<edge_t>(0),
-    thrust::make_counting_iterator<edge_t>(intersection_indices.size()),
-    update_edges_p_r_q_r_num_triangles<vertex_t, edge_t, decltype(edge_first)>{
-      edgelist_srcs.size(),
-      0,
-      raft::device_span<size_t const>(intersection_offsets.data(), intersection_offsets.size()),
-      raft::device_span<vertex_t const>(intersection_indices.data(), intersection_indices.size()),
-      raft::device_span<edge_t>(num_triangles.data(), num_triangles.size()),
-      edge_first});
-
-  thrust::for_each(
-    handle.get_thrust_policy(),
-    thrust::make_counting_iterator<edge_t>(0),
-    thrust::make_counting_iterator<edge_t>(intersection_indices.size()),
-    update_edges_p_r_q_r_num_triangles<vertex_t, edge_t, decltype(edge_first)>{
-      edgelist_srcs.size(),
-      1,
-      raft::device_span<size_t const>(intersection_offsets.data(), intersection_offsets.size()),
-      raft::device_span<vertex_t const>(intersection_indices.data(), intersection_indices.size()),
-      raft::device_span<edge_t>(num_triangles.data(), num_triangles.size()),
-      edge_first});
-
-  return num_triangles;
+  // auto my_rank = handle.get_comms().get_rank();
+  if constexpr (multi_gpu) {
+    num_chunks = host_scalar_allreduce(
+      handle.get_comms(), num_chunks, raft::comms::op_t::MAX, handle.get_stream());
+  }
+
+  // Need to ensure that the vector has its values initialized to 0 before incrementing
+  thrust::fill(handle.get_thrust_policy(), num_triangles.begin(), num_triangles.end(), 0);
+
+  for (size_t i = 0; i < num_chunks; ++i) {
+    auto chunk_size = std::min(edges_to_intersect_per_iteration, num_remaining_edges);
+    num_remaining_edges -= chunk_size;
+    // Perform 'nbr_intersection' in chunks to reduce peak memory.
+    auto [intersection_offsets, intersection_indices] =
+      detail::nbr_intersection(handle,
+                               graph_view,
+                               cugraph::edge_dummy_property_t{}.view(),
+                               edge_first + prev_chunk_size,
+                               edge_first + prev_chunk_size + chunk_size,
+                               std::array<bool, 2>{true, true},
+                               false /*FIXME: pass 'do_expensive_check' as argument*/);
+
+    // Update the number of triangles of each (p, q) edges by looking at their intersection
+    // size
+    thrust::for_each(
+      handle.get_thrust_policy(),
+      thrust::make_counting_iterator<edge_t>(0),
+      thrust::make_counting_iterator<edge_t>(chunk_size),
+      [chunk_start          = prev_chunk_size,
+       num_triangles        = raft::device_span<edge_t>(num_triangles.data(), num_triangles.size()),
+       intersection_offsets = raft::device_span<size_t const>(
+         intersection_offsets.data(), intersection_offsets.size())] __device__(auto i) {
+        num_triangles[chunk_start + i] += (intersection_offsets[i + 1] - intersection_offsets[i]);
+      });
+
+    if constexpr (multi_gpu) {
+      // stores all the pairs (p, r) and (q, r)
+      auto vertex_pair_buffer_tmp = allocate_dataframe_buffer<thrust::tuple<vertex_t, vertex_t>>(
+        intersection_indices.size() * 2, handle.get_stream());
+
+      // tabulate with the size of intersection_indices, and call binary search on
+      // intersection_offsets to get (p, r).
+      thrust::tabulate(
+        handle.get_thrust_policy(),
+        get_dataframe_buffer_begin(vertex_pair_buffer_tmp),
+        get_dataframe_buffer_begin(vertex_pair_buffer_tmp) + intersection_indices.size(),
+        extract_p_r_q_r<vertex_t, edge_t, decltype(edge_first)>{
+          prev_chunk_size,
+          0,
+          raft::device_span<size_t const>(intersection_offsets.data(), intersection_offsets.size()),
+          raft::device_span<vertex_t const>(intersection_indices.data(),
+                                            intersection_indices.size()),
+          edge_first});
+      // FIXME: Consolidate both functions
+      thrust::tabulate(
+        handle.get_thrust_policy(),
+        get_dataframe_buffer_begin(vertex_pair_buffer_tmp) + intersection_indices.size(),
+        get_dataframe_buffer_begin(vertex_pair_buffer_tmp) + (2 * intersection_indices.size()),
+        extract_p_r_q_r<vertex_t, edge_t, decltype(edge_first)>{
+          prev_chunk_size,
+          1,
+          raft::device_span<size_t const>(intersection_offsets.data(), intersection_offsets.size()),
+          raft::device_span<vertex_t const>(intersection_indices.data(),
+                                            intersection_indices.size()),
+          edge_first});
+
+      thrust::sort(handle.get_thrust_policy(),
+                   get_dataframe_buffer_begin(vertex_pair_buffer_tmp),
+                   get_dataframe_buffer_end(vertex_pair_buffer_tmp));
+
+      rmm::device_uvector<edge_t> increase_count_tmp(2 * intersection_indices.size(),
+                                                     handle.get_stream());
+      thrust::fill(handle.get_thrust_policy(),
+                   increase_count_tmp.begin(),
+                   increase_count_tmp.end(),
+                   size_t{1});
+
+      auto count_p_r_q_r = thrust::unique_count(handle.get_thrust_policy(),
+                                                get_dataframe_buffer_begin(vertex_pair_buffer_tmp),
+                                                get_dataframe_buffer_end(vertex_pair_buffer_tmp));
+
+      rmm::device_uvector<edge_t> increase_count(count_p_r_q_r, handle.get_stream());
+
+      auto vertex_pair_buffer = allocate_dataframe_buffer<thrust::tuple<vertex_t, vertex_t>>(
+        count_p_r_q_r, handle.get_stream());
+      thrust::reduce_by_key(handle.get_thrust_policy(),
+                            get_dataframe_buffer_begin(vertex_pair_buffer_tmp),
+                            get_dataframe_buffer_end(vertex_pair_buffer_tmp),
+                            increase_count_tmp.begin(),
+                            get_dataframe_buffer_begin(vertex_pair_buffer),
+                            increase_count.begin(),
+                            thrust::equal_to<thrust::tuple<vertex_t, vertex_t>>{});
+
+      rmm::device_uvector<vertex_t> pair_srcs(0, handle.get_stream());
+      rmm::device_uvector<vertex_t> pair_dsts(0, handle.get_stream());
+      std::optional<rmm::device_uvector<edge_t>> pair_count{std::nullopt};
+
+      std::optional<rmm::device_uvector<edge_t>> opt_increase_count =
+        std::make_optional(rmm::device_uvector<edge_t>(increase_count.size(), handle.get_stream()));
+
+      raft::copy<edge_t>((*opt_increase_count).begin(),
+                         increase_count.begin(),
+                         increase_count.size(),
+                         handle.get_stream());
+
+      // There are still multiple copies here but is it worth sorting and reducing again?
+      std::tie(pair_srcs, pair_dsts, std::ignore, pair_count, std::ignore) =
+        shuffle_int_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning<vertex_t,
+                                                                               edge_t,
+                                                                               weight_t,
+                                                                               int32_t>(
+          handle,
+          std::move(std::get<0>(vertex_pair_buffer)),
+          std::move(std::get<1>(vertex_pair_buffer)),
+          std::nullopt,
+          // FIXME: Add general purpose function for shuffling vertex pairs and arbitrary attributes
+          std::move(opt_increase_count),
+          std::nullopt,
+          graph_view.vertex_partition_range_lasts());
+
+      thrust::for_each(
+        handle.get_thrust_policy(),
+        thrust::make_counting_iterator<edge_t>(0),
+        thrust::make_counting_iterator<edge_t>(pair_srcs.size()),
+        [num_edges     = edgelist_srcs.size(),
+         num_triangles = num_triangles.data(),
+         pair_srcs     = pair_srcs.data(),
+         pair_dsts     = pair_dsts.data(),
+         pair_count    = (*pair_count).data(),
+         edge_first] __device__(auto idx) {
+          auto src          = pair_srcs[idx];
+          auto dst          = pair_dsts[idx];
+          auto p_r_q_r_pair = thrust::make_tuple(src, dst);
+
+          // Find its position in 'edges'
+          auto itr_p_r_q_r =
+            thrust::lower_bound(thrust::seq, edge_first, edge_first + num_edges, p_r_q_r_pair);
+
+          assert(*itr_p_r_q_r == p_r_q_r_pair);
+          auto idx_p_r_q_r = thrust::distance(edge_first, itr_p_r_q_r);
+
+          cuda::atomic_ref<edge_t, cuda::thread_scope_device> atomic_counter(
+            num_triangles[idx_p_r_q_r]);
+          auto r = atomic_counter.fetch_add(pair_count[idx], cuda::std::memory_order_relaxed);
+        });
+
+    } else {
+      // Given intersection offsets and indices that are used to update the number of
+      // triangles of (p, q) edges where `r`s are the intersection indices, update
+      // the number of triangles of the pairs (p, r) and (q, r).
+      thrust::for_each(
+        handle.get_thrust_policy(),
+        thrust::make_counting_iterator<edge_t>(0),
+        thrust::make_counting_iterator<edge_t>(intersection_indices.size()),
+        update_edges_p_r_q_r_num_triangles<vertex_t, edge_t, decltype(edge_first)>{
+          edgelist_srcs.size(),
+          0,
+          prev_chunk_size,
+          raft::device_span<size_t const>(intersection_offsets.data(), intersection_offsets.size()),
+          raft::device_span<vertex_t const>(intersection_indices.data(),
+                                            intersection_indices.size()),
+          raft::device_span<edge_t>(num_triangles.data(), num_triangles.size()),
+          edge_first});
+
+      thrust::for_each(
+        handle.get_thrust_policy(),
+        thrust::make_counting_iterator<edge_t>(0),
+        thrust::make_counting_iterator<edge_t>(intersection_indices.size()),
+        update_edges_p_r_q_r_num_triangles<vertex_t, edge_t, decltype(edge_first)>{
+          edgelist_srcs.size(),
+          1,
+          prev_chunk_size,
+          raft::device_span<size_t const>(intersection_offsets.data(), intersection_offsets.size()),
+          raft::device_span<vertex_t const>(intersection_indices.data(),
+                                            intersection_indices.size()),
+          raft::device_span<edge_t>(num_triangles.data(), num_triangles.size()),
+          edge_first});
+    }
+    prev_chunk_size += chunk_size;
+  }
+
+  cugraph::edge_property_t<graph_view_t<vertex_t, edge_t, false, multi_gpu>, edge_t> counts(
+    handle, graph_view);
+
+  cugraph::edge_bucket_t<vertex_t, void, true, multi_gpu, true> valid_edges(handle);
+  valid_edges.insert(edgelist_srcs.begin(), edgelist_srcs.end(), edgelist_dsts.begin());
+
+  auto cur_graph_view = graph_view;
+
+  cugraph::transform_e(
+    handle,
+    graph_view,
+    valid_edges,
+    cugraph::edge_src_dummy_property_t{}.view(),
+    cugraph::edge_dst_dummy_property_t{}.view(),
+    cugraph::edge_dummy_property_t{}.view(),
+    [edge_first,
+     edge_last     = edge_first + edgelist_srcs.size(),
+     num_edges     = edgelist_srcs.size(),
+     num_triangles = num_triangles.data()] __device__(auto src,
+                                                      auto dst,
+                                                      thrust::nullopt_t,
+                                                      thrust::nullopt_t,
+                                                      thrust::nullopt_t) {
+      auto pair = thrust::make_tuple(src, dst);
+
+      // Find its position in 'edges'
+      auto itr_pair = thrust::lower_bound(thrust::seq, edge_first, edge_last, pair);
+      auto idx_pair = thrust::distance(edge_first, itr_pair);
+      return num_triangles[idx_pair];
+    },
+    counts.mutable_view(),
+    false);
+
+  return counts;
 }
 
 }  // namespace detail
 
-template <typename vertex_t, typename edge_t, bool store_transposed, bool multi_gpu>
-rmm::device_uvector<edge_t> edge_triangle_count(
-  raft::handle_t const& handle,
-  graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu> const& graph_view,
-  raft::device_span<vertex_t> edgelist_srcs,
-  raft::device_span<vertex_t> edgelist_dsts)
+template <typename vertex_t, typename edge_t, bool multi_gpu>
+edge_property_t<graph_view_t<vertex_t, edge_t, false, multi_gpu>, edge_t> edge_triangle_count(
+  raft::handle_t const& handle, graph_view_t<vertex_t, edge_t, false, multi_gpu> const& graph_view)
 {
-  return detail::edge_triangle_count_impl(handle, graph_view, edgelist_srcs, edgelist_dsts);
+  return detail::edge_triangle_count_impl(handle, graph_view);
 }
 
 }  // namespace cugraph
diff --git a/cpp/src/community/edge_triangle_count_mg.cu b/cpp/src/community/edge_triangle_count_mg.cu
new file mode 100644
index 00000000000..254a0807e56
--- /dev/null
+++ b/cpp/src/community/edge_triangle_count_mg.cu
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "community/edge_triangle_count_impl.cuh"
+
+namespace cugraph {
+
+// SG instantiation
+template edge_property_t<graph_view_t<int32_t, int32_t, false, true>, int32_t> edge_triangle_count(
+  raft::handle_t const& handle,
+  cugraph::graph_view_t<int32_t, int32_t, false, true> const& graph_view);
+
+template edge_property_t<graph_view_t<int32_t, int64_t, false, true>, int64_t> edge_triangle_count(
+  raft::handle_t const& handle,
+  cugraph::graph_view_t<int32_t, int64_t, false, true> const& graph_view);
+
+template edge_property_t<graph_view_t<int64_t, int64_t, false, true>, int64_t> edge_triangle_count(
+  raft::handle_t const& handle,
+  cugraph::graph_view_t<int64_t, int64_t, false, true> const& graph_view);
+
+}  // namespace cugraph
diff --git a/cpp/src/community/edge_triangle_count_sg.cu b/cpp/src/community/edge_triangle_count_sg.cu
index c4b7e71b967..4ccb968458d 100644
--- a/cpp/src/community/edge_triangle_count_sg.cu
+++ b/cpp/src/community/edge_triangle_count_sg.cu
@@ -18,22 +18,16 @@
 namespace cugraph {
 
 // SG instantiation
-template rmm::device_uvector<int32_t> edge_triangle_count(
+template edge_property_t<graph_view_t<int32_t, int32_t, false, false>, int32_t> edge_triangle_count(
   raft::handle_t const& handle,
-  cugraph::graph_view_t<int32_t, int32_t, false, false> const& graph_view,
-  raft::device_span<int32_t> edgelist_srcs,
-  raft::device_span<int32_t> edgelist_dsts);
+  cugraph::graph_view_t<int32_t, int32_t, false, false> const& graph_view);
 
-template rmm::device_uvector<int64_t> edge_triangle_count(
+template edge_property_t<graph_view_t<int32_t, int64_t, false, false>, int64_t> edge_triangle_count(
   raft::handle_t const& handle,
-  cugraph::graph_view_t<int32_t, int64_t, false, false> const& graph_view,
-  raft::device_span<int32_t> edgelist_srcs,
-  raft::device_span<int32_t> edgelist_dsts);
+  cugraph::graph_view_t<int32_t, int64_t, false, false> const& graph_view);
 
-template rmm::device_uvector<int64_t> edge_triangle_count(
+template edge_property_t<graph_view_t<int64_t, int64_t, false, false>, int64_t> edge_triangle_count(
   raft::handle_t const& handle,
-  cugraph::graph_view_t<int64_t, int64_t, false, false> const& graph_view,
-  raft::device_span<int64_t> edgelist_srcs,
-  raft::device_span<int64_t> edgelist_dsts);
+  cugraph::graph_view_t<int64_t, int64_t, false, false> const& graph_view);
 
 }  // namespace cugraph
diff --git a/cpp/src/community/k_truss_impl.cuh b/cpp/src/community/k_truss_impl.cuh
index 7f96312703d..f830e6a7700 100644
--- a/cpp/src/community/k_truss_impl.cuh
+++ b/cpp/src/community/k_truss_impl.cuh
@@ -27,6 +27,8 @@
 #include <cugraph/graph_functions.hpp>
 #include <cugraph/utilities/error.hpp>
 
+#include <raft/util/integer_utils.hpp>
+
 #include <thrust/copy.h>
 #include <thrust/count.h>
 #include <thrust/distance.h>
@@ -39,14 +41,6 @@
 
 namespace cugraph {
 
-// FIXME : This will be deleted once edge_triangle_count becomes public
-template <typename vertex_t, typename edge_t, bool store_transposed, bool multi_gpu>
-rmm::device_uvector<edge_t> edge_triangle_count(
-  raft::handle_t const& handle,
-  graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu> const& graph_view,
-  raft::device_span<vertex_t> edgelist_srcs,
-  raft::device_span<vertex_t> edgelist_dsts);
-
 template <typename vertex_t, typename edge_t, typename EdgeIterator>
 struct unroll_edge {
   size_t num_valid_edges{};
@@ -442,6 +436,7 @@ struct extract_low_to_high_degree_edges_t {
 
 template <typename vertex_t, typename edge_t, bool generate_p_r>
 struct generate_p_r_or_q_r_from_p_q {
+  size_t chunk_start{};
   raft::device_span<size_t const> intersection_offsets{};
   raft::device_span<vertex_t const> intersection_indices{};
   raft::device_span<vertex_t const> invalid_srcs{};
@@ -454,10 +449,10 @@ struct generate_p_r_or_q_r_from_p_q {
     auto idx = thrust::distance(intersection_offsets.begin() + 1, itr);
 
     if constexpr (generate_p_r) {
-      return thrust::make_tuple(invalid_srcs[idx], intersection_indices[i]);
+      return thrust::make_tuple(invalid_srcs[chunk_start + idx], intersection_indices[i]);
 
     } else {
-      return thrust::make_tuple(invalid_dsts[idx], intersection_indices[i]);
+      return thrust::make_tuple(invalid_dsts[chunk_start + idx], intersection_indices[i]);
     }
   }
 };
@@ -491,6 +486,7 @@ k_truss(raft::handle_t const& handle,
   std::optional<rmm::device_uvector<vertex_t>> renumber_map{std::nullopt};
   std::optional<edge_property_t<graph_view_t<vertex_t, edge_t, false, multi_gpu>, weight_t>>
     edge_weight{std::nullopt};
+  std::optional<rmm::device_uvector<weight_t>> wgts{std::nullopt};
 
   if (graph_view.count_self_loops(handle) > edge_t{0}) {
     auto [srcs, dsts] = extract_transform_e(handle,
@@ -524,31 +520,30 @@ k_truss(raft::handle_t const& handle,
     modified_graph_view = (*modified_graph).view();
   }
 
-  // FIXME: Investigate k-1 core failure to yield correct results.
   // 3. Find (k-1)-core and exclude edges that do not belong to (k-1)-core
-  /*
   {
     auto cur_graph_view = modified_graph_view ? *modified_graph_view : graph_view;
+
     auto vertex_partition_range_lasts =
       renumber_map
         ? std::make_optional<std::vector<vertex_t>>(cur_graph_view.vertex_partition_range_lasts())
         : std::nullopt;
 
-    rmm::device_uvector<edge_t> d_core_numbers(cur_graph_view.local_vertex_partition_range_size(),
-                                               handle.get_stream());
-    raft::device_span<edge_t const> core_number_span{d_core_numbers.data(), d_core_numbers.size()};
+    rmm::device_uvector<edge_t> core_numbers(cur_graph_view.number_of_vertices(),
+                                             handle.get_stream());
+    core_number(
+      handle, cur_graph_view, core_numbers.data(), k_core_degree_type_t::OUT, size_t{2}, size_t{2});
+
+    raft::device_span<edge_t const> core_number_span{core_numbers.data(), core_numbers.size()};
 
     rmm::device_uvector<vertex_t> srcs{0, handle.get_stream()};
     rmm::device_uvector<vertex_t> dsts{0, handle.get_stream()};
-    std::tie(srcs, dsts, std::ignore) =
-      k_core(handle,
-             cur_graph_view,
-             std::optional<edge_property_view_t<edge_t, weight_t const*>>{std::nullopt},
-             size_t{k - 1},
-             std::make_optional(k_core_degree_type_t::OUT),
-             // Seems like the below argument is required. passing a std::nullopt
-             // create a compiler error
-             std::make_optional(core_number_span));
+    std::tie(srcs, dsts, wgts) = k_core(handle,
+                                        cur_graph_view,
+                                        edge_weight_view,
+                                        k - 1,
+                                        std::make_optional(k_core_degree_type_t::OUT),
+                                        std::make_optional(core_number_span));
 
     if constexpr (multi_gpu) {
       std::tie(srcs, dsts, std::ignore, std::ignore, std::ignore) =
@@ -561,17 +556,17 @@ k_truss(raft::handle_t const& handle,
 
     std::optional<rmm::device_uvector<vertex_t>> tmp_renumber_map{std::nullopt};
 
-    std::tie(*modified_graph, std::ignore, std::ignore, std::ignore, tmp_renumber_map) =
+    std::tie(*modified_graph, edge_weight, std::ignore, std::ignore, tmp_renumber_map) =
       create_graph_from_edgelist<vertex_t, edge_t, weight_t, edge_t, int32_t, false, multi_gpu>(
         handle,
         std::nullopt,
         std::move(srcs),
         std::move(dsts),
-        std::nullopt,
+        std::move(wgts),
         std::nullopt,
         std::nullopt,
         cugraph::graph_properties_t{true, graph_view.is_multigraph()},
-        true);
+        false);
 
     modified_graph_view = (*modified_graph).view();
 
@@ -584,7 +579,6 @@ k_truss(raft::handle_t const& handle,
     }
     renumber_map = std::move(tmp_renumber_map);
   }
-  */
 
   // 4. Keep only the edges from a low-degree vertex to a high-degree vertex.
 
@@ -606,7 +600,10 @@ k_truss(raft::handle_t const& handle,
 
     rmm::device_uvector<vertex_t> srcs(0, handle.get_stream());
     rmm::device_uvector<vertex_t> dsts(0, handle.get_stream());
-    std::optional<rmm::device_uvector<weight_t>> wgts{std::nullopt};
+
+    edge_weight_view =
+      edge_weight ? std::make_optional((*edge_weight).view())
+                  : std::optional<edge_property_view_t<edge_t, weight_t const*>>{std::nullopt};
     if (edge_weight_view) {
       std::tie(srcs, dsts, wgts) = extract_transform_e(
         handle,
@@ -666,38 +663,36 @@ k_truss(raft::handle_t const& handle,
     auto cur_graph_view = modified_graph_view ? *modified_graph_view : graph_view;
     rmm::device_uvector<vertex_t> edgelist_srcs(0, handle.get_stream());
     rmm::device_uvector<vertex_t> edgelist_dsts(0, handle.get_stream());
+    std::optional<rmm::device_uvector<edge_t>> num_triangles{std::nullopt};
     std::optional<rmm::device_uvector<weight_t>> edgelist_wgts{std::nullopt};
 
     edge_weight_view =
       edge_weight ? std::make_optional((*edge_weight).view())
                   : std::optional<edge_property_view_t<edge_t, weight_t const*>>{std::nullopt};
-    std::tie(edgelist_srcs, edgelist_dsts, edgelist_wgts, std::ignore, std::ignore) =
+
+    auto prop_num_triangles = edge_triangle_count<vertex_t, edge_t, false>(handle, cur_graph_view);
+
+    std::tie(edgelist_srcs, edgelist_dsts, edgelist_wgts, num_triangles, std::ignore) =
       decompress_to_edgelist(
         handle,
         cur_graph_view,
         edge_weight_view,
-        std::optional<edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
+        // FIXME: Update 'decompress_edgelist' to support int32_t and int64_t values
+        std::make_optional(prop_num_triangles.view()),
         std::optional<cugraph::edge_property_view_t<edge_t, int32_t const*>>{std::nullopt},
         std::optional<raft::device_span<vertex_t const>>(std::nullopt));
-
-    auto num_triangles = edge_triangle_count<vertex_t, edge_t, false, false>(
-      handle,
-      cur_graph_view,
-      raft::device_span<vertex_t>(edgelist_srcs.data(), edgelist_srcs.size()),
-      raft::device_span<vertex_t>(edgelist_dsts.data(), edgelist_dsts.size()));
-
     auto transposed_edge_first =
       thrust::make_zip_iterator(edgelist_dsts.begin(), edgelist_srcs.begin());
 
     auto edge_first = thrust::make_zip_iterator(edgelist_srcs.begin(), edgelist_dsts.begin());
 
     auto transposed_edge_triangle_count_pair_first =
-      thrust::make_zip_iterator(transposed_edge_first, num_triangles.begin());
+      thrust::make_zip_iterator(transposed_edge_first, (*num_triangles).begin());
 
     thrust::sort_by_key(handle.get_thrust_policy(),
                         transposed_edge_first,
                         transposed_edge_first + edgelist_srcs.size(),
-                        num_triangles.begin());
+                        (*num_triangles).begin());
 
     cugraph::edge_property_t<decltype(cur_graph_view), bool> edge_mask(handle, cur_graph_view);
     cugraph::fill_edge_property(handle, cur_graph_view, true, edge_mask);
@@ -728,92 +723,115 @@ k_truss(raft::handle_t const& handle,
 
       // nbr_intersection requires the edges to be sort by 'src'
       // sort the invalid edges by src for nbr intersection
-      thrust::sort_by_key(handle.get_thrust_policy(),
-                          edge_first + num_valid_edges,
-                          edge_first + edgelist_srcs.size(),
-                          num_triangles.begin() + num_valid_edges);
-
-      auto [intersection_offsets, intersection_indices] =
-        detail::nbr_intersection(handle,
-                                 cur_graph_view,
-                                 cugraph::edge_dummy_property_t{}.view(),
-                                 edge_first + num_valid_edges,
-                                 edge_first + edgelist_srcs.size(),
-                                 std::array<bool, 2>{true, true},
-                                 do_expensive_check);
-
-      // Update the number of triangles of each (p, q) edges by looking at their intersection
-      // size.
-      thrust::for_each(
-        handle.get_thrust_policy(),
-        thrust::make_counting_iterator<edge_t>(0),
-        thrust::make_counting_iterator<edge_t>(num_invalid_edges),
-        [num_triangles =
-           raft::device_span<edge_t>(num_triangles.data() + num_valid_edges, num_invalid_edges),
-         intersection_offsets = raft::device_span<size_t const>(
-           intersection_offsets.data(), intersection_offsets.size())] __device__(auto i) {
-          num_triangles[i] -= intersection_offsets[i + 1] - intersection_offsets[i];
-        });
-
-      // FIXME: Find a way to not have to maintain a dataframe_buffer
-      auto vertex_pair_buffer_p_r_edge_p_q =
-        allocate_dataframe_buffer<thrust::tuple<vertex_t, vertex_t>>(intersection_indices.size(),
-                                                                     handle.get_stream());
-
-      thrust::tabulate(
-        handle.get_thrust_policy(),
-        get_dataframe_buffer_begin(vertex_pair_buffer_p_r_edge_p_q),
-        get_dataframe_buffer_end(vertex_pair_buffer_p_r_edge_p_q),
-        generate_p_r_or_q_r_from_p_q<vertex_t, edge_t, true>{
-          raft::device_span<size_t const>(intersection_offsets.data(), intersection_offsets.size()),
-          raft::device_span<vertex_t const>(intersection_indices.data(),
-                                            intersection_indices.size()),
-          raft::device_span<vertex_t>(edgelist_srcs.data() + num_valid_edges, num_invalid_edges),
-          raft::device_span<vertex_t>(edgelist_dsts.data() + num_valid_edges, num_invalid_edges)});
-
-      auto vertex_pair_buffer_q_r_edge_p_q =
-        allocate_dataframe_buffer<thrust::tuple<vertex_t, vertex_t>>(intersection_indices.size(),
-                                                                     handle.get_stream());
-      thrust::tabulate(
-        handle.get_thrust_policy(),
-        get_dataframe_buffer_begin(vertex_pair_buffer_q_r_edge_p_q),
-        get_dataframe_buffer_end(vertex_pair_buffer_q_r_edge_p_q),
-        generate_p_r_or_q_r_from_p_q<vertex_t, edge_t, false>{
-          raft::device_span<size_t const>(intersection_offsets.data(), intersection_offsets.size()),
-          raft::device_span<vertex_t const>(intersection_indices.data(),
-                                            intersection_indices.size()),
-          raft::device_span<vertex_t>(edgelist_srcs.data() + num_valid_edges, num_invalid_edges),
-          raft::device_span<vertex_t>(edgelist_dsts.data() + num_valid_edges, num_invalid_edges)});
-
-      // Unrolling the edges require the edges to be sorted by destination
-      // re-sort the invalid edges by 'dst'
-      thrust::sort_by_key(handle.get_thrust_policy(),
-                          transposed_edge_first + num_valid_edges,
-                          transposed_edge_first + edgelist_srcs.size(),
-                          num_triangles.begin() + num_valid_edges);
-
-      thrust::for_each(handle.get_thrust_policy(),
-                       thrust::make_counting_iterator<edge_t>(0),
-                       thrust::make_counting_iterator<edge_t>(intersection_indices.size()),
-                       unroll_edge<vertex_t, edge_t, decltype(transposed_edge_first)>{
-                         num_valid_edges,
-                         raft::device_span<edge_t>(num_triangles.data(), num_triangles.size()),
-                         get_dataframe_buffer_begin(vertex_pair_buffer_p_r_edge_p_q),
-                         transposed_edge_first,
-                         transposed_edge_first + num_valid_edges,
-                         transposed_edge_first + edgelist_srcs.size()});
-
-      thrust::for_each(handle.get_thrust_policy(),
-                       thrust::make_counting_iterator<edge_t>(0),
-                       thrust::make_counting_iterator<edge_t>(intersection_indices.size()),
-                       unroll_edge<vertex_t, edge_t, decltype(transposed_edge_first)>{
-                         num_valid_edges,
-                         raft::device_span<edge_t>(num_triangles.data(), num_triangles.size()),
-                         get_dataframe_buffer_begin(vertex_pair_buffer_q_r_edge_p_q),
-                         transposed_edge_first,
-                         transposed_edge_first + num_valid_edges,
-                         transposed_edge_first + edgelist_srcs.size()});
-
+      size_t edges_to_intersect_per_iteration =
+        static_cast<size_t>(handle.get_device_properties().multiProcessorCount) * (1 << 17);
+
+      size_t prev_chunk_size         = 0;
+      size_t chunk_num_invalid_edges = num_invalid_edges;
+
+      auto num_chunks =
+        raft::div_rounding_up_safe(edgelist_srcs.size(), edges_to_intersect_per_iteration);
+
+      for (size_t i = 0; i < num_chunks; ++i) {
+        auto chunk_size = std::min(edges_to_intersect_per_iteration, chunk_num_invalid_edges);
+        thrust::sort_by_key(handle.get_thrust_policy(),
+                            edge_first + num_valid_edges,
+                            edge_first + edgelist_srcs.size(),
+                            (*num_triangles).begin() + num_valid_edges);
+
+        auto [intersection_offsets, intersection_indices] =
+          detail::nbr_intersection(handle,
+                                   cur_graph_view,
+                                   cugraph::edge_dummy_property_t{}.view(),
+                                   edge_first + num_valid_edges + prev_chunk_size,
+                                   edge_first + num_valid_edges + prev_chunk_size + chunk_size,
+                                   std::array<bool, 2>{true, true},
+                                   do_expensive_check);
+
+        // Update the number of triangles of each (p, q) edges by looking at their intersection
+        // size.
+        thrust::for_each(
+          handle.get_thrust_policy(),
+          thrust::make_counting_iterator<edge_t>(0),
+          thrust::make_counting_iterator<edge_t>(chunk_size),
+          [chunk_start   = prev_chunk_size,
+           num_triangles = raft::device_span<edge_t>((*num_triangles).data() + num_valid_edges,
+                                                     num_invalid_edges),
+           intersection_offsets = raft::device_span<size_t const>(
+             intersection_offsets.data(), intersection_offsets.size())] __device__(auto i) {
+            num_triangles[chunk_start + i] -=
+              (intersection_offsets[i + 1] - intersection_offsets[i]);
+          });
+
+        // FIXME: Find a way to not have to maintain a dataframe_buffer
+        auto vertex_pair_buffer_p_r_edge_p_q =
+          allocate_dataframe_buffer<thrust::tuple<vertex_t, vertex_t>>(intersection_indices.size(),
+                                                                       handle.get_stream());
+        thrust::tabulate(
+          handle.get_thrust_policy(),
+          get_dataframe_buffer_begin(vertex_pair_buffer_p_r_edge_p_q),
+          get_dataframe_buffer_end(vertex_pair_buffer_p_r_edge_p_q),
+          generate_p_r_or_q_r_from_p_q<vertex_t, edge_t, true>{
+            prev_chunk_size,
+            raft::device_span<size_t const>(intersection_offsets.data(),
+                                            intersection_offsets.size()),
+            raft::device_span<vertex_t const>(intersection_indices.data(),
+                                              intersection_indices.size()),
+            raft::device_span<vertex_t>(edgelist_srcs.data() + num_valid_edges, num_invalid_edges),
+            raft::device_span<vertex_t>(edgelist_dsts.data() + num_valid_edges,
+                                        num_invalid_edges)});
+
+        auto vertex_pair_buffer_q_r_edge_p_q =
+          allocate_dataframe_buffer<thrust::tuple<vertex_t, vertex_t>>(intersection_indices.size(),
+                                                                       handle.get_stream());
+        thrust::tabulate(
+          handle.get_thrust_policy(),
+          get_dataframe_buffer_begin(vertex_pair_buffer_q_r_edge_p_q),
+          get_dataframe_buffer_end(vertex_pair_buffer_q_r_edge_p_q),
+          generate_p_r_or_q_r_from_p_q<vertex_t, edge_t, false>{
+            prev_chunk_size,
+            raft::device_span<size_t const>(intersection_offsets.data(),
+                                            intersection_offsets.size()),
+            raft::device_span<vertex_t const>(intersection_indices.data(),
+                                              intersection_indices.size()),
+            raft::device_span<vertex_t>(edgelist_srcs.data() + num_valid_edges, num_invalid_edges),
+            raft::device_span<vertex_t>(edgelist_dsts.data() + num_valid_edges,
+                                        num_invalid_edges)});
+
+        // Unrolling the edges require the edges to be sorted by destination
+        // re-sort the invalid edges by 'dst'
+        thrust::sort_by_key(handle.get_thrust_policy(),
+                            transposed_edge_first + num_valid_edges,
+                            transposed_edge_first + edgelist_srcs.size(),
+                            (*num_triangles).begin() + num_valid_edges);
+
+        thrust::for_each(
+          handle.get_thrust_policy(),
+          thrust::make_counting_iterator<edge_t>(0),
+          thrust::make_counting_iterator<edge_t>(intersection_indices.size()),
+          unroll_edge<vertex_t, edge_t, decltype(transposed_edge_first)>{
+            num_valid_edges,
+            raft::device_span<edge_t>((*num_triangles).data(), (*num_triangles).size()),
+            get_dataframe_buffer_begin(vertex_pair_buffer_p_r_edge_p_q),
+            transposed_edge_first,
+            transposed_edge_first + num_valid_edges,
+            transposed_edge_first + edgelist_srcs.size()});
+
+        thrust::for_each(
+          handle.get_thrust_policy(),
+          thrust::make_counting_iterator<edge_t>(0),
+          thrust::make_counting_iterator<edge_t>(intersection_indices.size()),
+          unroll_edge<vertex_t, edge_t, decltype(transposed_edge_first)>{
+            num_valid_edges,
+            raft::device_span<edge_t>((*num_triangles).data(), (*num_triangles).size()),
+            get_dataframe_buffer_begin(vertex_pair_buffer_q_r_edge_p_q),
+            transposed_edge_first,
+            transposed_edge_first + num_valid_edges,
+            transposed_edge_first + edgelist_srcs.size()});
+
+        prev_chunk_size += chunk_size;
+        chunk_num_invalid_edges -= chunk_size;
+      }
       // case 2: unroll (q, r)
       // For each (q, r) edges to unroll, find the incoming edges to 'r' let's say from 'p' and
       // create the pair (p, q)
@@ -824,7 +842,7 @@ k_truss(raft::handle_t const& handle,
         num_valid_edges,
         raft::device_span<vertex_t const>(edgelist_srcs.data(), edgelist_srcs.size()),
         raft::device_span<vertex_t const>(edgelist_dsts.data(), edgelist_dsts.size()),
-        raft::device_span<edge_t>(num_triangles.data(), num_triangles.size()));
+        raft::device_span<edge_t>((*num_triangles).data(), (*num_triangles).size()));
 
       // case 3: unroll (p, r)
       cugraph::unroll_p_r_or_q_r_edges<vertex_t, edge_t, false, false>(
@@ -834,18 +852,18 @@ k_truss(raft::handle_t const& handle,
         num_valid_edges,
         raft::device_span<vertex_t const>(edgelist_srcs.data(), edgelist_srcs.size()),
         raft::device_span<vertex_t const>(edgelist_dsts.data(), edgelist_dsts.size()),
-        raft::device_span<edge_t>(num_triangles.data(), num_triangles.size()));
+        raft::device_span<edge_t>((*num_triangles).data(), (*num_triangles).size()));
 
       // Remove edges that have a triangle count of zero. Those should not be accounted
       // for during the unroling phase.
-      auto edges_with_triangle_last =
-        thrust::stable_partition(handle.get_thrust_policy(),
-                                 transposed_edge_triangle_count_pair_first,
-                                 transposed_edge_triangle_count_pair_first + num_triangles.size(),
-                                 [] __device__(auto e) {
-                                   auto num_triangles = thrust::get<1>(e);
-                                   return num_triangles > 0;
-                                 });
+      auto edges_with_triangle_last = thrust::stable_partition(
+        handle.get_thrust_policy(),
+        transposed_edge_triangle_count_pair_first,
+        transposed_edge_triangle_count_pair_first + (*num_triangles).size(),
+        [] __device__(auto e) {
+          auto num_triangles = thrust::get<1>(e);
+          return num_triangles > 0;
+        });
 
       auto num_edges_with_triangles = static_cast<size_t>(
         thrust::distance(transposed_edge_triangle_count_pair_first, edges_with_triangle_last));
@@ -893,7 +911,7 @@ k_truss(raft::handle_t const& handle,
 
       edgelist_srcs.resize(num_edges_with_triangles, handle.get_stream());
       edgelist_dsts.resize(num_edges_with_triangles, handle.get_stream());
-      num_triangles.resize(num_edges_with_triangles, handle.get_stream());
+      (*num_triangles).resize(num_edges_with_triangles, handle.get_stream());
     }
 
     std::tie(edgelist_srcs, edgelist_dsts, edgelist_wgts, std::ignore, std::ignore) =
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index ced3b7bedb1..d1dd2dec069 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -490,6 +490,11 @@ ConfigureTest(K_TRUSS_TEST community/k_truss_test.cpp)
 # - Triangle Count tests --------------------------------------------------------------------------
 ConfigureTest(TRIANGLE_COUNT_TEST community/triangle_count_test.cpp)
 
+###################################################################################################
+# - Edge Triangle Count tests --------------------------------------------------------------------------
+ConfigureTest(EDGE_TRIANGLE_COUNT_TEST community/edge_triangle_count_test.cpp)
+
+
 ###################################################################################################
 # - K-hop Neighbors tests -------------------------------------------------------------------------
 ConfigureTest(K_HOP_NBRS_TEST traversal/k_hop_nbrs_test.cpp)
@@ -590,6 +595,10 @@ if(BUILD_CUGRAPH_MG_TESTS)
     # - MG LOUVAIN tests --------------------------------------------------------------------------
     ConfigureTestMG(MG_EGONET_TEST community/mg_egonet_test.cu)
 
+    ###############################################################################################
+    # - MG EDGE TRIANGLE COUNT tests --------------------------------------------------------------------------
+    ConfigureTestMG(MG_EDGE_TRIANGLE_COUNT_TEST community/mg_edge_triangle_count_test.cpp)
+
     ###############################################################################################
     # - MG WEAKLY CONNECTED COMPONENTS tests ------------------------------------------------------
     ConfigureTestMG(MG_WEAKLY_CONNECTED_COMPONENTS_TEST
diff --git a/cpp/tests/centrality/mg_betweenness_centrality_test.cpp b/cpp/tests/centrality/mg_betweenness_centrality_test.cpp
index 7924d449897..798e767085e 100644
--- a/cpp/tests/centrality/mg_betweenness_centrality_test.cpp
+++ b/cpp/tests/centrality/mg_betweenness_centrality_test.cpp
@@ -152,13 +152,15 @@ class Tests_MGBetweennessCentrality
       std::optional<
         cugraph::edge_property_t<cugraph::graph_view_t<vertex_t, edge_t, false, false>, weight_t>>
         sg_edge_weights{std::nullopt};
-      std::tie(sg_graph, sg_edge_weights, std::ignore) =
-        cugraph::test::mg_graph_to_sg_graph(*handle_,
-                                            mg_graph_view,
-                                            mg_edge_weight_view,
-                                            std::make_optional<raft::device_span<vertex_t const>>(
-                                              (*mg_renumber_map).data(), (*mg_renumber_map).size()),
-                                            false);
+      std::tie(sg_graph, sg_edge_weights, std::ignore, std::ignore) =
+        cugraph::test::mg_graph_to_sg_graph(
+          *handle_,
+          mg_graph_view,
+          mg_edge_weight_view,
+          std::optional<cugraph::edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
+          std::make_optional<raft::device_span<vertex_t const>>((*mg_renumber_map).data(),
+                                                                (*mg_renumber_map).size()),
+          false);
 
       if (handle_->get_comms().get_rank() == 0) {
         auto sg_graph_view = sg_graph.view();
diff --git a/cpp/tests/centrality/mg_edge_betweenness_centrality_test.cpp b/cpp/tests/centrality/mg_edge_betweenness_centrality_test.cpp
index c3417e96c03..1703f198a4c 100644
--- a/cpp/tests/centrality/mg_edge_betweenness_centrality_test.cpp
+++ b/cpp/tests/centrality/mg_edge_betweenness_centrality_test.cpp
@@ -142,12 +142,14 @@ class Tests_MGEdgeBetweennessCentrality
       std::optional<
         cugraph::edge_property_t<cugraph::graph_view_t<vertex_t, edge_t, false, false>, weight_t>>
         sg_edge_weights{std::nullopt};
-      std::tie(sg_graph, sg_edge_weights, std::ignore) = cugraph::test::mg_graph_to_sg_graph(
-        *handle_,
-        mg_graph_view,
-        mg_edge_weight_view,
-        std::optional<raft::device_span<vertex_t const>>{std::nullopt},
-        false);
+      std::tie(sg_graph, sg_edge_weights, std::ignore, std::ignore) =
+        cugraph::test::mg_graph_to_sg_graph(
+          *handle_,
+          mg_graph_view,
+          mg_edge_weight_view,
+          std::optional<cugraph::edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
+          std::optional<raft::device_span<vertex_t const>>{std::nullopt},
+          false);
 
       if (handle_->get_comms().get_rank() == 0) {
         auto sg_edge_weights_view =
diff --git a/cpp/tests/centrality/mg_eigenvector_centrality_test.cpp b/cpp/tests/centrality/mg_eigenvector_centrality_test.cpp
index ed24bee0923..76c52d52bfd 100644
--- a/cpp/tests/centrality/mg_eigenvector_centrality_test.cpp
+++ b/cpp/tests/centrality/mg_eigenvector_centrality_test.cpp
@@ -144,13 +144,15 @@ class Tests_MGEigenvectorCentrality
       std::optional<
         cugraph::edge_property_t<cugraph::graph_view_t<vertex_t, edge_t, true, false>, weight_t>>
         sg_edge_weights{std::nullopt};
-      std::tie(sg_graph, sg_edge_weights, std::ignore) =
-        cugraph::test::mg_graph_to_sg_graph(*handle_,
-                                            mg_graph_view,
-                                            mg_edge_weight_view,
-                                            std::make_optional<raft::device_span<vertex_t const>>(
-                                              (*mg_renumber_map).data(), (*mg_renumber_map).size()),
-                                            false);
+      std::tie(sg_graph, sg_edge_weights, std::ignore, std::ignore) =
+        cugraph::test::mg_graph_to_sg_graph(
+          *handle_,
+          mg_graph_view,
+          mg_edge_weight_view,
+          std::optional<cugraph::edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
+          std::make_optional<raft::device_span<vertex_t const>>((*mg_renumber_map).data(),
+                                                                (*mg_renumber_map).size()),
+          false);
 
       if (handle_->get_comms().get_rank() == int{0}) {
         // 3-2. run SG Eigenvector Centrality
diff --git a/cpp/tests/centrality/mg_katz_centrality_test.cpp b/cpp/tests/centrality/mg_katz_centrality_test.cpp
index abe02b2287b..e38f87749b8 100644
--- a/cpp/tests/centrality/mg_katz_centrality_test.cpp
+++ b/cpp/tests/centrality/mg_katz_centrality_test.cpp
@@ -151,13 +151,15 @@ class Tests_MGKatzCentrality
       std::optional<
         cugraph::edge_property_t<cugraph::graph_view_t<vertex_t, edge_t, true, false>, weight_t>>
         sg_edge_weights{std::nullopt};
-      std::tie(sg_graph, sg_edge_weights, std::ignore) =
-        cugraph::test::mg_graph_to_sg_graph(*handle_,
-                                            mg_graph_view,
-                                            mg_edge_weight_view,
-                                            std::make_optional<raft::device_span<vertex_t const>>(
-                                              (*mg_renumber_map).data(), (*mg_renumber_map).size()),
-                                            false);
+      std::tie(sg_graph, sg_edge_weights, std::ignore, std::ignore) =
+        cugraph::test::mg_graph_to_sg_graph(
+          *handle_,
+          mg_graph_view,
+          mg_edge_weight_view,
+          std::optional<cugraph::edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
+          std::make_optional<raft::device_span<vertex_t const>>((*mg_renumber_map).data(),
+                                                                (*mg_renumber_map).size()),
+          false);
 
       if (handle_->get_comms().get_rank() == int{0}) {
         // 4-2. run SG Katz Centrality
diff --git a/cpp/tests/community/edge_triangle_count_test.cpp b/cpp/tests/community/edge_triangle_count_test.cpp
new file mode 100644
index 00000000000..8cefc2c31f4
--- /dev/null
+++ b/cpp/tests/community/edge_triangle_count_test.cpp
@@ -0,0 +1,260 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governin_from_mtxg permissions and
+ * limitations under the License.
+ */
+
+#include "utilities/base_fixture.hpp"
+#include "utilities/check_utilities.hpp"
+#include "utilities/conversion_utilities.hpp"
+#include "utilities/property_generator_utilities.hpp"
+#include "utilities/test_graphs.hpp"
+#include "utilities/thrust_wrapper.hpp"
+
+#include <cugraph/algorithms.hpp>
+#include <cugraph/graph.hpp>
+#include <cugraph/graph_functions.hpp>
+#include <cugraph/graph_view.hpp>
+#include <cugraph/utilities/high_res_timer.hpp>
+
+#include <raft/core/handle.hpp>
+#include <raft/util/cudart_utils.hpp>
+
+#include <gtest/gtest.h>
+
+#include <algorithm>
+#include <iterator>
+#include <limits>
+#include <numeric>
+#include <random>
+#include <set>
+#include <vector>
+
+struct EdgeTriangleCount_Usecase {
+  bool edge_masking_{false};
+  bool check_correctness_{true};
+};
+
+template <typename input_usecase_t>
+class Tests_EdgeTriangleCount
+  : public ::testing::TestWithParam<std::tuple<EdgeTriangleCount_Usecase, input_usecase_t>> {
+ public:
+  Tests_EdgeTriangleCount() {}
+
+  static void SetUpTestCase() {}
+  static void TearDownTestCase() {}
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  // FIXME: There is an utility equivalent functor not
+  // supporting host vectors.
+  template <typename type_t>
+  struct host_nearly_equal {
+    const type_t threshold_ratio;
+    const type_t threshold_magnitude;
+
+    bool operator()(type_t lhs, type_t rhs) const
+    {
+      return std::abs(lhs - rhs) <
+             std::max(std::max(lhs, rhs) * threshold_ratio, threshold_magnitude);
+    }
+  };
+
+  template <typename vertex_t, typename edge_t>
+  std::vector<edge_t> edge_triangle_count_reference(std::vector<vertex_t> h_srcs,
+                                                    std::vector<vertex_t> h_dsts)
+  {
+    std::vector<vertex_t> edge_triangle_counts(h_srcs.size());
+    std::uninitialized_fill(edge_triangle_counts.begin(), edge_triangle_counts.end(), 0);
+
+    for (int i = 0; i < h_srcs.size(); ++i) {  // edge centric implementation
+      // for each edge, find the intersection
+      auto src          = h_srcs[i];
+      auto dst          = h_dsts[i];
+      auto it_src_start = std::lower_bound(h_srcs.begin(), h_srcs.end(), src);
+      auto src_start    = std::distance(h_srcs.begin(), it_src_start);
+
+      auto src_end =
+        src_start + std::distance(it_src_start, std::upper_bound(it_src_start, h_srcs.end(), src));
+
+      auto it_dst_start = std::lower_bound(h_srcs.begin(), h_srcs.end(), dst);
+      auto dst_start    = std::distance(h_srcs.begin(), it_dst_start);
+      auto dst_end =
+        dst_start + std::distance(it_dst_start, std::upper_bound(it_dst_start, h_srcs.end(), dst));
+
+      std::set<vertex_t> nbr_intersection;
+      std::set_intersection(h_dsts.begin() + src_start,
+                            h_dsts.begin() + src_end,
+                            h_dsts.begin() + dst_start,
+                            h_dsts.begin() + dst_end,
+                            std::inserter(nbr_intersection, nbr_intersection.end()));
+      // Find the supporting edges
+      for (auto v : nbr_intersection) {
+        auto it_edge  = std::lower_bound(h_dsts.begin() + src_start, h_dsts.begin() + src_end, v);
+        auto idx_edge = std::distance(h_dsts.begin(), it_edge);
+        edge_triangle_counts[idx_edge] += 1;
+
+        it_edge  = std::lower_bound(h_dsts.begin() + dst_start, h_dsts.begin() + dst_end, v);
+        idx_edge = std::distance(h_dsts.begin(), it_edge);
+      }
+    }
+
+    std::transform(edge_triangle_counts.begin(),
+                   edge_triangle_counts.end(),
+                   edge_triangle_counts.begin(),
+                   [](auto count) { return count * 3; });
+    return std::move(edge_triangle_counts);
+  }
+
+  template <typename vertex_t, typename edge_t, typename weight_t>
+  void run_current_test(
+    std::tuple<EdgeTriangleCount_Usecase const&, input_usecase_t const&> const& param)
+  {
+    constexpr bool renumber                           = false;
+    auto [edge_triangle_count_usecase, input_usecase] = param;
+    raft::handle_t handle{};
+    HighResTimer hr_timer{};
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_timer.start("SG Construct graph");
+    }
+
+    auto [graph, edge_weight, d_renumber_map_labels] =
+      cugraph::test::construct_graph<vertex_t, edge_t, weight_t, false, false>(
+        handle, input_usecase, false, renumber, true, true);
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    auto graph_view = graph.view();
+
+    std::optional<cugraph::edge_property_t<decltype(graph_view), bool>> edge_mask{std::nullopt};
+    if (edge_triangle_count_usecase.edge_masking_) {
+      edge_mask =
+        cugraph::test::generate<decltype(graph_view), bool>::edge_property(handle, graph_view, 2);
+      graph_view.attach_edge_mask((*edge_mask).view());
+    }
+
+    rmm::device_uvector<vertex_t> edgelist_srcs(0, handle.get_stream());
+    rmm::device_uvector<vertex_t> edgelist_dsts(0, handle.get_stream());
+    std::optional<rmm::device_uvector<edge_t>> d_edge_triangle_counts{std::nullopt};
+
+    auto d_cugraph_results =
+      cugraph::edge_triangle_count<vertex_t, edge_t, false>(handle, graph_view);
+
+    std::tie(edgelist_srcs, edgelist_dsts, std::ignore, d_edge_triangle_counts, std::ignore) =
+      cugraph::decompress_to_edgelist(
+        handle,
+        graph_view,
+        std::optional<cugraph::edge_property_view_t<edge_t, weight_t const*>>{std::nullopt},
+        std::make_optional(d_cugraph_results.view()),
+        std::optional<cugraph::edge_property_view_t<edge_t, int32_t const*>>{std::nullopt},
+        std::optional<raft::device_span<vertex_t const>>{std::nullopt});  // FIXME: No longer needed
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_timer.start("EdgeTriangleCount");
+    }
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    if (edge_triangle_count_usecase.check_correctness_) {
+      std::optional<cugraph::graph_t<vertex_t, edge_t, false, false>> modified_graph{std::nullopt};
+      std::vector<vertex_t> h_srcs(edgelist_srcs.size());
+      std::vector<vertex_t> h_dsts(edgelist_dsts.size());
+      std::tie(h_srcs, h_dsts, std::ignore) = cugraph::test::graph_to_host_coo(
+        handle,
+        graph_view,
+        edge_weight ? std::make_optional((*edge_weight).view()) : std::nullopt,
+        std::optional<raft::device_span<vertex_t const>>(std::nullopt));
+
+      auto h_cugraph_edge_triangle_counts = cugraph::test::to_host(handle, *d_edge_triangle_counts);
+
+      auto h_reference_edge_triangle_counts =
+        edge_triangle_count_reference<vertex_t, edge_t>(h_srcs, h_dsts);
+
+      for (size_t i = 0; i < h_srcs.size(); ++i) {
+        ASSERT_EQ(h_cugraph_edge_triangle_counts[i], h_reference_edge_triangle_counts[i])
+          << "Edge triangle count values do not match with the reference values.";
+      }
+    }
+  }
+};
+
+using Tests_EdgeTriangleCount_File = Tests_EdgeTriangleCount<cugraph::test::File_Usecase>;
+using Tests_EdgeTriangleCount_Rmat = Tests_EdgeTriangleCount<cugraph::test::Rmat_Usecase>;
+
+TEST_P(Tests_EdgeTriangleCount_File, CheckInt32Int32Float)
+{
+  run_current_test<int32_t, int32_t, float>(
+    override_File_Usecase_with_cmd_line_arguments(GetParam()));
+}
+TEST_P(Tests_EdgeTriangleCount_Rmat, CheckInt32Int32Float)
+{
+  run_current_test<int32_t, int32_t, float>(
+    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+TEST_P(Tests_EdgeTriangleCount_File, CheckInt64Int64Float)
+{
+  run_current_test<int64_t, int64_t, float>(
+    override_File_Usecase_with_cmd_line_arguments(GetParam()));
+}
+TEST_P(Tests_EdgeTriangleCount_Rmat, CheckInt64Int64Float)
+{
+  run_current_test<int64_t, int64_t, float>(
+    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+  simple_test,
+  Tests_EdgeTriangleCount_File,
+  ::testing::Combine(
+    // enable correctness checks
+    ::testing::Values(EdgeTriangleCount_Usecase{false, true},
+                      EdgeTriangleCount_Usecase{true, true}),
+    ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/dolphins.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_small_test,
+  Tests_EdgeTriangleCount_Rmat,
+  // enable correctness checks
+  ::testing::Combine(
+    ::testing::Values(EdgeTriangleCount_Usecase{false, true},
+                      EdgeTriangleCount_Usecase{true, true}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, true, false))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_benchmark_test, /* note that scale & edge factor can be overridden in benchmarking (with
+                          --gtest_filter to select only the rmat_benchmark_test with a specific
+                          vertex & edge type combination) by command line arguments and do not
+                          include more than one Rmat_Usecase that differ only in scale or edge
+                          factor (to avoid running same benchmarks more than once) */
+  Tests_EdgeTriangleCount_Rmat,
+  // disable correctness checks for large graphs
+  // FIXME: High memory footprint. Perform nbr_intersection in chunks.
+  ::testing::Combine(
+    ::testing::Values(EdgeTriangleCount_Usecase{false, false},
+                      EdgeTriangleCount_Usecase{true, false}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(16, 16, 0.57, 0.19, 0.19, 0, true, false))));
+
+CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/community/mg_ecg_test.cpp b/cpp/tests/community/mg_ecg_test.cpp
index a5e02c4f532..c99f83fa2e8 100644
--- a/cpp/tests/community/mg_ecg_test.cpp
+++ b/cpp/tests/community/mg_ecg_test.cpp
@@ -127,12 +127,14 @@ class Tests_MGEcg : public ::testing::TestWithParam<std::tuple<Ecg_Usecase, inpu
     std::optional<
       cugraph::edge_property_t<cugraph::graph_view_t<vertex_t, edge_t, false, false>, weight_t>>
       sg_edge_weights{std::nullopt};
-    std::tie(sg_graph, sg_edge_weights, std::ignore) = cugraph::test::mg_graph_to_sg_graph(
-      *handle_,
-      mg_graph_view,
-      mg_edge_weight_view,
-      std::optional<raft::device_span<vertex_t const>>{std::nullopt},
-      false);  // crate a SG graph with MG graph vertex IDs
+    std::tie(sg_graph, sg_edge_weights, std::ignore, std::ignore) =
+      cugraph::test::mg_graph_to_sg_graph(
+        *handle_,
+        mg_graph_view,
+        mg_edge_weight_view,
+        std::optional<cugraph::edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
+        std::optional<raft::device_span<vertex_t const>>{std::nullopt},
+        false);  // crate a SG graph with MG graph vertex IDs
 
     auto const comm_rank = handle_->get_comms().get_rank();
     if (comm_rank == 0) {
diff --git a/cpp/tests/community/mg_edge_triangle_count_test.cpp b/cpp/tests/community/mg_edge_triangle_count_test.cpp
new file mode 100644
index 00000000000..89bdf870ccd
--- /dev/null
+++ b/cpp/tests/community/mg_edge_triangle_count_test.cpp
@@ -0,0 +1,253 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "utilities/base_fixture.hpp"
+#include "utilities/conversion_utilities.hpp"
+#include "utilities/device_comm_wrapper.hpp"
+#include "utilities/mg_utilities.hpp"
+#include "utilities/property_generator_utilities.hpp"
+#include "utilities/test_graphs.hpp"
+#include "utilities/thrust_wrapper.hpp"
+
+#include <cugraph/algorithms.hpp>
+#include <cugraph/graph.hpp>
+#include <cugraph/graph_functions.hpp>
+#include <cugraph/graph_view.hpp>
+#include <cugraph/utilities/high_res_timer.hpp>
+
+#include <raft/comms/mpi_comms.hpp>
+#include <raft/core/comms.hpp>
+#include <raft/core/handle.hpp>
+
+#include <rmm/device_scalar.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <gtest/gtest.h>
+
+#include <random>
+
+struct EdgeTriangleCount_Usecase {
+  bool edge_masking_{false};
+  bool check_correctness_{true};
+};
+
+template <typename input_usecase_t>
+class Tests_MGEdgeTriangleCount
+  : public ::testing::TestWithParam<std::tuple<EdgeTriangleCount_Usecase, input_usecase_t>> {
+ public:
+  Tests_MGEdgeTriangleCount() {}
+
+  static void SetUpTestCase() { handle_ = cugraph::test::initialize_mg_handle(); }
+
+  static void TearDownTestCase() { handle_.reset(); }
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  // Compare the results of running EdgeTriangleCount on multiple GPUs to that of a single-GPU run
+  template <typename vertex_t, typename edge_t>
+  void run_current_test(EdgeTriangleCount_Usecase const& edge_triangle_count_usecase,
+                        input_usecase_t const& input_usecase)
+  {
+    using weight_t = float;
+
+    HighResTimer hr_timer{};
+
+    // 1. create MG graph
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.start("MG Construct graph");
+    }
+
+    cugraph::graph_t<vertex_t, edge_t, false, true> mg_graph(*handle_);
+    std::optional<rmm::device_uvector<vertex_t>> mg_renumber_map{std::nullopt};
+    std::tie(mg_graph, std::ignore, mg_renumber_map) =
+      cugraph::test::construct_graph<vertex_t, edge_t, weight_t, false, true>(
+        *handle_, input_usecase, false, true, false, true);
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    auto mg_graph_view = mg_graph.view();
+
+    std::optional<cugraph::edge_property_t<decltype(mg_graph_view), bool>> edge_mask{std::nullopt};
+    if (edge_triangle_count_usecase.edge_masking_) {
+      edge_mask = cugraph::test::generate<decltype(mg_graph_view), bool>::edge_property(
+        *handle_, mg_graph_view, 2);
+      mg_graph_view.attach_edge_mask((*edge_mask).view());
+    }
+
+    // 2. run MG EdgeTriangleCount
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.start("MG EdgeTriangleCount");
+    }
+
+    auto d_mg_cugraph_results =
+      cugraph::edge_triangle_count<vertex_t, edge_t, true>(*handle_, mg_graph_view);
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    // 3. Compare SG & MG results
+
+    if (edge_triangle_count_usecase.check_correctness_) {
+      // 3-1. Convert to SG graph
+
+      cugraph::graph_t<vertex_t, edge_t, false, false> sg_graph(*handle_);
+      std::optional<
+        cugraph::edge_property_t<cugraph::graph_view_t<vertex_t, edge_t, false, false>, edge_t>>
+        d_sg_cugraph_results{std::nullopt};
+      std::tie(sg_graph, std::ignore, d_sg_cugraph_results, std::ignore) =
+        cugraph::test::mg_graph_to_sg_graph(
+          *handle_,
+          mg_graph_view,
+          std::optional<cugraph::edge_property_view_t<edge_t, weight_t const*>>{std::nullopt},
+          // FIXME: Update 'create_graph_from_edgelist' to support int32_t and int64_t values
+          std::make_optional(d_mg_cugraph_results.view()),
+          std::make_optional<raft::device_span<vertex_t const>>((*mg_renumber_map).data(),
+                                                                (*mg_renumber_map).size()),
+          false);
+
+      if (handle_->get_comms().get_rank() == int{0}) {
+        // 3-2. Convert the MG triangle counts stored as 'edge_property_t' to device vector
+
+        auto [edgelist_srcs,
+              edgelist_dsts,
+              d_edgelist_weights,
+              d_edge_triangle_counts,
+              d_edgelist_type] =
+          cugraph::decompress_to_edgelist(
+            *handle_,
+            sg_graph.view(),
+            std::optional<cugraph::edge_property_view_t<edge_t, weight_t const*>>{std::nullopt},
+            // FIXME: Update 'decompress_edgelist' to support int32_t and int64_t values
+            std::make_optional((*d_sg_cugraph_results).view()),
+            std::optional<cugraph::edge_property_view_t<edge_t, int32_t const*>>{std::nullopt},
+            std::optional<raft::device_span<vertex_t const>>{
+              std::nullopt});  // FIXME: No longer needed
+
+        // 3-3. Run SG EdgeTriangleCount
+
+        auto ref_d_sg_cugraph_results =
+          cugraph::edge_triangle_count<vertex_t, edge_t, false>(*handle_, sg_graph.view());
+        auto [ref_edgelist_srcs,
+              ref_edgelist_dsts,
+              ref_d_edgelist_weights,
+              ref_d_edge_triangle_counts,
+              ref_d_edgelist_type] =
+          cugraph::decompress_to_edgelist(
+            *handle_,
+            sg_graph.view(),
+            std::optional<cugraph::edge_property_view_t<edge_t, weight_t const*>>{std::nullopt},
+            std::make_optional(ref_d_sg_cugraph_results.view()),
+            std::optional<cugraph::edge_property_view_t<edge_t, int32_t const*>>{std::nullopt},
+            std::optional<raft::device_span<vertex_t const>>{
+              std::nullopt});  // FIXME: No longer needed
+
+        // 3-4. Compare
+
+        auto h_mg_edge_triangle_counts = cugraph::test::to_host(*handle_, *d_edge_triangle_counts);
+        auto h_sg_edge_triangle_counts =
+          cugraph::test::to_host(*handle_, *ref_d_edge_triangle_counts);
+
+        ASSERT_TRUE(std::equal(h_mg_edge_triangle_counts.begin(),
+                               h_mg_edge_triangle_counts.end(),
+                               h_sg_edge_triangle_counts.begin()));
+      }
+    }
+  }
+
+ private:
+  static std::unique_ptr<raft::handle_t> handle_;
+};
+
+template <typename input_usecase_t>
+std::unique_ptr<raft::handle_t> Tests_MGEdgeTriangleCount<input_usecase_t>::handle_ = nullptr;
+
+using Tests_MGEdgeTriangleCount_File = Tests_MGEdgeTriangleCount<cugraph::test::File_Usecase>;
+using Tests_MGEdgeTriangleCount_Rmat = Tests_MGEdgeTriangleCount<cugraph::test::Rmat_Usecase>;
+
+TEST_P(Tests_MGEdgeTriangleCount_File, CheckInt32Int32)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t>(std::get<0>(param), std::get<1>(param));
+}
+
+TEST_P(Tests_MGEdgeTriangleCount_Rmat, CheckInt32Int32)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t>(
+    std::get<0>(param), override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
+}
+
+TEST_P(Tests_MGEdgeTriangleCount_Rmat, CheckInt32Int64)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int64_t>(
+    std::get<0>(param), override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
+}
+
+TEST_P(Tests_MGEdgeTriangleCount_Rmat, CheckInt64Int64)
+{
+  auto param = GetParam();
+  run_current_test<int64_t, int64_t>(
+    std::get<0>(param), override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+  file_tests,
+  Tests_MGEdgeTriangleCount_File,
+  ::testing::Combine(
+    // enable correctness checks
+    ::testing::Values(EdgeTriangleCount_Usecase{false, true},
+                      EdgeTriangleCount_Usecase{true, true}),
+    ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/dolphins.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_small_tests,
+  Tests_MGEdgeTriangleCount_Rmat,
+  ::testing::Combine(
+    ::testing::Values(EdgeTriangleCount_Usecase{false, true},
+                      EdgeTriangleCount_Usecase{true, true}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, true, false))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_benchmark_test, /* note that scale & edge factor can be overridden in benchmarking (with
+                          --gtest_filter to select only the rmat_benchmark_test with a specific
+                          vertex & edge type combination) by command line arguments and do not
+                          include more than one Rmat_Usecase that differ only in scale or edge
+                          factor (to avoid running same benchmarks more than once) */
+  Tests_MGEdgeTriangleCount_Rmat,
+  ::testing::Combine(
+    ::testing::Values(EdgeTriangleCount_Usecase{false, false},
+                      EdgeTriangleCount_Usecase{true, false}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, true, false))));
+
+CUGRAPH_MG_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/community/mg_egonet_test.cu b/cpp/tests/community/mg_egonet_test.cu
index 66ab1f47312..ac363df3ec5 100644
--- a/cpp/tests/community/mg_egonet_test.cu
+++ b/cpp/tests/community/mg_egonet_test.cu
@@ -199,13 +199,15 @@ class Tests_MGEgonet
                      triplet_first + d_mg_aggregate_edgelist_src.size());
       }
 
-      auto [sg_graph, sg_edge_weights, sg_number_map] =
-        cugraph::test::mg_graph_to_sg_graph(*handle_,
-                                            mg_graph_view,
-                                            mg_edge_weight_view,
-                                            std::make_optional<raft::device_span<vertex_t const>>(
-                                              (*mg_renumber_map).data(), (*mg_renumber_map).size()),
-                                            false);
+      auto [sg_graph, sg_edge_weights, sg_edge_ids, sg_number_map] =
+        cugraph::test::mg_graph_to_sg_graph(
+          *handle_,
+          mg_graph_view,
+          mg_edge_weight_view,
+          std::optional<cugraph::edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
+          std::make_optional<raft::device_span<vertex_t const>>((*mg_renumber_map).data(),
+                                                                (*mg_renumber_map).size()),
+          false);
 
       if (handle_->get_comms().get_rank() == 0) {
         auto d_mg_aggregate_edgelist_offsets =
diff --git a/cpp/tests/community/mg_leiden_test.cpp b/cpp/tests/community/mg_leiden_test.cpp
index f1a2fc83192..65f4827ba06 100644
--- a/cpp/tests/community/mg_leiden_test.cpp
+++ b/cpp/tests/community/mg_leiden_test.cpp
@@ -87,12 +87,14 @@ class Tests_MGLeiden
     std::optional<
       cugraph::edge_property_t<cugraph::graph_view_t<vertex_t, edge_t, false, false>, weight_t>>
       sg_edge_weights{std::nullopt};
-    std::tie(sg_graph, sg_edge_weights, std::ignore) = cugraph::test::mg_graph_to_sg_graph(
-      *handle_,
-      mg_graph_view,
-      mg_edge_weight_view,
-      std::optional<raft::device_span<vertex_t const>>{std::nullopt},
-      false);  // crate an SG graph with MG graph vertex IDs
+    std::tie(sg_graph, sg_edge_weights, std::ignore, std::ignore) =
+      cugraph::test::mg_graph_to_sg_graph(
+        *handle_,
+        mg_graph_view,
+        mg_edge_weight_view,
+        std::optional<cugraph::edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
+        std::optional<raft::device_span<vertex_t const>>{std::nullopt},
+        false);  // crate an SG graph with MG graph vertex IDs
 
     // FIXME: We need to figure out how to test each iteration of
     // SG vs MG Leiden, possibly by passing results of refinement phase
diff --git a/cpp/tests/community/mg_louvain_test.cpp b/cpp/tests/community/mg_louvain_test.cpp
index 733ee9368ac..106ad2562f7 100644
--- a/cpp/tests/community/mg_louvain_test.cpp
+++ b/cpp/tests/community/mg_louvain_test.cpp
@@ -85,12 +85,14 @@ class Tests_MGLouvain
     std::optional<
       cugraph::edge_property_t<cugraph::graph_view_t<vertex_t, edge_t, false, false>, weight_t>>
       sg_edge_weights{std::nullopt};
-    std::tie(sg_graph, sg_edge_weights, std::ignore) = cugraph::test::mg_graph_to_sg_graph(
-      *handle_,
-      mg_graph_view,
-      mg_edge_weight_view,
-      std::optional<raft::device_span<vertex_t const>>{std::nullopt},
-      false);  // crate an SG graph with MG graph vertex IDs
+    std::tie(sg_graph, sg_edge_weights, std::ignore, std::ignore) =
+      cugraph::test::mg_graph_to_sg_graph(
+        *handle_,
+        mg_graph_view,
+        mg_edge_weight_view,
+        std::optional<cugraph::edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
+        std::optional<raft::device_span<vertex_t const>>{std::nullopt},
+        false);  // crate an SG graph with MG graph vertex IDs
 
     weight_t sg_modularity{-1.0};
 
diff --git a/cpp/tests/community/mg_triangle_count_test.cpp b/cpp/tests/community/mg_triangle_count_test.cpp
index ca3e0b2ac8f..932ff5050f1 100644
--- a/cpp/tests/community/mg_triangle_count_test.cpp
+++ b/cpp/tests/community/mg_triangle_count_test.cpp
@@ -178,13 +178,15 @@ class Tests_MGTriangleCount
                                           d_mg_triangle_counts.size()));
 
       cugraph::graph_t<vertex_t, edge_t, false, false> sg_graph(*handle_);
-      std::tie(sg_graph, std::ignore, std::ignore) = cugraph::test::mg_graph_to_sg_graph(
-        *handle_,
-        mg_graph_view,
-        std::optional<cugraph::edge_property_view_t<edge_t, weight_t const*>>{std::nullopt},
-        std::make_optional<raft::device_span<vertex_t const>>((*mg_renumber_map).data(),
-                                                              (*mg_renumber_map).size()),
-        false);
+      std::tie(sg_graph, std::ignore, std::ignore, std::ignore) =
+        cugraph::test::mg_graph_to_sg_graph(
+          *handle_,
+          mg_graph_view,
+          std::optional<cugraph::edge_property_view_t<edge_t, weight_t const*>>{std::nullopt},
+          std::optional<cugraph::edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
+          std::make_optional<raft::device_span<vertex_t const>>((*mg_renumber_map).data(),
+                                                                (*mg_renumber_map).size()),
+          false);
 
       if (handle_->get_comms().get_rank() == int{0}) {
         // 4-2. run SG TriangleCount
diff --git a/cpp/tests/community/mg_weighted_matching_test.cpp b/cpp/tests/community/mg_weighted_matching_test.cpp
index 21963922ab1..4f36ee36902 100644
--- a/cpp/tests/community/mg_weighted_matching_test.cpp
+++ b/cpp/tests/community/mg_weighted_matching_test.cpp
@@ -130,12 +130,14 @@ class Tests_MGWeightedMatching
       std::optional<
         cugraph::edge_property_t<cugraph::graph_view_t<vertex_t, edge_t, false, false>, weight_t>>
         sg_edge_weights{std::nullopt};
-      std::tie(sg_graph, sg_edge_weights, std::ignore) = cugraph::test::mg_graph_to_sg_graph(
-        *handle_,
-        mg_graph_view,
-        mg_edge_weight_view,
-        std::optional<raft::device_span<vertex_t const>>(std::nullopt),
-        false);
+      std::tie(sg_graph, sg_edge_weights, std::ignore, std::ignore) =
+        cugraph::test::mg_graph_to_sg_graph(
+          *handle_,
+          mg_graph_view,
+          mg_edge_weight_view,
+          std::optional<cugraph::edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
+          std::optional<raft::device_span<vertex_t const>>(std::nullopt),
+          false);
 
       if (handle_->get_comms().get_rank() == 0) {
         auto sg_graph_view = sg_graph.view();
diff --git a/cpp/tests/components/mg_weakly_connected_components_test.cpp b/cpp/tests/components/mg_weakly_connected_components_test.cpp
index c510e3139fb..368fea68877 100644
--- a/cpp/tests/components/mg_weakly_connected_components_test.cpp
+++ b/cpp/tests/components/mg_weakly_connected_components_test.cpp
@@ -125,13 +125,15 @@ class Tests_MGWeaklyConnectedComponents
           raft::device_span<vertex_t const>(d_mg_components.data(), d_mg_components.size()));
 
       cugraph::graph_t<vertex_t, edge_t, false, false> sg_graph(*handle_);
-      std::tie(sg_graph, std::ignore, std::ignore) = cugraph::test::mg_graph_to_sg_graph(
-        *handle_,
-        mg_graph_view,
-        std::optional<cugraph::edge_property_view_t<edge_t, weight_t const*>>{std::nullopt},
-        std::make_optional<raft::device_span<vertex_t const>>((*mg_renumber_map).data(),
-                                                              (*mg_renumber_map).size()),
-        false);
+      std::tie(sg_graph, std::ignore, std::ignore, std::ignore) =
+        cugraph::test::mg_graph_to_sg_graph(
+          *handle_,
+          mg_graph_view,
+          std::optional<cugraph::edge_property_view_t<edge_t, weight_t const*>>{std::nullopt},
+          std::optional<cugraph::edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
+          std::make_optional<raft::device_span<vertex_t const>>((*mg_renumber_map).data(),
+                                                                (*mg_renumber_map).size()),
+          false);
 
       if (handle_->get_comms().get_rank() == int{0}) {
         // 3-2. run SG weakly connected components
diff --git a/cpp/tests/cores/mg_core_number_test.cpp b/cpp/tests/cores/mg_core_number_test.cpp
index ac99d7d4a93..f8294d81fdf 100644
--- a/cpp/tests/cores/mg_core_number_test.cpp
+++ b/cpp/tests/cores/mg_core_number_test.cpp
@@ -143,13 +143,15 @@ class Tests_MGCoreNumber
           raft::device_span<edge_t const>(d_mg_core_numbers.data(), d_mg_core_numbers.size()));
 
       cugraph::graph_t<vertex_t, edge_t, false, false> sg_graph(*handle_);
-      std::tie(sg_graph, std::ignore, std::ignore) = cugraph::test::mg_graph_to_sg_graph(
-        *handle_,
-        mg_graph_view,
-        std::optional<cugraph::edge_property_view_t<edge_t, weight_t const*>>{std::nullopt},
-        std::make_optional<raft::device_span<vertex_t const>>((*mg_renumber_map).data(),
-                                                              (*mg_renumber_map).size()),
-        false);
+      std::tie(sg_graph, std::ignore, std::ignore, std::ignore) =
+        cugraph::test::mg_graph_to_sg_graph(
+          *handle_,
+          mg_graph_view,
+          std::optional<cugraph::edge_property_view_t<edge_t, weight_t const*>>{std::nullopt},
+          std::optional<cugraph::edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
+          std::make_optional<raft::device_span<vertex_t const>>((*mg_renumber_map).data(),
+                                                                (*mg_renumber_map).size()),
+          false);
       if (handle_->get_comms().get_rank() == int{0}) {
         // 3-2. run SG CoreNumber
 
diff --git a/cpp/tests/cores/mg_k_core_test.cpp b/cpp/tests/cores/mg_k_core_test.cpp
index 100c7fa3bcf..28bc445bda8 100644
--- a/cpp/tests/cores/mg_k_core_test.cpp
+++ b/cpp/tests/cores/mg_k_core_test.cpp
@@ -160,13 +160,15 @@ class Tests_MGKCore : public ::testing::TestWithParam<std::tuple<KCore_Usecase,
           std::optional<raft::device_span<vertex_t const>>{std::nullopt},
           raft::device_span<edge_t const>(d_mg_core_numbers.data(), d_mg_core_numbers.size()));
 
-      auto [sg_graph, sg_edge_weights, sg_number_map] =
-        cugraph::test::mg_graph_to_sg_graph(*handle_,
-                                            mg_graph_view,
-                                            mg_edge_weight_view,
-                                            std::make_optional<raft::device_span<vertex_t const>>(
-                                              (*mg_renumber_map).data(), (*mg_renumber_map).size()),
-                                            false);
+      auto [sg_graph, sg_edge_weights, sg_edge_ids, sg_number_map] =
+        cugraph::test::mg_graph_to_sg_graph(
+          *handle_,
+          mg_graph_view,
+          mg_edge_weight_view,
+          std::optional<cugraph::edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
+          std::make_optional<raft::device_span<vertex_t const>>((*mg_renumber_map).data(),
+                                                                (*mg_renumber_map).size()),
+          false);
 
       if (handle_->get_comms().get_rank() == 0) {
         auto sg_graph_view = sg_graph.view();
diff --git a/cpp/tests/link_analysis/mg_hits_test.cpp b/cpp/tests/link_analysis/mg_hits_test.cpp
index 101a4fe1557..40a439ffc4c 100644
--- a/cpp/tests/link_analysis/mg_hits_test.cpp
+++ b/cpp/tests/link_analysis/mg_hits_test.cpp
@@ -186,13 +186,15 @@ class Tests_MGHits : public ::testing::TestWithParam<std::tuple<Hits_Usecase, in
       std::optional<
         cugraph::edge_property_t<cugraph::graph_view_t<vertex_t, edge_t, true, false>, weight_t>>
         sg_edge_weights{std::nullopt};
-      std::tie(sg_graph, sg_edge_weights, std::ignore) = cugraph::test::mg_graph_to_sg_graph(
-        *handle_,
-        mg_graph_view,
-        std::optional<cugraph::edge_property_view_t<edge_t, weight_t const*>>{std::nullopt},
-        std::make_optional<raft::device_span<vertex_t const>>((*mg_renumber_map).data(),
-                                                              (*mg_renumber_map).size()),
-        false);
+      std::tie(sg_graph, sg_edge_weights, std::ignore, std::ignore) =
+        cugraph::test::mg_graph_to_sg_graph(
+          *handle_,
+          mg_graph_view,
+          std::optional<cugraph::edge_property_view_t<edge_t, weight_t const*>>{std::nullopt},
+          std::optional<cugraph::edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
+          std::make_optional<raft::device_span<vertex_t const>>((*mg_renumber_map).data(),
+                                                                (*mg_renumber_map).size()),
+          false);
 
       if (handle_->get_comms().get_rank() == int{0}) {
         // 3-3. run SG Hits
diff --git a/cpp/tests/link_analysis/mg_pagerank_test.cpp b/cpp/tests/link_analysis/mg_pagerank_test.cpp
index 6be451ac5fd..26136c8c9d2 100644
--- a/cpp/tests/link_analysis/mg_pagerank_test.cpp
+++ b/cpp/tests/link_analysis/mg_pagerank_test.cpp
@@ -202,13 +202,15 @@ class Tests_MGPageRank
       std::optional<
         cugraph::edge_property_t<cugraph::graph_view_t<vertex_t, edge_t, true, false>, weight_t>>
         sg_edge_weights{std::nullopt};
-      std::tie(sg_graph, sg_edge_weights, std::ignore) = cugraph::test::mg_graph_to_sg_graph(
-        *handle_,
-        mg_graph_view,
-        mg_edge_weight_view,
-        std::make_optional<raft::device_span<vertex_t const>>((*d_mg_renumber_map).data(),
-                                                              (*d_mg_renumber_map).size()),
-        false);
+      std::tie(sg_graph, sg_edge_weights, std::ignore, std::ignore) =
+        cugraph::test::mg_graph_to_sg_graph(
+          *handle_,
+          mg_graph_view,
+          mg_edge_weight_view,
+          std::optional<cugraph::edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
+          std::make_optional<raft::device_span<vertex_t const>>((*d_mg_renumber_map).data(),
+                                                                (*d_mg_renumber_map).size()),
+          false);
 
       if (handle_->get_comms().get_rank() == int{0}) {
         // 4-2. run SG PageRank
diff --git a/cpp/tests/mtmg/threaded_test_louvain.cu b/cpp/tests/mtmg/threaded_test_louvain.cu
index ab51d701b57..b9c8f621ab8 100644
--- a/cpp/tests/mtmg/threaded_test_louvain.cu
+++ b/cpp/tests/mtmg/threaded_test_louvain.cu
@@ -384,12 +384,13 @@ class Tests_Multithreaded
             auto thread_handle = instance_manager->get_handle();
 
             if (thread_handle.get_rank() == 0) {
-              std::tie(sg_graph, sg_edge_weights, std::ignore) =
+              std::tie(sg_graph, sg_edge_weights, std::ignore, std::ignore) =
                 cugraph::test::mg_graph_to_sg_graph(
                   thread_handle.raft_handle(),
                   graph_view.get(thread_handle),
                   edge_weights ? std::make_optional(edge_weights->get(thread_handle).view())
                                : std::nullopt,
+                  std::optional<cugraph::edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
                   std::optional<raft::device_span<vertex_t const>>{std::nullopt},
                   false);  // create an SG graph with MG graph vertex IDs
             } else {
@@ -398,6 +399,7 @@ class Tests_Multithreaded
                 graph_view.get(thread_handle),
                 edge_weights ? std::make_optional(edge_weights->get(thread_handle).view())
                              : std::nullopt,
+                std::optional<cugraph::edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
                 std::optional<raft::device_span<vertex_t const>>{std::nullopt},
                 false);  // create an SG graph with MG graph vertex IDs
             }
diff --git a/cpp/tests/prims/mg_count_if_e.cu b/cpp/tests/prims/mg_count_if_e.cu
index 8ad1a20e585..137f7db8625 100644
--- a/cpp/tests/prims/mg_count_if_e.cu
+++ b/cpp/tests/prims/mg_count_if_e.cu
@@ -149,13 +149,15 @@ class Tests_MGCountIfE
 
     if (prims_usecase.check_correctness) {
       cugraph::graph_t<vertex_t, edge_t, store_transposed, false> sg_graph(*handle_);
-      std::tie(sg_graph, std::ignore, std::ignore) = cugraph::test::mg_graph_to_sg_graph(
-        *handle_,
-        mg_graph_view,
-        std::optional<cugraph::edge_property_view_t<edge_t, weight_t const*>>{std::nullopt},
-        std::make_optional<raft::device_span<vertex_t const>>((*mg_renumber_map).data(),
-                                                              (*mg_renumber_map).size()),
-        false);
+      std::tie(sg_graph, std::ignore, std::ignore, std::ignore) =
+        cugraph::test::mg_graph_to_sg_graph(
+          *handle_,
+          mg_graph_view,
+          std::optional<cugraph::edge_property_view_t<edge_t, weight_t const*>>{std::nullopt},
+          std::optional<cugraph::edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
+          std::make_optional<raft::device_span<vertex_t const>>((*mg_renumber_map).data(),
+                                                                (*mg_renumber_map).size()),
+          false);
 
       if (handle_->get_comms().get_rank() == 0) {
         auto sg_graph_view = sg_graph.view();
diff --git a/cpp/tests/prims/mg_count_if_v.cu b/cpp/tests/prims/mg_count_if_v.cu
index eb0e8cf9835..e3f30e37729 100644
--- a/cpp/tests/prims/mg_count_if_v.cu
+++ b/cpp/tests/prims/mg_count_if_v.cu
@@ -123,13 +123,15 @@ class Tests_MGCountIfV
 
     if (prims_usecase.check_correctness) {
       cugraph::graph_t<vertex_t, edge_t, store_transposed, false> sg_graph(*handle_);
-      std::tie(sg_graph, std::ignore, std::ignore) = cugraph::test::mg_graph_to_sg_graph(
-        *handle_,
-        mg_graph_view,
-        std::optional<cugraph::edge_property_view_t<edge_t, weight_t const*>>{std::nullopt},
-        std::make_optional<raft::device_span<vertex_t const>>((*mg_renumber_map).data(),
-                                                              (*mg_renumber_map).size()),
-        false);
+      std::tie(sg_graph, std::ignore, std::ignore, std::ignore) =
+        cugraph::test::mg_graph_to_sg_graph(
+          *handle_,
+          mg_graph_view,
+          std::optional<cugraph::edge_property_view_t<edge_t, weight_t const*>>{std::nullopt},
+          std::optional<cugraph::edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
+          std::make_optional<raft::device_span<vertex_t const>>((*mg_renumber_map).data(),
+                                                                (*mg_renumber_map).size()),
+          false);
 
       if (handle_->get_comms().get_rank() == 0) {
         auto sg_graph_view         = sg_graph.view();
diff --git a/cpp/tests/prims/mg_extract_transform_e.cu b/cpp/tests/prims/mg_extract_transform_e.cu
index 48b893f6fea..20e87070fa5 100644
--- a/cpp/tests/prims/mg_extract_transform_e.cu
+++ b/cpp/tests/prims/mg_extract_transform_e.cu
@@ -253,13 +253,15 @@ class Tests_MGExtractTransformE
       }
 
       cugraph::graph_t<vertex_t, edge_t, store_transposed, false> sg_graph(*handle_);
-      std::tie(sg_graph, std::ignore, std::ignore) = cugraph::test::mg_graph_to_sg_graph(
-        *handle_,
-        mg_graph_view,
-        std::optional<cugraph::edge_property_view_t<edge_t, weight_t const*>>{std::nullopt},
-        std::make_optional<raft::device_span<vertex_t const>>((*d_mg_renumber_map_labels).data(),
-                                                              (*d_mg_renumber_map_labels).size()),
-        false);
+      std::tie(sg_graph, std::ignore, std::ignore, std::ignore) =
+        cugraph::test::mg_graph_to_sg_graph(
+          *handle_,
+          mg_graph_view,
+          std::optional<cugraph::edge_property_view_t<edge_t, weight_t const*>>{std::nullopt},
+          std::optional<cugraph::edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
+          std::make_optional<raft::device_span<vertex_t const>>((*d_mg_renumber_map_labels).data(),
+                                                                (*d_mg_renumber_map_labels).size()),
+          false);
       rmm::device_uvector<result_t> sg_vertex_prop(0, handle_->get_stream());
       std::tie(std::ignore, sg_vertex_prop) =
         cugraph::test::mg_vertex_property_values_to_sg_vertex_property_values(
diff --git a/cpp/tests/prims/mg_extract_transform_v_frontier_outgoing_e.cu b/cpp/tests/prims/mg_extract_transform_v_frontier_outgoing_e.cu
index 3611a250afd..9e7611190ae 100644
--- a/cpp/tests/prims/mg_extract_transform_v_frontier_outgoing_e.cu
+++ b/cpp/tests/prims/mg_extract_transform_v_frontier_outgoing_e.cu
@@ -283,13 +283,15 @@ class Tests_MGExtractTransformVFrontierOutgoingE
       }
 
       cugraph::graph_t<vertex_t, edge_t, store_transposed, false> sg_graph(*handle_);
-      std::tie(sg_graph, std::ignore, std::ignore) = cugraph::test::mg_graph_to_sg_graph(
-        *handle_,
-        mg_graph_view,
-        std::optional<cugraph::edge_property_view_t<edge_t, weight_t const*>>{std::nullopt},
-        std::make_optional<raft::device_span<vertex_t const>>((*d_mg_renumber_map_labels).data(),
-                                                              (*d_mg_renumber_map_labels).size()),
-        false);
+      std::tie(sg_graph, std::ignore, std::ignore, std::ignore) =
+        cugraph::test::mg_graph_to_sg_graph(
+          *handle_,
+          mg_graph_view,
+          std::optional<cugraph::edge_property_view_t<edge_t, weight_t const*>>{std::nullopt},
+          std::optional<cugraph::edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
+          std::make_optional<raft::device_span<vertex_t const>>((*d_mg_renumber_map_labels).data(),
+                                                                (*d_mg_renumber_map_labels).size()),
+          false);
       rmm::device_uvector<result_t> sg_vertex_prop(0, handle_->get_stream());
       std::tie(std::ignore, sg_vertex_prop) =
         cugraph::test::mg_vertex_property_values_to_sg_vertex_property_values(
diff --git a/cpp/tests/prims/mg_per_v_pair_transform_dst_nbr_intersection.cu b/cpp/tests/prims/mg_per_v_pair_transform_dst_nbr_intersection.cu
index 762da62eeb8..75b711fbd9c 100644
--- a/cpp/tests/prims/mg_per_v_pair_transform_dst_nbr_intersection.cu
+++ b/cpp/tests/prims/mg_per_v_pair_transform_dst_nbr_intersection.cu
@@ -226,13 +226,15 @@ class Tests_MGPerVPairTransformDstNbrIntersection
         *handle_, std::get<1>(mg_result_buffer).data(), std::get<1>(mg_result_buffer).size());
 
       cugraph::graph_t<vertex_t, edge_t, false, false> sg_graph(*handle_);
-      std::tie(sg_graph, std::ignore, std::ignore) = cugraph::test::mg_graph_to_sg_graph(
-        *handle_,
-        mg_graph_view,
-        std::optional<cugraph::edge_property_view_t<edge_t, weight_t const*>>{std::nullopt},
-        std::make_optional<raft::device_span<vertex_t const>>((*mg_renumber_map).data(),
-                                                              (*mg_renumber_map).size()),
-        false);
+      std::tie(sg_graph, std::ignore, std::ignore, std::ignore) =
+        cugraph::test::mg_graph_to_sg_graph(
+          *handle_,
+          mg_graph_view,
+          std::optional<cugraph::edge_property_view_t<edge_t, weight_t const*>>{std::nullopt},
+          std::optional<cugraph::edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
+          std::make_optional<raft::device_span<vertex_t const>>((*mg_renumber_map).data(),
+                                                                (*mg_renumber_map).size()),
+          false);
 
       if (handle_->get_comms().get_rank() == 0) {
         auto sg_graph_view = sg_graph.view();
diff --git a/cpp/tests/prims/mg_per_v_pair_transform_dst_nbr_weighted_intersection.cu b/cpp/tests/prims/mg_per_v_pair_transform_dst_nbr_weighted_intersection.cu
index de78b42603d..48bbc6176d8 100644
--- a/cpp/tests/prims/mg_per_v_pair_transform_dst_nbr_weighted_intersection.cu
+++ b/cpp/tests/prims/mg_per_v_pair_transform_dst_nbr_weighted_intersection.cu
@@ -258,15 +258,17 @@ class Tests_MGPerVPairTransformDstNbrIntersection
                                  weight_t>>
         sg_edge_weight{std::nullopt};
 
-      std::tie(sg_graph, sg_edge_weight, std::ignore) = cugraph::test::mg_graph_to_sg_graph(
-        *handle_,
-        mg_graph_view,
-        mg_edge_weight
-          ? std::make_optional(mg_edge_weight_view)
-          : std::optional<cugraph::edge_property_view_t<edge_t, weight_t const*>>{std::nullopt},
-        std::make_optional<raft::device_span<vertex_t const>>((*mg_renumber_map).data(),
-                                                              (*mg_renumber_map).size()),
-        false);
+      std::tie(sg_graph, sg_edge_weight, std::ignore, std::ignore) =
+        cugraph::test::mg_graph_to_sg_graph(
+          *handle_,
+          mg_graph_view,
+          mg_edge_weight
+            ? std::make_optional(mg_edge_weight_view)
+            : std::optional<cugraph::edge_property_view_t<edge_t, weight_t const*>>{std::nullopt},
+          std::optional<cugraph::edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
+          std::make_optional<raft::device_span<vertex_t const>>((*mg_renumber_map).data(),
+                                                                (*mg_renumber_map).size()),
+          false);
 
       if (handle_->get_comms().get_rank() == 0) {
         auto sg_graph_view = sg_graph.view();
diff --git a/cpp/tests/prims/mg_per_v_random_select_transform_outgoing_e.cu b/cpp/tests/prims/mg_per_v_random_select_transform_outgoing_e.cu
index 97c7333cd2e..b99dbf16107 100644
--- a/cpp/tests/prims/mg_per_v_random_select_transform_outgoing_e.cu
+++ b/cpp/tests/prims/mg_per_v_random_select_transform_outgoing_e.cu
@@ -320,13 +320,15 @@ class Tests_MGPerVRandomSelectTransformOutgoingE
       }
 
       cugraph::graph_t<vertex_t, edge_t, false, false> sg_graph(*handle_);
-      std::tie(sg_graph, std::ignore, std::ignore) = cugraph::test::mg_graph_to_sg_graph(
-        *handle_,
-        mg_graph_view,
-        std::optional<cugraph::edge_property_view_t<edge_t, weight_t const*>>{std::nullopt},
-        std::make_optional<raft::device_span<vertex_t const>>((*mg_renumber_map).data(),
-                                                              (*mg_renumber_map).size()),
-        false);
+      std::tie(sg_graph, std::ignore, std::ignore, std::ignore) =
+        cugraph::test::mg_graph_to_sg_graph(
+          *handle_,
+          mg_graph_view,
+          std::optional<cugraph::edge_property_view_t<edge_t, weight_t const*>>{std::nullopt},
+          std::optional<cugraph::edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
+          std::make_optional<raft::device_span<vertex_t const>>((*mg_renumber_map).data(),
+                                                                (*mg_renumber_map).size()),
+          false);
 
       if (handle_->get_comms().get_rank() == 0) {
         std::optional<rmm::device_uvector<size_t>> mg_aggregate_sample_offsets{std::nullopt};
diff --git a/cpp/tests/prims/mg_per_v_transform_reduce_dst_key_aggregated_outgoing_e.cu b/cpp/tests/prims/mg_per_v_transform_reduce_dst_key_aggregated_outgoing_e.cu
index efcfee9fc66..fd9192dcce5 100644
--- a/cpp/tests/prims/mg_per_v_transform_reduce_dst_key_aggregated_outgoing_e.cu
+++ b/cpp/tests/prims/mg_per_v_transform_reduce_dst_key_aggregated_outgoing_e.cu
@@ -297,13 +297,15 @@ class Tests_MGPerVTransformReduceDstKeyAggregatedOutgoingE
       std::optional<
         cugraph::edge_property_t<cugraph::graph_view_t<vertex_t, edge_t, false, false>, weight_t>>
         sg_edge_weights{std::nullopt};
-      std::tie(sg_graph, sg_edge_weights, std::ignore) = cugraph::test::mg_graph_to_sg_graph(
-        *handle_,
-        mg_graph_view,
-        std::optional<cugraph::edge_property_view_t<edge_t, weight_t const*>>{std::nullopt},
-        std::make_optional<raft::device_span<vertex_t const>>((*mg_renumber_map).data(),
-                                                              (*mg_renumber_map).size()),
-        false);
+      std::tie(sg_graph, sg_edge_weights, std::ignore, std::ignore) =
+        cugraph::test::mg_graph_to_sg_graph(
+          *handle_,
+          mg_graph_view,
+          std::optional<cugraph::edge_property_view_t<edge_t, weight_t const*>>{std::nullopt},
+          std::optional<cugraph::edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
+          std::make_optional<raft::device_span<vertex_t const>>((*mg_renumber_map).data(),
+                                                                (*mg_renumber_map).size()),
+          false);
 
       for (size_t i = 0; i < reduction_types.size(); ++i) {
         auto mg_aggregate_results =
diff --git a/cpp/tests/prims/mg_per_v_transform_reduce_incoming_outgoing_e.cu b/cpp/tests/prims/mg_per_v_transform_reduce_incoming_outgoing_e.cu
index e3eb56d5a6e..be29c793ad5 100644
--- a/cpp/tests/prims/mg_per_v_transform_reduce_incoming_outgoing_e.cu
+++ b/cpp/tests/prims/mg_per_v_transform_reduce_incoming_outgoing_e.cu
@@ -271,13 +271,15 @@ class Tests_MGPerVTransformReduceIncomingOutgoingE
 
     if (prims_usecase.check_correctness) {
       cugraph::graph_t<vertex_t, edge_t, store_transposed, false> sg_graph(*handle_);
-      std::tie(sg_graph, std::ignore, std::ignore) = cugraph::test::mg_graph_to_sg_graph(
-        *handle_,
-        mg_graph_view,
-        std::optional<cugraph::edge_property_view_t<edge_t, weight_t const*>>{std::nullopt},
-        std::make_optional<raft::device_span<vertex_t const>>((*mg_renumber_map).data(),
-                                                              (*mg_renumber_map).size()),
-        false);
+      std::tie(sg_graph, std::ignore, std::ignore, std::ignore) =
+        cugraph::test::mg_graph_to_sg_graph(
+          *handle_,
+          mg_graph_view,
+          std::optional<cugraph::edge_property_view_t<edge_t, weight_t const*>>{std::nullopt},
+          std::optional<cugraph::edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
+          std::make_optional<raft::device_span<vertex_t const>>((*mg_renumber_map).data(),
+                                                                (*mg_renumber_map).size()),
+          false);
 
       for (size_t i = 0; i < reduction_types.size(); ++i) {
         auto mg_aggregate_in_results =
diff --git a/cpp/tests/prims/mg_reduce_v.cu b/cpp/tests/prims/mg_reduce_v.cu
index 1449e8f9910..e91db5fa6ad 100644
--- a/cpp/tests/prims/mg_reduce_v.cu
+++ b/cpp/tests/prims/mg_reduce_v.cu
@@ -163,13 +163,15 @@ class Tests_MGReduceV
 
     if (prims_usecase.check_correctness) {
       cugraph::graph_t<vertex_t, edge_t, store_transposed, false> sg_graph(*handle_);
-      std::tie(sg_graph, std::ignore, std::ignore) = cugraph::test::mg_graph_to_sg_graph(
-        *handle_,
-        mg_graph_view,
-        std::optional<cugraph::edge_property_view_t<edge_t, weight_t const*>>{std::nullopt},
-        std::make_optional<raft::device_span<vertex_t const>>((*mg_renumber_map).data(),
-                                                              (*mg_renumber_map).size()),
-        false);
+      std::tie(sg_graph, std::ignore, std::ignore, std::ignore) =
+        cugraph::test::mg_graph_to_sg_graph(
+          *handle_,
+          mg_graph_view,
+          std::optional<cugraph::edge_property_view_t<edge_t, weight_t const*>>{std::nullopt},
+          std::optional<cugraph::edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
+          std::make_optional<raft::device_span<vertex_t const>>((*mg_renumber_map).data(),
+                                                                (*mg_renumber_map).size()),
+          false);
 
       if (handle_->get_comms().get_rank() == 0) {
         auto sg_graph_view = sg_graph.view();
diff --git a/cpp/tests/prims/mg_transform_reduce_dst_nbr_intersection_of_e_endpoints_by_v.cu b/cpp/tests/prims/mg_transform_reduce_dst_nbr_intersection_of_e_endpoints_by_v.cu
index 71cdf27fda1..4fac6ef3be7 100644
--- a/cpp/tests/prims/mg_transform_reduce_dst_nbr_intersection_of_e_endpoints_by_v.cu
+++ b/cpp/tests/prims/mg_transform_reduce_dst_nbr_intersection_of_e_endpoints_by_v.cu
@@ -174,13 +174,15 @@ class Tests_MGTransformReduceDstNbrIntersectionOfEEndpointsByV
           raft::device_span<edge_t const>(mg_result_buffer.data(), mg_result_buffer.size()));
 
       cugraph::graph_t<vertex_t, edge_t, false, false> sg_graph(*handle_);
-      std::tie(sg_graph, std::ignore, std::ignore) = cugraph::test::mg_graph_to_sg_graph(
-        *handle_,
-        mg_graph_view,
-        std::optional<cugraph::edge_property_view_t<edge_t, weight_t const*>>{std::nullopt},
-        std::make_optional<raft::device_span<vertex_t const>>((*mg_renumber_map).data(),
-                                                              (*mg_renumber_map).size()),
-        false);
+      std::tie(sg_graph, std::ignore, std::ignore, std::ignore) =
+        cugraph::test::mg_graph_to_sg_graph(
+          *handle_,
+          mg_graph_view,
+          std::optional<cugraph::edge_property_view_t<edge_t, weight_t const*>>{std::nullopt},
+          std::optional<cugraph::edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
+          std::make_optional<raft::device_span<vertex_t const>>((*mg_renumber_map).data(),
+                                                                (*mg_renumber_map).size()),
+          false);
 
       if (handle_->get_comms().get_rank() == 0) {
         auto sg_graph_view = sg_graph.view();
diff --git a/cpp/tests/prims/mg_transform_reduce_e.cu b/cpp/tests/prims/mg_transform_reduce_e.cu
index a086571d6e0..4785a8bb01b 100644
--- a/cpp/tests/prims/mg_transform_reduce_e.cu
+++ b/cpp/tests/prims/mg_transform_reduce_e.cu
@@ -159,13 +159,15 @@ class Tests_MGTransformReduceE
 
     if (prims_usecase.check_correctness) {
       cugraph::graph_t<vertex_t, edge_t, store_transposed, false> sg_graph(*handle_);
-      std::tie(sg_graph, std::ignore, std::ignore) = cugraph::test::mg_graph_to_sg_graph(
-        *handle_,
-        mg_graph_view,
-        std::optional<cugraph::edge_property_view_t<edge_t, weight_t const*>>{std::nullopt},
-        std::make_optional<raft::device_span<vertex_t const>>((*mg_renumber_map).data(),
-                                                              (*mg_renumber_map).size()),
-        false);
+      std::tie(sg_graph, std::ignore, std::ignore, std::ignore) =
+        cugraph::test::mg_graph_to_sg_graph(
+          *handle_,
+          mg_graph_view,
+          std::optional<cugraph::edge_property_view_t<edge_t, weight_t const*>>{std::nullopt},
+          std::optional<cugraph::edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
+          std::make_optional<raft::device_span<vertex_t const>>((*mg_renumber_map).data(),
+                                                                (*mg_renumber_map).size()),
+          false);
 
       if (handle_->get_comms().get_rank() == 0) {
         auto sg_graph_view = sg_graph.view();
diff --git a/cpp/tests/prims/mg_transform_reduce_e_by_src_dst_key.cu b/cpp/tests/prims/mg_transform_reduce_e_by_src_dst_key.cu
index a66c70ff586..9950b5bdbf4 100644
--- a/cpp/tests/prims/mg_transform_reduce_e_by_src_dst_key.cu
+++ b/cpp/tests/prims/mg_transform_reduce_e_by_src_dst_key.cu
@@ -237,13 +237,15 @@ class Tests_MGTransformReduceEBySrcDstKey
                           cugraph::get_dataframe_buffer_begin(mg_aggregate_by_dst_values));
 
       cugraph::graph_t<vertex_t, edge_t, store_transposed, false> sg_graph(*handle_);
-      std::tie(sg_graph, std::ignore, std::ignore) = cugraph::test::mg_graph_to_sg_graph(
-        *handle_,
-        mg_graph_view,
-        std::optional<cugraph::edge_property_view_t<edge_t, weight_t const*>>{std::nullopt},
-        std::make_optional<raft::device_span<vertex_t const>>((*mg_renumber_map).data(),
-                                                              (*mg_renumber_map).size()),
-        false);
+      std::tie(sg_graph, std::ignore, std::ignore, std::ignore) =
+        cugraph::test::mg_graph_to_sg_graph(
+          *handle_,
+          mg_graph_view,
+          std::optional<cugraph::edge_property_view_t<edge_t, weight_t const*>>{std::nullopt},
+          std::optional<cugraph::edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
+          std::make_optional<raft::device_span<vertex_t const>>((*mg_renumber_map).data(),
+                                                                (*mg_renumber_map).size()),
+          false);
 
       if (handle_->get_comms().get_rank() == 0) {
         auto sg_graph_view = sg_graph.view();
diff --git a/cpp/tests/prims/mg_transform_reduce_v.cu b/cpp/tests/prims/mg_transform_reduce_v.cu
index c26085a55c4..f6f07bc03ab 100644
--- a/cpp/tests/prims/mg_transform_reduce_v.cu
+++ b/cpp/tests/prims/mg_transform_reduce_v.cu
@@ -169,13 +169,15 @@ class Tests_MGTransformReduceV
 
     if (prims_usecase.check_correctness) {
       cugraph::graph_t<vertex_t, edge_t, store_transposed, false> sg_graph(*handle_);
-      std::tie(sg_graph, std::ignore, std::ignore) = cugraph::test::mg_graph_to_sg_graph(
-        *handle_,
-        mg_graph_view,
-        std::optional<cugraph::edge_property_view_t<edge_t, weight_t const*>>{std::nullopt},
-        std::make_optional<raft::device_span<vertex_t const>>((*mg_renumber_map).data(),
-                                                              (*mg_renumber_map).size()),
-        false);
+      std::tie(sg_graph, std::ignore, std::ignore, std::ignore) =
+        cugraph::test::mg_graph_to_sg_graph(
+          *handle_,
+          mg_graph_view,
+          std::optional<cugraph::edge_property_view_t<edge_t, weight_t const*>>{std::nullopt},
+          std::optional<cugraph::edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
+          std::make_optional<raft::device_span<vertex_t const>>((*mg_renumber_map).data(),
+                                                                (*mg_renumber_map).size()),
+          false);
 
       if (handle_->get_comms().get_rank() == 0) {
         auto sg_graph_view = sg_graph.view();
diff --git a/cpp/tests/prims/mg_transform_reduce_v_frontier_outgoing_e_by_dst.cu b/cpp/tests/prims/mg_transform_reduce_v_frontier_outgoing_e_by_dst.cu
index 07a0f7e7aab..335a7ec879c 100644
--- a/cpp/tests/prims/mg_transform_reduce_v_frontier_outgoing_e_by_dst.cu
+++ b/cpp/tests/prims/mg_transform_reduce_v_frontier_outgoing_e_by_dst.cu
@@ -292,13 +292,15 @@ class Tests_MGTransformReduceVFrontierOutgoingEByDst
       }
 
       cugraph::graph_t<vertex_t, edge_t, store_transposed, false> sg_graph(*handle_);
-      std::tie(sg_graph, std::ignore, std::ignore) = cugraph::test::mg_graph_to_sg_graph(
-        *handle_,
-        mg_graph_view,
-        std::optional<cugraph::edge_property_view_t<edge_t, weight_t const*>>{std::nullopt},
-        std::make_optional<raft::device_span<vertex_t const>>((*mg_renumber_map).data(),
-                                                              (*mg_renumber_map).size()),
-        false);
+      std::tie(sg_graph, std::ignore, std::ignore, std::ignore) =
+        cugraph::test::mg_graph_to_sg_graph(
+          *handle_,
+          mg_graph_view,
+          std::optional<cugraph::edge_property_view_t<edge_t, weight_t const*>>{std::nullopt},
+          std::optional<cugraph::edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
+          std::make_optional<raft::device_span<vertex_t const>>((*mg_renumber_map).data(),
+                                                                (*mg_renumber_map).size()),
+          false);
 
       if (handle_->get_comms().get_rank() == int{0}) {
         if constexpr (std::is_same_v<payload_t, void>) {
diff --git a/cpp/tests/structure/mg_coarsen_graph_test.cpp b/cpp/tests/structure/mg_coarsen_graph_test.cpp
index 1da30869545..471773d71bd 100644
--- a/cpp/tests/structure/mg_coarsen_graph_test.cpp
+++ b/cpp/tests/structure/mg_coarsen_graph_test.cpp
@@ -330,23 +330,26 @@ class Tests_MGCoarsenGraph
         cugraph::edge_property_t<cugraph::graph_view_t<vertex_t, edge_t, store_transposed, false>,
                                  weight_t>>
         sg_edge_weights{std::nullopt};
-      std::tie(sg_graph, sg_edge_weights, std::ignore) = cugraph::test::mg_graph_to_sg_graph(
-        *handle_,
-        mg_graph_view,
-        mg_edge_weight_view,
-        std::optional<raft::device_span<vertex_t const>>{std::nullopt},
-        false);
+      std::tie(sg_graph, sg_edge_weights, std::ignore, std::ignore) =
+        cugraph::test::mg_graph_to_sg_graph(
+          *handle_,
+          mg_graph_view,
+          mg_edge_weight_view,
+          std::optional<cugraph::edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
+          std::optional<raft::device_span<vertex_t const>>{std::nullopt},
+          false);
 
       cugraph::graph_t<vertex_t, edge_t, store_transposed, false> sg_coarse_graph(*handle_);
       std::optional<
         cugraph::edge_property_t<cugraph::graph_view_t<vertex_t, edge_t, store_transposed, false>,
                                  weight_t>>
         sg_coarse_edge_weights{std::nullopt};
-      std::tie(sg_coarse_graph, sg_coarse_edge_weights, std::ignore) =
+      std::tie(sg_coarse_graph, sg_coarse_edge_weights, std::ignore, std::ignore) =
         cugraph::test::mg_graph_to_sg_graph(
           *handle_,
           mg_coarse_graph_view,
           mg_coarse_edge_weight_view,
+          std::optional<cugraph::edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
           std::optional<raft::device_span<vertex_t const>>{std::nullopt},
           false);
 
diff --git a/cpp/tests/structure/mg_count_self_loops_and_multi_edges_test.cpp b/cpp/tests/structure/mg_count_self_loops_and_multi_edges_test.cpp
index 45fac884f49..61f40049e31 100644
--- a/cpp/tests/structure/mg_count_self_loops_and_multi_edges_test.cpp
+++ b/cpp/tests/structure/mg_count_self_loops_and_multi_edges_test.cpp
@@ -126,13 +126,15 @@ class Tests_MGCountSelfLoopsAndMultiEdges
       // 3-1. aggregate MG results
 
       cugraph::graph_t<vertex_t, edge_t, store_transposed, false> sg_graph(*handle_);
-      std::tie(sg_graph, std::ignore, std::ignore) = cugraph::test::mg_graph_to_sg_graph(
-        *handle_,
-        mg_graph_view,
-        std::optional<cugraph::edge_property_view_t<edge_t, weight_t const*>>{std::nullopt},
-        std::make_optional<raft::device_span<vertex_t const>>((*mg_renumber_map).data(),
-                                                              (*mg_renumber_map).size()),
-        false);
+      std::tie(sg_graph, std::ignore, std::ignore, std::ignore) =
+        cugraph::test::mg_graph_to_sg_graph(
+          *handle_,
+          mg_graph_view,
+          std::optional<cugraph::edge_property_view_t<edge_t, weight_t const*>>{std::nullopt},
+          std::optional<cugraph::edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
+          std::make_optional<raft::device_span<vertex_t const>>((*mg_renumber_map).data(),
+                                                                (*mg_renumber_map).size()),
+          false);
 
       if (handle_->get_comms().get_rank() == 0) {
         auto sg_graph_view = sg_graph.view();
diff --git a/cpp/tests/structure/mg_has_edge_and_compute_multiplicity_test.cpp b/cpp/tests/structure/mg_has_edge_and_compute_multiplicity_test.cpp
index 0ee72726294..3d3d881fb23 100644
--- a/cpp/tests/structure/mg_has_edge_and_compute_multiplicity_test.cpp
+++ b/cpp/tests/structure/mg_has_edge_and_compute_multiplicity_test.cpp
@@ -204,13 +204,15 @@ class Tests_MGHasEdgeAndComputeMultiplicity
                                         d_mg_edge_multiplicities.size()));
 
       cugraph::graph_t<vertex_t, edge_t, store_transposed, false> sg_graph(*handle_);
-      std::tie(sg_graph, std::ignore, std::ignore) = cugraph::test::mg_graph_to_sg_graph(
-        *handle_,
-        mg_graph_view,
-        std::optional<cugraph::edge_property_view_t<edge_t, weight_t const*>>{std::nullopt},
-        std::make_optional<raft::device_span<vertex_t const>>((*mg_renumber_map).data(),
-                                                              (*mg_renumber_map).size()),
-        false);
+      std::tie(sg_graph, std::ignore, std::ignore, std::ignore) =
+        cugraph::test::mg_graph_to_sg_graph(
+          *handle_,
+          mg_graph_view,
+          std::optional<cugraph::edge_property_view_t<edge_t, weight_t const*>>{std::nullopt},
+          std::optional<cugraph::edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
+          std::make_optional<raft::device_span<vertex_t const>>((*mg_renumber_map).data(),
+                                                                (*mg_renumber_map).size()),
+          false);
 
       if (handle_->get_comms().get_rank() == 0) {
         auto sg_graph_view = sg_graph.view();
diff --git a/cpp/tests/structure/mg_induced_subgraph_test.cu b/cpp/tests/structure/mg_induced_subgraph_test.cu
index 3b32c15bf9f..2ed909b9955 100644
--- a/cpp/tests/structure/mg_induced_subgraph_test.cu
+++ b/cpp/tests/structure/mg_induced_subgraph_test.cu
@@ -214,12 +214,14 @@ class Tests_MGInducedSubgraph
                                                         true,
                                                         handle_->get_stream());
 
-      auto [sg_graph, sg_edge_weights, sg_number_map] = cugraph::test::mg_graph_to_sg_graph(
-        *handle_,
-        mg_graph_view,
-        mg_edge_weight_view,
-        std::optional<raft::device_span<vertex_t const>>{std::nullopt},
-        false);
+      auto [sg_graph, sg_edge_weights, sg_edge_ids, sg_number_map] =
+        cugraph::test::mg_graph_to_sg_graph(
+          *handle_,
+          mg_graph_view,
+          mg_edge_weight_view,
+          std::optional<cugraph::edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
+          std::optional<raft::device_span<vertex_t const>>{std::nullopt},
+          false);
 
       if (my_rank == 0) {
         auto d_sg_subgraph_offsets = cugraph::test::to_device(*handle_, h_sg_subgraph_offsets);
diff --git a/cpp/tests/structure/mg_symmetrize_test.cpp b/cpp/tests/structure/mg_symmetrize_test.cpp
index e607370f62a..7f1e4f04dc7 100644
--- a/cpp/tests/structure/mg_symmetrize_test.cpp
+++ b/cpp/tests/structure/mg_symmetrize_test.cpp
@@ -88,13 +88,15 @@ class Tests_MGSymmetrize
                                weight_t>>
       sg_edge_weights{std::nullopt};
     if (symmetrize_usecase.check_correctness) {
-      std::tie(sg_graph, sg_edge_weights, std::ignore) = cugraph::test::mg_graph_to_sg_graph(
-        *handle_,
-        mg_graph.view(),
-        mg_edge_weights ? std::make_optional((*mg_edge_weights).view()) : std::nullopt,
-        std::make_optional<raft::device_span<vertex_t const>>((*mg_renumber_map).data(),
-                                                              (*mg_renumber_map).size()),
-        false);
+      std::tie(sg_graph, sg_edge_weights, std::ignore, std::ignore) =
+        cugraph::test::mg_graph_to_sg_graph(
+          *handle_,
+          mg_graph.view(),
+          mg_edge_weights ? std::make_optional((*mg_edge_weights).view()) : std::nullopt,
+          std::optional<cugraph::edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
+          std::make_optional<raft::device_span<vertex_t const>>((*mg_renumber_map).data(),
+                                                                (*mg_renumber_map).size()),
+          false);
     }
 
     // 3. run MG symmetrize
diff --git a/cpp/tests/structure/mg_transpose_storage_test.cpp b/cpp/tests/structure/mg_transpose_storage_test.cpp
index c8b4f70f1e2..e870f648039 100644
--- a/cpp/tests/structure/mg_transpose_storage_test.cpp
+++ b/cpp/tests/structure/mg_transpose_storage_test.cpp
@@ -87,13 +87,15 @@ class Tests_MGTransposeStorage
                                weight_t>>
       sg_edge_weights{std::nullopt};
     if (transpose_storage_usecase.check_correctness) {
-      std::tie(sg_graph, sg_edge_weights, std::ignore) = cugraph::test::mg_graph_to_sg_graph(
-        *handle_,
-        mg_graph.view(),
-        mg_edge_weights ? std::make_optional((*mg_edge_weights).view()) : std::nullopt,
-        std::make_optional<raft::device_span<vertex_t const>>((*mg_renumber_map).data(),
-                                                              (*mg_renumber_map).size()),
-        false);
+      std::tie(sg_graph, sg_edge_weights, std::ignore, std::ignore) =
+        cugraph::test::mg_graph_to_sg_graph(
+          *handle_,
+          mg_graph.view(),
+          mg_edge_weights ? std::make_optional((*mg_edge_weights).view()) : std::nullopt,
+          std::optional<cugraph::edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
+          std::make_optional<raft::device_span<vertex_t const>>((*mg_renumber_map).data(),
+                                                                (*mg_renumber_map).size()),
+          false);
     }
 
     // 2. run MG transpose storage
diff --git a/cpp/tests/structure/mg_transpose_test.cpp b/cpp/tests/structure/mg_transpose_test.cpp
index 4428f8430d5..921cef42595 100644
--- a/cpp/tests/structure/mg_transpose_test.cpp
+++ b/cpp/tests/structure/mg_transpose_test.cpp
@@ -87,13 +87,15 @@ class Tests_MGTranspose
                                weight_t>>
       sg_edge_weights{std::nullopt};
     if (transpose_usecase.check_correctness) {
-      std::tie(sg_graph, sg_edge_weights, std::ignore) = cugraph::test::mg_graph_to_sg_graph(
-        *handle_,
-        mg_graph.view(),
-        mg_edge_weights ? std::make_optional((*mg_edge_weights).view()) : std::nullopt,
-        std::make_optional<raft::device_span<vertex_t const>>((*mg_renumber_map).data(),
-                                                              (*mg_renumber_map).size()),
-        false);
+      std::tie(sg_graph, sg_edge_weights, std::ignore, std::ignore) =
+        cugraph::test::mg_graph_to_sg_graph(
+          *handle_,
+          mg_graph.view(),
+          mg_edge_weights ? std::make_optional((*mg_edge_weights).view()) : std::nullopt,
+          std::optional<cugraph::edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
+          std::make_optional<raft::device_span<vertex_t const>>((*mg_renumber_map).data(),
+                                                                (*mg_renumber_map).size()),
+          false);
     }
 
     // 3. run MG transpose
diff --git a/cpp/tests/traversal/mg_bfs_test.cpp b/cpp/tests/traversal/mg_bfs_test.cpp
index 431ed75c82d..1b63ad3b085 100644
--- a/cpp/tests/traversal/mg_bfs_test.cpp
+++ b/cpp/tests/traversal/mg_bfs_test.cpp
@@ -183,13 +183,15 @@ class Tests_MGBFS : public ::testing::TestWithParam<std::tuple<BFS_Usecase, inpu
         d_mg_source ? size_t{1} : size_t{0});
 
       cugraph::graph_t<vertex_t, edge_t, false, false> sg_graph(*handle_);
-      std::tie(sg_graph, std::ignore, std::ignore) = cugraph::test::mg_graph_to_sg_graph(
-        *handle_,
-        mg_graph_view,
-        std::optional<cugraph::edge_property_view_t<edge_t, weight_t const*>>{std::nullopt},
-        std::make_optional<raft::device_span<vertex_t const>>((*mg_renumber_map).data(),
-                                                              (*mg_renumber_map).size()),
-        false);
+      std::tie(sg_graph, std::ignore, std::ignore, std::ignore) =
+        cugraph::test::mg_graph_to_sg_graph(
+          *handle_,
+          mg_graph_view,
+          std::optional<cugraph::edge_property_view_t<edge_t, weight_t const*>>{std::nullopt},
+          std::optional<cugraph::edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
+          std::make_optional<raft::device_span<vertex_t const>>((*mg_renumber_map).data(),
+                                                                (*mg_renumber_map).size()),
+          false);
 
       if (handle_->get_comms().get_rank() == int{0}) {
         // 3-3. run SG BFS
diff --git a/cpp/tests/traversal/mg_extract_bfs_paths_test.cu b/cpp/tests/traversal/mg_extract_bfs_paths_test.cu
index 8484066c6a0..476a6ffab8f 100644
--- a/cpp/tests/traversal/mg_extract_bfs_paths_test.cu
+++ b/cpp/tests/traversal/mg_extract_bfs_paths_test.cu
@@ -237,13 +237,15 @@ class Tests_MGExtractBFSPaths
         cugraph::test::device_gatherv(*handle_, d_mg_paths.data(), d_mg_paths.size());
 
       cugraph::graph_t<vertex_t, edge_t, false, false> sg_graph(*handle_);
-      std::tie(sg_graph, std::ignore, std::ignore) = cugraph::test::mg_graph_to_sg_graph(
-        *handle_,
-        mg_graph_view,
-        std::optional<cugraph::edge_property_view_t<edge_t, weight_t const*>>{std::nullopt},
-        std::make_optional<raft::device_span<vertex_t const>>((*mg_renumber_map).data(),
-                                                              (*mg_renumber_map).size()),
-        false);
+      std::tie(sg_graph, std::ignore, std::ignore, std::ignore) =
+        cugraph::test::mg_graph_to_sg_graph(
+          *handle_,
+          mg_graph_view,
+          std::optional<cugraph::edge_property_view_t<edge_t, weight_t const*>>{std::nullopt},
+          std::optional<cugraph::edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
+          std::make_optional<raft::device_span<vertex_t const>>((*mg_renumber_map).data(),
+                                                                (*mg_renumber_map).size()),
+          false);
 
       if (handle_->get_comms().get_rank() == int{0}) {
         // run SG extract_bfs_paths
diff --git a/cpp/tests/traversal/mg_k_hop_nbrs_test.cpp b/cpp/tests/traversal/mg_k_hop_nbrs_test.cpp
index 07ea107a2ed..64674fb3799 100644
--- a/cpp/tests/traversal/mg_k_hop_nbrs_test.cpp
+++ b/cpp/tests/traversal/mg_k_hop_nbrs_test.cpp
@@ -178,13 +178,15 @@ class Tests_MGKHopNbrs
         *handle_, raft::device_span<vertex_t const>(d_mg_nbrs.data(), d_mg_nbrs.size()));
 
       cugraph::graph_t<vertex_t, edge_t, false, false> sg_graph(*handle_);
-      std::tie(sg_graph, std::ignore, std::ignore) = cugraph::test::mg_graph_to_sg_graph(
-        *handle_,
-        mg_graph_view,
-        std::optional<cugraph::edge_property_view_t<edge_t, weight_t const*>>{std::nullopt},
-        std::make_optional<raft::device_span<vertex_t const>>((*mg_renumber_map).data(),
-                                                              (*mg_renumber_map).size()),
-        false);
+      std::tie(sg_graph, std::ignore, std::ignore, std::ignore) =
+        cugraph::test::mg_graph_to_sg_graph(
+          *handle_,
+          mg_graph_view,
+          std::optional<cugraph::edge_property_view_t<edge_t, weight_t const*>>{std::nullopt},
+          std::optional<cugraph::edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
+          std::make_optional<raft::device_span<vertex_t const>>((*mg_renumber_map).data(),
+                                                                (*mg_renumber_map).size()),
+          false);
 
       if (handle_->get_comms().get_rank() == int{0}) {
         // 3-3. run SG K-hop neighbors
diff --git a/cpp/tests/traversal/mg_sssp_test.cpp b/cpp/tests/traversal/mg_sssp_test.cpp
index 188d0eca115..9ad16d1c947 100644
--- a/cpp/tests/traversal/mg_sssp_test.cpp
+++ b/cpp/tests/traversal/mg_sssp_test.cpp
@@ -176,13 +176,15 @@ class Tests_MGSSSP : public ::testing::TestWithParam<std::tuple<SSSP_Usecase, in
       std::optional<
         cugraph::edge_property_t<cugraph::graph_view_t<vertex_t, edge_t, false, false>, weight_t>>
         sg_edge_weights{std::nullopt};
-      std::tie(sg_graph, sg_edge_weights, std::ignore) =
-        cugraph::test::mg_graph_to_sg_graph(*handle_,
-                                            mg_graph_view,
-                                            mg_edge_weight_view,
-                                            std::make_optional<raft::device_span<vertex_t const>>(
-                                              (*mg_renumber_map).data(), (*mg_renumber_map).size()),
-                                            false);
+      std::tie(sg_graph, sg_edge_weights, std::ignore, std::ignore) =
+        cugraph::test::mg_graph_to_sg_graph(
+          *handle_,
+          mg_graph_view,
+          mg_edge_weight_view,
+          std::optional<cugraph::edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
+          std::make_optional<raft::device_span<vertex_t const>>((*mg_renumber_map).data(),
+                                                                (*mg_renumber_map).size()),
+          false);
 
       if (handle_->get_comms().get_rank() == int{0}) {
         // 3-3. run SG SSSP
diff --git a/cpp/tests/utilities/conversion_utilities.hpp b/cpp/tests/utilities/conversion_utilities.hpp
index 9b55f45d5bd..24a8ecbe4fd 100644
--- a/cpp/tests/utilities/conversion_utilities.hpp
+++ b/cpp/tests/utilities/conversion_utilities.hpp
@@ -216,15 +216,20 @@ graph_to_host_csc(
 
 // Only the rank 0 GPU holds the valid data
 template <typename vertex_t, typename edge_t, typename weight_t, bool store_transposed>
-std::tuple<cugraph::graph_t<vertex_t, edge_t, store_transposed, false>,
-           std::optional<cugraph::edge_property_t<
-             cugraph::graph_view_t<vertex_t, edge_t, store_transposed, false>,
-             weight_t>>,
-           std::optional<rmm::device_uvector<vertex_t>>>
+std::tuple<
+  cugraph::graph_t<vertex_t, edge_t, store_transposed, false>,
+  std::optional<
+    cugraph::edge_property_t<cugraph::graph_view_t<vertex_t, edge_t, store_transposed, false>,
+                             weight_t>>,
+  std::optional<
+    cugraph::edge_property_t<cugraph::graph_view_t<vertex_t, edge_t, store_transposed, false>,
+                             edge_t>>,
+  std::optional<rmm::device_uvector<vertex_t>>>
 mg_graph_to_sg_graph(
   raft::handle_t const& handle,
   cugraph::graph_view_t<vertex_t, edge_t, store_transposed, true> const& graph_view,
   std::optional<cugraph::edge_property_view_t<edge_t, weight_t const*>> edge_weight_view,
+  std::optional<cugraph::edge_property_view_t<edge_t, edge_t const*>> edge_id_view,
   std::optional<raft::device_span<vertex_t const>> renumber_map,
   bool renumber);
 
diff --git a/cpp/tests/utilities/conversion_utilities_impl.cuh b/cpp/tests/utilities/conversion_utilities_impl.cuh
index 6eb7357eedd..748a5731b89 100644
--- a/cpp/tests/utilities/conversion_utilities_impl.cuh
+++ b/cpp/tests/utilities/conversion_utilities_impl.cuh
@@ -283,23 +283,26 @@ template <typename vertex_t, typename edge_t, typename weight_t, bool store_tran
 std::tuple<
   cugraph::graph_t<vertex_t, edge_t, store_transposed, false>,
   std::optional<edge_property_t<graph_view_t<vertex_t, edge_t, store_transposed, false>, weight_t>>,
+  std::optional<edge_property_t<graph_view_t<vertex_t, edge_t, store_transposed, false>, edge_t>>,
   std::optional<rmm::device_uvector<vertex_t>>>
 mg_graph_to_sg_graph(
   raft::handle_t const& handle,
   cugraph::graph_view_t<vertex_t, edge_t, store_transposed, true> const& graph_view,
   std::optional<cugraph::edge_property_view_t<edge_t, weight_t const*>> edge_weight_view,
+  std::optional<cugraph::edge_property_view_t<edge_t, edge_t const*>> edge_id_view,
   std::optional<raft::device_span<vertex_t const>> renumber_map,
   bool renumber)
 {
   rmm::device_uvector<vertex_t> d_src(0, handle.get_stream());
   rmm::device_uvector<vertex_t> d_dst(0, handle.get_stream());
   std::optional<rmm::device_uvector<weight_t>> d_wgt{std::nullopt};
+  std::optional<rmm::device_uvector<edge_t>> d_edge_id{std::nullopt};
 
-  std::tie(d_src, d_dst, d_wgt, std::ignore, std::ignore) = cugraph::decompress_to_edgelist(
+  std::tie(d_src, d_dst, d_wgt, d_edge_id, std::ignore) = cugraph::decompress_to_edgelist(
     handle,
     graph_view,
     edge_weight_view,
-    std::optional<edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
+    edge_id_view,
     std::optional<cugraph::edge_property_view_t<edge_t, int32_t const*>>{std::nullopt},
     renumber_map);
 
@@ -310,6 +313,9 @@ mg_graph_to_sg_graph(
   if (d_wgt)
     *d_wgt = cugraph::test::device_gatherv(
       handle, raft::device_span<weight_t const>{d_wgt->data(), d_wgt->size()});
+  if (d_edge_id)
+    *d_edge_id = cugraph::test::device_gatherv(
+      handle, raft::device_span<edge_t const>{d_edge_id->data(), d_edge_id->size()});
 
   rmm::device_uvector<vertex_t> vertices(0, handle.get_stream());
   if (renumber_map) { vertices = cugraph::test::device_gatherv(handle, *renumber_map); }
@@ -317,6 +323,8 @@ mg_graph_to_sg_graph(
   graph_t<vertex_t, edge_t, store_transposed, false> sg_graph(handle);
   std::optional<edge_property_t<graph_view_t<vertex_t, edge_t, store_transposed, false>, weight_t>>
     sg_edge_weights{std::nullopt};
+  std::optional<edge_property_t<graph_view_t<vertex_t, edge_t, store_transposed, false>, edge_t>>
+    sg_edge_ids{std::nullopt};
   std::optional<rmm::device_uvector<vertex_t>> sg_number_map;
   if (handle.get_comms().get_rank() == 0) {
     if (!renumber_map) {
@@ -325,7 +333,7 @@ mg_graph_to_sg_graph(
         handle.get_stream(), vertices.data(), vertices.size(), vertex_t{0});
     }
 
-    std::tie(sg_graph, sg_edge_weights, std::ignore, std::ignore, sg_number_map) =
+    std::tie(sg_graph, sg_edge_weights, sg_edge_ids, std::ignore, sg_number_map) =
       cugraph::create_graph_from_edgelist<vertex_t,
                                           edge_t,
                                           weight_t,
@@ -338,7 +346,7 @@ mg_graph_to_sg_graph(
         std::move(d_src),
         std::move(d_dst),
         std::move(d_wgt),
-        std::nullopt,
+        std::move(d_edge_id),
         std::nullopt,
         cugraph::graph_properties_t{graph_view.is_symmetric(), graph_view.is_multigraph()},
         renumber);
@@ -351,9 +359,16 @@ mg_graph_to_sg_graph(
       (*d_wgt).resize(0, handle.get_stream());
       (*d_wgt).shrink_to_fit(handle.get_stream());
     }
+    if (d_edge_id) {
+      (*d_edge_id).resize(0, handle.get_stream());
+      (*d_edge_id).shrink_to_fit(handle.get_stream());
+    }
   }
 
-  return std::make_tuple(std::move(sg_graph), std::move(sg_edge_weights), std::move(sg_number_map));
+  return std::make_tuple(std::move(sg_graph),
+                         std::move(sg_edge_weights),
+                         std::move(sg_edge_ids),
+                         std::move(sg_number_map));
 }
 
 template <typename vertex_t, typename value_t>
diff --git a/cpp/tests/utilities/conversion_utilities_mg.cu b/cpp/tests/utilities/conversion_utilities_mg.cu
index d657f868497..cb4703ec89b 100644
--- a/cpp/tests/utilities/conversion_utilities_mg.cu
+++ b/cpp/tests/utilities/conversion_utilities_mg.cu
@@ -381,132 +381,156 @@ graph_to_host_csc(
 template std::tuple<
   cugraph::graph_t<int32_t, int32_t, true, false>,
   std::optional<cugraph::edge_property_t<graph_view_t<int32_t, int32_t, true, false>, float>>,
+  std::optional<cugraph::edge_property_t<graph_view_t<int32_t, int32_t, true, false>, int32_t>>,
   std::optional<rmm::device_uvector<int32_t>>>
 mg_graph_to_sg_graph(
   raft::handle_t const& handle,
   cugraph::graph_view_t<int32_t, int32_t, true, true> const& graph_view,
   std::optional<cugraph::edge_property_view_t<int32_t, float const*>> edge_weight_view,
+  std::optional<cugraph::edge_property_view_t<int32_t, int32_t const*>> edge_id_view,
   std::optional<raft::device_span<int32_t const>> renumber_map,
   bool renumber);
 
 template std::tuple<
   cugraph::graph_t<int32_t, int64_t, true, false>,
   std::optional<cugraph::edge_property_t<graph_view_t<int32_t, int64_t, true, false>, float>>,
+  std::optional<cugraph::edge_property_t<graph_view_t<int32_t, int64_t, true, false>, int64_t>>,
   std::optional<rmm::device_uvector<int32_t>>>
 mg_graph_to_sg_graph(
   raft::handle_t const& handle,
   cugraph::graph_view_t<int32_t, int64_t, true, true> const& graph_view,
   std::optional<cugraph::edge_property_view_t<int64_t, float const*>> edge_weight_view,
+  std::optional<cugraph::edge_property_view_t<int64_t, int64_t const*>> edge_id_view,
   std::optional<raft::device_span<int32_t const>> renumber_map,
   bool renumber);
 
 template std::tuple<
   cugraph::graph_t<int64_t, int64_t, true, false>,
   std::optional<cugraph::edge_property_t<graph_view_t<int64_t, int64_t, true, false>, float>>,
+  std::optional<cugraph::edge_property_t<graph_view_t<int64_t, int64_t, true, false>, int64_t>>,
   std::optional<rmm::device_uvector<int64_t>>>
 mg_graph_to_sg_graph(
   raft::handle_t const& handle,
   cugraph::graph_view_t<int64_t, int64_t, true, true> const& graph_view,
   std::optional<cugraph::edge_property_view_t<int64_t, float const*>> edge_weight_view,
+  std::optional<cugraph::edge_property_view_t<int64_t, int64_t const*>> edge_id_view,
   std::optional<raft::device_span<int64_t const>> renumber_map,
   bool renumber);
 
 template std::tuple<
   cugraph::graph_t<int32_t, int32_t, true, false>,
   std::optional<cugraph::edge_property_t<graph_view_t<int32_t, int32_t, true, false>, double>>,
+  std::optional<cugraph::edge_property_t<graph_view_t<int32_t, int32_t, true, false>, int32_t>>,
   std::optional<rmm::device_uvector<int32_t>>>
 mg_graph_to_sg_graph(
   raft::handle_t const& handle,
   cugraph::graph_view_t<int32_t, int32_t, true, true> const& graph_view,
   std::optional<cugraph::edge_property_view_t<int32_t, double const*>> edge_weight_view,
+  std::optional<cugraph::edge_property_view_t<int32_t, int32_t const*>> edge_id_view,
   std::optional<raft::device_span<int32_t const>> renumber_map,
   bool renumber);
 
 template std::tuple<
   cugraph::graph_t<int32_t, int64_t, true, false>,
   std::optional<cugraph::edge_property_t<graph_view_t<int32_t, int64_t, true, false>, double>>,
+  std::optional<cugraph::edge_property_t<graph_view_t<int32_t, int64_t, true, false>, int64_t>>,
   std::optional<rmm::device_uvector<int32_t>>>
 mg_graph_to_sg_graph(
   raft::handle_t const& handle,
   cugraph::graph_view_t<int32_t, int64_t, true, true> const& graph_view,
   std::optional<cugraph::edge_property_view_t<int64_t, double const*>> edge_weight_view,
+  std::optional<cugraph::edge_property_view_t<int64_t, int64_t const*>> edge_id_view,
   std::optional<raft::device_span<int32_t const>> renumber_map,
   bool renumber);
 
 template std::tuple<
   cugraph::graph_t<int64_t, int64_t, true, false>,
   std::optional<cugraph::edge_property_t<graph_view_t<int64_t, int64_t, true, false>, double>>,
+  std::optional<cugraph::edge_property_t<graph_view_t<int64_t, int64_t, true, false>, int64_t>>,
   std::optional<rmm::device_uvector<int64_t>>>
 mg_graph_to_sg_graph(
   raft::handle_t const& handle,
   cugraph::graph_view_t<int64_t, int64_t, true, true> const& graph_view,
   std::optional<cugraph::edge_property_view_t<int64_t, double const*>> edge_weight_view,
+  std::optional<cugraph::edge_property_view_t<int64_t, int64_t const*>> edge_id_view,
   std::optional<raft::device_span<int64_t const>> renumber_map,
   bool renumber);
 
 template std::tuple<
   cugraph::graph_t<int32_t, int32_t, false, false>,
   std::optional<cugraph::edge_property_t<graph_view_t<int32_t, int32_t, false, false>, float>>,
+  std::optional<cugraph::edge_property_t<graph_view_t<int32_t, int32_t, false, false>, int32_t>>,
   std::optional<rmm::device_uvector<int32_t>>>
 mg_graph_to_sg_graph(
   raft::handle_t const& handle,
   cugraph::graph_view_t<int32_t, int32_t, false, true> const& graph_view,
   std::optional<cugraph::edge_property_view_t<int32_t, float const*>> edge_weight_view,
+  std::optional<cugraph::edge_property_view_t<int32_t, int32_t const*>> edge_id_view,
   std::optional<raft::device_span<int32_t const>> renumber_map,
   bool renumber);
 
 template std::tuple<
   cugraph::graph_t<int32_t, int64_t, false, false>,
   std::optional<cugraph::edge_property_t<graph_view_t<int32_t, int64_t, false, false>, float>>,
+  std::optional<cugraph::edge_property_t<graph_view_t<int32_t, int64_t, false, false>, int64_t>>,
   std::optional<rmm::device_uvector<int32_t>>>
 mg_graph_to_sg_graph(
   raft::handle_t const& handle,
   cugraph::graph_view_t<int32_t, int64_t, false, true> const& graph_view,
   std::optional<cugraph::edge_property_view_t<int64_t, float const*>> edge_weight_view,
+  std::optional<cugraph::edge_property_view_t<int64_t, int64_t const*>> edge_id_view,
   std::optional<raft::device_span<int32_t const>> renumber_map,
   bool renumber);
 
 template std::tuple<
   cugraph::graph_t<int64_t, int64_t, false, false>,
   std::optional<cugraph::edge_property_t<graph_view_t<int64_t, int64_t, false, false>, float>>,
+  std::optional<cugraph::edge_property_t<graph_view_t<int64_t, int64_t, false, false>, int64_t>>,
   std::optional<rmm::device_uvector<int64_t>>>
 mg_graph_to_sg_graph(
   raft::handle_t const& handle,
   cugraph::graph_view_t<int64_t, int64_t, false, true> const& graph_view,
   std::optional<cugraph::edge_property_view_t<int64_t, float const*>> edge_weight_view,
+  std::optional<cugraph::edge_property_view_t<int64_t, int64_t const*>> edge_id_view,
   std::optional<raft::device_span<int64_t const>> renumber_map,
   bool renumber);
 
 template std::tuple<
   cugraph::graph_t<int32_t, int32_t, false, false>,
   std::optional<cugraph::edge_property_t<graph_view_t<int32_t, int32_t, false, false>, double>>,
+  std::optional<cugraph::edge_property_t<graph_view_t<int32_t, int32_t, false, false>, int32_t>>,
   std::optional<rmm::device_uvector<int32_t>>>
 mg_graph_to_sg_graph(
   raft::handle_t const& handle,
   cugraph::graph_view_t<int32_t, int32_t, false, true> const& graph_view,
   std::optional<cugraph::edge_property_view_t<int32_t, double const*>> edge_weight_view,
+  std::optional<cugraph::edge_property_view_t<int32_t, int32_t const*>> edge_id_view,
   std::optional<raft::device_span<int32_t const>> renumber_map,
   bool renumber);
 
 template std::tuple<
   cugraph::graph_t<int32_t, int64_t, false, false>,
   std::optional<cugraph::edge_property_t<graph_view_t<int32_t, int64_t, false, false>, double>>,
+  std::optional<cugraph::edge_property_t<graph_view_t<int32_t, int64_t, false, false>, int64_t>>,
   std::optional<rmm::device_uvector<int32_t>>>
 mg_graph_to_sg_graph(
   raft::handle_t const& handle,
   cugraph::graph_view_t<int32_t, int64_t, false, true> const& graph_view,
   std::optional<cugraph::edge_property_view_t<int64_t, double const*>> edge_weight_view,
+  std::optional<cugraph::edge_property_view_t<int64_t, int64_t const*>> edge_id_view,
   std::optional<raft::device_span<int32_t const>> renumber_map,
   bool renumber);
 
 template std::tuple<
   cugraph::graph_t<int64_t, int64_t, false, false>,
   std::optional<cugraph::edge_property_t<graph_view_t<int64_t, int64_t, false, false>, double>>,
+  std::optional<cugraph::edge_property_t<graph_view_t<int64_t, int64_t, false, false>, int64_t>>,
   std::optional<rmm::device_uvector<int64_t>>>
 mg_graph_to_sg_graph(
   raft::handle_t const& handle,
   cugraph::graph_view_t<int64_t, int64_t, false, true> const& graph_view,
   std::optional<cugraph::edge_property_view_t<int64_t, double const*>> edge_weight_view,
+  std::optional<cugraph::edge_property_view_t<int64_t, int64_t const*>> edge_id_view,
   std::optional<raft::device_span<int64_t const>> renumber_map,
   bool renumber);
 

From 31565696d420ae661aa75db84113ad65104c8da9 Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Tue, 28 May 2024 08:58:51 -0500
Subject: [PATCH 15/23] Enable expression-based Dask Dataframe support (#4325)

**[WIP]** I'm using this PR to debug/add support for `DASK_DATAFRAME__QUERY_PLANNING=True`.

**NOTES**:
- Depends on https://github.com/dask/dask-expr/pull/1041 [Merged]
- Depends on https://github.com/dask/dask-expr/pull/1044

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

Approvers:
  - Rick Ratzel (https://github.com/rlratzel)
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/cugraph/pull/4325
---
 .../bulk_sampling/cugraph_bulk_sampling.py    |  2 +-
 ci/test_python.sh                             |  4 ----
 ci/test_wheel.sh                              |  4 ----
 python/cugraph/cugraph/dask/__init__.py       |  7 ++++++-
 .../cugraph/dask/common/input_utils.py        |  6 +++---
 .../cugraph/cugraph/dask/common/part_utils.py |  4 ++--
 .../cugraph/structure/convert_matrix.py       |  4 ++--
 .../simpleDistributedGraph.py                 | 19 ++++++++++---------
 .../data_store/test_property_graph_mg.py      |  4 ++--
 .../tests/internals/test_symmetrize_mg.py     | 13 ++++++++-----
 .../cugraph/tests/structure/test_graph_mg.py  |  6 +++---
 .../cugraph/tests/utils/test_dataset.py       |  2 +-
 12 files changed, 38 insertions(+), 37 deletions(-)

diff --git a/benchmarks/cugraph/standalone/bulk_sampling/cugraph_bulk_sampling.py b/benchmarks/cugraph/standalone/bulk_sampling/cugraph_bulk_sampling.py
index 95e1afcb28b..578e2520765 100644
--- a/benchmarks/cugraph/standalone/bulk_sampling/cugraph_bulk_sampling.py
+++ b/benchmarks/cugraph/standalone/bulk_sampling/cugraph_bulk_sampling.py
@@ -344,7 +344,7 @@ def generate_rmat_dataset(
     del label_df
     gc.collect()
 
-    dask_label_df = dask_cudf.from_dask_dataframe(dask_label_df)
+    dask_label_df = dask_label_df.to_backend("cudf")
 
     node_offsets = {"paper": 0}
     edge_offsets = {("paper", "cites", "paper"): 0}
diff --git a/ci/test_python.sh b/ci/test_python.sh
index 5ea893eca60..9537f66e825 100755
--- a/ci/test_python.sh
+++ b/ci/test_python.sh
@@ -3,10 +3,6 @@
 
 set -euo pipefail
 
-# TODO: Enable dask query planning (by default) once some bugs are fixed.
-# xref: https://github.com/rapidsai/cudf/issues/15027
-export DASK_DATAFRAME__QUERY_PLANNING=False
-
 # Support invoking test_python.sh outside the script directory
 cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../
 
diff --git a/ci/test_wheel.sh b/ci/test_wheel.sh
index cda40d92c74..158704e08d1 100755
--- a/ci/test_wheel.sh
+++ b/ci/test_wheel.sh
@@ -3,10 +3,6 @@
 
 set -eoxu pipefail
 
-# TODO: Enable dask query planning (by default) once some bugs are fixed.
-# xref: https://github.com/rapidsai/cudf/issues/15027
-export DASK_DATAFRAME__QUERY_PLANNING=False
-
 package_name=$1
 package_dir=$2
 
diff --git a/python/cugraph/cugraph/dask/__init__.py b/python/cugraph/cugraph/dask/__init__.py
index a6958aaaf49..a76f1460575 100644
--- a/python/cugraph/cugraph/dask/__init__.py
+++ b/python/cugraph/cugraph/dask/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -11,6 +11,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from dask import config
+
 from .link_analysis.pagerank import pagerank
 from .link_analysis.hits import hits
 from .traversal.bfs import bfs
@@ -34,3 +36,6 @@
 from .link_prediction.sorensen import sorensen
 from .link_prediction.overlap import overlap
 from .community.leiden import leiden
+
+# Avoid "p2p" shuffling in dask for now
+config.set({"dataframe.shuffle.method": "tasks"})
diff --git a/python/cugraph/cugraph/dask/common/input_utils.py b/python/cugraph/cugraph/dask/common/input_utils.py
index dcbd811562b..db70b7b089f 100644
--- a/python/cugraph/cugraph/dask/common/input_utils.py
+++ b/python/cugraph/cugraph/dask/common/input_utils.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -15,8 +15,8 @@
 
 from collections.abc import Sequence
 from collections import OrderedDict
-from dask_cudf.core import DataFrame as dcDataFrame
-from dask_cudf.core import Series as daskSeries
+from dask_cudf import DataFrame as dcDataFrame
+from dask_cudf import Series as daskSeries
 
 import cugraph.dask.comms.comms as Comms
 
diff --git a/python/cugraph/cugraph/dask/common/part_utils.py b/python/cugraph/cugraph/dask/common/part_utils.py
index d362502f239..19c429bb7be 100644
--- a/python/cugraph/cugraph/dask/common/part_utils.py
+++ b/python/cugraph/cugraph/dask/common/part_utils.py
@@ -18,8 +18,8 @@
 import collections
 import dask_cudf
 from dask.array.core import Array as daskArray
-from dask_cudf.core import DataFrame as daskDataFrame
-from dask_cudf.core import Series as daskSeries
+from dask_cudf import DataFrame as daskDataFrame
+from dask_cudf import Series as daskSeries
 from functools import reduce
 import cugraph.dask.comms.comms as Comms
 from dask.delayed import delayed
diff --git a/python/cugraph/cugraph/structure/convert_matrix.py b/python/cugraph/cugraph/structure/convert_matrix.py
index b9b9554b870..024b9ddfba2 100644
--- a/python/cugraph/cugraph/structure/convert_matrix.py
+++ b/python/cugraph/cugraph/structure/convert_matrix.py
@@ -40,7 +40,7 @@ def from_edgelist(
 
     Parameters
     ----------
-    df : cudf.DataFrame, pandas.DataFrame, dask_cudf.core.DataFrame
+    df : cudf.DataFrame, pandas.DataFrame, dask_cudf.DataFrame
         This DataFrame contains columns storing edge source vertices,
         destination (or target following NetworkX's terminology) vertices, and
         (optional) weights.
@@ -95,7 +95,7 @@ def from_edgelist(
             renumber=renumber,
         )
 
-    elif df_type is dask_cudf.core.DataFrame:
+    elif df_type is dask_cudf.DataFrame:
         if create_using is None:
             G = Graph()
         elif isinstance(create_using, Graph):
diff --git a/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py b/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py
index 0ef5eaf1b9e..3fa92bb5e67 100644
--- a/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py
+++ b/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py
@@ -285,19 +285,20 @@ def __from_edgelist(
                 symmetrize=not self.properties.directed,
             )
 
+        # Create a dask_cudf dataframe from the cudf series
+        # or dataframe objects obtained from symmetrization
         if isinstance(source_col, dask_cudf.Series):
-            # Create a dask_cudf dataframe from the cudf series obtained
-            # from symmetrization
-            input_ddf = source_col.to_frame()
-            input_ddf = input_ddf.rename(columns={source_col.name: source})
-            input_ddf[destination] = dest_col
+            frames = [
+                source_col.to_frame(name=source),
+                dest_col.to_frame(name=destination),
+            ]
         else:
-            # Multi column dask_cudf dataframe
-            input_ddf = dask_cudf.concat([source_col, dest_col], axis=1)
+            frames = [source_col, dest_col]
 
         if value_col is not None:
-            for vc in value_col_names:
-                input_ddf[vc] = value_col[vc]
+            frames.append(value_col[value_col_names])
+
+        input_ddf = dask_cudf.concat(frames, axis=1)
 
         self.input_df = input_ddf
 
diff --git a/python/cugraph/cugraph/tests/data_store/test_property_graph_mg.py b/python/cugraph/cugraph/tests/data_store/test_property_graph_mg.py
index db4ab0a2ac1..42cb0f232bf 100644
--- a/python/cugraph/cugraph/tests/data_store/test_property_graph_mg.py
+++ b/python/cugraph/cugraph/tests/data_store/test_property_graph_mg.py
@@ -159,8 +159,8 @@ def df_type_id(dataframe_type):
         return s + "cudf.DataFrame"
     if dataframe_type == pd.DataFrame:
         return s + "pandas.DataFrame"
-    if dataframe_type == dask_cudf.core.DataFrame:
-        return s + "dask_cudf.core.DataFrame"
+    if dataframe_type == dask_cudf.DataFrame:
+        return s + "dask_cudf.DataFrame"
     return s + "?"
 
 
diff --git a/python/cugraph/cugraph/tests/internals/test_symmetrize_mg.py b/python/cugraph/cugraph/tests/internals/test_symmetrize_mg.py
index 913443fe400..9091ab7df57 100644
--- a/python/cugraph/cugraph/tests/internals/test_symmetrize_mg.py
+++ b/python/cugraph/cugraph/tests/internals/test_symmetrize_mg.py
@@ -232,14 +232,17 @@ def test_mg_symmetrize(dask_client, read_datasets):
 
     # create a dask DataFrame from the dask Series
     if isinstance(sym_src, dask_cudf.Series):
-        ddf2 = sym_src.to_frame()
-        ddf2 = ddf2.rename(columns={sym_src.name: "src"})
-        ddf2["dst"] = sym_dst
+        frames = [
+            sym_src.to_frame(name="src"),
+            sym_dst.to_frame(name="dst"),
+        ]
     else:
-        ddf2 = dask_cudf.concat([sym_src, sym_dst], axis=1)
+        frames = [sym_src, sym_dst]
 
     if val_col_name is not None:
-        ddf2["weight"] = sym_val
+        frames.append(sym_val.to_frame(name="weight"))
+
+    ddf2 = dask_cudf.concat(frames, axis=1)
 
     compare(ddf, ddf2, src_col_name, dst_col_name, val_col_name)
 
diff --git a/python/cugraph/cugraph/tests/structure/test_graph_mg.py b/python/cugraph/cugraph/tests/structure/test_graph_mg.py
index f23d4ec026d..cba61731e9a 100644
--- a/python/cugraph/cugraph/tests/structure/test_graph_mg.py
+++ b/python/cugraph/cugraph/tests/structure/test_graph_mg.py
@@ -99,13 +99,13 @@ def test_nodes_functionality(dask_client, input_combo):
     expected_nodes = (
         dask_cudf.concat([ddf["src"], ddf["dst"]])
         .drop_duplicates()
-        .to_frame()
-        .sort_values(0)
+        .to_frame(name="0")
+        .sort_values("0")
     )
 
     expected_nodes = expected_nodes.compute().reset_index(drop=True)
 
-    result_nodes["expected_nodes"] = expected_nodes[0]
+    result_nodes["expected_nodes"] = expected_nodes["0"]
 
     compare = result_nodes.query("result_nodes != expected_nodes")
 
diff --git a/python/cugraph/cugraph/tests/utils/test_dataset.py b/python/cugraph/cugraph/tests/utils/test_dataset.py
index fae89e02002..a52b99dabfe 100644
--- a/python/cugraph/cugraph/tests/utils/test_dataset.py
+++ b/python/cugraph/cugraph/tests/utils/test_dataset.py
@@ -198,7 +198,7 @@ def test_reader_dask(dask_client, dataset):
     E = dataset.get_dask_edgelist(download=True)
 
     assert E is not None
-    assert isinstance(E, dask_cudf.core.DataFrame)
+    assert isinstance(E, dask_cudf.DataFrame)
     dataset.unload()
 
 

From 562b5a5b9f3db29184390c319468ccb488d21056 Mon Sep 17 00:00:00 2001
From: Tingyu Wang <tingyuw@nvidia.com>
Date: Tue, 28 May 2024 16:48:30 -0400
Subject: [PATCH 16/23] Pin torch version in `cugraph-dgl` wheel test (#4447)

To fix the [CI nightly issue](https://github.com/rapidsai/cugraph/actions/runs/9188624604/job/25298484338#step:8:837) in cugraph-dgl wheel test for CUDA 11.

Authors:
  - Tingyu Wang (https://github.com/tingyu66)

Approvers:
  - Rick Ratzel (https://github.com/rlratzel)
  - Brad Rees (https://github.com/BradReesWork)
  - Jake Awe (https://github.com/AyodeAwe)

URL: https://github.com/rapidsai/cugraph/pull/4447
---
 ci/test_wheel_cugraph-dgl.sh | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/ci/test_wheel_cugraph-dgl.sh b/ci/test_wheel_cugraph-dgl.sh
index 827ad487115..564b46cb07e 100755
--- a/ci/test_wheel_cugraph-dgl.sh
+++ b/ci/test_wheel_cugraph-dgl.sh
@@ -32,8 +32,18 @@ fi
 PYTORCH_URL="https://download.pytorch.org/whl/cu${PYTORCH_CUDA_VER}"
 DGL_URL="https://data.dgl.ai/wheels/cu${PYTORCH_CUDA_VER}/repo.html"
 
+# Starting from 2.2, PyTorch wheels depend on nvidia-nccl-cuxx>=2.19 wheel and
+# dynamically link to NCCL. RAPIDS CUDA 11 CI images have an older NCCL version that
+# might shadow the newer NCCL required by PyTorch during import (when importing
+# `cupy` before `torch`).
+if [[ "${NCCL_VERSION}" < "2.19" ]]; then
+  PYTORCH_VER="2.1.0"
+else
+  PYTORCH_VER="2.3.0"
+fi
+
 rapids-logger "Installing PyTorch and DGL"
-rapids-retry python -m pip install torch --index-url ${PYTORCH_URL}
+rapids-retry python -m pip install "torch==${PYTORCH_VER}" --index-url ${PYTORCH_URL}
 rapids-retry python -m pip install dgl==2.0.0 --find-links ${DGL_URL}
 
 python -m pytest python/cugraph-dgl/tests

From 169d1625fd93d99c481051f78047464e43fdee02 Mon Sep 17 00:00:00 2001
From: Don Acosta <97529984+acostadon@users.noreply.github.com>
Date: Wed, 29 May 2024 08:43:47 -0400
Subject: [PATCH 17/23] adding notebook to demo nx_cugraph (#4366)

This notebook will be used to demontstrate how to  use nx-cugraph and show the speed-up.

Authors:
  - Don Acosta (https://github.com/acostadon)

Approvers:
  - Brad Rees (https://github.com/BradReesWork)

URL: https://github.com/rapidsai/cugraph/pull/4366
---
 docs/cugraph/source/nx_cugraph/nx_cugraph.md  | 159 +++---------
 .../nx_cugraph_codeless_switching.ipynb       | 244 ++++++++++++++++++
 2 files changed, 275 insertions(+), 128 deletions(-)
 create mode 100644 notebooks/cugraph_benchmarks/nx_cugraph_codeless_switching.ipynb

diff --git a/docs/cugraph/source/nx_cugraph/nx_cugraph.md b/docs/cugraph/source/nx_cugraph/nx_cugraph.md
index 92fbf90a43b..ff2fc3d1da8 100644
--- a/docs/cugraph/source/nx_cugraph/nx_cugraph.md
+++ b/docs/cugraph/source/nx_cugraph/nx_cugraph.md
@@ -24,142 +24,45 @@ Each chart has three measurements.
 ![Single Source Shortest Path](../images/sssp.png)
 ![Weakly Connected Components](../images/wcc.png)
 
+### Command line example
+Open bc_demo.ipy and paste the code below.
 
-The following algorithms are supported and automatically dispatched to nx-cuGraph for acceleration.
+```
+import pandas as pd
+import networkx as nx
+
+url = "https://data.rapids.ai/cugraph/datasets/cit-Patents.csv"
+df = pd.read_csv(url, sep=" ", names=["src", "dst"], dtype="int32")
+G = nx.from_pandas_edgelist(df, source="src", target="dst")
 
-#### Algorithms
+%time result = nx.betweenness_centrality(G, k=10)
+```
+Run the command:
 ```
-bipartite
- ├─ basic
- │   └─ is_bipartite
- └─ generators
-     └─ complete_bipartite_graph
-centrality
- ├─ betweenness
- │   ├─ betweenness_centrality
- │   └─ edge_betweenness_centrality
- ├─ degree_alg
- │   ├─ degree_centrality
- │   ├─ in_degree_centrality
- │   └─ out_degree_centrality
- ├─ eigenvector
- │   └─ eigenvector_centrality
- └─ katz
-     └─ katz_centrality
-cluster
- ├─ average_clustering
- ├─ clustering
- ├─ transitivity
- └─ triangles
-community
- └─ louvain
-     └─ louvain_communities
-components
- ├─ connected
- │   ├─ connected_components
- │   ├─ is_connected
- │   ├─ node_connected_component
- │   └─ number_connected_components
- └─ weakly_connected
-     ├─ is_weakly_connected
-     ├─ number_weakly_connected_components
-     └─ weakly_connected_components
-core
- ├─ core_number
- └─ k_truss
-dag
- ├─ ancestors
- └─ descendants
-isolate
- ├─ is_isolate
- ├─ isolates
- └─ number_of_isolates
-link_analysis
- ├─ hits_alg
- │   └─ hits
- └─ pagerank_alg
-     └─ pagerank
-operators
- └─ unary
-     ├─ complement
-     └─ reverse
-reciprocity
- ├─ overall_reciprocity
- └─ reciprocity
-shortest_paths
- └─ unweighted
-     ├─ single_source_shortest_path_length
-     └─ single_target_shortest_path_length
-traversal
- └─ breadth_first_search
-     ├─ bfs_edges
-     ├─ bfs_layers
-     ├─ bfs_predecessors
-     ├─ bfs_successors
-     ├─ bfs_tree
-     ├─ descendants_at_distance
-     └─ generic_bfs_edges
-tree
- └─ recognition
-     ├─ is_arborescence
-     ├─ is_branching
-     ├─ is_forest
-     └─ is_tree
+user@machine:/# ipython bc_demo.ipy
 ```
 
-#### Generators
+You will observe a run time of approximately 7 minutes...more or less depending on your cpu.
+
+Run the command again, this time specifiying cugraph as the NetworkX backend of choice.
+```
+user@machine:/# NETWORKX_BACKEND_PRIORITY=cugraph ipython bc_demo.ipy
 ```
-classic
- ├─ barbell_graph
- ├─ circular_ladder_graph
- ├─ complete_graph
- ├─ complete_multipartite_graph
- ├─ cycle_graph
- ├─ empty_graph
- ├─ ladder_graph
- ├─ lollipop_graph
- ├─ null_graph
- ├─ path_graph
- ├─ star_graph
- ├─ tadpole_graph
- ├─ trivial_graph
- ├─ turan_graph
- └─ wheel_graph
-community
- └─ caveman_graph
-small
- ├─ bull_graph
- ├─ chvatal_graph
- ├─ cubical_graph
- ├─ desargues_graph
- ├─ diamond_graph
- ├─ dodecahedral_graph
- ├─ frucht_graph
- ├─ heawood_graph
- ├─ house_graph
- ├─ house_x_graph
- ├─ icosahedral_graph
- ├─ krackhardt_kite_graph
- ├─ moebius_kantor_graph
- ├─ octahedral_graph
- ├─ pappus_graph
- ├─ petersen_graph
- ├─ sedgewick_maze_graph
- ├─ tetrahedral_graph
- ├─ truncated_cube_graph
- ├─ truncated_tetrahedron_graph
- └─ tutte_graph
-social
- ├─ davis_southern_women_graph
- ├─ florentine_families_graph
- ├─ karate_club_graph
- └─ les_miserables_graph
+This run will be much faster, typically around 20 seconds depending on your GPU.
+```
+user@machine:/# NETWORKX_BACKEND_PRIORITY=cugraph ipython bc_demo.ipy
+```
+There is also an option to add caching. This will dramatically help performance when running multiple algorithms on the same graph.
+```
+NETWORKX_BACKEND_PRIORITY=cugraph CACHE_CONVERTED_GRAPH=True ipython bc_demo.ipy
 ```
 
-#### Other
+When running Python interactively, cugraph backend can be specified as an argument in the algorithm call.
 
+For example:
 ```
-convert_matrix
- ├─ from_pandas_edgelist
- └─ from_scipy_sparse_array
+nx.betweenness_centrality(cit_patents_graph, k=k, backend="cugraph")
 ```
+
+
+The latest list of algorithms that can be dispatched to nx-cuGraph for acceleration is found [here](https://github.com/rapidsai/cugraph/blob/main/python/nx-cugraph/README.md#algorithms).
diff --git a/notebooks/cugraph_benchmarks/nx_cugraph_codeless_switching.ipynb b/notebooks/cugraph_benchmarks/nx_cugraph_codeless_switching.ipynb
new file mode 100644
index 00000000000..e05544448b1
--- /dev/null
+++ b/notebooks/cugraph_benchmarks/nx_cugraph_codeless_switching.ipynb
@@ -0,0 +1,244 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Benchmarking Performance of NetworkX with Rapids GPU-based nx_cugraph backend vs on cpu\n",
+    "# Skip notebook test\n",
+    "This notebook demonstrates compares the performance of nx_cugraph as a dispatcher for NetworkX algorithms. \n",
+    "\n",
+    "We do this by executing Betweenness Centrality, Breadth First Search and Louvain Community Detection, collecting run times with and without nx_cugraph backend and graph caching enabled. nx_cugraph is a registered NetworkX backend. Using it is a zero code change solution.\n",
+    "\n",
+    "In the notebook switching to the nx-cugraph backend is done via variables set using the [NetworkX config package](https://networkx.org/documentation/stable/reference/backends.html#networkx.utils.configs.NetworkXConfig) **which requires networkX 3.3 or later !!**\n",
+    "\n",
+    "\n",
+    "They can be set at the command line as well.\n",
+    "\n",
+    "### See this example from GTC Spring 2024\n",
+    "\n",
+    "\n",
+    "\n",
+    "Here is a sample minimal script to demonstrate No-code-change GPU acceleration using nx-cugraph.\n",
+    "\n",
+    "----\n",
+    "bc_demo.ipy:\n",
+    "\n",
+    "```\n",
+    "import pandas as pd\n",
+    "import networkx as nx\n",
+    "\n",
+    "url = \"https://data.rapids.ai/cugraph/datasets/cit-Patents.csv\"\n",
+    "df = pd.read_csv(url, sep=\" \", names=[\"src\", \"dst\"], dtype=\"int32\")\n",
+    "G = nx.from_pandas_edgelist(df, source=\"src\", target=\"dst\")\n",
+    "\n",
+    "%time result = nx.betweenness_centrality(G, k=10)\n",
+    "```\n",
+    "----\n",
+    "Running it with the nx-cugraph backend looks like this:\n",
+    "```\n",
+    "user@machine:/# ipython bc_demo.ipy\n",
+    "CPU times: user 7min 38s, sys: 5.6 s, total: 7min 44s\n",
+    "Wall time: 7min 44s\n",
+    "\n",
+    "user@machine:/# NETWORKX_BACKEND_PRIORITY=cugraph ipython bc_demo.ipy\n",
+    "CPU times: user 18.4 s, sys: 1.44 s, total: 19.9 s\n",
+    "Wall time: 20 s\n",
+    "```\n",
+    "----\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "First import the needed packages"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import networkx as nx\n",
+    "import time\n",
+    "import os"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This installs the NetworkX cuGraph dispatcher if not already present."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "try: \n",
+    "    import nx_cugraph\n",
+    "except ModuleNotFoundError:\n",
+    "    os.system('conda install -c rapidsai -c conda-forge -c nvidia nx-cugraph')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This is boiler plate NetworkX code to run:\n",
+    "* betweenness Centrality\n",
+    "* Bredth first Search\n",
+    "* Louvain community detection\n",
+    "\n",
+    "and report times. it is completely unaware of cugraph or GPU-based tools.\n",
+    "[NetworkX configurations](https://networkx.org/documentation/stable/reference/utils.html#backends) can determine how they are run."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def run_algos(G):\n",
+    "    runtime = time.time()\n",
+    "    result = nx.betweenness_centrality(G, k=10)\n",
+    "    print (\"Betweenness Centrality time: \" + str(round(time.time() - runtime))+ \" seconds\")\n",
+    "    runtime = time.time()\n",
+    "    result = nx.bfs_tree(G,source=1)\n",
+    "    print (\"Breadth First Search time:  \" + str(round(time.time() - runtime))+ \" seconds\")\n",
+    "    runtime = time.time()\n",
+    "    result = nx.community.louvain_communities(G,threshold=1e-04)\n",
+    "    print (\"Louvain time: \" + str(round(time.time() - runtime))+ \" seconds\")\n",
+    "    return"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Downloads a patent citation dataset containing 3774768 nodes and 16518948 edges and loads it into a NetworkX graph."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "filepath = \"./data/cit-Patents.csv\"\n",
+    "\n",
+    "if os.path.exists(filepath):\n",
+    "    print(\"File found\")\n",
+    "    url = filepath\n",
+    "else:\n",
+    "    url = \"https://data.rapids.ai/cugraph/datasets/cit-Patents.csv\"\n",
+    "df = pd.read_csv(url, sep=\" \", names=[\"src\", \"dst\"], dtype=\"int32\")\n",
+    "G = nx.from_pandas_edgelist(df, source=\"src\", target=\"dst\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Setting the NetworkX dispatcher with an environment variable or in code using NetworkX config package which is new to [NetworkX 3.3 config](https://networkx.org/documentation/stable/reference/backends.html#networkx.utils.configs.NetworkXConfig).\n",
+    "\n",
+    "These convenience settinge allow turning off caching and cugraph dispatching if you want to see how long cpu-only takes.\n",
+    "This example using an AMD Ryzen Threadripper PRO 3975WX 32-Cores cpu completed in slightly over 40 minutes."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "use_cugraph = True\n",
+    "cache_graph = True"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if use_cugraph:\n",
+    "    nx.config[\"backend_priority\"]=['cugraph']\n",
+    "else:\n",
+    "    # Use this setting to turn off the cugraph dispatcher running in legacy cpu mode.\n",
+    "    nx.config[\"backend_priority\"]=[]\n",
+    "if cache_graph:\n",
+    "    nx.config[\"cache_converted_graphs\"]= True\n",
+    "else:\n",
+    "    # Use this setting to turn off graph caching which will convertthe NetworkX to a gpu-resident graph each time an algorithm is run.\n",
+    "    nx.config[\"cache_converted_graphs\"]= False\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Run the algorithms on GPU. \n",
+    "\n",
+    "**Note the messages NetworkX generates to remind us cached graph shouldn't be modified.**\n",
+    "\n",
+    "```\n",
+    "For the cache to be consistent (i.e., correct), the input graph must not have been manually mutated since the cached graph was created.\n",
+    "\n",
+    "Using cached graph for 'cugraph' backend in call to bfs_edges.\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "run_algos(G)\n",
+    "print (\"Total Algorithm run time\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "___\n",
+    "Copyright (c) 2024, NVIDIA CORPORATION.\n",
+    "\n",
+    "Licensed under the Apache License, Version 2.0 (the \"License\");  you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0\n",
+    "\n",
+    "Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.\n",
+    "___"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

From 4c797bfa251d36f57870cc9ca8636d3098be964c Mon Sep 17 00:00:00 2001
From: Paul Taylor <178183+trxcllnt@users.noreply.github.com>
Date: Wed, 29 May 2024 11:07:57 -0700
Subject: [PATCH 18/23] Fix building cugraph with CCCL main (#4404)

Similar to https://github.com/rapidsai/cudf/pull/15552, we are testing [building RAPIDS with CCCL's main branch](https://github.com/NVIDIA/cccl/pull/1667) to get ahead of any breaking changes.

Authors:
  - Paul Taylor (https://github.com/trxcllnt)
  - Ralph Liu (https://github.com/nv-rliu)
  - Seunghwa Kang (https://github.com/seunghwak)
  - Ray Bell (https://github.com/raybellwaves)

Approvers:
  - Chuck Hastings (https://github.com/ChuckHastings)
  - Seunghwa Kang (https://github.com/seunghwak)
  - Jake Awe (https://github.com/AyodeAwe)

URL: https://github.com/rapidsai/cugraph/pull/4404
---
 .devcontainer/Dockerfile                      |   5 +
 .../cuda11.8-conda/devcontainer.json          |   2 +-
 .devcontainer/cuda11.8-pip/devcontainer.json  |   8 +-
 .../cuda12.2-conda/devcontainer.json          |   2 +-
 .devcontainer/cuda12.2-pip/devcontainer.json  |   8 +-
 .github/workflows/pr.yaml                     |   2 +-
 cpp/CMakeLists.txt                            |   4 +-
 .../cugraph/utilities/device_functors.cuh     |   9 +-
 cpp/include/cugraph/utilities/mask_utils.cuh  |   5 +-
 cpp/src/community/detail/common_methods.cuh   |   3 +-
 cpp/src/community/legacy/louvain.cuh          |  15 ++-
 .../weakly_connected_components_impl.cuh      |  15 ++-
 cpp/src/detail/utility_wrappers.cu            |   4 +-
 cpp/src/prims/kv_store.cuh                    |   1 +
 ...m_reduce_dst_key_aggregated_outgoing_e.cuh |   2 +-
 cpp/src/structure/graph_view_impl.cuh         |  36 +++---
 cpp/tests/CMakeLists.txt                      |  24 +++-
 cpp/tests/prims/mg_extract_transform_e.cu     | 109 +++++-------------
 .../sampling/sampling_post_processing_test.cu |  38 +++---
 19 files changed, 138 insertions(+), 154 deletions(-)

diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile
index 3d0ac075be3..190003dd7af 100644
--- a/.devcontainer/Dockerfile
+++ b/.devcontainer/Dockerfile
@@ -7,6 +7,11 @@ FROM ${BASE} as pip-base
 
 ENV DEFAULT_VIRTUAL_ENV=rapids
 
+RUN apt update -y \
+ && DEBIAN_FRONTEND=noninteractive apt install -y \
+    libblas-dev liblapack-dev \
+ && rm -rf /tmp/* /var/tmp/* /var/cache/apt/* /var/lib/apt/lists/*;
+
 FROM ${BASE} as conda-base
 
 ENV DEFAULT_CONDA_ENV=rapids
diff --git a/.devcontainer/cuda11.8-conda/devcontainer.json b/.devcontainer/cuda11.8-conda/devcontainer.json
index 7c9cd0258a4..d878f2d6584 100644
--- a/.devcontainer/cuda11.8-conda/devcontainer.json
+++ b/.devcontainer/cuda11.8-conda/devcontainer.json
@@ -11,7 +11,7 @@
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER}-rapids-${localWorkspaceFolderBasename}-24.06-cuda11.8-conda"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.06-cuda11.8-conda"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
diff --git a/.devcontainer/cuda11.8-pip/devcontainer.json b/.devcontainer/cuda11.8-pip/devcontainer.json
index a4dc168505b..a0edcb27df8 100644
--- a/.devcontainer/cuda11.8-pip/devcontainer.json
+++ b/.devcontainer/cuda11.8-pip/devcontainer.json
@@ -5,19 +5,16 @@
     "args": {
       "CUDA": "11.8",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:24.06-cpp-cuda11.8-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.06-cpp-cuda11.8-ucx1.15.0-openmpi-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER}-rapids-${localWorkspaceFolderBasename}-24.06-cuda11.8-pip"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.06-cuda11.8-pip"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/ucx:24.6": {
-      "version": "1.15.0"
-    },
     "ghcr.io/rapidsai/devcontainers/features/cuda:24.6": {
       "version": "11.8",
       "installcuBLAS": true,
@@ -28,7 +25,6 @@
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.6": {}
   },
   "overrideFeatureInstallOrder": [
-    "ghcr.io/rapidsai/devcontainers/features/ucx",
     "ghcr.io/rapidsai/devcontainers/features/cuda",
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
   ],
diff --git a/.devcontainer/cuda12.2-conda/devcontainer.json b/.devcontainer/cuda12.2-conda/devcontainer.json
index eae4967f3b2..8a095d9b934 100644
--- a/.devcontainer/cuda12.2-conda/devcontainer.json
+++ b/.devcontainer/cuda12.2-conda/devcontainer.json
@@ -11,7 +11,7 @@
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER}-rapids-${localWorkspaceFolderBasename}-24.06-cuda12.2-conda"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.06-cuda12.2-conda"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
diff --git a/.devcontainer/cuda12.2-pip/devcontainer.json b/.devcontainer/cuda12.2-pip/devcontainer.json
index 393a5c63d23..10436f8b28d 100644
--- a/.devcontainer/cuda12.2-pip/devcontainer.json
+++ b/.devcontainer/cuda12.2-pip/devcontainer.json
@@ -5,19 +5,16 @@
     "args": {
       "CUDA": "12.2",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:24.06-cpp-cuda12.2-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.06-cpp-cuda12.2-ucx1.15.0-openmpi-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER}-rapids-${localWorkspaceFolderBasename}-24.06-cuda12.2-pip"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.06-cuda12.2-pip"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/ucx:24.6": {
-      "version": "1.15.0"
-    },
     "ghcr.io/rapidsai/devcontainers/features/cuda:24.6": {
       "version": "12.2",
       "installcuBLAS": true,
@@ -28,7 +25,6 @@
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.6": {}
   },
   "overrideFeatureInstallOrder": [
-    "ghcr.io/rapidsai/devcontainers/features/ucx",
     "ghcr.io/rapidsai/devcontainers/features/cuda",
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
   ],
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index c04e0e879d2..5733646a8b9 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -196,5 +196,5 @@ jobs:
       extra-repo-deploy-key: CUGRAPH_OPS_SSH_PRIVATE_DEPLOY_KEY
       build_command: |
         sccache -z;
-        build-all --verbose -j$(nproc --ignore=1);
+        build-all --verbose -j$(nproc --ignore=1) -DBUILD_CUGRAPH_MG_TESTS=ON;
         sccache -s;
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 2527599fece..7dca3d983a5 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -92,14 +92,14 @@ set(CUGRAPH_CXX_FLAGS "")
 set(CUGRAPH_CUDA_FLAGS "")
 
 if(CMAKE_COMPILER_IS_GNUCXX)
-    list(APPEND CUGRAPH_CXX_FLAGS -Werror -Wno-error=deprecated-declarations)
+    list(APPEND CUGRAPH_CXX_FLAGS -Werror -Wno-error=deprecated-declarations -Wno-deprecated-declarations -DRAFT_HIDE_DEPRECATION_WARNINGS)
 endif(CMAKE_COMPILER_IS_GNUCXX)
 
 
 message("-- Building for GPU_ARCHS = ${CMAKE_CUDA_ARCHITECTURES}")
 
 list(APPEND CUGRAPH_CUDA_FLAGS --expt-extended-lambda --expt-relaxed-constexpr)
-list(APPEND CUGRAPH_CUDA_FLAGS -Werror=cross-execution-space-call -Wno-deprecated-declarations -Xptxas=--disable-warnings)
+list(APPEND CUGRAPH_CUDA_FLAGS -Werror=cross-execution-space-call -Wno-deprecated-declarations -DRAFT_HIDE_DEPRECATION_WARNINGS -Xptxas=--disable-warnings)
 list(APPEND CUGRAPH_CUDA_FLAGS -Xcompiler=-Wall,-Wno-error=sign-compare,-Wno-error=unused-but-set-variable)
 list(APPEND CUGRAPH_CUDA_FLAGS -Xfatbin=-compress-all)
 
diff --git a/cpp/include/cugraph/utilities/device_functors.cuh b/cpp/include/cugraph/utilities/device_functors.cuh
index 3af8ed1dd19..20cf98f7e6d 100644
--- a/cpp/include/cugraph/utilities/device_functors.cuh
+++ b/cpp/include/cugraph/utilities/device_functors.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -78,13 +78,14 @@ struct indirection_t {
 
 template <typename index_t, typename Iterator>
 struct indirection_if_idx_valid_t {
+  using value_type = typename thrust::iterator_traits<Iterator>::value_type;
   Iterator first{};
   index_t invalid_idx{};
-  typename thrust::iterator_traits<Iterator>::value_type invalid_value{};
+  value_type invalid_value{};
 
-  __device__ typename thrust::iterator_traits<Iterator>::value_type operator()(index_t i) const
+  __device__ value_type operator()(index_t i) const
   {
-    return (i != invalid_idx) ? *(first + i) : invalid_value;
+    return (i != invalid_idx) ? static_cast<value_type>(*(first + i)) : invalid_value;
   }
 };
 
diff --git a/cpp/include/cugraph/utilities/mask_utils.cuh b/cpp/include/cugraph/utilities/mask_utils.cuh
index 7b69ea3fe3a..1d86eef0ed1 100644
--- a/cpp/include/cugraph/utilities/mask_utils.cuh
+++ b/cpp/include/cugraph/utilities/mask_utils.cuh
@@ -20,6 +20,7 @@
 
 #include <raft/core/handle.hpp>
 
+#include <cuda/functional>
 #include <thrust/copy.h>
 #include <thrust/functional.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -160,13 +161,13 @@ size_t count_set_bits(raft::handle_t const& handle, MaskIterator mask_first, siz
     handle.get_thrust_policy(),
     thrust::make_counting_iterator(size_t{0}),
     thrust::make_counting_iterator(packed_bool_size(num_bits)),
-    [mask_first, num_bits] __device__(size_t i) {
+    cuda::proclaim_return_type<size_t>([mask_first, num_bits] __device__(size_t i) -> size_t {
       auto word = *(mask_first + i);
       if ((i + 1) * packed_bools_per_word() > num_bits) {
         word &= packed_bool_partial_mask(num_bits % packed_bools_per_word());
       }
       return static_cast<size_t>(__popc(word));
-    },
+    }),
     size_t{0},
     thrust::plus<size_t>{});
 }
diff --git a/cpp/src/community/detail/common_methods.cuh b/cpp/src/community/detail/common_methods.cuh
index fe0a415db30..dcad4e92b95 100644
--- a/cpp/src/community/detail/common_methods.cuh
+++ b/cpp/src/community/detail/common_methods.cuh
@@ -29,6 +29,7 @@
 #include <cugraph/detail/utility_wrappers.hpp>
 #include <cugraph/graph_functions.hpp>
 
+#include <cuda/functional>
 #include <thrust/binary_search.h>
 #include <thrust/execution_policy.h>
 #include <thrust/functional.h>
@@ -178,7 +179,7 @@ weight_t compute_modularity(
     handle.get_thrust_policy(),
     cluster_weights.begin(),
     cluster_weights.end(),
-    [] __device__(weight_t p) { return p * p; },
+    cuda::proclaim_return_type<weight_t>([] __device__(weight_t p) -> weight_t { return p * p; }),
     weight_t{0},
     thrust::plus<weight_t>());
 
diff --git a/cpp/src/community/legacy/louvain.cuh b/cpp/src/community/legacy/louvain.cuh
index 6cf5bbdc3c6..53d0b231c03 100644
--- a/cpp/src/community/legacy/louvain.cuh
+++ b/cpp/src/community/legacy/louvain.cuh
@@ -22,6 +22,7 @@
 
 #include <cugraph/dendrogram.hpp>
 #include <cugraph/legacy/graph.hpp>
+
 #ifdef TIMING
 #include <cugraph/utilities/high_res_timer.hpp>
 #endif
@@ -29,6 +30,7 @@
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <cuda/functional>
 #include <thrust/copy.h>
 #include <thrust/distance.h>
 #include <thrust/execution_policy.h>
@@ -141,12 +143,13 @@ class Louvain {
       handle_.get_thrust_policy(),
       thrust::make_counting_iterator(0),
       thrust::make_counting_iterator(graph.number_of_vertices),
-      [d_deg = deg.data(), d_inc = inc.data(), total_edge_weight, resolution] __device__(
-        vertex_t community) {
-        return ((d_inc[community] / total_edge_weight) - resolution *
-                                                           (d_deg[community] * d_deg[community]) /
-                                                           (total_edge_weight * total_edge_weight));
-      },
+      cuda::proclaim_return_type<weight_t>(
+        [d_deg = deg.data(), d_inc = inc.data(), total_edge_weight, resolution] __device__(
+          vertex_t community) -> weight_t {
+          return ((d_inc[community] / total_edge_weight) -
+                  resolution * (d_deg[community] * d_deg[community]) /
+                    (total_edge_weight * total_edge_weight));
+        }),
       weight_t{0.0},
       thrust::plus<weight_t>());
 
diff --git a/cpp/src/components/weakly_connected_components_impl.cuh b/cpp/src/components/weakly_connected_components_impl.cuh
index d4d6d842951..f63f28210d8 100644
--- a/cpp/src/components/weakly_connected_components_impl.cuh
+++ b/cpp/src/components/weakly_connected_components_impl.cuh
@@ -34,6 +34,7 @@
 
 #include <rmm/device_uvector.hpp>
 
+#include <cuda/functional>
 #include <thrust/binary_search.h>
 #include <thrust/copy.h>
 #include <thrust/distance.h>
@@ -400,9 +401,10 @@ void weakly_connected_components_impl(raft::handle_t const& handle,
         handle.get_thrust_policy(),
         new_root_candidates.begin(),
         new_root_candidates.begin() + (new_root_candidates.size() > 0 ? 1 : 0),
-        [vertex_partition, degrees = degrees.data()] __device__(auto v) {
-          return degrees[vertex_partition.local_vertex_partition_offset_from_vertex_nocheck(v)];
-        },
+        cuda::proclaim_return_type<edge_t>(
+          [vertex_partition, degrees = degrees.data()] __device__(auto v) -> edge_t {
+            return degrees[vertex_partition.local_vertex_partition_offset_from_vertex_nocheck(v)];
+          }),
         edge_t{0},
         thrust::plus<edge_t>{});
 
@@ -642,9 +644,10 @@ void weakly_connected_components_impl(raft::handle_t const& handle,
         handle.get_thrust_policy(),
         thrust::get<0>(vertex_frontier.bucket(bucket_idx_cur).begin().get_iterator_tuple()),
         thrust::get<0>(vertex_frontier.bucket(bucket_idx_cur).end().get_iterator_tuple()),
-        [vertex_partition, degrees = degrees.data()] __device__(auto v) {
-          return degrees[vertex_partition.local_vertex_partition_offset_from_vertex_nocheck(v)];
-        },
+        cuda::proclaim_return_type<edge_t>(
+          [vertex_partition, degrees = degrees.data()] __device__(auto v) -> edge_t {
+            return degrees[vertex_partition.local_vertex_partition_offset_from_vertex_nocheck(v)];
+          }),
         edge_t{0},
         thrust::plus<edge_t>());
 
diff --git a/cpp/src/detail/utility_wrappers.cu b/cpp/src/detail/utility_wrappers.cu
index 9100ecbd5e1..6d6158a16e7 100644
--- a/cpp/src/detail/utility_wrappers.cu
+++ b/cpp/src/detail/utility_wrappers.cu
@@ -21,6 +21,7 @@
 
 #include <rmm/exec_policy.hpp>
 
+#include <cuda/functional>
 #include <thrust/count.h>
 #include <thrust/distance.h>
 #include <thrust/functional.h>
@@ -139,7 +140,8 @@ vertex_t compute_maximum_vertex_id(rmm::cuda_stream_view const& stream_view,
     rmm::exec_policy(stream_view),
     edge_first,
     edge_first + num_edges,
-    [] __device__(auto e) { return std::max(thrust::get<0>(e), thrust::get<1>(e)); },
+    cuda::proclaim_return_type<vertex_t>(
+      [] __device__(auto e) -> vertex_t { return std::max(thrust::get<0>(e), thrust::get<1>(e)); }),
     vertex_t{0},
     thrust::maximum<vertex_t>());
 }
diff --git a/cpp/src/prims/kv_store.cuh b/cpp/src/prims/kv_store.cuh
index 5001a20bb83..de233fd583b 100644
--- a/cpp/src/prims/kv_store.cuh
+++ b/cpp/src/prims/kv_store.cuh
@@ -17,6 +17,7 @@
 
 #include "prims/detail/optional_dataframe_buffer.hpp"
 
+#include <cugraph/graph.hpp>
 #include <cugraph/utilities/dataframe_buffer.hpp>
 #include <cugraph/utilities/device_functors.cuh>
 
diff --git a/cpp/src/prims/per_v_transform_reduce_dst_key_aggregated_outgoing_e.cuh b/cpp/src/prims/per_v_transform_reduce_dst_key_aggregated_outgoing_e.cuh
index 006d7760666..7be30b0a5f0 100644
--- a/cpp/src/prims/per_v_transform_reduce_dst_key_aggregated_outgoing_e.cuh
+++ b/cpp/src/prims/per_v_transform_reduce_dst_key_aggregated_outgoing_e.cuh
@@ -754,7 +754,7 @@ void per_v_transform_reduce_dst_key_aggregated_outgoing_e(
             std::make_unique<kv_store_t<vertex_t, edge_src_value_t, true>>(
               std::move(majors),
               std::move(edge_major_values),
-              invalid_vertex_id<vertex_t>::value,
+              edge_src_value_t{},
               true,
               handle.get_stream());
         }
diff --git a/cpp/src/structure/graph_view_impl.cuh b/cpp/src/structure/graph_view_impl.cuh
index 29dca6ef409..7097349dce5 100644
--- a/cpp/src/structure/graph_view_impl.cuh
+++ b/cpp/src/structure/graph_view_impl.cuh
@@ -353,7 +353,7 @@ edge_t count_edge_partition_multi_edges(
         execution_policy,
         thrust::make_counting_iterator(edge_partition.major_range_first()) + (*segment_offsets)[2],
         thrust::make_counting_iterator(edge_partition.major_range_first()) + (*segment_offsets)[3],
-        [edge_partition] __device__(auto major) {
+        cuda::proclaim_return_type<edge_t>([edge_partition] __device__(auto major) -> edge_t {
           auto major_offset = edge_partition.major_offset_from_major_nocheck(major);
           vertex_t const* indices{nullptr};
           [[maybe_unused]] edge_t edge_offset{};
@@ -365,7 +365,7 @@ edge_t count_edge_partition_multi_edges(
             if (indices[i - 1] == indices[i]) { ++count; }
           }
           return count;
-        },
+        }),
         edge_t{0},
         thrust::plus<edge_t>{});
     }
@@ -374,19 +374,21 @@ edge_t count_edge_partition_multi_edges(
         execution_policy,
         thrust::make_counting_iterator(vertex_t{0}),
         thrust::make_counting_iterator(*(edge_partition.dcs_nzd_vertex_count())),
-        [edge_partition, major_start_offset = (*segment_offsets)[3]] __device__(auto idx) {
-          auto major_idx =
-            major_start_offset + idx;  // major_offset != major_idx in the hypersparse region
-          vertex_t const* indices{nullptr};
-          [[maybe_unused]] edge_t edge_offset{};
-          edge_t local_degree{};
-          thrust::tie(indices, edge_offset, local_degree) = edge_partition.local_edges(major_idx);
-          edge_t count{0};
-          for (edge_t i = 1; i < local_degree; ++i) {  // assumes neighbors are sorted
-            if (indices[i - 1] == indices[i]) { ++count; }
-          }
-          return count;
-        },
+        cuda::proclaim_return_type<edge_t>(
+          [edge_partition,
+           major_start_offset = (*segment_offsets)[3]] __device__(auto idx) -> edge_t {
+            auto major_idx =
+              major_start_offset + idx;  // major_offset != major_idx in the hypersparse region
+            vertex_t const* indices{nullptr};
+            [[maybe_unused]] edge_t edge_offset{};
+            edge_t local_degree{};
+            thrust::tie(indices, edge_offset, local_degree) = edge_partition.local_edges(major_idx);
+            edge_t count{0};
+            for (edge_t i = 1; i < local_degree; ++i) {  // assumes neighbors are sorted
+              if (indices[i - 1] == indices[i]) { ++count; }
+            }
+            return count;
+          }),
         edge_t{0},
         thrust::plus<edge_t>{});
     }
@@ -398,7 +400,7 @@ edge_t count_edge_partition_multi_edges(
       thrust::make_counting_iterator(edge_partition.major_range_first()),
       thrust::make_counting_iterator(edge_partition.major_range_first()) +
         edge_partition.major_range_size(),
-      [edge_partition] __device__(auto major) {
+      cuda::proclaim_return_type<edge_t>([edge_partition] __device__(auto major) -> edge_t {
         auto major_offset = edge_partition.major_offset_from_major_nocheck(major);
         vertex_t const* indices{nullptr};
         [[maybe_unused]] edge_t edge_offset{};
@@ -409,7 +411,7 @@ edge_t count_edge_partition_multi_edges(
           if (indices[i - 1] == indices[i]) { ++count; }
         }
         return count;
-      },
+      }),
       edge_t{0},
       thrust::plus<edge_t>{});
   }
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index d1dd2dec069..2152de28ff9 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -169,7 +169,11 @@ function(ConfigureTest CMAKE_TEST_NAME)
     )
     set_target_properties(
         ${CMAKE_TEST_NAME}
-            PROPERTIES INSTALL_RPATH "\$ORIGIN/../../../lib")
+            PROPERTIES INSTALL_RPATH "\$ORIGIN/../../../lib"
+                       CXX_STANDARD                        17
+                       CXX_STANDARD_REQUIRED               ON
+                       CUDA_STANDARD                       17
+                       CUDA_STANDARD_REQUIRED              ON)
 
     rapids_test_add(
         NAME ${CMAKE_TEST_NAME}
@@ -195,7 +199,11 @@ function(ConfigureTestMG CMAKE_TEST_NAME)
     )
     set_target_properties(
         ${CMAKE_TEST_NAME}
-            PROPERTIES INSTALL_RPATH "\$ORIGIN/../../../lib")
+            PROPERTIES INSTALL_RPATH "\$ORIGIN/../../../lib"
+                       CXX_STANDARD                        17
+                       CXX_STANDARD_REQUIRED               ON
+                       CUDA_STANDARD                       17
+                       CUDA_STANDARD_REQUIRED              ON)
 
     rapids_test_add(
         NAME ${CMAKE_TEST_NAME}
@@ -241,7 +249,11 @@ function(ConfigureCTest CMAKE_TEST_NAME)
     )
     set_target_properties(
         ${CMAKE_TEST_NAME}
-            PROPERTIES INSTALL_RPATH "\$ORIGIN/../../../lib")
+            PROPERTIES INSTALL_RPATH "\$ORIGIN/../../../lib"
+                       CXX_STANDARD                        17
+                       CXX_STANDARD_REQUIRED               ON
+                       CUDA_STANDARD                       17
+                       CUDA_STANDARD_REQUIRED              ON)
 
     rapids_test_add(
         NAME ${CMAKE_TEST_NAME}
@@ -269,7 +281,11 @@ function(ConfigureCTestMG CMAKE_TEST_NAME)
     )
     set_target_properties(
         ${CMAKE_TEST_NAME}
-            PROPERTIES INSTALL_RPATH "\$ORIGIN/../../../lib")
+            PROPERTIES INSTALL_RPATH "\$ORIGIN/../../../lib"
+                       CXX_STANDARD                        17
+                       CXX_STANDARD_REQUIRED               ON
+                       CUDA_STANDARD                       17
+                       CUDA_STANDARD_REQUIRED              ON)
 
     rapids_test_add(
         NAME ${CMAKE_TEST_NAME}
diff --git a/cpp/tests/prims/mg_extract_transform_e.cu b/cpp/tests/prims/mg_extract_transform_e.cu
index 20e87070fa5..d7aa953ef7c 100644
--- a/cpp/tests/prims/mg_extract_transform_e.cu
+++ b/cpp/tests/prims/mg_extract_transform_e.cu
@@ -59,55 +59,27 @@
 #include <sstream>
 #include <type_traits>
 
-template <typename key_t, typename vertex_t, typename property_t, typename output_payload_t>
+template <typename vertex_t, typename property_t, typename output_payload_t>
 struct e_op_t {
-  static_assert(std::is_same_v<key_t, vertex_t> ||
-                std::is_same_v<key_t, thrust::tuple<vertex_t, int32_t>>);
   static_assert(std::is_same_v<output_payload_t, int32_t> ||
                 std::is_same_v<output_payload_t, thrust::tuple<float, int32_t>>);
 
-  using return_type = thrust::optional<typename std::conditional_t<
-    std::is_same_v<key_t, vertex_t>,
-    std::conditional_t<std::is_arithmetic_v<output_payload_t>,
-                       thrust::tuple<vertex_t, vertex_t, int32_t>,
-                       thrust::tuple<vertex_t, vertex_t, float, int32_t>>,
-    std::conditional_t<std::is_arithmetic_v<output_payload_t>,
-                       thrust::tuple<vertex_t, int32_t, vertex_t, int32_t>,
-                       thrust::tuple<vertex_t, int32_t, vertex_t, float, int32_t>>>>;
-
-  __device__ return_type operator()(key_t optionally_tagged_src,
-                                    vertex_t dst,
-                                    property_t src_val,
-                                    property_t dst_val,
-                                    thrust::nullopt_t) const
+  using return_type =
+    thrust::optional<std::conditional_t<std::is_arithmetic_v<output_payload_t>,
+                                        thrust::tuple<vertex_t, vertex_t, int32_t>,
+                                        thrust::tuple<vertex_t, vertex_t, float, int32_t>>>;
+
+  __device__ return_type operator()(
+    vertex_t src, vertex_t dst, property_t src_val, property_t dst_val, thrust::nullopt_t) const
   {
     auto output_payload = static_cast<output_payload_t>(1);
     if (src_val < dst_val) {
-      if constexpr (std::is_same_v<key_t, vertex_t>) {
-        if constexpr (std::is_arithmetic_v<output_payload_t>) {
-          return thrust::make_tuple(optionally_tagged_src, dst, output_payload);
-        } else {
-          static_assert(thrust::tuple_size<output_payload_t>::value == size_t{2});
-          return thrust::make_tuple(optionally_tagged_src,
-                                    dst,
-                                    thrust::get<0>(output_payload),
-                                    thrust::get<1>(output_payload));
-        }
+      if constexpr (std::is_arithmetic_v<output_payload_t>) {
+        return thrust::make_tuple(src, dst, output_payload);
       } else {
-        static_assert(thrust::tuple_size<key_t>::value == size_t{2});
-        if constexpr (std::is_arithmetic_v<output_payload_t>) {
-          return thrust::make_tuple(thrust::get<0>(optionally_tagged_src),
-                                    thrust::get<1>(optionally_tagged_src),
-                                    dst,
-                                    output_payload);
-        } else {
-          static_assert(thrust::tuple_size<output_payload_t>::value == size_t{2});
-          return thrust::make_tuple(thrust::get<0>(optionally_tagged_src),
-                                    thrust::get<1>(optionally_tagged_src),
-                                    dst,
-                                    thrust::get<0>(output_payload),
-                                    thrust::get<1>(output_payload));
-        }
+        static_assert(thrust::tuple_size<output_payload_t>::value == size_t{2});
+        return thrust::make_tuple(
+          src, dst, thrust::get<0>(output_payload), thrust::get<1>(output_payload));
       }
     } else {
       return thrust::nullopt;
@@ -134,19 +106,11 @@ class Tests_MGExtractTransformE
   virtual void TearDown() {}
 
   // Compare the results of extract_transform_e primitive
-  template <typename vertex_t,
-            typename edge_t,
-            typename weight_t,
-            typename tag_t,
-            typename output_payload_t>
+  template <typename vertex_t, typename edge_t, typename weight_t, typename output_payload_t>
   void run_current_test(Prims_Usecase const& prims_usecase, input_usecase_t const& input_usecase)
   {
     using result_t = int32_t;
 
-    using key_t =
-      std::conditional_t<std::is_same_v<tag_t, void>, vertex_t, thrust::tuple<vertex_t, tag_t>>;
-
-    static_assert(std::is_same_v<tag_t, void> || std::is_arithmetic_v<tag_t>);
     static_assert(std::is_same_v<output_payload_t, void> ||
                   cugraph::is_arithmetic_or_thrust_tuple_of_arithmetic<output_payload_t>::value);
     if constexpr (cugraph::is_thrust_tuple<output_payload_t>::value) {
@@ -212,7 +176,7 @@ class Tests_MGExtractTransformE
                                    mg_src_prop.view(),
                                    mg_dst_prop.view(),
                                    cugraph::edge_dummy_property_t{}.view(),
-                                   e_op_t<key_t, vertex_t, result_t, output_payload_t>{});
+                                   e_op_t<vertex_t, result_t, output_payload_t>{});
 
     if (cugraph::test::g_perf) {
       RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
@@ -225,7 +189,7 @@ class Tests_MGExtractTransformE
 
     if (prims_usecase.check_correctness) {
       auto mg_aggregate_extract_transform_output_buffer = cugraph::allocate_dataframe_buffer<
-        typename e_op_t<key_t, vertex_t, result_t, output_payload_t>::return_type::value_type>(
+        typename e_op_t<vertex_t, result_t, output_payload_t>::return_type::value_type>(
         size_t{0}, handle_->get_stream());
       std::get<0>(mg_aggregate_extract_transform_output_buffer) =
         cugraph::test::device_gatherv(*handle_,
@@ -239,18 +203,12 @@ class Tests_MGExtractTransformE
         cugraph::test::device_gatherv(*handle_,
                                       std::get<2>(mg_extract_transform_output_buffer).data(),
                                       std::get<2>(mg_extract_transform_output_buffer).size());
-      if constexpr (!std::is_same_v<key_t, vertex_t> || !std::is_arithmetic_v<output_payload_t>) {
+      if constexpr (!std::is_arithmetic_v<output_payload_t>) {
         std::get<3>(mg_aggregate_extract_transform_output_buffer) =
           cugraph::test::device_gatherv(*handle_,
                                         std::get<3>(mg_extract_transform_output_buffer).data(),
                                         std::get<3>(mg_extract_transform_output_buffer).size());
       }
-      if constexpr (!std::is_same_v<key_t, vertex_t> && !std::is_arithmetic_v<output_payload_t>) {
-        std::get<4>(mg_aggregate_extract_transform_output_buffer) =
-          cugraph::test::device_gatherv(*handle_,
-                                        std::get<4>(mg_extract_transform_output_buffer).data(),
-                                        std::get<4>(mg_extract_transform_output_buffer).size());
-      }
 
       cugraph::graph_t<vertex_t, edge_t, store_transposed, false> sg_graph(*handle_);
       std::tie(sg_graph, std::ignore, std::ignore, std::ignore) =
@@ -292,7 +250,7 @@ class Tests_MGExtractTransformE
                                        sg_src_prop.view(),
                                        sg_dst_prop.view(),
                                        cugraph::edge_dummy_property_t{}.view(),
-                                       e_op_t<key_t, vertex_t, result_t, output_payload_t>{});
+                                       e_op_t<vertex_t, result_t, output_payload_t>{});
 
         thrust::sort(handle_->get_thrust_policy(),
                      cugraph::get_dataframe_buffer_begin(sg_extract_transform_output_buffer),
@@ -321,13 +279,13 @@ using Tests_MGExtractTransformE_Rmat = Tests_MGExtractTransformE<cugraph::test::
 TEST_P(Tests_MGExtractTransformE_File, CheckInt32Int32FloatVoidInt32)
 {
   auto param = GetParam();
-  run_current_test<int32_t, int32_t, float, void, int32_t>(std::get<0>(param), std::get<1>(param));
+  run_current_test<int32_t, int32_t, float, int32_t>(std::get<0>(param), std::get<1>(param));
 }
 
 TEST_P(Tests_MGExtractTransformE_Rmat, CheckInt32Int32FloatVoidInt32)
 {
   auto param = GetParam();
-  run_current_test<int32_t, int32_t, float, void, int32_t>(
+  run_current_test<int32_t, int32_t, float, int32_t>(
     std::get<0>(param),
     cugraph::test::override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
 }
@@ -335,14 +293,14 @@ TEST_P(Tests_MGExtractTransformE_Rmat, CheckInt32Int32FloatVoidInt32)
 TEST_P(Tests_MGExtractTransformE_File, CheckInt32Int32FloatVoidTupleFloatInt32)
 {
   auto param = GetParam();
-  run_current_test<int32_t, int32_t, float, void, thrust::tuple<float, int32_t>>(
-    std::get<0>(param), std::get<1>(param));
+  run_current_test<int32_t, int32_t, float, thrust::tuple<float, int32_t>>(std::get<0>(param),
+                                                                           std::get<1>(param));
 }
 
 TEST_P(Tests_MGExtractTransformE_Rmat, CheckInt32Int32FloatVoidTupleFloatInt32)
 {
   auto param = GetParam();
-  run_current_test<int32_t, int32_t, float, void, thrust::tuple<float, int32_t>>(
+  run_current_test<int32_t, int32_t, float, thrust::tuple<float, int32_t>>(
     std::get<0>(param),
     cugraph::test::override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
 }
@@ -350,14 +308,13 @@ TEST_P(Tests_MGExtractTransformE_Rmat, CheckInt32Int32FloatVoidTupleFloatInt32)
 TEST_P(Tests_MGExtractTransformE_File, CheckInt32Int32FloatInt32Int32)
 {
   auto param = GetParam();
-  run_current_test<int32_t, int32_t, float, int32_t, int32_t>(std::get<0>(param),
-                                                              std::get<1>(param));
+  run_current_test<int32_t, int32_t, float, int32_t>(std::get<0>(param), std::get<1>(param));
 }
 
 TEST_P(Tests_MGExtractTransformE_Rmat, CheckInt32Int32FloatInt32Int32)
 {
   auto param = GetParam();
-  run_current_test<int32_t, int32_t, float, int32_t, int32_t>(
+  run_current_test<int32_t, int32_t, float, int32_t>(
     std::get<0>(param),
     cugraph::test::override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
 }
@@ -365,14 +322,14 @@ TEST_P(Tests_MGExtractTransformE_Rmat, CheckInt32Int32FloatInt32Int32)
 TEST_P(Tests_MGExtractTransformE_File, CheckInt32Int32FloatInt32TupleFloatInt32)
 {
   auto param = GetParam();
-  run_current_test<int32_t, int32_t, float, int32_t, thrust::tuple<float, int32_t>>(
-    std::get<0>(param), std::get<1>(param));
+  run_current_test<int32_t, int32_t, float, thrust::tuple<float, int32_t>>(std::get<0>(param),
+                                                                           std::get<1>(param));
 }
 
 TEST_P(Tests_MGExtractTransformE_Rmat, CheckInt32Int32FloatInt32TupleFloatInt32)
 {
   auto param = GetParam();
-  run_current_test<int32_t, int32_t, float, int32_t, thrust::tuple<float, int32_t>>(
+  run_current_test<int32_t, int32_t, float, thrust::tuple<float, int32_t>>(
     std::get<0>(param),
     cugraph::test::override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
 }
@@ -380,14 +337,13 @@ TEST_P(Tests_MGExtractTransformE_Rmat, CheckInt32Int32FloatInt32TupleFloatInt32)
 TEST_P(Tests_MGExtractTransformE_File, CheckInt32Int64FloatInt32Int32)
 {
   auto param = GetParam();
-  run_current_test<int32_t, int64_t, float, int32_t, int32_t>(std::get<0>(param),
-                                                              std::get<1>(param));
+  run_current_test<int32_t, int64_t, float, int32_t>(std::get<0>(param), std::get<1>(param));
 }
 
 TEST_P(Tests_MGExtractTransformE_Rmat, CheckInt32Int64FloatInt32Int32)
 {
   auto param = GetParam();
-  run_current_test<int32_t, int64_t, float, int32_t, int32_t>(
+  run_current_test<int32_t, int64_t, float, int32_t>(
     std::get<0>(param),
     cugraph::test::override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
 }
@@ -395,14 +351,13 @@ TEST_P(Tests_MGExtractTransformE_Rmat, CheckInt32Int64FloatInt32Int32)
 TEST_P(Tests_MGExtractTransformE_File, CheckInt64Int64FloatInt32Int32)
 {
   auto param = GetParam();
-  run_current_test<int64_t, int64_t, float, int32_t, int32_t>(std::get<0>(param),
-                                                              std::get<1>(param));
+  run_current_test<int64_t, int64_t, float, int32_t>(std::get<0>(param), std::get<1>(param));
 }
 
 TEST_P(Tests_MGExtractTransformE_Rmat, CheckInt64Int64FloatInt32Int32)
 {
   auto param = GetParam();
-  run_current_test<int64_t, int64_t, float, int32_t, int32_t>(
+  run_current_test<int64_t, int64_t, float, int32_t>(
     std::get<0>(param),
     cugraph::test::override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
 }
diff --git a/cpp/tests/sampling/sampling_post_processing_test.cu b/cpp/tests/sampling/sampling_post_processing_test.cu
index c87cc5b960b..3bca382a2eb 100644
--- a/cpp/tests/sampling/sampling_post_processing_test.cu
+++ b/cpp/tests/sampling/sampling_post_processing_test.cu
@@ -398,15 +398,16 @@ bool check_renumber_map_invariants(
       handle.get_thrust_policy(),
       unique_majors.begin(),
       unique_majors.end(),
-      [sorted_org_vertices =
-         raft::device_span<vertex_t const>(sorted_org_vertices.data(), sorted_org_vertices.size()),
-       matching_renumbered_vertices = raft::device_span<vertex_t const>(
-         matching_renumbered_vertices.data(),
-         matching_renumbered_vertices.size())] __device__(vertex_t major) {
-        auto it = thrust::lower_bound(
-          thrust::seq, sorted_org_vertices.begin(), sorted_org_vertices.end(), major);
-        return matching_renumbered_vertices[thrust::distance(sorted_org_vertices.begin(), it)];
-      },
+      cuda::proclaim_return_type<vertex_t>(
+        [sorted_org_vertices = raft::device_span<vertex_t const>(sorted_org_vertices.data(),
+                                                                 sorted_org_vertices.size()),
+         matching_renumbered_vertices = raft::device_span<vertex_t const>(
+           matching_renumbered_vertices.data(),
+           matching_renumbered_vertices.size())] __device__(vertex_t major) -> vertex_t {
+          auto it = thrust::lower_bound(
+            thrust::seq, sorted_org_vertices.begin(), sorted_org_vertices.end(), major);
+          return matching_renumbered_vertices[thrust::distance(sorted_org_vertices.begin(), it)];
+        }),
       std::numeric_limits<vertex_t>::lowest(),
       thrust::maximum<vertex_t>{});
 
@@ -414,15 +415,16 @@ bool check_renumber_map_invariants(
       handle.get_thrust_policy(),
       unique_minors.begin(),
       unique_minors.end(),
-      [sorted_org_vertices =
-         raft::device_span<vertex_t const>(sorted_org_vertices.data(), sorted_org_vertices.size()),
-       matching_renumbered_vertices = raft::device_span<vertex_t const>(
-         matching_renumbered_vertices.data(),
-         matching_renumbered_vertices.size())] __device__(vertex_t minor) {
-        auto it = thrust::lower_bound(
-          thrust::seq, sorted_org_vertices.begin(), sorted_org_vertices.end(), minor);
-        return matching_renumbered_vertices[thrust::distance(sorted_org_vertices.begin(), it)];
-      },
+      cuda::proclaim_return_type<vertex_t>(
+        [sorted_org_vertices = raft::device_span<vertex_t const>(sorted_org_vertices.data(),
+                                                                 sorted_org_vertices.size()),
+         matching_renumbered_vertices = raft::device_span<vertex_t const>(
+           matching_renumbered_vertices.data(),
+           matching_renumbered_vertices.size())] __device__(vertex_t minor) -> vertex_t {
+          auto it = thrust::lower_bound(
+            thrust::seq, sorted_org_vertices.begin(), sorted_org_vertices.end(), minor);
+          return matching_renumbered_vertices[thrust::distance(sorted_org_vertices.begin(), it)];
+        }),
       std::numeric_limits<vertex_t>::max(),
       thrust::minimum<vertex_t>{});
 

From 04e80008180656da050e37a3a4b04c47ab015de9 Mon Sep 17 00:00:00 2001
From: Don Acosta <97529984+acostadon@users.noreply.github.com>
Date: Wed, 29 May 2024 15:16:07 -0400
Subject: [PATCH 19/23] Fixed links and added c++ docs per issue 4431 (#4435)

added content to document c++ algorithms and fixed links that were pointing to the previously removed content
resolves #4431
Resolves #4116

Authors:
  - Don Acosta (https://github.com/acostadon)

Approvers:
  - Rick Ratzel (https://github.com/rlratzel)
  - Brad Rees (https://github.com/BradReesWork)
  - Chuck Hastings (https://github.com/ChuckHastings)

URL: https://github.com/rapidsai/cugraph/pull/4435
---
 .../source/graph_support/algorithms.md        | 10 +--
 .../cpp_algorithms/centrality_cpp.md          | 81 +++++++++++++++++++
 .../algorithms/cpp_algorithms/linear_cpp.md   | 37 +++++++++
 .../cpp_algorithms/traversal_cpp.md           | 56 +++++++++++++
 4 files changed, 179 insertions(+), 5 deletions(-)
 create mode 100644 docs/cugraph/source/graph_support/algorithms/cpp_algorithms/centrality_cpp.md
 create mode 100644 docs/cugraph/source/graph_support/algorithms/cpp_algorithms/linear_cpp.md
 create mode 100644 docs/cugraph/source/graph_support/algorithms/cpp_algorithms/traversal_cpp.md

diff --git a/docs/cugraph/source/graph_support/algorithms.md b/docs/cugraph/source/graph_support/algorithms.md
index 8a5158f2f56..2aac61325e0 100644
--- a/docs/cugraph/source/graph_support/algorithms.md
+++ b/docs/cugraph/source/graph_support/algorithms.md
@@ -50,10 +50,10 @@ Note: Multi-GPU, or MG, includes support for Multi-Node Multi-GPU (also called M
 | Layout            |                                    |                     |                                                                 |
 |                   | [Force Atlas 2](https://github.com/rapidsai/cugraph/blob/main/notebooks/algorithms/layout/Force-Atlas2.ipynb)              | Single-GPU          |        |
 | Linear Assignment |                                    |                     |                                                                 |
-|                   | [Hungarian]()                      | Single-GPU          | [README](cpp/src/linear_assignment/README-hungarian.md)         |
+|                   | [Hungarian](https://docs.rapids.ai/api/cugraph/nightly/api_docs/cugraph/linear_assignment/#hungarian)                      | Single-GPU          | [README](./algorithms/cpp_algorithms/linear_cpp.html)         |
 | Link Analysis     |                                    |                     |                                                                 |
-|                   | [Pagerank](https://github.com/rapidsai/cugraph/blob/main/notebooks/algorithms/link_analysis/Pagerank.ipynb)                | __Multi-GPU__ | [C++ README](cpp/src/centrality/README.md#Pagerank)                |
-|                   | [Personal Pagerank]()                  | __Multi-GPU__ | [C++ README](cpp/src/centrality/README.md#Personalized-Pagerank)   |
+|                   | [Pagerank](https://github.com/rapidsai/cugraph/blob/main/notebooks/algorithms/link_analysis/Pagerank.ipynb)                | __Multi-GPU__ | [C++ README](./algorithms/cpp_algorithms/centrality_cpp.html#Pagerank)                |
+|                   | [Personal Pagerank](https://github.com/rapidsai/cugraph/blob/main/notebooks/algorithms/link_analysis/Pagerank.ipynb)                  | __Multi-GPU__ | [C++ README](./algorithms/cpp_algorithms/centrality_cpp.html#Personalized-Pagerank)   |
 |                   | [HITS](https://github.com/rapidsai/cugraph/blob/main/notebooks/algorithms/link_analysis/HITS.ipynb)        | __Multi-GPU__ |                |
 | [Link Prediction](algorithms/Similarity.html)   |                                    |                     |                                                                 |
 |                   | [Jaccard Similarity](https://github.com/rapidsai/cugraph/blob/main/notebooks/algorithms/link_prediction/Jaccard-Similarity.ipynb)                 | __Multi-GPU__      | Directed graph only                         |
@@ -68,8 +68,8 @@ Note: Multi-GPU, or MG, includes support for Multi-Node Multi-GPU (also called M
 |                   | Node2Vec                           | __Multi-GPU__       |                                             |
 |                   | Neighborhood sampling      | __Multi-GPU__ |                                                                 |
 | Traversal         |                                    |                     |                                                                 |
-|                   | Breadth First Search (BFS)         | __Multi-GPU__ | with cutoff support [C++ README](cpp/src/traversal/README.md#BFS) |
-|                   | Single Source Shortest Path (SSSP) | __Multi-GPU__ | [C++ README](cpp/src/traversal/README.md#SSSP)                     |
+|                   | Breadth First Search (BFS)         | __Multi-GPU__ | [C++ README](algorithms/cpp_algorithms/traversal_cpp.html#BFS) |
+|                   | Single Source Shortest Path (SSSP) | __Multi-GPU__ | [C++ README](algorithms/cpp_algorithms/traversal_cpp.html#SSSP)                     |
 |                   | _ASSP / APSP_                    | ---                 |                                                                 |
 | Tree              |                                    |                     |                                                                 |
 |                   | Minimum Spanning Tree              | Single-GPU          |                                                                 |
diff --git a/docs/cugraph/source/graph_support/algorithms/cpp_algorithms/centrality_cpp.md b/docs/cugraph/source/graph_support/algorithms/cpp_algorithms/centrality_cpp.md
new file mode 100644
index 00000000000..b3f7ac17d1a
--- /dev/null
+++ b/docs/cugraph/source/graph_support/algorithms/cpp_algorithms/centrality_cpp.md
@@ -0,0 +1,81 @@
+# Centrality algorithms
+cuGraph Pagerank is implemented using our graph primitive library
+
+## Pagerank
+
+The unit test code is the best place to search for examples on calling pagerank.
+
+ * [SG Implementation](https://github.com/rapidsai/cugraph/blob/main/cpp/tests/link_analysis/pagerank_test.cpp)
+ * [MG Implementation](https://github.com/rapidsai/cugraph/blob/main/cpp/tests/link_analysis/mg_pagerank_test.cpp)
+
+## Simple pagerank
+
+The example assumes that you create an SG or MG graph somehow.  The caller must create the pageranks vector in device memory and pass in the raw pointer to that vector into the pagerank function.
+
+```cpp
+#include <cugraph/algorithms.hpp>
+...
+using vertex_t = int32_t;       // or int64_t, whichever is appropriate
+using weight_t = float;         // or double, whichever is appropriate
+using result_t = weight_t;      // could specify float or double also
+raft::handle_t handle;          // Must be configured if MG
+auto graph_view = graph.view(); // assumes you have created a graph somehow
+
+result_t constexpr alpha{0.85};
+result_t constexpr epsilon{1e-6};
+
+rmm::device_uvector<result_t> pageranks_v(graph_view.number_of_vertices(), handle.get_stream());
+
+// pagerank optionally supports three additional parameters:
+//     max_iterations     - maximum number of iterations, if pagerank doesn't coverge by
+//                          then we abort
+//     has_initial_guess  - if true, values in the pagerank array when the call is initiated
+//                          will be used as the initial pagerank values.  These values will
+//                          be normalized before use.  If false (the default), the values
+//                          in the pagerank array will be set to 1/num_vertices before
+//                          starting the computation.
+//     do_expensive_check - perform extensive validation of the input data before
+//                          executing algorithm.  Off by default.  Note: turning this on
+//                          is expensive
+cugraph::pagerank(handle, graph_view, nullptr, nullptr, nullptr, vertex_t{0},
+                                pageranks_v.data(), alpha, epsilon);
+```
+
+## Personalized Pagerank
+
+The example assumes that you create an SG or MG graph somehow.  The caller must create the pageranks vector in device memory and pass in the raw pointer to that vector into the pagerank function.  Additionally, the caller must create personalization_vertices and personalized_values vectors in device memory, populate them and pass in the raw pointers to those vectors.
+
+```cpp
+#include <cugraph/algorithms.hpp>
+...
+using vertex_t = int32_t;                    // or int64_t, whichever is appropriate
+using weight_t = float;                      // or double, whichever is appropriate
+using result_t = weight_t;                   // could specify float or double also
+raft::handle_t handle;                       // Must be configured if MG
+auto graph_view = graph.view();              // assumes you have created a graph somehow
+vertex_t number_of_personalization_vertices; // Provided by caller
+
+result_t constexpr alpha{0.85};
+result_t constexpr epsilon{1e-6};
+
+rmm::device_uvector<result_t> pageranks_v(graph_view.number_of_vertices(), handle.get_stream());
+rmm::device_uvector<vertex_t> personalization_vertices(number_of_personalization_vertices, handle.get_stream());
+rmm::device_uvector<result_t> personalization_values(number_of_personalization_vertices, handle.get_stream());
+
+//  Populate personalization_vertices, personalization_values with user provided data
+
+// pagerank optionally supports three additional parameters:
+//     max_iterations     - maximum number of iterations, if pagerank doesn't coverge by
+//                          then we abort
+//     has_initial_guess  - if true, values in the pagerank array when the call is initiated
+//                          will be used as the initial pagerank values.  These values will
+//                          be normalized before use.  If false (the default), the values
+//                          in the pagerank array will be set to 1/num_vertices before
+//                          starting the computation.
+//     do_expensive_check - perform extensive validation of the input data before
+//                          executing algorithm.  Off by default.  Note: turning this on
+//                          is expensive
+cugraph::pagerank(handle, graph_view, nullptr, personalization_vertices.data(),
+                                personalization_values.data(), number_of_personalization_vertices,
+                                pageranks_v.data(), alpha, epsilon);
+```
diff --git a/docs/cugraph/source/graph_support/algorithms/cpp_algorithms/linear_cpp.md b/docs/cugraph/source/graph_support/algorithms/cpp_algorithms/linear_cpp.md
new file mode 100644
index 00000000000..8af4a5042f6
--- /dev/null
+++ b/docs/cugraph/source/graph_support/algorithms/cpp_algorithms/linear_cpp.md
@@ -0,0 +1,37 @@
+# LAP
+
+Implementation of ***O(n^3) Alternating Tree Variant*** of Hungarian Algorithm on NVIDIA CUDA-enabled GPU.
+
+This implementation solves a batch of ***k*** **Linear Assignment Problems (LAP)**, each with ***nxn*** matrix of single floating point cost values. At optimality, the algorithm produces an assignment with ***minimum*** cost.
+
+The API can be used to query optimal primal and dual costs, optimal assignment vector, and optimal row/column dual vectors for each subproblem in the batch.
+
+cuGraph exposes the Hungarian algorithm, the actual implementation is contained in the RAFT library which contains some common tools and kernels shared between cuGraph and cuML.
+
+Following parameters can be used to tune the performance of algorithm:
+
+1. epsilon: (in raft/lap/lap_kernels.cuh) This parameter controls the tolerance on the floating point precision. Setting this too small will result in increased solution time because the algorithm will search for precise solutions. Setting it too high may cause some inaccuracies.
+
+2. BLOCKDIMX, BLOCKDIMY: (in raft/lap/lap_functions.cuh) These parameters control threads_per_block to be used along the given dimension. Set these according to the device specifications and occupancy calculation.
+
+***This library is licensed under Apache License 2.0. Please cite our paper, if this library helps you in your research.***
+
+- Harvard citation style
+
+  Date, K. and Nagi, R., 2016. GPU-accelerated Hungarian algorithms for the Linear Assignment Problem. Parallel Computing, 57, pp.52-72.
+
+- BibTeX Citation block to be used in LaTeX bibliography file:
+
+```
+@article{date2016gpu,
+  title={GPU-accelerated Hungarian algorithms for the Linear Assignment Problem},
+  author={Date, Ketan and Nagi, Rakesh},
+  journal={Parallel Computing},
+  volume={57},
+  pages={52--72},
+  year={2016},
+  publisher={Elsevier}
+}
+```
+
+The paper is available online on [ScienceDirect](https://www.sciencedirect.com/science/article/abs/pii/S016781911630045X).
diff --git a/docs/cugraph/source/graph_support/algorithms/cpp_algorithms/traversal_cpp.md b/docs/cugraph/source/graph_support/algorithms/cpp_algorithms/traversal_cpp.md
new file mode 100644
index 00000000000..6480d885a38
--- /dev/null
+++ b/docs/cugraph/source/graph_support/algorithms/cpp_algorithms/traversal_cpp.md
@@ -0,0 +1,56 @@
+# Traversal
+cuGraph traversal algorithms are contained in this directory
+
+## SSSP
+
+The unit test code is the best place to search for examples on calling SSSP.
+
+ * [SG Implementation](https://github.com/rapidsai/cugraph/blob/main/cpp/tests/traversal/sssp_test.cpp)
+ * [MG Implementation](https://github.com/rapidsai/cugraph/blob/main/cpp/tests/traversal/mg_sssp_test.cpp)
+
+## Simple SSSP
+
+The example assumes that you create an SG or MG graph somehow.  The caller must create the distances and predecessors vectors in device memory and pass in the raw pointers to those vectors into the SSSP function.
+
+```cpp
+#include <cugraph/algorithms.hpp>
+...
+using vertex_t = int32_t;       // or int64_t, whichever is appropriate
+using weight_t = float;         // or double, whichever is appropriate
+using result_t = weight_t;      // could specify float or double also
+raft::handle_t handle;          // Must be configured if MG
+auto graph_view = graph.view(); // assumes you have created a graph somehow
+vertex_t source;                // Initialized by user
+
+rmm::device_uvector<weight_t> distances_v(graph_view.number_of_vertices(), handle.get_stream());
+rmm::device_uvector<vertex_t> predecessors_v(graph_view.number_of_vertices(), handle.get_stream());
+
+cugraph::sssp(handle, graph_view, distances_v.begin(), predecessors_v.begin(), source, std::numeric_limits<weight_t>::max(), false);
+```
+
+## BFS
+
+The unit test code is the best place to search for examples on calling BFS.
+
+ * [SG Implementation](https://github.com/rapidsai/cugraph/blob/main/cpp/tests/traversal/bfs_test.cpp)
+ * [MG Implementation](https://github.com/rapidsai/cugraph/blob/main/cpp/tests/traversal/mg_bfs_test.cpp)
+
+## Simple BFS
+
+The example assumes that you create an SG or MG graph somehow.  The caller must create the distances and predecessors vectors in device memory and pass in the raw pointers to those vectors into the BFS function.
+
+```cpp
+#include <cugraph/algorithms.hpp>
+...
+using vertex_t = int32_t;       // or int64_t, whichever is appropriate
+using weight_t = float;         // or double, whichever is appropriate
+using result_t = weight_t;      // could specify float or double also
+raft::handle_t handle;          // Must be configured if MG
+auto graph_view = graph.view(); // assumes you have created a graph somehow
+vertex_t source;                // Initialized by user
+
+rmm::device_uvector<weight_t> distances_v(graph_view.number_of_vertices(), handle.get_stream());
+rmm::device_uvector<vertex_t> predecessors_v(graph_view.number_of_vertices(), handle.get_stream());
+
+cugraph::bfs(handle, graph_view, d_distances.begin(), d_predecessors.begin(), source, false, std::numeric_limits<vertex_t>::max(), false);
+```

From 507f73209a743e55e20c28238b5552f8ca297898 Mon Sep 17 00:00:00 2001
From: Don Acosta <97529984+acostadon@users.noreply.github.com>
Date: Wed, 29 May 2024 19:16:55 -0400
Subject: [PATCH 20/23] first copy of general cugraph tutorial. (#4396)

Adding a tutorial to get started with cugraph.

There are more to follow but this is identified as an important one.

closes #4385

Authors:
  - Don Acosta (https://github.com/acostadon)

Approvers:
  - Brad Rees (https://github.com/BradReesWork)

URL: https://github.com/rapidsai/cugraph/pull/4396
---
 .../cugraph/source/tutorials/basic_cugraph.md | 38 +++++++++++++++++++
 .../cugraph/source/tutorials/how_to_guides.md |  2 +-
 2 files changed, 39 insertions(+), 1 deletion(-)
 create mode 100644 docs/cugraph/source/tutorials/basic_cugraph.md

diff --git a/docs/cugraph/source/tutorials/basic_cugraph.md b/docs/cugraph/source/tutorials/basic_cugraph.md
new file mode 100644
index 00000000000..78325472489
--- /dev/null
+++ b/docs/cugraph/source/tutorials/basic_cugraph.md
@@ -0,0 +1,38 @@
+# Getting started with cuGraph
+
+## Required hardware/software
+
+CuGraph is part of [Rapids](https://docs.rapids.ai/user-guide) and has the following system requirements:
+ * NVIDIA GPU, Volta architecture or later, with [compute capability](https://developer.nvidia.com/cuda-gpus) 7.0+
+ * CUDA 11.2, 11.4, 11.5, 11.8, 12.0 or 12.2
+ * Python version 3.9, 3.10, or 3.11
+ * NetworkX >= version 3.3 or newer in order to use use [NetworkX Configs](https://networkx.org/documentation/stable/reference/backends.html#module-networkx.utils.configs) **This is required for use of nx-cuGraph, [see below](#cugraph-using-networkx-code).**
+
+## Installation
+The latest RAPIDS System Requirements documentation is located [here](https://docs.rapids.ai/install#system-req).
+
+This includes several ways to set up cuGraph
+* From Unix
+    * [Conda](https://docs.rapids.ai/install#wsl-conda)
+    * [Docker](https://docs.rapids.ai/install#wsl-docker)
+    * [pip](https://docs.rapids.ai/install#wsl-pip)
+
+* In windows you must install [WSL2](https://learn.microsoft.com/en-us/windows/wsl/install) and then choose one of the following:
+    * [Conda](https://docs.rapids.ai/install#wsl-conda)
+    * [Docker](https://docs.rapids.ai/install#wsl-docker)
+    * [pip](https://docs.rapids.ai/install#wsl-pip)
+
+* Build From Source
+
+To build from source, check each RAPIDS GitHub README for set up and build instructions. Further links are provided in the [selector tool](https://docs.rapids.ai/install#selector). If additional help is needed reach out on our [Slack Channel](https://rapids-goai.slack.com/archives/C5E06F4DC).
+
+## CuGraph Using NetworkX Code
+While the steps above are required to use the full suite of cuGraph graph analytics, cuGraph is now supported as a NetworkX backend using [nx-cugraph](https://docs.rapids.ai/api/cugraph/nightly/nx_cugraph/nx_cugraph/).
+Nx-cugraph offers those with existing NetworkX code, a **zero code change** option with a growing list of supported algorithms.
+
+
+## Cugraph API Example
+Coming soon !
+
+
+Until then, [the cuGraph notebook repository](https://github.com/rapidsai/cugraph/blob/main/notebooks/README.md) has many examples of loading graph data and running algorithms in Jupyter notebooks. The [cuGraph test code](https://github.com/rapidsai/cugraph/tree/main/python/cugraph/cugraph/tests) gives examples of python scripts settng up and calling cuGraph algorithms. A simple example of [testing the degree centrality algorithm](https://github.com/rapidsai/cugraph/blob/main/python/cugraph/cugraph/tests/centrality/test_degree_centrality.py) is a good place to start. Some of these examples show [multi-GPU tests/examples with larger data sets](https://github.com/rapidsai/cugraph/blob/main/python/cugraph/cugraph/tests/centrality/test_degree_centrality_mg.py) as well.
diff --git a/docs/cugraph/source/tutorials/how_to_guides.md b/docs/cugraph/source/tutorials/how_to_guides.md
index 80be5b4ab5b..998957afea1 100644
--- a/docs/cugraph/source/tutorials/how_to_guides.md
+++ b/docs/cugraph/source/tutorials/how_to_guides.md
@@ -1,5 +1,5 @@
 # How To Guides
-- Basic use of cuGraph, on the page
+- [Basic use of cuGraph](./basic_cugraph.md)
 - Property graph with analytic flow
 - GNN – model building
 - cuGraph Service – client/server setup and use (ucx)

From 563c06e9dbdb7af999079e3019cede701cb9ac8d Mon Sep 17 00:00:00 2001
From: Rick Ratzel <3039903+rlratzel@users.noreply.github.com>
Date: Wed, 29 May 2024 21:30:00 -0500
Subject: [PATCH 21/23] Adds benchmark for `nx_cugraph.ego_graph` (#4451)

Adds benchmark for `nx_cugraph.ego_graph`

_Note: this code is not part of any installed package and does not affect the release.  It is being added so it can be included in the `branch-24.06` sources for reference._

Authors:
  - Rick Ratzel (https://github.com/rlratzel)

Approvers:
  - Alex Barghi (https://github.com/alexbarghi-nv)
  - Erik Welch (https://github.com/eriknw)

URL: https://github.com/rapidsai/cugraph/pull/4451
---
 .../nx-cugraph/pytest-based/bench_algos.py      | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/benchmarks/nx-cugraph/pytest-based/bench_algos.py b/benchmarks/nx-cugraph/pytest-based/bench_algos.py
index 3b085a9bfdb..d40b5130827 100644
--- a/benchmarks/nx-cugraph/pytest-based/bench_algos.py
+++ b/benchmarks/nx-cugraph/pytest-based/bench_algos.py
@@ -848,6 +848,23 @@ def bench_weakly_connected_components(benchmark, graph_obj, backend_wrapper):
     assert type(result) is list
 
 
+def bench_ego_graph(benchmark, graph_obj, backend_wrapper):
+    G = get_graph_obj_for_benchmark(graph_obj, backend_wrapper)
+    node = get_highest_degree_node(graph_obj)
+    result = benchmark.pedantic(
+        target=backend_wrapper(nx.ego_graph),
+        args=(G,),
+        kwargs=dict(
+            n=node,
+            radius=100,
+        ),
+        rounds=rounds,
+        iterations=iterations,
+        warmup_rounds=warmup_rounds,
+    )
+    assert isinstance(result, (nx.Graph, nxcg.Graph))
+
+
 @pytest.mark.skip(reason="benchmark not implemented")
 def bench_complete_bipartite_graph(benchmark, graph_obj, backend_wrapper):
     pass

From 797a036030feb2a70c234f365a03fc084a216d74 Mon Sep 17 00:00:00 2001
From: Alex Barghi <105237337+alexbarghi-nv@users.noreply.github.com>
Date: Thu, 30 May 2024 00:44:27 -0400
Subject: [PATCH 22/23] Distributed Sampling in cuGraph-PyG (#4384)

Distributed sampling in cuGraph-PyG.  Also renames the existing API to clarify that it is dask based.
Adds a dependency on `tensordict` for `cuGraph-PyG` which supports the new `TensorDictFeatureStore`.
Also no longer installs `torch-cluster` and `torch-spline-conv` in CI for testing since that results in an `ImportError` and neither of those packages are needed.

Requires PyG 2.5.  Should be merged after #4335

Merge after #4355

Closes #4248
Closes #4249
Closes #3383
Closes #3942
Closes #3836
Closes #4202
Closes #4051
Closes #4326
Closes #4252
Partially addresses #3805

Authors:
  - Alex Barghi (https://github.com/alexbarghi-nv)
  - Seunghwa Kang (https://github.com/seunghwak)
  - Tingyu Wang (https://github.com/tingyu66)
  - Ralph Liu (https://github.com/nv-rliu)

Approvers:
  - Tingyu Wang (https://github.com/tingyu66)
  - Brad Rees (https://github.com/BradReesWork)
  - Jake Awe (https://github.com/AyodeAwe)

URL: https://github.com/rapidsai/cugraph/pull/4384
---
 ci/run_cugraph_pyg_pytests.sh                 |   5 +-
 ci/test.sh                                    |   2 +-
 ci/test_python.sh                             |   7 +-
 ci/test_wheel_cugraph-pyg.sh                  |   8 +-
 conda/recipes/cugraph-pyg/meta.yaml           |   1 +
 dependencies.yaml                             |   1 +
 .../api_docs/cugraph-pyg/cugraph_pyg.rst      |  33 +-
 .../conda/cugraph_pyg_dev_cuda-118.yaml       |   1 +
 .../cugraph-pyg/cugraph_pyg/data/__init__.py  |  13 +-
 .../{cugraph_store.py => dask_graph_store.py} |   4 +-
 .../cugraph_pyg/data/feature_store.py         | 129 +++++++
 .../cugraph_pyg/data/graph_store.py           | 322 +++++++++++++++++
 .../cugraph_pyg/examples/README.md            |  11 -
 .../examples/cugraph_dist_sampling_mg.py      |   2 +-
 .../examples/cugraph_dist_sampling_sg.py      |   2 +
 .../cugraph_pyg/examples/gcn_dist_sg.py       | 178 ++++++++++
 .../cugraph_pyg/examples/gcn_dist_snmg.py     | 328 ++++++++++++++++++
 .../cugraph_pyg/examples/graph_sage_mg.py     |  17 +-
 .../cugraph_pyg/examples/graph_sage_sg.py     |   8 +-
 .../cugraph_pyg/loader/__init__.py            |  18 +-
 ...aph_node_loader.py => dask_node_loader.py} |  20 +-
 .../cugraph_pyg/loader/neighbor_loader.py     | 232 +++++++++++++
 .../cugraph_pyg/loader/node_loader.py         | 148 ++++++++
 .../cugraph_pyg/sampler/__init__.py           |   4 +-
 .../cugraph_pyg/sampler/sampler.py            | 323 +++++++++++++++++
 .../{cugraph_sampler.py => sampler_utils.py}  |  56 +--
 .../test_dask_graph_store.py}                 |  58 ++--
 .../test_dask_graph_store_mg.py}              |  61 ++--
 .../tests/data/test_feature_store.py          |  44 +++
 .../tests/data/test_graph_store.py            |  45 +++
 .../tests/data/test_graph_store_mg.py         |  45 +++
 .../test_dask_neighbor_loader.py}             |  39 ++-
 .../test_dask_neighbor_loader_mg.py}          |  16 +-
 .../tests/loader/test_neighbor_loader.py      |  54 +++
 .../tests/loader/test_neighbor_loader_mg.py   | 111 ++++++
 .../cugraph_pyg/tests/nn/test_gat_conv.py     |   1 +
 .../cugraph_pyg/tests/nn/test_gatv2_conv.py   |   1 +
 .../tests/nn/test_hetero_gat_conv.py          |   1 +
 .../cugraph_pyg/tests/nn/test_rgcn_conv.py    |   1 +
 .../cugraph_pyg/tests/nn/test_sage_conv.py    |   1 +
 .../tests/nn/test_transformer_conv.py         |   1 +
 .../test_sampler_utils.py}                    |  19 +-
 .../test_sampler_utils_mg.py}                 |  37 +-
 python/cugraph-pyg/pytest.ini                 |   2 +
 python/cugraph/cugraph/gnn/__init__.py        |   1 +
 .../cugraph/gnn/data_loading/__init__.py      |   1 +
 .../cugraph/gnn/data_loading/dist_sampler.py  | 282 +++++++++++++--
 47 files changed, 2465 insertions(+), 229 deletions(-)
 rename python/cugraph-pyg/cugraph_pyg/data/{cugraph_store.py => dask_graph_store.py} (99%)
 create mode 100644 python/cugraph-pyg/cugraph_pyg/data/feature_store.py
 create mode 100644 python/cugraph-pyg/cugraph_pyg/data/graph_store.py
 delete mode 100644 python/cugraph-pyg/cugraph_pyg/examples/README.md
 create mode 100644 python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_sg.py
 create mode 100644 python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_snmg.py
 rename python/cugraph-pyg/cugraph_pyg/loader/{cugraph_node_loader.py => dask_node_loader.py} (97%)
 create mode 100644 python/cugraph-pyg/cugraph_pyg/loader/neighbor_loader.py
 create mode 100644 python/cugraph-pyg/cugraph_pyg/loader/node_loader.py
 create mode 100644 python/cugraph-pyg/cugraph_pyg/sampler/sampler.py
 rename python/cugraph-pyg/cugraph_pyg/sampler/{cugraph_sampler.py => sampler_utils.py} (89%)
 rename python/cugraph-pyg/cugraph_pyg/tests/{test_cugraph_store.py => data/test_dask_graph_store.py} (92%)
 rename python/cugraph-pyg/cugraph_pyg/tests/{mg/test_mg_cugraph_store.py => data/test_dask_graph_store_mg.py} (90%)
 create mode 100644 python/cugraph-pyg/cugraph_pyg/tests/data/test_feature_store.py
 create mode 100644 python/cugraph-pyg/cugraph_pyg/tests/data/test_graph_store.py
 create mode 100644 python/cugraph-pyg/cugraph_pyg/tests/data/test_graph_store_mg.py
 rename python/cugraph-pyg/cugraph_pyg/tests/{test_cugraph_loader.py => loader/test_dask_neighbor_loader.py} (95%)
 rename python/cugraph-pyg/cugraph_pyg/tests/{mg/test_mg_cugraph_loader.py => loader/test_dask_neighbor_loader_mg.py} (85%)
 create mode 100644 python/cugraph-pyg/cugraph_pyg/tests/loader/test_neighbor_loader.py
 create mode 100644 python/cugraph-pyg/cugraph_pyg/tests/loader/test_neighbor_loader_mg.py
 rename python/cugraph-pyg/cugraph_pyg/tests/{test_cugraph_sampler.py => sampler/test_sampler_utils.py} (93%)
 rename python/cugraph-pyg/cugraph_pyg/tests/{mg/test_mg_cugraph_sampler.py => sampler/test_sampler_utils_mg.py} (86%)

diff --git a/ci/run_cugraph_pyg_pytests.sh b/ci/run_cugraph_pyg_pytests.sh
index 88642e6ceb6..fb27f16d79e 100755
--- a/ci/run_cugraph_pyg_pytests.sh
+++ b/ci/run_cugraph_pyg_pytests.sh
@@ -6,7 +6,10 @@ set -euo pipefail
 # Support invoking run_cugraph_pyg_pytests.sh outside the script directory
 cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../python/cugraph-pyg/cugraph_pyg
 
-pytest --cache-clear --ignore=tests/mg "$@" .
+pytest --cache-clear --benchmark-disable "$@" .
+
+# Used to skip certain examples in CI due to memory limitations
+export CI_RUN=1
 
 # Test examples
 for e in "$(pwd)"/examples/*.py; do
diff --git a/ci/test.sh b/ci/test.sh
index f20fc40f85a..884ed7ac881 100755
--- a/ci/test.sh
+++ b/ci/test.sh
@@ -103,7 +103,7 @@ if hasArg "--run-python-tests"; then
     conda list
     cd ${CUGRAPH_ROOT}/python/cugraph-pyg/cugraph_pyg
     # rmat is not tested because of MG testing
-    pytest --cache-clear --junitxml=${CUGRAPH_ROOT}/junit-cugraph-pytests.xml -v --cov-config=.coveragerc --cov=cugraph_pyg --cov-report=xml:${WORKSPACE}/python/cugraph_pyg/cugraph-coverage.xml --cov-report term --ignore=raft --ignore=tests/mg --ignore=tests/int --ignore=tests/generators --benchmark-disable
+    pytest -sv -m sg --cache-clear --junitxml=${CUGRAPH_ROOT}/junit-cugraph-pytests.xml -v --cov-config=.coveragerc --cov=cugraph_pyg --cov-report=xml:${WORKSPACE}/python/cugraph_pyg/cugraph-coverage.xml --cov-report term --ignore=raft --benchmark-disable
     echo "Ran Python pytest for cugraph_pyg : return code was: $?, test script exit code is now: $EXITCODE"
 
     echo "Python pytest for cugraph-service (single-GPU only)..."
diff --git a/ci/test_python.sh b/ci/test_python.sh
index 9537f66e825..c215e25c526 100755
--- a/ci/test_python.sh
+++ b/ci/test_python.sh
@@ -215,13 +215,14 @@ if [[ "${RAPIDS_CUDA_VERSION}" == "11.8.0" ]]; then
 
     # Install pyg dependencies (which requires pip)
 
-    pip install ogb
+    pip install \
+      ogb \
+      tensordict
+
     pip install \
         pyg_lib \
         torch_scatter \
         torch_sparse \
-        torch_cluster \
-        torch_spline_conv \
       -f ${PYG_URL}
 
     rapids-print-env
diff --git a/ci/test_wheel_cugraph-pyg.sh b/ci/test_wheel_cugraph-pyg.sh
index f45112dd80b..1004063cc38 100755
--- a/ci/test_wheel_cugraph-pyg.sh
+++ b/ci/test_wheel_cugraph-pyg.sh
@@ -24,6 +24,9 @@ python -m pip install $(ls ./dist/${python_package_name}*.whl)[test]
 # RAPIDS_DATASET_ROOT_DIR is used by test scripts
 export RAPIDS_DATASET_ROOT_DIR="$(realpath datasets)"
 
+# Used to skip certain examples in CI due to memory limitations
+export CI_RUN=1
+
 if [[ "${CUDA_VERSION}" == "11.8.0" ]]; then
   PYTORCH_URL="https://download.pytorch.org/whl/cu118"
   PYG_URL="https://data.pyg.org/whl/torch-2.1.0+cu118.html"
@@ -39,15 +42,14 @@ rapids-retry python -m pip install \
   pyg_lib \
   torch_scatter \
   torch_sparse \
-  torch_cluster \
-  torch_spline_conv \
+  tensordict \
   -f ${PYG_URL}
 
 rapids-logger "pytest cugraph-pyg (single GPU)"
 pushd python/cugraph-pyg/cugraph_pyg
 python -m pytest \
   --cache-clear \
-  --ignore=tests/mg \
+  --benchmark-disable \
   tests
 # Test examples
 for e in "$(pwd)"/examples/*.py; do
diff --git a/conda/recipes/cugraph-pyg/meta.yaml b/conda/recipes/cugraph-pyg/meta.yaml
index c02e8391eb2..64091ff4782 100644
--- a/conda/recipes/cugraph-pyg/meta.yaml
+++ b/conda/recipes/cugraph-pyg/meta.yaml
@@ -34,6 +34,7 @@ requirements:
     - cupy >=12.0.0
     - cugraph ={{ version }}
     - pylibcugraphops ={{ minor_version }}
+    - tensordict >=0.1.2
     - pyg >=2.5,<2.6
 
 tests:
diff --git a/dependencies.yaml b/dependencies.yaml
index c0699fdb1c5..3c2622fde9f 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -565,6 +565,7 @@ dependencies:
           - cugraph==24.6.*
           - pytorch>=2.0
           - pytorch-cuda==11.8
+          - tensordict>=0.1.2
           - pyg>=2.5,<2.6
 
   depends_on_rmm:
diff --git a/docs/cugraph/source/api_docs/cugraph-pyg/cugraph_pyg.rst b/docs/cugraph/source/api_docs/cugraph-pyg/cugraph_pyg.rst
index a150d4db9fe..5475fd6c581 100644
--- a/docs/cugraph/source/api_docs/cugraph-pyg/cugraph_pyg.rst
+++ b/docs/cugraph/source/api_docs/cugraph-pyg/cugraph_pyg.rst
@@ -6,8 +6,37 @@ cugraph-pyg
 
 .. currentmodule:: cugraph_pyg
 
+Graph Storage
+-------------
 .. autosummary::
    :toctree: ../api/cugraph-pyg/
 
-..   cugraph_pyg.data.cugraph_store.EXPERIMENTAL__CuGraphStore
-..   cugraph_pyg.sampler.cugraph_sampler.EXPERIMENTAL__CuGraphSampler
+   cugraph_pyg.data.dask_graph_store.DaskGraphStore
+   cugraph_pyg.data.graph_store.GraphStore
+
+Feature Storage
+---------------
+.. autosummary::
+   :toctree: ../api/cugraph-pyg/
+
+   cugraph_pyg.data.feature_store.TensorDictFeatureStore
+
+Data Loaders
+------------
+.. autosummary::
+   :toctree: ../api/cugraph-pyg/
+
+   cugraph_pyg.loader.dask_node_loader.DaskNeighborLoader
+   cugraph_pyg.loader.dask_node_loader.BulkSampleLoader
+   cugraph_pyg.loader.node_loader.NodeLoader
+   cugraph_pyg.loader.neighbor_loader.NeighborLoader
+
+Samplers
+--------
+.. autosummary::
+   :toctree: ../api/cugraph-pyg/
+
+   cugraph_pyg.sampler.sampler.BaseSampler
+   cugraph_pyg.sampler.sampler.SampleReader
+   cugraph_pyg.sampler.sampler.HomogeneousSampleReader
+   cugraph_pyg.sampler.sampler.SampleIterator
diff --git a/python/cugraph-pyg/conda/cugraph_pyg_dev_cuda-118.yaml b/python/cugraph-pyg/conda/cugraph_pyg_dev_cuda-118.yaml
index ebef0094cfa..922d92f069a 100644
--- a/python/cugraph-pyg/conda/cugraph_pyg_dev_cuda-118.yaml
+++ b/python/cugraph-pyg/conda/cugraph_pyg_dev_cuda-118.yaml
@@ -21,4 +21,5 @@ dependencies:
 - pytorch-cuda==11.8
 - pytorch>=2.0
 - scipy
+- tensordict>=0.1.2
 name: cugraph_pyg_dev_cuda-118
diff --git a/python/cugraph-pyg/cugraph_pyg/data/__init__.py b/python/cugraph-pyg/cugraph_pyg/data/__init__.py
index 66a9843c047..4c6f267410d 100644
--- a/python/cugraph-pyg/cugraph_pyg/data/__init__.py
+++ b/python/cugraph-pyg/cugraph_pyg/data/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -11,4 +11,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cugraph_pyg.data.cugraph_store import CuGraphStore
+import warnings
+
+from cugraph_pyg.data.dask_graph_store import DaskGraphStore
+from cugraph_pyg.data.graph_store import GraphStore
+from cugraph_pyg.data.feature_store import TensorDictFeatureStore
+
+
+def CuGraphStore(*args, **kwargs):
+    warnings.warn("CuGraphStore has been renamed to DaskGraphStore", FutureWarning)
+    return DaskGraphStore(*args, **kwargs)
diff --git a/python/cugraph-pyg/cugraph_pyg/data/cugraph_store.py b/python/cugraph-pyg/cugraph_pyg/data/dask_graph_store.py
similarity index 99%
rename from python/cugraph-pyg/cugraph_pyg/data/cugraph_store.py
rename to python/cugraph-pyg/cugraph_pyg/data/dask_graph_store.py
index 354eea8ee6b..ef22982c4da 100644
--- a/python/cugraph-pyg/cugraph_pyg/data/cugraph_store.py
+++ b/python/cugraph-pyg/cugraph_pyg/data/dask_graph_store.py
@@ -199,7 +199,7 @@ def cast(cls, *args, **kwargs):
         return cls(*args, **kwargs)
 
 
-class CuGraphStore:
+class DaskGraphStore:
     """
     Duck-typed version of PyG's GraphStore and FeatureStore.
     """
@@ -221,7 +221,7 @@ def __init__(
         order: str = "CSR",
     ):
         """
-        Constructs a new CuGraphStore from the provided
+        Constructs a new DaskGraphStore from the provided
         arguments.
 
         Parameters
diff --git a/python/cugraph-pyg/cugraph_pyg/data/feature_store.py b/python/cugraph-pyg/cugraph_pyg/data/feature_store.py
new file mode 100644
index 00000000000..42dda42a9e1
--- /dev/null
+++ b/python/cugraph-pyg/cugraph_pyg/data/feature_store.py
@@ -0,0 +1,129 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+
+from typing import Optional, Tuple, List
+
+from cugraph.utilities.utils import import_optional, MissingModule
+
+torch = import_optional("torch")
+torch_geometric = import_optional("torch_geometric")
+tensordict = import_optional("tensordict")
+
+
+class TensorDictFeatureStore(
+    object
+    if isinstance(torch_geometric, MissingModule)
+    else torch_geometric.data.FeatureStore
+):
+    """
+    A basic implementation of the PyG FeatureStore interface that stores
+    feature data in a single TensorDict.  This type of feature store is
+    not distributed, so each node will have to load the entire graph's
+    features into memory.
+    """
+
+    def __init__(self):
+        super().__init__()
+
+        self.__features = {}
+
+    def _put_tensor(
+        self,
+        tensor: "torch_geometric.typing.FeatureTensorType",
+        attr: "torch_geometric.data.feature_store.TensorAttr",
+    ) -> bool:
+        if attr.group_name in self.__features:
+            td = self.__features[attr.group_name]
+            batch_size = td.batch_size[0]
+
+            if attr.is_set("index"):
+                if attr.attr_name in td.keys():
+                    if attr.index.shape[0] != batch_size:
+                        raise ValueError(
+                            "Leading size of index tensor "
+                            "does not match existing tensors for group name "
+                            f"{attr.group_name}; Expected {batch_size}, "
+                            f"got {attr.index.shape[0]}"
+                        )
+                    td[attr.attr_name][attr.index] = tensor
+                    return True
+                else:
+                    warnings.warn(
+                        "Ignoring index parameter "
+                        f"(attribute does not exist for group {attr.group_name})"
+                    )
+
+            if tensor.shape[0] != batch_size:
+                raise ValueError(
+                    "Leading size of input tensor does not match "
+                    f"existing tensors for group name {attr.group_name};"
+                    f" Expected {batch_size}, got {tensor.shape[0]}"
+                )
+        else:
+            batch_size = tensor.shape[0]
+            self.__features[attr.group_name] = tensordict.TensorDict(
+                {}, batch_size=batch_size
+            )
+
+        self.__features[attr.group_name][attr.attr_name] = tensor
+        return True
+
+    def _get_tensor(
+        self, attr: "torch_geometric.data.feature_store.TensorAttr"
+    ) -> Optional["torch_geometric.typing.FeatureTensorType"]:
+        if attr.group_name not in self.__features:
+            return None
+
+        if attr.attr_name not in self.__features[attr.group_name].keys():
+            return None
+
+        tensor = self.__features[attr.group_name][attr.attr_name]
+        return (
+            tensor
+            if (attr.index is None or (not attr.is_set("index")))
+            else tensor[attr.index]
+        )
+
+    def _remove_tensor(
+        self, attr: "torch_geometric.data.feature_store.TensorAttr"
+    ) -> bool:
+        if attr.group_name not in self.__features:
+            return False
+
+        if attr.attr_name not in self.__features[attr.group_name].keys():
+            return False
+
+        del self.__features[attr.group_name][attr.attr_name]
+        return True
+
+    def _get_tensor_size(
+        self, attr: "torch_geometric.data.feature_store.TensorAttr"
+    ) -> Tuple:
+        return self._get_tensor(attr).size()
+
+    def get_all_tensor_attrs(
+        self,
+    ) -> List["torch_geometric.data.feature_store.TensorAttr"]:
+        attrs = []
+        for group_name, td in self.__features.items():
+            for attr_name in td.keys():
+                attrs.append(
+                    torch_geometric.data.feature_store.TensorAttr(
+                        group_name,
+                        attr_name,
+                    )
+                )
+
+        return attrs
diff --git a/python/cugraph-pyg/cugraph_pyg/data/graph_store.py b/python/cugraph-pyg/cugraph_pyg/data/graph_store.py
new file mode 100644
index 00000000000..01af7fd6ed0
--- /dev/null
+++ b/python/cugraph-pyg/cugraph_pyg/data/graph_store.py
@@ -0,0 +1,322 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import cupy
+import cudf
+import pandas
+
+import pylibcugraph
+
+from cugraph.utilities.utils import import_optional, MissingModule
+from cugraph.gnn.comms import cugraph_comms_get_raft_handle
+
+from typing import Union, Optional, List, Dict
+
+
+# Have to use import_optional even though these are required
+# dependencies in order to build properly.
+torch_geometric = import_optional("torch_geometric")
+torch = import_optional("torch")
+tensordict = import_optional("tensordict")
+
+TensorType = Union["torch.Tensor", cupy.ndarray, np.ndarray, cudf.Series, pandas.Series]
+
+
+class GraphStore(
+    object
+    if isinstance(torch_geometric, MissingModule)
+    else torch_geometric.data.GraphStore
+):
+    """
+    This object uses lazy graph creation.  Users can repeatedly call
+    put_edge_index, and the tensors won't be converted into a cuGraph
+    graph until one is needed (i.e. when creating a loader).
+    """
+
+    def __init__(self, is_multi_gpu: bool = False):
+        self.__edge_indices = tensordict.TensorDict({}, batch_size=(2,))
+        self.__sizes = {}
+        self.__graph = None
+        self.__vertex_offsets = None
+        self.__handle = None
+        self.__is_multi_gpu = is_multi_gpu
+
+        super().__init__()
+
+    def _put_edge_index(
+        self,
+        edge_index: "torch_geometric.typing.EdgeTensorType",
+        edge_attr: "torch_geometric.data.EdgeAttr",
+    ) -> bool:
+        if edge_attr.layout != torch_geometric.data.graph_store.EdgeLayout.COO:
+            raise ValueError("Only COO format supported")
+
+        if isinstance(edge_index, (cupy.ndarray, cudf.Series)):
+            edge_index = torch.as_tensor(edge_index, device="cuda")
+        elif isinstance(edge_index, (np.ndarray)):
+            edge_index = torch.as_tensor(edge_index, device="cpu")
+        elif isinstance(edge_index, pandas.Series):
+            edge_index = torch.as_tensor(edge_index.values, device="cpu")
+        elif isinstance(edge_index, cudf.Series):
+            edge_index = torch.as_tensor(edge_index.values, device="cuda")
+
+        self.__edge_indices[edge_attr.edge_type] = torch.stack(
+            [edge_index[0], edge_index[1]]
+        )
+        self.__sizes[edge_attr.edge_type] = edge_attr.size
+
+        # invalidate the graph
+        self.__graph = None
+        self.__vertex_offsets = None
+        return True
+
+    def _get_edge_index(
+        self, edge_attr: "torch_geometric.data.EdgeAttr"
+    ) -> Optional["torch_geometric.typing.EdgeTensorType"]:
+        ei = torch_geometric.EdgeIndex(self.__edge_indices[edge_attr.edge_type])
+
+        if edge_attr.layout == "csr":
+            return ei.sort_by("row").values.get_csr()
+        elif edge_attr.layout == "csc":
+            return ei.sort_by("col").values.get_csc()
+
+        return ei
+
+    def _remove_edge_index(self, edge_attr: "torch_geometric.data.EdgeAttr") -> bool:
+        del self.__edge_indices[edge_attr.edge_type]
+
+        # invalidate the graph
+        self.__graph = None
+        return True
+
+    def get_all_edge_attrs(self) -> List["torch_geometric.data.EdgeAttr"]:
+        attrs = []
+        for et in self.__edge_indices.keys(leaves_only=True, include_nested=True):
+            attrs.append(
+                torch_geometric.data.EdgeAttr(
+                    edge_type=et, layout="coo", is_sorted=False, size=self.__sizes[et]
+                )
+            )
+
+        return attrs
+
+    @property
+    def is_multi_gpu(self):
+        return self.__is_multi_gpu
+
+    @property
+    def _resource_handle(self):
+        if self.__handle is None:
+            if self.is_multi_gpu:
+                self.__handle = pylibcugraph.ResourceHandle(
+                    cugraph_comms_get_raft_handle().getHandle()
+                )
+            else:
+                self.__handle = pylibcugraph.ResourceHandle()
+        return self.__handle
+
+    @property
+    def _graph(self) -> Union[pylibcugraph.SGGraph, pylibcugraph.MGGraph]:
+        graph_properties = pylibcugraph.GraphProperties(
+            is_multigraph=True, is_symmetric=False
+        )
+
+        if self.__graph is None:
+            edgelist_dict = self.__get_edgelist()
+
+            if self.is_multi_gpu:
+                rank = torch.distributed.get_rank()
+                world_size = torch.distributed.get_world_size()
+
+                vertices_array = cupy.arange(
+                    sum(self._num_vertices().values()), dtype="int64"
+                )
+                vertices_array = cupy.array_split(vertices_array, world_size)[rank]
+
+                self.__graph = pylibcugraph.MGGraph(
+                    self._resource_handle,
+                    graph_properties,
+                    [cupy.asarray(edgelist_dict["src"]).astype("int64")],
+                    [cupy.asarray(edgelist_dict["dst"]).astype("int64")],
+                    vertices_array=[vertices_array],
+                    edge_id_array=[cupy.asarray(edgelist_dict["eid"])],
+                    edge_type_array=[cupy.asarray(edgelist_dict["etp"])],
+                )
+            else:
+                self.__graph = pylibcugraph.SGGraph(
+                    self._resource_handle,
+                    graph_properties,
+                    cupy.asarray(edgelist_dict["src"]).astype("int64"),
+                    cupy.asarray(edgelist_dict["dst"]).astype("int64"),
+                    vertices_array=cupy.arange(
+                        sum(self._num_vertices().values()), dtype="int64"
+                    ),
+                    edge_id_array=cupy.asarray(edgelist_dict["eid"]),
+                    edge_type_array=cupy.asarray(edgelist_dict["etp"]),
+                )
+
+        return self.__graph
+
+    def _num_vertices(self) -> Dict[str, int]:
+        num_vertices = {}
+        for edge_attr in self.get_all_edge_attrs():
+            if edge_attr.size is not None:
+                num_vertices[edge_attr.edge_type[0]] = (
+                    max(num_vertices[edge_attr.edge_type[0]], edge_attr.size[0])
+                    if edge_attr.edge_type[0] in num_vertices
+                    else edge_attr.size[0]
+                )
+                num_vertices[edge_attr.edge_type[2]] = (
+                    max(num_vertices[edge_attr.edge_type[2]], edge_attr.size[1])
+                    if edge_attr.edge_type[2] in num_vertices
+                    else edge_attr.size[1]
+                )
+            else:
+                if edge_attr.edge_type[0] not in num_vertices:
+                    num_vertices[edge_attr.edge_type[0]] = int(
+                        self.__edge_indices[edge_attr.edge_type][0].max() + 1
+                    )
+                if edge_attr.edge_type[2] not in num_vertices:
+                    num_vertices[edge_attr.edge_type[1]] = int(
+                        self.__edge_indices[edge_attr.edge_type][1].max() + 1
+                    )
+
+        if self.is_multi_gpu:
+            vtypes = num_vertices.keys()
+            for vtype in vtypes:
+                sz = torch.tensor(num_vertices[vtype], device="cuda")
+                torch.distributed.all_reduce(sz, op=torch.distributed.ReduceOp.MAX)
+                num_vertices[vtype] = int(sz)
+        return num_vertices
+
+    @property
+    def _vertex_offsets(self) -> Dict[str, int]:
+        if self.__vertex_offsets is None:
+            num_vertices = self._num_vertices()
+            ordered_keys = sorted(list(num_vertices.keys()))
+            self.__vertex_offsets = {}
+            offset = 0
+            for vtype in ordered_keys:
+                self.__vertex_offsets[vtype] = offset
+                offset += num_vertices[vtype]
+
+        return dict(self.__vertex_offsets)
+
+    @property
+    def is_homogeneous(self) -> bool:
+        return len(self._vertex_offsets) == 1
+
+    def __get_edgelist(self):
+        """
+        Returns
+        -------
+        Dict[str, torch.Tensor] with the following keys:
+            src: source vertices (int64)
+                Note that src is the 2nd element of the PyG edge index.
+            dst: destination vertices (int64)
+                Note that dst is the 1st element of the PyG edge index.
+            eid: edge ids for each edge (int64)
+                Note that these start from 0 for each edge type.
+            etp: edge types for each edge (int32)
+                Note that these are in lexicographic order.
+        """
+        sorted_keys = sorted(
+            list(self.__edge_indices.keys(leaves_only=True, include_nested=True))
+        )
+
+        # note that this still follows the PyG convention of (dst, rel, src)
+        # i.e. (author, writes, paper): [[0,1,2],[2,0,1]] is referring to a
+        # cuGraph graph where (paper 2) -> (author 0), (paper 0) -> (author 1),
+        # and (paper 1) -> (author 0)
+        edge_index = torch.concat(
+            [
+                torch.stack(
+                    [
+                        self.__edge_indices[dst_type, rel_type, src_type][0]
+                        + self._vertex_offsets[dst_type],
+                        self.__edge_indices[dst_type, rel_type, src_type][1]
+                        + self._vertex_offsets[src_type],
+                    ]
+                )
+                for (dst_type, rel_type, src_type) in sorted_keys
+            ],
+            axis=1,
+        ).cuda()
+
+        edge_type_array = torch.arange(
+            len(sorted_keys), dtype=torch.int32, device="cuda"
+        ).repeat_interleave(
+            torch.tensor(
+                [self.__edge_indices[et].shape[1] for et in sorted_keys],
+                device="cuda",
+                dtype=torch.int32,
+            )
+        )
+
+        if self.is_multi_gpu:
+            rank = torch.distributed.get_rank()
+            world_size = torch.distributed.get_world_size()
+
+            num_edges_t = torch.tensor(
+                [self.__edge_indices[et].shape[1] for et in sorted_keys], device="cuda"
+            )
+            num_edges_all_t = torch.empty(
+                world_size, num_edges_t.numel(), dtype=torch.int64, device="cuda"
+            )
+            torch.distributed.all_gather_into_tensor(num_edges_all_t, num_edges_t)
+
+            if rank > 0:
+                start_offsets = num_edges_all_t[:rank].T.sum(axis=1)
+                edge_id_array = torch.concat(
+                    [
+                        torch.arange(
+                            start_offsets[i],
+                            start_offsets[i] + num_edges_all_t[rank][i],
+                            dtype=torch.int64,
+                            device="cuda",
+                        )
+                        for i in range(len(sorted_keys))
+                    ]
+                )
+            else:
+                edge_id_array = torch.concat(
+                    [
+                        torch.arange(
+                            self.__edge_indices[et].shape[1],
+                            dtype=torch.int64,
+                            device="cuda",
+                        )
+                        for et in sorted_keys
+                    ]
+                )
+
+        else:
+            # single GPU
+            edge_id_array = torch.concat(
+                [
+                    torch.arange(
+                        self.__edge_indices[et].shape[1],
+                        dtype=torch.int64,
+                        device="cuda",
+                    )
+                    for et in sorted_keys
+                ]
+            )
+
+        return {
+            "dst": edge_index[0],
+            "src": edge_index[1],
+            "etp": edge_type_array,
+            "eid": edge_id_array,
+        }
diff --git a/python/cugraph-pyg/cugraph_pyg/examples/README.md b/python/cugraph-pyg/cugraph_pyg/examples/README.md
deleted file mode 100644
index 572111ac26a..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/examples/README.md
+++ /dev/null
@@ -1,11 +0,0 @@
-This directory contains examples for running cugraph-pyg training.
-
-For single-GPU (SG) scripts, no special configuration is required.
-
-For multi-GPU (MG) scripts, dask must be started first in a separate process.
-To do this, the `start_dask.sh` script has been provided.  This scripts starts
-a dask scheduler and dask workers.  To select the GPUs and amount of memory
-allocated to dask per GPU, the `CUDA_VISIBLE_DEVICES` and `WORKER_RMM_POOL_SIZE`
-arguments in that script can be modified.
-To connect to dask, the scheduler JSON file must be provided.  This can be done
-using the `--dask_scheduler_file` argument in the mg python script being run.
diff --git a/python/cugraph-pyg/cugraph_pyg/examples/cugraph_dist_sampling_mg.py b/python/cugraph-pyg/cugraph_pyg/examples/cugraph_dist_sampling_mg.py
index 29a6cc2b464..31cbaf69ca5 100644
--- a/python/cugraph-pyg/cugraph_pyg/examples/cugraph_dist_sampling_mg.py
+++ b/python/cugraph-pyg/cugraph_pyg/examples/cugraph_dist_sampling_mg.py
@@ -95,7 +95,7 @@ def main():
     with tempfile.TemporaryDirectory() as directory:
         tmp.spawn(
             sample,
-            args=(world_size, uid, el, "."),
+            args=(world_size, uid, el, directory),
             nprocs=world_size,
         )
 
diff --git a/python/cugraph-pyg/cugraph_pyg/examples/cugraph_dist_sampling_sg.py b/python/cugraph-pyg/cugraph_pyg/examples/cugraph_dist_sampling_sg.py
index 8366ff44233..de45acc7456 100644
--- a/python/cugraph-pyg/cugraph_pyg/examples/cugraph_dist_sampling_sg.py
+++ b/python/cugraph-pyg/cugraph_pyg/examples/cugraph_dist_sampling_sg.py
@@ -55,6 +55,8 @@ def sample(edgelist, directory):
         G,
         sample_writer,
         fanout=[5, 5],
+        compression="CSR",
+        retain_original_seeds=True,
     )
 
     sampler.sample_from_nodes(seeds, batch_size=16, random_state=62)
diff --git a/python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_sg.py b/python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_sg.py
new file mode 100644
index 00000000000..71b0e4bb2fb
--- /dev/null
+++ b/python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_sg.py
@@ -0,0 +1,178 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import time
+import argparse
+import tempfile
+import os
+
+from typing import Optional
+
+import torch
+import cupy
+
+import rmm
+from rmm.allocators.cupy import rmm_cupy_allocator
+from rmm.allocators.torch import rmm_torch_allocator
+
+# Must change allocators immediately upon import
+# or else other imports will cause memory to be
+# allocated and prevent changing the allocator
+rmm.reinitialize(devices=[0], pool_allocator=True, managed_memory=True)
+cupy.cuda.set_allocator(rmm_cupy_allocator)
+torch.cuda.memory.change_current_allocator(rmm_torch_allocator)
+
+import torch.nn.functional as F  # noqa: E402
+import torch_geometric  # noqa: E402
+import cugraph_pyg  # noqa: E402
+from cugraph_pyg.loader import NeighborLoader  # noqa: E402
+
+# Enable cudf spilling to save gpu memory
+from cugraph.testing.mg_utils import enable_spilling  # noqa: E402
+
+enable_spilling()
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--hidden_channels", type=int, default=256)
+parser.add_argument("--num_layers", type=int, default=2)
+parser.add_argument("--lr", type=float, default=0.001)
+parser.add_argument("--epochs", type=int, default=4)
+parser.add_argument("--batch_size", type=int, default=1024)
+parser.add_argument("--fan_out", type=int, default=30)
+parser.add_argument("--tempdir_root", type=str, default=None)
+parser.add_argument("--dataset_root", type=str, default="dataset")
+parser.add_argument("--dataset", type=str, default="ogbn-products")
+
+args = parser.parse_args()
+
+wall_clock_start = time.perf_counter()
+device = torch.device("cuda")
+
+from ogb.nodeproppred import PygNodePropPredDataset  # noqa: E402
+
+dataset = PygNodePropPredDataset(name=args.dataset, root=args.dataset_root)
+split_idx = dataset.get_idx_split()
+data = dataset[0]
+
+graph_store = cugraph_pyg.data.GraphStore()
+graph_store[
+    ("node", "rel", "node"), "coo", False, (data.num_nodes, data.num_nodes)
+] = data.edge_index
+
+feature_store = cugraph_pyg.data.TensorDictFeatureStore()
+feature_store["node", "x"] = data.x
+feature_store["node", "y"] = data.y
+
+with tempfile.TemporaryDirectory(dir=args.tempdir_root) as samples_dir:
+    train_dir = os.path.join(samples_dir, "train")
+    os.mkdir(train_dir)
+    train_loader = NeighborLoader(
+        data=(feature_store, graph_store),
+        num_neighbors=[args.fan_out] * args.num_layers,
+        input_nodes=split_idx["train"],
+        replace=False,
+        batch_size=args.batch_size,
+        directory=train_dir,
+    )
+
+    val_dir = os.path.join(samples_dir, "val")
+    os.mkdir(val_dir)
+    val_loader = NeighborLoader(
+        data=(feature_store, graph_store),
+        num_neighbors=[args.fan_out] * args.num_layers,
+        input_nodes=split_idx["valid"],
+        replace=False,
+        batch_size=args.batch_size,
+        directory=val_dir,
+    )
+
+    test_dir = os.path.join(samples_dir, "test")
+    os.mkdir(test_dir)
+    test_loader = NeighborLoader(
+        data=(feature_store, graph_store),
+        num_neighbors=[args.fan_out] * args.num_layers,
+        input_nodes=split_idx["test"],
+        replace=False,
+        batch_size=args.batch_size,
+        directory=test_dir,
+    )
+
+    model = torch_geometric.nn.models.GCN(
+        dataset.num_features,
+        args.hidden_channels,
+        args.num_layers,
+        dataset.num_classes,
+    ).to(device)
+
+    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=0.0005)
+
+    warmup_steps = 20
+
+    def train(epoch: int):
+        model.train()
+        for i, batch in enumerate(train_loader):
+            if i == warmup_steps:
+                torch.cuda.synchronize()
+                start_avg_time = time.perf_counter()
+            batch = batch.to(device)
+
+            optimizer.zero_grad()
+            batch_size = batch.batch_size
+            out = model(batch.x, batch.edge_index)[:batch_size]
+            y = batch.y[:batch_size].view(-1).to(torch.long)
+
+            loss = F.cross_entropy(out, y)
+            loss.backward()
+            optimizer.step()
+
+            if i % 10 == 0:
+                print(f"Epoch: {epoch:02d}, Iteration: {i}, Loss: {loss:.4f}")
+        torch.cuda.synchronize()
+        print(
+            f"Average Training Iteration Time (s/iter): \
+                {(time.perf_counter() - start_avg_time)/(i-warmup_steps):.6f}"
+        )
+
+    @torch.no_grad()
+    def test(loader: NeighborLoader, val_steps: Optional[int] = None):
+        model.eval()
+
+        total_correct = total_examples = 0
+        for i, batch in enumerate(loader):
+            if val_steps is not None and i >= val_steps:
+                break
+            batch = batch.to(device)
+            batch_size = batch.batch_size
+            out = model(batch.x, batch.edge_index)[:batch_size]
+            pred = out.argmax(dim=-1)
+            y = batch.y[:batch_size].view(-1).to(torch.long)
+
+            total_correct += int((pred == y).sum())
+            total_examples += y.size(0)
+
+        return total_correct / total_examples
+
+    torch.cuda.synchronize()
+    prep_time = round(time.perf_counter() - wall_clock_start, 2)
+    print("Total time before training begins (prep_time)=", prep_time, "seconds")
+    print("Beginning training...")
+    for epoch in range(1, 1 + args.epochs):
+        train(epoch)
+        val_acc = test(val_loader, val_steps=100)
+        print(f"Val Acc: ~{val_acc:.4f}")
+
+    test_acc = test(test_loader)
+    print(f"Test Acc: {test_acc:.4f}")
+    total_time = round(time.perf_counter() - wall_clock_start, 2)
+    print("Total Program Runtime (total_time) =", total_time, "seconds")
+    print("total_time - prep_time =", total_time - prep_time, "seconds")
diff --git a/python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_snmg.py b/python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_snmg.py
new file mode 100644
index 00000000000..b1bb0240e71
--- /dev/null
+++ b/python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_snmg.py
@@ -0,0 +1,328 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Single-node, multi-GPU example.
+
+import argparse
+import os
+import tempfile
+import time
+import warnings
+
+import torch
+import torch.distributed as dist
+import torch.multiprocessing as mp
+import torch.nn.functional as F
+from ogb.nodeproppred import PygNodePropPredDataset
+from torch.nn.parallel import DistributedDataParallel
+
+import torch_geometric
+
+from cugraph.gnn import (
+    cugraph_comms_init,
+    cugraph_comms_shutdown,
+    cugraph_comms_create_unique_id,
+)
+
+# Allow computation on objects that are larger than GPU memory
+# https://docs.rapids.ai/api/cudf/stable/developer_guide/library_design/#spilling-to-host-memory
+os.environ["CUDF_SPILL"] = "1"
+
+# Ensures that a CUDA context is not created on import of rapids.
+# Allows pytorch to create the context instead
+os.environ["RAPIDS_NO_INITIALIZE"] = "1"
+
+
+def init_pytorch_worker(rank, world_size, cugraph_id):
+    import rmm
+
+    rmm.reinitialize(
+        devices=rank,
+        managed_memory=True,
+        pool_allocator=True,
+    )
+
+    import cupy
+
+    cupy.cuda.Device(rank).use()
+    from rmm.allocators.cupy import rmm_cupy_allocator
+
+    cupy.cuda.set_allocator(rmm_cupy_allocator)
+
+    from cugraph.testing.mg_utils import enable_spilling
+
+    enable_spilling()
+
+    torch.cuda.set_device(rank)
+
+    os.environ["MASTER_ADDR"] = "localhost"
+    os.environ["MASTER_PORT"] = "12355"
+    dist.init_process_group("nccl", rank=rank, world_size=world_size)
+
+    cugraph_comms_init(rank=rank, world_size=world_size, uid=cugraph_id, device=rank)
+
+
+def run_train(
+    rank,
+    data,
+    world_size,
+    cugraph_id,
+    model,
+    epochs,
+    batch_size,
+    fan_out,
+    split_idx,
+    num_classes,
+    wall_clock_start,
+    tempdir=None,
+    num_layers=3,
+):
+
+    init_pytorch_worker(
+        rank,
+        world_size,
+        cugraph_id,
+    )
+
+    model = model.to(rank)
+    model = DistributedDataParallel(model, device_ids=[rank])
+    optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=0.0005)
+
+    kwargs = dict(
+        num_neighbors=[fan_out] * num_layers,
+        batch_size=batch_size,
+    )
+    # Set Up Neighbor Loading
+    from cugraph_pyg.data import GraphStore, TensorDictFeatureStore
+    from cugraph_pyg.loader import NeighborLoader
+
+    graph_store = GraphStore(is_multi_gpu=True)
+    ixr = torch.tensor_split(data.edge_index, world_size, dim=1)[rank]
+    graph_store[
+        ("node", "rel", "node"), "coo", False, (data.num_nodes, data.num_nodes)
+    ] = ixr
+
+    feature_store = TensorDictFeatureStore()
+    feature_store["node", "x"] = data.x
+    feature_store["node", "y"] = data.y
+
+    dist.barrier()
+
+    ix_train = torch.tensor_split(split_idx["train"], world_size)[rank].cuda()
+    train_path = os.path.join(tempdir, f"train_{rank}")
+    os.mkdir(train_path)
+    train_loader = NeighborLoader(
+        (feature_store, graph_store),
+        input_nodes=ix_train,
+        directory=train_path,
+        shuffle=True,
+        drop_last=True,
+        **kwargs,
+    )
+
+    ix_test = torch.tensor_split(split_idx["test"], world_size)[rank].cuda()
+    test_path = os.path.join(tempdir, f"test_{rank}")
+    os.mkdir(test_path)
+    test_loader = NeighborLoader(
+        (feature_store, graph_store),
+        input_nodes=ix_test,
+        directory=test_path,
+        shuffle=True,
+        drop_last=True,
+        local_seeds_per_call=80000,
+        **kwargs,
+    )
+
+    ix_valid = torch.tensor_split(split_idx["valid"], world_size)[rank].cuda()
+    valid_path = os.path.join(tempdir, f"valid_{rank}")
+    os.mkdir(valid_path)
+    valid_loader = NeighborLoader(
+        (feature_store, graph_store),
+        input_nodes=ix_valid,
+        directory=valid_path,
+        shuffle=True,
+        drop_last=True,
+        **kwargs,
+    )
+
+    dist.barrier()
+
+    eval_steps = 1000
+    warmup_steps = 20
+    dist.barrier()
+    torch.cuda.synchronize()
+
+    if rank == 0:
+        prep_time = round(time.perf_counter() - wall_clock_start, 2)
+        print("Total time before training begins (prep_time) =", prep_time, "seconds")
+        print("Beginning training...")
+    for epoch in range(epochs):
+        for i, batch in enumerate(train_loader):
+            if i == warmup_steps:
+                torch.cuda.synchronize()
+                start = time.time()
+
+            batch = batch.to(rank)
+            batch_size = batch.batch_size
+
+            batch.y = batch.y.to(torch.long)
+            optimizer.zero_grad()
+            out = model(batch.x, batch.edge_index)
+            loss = F.cross_entropy(out[:batch_size], batch.y[:batch_size])
+            loss.backward()
+            optimizer.step()
+            if rank == 0 and i % 10 == 0:
+                print(
+                    "Epoch: "
+                    + str(epoch)
+                    + ", Iteration: "
+                    + str(i)
+                    + ", Loss: "
+                    + str(loss)
+                )
+        nb = i + 1.0
+
+        if rank == 0:
+            print(
+                "Average Training Iteration Time:",
+                (time.time() - start) / (nb - warmup_steps),
+                "s/iter",
+            )
+
+        with torch.no_grad():
+            total_correct = total_examples = 0
+            for i, batch in enumerate(valid_loader):
+                if i >= eval_steps:
+                    break
+
+                batch = batch.to(rank)
+                batch_size = batch.batch_size
+
+                batch.y = batch.y.to(torch.long)
+                out = model(batch.x, batch.edge_index)[:batch_size]
+
+                pred = out.argmax(dim=-1)
+                y = batch.y[:batch_size].view(-1).to(torch.long)
+
+                total_correct += int((pred == y).sum())
+                total_examples += y.size(0)
+
+            acc_val = total_correct / total_examples
+            if rank == 0:
+                print(
+                    f"Validation Accuracy: {acc_val * 100.0:.4f}%",
+                )
+
+        torch.cuda.synchronize()
+
+    with torch.no_grad():
+        total_correct = total_examples = 0
+        for i, batch in enumerate(test_loader):
+            batch = batch.to(rank)
+            batch_size = batch.batch_size
+
+            batch.y = batch.y.to(torch.long)
+            out = model(batch.x, batch.edge_index)[:batch_size]
+
+            pred = out.argmax(dim=-1)
+            y = batch.y[:batch_size].view(-1).to(torch.long)
+
+            total_correct += int((pred == y).sum())
+            total_examples += y.size(0)
+
+        acc_test = total_correct / total_examples
+        if rank == 0:
+            print(
+                f"Test Accuracy: {acc_test * 100.0:.4f}%",
+            )
+
+    if rank == 0:
+        total_time = round(time.perf_counter() - wall_clock_start, 2)
+        print("Total Program Runtime (total_time) =", total_time, "seconds")
+        print("total_time - prep_time =", total_time - prep_time, "seconds")
+
+    cugraph_comms_shutdown()
+    dist.destroy_process_group()
+
+
+if __name__ == "__main__":
+    if "CI_RUN" in os.environ and os.environ["CI_RUN"] == "1":
+        warnings.warn("Skipping SMNG example in CI due to memory limit")
+    else:
+        parser = argparse.ArgumentParser()
+        parser.add_argument("--hidden_channels", type=int, default=256)
+        parser.add_argument("--num_layers", type=int, default=2)
+        parser.add_argument("--lr", type=float, default=0.001)
+        parser.add_argument("--epochs", type=int, default=4)
+        parser.add_argument("--batch_size", type=int, default=1024)
+        parser.add_argument("--fan_out", type=int, default=30)
+        parser.add_argument("--tempdir_root", type=str, default=None)
+        parser.add_argument("--dataset_root", type=str, default="dataset")
+        parser.add_argument("--dataset", type=str, default="ogbn-products")
+
+        parser.add_argument(
+            "--n_devices",
+            type=int,
+            default=-1,
+            help="1-8 to use that many GPUs. Defaults to all available GPUs",
+        )
+
+        args = parser.parse_args()
+        wall_clock_start = time.perf_counter()
+
+        from rmm.allocators.torch import rmm_torch_allocator
+
+        torch.cuda.memory.change_current_allocator(rmm_torch_allocator)
+
+        dataset = PygNodePropPredDataset(name=args.dataset, root=args.dataset_root)
+        split_idx = dataset.get_idx_split()
+        data = dataset[0]
+        data.y = data.y.reshape(-1)
+
+        model = torch_geometric.nn.models.GCN(
+            dataset.num_features,
+            args.hidden_channels,
+            args.num_layers,
+            dataset.num_classes,
+        )
+
+        print("Data =", data)
+        if args.n_devices == -1:
+            world_size = torch.cuda.device_count()
+        else:
+            world_size = args.n_devices
+        print("Using", world_size, "GPUs...")
+
+        # Create the uid needed for cuGraph comms
+        cugraph_id = cugraph_comms_create_unique_id()
+
+        with tempfile.TemporaryDirectory(dir=args.tempdir_root) as tempdir:
+            mp.spawn(
+                run_train,
+                args=(
+                    data,
+                    world_size,
+                    cugraph_id,
+                    model,
+                    args.epochs,
+                    args.batch_size,
+                    args.fan_out,
+                    split_idx,
+                    dataset.num_classes,
+                    wall_clock_start,
+                    tempdir,
+                    args.num_layers,
+                ),
+                nprocs=world_size,
+                join=True,
+            )
diff --git a/python/cugraph-pyg/cugraph_pyg/examples/graph_sage_mg.py b/python/cugraph-pyg/cugraph_pyg/examples/graph_sage_mg.py
index 80d683e6c79..145675c8a06 100644
--- a/python/cugraph-pyg/cugraph_pyg/examples/graph_sage_mg.py
+++ b/python/cugraph-pyg/cugraph_pyg/examples/graph_sage_mg.py
@@ -11,6 +11,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+# For this script, dask must be started first in a separate process.
+# To do this, the `start_dask.sh` script has been provided.  This scripts starts
+# a dask scheduler and dask workers.  To select the GPUs and amount of memory
+# allocated to dask per GPU, the `CUDA_VISIBLE_DEVICES` and `WORKER_RMM_POOL_SIZE`
+# arguments in that script can be modified.
+# To connect to dask, the scheduler JSON file must be provided.  This can be done
+# using the `--dask_scheduler_file` argument in the mg python script being run.
 
 from ogb.nodeproppred import NodePropPredDataset
 
@@ -159,8 +166,8 @@ def train(
     td.barrier()
 
     import cugraph
-    from cugraph_pyg.data import CuGraphStore
-    from cugraph_pyg.loader import CuGraphNeighborLoader
+    from cugraph_pyg.data import DaskGraphStore
+    from cugraph_pyg.loader import DaskNeighborLoader
 
     if rank == 0:
         print("Rank 0 downloading dataset")
@@ -212,7 +219,7 @@ def train(
         # Rank 0 will initialize the distributed cugraph graph.
         cugraph_store_create_start = time.perf_counter_ns()
         print("G:", G[("paper", "cites", "paper")].shape)
-        cugraph_store = CuGraphStore(fs, G, N, multi_gpu=True)
+        cugraph_store = DaskGraphStore(fs, G, N, multi_gpu=True)
         cugraph_store_create_end = time.perf_counter_ns()
         print(
             "cuGraph Store created on rank 0 in "
@@ -237,7 +244,7 @@ def train(
 
             # Will automatically use the stored distributed cugraph graph on rank 0.
             cugraph_store_create_start = time.perf_counter_ns()
-            cugraph_store = CuGraphStore(fs, G, N, multi_gpu=True)
+            cugraph_store = DaskGraphStore(fs, G, N, multi_gpu=True)
             cugraph_store_create_end = time.perf_counter_ns()
             print(
                 f"Rank {rank} created cugraph store in "
@@ -269,7 +276,7 @@ def train(
         model.train()
 
         start_time_loader = time.perf_counter_ns()
-        cugraph_bulk_loader = CuGraphNeighborLoader(
+        cugraph_bulk_loader = DaskNeighborLoader(
             cugraph_store,
             train_nodes,
             batch_size=250,
diff --git a/python/cugraph-pyg/cugraph_pyg/examples/graph_sage_sg.py b/python/cugraph-pyg/cugraph_pyg/examples/graph_sage_sg.py
index 58a403084df..e0169ee2c25 100644
--- a/python/cugraph-pyg/cugraph_pyg/examples/graph_sage_sg.py
+++ b/python/cugraph-pyg/cugraph_pyg/examples/graph_sage_sg.py
@@ -74,8 +74,8 @@ def train(device: int, features_device: Union[str, int] = "cpu", num_epochs=2) -
     init_pytorch_worker(device)
 
     import cugraph
-    from cugraph_pyg.data import CuGraphStore
-    from cugraph_pyg.loader import CuGraphNeighborLoader
+    from cugraph_pyg.data import DaskGraphStore
+    from cugraph_pyg.loader import DaskNeighborLoader
 
     from ogb.nodeproppred import NodePropPredDataset
 
@@ -106,7 +106,7 @@ def train(device: int, features_device: Union[str, int] = "cpu", num_epochs=2) -
 
     fs.add_data(train_mask, "paper", "train")
 
-    cugraph_store = CuGraphStore(fs, G, N)
+    cugraph_store = DaskGraphStore(fs, G, N)
 
     model = (
         CuGraphSAGE(in_channels=128, hidden_channels=64, out_channels=349, num_layers=3)
@@ -120,7 +120,7 @@ def train(device: int, features_device: Union[str, int] = "cpu", num_epochs=2) -
         start_time_train = time.perf_counter_ns()
         model.train()
 
-        cugraph_bulk_loader = CuGraphNeighborLoader(
+        cugraph_bulk_loader = DaskNeighborLoader(
             cugraph_store, train_nodes, batch_size=500, num_neighbors=[10, 25]
         )
 
diff --git a/python/cugraph-pyg/cugraph_pyg/loader/__init__.py b/python/cugraph-pyg/cugraph_pyg/loader/__init__.py
index 2c3d7eff89e..cad66aaa183 100644
--- a/python/cugraph-pyg/cugraph_pyg/loader/__init__.py
+++ b/python/cugraph-pyg/cugraph_pyg/loader/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -11,6 +11,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cugraph_pyg.loader.cugraph_node_loader import CuGraphNeighborLoader
+import warnings
 
-from cugraph_pyg.loader.cugraph_node_loader import BulkSampleLoader
+from cugraph_pyg.loader.node_loader import NodeLoader
+from cugraph_pyg.loader.neighbor_loader import NeighborLoader
+
+from cugraph_pyg.loader.dask_node_loader import DaskNeighborLoader
+
+from cugraph_pyg.loader.dask_node_loader import BulkSampleLoader
+
+
+def CuGraphNeighborLoader(*args, **kwargs):
+    warnings.warn(
+        "CuGraphNeighborLoader has been renamed to DaskNeighborLoader", FutureWarning
+    )
+    return DaskNeighborLoader(*args, **kwargs)
diff --git a/python/cugraph-pyg/cugraph_pyg/loader/cugraph_node_loader.py b/python/cugraph-pyg/cugraph_pyg/loader/dask_node_loader.py
similarity index 97%
rename from python/cugraph-pyg/cugraph_pyg/loader/cugraph_node_loader.py
rename to python/cugraph-pyg/cugraph_pyg/loader/dask_node_loader.py
index 55c9e9b3329..aaf82dd46bb 100644
--- a/python/cugraph-pyg/cugraph_pyg/loader/cugraph_node_loader.py
+++ b/python/cugraph-pyg/cugraph_pyg/loader/dask_node_loader.py
@@ -23,8 +23,8 @@
 from cugraph.gnn import BulkSampler
 from cugraph.utilities.utils import import_optional, MissingModule
 
-from cugraph_pyg.data import CuGraphStore
-from cugraph_pyg.sampler.cugraph_sampler import (
+from cugraph_pyg.data import DaskGraphStore
+from cugraph_pyg.sampler.sampler_utils import (
     _sampler_output_from_sampling_results_heterogeneous,
     _sampler_output_from_sampling_results_homogeneous_csr,
     _sampler_output_from_sampling_results_homogeneous_coo,
@@ -47,8 +47,8 @@ class BulkSampleLoader:
 
     def __init__(
         self,
-        feature_store: CuGraphStore,
-        graph_store: CuGraphStore,
+        feature_store: DaskGraphStore,
+        graph_store: DaskGraphStore,
         input_nodes: InputNodes = None,
         batch_size: int = 0,
         *,
@@ -72,10 +72,10 @@ def __init__(
 
         Parameters
         ----------
-        feature_store: CuGraphStore
+        feature_store: DaskGraphStore
             The feature store containing features for the graph.
 
-        graph_store: CuGraphStore
+        graph_store: DaskGraphStore
             The graph store containing the graph structure.
 
         input_nodes: InputNodes
@@ -487,10 +487,10 @@ def __iter__(self):
         return self
 
 
-class CuGraphNeighborLoader:
+class DaskNeighborLoader:
     def __init__(
         self,
-        data: Union[CuGraphStore, Tuple[CuGraphStore, CuGraphStore]],
+        data: Union[DaskGraphStore, Tuple[DaskGraphStore, DaskGraphStore]],
         input_nodes: Union[InputNodes, int] = None,
         batch_size: int = None,
         **kwargs,
@@ -498,8 +498,8 @@ def __init__(
         """
         Parameters
         ----------
-        data: CuGraphStore or (CuGraphStore, CuGraphStore)
-            The CuGraphStore or stores where the graph/feature data is held.
+        data: DaskGraphStore or (DaskGraphStore, DaskGraphStore)
+            The DaskGraphStore or stores where the graph/feature data is held.
 
         batch_size: int (required)
             The number of input nodes in each batch.
diff --git a/python/cugraph-pyg/cugraph_pyg/loader/neighbor_loader.py b/python/cugraph-pyg/cugraph_pyg/loader/neighbor_loader.py
new file mode 100644
index 00000000000..3d29ee3aca3
--- /dev/null
+++ b/python/cugraph-pyg/cugraph_pyg/loader/neighbor_loader.py
@@ -0,0 +1,232 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+import tempfile
+
+from typing import Union, Tuple, Optional, Callable, List, Dict
+
+import cugraph_pyg
+from cugraph_pyg.loader import NodeLoader
+from cugraph_pyg.sampler import BaseSampler
+
+from cugraph.gnn import UniformNeighborSampler, DistSampleWriter
+from cugraph.utilities.utils import import_optional
+
+torch_geometric = import_optional("torch_geometric")
+
+
+class NeighborLoader(NodeLoader):
+    """
+    Node loader that implements the neighbor sampling
+    algorithm used in GraphSAGE.
+
+    Duck-typed version of torch_geometric.loader.NeighborLoader
+    """
+
+    def __init__(
+        self,
+        data: Union[
+            "torch_geometric.data.Data",
+            "torch_geometric.data.HeteroData",
+            Tuple[
+                "torch_geometric.data.FeatureStore", "torch_geometric.data.GraphStore"
+            ],
+        ],
+        num_neighbors: Union[
+            List[int], Dict["torch_geometric.typing.EdgeType", List[int]]
+        ],
+        input_nodes: "torch_geometric.typing.InputNodes" = None,
+        input_time: "torch_geometric.typing.OptTensor" = None,
+        replace: bool = False,
+        subgraph_type: Union[
+            "torch_geometric.typing.SubgraphType", str
+        ] = "directional",
+        disjoint: bool = False,
+        temporal_strategy: str = "uniform",
+        time_attr: Optional[str] = None,
+        weight_attr: Optional[str] = None,
+        transform: Optional[Callable] = None,
+        transform_sampler_output: Optional[Callable] = None,
+        is_sorted: bool = False,
+        filter_per_worker: Optional[bool] = None,
+        neighbor_sampler: Optional["torch_geometric.sampler.NeighborSampler"] = None,
+        directed: bool = True,  # Deprecated.
+        batch_size: int = 16,
+        directory: str = None,
+        batches_per_partition=256,
+        format: str = "parquet",
+        compression: Optional[str] = None,
+        local_seeds_per_call: Optional[int] = None,
+        **kwargs,
+    ):
+        """
+        data: Data, HeteroData, or Tuple[FeatureStore, GraphStore]
+            See torch_geometric.loader.NeighborLoader.
+        num_neighbors: List[int] or Dict[EdgeType, List[int]]
+            Fanout values.
+            See torch_geometric.loader.NeighborLoader.
+        input_nodes: InputNodes
+            Input nodes for sampling.
+            See torch_geometric.loader.NeighborLoader.
+        input_time: OptTensor (optional)
+            See torch_geometric.loader.NeighborLoader.
+        replace: bool (optional, default=False)
+            Whether to sample with replacement.
+            See torch_geometric.loader.NeighborLoader.
+        subgraph_type: Union[SubgraphType, str] (optional, default='directional')
+            The type of subgraph to return.
+            Currently only 'directional' is supported.
+            See torch_geometric.loader.NeighborLoader.
+        disjoint: bool (optional, default=False)
+            Whether to perform disjoint sampling.
+            Currently unsupported.
+            See torch_geometric.loader.NeighborLoader.
+        temporal_strategy: str (optional, default='uniform')
+            Currently only 'uniform' is suppported.
+            See torch_geometric.loader.NeighborLoader.
+        time_attr: str (optional, default=None)
+            Used for temporal sampling.
+            See torch_geometric.loader.NeighborLoader.
+        weight_attr: str (optional, default=None)
+            Used for biased sampling.
+            See torch_geometric.loader.NeighborLoader.
+        transform: Callable (optional, default=None)
+            See torch_geometric.loader.NeighborLoader.
+        transform_sampler_output: Callable (optional, default=None)
+            See torch_geometric.loader.NeighborLoader.
+        is_sorted: bool (optional, default=False)
+            Ignored by cuGraph.
+            See torch_geometric.loader.NeighborLoader.
+        filter_per_worker: bool (optional, default=False)
+            Currently ignored by cuGraph, but this may
+            change once in-memory sampling is implemented.
+            See torch_geometric.loader.NeighborLoader.
+        neighbor_sampler: torch_geometric.sampler.NeighborSampler
+            (optional, default=None)
+            Not supported by cuGraph.
+            See torch_geometric.loader.NeighborLoader.
+        directed: bool (optional, default=True)
+            Deprecated.
+            See torch_geometric.loader.NeighborLoader.
+        batch_size: int (optional, default=16)
+            The number of input nodes per output minibatch.
+            See torch.utils.dataloader.
+        directory: str (optional, default=None)
+            The directory where samples will be temporarily stored.
+            It is recommend that this be set by the user, usually
+            setting it to a tempfile.TemporaryDirectory with a context
+            manager is a good option but depending on the filesystem,
+            you may want to choose an alternative location with fast I/O
+            intead.
+            If not set, this will create a TemporaryDirectory that will
+            persist until this object is garbage collected.
+            See cugraph.gnn.DistSampleWriter.
+        batches_per_partition: int (optional, default=256)
+            The number of batches per partition if writing samples to
+            disk.  Manually tuning this parameter is not recommended
+            but reducing it may help conserve GPU memory.
+            See cugraph.gnn.DistSampleWriter.
+        format: str (optional, default='parquet')
+            If writing samples to disk, they will be written in this
+            file format.
+            See cugraph.gnn.DistSampleWriter.
+        compression: str (optional, default=None)
+            The compression type to use if writing samples to disk.
+            If not provided, it is automatically chosen.
+        local_seeds_per_call: int (optional, default=None)
+            The number of seeds to process within a single sampling call.
+            Manually tuning this parameter is not recommended but reducing
+            it may conserve GPU memory.  The total number of seeds processed
+            per sampling call is equal to the sum of this parameter across
+            all workers.  If not provided, it will be automatically
+            calculated.
+            See cugraph.gnn.DistSampler.
+        **kwargs
+            Other keyword arguments passed to the superclass.
+        """
+
+        subgraph_type = torch_geometric.sampler.base.SubgraphType(subgraph_type)
+
+        if not directed:
+            subgraph_type = torch_geometric.sampler.base.SubgraphType.induced
+            warnings.warn(
+                "The 'directed' argument is deprecated. "
+                "Use subgraph_type='induced' instead."
+            )
+        if subgraph_type != torch_geometric.sampler.base.SubgraphType.directional:
+            raise ValueError("Only directional subgraphs are currently supported")
+        if disjoint:
+            raise ValueError("Disjoint sampling is currently unsupported")
+        if temporal_strategy != "uniform":
+            warnings.warn("Only the uniform temporal strategy is currently supported")
+        if neighbor_sampler is not None:
+            raise ValueError("Passing a neighbor sampler is currently unsupported")
+        if time_attr is not None:
+            raise ValueError("Temporal sampling is currently unsupported")
+        if weight_attr is not None:
+            raise ValueError("Biased sampling is currently unsupported")
+        if is_sorted:
+            warnings.warn("The 'is_sorted' argument is ignored by cuGraph.")
+        if not isinstance(data, (list, tuple)) or not isinstance(
+            data[1], cugraph_pyg.data.GraphStore
+        ):
+            # Will eventually automatically convert these objects to cuGraph objects.
+            raise NotImplementedError("Currently can't accept non-cugraph graphs")
+
+        if directory is None:
+            warnings.warn("Setting a directory to store samples is recommended.")
+            self._tempdir = tempfile.TemporaryDirectory()
+            directory = self._tempdir.name
+
+        if compression is None:
+            compression = "CSR"
+        elif compression not in ["CSR", "COO"]:
+            raise ValueError("Invalid value for compression (expected 'CSR' or 'COO')")
+
+        writer = DistSampleWriter(
+            directory=directory,
+            batches_per_partition=batches_per_partition,
+            format=format,
+        )
+
+        feature_store, graph_store = data
+        sampler = BaseSampler(
+            UniformNeighborSampler(
+                graph_store._graph,
+                writer,
+                retain_original_seeds=True,
+                fanout=num_neighbors,
+                prior_sources_behavior="exclude",
+                deduplicate_sources=True,
+                compression=compression,
+                compress_per_hop=False,
+                with_replacement=replace,
+                local_seeds_per_call=local_seeds_per_call,
+            ),
+            (feature_store, graph_store),
+            batch_size=batch_size,
+        )
+        # TODO add heterogeneous support and pass graph_store._vertex_offsets
+
+        super().__init__(
+            (feature_store, graph_store),
+            sampler,
+            input_nodes=input_nodes,
+            input_time=input_time,
+            transform=transform,
+            transform_sampler_output=transform_sampler_output,
+            filter_per_worker=filter_per_worker,
+            batch_size=batch_size,
+            **kwargs,
+        )
diff --git a/python/cugraph-pyg/cugraph_pyg/loader/node_loader.py b/python/cugraph-pyg/cugraph_pyg/loader/node_loader.py
new file mode 100644
index 00000000000..56b58352a7c
--- /dev/null
+++ b/python/cugraph-pyg/cugraph_pyg/loader/node_loader.py
@@ -0,0 +1,148 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+
+import cugraph_pyg
+from typing import Union, Tuple, Callable, Optional
+
+from cugraph.utilities.utils import import_optional
+
+torch_geometric = import_optional("torch_geometric")
+torch = import_optional("torch")
+
+
+class NodeLoader:
+    """
+    Duck-typed version of torch_geometric.loader.NodeLoader
+    """
+
+    def __init__(
+        self,
+        data: Union[
+            "torch_geometric.data.Data",
+            "torch_geometric.data.HeteroData",
+            Tuple[
+                "torch_geometric.data.FeatureStore", "torch_geometric.data.GraphStore"
+            ],
+        ],
+        node_sampler: "cugraph_pyg.sampler.BaseSampler",
+        input_nodes: "torch_geometric.typing.InputNodes" = None,
+        input_time: "torch_geometric.typing.OptTensor" = None,
+        transform: Optional[Callable] = None,
+        transform_sampler_output: Optional[Callable] = None,
+        filter_per_worker: Optional[bool] = None,
+        custom_cls: Optional["torch_geometric.data.HeteroData"] = None,
+        input_id: "torch_geometric.typing.OptTensor" = None,
+        batch_size: int = 1,
+        shuffle: bool = False,
+        drop_last: bool = False,
+        **kwargs,
+    ):
+        """
+        Parameters
+        ----------
+            data: Data, HeteroData, or Tuple[FeatureStore, GraphStore]
+                See torch_geometric.loader.NodeLoader.
+            node_sampler: BaseSampler
+                See torch_geometric.loader.NodeLoader.
+            input_nodes: InputNodes
+                See torch_geometric.loader.NodeLoader.
+            input_time: OptTensor
+                See torch_geometric.loader.NodeLoader.
+            transform: Callable (optional, default=None)
+                This argument currently has no effect.
+            transform_sampler_output: Callable (optional, default=None)
+                This argument currently has no effect.
+            filter_per_worker: bool (optional, default=False)
+                This argument currently has no effect.
+            custom_cls: HeteroData
+                This argument currently has no effect.  This loader will
+                always return a Data or HeteroData object.
+            input_id: OptTensor
+                See torch_geometric.loader.NodeLoader.
+
+        """
+        if not isinstance(data, (list, tuple)) or not isinstance(
+            data[1], cugraph_pyg.data.GraphStore
+        ):
+            # Will eventually automatically convert these objects to cuGraph objects.
+            raise NotImplementedError("Currently can't accept non-cugraph graphs")
+
+        if not isinstance(node_sampler, cugraph_pyg.sampler.BaseSampler):
+            raise NotImplementedError("Must provide a cuGraph sampler")
+
+        if input_time is not None:
+            raise ValueError("Temporal sampling is currently unsupported")
+
+        if filter_per_worker:
+            warnings.warn("filter_per_worker is currently ignored")
+
+        if custom_cls is not None:
+            warnings.warn("custom_cls is currently ignored")
+
+        if transform is not None:
+            warnings.warn("transform is currently ignored.")
+
+        if transform_sampler_output is not None:
+            warnings.warn("transform_sampler_output is currently ignored.")
+
+        (
+            input_type,
+            input_nodes,
+            input_id,
+        ) = torch_geometric.loader.utils.get_input_nodes(
+            data,
+            input_nodes,
+            input_id,
+        )
+
+        self.__input_data = torch_geometric.loader.node_loader.NodeSamplerInput(
+            input_id=input_id,
+            node=input_nodes,
+            time=None,
+            input_type=input_type,
+        )
+
+        self.__data = data
+
+        self.__node_sampler = node_sampler
+
+        self.__batch_size = batch_size
+        self.__shuffle = shuffle
+        self.__drop_last = drop_last
+
+    def __iter__(self):
+        if self.__shuffle:
+            perm = torch.randperm(self.__input_data.node.numel())
+        else:
+            perm = torch.arange(self.__input_data.node.numel())
+
+        if self.__drop_last:
+            d = perm.numel() % self.__batch_size
+            perm = perm[:-d]
+
+        input_data = torch_geometric.loader.node_loader.NodeSamplerInput(
+            input_id=None
+            if self.__input_data.input_id is None
+            else self.__input_data.input_id[perm],
+            node=self.__input_data.node[perm],
+            time=None
+            if self.__input_data.time is None
+            else self.__input_data.time[perm],
+            input_type=self.__input_data.input_type,
+        )
+
+        return cugraph_pyg.sampler.SampleIterator(
+            self.__data, self.__node_sampler.sample_from_nodes(input_data)
+        )
diff --git a/python/cugraph-pyg/cugraph_pyg/sampler/__init__.py b/python/cugraph-pyg/cugraph_pyg/sampler/__init__.py
index 2ec68a8b4ac..34fe9c4463e 100644
--- a/python/cugraph-pyg/cugraph_pyg/sampler/__init__.py
+++ b/python/cugraph-pyg/cugraph_pyg/sampler/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -10,3 +10,5 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+from cugraph_pyg.sampler.sampler import BaseSampler, SampleIterator
diff --git a/python/cugraph-pyg/cugraph_pyg/sampler/sampler.py b/python/cugraph-pyg/cugraph_pyg/sampler/sampler.py
new file mode 100644
index 00000000000..101f7b042be
--- /dev/null
+++ b/python/cugraph-pyg/cugraph_pyg/sampler/sampler.py
@@ -0,0 +1,323 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional, Iterator, Union, Dict, Tuple
+
+from cugraph.utilities.utils import import_optional
+from cugraph.gnn import DistSampler, DistSampleReader
+
+torch = import_optional("torch")
+torch_geometric = import_optional("torch_geometric")
+
+
+class SampleIterator:
+    def __init__(
+        self,
+        data: Tuple[
+            "torch_geometric.data.FeatureStore", "torch_geometric.data.GraphStore"
+        ],
+        output_iter: Iterator[
+            Union[
+                "torch_geometric.sampler.HeteroSamplerOutput",
+                "torch_geometric.sampler.SamplerOutput",
+            ]
+        ],
+    ):
+        self.__feature_store, self.__graph_store = data
+        self.__output_iter = output_iter
+
+    def __next__(self):
+        next_sample = next(self.__output_iter)
+        if isinstance(next_sample, torch_geometric.sampler.SamplerOutput):
+            sz = next_sample.edge.numel()
+            if sz == next_sample.col.numel():
+                col = next_sample.col
+            else:
+                col = torch_geometric.edge_index.ptr2index(
+                    next_sample.col, next_sample.edge.numel()
+                )
+
+            data = torch_geometric.loader.utils.filter_custom_store(
+                self.__feature_store,
+                self.__graph_store,
+                next_sample.node,
+                next_sample.row,
+                col,
+                next_sample.edge,
+                None,
+            )
+
+            if "n_id" not in data:
+                data.n_id = next_sample.node
+            if next_sample.edge is not None and "e_id" not in data:
+                edge = next_sample.edge.to(torch.long)
+                data.e_id = edge
+
+            data.batch = next_sample.batch
+            data.num_sampled_nodes = next_sample.num_sampled_nodes
+            data.num_sampled_edges = next_sample.num_sampled_edges
+
+            data.input_id = data.batch
+            data.seed_time = None
+            data.batch_size = data.input_id.size(0)
+
+        elif isinstance(next_sample, torch_geometric.sampler.HeteroSamplerOutput):
+            col = {}
+            for edge_type, col_idx in next_sample.col:
+                sz = next_sample.edge[edge_type].numel()
+                if sz == col_idx.numel():
+                    col[edge_type] = col_idx
+                else:
+                    col[edge_type] = torch_geometric.edge_index.ptr2index(col_idx, sz)
+
+            data = torch_geometric.loader.utils.filter_custom_hetero_store(
+                self.__feature_store,
+                self.__graph_store,
+                next_sample.node,
+                next_sample.row,
+                col,
+                next_sample.edge,
+                None,
+            )
+
+            for key, node in next_sample.node.items():
+                if "n_id" not in data[key]:
+                    data[key].n_id = node
+
+            for key, edge in (next_sample.edge or {}).items():
+                if edge is not None and "e_id" not in data[key]:
+                    edge = edge.to(torch.long)
+                    data[key].e_id = edge
+
+            data.set_value_dict("batch", next_sample.batch)
+            data.set_value_dict("num_sampled_nodes", next_sample.num_sampled_nodes)
+            data.set_value_dict("num_sampled_edges", next_sample.num_sampled_edges)
+
+            # TODO figure out how to set input_id for heterogeneous output
+        else:
+            raise ValueError("Invalid output type")
+
+        return data
+
+    def __iter__(self):
+        return self
+
+
+class SampleReader:
+    def __init__(self, base_reader: DistSampleReader):
+        self.__base_reader = base_reader
+        self.__num_samples_remaining = 0
+        self.__index = 0
+
+    def __next__(self):
+        if self.__num_samples_remaining == 0:
+            # raw_sample_data is already a dict of tensors
+            self.__raw_sample_data, start_inclusive, end_inclusive = next(
+                self.__base_reader
+            )
+
+            self.__raw_sample_data["label_hop_offsets"] -= self.__raw_sample_data[
+                "label_hop_offsets"
+            ][0].clone()
+            self.__raw_sample_data["renumber_map_offsets"] -= self.__raw_sample_data[
+                "renumber_map_offsets"
+            ][0].clone()
+            if "major_offsets" in self.__raw_sample_data:
+                self.__raw_sample_data["major_offsets"] -= self.__raw_sample_data[
+                    "major_offsets"
+                ][0].clone()
+
+            self.__num_samples_remaining = end_inclusive - start_inclusive + 1
+            self.__index = 0
+
+        out = self._decode(self.__raw_sample_data, self.__index)
+        self.__index += 1
+        self.__num_samples_remaining -= 1
+        return out
+
+    def __iter__(self):
+        return self
+
+
+class HomogeneousSampleReader(SampleReader):
+    def __init__(self, base_reader: DistSampleReader):
+        super().__init__(base_reader)
+
+    def __decode_csc(self, raw_sample_data: Dict[str, "torch.Tensor"], index: int):
+        fanout_length = (raw_sample_data["label_hop_offsets"].numel() - 1) // (
+            raw_sample_data["renumber_map_offsets"].numel() - 1
+        )
+
+        major_offsets_start_incl = raw_sample_data["label_hop_offsets"][
+            index * fanout_length
+        ]
+        major_offsets_end_incl = raw_sample_data["label_hop_offsets"][
+            (index + 1) * fanout_length
+        ]
+
+        major_offsets = raw_sample_data["major_offsets"][
+            major_offsets_start_incl : major_offsets_end_incl + 1
+        ].clone()
+        minors = raw_sample_data["minors"][major_offsets[0] : major_offsets[-1]]
+        edge_id = raw_sample_data["edge_id"][major_offsets[0] : major_offsets[-1]]
+        # don't retrieve edge type for a homogeneous graph
+
+        major_offsets -= major_offsets[0].clone()
+
+        renumber_map_start = raw_sample_data["renumber_map_offsets"][index]
+        renumber_map_end = raw_sample_data["renumber_map_offsets"][index + 1]
+
+        renumber_map = raw_sample_data["map"][renumber_map_start:renumber_map_end]
+
+        current_label_hop_offsets = raw_sample_data["label_hop_offsets"][
+            index * fanout_length : (index + 1) * fanout_length + 1
+        ].clone()
+        current_label_hop_offsets -= current_label_hop_offsets[0].clone()
+
+        num_sampled_edges = major_offsets[current_label_hop_offsets].diff()
+
+        num_sampled_nodes_hops = torch.tensor(
+            [
+                minors[: num_sampled_edges[:i].sum()].max() + 1
+                for i in range(1, fanout_length + 1)
+            ],
+            device="cpu",
+        )
+
+        num_seeds = (
+            torch.searchsorted(major_offsets, num_sampled_edges[0]).reshape((1,)).cpu()
+        )
+        num_sampled_nodes = torch.concat(
+            [num_seeds, num_sampled_nodes_hops.diff(prepend=num_seeds)]
+        )
+
+        return torch_geometric.sampler.SamplerOutput(
+            node=renumber_map.cpu(),
+            row=minors,
+            col=major_offsets,
+            edge=edge_id,
+            batch=renumber_map[:num_seeds],
+            num_sampled_nodes=num_sampled_nodes.cpu(),
+            num_sampled_edges=num_sampled_edges.cpu(),
+        )
+
+    def __decode_coo(self, raw_sample_data: Dict[str, "torch.Tensor"], index: int):
+        fanout_length = (raw_sample_data["label_hop_offsets"].numel() - 1) // (
+            raw_sample_data["renumber_map_offsets"].numel() - 1
+        )
+
+        major_minor_start = raw_sample_data["label_hop_offsets"][index * fanout_length]
+        ix_end = (index + 1) * fanout_length
+        if ix_end == raw_sample_data["label_hop_offsets"].numel():
+            major_minor_end = raw_sample_data["majors"].numel()
+        else:
+            major_minor_end = raw_sample_data["label_hop_offsets"][ix_end]
+
+        majors = raw_sample_data["majors"][major_minor_start:major_minor_end]
+        minors = raw_sample_data["minors"][major_minor_start:major_minor_end]
+        edge_id = raw_sample_data["edge_id"][major_minor_start:major_minor_end]
+        # don't retrieve edge type for a homogeneous graph
+
+        renumber_map_start = raw_sample_data["renumber_map_offsets"][index]
+        renumber_map_end = raw_sample_data["renumber_map_offsets"][index + 1]
+
+        renumber_map = raw_sample_data["map"][renumber_map_start:renumber_map_end]
+
+        num_sampled_edges = (
+            raw_sample_data["label_hop_offsets"][
+                index * fanout_length : (index + 1) * fanout_length + 1
+            ]
+            .diff()
+            .cpu()
+        )
+
+        num_seeds = (majors[: num_sampled_edges[0]].max() + 1).reshape((1,)).cpu()
+        num_sampled_nodes_hops = torch.tensor(
+            [
+                minors[: num_sampled_edges[:i].sum()].max() + 1
+                for i in range(1, fanout_length + 1)
+            ],
+            device="cpu",
+        )
+
+        num_sampled_nodes = torch.concat(
+            [num_seeds, num_sampled_nodes_hops.diff(prepend=num_seeds)]
+        )
+
+        return torch_geometric.sampler.SamplerOutput(
+            node=renumber_map.cpu(),
+            row=minors,
+            col=majors,
+            edge=edge_id,
+            batch=renumber_map[:num_seeds],
+            num_sampled_nodes=num_sampled_nodes,
+            num_sampled_edges=num_sampled_edges,
+        )
+
+    def _decode(self, raw_sample_data: Dict[str, "torch.Tensor"], index: int):
+        if "major_offsets" in raw_sample_data:
+            return self.__decode_csc(raw_sample_data, index)
+        else:
+            return self.__decode_coo(raw_sample_data, index)
+
+
+class BaseSampler:
+    def __init__(
+        self,
+        sampler: DistSampler,
+        data: Tuple[
+            "torch_geometric.data.FeatureStore", "torch_geometric.data.GraphStore"
+        ],
+        batch_size: int = 16,
+    ):
+        self.__sampler = sampler
+        self.__feature_store, self.__graph_store = data
+        self.__batch_size = batch_size
+
+    def sample_from_nodes(
+        self, index: "torch_geometric.sampler.NodeSamplerInput", **kwargs
+    ) -> Iterator[
+        Union[
+            "torch_geometric.sampler.HeteroSamplerOutput",
+            "torch_geometric.sampler.SamplerOutput",
+        ]
+    ]:
+        self.__sampler.sample_from_nodes(
+            index.node, batch_size=self.__batch_size, **kwargs
+        )
+
+        edge_attrs = self.__graph_store.get_all_edge_attrs()
+        if (
+            len(edge_attrs) == 1
+            and edge_attrs[0].edge_type[0] == edge_attrs[0].edge_type[2]
+        ):
+            return HomogeneousSampleReader(self.__sampler.get_reader())
+        else:
+            # TODO implement heterogeneous sampling
+            raise NotImplementedError(
+                "Sampling heterogeneous graphs is currently"
+                " unsupported in the non-dask API"
+            )
+
+    def sample_from_edges(
+        self,
+        index: "torch_geometric.sampler.EdgeSamplerInput",
+        neg_sampling: Optional["torch_geometric.sampler.NegativeSampling"],
+        **kwargs,
+    ) -> Iterator[
+        Union[
+            "torch_geometric.sampler.HeteroSamplerOutput",
+            "torch_geometric.sampler.SamplerOutput",
+        ]
+    ]:
+        raise NotImplementedError("Edge sampling is currently unimplemented.")
diff --git a/python/cugraph-pyg/cugraph_pyg/sampler/cugraph_sampler.py b/python/cugraph-pyg/cugraph_pyg/sampler/sampler_utils.py
similarity index 89%
rename from python/cugraph-pyg/cugraph_pyg/sampler/cugraph_sampler.py
rename to python/cugraph-pyg/cugraph_pyg/sampler/sampler_utils.py
index 8bcfb783ae1..c3e19393970 100644
--- a/python/cugraph-pyg/cugraph_pyg/sampler/cugraph_sampler.py
+++ b/python/cugraph-pyg/cugraph_pyg/sampler/sampler_utils.py
@@ -14,7 +14,7 @@
 
 from typing import Sequence, Dict, Tuple
 
-from cugraph_pyg.data import CuGraphStore
+from cugraph_pyg.data import DaskGraphStore
 
 from cugraph.utilities.utils import import_optional
 import cudf
@@ -28,7 +28,7 @@
 
 def _get_unique_nodes(
     sampling_results: cudf.DataFrame,
-    graph_store: CuGraphStore,
+    graph_store: DaskGraphStore,
     node_type: str,
     node_position: str,
 ) -> int:
@@ -40,7 +40,7 @@ def _get_unique_nodes(
     sampling_results: cudf.DataFrame
         The dataframe containing sampling results or filtered sampling results
         (i.e. sampling results for hop 2)
-    graph_store: CuGraphStore
+    graph_store: DaskGraphStore
         The graph store containing the structure of the sampled graph.
     node_type: str
         The node type to count the number of unique nodes of.
@@ -81,7 +81,7 @@ def _get_unique_nodes(
 def _sampler_output_from_sampling_results_homogeneous_coo(
     sampling_results: cudf.DataFrame,
     renumber_map: torch.Tensor,
-    graph_store: CuGraphStore,
+    graph_store: DaskGraphStore,
     data_index: Dict[Tuple[int, int], Dict[str, int]],
     batch_id: int,
     metadata: Sequence = None,
@@ -94,7 +94,7 @@ def _sampler_output_from_sampling_results_homogeneous_coo(
     renumber_map: torch.Tensor
         The tensor containing the renumber map, or None if there
         is no renumber map.
-    graph_store: CuGraphStore
+    graph_store: DaskGraphStore
         The graph store containing the structure of the sampled graph.
     data_index: Dict[Tuple[int, int], Dict[str, int]]
         Dictionary where keys are the batch id and hop id,
@@ -181,7 +181,7 @@ def _sampler_output_from_sampling_results_homogeneous_csr(
     major_offsets: torch.Tensor,
     minors: torch.Tensor,
     renumber_map: torch.Tensor,
-    graph_store: CuGraphStore,
+    graph_store: DaskGraphStore,
     label_hop_offsets: torch.Tensor,
     batch_id: int,
     metadata: Sequence = None,
@@ -196,7 +196,7 @@ def _sampler_output_from_sampling_results_homogeneous_csr(
     renumber_map: torch.Tensor
         The tensor containing the renumber map.
         Required.
-    graph_store: CuGraphStore
+    graph_store: DaskGraphStore
         The graph store containing the structure of the sampled graph.
     label_hop_offsets: torch.Tensor
         The tensor containing the label-hop offsets.
@@ -263,7 +263,7 @@ def _sampler_output_from_sampling_results_homogeneous_csr(
 def _sampler_output_from_sampling_results_heterogeneous(
     sampling_results: cudf.DataFrame,
     renumber_map: cudf.Series,
-    graph_store: CuGraphStore,
+    graph_store: DaskGraphStore,
     metadata: Sequence = None,
 ) -> HeteroSamplerOutput:
     """
@@ -274,7 +274,7 @@ def _sampler_output_from_sampling_results_heterogeneous(
     renumber_map: cudf.Series
         The series containing the renumber map, or None if there
         is no renumber map.
-    graph_store: CuGraphStore
+    graph_store: DaskGraphStore
         The graph store containing the structure of the sampled graph.
     metadata: Tensor
         The metadata for the sampled batch.
@@ -403,41 +403,3 @@ def _sampler_output_from_sampling_results_heterogeneous(
         num_sampled_edges={k: t.tolist() for k, t in num_edges_per_hop_dict.items()},
         metadata=metadata,
     )
-
-
-def filter_cugraph_store_csc(
-    feature_store: torch_geometric.data.FeatureStore,
-    graph_store: torch_geometric.data.GraphStore,
-    node_dict: Dict[str, torch.Tensor],
-    row_dict: Dict[str, torch.Tensor],
-    col_dict: Dict[str, torch.Tensor],
-    edge_dict: Dict[str, Tuple[torch.Tensor]],
-) -> torch_geometric.data.HeteroData:
-    """
-    Deprecated
-    """
-
-    data = torch_geometric.data.HeteroData()
-
-    for attr in graph_store.get_all_edge_attrs():
-        key = attr.edge_type
-        if key in row_dict and key in col_dict:
-            data.put_edge_index(
-                (row_dict[key], col_dict[key]),
-                edge_type=key,
-                layout="csc",
-                is_sorted=True,
-            )
-
-    required_attrs = []
-    for attr in feature_store.get_all_tensor_attrs():
-        if attr.group_name in node_dict:
-            attr.index = node_dict[attr.group_name]
-            required_attrs.append(attr)
-            data[attr.group_name].num_nodes = attr.index.size(0)
-
-    tensors = feature_store.multi_get_tensor(required_attrs)
-    for i, attr in enumerate(required_attrs):
-        data[attr.group_name][attr.attr_name] = tensors[i]
-
-    return data
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_store.py b/python/cugraph-pyg/cugraph_pyg/tests/data/test_dask_graph_store.py
similarity index 92%
rename from python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_store.py
rename to python/cugraph-pyg/cugraph_pyg/tests/data/test_dask_graph_store.py
index c99fd447aa0..0a997a960b8 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_store.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/data/test_dask_graph_store.py
@@ -12,12 +12,12 @@
 # limitations under the License.
 
 import cugraph
-from cugraph_pyg.data.cugraph_store import (
+from cugraph_pyg.data.dask_graph_store import (
     CuGraphTensorAttr,
     CuGraphEdgeAttr,
     EdgeLayout,
 )
-from cugraph_pyg.data import CuGraphStore
+from cugraph_pyg.data import DaskGraphStore
 
 import cudf
 import cupy
@@ -33,6 +33,7 @@
 
 
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.sg
 def test_tensor_attr():
     ta = CuGraphTensorAttr("group0", "property1")
     assert not ta.is_fully_specified()
@@ -63,6 +64,7 @@ def test_tensor_attr():
 
 
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.sg
 def test_edge_attr():
     ea = CuGraphEdgeAttr("type0", EdgeLayout.COO, False, 10)
     assert ea.edge_type == "type0"
@@ -98,6 +100,7 @@ def single_vertex_graph(request):
 
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
 @pytest.mark.parametrize("edge_index_type", ["numpy", "torch-cpu", "torch-gpu", "cudf"])
+@pytest.mark.sg
 def test_get_edge_index(graph, edge_index_type):
     F, G, N = graph
     if "torch" in edge_index_type:
@@ -113,7 +116,7 @@ def test_get_edge_index(graph, edge_index_type):
             G[et][0] = cudf.Series(G[et][0])
             G[et][1] = cudf.Series(G[et][1])
 
-    cugraph_store = CuGraphStore(F, G, N, order="CSC")
+    cugraph_store = DaskGraphStore(F, G, N, order="CSC")
 
     for pyg_can_edge_type in G:
         src, dst = cugraph_store.get_edge_index(
@@ -129,9 +132,10 @@ def test_get_edge_index(graph, edge_index_type):
 
 
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.sg
 def test_edge_types(graph):
     F, G, N = graph
-    cugraph_store = CuGraphStore(F, G, N)
+    cugraph_store = DaskGraphStore(F, G, N)
 
     eta = cugraph_store._edge_types_to_attrs
     assert eta.keys() == G.keys()
@@ -145,9 +149,10 @@ def test_edge_types(graph):
 
 
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.sg
 def test_get_subgraph(graph):
     F, G, N = graph
-    cugraph_store = CuGraphStore(F, G, N)
+    cugraph_store = DaskGraphStore(F, G, N)
 
     if len(G.keys()) > 1:
         for edge_type in G.keys():
@@ -163,9 +168,10 @@ def test_get_subgraph(graph):
 
 
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.sg
 def test_renumber_vertices_basic(single_vertex_graph):
     F, G, N = single_vertex_graph
-    cugraph_store = CuGraphStore(F, G, N)
+    cugraph_store = DaskGraphStore(F, G, N)
 
     nodes_of_interest = torch.as_tensor(
         cupy.random.randint(0, sum(N.values()), 3), device="cuda"
@@ -176,9 +182,10 @@ def test_renumber_vertices_basic(single_vertex_graph):
 
 
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.sg
 def test_renumber_vertices_multi_edge_multi_vertex(multi_edge_multi_vertex_graph_1):
     F, G, N = multi_edge_multi_vertex_graph_1
-    cugraph_store = CuGraphStore(F, G, N)
+    cugraph_store = DaskGraphStore(F, G, N)
 
     nodes_of_interest = torch.as_tensor(
         cupy.random.randint(0, sum(N.values()), 3), device="cuda"
@@ -196,10 +203,11 @@ def test_renumber_vertices_multi_edge_multi_vertex(multi_edge_multi_vertex_graph
 
 
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.sg
 def test_renumber_edges(abc_graph):
     F, G, N = abc_graph
 
-    graph_store = CuGraphStore(F, G, N, order="CSR")
+    graph_store = DaskGraphStore(F, G, N, order="CSR")
 
     # let 0, 1 be the start vertices, fanout = [2, 1, 2, 3]
     mock_sampling_results = cudf.DataFrame(
@@ -232,9 +240,10 @@ def test_renumber_edges(abc_graph):
 
 
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.sg
 def test_get_tensor(graph):
     F, G, N = graph
-    cugraph_store = CuGraphStore(F, G, N)
+    cugraph_store = DaskGraphStore(F, G, N)
 
     for feature_name, feature_on_types in F.get_feature_list().items():
         for type_name in feature_on_types:
@@ -253,9 +262,10 @@ def test_get_tensor(graph):
 
 
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.sg
 def test_get_tensor_empty_idx(karate_gnn):
     F, G, N = karate_gnn
-    cugraph_store = CuGraphStore(F, G, N)
+    cugraph_store = DaskGraphStore(F, G, N)
 
     t = cugraph_store.get_tensor(
         CuGraphTensorAttr(group_name="type0", attr_name="prop0", index=None)
@@ -264,9 +274,10 @@ def test_get_tensor_empty_idx(karate_gnn):
 
 
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.sg
 def test_multi_get_tensor(graph):
     F, G, N = graph
-    cugraph_store = CuGraphStore(F, G, N)
+    cugraph_store = DaskGraphStore(F, G, N)
 
     for vertex_type in sorted(N.keys()):
         v_ids = np.arange(N[vertex_type])
@@ -291,9 +302,10 @@ def test_multi_get_tensor(graph):
 
 
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.sg
 def test_get_all_tensor_attrs(graph):
     F, G, N = graph
-    cugraph_store = CuGraphStore(F, G, N)
+    cugraph_store = DaskGraphStore(F, G, N)
 
     tensor_attrs = []
     for vertex_type in sorted(N.keys()):
@@ -320,20 +332,11 @@ def test_get_all_tensor_attrs(graph):
     )
 
 
-@pytest.mark.skip("not implemented")
-def test_get_tensor_spec_props(graph):
-    raise NotImplementedError("not implemented")
-
-
-@pytest.mark.skip("not implemented")
-def test_multi_get_tensor_spec_props(multi_edge_multi_vertex_graph_1):
-    raise NotImplementedError("not implemented")
-
-
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.sg
 def test_get_tensor_from_tensor_attrs(graph):
     F, G, N = graph
-    cugraph_store = CuGraphStore(F, G, N)
+    cugraph_store = DaskGraphStore(F, G, N)
 
     tensor_attrs = cugraph_store.get_all_tensor_attrs()
     for tensor_attr in tensor_attrs:
@@ -345,9 +348,10 @@ def test_get_tensor_from_tensor_attrs(graph):
 
 
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.sg
 def test_get_tensor_size(graph):
     F, G, N = graph
-    cugraph_store = CuGraphStore(F, G, N)
+    cugraph_store = DaskGraphStore(F, G, N)
 
     tensor_attrs = cugraph_store.get_all_tensor_attrs()
     for tensor_attr in tensor_attrs:
@@ -361,9 +365,10 @@ def test_get_tensor_size(graph):
 @pytest.mark.skipif(
     isinstance(torch_geometric, MissingModule), reason="pyg not available"
 )
+@pytest.mark.sg
 def test_get_input_nodes(karate_gnn):
     F, G, N = karate_gnn
-    cugraph_store = CuGraphStore(F, G, N)
+    cugraph_store = DaskGraphStore(F, G, N)
 
     input_node_info = torch_geometric.loader.utils.get_input_nodes(
         (cugraph_store, cugraph_store), "type0"
@@ -383,11 +388,12 @@ def test_get_input_nodes(karate_gnn):
     assert input_nodes.tolist() == torch.arange(17, dtype=torch.int32).tolist()
 
 
+@pytest.mark.sg
 def test_serialize(multi_edge_multi_vertex_no_graph_1):
     import pickle
 
     F, G, N = multi_edge_multi_vertex_no_graph_1
-    cugraph_store = CuGraphStore(F, G, N)
+    cugraph_store = DaskGraphStore(F, G, N)
 
     cugraph_store_copy = pickle.loads(pickle.dumps(cugraph_store))
 
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_store.py b/python/cugraph-pyg/cugraph_pyg/tests/data/test_dask_graph_store_mg.py
similarity index 90%
rename from python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_store.py
rename to python/cugraph-pyg/cugraph_pyg/tests/data/test_dask_graph_store_mg.py
index 85acbebc3ec..65cb8984586 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_store.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/data/test_dask_graph_store_mg.py
@@ -12,12 +12,12 @@
 # limitations under the License.
 
 import cugraph
-from cugraph_pyg.data.cugraph_store import (
+from cugraph_pyg.data.dask_graph_store import (
     CuGraphTensorAttr,
     CuGraphEdgeAttr,
     EdgeLayout,
 )
-from cugraph_pyg.data import CuGraphStore
+from cugraph_pyg.data import DaskGraphStore
 
 import cudf
 import dask_cudf
@@ -101,6 +101,7 @@ def single_vertex_graph(request):
 @pytest.mark.parametrize(
     "edge_index_type", ["numpy", "torch-cpu", "torch-gpu", "cudf", "dask-cudf"]
 )
+@pytest.mark.mg
 def test_get_edge_index(graph, edge_index_type, dask_client):
     F, G, N = graph
     if "torch" in edge_index_type:
@@ -120,7 +121,7 @@ def test_get_edge_index(graph, edge_index_type, dask_client):
             G[et][0] = dask_cudf.from_cudf(cudf.Series(G[et][0]), npartitions=1)
             G[et][1] = dask_cudf.from_cudf(cudf.Series(G[et][1]), npartitions=1)
 
-    cugraph_store = CuGraphStore(F, G, N, order="CSC", multi_gpu=True)
+    cugraph_store = DaskGraphStore(F, G, N, order="CSC", multi_gpu=True)
 
     for pyg_can_edge_type in G:
         src, dst = cugraph_store.get_edge_index(
@@ -143,9 +144,10 @@ def test_get_edge_index(graph, edge_index_type, dask_client):
 
 
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.mg
 def test_edge_types(graph, dask_client):
     F, G, N = graph
-    cugraph_store = CuGraphStore(F, G, N, multi_gpu=True)
+    cugraph_store = DaskGraphStore(F, G, N, multi_gpu=True)
 
     eta = cugraph_store._edge_types_to_attrs
     assert eta.keys() == G.keys()
@@ -159,9 +161,10 @@ def test_edge_types(graph, dask_client):
 
 
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.mg
 def test_get_subgraph(graph, dask_client):
     F, G, N = graph
-    cugraph_store = CuGraphStore(F, G, N, multi_gpu=True)
+    cugraph_store = DaskGraphStore(F, G, N, multi_gpu=True)
 
     if len(G.keys()) > 1:
         for edge_type in G.keys():
@@ -177,9 +180,10 @@ def test_get_subgraph(graph, dask_client):
 
 
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.mg
 def test_renumber_vertices_basic(single_vertex_graph, dask_client):
     F, G, N = single_vertex_graph
-    cugraph_store = CuGraphStore(F, G, N, multi_gpu=True)
+    cugraph_store = DaskGraphStore(F, G, N, multi_gpu=True)
 
     nodes_of_interest = torch.as_tensor(
         cupy.random.randint(0, sum(N.values()), 3), device="cuda"
@@ -190,11 +194,12 @@ def test_renumber_vertices_basic(single_vertex_graph, dask_client):
 
 
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.mg
 def test_renumber_vertices_multi_edge_multi_vertex(
     multi_edge_multi_vertex_graph_1, dask_client
 ):
     F, G, N = multi_edge_multi_vertex_graph_1
-    cugraph_store = CuGraphStore(F, G, N, multi_gpu=True)
+    cugraph_store = DaskGraphStore(F, G, N, multi_gpu=True)
 
     nodes_of_interest = torch.as_tensor(
         cupy.random.randint(0, sum(N.values()), 3), device="cuda"
@@ -212,10 +217,11 @@ def test_renumber_vertices_multi_edge_multi_vertex(
 
 
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.mg
 def test_renumber_edges(abc_graph, dask_client):
     F, G, N = abc_graph
 
-    graph_store = CuGraphStore(F, G, N, multi_gpu=True, order="CSR")
+    graph_store = DaskGraphStore(F, G, N, multi_gpu=True, order="CSR")
 
     # let 0, 1 be the start vertices, fanout = [2, 1, 2, 3]
     mock_sampling_results = cudf.DataFrame(
@@ -248,9 +254,10 @@ def test_renumber_edges(abc_graph, dask_client):
 
 
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.mg
 def test_get_tensor(graph, dask_client):
     F, G, N = graph
-    cugraph_store = CuGraphStore(F, G, N, multi_gpu=True)
+    cugraph_store = DaskGraphStore(F, G, N, multi_gpu=True)
 
     for feature_name, feature_on_types in F.get_feature_list().items():
         for type_name in feature_on_types:
@@ -269,9 +276,10 @@ def test_get_tensor(graph, dask_client):
 
 
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.mg
 def test_get_tensor_empty_idx(karate_gnn, dask_client):
     F, G, N = karate_gnn
-    cugraph_store = CuGraphStore(F, G, N, multi_gpu=True)
+    cugraph_store = DaskGraphStore(F, G, N, multi_gpu=True)
 
     t = cugraph_store.get_tensor(
         CuGraphTensorAttr(group_name="type0", attr_name="prop0", index=None)
@@ -280,9 +288,10 @@ def test_get_tensor_empty_idx(karate_gnn, dask_client):
 
 
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.mg
 def test_multi_get_tensor(graph, dask_client):
     F, G, N = graph
-    cugraph_store = CuGraphStore(F, G, N, multi_gpu=True)
+    cugraph_store = DaskGraphStore(F, G, N, multi_gpu=True)
 
     for vertex_type in sorted(N.keys()):
         v_ids = np.arange(N[vertex_type])
@@ -307,9 +316,10 @@ def test_multi_get_tensor(graph, dask_client):
 
 
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.mg
 def test_get_all_tensor_attrs(graph, dask_client):
     F, G, N = graph
-    cugraph_store = CuGraphStore(F, G, N, multi_gpu=True)
+    cugraph_store = DaskGraphStore(F, G, N, multi_gpu=True)
 
     tensor_attrs = []
     for vertex_type in sorted(N.keys()):
@@ -328,20 +338,11 @@ def test_get_all_tensor_attrs(graph, dask_client):
     )
 
 
-@pytest.mark.skip("not implemented")
-def test_get_tensor_spec_props(graph, dask_client):
-    raise NotImplementedError("not implemented")
-
-
-@pytest.mark.skip("not implemented")
-def test_multi_get_tensor_spec_props(multi_edge_multi_vertex_graph_1, dask_client):
-    raise NotImplementedError("not implemented")
-
-
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.mg
 def test_get_tensor_from_tensor_attrs(graph, dask_client):
     F, G, N = graph
-    cugraph_store = CuGraphStore(F, G, N, multi_gpu=True)
+    cugraph_store = DaskGraphStore(F, G, N, multi_gpu=True)
 
     tensor_attrs = cugraph_store.get_all_tensor_attrs()
     for tensor_attr in tensor_attrs:
@@ -353,9 +354,10 @@ def test_get_tensor_from_tensor_attrs(graph, dask_client):
 
 
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.mg
 def test_get_tensor_size(graph, dask_client):
     F, G, N = graph
-    cugraph_store = CuGraphStore(F, G, N, multi_gpu=True)
+    cugraph_store = DaskGraphStore(F, G, N, multi_gpu=True)
 
     tensor_attrs = cugraph_store.get_all_tensor_attrs()
     for tensor_attr in tensor_attrs:
@@ -369,9 +371,10 @@ def test_get_tensor_size(graph, dask_client):
 @pytest.mark.skipif(
     isinstance(torch_geometric, MissingModule), reason="pyg not available"
 )
+@pytest.mark.mg
 def test_get_input_nodes(karate_gnn, dask_client):
     F, G, N = karate_gnn
-    cugraph_store = CuGraphStore(F, G, N, multi_gpu=True)
+    cugraph_store = DaskGraphStore(F, G, N, multi_gpu=True)
 
     nodes = torch_geometric.loader.utils.get_input_nodes(
         (cugraph_store, cugraph_store), "type0"
@@ -387,13 +390,15 @@ def test_get_input_nodes(karate_gnn, dask_client):
 
 
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.mg
 def test_mg_frame_handle(graph, dask_client):
     F, G, N = graph
-    cugraph_store = CuGraphStore(F, G, N, multi_gpu=True)
-    assert isinstance(cugraph_store._CuGraphStore__graph._plc_graph, dict)
+    cugraph_store = DaskGraphStore(F, G, N, multi_gpu=True)
+    assert isinstance(cugraph_store._DaskGraphStore__graph._plc_graph, dict)
 
 
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.mg
 def test_cugraph_loader_large_index(dask_client):
     large_index = (
         np.random.randint(0, 1_000_000, (100_000_000,)),
@@ -404,7 +409,7 @@ def test_cugraph_loader_large_index(dask_client):
     F = cugraph.gnn.FeatureStore(backend="torch")
     F.add_data(large_features, "N", "f")
 
-    store = CuGraphStore(
+    store = DaskGraphStore(
         F,
         {("N", "e", "N"): large_index},
         {"N": 1_000_000},
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/data/test_feature_store.py b/python/cugraph-pyg/cugraph_pyg/tests/data/test_feature_store.py
new file mode 100644
index 00000000000..ab5f1e217bb
--- /dev/null
+++ b/python/cugraph-pyg/cugraph_pyg/tests/data/test_feature_store.py
@@ -0,0 +1,44 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+from cugraph.utilities.utils import import_optional, MissingModule
+
+from cugraph_pyg.data import TensorDictFeatureStore
+
+torch = import_optional("torch")
+
+
+@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.sg
+def test_tensordict_feature_store_basic_api():
+    feature_store = TensorDictFeatureStore()
+
+    node_features_0 = torch.randint(128, (100, 1000))
+    node_features_1 = torch.randint(256, (100, 10))
+
+    other_features = torch.randint(1024, (10, 5))
+
+    feature_store["node", "feat0"] = node_features_0
+    feature_store["node", "feat1"] = node_features_1
+    feature_store["other", "feat"] = other_features
+
+    assert (feature_store["node"]["feat0"][:] == node_features_0).all()
+    assert (feature_store["node"]["feat1"][:] == node_features_1).all()
+    assert (feature_store["other"]["feat"][:] == other_features).all()
+
+    assert len(feature_store.get_all_tensor_attrs()) == 3
+
+    del feature_store["node", "feat0"]
+    assert len(feature_store.get_all_tensor_attrs()) == 2
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/data/test_graph_store.py b/python/cugraph-pyg/cugraph_pyg/tests/data/test_graph_store.py
new file mode 100644
index 00000000000..a8b93665aad
--- /dev/null
+++ b/python/cugraph-pyg/cugraph_pyg/tests/data/test_graph_store.py
@@ -0,0 +1,45 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+from cugraph.datasets import karate
+from cugraph.utilities.utils import import_optional, MissingModule
+
+from cugraph_pyg.data import GraphStore
+
+torch = import_optional("torch")
+
+
+@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.sg
+def test_graph_store_basic_api():
+    df = karate.get_edgelist()
+    src = torch.as_tensor(df["src"], device="cuda")
+    dst = torch.as_tensor(df["dst"], device="cuda")
+
+    ei = torch.stack([dst, src])
+
+    graph_store = GraphStore()
+    graph_store.put_edge_index(ei, ("person", "knows", "person"), "coo")
+
+    rei = graph_store.get_edge_index(("person", "knows", "person"), "coo")
+
+    assert (ei == rei).all()
+
+    edge_attrs = graph_store.get_all_edge_attrs()
+    assert len(edge_attrs) == 1
+
+    graph_store.remove_edge_index(("person", "knows", "person"), "coo")
+    edge_attrs = graph_store.get_all_edge_attrs()
+    assert len(edge_attrs) == 0
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/data/test_graph_store_mg.py b/python/cugraph-pyg/cugraph_pyg/tests/data/test_graph_store_mg.py
new file mode 100644
index 00000000000..14540b7e17d
--- /dev/null
+++ b/python/cugraph-pyg/cugraph_pyg/tests/data/test_graph_store_mg.py
@@ -0,0 +1,45 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+from cugraph.datasets import karate
+from cugraph.utilities.utils import import_optional, MissingModule
+
+from cugraph_pyg.data import GraphStore
+
+torch = import_optional("torch")
+
+
+@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.mg
+def test_graph_store_basic_api_mg():
+    df = karate.get_edgelist()
+    src = torch.as_tensor(df["src"], device="cuda")
+    dst = torch.as_tensor(df["dst"], device="cuda")
+
+    ei = torch.stack([dst, src])
+
+    graph_store = GraphStore(is_multi_gpu=True)
+    graph_store.put_edge_index(ei, ("person", "knows", "person"), "coo")
+
+    rei = graph_store.get_edge_index(("person", "knows", "person"), "coo")
+
+    assert (ei == rei).all()
+
+    edge_attrs = graph_store.get_all_edge_attrs()
+    assert len(edge_attrs) == 1
+
+    graph_store.remove_edge_index(("person", "knows", "person"), "coo")
+    edge_attrs = graph_store.get_all_edge_attrs()
+    assert len(edge_attrs) == 0
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_loader.py b/python/cugraph-pyg/cugraph_pyg/tests/loader/test_dask_neighbor_loader.py
similarity index 95%
rename from python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_loader.py
rename to python/cugraph-pyg/cugraph_pyg/tests/loader/test_dask_neighbor_loader.py
index ab20ef01fd3..34ef6a59511 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_loader.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/loader/test_dask_neighbor_loader.py
@@ -20,9 +20,9 @@
 import cupy
 import numpy as np
 
-from cugraph_pyg.loader import CuGraphNeighborLoader
+from cugraph_pyg.loader import DaskNeighborLoader
 from cugraph_pyg.loader import BulkSampleLoader
-from cugraph_pyg.data import CuGraphStore
+from cugraph_pyg.data import DaskGraphStore
 from cugraph_pyg.nn import SAGEConv as CuGraphSAGEConv
 
 from cugraph.gnn import FeatureStore
@@ -47,14 +47,15 @@
 
 
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.sg
 def test_cugraph_loader_basic(
     karate_gnn: Tuple[
         FeatureStore, Dict[Tuple[str, str, str], np.ndarray], Dict[str, int]
     ]
 ):
     F, G, N = karate_gnn
-    cugraph_store = CuGraphStore(F, G, N, order="CSR")
-    loader = CuGraphNeighborLoader(
+    cugraph_store = DaskGraphStore(F, G, N, order="CSR")
+    loader = DaskNeighborLoader(
         (cugraph_store, cugraph_store),
         torch.arange(N["type0"] + N["type1"], dtype=torch.int64),
         10,
@@ -77,14 +78,15 @@ def test_cugraph_loader_basic(
 
 
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.sg
 def test_cugraph_loader_hetero(
     karate_gnn: Tuple[
         FeatureStore, Dict[Tuple[str, str, str], np.ndarray], Dict[str, int]
     ]
 ):
     F, G, N = karate_gnn
-    cugraph_store = CuGraphStore(F, G, N, order="CSR")
-    loader = CuGraphNeighborLoader(
+    cugraph_store = DaskGraphStore(F, G, N, order="CSR")
+    loader = DaskNeighborLoader(
         (cugraph_store, cugraph_store),
         input_nodes=("type1", torch.tensor([0, 1, 2, 5], device="cuda")),
         batch_size=2,
@@ -107,6 +109,7 @@ def test_cugraph_loader_hetero(
 
 
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.sg
 def test_cugraph_loader_from_disk():
     m = [2, 9, 99, 82, 9, 3, 18, 1, 12]
     n = torch.arange(1, 1 + len(m), dtype=torch.int32)
@@ -118,7 +121,7 @@ def test_cugraph_loader_from_disk():
     G = {("t0", "knows", "t0"): 9080}
     N = {"t0": 256}
 
-    cugraph_store = CuGraphStore(F, G, N, order="CSR")
+    cugraph_store = DaskGraphStore(F, G, N, order="CSR")
 
     bogus_samples = cudf.DataFrame(
         {
@@ -164,6 +167,7 @@ def test_cugraph_loader_from_disk():
 
 
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.sg
 def test_cugraph_loader_from_disk_subset():
     m = [2, 9, 99, 82, 9, 3, 18, 1, 12]
     n = torch.arange(1, 1 + len(m), dtype=torch.int32)
@@ -175,7 +179,7 @@ def test_cugraph_loader_from_disk_subset():
     G = {("t0", "knows", "t0"): 9080}
     N = {"t0": 256}
 
-    cugraph_store = CuGraphStore(F, G, N, order="CSR")
+    cugraph_store = DaskGraphStore(F, G, N, order="CSR")
 
     bogus_samples = cudf.DataFrame(
         {
@@ -223,6 +227,7 @@ def test_cugraph_loader_from_disk_subset():
 
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
 @pytest.mark.skipif(not HAS_TORCH_SPARSE, reason="torch-sparse not available")
+@pytest.mark.sg
 def test_cugraph_loader_from_disk_subset_csr():
     m = [2, 9, 99, 82, 11, 13]
     n = torch.arange(1, 1 + len(m), dtype=torch.int32)
@@ -234,7 +239,7 @@ def test_cugraph_loader_from_disk_subset_csr():
     G = {("t0", "knows", "t0"): 9080}
     N = {"t0": 256}
 
-    cugraph_store = CuGraphStore(F, G, N)
+    cugraph_store = DaskGraphStore(F, G, N)
 
     bogus_samples = cudf.DataFrame(
         {
@@ -289,6 +294,7 @@ def test_cugraph_loader_from_disk_subset_csr():
 
 
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.sg
 def test_cugraph_loader_e2e_coo():
     m = [2, 9, 99, 82, 9, 3, 18, 1, 12]
     x = torch.randint(3000, (256, 256)).to(torch.float32)
@@ -298,7 +304,7 @@ def test_cugraph_loader_e2e_coo():
     G = {("t0", "knows", "t0"): 9999}
     N = {"t0": 256}
 
-    cugraph_store = CuGraphStore(F, G, N, order="CSR")
+    cugraph_store = DaskGraphStore(F, G, N, order="CSR")
 
     bogus_samples = cudf.DataFrame(
         {
@@ -357,6 +363,7 @@ def test_cugraph_loader_e2e_coo():
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
 @pytest.mark.skipif(not HAS_TORCH_SPARSE, reason="torch-sparse not available")
 @pytest.mark.parametrize("framework", ["pyg", "cugraph-ops"])
+@pytest.mark.sg
 def test_cugraph_loader_e2e_csc(framework: str):
     m = [2, 9, 99, 82, 9, 3, 18, 1, 12]
     x = torch.randint(3000, (256, 256)).to(torch.float32)
@@ -366,7 +373,7 @@ def test_cugraph_loader_e2e_csc(framework: str):
     G = {("t0", "knows", "t0"): 9999}
     N = {"t0": 256}
 
-    cugraph_store = CuGraphStore(F, G, N)
+    cugraph_store = DaskGraphStore(F, G, N)
 
     bogus_samples = cudf.DataFrame(
         {
@@ -461,6 +468,7 @@ def test_cugraph_loader_e2e_csc(framework: str):
 
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
 @pytest.mark.parametrize("drop_last", [True, False])
+@pytest.mark.sg
 def test_drop_last(drop_last):
     N = {"N": 10}
     G = {
@@ -471,9 +479,9 @@ def test_drop_last(drop_last):
     F = FeatureStore(backend="torch")
     F.add_data(torch.arange(10), "N", "z")
 
-    store = CuGraphStore(F, G, N)
+    store = DaskGraphStore(F, G, N)
     with tempfile.TemporaryDirectory() as dir:
-        loader = CuGraphNeighborLoader(
+        loader = DaskNeighborLoader(
             (store, store),
             input_nodes=torch.tensor([0, 1, 2, 3, 4]),
             num_neighbors=[1],
@@ -499,6 +507,7 @@ def test_drop_last(drop_last):
 
 
 @pytest.mark.parametrize("directory", ["local", "temp"])
+@pytest.mark.sg
 def test_load_directory(
     karate_gnn: Tuple[
         FeatureStore, Dict[Tuple[str, str, str], np.ndarray], Dict[str, int]
@@ -508,8 +517,8 @@ def test_load_directory(
     if directory == "local":
         local_dir = tempfile.TemporaryDirectory(dir=".")
 
-    cugraph_store = CuGraphStore(*karate_gnn)
-    cugraph_loader = CuGraphNeighborLoader(
+    cugraph_store = DaskGraphStore(*karate_gnn)
+    cugraph_loader = DaskNeighborLoader(
         (cugraph_store, cugraph_store),
         torch.arange(8, dtype=torch.int64),
         2,
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_loader.py b/python/cugraph-pyg/cugraph_pyg/tests/loader/test_dask_neighbor_loader_mg.py
similarity index 85%
rename from python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_loader.py
rename to python/cugraph-pyg/cugraph_pyg/tests/loader/test_dask_neighbor_loader_mg.py
index f5035a38621..9e8a85a5b67 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_loader.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/loader/test_dask_neighbor_loader_mg.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -13,18 +13,19 @@
 
 import pytest
 
-from cugraph_pyg.loader import CuGraphNeighborLoader
-from cugraph_pyg.data import CuGraphStore
+from cugraph_pyg.loader import DaskNeighborLoader
+from cugraph_pyg.data import DaskGraphStore
 from cugraph.utilities.utils import import_optional, MissingModule
 
 torch = import_optional("torch")
 
 
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.mg
 def test_cugraph_loader_basic(dask_client, karate_gnn):
     F, G, N = karate_gnn
-    cugraph_store = CuGraphStore(F, G, N, multi_gpu=True, order="CSR")
-    loader = CuGraphNeighborLoader(
+    cugraph_store = DaskGraphStore(F, G, N, multi_gpu=True, order="CSR")
+    loader = DaskNeighborLoader(
         (cugraph_store, cugraph_store),
         torch.arange(N["type0"] + N["type1"], dtype=torch.int64),
         10,
@@ -49,10 +50,11 @@ def test_cugraph_loader_basic(dask_client, karate_gnn):
 
 
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.mg
 def test_cugraph_loader_hetero(dask_client, karate_gnn):
     F, G, N = karate_gnn
-    cugraph_store = CuGraphStore(F, G, N, multi_gpu=True, order="CSR")
-    loader = CuGraphNeighborLoader(
+    cugraph_store = DaskGraphStore(F, G, N, multi_gpu=True, order="CSR")
+    loader = DaskNeighborLoader(
         (cugraph_store, cugraph_store),
         input_nodes=("type1", torch.tensor([0, 1, 2, 5], device="cuda")),
         batch_size=2,
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/loader/test_neighbor_loader.py b/python/cugraph-pyg/cugraph_pyg/tests/loader/test_neighbor_loader.py
new file mode 100644
index 00000000000..8edb5276953
--- /dev/null
+++ b/python/cugraph-pyg/cugraph_pyg/tests/loader/test_neighbor_loader.py
@@ -0,0 +1,54 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+from cugraph.datasets import karate
+from cugraph.utilities.utils import import_optional, MissingModule
+
+from cugraph_pyg.data import TensorDictFeatureStore, GraphStore
+from cugraph_pyg.loader import NeighborLoader
+
+torch = import_optional("torch")
+torch_geometric = import_optional("torch_geometric")
+
+
+@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.sg
+def test_neighbor_loader():
+    """
+    Basic e2e test that covers loading and sampling.
+    """
+
+    df = karate.get_edgelist()
+    src = torch.as_tensor(df["src"], device="cuda")
+    dst = torch.as_tensor(df["dst"], device="cuda")
+
+    ei = torch.stack([dst, src])
+
+    graph_store = GraphStore()
+    graph_store.put_edge_index(ei, ("person", "knows", "person"), "coo")
+
+    feature_store = TensorDictFeatureStore()
+    feature_store["person", "feat"] = torch.randint(128, (34, 16))
+
+    loader = NeighborLoader(
+        (feature_store, graph_store),
+        [5, 5],
+        input_nodes=torch.arange(34),
+        directory=".",
+    )
+
+    for batch in loader:
+        assert isinstance(batch, torch_geometric.data.Data)
+        assert (feature_store["person", "feat"][batch.n_id] == batch.feat).all()
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/loader/test_neighbor_loader_mg.py b/python/cugraph-pyg/cugraph_pyg/tests/loader/test_neighbor_loader_mg.py
new file mode 100644
index 00000000000..6a5f46b0940
--- /dev/null
+++ b/python/cugraph-pyg/cugraph_pyg/tests/loader/test_neighbor_loader_mg.py
@@ -0,0 +1,111 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+import os
+
+from cugraph.datasets import karate
+from cugraph.utilities.utils import import_optional, MissingModule
+
+from cugraph_pyg.data import TensorDictFeatureStore, GraphStore
+from cugraph_pyg.loader import NeighborLoader
+
+from cugraph.gnn import (
+    cugraph_comms_init,
+    cugraph_comms_shutdown,
+    cugraph_comms_create_unique_id,
+)
+
+torch = import_optional("torch")
+torch_geometric = import_optional("torch_geometric")
+
+
+def init_pytorch_worker(rank, world_size, cugraph_id):
+    import rmm
+
+    rmm.reinitialize(
+        devices=rank,
+    )
+
+    import cupy
+
+    cupy.cuda.Device(rank).use()
+    from rmm.allocators.cupy import rmm_cupy_allocator
+
+    cupy.cuda.set_allocator(rmm_cupy_allocator)
+
+    from cugraph.testing.mg_utils import enable_spilling
+
+    enable_spilling()
+
+    torch.cuda.set_device(rank)
+
+    os.environ["MASTER_ADDR"] = "localhost"
+    os.environ["MASTER_PORT"] = "12355"
+    torch.distributed.init_process_group("nccl", rank=rank, world_size=world_size)
+
+    cugraph_comms_init(rank=rank, world_size=world_size, uid=cugraph_id, device=rank)
+
+
+def run_test_neighbor_loader_mg(rank, uid, world_size, specify_size):
+    """
+    Basic e2e test that covers loading and sampling.
+    """
+    init_pytorch_worker(rank, world_size, uid)
+
+    df = karate.get_edgelist()
+    src = torch.as_tensor(df["src"], device="cuda")
+    dst = torch.as_tensor(df["dst"], device="cuda")
+
+    ei = torch.stack([dst, src])
+    ei = torch.tensor_split(ei.clone(), world_size, axis=1)[rank]
+
+    sz = (34, 34) if specify_size else None
+    graph_store = GraphStore(is_multi_gpu=True)
+    graph_store.put_edge_index(ei, ("person", "knows", "person"), "coo", False, sz)
+
+    feature_store = TensorDictFeatureStore()
+    feature_store["person", "feat"] = torch.randint(128, (34, 16))
+
+    ix_train = torch.tensor_split(torch.arange(34), world_size, axis=0)[rank]
+
+    loader = NeighborLoader(
+        (feature_store, graph_store),
+        [5, 5],
+        input_nodes=ix_train,
+    )
+
+    for batch in loader:
+        assert isinstance(batch, torch_geometric.data.Data)
+        assert (feature_store["person", "feat"][batch.n_id] == batch.feat).all()
+
+    cugraph_comms_shutdown()
+
+
+@pytest.mark.parametrize("specify_size", [True, False])
+@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.mg
+def test_neighbor_loader_mg(specify_size):
+    uid = cugraph_comms_create_unique_id()
+    world_size = torch.cuda.device_count()
+
+    torch.multiprocessing.spawn(
+        run_test_neighbor_loader_mg,
+        args=(
+            uid,
+            world_size,
+            specify_size,
+        ),
+        nprocs=world_size,
+    )
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_gat_conv.py b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_gat_conv.py
index a26063f62fa..92d216fefa3 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_gat_conv.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_gat_conv.py
@@ -30,6 +30,7 @@
 @pytest.mark.parametrize("max_num_neighbors", [8, None])
 @pytest.mark.parametrize("use_edge_attr", [True, False])
 @pytest.mark.parametrize("graph", ["basic_pyg_graph_1", "basic_pyg_graph_2"])
+@pytest.mark.sg
 def test_gat_conv_equality(
     use_edge_index,
     bias,
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_gatv2_conv.py b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_gatv2_conv.py
index a62f2fed2f7..2e221922add 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_gatv2_conv.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_gatv2_conv.py
@@ -24,6 +24,7 @@
 @pytest.mark.parametrize("heads", [1, 2, 3, 5, 10, 16])
 @pytest.mark.parametrize("use_edge_attr", [True, False])
 @pytest.mark.parametrize("graph", ["basic_pyg_graph_1", "basic_pyg_graph_2"])
+@pytest.mark.sg
 def test_gatv2_conv_equality(
     use_edge_index, bipartite, concat, heads, use_edge_attr, graph, request
 ):
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_hetero_gat_conv.py b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_hetero_gat_conv.py
index d8190ea345f..f182869002a 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_hetero_gat_conv.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_hetero_gat_conv.py
@@ -25,6 +25,7 @@
 )
 @pytest.mark.parametrize("heads", [1, 3, 10])
 @pytest.mark.parametrize("aggr", ["sum", "mean"])
+@pytest.mark.sg
 def test_hetero_gat_conv_equality(sample_pyg_hetero_data, aggr, heads):
     import torch
     from torch_geometric.data import HeteroData
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_rgcn_conv.py b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_rgcn_conv.py
index fc0aaf25b7b..8b06cb2e180 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_rgcn_conv.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_rgcn_conv.py
@@ -25,6 +25,7 @@
 @pytest.mark.parametrize("num_bases", [1, 2, None])
 @pytest.mark.parametrize("root_weight", [True, False])
 @pytest.mark.parametrize("graph", ["basic_pyg_graph_1", "basic_pyg_graph_2"])
+@pytest.mark.sg
 def test_rgcn_conv_equality(
     use_edge_index,
     aggr,
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_sage_conv.py b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_sage_conv.py
index 9d8d413c590..878ceff632a 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_sage_conv.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_sage_conv.py
@@ -26,6 +26,7 @@
 @pytest.mark.parametrize("normalize", [True, False])
 @pytest.mark.parametrize("root_weight", [True, False])
 @pytest.mark.parametrize("graph", ["basic_pyg_graph_1", "basic_pyg_graph_2"])
+@pytest.mark.sg
 def test_sage_conv_equality(
     use_edge_index,
     aggr,
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_transformer_conv.py b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_transformer_conv.py
index 1776b691c87..d207a4d7947 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_transformer_conv.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_transformer_conv.py
@@ -24,6 +24,7 @@
 @pytest.mark.parametrize("concat", [True, False])
 @pytest.mark.parametrize("heads", [1, 2, 3, 5, 10, 16])
 @pytest.mark.parametrize("graph", ["basic_pyg_graph_1", "basic_pyg_graph_2"])
+@pytest.mark.sg
 def test_transformer_conv_equality(
     use_edge_index, use_edge_attr, bipartite, concat, heads, graph, request
 ):
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_sampler.py b/python/cugraph-pyg/cugraph_pyg/tests/sampler/test_sampler_utils.py
similarity index 93%
rename from python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_sampler.py
rename to python/cugraph-pyg/cugraph_pyg/tests/sampler/test_sampler_utils.py
index ed011a658a9..7659fdc386f 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_sampler.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/sampler/test_sampler_utils.py
@@ -16,8 +16,8 @@
 
 import pytest
 
-from cugraph_pyg.data import CuGraphStore
-from cugraph_pyg.sampler.cugraph_sampler import (
+from cugraph_pyg.data import DaskGraphStore
+from cugraph_pyg.sampler.sampler_utils import (
     _sampler_output_from_sampling_results_heterogeneous,
 )
 
@@ -29,9 +29,10 @@
 
 @pytest.mark.cugraph_ops
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.sg
 def test_neighbor_sample(basic_graph_1):
     F, G, N = basic_graph_1
-    cugraph_store = CuGraphStore(F, G, N, order="CSR")
+    cugraph_store = DaskGraphStore(F, G, N, order="CSR")
 
     batches = cudf.DataFrame(
         {
@@ -88,9 +89,10 @@ def test_neighbor_sample(basic_graph_1):
 
 @pytest.mark.cugraph_ops
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.sg
 def test_neighbor_sample_multi_vertex(multi_edge_multi_vertex_graph_1):
     F, G, N = multi_edge_multi_vertex_graph_1
-    cugraph_store = CuGraphStore(F, G, N, order="CSR")
+    cugraph_store = DaskGraphStore(F, G, N, order="CSR")
 
     batches = cudf.DataFrame(
         {
@@ -148,10 +150,11 @@ def test_neighbor_sample_multi_vertex(multi_edge_multi_vertex_graph_1):
 
 
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.sg
 def test_neighbor_sample_mock_sampling_results(abc_graph):
     F, G, N = abc_graph
 
-    graph_store = CuGraphStore(F, G, N, order="CSR")
+    graph_store = DaskGraphStore(F, G, N, order="CSR")
 
     # let 0, 1 be the start vertices, fanout = [2, 1, 2, 3]
     mock_sampling_results = cudf.DataFrame(
@@ -191,9 +194,3 @@ def test_neighbor_sample_mock_sampling_results(abc_graph):
     assert out.num_sampled_edges[("A", "ab", "B")] == [3, 0, 1, 0]
     assert out.num_sampled_edges[("B", "ba", "A")] == [0, 1, 0, 1]
     assert out.num_sampled_edges[("B", "bc", "C")] == [0, 2, 0, 2]
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.skip("needs to be written")
-def test_neighbor_sample_renumbered():
-    pass
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_sampler.py b/python/cugraph-pyg/cugraph_pyg/tests/sampler/test_sampler_utils_mg.py
similarity index 86%
rename from python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_sampler.py
rename to python/cugraph-pyg/cugraph_pyg/tests/sampler/test_sampler_utils_mg.py
index 80a2d0a6c79..91e0668b3c1 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_sampler.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/sampler/test_sampler_utils_mg.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -16,8 +16,8 @@
 
 import pytest
 
-from cugraph_pyg.data import CuGraphStore
-from cugraph_pyg.sampler.cugraph_sampler import (
+from cugraph_pyg.data import DaskGraphStore
+from cugraph_pyg.sampler.sampler_utils import (
     _sampler_output_from_sampling_results_heterogeneous,
 )
 
@@ -31,9 +31,10 @@
 
 @pytest.mark.cugraph_ops
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.mg
 def test_neighbor_sample(dask_client, basic_graph_1):
     F, G, N = basic_graph_1
-    cugraph_store = CuGraphStore(F, G, N, multi_gpu=True, order="CSR")
+    cugraph_store = DaskGraphStore(F, G, N, multi_gpu=True, order="CSR")
 
     batches = cudf.DataFrame(
         {
@@ -87,18 +88,19 @@ def test_neighbor_sample(dask_client, basic_graph_1):
 
     # check the hop dictionaries
     assert len(out.num_sampled_nodes) == 1
-    assert out.num_sampled_nodes["vt1"].tolist() == [4, 1]
+    assert out.num_sampled_nodes["vt1"] == [4, 1]
 
     assert len(out.num_sampled_edges) == 1
-    assert out.num_sampled_edges[("vt1", "pig", "vt1")].tolist() == [6]
+    assert out.num_sampled_edges[("vt1", "pig", "vt1")] == [6]
 
 
 @pytest.mark.cugraph_ops
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
 @pytest.mark.skip(reason="broken")
+@pytest.mark.mg
 def test_neighbor_sample_multi_vertex(dask_client, multi_edge_multi_vertex_graph_1):
     F, G, N = multi_edge_multi_vertex_graph_1
-    cugraph_store = CuGraphStore(F, G, N, multi_gpu=True, order="CSR")
+    cugraph_store = DaskGraphStore(F, G, N, multi_gpu=True, order="CSR")
 
     batches = cudf.DataFrame(
         {
@@ -160,6 +162,7 @@ def test_neighbor_sample_multi_vertex(dask_client, multi_edge_multi_vertex_graph
 
 
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.mg
 def test_neighbor_sample_mock_sampling_results(dask_client):
     N = {
         "A": 2,  # 0, 1
@@ -190,7 +193,7 @@ def test_neighbor_sample_mock_sampling_results(dask_client):
         torch.tensor([3.2, 2.1], dtype=torch.float32), type_name="A", feat_name="prop1"
     )
 
-    graph_store = CuGraphStore(F, G, N, multi_gpu=True, order="CSR")
+    graph_store = DaskGraphStore(F, G, N, multi_gpu=True, order="CSR")
 
     # let 0, 1 be the start vertices, fanout = [2, 1, 2, 3]
     mock_sampling_results = cudf.DataFrame(
@@ -222,17 +225,11 @@ def test_neighbor_sample_mock_sampling_results(dask_client):
     assert out.col[("B", "ba", "A")].tolist() == [1, 1]
 
     assert len(out.num_sampled_nodes) == 3
-    assert out.num_sampled_nodes["A"].tolist() == [2, 0, 0, 0, 0]
-    assert out.num_sampled_nodes["B"].tolist() == [0, 2, 0, 0, 0]
-    assert out.num_sampled_nodes["C"].tolist() == [0, 0, 2, 0, 1]
+    assert out.num_sampled_nodes["A"] == [2, 0, 0, 0, 0]
+    assert out.num_sampled_nodes["B"] == [0, 2, 0, 0, 0]
+    assert out.num_sampled_nodes["C"] == [0, 0, 2, 0, 1]
 
     assert len(out.num_sampled_edges) == 3
-    assert out.num_sampled_edges[("A", "ab", "B")].tolist() == [3, 0, 1, 0]
-    assert out.num_sampled_edges[("B", "ba", "A")].tolist() == [0, 1, 0, 1]
-    assert out.num_sampled_edges[("B", "bc", "C")].tolist() == [0, 2, 0, 2]
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.skip("needs to be written")
-def test_neighbor_sample_renumbered(dask_client):
-    pass
+    assert out.num_sampled_edges[("A", "ab", "B")] == [3, 0, 1, 0]
+    assert out.num_sampled_edges[("B", "ba", "A")] == [0, 1, 0, 1]
+    assert out.num_sampled_edges[("B", "bc", "C")] == [0, 2, 0, 2]
diff --git a/python/cugraph-pyg/pytest.ini b/python/cugraph-pyg/pytest.ini
index 579b2245308..db99a54ae49 100644
--- a/python/cugraph-pyg/pytest.ini
+++ b/python/cugraph-pyg/pytest.ini
@@ -23,6 +23,8 @@ addopts =
 markers =
           slow: slow-running tests/benchmarks
           cugraph_ops: Tests requiring cugraph-ops
+          mg: Test MG code paths - number of gpu > 1
+          sg: Test SG code paths and dask sg tests - number of gpu == 1
 
 python_classes =
           Bench*
diff --git a/python/cugraph/cugraph/gnn/__init__.py b/python/cugraph/cugraph/gnn/__init__.py
index 1f4d98f0230..b6c8e1981d0 100644
--- a/python/cugraph/cugraph/gnn/__init__.py
+++ b/python/cugraph/cugraph/gnn/__init__.py
@@ -16,6 +16,7 @@
 from .data_loading.dist_sampler import (
     DistSampler,
     DistSampleWriter,
+    DistSampleReader,
     UniformNeighborSampler,
 )
 from .comms.cugraph_nccl_comms import (
diff --git a/python/cugraph/cugraph/gnn/data_loading/__init__.py b/python/cugraph/cugraph/gnn/data_loading/__init__.py
index a50f6085e9a..98c547a0083 100644
--- a/python/cugraph/cugraph/gnn/data_loading/__init__.py
+++ b/python/cugraph/cugraph/gnn/data_loading/__init__.py
@@ -15,5 +15,6 @@
 from cugraph.gnn.data_loading.dist_sampler import (
     DistSampler,
     DistSampleWriter,
+    DistSampleReader,
     UniformNeighborSampler,
 )
diff --git a/python/cugraph/cugraph/gnn/data_loading/dist_sampler.py b/python/cugraph/cugraph/gnn/data_loading/dist_sampler.py
index e57e195a4b8..52638230b9b 100644
--- a/python/cugraph/cugraph/gnn/data_loading/dist_sampler.py
+++ b/python/cugraph/cugraph/gnn/data_loading/dist_sampler.py
@@ -12,15 +12,18 @@
 # limitations under the License.
 
 import os
+import re
 import warnings
 from math import ceil
+from functools import reduce
 
 import pylibcugraph
 import numpy as np
 import cupy
 import cudf
 
-from typing import Union, List, Dict, Tuple
+from typing import Union, List, Dict, Tuple, Iterator, Optional
+
 from cugraph.utilities import import_optional
 from cugraph.gnn.comms import cugraph_comms_get_raft_handle
 
@@ -32,6 +35,73 @@
 TensorType = Union["torch.Tensor", cupy.ndarray, cudf.Series]
 
 
+class DistSampleReader:
+    def __init__(
+        self,
+        directory: str,
+        *,
+        format: str = "parquet",
+        rank: Optional[int] = None,
+        filelist=None,
+    ):
+        self.__format = format
+        self.__directory = directory
+
+        if format != "parquet":
+            raise ValueError("Invalid format (currently supported: 'parquet')")
+
+        if filelist is None:
+            files = os.listdir(directory)
+            ex = re.compile(r"batch\=([0-9]+)\.([0-9]+)\-([0-9]+)\.([0-9]+)\.parquet")
+            filematch = [ex.match(f) for f in files]
+            filematch = [f for f in filematch if f]
+
+            if rank is not None:
+                filematch = [f for f in filematch if int(f[1]) == rank]
+
+            batch_count = sum([int(f[4]) - int(f[2]) + 1 for f in filematch])
+            filematch = sorted(filematch, key=lambda f: int(f[2]), reverse=True)
+
+            self.__files = filematch
+        else:
+            self.__files = list(filelist)
+
+        if rank is None:
+            self.__batch_count = batch_count
+        else:
+            batch_count = torch.tensor([batch_count], device="cuda")
+            torch.distributed.all_reduce(batch_count, torch.distributed.ReduceOp.MIN)
+            self.__batch_count = int(batch_count)
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        if len(self.__files) > 0:
+            f = self.__files.pop()
+            fname = f[0]
+            start_inclusive = int(f[2])
+            end_inclusive = int(f[4])
+
+            if (end_inclusive - start_inclusive + 1) > self.__batch_count:
+                end_inclusive = start_inclusive + self.__batch_count - 1
+                self.__batch_count = 0
+            else:
+                self.__batch_count -= end_inclusive - start_inclusive + 1
+
+            df = cudf.read_parquet(os.path.join(self.__directory, fname))
+            tensors = {}
+            for col in list(df.columns):
+                s = df[col].dropna()
+                if len(s) > 0:
+                    tensors[col] = torch.as_tensor(s, device="cuda")
+                df.drop(col, axis=1, inplace=True)
+
+            return tensors, start_inclusive, end_inclusive
+
+        raise StopIteration
+
+
 class DistSampleWriter:
     def __init__(
         self,
@@ -72,6 +142,16 @@ def _directory(self):
     def _batches_per_partition(self):
         return self.__batches_per_partition
 
+    def get_reader(
+        self, rank: int
+    ) -> Iterator[Tuple[Dict[str, "torch.Tensor"], int, int]]:
+        """
+        Returns an iterator over sampled data.
+        """
+
+        # currently only disk reading is supported
+        return DistSampleReader(self._directory, format=self._format, rank=rank)
+
     def __write_minibatches_coo(self, minibatch_dict):
         has_edge_ids = minibatch_dict["edge_id"] is not None
         has_edge_types = minibatch_dict["edge_type"] is not None
@@ -166,10 +246,109 @@ def __write_minibatches_coo(self, minibatch_dict):
             )
 
     def __write_minibatches_csr(self, minibatch_dict):
-        raise NotImplementedError(
-            "CSR format currently not supported for distributed sampling"
+        has_edge_ids = minibatch_dict["edge_id"] is not None
+        has_edge_types = minibatch_dict["edge_type"] is not None
+        has_weights = minibatch_dict["weight"] is not None
+
+        if minibatch_dict["renumber_map"] is None:
+            raise ValueError(
+                "Distributed sampling without renumbering is not supported"
+            )
+
+        # Quit if there are no batches to write.
+        if len(minibatch_dict["batch_id"]) == 0:
+            return
+
+        fanout_length = (len(minibatch_dict["label_hop_offsets"]) - 1) // len(
+            minibatch_dict["batch_id"]
         )
 
+        for p in range(
+            0, int(ceil(len(minibatch_dict["batch_id"]) / self.__batches_per_partition))
+        ):
+            partition_start = p * (self.__batches_per_partition)
+            partition_end = (p + 1) * (self.__batches_per_partition)
+
+            label_hop_offsets_array_p = minibatch_dict["label_hop_offsets"][
+                partition_start * fanout_length : partition_end * fanout_length + 1
+            ]
+
+            batch_id_array_p = minibatch_dict["batch_id"][partition_start:partition_end]
+            start_batch_id = batch_id_array_p[0]
+
+            # major offsets and minors
+            (
+                major_offsets_start_incl,
+                major_offsets_end_incl,
+            ) = label_hop_offsets_array_p[[0, -1]]
+
+            start_ix, end_ix = minibatch_dict["major_offsets"][
+                [major_offsets_start_incl, major_offsets_end_incl]
+            ]
+
+            major_offsets_array_p = minibatch_dict["major_offsets"][
+                major_offsets_start_incl : major_offsets_end_incl + 1
+            ]
+
+            minors_array_p = minibatch_dict["minors"][start_ix:end_ix]
+            edge_id_array_p = (
+                minibatch_dict["edge_id"][start_ix:end_ix]
+                if has_edge_ids
+                else cupy.array([], dtype="int64")
+            )
+            edge_type_array_p = (
+                minibatch_dict["edge_type"][start_ix:end_ix]
+                if has_edge_types
+                else cupy.array([], dtype="int32")
+            )
+            weight_array_p = (
+                minibatch_dict["weight"][start_ix:end_ix]
+                if has_weights
+                else cupy.array([], dtype="float32")
+            )
+
+            # create the renumber map offsets
+            renumber_map_offsets_array_p = minibatch_dict["renumber_map_offsets"][
+                partition_start : partition_end + 1
+            ]
+
+            renumber_map_start_ix, renumber_map_end_ix = renumber_map_offsets_array_p[
+                [0, -1]
+            ]
+
+            renumber_map_array_p = minibatch_dict["renumber_map"][
+                renumber_map_start_ix:renumber_map_end_ix
+            ]
+
+            results_dataframe_p = create_df_from_disjoint_arrays(
+                {
+                    "major_offsets": major_offsets_array_p,
+                    "minors": minors_array_p,
+                    "map": renumber_map_array_p,
+                    "label_hop_offsets": label_hop_offsets_array_p,
+                    "weight": weight_array_p,
+                    "edge_id": edge_id_array_p,
+                    "edge_type": edge_type_array_p,
+                    "renumber_map_offsets": renumber_map_offsets_array_p,
+                }
+            )
+
+            end_batch_id = start_batch_id + len(batch_id_array_p) - 1
+            rank = minibatch_dict["rank"] if "rank" in minibatch_dict else 0
+
+            full_output_path = os.path.join(
+                self.__directory,
+                f"batch={rank:05d}.{start_batch_id:08d}-"
+                f"{rank:05d}.{end_batch_id:08d}.parquet",
+            )
+
+            results_dataframe_p.to_parquet(
+                full_output_path,
+                compression=None,
+                index=False,
+                force_nullable_schema=True,
+            )
+
     def write_minibatches(self, minibatch_dict):
         if (minibatch_dict["majors"] is not None) and (
             minibatch_dict["minors"] is not None
@@ -188,8 +367,8 @@ def __init__(
         self,
         graph: Union[pylibcugraph.SGGraph, pylibcugraph.MGGraph],
         writer: DistSampleWriter,
-        local_seeds_per_call: int = 32768,
-        retain_original_seeds: bool = False,  # TODO See #4329, needs C API
+        local_seeds_per_call: int,
+        retain_original_seeds: bool = False,
     ):
         """
         Parameters
@@ -199,14 +378,16 @@ def __init__(
         writer: DistSampleWriter (required)
             The writer responsible for writing samples to disk
             or, in the future, device or host memory.
-        local_seeds_per_call: int (optional, default=32768)
+        local_seeds_per_call: int
             The number of seeds on this rank this sampler will
             process in a single sampling call.  Batches will
             get split into multiple sampling calls based on
             this parameter.  This parameter must
             be the same across all ranks.  The total number
             of seeds processed per sampling call is this
-            parameter times the world size.
+            parameter times the world size. Subclasses should
+            generally calculate the appropriate number of
+            seeds.
         retain_original_seeds: bool (optional, default=False)
             Whether to retain the original seeds even if they
             do not appear in the output minibatch.  This will
@@ -219,6 +400,13 @@ def __init__(
         self.__handle = None
         self.__retain_original_seeds = retain_original_seeds
 
+    def get_reader(self) -> Iterator[Tuple[Dict[str, "torch.Tensor"], int, int]]:
+        """
+        Returns an iterator over sampled data.
+        """
+        rank = torch.distributed.get_rank() if self.is_multi_gpu else None
+        return self.__writer.get_reader(rank)
+
     def sample_batches(
         self,
         seeds: TensorType,
@@ -438,13 +626,6 @@ def sample_from_nodes(
                 : len(current_seeds)
             ]
 
-            # Handle the case where not all ranks have the same number of call groups,
-            # in which case there will be some empty groups that get submitted on the
-            # ranks with fewer call groups.
-            label_start, label_end = (
-                current_batches[[0, -1]] if len(current_batches) > 0 else (0, -1)
-            )
-
             minibatch_dict = self.sample_batches(
                 seeds=current_seeds,
                 batch_ids=current_batches,
@@ -482,12 +663,20 @@ def _retain_original_seeds(self):
 
 
 class UniformNeighborSampler(DistSampler):
+    # Number of vertices in the output minibatch, based
+    # on benchmarking.
+    BASE_VERTICES_PER_BYTE = 0.1107662486009992
+
+    # Default number of seeds if the output minibatch
+    # size can't be estimated.
+    UNKNOWN_VERTICES_DEFAULT = 32768
+
     def __init__(
         self,
         graph: Union[pylibcugraph.SGGraph, pylibcugraph.MGGraph],
         writer: DistSampleWriter,
         *,
-        local_seeds_per_call: int = 32768,
+        local_seeds_per_call: Optional[int] = None,
         retain_original_seeds: bool = False,
         fanout: List[int] = [-1],
         prior_sources_behavior: str = "exclude",
@@ -496,12 +685,6 @@ def __init__(
         compress_per_hop: bool = False,
         with_replacement: bool = False,
     ):
-        super().__init__(
-            graph,
-            writer,
-            local_seeds_per_call=local_seeds_per_call,
-            retain_original_seeds=retain_original_seeds,
-        )
         self.__fanout = fanout
         self.__prior_sources_behavior = prior_sources_behavior
         self.__deduplicate_sources = deduplicate_sources
@@ -509,6 +692,28 @@ def __init__(
         self.__compression = compression
         self.__with_replacement = with_replacement
 
+        super().__init__(
+            graph,
+            writer,
+            local_seeds_per_call=self.__calc_local_seeds_per_call(local_seeds_per_call),
+            retain_original_seeds=retain_original_seeds,
+        )
+
+    def __calc_local_seeds_per_call(self, local_seeds_per_call: Optional[int] = None):
+        if local_seeds_per_call is None:
+            if len([x for x in self.__fanout if x <= 0]) > 0:
+                return UniformNeighborSampler.UNKNOWN_VERTICES_DEFAULT
+
+            total_memory = torch.cuda.get_device_properties(0).total_memory
+            fanout_prod = reduce(lambda x, y: x * y, self.__fanout)
+            return int(
+                UniformNeighborSampler.BASE_VERTICES_PER_BYTE
+                * total_memory
+                / fanout_prod
+            )
+
+        return local_seeds_per_call
+
     def sample_batches(
         self,
         seeds: TensorType,
@@ -526,12 +731,17 @@ def sample_batches(
                 local_label_list, assume_equal_input_size=assume_equal_input_size
             )
 
-            # TODO add calculation of seed vertex label offsets
             if self._retain_original_seeds:
-                warnings.warn(
-                    "The 'retain_original_seeds` parameter is currently ignored "
-                    "since seed retention is not implemented yet."
+                label_offsets = torch.concat(
+                    [
+                        torch.searchsorted(batch_ids, local_label_list),
+                        torch.tensor(
+                            [batch_ids.shape[0]], device="cuda", dtype=torch.int64
+                        ),
+                    ]
                 )
+            else:
+                label_offsets = None
 
             sampling_results_dict = pylibcugraph.uniform_neighbor_sample(
                 self._resource_handle,
@@ -542,7 +752,7 @@ def sample_batches(
                 label_to_output_comm_rank=cupy.asarray(label_to_output_comm_rank),
                 h_fan_out=np.array(self.__fanout, dtype="int32"),
                 with_replacement=self.__with_replacement,
-                do_expensive_check=False,
+                do_expensive_check=True,
                 with_edge_properties=True,
                 random_state=random_state + rank,
                 prior_sources_behavior=self.__prior_sources_behavior,
@@ -551,10 +761,28 @@ def sample_batches(
                 renumber=True,
                 compression=self.__compression,
                 compress_per_hop=self.__compress_per_hop,
+                retain_seeds=self._retain_original_seeds,
+                label_offsets=None
+                if label_offsets is None
+                else cupy.asarray(label_offsets),
                 return_dict=True,
             )
             sampling_results_dict["rank"] = rank
         else:
+            if self._retain_original_seeds:
+                batch_ids = batch_ids.to(device="cuda", dtype=torch.int32)
+                local_label_list = torch.unique(batch_ids)
+                label_offsets = torch.concat(
+                    [
+                        torch.searchsorted(batch_ids, local_label_list),
+                        torch.tensor(
+                            [batch_ids.shape[0]], device="cuda", dtype=torch.int64
+                        ),
+                    ]
+                )
+            else:
+                label_offsets = None
+
             sampling_results_dict = pylibcugraph.uniform_neighbor_sample(
                 self._resource_handle,
                 self._graph,
@@ -571,6 +799,8 @@ def sample_batches(
                 renumber=True,
                 compression=self.__compression,
                 compress_per_hop=self.__compress_per_hop,
+                retain_seeds=self._retain_original_seeds,
+                label_offsets=cupy.asarray(label_offsets),
                 return_dict=True,
             )
 

From 1667f7a43bcc0e9289fa91db2a56ed8907ec92c6 Mon Sep 17 00:00:00 2001
From: Alex Barghi <105237337+alexbarghi-nv@users.noreply.github.com>
Date: Thu, 30 May 2024 15:40:17 -0700
Subject: [PATCH 23/23] [FEA] New WholeGraph Feature Store for PyG (#4432)

Reimplements the WG feature store for PyG using the `FeatureStore` interface.
Merge after #4384

Closes rapidsai/wholegraph#47
Closes #4399

Authors:
  - Alex Barghi (https://github.com/alexbarghi-nv)
  - Seunghwa Kang (https://github.com/seunghwak)
  - Tingyu Wang (https://github.com/tingyu66)
  - Ralph Liu (https://github.com/nv-rliu)

Approvers:
  - Tingyu Wang (https://github.com/tingyu66)
  - Vibhu Jawa (https://github.com/VibhuJawa)
  - Brad Rees (https://github.com/BradReesWork)
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/cugraph/pull/4432
---
 ci/build_wheel.sh                             |   2 +-
 dependencies.yaml                             |  30 +-
 .../api_docs/cugraph-pyg/cugraph_pyg.rst      |   1 +
 python/cugraph-dgl/pyproject.toml             |   1 +
 .../cugraph-pyg/cugraph_pyg/data/__init__.py  |   5 +-
 .../cugraph_pyg/data/feature_store.py         | 147 ++++++
 .../cugraph_pyg/examples/gcn_dist_mnmg.py     | 434 ++++++++++++++++++
 .../cugraph_pyg/examples/gcn_dist_sg.py       | 269 ++++++-----
 .../tests/data/test_feature_store_mg.py       |  85 ++++
 python/cugraph-pyg/pyproject.toml             |   1 +
 python/cugraph/pyproject.toml                 |   1 +
 11 files changed, 855 insertions(+), 121 deletions(-)
 create mode 100644 python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_mnmg.py
 create mode 100644 python/cugraph-pyg/cugraph_pyg/tests/data/test_feature_store_mg.py

diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh
index 587c5fb38e7..c980ed320dc 100755
--- a/ci/build_wheel.sh
+++ b/ci/build_wheel.sh
@@ -40,7 +40,7 @@ if ! rapids-is-release-build; then
     alpha_spec=',>=0.0.0a0'
 fi
 
-for dep in rmm cudf cugraph raft-dask pylibcugraph pylibcugraphops pylibraft ucx-py; do
+for dep in rmm cudf cugraph raft-dask pylibcugraph pylibcugraphops pylibwholegraph pylibraft ucx-py; do
     sed -r -i "s/${dep}==(.*)\"/${dep}${PACKAGE_CUDA_SUFFIX}==\1${alpha_spec}\"/g" ${pyproject_file}
 done
 
diff --git a/dependencies.yaml b/dependencies.yaml
index 3c2622fde9f..19634420520 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -20,6 +20,7 @@ files:
       - depends_on_pylibraft
       - depends_on_raft_dask
       - depends_on_pylibcugraphops
+      - depends_on_pylibwholegraph
       - depends_on_cupy
       - python_run_cugraph
       - python_run_nx_cugraph
@@ -60,6 +61,7 @@ files:
     includes:
       - cuda_version
       - depends_on_cudf
+      - depends_on_pylibwholegraph
       - py_version
       - test_python_common
       - test_python_cugraph
@@ -98,6 +100,7 @@ files:
     includes:
       - test_python_common
       - test_python_cugraph
+      - depends_on_pylibwholegraph
   py_build_pylibcugraph:
     output: pyproject
     pyproject_dir: python/pylibcugraph
@@ -175,6 +178,7 @@ files:
       key: test
     includes:
       - test_python_common
+      - depends_on_pylibwholegraph
   py_build_cugraph_pyg:
     output: pyproject
     pyproject_dir: python/cugraph-pyg
@@ -198,6 +202,7 @@ files:
       key: test
     includes:
       - test_python_common
+      - depends_on_pylibwholegraph
   py_build_cugraph_equivariant:
     output: pyproject
     pyproject_dir: python/cugraph-equivariant
@@ -535,9 +540,7 @@ dependencies:
           - *numpy
           - python-louvain
           - scikit-learn>=0.23.1
-      - output_types: [conda]
-        packages:
-          - pylibwholegraph==24.6.*
+
   test_python_pylibcugraph:
     common:
       - output_types: [conda, pyproject]
@@ -568,6 +571,27 @@ dependencies:
           - tensordict>=0.1.2
           - pyg>=2.5,<2.6
 
+  depends_on_pylibwholegraph:
+    common:
+      - output_types: conda
+        packages:
+          - &pylibwholegraph_conda pylibwholegraph==24.6.*
+      - output_types: requirements
+        packages:
+          # pip recognizes the index as a global option for the requirements.txt file
+          - --extra-index-url=https://pypi.nvidia.com
+          - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
+    specific:
+      - output_types: [requirements, pyproject]
+        matrices:
+          - matrix: {cuda: "12.*"}
+            packages:
+              - pylibwholegraph-cu12==24.6.*
+          - matrix: {cuda: "11.*"}
+            packages:
+              - pylibwholegraph-cu11==24.6.*
+          - {matrix: null, packages: [*pylibwholegraph_conda]}
+
   depends_on_rmm:
     common:
       - output_types: conda
diff --git a/docs/cugraph/source/api_docs/cugraph-pyg/cugraph_pyg.rst b/docs/cugraph/source/api_docs/cugraph-pyg/cugraph_pyg.rst
index 5475fd6c581..d2b1d124ccb 100644
--- a/docs/cugraph/source/api_docs/cugraph-pyg/cugraph_pyg.rst
+++ b/docs/cugraph/source/api_docs/cugraph-pyg/cugraph_pyg.rst
@@ -20,6 +20,7 @@ Feature Storage
    :toctree: ../api/cugraph-pyg/
 
    cugraph_pyg.data.feature_store.TensorDictFeatureStore
+   cugraph_pyg.data.feature_store.WholeFeatureStore
 
 Data Loaders
 ------------
diff --git a/python/cugraph-dgl/pyproject.toml b/python/cugraph-dgl/pyproject.toml
index 37ea8b850bd..534106eb87f 100644
--- a/python/cugraph-dgl/pyproject.toml
+++ b/python/cugraph-dgl/pyproject.toml
@@ -32,6 +32,7 @@ dependencies = [
 [project.optional-dependencies]
 test = [
     "pandas",
+    "pylibwholegraph==24.6.*",
     "pytest",
     "pytest-benchmark",
     "pytest-cov",
diff --git a/python/cugraph-pyg/cugraph_pyg/data/__init__.py b/python/cugraph-pyg/cugraph_pyg/data/__init__.py
index 4c6f267410d..6d51fd5ea01 100644
--- a/python/cugraph-pyg/cugraph_pyg/data/__init__.py
+++ b/python/cugraph-pyg/cugraph_pyg/data/__init__.py
@@ -15,7 +15,10 @@
 
 from cugraph_pyg.data.dask_graph_store import DaskGraphStore
 from cugraph_pyg.data.graph_store import GraphStore
-from cugraph_pyg.data.feature_store import TensorDictFeatureStore
+from cugraph_pyg.data.feature_store import (
+    TensorDictFeatureStore,
+    WholeFeatureStore,
+)
 
 
 def CuGraphStore(*args, **kwargs):
diff --git a/python/cugraph-pyg/cugraph_pyg/data/feature_store.py b/python/cugraph-pyg/cugraph_pyg/data/feature_store.py
index 42dda42a9e1..0adef9f9135 100644
--- a/python/cugraph-pyg/cugraph_pyg/data/feature_store.py
+++ b/python/cugraph-pyg/cugraph_pyg/data/feature_store.py
@@ -20,6 +20,7 @@
 torch = import_optional("torch")
 torch_geometric = import_optional("torch_geometric")
 tensordict = import_optional("tensordict")
+wgth = import_optional("pylibwholegraph.torch")
 
 
 class TensorDictFeatureStore(
@@ -127,3 +128,149 @@ def get_all_tensor_attrs(
                 )
 
         return attrs
+
+
+class WholeFeatureStore(
+    object
+    if isinstance(torch_geometric, MissingModule)
+    else torch_geometric.data.FeatureStore
+):
+    """
+    A basic implementation of the PyG FeatureStore interface that stores
+    feature data in WholeGraph WholeMemory.  This type of feature store is
+    distributed, and avoids data replication across workers.
+
+    Data should be sliced before being passed into this feature store.
+    That means each worker should have its own partition.
+    """
+
+    def __init__(self, memory_type="distributed", location="cpu"):
+        """
+        Parameters
+        ----------
+        memory_type: str (optional, default='distributed')
+            The memory type of this store.
+        location: str(optional, default='cpu')
+            The location ('cpu' or 'cuda') where data is stored.
+        """
+        super().__init__()
+
+        self.__features = {}
+
+        self.__wg_comm = wgth.get_local_node_communicator()
+        self.__wg_type = memory_type
+        self.__wg_location = location
+
+    def _put_tensor(
+        self,
+        tensor: "torch_geometric.typing.FeatureTensorType",
+        attr: "torch_geometric.data.feature_store.TensorAttr",
+    ) -> bool:
+        wg_comm_obj = self.__wg_comm
+
+        if attr.is_set("index"):
+            if (attr.group_name, attr.attr_name) in self.__features:
+                raise NotImplementedError(
+                    "Updating an embedding from an index"
+                    " is not supported by WholeGraph."
+                )
+            else:
+                warnings.warn(
+                    "Ignoring index parameter "
+                    f"(attribute does not exist for group {attr.group_name})"
+                )
+
+        if len(tensor.shape) > 2:
+            raise ValueError("Only 1-D or 2-D tensors are supported by WholeGraph.")
+
+        rank = torch.distributed.get_rank()
+        world_size = torch.distributed.get_world_size()
+
+        ld = torch.tensor(tensor.shape[0], device="cuda", dtype=torch.int64)
+        sizes = torch.empty((world_size,), device="cuda", dtype=torch.int64)
+        torch.distributed.all_gather_into_tensor(sizes, ld)
+
+        sizes = sizes.cpu()
+        ld = sizes.sum()
+
+        td = -1 if len(tensor.shape) == 1 else tensor.shape[1]
+        global_shape = [
+            int(ld),
+            td if td > 0 else 1,
+        ]
+
+        if td < 0:
+            tensor = tensor.reshape((tensor.shape[0], 1))
+
+        wg_embedding = wgth.create_wholememory_tensor(
+            wg_comm_obj,
+            self.__wg_type,
+            self.__wg_location,
+            global_shape,
+            tensor.dtype,
+            [global_shape[1], 1],
+        )
+
+        offset = sizes[:rank].sum() if rank > 0 else 0
+
+        wg_embedding.scatter(
+            tensor.clone(memory_format=torch.contiguous_format).cuda(),
+            torch.arange(
+                offset, offset + tensor.shape[0], dtype=torch.int64, device="cuda"
+            ).contiguous(),
+        )
+
+        wg_comm_obj.barrier()
+
+        self.__features[attr.group_name, attr.attr_name] = (wg_embedding, td)
+        return True
+
+    def _get_tensor(
+        self, attr: "torch_geometric.data.feature_store.TensorAttr"
+    ) -> Optional["torch_geometric.typing.FeatureTensorType"]:
+        if (attr.group_name, attr.attr_name) not in self.__features:
+            return None
+
+        emb, td = self.__features[attr.group_name, attr.attr_name]
+
+        if attr.index is None or (not attr.is_set("index")):
+            attr.index = torch.arange(emb.shape[0], dtype=torch.int64)
+
+        attr.index = attr.index.cuda()
+        t = emb.gather(
+            attr.index,
+            force_dtype=emb.dtype,
+        )
+
+        if td < 0:
+            t = t.reshape((t.shape[0],))
+
+        return t
+
+    def _remove_tensor(
+        self, attr: "torch_geometric.data.feature_store.TensorAttr"
+    ) -> bool:
+        if (attr.group_name, attr.attr_name) not in self.__features:
+            return False
+
+        del self.__features[attr.group_name, attr.attr_name]
+        return True
+
+    def _get_tensor_size(
+        self, attr: "torch_geometric.data.feature_store.TensorAttr"
+    ) -> Tuple:
+        return self.__features[attr.group_name, attr.attr_name].shape
+
+    def get_all_tensor_attrs(
+        self,
+    ) -> List["torch_geometric.data.feature_store.TensorAttr"]:
+        attrs = []
+        for (group_name, attr_name) in self.__features.keys():
+            attrs.append(
+                torch_geometric.data.feature_store.TensorAttr(
+                    group_name=group_name,
+                    attr_name=attr_name,
+                )
+            )
+
+        return attrs
diff --git a/python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_mnmg.py b/python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_mnmg.py
new file mode 100644
index 00000000000..be6447208ce
--- /dev/null
+++ b/python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_mnmg.py
@@ -0,0 +1,434 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Multi-node, multi-GPU example with WholeGraph feature storage.
+# Can be run with torchrun.
+
+import argparse
+import os
+import warnings
+import tempfile
+import time
+import json
+
+
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+from ogb.nodeproppred import PygNodePropPredDataset
+from torch.nn.parallel import DistributedDataParallel
+
+import torch_geometric
+
+from cugraph.gnn import (
+    cugraph_comms_init,
+    cugraph_comms_shutdown,
+    cugraph_comms_create_unique_id,
+)
+
+from pylibwholegraph.torch.initialize import (
+    init as wm_init,
+    finalize as wm_finalize,
+)
+
+# Allow computation on objects that are larger than GPU memory
+# https://docs.rapids.ai/api/cudf/stable/developer_guide/library_design/#spilling-to-host-memory
+os.environ["CUDF_SPILL"] = "1"
+
+# Ensures that a CUDA context is not created on import of rapids.
+# Allows pytorch to create the context instead
+os.environ["RAPIDS_NO_INITIALIZE"] = "1"
+
+
+def init_pytorch_worker(global_rank, local_rank, world_size, cugraph_id):
+    import rmm
+
+    rmm.reinitialize(
+        devices=local_rank,
+        managed_memory=True,
+        pool_allocator=True,
+    )
+
+    import cupy
+
+    cupy.cuda.Device(local_rank).use()
+    from rmm.allocators.cupy import rmm_cupy_allocator
+
+    cupy.cuda.set_allocator(rmm_cupy_allocator)
+
+    from cugraph.testing.mg_utils import enable_spilling
+
+    enable_spilling()
+
+    torch.cuda.set_device(local_rank)
+
+    cugraph_comms_init(
+        rank=global_rank, world_size=world_size, uid=cugraph_id, device=local_rank
+    )
+
+    wm_init(global_rank, world_size, local_rank, torch.cuda.device_count())
+
+
+def partition_data(dataset, split_idx, edge_path, feature_path, label_path, meta_path):
+    data = dataset[0]
+
+    # Split and save edge index
+    os.makedirs(
+        edge_path,
+        exist_ok=True,
+    )
+    for (r, e) in enumerate(torch.tensor_split(data.edge_index, world_size, dim=1)):
+        rank_path = os.path.join(edge_path, f"rank={r}.pt")
+        torch.save(
+            e.clone(),
+            rank_path,
+        )
+
+    # Split and save features
+    os.makedirs(
+        feature_path,
+        exist_ok=True,
+    )
+
+    for (r, f) in enumerate(torch.tensor_split(data.x, world_size)):
+        rank_path = os.path.join(feature_path, f"rank={r}_x.pt")
+        torch.save(
+            f.clone(),
+            rank_path,
+        )
+    for (r, f) in enumerate(torch.tensor_split(data.y, world_size)):
+        rank_path = os.path.join(feature_path, f"rank={r}_y.pt")
+        torch.save(
+            f.clone(),
+            rank_path,
+        )
+
+    # Split and save labels
+    os.makedirs(
+        label_path,
+        exist_ok=True,
+    )
+    for (d, i) in split_idx.items():
+        i_parts = torch.tensor_split(i, world_size)
+        for r, i_part in enumerate(i_parts):
+            rank_path = os.path.join(label_path, f"rank={r}")
+            os.makedirs(rank_path, exist_ok=True)
+            torch.save(i_part, os.path.join(rank_path, f"{d}.pt"))
+
+    # Save metadata
+    meta = {
+        "num_classes": int(dataset.num_classes),
+        "num_features": int(dataset.num_features),
+        "num_nodes": int(data.num_nodes),
+    }
+    with open(meta_path, "w") as f:
+        json.dump(meta, f)
+
+
+def load_partitioned_data(
+    rank, edge_path, feature_path, label_path, meta_path, wg_mem_type
+):
+    from cugraph_pyg.data import GraphStore, WholeFeatureStore
+
+    graph_store = GraphStore(is_multi_gpu=True)
+    feature_store = WholeFeatureStore(memory_type=wg_mem_type)
+
+    # Load metadata
+    with open(meta_path, "r") as f:
+        meta = json.load(f)
+
+    # Load labels
+    split_idx = {}
+    for split in ["train", "test", "valid"]:
+        split_idx[split] = torch.load(
+            os.path.join(label_path, f"rank={rank}", f"{split}.pt")
+        )
+
+    # Load features
+    feature_store["node", "x"] = torch.load(
+        os.path.join(feature_path, f"rank={rank}_x.pt")
+    )
+    feature_store["node", "y"] = torch.load(
+        os.path.join(feature_path, f"rank={rank}_y.pt")
+    )
+
+    # Load edge index
+    eix = torch.load(os.path.join(edge_path, f"rank={rank}.pt"))
+    graph_store[
+        ("node", "rel", "node"), "coo", False, (meta["num_nodes"], meta["num_nodes"])
+    ] = eix
+
+    return (feature_store, graph_store), split_idx, meta
+
+
+def run_train(
+    global_rank,
+    data,
+    split_idx,
+    world_size,
+    device,
+    model,
+    epochs,
+    batch_size,
+    fan_out,
+    num_classes,
+    wall_clock_start,
+    tempdir=None,
+    num_layers=3,
+):
+    optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=0.0005)
+
+    kwargs = dict(
+        num_neighbors=[fan_out] * num_layers,
+        batch_size=batch_size,
+    )
+    # Set Up Neighbor Loading
+    from cugraph_pyg.loader import NeighborLoader
+
+    ix_train = split_idx["train"].cuda()
+    train_path = os.path.join(tempdir, f"train_{global_rank}")
+    os.mkdir(train_path)
+    train_loader = NeighborLoader(
+        data,
+        input_nodes=ix_train,
+        directory=train_path,
+        shuffle=True,
+        drop_last=True,
+        **kwargs,
+    )
+
+    ix_test = split_idx["test"].cuda()
+    test_path = os.path.join(tempdir, f"test_{global_rank}")
+    os.mkdir(test_path)
+    test_loader = NeighborLoader(
+        data,
+        input_nodes=ix_test,
+        directory=test_path,
+        shuffle=True,
+        drop_last=True,
+        local_seeds_per_call=80000,
+        **kwargs,
+    )
+
+    ix_valid = split_idx["valid"].cuda()
+    valid_path = os.path.join(tempdir, f"valid_{global_rank}")
+    os.mkdir(valid_path)
+    valid_loader = NeighborLoader(
+        data,
+        input_nodes=ix_valid,
+        directory=valid_path,
+        shuffle=True,
+        drop_last=True,
+        **kwargs,
+    )
+
+    dist.barrier()
+
+    eval_steps = 1000
+    warmup_steps = 20
+    dist.barrier()
+    torch.cuda.synchronize()
+
+    if global_rank == 0:
+        prep_time = round(time.perf_counter() - wall_clock_start, 2)
+        print("Total time before training begins (prep_time) =", prep_time, "seconds")
+        print("Beginning training...")
+
+    for epoch in range(epochs):
+        for i, batch in enumerate(train_loader):
+            if i == warmup_steps:
+                torch.cuda.synchronize()
+                start = time.time()
+
+            batch = batch.to(device)
+            batch_size = batch.batch_size
+
+            batch.y = batch.y.view(-1).to(torch.long)
+            optimizer.zero_grad()
+            out = model(batch.x, batch.edge_index)
+            loss = F.cross_entropy(out[:batch_size], batch.y[:batch_size])
+            loss.backward()
+            optimizer.step()
+            if global_rank == 0 and i % 10 == 0:
+                print(
+                    "Epoch: "
+                    + str(epoch)
+                    + ", Iteration: "
+                    + str(i)
+                    + ", Loss: "
+                    + str(loss)
+                )
+        nb = i + 1.0
+
+        if global_rank == 0:
+            print(
+                "Average Training Iteration Time:",
+                (time.time() - start) / (nb - warmup_steps),
+                "s/iter",
+            )
+
+        with torch.no_grad():
+            total_correct = total_examples = 0
+            for i, batch in enumerate(valid_loader):
+                if i >= eval_steps:
+                    break
+
+                batch = batch.to(device)
+                batch_size = batch.batch_size
+
+                batch.y = batch.y.to(torch.long)
+                out = model(batch.x, batch.edge_index)[:batch_size]
+
+                pred = out.argmax(dim=-1)
+                y = batch.y[:batch_size].view(-1).to(torch.long)
+
+                total_correct += int((pred == y).sum())
+                total_examples += y.size(0)
+
+            acc_val = total_correct / total_examples
+            if global_rank == 0:
+                print(
+                    f"Validation Accuracy: {acc_val * 100.0:.4f}%",
+                )
+
+        torch.cuda.synchronize()
+
+    with torch.no_grad():
+        total_correct = total_examples = 0
+        for i, batch in enumerate(test_loader):
+            batch = batch.to(device)
+            batch_size = batch.batch_size
+
+            batch.y = batch.y.to(torch.long)
+            out = model(batch.x, batch.edge_index)[:batch_size]
+
+            pred = out.argmax(dim=-1)
+            y = batch.y[:batch_size].view(-1).to(torch.long)
+
+            total_correct += int((pred == y).sum())
+            total_examples += y.size(0)
+
+        acc_test = total_correct / total_examples
+        if global_rank == 0:
+            print(
+                f"Test Accuracy: {acc_test * 100.0:.4f}%",
+            )
+
+    if global_rank == 0:
+        total_time = round(time.perf_counter() - wall_clock_start, 2)
+        print("Total Program Runtime (total_time) =", total_time, "seconds")
+        print("total_time - prep_time =", total_time - prep_time, "seconds")
+
+    wm_finalize()
+    cugraph_comms_shutdown()
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--hidden_channels", type=int, default=256)
+    parser.add_argument("--num_layers", type=int, default=2)
+    parser.add_argument("--lr", type=float, default=0.001)
+    parser.add_argument("--epochs", type=int, default=4)
+    parser.add_argument("--batch_size", type=int, default=1024)
+    parser.add_argument("--fan_out", type=int, default=30)
+    parser.add_argument("--tempdir_root", type=str, default=None)
+    parser.add_argument("--dataset_root", type=str, default="dataset")
+    parser.add_argument("--dataset", type=str, default="ogbn-products")
+    parser.add_argument("--skip_partition", action="store_true")
+    parser.add_argument("--wg_mem_type", type=str, default="chunked")
+
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    wall_clock_start = time.perf_counter()
+
+    if "LOCAL_RANK" in os.environ:
+        dist.init_process_group("nccl")
+        world_size = dist.get_world_size()
+        global_rank = dist.get_rank()
+        local_rank = int(os.environ["LOCAL_RANK"])
+        device = torch.device(local_rank)
+
+        # Create the uid needed for cuGraph comms
+        if global_rank == 0:
+            cugraph_id = [cugraph_comms_create_unique_id()]
+        else:
+            cugraph_id = [None]
+        dist.broadcast_object_list(cugraph_id, src=0, device=device)
+        cugraph_id = cugraph_id[0]
+
+        init_pytorch_worker(global_rank, local_rank, world_size, cugraph_id)
+
+        # Split the data
+        edge_path = os.path.join(args.dataset_root, args.dataset + "_eix_part")
+        feature_path = os.path.join(args.dataset_root, args.dataset + "_fea_part")
+        label_path = os.path.join(args.dataset_root, args.dataset + "_label_part")
+        meta_path = os.path.join(args.dataset_root, args.dataset + "_meta.json")
+
+        # We partition the data to avoid loading it in every worker, which will
+        # waste memory and can lead to an out of memory exception.
+        # cugraph_pyg.GraphStore and cugraph_pyg.WholeFeatureStore are always
+        # constructed from partitions of the edge index and features, respectively,
+        # so this works well.
+        if not args.skip_partition and global_rank == 0:
+            dataset = PygNodePropPredDataset(name=args.dataset, root=args.dataset_root)
+            split_idx = dataset.get_idx_split()
+
+            partition_data(
+                dataset,
+                split_idx,
+                meta_path=meta_path,
+                label_path=label_path,
+                feature_path=feature_path,
+                edge_path=edge_path,
+            )
+
+        dist.barrier()
+        data, split_idx, meta = load_partitioned_data(
+            rank=global_rank,
+            edge_path=edge_path,
+            feature_path=feature_path,
+            label_path=label_path,
+            meta_path=meta_path,
+            wg_mem_type=args.wg_mem_type,
+        )
+        dist.barrier()
+
+        model = torch_geometric.nn.models.GCN(
+            meta["num_features"],
+            args.hidden_channels,
+            args.num_layers,
+            meta["num_classes"],
+        ).to(device)
+        model = DistributedDataParallel(model, device_ids=[local_rank])
+
+        with tempfile.TemporaryDirectory(dir=args.tempdir_root) as tempdir:
+            run_train(
+                global_rank,
+                data,
+                split_idx,
+                world_size,
+                device,
+                model,
+                args.epochs,
+                args.batch_size,
+                args.fan_out,
+                meta["num_classes"],
+                wall_clock_start,
+                tempdir,
+                args.num_layers,
+            )
+    else:
+        warnings.warn("This script should be run with 'torchrun`.  Exiting.")
diff --git a/python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_sg.py b/python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_sg.py
index 71b0e4bb2fb..82a612622a1 100644
--- a/python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_sg.py
+++ b/python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_sg.py
@@ -16,7 +16,7 @@
 import tempfile
 import os
 
-from typing import Optional
+from typing import Optional, Tuple, Dict
 
 import torch
 import cupy
@@ -42,137 +42,174 @@
 
 enable_spilling()
 
-parser = argparse.ArgumentParser()
-parser.add_argument("--hidden_channels", type=int, default=256)
-parser.add_argument("--num_layers", type=int, default=2)
-parser.add_argument("--lr", type=float, default=0.001)
-parser.add_argument("--epochs", type=int, default=4)
-parser.add_argument("--batch_size", type=int, default=1024)
-parser.add_argument("--fan_out", type=int, default=30)
-parser.add_argument("--tempdir_root", type=str, default=None)
-parser.add_argument("--dataset_root", type=str, default="dataset")
-parser.add_argument("--dataset", type=str, default="ogbn-products")
-
-args = parser.parse_args()
-
-wall_clock_start = time.perf_counter()
-device = torch.device("cuda")
-
-from ogb.nodeproppred import PygNodePropPredDataset  # noqa: E402
-
-dataset = PygNodePropPredDataset(name=args.dataset, root=args.dataset_root)
-split_idx = dataset.get_idx_split()
-data = dataset[0]
-
-graph_store = cugraph_pyg.data.GraphStore()
-graph_store[
-    ("node", "rel", "node"), "coo", False, (data.num_nodes, data.num_nodes)
-] = data.edge_index
-
-feature_store = cugraph_pyg.data.TensorDictFeatureStore()
-feature_store["node", "x"] = data.x
-feature_store["node", "y"] = data.y
-
-with tempfile.TemporaryDirectory(dir=args.tempdir_root) as samples_dir:
-    train_dir = os.path.join(samples_dir, "train")
-    os.mkdir(train_dir)
-    train_loader = NeighborLoader(
-        data=(feature_store, graph_store),
-        num_neighbors=[args.fan_out] * args.num_layers,
-        input_nodes=split_idx["train"],
-        replace=False,
-        batch_size=args.batch_size,
-        directory=train_dir,
-    )
 
-    val_dir = os.path.join(samples_dir, "val")
-    os.mkdir(val_dir)
-    val_loader = NeighborLoader(
-        data=(feature_store, graph_store),
-        num_neighbors=[args.fan_out] * args.num_layers,
-        input_nodes=split_idx["valid"],
-        replace=False,
-        batch_size=args.batch_size,
-        directory=val_dir,
+def train(epoch: int):
+    model.train()
+    for i, batch in enumerate(train_loader):
+        if i == warmup_steps:
+            torch.cuda.synchronize()
+            start_avg_time = time.perf_counter()
+        batch = batch.to(device)
+
+        optimizer.zero_grad()
+        batch_size = batch.batch_size
+        out = model(batch.x, batch.edge_index)[:batch_size]
+        y = batch.y[:batch_size].view(-1).to(torch.long)
+
+        loss = F.cross_entropy(out, y)
+        loss.backward()
+        optimizer.step()
+
+        if i % 10 == 0:
+            print(f"Epoch: {epoch:02d}, Iteration: {i}, Loss: {loss:.4f}")
+    torch.cuda.synchronize()
+    print(
+        f"Average Training Iteration Time (s/iter): \
+            {(time.perf_counter() - start_avg_time)/(i-warmup_steps):.6f}"
     )
 
-    test_dir = os.path.join(samples_dir, "test")
-    os.mkdir(test_dir)
-    test_loader = NeighborLoader(
-        data=(feature_store, graph_store),
-        num_neighbors=[args.fan_out] * args.num_layers,
-        input_nodes=split_idx["test"],
-        replace=False,
-        batch_size=args.batch_size,
-        directory=test_dir,
+
+@torch.no_grad()
+def test(loader: NeighborLoader, val_steps: Optional[int] = None):
+    model.eval()
+
+    total_correct = total_examples = 0
+    for i, batch in enumerate(loader):
+        if val_steps is not None and i >= val_steps:
+            break
+        batch = batch.to(device)
+        batch_size = batch.batch_size
+        out = model(batch.x, batch.edge_index)[:batch_size]
+        pred = out.argmax(dim=-1)
+        y = batch.y[:batch_size].view(-1).to(torch.long)
+
+        total_correct += int((pred == y).sum())
+        total_examples += y.size(0)
+
+    return total_correct / total_examples
+
+
+def create_loader(
+    data, num_neighbors, input_nodes, replace, batch_size, samples_dir, stage_name
+):
+    directory = os.path.join(samples_dir, stage_name)
+    os.mkdir(directory)
+    return NeighborLoader(
+        data,
+        num_neighbors=num_neighbors,
+        input_nodes=input_nodes,
+        replace=replace,
+        batch_size=batch_size,
+        directory=directory,
     )
 
-    model = torch_geometric.nn.models.GCN(
+
+def load_data(
+    dataset, dataset_root
+) -> Tuple[
+    Tuple[torch_geometric.data.FeatureStore, torch_geometric.data.GraphStore],
+    Dict[str, torch.Tensor],
+    int,
+    int,
+]:
+    from ogb.nodeproppred import PygNodePropPredDataset
+
+    dataset = PygNodePropPredDataset(dataset, root=dataset_root)
+    split_idx = dataset.get_idx_split()
+    data = dataset[0]
+
+    graph_store = cugraph_pyg.data.GraphStore()
+    graph_store[
+        ("node", "rel", "node"), "coo", False, (data.num_nodes, data.num_nodes)
+    ] = data.edge_index
+
+    feature_store = cugraph_pyg.data.TensorDictFeatureStore()
+    feature_store["node", "x"] = data.x
+    feature_store["node", "y"] = data.y
+
+    return (
+        (feature_store, graph_store),
+        split_idx,
         dataset.num_features,
-        args.hidden_channels,
-        args.num_layers,
         dataset.num_classes,
-    ).to(device)
+    )
 
-    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=0.0005)
 
-    warmup_steps = 20
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--hidden_channels", type=int, default=256)
+    parser.add_argument("--num_layers", type=int, default=2)
+    parser.add_argument("--lr", type=float, default=0.001)
+    parser.add_argument("--epochs", type=int, default=4)
+    parser.add_argument("--batch_size", type=int, default=1024)
+    parser.add_argument("--fan_out", type=int, default=30)
+    parser.add_argument("--tempdir_root", type=str, default=None)
+    parser.add_argument("--dataset_root", type=str, default="dataset")
+    parser.add_argument("--dataset", type=str, default="ogbn-products")
 
-    def train(epoch: int):
-        model.train()
-        for i, batch in enumerate(train_loader):
-            if i == warmup_steps:
-                torch.cuda.synchronize()
-                start_avg_time = time.perf_counter()
-            batch = batch.to(device)
+    return parser.parse_args()
 
-            optimizer.zero_grad()
-            batch_size = batch.batch_size
-            out = model(batch.x, batch.edge_index)[:batch_size]
-            y = batch.y[:batch_size].view(-1).to(torch.long)
 
-            loss = F.cross_entropy(out, y)
-            loss.backward()
-            optimizer.step()
+if __name__ == "__main__":
+    args = parse_args()
+    wall_clock_start = time.perf_counter()
+    device = torch.device("cuda")
 
-            if i % 10 == 0:
-                print(f"Epoch: {epoch:02d}, Iteration: {i}, Loss: {loss:.4f}")
-        torch.cuda.synchronize()
-        print(
-            f"Average Training Iteration Time (s/iter): \
-                {(time.perf_counter() - start_avg_time)/(i-warmup_steps):.6f}"
+    data, split_idx, num_features, num_classes = load_data(
+        args.dataset, args.dataset_root
+    )
+
+    with tempfile.TemporaryDirectory(dir=args.tempdir_root) as samples_dir:
+        loader_kwargs = {
+            "data": data,
+            "num_neighbors": [args.fan_out] * args.num_layers,
+            "replace": False,
+            "batch_size": args.batch_size,
+            "samples_dir": samples_dir,
+        }
+
+        train_loader = create_loader(
+            input_nodes=split_idx["train"],
+            stage_name="train",
+            **loader_kwargs,
+        )
+
+        val_loader = create_loader(
+            input_nodes=split_idx["valid"],
+            stage_name="val",
+            **loader_kwargs,
         )
 
-    @torch.no_grad()
-    def test(loader: NeighborLoader, val_steps: Optional[int] = None):
-        model.eval()
+        test_loader = create_loader(
+            input_nodes=split_idx["test"],
+            stage_name="test",
+            **loader_kwargs,
+        )
 
-        total_correct = total_examples = 0
-        for i, batch in enumerate(loader):
-            if val_steps is not None and i >= val_steps:
-                break
-            batch = batch.to(device)
-            batch_size = batch.batch_size
-            out = model(batch.x, batch.edge_index)[:batch_size]
-            pred = out.argmax(dim=-1)
-            y = batch.y[:batch_size].view(-1).to(torch.long)
+        model = torch_geometric.nn.models.GCN(
+            num_features,
+            args.hidden_channels,
+            args.num_layers,
+            num_classes,
+        ).to(device)
 
-            total_correct += int((pred == y).sum())
-            total_examples += y.size(0)
+        optimizer = torch.optim.Adam(
+            model.parameters(), lr=args.lr, weight_decay=0.0005
+        )
 
-        return total_correct / total_examples
+        warmup_steps = 20
 
-    torch.cuda.synchronize()
-    prep_time = round(time.perf_counter() - wall_clock_start, 2)
-    print("Total time before training begins (prep_time)=", prep_time, "seconds")
-    print("Beginning training...")
-    for epoch in range(1, 1 + args.epochs):
-        train(epoch)
-        val_acc = test(val_loader, val_steps=100)
-        print(f"Val Acc: ~{val_acc:.4f}")
-
-    test_acc = test(test_loader)
-    print(f"Test Acc: {test_acc:.4f}")
-    total_time = round(time.perf_counter() - wall_clock_start, 2)
-    print("Total Program Runtime (total_time) =", total_time, "seconds")
-    print("total_time - prep_time =", total_time - prep_time, "seconds")
+        torch.cuda.synchronize()
+        prep_time = round(time.perf_counter() - wall_clock_start, 2)
+        print("Total time before training begins (prep_time)=", prep_time, "seconds")
+        print("Beginning training...")
+        for epoch in range(1, 1 + args.epochs):
+            train(epoch)
+            val_acc = test(val_loader, val_steps=100)
+            print(f"Val Acc: ~{val_acc:.4f}")
+
+        test_acc = test(test_loader)
+        print(f"Test Acc: {test_acc:.4f}")
+        total_time = round(time.perf_counter() - wall_clock_start, 2)
+        print("Total Program Runtime (total_time) =", total_time, "seconds")
+        print("total_time - prep_time =", total_time - prep_time, "seconds")
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/data/test_feature_store_mg.py b/python/cugraph-pyg/cugraph_pyg/tests/data/test_feature_store_mg.py
new file mode 100644
index 00000000000..f1f514560c8
--- /dev/null
+++ b/python/cugraph-pyg/cugraph_pyg/tests/data/test_feature_store_mg.py
@@ -0,0 +1,85 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import pytest
+
+from cugraph.utilities.utils import import_optional, MissingModule
+
+from cugraph_pyg.data import TensorDictFeatureStore, WholeFeatureStore
+
+torch = import_optional("torch")
+pylibwholegraph = import_optional("pylibwholegraph")
+
+
+def run_test_wholegraph_feature_store_basic_api(rank, world_size, dtype):
+    if dtype == "float32":
+        torch_dtype = torch.float32
+    elif dtype == "int64":
+        torch_dtype = torch.int64
+
+    torch.cuda.set_device(rank)
+
+    os.environ["MASTER_ADDR"] = "localhost"
+    os.environ["MASTER_PORT"] = "12355"
+    torch.distributed.init_process_group("nccl", rank=rank, world_size=world_size)
+
+    pylibwholegraph.torch.initialize.init(
+        rank,
+        world_size,
+        rank,
+        world_size,
+    )
+
+    features = torch.arange(0, world_size * 2000)
+    features = features.reshape((features.numel() // 100, 100)).to(torch_dtype)
+
+    tensordict_store = TensorDictFeatureStore()
+    tensordict_store["node", "fea"] = features
+
+    whole_store = WholeFeatureStore()
+    whole_store["node", "fea"] = torch.tensor_split(features, world_size)[rank]
+
+    ix = torch.arange(features.shape[0])
+    assert (
+        whole_store["node", "fea"][ix].cpu() == tensordict_store["node", "fea"][ix]
+    ).all()
+
+    label = torch.arange(0, features.shape[0]).reshape((features.shape[0], 1))
+    tensordict_store["node", "label"] = label
+    whole_store["node", "label"] = torch.tensor_split(label, world_size)[rank]
+
+    assert (
+        whole_store["node", "fea"][ix].cpu() == tensordict_store["node", "fea"][ix]
+    ).all()
+
+    pylibwholegraph.torch.initialize.finalize()
+
+
+@pytest.mark.skipif(
+    isinstance(pylibwholegraph, MissingModule), reason="wholegraph not available"
+)
+@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.parametrize("dtype", ["float32", "int64"])
+@pytest.mark.mg
+def test_wholegraph_feature_store_basic_api(dtype):
+    world_size = torch.cuda.device_count()
+    torch.multiprocessing.spawn(
+        run_test_wholegraph_feature_store_basic_api,
+        args=(
+            world_size,
+            dtype,
+        ),
+        nprocs=world_size,
+    )
diff --git a/python/cugraph-pyg/pyproject.toml b/python/cugraph-pyg/pyproject.toml
index dfa522e6047..b41911b5f80 100644
--- a/python/cugraph-pyg/pyproject.toml
+++ b/python/cugraph-pyg/pyproject.toml
@@ -40,6 +40,7 @@ Documentation = "https://docs.rapids.ai/api/cugraph/stable/"
 [project.optional-dependencies]
 test = [
     "pandas",
+    "pylibwholegraph==24.6.*",
     "pytest",
     "pytest-benchmark",
     "pytest-cov",
diff --git a/python/cugraph/pyproject.toml b/python/cugraph/pyproject.toml
index b29d6f80ff0..8f9a6214ace 100644
--- a/python/cugraph/pyproject.toml
+++ b/python/cugraph/pyproject.toml
@@ -55,6 +55,7 @@ test = [
     "networkx>=2.5.1",
     "numpy>=1.23,<2.0a0",
     "pandas",
+    "pylibwholegraph==24.6.*",
     "pytest",
     "pytest-benchmark",
     "pytest-cov",