From 4e3045fed370fa28b23faa7d10b9cfa955a62412 Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Thu, 6 Jun 2024 11:15:13 -0700
Subject: [PATCH 01/47] c

---
 conda/recipes/cugraph-dgl/meta.yaml |  4 +-
 conda/recipes/cugraph-pyg/meta.yaml |  2 +-
 dependencies.yaml                   | 76 +++++++++++++++--------------
 3 files changed, 43 insertions(+), 39 deletions(-)

diff --git a/conda/recipes/cugraph-dgl/meta.yaml b/conda/recipes/cugraph-dgl/meta.yaml
index 5e28e69a0d7..abdd91e21e0 100644
--- a/conda/recipes/cugraph-dgl/meta.yaml
+++ b/conda/recipes/cugraph-dgl/meta.yaml
@@ -27,8 +27,10 @@ requirements:
     - numba >=0.57
     - numpy >=1.23,<2.0a0
     - pylibcugraphops ={{ minor_version }}
+    - tensordict >=0.1.2,<0.3.1
     - python
-    - pytorch
+    - pytorch >=2.0
+    - cupy >= 12.0.0
 
 tests:
   imports:
diff --git a/conda/recipes/cugraph-pyg/meta.yaml b/conda/recipes/cugraph-pyg/meta.yaml
index 64091ff4782..60b7df5efa2 100644
--- a/conda/recipes/cugraph-pyg/meta.yaml
+++ b/conda/recipes/cugraph-pyg/meta.yaml
@@ -34,7 +34,7 @@ requirements:
     - cupy >=12.0.0
     - cugraph ={{ version }}
     - pylibcugraphops ={{ minor_version }}
-    - tensordict >=0.1.2
+    - tensordict >=0.1.2,<0.3.1
     - pyg >=2.5,<2.6
 
 tests:
diff --git a/dependencies.yaml b/dependencies.yaml
index 93cb1bf35ac..2d26fbf1a6a 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -73,6 +73,7 @@ files:
       table: build-system
     includes:
       - common_build
+      - python_build_wheel
       - depends_on_rmm
       - depends_on_pylibraft
       - depends_on_pylibcugraph
@@ -107,6 +108,7 @@ files:
       table: build-system
     includes:
       - common_build
+      - python_build_wheel
       - depends_on_rmm
       - depends_on_pylibraft
       - python_build_cythonize
@@ -357,11 +359,11 @@ dependencies:
         packages:
           - c-compiler
           - cxx-compiler
-          - libcudf==24.8.*
-          - libcugraphops==24.8.*
-          - libraft-headers==24.8.*
-          - libraft==24.8.*
-          - librmm==24.8.*
+          - libcudf==24.6.*
+          - libcugraphops==24.6.*
+          - libraft-headers==24.6.*
+          - libraft==24.6.*
+          - librmm==24.6.*
           - openmpi # Required for building cpp-mgtests (multi-GPU tests)
     specific:
       - output_types: [conda]
@@ -446,11 +448,11 @@ dependencies:
     common:
       - output_types: [conda, pyproject]
         packages:
-          - &dask rapids-dask-dependency==24.8.*
-          - &dask_cuda dask-cuda==24.8.*
+          - &dask rapids-dask-dependency==24.6.*
+          - &dask_cuda dask-cuda==24.6.*
           - &numba numba>=0.57
           - &numpy numpy>=1.23,<2.0a0
-          - &ucx_py ucx-py==0.39.*
+          - &ucx_py ucx-py==0.38.*
       - output_types: conda
         packages:
           - aiohttp
@@ -475,15 +477,17 @@ dependencies:
         packages:
           - *numba
           - *numpy
+          - &tensordict tensordict>=0.1.2,<0.3.1
       - output_types: [pyproject]
         packages:
-          - &cugraph cugraph==24.8.*
+          - &cugraph cugraph==24.6.*
   python_run_cugraph_pyg:
     common:
       - output_types: [conda, pyproject]
         packages:
           - *numba
           - *numpy
+          - *tensordict
       - output_types: [pyproject]
         packages:
           - *cugraph
@@ -505,7 +509,7 @@ dependencies:
       - output_types: pyproject
         packages:
           - *cugraph
-          - cugraph-service-client==24.8.*
+          - cugraph-service-client==24.6.*
   test_cpp:
     common:
       - output_types: conda
@@ -538,9 +542,6 @@ dependencies:
           - *numpy
           - python-louvain
           - scikit-learn>=0.23.1
-      - output_types: [conda]
-        packages:
-          - pylibwholegraph==24.8.*
 
   test_python_pylibcugraph:
     common:
@@ -558,18 +559,19 @@ dependencies:
     common:
       - output_types: [conda]
         packages:
-          - cugraph==24.8.*
+          - cugraph==24.6.*
           - pytorch>=2.0
           - pytorch-cuda==11.8
+          - *tensordict
           - dgl>=1.1.0.cu*
   cugraph_pyg_dev:
     common:
       - output_types: [conda]
         packages:
-          - cugraph==24.8.*
+          - cugraph==24.6.*
           - pytorch>=2.0
           - pytorch-cuda==11.8
-          - tensordict>=0.1.2
+          - *tensordict
           - pyg>=2.5,<2.6
 
   depends_on_pylibwholegraph:
@@ -597,7 +599,7 @@ dependencies:
     common:
       - output_types: conda
         packages:
-          - &rmm_conda rmm==24.8.*
+          - &rmm_conda rmm==24.6.*
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -608,17 +610,17 @@ dependencies:
         matrices:
           - matrix: {cuda: "12.*"}
             packages:
-              - rmm-cu12==24.8.*
+              - rmm-cu12==24.6.*
           - matrix: {cuda: "11.*"}
             packages:
-              - rmm-cu11==24.8.*
+              - rmm-cu11==24.6.*
           - {matrix: null, packages: [*rmm_conda]}
 
   depends_on_cudf:
     common:
       - output_types: conda
         packages:
-          - &cudf_conda cudf==24.8.*
+          - &cudf_conda cudf==24.6.*
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -629,17 +631,17 @@ dependencies:
         matrices:
           - matrix: {cuda: "12.*"}
             packages:
-              - cudf-cu12==24.8.*
+              - cudf-cu12==24.6.*
           - matrix: {cuda: "11.*"}
             packages:
-              - cudf-cu11==24.8.*
+              - cudf-cu11==24.6.*
           - {matrix: null, packages: [*cudf_conda]}
 
   depends_on_dask_cudf:
     common:
       - output_types: conda
         packages:
-          - &dask_cudf_conda dask-cudf==24.8.*
+          - &dask_cudf_conda dask-cudf==24.6.*
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -650,17 +652,17 @@ dependencies:
         matrices:
           - matrix: {cuda: "12.*"}
             packages:
-              - dask-cudf-cu12==24.8.*
+              - dask-cudf-cu12==24.6.*
           - matrix: {cuda: "11.*"}
             packages:
-              - dask-cudf-cu11==24.8.*
+              - dask-cudf-cu11==24.6.*
           - {matrix: null, packages: [*dask_cudf_conda]}
 
   depends_on_pylibraft:
     common:
       - output_types: conda
         packages:
-          - &pylibraft_conda pylibraft==24.8.*
+          - &pylibraft_conda pylibraft==24.6.*
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -671,17 +673,17 @@ dependencies:
         matrices:
           - matrix: {cuda: "12.*"}
             packages:
-              - pylibraft-cu12==24.8.*
+              - pylibraft-cu12==24.6.*
           - matrix: {cuda: "11.*"}
             packages:
-              - pylibraft-cu11==24.8.*
+              - pylibraft-cu11==24.6.*
           - {matrix: null, packages: [*pylibraft_conda]}
 
   depends_on_raft_dask:
     common:
       - output_types: conda
         packages:
-          - &raft_dask_conda raft-dask==24.8.*
+          - &raft_dask_conda raft-dask==24.6.*
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -692,17 +694,17 @@ dependencies:
         matrices:
           - matrix: {cuda: "12.*"}
             packages:
-              - raft-dask-cu12==24.8.*
+              - raft-dask-cu12==24.6.*
           - matrix: {cuda: "11.*"}
             packages:
-              - raft-dask-cu11==24.8.*
+              - raft-dask-cu11==24.6.*
           - {matrix: null, packages: [*raft_dask_conda]}
 
   depends_on_pylibcugraph:
     common:
       - output_types: conda
         packages:
-          - &pylibcugraph_conda pylibcugraph==24.8.*
+          - &pylibcugraph_conda pylibcugraph==24.6.*
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -713,17 +715,17 @@ dependencies:
         matrices:
           - matrix: {cuda: "12.*"}
             packages:
-              - pylibcugraph-cu12==24.8.*
+              - pylibcugraph-cu12==24.6.*
           - matrix: {cuda: "11.*"}
             packages:
-              - pylibcugraph-cu11==24.8.*
+              - pylibcugraph-cu11==24.6.*
           - {matrix: null, packages: [*pylibcugraph_conda]}
 
   depends_on_pylibcugraphops:
     common:
       - output_types: conda
         packages:
-          - &pylibcugraphops_conda pylibcugraphops==24.8.*
+          - &pylibcugraphops_conda pylibcugraphops==24.6.*
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -734,10 +736,10 @@ dependencies:
         matrices:
           - matrix: {cuda: "12.*"}
             packages:
-              - pylibcugraphops-cu12==24.8.*
+              - pylibcugraphops-cu12==24.6.*
           - matrix: {cuda: "11.*"}
             packages:
-              - pylibcugraphops-cu11==24.8.*
+              - pylibcugraphops-cu11==24.6.*
           - {matrix: null, packages: [*pylibcugraphops_conda]}
 
   depends_on_cupy:

From f243351c419c194b613925a97c90b235ee22893c Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Thu, 6 Jun 2024 11:20:18 -0700
Subject: [PATCH 02/47] pull in dependency fixes

---
 ci/test_python.sh                   | 5 ++---
 ci/test_wheel_cugraph-pyg.sh        | 1 -
 conda/recipes/cugraph-dgl/meta.yaml | 2 +-
 conda/recipes/cugraph-pyg/meta.yaml | 2 +-
 4 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/ci/test_python.sh b/ci/test_python.sh
index fdcf88d692a..90cdf48c46c 100755
--- a/ci/test_python.sh
+++ b/ci/test_python.sh
@@ -214,9 +214,8 @@ if [[ "${RAPIDS_CUDA_VERSION}" == "11.8.0" ]]; then
     # Install pyg dependencies (which requires pip)
 
     pip install \
-      ogb \
-      tensordict
-
+      ogb 
+      
     pip install \
         pyg_lib \
         torch_scatter \
diff --git a/ci/test_wheel_cugraph-pyg.sh b/ci/test_wheel_cugraph-pyg.sh
index 1004063cc38..c55ae033344 100755
--- a/ci/test_wheel_cugraph-pyg.sh
+++ b/ci/test_wheel_cugraph-pyg.sh
@@ -42,7 +42,6 @@ rapids-retry python -m pip install \
   pyg_lib \
   torch_scatter \
   torch_sparse \
-  tensordict \
   -f ${PYG_URL}
 
 rapids-logger "pytest cugraph-pyg (single GPU)"
diff --git a/conda/recipes/cugraph-dgl/meta.yaml b/conda/recipes/cugraph-dgl/meta.yaml
index abdd91e21e0..8d4f5327092 100644
--- a/conda/recipes/cugraph-dgl/meta.yaml
+++ b/conda/recipes/cugraph-dgl/meta.yaml
@@ -27,7 +27,7 @@ requirements:
     - numba >=0.57
     - numpy >=1.23,<2.0a0
     - pylibcugraphops ={{ minor_version }}
-    - tensordict >=0.1.2,<0.3.1
+    - tensordict >=0.1.2,<0.3.1a0
     - python
     - pytorch >=2.0
     - cupy >= 12.0.0
diff --git a/conda/recipes/cugraph-pyg/meta.yaml b/conda/recipes/cugraph-pyg/meta.yaml
index 60b7df5efa2..eef02994da9 100644
--- a/conda/recipes/cugraph-pyg/meta.yaml
+++ b/conda/recipes/cugraph-pyg/meta.yaml
@@ -34,7 +34,7 @@ requirements:
     - cupy >=12.0.0
     - cugraph ={{ version }}
     - pylibcugraphops ={{ minor_version }}
-    - tensordict >=0.1.2,<0.3.1
+    - tensordict >=0.1.2,<0.3.1a0
     - pyg >=2.5,<2.6
 
 tests:

From 4c29329957dce8f0fe2c5225151051cf93634dd0 Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Fri, 7 Jun 2024 11:03:26 -0700
Subject: [PATCH 03/47] w

---
 python/cugraph-dgl/cugraph_dgl/features.py    | 24 +++++
 python/cugraph-dgl/cugraph_dgl/graph.py       | 95 ++++++++++++++++++-
 python/cugraph-dgl/cugraph_dgl/typing.py      | 16 ++++
 .../utils/cugraph_conversion_utils.py         | 15 ++-
 4 files changed, 147 insertions(+), 3 deletions(-)
 create mode 100644 python/cugraph-dgl/cugraph_dgl/features.py
 create mode 100644 python/cugraph-dgl/cugraph_dgl/typing.py

diff --git a/python/cugraph-dgl/cugraph_dgl/features.py b/python/cugraph-dgl/cugraph_dgl/features.py
new file mode 100644
index 00000000000..1e96a5ecc4a
--- /dev/null
+++ b/python/cugraph-dgl/cugraph_dgl/features.py
@@ -0,0 +1,24 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+dgl = import_optional('dgl')
+
+class FeatureStore(dgl.FeatureStorage):
+    """
+    Interface for feature storage.
+    """
+
+    def requires_ddp(self) -> bool:
+        return False
+
+    def fetch
\ No newline at end of file
diff --git a/python/cugraph-dgl/cugraph_dgl/graph.py b/python/cugraph-dgl/cugraph_dgl/graph.py
index 0b2f366f26d..726ceaa5405 100644
--- a/python/cugraph-dgl/cugraph_dgl/graph.py
+++ b/python/cugraph-dgl/cugraph_dgl/graph.py
@@ -11,7 +11,98 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import cupy
+
+from cugraph_dgl.typing import TensorType
+from cugraph_dgl.utils.cugraph_conversion_utils import _cast_to_torch_tensor
+
+from typing import Union, Optional, Dict, Tuple
+
+# Have to use import_optional even though these are required
+# dependencies in order to build properly.
+dgl = import_optional("dgl")
+torch = import_optional("torch")
+tensordict = import_optional("tensordict")
+
+HOMOGENEOUS_EDGE_TYPE = ('n','e','n')
+
 class Graph:
     """
-    Duck-typed version of dgl.DGLGraph.
-    """
\ No newline at end of file
+    cuGraph-backed duck-typed version of dgl.DGLGraph that distributes
+    the graph across workers.  This object uses lazy graph creation.
+    Users can repeatedly call add_edges, and the tensors won't
+    be converted into a cuGraph graph until one is needed
+    (i.e. when creating a loader). Supports
+    single-node/single-GPU, single-node/multi-GPU, and
+    multi-node/multi-GPU graph storage.
+
+    Each worker should have a slice of the graph locally, and
+    call put_edge_index with its slice.
+    """
+
+    def __init__(self, is_multi_gpu: bool=False):
+        """
+        Parameters
+        ----------
+        is_multi_gpu: bool (optional, default=False)
+            Specifies whether this graph is distributed across GPUs.
+        """
+
+        self.__edge_indices = tensordict.TensorDict({}, batch_size=(2,))
+        self.__sizes = {}
+        self.__graph = None
+        self.__vertex_offsets = None
+        self.__handle = None
+        self.__is_multi_gpu = is_multi_gpu
+
+    def to_canonical_etype(self, etype: Union[str, Tuple[str, str, str]]) -> Tuple[str, str, str]:
+        if etype is None:
+            if len(self.__edge_indices.keys(leaves_only=True,include_nested=True)) > 1:
+                raise ValueError("Edge type is required for heterogeneous graphs.")
+            return HOMOGENEOUS_EDGE_TYPE
+
+        if isinstance(etype, Tuple[str, str, str]):
+            return etype
+    
+        for src_type, rel_type, dst_type in self.__edge_indices.keys(leaves_only=True,include_nested=True):
+            if etype == rel_type:
+                return (src_type, rel_type, dst_type)
+    
+        raise ValueError(
+            "Unknown relation type " + etype
+        )
+    
+    def add_edges(self, u: TensorType, v: TensorType, data:Optional[Dict[str, TensorType]]=None, etype:Optional[Union[str, Tuple[str, str, str]]]=None) -> None:
+        """
+        Adds edges to this graph.
+
+        Parameters
+        ----------
+        u: TensorType
+            1d tensor of source vertex ids.
+        v: TensorType
+            1d tensor of destination vertex ids.
+        data: Dict[str, TensorType] (optional, default=None)
+            Dictionary containing edge features for the new edges.
+        etype: Union[str, Tuple[str, str, str]]
+            The edge type of the edges being inserted.  Not required
+            for homogeneous graphs, which have only one edge type.
+        """
+
+        dgl_can_edge_type = self.to_canonical_etype(etype)
+
+        new_edges = torch.stack([
+            _cast_to_torch_tensor(u),
+            _cast_to_torch_tensor(v),
+        ])
+
+        if dgl_can_edge_type in self.__edge_indices.keys(leaves_only=True, include_nested=True):
+            self.__edge_indices[dgl_can_edge_type] = torch.concat([
+                self.__edge_indices[dgl_can_edge_type],
+                new_edges,
+            ], dim=1)
+        else:
+            self.__edge_indices[dgl_can_edge_type] = new_edges
+        
+        if data is not None:
+            
\ No newline at end of file
diff --git a/python/cugraph-dgl/cugraph_dgl/typing.py b/python/cugraph-dgl/cugraph_dgl/typing.py
new file mode 100644
index 00000000000..7a16a1b3dfd
--- /dev/null
+++ b/python/cugraph-dgl/cugraph_dgl/typing.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Union
+
+TensorType = Union["torch.Tensor", "cupy.ndarray", "numpy.ndarray", "cudf.Series", "pandas.Series", List[int]]
diff --git a/python/cugraph-dgl/cugraph_dgl/utils/cugraph_conversion_utils.py b/python/cugraph-dgl/cugraph_dgl/utils/cugraph_conversion_utils.py
index 647dbd38a64..7ae1cba0263 100644
--- a/python/cugraph-dgl/cugraph_dgl/utils/cugraph_conversion_utils.py
+++ b/python/cugraph-dgl/cugraph_dgl/utils/cugraph_conversion_utils.py
@@ -13,7 +13,9 @@
 
 # Utils to convert b/w dgl heterograph to cugraph GraphStore
 from __future__ import annotations
-from typing import Dict, Tuple, Union
+from typing import Dict, Tuple, Union, List
+
+from cugraph_dgl.typing import TensorType
 
 import cudf
 import pandas as pd
@@ -21,6 +23,7 @@
 import dask_cudf
 from dask.distributed import get_client
 import cupy as cp
+import numpy as np
 from cugraph.utilities.utils import import_optional
 from cugraph.gnn.dgl_extensions.dgl_uniform_sampler import src_n, dst_n
 
@@ -115,3 +118,13 @@ def add_edata_from_dgl_HeteroGraph(gs, g):
                 gs.edata_storage.add_data(
                     feat_name=feat_name, type_name=etype, feat_obj=feat_t
                 )
+
+
+def _cast_to_torch_tensor(t: TensorType) -> "torch.Tensor":
+    if isinstance(t, torch.Tensor):
+        return t
+    elif isinstance(t, (cp.ndarray, cudf.Series)):
+        return torch.as_tensor(t, device='cuda')
+    elif isinstance(t, pd.Series, np.ndarray):
+        return torch.as_tensor(t, device='cpu')
+    return torch.as_tensor(t)
\ No newline at end of file

From 265f5467d935b499d1fdae8e771ccea8871adfe0 Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Fri, 7 Jun 2024 14:37:08 -0700
Subject: [PATCH 04/47] basic graph/fs

---
 python/cugraph-dgl/cugraph_dgl/features.py | 105 ++++++++-
 python/cugraph-dgl/cugraph_dgl/graph.py    | 244 ++++++++++++++++++---
 2 files changed, 316 insertions(+), 33 deletions(-)

diff --git a/python/cugraph-dgl/cugraph_dgl/features.py b/python/cugraph-dgl/cugraph_dgl/features.py
index 1e96a5ecc4a..8037bd90454 100644
--- a/python/cugraph-dgl/cugraph_dgl/features.py
+++ b/python/cugraph-dgl/cugraph_dgl/features.py
@@ -11,14 +11,111 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-dgl = import_optional('dgl')
+import warnings
 
-class FeatureStore(dgl.FeatureStorage):
+from cugraph.utilities.utils import import_optional, MissingModule
+
+torch = import_optional("torch")
+dgl = import_optional("dgl")
+wgth = import_optional("pylibwholegraph.torch")
+
+
+class WholeFeatureStore(
+    object if isinstance(dgl, MissingModule) else dgl.storages.base.FeatureStorage
+):
     """
     Interface for feature storage.
     """
 
+    def __init__(
+        self,
+        tensor: "torch.Tensor",
+        memory_type: str = "distributed",
+        location: str = "cpu",
+    ):
+        """
+        Constructs a new WholeFeatureStore object that wraps a WholeGraph wholememory
+        distributed tensor.
+
+        Parameters
+        ----------
+        t: torch.Tensor
+            The local slice of the tensor being distributed.  These should be in order
+            by rank (i.e. rank 0 contains elements 0-9, rank 1 contains elements 10-19,
+            rank 3 contains elements 20-29, etc.)  The sizes do not need to be equal.
+        memory_type: str (optional, default='distributed')
+            The memory type of this store.  Options are
+            'distributed', 'chunked', and 'continuous'.
+            For more information consult the WholeGraph
+            documentation.
+        location: str(optional, default='cpu')
+            The location ('cpu' or 'cuda') where data is stored.
+        """
+        self.__wg_comm = wgth.get_local_node_communicator()
+
+        if len(tensor.shape) > 2:
+            raise ValueError("Only 1-D or 2-D tensors are supported by WholeGraph.")
+
+        rank = torch.distributed.get_rank()
+        world_size = torch.distributed.get_world_size()
+
+        ld = torch.tensor(tensor.shape[0], device="cuda", dtype=torch.int64)
+        sizes = torch.empty((world_size,), device="cuda", dtype=torch.int64)
+        torch.distributed.all_gather_into_tensor(sizes, ld)
+
+        sizes = sizes.cpu()
+        ld = sizes.sum()
+
+        self.__td = -1 if len(tensor.shape) == 1 else tensor.shape[1]
+        global_shape = [
+            int(ld),
+            self.__td if self.__td > 0 else 1,
+        ]
+
+        if self.__td < 0:
+            tensor = tensor.reshape((tensor.shape[0], 1))
+
+        wg_tensor = wgth.create_wholememory_tensor(
+            self.__wg_commm,
+            memory_type,
+            location,
+            global_shape,
+            tensor.dtype,
+            [global_shape[1], 1],
+        )
+
+        offset = sizes[:rank].sum() if rank > 0 else 0
+
+        wg_tensor.scatter(
+            tensor.clone(memory_format=torch.contiguous_format).cuda(),
+            torch.arange(
+                offset, offset + tensor.shape[0], dtype=torch.int64, device="cuda"
+            ).contiguous(),
+        )
+
+        self.__wg_comm.barrier()
+
+        self.__wg_tensor = wg_tensor
+
     def requires_ddp(self) -> bool:
-        return False
+        return True
+
+    def fetch(
+        self,
+        indices: torch.Tensor,
+        device: torch.cuda.Device,
+        pin_memory=False,
+        **kwargs,
+    ):
+        if pin_memory:
+            warnings.warn("pin_memory has no effect for WholeFeatureStorage.")
+
+        t = self.__wg_tensor.gather(
+            indices.cuda(),
+            force_dtype=self.__wg_tensor.dtype,
+        )
+
+        if self.__td < 0:
+            t = t.reshape((t.shape[0],))
 
-    def fetch
\ No newline at end of file
+        return t
diff --git a/python/cugraph-dgl/cugraph_dgl/graph.py b/python/cugraph-dgl/cugraph_dgl/graph.py
index 726ceaa5405..731aeab0b3f 100644
--- a/python/cugraph-dgl/cugraph_dgl/graph.py
+++ b/python/cugraph-dgl/cugraph_dgl/graph.py
@@ -11,10 +11,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import cupy
+from cugraph.utilities.utils import import_optional
 
 from cugraph_dgl.typing import TensorType
 from cugraph_dgl.utils.cugraph_conversion_utils import _cast_to_torch_tensor
+from cugraph_dgl.features import WholeFeatureStore
 
 from typing import Union, Optional, Dict, Tuple
 
@@ -24,7 +25,9 @@
 torch = import_optional("torch")
 tensordict = import_optional("tensordict")
 
-HOMOGENEOUS_EDGE_TYPE = ('n','e','n')
+HOMOGENEOUS_NODE_TYPE = "n"
+HOMOGENEOUS_EDGE_TYPE = (HOMOGENEOUS_NODE_TYPE, "e", HOMOGENEOUS_NODE_TYPE)
+
 
 class Graph:
     """
@@ -40,48 +43,208 @@ class Graph:
     call put_edge_index with its slice.
     """
 
-    def __init__(self, is_multi_gpu: bool=False):
+    def __init__(
+        self,
+        is_multi_gpu: bool = False,
+        ndata_storage="torch",
+        edata_storage="torch",
+        **kwargs,
+    ):
         """
         Parameters
         ----------
         is_multi_gpu: bool (optional, default=False)
             Specifies whether this graph is distributed across GPUs.
+        ndata_storage: str (optional, default='torch')
+            Specifies where node data should be stored
+            (options are 'torch' and 'wholegraph').
+            If using PyTorch tensors for storage ('torch')
+            then data will be replicated across workers and data
+            for all nodes should be provided when calling add_nodes.
+            If using WholeGraph wholememory tensors for storage,
+            then data will be distributed across workers and only
+            the local slice of the data should be provided when
+            calling add_nodes.
+        edata_storage: str (optional, default='torch')
+            If using PyTorch tensors for storage ('torch')
+            then data will be replicated across workers and data
+            for all nodes should be provided when calling add_edge.
+            If using WholeGraph wholememory tensors for storage,
+            then data will be distributed across workers and only
+            the local slice of the data should be provided when
+            calling add_edges.
+        kwargs:
+            Optional kwargs for WholeGraph feature storage.
         """
 
+        if ndata_storage not in ("torch", "wholegraph"):
+            raise ValueError(
+                "Invalid node storage type (valid types are 'torch' and 'wholegraph')"
+            )
+        if edata_storage not in ("torch", "wholegraph"):
+            raise ValueError(
+                "Invalid edge storage type (valid types are 'torch' and 'wholegraph')"
+            )
+
+        self.__num_nodes_dict = {}
         self.__edge_indices = tensordict.TensorDict({}, batch_size=(2,))
+
         self.__sizes = {}
         self.__graph = None
         self.__vertex_offsets = None
         self.__handle = None
         self.__is_multi_gpu = is_multi_gpu
 
-    def to_canonical_etype(self, etype: Union[str, Tuple[str, str, str]]) -> Tuple[str, str, str]:
+        self.__ndata_storage_type = (
+            WholeFeatureStore
+            if ndata_storage == "wholegraph"
+            else dgl.storages.pytorch_tensor.PyTorchTensorStorage
+        )
+        self.__edata_storage_type = (
+            WholeFeatureStore
+            if edata_storage == "wholegraph"
+            else dgl.storages.pytorch_tensor.PyTorchTensorStorage
+        )
+        self.__ndata_storage = {}
+        self.__edata_storage = {}
+        self.__wg_kwargs = kwargs
+
+    @property
+    def is_multi_gpu(self):
+        return self.__is_multi_gpu
+
+    def to_canonical_etype(
+        self, etype: Union[str, Tuple[str, str, str]]
+    ) -> Tuple[str, str, str]:
         if etype is None:
-            if len(self.__edge_indices.keys(leaves_only=True,include_nested=True)) > 1:
+            if len(self.__edge_indices.keys(leaves_only=True, include_nested=True)) > 1:
                 raise ValueError("Edge type is required for heterogeneous graphs.")
             return HOMOGENEOUS_EDGE_TYPE
 
         if isinstance(etype, Tuple[str, str, str]):
             return etype
-    
-        for src_type, rel_type, dst_type in self.__edge_indices.keys(leaves_only=True,include_nested=True):
+
+        for src_type, rel_type, dst_type in self.__edge_indices.keys(
+            leaves_only=True, include_nested=True
+        ):
             if etype == rel_type:
                 return (src_type, rel_type, dst_type)
-    
-        raise ValueError(
-            "Unknown relation type " + etype
-        )
-    
-    def add_edges(self, u: TensorType, v: TensorType, data:Optional[Dict[str, TensorType]]=None, etype:Optional[Union[str, Tuple[str, str, str]]]=None) -> None:
+
+        raise ValueError("Unknown relation type " + etype)
+
+    def add_nodes(
+        self,
+        global_num_nodes: int,
+        data: Optional[Dict[str, TensorType]] = None,
+        ntype: Optional[str] = None,
+    ):
+        """
+        Adds the given number of nodes to this graph.  Can only be called once
+        per node type. The number of nodes specified here refers to the total
+        number of nodes across workers. If the backing feature store is
+        distributed (i.e. wholegraph), then only local features should
+        be passed to the data argument.  If the backing feature store is
+        replicated, then features for all nodes should be passed to the
+        data argument, including those for nodes not on the local worker.
+
+        Parameters
+        ----------
+        global_num_nodes: int
+            The total number of nodes of the given type in this graph.
+            The same number should be passed to every worker.
+        data: Dict[str, TensorType] (optional, default=None)
+            Node feature tensors.
+        ntype: str (optional, default=None)
+            The node type being modified.  Required for heterogeneous graphs.
+        """
+        if ntype is None:
+            if len(self.__num_nodes_dict.keys()) > 1:
+                raise ValueError("Node type is required for heterogeneous graphs.")
+            ntype = HOMOGENEOUS_NODE_TYPE
+
+        if ntype in self.__num_nodes_dict:
+            raise ValueError(
+                "Calling add_nodes multiple types for the same "
+                "node type is not allowed in cuGraph-DGL"
+            )
+
+        if self.is_multi_gpu:
+            # Ensure all nodes got the same number of nodes passed
+            world_size = torch.distributed.get_world_size()
+            local_size = torch.tensor(
+                [global_num_nodes], device="cuda", dtype=torch.int64
+            )
+            ns = torch.empty((world_size,), device="cuda", dtype=torch.int64)
+            torch.distributed.all_gather_into_tensor(ns, local_size)
+            if not (ns == global_num_nodes).all():
+                raise ValueError("The global number of nodes must match on all workers")
+
+            # Ensure the sum of the feature shapes equals the global number of nodes.
+            for feature_name, feature_tensor in data.items():
+                features_size = torch.tensor(
+                    [int(feature_tensor.shape[0])], device="cuda", dtype=torch.int64
+                )
+                torch.distributed.all_reduce(
+                    features_size, op=torch.distributed.ReduceOp.SUM
+                )
+                if features_size != global_num_nodes:
+                    raise ValueError(
+                        "The total length of the feature vector across workers must"
+                        " match the global number of nodes but it does not match for "
+                        f"{feature_name}."
+                    )
+
+        self.__num_nodes_dict[ntype] = global_num_nodes
+
+        for feature_name, feature_tensor in data.items():
+            self.__ndata_storage[ntype, feature_name] = self.__ndata_storage_type(
+                feature_tensor, **self.__wg_kwargs
+            )
+
+    def __check_node_ids(self, ntype: str, ids: TensorType):
+        """
+        Ensures all node ids in the provided id tensor are valid.
+        Raises a ValueError if any are invalid.
+
+        Parameters
+        ----------
+        ntype: str
+            The node type being validated against.
+        ids:
+            The tensor of ids being validated.
         """
-        Adds edges to this graph.
+        if ntype in self.__num_nodes_dict:
+            if ids.max() + 1 > self.__num_nodes(ntype):
+                raise ValueError(
+                    f"input tensor contains invalid node ids for type {ntype}"
+                )
+        else:
+            raise ValueError(
+                f"add_nodes() must be called for type {ntype} before calling num_edges."
+            )
+
+    def add_edges(
+        self,
+        u: TensorType,
+        v: TensorType,
+        data: Optional[Dict[str, TensorType]] = None,
+        etype: Optional[Union[str, Tuple[str, str, str]]] = None,
+    ) -> None:
+        """
+        Adds edges to this graph.  Must be called after add_nodes
+        is called for the src/dst node type. If the backing feature
+        store is distributed (i.e. wholegraph), then only local
+        features should be passed to the data argument.  If the
+        backing feature store is replicated, then features for
+        all edges should be passed to the data argument,
+        including those for edges not on the local worker.
 
         Parameters
         ----------
         u: TensorType
-            1d tensor of source vertex ids.
+            1d tensor of source node ids (local slice of the distributed edgelist).
         v: TensorType
-            1d tensor of destination vertex ids.
+            1d tensor of destination node ids (local slice of the distributed edgelist).
         data: Dict[str, TensorType] (optional, default=None)
             Dictionary containing edge features for the new edges.
         etype: Union[str, Tuple[str, str, str]]
@@ -89,20 +252,43 @@ def add_edges(self, u: TensorType, v: TensorType, data:Optional[Dict[str, Tensor
             for homogeneous graphs, which have only one edge type.
         """
 
+        # Validate all inputs before proceeding
+        # The number of nodes for the src/dst type needs to be known and there cannot
+        # be any edges of this type in the graph.
         dgl_can_edge_type = self.to_canonical_etype(etype)
+        src_type, _, dst_type = dgl_can_edge_type
+        if dgl_can_edge_type in self.__edge_indices.keys(
+            leaves_only=True, include_nested=True
+        ):
+            raise ValueError(
+                "This cuGraph-DGL graph already contains edges of type"
+                f" {dgl_can_edge_type}. Calling add_edges multiple times"
+                " for the same edge type is not supported."
+            )
+        self.__check_node_ids(src_type, u)
+        self.__check_node_ids(dst_type, v)
 
-        new_edges = torch.stack([
-            _cast_to_torch_tensor(u),
-            _cast_to_torch_tensor(v),
-        ])
+        self.__edge_indices[dgl_can_edge_type] = torch.stack(
+            [
+                _cast_to_torch_tensor(u),
+                _cast_to_torch_tensor(v),
+            ]
+        )
 
-        if dgl_can_edge_type in self.__edge_indices.keys(leaves_only=True, include_nested=True):
-            self.__edge_indices[dgl_can_edge_type] = torch.concat([
-                self.__edge_indices[dgl_can_edge_type],
-                new_edges,
-            ], dim=1)
-        else:
-            self.__edge_indices[dgl_can_edge_type] = new_edges
-        
         if data is not None:
-            
\ No newline at end of file
+            for attr_name, attr_tensor in data.items():
+                self.__edata_storage[
+                    dgl_can_edge_type, attr_name
+                ] = self.__edata_storage_type(attr_tensor, **self.__wg_kwargs)
+
+    def num_nodes(self, ntype: str = None):
+        """
+        Returns the number of nodes of ntype, or if ntype is not provided,
+        the total number of nodes in the graph.
+        """
+        if ntype is None:
+            if len(self.__num_nodes_dict.keys()) > 1:
+                raise ValueError("ntype is required for heterogeneous graphs")
+            return self.__num_nodes_dict[HOMOGENEOUS_NODE_TYPE]
+
+        return self.__num_nodes_dict[ntype]

From b51eda4ba9127315b273581fb8a41c025382f9be Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Mon, 10 Jun 2024 12:47:00 -0700
Subject: [PATCH 05/47] dist sampling

---
 python/cugraph-dgl/cugraph_dgl/graph.py | 93 +++++++++++++++++++++++--
 1 file changed, 88 insertions(+), 5 deletions(-)

diff --git a/python/cugraph-dgl/cugraph_dgl/graph.py b/python/cugraph-dgl/cugraph_dgl/graph.py
index 731aeab0b3f..343ba4c8670 100644
--- a/python/cugraph-dgl/cugraph_dgl/graph.py
+++ b/python/cugraph-dgl/cugraph_dgl/graph.py
@@ -11,13 +11,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import cupy
+
+import pylibcugraph
+
+from typing import Union, Optional, Dict, Tuple
+
 from cugraph.utilities.utils import import_optional
 
 from cugraph_dgl.typing import TensorType
 from cugraph_dgl.utils.cugraph_conversion_utils import _cast_to_torch_tensor
 from cugraph_dgl.features import WholeFeatureStore
 
-from typing import Union, Optional, Dict, Tuple
+
 
 # Have to use import_optional even though these are required
 # dependencies in order to build properly.
@@ -87,6 +93,7 @@ def __init__(
             )
 
         self.__num_nodes_dict = {}
+        self.__num_edges_dict = {}
         self.__edge_indices = tensordict.TensorDict({}, batch_size=(2,))
 
         self.__sizes = {}
@@ -200,6 +207,9 @@ def add_nodes(
             self.__ndata_storage[ntype, feature_name] = self.__ndata_storage_type(
                 feature_tensor, **self.__wg_kwargs
             )
+        
+        self.__graph = None
+        self.__vertex_offsets = None
 
     def __check_node_ids(self, ntype: str, ids: TensorType):
         """
@@ -281,14 +291,87 @@ def add_edges(
                     dgl_can_edge_type, attr_name
                 ] = self.__edata_storage_type(attr_tensor, **self.__wg_kwargs)
 
-    def num_nodes(self, ntype: str = None):
+        num_edges = self.__edge_indices[dgl_can_edge_type].shape[1]
+        if self.is_multi_gpu:
+            num_edges = torch.tensor([num_edges], device='cuda', dtype=torch.int64)
+            torch.distributed.all_reduce(num_edges, op=torch.distributed.ReduceOp.SUM)
+        
+        self.__num_edges_dict[dgl_can_edge_type] = int(num_edges)
+        
+        self.__graph = None
+        self.__vertex_offsets = None
+
+    def num_nodes(self, ntype: str = None) -> int:
         """
         Returns the number of nodes of ntype, or if ntype is not provided,
         the total number of nodes in the graph.
         """
         if ntype is None:
-            if len(self.__num_nodes_dict.keys()) > 1:
-                raise ValueError("ntype is required for heterogeneous graphs")
-            return self.__num_nodes_dict[HOMOGENEOUS_NODE_TYPE]
+            return sum(self.__num_nodes_dict.values())
 
         return self.__num_nodes_dict[ntype]
+
+    def number_of_nodes(self, ntype: str = None) -> int:
+        """
+        Alias for num_nodes.
+        """
+        return self.num_nodes(ntype=ntype)
+    
+    def num_edges(self, etype: Union[str, Tuple[str, str, str]]=None) -> int:
+        """
+        Returns the number of edges of etype, or if etype is not provided,
+        the total number of edges in the graph.
+        """
+        if etype is None:
+            return sum(self.__num_edges_dict.values())
+    
+        etype = self.to_canonical_etype(etype)
+        return self.__num_edges_dict[etype]
+    
+    def number_of_edges(self, etype: Union[str, Tuple[str, str, str]]=None) -> int:
+        """
+        Alias for num_edges.
+        """
+        return self.num_edges(etype=etype)
+
+    @property
+    def is_homogeneous(self):
+        return len(self.__num_edges_dict) <= 1 and len(self.__num_nodes_dict) <=1
+
+    @property
+    def _graph(self) -> Union[pylibcugraph.SGGraph, pylibcugraph.MGGraph]:
+        if self.__graph is None:
+            edgelist_dict = self.__get_edgelist()
+
+            if self.is_multi_gpu:
+                rank = torch.distributed.get_rank()
+                world_size = torch.distributed.get_world_size()
+
+                vertices_array = cupy.arange(
+                    sum(self._num_vertices().values()), dtype="int64"
+                )
+                vertices_array = cupy.array_split(vertices_array, world_size)[rank]
+
+                self.__graph = pylibcugraph.MGGraph(
+                    self._resource_handle,
+                    graph_properties,
+                    [cupy.asarray(edgelist_dict["src"]).astype("int64")],
+                    [cupy.asarray(edgelist_dict["dst"]).astype("int64")],
+                    vertices_array=[vertices_array],
+                    edge_id_array=[cupy.asarray(edgelist_dict["eid"])],
+                    edge_type_array=[cupy.asarray(edgelist_dict["etp"])],
+                )
+            else:
+                self.__graph = pylibcugraph.SGGraph(
+                    self._resource_handle,
+                    graph_properties,
+                    cupy.asarray(edgelist_dict["src"]).astype("int64"),
+                    cupy.asarray(edgelist_dict["dst"]).astype("int64"),
+                    vertices_array=cupy.arange(
+                        sum(self._num_vertices().values()), dtype="int64"
+                    ),
+                    edge_id_array=cupy.asarray(edgelist_dict["eid"]),
+                    edge_type_array=cupy.asarray(edgelist_dict["etp"]),
+                )
+
+        return self.__graph
\ No newline at end of file

From 99432600df962efdf098cfa9a19bdc99926bde6a Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Wed, 12 Jun 2024 13:41:59 -0700
Subject: [PATCH 06/47] graph data views

---
 batch=00000.00000000-00000.00000002.parquet | Bin 0 -> 4768 bytes
 python/cugraph-dgl/cugraph_dgl/__init__.py  |   1 +
 python/cugraph-dgl/cugraph_dgl/features.py  |   2 +-
 python/cugraph-dgl/cugraph_dgl/graph.py     | 300 ++++++++++++++++++--
 python/cugraph-dgl/cugraph_dgl/view.py      | 101 +++++++
 5 files changed, 378 insertions(+), 26 deletions(-)
 create mode 100644 batch=00000.00000000-00000.00000002.parquet
 create mode 100644 python/cugraph-dgl/cugraph_dgl/view.py

diff --git a/batch=00000.00000000-00000.00000002.parquet b/batch=00000.00000000-00000.00000002.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..a9962589e3be43900ec86c90d6eec24949d444cb
GIT binary patch
literal 4768
zcmb`LYfO_@7{}kXP+l(xMc&d%(OTsN6)QKZD0sh4L_nDr&{F7IsP+PF0Rb6;c$q_F
z&grIG#&n5^x-IS!qh^WIEnBuF+e{V{jf)?)eb}<ZEXEi=EIV)C|0CE|=LhQvKb~`W
z&N<I{o_~qui!1YlIN_d#wNhb$5XW&7IOUTF@}LSF1*U)oa4eVrP6X4ycu)gofLhQ5
zCV}I@abPOQju`%o2KAs0Ob3l1L5*L=fC8BO*cSTLtmOy;8Ws|*zz!y#bwHzvFW6I|
zvt5z?)YaXrKYe=ZFCTTRJ$|#k;ERTfx3^#Z%+=v>UHQAUsO0>_+r5)7-&rp$`mIVV
z>;CYcjhlb}xT-JV9KV0n)uF-c(fYk-t1eb&etY5ImyHvuZ|n(P`TEw6XS#m4k#^2B
zI;FC0$%KCEnK>!Wrjq+VEln!vy?_0$4s{$m&7h98QsIbNexVgO@d?la_Jj0dnXumk
z=>^xot_98DDzE~a2kr;ULCSzZ*b+Di8~`bk4!{n8d%@ixWtts!<jZbF{~pi+M!xtq
z^!vcb3@AqbtDp^B4fcYRv7N9fbC<(D0?q`tgOu4J*bBfS@Hp59&H>*7XMrohO0WZL
z1t)_`!DAp*R2l4(AXV5w*bBkU;6`vC*aI#G4}))m)nEy@6Kn&!KsQL0*9<!c^nxPz
z3g`su!KYPh5aXmYQ(%*l9EDAak_CGzND5O0J5p(8qo0*r__GKs1$Tih;4-iqj8sS~
z`lo?o!Pmhh;9PJm*a(uc7Q)U3=Y#p+8{lhTJLm@wfdVI-(y$go@%VyeNn^LKo|3+`
zCMBUZU%mO|$*oiO7i>)PISK>T`uK$_S~5p>3+pTQSr)`KZb>#R+U9neXEtxB4mI>l
zu+P|$Q784~?3^=i{Pe71e$)Kj6~Ts{c8{@?jZK=nBg3#}oIZQyUd@42+w3x3N8+lb
zxqWp~Z>M&NyvV<H);d-B#FC_!*!JQmb?qygs{(B2h1U4`_1!|{zM`z+>HM@^+?c%O
zZF{^$U7IQvH!ZL<Z!nflnrm3IaxZh<p<Cx-emEwcyug<VVda!B&Tg$2^fB(N8qmnI
zhvZf&ER&U%N2EwO&9Fy-q<ExsiRe!QNjXRvB9$#t!Hif(h|D0$7|I07d{P8b0IGPZ
z1gZea_{dDB45vzcTE(&04XRih89jNgV=C1{6{i)}-ObDhti3w4>&OM^eQn**@{B{v
z`^J9Id^fPH?B`nj^5UNTHM`3*?-XXgH+jQ#ZS(gP$%oU55?yk9neR^$SPT2+l<!|T
zej(KfIu^%KKq-)PG5X_x0-ufu(gTq+P{~Zk_)L&WGZ!|M{v6mvAe9<P`y%u&1xdO|
zA}Y|o9we#R2zv`y18xIrK?x)Y*a6!MlBCjC4oNXdQxE2mw38&zS2sx{eFeRX`KQ1$
z;Cb)@_$l}~coDn=UIDLy*T8Q;_Sn5D)*>HU)BW%8V<%%WJ!%&0SzsPWkHef+3YAeV
zlSa6tyqJfjYe2SioG@<WQn_ShK~NjtS)~ouH%eZIkj(TUQ*F?C-9CRHs2B;7vq7@E
zoopr>8Wpxe?M;$0s_{CSl;0A)j_v+{-S2V*r4ZKYJdQfaV{h~~J=vcckbEuPIw@dh
z%ZG<}dG9vK-Ow1~^aC2B$=EM)CLzw48K*VUsRVgQe@)Apcq~rjO*Sr@C!H{mpm91K
zXUt)07bmFY%9FuyBfQbI6nBqO^^a1;M`_}B+OWTq2-|b`Q!A$%4vgZlMqeZTkuH9f
zF8d41k0Jc?|IQ@`RPpd2UjHCo{M9TDnM}5as%)M_@Hq~}LdLO>J^3bz+gl0xTM6PZ
zOR?>QI-93_ebT3iMU1`oaKvP1(T}M(Jgd^5Rf+ee729@evU$qY=WrM^3lyKN`d0D8
zjAGk9=8;nPSsr=izDOR$N4`E^9Ly=Uu_QMRUr2vBhvkpRGcrvznhe?|htKH<8ZPmj
z7PrqS?X=hXJuO~e&@#_ly*0;dQA~SyOp53Yo1GS)!z;1r7Nu@1Y>~??d7Sp>eq}Se
zTtM;yw=ZNXl;^W@Xljq{_DUg#(-Cs8xxN;Urz^)iycy~;dbbe+FM7N1xudT4SQF&4
zL~mJ;AM=Jv_p@%uN+J4o=#1VbRv=eXe4o9|2;qvgVZ`tY+&W4hqqmPa<pmxgmJ~;y
z;%T#9;6W7OkKTH?KVmbN$L|QmO0E^d1Cj3@F6e;d2>Ao7raCPicU`~{XqRiTsofC>
z_}k<Iv`T@X+wYTm^K8~UYp#V=9%c53!F((GXX)Y$LVb(VWyXj(ztEa%v*zX+``O>8
PF&o#Gz;Wg5|EB%}$$;L!

literal 0
HcmV?d00001

diff --git a/python/cugraph-dgl/cugraph_dgl/__init__.py b/python/cugraph-dgl/cugraph_dgl/__init__.py
index 03ff50896a4..61b4142a871 100644
--- a/python/cugraph-dgl/cugraph_dgl/__init__.py
+++ b/python/cugraph-dgl/cugraph_dgl/__init__.py
@@ -15,6 +15,7 @@
 
 # to prevent rapids context being created when importing cugraph_dgl
 os.environ["RAPIDS_NO_INITIALIZE"] = "1"
+from cugraph_dgl.graph import Graph
 from cugraph_dgl.cugraph_storage import CuGraphStorage
 from cugraph_dgl.convert import cugraph_storage_from_heterograph
 import cugraph_dgl.dataloading
diff --git a/python/cugraph-dgl/cugraph_dgl/features.py b/python/cugraph-dgl/cugraph_dgl/features.py
index 8037bd90454..80885cf01aa 100644
--- a/python/cugraph-dgl/cugraph_dgl/features.py
+++ b/python/cugraph-dgl/cugraph_dgl/features.py
@@ -118,4 +118,4 @@ def fetch(
         if self.__td < 0:
             t = t.reshape((t.shape[0],))
 
-        return t
+        return t.to(torch.device(device))
diff --git a/python/cugraph-dgl/cugraph_dgl/graph.py b/python/cugraph-dgl/cugraph_dgl/graph.py
index 343ba4c8670..c6978af877b 100644
--- a/python/cugraph-dgl/cugraph_dgl/graph.py
+++ b/python/cugraph-dgl/cugraph_dgl/graph.py
@@ -11,13 +11,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import cupy
-
-import pylibcugraph
-
-from typing import Union, Optional, Dict, Tuple
+from typing import Union, Optional, Dict, Tuple, List
 
 from cugraph.utilities.utils import import_optional
+from cugraph.gnn import cugraph_comms_get_raft_handle
+
+import cupy
+import pylibcugraph
 
 from cugraph_dgl.typing import TensorType
 from cugraph_dgl.utils.cugraph_conversion_utils import _cast_to_torch_tensor
@@ -34,7 +34,6 @@
 HOMOGENEOUS_NODE_TYPE = "n"
 HOMOGENEOUS_EDGE_TYPE = (HOMOGENEOUS_NODE_TYPE, "e", HOMOGENEOUS_NODE_TYPE)
 
-
 class Graph:
     """
     cuGraph-backed duck-typed version of dgl.DGLGraph that distributes
@@ -96,7 +95,6 @@ def __init__(
         self.__num_edges_dict = {}
         self.__edge_indices = tensordict.TensorDict({}, batch_size=(2,))
 
-        self.__sizes = {}
         self.__graph = None
         self.__vertex_offsets = None
         self.__handle = None
@@ -148,11 +146,11 @@ def add_nodes(
         """
         Adds the given number of nodes to this graph.  Can only be called once
         per node type. The number of nodes specified here refers to the total
-        number of nodes across workers. If the backing feature store is
-        distributed (i.e. wholegraph), then only local features should
-        be passed to the data argument.  If the backing feature store is
-        replicated, then features for all nodes should be passed to the
-        data argument, including those for nodes not on the local worker.
+        number of nodes across all workers (the entire graph). If the backing
+        feature store is distributed (i.e. wholegraph), then only local features
+        should be passed to the data argument.  If the backing feature store is
+        replicated, then features for all nodes in the graph should be passed to
+        the data argument, including those for nodes not on the local worker.
 
         Parameters
         ----------
@@ -334,13 +332,192 @@ def number_of_edges(self, etype: Union[str, Tuple[str, str, str]]=None) -> int:
         """
         return self.num_edges(etype=etype)
 
+    @property
+    def ntypes(self) -> List[str]:
+        """
+        Returns the node type names in this graph.
+        """
+        return list(self.__num_nodes_dict.keys())
+
+    @property
+    def etypes(self) -> List[str]:
+        """
+        Returns the edge type names in this graph
+        (the second element of the canonical edge
+        type tuple).
+        """
+        return [
+            et[1]
+            for et in self.__num_edges_dict.keys()
+        ]
+
+    @property
+    def canonical_etypes(self) -> List[str]:
+        """
+        Returns the canonical edge type names in this
+        graph.
+        """
+        return list(self.__num_edges_dict.keys())
+
+    @property
+    def _vertex_offsets(self) -> Dict[str, int]:
+        if self.__vertex_offsets is None:
+            ordered_keys = sorted(list(self.ntypes))
+            self.__vertex_offsets = {}
+            offset = 0
+            for vtype in ordered_keys:
+                self.__vertex_offsets[vtype] = offset
+                offset += self.num_nodes(vtype)
+
+        return dict(self.__vertex_offsets)
+
+    def __get_edgelist(self) -> Dict[str, "torch.Tensor"]:
+        """
+        This function always returns src/dst labels with respect
+        to the out direction.
+
+        Returns
+        -------
+        Dict[str, torch.Tensor] with the following keys:
+            src: source vertices (int64)
+                Note that src is the 1st element of the DGL edge index.
+            dst: destination vertices (int64)
+                Note that dst is the 2nd element of the DGL edge index.
+            eid: edge ids for each edge (int64)
+                Note that these start from 0 for each edge type.
+            etp: edge types for each edge (int32)
+                Note that these are in lexicographic order.
+        """
+        sorted_keys = sorted(
+            list(self.__edge_indices.keys(leaves_only=True, include_nested=True))
+        )
+
+        # note that this still follows the DGL convention of (src, rel, dst)
+        # i.e. (author, writes, paper): [[0,1,2],[2,0,1]] is referring to a
+        # cuGraph graph where (paper 2) -> (author 0), (paper 0) -> (author 1),
+        # and (paper 1) -> (author 0)
+        edge_index = torch.concat(
+            [
+                torch.stack(
+                    [
+                        self.__edge_indices[src_type, rel_type, dst_type][0]
+                        + self._vertex_offsets[src_type],
+                        self.__edge_indices[src_type, rel_type, dst_type][1]
+                        + self._vertex_offsets[dst_type],
+                    ]
+                )
+                for (src_type, rel_type, dst_type) in sorted_keys
+            ],
+            axis=1,
+        ).cuda()
+
+        edge_type_array = torch.arange(
+            len(sorted_keys), dtype=torch.int32, device="cuda"
+        ).repeat_interleave(
+            torch.tensor(
+                [self.__edge_indices[et].shape[1] for et in sorted_keys],
+                device="cuda",
+                dtype=torch.int32,
+            )
+        )
+
+        if self.is_multi_gpu:
+            rank = torch.distributed.get_rank()
+            world_size = torch.distributed.get_world_size()
+
+            num_edges_t = torch.tensor(
+                [self.__edge_indices[et].shape[1] for et in sorted_keys], device="cuda"
+            )
+            num_edges_all_t = torch.empty(
+                world_size, num_edges_t.numel(), dtype=torch.int64, device="cuda"
+            )
+            torch.distributed.all_gather_into_tensor(num_edges_all_t, num_edges_t)
+
+            if rank > 0:
+                start_offsets = num_edges_all_t[:rank].T.sum(axis=1)
+                edge_id_array = torch.concat(
+                    [
+                        torch.arange(
+                            start_offsets[i],
+                            start_offsets[i] + num_edges_all_t[rank][i],
+                            dtype=torch.int64,
+                            device="cuda",
+                        )
+                        for i in range(len(sorted_keys))
+                    ]
+                )
+            else:
+                edge_id_array = torch.concat(
+                    [
+                        torch.arange(
+                            self.__edge_indices[et].shape[1],
+                            dtype=torch.int64,
+                            device="cuda",
+                        )
+                        for et in sorted_keys
+                    ]
+                )
+
+        else:
+            # single GPU
+            edge_id_array = torch.concat(
+                [
+                    torch.arange(
+                        self.__edge_indices[et].shape[1],
+                        dtype=torch.int64,
+                        device="cuda",
+                    )
+                    for et in sorted_keys
+                ]
+            )
+
+        return {
+            "src": edge_index[0],
+            "dst": edge_index[1],
+            "etp": edge_type_array,
+            "eid": edge_id_array,
+        }
+
     @property
     def is_homogeneous(self):
         return len(self.__num_edges_dict) <= 1 and len(self.__num_nodes_dict) <=1
-
+    
+    @property
+    def idtype(self):
+        return torch.int64
+    
     @property
-    def _graph(self) -> Union[pylibcugraph.SGGraph, pylibcugraph.MGGraph]:
+    def _resource_handle(self):
+        if self.__handle is None:
+            if self.is_multi_gpu:
+                self.__handle = pylibcugraph.ResourceHandle(
+                    cugraph_comms_get_raft_handle().getHandle()
+                )
+            else:
+                self.__handle = pylibcugraph.ResourceHandle()
+        return self.__handle
+
+    def _graph(self, direction:str) -> Union[pylibcugraph.SGGraph, pylibcugraph.MGGraph]:
+        """
+        Gets the pylibcugraph Graph object with edges pointing in the given direction
+        (i.e. 'out' is standard, 'in' is reverse).
+        """
+
+        if direction not in ['out','in']:
+            raise ValueError(f"Invalid direction {direction} (expected 'in' or 'out').")
+
+        graph_properties = pylibcugraph.GraphProperties(
+            is_multigraph=True, is_symmetric=False
+        )
+
+        if self.__graph[1] != direction:
+            self.__graph = None
+
         if self.__graph is None:
+            src_col, dst_col = (
+                ('src','dst') if direction == 'out'
+                else ('dst','src')
+            )
             edgelist_dict = self.__get_edgelist()
 
             if self.is_multi_gpu:
@@ -348,30 +525,103 @@ def _graph(self) -> Union[pylibcugraph.SGGraph, pylibcugraph.MGGraph]:
                 world_size = torch.distributed.get_world_size()
 
                 vertices_array = cupy.arange(
-                    sum(self._num_vertices().values()), dtype="int64"
+                    self.num_nodes(), dtype="int64"
                 )
                 vertices_array = cupy.array_split(vertices_array, world_size)[rank]
 
-                self.__graph = pylibcugraph.MGGraph(
+                self.__graph = (pylibcugraph.MGGraph(
                     self._resource_handle,
                     graph_properties,
-                    [cupy.asarray(edgelist_dict["src"]).astype("int64")],
-                    [cupy.asarray(edgelist_dict["dst"]).astype("int64")],
+                    [cupy.asarray(edgelist_dict[src_col]).astype("int64")],
+                    [cupy.asarray(edgelist_dict[dst_col]).astype("int64")],
                     vertices_array=[vertices_array],
                     edge_id_array=[cupy.asarray(edgelist_dict["eid"])],
                     edge_type_array=[cupy.asarray(edgelist_dict["etp"])],
-                )
+                ), direction)
             else:
-                self.__graph = pylibcugraph.SGGraph(
+                self.__graph = (pylibcugraph.SGGraph(
                     self._resource_handle,
                     graph_properties,
-                    cupy.asarray(edgelist_dict["src"]).astype("int64"),
-                    cupy.asarray(edgelist_dict["dst"]).astype("int64"),
+                    cupy.asarray(edgelist_dict[src_col]).astype("int64"),
+                    cupy.asarray(edgelist_dict[dst_col]).astype("int64"),
                     vertices_array=cupy.arange(
-                        sum(self._num_vertices().values()), dtype="int64"
+                        self.num_nodes(), dtype="int64"
                     ),
                     edge_id_array=cupy.asarray(edgelist_dict["eid"]),
                     edge_type_array=cupy.asarray(edgelist_dict["etp"]),
-                )
+                ), direction)
+
+        return self.__graph[0]
+
+    def _get_n_emb(self, ntype:str, emb_name: str, u:Union[str, TensorType]):
+        """
+        Gets the embedding of a single node type.
+        Unlike DGL, this function takes the string node
+        type name instead of an integer id.
+
+        Parameters
+        ----------
+        ntype: str
+            The node type to get the embedding of.
+        emb_name: str
+            The embedding name of the embedding to get.
+        u: Union[str, TensorType]
+            Nodes to get the representation of, or ALL
+            to get the representation of all nodes of
+            the given type.
+        """
+
+        if dgl.base.is_all(u):
+            u = torch.arange(self.num_nodes(ntype), dtype=torch.int64)
+        
+        return self.__ndata_storage[ntype, emb_name].fetch(
+            _cast_to_torch_tensor(u),
+            'cuda'
+        )
+
+    def _set_n_emb(self, ntype:str, u:Union[str, TensorType], kv: Dict[str, TensorType]):
+        """
+        Stores or updates the embedding(s) of a single node type.
+        Unlike DGL, this function takes the string node type name
+        instead of an integer id.
+
+        The semantics of this function match those of add_nodes
+        with respect to whether or not the backing feature store
+        is distributed.
+
+        Parameters
+        ----------
+        ntype: str
+            The node type to store an embedding of.
+        u: Union[str, TensorType]
+            The indices to update, if updating the embedding.
+            Currently, updating a slice of an embedding is
+            unsupported, so this should be ALL.
+        kv: Dict[str, TensorType]
+            A mapping of embedding names to embedding tensors.
+        """
 
-        return self.__graph
\ No newline at end of file
+        if not dgl.base.is_all(u):
+            raise NotImplementedError(
+                "Updating a slice of an embedding is "
+                "currently unimplemented in cuGraph-DGL."
+            )
+    
+        for k, v in kv:
+            self.__ndata_storage[ntype, k] = self.__ndata_storage_type(
+                v, **self.__wg_kwargs
+            )
+        
+    def _pop_n_emb(self, ntype:str, key: str):
+        return self.__ndata_storage[ntype, key].pop(key)
+
+    def _get_n_emb_keys(self, ntype:str):
+        return [
+            k
+            for (t, k) in self.__ndata_storage
+            if ntype == t
+        ]
+
+    @property
+    def ndata(self):
+        
\ No newline at end of file
diff --git a/python/cugraph-dgl/cugraph_dgl/view.py b/python/cugraph-dgl/cugraph_dgl/view.py
new file mode 100644
index 00000000000..39ce44bae0c
--- /dev/null
+++ b/python/cugraph-dgl/cugraph_dgl/view.py
@@ -0,0 +1,101 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import defaultdict
+from typing import Union, Dict, List
+
+from cugraph_dgl.typing import TensorType
+
+class HeteroNodeDataView:
+    """
+    Duck-typed version of DGL's HeteroNodeDataView.
+    Used for accessing and modifying node features.
+    """
+
+    def __init__(self, graph: "cugraph_dgl.Graph", ntype: Union[str, List[str]], nodes: TensorType):
+        self.__graph = graph
+        self.__ntype = ntype
+        self.__nodes = nodes
+
+    @property
+    def _ntype(self) -> str:
+        return self.__ntype
+    
+    @property
+    def _graph(self) -> "cugraph_dgl.Graph":
+        return self.__graph
+    
+    @property
+    def _nodes(self) -> TensorType:
+        return self.__nodes
+
+    def __getitem__(self, key: str):
+        if isinstance(self._ntype, list):
+            return {
+                t: self._graph._get_n_emb(t, key, self._nodes)
+                for t in self._ntype
+            }
+        else:
+            return self._graph._get_n_emb(self._ntype, key, self._nodes)
+    
+    def __setitem__(self, key: str, val: Union[TensorType, Dict[TensorType]]):
+        if isinstance(self._ntype, list):
+            if not isinstance(val, dict):
+                raise ValueError(
+                    "There are multiple node types in this view. "
+                    "Expected a dictionary of values."
+                )
+            for t, v in val.items():
+                if t not in self._ntype:
+                    raise ValueError("Attempted to modify a type out of view.")
+                self._graph._set_n_emb(self._ntype, self._nodes, {key: v})
+        else:
+            if isinstance(val, dict):
+                raise ValueError(
+                    "There is only one node type in this view. "
+                    "Expected a single value tensor."
+                )
+            self._graph._set_n_emb(self._ntype, self._nodes, {key: val})
+
+    def __delitem__(self, key: str):
+        if isinstance(self._ntype, list):
+            for t in self._ntype:
+                self._graph._pop_n_emb(t, key)
+
+    def _transpose(self, fetch_vals=True):
+        if isinstance(self._ntype, list):
+            tr = defaultdict(dict)
+            for ntype in self._ntype:
+                for key in self._graph._get_n_emb_keys(ntype):
+                    tr[key][ntype] = self._graph._get_n_emb(ntype, key, self._nodes) if fetch_vals else []
+        else:
+            tr = {}
+            for key in self._graph._get_n_emb_keys(self._ntype):
+                tr[key] = self._graph._get_n_emb(ntype, key, self._nodes) if fetch_vals else []
+        
+        return tr
+
+    def __len__(self):
+        return len(self._transpose(fetch_vals=False))
+
+    def __iter__(self):
+        return iter(self._transpose())
+    
+    def keys(self):
+        return self._transpose(fetch_vals=False).keys()
+
+    def values(self):
+        return self._transpose().values()
+    
+    def __repr__(self):
+        return repr(self.__transpose(fetch_vals=False))
\ No newline at end of file

From 055db0a782c5427d617d1b8fd688b86adfa3bf0d Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Wed, 12 Jun 2024 13:42:30 -0700
Subject: [PATCH 07/47] remove unwanted file

---
 batch=00000.00000000-00000.00000002.parquet | Bin 4768 -> 0 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 batch=00000.00000000-00000.00000002.parquet

diff --git a/batch=00000.00000000-00000.00000002.parquet b/batch=00000.00000000-00000.00000002.parquet
deleted file mode 100644
index a9962589e3be43900ec86c90d6eec24949d444cb..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 4768
zcmb`LYfO_@7{}kXP+l(xMc&d%(OTsN6)QKZD0sh4L_nDr&{F7IsP+PF0Rb6;c$q_F
z&grIG#&n5^x-IS!qh^WIEnBuF+e{V{jf)?)eb}<ZEXEi=EIV)C|0CE|=LhQvKb~`W
z&N<I{o_~qui!1YlIN_d#wNhb$5XW&7IOUTF@}LSF1*U)oa4eVrP6X4ycu)gofLhQ5
zCV}I@abPOQju`%o2KAs0Ob3l1L5*L=fC8BO*cSTLtmOy;8Ws|*zz!y#bwHzvFW6I|
zvt5z?)YaXrKYe=ZFCTTRJ$|#k;ERTfx3^#Z%+=v>UHQAUsO0>_+r5)7-&rp$`mIVV
z>;CYcjhlb}xT-JV9KV0n)uF-c(fYk-t1eb&etY5ImyHvuZ|n(P`TEw6XS#m4k#^2B
zI;FC0$%KCEnK>!Wrjq+VEln!vy?_0$4s{$m&7h98QsIbNexVgO@d?la_Jj0dnXumk
z=>^xot_98DDzE~a2kr;ULCSzZ*b+Di8~`bk4!{n8d%@ixWtts!<jZbF{~pi+M!xtq
z^!vcb3@AqbtDp^B4fcYRv7N9fbC<(D0?q`tgOu4J*bBfS@Hp59&H>*7XMrohO0WZL
z1t)_`!DAp*R2l4(AXV5w*bBkU;6`vC*aI#G4}))m)nEy@6Kn&!KsQL0*9<!c^nxPz
z3g`su!KYPh5aXmYQ(%*l9EDAak_CGzND5O0J5p(8qo0*r__GKs1$Tih;4-iqj8sS~
z`lo?o!Pmhh;9PJm*a(uc7Q)U3=Y#p+8{lhTJLm@wfdVI-(y$go@%VyeNn^LKo|3+`
zCMBUZU%mO|$*oiO7i>)PISK>T`uK$_S~5p>3+pTQSr)`KZb>#R+U9neXEtxB4mI>l
zu+P|$Q784~?3^=i{Pe71e$)Kj6~Ts{c8{@?jZK=nBg3#}oIZQyUd@42+w3x3N8+lb
zxqWp~Z>M&NyvV<H);d-B#FC_!*!JQmb?qygs{(B2h1U4`_1!|{zM`z+>HM@^+?c%O
zZF{^$U7IQvH!ZL<Z!nflnrm3IaxZh<p<Cx-emEwcyug<VVda!B&Tg$2^fB(N8qmnI
zhvZf&ER&U%N2EwO&9Fy-q<ExsiRe!QNjXRvB9$#t!Hif(h|D0$7|I07d{P8b0IGPZ
z1gZea_{dDB45vzcTE(&04XRih89jNgV=C1{6{i)}-ObDhti3w4>&OM^eQn**@{B{v
z`^J9Id^fPH?B`nj^5UNTHM`3*?-XXgH+jQ#ZS(gP$%oU55?yk9neR^$SPT2+l<!|T
zej(KfIu^%KKq-)PG5X_x0-ufu(gTq+P{~Zk_)L&WGZ!|M{v6mvAe9<P`y%u&1xdO|
zA}Y|o9we#R2zv`y18xIrK?x)Y*a6!MlBCjC4oNXdQxE2mw38&zS2sx{eFeRX`KQ1$
z;Cb)@_$l}~coDn=UIDLy*T8Q;_Sn5D)*>HU)BW%8V<%%WJ!%&0SzsPWkHef+3YAeV
zlSa6tyqJfjYe2SioG@<WQn_ShK~NjtS)~ouH%eZIkj(TUQ*F?C-9CRHs2B;7vq7@E
zoopr>8Wpxe?M;$0s_{CSl;0A)j_v+{-S2V*r4ZKYJdQfaV{h~~J=vcckbEuPIw@dh
z%ZG<}dG9vK-Ow1~^aC2B$=EM)CLzw48K*VUsRVgQe@)Apcq~rjO*Sr@C!H{mpm91K
zXUt)07bmFY%9FuyBfQbI6nBqO^^a1;M`_}B+OWTq2-|b`Q!A$%4vgZlMqeZTkuH9f
zF8d41k0Jc?|IQ@`RPpd2UjHCo{M9TDnM}5as%)M_@Hq~}LdLO>J^3bz+gl0xTM6PZ
zOR?>QI-93_ebT3iMU1`oaKvP1(T}M(Jgd^5Rf+ee729@evU$qY=WrM^3lyKN`d0D8
zjAGk9=8;nPSsr=izDOR$N4`E^9Ly=Uu_QMRUr2vBhvkpRGcrvznhe?|htKH<8ZPmj
z7PrqS?X=hXJuO~e&@#_ly*0;dQA~SyOp53Yo1GS)!z;1r7Nu@1Y>~??d7Sp>eq}Se
zTtM;yw=ZNXl;^W@Xljq{_DUg#(-Cs8xxN;Urz^)iycy~;dbbe+FM7N1xudT4SQF&4
zL~mJ;AM=Jv_p@%uN+J4o=#1VbRv=eXe4o9|2;qvgVZ`tY+&W4hqqmPa<pmxgmJ~;y
z;%T#9;6W7OkKTH?KVmbN$L|QmO0E^d1Cj3@F6e;d2>Ao7raCPicU`~{XqRiTsofC>
z_}k<Iv`T@X+wYTm^K8~UYp#V=9%c53!F((GXX)Y$LVb(VWyXj(ztEa%v*zX+``O>8
PF&o#Gz;Wg5|EB%}$$;L!


From 1f76898f6eb060e7a9df7c5526952a12c8ca5124 Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Thu, 13 Jun 2024 14:30:11 -0700
Subject: [PATCH 08/47] revert devcontainer change

---
 .devcontainer/cuda11.8-pip/devcontainer.json |   1 -
 .devcontainer/cuda12.2-pip/devcontainer.json |   1 -
 python/cugraph-dgl/cugraph_dgl/graph.py      | 348 +++++++++++++++----
 python/cugraph-dgl/cugraph_dgl/view.py       | 231 +++++++++++-
 4 files changed, 499 insertions(+), 82 deletions(-)

diff --git a/.devcontainer/cuda11.8-pip/devcontainer.json b/.devcontainer/cuda11.8-pip/devcontainer.json
index 523933c34fb..f044aa8fbbc 100644
--- a/.devcontainer/cuda11.8-pip/devcontainer.json
+++ b/.devcontainer/cuda11.8-pip/devcontainer.json
@@ -25,7 +25,6 @@
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.8": {}
   },
   "overrideFeatureInstallOrder": [
-    "ghcr.io/rapidsai/devcontainers/features/ucx",
     "ghcr.io/rapidsai/devcontainers/features/cuda",
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
   ],
diff --git a/.devcontainer/cuda12.2-pip/devcontainer.json b/.devcontainer/cuda12.2-pip/devcontainer.json
index 9b90398a29c..4a4bea7bbb0 100644
--- a/.devcontainer/cuda12.2-pip/devcontainer.json
+++ b/.devcontainer/cuda12.2-pip/devcontainer.json
@@ -25,7 +25,6 @@
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.8": {}
   },
   "overrideFeatureInstallOrder": [
-    "ghcr.io/rapidsai/devcontainers/features/ucx",
     "ghcr.io/rapidsai/devcontainers/features/cuda",
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
   ],
diff --git a/python/cugraph-dgl/cugraph_dgl/graph.py b/python/cugraph-dgl/cugraph_dgl/graph.py
index c6978af877b..2bfa75e091c 100644
--- a/python/cugraph-dgl/cugraph_dgl/graph.py
+++ b/python/cugraph-dgl/cugraph_dgl/graph.py
@@ -22,7 +22,12 @@
 from cugraph_dgl.typing import TensorType
 from cugraph_dgl.utils.cugraph_conversion_utils import _cast_to_torch_tensor
 from cugraph_dgl.features import WholeFeatureStore
-
+from cugraph_dgl.view import (
+    HeteroNodeView,
+    HeteroNodeDataView,
+    HeteroEdgeView,
+    HeteroEdgeDataView,
+)
 
 
 # Have to use import_optional even though these are required
@@ -34,6 +39,7 @@
 HOMOGENEOUS_NODE_TYPE = "n"
 HOMOGENEOUS_EDGE_TYPE = (HOMOGENEOUS_NODE_TYPE, "e", HOMOGENEOUS_NODE_TYPE)
 
+
 class Graph:
     """
     cuGraph-backed duck-typed version of dgl.DGLGraph that distributes
@@ -205,7 +211,7 @@ def add_nodes(
             self.__ndata_storage[ntype, feature_name] = self.__ndata_storage_type(
                 feature_tensor, **self.__wg_kwargs
             )
-        
+
         self.__graph = None
         self.__vertex_offsets = None
 
@@ -291,11 +297,11 @@ def add_edges(
 
         num_edges = self.__edge_indices[dgl_can_edge_type].shape[1]
         if self.is_multi_gpu:
-            num_edges = torch.tensor([num_edges], device='cuda', dtype=torch.int64)
+            num_edges = torch.tensor([num_edges], device="cuda", dtype=torch.int64)
             torch.distributed.all_reduce(num_edges, op=torch.distributed.ReduceOp.SUM)
-        
+
         self.__num_edges_dict[dgl_can_edge_type] = int(num_edges)
-        
+
         self.__graph = None
         self.__vertex_offsets = None
 
@@ -314,19 +320,19 @@ def number_of_nodes(self, ntype: str = None) -> int:
         Alias for num_nodes.
         """
         return self.num_nodes(ntype=ntype)
-    
-    def num_edges(self, etype: Union[str, Tuple[str, str, str]]=None) -> int:
+
+    def num_edges(self, etype: Union[str, Tuple[str, str, str]] = None) -> int:
         """
         Returns the number of edges of etype, or if etype is not provided,
         the total number of edges in the graph.
         """
         if etype is None:
             return sum(self.__num_edges_dict.values())
-    
+
         etype = self.to_canonical_etype(etype)
         return self.__num_edges_dict[etype]
-    
-    def number_of_edges(self, etype: Union[str, Tuple[str, str, str]]=None) -> int:
+
+    def number_of_edges(self, etype: Union[str, Tuple[str, str, str]] = None) -> int:
         """
         Alias for num_edges.
         """
@@ -346,10 +352,7 @@ def etypes(self) -> List[str]:
         (the second element of the canonical edge
         type tuple).
         """
-        return [
-            et[1]
-            for et in self.__num_edges_dict.keys()
-        ]
+        return [et[1] for et in self.__num_edges_dict.keys()]
 
     @property
     def canonical_etypes(self) -> List[str]:
@@ -480,12 +483,12 @@ def __get_edgelist(self) -> Dict[str, "torch.Tensor"]:
 
     @property
     def is_homogeneous(self):
-        return len(self.__num_edges_dict) <= 1 and len(self.__num_nodes_dict) <=1
-    
+        return len(self.__num_edges_dict) <= 1 and len(self.__num_nodes_dict) <= 1
+
     @property
     def idtype(self):
         return torch.int64
-    
+
     @property
     def _resource_handle(self):
         if self.__handle is None:
@@ -497,13 +500,15 @@ def _resource_handle(self):
                 self.__handle = pylibcugraph.ResourceHandle()
         return self.__handle
 
-    def _graph(self, direction:str) -> Union[pylibcugraph.SGGraph, pylibcugraph.MGGraph]:
+    def _graph(
+        self, direction: str
+    ) -> Union[pylibcugraph.SGGraph, pylibcugraph.MGGraph]:
         """
         Gets the pylibcugraph Graph object with edges pointing in the given direction
         (i.e. 'out' is standard, 'in' is reverse).
         """
 
-        if direction not in ['out','in']:
+        if direction not in ["out", "in"]:
             raise ValueError(f"Invalid direction {direction} (expected 'in' or 'out').")
 
         graph_properties = pylibcugraph.GraphProperties(
@@ -514,46 +519,47 @@ def _graph(self, direction:str) -> Union[pylibcugraph.SGGraph, pylibcugraph.MGGr
             self.__graph = None
 
         if self.__graph is None:
-            src_col, dst_col = (
-                ('src','dst') if direction == 'out'
-                else ('dst','src')
-            )
+            src_col, dst_col = ("src", "dst") if direction == "out" else ("dst", "src")
             edgelist_dict = self.__get_edgelist()
 
             if self.is_multi_gpu:
                 rank = torch.distributed.get_rank()
                 world_size = torch.distributed.get_world_size()
 
-                vertices_array = cupy.arange(
-                    self.num_nodes(), dtype="int64"
-                )
+                vertices_array = cupy.arange(self.num_nodes(), dtype="int64")
                 vertices_array = cupy.array_split(vertices_array, world_size)[rank]
 
-                self.__graph = (pylibcugraph.MGGraph(
-                    self._resource_handle,
-                    graph_properties,
-                    [cupy.asarray(edgelist_dict[src_col]).astype("int64")],
-                    [cupy.asarray(edgelist_dict[dst_col]).astype("int64")],
-                    vertices_array=[vertices_array],
-                    edge_id_array=[cupy.asarray(edgelist_dict["eid"])],
-                    edge_type_array=[cupy.asarray(edgelist_dict["etp"])],
-                ), direction)
+                self.__graph = (
+                    pylibcugraph.MGGraph(
+                        self._resource_handle,
+                        graph_properties,
+                        [cupy.asarray(edgelist_dict[src_col]).astype("int64")],
+                        [cupy.asarray(edgelist_dict[dst_col]).astype("int64")],
+                        vertices_array=[vertices_array],
+                        edge_id_array=[cupy.asarray(edgelist_dict["eid"])],
+                        edge_type_array=[cupy.asarray(edgelist_dict["etp"])],
+                    ),
+                    direction,
+                )
             else:
-                self.__graph = (pylibcugraph.SGGraph(
-                    self._resource_handle,
-                    graph_properties,
-                    cupy.asarray(edgelist_dict[src_col]).astype("int64"),
-                    cupy.asarray(edgelist_dict[dst_col]).astype("int64"),
-                    vertices_array=cupy.arange(
-                        self.num_nodes(), dtype="int64"
+                self.__graph = (
+                    pylibcugraph.SGGraph(
+                        self._resource_handle,
+                        graph_properties,
+                        cupy.asarray(edgelist_dict[src_col]).astype("int64"),
+                        cupy.asarray(edgelist_dict[dst_col]).astype("int64"),
+                        vertices_array=cupy.arange(self.num_nodes(), dtype="int64"),
+                        edge_id_array=cupy.asarray(edgelist_dict["eid"]),
+                        edge_type_array=cupy.asarray(edgelist_dict["etp"]),
                     ),
-                    edge_id_array=cupy.asarray(edgelist_dict["eid"]),
-                    edge_type_array=cupy.asarray(edgelist_dict["etp"]),
-                ), direction)
+                    direction,
+                )
 
         return self.__graph[0]
 
-    def _get_n_emb(self, ntype:str, emb_name: str, u:Union[str, TensorType]):
+    def _get_n_emb(
+        self, ntype: str, emb_name: str, u: Union[str, TensorType]
+    ) -> "torch.Tensor":
         """
         Gets the embedding of a single node type.
         Unlike DGL, this function takes the string node
@@ -569,17 +575,55 @@ def _get_n_emb(self, ntype:str, emb_name: str, u:Union[str, TensorType]):
             Nodes to get the representation of, or ALL
             to get the representation of all nodes of
             the given type.
+
+        Returns
+        -------
+        torch.Tensor
+            The embedding of the given edge type with the given embedding name.
         """
 
         if dgl.base.is_all(u):
-            u = torch.arange(self.num_nodes(ntype), dtype=torch.int64)
-        
+            u = torch.arange(self.num_nodes(ntype), dtype=self.idtype)
+
         return self.__ndata_storage[ntype, emb_name].fetch(
-            _cast_to_torch_tensor(u),
-            'cuda'
+            _cast_to_torch_tensor(u), "cuda"
+        )
+
+    def _get_e_emb(
+        self, etype: Tuple[str, str, str], emb_name: str, u: Union[str, TensorType]
+    ) -> "torch.Tensor":
+        """
+        Gets the embedding of a single edge type.
+        Unlike DGL, this function takes the canonical edge type
+        instead of an integer id.
+
+        Parameters
+        ----------
+        etype: str
+            The edge type to get the embedding of.
+        emb_name: str
+            The embedding name of the embedding to get.
+        u: Union[str, TensorType]
+            Edges to get the representation of, or ALL to
+            get the representation of all nodes of the
+            given type.
+
+        Returns
+        -------
+        torch.Tensor
+            The embedding of the given edge type with the given embedding name.
+        """
+
+        if dgl.base.is_all(u):
+            u = torch.arange(self.num_edges(etype), dtype=self.idtype)
+
+        return self.__edata_storage[etype, emb_name].fetch(
+            _cast_to_torch_tensor(u), "cuda"
         )
 
-    def _set_n_emb(self, ntype:str, u:Union[str, TensorType], kv: Dict[str, TensorType]):
+    def _set_n_emb(
+        self, ntype: str, u: Union[str, TensorType], kv: Dict[str, TensorType]
+    ) -> None:
         """
         Stores or updates the embedding(s) of a single node type.
         Unlike DGL, this function takes the string node type name
@@ -606,22 +650,200 @@ def _set_n_emb(self, ntype:str, u:Union[str, TensorType], kv: Dict[str, TensorTy
                 "Updating a slice of an embedding is "
                 "currently unimplemented in cuGraph-DGL."
             )
-    
+
         for k, v in kv:
             self.__ndata_storage[ntype, k] = self.__ndata_storage_type(
-                v, **self.__wg_kwargs
+                v,
+                **self.__wg_kwargs,
             )
-        
-    def _pop_n_emb(self, ntype:str, key: str):
+
+    def _set_e_emb(
+        self, etype: str, u: Union[str, TensorType], kv: Dict[str, TensorType]
+    ) -> None:
+        """
+        Stores or updates the embedding(s) of a single edge type.
+        Unlike DGL, this function takes the canonical edge type name
+        instead of an integer id.
+
+        The semantics of this function match those of add_edges
+        with respect to whether or not the backing feature store
+        is distributed.
+
+        Parameters
+        ----------
+        etype: str
+            The edge type to store an embedding of.
+        u: Union[str, TensorType]
+            The indices to update, if updating the embedding.
+            Currently, updating a slice of an embedding is
+            unsupported, so this should be ALL.
+        kv: Dict[str, TensorType]
+            A mapping of embedding names to embedding tensors.
+        """
+
+        if not dgl.base.is_all(u):
+            raise NotImplementedError(
+                "Updating a slice of an embedding is "
+                "currently unimplemented in cuGraph-DGL."
+            )
+
+        for k, v in kv:
+            self.__edata_storage[etype, k] = self.__edata_storage_type(
+                v,
+                **self.__wg_kwargs,
+            )
+
+    def _pop_n_emb(self, ntype: str, key: str) -> "torch.Tensor":
+        """
+        Removes and returns the embedding of the given node
+        type with the given name.
+
+        Parameters
+        ----------
+        ntype:str
+            The node type.
+        key:str
+            The embedding name.
+
+        Returns
+        -------
+        The removed embedding.
+        """
         return self.__ndata_storage[ntype, key].pop(key)
 
-    def _get_n_emb_keys(self, ntype:str):
-        return [
-            k
-            for (t, k) in self.__ndata_storage
-            if ntype == t
-        ]
+    def _pop_e_emb(self, etype: str, key: str) -> "torch.Tensor":
+        """
+        Removes and returns the embedding of the given edge
+        type with the given name.
+
+        Parameters
+        ----------
+        etype:str
+            The node type.
+        key:str
+            The embedding name.
+
+        Returns
+        -------
+        torch.Tensor
+            The removed embedding.
+        """
+        return self.__edata_storage[etype, key].pop(key)
+
+    def _get_n_emb_keys(self, ntype: str) -> List[str]:
+        """
+        Gets a list of the embedding names for a given node
+        type.
+
+        Parameters
+        ----------
+        ntype: str
+            The node type to get embedding names for.
+
+        Returns
+        -------
+        List[str]
+            The list of embedding names for the given node type.
+        """
+        return [k for (t, k) in self.__ndata_storage if ntype == t]
+
+    def _get_e_emb_keys(self, etype: str) -> List[str]:
+        """
+        Gets a list of the embedding names for a given edge
+        type.
+
+        Parameters
+        ----------
+        etype: str
+            The edge type to get embedding names for.
+
+        Returns
+        -------
+        List[str]
+            The list of embedding names for the given edge type.
+        """
+        return [k for (t, k) in self.__ndata_storage if etype == t]
+
+    def all_edges(
+        self,
+        form="uv",
+        order="eid",
+        etype: Union[str, Tuple[str, str, str]] = None,
+        device: Union[str, int, "torch.device"] = "cpu",
+    ):
+        """
+        Returns all edges with the specified edge type.
+        cuGraph-DGL currently only supports 'eid' format and
+        'eid' order.
+
+        Parameters
+        ----------
+        form: str (optional, default='uv')
+            The format to return ('uv', 'eid', 'all').
+            cuGraph-DGL currently only supports 'eid'.
+        order: str (optional, default='eid')
+            The order to return edges in ('eid', 'srcdst')
+            cuGraph-DGL currently only supports 'eid'.
+        etype: Union[str, Tuple[str, str, str]] (optional, default=None)
+            The edge type to get.  Not required if this is
+            a homogeneous graph.  Can be the relation type if the
+            relation type is unique, or the canonical edge type.
+        device: Union[str, int, torch.device] (optional, default='cpu')
+            The device where returned edges should be stored
+            ('cpu', 'cuda', or device id).
+        """
+
+        if form != "eid":
+            raise NotImplementedError("cuGraph-DGL only supports eid format.")
+
+        if order != "eid":
+            raise NotImplementedError("cugraph-DGL only supports eid order.")
+
+        if etype is None and len(self.canonical_etypes) > 1:
+            raise ValueError("Edge type is required for heterogeneous graphs.")
+
+        etype = self.to_canonical_etype(etype)
+        return torch.arange(
+            0,
+            self.__num_edges_dict[etype],
+            dtype=self.idtype,
+            device=device,
+        )
+
+    @property
+    def ndata(self) -> HeteroNodeDataView:
+        """
+        Returns a view of the node data in this graph which can be used to
+        access or modify node features.
+        """
+
+        if len(self.ntypes) == 1:
+            ntype = self.ntypes[0]
+            return HeteroNodeDataView(self, ntype, dgl.base.ALL)
+
+        return HeteroNodeDataView(self, self.ntypes, dgl.base.ALL)
+
+    @property
+    def edata(self) -> HeteroEdgeDataView:
+        """
+        Returns a view of the edge data in this graph which can be used to
+        access or modify edge features.
+        """
+        if len(self.canonical_etypes) == 1:
+            return HeteroEdgeDataView(self, None, dgl.base.ALL)
+
+        return HeteroEdgeDataView(self, self.canonical_etypes, dgl.base.ALL)
+
+    @property
+    def nodes(self) -> HeteroNodeView:
+        """
+        Returns a view of the nodes in this graph.
+        """
+        return HeteroNodeView(self)
 
     @property
-    def ndata(self):
-        
\ No newline at end of file
+    def edges(self) -> HeteroEdgeView:
+        """
+        Returns a view of the edges in this graph.
+        """
+        return HeteroEdgeView(self)
diff --git a/python/cugraph-dgl/cugraph_dgl/view.py b/python/cugraph-dgl/cugraph_dgl/view.py
index 39ce44bae0c..49d8d2f69b9 100644
--- a/python/cugraph-dgl/cugraph_dgl/view.py
+++ b/python/cugraph-dgl/cugraph_dgl/view.py
@@ -12,17 +12,127 @@
 # limitations under the License.
 
 from collections import defaultdict
-from typing import Union, Dict, List
+from collections.abc import MutableMapping
+from typing import Union, Dict, List, Tuple
 
+from cugraph.utilities.utils import import_optional
+
+import cugraph_dgl
 from cugraph_dgl.typing import TensorType
 
-class HeteroNodeDataView:
+torch = import_optional("torch")
+dgl = import_optional("dgl")
+
+
+class HeteroEdgeDataView(MutableMapping):
+    """
+    Duck-typed version of DGL's HeteroEdgeDataView.
+    Used for accessing and modifying edge features.
+    """
+
+    def __init__(
+        self,
+        graph: "cugraph_dgl.Graph",
+        etype: Union[Tuple[str, str, str], List[Tuple[str, str, str]]],
+        edges: TensorType,
+    ):
+        self.__graph = graph
+        self.__etype = etype
+        self.__edges = edges
+
+    @property
+    def _etype(self) -> Tuple[str, str, str]:
+        return self.__etype
+
+    @property
+    def _graph(self) -> "cugraph_dgl.Graph":
+        return self.__graph
+
+    @property
+    def _edges(self) -> TensorType:
+        return self.__edges
+
+    def __getitem__(self, key: str):
+        if isinstance(self._etype, list):
+            return {t: self._graph._get_e_emb(t, key, self._nodes) for t in self._etype}
+
+        return self._graph._get_e_emb(self._etype, key, self._nodes)
+
+    def __setitem__(self, key: str, val: Union[TensorType, Dict[TensorType]]):
+        if isinstance(self._etype, list):
+            if not isinstance(val, dict):
+                raise ValueError(
+                    "There are multiple edge types in this view. "
+                    "Expected a dictionary of values."
+                )
+            for t, v in val.items():
+                if t not in self._etype:
+                    raise ValueError("Attempted to modify a type out of view.")
+                self._graph.set_e_emb(t, self._edges, {key: v})
+        else:
+            if isinstance(val, dict):
+                raise ValueError(
+                    "There is only one edge type in this view. "
+                    "Expected a single tensor."
+                )
+            self._graph.set_e_emb(self._etype, self._edges, {key: v})
+
+    def __delitem__(self, key: str):
+        if isinstance(self._etype, list):
+            for t in self._etype:
+                self._graph.pop_e_emb(t, key)
+        else:
+            self._graph.pop_e_emb(self._etype, key)
+
+    def _transpose(self, fetch_vals=True):
+        if isinstance(self._etype, list):
+            tr = defaultdict(dict)
+            for etype in self._etype:
+                for key in self._graph._get_e_emb_keys(etype):
+                    tr[key][etype] = (
+                        self._graph._get_e_emb(etype, key, self._edges)
+                        if fetch_vals
+                        else []
+                    )
+        else:
+            tr = {}
+            for key in self._graph._get_e_emb_keys(self._etype):
+                tr[key] = (
+                    self._graph._get_e_emb(self._etype, key, self._edges)
+                    if fetch_vals
+                    else []
+                )
+
+        return tr
+
+    def __len__(self):
+        return len(self._transpose(fetch_vals=False))
+
+    def __iter__(self):
+        return iter(self._transpose())
+
+    def keys(self):
+        return self._transpose(fetch_vals=False).keys()
+
+    def values(self):
+        return self._transpose().values()
+
+    def __repr__(self):
+        return repr(self.__transpose(fetch_vals=False))
+
+
+class HeteroNodeDataView(MutableMapping):
     """
     Duck-typed version of DGL's HeteroNodeDataView.
     Used for accessing and modifying node features.
     """
 
-    def __init__(self, graph: "cugraph_dgl.Graph", ntype: Union[str, List[str]], nodes: TensorType):
+    def __init__(
+        self,
+        graph: "cugraph_dgl.Graph",
+        ntype: Union[str, List[str]],
+        nodes: TensorType,
+    ):
         self.__graph = graph
         self.__ntype = ntype
         self.__nodes = nodes
@@ -30,24 +140,21 @@ def __init__(self, graph: "cugraph_dgl.Graph", ntype: Union[str, List[str]], nod
     @property
     def _ntype(self) -> str:
         return self.__ntype
-    
+
     @property
     def _graph(self) -> "cugraph_dgl.Graph":
         return self.__graph
-    
+
     @property
     def _nodes(self) -> TensorType:
         return self.__nodes
 
     def __getitem__(self, key: str):
         if isinstance(self._ntype, list):
-            return {
-                t: self._graph._get_n_emb(t, key, self._nodes)
-                for t in self._ntype
-            }
+            return {t: self._graph._get_n_emb(t, key, self._nodes) for t in self._ntype}
         else:
             return self._graph._get_n_emb(self._ntype, key, self._nodes)
-    
+
     def __setitem__(self, key: str, val: Union[TensorType, Dict[TensorType]]):
         if isinstance(self._ntype, list):
             if not isinstance(val, dict):
@@ -58,7 +165,7 @@ def __setitem__(self, key: str, val: Union[TensorType, Dict[TensorType]]):
             for t, v in val.items():
                 if t not in self._ntype:
                     raise ValueError("Attempted to modify a type out of view.")
-                self._graph._set_n_emb(self._ntype, self._nodes, {key: v})
+                self._graph._set_n_emb(t, self._nodes, {key: v})
         else:
             if isinstance(val, dict):
                 raise ValueError(
@@ -71,18 +178,28 @@ def __delitem__(self, key: str):
         if isinstance(self._ntype, list):
             for t in self._ntype:
                 self._graph._pop_n_emb(t, key)
+        else:
+            self._graph.pop_n_emb(self._ntype, key)
 
     def _transpose(self, fetch_vals=True):
         if isinstance(self._ntype, list):
             tr = defaultdict(dict)
             for ntype in self._ntype:
                 for key in self._graph._get_n_emb_keys(ntype):
-                    tr[key][ntype] = self._graph._get_n_emb(ntype, key, self._nodes) if fetch_vals else []
+                    tr[key][ntype] = (
+                        self._graph._get_n_emb(ntype, key, self._nodes)
+                        if fetch_vals
+                        else []
+                    )
         else:
             tr = {}
             for key in self._graph._get_n_emb_keys(self._ntype):
-                tr[key] = self._graph._get_n_emb(ntype, key, self._nodes) if fetch_vals else []
-        
+                tr[key] = (
+                    self._graph._get_n_emb(self._ntype, key, self._nodes)
+                    if fetch_vals
+                    else []
+                )
+
         return tr
 
     def __len__(self):
@@ -90,12 +207,92 @@ def __len__(self):
 
     def __iter__(self):
         return iter(self._transpose())
-    
+
     def keys(self):
         return self._transpose(fetch_vals=False).keys()
 
     def values(self):
         return self._transpose().values()
-    
+
     def __repr__(self):
-        return repr(self.__transpose(fetch_vals=False))
\ No newline at end of file
+        return repr(self.__transpose(fetch_vals=False))
+
+
+class HeteroEdgeView:
+    """
+    Duck-typed version of DGL's HeteroEdgeView.
+    """
+
+    def __init__(self, graph):
+        self.__graph = graph
+
+    @property
+    def _graph(self) -> "cugraph_dgl.Graph":
+        return self.__graph
+
+    def __getitem__(self, key):
+        if isinstance(key, slice):
+            if not (key.start is None and key.stop is None and key.stop is None):
+                raise ValueError("Only full slices are supported in DGL.")
+            edges = dgl.base.ALL
+            etype = None
+        elif key is None:
+            edges = dgl.base.ALL
+            etype = None
+        elif isinstance(key, tuple):
+            if len(key) == 3:
+                edges = dgl.base.ALL
+                etype = key
+            else:
+                edges = key
+                etype = None
+        elif isinstance(key, str):
+            edges = dgl.base.ALL
+            etype = key
+        else:
+            edges = key
+            etype = None
+
+        return HeteroEdgeDataView(
+            graph=self.__graph,
+            etype=etype,
+            edges=edges,
+        )
+
+    def __call__(self, *args, **kwargs):
+        return self.__graph.all_edges(*args, **kwargs)
+
+
+class HeteroNodeView:
+    """
+    Duck-typed version of DGL's HeteroNodeView.
+    """
+
+    def __init__(self, graph: "cugraph_dgl.Graph"):
+        self.__graph = graph
+
+    @property
+    def _graph(self) -> "cugraph_dgl.Graph":
+        return self.__graph
+
+    def __getitem__(self, key):
+        if isinstance(key, slice):
+            if not (key.start is None and key.stop is None and key.stop is None):
+                raise ValueError("Only full slices are supported in DGL.")
+            nodes = dgl.base.ALL
+            ntype = None
+        elif isinstance(key, tuple):
+            nodes, ntype = key
+        elif key is None or isinstance(key, str):
+            nodes = dgl.base.ALL
+            ntype = key
+        else:
+            nodes = key
+            ntype = None
+
+        return HeteroNodeDataView(graph=self.__graph, ntype=ntype, nodes=nodes)
+
+    def __call__(self, ntype=None):
+        return torch.arange(
+            0, self.__graph.num_nodes(ntype), dtype=self.__graph.idtype, device="cuda"
+        )

From 927ee0908680aad90c887d1f69f5410e7c9a31ae Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Fri, 14 Jun 2024 12:43:43 -0700
Subject: [PATCH 09/47] tests, bugfixes, resolve indexing problem (sort of)

---
 python/cugraph-dgl/cugraph_dgl/graph.py       | 26 ++++--
 .../{ => cugraph_dgl}/tests/__init__.py       |  0
 .../{ => cugraph_dgl}/tests/conftest.py       |  0
 .../tests/mg/test_dataloader.py               |  0
 .../tests/nn/test_gatconv.py                  |  0
 .../tests/nn/test_gatv2conv.py                |  0
 .../tests/nn/test_relgraphconv.py             |  0
 .../tests/nn/test_sageconv.py                 |  0
 .../tests/nn/test_sparsegraph.py              |  0
 .../tests/nn/test_transformerconv.py          |  0
 .../tests/test_cugraph_storage.py             |  0
 .../tests/test_dataloader.py                  |  0
 .../{ => cugraph_dgl}/tests/test_dataset.py   |  0
 .../tests/test_from_dgl_heterograph.py        |  0
 .../cugraph_dgl/tests/test_graph.py           | 91 +++++++++++++++++++
 .../{ => cugraph_dgl}/tests/test_utils.py     |  0
 .../{ => cugraph_dgl}/tests/utils.py          |  0
 .../utils/cugraph_conversion_utils.py         | 12 +--
 python/cugraph-dgl/cugraph_dgl/view.py        | 12 +--
 19 files changed, 121 insertions(+), 20 deletions(-)
 rename python/cugraph-dgl/{ => cugraph_dgl}/tests/__init__.py (100%)
 rename python/cugraph-dgl/{ => cugraph_dgl}/tests/conftest.py (100%)
 rename python/cugraph-dgl/{ => cugraph_dgl}/tests/mg/test_dataloader.py (100%)
 rename python/cugraph-dgl/{ => cugraph_dgl}/tests/nn/test_gatconv.py (100%)
 rename python/cugraph-dgl/{ => cugraph_dgl}/tests/nn/test_gatv2conv.py (100%)
 rename python/cugraph-dgl/{ => cugraph_dgl}/tests/nn/test_relgraphconv.py (100%)
 rename python/cugraph-dgl/{ => cugraph_dgl}/tests/nn/test_sageconv.py (100%)
 rename python/cugraph-dgl/{ => cugraph_dgl}/tests/nn/test_sparsegraph.py (100%)
 rename python/cugraph-dgl/{ => cugraph_dgl}/tests/nn/test_transformerconv.py (100%)
 rename python/cugraph-dgl/{ => cugraph_dgl}/tests/test_cugraph_storage.py (100%)
 rename python/cugraph-dgl/{ => cugraph_dgl}/tests/test_dataloader.py (100%)
 rename python/cugraph-dgl/{ => cugraph_dgl}/tests/test_dataset.py (100%)
 rename python/cugraph-dgl/{ => cugraph_dgl}/tests/test_from_dgl_heterograph.py (100%)
 create mode 100644 python/cugraph-dgl/cugraph_dgl/tests/test_graph.py
 rename python/cugraph-dgl/{ => cugraph_dgl}/tests/test_utils.py (100%)
 rename python/cugraph-dgl/{ => cugraph_dgl}/tests/utils.py (100%)

diff --git a/python/cugraph-dgl/cugraph_dgl/graph.py b/python/cugraph-dgl/cugraph_dgl/graph.py
index 2bfa75e091c..142bf483cc3 100644
--- a/python/cugraph-dgl/cugraph_dgl/graph.py
+++ b/python/cugraph-dgl/cugraph_dgl/graph.py
@@ -128,11 +128,11 @@ def to_canonical_etype(
         self, etype: Union[str, Tuple[str, str, str]]
     ) -> Tuple[str, str, str]:
         if etype is None:
-            if len(self.__edge_indices.keys(leaves_only=True, include_nested=True)) > 1:
+            if len(self.canonical_etypes) > 1:
                 raise ValueError("Edge type is required for heterogeneous graphs.")
             return HOMOGENEOUS_EDGE_TYPE
 
-        if isinstance(etype, Tuple[str, str, str]):
+        if isinstance(etype, tuple) and len(etype) == 3:
             return etype
 
         for src_type, rel_type, dst_type in self.__edge_indices.keys(
@@ -209,7 +209,7 @@ def add_nodes(
 
         for feature_name, feature_tensor in data.items():
             self.__ndata_storage[ntype, feature_name] = self.__ndata_storage_type(
-                feature_tensor, **self.__wg_kwargs
+                _cast_to_torch_tensor(feature_tensor), **self.__wg_kwargs
             )
 
         self.__graph = None
@@ -228,7 +228,7 @@ def __check_node_ids(self, ntype: str, ids: TensorType):
             The tensor of ids being validated.
         """
         if ntype in self.__num_nodes_dict:
-            if ids.max() + 1 > self.__num_nodes(ntype):
+            if ids.max() + 1 > self.num_nodes(ntype):
                 raise ValueError(
                     f"input tensor contains invalid node ids for type {ntype}"
                 )
@@ -293,7 +293,9 @@ def add_edges(
             for attr_name, attr_tensor in data.items():
                 self.__edata_storage[
                     dgl_can_edge_type, attr_name
-                ] = self.__edata_storage_type(attr_tensor, **self.__wg_kwargs)
+                ] = self.__edata_storage_type(
+                    _cast_to_torch_tensor(attr_tensor), **self.__wg_kwargs
+                )
 
         num_edges = self.__edge_indices[dgl_can_edge_type].shape[1]
         if self.is_multi_gpu:
@@ -515,7 +517,7 @@ def _graph(
             is_multigraph=True, is_symmetric=False
         )
 
-        if self.__graph[1] != direction:
+        if self.__graph is not None and self.__graph[1] != direction:
             self.__graph = None
 
         if self.__graph is None:
@@ -582,8 +584,14 @@ def _get_n_emb(
             The embedding of the given edge type with the given embedding name.
         """
 
+        if ntype is None:
+            if len(self.ntypes) == 1:
+                ntype = HOMOGENEOUS_NODE_TYPE
+            else:
+                raise ValueError("Must provide the node type for a heterogeneous graph")
+
         if dgl.base.is_all(u):
-            u = torch.arange(self.num_nodes(ntype), dtype=self.idtype)
+            u = torch.arange(self.num_nodes(ntype), dtype=self.idtype, device="cpu")
 
         return self.__ndata_storage[ntype, emb_name].fetch(
             _cast_to_torch_tensor(u), "cuda"
@@ -614,8 +622,10 @@ def _get_e_emb(
             The embedding of the given edge type with the given embedding name.
         """
 
+        etype = self.to_canonical_etype(etype)
+
         if dgl.base.is_all(u):
-            u = torch.arange(self.num_edges(etype), dtype=self.idtype)
+            u = torch.arange(self.num_edges(etype), dtype=self.idtype, device="cpu")
 
         return self.__edata_storage[etype, emb_name].fetch(
             _cast_to_torch_tensor(u), "cuda"
diff --git a/python/cugraph-dgl/tests/__init__.py b/python/cugraph-dgl/cugraph_dgl/tests/__init__.py
similarity index 100%
rename from python/cugraph-dgl/tests/__init__.py
rename to python/cugraph-dgl/cugraph_dgl/tests/__init__.py
diff --git a/python/cugraph-dgl/tests/conftest.py b/python/cugraph-dgl/cugraph_dgl/tests/conftest.py
similarity index 100%
rename from python/cugraph-dgl/tests/conftest.py
rename to python/cugraph-dgl/cugraph_dgl/tests/conftest.py
diff --git a/python/cugraph-dgl/tests/mg/test_dataloader.py b/python/cugraph-dgl/cugraph_dgl/tests/mg/test_dataloader.py
similarity index 100%
rename from python/cugraph-dgl/tests/mg/test_dataloader.py
rename to python/cugraph-dgl/cugraph_dgl/tests/mg/test_dataloader.py
diff --git a/python/cugraph-dgl/tests/nn/test_gatconv.py b/python/cugraph-dgl/cugraph_dgl/tests/nn/test_gatconv.py
similarity index 100%
rename from python/cugraph-dgl/tests/nn/test_gatconv.py
rename to python/cugraph-dgl/cugraph_dgl/tests/nn/test_gatconv.py
diff --git a/python/cugraph-dgl/tests/nn/test_gatv2conv.py b/python/cugraph-dgl/cugraph_dgl/tests/nn/test_gatv2conv.py
similarity index 100%
rename from python/cugraph-dgl/tests/nn/test_gatv2conv.py
rename to python/cugraph-dgl/cugraph_dgl/tests/nn/test_gatv2conv.py
diff --git a/python/cugraph-dgl/tests/nn/test_relgraphconv.py b/python/cugraph-dgl/cugraph_dgl/tests/nn/test_relgraphconv.py
similarity index 100%
rename from python/cugraph-dgl/tests/nn/test_relgraphconv.py
rename to python/cugraph-dgl/cugraph_dgl/tests/nn/test_relgraphconv.py
diff --git a/python/cugraph-dgl/tests/nn/test_sageconv.py b/python/cugraph-dgl/cugraph_dgl/tests/nn/test_sageconv.py
similarity index 100%
rename from python/cugraph-dgl/tests/nn/test_sageconv.py
rename to python/cugraph-dgl/cugraph_dgl/tests/nn/test_sageconv.py
diff --git a/python/cugraph-dgl/tests/nn/test_sparsegraph.py b/python/cugraph-dgl/cugraph_dgl/tests/nn/test_sparsegraph.py
similarity index 100%
rename from python/cugraph-dgl/tests/nn/test_sparsegraph.py
rename to python/cugraph-dgl/cugraph_dgl/tests/nn/test_sparsegraph.py
diff --git a/python/cugraph-dgl/tests/nn/test_transformerconv.py b/python/cugraph-dgl/cugraph_dgl/tests/nn/test_transformerconv.py
similarity index 100%
rename from python/cugraph-dgl/tests/nn/test_transformerconv.py
rename to python/cugraph-dgl/cugraph_dgl/tests/nn/test_transformerconv.py
diff --git a/python/cugraph-dgl/tests/test_cugraph_storage.py b/python/cugraph-dgl/cugraph_dgl/tests/test_cugraph_storage.py
similarity index 100%
rename from python/cugraph-dgl/tests/test_cugraph_storage.py
rename to python/cugraph-dgl/cugraph_dgl/tests/test_cugraph_storage.py
diff --git a/python/cugraph-dgl/tests/test_dataloader.py b/python/cugraph-dgl/cugraph_dgl/tests/test_dataloader.py
similarity index 100%
rename from python/cugraph-dgl/tests/test_dataloader.py
rename to python/cugraph-dgl/cugraph_dgl/tests/test_dataloader.py
diff --git a/python/cugraph-dgl/tests/test_dataset.py b/python/cugraph-dgl/cugraph_dgl/tests/test_dataset.py
similarity index 100%
rename from python/cugraph-dgl/tests/test_dataset.py
rename to python/cugraph-dgl/cugraph_dgl/tests/test_dataset.py
diff --git a/python/cugraph-dgl/tests/test_from_dgl_heterograph.py b/python/cugraph-dgl/cugraph_dgl/tests/test_from_dgl_heterograph.py
similarity index 100%
rename from python/cugraph-dgl/tests/test_from_dgl_heterograph.py
rename to python/cugraph-dgl/cugraph_dgl/tests/test_from_dgl_heterograph.py
diff --git a/python/cugraph-dgl/cugraph_dgl/tests/test_graph.py b/python/cugraph-dgl/cugraph_dgl/tests/test_graph.py
new file mode 100644
index 00000000000..966d51d1d66
--- /dev/null
+++ b/python/cugraph-dgl/cugraph_dgl/tests/test_graph.py
@@ -0,0 +1,91 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+import cugraph_dgl
+import pylibcugraph
+import cupy
+import numpy as np
+
+from cugraph.datasets import karate
+from cugraph.utilities.utils import import_optional, MissingModule
+
+torch = import_optional("torch")
+
+
+@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.parametrize("direction", ["out", "in"])
+def test_graph_make_homogeneous_graph(direction):
+    df = karate.get_edgelist()
+    df.src = df.src.astype("int64")
+    df.dst = df.dst.astype("int64")
+    wgt = np.random.random((len(df),))
+
+    graph = cugraph_dgl.Graph()
+    num_nodes = max(df.src.max(), df.dst.max()) + 1
+    node_x = np.random.random((num_nodes,))
+
+    graph.add_nodes(
+        num_nodes, data={"num": torch.arange(num_nodes, dtype=torch.int64), "x": node_x}
+    )
+    graph.add_edges(df.src, df.dst, {"weight": wgt})
+    plc_dgl_graph = graph._graph(direction=direction)
+
+    assert graph.num_nodes() == num_nodes
+    assert graph.num_edges() == len(df)
+    assert graph.is_homogeneous
+    assert not graph.is_multi_gpu
+
+    assert (
+        graph.nodes() == torch.arange(num_nodes, dtype=torch.int64, device="cuda")
+    ).all()
+    assert (graph.nodes[None]["x"] == torch.as_tensor(node_x, device="cuda")).all()
+    assert (
+        graph.nodes[None]["num"]
+        == torch.arange(num_nodes, dtype=torch.int64, device="cuda")
+    ).all()
+
+    assert (
+        graph.edges("eid", device="cuda")
+        == torch.arange(len(df), dtype=torch.int64, device="cuda")
+    ).all()
+    assert (graph.edges[None]["weight"] == torch.as_tensor(wgt, device="cuda")).all()
+
+    plc_expected_graph = pylibcugraph.SGGraph(
+        pylibcugraph.ResourceHandle(),
+        pylibcugraph.GraphProperties(is_multigraph=True, is_symmetric=False),
+        df.src if direction == "out" else df.dst,
+        df.dst if direction == "out" else df.src,
+        vertices_array=cupy.arange(num_nodes, dtype="int64"),
+    )
+
+    # Do the expensive check to make sure this test fails if an invalid
+    # graph is constructed.
+    v_actual, d_in_actual, d_out_actual = pylibcugraph.degrees(
+        pylibcugraph.ResourceHandle(),
+        plc_dgl_graph,
+        source_vertices=cupy.arange(num_nodes, dtype="int64"),
+        do_expensive_check=True,
+    )
+
+    v_exp, d_in_exp, d_out_exp = pylibcugraph.degrees(
+        pylibcugraph.ResourceHandle(),
+        plc_expected_graph,
+        source_vertices=cupy.arange(num_nodes, dtype="int64"),
+        do_expensive_check=True,
+    )
+
+    assert (v_actual == v_exp).all()
+    assert (d_in_actual == d_in_exp).all()
+    assert (d_out_actual == d_out_exp).all()
diff --git a/python/cugraph-dgl/tests/test_utils.py b/python/cugraph-dgl/cugraph_dgl/tests/test_utils.py
similarity index 100%
rename from python/cugraph-dgl/tests/test_utils.py
rename to python/cugraph-dgl/cugraph_dgl/tests/test_utils.py
diff --git a/python/cugraph-dgl/tests/utils.py b/python/cugraph-dgl/cugraph_dgl/tests/utils.py
similarity index 100%
rename from python/cugraph-dgl/tests/utils.py
rename to python/cugraph-dgl/cugraph_dgl/tests/utils.py
diff --git a/python/cugraph-dgl/cugraph_dgl/utils/cugraph_conversion_utils.py b/python/cugraph-dgl/cugraph_dgl/utils/cugraph_conversion_utils.py
index 7ae1cba0263..2ba04bd916f 100644
--- a/python/cugraph-dgl/cugraph_dgl/utils/cugraph_conversion_utils.py
+++ b/python/cugraph-dgl/cugraph_dgl/utils/cugraph_conversion_utils.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -13,7 +13,7 @@
 
 # Utils to convert b/w dgl heterograph to cugraph GraphStore
 from __future__ import annotations
-from typing import Dict, Tuple, Union, List
+from typing import Dict, Tuple, Union
 
 from cugraph_dgl.typing import TensorType
 
@@ -124,7 +124,7 @@ def _cast_to_torch_tensor(t: TensorType) -> "torch.Tensor":
     if isinstance(t, torch.Tensor):
         return t
     elif isinstance(t, (cp.ndarray, cudf.Series)):
-        return torch.as_tensor(t, device='cuda')
-    elif isinstance(t, pd.Series, np.ndarray):
-        return torch.as_tensor(t, device='cpu')
-    return torch.as_tensor(t)
\ No newline at end of file
+        return torch.as_tensor(t, device="cuda")
+    elif isinstance(t, (pd.Series, np.ndarray)):
+        return torch.as_tensor(t, device="cpu")
+    return torch.as_tensor(t)
diff --git a/python/cugraph-dgl/cugraph_dgl/view.py b/python/cugraph-dgl/cugraph_dgl/view.py
index 49d8d2f69b9..2bd4c1f2540 100644
--- a/python/cugraph-dgl/cugraph_dgl/view.py
+++ b/python/cugraph-dgl/cugraph_dgl/view.py
@@ -54,11 +54,11 @@ def _edges(self) -> TensorType:
 
     def __getitem__(self, key: str):
         if isinstance(self._etype, list):
-            return {t: self._graph._get_e_emb(t, key, self._nodes) for t in self._etype}
+            return {t: self._graph._get_e_emb(t, key, self._edges) for t in self._etype}
 
-        return self._graph._get_e_emb(self._etype, key, self._nodes)
+        return self._graph._get_e_emb(self._etype, key, self._edges)
 
-    def __setitem__(self, key: str, val: Union[TensorType, Dict[TensorType]]):
+    def __setitem__(self, key: str, val: Union[TensorType, Dict[str, TensorType]]):
         if isinstance(self._etype, list):
             if not isinstance(val, dict):
                 raise ValueError(
@@ -118,7 +118,7 @@ def values(self):
         return self._transpose().values()
 
     def __repr__(self):
-        return repr(self.__transpose(fetch_vals=False))
+        return repr(self._transpose(fetch_vals=False))
 
 
 class HeteroNodeDataView(MutableMapping):
@@ -155,7 +155,7 @@ def __getitem__(self, key: str):
         else:
             return self._graph._get_n_emb(self._ntype, key, self._nodes)
 
-    def __setitem__(self, key: str, val: Union[TensorType, Dict[TensorType]]):
+    def __setitem__(self, key: str, val: Union[TensorType, Dict[str, TensorType]]):
         if isinstance(self._ntype, list):
             if not isinstance(val, dict):
                 raise ValueError(
@@ -215,7 +215,7 @@ def values(self):
         return self._transpose().values()
 
     def __repr__(self):
-        return repr(self.__transpose(fetch_vals=False))
+        return repr(self._transpose(fetch_vals=False))
 
 
 class HeteroEdgeView:

From 68129d999b063ccc12ee7d5374e8d31dd608cf51 Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Tue, 25 Jun 2024 13:26:01 -0700
Subject: [PATCH 10/47] add heteogeneous tests

---
 python/cugraph-dgl/cugraph_dgl/graph.py       |  46 ++++++--
 ...st_dataloader.py => test_dataloader_mg.py} |   0
 .../cugraph_dgl/tests/test_graph.py           | 102 ++++++++++++++++++
 .../cugraph_dgl/tests/test_graph_mg.py        |   0
 python/cugraph-dgl/cugraph_dgl/view.py        |   5 +-
 5 files changed, 141 insertions(+), 12 deletions(-)
 rename python/cugraph-dgl/cugraph_dgl/tests/{mg/test_dataloader.py => test_dataloader_mg.py} (100%)
 create mode 100644 python/cugraph-dgl/cugraph_dgl/tests/test_graph_mg.py

diff --git a/python/cugraph-dgl/cugraph_dgl/graph.py b/python/cugraph-dgl/cugraph_dgl/graph.py
index 142bf483cc3..02abc9bffe0 100644
--- a/python/cugraph-dgl/cugraph_dgl/graph.py
+++ b/python/cugraph-dgl/cugraph_dgl/graph.py
@@ -287,7 +287,7 @@ def add_edges(
                 _cast_to_torch_tensor(u),
                 _cast_to_torch_tensor(v),
             ]
-        )
+        ).to(self.idtype)
 
         if data is not None:
             for attr_name, attr_tensor in data.items():
@@ -790,7 +790,7 @@ def all_edges(
         ----------
         form: str (optional, default='uv')
             The format to return ('uv', 'eid', 'all').
-            cuGraph-DGL currently only supports 'eid'.
+
         order: str (optional, default='eid')
             The order to return edges in ('eid', 'srcdst')
             cuGraph-DGL currently only supports 'eid'.
@@ -803,9 +803,6 @@ def all_edges(
             ('cpu', 'cuda', or device id).
         """
 
-        if form != "eid":
-            raise NotImplementedError("cuGraph-DGL only supports eid format.")
-
         if order != "eid":
             raise NotImplementedError("cugraph-DGL only supports eid order.")
 
@@ -813,12 +810,39 @@ def all_edges(
             raise ValueError("Edge type is required for heterogeneous graphs.")
 
         etype = self.to_canonical_etype(etype)
-        return torch.arange(
-            0,
-            self.__num_edges_dict[etype],
-            dtype=self.idtype,
-            device=device,
-        )
+
+        if form == 'eid':
+            return torch.arange(
+                0,
+                self.__num_edges_dict[etype],
+                dtype=self.idtype,
+                device=device,
+            )
+        else:
+            if self.is_multi_gpu:
+                src = torch.empty((self.__num_edges_dict[etype], ), dtype=self.idtype, device='cuda')
+                dst = torch.empty((self.__num_edges_dict[etype], ), dtype=self.idtype, device='cuda')
+                
+                h1 = torch.distributed.all_gather_into_tensor(src, self.__edge_indices[etype][0].cuda(), async_op=True)
+                h2 = torch.distributed.all_gather_into_tensor(dst, self.__edge_indices[etype][1].cuda(), async_op=True)
+
+                h1.wait()
+                h2.wait()
+                if form == 'uv':
+                    return src.to(device), dst.to(device)
+                elif form == 'all':
+                    return src.to(device), dst.to(device), torch.arange(self.__num_edges_dict[etype], dtype=self.idtype,device=device)
+                else:
+                    raise ValueError(f"Invalid form {form}")
+
+            else:
+                eix = self.__edge_indices[etype].to(device)
+                if form == 'uv':
+                    return eix[0], eix[1]
+                elif form == 'all':
+                    return eix[0], eix[1], torch.arange(self.__num_edges_dict[etype], dtype=self.idtype,device=device)
+                else:
+                    raise ValueError(f"Invalid form {form}")
 
     @property
     def ndata(self) -> HeteroNodeDataView:
diff --git a/python/cugraph-dgl/cugraph_dgl/tests/mg/test_dataloader.py b/python/cugraph-dgl/cugraph_dgl/tests/test_dataloader_mg.py
similarity index 100%
rename from python/cugraph-dgl/cugraph_dgl/tests/mg/test_dataloader.py
rename to python/cugraph-dgl/cugraph_dgl/tests/test_dataloader_mg.py
diff --git a/python/cugraph-dgl/cugraph_dgl/tests/test_graph.py b/python/cugraph-dgl/cugraph_dgl/tests/test_graph.py
index 966d51d1d66..89a74ff073c 100644
--- a/python/cugraph-dgl/cugraph_dgl/tests/test_graph.py
+++ b/python/cugraph-dgl/cugraph_dgl/tests/test_graph.py
@@ -89,3 +89,105 @@ def test_graph_make_homogeneous_graph(direction):
     assert (v_actual == v_exp).all()
     assert (d_in_actual == d_in_exp).all()
     assert (d_out_actual == d_out_exp).all()
+
+
+@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.parametrize("direction", ["out", "in"])
+def test_graph_make_heterogeneous_graph(direction):
+    df = karate.get_edgelist()
+    df.src = df.src.astype("int64")
+    df.dst = df.dst.astype("int64")
+    wgt = np.random.random((len(df),))
+
+    graph = cugraph_dgl.Graph()
+    total_num_nodes = max(df.src.max(), df.dst.max()) + 1
+    
+    num_nodes_group_1 = total_num_nodes // 2
+    num_nodes_group_2 = total_num_nodes - num_nodes_group_1
+    
+    node_x_1 = np.random.random((num_nodes_group_1,))
+    node_x_2 = np.random.random((num_nodes_group_2,))
+
+    graph.add_nodes(num_nodes_group_1, {'x':node_x_1}, 'type1')
+    graph.add_nodes(num_nodes_group_2, {'x':node_x_2}, 'type2')
+
+    edges_11 = df[(df.src < num_nodes_group_1) & (df.dst < num_nodes_group_1)]
+    edges_12 = df[(df.src < num_nodes_group_1) & (df.dst >= num_nodes_group_1)]
+    edges_21 = df[(df.src >= num_nodes_group_1) & (df.dst < num_nodes_group_1)]
+    edges_22 = df[(df.src >= num_nodes_group_1) & (df.dst >= num_nodes_group_1)]
+
+    edges_12.dst -= num_nodes_group_1
+    edges_21.src -= num_nodes_group_1
+    edges_22.dst -= num_nodes_group_1
+    edges_22.src -= num_nodes_group_1
+
+    graph.add_edges(edges_11.src, edges_11.dst, etype=('type1', 'e1', 'type1'))
+    graph.add_edges(edges_12.src, edges_12.dst, etype=('type1', 'e2', 'type2'))
+    graph.add_edges(edges_21.src, edges_21.dst, etype=('type2', 'e3', 'type1'))
+    graph.add_edges(edges_22.src, edges_22.dst, etype=('type2', 'e4', 'type2'))
+
+    assert not graph.is_homogeneous
+    assert not graph.is_multi_gpu
+
+    # Verify graph.nodes()
+    assert (
+        graph.nodes() == torch.arange(total_num_nodes, dtype=torch.int64, device="cuda")
+    ).all()
+    assert (
+        graph.nodes('type1') == torch.arange(num_nodes_group_1, dtype=torch.int64, device="cuda")
+    ).all()
+    assert (
+        graph.nodes('type2') == torch.arange(num_nodes_group_2, dtype=torch.int64, device="cuda")
+    ).all()
+
+    # Verify graph.edges()
+    assert((graph.edges('eid',etype=('type1','e1','type1')) == torch.arange(len(edges_11), dtype=torch.int64, device='cuda')).all())
+    assert((graph.edges('eid',etype=('type1','e2','type2')) == torch.arange(len(edges_12), dtype=torch.int64, device='cuda')).all())
+    assert((graph.edges('eid',etype=('type2','e3','type1')) == torch.arange(len(edges_21), dtype=torch.int64, device='cuda')).all())
+    assert((graph.edges('eid',etype=('type2','e4','type2')) == torch.arange(len(edges_22), dtype=torch.int64, device='cuda')).all())
+
+    # Use sampling call to check graph creation
+    # This isn't a test of cuGraph sampling with DGL; the options are
+    # set to verify the graph only.
+    plc_graph = graph._graph(direction)
+    sampling_output = pylibcugraph.uniform_neighbor_sample(
+        pylibcugraph.ResourceHandle(),
+        plc_graph,
+        start_list=cupy.arange(total_num_nodes, dtype='int64'),
+        h_fan_out=np.array([1, 1], dtype='int32'),
+        with_replacement=False,
+        do_expensive_check=True,
+        with_edge_properties=True,
+        prior_sources_behavior='exclude',
+        return_dict=True,
+    )
+
+    expected_etypes = {
+        0: 'e1',
+        1: 'e2',
+        2: 'e3',
+        3: 'e4',
+    }
+    expected_offsets = {
+        0: (0, 0),
+        1: (0, num_nodes_group_1),
+        2: (num_nodes_group_1, 0),
+        3: (num_nodes_group_1, num_nodes_group_1),
+    }
+    if direction == 'in':   
+        src_col = 'minors'
+        dst_col = 'majors'
+    else:
+        src_col = 'majors'
+        dst_col = 'minors'
+
+    # Looping over the output verifies that all edges are valid
+    # (and therefore, the graph is valid)
+    for i, etype in enumerate(sampling_output['edge_type'].tolist()):
+        eid = int(sampling_output['edge_id'][i])
+
+        srcs, dsts, eids = graph.edges('all', etype=expected_etypes[etype], device='cpu')
+        
+        assert eids[eid] == eid
+        assert srcs[eid] == int(sampling_output[src_col][i]) - expected_offsets[etype][0]
+        assert dsts[eid] == int(sampling_output[dst_col][i]) - expected_offsets[etype][1]
\ No newline at end of file
diff --git a/python/cugraph-dgl/cugraph_dgl/tests/test_graph_mg.py b/python/cugraph-dgl/cugraph_dgl/tests/test_graph_mg.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/python/cugraph-dgl/cugraph_dgl/view.py b/python/cugraph-dgl/cugraph_dgl/view.py
index 2bd4c1f2540..e2bf7c20a29 100644
--- a/python/cugraph-dgl/cugraph_dgl/view.py
+++ b/python/cugraph-dgl/cugraph_dgl/view.py
@@ -260,7 +260,10 @@ def __getitem__(self, key):
         )
 
     def __call__(self, *args, **kwargs):
-        return self.__graph.all_edges(*args, **kwargs)
+        if 'device' in kwargs:
+            return self.__graph.all_edges(*args, **kwargs)
+        
+        return self.__graph.all_edges(*args, **kwargs, device='cuda')
 
 
 class HeteroNodeView:

From 20450a37427881f641bd1b231284f4968be14246 Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Wed, 26 Jun 2024 13:19:36 -0700
Subject: [PATCH 11/47] testing, fixing graph API

---
 python/cugraph-dgl/cugraph_dgl/features.py    |   2 +-
 python/cugraph-dgl/cugraph_dgl/graph.py       |  35 +-
 .../cugraph_dgl/tests/test_graph_mg.py        | 340 ++++++++++++++++++
 3 files changed, 358 insertions(+), 19 deletions(-)

diff --git a/python/cugraph-dgl/cugraph_dgl/features.py b/python/cugraph-dgl/cugraph_dgl/features.py
index 80885cf01aa..b4ff0049494 100644
--- a/python/cugraph-dgl/cugraph_dgl/features.py
+++ b/python/cugraph-dgl/cugraph_dgl/features.py
@@ -76,7 +76,7 @@ def __init__(
             tensor = tensor.reshape((tensor.shape[0], 1))
 
         wg_tensor = wgth.create_wholememory_tensor(
-            self.__wg_commm,
+            self.__wg_comm,
             memory_type,
             location,
             global_shape,
diff --git a/python/cugraph-dgl/cugraph_dgl/graph.py b/python/cugraph-dgl/cugraph_dgl/graph.py
index 02abc9bffe0..00fa9a66be7 100644
--- a/python/cugraph-dgl/cugraph_dgl/graph.py
+++ b/python/cugraph-dgl/cugraph_dgl/graph.py
@@ -811,7 +811,7 @@ def all_edges(
 
         etype = self.to_canonical_etype(etype)
 
-        if form == 'eid':
+        if form == "eid":
             return torch.arange(
                 0,
                 self.__num_edges_dict[etype],
@@ -820,27 +820,26 @@ def all_edges(
             )
         else:
             if self.is_multi_gpu:
-                src = torch.empty((self.__num_edges_dict[etype], ), dtype=self.idtype, device='cuda')
-                dst = torch.empty((self.__num_edges_dict[etype], ), dtype=self.idtype, device='cuda')
-                
-                h1 = torch.distributed.all_gather_into_tensor(src, self.__edge_indices[etype][0].cuda(), async_op=True)
-                h2 = torch.distributed.all_gather_into_tensor(dst, self.__edge_indices[etype][1].cuda(), async_op=True)
-
-                h1.wait()
-                h2.wait()
-                if form == 'uv':
-                    return src.to(device), dst.to(device)
-                elif form == 'all':
-                    return src.to(device), dst.to(device), torch.arange(self.__num_edges_dict[etype], dtype=self.idtype,device=device)
-                else:
-                    raise ValueError(f"Invalid form {form}")
+                # This can't be done because it requires collective communication.
+                raise ValueError(
+                    "Calling all_edges in a distributed graph with"
+                    " form 'uv' or 'all' is unsupported."
+                )
 
             else:
                 eix = self.__edge_indices[etype].to(device)
-                if form == 'uv':
+                if form == "uv":
                     return eix[0], eix[1]
-                elif form == 'all':
-                    return eix[0], eix[1], torch.arange(self.__num_edges_dict[etype], dtype=self.idtype,device=device)
+                elif form == "all":
+                    return (
+                        eix[0],
+                        eix[1],
+                        torch.arange(
+                            self.__num_edges_dict[etype],
+                            dtype=self.idtype,
+                            device=device,
+                        ),
+                    )
                 else:
                     raise ValueError(f"Invalid form {form}")
 
diff --git a/python/cugraph-dgl/cugraph_dgl/tests/test_graph_mg.py b/python/cugraph-dgl/cugraph_dgl/tests/test_graph_mg.py
index e69de29bb2d..0dfde6b9715 100644
--- a/python/cugraph-dgl/cugraph_dgl/tests/test_graph_mg.py
+++ b/python/cugraph-dgl/cugraph_dgl/tests/test_graph_mg.py
@@ -0,0 +1,340 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import pytest
+
+import cugraph_dgl
+import pylibcugraph
+import cupy
+import numpy as np
+
+import cudf
+
+from cugraph.datasets import karate
+from cugraph.utilities.utils import import_optional, MissingModule
+
+from cugraph.gnn import (
+    cugraph_comms_init,
+    cugraph_comms_shutdown,
+    cugraph_comms_create_unique_id,
+    cugraph_comms_get_raft_handle,
+)
+
+pylibwholegraph = import_optional("pylibwholegraph")
+torch = import_optional("torch")
+
+
+def init_pytorch_worker(rank, world_size, cugraph_id):
+    import rmm
+
+    rmm.reinitialize(
+        devices=rank,
+    )
+
+    import cupy
+
+    cupy.cuda.Device(rank).use()
+    from rmm.allocators.cupy import rmm_cupy_allocator
+
+    cupy.cuda.set_allocator(rmm_cupy_allocator)
+
+    from cugraph.testing.mg_utils import enable_spilling
+
+    enable_spilling()
+
+    torch.cuda.set_device(rank)
+
+    os.environ["MASTER_ADDR"] = "localhost"
+    os.environ["MASTER_PORT"] = "12355"
+    torch.distributed.init_process_group("nccl", rank=rank, world_size=world_size)
+
+    pylibwholegraph.torch.initialize.init(
+        rank,
+        world_size,
+        rank,
+        world_size,
+    )
+
+    cugraph_comms_init(rank=rank, world_size=world_size, uid=cugraph_id, device=rank)
+
+
+def run_test_graph_make_homogeneous_graph_mg(rank, uid, world_size, direction):
+    init_pytorch_worker(rank, world_size, uid)
+
+    df = karate.get_edgelist()
+    df.src = df.src.astype("int64")
+    df.dst = df.dst.astype("int64")
+    wgt = np.random.random((len(df),))
+
+    graph = cugraph_dgl.Graph(
+        is_multi_gpu=True, ndata_storage="wholegraph", edata_storage="wholegraph"
+    )
+
+    # The number of nodes is set globally but features can have
+    # any distribution across workers as long as they are in order.
+    global_num_nodes = max(df.src.max(), df.dst.max()) + 1
+    node_x = np.array_split(np.arange(global_num_nodes, dtype="int64"), world_size)[
+        rank
+    ]
+
+    # Each worker gets a shuffled, permuted version of the edgelist
+    df = df.sample(frac=1.0)
+    df.src = (df.src + rank) % global_num_nodes
+    df.dst = (df.dst + rank + 1) % global_num_nodes
+
+    graph.add_nodes(global_num_nodes, data={"x": node_x})
+    graph.add_edges(df.src, df.dst, {"weight": wgt})
+    plc_dgl_graph = graph._graph(direction=direction)
+
+    assert graph.num_nodes() == global_num_nodes
+    assert graph.num_edges() == len(df) * world_size
+    assert graph.is_homogeneous
+    assert graph.is_multi_gpu
+
+    assert (
+        graph.nodes()
+        == torch.arange(global_num_nodes, dtype=torch.int64, device="cuda")
+    ).all()
+    ix = torch.arange(len(node_x) * rank, len(node_x) * (rank + 1), dtype=torch.int64)
+    assert (graph.nodes[ix]["x"] == torch.as_tensor(node_x, device="cuda")).all()
+
+    assert (
+        graph.edges("eid", device="cuda")
+        == torch.arange(world_size * len(df), dtype=torch.int64, device="cuda")
+    ).all()
+    ix = torch.arange(len(df) * rank, len(df) * (rank + 1), dtype=torch.int64)
+    assert (graph.edges[ix]["weight"] == torch.as_tensor(wgt, device="cuda")).all()
+
+    plc_handle = pylibcugraph.ResourceHandle(
+        cugraph_comms_get_raft_handle().getHandle()
+    )
+
+    plc_expected_graph = pylibcugraph.MGGraph(
+        plc_handle,
+        pylibcugraph.GraphProperties(is_multigraph=True, is_symmetric=False),
+        [df.src] if direction == "out" else [df.dst],
+        [df.dst] if direction == "out" else [df.src],
+        vertices_array=[
+            cupy.array_split(cupy.arange(global_num_nodes, dtype="int64"), world_size)[
+                rank
+            ]
+        ],
+    )
+
+    # Do the expensive check to make sure this test fails if an invalid
+    # graph is constructed.
+    v_actual, d_in_actual, d_out_actual = pylibcugraph.degrees(
+        plc_handle,
+        plc_dgl_graph,
+        source_vertices=cupy.arange(global_num_nodes, dtype="int64"),
+        do_expensive_check=True,
+    )
+
+    v_exp, d_in_exp, d_out_exp = pylibcugraph.degrees(
+        plc_handle,
+        plc_expected_graph,
+        source_vertices=cupy.arange(global_num_nodes, dtype="int64"),
+        do_expensive_check=True,
+    )
+
+    assert (v_actual == v_exp).all()
+    assert (d_in_actual == d_in_exp).all()
+    assert (d_out_actual == d_out_exp).all()
+
+    cugraph_comms_shutdown()
+
+
+@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.skipif(
+    isinstance(pylibwholegraph, MissingModule), reason="wholegraph not available"
+)
+@pytest.mark.parametrize("direction", ["out", "in"])
+def test_graph_make_homogeneous_graph_mg(direction):
+    uid = cugraph_comms_create_unique_id()
+    world_size = torch.cuda.device_count()
+
+    torch.multiprocessing.spawn(
+        run_test_graph_make_homogeneous_graph_mg,
+        args=(
+            uid,
+            world_size,
+            direction,
+        ),
+        nprocs=world_size,
+    )
+
+
+def run_test_graph_make_heterogeneous_graph_mg(rank, uid, world_size, direction):
+    init_pytorch_worker(rank, world_size, uid)
+
+    df = karate.get_edgelist()
+    df.src = df.src.astype("int64")
+    df.dst = df.dst.astype("int64")
+
+    graph = cugraph_dgl.Graph(is_multi_gpu=True)
+    total_num_nodes = max(df.src.max(), df.dst.max()) + 1
+
+    # Each worker gets a shuffled, permuted version of the edgelist
+    df = df.sample(frac=1.0)
+    df.src = (df.src + rank) % total_num_nodes
+    df.dst = (df.dst + rank + 1) % total_num_nodes
+
+    num_nodes_group_1 = total_num_nodes // 2
+    num_nodes_group_2 = total_num_nodes - num_nodes_group_1
+
+    node_x_1 = np.array_split(np.random.random((num_nodes_group_1,)), world_size)[rank]
+    node_x_2 = np.array_split(np.random.random((num_nodes_group_2,)), world_size)[rank]
+
+    graph.add_nodes(num_nodes_group_1, {"x": node_x_1}, "type1")
+    graph.add_nodes(num_nodes_group_2, {"x": node_x_2}, "type2")
+
+    edges_11 = df[(df.src < num_nodes_group_1) & (df.dst < num_nodes_group_1)]
+    edges_12 = df[(df.src < num_nodes_group_1) & (df.dst >= num_nodes_group_1)]
+    edges_21 = df[(df.src >= num_nodes_group_1) & (df.dst < num_nodes_group_1)]
+    edges_22 = df[(df.src >= num_nodes_group_1) & (df.dst >= num_nodes_group_1)]
+
+    edges_12.dst -= num_nodes_group_1
+    edges_21.src -= num_nodes_group_1
+    edges_22.dst -= num_nodes_group_1
+    edges_22.src -= num_nodes_group_1
+
+    total_edges_11 = torch.tensor(len(edges_11), device="cuda", dtype=torch.int64)
+    torch.distributed.all_reduce(total_edges_11, torch.distributed.ReduceOp.SUM)
+    total_edges_12 = torch.tensor(len(edges_12), device="cuda", dtype=torch.int64)
+    torch.distributed.all_reduce(total_edges_12, torch.distributed.ReduceOp.SUM)
+    total_edges_21 = torch.tensor(len(edges_21), device="cuda", dtype=torch.int64)
+    torch.distributed.all_reduce(total_edges_21, torch.distributed.ReduceOp.SUM)
+    total_edges_22 = torch.tensor(len(edges_22), device="cuda", dtype=torch.int64)
+    torch.distributed.all_reduce(total_edges_22, torch.distributed.ReduceOp.SUM)
+
+    graph.add_edges(edges_11.src, edges_11.dst, etype=("type1", "e1", "type1"))
+    graph.add_edges(edges_12.src, edges_12.dst, etype=("type1", "e2", "type2"))
+    graph.add_edges(edges_21.src, edges_21.dst, etype=("type2", "e3", "type1"))
+    graph.add_edges(edges_22.src, edges_22.dst, etype=("type2", "e4", "type2"))
+
+    assert not graph.is_homogeneous
+    assert graph.is_multi_gpu
+
+    # Verify graph.nodes()
+    assert (
+        graph.nodes() == torch.arange(total_num_nodes, dtype=torch.int64, device="cuda")
+    ).all()
+    assert (
+        graph.nodes("type1")
+        == torch.arange(num_nodes_group_1, dtype=torch.int64, device="cuda")
+    ).all()
+    assert (
+        graph.nodes("type2")
+        == torch.arange(num_nodes_group_2, dtype=torch.int64, device="cuda")
+    ).all()
+
+    # Verify graph.edges()
+    assert (
+        graph.edges("eid", etype=("type1", "e1", "type1"))
+        == torch.arange(total_edges_11, dtype=torch.int64, device="cuda")
+    ).all()
+    assert (
+        graph.edges("eid", etype=("type1", "e2", "type2"))
+        == torch.arange(total_edges_12, dtype=torch.int64, device="cuda")
+    ).all()
+    assert (
+        graph.edges("eid", etype=("type2", "e3", "type1"))
+        == torch.arange(total_edges_21, dtype=torch.int64, device="cuda")
+    ).all()
+    assert (
+        graph.edges("eid", etype=("type2", "e4", "type2"))
+        == torch.arange(total_edges_22, dtype=torch.int64, device="cuda")
+    ).all()
+
+    # Use sampling call to check graph creation
+    # This isn't a test of cuGraph sampling with DGL; the options are
+    # set to verify the graph only.
+    plc_graph = graph._graph(direction)
+    assert isinstance(plc_graph, pylibcugraph.MGGraph)
+    sampling_output = pylibcugraph.uniform_neighbor_sample(
+        graph._resource_handle,
+        plc_graph,
+        start_list=cupy.arange(total_num_nodes, dtype="int64"),
+        batch_id_list=cupy.full(total_num_nodes, rank, dtype="int32"),
+        label_list=cupy.arange(world_size, dtype="int32"),
+        label_to_output_comm_rank=cupy.arange(world_size, dtype="int32"),
+        h_fan_out=np.array([-1], dtype="int32"),
+        with_replacement=False,
+        do_expensive_check=True,
+        with_edge_properties=True,
+        prior_sources_behavior="exclude",
+        return_dict=True,
+    )
+
+    sdf = cudf.DataFrame(
+        {
+            "majors": sampling_output["majors"],
+            "minors": sampling_output["minors"],
+            "edge_id": sampling_output["edge_id"],
+            "edge_type": sampling_output["edge_type"],
+        }
+    )
+
+    expected_offsets = {
+        0: (0, 0),
+        1: (0, num_nodes_group_1),
+        2: (num_nodes_group_1, 0),
+        3: (num_nodes_group_1, num_nodes_group_1),
+    }
+    if direction == "in":
+        src_col = "minors"
+        dst_col = "majors"
+    else:
+        src_col = "majors"
+        dst_col = "minors"
+
+    edges_11["etype"] = 0
+    edges_12["etype"] = 1
+    edges_21["etype"] = 2
+    edges_22["etype"] = 3
+
+    cdf = cudf.concat([edges_11, edges_12, edges_21, edges_22])
+    for i in range(len(cdf)):
+        row = cdf.iloc[i]
+        etype = row["etype"]
+        src = row["src"] + expected_offsets[etype][0]
+        dst = row["dst"] + expected_offsets[etype][1]
+
+        f = sdf[
+            (sdf[src_col] == src) & (sdf[dst_col] == dst) & (sdf["edge_type"] == etype)
+        ]
+        assert len(f) > 0  # may be multiple, some could be on other GPU
+
+    cugraph_comms_shutdown()
+
+
+@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.skipif(
+    isinstance(pylibwholegraph, MissingModule), reason="wholegraph not available"
+)
+@pytest.mark.parametrize("direction", ["out", "in"])
+def test_graph_make_heterogeneous_graph_mg(direction):
+    uid = cugraph_comms_create_unique_id()
+    world_size = torch.cuda.device_count()
+
+    torch.multiprocessing.spawn(
+        run_test_graph_make_heterogeneous_graph_mg,
+        args=(
+            uid,
+            world_size,
+            direction,
+        ),
+        nprocs=world_size,
+    )

From 557d9aa03442c9f1e2d82eeda4d6c67db62533d0 Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Thu, 27 Jun 2024 15:15:42 -0700
Subject: [PATCH 12/47] Loaders

---
 .../cugraph_dgl/dataloading/__init__.py       |  11 +-
 .../dataloading/dask_dataloader.py            | 321 +++++++++++++++
 .../cugraph_dgl/dataloading/dataloader.py     | 384 +++++-------------
 .../dataloading/neighbor_sampler.py           | 123 +++++-
 .../cugraph_dgl/dataloading/sampler.py        | 154 +++++++
 .../dataloading/utils/sampling_helpers.py     | 104 ++++-
 6 files changed, 806 insertions(+), 291 deletions(-)
 create mode 100644 python/cugraph-dgl/cugraph_dgl/dataloading/dask_dataloader.py
 create mode 100644 python/cugraph-dgl/cugraph_dgl/dataloading/sampler.py

diff --git a/python/cugraph-dgl/cugraph_dgl/dataloading/__init__.py b/python/cugraph-dgl/cugraph_dgl/dataloading/__init__.py
index 2fd7d29bd49..5a775f0e88c 100644
--- a/python/cugraph-dgl/cugraph_dgl/dataloading/__init__.py
+++ b/python/cugraph-dgl/cugraph_dgl/dataloading/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -11,9 +11,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import warnings
+
 from cugraph_dgl.dataloading.dataset import (
     HomogenousBulkSamplerDataset,
     HeterogenousBulkSamplerDataset,
 )
 from cugraph_dgl.dataloading.neighbor_sampler import NeighborSampler
-from cugraph_dgl.dataloading.dataloader import DataLoader
+from cugraph_dgl.dataloading.dask_dataloader import DaskDataLoader
+
+
+def DataLoader(*args, **kwargs):
+    warnings.warn("DataLoader has been renamed to DaskDataLoader", FutureWarning)
+    return DaskDataLoader(*args, **kwargs)
diff --git a/python/cugraph-dgl/cugraph_dgl/dataloading/dask_dataloader.py b/python/cugraph-dgl/cugraph_dgl/dataloading/dask_dataloader.py
new file mode 100644
index 00000000000..7cd94a1be84
--- /dev/null
+++ b/python/cugraph-dgl/cugraph_dgl/dataloading/dask_dataloader.py
@@ -0,0 +1,321 @@
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+import os
+import shutil
+import cugraph_dgl
+import cupy as cp
+import cudf
+from cugraph.utilities.utils import import_optional
+from cugraph.gnn import BulkSampler
+from dask.distributed import default_client, Event
+from cugraph_dgl.dataloading import (
+    HomogenousBulkSamplerDataset,
+    HeterogenousBulkSamplerDataset,
+)
+from cugraph_dgl.dataloading.utils.extract_graph_helpers import (
+    create_cugraph_graph_from_edges_dict,
+)
+
+dgl = import_optional("dgl")
+torch = import_optional("torch")
+
+
+class DaskDataLoader(torch.utils.data.DataLoader):
+    """
+    Sampled graph data loader. Wrap a :class:`~cugraph_dgl.CuGraphStorage` and a
+    :class:`~cugraph_dgl.dataloading.NeighborSampler` into
+    an iterable over mini-batches of samples. cugraph_dgl's ``DataLoader`` extends
+    PyTorch's ``DataLoader`` by handling creation and
+    transmission of graph samples.
+    """
+
+    def __init__(
+        self,
+        graph: cugraph_dgl.CuGraphStorage,
+        indices: torch.Tensor,
+        graph_sampler: cugraph_dgl.dataloading.NeighborSampler,
+        sampling_output_dir: str,
+        batches_per_partition: int = 50,
+        seeds_per_call: int = 200_000,
+        device: torch.device = None,
+        use_ddp: bool = False,
+        ddp_seed: int = 0,
+        batch_size: int = 1024,
+        drop_last: bool = False,
+        shuffle: bool = False,
+        sparse_format: str = "coo",
+        **kwargs,
+    ):
+        """
+        Constructor for DaskDataLoader:
+        -------------------------------
+        graph : CuGraphStorage
+            The graph.
+        indices : Tensor or dict[ntype, Tensor]
+            The set of indices.  It can either be a tensor of
+            integer indices or a dictionary of types and indices.
+            The actual meaning of the indices is defined by the :meth:`sample` method of
+            :attr:`graph_sampler`.
+        graph_sampler : cugraph_dgl.dataloading.NeighborSampler
+            The subgraph sampler.
+        sampling_output_dir: str
+            Output directory to share sampling results in
+        batches_per_partition: int
+            The number of batches of sampling results to write/read
+        seeds_per_call: int
+            The number of seeds to sample at once
+        device : device context, optional
+            The device of the generated MFGs in each iteration, which should be a
+            PyTorch device object (e.g., ``torch.device``).
+            By default this returns the tenors on device with the current
+            cuda context
+        use_ddp : boolean, optional
+            If True, tells the DataLoader to split the training set for each
+            participating process appropriately using
+            :class:`torch.utils.data.distributed.DistributedSampler`.
+            Overrides the :attr:`sampler` argument of
+            :class:`torch.utils.data.DataLoader`.
+        ddp_seed : int, optional
+            The seed for shuffling the dataset in
+            :class:`torch.utils.data.distributed.DistributedSampler`.
+            Only effective when :attr:`use_ddp` is True.
+        batch_size: int
+            Batch size.
+        sparse_format: str, default = "coo"
+            The sparse format of the emitted sampled graphs. Choose between "csc"
+            and "coo". When using "csc", the graphs are of type
+            cugraph_dgl.nn.SparseGraph.
+        kwargs : dict
+            Key-word arguments to be passed to the parent PyTorch
+            :py:class:`torch.utils.data.DataLoader` class. Common arguments are:
+                - ``batch_size`` (int): The number of indices in each batch.
+                - ``drop_last`` (bool): Whether to drop the last incomplete
+                                        batch.
+                - ``shuffle`` (bool): Whether to randomly shuffle the
+                                      indices at each epoch
+        Examples
+        --------
+        To train a 3-layer GNN for node classification on a set of nodes
+        ``train_nid`` on a homogeneous graph where each node takes messages
+        from 15 neighbors on the first layer, 10 neighbors on the second, and
+        5 neighbors on the third:
+        >>> sampler = cugraph_dgl.dataloading.NeighborSampler([15, 10, 5])
+        >>> dataloader = cugraph_dgl.dataloading.DataLoader(
+        ...     g, train_nid, sampler,
+        ...     batch_size=1024, shuffle=True, drop_last=False, num_workers=0)
+        >>> for input_nodes, output_nodes, blocks in dataloader:
+        ...     train_on(input_nodes, output_nodes, blocks)
+        **Using with Distributed Data Parallel**
+        If you are using PyTorch's distributed training (e.g. when using
+        :mod:`torch.nn.parallel.DistributedDataParallel`),
+        you can train the model by turning
+        on the `use_ddp` option:
+        >>> sampler = cugraph_dgl.dataloading.NeighborSampler([15, 10, 5])
+        >>> dataloader = cugraph_dgl.dataloading.DataLoader(
+        ...     g, train_nid, sampler, use_ddp=True,
+        ...     batch_size=1024, shuffle=True, drop_last=False, num_workers=0)
+        >>> for epoch in range(start_epoch, n_epochs):
+        ...     for input_nodes, output_nodes, blocks in dataloader:
+        ...
+        """
+        if sparse_format not in ["coo", "csc"]:
+            raise ValueError(
+                f"sparse_format must be one of 'coo', 'csc', "
+                f"but got {sparse_format}."
+            )
+        self.sparse_format = sparse_format
+
+        self.ddp_seed = ddp_seed
+        self.use_ddp = use_ddp
+        self.shuffle = shuffle
+        self.drop_last = drop_last
+        self.graph_sampler = graph_sampler
+        worker_init_fn = dgl.dataloading.WorkerInitWrapper(
+            kwargs.get("worker_init_fn", None)
+        )
+        self.other_storages = {}
+        self.epoch_number = 0
+        self._batch_size = batch_size
+        self._sampling_output_dir = sampling_output_dir
+        self._batches_per_partition = batches_per_partition
+        self._seeds_per_call = seeds_per_call
+        self._rank = None
+
+        indices = _dgl_idx_to_cugraph_idx(indices, graph)
+
+        self.tensorized_indices_ds = dgl.dataloading.create_tensorized_dataset(
+            indices,
+            batch_size,
+            drop_last,
+            use_ddp,
+            ddp_seed,
+            shuffle,
+            kwargs.get("persistent_workers", False),
+        )
+
+        if len(graph.ntypes) <= 1:
+            self.cugraph_dgl_dataset = HomogenousBulkSamplerDataset(
+                total_number_of_nodes=graph.total_number_of_nodes,
+                edge_dir=self.graph_sampler.edge_dir,
+                sparse_format=sparse_format,
+            )
+        else:
+            etype_id_to_etype_str_dict = {v: k for k, v in graph._etype_id_dict.items()}
+
+            self.cugraph_dgl_dataset = HeterogenousBulkSamplerDataset(
+                num_nodes_dict=graph.num_nodes_dict,
+                etype_id_dict=etype_id_to_etype_str_dict,
+                etype_offset_dict=graph._etype_offset_d,
+                ntype_offset_dict=graph._ntype_offset_d,
+                edge_dir=self.graph_sampler.edge_dir,
+            )
+
+        if use_ddp:
+            rank = torch.distributed.get_rank()
+            client = default_client()
+            self._graph_creation_event = Event("cugraph_dgl_load_mg_graph_event")
+            if rank == 0:
+                G = create_cugraph_graph_from_edges_dict(
+                    edges_dict=graph._edges_dict,
+                    etype_id_dict=graph._etype_id_dict,
+                    edge_dir=graph_sampler.edge_dir,
+                )
+                client.publish_dataset(cugraph_dgl_mg_graph_ds=G)
+                self._graph_creation_event.set()
+            else:
+                if self._graph_creation_event.wait(timeout=1000):
+                    G = client.get_dataset("cugraph_dgl_mg_graph_ds")
+                else:
+                    raise RuntimeError(
+                        f"Fetch cugraph_dgl_mg_graph_ds to worker_id {rank}",
+                        "from worker_id 0 failed",
+                    )
+        else:
+            rank = 0
+            G = create_cugraph_graph_from_edges_dict(
+                edges_dict=graph._edges_dict,
+                etype_id_dict=graph._etype_id_dict,
+                edge_dir=graph_sampler.edge_dir,
+            )
+
+        self._rank = rank
+        self._cugraph_graph = G
+        super().__init__(
+            self.cugraph_dgl_dataset,
+            batch_size=None,
+            worker_init_fn=worker_init_fn,
+            collate_fn=lambda x: x,  # Hack to prevent collating
+            **kwargs,
+        )
+
+    def __iter__(self):
+        output_dir = os.path.join(
+            self._sampling_output_dir, "epoch_" + str(self.epoch_number)
+        )
+        kwargs = {}
+        if isinstance(self.cugraph_dgl_dataset, HomogenousBulkSamplerDataset):
+            kwargs["deduplicate_sources"] = True
+            kwargs["prior_sources_behavior"] = "carryover"
+            kwargs["renumber"] = True
+
+            if self.sparse_format == "csc":
+                kwargs["compression"] = "CSR"
+                kwargs["compress_per_hop"] = True
+                # The following kwargs will be deprecated in uniform sampler.
+                kwargs["use_legacy_names"] = False
+                kwargs["include_hop_column"] = False
+
+        else:
+            kwargs["deduplicate_sources"] = False
+            kwargs["prior_sources_behavior"] = None
+            kwargs["renumber"] = False
+
+        bs = BulkSampler(
+            output_path=output_dir,
+            batch_size=self._batch_size,
+            graph=self._cugraph_graph,
+            batches_per_partition=self._batches_per_partition,
+            seeds_per_call=self._seeds_per_call,
+            fanout_vals=self.graph_sampler._reversed_fanout_vals,
+            with_replacement=self.graph_sampler.replace,
+            **kwargs,
+        )
+
+        if self.shuffle:
+            self.tensorized_indices_ds.shuffle()
+
+        batch_df = create_batch_df(self.tensorized_indices_ds)
+        bs.add_batches(batch_df, start_col_name="start", batch_col_name="batch_id")
+        bs.flush()
+        self.cugraph_dgl_dataset.set_input_files(input_directory=output_dir)
+        self.epoch_number = self.epoch_number + 1
+        return super().__iter__()
+
+    def __del__(self):
+        if self.use_ddp:
+            torch.distributed.barrier()
+        if self._rank == 0:
+            if self.use_ddp:
+                client = default_client()
+                client.unpublish_dataset("cugraph_dgl_mg_graph_ds")
+                self._graph_creation_event.clear()
+            _clean_directory(self._sampling_output_dir)
+
+
+def get_batch_id_series(n_output_rows: int, batch_size: int):
+    num_batches = (n_output_rows + batch_size - 1) // batch_size
+    print(f"Number of batches = {num_batches}".format(num_batches))
+    batch_ar = cp.arange(0, num_batches).repeat(batch_size)
+    batch_ar = batch_ar[0:n_output_rows].astype(cp.int32)
+    return cudf.Series(batch_ar)
+
+
+def create_batch_df(dataset: torch.Tensor):
+    batch_id_ls = []
+    indices_ls = []
+    for batch_id, b_indices in enumerate(dataset):
+        if isinstance(b_indices, dict):
+            b_indices = torch.cat(list(b_indices.values()))
+        batch_id_ar = cp.full(shape=len(b_indices), fill_value=batch_id, dtype=cp.int32)
+        batch_id_ls.append(batch_id_ar)
+        indices_ls.append(b_indices)
+
+    batch_id_ar = cp.concatenate(batch_id_ls)
+    indices_ar = cp.asarray(torch.concat(indices_ls))
+    batches_df = cudf.DataFrame(
+        {
+            "start": indices_ar,
+            "batch_id": batch_id_ar,
+        }
+    )
+    return batches_df
+
+
+def _dgl_idx_to_cugraph_idx(idx, cugraph_gs):
+    if not isinstance(idx, dict):
+        if len(cugraph_gs.ntypes) > 1:
+            raise dgl.DGLError(
+                "Must specify node type when the graph is not homogeneous."
+            )
+        return idx
+    else:
+        return {k: cugraph_gs.dgl_n_id_to_cugraph_id(n, k) for k, n in idx.items()}
+
+
+def _clean_directory(path):
+    """param <path> could either be relative or absolute."""
+    if os.path.isfile(path):
+        os.remove(path)  # remove the file
+    elif os.path.isdir(path):
+        shutil.rmtree(path)  # remove dir and all contains
diff --git a/python/cugraph-dgl/cugraph_dgl/dataloading/dataloader.py b/python/cugraph-dgl/cugraph_dgl/dataloading/dataloader.py
index 11139910931..73130e2dfb0 100644
--- a/python/cugraph-dgl/cugraph_dgl/dataloading/dataloader.py
+++ b/python/cugraph-dgl/cugraph_dgl/dataloading/dataloader.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -10,151 +10,121 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import annotations
-import os
-import shutil
-import cugraph_dgl
-import cupy as cp
-import cudf
+
+import warnings
+
+from typing import Union, Optional, Dict
+
 from cugraph.utilities.utils import import_optional
-from cugraph.gnn import BulkSampler
-from dask.distributed import default_client, Event
-from cugraph_dgl.dataloading import (
-    HomogenousBulkSamplerDataset,
-    HeterogenousBulkSamplerDataset,
-)
-from cugraph_dgl.dataloading.utils.extract_graph_helpers import (
-    create_cugraph_graph_from_edges_dict,
-)
+
+import cugraph_dgl
+from cugraph_dgl.typing import TensorType
+from cugraph_dgl.utils.cugraph_conversion_utils import _cast_to_torch_tensor
 
 dgl = import_optional("dgl")
 torch = import_optional("torch")
 
 
-class DataLoader(torch.utils.data.DataLoader):
+class DataLoader:
     """
-    Sampled graph data loader. Wrap a :class:`~cugraph_dgl.CuGraphStorage` and a
-    :class:`~cugraph_dgl.dataloading.NeighborSampler` into
-    an iterable over mini-batches of samples. cugraph_dgl's ``DataLoader`` extends
-    PyTorch's ``DataLoader`` by handling creation and
-    transmission of graph samples.
+    Duck-typed version of dgl.dataloading.DataLoader
     """
 
     def __init__(
         self,
-        graph: cugraph_dgl.CuGraphStorage,
-        indices: torch.Tensor,
-        graph_sampler: cugraph_dgl.dataloading.NeighborSampler,
-        sampling_output_dir: str,
-        batches_per_partition: int = 50,
-        seeds_per_call: int = 200_000,
-        device: torch.device = None,
+        graph: "cugraph_dgl.Graph",
+        indices: TensorType,
+        graph_sampler: "cugraph_dgl.dataloading.Sampler",
+        device: Union[int, str, "torch.device"] = None,
         use_ddp: bool = False,
         ddp_seed: int = 0,
-        batch_size: int = 1024,
+        batch_size=1,
         drop_last: bool = False,
         shuffle: bool = False,
-        sparse_format: str = "coo",
+        use_prefetch_thread: Optional[bool] = None,
+        use_alternate_streams: Optional[bool] = None,
+        pin_prefetcher: Optional[bool] = None,
+        use_uva=False,
+        gpu_cache: Dict[str, Dict[str, int]] = None,
+        output_format: str = "dgl.Block",
         **kwargs,
     ):
         """
-        Constructor for CuGraphStorage:
-        -------------------------------
-        graph : CuGraphStorage
-            The graph.
-        indices : Tensor or dict[ntype, Tensor]
-            The set of indices.  It can either be a tensor of
-            integer indices or a dictionary of types and indices.
-            The actual meaning of the indices is defined by the :meth:`sample` method of
-            :attr:`graph_sampler`.
-        graph_sampler : cugraph_dgl.dataloading.NeighborSampler
-            The subgraph sampler.
-        sampling_output_dir: str
-            Output directory to share sampling results in
-        batches_per_partition: int
-            The number of batches of sampling results to write/read
-        seeds_per_call: int
-            The number of seeds to sample at once
-        device : device context, optional
-            The device of the generated MFGs in each iteration, which should be a
-            PyTorch device object (e.g., ``torch.device``).
-            By default this returns the tenors on device with the current
-            cuda context
-        use_ddp : boolean, optional
-            If True, tells the DataLoader to split the training set for each
-            participating process appropriately using
-            :class:`torch.utils.data.distributed.DistributedSampler`.
-            Overrides the :attr:`sampler` argument of
-            :class:`torch.utils.data.DataLoader`.
-        ddp_seed : int, optional
-            The seed for shuffling the dataset in
-            :class:`torch.utils.data.distributed.DistributedSampler`.
-            Only effective when :attr:`use_ddp` is True.
-        batch_size: int
-            Batch size.
-        sparse_format: str, default = "coo"
-            The sparse format of the emitted sampled graphs. Choose between "csc"
-            and "coo". When using "csc", the graphs are of type
-            cugraph_dgl.nn.SparseGraph.
-        kwargs : dict
-            Key-word arguments to be passed to the parent PyTorch
-            :py:class:`torch.utils.data.DataLoader` class. Common arguments are:
-                - ``batch_size`` (int): The number of indices in each batch.
-                - ``drop_last`` (bool): Whether to drop the last incomplete
-                                        batch.
-                - ``shuffle`` (bool): Whether to randomly shuffle the
-                                      indices at each epoch
-        Examples
-        --------
-        To train a 3-layer GNN for node classification on a set of nodes
-        ``train_nid`` on a homogeneous graph where each node takes messages
-        from 15 neighbors on the first layer, 10 neighbors on the second, and
-        5 neighbors on the third:
-        >>> sampler = cugraph_dgl.dataloading.NeighborSampler([15, 10, 5])
-        >>> dataloader = cugraph_dgl.dataloading.DataLoader(
-        ...     g, train_nid, sampler,
-        ...     batch_size=1024, shuffle=True, drop_last=False, num_workers=0)
-        >>> for input_nodes, output_nodes, blocks in dataloader:
-        ...     train_on(input_nodes, output_nodes, blocks)
-        **Using with Distributed Data Parallel**
-        If you are using PyTorch's distributed training (e.g. when using
-        :mod:`torch.nn.parallel.DistributedDataParallel`),
-        you can train the model by turning
-        on the `use_ddp` option:
-        >>> sampler = cugraph_dgl.dataloading.NeighborSampler([15, 10, 5])
-        >>> dataloader = cugraph_dgl.dataloading.DataLoader(
-        ...     g, train_nid, sampler, use_ddp=True,
-        ...     batch_size=1024, shuffle=True, drop_last=False, num_workers=0)
-        >>> for epoch in range(start_epoch, n_epochs):
-        ...     for input_nodes, output_nodes, blocks in dataloader:
-        ...
+        Parameters
+        ----------
+        graph: cugraph_dgl.Graph
+            The graph being sampled.  Can be a single-GPU or multi-GPU graph.
+        indices: TensorType
+            The seed nodes for sampling.  If use_ddp=True, then all seed
+            nodes should be provided.  If use_ddp=False, then only the seed
+            nodes assigned to this worker should be provided.
+        graph_sampler: cugraph_dgl.dataloading.Sampler
+            The sampler responsible for sampling the graph and producing
+            output minibatches.
+        device: Union[int, str, torch.device]
+            Optional.
+            The device assigned to this loader ('cpu', 'cuda' or device id).
+            Defaults to the current device.
+        use_ddp: bool
+            Optional (default=False).
+            If true, this argument will assume the entire list of input seed
+            nodes is being passed to each worker, and will appropriately
+            split and shuffle the list.
+            It false, then it is assumed that the list of input seed nodes
+            is comprised of the union of the lists provided to each worker.
+        ddp_seed: int
+            Optional (default=0).
+            The seed used for dividing and shuffling data if use_ddp=True.
+            Has no effect if use_ddp=False.
+        use_uva: bool
+            Optional (default=False).
+            Whether to use pinned memory and unified virtual addressing
+            to perform sampling.
+            This argument is ignored by cuGraph-DGL.
+        use_prefetch_thread: bool
+            Optional (default=False).
+            Whether to spawn a new thread for feature fetching.
+            This argument is ignored by cuGraph-DGL.
+        use_alternate_streams: bool
+            Optional (default=False).
+            Whether to perform feature fetching on a separate stream.
+            This argument is ignored by cuGraph-DGL.
+        pin_prefetcher: bool
+            Optional (default=False).
+            Whether to pin the feature tensors.
+            This argument is currently ignored by cuGraph-DGL.
+        gpu_cache: Dict[str, Dict[str, int]]
+            List of features to cache using HugeCTR.
+            This argument is not supported by cuGraph-DGL and
+            will result in an error.
+        output_format: str
+            Optional (default="dgl.Block").
+            The output format for blocks.
+            Can be either "dgl.Block" or "cugraph_dgl.nn.SparseGraph".
         """
-        if sparse_format not in ["coo", "csc"]:
+
+        if use_uva:
+            warnings.warn("The 'use_uva' argument is ignored by cuGraph-DGL.")
+        if use_prefetch_thread:
+            warnings.warn(
+                "The 'use_prefetch_thread' argument is ignored by cuGraph-DGL."
+            )
+        if use_alternate_streams:
+            warnings.warn(
+                "The 'use_alternate_streams' argument is ignored by cuGraph-DGL."
+            )
+        if pin_prefetcher:
+            warnings.warn("The 'pin_prefetcher' argument is ignored by cuGraph-DGL.")
+        if gpu_cache:
             raise ValueError(
-                f"sparse_format must be one of 'coo', 'csc', "
-                f"but got {sparse_format}."
+                "HugeCTR is not supported by cuGraph-DGL. "
+                "Consider using WholeGraph for feature storage"
+                " in cugraph_dgl.Graph instead."
             )
-        self.sparse_format = sparse_format
 
-        self.ddp_seed = ddp_seed
-        self.use_ddp = use_ddp
-        self.shuffle = shuffle
-        self.drop_last = drop_last
-        self.graph_sampler = graph_sampler
-        worker_init_fn = dgl.dataloading.WorkerInitWrapper(
-            kwargs.get("worker_init_fn", None)
-        )
-        self.other_storages = {}
-        self.epoch_number = 0
-        self._batch_size = batch_size
-        self._sampling_output_dir = sampling_output_dir
-        self._batches_per_partition = batches_per_partition
-        self._seeds_per_call = seeds_per_call
-        self._rank = None
-
-        indices = _dgl_idx_to_cugraph_idx(indices, graph)
+        indices = _cast_to_torch_tensor(indices)
 
-        self.tensorized_indices_ds = dgl.dataloading.create_tensorized_dataset(
+        self.__dataset = dgl.dataloading.create_tensorized_dataset(
             indices,
             batch_size,
             drop_last,
@@ -164,158 +134,24 @@ def __init__(
             kwargs.get("persistent_workers", False),
         )
 
-        if len(graph.ntypes) <= 1:
-            self.cugraph_dgl_dataset = HomogenousBulkSamplerDataset(
-                total_number_of_nodes=graph.total_number_of_nodes,
-                edge_dir=self.graph_sampler.edge_dir,
-                sparse_format=sparse_format,
-            )
-        else:
-            etype_id_to_etype_str_dict = {v: k for k, v in graph._etype_id_dict.items()}
-
-            self.cugraph_dgl_dataset = HeterogenousBulkSamplerDataset(
-                num_nodes_dict=graph.num_nodes_dict,
-                etype_id_dict=etype_id_to_etype_str_dict,
-                etype_offset_dict=graph._etype_offset_d,
-                ntype_offset_dict=graph._ntype_offset_d,
-                edge_dir=self.graph_sampler.edge_dir,
-            )
+        self.__output_format = output_format
+        self.__sampler = graph_sampler
+        self.__batch_size = batch_size
+        self.__graph = graph
+        self.__device = device
 
-        if use_ddp:
-            rank = torch.distributed.get_rank()
-            client = default_client()
-            self._graph_creation_event = Event("cugraph_dgl_load_mg_graph_event")
-            if rank == 0:
-                G = create_cugraph_graph_from_edges_dict(
-                    edges_dict=graph._edges_dict,
-                    etype_id_dict=graph._etype_id_dict,
-                    edge_dir=graph_sampler.edge_dir,
-                )
-                client.publish_dataset(cugraph_dgl_mg_graph_ds=G)
-                self._graph_creation_event.set()
-            else:
-                if self._graph_creation_event.wait(timeout=1000):
-                    G = client.get_dataset("cugraph_dgl_mg_graph_ds")
-                else:
-                    raise RuntimeError(
-                        f"Fetch cugraph_dgl_mg_graph_ds to worker_id {rank}",
-                        "from worker_id 0 failed",
-                    )
-        else:
-            rank = 0
-            G = create_cugraph_graph_from_edges_dict(
-                edges_dict=graph._edges_dict,
-                etype_id_dict=graph._etype_id_dict,
-                edge_dir=graph_sampler.edge_dir,
-            )
-
-        self._rank = rank
-        self._cugraph_graph = G
-        super().__init__(
-            self.cugraph_dgl_dataset,
-            batch_size=None,
-            worker_init_fn=worker_init_fn,
-            collate_fn=lambda x: x,  # Hack to prevent collating
-            **kwargs,
-        )
+    @property
+    def dataset(
+        self,
+    ) -> Union[
+        "dgl.dataloading.dataloader.TensorizedDataset",
+        "dgl.dataloading.dataloader.DDPTensorizedDataset",
+    ]:
+        return self.__dataset
 
     def __iter__(self):
-        output_dir = os.path.join(
-            self._sampling_output_dir, "epoch_" + str(self.epoch_number)
+        return self.__sampler.sample(
+            self.__graph,
+            self.__dataset,
+            self.__batch_size,
         )
-        kwargs = {}
-        if isinstance(self.cugraph_dgl_dataset, HomogenousBulkSamplerDataset):
-            kwargs["deduplicate_sources"] = True
-            kwargs["prior_sources_behavior"] = "carryover"
-            kwargs["renumber"] = True
-
-            if self.sparse_format == "csc":
-                kwargs["compression"] = "CSR"
-                kwargs["compress_per_hop"] = True
-                # The following kwargs will be deprecated in uniform sampler.
-                kwargs["use_legacy_names"] = False
-                kwargs["include_hop_column"] = False
-
-        else:
-            kwargs["deduplicate_sources"] = False
-            kwargs["prior_sources_behavior"] = None
-            kwargs["renumber"] = False
-
-        bs = BulkSampler(
-            output_path=output_dir,
-            batch_size=self._batch_size,
-            graph=self._cugraph_graph,
-            batches_per_partition=self._batches_per_partition,
-            seeds_per_call=self._seeds_per_call,
-            fanout_vals=self.graph_sampler._reversed_fanout_vals,
-            with_replacement=self.graph_sampler.replace,
-            **kwargs,
-        )
-
-        if self.shuffle:
-            self.tensorized_indices_ds.shuffle()
-
-        batch_df = create_batch_df(self.tensorized_indices_ds)
-        bs.add_batches(batch_df, start_col_name="start", batch_col_name="batch_id")
-        bs.flush()
-        self.cugraph_dgl_dataset.set_input_files(input_directory=output_dir)
-        self.epoch_number = self.epoch_number + 1
-        return super().__iter__()
-
-    def __del__(self):
-        if self.use_ddp:
-            torch.distributed.barrier()
-        if self._rank == 0:
-            if self.use_ddp:
-                client = default_client()
-                client.unpublish_dataset("cugraph_dgl_mg_graph_ds")
-                self._graph_creation_event.clear()
-            _clean_directory(self._sampling_output_dir)
-
-
-def get_batch_id_series(n_output_rows: int, batch_size: int):
-    num_batches = (n_output_rows + batch_size - 1) // batch_size
-    print(f"Number of batches = {num_batches}".format(num_batches))
-    batch_ar = cp.arange(0, num_batches).repeat(batch_size)
-    batch_ar = batch_ar[0:n_output_rows].astype(cp.int32)
-    return cudf.Series(batch_ar)
-
-
-def create_batch_df(dataset: torch.Tensor):
-    batch_id_ls = []
-    indices_ls = []
-    for batch_id, b_indices in enumerate(dataset):
-        if isinstance(b_indices, dict):
-            b_indices = torch.cat(list(b_indices.values()))
-        batch_id_ar = cp.full(shape=len(b_indices), fill_value=batch_id, dtype=cp.int32)
-        batch_id_ls.append(batch_id_ar)
-        indices_ls.append(b_indices)
-
-    batch_id_ar = cp.concatenate(batch_id_ls)
-    indices_ar = cp.asarray(torch.concat(indices_ls))
-    batches_df = cudf.DataFrame(
-        {
-            "start": indices_ar,
-            "batch_id": batch_id_ar,
-        }
-    )
-    return batches_df
-
-
-def _dgl_idx_to_cugraph_idx(idx, cugraph_gs):
-    if not isinstance(idx, dict):
-        if len(cugraph_gs.ntypes) > 1:
-            raise dgl.DGLError(
-                "Must specify node type when the graph is not homogeneous."
-            )
-        return idx
-    else:
-        return {k: cugraph_gs.dgl_n_id_to_cugraph_id(n, k) for k, n in idx.items()}
-
-
-def _clean_directory(path):
-    """param <path> could either be relative or absolute."""
-    if os.path.isfile(path):
-        os.remove(path)  # remove the file
-    elif os.path.isdir(path):
-        shutil.rmtree(path)  # remove dir and all contains
diff --git a/python/cugraph-dgl/cugraph_dgl/dataloading/neighbor_sampler.py b/python/cugraph-dgl/cugraph_dgl/dataloading/neighbor_sampler.py
index b61f05f6379..f64eb0d8866 100644
--- a/python/cugraph-dgl/cugraph_dgl/dataloading/neighbor_sampler.py
+++ b/python/cugraph-dgl/cugraph_dgl/dataloading/neighbor_sampler.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -10,8 +10,23 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 from __future__ import annotations
-from typing import Sequence
+
+import warnings
+
+from typing import Sequence, Optional, Union, List, Tuple, Iterator
+
+from cugraph.gnn import UniformNeighborSampler, DistSampleWriter
+from cugraph.utilities.utils import import_optional
+
+
+import cugraph_dgl
+from cugraph_dgl.nn import SparseGraph
+from cugraph_dgl.typing import TensorType
+from cugraph_dgl.dataloading.sampler import HomogeneousSampleReader
+
+torch = import_optional("torch")
 
 
 class NeighborSampler:
@@ -50,7 +65,78 @@ def __init__(
         fanouts_per_layer: Sequence[int],
         edge_dir: str = "in",
         replace: bool = False,
+        prob: Optional[str] = None,
+        mask: Optional[str] = None,
+        prefetch_node_feats: Optional[Union[List[str], dict[str, List[str]]]] = None,
+        prefetch_edge_feats: Optional[
+            Union[List[str], dict[Tuple[str, str, str], List[str]]]
+        ] = None,
+        prefetch_labels: Optional[Union[List[str], dict[str, List[str]]]] = None,
+        output_device: Optional[Union["torch.device", int, str]] = None,
+        fused: bool = True,
+        **kwargs,
     ):
+        """
+        Parameters
+        ----------
+        fanouts_per_layer: Sequence[int]
+            The number of neighbors to sample per layer.
+        edge_dir: str
+            Optional (default='in').
+            The direction to traverse edges.
+        replace: bool
+            Optional (default=False).
+            Whether to sample with replacement.
+        prob: str
+            Optional.
+            If provided, the probability of each neighbor being
+            sampled is proportional to the edge feature
+            with the given name.  Mutually exclusive with mask.
+            Currently unsupported.
+        mask: str
+            Optional.
+            If proivided, only neighbors where the edge mask
+            with the given name is True can be selected.
+            Mutually exclusive with prob.
+            Currently unsupported.
+        prefetch_node_feats: Union[List[str], dict[str, List[str]]]
+            Optional.
+            Currently ignored by cuGraph-DGL.
+        prefetch_edge_feats: Union[List[str], dict[Tuple[str, str, str], List[str]]]
+            Optional.
+            Currently ignored by cuGraph-DGL.
+        prefetch_labels: Union[List[str], dict[str, List[str]]]
+            Optional.
+            Currently ignored by cuGraph-DGL.
+        output_device: Union[torch.device, int, str]
+            Optional.
+            Output device for samples. Defaults to the current device.
+        fused: bool
+            Optional (default=True).
+            This argument is ignored by cuGraph-DGL.
+        **kwargs
+            Keyword arguments for the underlying cuGraph distributed sampler
+            and writer (directory, batches_per_partition, format,
+            local_seeds_per_call).
+        """
+
+        if mask:
+            raise NotImplementedError(
+                "Edge masking is currently unsupported by cuGraph-DGL"
+            )
+        if prob:
+            raise NotImplementedError(
+                "Edge masking is currently unsupported by cuGraph-DGL"
+            )
+        if prefetch_edge_feats:
+            warnings.warn("'prefetch_edge_feats' is ignored by cuGraph-DGL")
+        if prefetch_node_feats:
+            warnings.warn("'prefetch_node_feats' is ignored by cuGraph-DGL")
+        if prefetch_labels:
+            warnings.warn("'prefetch_labels' is ignored by cuGraph-DGL")
+        if fused:
+            warnings.warn("'fused' is ignored by cuGraph-DGL")
+
         self.fanouts = fanouts_per_layer
         reverse_fanouts = fanouts_per_layer.copy()
         reverse_fanouts.reverse()
@@ -58,3 +144,36 @@ def __init__(
 
         self.edge_dir = edge_dir
         self.replace = replace
+        self.__kwargs = kwargs
+
+    def sample(
+        self, g: "cugraph_dgl.Graph", indices: TensorType, batch_size: int = 1
+    ) -> Iterator[Tuple["torch.Tensor", "torch.Tensor", List[SparseGraph]]]:
+        kwargs = dict(**self.__kwargs)
+
+        writer = DistSampleWriter(
+            direction=kwargs.pop("directory", None),
+            batches_per_partition=kwargs.pop("batches_per_partition", 256),
+            format=kwargs.pop("format", "parquet"),
+        )
+
+        ds = UniformNeighborSampler(
+            g._graph(self.edge_dir),
+            writer,
+            compression="CSC",
+            fanout=self._reversed_fanout_vals,
+            prior_sources_behavior="carryover",
+            deduplicate_sources=True,
+            compress_per_hop=True,
+            with_replacement=self.replace,
+            **kwargs,
+        )
+
+        if g.is_homogeneous:
+            ds.sample_from_nodes(indices, batch_size=batch_size)
+            return HomogeneousSampleReader(ds.get_reader())
+
+        raise ValueError(
+            "Sampling heterogeneous graphs is currently"
+            " unsupported in the non-dask API"
+        )
diff --git a/python/cugraph-dgl/cugraph_dgl/dataloading/sampler.py b/python/cugraph-dgl/cugraph_dgl/dataloading/sampler.py
new file mode 100644
index 00000000000..252ae2c36a0
--- /dev/null
+++ b/python/cugraph-dgl/cugraph_dgl/dataloading/sampler.py
@@ -0,0 +1,154 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Iterator, Dict, Tuple, List
+
+import cugraph_dgl
+from cugraph_dgl.nn import SparseGraph
+from cugraph_dgl.typing import TensorType
+from cugraph_dgl.dataloading.utils.sampling_helpers import (
+    create_homogeneous_sampled_graphs_from_tensors_csc,
+)
+
+from cugraph.gnn import DistSampleReader
+
+from cugraph.utilities.utils import import_optional
+
+torch = import_optional("torch")
+
+
+class SampleReader:
+    """
+    Iterator that processes results from the cuGraph distributed sampler.
+    """
+
+    def __init__(self, base_reader: DistSampleReader):
+        """
+        Constructs a new SampleReader.
+
+        Parameters
+        ----------
+        base_reader: DistSampleReader
+            The reader responsible for loading saved samples produced by
+            the cuGraph distributed sampler.
+        """
+        self.__base_reader = base_reader
+        self.__num_samples_remaining = 0
+        self.__index = 0
+
+    def __next__(self):
+        if self._num_samples_remaining == 0:
+            # raw_sample_data is already a dict of tensors
+            self.__raw_sample_data, start_inclusive, end_inclusive = next(
+                self.__base_reader
+            )
+
+            self.__decoded_samples = self._decode_all(self.__raw_sample_data)
+            self.__num_samples_remaining = end_inclusive - start_inclusive + 1
+            self.__index = 0
+
+        out = self.__decoded_samples[self.__index]
+        self.__index += 1
+        self.__num_samples_remaining -= 1
+        return out
+
+    def _decode_all(self):
+        raise NotImplementedError("Must be implemented by subclass")
+
+    def __iter__(self):
+        return self
+
+
+class HomogeneousSampleReader(SampleReader):
+    """
+    Subclass of SampleReader that reads DGL homogeneous output samples
+    produced by the cuGraph distributed sampler.
+    """
+
+    def __init__(self, base_reader: DistSampleReader):
+        """
+        Constructs a new HomogeneousSampleReader
+
+        Parameters
+        ----------
+        base_reader: DistSampleReader
+            The reader responsible for loading saved samples produced by
+            the cuGraph distributed sampler.
+        """
+        super().__init__(base_reader)
+
+    def __decode_csc(self, raw_sample_data: Dict[str, "torch.Tensor"]):
+        create_homogeneous_sampled_graphs_from_tensors_csc(
+            raw_sample_data,
+        )
+
+    def __decode_coo(self, raw_sample_data: Dict[str, "torch.Tensor"]):
+        raise NotImplementedError(
+            "COO format is currently unsupported in the non-dask API"
+        )
+
+    def _decode_all(self, raw_sample_data: Dict[str, "torch.Tensor"]):
+        if "major_offsets" in raw_sample_data:
+            return self.__decode_csc(raw_sample_data)
+        else:
+            return self.__decode_coo(raw_sample_data)
+
+
+class Sampler:
+    """
+    Base sampler class for all cugraph-DGL samplers.
+    """
+
+    def __init__(self, sparse_format: str = "csc"):
+        """
+        Parameters
+        ----------
+        sparse_format: str
+        Optional (default = "coo").
+        The sparse format of the emitted sampled graphs.
+        Currently, only "csc" is supported.
+        """
+
+        if sparse_format != "csc":
+            raise ValueError("Only CSC format is supported at this time")
+
+        self.__sparse_format = sparse_format
+
+    def sample(
+        self, g: cugraph_dgl.Graph, indices: TensorType, batch_size: int = 1
+    ) -> Iterator[Tuple["torch.Tensor", "torch.Tensor", List[SparseGraph]]]:
+        """
+        Samples the graph.
+
+        Parameters
+        ----------
+        g: cugraph_dgl.Graph
+            The graph being sampled.
+        indices: TensorType
+            The node ids of seed nodes where sampling will initiate from.
+        batch_size: int
+            The number of seed nodes per batch.
+
+        Returns
+        -------
+        Iterator[Tuple[torch.Tensor, torch.Tensor, List[cugraph_dgl.nn.SparseGraph]]]
+            Iterator over batches.  Returns batches in the sparse
+            graph format, which can be converted upstream to DGL blocks
+            if needed. The returned tuples are in standard
+            DGL format: (input nodes, output nodes, blocks) where input
+            nodes are the renumbered input nodes, output nodes are
+            the renumbered output nodes, and blocks are the output graphs
+            for each hop.
+        """
+
+        raise NotImplementedError("Must be implemented by subclass")
diff --git a/python/cugraph-dgl/cugraph_dgl/dataloading/utils/sampling_helpers.py b/python/cugraph-dgl/cugraph_dgl/dataloading/utils/sampling_helpers.py
index 10d851ebade..1286ddfe0dc 100644
--- a/python/cugraph-dgl/cugraph_dgl/dataloading/utils/sampling_helpers.py
+++ b/python/cugraph-dgl/cugraph_dgl/dataloading/utils/sampling_helpers.py
@@ -139,6 +139,28 @@ def _get_tensor_d_from_sampled_df(df):
     return result_tensor_d
 
 
+def create_homogeneous_sampled_graphs_from_tensors_dist_coo(
+    tensors: Dict[str, "torch.Tensor"], return_type: str = "dgl.Block"
+):
+    """
+    Creates DGL MFGs for homogeneous graphs from output
+    tensors from a cuGraph DistSampler.
+
+    Parameters
+    ----------
+    tensors: Dict[str, torch.Tensor]
+        The dictionary of output tensors from the bulk sampler.
+    return_type: str
+        Optional (default="dgl.Block").
+        The return type for the MFGs (either "dgl.Block" or
+        "cugraph_dgl.nn.SparseGraph")
+    """
+    if return_type not in ["dgl.Block", "cugraph_dgl.nn.SparseGraph"]:
+        raise ValueError(
+            "return_type must be either dgl.Block or cugraph_dgl.nn.SparseGraph"
+        )
+
+
 def create_homogeneous_sampled_graphs_from_dataframe(
     sampled_df: cudf.DataFrame,
     edge_dir: str = "in",
@@ -404,21 +426,21 @@ def create_heterogenous_dgl_block_from_tensors_dict(
     return block
 
 
-def _process_sampled_df_csc(
-    df: cudf.DataFrame,
+def _process_sampled_tensors_csc(
+    tensors: Dict["torch.Tensor"],
     reverse_hop_id: bool = True,
 ) -> Tuple[
-    Dict[int, Dict[int, Dict[str, torch.Tensor]]],
-    List[torch.Tensor],
+    Dict[int, Dict[int, Dict[str, "torch.Tensor"]]],
+    List["torch.Tensor"],
     List[List[int, int]],
 ]:
     """
-    Convert a dataframe generated by BulkSampler to a dictionary of tensors, to
+    Convert tensors generated by BulkSampler to a dictionary of tensors, to
     facilitate MFG creation. The sampled graphs in the dataframe use CSC-format.
 
     Parameters
     ----------
-    df: cudf.DataFrame
+    tensors: Dict[torch.Tensor]
         The output from BulkSampler compressed in CSC format. The dataframe
         should be generated with `compression="CSR"` in BulkSampler,
         since the sampling routine treats seed nodes as sources.
@@ -442,12 +464,12 @@ def _process_sampled_df_csc(
         k-th hop, mfg_sizes[k] and mfg_sizes[k+1] is the number of sources and
         destinations, respectively.
     """
-    # dropna
-    major_offsets = cast_to_tensor(df.major_offsets.dropna())
-    label_hop_offsets = cast_to_tensor(df.label_hop_offsets.dropna())
-    renumber_map_offsets = cast_to_tensor(df.renumber_map_offsets.dropna())
-    renumber_map = cast_to_tensor(df["map"].dropna())
-    minors = cast_to_tensor(df.minors.dropna())
+
+    major_offsets = tensors["major_offsets"]
+    minors = tensors["minors"]
+    label_hop_offsets = tensors["label_hop_offsets"]
+    renumber_map = tensors["map"]
+    renumber_map_offsets = tensors["renumber_map_offsets"]
 
     n_batches = len(renumber_map_offsets) - 1
     n_hops = int((len(label_hop_offsets) - 1) / n_batches)
@@ -511,10 +533,58 @@ def _process_sampled_df_csc(
     return tensors_dict, renumber_map_list, mfg_sizes.tolist()
 
 
+def _process_sampled_df_csc(
+    df: cudf.DataFrame,
+    reverse_hop_id: bool = True,
+):
+    """
+    Convert a dataframe generated by BulkSampler to a dictionary of tensors, to
+    facilitate MFG creation. The sampled graphs in the dataframe use CSC-format.
+
+    Parameters
+    ----------
+    df: cudf.DataFrame
+        The output from BulkSampler compressed in CSC format. The dataframe
+        should be generated with `compression="CSR"` in BulkSampler,
+        since the sampling routine treats seed nodes as sources.
+
+    reverse_hop_id: bool (default=True)
+        Reverse hop id.
+
+    Returns
+    -------
+    tensors_dict: dict
+        A nested dictionary keyed by batch id and hop id.
+        `tensor_dict[batch_id][hop_id]` holds "minors" and "major_offsets"
+        values for CSC MFGs.
+
+    renumber_map_list: list
+        List of renumbering maps for looking up global indices of nodes. One
+        map for each batch.
+
+    mfg_sizes: list
+        List of the number of nodes in each message passing layer. For the
+        k-th hop, mfg_sizes[k] and mfg_sizes[k+1] is the number of sources and
+        destinations, respectively.
+    """
+
+    return _process_sampled_tensors_csc(
+        {
+            "major_offsets": cast_to_tensor(df.major_offsets.dropna()),
+            "label_hop_offsets": cast_to_tensor(df.label_hop_offsets.dropna()),
+            "renumber_map_offsets": cast_to_tensor(df.renumber_map_offsets.dropna()),
+            "map": cast_to_tensor(df["map"].dropna()),
+            "minors": cast_to_tensor(df.minors.dropna()),
+        },
+        reverse_hop_id=reverse_hop_id,
+    )
+
+
 def _create_homogeneous_sparse_graphs_from_csc(
     tensors_dict: Dict[int, Dict[int, Dict[str, torch.Tensor]]],
     renumber_map_list: List[torch.Tensor],
     mfg_sizes: List[int, int],
+    output_format: str = "dgl.Block",
 ) -> List[List[torch.Tensor, torch.Tensor, List[SparseGraph]]]:
     """Create mini-batches of MFGs. The input arguments are the outputs of
     the function `_process_sampled_df_csc`.
@@ -553,5 +623,13 @@ def create_homogeneous_sampled_graphs_from_dataframe_csc(sampled_df: cudf.DataFr
     """Public API to create mini-batches of MFGs using a dataframe output by
     BulkSampler, where the sampled graph is compressed in CSC format."""
     return _create_homogeneous_sparse_graphs_from_csc(
-        *(_process_sampled_df_csc(sampled_df))
+        *(_process_sampled_df_csc(sampled_df)),
+    )
+
+
+def create_homogeneous_sampled_graphs_from_tensors_csc(tensors: Dict["torch.Tensor"]):
+    """Public API to create mini-batches of MFGs using a dataframe output by
+    BulkSampler, where the sampled graph is compressed in CSC format."""
+    return _create_homogeneous_sparse_graphs_from_csc(
+        *(_process_sampled_tensors_csc(tensors)),
     )

From a8c0848b60b8f1f84c283b536ec6d2f8d5d99398 Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Thu, 27 Jun 2024 15:16:21 -0700
Subject: [PATCH 13/47] add todo

---
 python/cugraph-dgl/cugraph_dgl/dataloading/dataloader.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/cugraph-dgl/cugraph_dgl/dataloading/dataloader.py b/python/cugraph-dgl/cugraph_dgl/dataloading/dataloader.py
index 73130e2dfb0..924ab696199 100644
--- a/python/cugraph-dgl/cugraph_dgl/dataloading/dataloader.py
+++ b/python/cugraph-dgl/cugraph_dgl/dataloading/dataloader.py
@@ -22,7 +22,7 @@
 from cugraph_dgl.utils.cugraph_conversion_utils import _cast_to_torch_tensor
 
 dgl = import_optional("dgl")
-torch = import_optional("torch")
+torch = import_optional('torch')
 
 
 class DataLoader:
@@ -150,6 +150,7 @@ def dataset(
         return self.__dataset
 
     def __iter__(self):
+        # TODO convert to the right output format and device
         return self.__sampler.sample(
             self.__graph,
             self.__dataset,

From 913b8cd372adb5498a6d439a3c57f30bb7b4207b Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Fri, 28 Jun 2024 12:35:17 -0700
Subject: [PATCH 14/47] fix block issue, typing

---
 .../cugraph_dgl/dataloading/dataloader.py     |  4 +-
 .../dataloading/neighbor_sampler.py           | 24 +++--
 .../cugraph_dgl/dataloading/sampler.py        | 70 ++++++++++-----
 .../dataloading/utils/sampling_helpers.py     | 88 ++++++++++++-------
 python/cugraph-dgl/cugraph_dgl/typing.py      | 28 +++++-
 5 files changed, 153 insertions(+), 61 deletions(-)

diff --git a/python/cugraph-dgl/cugraph_dgl/dataloading/dataloader.py b/python/cugraph-dgl/cugraph_dgl/dataloading/dataloader.py
index 924ab696199..862f58af45d 100644
--- a/python/cugraph-dgl/cugraph_dgl/dataloading/dataloader.py
+++ b/python/cugraph-dgl/cugraph_dgl/dataloading/dataloader.py
@@ -22,7 +22,7 @@
 from cugraph_dgl.utils.cugraph_conversion_utils import _cast_to_torch_tensor
 
 dgl = import_optional("dgl")
-torch = import_optional('torch')
+torch = import_optional("torch")
 
 
 class DataLoader:
@@ -150,7 +150,7 @@ def dataset(
         return self.__dataset
 
     def __iter__(self):
-        # TODO convert to the right output format and device
+        # TODO move to the correct device
         return self.__sampler.sample(
             self.__graph,
             self.__dataset,
diff --git a/python/cugraph-dgl/cugraph_dgl/dataloading/neighbor_sampler.py b/python/cugraph-dgl/cugraph_dgl/dataloading/neighbor_sampler.py
index f64eb0d8866..398a44fb3b5 100644
--- a/python/cugraph-dgl/cugraph_dgl/dataloading/neighbor_sampler.py
+++ b/python/cugraph-dgl/cugraph_dgl/dataloading/neighbor_sampler.py
@@ -22,8 +22,7 @@
 
 
 import cugraph_dgl
-from cugraph_dgl.nn import SparseGraph
-from cugraph_dgl.typing import TensorType
+from cugraph_dgl.typing import TensorType, DGLSamplerOutput
 from cugraph_dgl.dataloading.sampler import HomogeneousSampleReader
 
 torch = import_optional("torch")
@@ -74,6 +73,8 @@ def __init__(
         prefetch_labels: Optional[Union[List[str], dict[str, List[str]]]] = None,
         output_device: Optional[Union["torch.device", int, str]] = None,
         fused: bool = True,
+        sparse_format="csc",
+        output_format="dgl.Block",
         **kwargs,
     ):
         """
@@ -114,6 +115,14 @@ def __init__(
         fused: bool
             Optional (default=True).
             This argument is ignored by cuGraph-DGL.
+        sparse_format: str
+            Optional (default = "coo").
+            The sparse format of the emitted sampled graphs.
+            Currently, only "csc" is supported.
+        output_format: str
+            Optional (default = "dgl.Block")
+            The output format of the emitted sampled graphs.
+            Can be either "dgl.Block" (default), or "cugraph_dgl.nn.SparseGraph".
         **kwargs
             Keyword arguments for the underlying cuGraph distributed sampler
             and writer (directory, batches_per_partition, format,
@@ -146,9 +155,14 @@ def __init__(
         self.replace = replace
         self.__kwargs = kwargs
 
+        super(
+            sparse_format=sparse_format,
+            output_format=output_format,
+        )
+
     def sample(
         self, g: "cugraph_dgl.Graph", indices: TensorType, batch_size: int = 1
-    ) -> Iterator[Tuple["torch.Tensor", "torch.Tensor", List[SparseGraph]]]:
+    ) -> Iterator[DGLSamplerOutput]:
         kwargs = dict(**self.__kwargs)
 
         writer = DistSampleWriter(
@@ -160,7 +174,7 @@ def sample(
         ds = UniformNeighborSampler(
             g._graph(self.edge_dir),
             writer,
-            compression="CSC",
+            compression=self.sparse_format.upper(),
             fanout=self._reversed_fanout_vals,
             prior_sources_behavior="carryover",
             deduplicate_sources=True,
@@ -171,7 +185,7 @@ def sample(
 
         if g.is_homogeneous:
             ds.sample_from_nodes(indices, batch_size=batch_size)
-            return HomogeneousSampleReader(ds.get_reader())
+            return HomogeneousSampleReader(ds.get_reader(), self.output_format)
 
         raise ValueError(
             "Sampling heterogeneous graphs is currently"
diff --git a/python/cugraph-dgl/cugraph_dgl/dataloading/sampler.py b/python/cugraph-dgl/cugraph_dgl/dataloading/sampler.py
index 252ae2c36a0..1ee76882bdd 100644
--- a/python/cugraph-dgl/cugraph_dgl/dataloading/sampler.py
+++ b/python/cugraph-dgl/cugraph_dgl/dataloading/sampler.py
@@ -11,11 +11,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Iterator, Dict, Tuple, List
+from typing import Iterator, Dict, Tuple, List, Union
 
 import cugraph_dgl
 from cugraph_dgl.nn import SparseGraph
-from cugraph_dgl.typing import TensorType
+from cugraph_dgl.typing import TensorType, DGLSamplerOutput
 from cugraph_dgl.dataloading.utils.sampling_helpers import (
     create_homogeneous_sampled_graphs_from_tensors_csc,
 )
@@ -25,6 +25,7 @@
 from cugraph.utilities.utils import import_optional
 
 torch = import_optional("torch")
+dgl = import_optional("dgl")
 
 
 class SampleReader:
@@ -32,7 +33,7 @@ class SampleReader:
     Iterator that processes results from the cuGraph distributed sampler.
     """
 
-    def __init__(self, base_reader: DistSampleReader):
+    def __init__(self, base_reader: DistSampleReader, output_format: str = "dgl.Block"):
         """
         Constructs a new SampleReader.
 
@@ -42,11 +43,16 @@ def __init__(self, base_reader: DistSampleReader):
             The reader responsible for loading saved samples produced by
             the cuGraph distributed sampler.
         """
+        self.__output_format = output_format
         self.__base_reader = base_reader
         self.__num_samples_remaining = 0
         self.__index = 0
 
-    def __next__(self):
+    @property
+    def output_format(self) -> str:
+        return self.__output_format
+
+    def __next__(self) -> DGLSamplerOutput:
         if self._num_samples_remaining == 0:
             # raw_sample_data is already a dict of tensors
             self.__raw_sample_data, start_inclusive, end_inclusive = next(
@@ -62,10 +68,10 @@ def __next__(self):
         self.__num_samples_remaining -= 1
         return out
 
-    def _decode_all(self):
+    def _decode_all(self) -> List[DGLSamplerOutput]:
         raise NotImplementedError("Must be implemented by subclass")
 
-    def __iter__(self):
+    def __iter__(self) -> DGLSamplerOutput:
         return self
 
 
@@ -75,7 +81,7 @@ class HomogeneousSampleReader(SampleReader):
     produced by the cuGraph distributed sampler.
     """
 
-    def __init__(self, base_reader: DistSampleReader):
+    def __init__(self, base_reader: DistSampleReader, output_format: str = "dgl.Block"):
         """
         Constructs a new HomogeneousSampleReader
 
@@ -84,20 +90,29 @@ def __init__(self, base_reader: DistSampleReader):
         base_reader: DistSampleReader
             The reader responsible for loading saved samples produced by
             the cuGraph distributed sampler.
+        output_format: str
+            The output format for blocks (either "dgl.Block" or
+            "cugraph_dgl.nn.SparseGraph").
         """
-        super().__init__(base_reader)
+        super().__init__(base_reader, output_format=output_format)
 
-    def __decode_csc(self, raw_sample_data: Dict[str, "torch.Tensor"]):
+    def __decode_csc(
+        self, raw_sample_data: Dict[str, "torch.Tensor"]
+    ) -> List[DGLSamplerOutput]:
         create_homogeneous_sampled_graphs_from_tensors_csc(
-            raw_sample_data,
+            raw_sample_data, output_format=self.output_format
         )
 
-    def __decode_coo(self, raw_sample_data: Dict[str, "torch.Tensor"]):
+    def __decode_coo(
+        self, raw_sample_data: Dict[str, "torch.Tensor"]
+    ) -> List[DGLSamplerOutput]:
         raise NotImplementedError(
             "COO format is currently unsupported in the non-dask API"
         )
 
-    def _decode_all(self, raw_sample_data: Dict[str, "torch.Tensor"]):
+    def _decode_all(
+        self, raw_sample_data: Dict[str, "torch.Tensor"]
+    ) -> List[DGLSamplerOutput]:
         if "major_offsets" in raw_sample_data:
             return self.__decode_csc(raw_sample_data)
         else:
@@ -109,24 +124,39 @@ class Sampler:
     Base sampler class for all cugraph-DGL samplers.
     """
 
-    def __init__(self, sparse_format: str = "csc"):
+    def __init__(self, sparse_format: str = "csc", output_format="dgl.Block"):
         """
         Parameters
         ----------
         sparse_format: str
-        Optional (default = "coo").
-        The sparse format of the emitted sampled graphs.
-        Currently, only "csc" is supported.
+            Optional (default = "coo").
+            The sparse format of the emitted sampled graphs.
+            Currently, only "csc" is supported.
+        output_format: str
+            Optional (default = "dgl.Block")
+            The output format of the emitted sampled graphs.
+            Can be either "dgl.Block" (default), or "cugraph_dgl.nn.SparseGraph".
         """
 
         if sparse_format != "csc":
             raise ValueError("Only CSC format is supported at this time")
 
         self.__sparse_format = sparse_format
+        self.__output_format = output_format
+
+    @property
+    def output_format(self):
+        return self.__output_format
+
+    @property
+    def sparse_format(self):
+        return self.__sparse_format
 
     def sample(
         self, g: cugraph_dgl.Graph, indices: TensorType, batch_size: int = 1
-    ) -> Iterator[Tuple["torch.Tensor", "torch.Tensor", List[SparseGraph]]]:
+    ) -> Iterator[
+        Tuple["torch.Tensor", "torch.Tensor", List[Union[SparseGraph, "dgl.Block"]]]
+    ]:
         """
         Samples the graph.
 
@@ -141,10 +171,8 @@ def sample(
 
         Returns
         -------
-        Iterator[Tuple[torch.Tensor, torch.Tensor, List[cugraph_dgl.nn.SparseGraph]]]
-            Iterator over batches.  Returns batches in the sparse
-            graph format, which can be converted upstream to DGL blocks
-            if needed. The returned tuples are in standard
+        Iterator[DGLSamplerOutput]
+            Iterator over batches.  The returned tuples are in standard
             DGL format: (input nodes, output nodes, blocks) where input
             nodes are the renumbered input nodes, output nodes are
             the renumbered output nodes, and blocks are the output graphs
diff --git a/python/cugraph-dgl/cugraph_dgl/dataloading/utils/sampling_helpers.py b/python/cugraph-dgl/cugraph_dgl/dataloading/utils/sampling_helpers.py
index 1286ddfe0dc..f98909450cd 100644
--- a/python/cugraph-dgl/cugraph_dgl/dataloading/utils/sampling_helpers.py
+++ b/python/cugraph-dgl/cugraph_dgl/dataloading/utils/sampling_helpers.py
@@ -139,28 +139,6 @@ def _get_tensor_d_from_sampled_df(df):
     return result_tensor_d
 
 
-def create_homogeneous_sampled_graphs_from_tensors_dist_coo(
-    tensors: Dict[str, "torch.Tensor"], return_type: str = "dgl.Block"
-):
-    """
-    Creates DGL MFGs for homogeneous graphs from output
-    tensors from a cuGraph DistSampler.
-
-    Parameters
-    ----------
-    tensors: Dict[str, torch.Tensor]
-        The dictionary of output tensors from the bulk sampler.
-    return_type: str
-        Optional (default="dgl.Block").
-        The return type for the MFGs (either "dgl.Block" or
-        "cugraph_dgl.nn.SparseGraph")
-    """
-    if return_type not in ["dgl.Block", "cugraph_dgl.nn.SparseGraph"]:
-        raise ValueError(
-            "return_type must be either dgl.Block or cugraph_dgl.nn.SparseGraph"
-        )
-
-
 def create_homogeneous_sampled_graphs_from_dataframe(
     sampled_df: cudf.DataFrame,
     edge_dir: str = "in",
@@ -580,11 +558,41 @@ def _process_sampled_df_csc(
     )
 
 
+def _create_homogeneous_blocks_from_csc(
+    tensors_dict: Dict[int, Dict[int, Dict[str, torch.Tensor]]],
+    renumber_map_list: List[torch.Tensor],
+    mfg_sizes: List[int, int],
+):
+    """Create mini-batches of MFGs in the dgl.Block format.
+    The input arguments are the outputs of
+    the function `_process_sampled_df_csc`.
+
+    Returns
+    -------
+    output: list
+        A list of mini-batches. Each mini-batch is a list that consists of
+        `input_nodes` tensor, `output_nodes` tensor and a list of MFGs.
+    """
+    n_batches = len(mfg_sizes)
+    output = []
+    for b_id in range(n_batches):
+        output_batch = []
+        output_batch.append(renumber_map_list[b_id])
+        output_batch.append(renumber_map_list[b_id][: mfg_sizes[b_id][-1]])
+
+        mfgs = _create_homogeneous_sampled_graphs_from_tensors_perhop(
+            tensors_batch_d=tensors_dict[b_id], edge_dir="in", return_type="dgl.Block"
+        )[2]
+
+        output_batch.append(mfgs)
+
+        output.append(output_batch)
+
+
 def _create_homogeneous_sparse_graphs_from_csc(
     tensors_dict: Dict[int, Dict[int, Dict[str, torch.Tensor]]],
     renumber_map_list: List[torch.Tensor],
     mfg_sizes: List[int, int],
-    output_format: str = "dgl.Block",
 ) -> List[List[torch.Tensor, torch.Tensor, List[SparseGraph]]]:
     """Create mini-batches of MFGs. The input arguments are the outputs of
     the function `_process_sampled_df_csc`.
@@ -619,17 +627,35 @@ def _create_homogeneous_sparse_graphs_from_csc(
     return output
 
 
-def create_homogeneous_sampled_graphs_from_dataframe_csc(sampled_df: cudf.DataFrame):
+def create_homogeneous_sampled_graphs_from_dataframe_csc(
+    sampled_df: cudf.DataFrame, output_format: str = "cugraph_dgl.nn.SparseGraph"
+):
     """Public API to create mini-batches of MFGs using a dataframe output by
     BulkSampler, where the sampled graph is compressed in CSC format."""
-    return _create_homogeneous_sparse_graphs_from_csc(
-        *(_process_sampled_df_csc(sampled_df)),
-    )
+    if output_format == "cugraph_dgl.nn.SparseGraph":
+        return _create_homogeneous_sparse_graphs_from_csc(
+            *(_process_sampled_df_csc(sampled_df)),
+        )
+    elif output_format == "dgl.Block":
+        return _create_homogeneous_blocks_from_csc(
+            *(_process_sampled_df_csc(sampled_df)),
+        )
+    else:
+        raise ValueError(f"Invalid output format {output_format}")
 
 
-def create_homogeneous_sampled_graphs_from_tensors_csc(tensors: Dict["torch.Tensor"]):
+def create_homogeneous_sampled_graphs_from_tensors_csc(
+    tensors: Dict["torch.Tensor"], output_format: str = "cugraph_dgl.nn.SparseGraph"
+):
     """Public API to create mini-batches of MFGs using a dataframe output by
     BulkSampler, where the sampled graph is compressed in CSC format."""
-    return _create_homogeneous_sparse_graphs_from_csc(
-        *(_process_sampled_tensors_csc(tensors)),
-    )
+    if output_format == "cugraph_dgl.nn.SparseGraph":
+        return _create_homogeneous_sparse_graphs_from_csc(
+            *(_process_sampled_tensors_csc(tensors)),
+        )
+    elif output_format == "dgl.Block":
+        return _create_homogeneous_blocks_from_csc(
+            *(_process_sampled_tensors_csc(tensors)),
+        )
+    else:
+        raise ValueError(f"Invalid output format {output_format}")
diff --git a/python/cugraph-dgl/cugraph_dgl/typing.py b/python/cugraph-dgl/cugraph_dgl/typing.py
index 7a16a1b3dfd..a68463c3fd9 100644
--- a/python/cugraph-dgl/cugraph_dgl/typing.py
+++ b/python/cugraph-dgl/cugraph_dgl/typing.py
@@ -11,6 +11,30 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import List, Union
+from typing import List, Union, Tuple
+from cugraph.utilities.utils import import_optional
 
-TensorType = Union["torch.Tensor", "cupy.ndarray", "numpy.ndarray", "cudf.Series", "pandas.Series", List[int]]
+from cugraph_dgl.nn import SparseGraph
+
+import pandas
+import numpy
+import cupy
+import cudf
+
+torch = import_optional("torch")
+dgl = import_optional("dgl")
+
+TensorType = Union[
+    "torch.Tensor",
+    "cupy.ndarray",
+    "numpy.ndarray",
+    "cudf.Series",
+    "pandas.Series",
+    List[int],
+]
+
+DGLSamplerOutput = Tuple[
+    "torch.Tensor",
+    "torch.Tensor",
+    List[Union["dgl.Block", SparseGraph]],
+]

From 79c8f7870adf734196c85f3d8b3f14ed1aa7adef Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Mon, 1 Jul 2024 10:12:37 -0700
Subject: [PATCH 15/47] reorganize tests

---
 python/cugraph-dgl/cugraph_dgl/dataloading/neighbor_sampler.py   | 1 -
 .../{test_dataloader.py => dataloading/test_dask_dataloader.py}  | 0
 .../test_dask_dataloader_mg.py}                                  | 0
 .../cugraph_dgl/tests/{ => dataloading}/test_dataset.py          | 0
 4 files changed, 1 deletion(-)
 rename python/cugraph-dgl/cugraph_dgl/tests/{test_dataloader.py => dataloading/test_dask_dataloader.py} (100%)
 rename python/cugraph-dgl/cugraph_dgl/tests/{test_dataloader_mg.py => dataloading/test_dask_dataloader_mg.py} (100%)
 rename python/cugraph-dgl/cugraph_dgl/tests/{ => dataloading}/test_dataset.py (100%)

diff --git a/python/cugraph-dgl/cugraph_dgl/dataloading/neighbor_sampler.py b/python/cugraph-dgl/cugraph_dgl/dataloading/neighbor_sampler.py
index 398a44fb3b5..7c98bd3c301 100644
--- a/python/cugraph-dgl/cugraph_dgl/dataloading/neighbor_sampler.py
+++ b/python/cugraph-dgl/cugraph_dgl/dataloading/neighbor_sampler.py
@@ -20,7 +20,6 @@
 from cugraph.gnn import UniformNeighborSampler, DistSampleWriter
 from cugraph.utilities.utils import import_optional
 
-
 import cugraph_dgl
 from cugraph_dgl.typing import TensorType, DGLSamplerOutput
 from cugraph_dgl.dataloading.sampler import HomogeneousSampleReader
diff --git a/python/cugraph-dgl/cugraph_dgl/tests/test_dataloader.py b/python/cugraph-dgl/cugraph_dgl/tests/dataloading/test_dask_dataloader.py
similarity index 100%
rename from python/cugraph-dgl/cugraph_dgl/tests/test_dataloader.py
rename to python/cugraph-dgl/cugraph_dgl/tests/dataloading/test_dask_dataloader.py
diff --git a/python/cugraph-dgl/cugraph_dgl/tests/test_dataloader_mg.py b/python/cugraph-dgl/cugraph_dgl/tests/dataloading/test_dask_dataloader_mg.py
similarity index 100%
rename from python/cugraph-dgl/cugraph_dgl/tests/test_dataloader_mg.py
rename to python/cugraph-dgl/cugraph_dgl/tests/dataloading/test_dask_dataloader_mg.py
diff --git a/python/cugraph-dgl/cugraph_dgl/tests/test_dataset.py b/python/cugraph-dgl/cugraph_dgl/tests/dataloading/test_dataset.py
similarity index 100%
rename from python/cugraph-dgl/cugraph_dgl/tests/test_dataset.py
rename to python/cugraph-dgl/cugraph_dgl/tests/dataloading/test_dataset.py

From a56b56d4e27938dd7650bc7bd8ba4c036cf1f7cb Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Tue, 2 Jul 2024 11:22:31 -0700
Subject: [PATCH 16/47] sampling

---
 .../cugraph_dgl/dataloading/__init__.py       | 10 +++-
 .../cugraph_dgl/dataloading/dataloader.py     |  8 +++
 .../dataloading/neighbor_sampler.py           | 22 ++++++---
 .../cugraph_dgl/dataloading/sampler.py        |  6 +--
 .../dataloading/utils/sampling_helpers.py     | 34 +++++++++++--
 .../cugraph-dgl/cugraph_dgl/nn/conv/base.py   | 25 +++++++++-
 .../tests/dataloading/test_dask_dataloader.py |  4 +-
 .../dataloading/test_dask_dataloader_mg.py    |  4 +-
 .../tests/dataloading/test_dataloader.py      | 49 +++++++++++++++++++
 .../cugraph_dgl/tests/test_graph.py           |  3 ++
 .../cugraph_dgl/tests/test_graph_mg.py        |  4 +-
 11 files changed, 147 insertions(+), 22 deletions(-)
 create mode 100644 python/cugraph-dgl/cugraph_dgl/tests/dataloading/test_dataloader.py

diff --git a/python/cugraph-dgl/cugraph_dgl/dataloading/__init__.py b/python/cugraph-dgl/cugraph_dgl/dataloading/__init__.py
index 5a775f0e88c..9ee5e6a970f 100644
--- a/python/cugraph-dgl/cugraph_dgl/dataloading/__init__.py
+++ b/python/cugraph-dgl/cugraph_dgl/dataloading/__init__.py
@@ -17,10 +17,18 @@
     HomogenousBulkSamplerDataset,
     HeterogenousBulkSamplerDataset,
 )
+
+from cugraph_dgl.dataloading.sampler import Sampler
 from cugraph_dgl.dataloading.neighbor_sampler import NeighborSampler
+
 from cugraph_dgl.dataloading.dask_dataloader import DaskDataLoader
 
 
 def DataLoader(*args, **kwargs):
-    warnings.warn("DataLoader has been renamed to DaskDataLoader", FutureWarning)
+    warnings.warn(
+        "DataLoader has been renamed to DaskDataLoader.  "
+        "In Release 24.10, cugraph_dgl.dataloading.dataloader.DataLoader "
+        "will take over the DataLoader name.",
+        FutureWarning
+    )
     return DaskDataLoader(*args, **kwargs)
diff --git a/python/cugraph-dgl/cugraph_dgl/dataloading/dataloader.py b/python/cugraph-dgl/cugraph_dgl/dataloading/dataloader.py
index 862f58af45d..b3af6fc3bc9 100644
--- a/python/cugraph-dgl/cugraph_dgl/dataloading/dataloader.py
+++ b/python/cugraph-dgl/cugraph_dgl/dataloading/dataloader.py
@@ -156,3 +156,11 @@ def __iter__(self):
             self.__dataset,
             self.__batch_size,
         )
+    
+        """
+        start, end, blocks = out
+
+        start = start.to(self.__device)
+        end = end.to(self.__device)
+        blocks = [b.to(self.__device) for b in blocks]
+        """
\ No newline at end of file
diff --git a/python/cugraph-dgl/cugraph_dgl/dataloading/neighbor_sampler.py b/python/cugraph-dgl/cugraph_dgl/dataloading/neighbor_sampler.py
index 7c98bd3c301..f77b00bbac8 100644
--- a/python/cugraph-dgl/cugraph_dgl/dataloading/neighbor_sampler.py
+++ b/python/cugraph-dgl/cugraph_dgl/dataloading/neighbor_sampler.py
@@ -14,6 +14,7 @@
 from __future__ import annotations
 
 import warnings
+import tempfile
 
 from typing import Sequence, Optional, Union, List, Tuple, Iterator
 
@@ -22,12 +23,12 @@
 
 import cugraph_dgl
 from cugraph_dgl.typing import TensorType, DGLSamplerOutput
-from cugraph_dgl.dataloading.sampler import HomogeneousSampleReader
+from cugraph_dgl.dataloading.sampler import Sampler, HomogeneousSampleReader
 
 torch = import_optional("torch")
 
 
-class NeighborSampler:
+class NeighborSampler(Sampler):
     """Sampler that builds computational dependency of node representations via
     neighbor sampling for multilayer GNN.
     This sampler will make every node gather messages from a fixed number of neighbors
@@ -71,7 +72,7 @@ def __init__(
         ] = None,
         prefetch_labels: Optional[Union[List[str], dict[str, List[str]]]] = None,
         output_device: Optional[Union["torch.device", int, str]] = None,
-        fused: bool = True,
+        fused: Optional[bool] = None,
         sparse_format="csc",
         output_format="dgl.Block",
         **kwargs,
@@ -112,7 +113,7 @@ def __init__(
             Optional.
             Output device for samples. Defaults to the current device.
         fused: bool
-            Optional (default=True).
+            Optional.
             This argument is ignored by cuGraph-DGL.
         sparse_format: str
             Optional (default = "coo").
@@ -154,18 +155,24 @@ def __init__(
         self.replace = replace
         self.__kwargs = kwargs
 
-        super(
+        super().__init__(
             sparse_format=sparse_format,
             output_format=output_format,
         )
 
     def sample(
-        self, g: "cugraph_dgl.Graph", indices: TensorType, batch_size: int = 1
+        self, g: "cugraph_dgl.Graph", indices: Iterator["torch.Tensor"], batch_size: int = 1
     ) -> Iterator[DGLSamplerOutput]:
         kwargs = dict(**self.__kwargs)
 
+        directory = kwargs.pop('directory', None)
+        if directory is None:
+            warnings.warn("Setting a directory to store samples is recommended.")
+            self._tempdir = tempfile.TemporaryDirectory()
+            directory = self._tempdir.name
+
         writer = DistSampleWriter(
-            direction=kwargs.pop("directory", None),
+            directory=directory,
             batches_per_partition=kwargs.pop("batches_per_partition", 256),
             format=kwargs.pop("format", "parquet"),
         )
@@ -183,6 +190,7 @@ def sample(
         )
 
         if g.is_homogeneous:
+            indices = torch.concat(list(indices))
             ds.sample_from_nodes(indices, batch_size=batch_size)
             return HomogeneousSampleReader(ds.get_reader(), self.output_format)
 
diff --git a/python/cugraph-dgl/cugraph_dgl/dataloading/sampler.py b/python/cugraph-dgl/cugraph_dgl/dataloading/sampler.py
index 1ee76882bdd..20bce365c7e 100644
--- a/python/cugraph-dgl/cugraph_dgl/dataloading/sampler.py
+++ b/python/cugraph-dgl/cugraph_dgl/dataloading/sampler.py
@@ -53,7 +53,7 @@ def output_format(self) -> str:
         return self.__output_format
 
     def __next__(self) -> DGLSamplerOutput:
-        if self._num_samples_remaining == 0:
+        if self.__num_samples_remaining == 0:
             # raw_sample_data is already a dict of tensors
             self.__raw_sample_data, start_inclusive, end_inclusive = next(
                 self.__base_reader
@@ -99,7 +99,7 @@ def __init__(self, base_reader: DistSampleReader, output_format: str = "dgl.Bloc
     def __decode_csc(
         self, raw_sample_data: Dict[str, "torch.Tensor"]
     ) -> List[DGLSamplerOutput]:
-        create_homogeneous_sampled_graphs_from_tensors_csc(
+        return create_homogeneous_sampled_graphs_from_tensors_csc(
             raw_sample_data, output_format=self.output_format
         )
 
@@ -153,7 +153,7 @@ def sparse_format(self):
         return self.__sparse_format
 
     def sample(
-        self, g: cugraph_dgl.Graph, indices: TensorType, batch_size: int = 1
+        self, g: cugraph_dgl.Graph, indices: Iterator["torch.Tensor"], batch_size: int = 1
     ) -> Iterator[
         Tuple["torch.Tensor", "torch.Tensor", List[Union[SparseGraph, "dgl.Block"]]]
     ]:
diff --git a/python/cugraph-dgl/cugraph_dgl/dataloading/utils/sampling_helpers.py b/python/cugraph-dgl/cugraph_dgl/dataloading/utils/sampling_helpers.py
index f98909450cd..da40fb6f564 100644
--- a/python/cugraph-dgl/cugraph_dgl/dataloading/utils/sampling_helpers.py
+++ b/python/cugraph-dgl/cugraph_dgl/dataloading/utils/sampling_helpers.py
@@ -573,20 +573,44 @@ def _create_homogeneous_blocks_from_csc(
         A list of mini-batches. Each mini-batch is a list that consists of
         `input_nodes` tensor, `output_nodes` tensor and a list of MFGs.
     """
-    n_batches = len(mfg_sizes)
+    n_batches, n_hops = len(mfg_sizes), len(mfg_sizes[0]) - 1
     output = []
     for b_id in range(n_batches):
         output_batch = []
         output_batch.append(renumber_map_list[b_id])
         output_batch.append(renumber_map_list[b_id][: mfg_sizes[b_id][-1]])
 
-        mfgs = _create_homogeneous_sampled_graphs_from_tensors_perhop(
-            tensors_batch_d=tensors_dict[b_id], edge_dir="in", return_type="dgl.Block"
-        )[2]
+        mfgs = [
+            SparseGraph(
+                size=(mfg_sizes[b_id][h_id], mfg_sizes[b_id][h_id + 1]),
+                src_ids=tensors_dict[b_id][h_id]["minors"],
+                cdst_ids=tensors_dict[b_id][h_id]["major_offsets"],
+                formats=["csc", "coo"],
+                reduce_memory=True,
+            )
+            for h_id in range(n_hops)
+        ]
 
-        output_batch.append(mfgs)
+        blocks = []
+        seednodes_range=None
+        for mfg in mfgs:
+            block_mfg = _create_homogeneous_dgl_block_from_tensor_d(
+                {'sources': mfg.src_ids(), 'destinations': mfg.dst_ids(), 'sources_range': mfg._num_src_nodes-1, 'destinations_range': mfg._num_dst_nodes-1},
+                renumber_map=renumber_map_list[b_id],
+                seednodes_range=seednodes_range
+            )
+
+            seednodes_range = max(
+                mfg._num_src_nodes-1,
+                mfg._num_dst_nodes-1,
+            )
+            blocks.append(block_mfg)
+        del mfgs
+
+        output_batch.append(blocks)
 
         output.append(output_batch)
+    return output
 
 
 def _create_homogeneous_sparse_graphs_from_csc(
diff --git a/python/cugraph-dgl/cugraph_dgl/nn/conv/base.py b/python/cugraph-dgl/cugraph_dgl/nn/conv/base.py
index ddd95a76366..d2460f814c9 100644
--- a/python/cugraph-dgl/cugraph_dgl/nn/conv/base.py
+++ b/python/cugraph-dgl/cugraph_dgl/nn/conv/base.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -15,6 +15,8 @@
 
 from cugraph.utilities.utils import import_optional
 
+import cugraph_dgl
+
 torch = import_optional("torch")
 ops_torch = import_optional("pylibcugraphops.pytorch")
 dgl = import_optional("dgl")
@@ -255,6 +257,27 @@ def __repr__(self) -> str:
             f"num_edges={self._src_ids.size(0)}, formats={self._formats})"
         )
 
+    def to(self, device: Union[torch.device, str, int]) -> "cugraph_dgl.nn.SparseGraph":
+        sg = SparseGraph(
+            src_ids=None if self._src_ids is None else self._src_ids.to(device),
+            dst_ids=None if self._dst_ids is None else self._dst_ids.to(device),
+            csrc_ids=None if self._csrc_ids is None else self._csrc_ids.to(device),
+            cdst_ids=None if self._cdst_ids is None else self._cdst_ids.to(device),
+            values=None if self._values is None else self._values.to(device),
+            is_sorted=self._is_sorted,
+            formats=self._formats,
+            reduce_memory=self._reduce_memory,
+        )
+
+        sg._perm_coo2csc = (
+            None if self._perm_coo2csc is None else self._perm_coo2csc.to(device)
+        )
+        sg._perm_csc2csr = (
+            None if self._perm_csc2csr is None else self._perm_csc2csr.to(device)
+        )
+
+        return sg
+
 
 class BaseConv(torch.nn.Module):
     r"""An abstract base class for cugraph-ops nn module."""
diff --git a/python/cugraph-dgl/cugraph_dgl/tests/dataloading/test_dask_dataloader.py b/python/cugraph-dgl/cugraph_dgl/tests/dataloading/test_dask_dataloader.py
index cc473cd0ad6..e2542657de4 100644
--- a/python/cugraph-dgl/cugraph_dgl/tests/dataloading/test_dask_dataloader.py
+++ b/python/cugraph-dgl/cugraph_dgl/tests/dataloading/test_dask_dataloader.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -52,7 +52,7 @@ def sample_cugraph_dgl_graphs(cugraph_gs, train_nid, fanouts):
     sampler = cugraph_dgl.dataloading.NeighborSampler(fanouts)
     tempdir_object = tempfile.TemporaryDirectory()
     sampling_output_dir = tempdir_object
-    dataloader = cugraph_dgl.dataloading.DataLoader(
+    dataloader = cugraph_dgl.dataloading.DaskDataLoader(
         cugraph_gs,
         train_nid,
         sampler,
diff --git a/python/cugraph-dgl/cugraph_dgl/tests/dataloading/test_dask_dataloader_mg.py b/python/cugraph-dgl/cugraph_dgl/tests/dataloading/test_dask_dataloader_mg.py
index 29b7e1c3412..d49e1293e77 100644
--- a/python/cugraph-dgl/cugraph_dgl/tests/dataloading/test_dask_dataloader_mg.py
+++ b/python/cugraph-dgl/cugraph_dgl/tests/dataloading/test_dask_dataloader_mg.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -51,7 +51,7 @@ def sample_cugraph_dgl_graphs(cugraph_gs, train_nid, fanouts):
     sampler = cugraph_dgl.dataloading.NeighborSampler(fanouts)
     tempdir_object = tempfile.TemporaryDirectory()
     sampling_output_dir = tempdir_object
-    dataloader = cugraph_dgl.dataloading.DataLoader(
+    dataloader = cugraph_dgl.dataloading.DaskDataLoader(
         cugraph_gs,
         train_nid,
         sampler,
diff --git a/python/cugraph-dgl/cugraph_dgl/tests/dataloading/test_dataloader.py b/python/cugraph-dgl/cugraph_dgl/tests/dataloading/test_dataloader.py
new file mode 100644
index 00000000000..d29f1c8ea41
--- /dev/null
+++ b/python/cugraph-dgl/cugraph_dgl/tests/dataloading/test_dataloader.py
@@ -0,0 +1,49 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+import cugraph_dgl
+from cugraph_dgl.dataloading.dataloader import DataLoader
+from cugraph_dgl.dataloading import NeighborSampler
+
+from cugraph.datasets import karate
+from cugraph.utilities.utils import import_optional, MissingModule
+
+torch = import_optional('torch')
+dgl = import_optional('dgl')
+
+@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.skipif(isinstance(dgl, MissingModule), reason="dgl not available")
+def test_dataloader_basic_homogeneous():
+    graph = cugraph_dgl.Graph(
+        is_multi_gpu=False
+    )
+
+    num_nodes = karate.number_of_nodes()
+    graph.add_nodes(
+        num_nodes,
+        data={'z': torch.arange(num_nodes)}
+    )
+
+    edf = karate.get_edgelist()
+    graph.add_edges(
+        u=edf['src'],
+        v=edf['dst'],
+        data={'q': torch.arange(karate.number_of_edges())}
+    )
+
+    sampler = NeighborSampler([5, 5, 5])
+    loader = DataLoader(graph, torch.arange(num_nodes), sampler, batch_size=2)
+
+    print(next(iter(loader)))
\ No newline at end of file
diff --git a/python/cugraph-dgl/cugraph_dgl/tests/test_graph.py b/python/cugraph-dgl/cugraph_dgl/tests/test_graph.py
index 89a74ff073c..e47e280c48e 100644
--- a/python/cugraph-dgl/cugraph_dgl/tests/test_graph.py
+++ b/python/cugraph-dgl/cugraph_dgl/tests/test_graph.py
@@ -22,9 +22,11 @@
 from cugraph.utilities.utils import import_optional, MissingModule
 
 torch = import_optional("torch")
+dgl = import_optional("dgl")
 
 
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.skipif(isinstance(dgl, MissingModule), reason="dgl not available")
 @pytest.mark.parametrize("direction", ["out", "in"])
 def test_graph_make_homogeneous_graph(direction):
     df = karate.get_edgelist()
@@ -92,6 +94,7 @@ def test_graph_make_homogeneous_graph(direction):
 
 
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.skipif(isinstance(dgl, MissingModule), reason="dgl not available")
 @pytest.mark.parametrize("direction", ["out", "in"])
 def test_graph_make_heterogeneous_graph(direction):
     df = karate.get_edgelist()
diff --git a/python/cugraph-dgl/cugraph_dgl/tests/test_graph_mg.py b/python/cugraph-dgl/cugraph_dgl/tests/test_graph_mg.py
index 0dfde6b9715..f0561c41095 100644
--- a/python/cugraph-dgl/cugraph_dgl/tests/test_graph_mg.py
+++ b/python/cugraph-dgl/cugraph_dgl/tests/test_graph_mg.py
@@ -34,7 +34,7 @@
 
 pylibwholegraph = import_optional("pylibwholegraph")
 torch = import_optional("torch")
-
+dgl = import_optional('dgl')
 
 def init_pytorch_worker(rank, world_size, cugraph_id):
     import rmm
@@ -160,6 +160,7 @@ def run_test_graph_make_homogeneous_graph_mg(rank, uid, world_size, direction):
 @pytest.mark.skipif(
     isinstance(pylibwholegraph, MissingModule), reason="wholegraph not available"
 )
+@pytest.mark.skipif(isinstance(dgl, MissingModule), reason="dgl not available")
 @pytest.mark.parametrize("direction", ["out", "in"])
 def test_graph_make_homogeneous_graph_mg(direction):
     uid = cugraph_comms_create_unique_id()
@@ -324,6 +325,7 @@ def run_test_graph_make_heterogeneous_graph_mg(rank, uid, world_size, direction)
 @pytest.mark.skipif(
     isinstance(pylibwholegraph, MissingModule), reason="wholegraph not available"
 )
+@pytest.mark.skipif(isinstance(dgl, MissingModule), reason="dgl not available")
 @pytest.mark.parametrize("direction", ["out", "in"])
 def test_graph_make_heterogeneous_graph_mg(direction):
     uid = cugraph_comms_create_unique_id()

From 8f14f88a7f5acb2a42e10c7db6a19c3e20d9f3f1 Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Tue, 2 Jul 2024 12:11:15 -0700
Subject: [PATCH 17/47] revert dependencies.yaml

---
 dependencies.yaml | 184 +++++++++++++++++++++++++++++++++-------------
 1 file changed, 133 insertions(+), 51 deletions(-)

diff --git a/dependencies.yaml b/dependencies.yaml
index 7b42c666792..fdb6f278265 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -12,6 +12,7 @@ files:
       - cuda
       - cuda_version
       - docs
+      - python_build_rapids
       - python_build_wheel
       - python_build_cythonize
       - depends_on_rmm
@@ -73,13 +74,20 @@ files:
     pyproject_dir: python/cugraph
     extras:
       table: build-system
+    includes:
+      - python_build_rapids
+      - python_build_cythonize
+  py_rapids_build_cugraph:
+    output: pyproject
+    pyproject_dir: python/cugraph
+    extras:
+      table: tool.rapids-build-backend
+      key: requires
     includes:
       - common_build
-      - python_build_wheel
       - depends_on_rmm
       - depends_on_pylibraft
       - depends_on_pylibcugraph
-      - python_build_cythonize
   py_run_cugraph:
     output: pyproject
     pyproject_dir: python/cugraph
@@ -108,12 +116,19 @@ files:
     pyproject_dir: python/pylibcugraph
     extras:
       table: build-system
+    includes:
+      - python_build_rapids
+      - python_build_cythonize
+  py_rapids_build_pylibcugraph:
+    output: pyproject
+    pyproject_dir: python/pylibcugraph
+    extras:
+      table: tool.rapids-build-backend
+      key: requires
     includes:
       - common_build
-      - python_build_wheel
       - depends_on_rmm
       - depends_on_pylibraft
-      - python_build_cythonize
   py_run_pylibcugraph:
     output: pyproject
     pyproject_dir: python/pylibcugraph
@@ -138,6 +153,7 @@ files:
     extras:
       table: build-system
     includes:
+      - python_build_rapids
       - python_build_wheel
   py_run_nx_cugraph:
     output: pyproject
@@ -163,6 +179,7 @@ files:
     extras:
       table: build-system
     includes:
+      - python_build_rapids
       - python_build_wheel
   py_run_cugraph_dgl:
     output: pyproject
@@ -188,6 +205,7 @@ files:
     extras:
       table: build-system
     includes:
+      - python_build_rapids
       - python_build_wheel
   py_run_cugraph_pyg:
     output: pyproject
@@ -213,6 +231,7 @@ files:
     extras:
       table: build-system
     includes:
+      - python_build_rapids
       - python_build_wheel
   py_run_cugraph_equivariant:
     output: pyproject
@@ -235,6 +254,7 @@ files:
     extras:
       table: build-system
     includes:
+      - python_build_rapids
       - python_build_wheel
   py_run_cugraph_service_client:
     output: pyproject
@@ -249,6 +269,7 @@ files:
     extras:
       table: build-system
     includes:
+      - python_build_rapids
       - python_build_wheel
   py_run_cugraph_service_server:
     output: pyproject
@@ -363,11 +384,11 @@ dependencies:
         packages:
           - c-compiler
           - cxx-compiler
-          - libcudf==24.8.*
-          - libcugraphops==24.8.*
-          - libraft-headers==24.8.*
-          - libraft==24.8.*
-          - librmm==24.8.*
+          - libcudf==24.8.*,>=0.0.0a0
+          - libcugraphops==24.8.*,>=0.0.0a0
+          - libraft-headers==24.8.*,>=0.0.0a0
+          - libraft==24.8.*,>=0.0.0a0
+          - librmm==24.8.*,>=0.0.0a0
           - openmpi # Required for building cpp-mgtests (multi-GPU tests)
     specific:
       - output_types: [conda]
@@ -431,6 +452,11 @@ dependencies:
           - matrix:
             packages:
               - python>=3.9,<3.12
+  python_build_rapids:
+    common:
+      - output_types: [conda, pyproject, requirements]
+        packages:
+          - rapids-build-backend>=0.3.1,<0.4.0.dev0
   python_build_wheel:
     common:
       - output_types: [conda, pyproject, requirements]
@@ -452,11 +478,10 @@ dependencies:
     common:
       - output_types: [conda, pyproject]
         packages:
-          - &dask rapids-dask-dependency==24.6.*
-          - &dask_cuda dask-cuda==24.6.*
+          - &dask rapids-dask-dependency==24.8.*,>=0.0.0a0
+          - &dask_cuda dask-cuda==24.8.*,>=0.0.0a0
           - &numba numba>=0.57
           - &numpy numpy>=1.23,<2.0a0
-          - &ucx_py ucx-py==0.38.*
       - output_types: conda
         packages:
           - aiohttp
@@ -464,11 +489,26 @@ dependencies:
           - requests
           - nccl>=2.9.9
           - ucx-proc=*=gpu
+          - &ucx_py ucx-py==0.39.*,>=0.0.0a0
       - output_types: pyproject
         packages:
             # cudf uses fsspec but is protocol independent. cugraph
             # dataset APIs require [http] extras for use with cudf.
           - fsspec[http]>=0.6.0
+    specific:
+      - output_types: pyproject
+        matrices:
+          - matrix:
+              cuda: "11.*"
+            packages:
+              - &ucx_py_cu11 ucx-py-cu11==0.39.*,>=0.0.0a0
+          - matrix:
+              cuda: "12.*"
+            packages:
+              - &ucx_py_cu12 ucx-py-cu12==0.39.*,>=0.0.0a0
+          - matrix:
+            packages:
+              - *ucx_py
   python_run_nx_cugraph:
     common:
       - output_types: [conda, pyproject]
@@ -481,20 +521,40 @@ dependencies:
         packages:
           - *numba
           - *numpy
-          - &tensordict tensordict>=0.1.2,<0.3.1
+    specific:
       - output_types: [pyproject]
-        packages:
-          - &cugraph cugraph==24.6.*
+        matrices:
+          - matrix:
+              cuda: "11.*"
+            packages:
+              - &cugraph_cu11 cugraph-cu11==24.8.*,>=0.0.0a0
+          - matrix:
+              cuda: "12.*"
+            packages:
+              - &cugraph_cu12 cugraph-cu12==24.8.*,>=0.0.0a0
+          - matrix:
+            packages:
+              - &cugraph cugraph==24.8.*,>=0.0.0a0
   python_run_cugraph_pyg:
     common:
       - output_types: [conda, pyproject]
         packages:
           - *numba
           - *numpy
-          - *tensordict
+    specific:
       - output_types: [pyproject]
-        packages:
-          - *cugraph
+        matrices:
+          - matrix:
+              cuda: "11.*"
+            packages:
+              - *cugraph_cu11
+          - matrix:
+              cuda: "12.*"
+            packages:
+              - *cugraph_cu12
+          - matrix:
+            packages:
+              - *cugraph
   python_run_cugraph_service_client:
     common:
       - output_types: [conda, pyproject]
@@ -509,11 +569,27 @@ dependencies:
           - *numba
           - *numpy
           - *thrift
-          - *ucx_py
       - output_types: pyproject
         packages:
           - *cugraph
-          - cugraph-service-client==24.6.*
+          - cugraph-service-client==24.8.*,>=0.0.0a0
+      - output_types: conda
+        packages:
+          - *ucx_py
+    specific:
+      - output_types: pyproject
+        matrices:
+          - matrix:
+              cuda: "11.*"
+            packages:
+              - *ucx_py_cu11
+          - matrix:
+              cuda: "12.*"
+            packages:
+              - *ucx_py_cu12
+          - matrix:
+            packages:
+              - *ucx_py
   test_cpp:
     common:
       - output_types: conda
@@ -548,7 +624,7 @@ dependencies:
           - scikit-learn>=0.23.1
       - output_types: [conda]
         packages:
-          - pylibwholegraph==24.8.*
+          - &pylibwholegraph_conda pylibwholegraph==24.8.*,>=0.0.0a0
           # this thriftpy2 entry can be removed entirely (or switched to a '!=')
           # once a new release of that project resolves https://github.com/Thriftpy/thriftpy2/issues/281
           - thriftpy2<=0.5.0
@@ -568,19 +644,18 @@ dependencies:
     common:
       - output_types: [conda]
         packages:
-          - cugraph==24.6.*
+          - cugraph==24.8.*,>=0.0.0a0
           - pytorch>=2.0
           - pytorch-cuda==11.8
-          - *tensordict
           - dgl>=1.1.0.cu*
   cugraph_pyg_dev:
     common:
       - output_types: [conda]
         packages:
-          - cugraph==24.6.*
+          - cugraph==24.8.*,>=0.0.0a0
           - pytorch>=2.0
           - pytorch-cuda==11.8
-          - *tensordict
+          - &tensordict tensordict>=0.1.2
           - pyg>=2.5,<2.6
 
   depends_on_pytorch:
@@ -590,25 +665,32 @@ dependencies:
           - &pytorch_conda pytorch>=2.0,<2.2.0a0
 
     specific:
+      - output_types: [requirements]
+        matrices:
+          - matrix: {cuda: "12.*"}
+            packages:
+              - --extra-index-url=https://download.pytorch.org/whl/cu121
+          - matrix: {cuda: "11.*"}
+            packages:
+              - --extra-index-url=https://download.pytorch.org/whl/cu118
+          - {matrix: null, packages: null}
       - output_types: [requirements, pyproject]
         matrices:
           - matrix: {cuda: "12.*"}
             packages:
               - &pytorch_pip torch>=2.0,<2.2.0a0
               - *tensordict
-              - --extra-index-url=https://download.pytorch.org/whl/cu121
           - matrix: {cuda: "11.*"}
             packages:
               - *pytorch_pip
               - *tensordict
-              - --extra-index-url=https://download.pytorch.org/whl/cu118
           - {matrix: null, packages: [*pytorch_pip, *tensordict]}
 
   depends_on_pylibwholegraph:
     common:
       - output_types: conda
         packages:
-          - &pylibwholegraph_conda pylibwholegraph==24.8.*
+          - *pylibwholegraph_conda
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -619,17 +701,17 @@ dependencies:
         matrices:
           - matrix: {cuda: "12.*"}
             packages:
-              - pylibwholegraph-cu12==24.8.*
+              - pylibwholegraph-cu12==24.8.*,>=0.0.0a0
           - matrix: {cuda: "11.*"}
             packages:
-              - pylibwholegraph-cu11==24.8.*
+              - pylibwholegraph-cu11==24.8.*,>=0.0.0a0
           - {matrix: null, packages: [*pylibwholegraph_conda]}
 
   depends_on_rmm:
     common:
       - output_types: conda
         packages:
-          - &rmm_conda rmm==24.6.*
+          - &rmm_conda rmm==24.8.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -640,17 +722,17 @@ dependencies:
         matrices:
           - matrix: {cuda: "12.*"}
             packages:
-              - rmm-cu12==24.6.*
+              - rmm-cu12==24.8.*,>=0.0.0a0
           - matrix: {cuda: "11.*"}
             packages:
-              - rmm-cu11==24.6.*
+              - rmm-cu11==24.8.*,>=0.0.0a0
           - {matrix: null, packages: [*rmm_conda]}
 
   depends_on_cudf:
     common:
       - output_types: conda
         packages:
-          - &cudf_conda cudf==24.6.*
+          - &cudf_conda cudf==24.8.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -661,17 +743,17 @@ dependencies:
         matrices:
           - matrix: {cuda: "12.*"}
             packages:
-              - cudf-cu12==24.6.*
+              - cudf-cu12==24.8.*,>=0.0.0a0
           - matrix: {cuda: "11.*"}
             packages:
-              - cudf-cu11==24.6.*
+              - cudf-cu11==24.8.*,>=0.0.0a0
           - {matrix: null, packages: [*cudf_conda]}
 
   depends_on_dask_cudf:
     common:
       - output_types: conda
         packages:
-          - &dask_cudf_conda dask-cudf==24.6.*
+          - &dask_cudf_conda dask-cudf==24.8.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -682,17 +764,17 @@ dependencies:
         matrices:
           - matrix: {cuda: "12.*"}
             packages:
-              - dask-cudf-cu12==24.6.*
+              - dask-cudf-cu12==24.8.*,>=0.0.0a0
           - matrix: {cuda: "11.*"}
             packages:
-              - dask-cudf-cu11==24.6.*
+              - dask-cudf-cu11==24.8.*,>=0.0.0a0
           - {matrix: null, packages: [*dask_cudf_conda]}
 
   depends_on_pylibraft:
     common:
       - output_types: conda
         packages:
-          - &pylibraft_conda pylibraft==24.6.*
+          - &pylibraft_conda pylibraft==24.8.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -703,17 +785,17 @@ dependencies:
         matrices:
           - matrix: {cuda: "12.*"}
             packages:
-              - pylibraft-cu12==24.6.*
+              - pylibraft-cu12==24.8.*,>=0.0.0a0
           - matrix: {cuda: "11.*"}
             packages:
-              - pylibraft-cu11==24.6.*
+              - pylibraft-cu11==24.8.*,>=0.0.0a0
           - {matrix: null, packages: [*pylibraft_conda]}
 
   depends_on_raft_dask:
     common:
       - output_types: conda
         packages:
-          - &raft_dask_conda raft-dask==24.6.*
+          - &raft_dask_conda raft-dask==24.8.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -724,17 +806,17 @@ dependencies:
         matrices:
           - matrix: {cuda: "12.*"}
             packages:
-              - raft-dask-cu12==24.6.*
+              - raft-dask-cu12==24.8.*,>=0.0.0a0
           - matrix: {cuda: "11.*"}
             packages:
-              - raft-dask-cu11==24.6.*
+              - raft-dask-cu11==24.8.*,>=0.0.0a0
           - {matrix: null, packages: [*raft_dask_conda]}
 
   depends_on_pylibcugraph:
     common:
       - output_types: conda
         packages:
-          - &pylibcugraph_conda pylibcugraph==24.6.*
+          - &pylibcugraph_conda pylibcugraph==24.8.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -745,17 +827,17 @@ dependencies:
         matrices:
           - matrix: {cuda: "12.*"}
             packages:
-              - pylibcugraph-cu12==24.6.*
+              - pylibcugraph-cu12==24.8.*,>=0.0.0a0
           - matrix: {cuda: "11.*"}
             packages:
-              - pylibcugraph-cu11==24.6.*
+              - pylibcugraph-cu11==24.8.*,>=0.0.0a0
           - {matrix: null, packages: [*pylibcugraph_conda]}
 
   depends_on_pylibcugraphops:
     common:
       - output_types: conda
         packages:
-          - &pylibcugraphops_conda pylibcugraphops==24.6.*
+          - &pylibcugraphops_conda pylibcugraphops==24.8.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -766,10 +848,10 @@ dependencies:
         matrices:
           - matrix: {cuda: "12.*"}
             packages:
-              - pylibcugraphops-cu12==24.6.*
+              - pylibcugraphops-cu12==24.8.*,>=0.0.0a0
           - matrix: {cuda: "11.*"}
             packages:
-              - pylibcugraphops-cu11==24.6.*
+              - pylibcugraphops-cu11==24.8.*,>=0.0.0a0
           - {matrix: null, packages: [*pylibcugraphops_conda]}
 
   depends_on_cupy:

From 5f74252c4b0e3fa395a11879e80696bb92372171 Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Tue, 2 Jul 2024 12:13:15 -0700
Subject: [PATCH 18/47] update tensordict dependency

---
 dependencies.yaml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/dependencies.yaml b/dependencies.yaml
index fdb6f278265..31683483967 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -647,6 +647,7 @@ dependencies:
           - cugraph==24.8.*,>=0.0.0a0
           - pytorch>=2.0
           - pytorch-cuda==11.8
+          - &tensordict tensordict>=0.1.2
           - dgl>=1.1.0.cu*
   cugraph_pyg_dev:
     common:
@@ -655,7 +656,7 @@ dependencies:
           - cugraph==24.8.*,>=0.0.0a0
           - pytorch>=2.0
           - pytorch-cuda==11.8
-          - &tensordict tensordict>=0.1.2
+          - *tensordict
           - pyg>=2.5,<2.6
 
   depends_on_pytorch:

From b2fdef84931f5a5261d1158fef030dde5cbe18cb Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Tue, 2 Jul 2024 14:35:38 -0700
Subject: [PATCH 19/47] update dependencies

---
 dependencies.yaml | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/dependencies.yaml b/dependencies.yaml
index 31683483967..4938b8f79e2 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -647,7 +647,7 @@ dependencies:
           - cugraph==24.8.*,>=0.0.0a0
           - pytorch>=2.0
           - pytorch-cuda==11.8
-          - &tensordict tensordict>=0.1.2
+          - tensordict>=0.1.2
           - dgl>=1.1.0.cu*
   cugraph_pyg_dev:
     common:
@@ -656,7 +656,7 @@ dependencies:
           - cugraph==24.8.*,>=0.0.0a0
           - pytorch>=2.0
           - pytorch-cuda==11.8
-          - *tensordict
+          - tensordict>=0.1.2
           - pyg>=2.5,<2.6
 
   depends_on_pytorch:
@@ -664,6 +664,8 @@ dependencies:
       - output_types: [conda]
         packages:
           - &pytorch_conda pytorch>=2.0,<2.2.0a0
+          - torchdata
+          - pydantic
 
     specific:
       - output_types: [requirements]

From 92fd8665b20f5ad2910cb9a24c7938c991a26bf4 Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Tue, 2 Jul 2024 14:38:41 -0700
Subject: [PATCH 20/47] update meta files

---
 conda/environments/all_cuda-118_arch-x86_64.yaml       | 2 ++
 conda/environments/all_cuda-122_arch-x86_64.yaml       | 2 ++
 conda/recipes/cugraph-dgl/meta.yaml                    | 2 +-
 conda/recipes/cugraph-pyg/meta.yaml                    | 2 +-
 dependencies.yaml                                      | 4 ++--
 python/cugraph-dgl/conda/cugraph_dgl_dev_cuda-118.yaml | 1 +
 6 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 177145cc44c..4de237a2b0e 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -46,6 +46,7 @@ dependencies:
 - packaging>=21
 - pandas
 - pre-commit
+- pydantic
 - pydata-sphinx-theme
 - pylibcugraphops==24.8.*,>=0.0.0a0
 - pylibraft==24.8.*,>=0.0.0a0
@@ -72,6 +73,7 @@ dependencies:
 - sphinx<6
 - sphinxcontrib-websupport
 - thriftpy2<=0.5.0
+- torchdata
 - ucx-proc=*=gpu
 - ucx-py==0.39.*,>=0.0.0a0
 - wget
diff --git a/conda/environments/all_cuda-122_arch-x86_64.yaml b/conda/environments/all_cuda-122_arch-x86_64.yaml
index 9b77955e0fc..5c8c773d0d6 100644
--- a/conda/environments/all_cuda-122_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-122_arch-x86_64.yaml
@@ -51,6 +51,7 @@ dependencies:
 - packaging>=21
 - pandas
 - pre-commit
+- pydantic
 - pydata-sphinx-theme
 - pylibcugraphops==24.8.*,>=0.0.0a0
 - pylibraft==24.8.*,>=0.0.0a0
@@ -77,6 +78,7 @@ dependencies:
 - sphinx<6
 - sphinxcontrib-websupport
 - thriftpy2<=0.5.0
+- torchdata
 - ucx-proc=*=gpu
 - ucx-py==0.39.*,>=0.0.0a0
 - wget
diff --git a/conda/recipes/cugraph-dgl/meta.yaml b/conda/recipes/cugraph-dgl/meta.yaml
index 7346c9e6f94..7c30c9c602b 100644
--- a/conda/recipes/cugraph-dgl/meta.yaml
+++ b/conda/recipes/cugraph-dgl/meta.yaml
@@ -28,7 +28,7 @@ requirements:
     - numba >=0.57
     - numpy >=1.23,<2.0a0
     - pylibcugraphops ={{ minor_version }}
-    - tensordict >=0.1.2,<0.3.1a0
+    - tensordict >=0.1.2
     - python
     - pytorch >=2.0
     - cupy >= 12.0.0
diff --git a/conda/recipes/cugraph-pyg/meta.yaml b/conda/recipes/cugraph-pyg/meta.yaml
index 17362893310..9833a78d88b 100644
--- a/conda/recipes/cugraph-pyg/meta.yaml
+++ b/conda/recipes/cugraph-pyg/meta.yaml
@@ -35,7 +35,7 @@ requirements:
     - cupy >=12.0.0
     - cugraph ={{ version }}
     - pylibcugraphops ={{ minor_version }}
-    - tensordict >=0.1.2,<0.3.1a0
+    - tensordict >=0.1.2
     - pyg >=2.5,<2.6
 
 tests:
diff --git a/dependencies.yaml b/dependencies.yaml
index 4938b8f79e2..4580130643d 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -647,7 +647,7 @@ dependencies:
           - cugraph==24.8.*,>=0.0.0a0
           - pytorch>=2.0
           - pytorch-cuda==11.8
-          - tensordict>=0.1.2
+          - &tensordict tensordict>=0.1.2
           - dgl>=1.1.0.cu*
   cugraph_pyg_dev:
     common:
@@ -656,7 +656,7 @@ dependencies:
           - cugraph==24.8.*,>=0.0.0a0
           - pytorch>=2.0
           - pytorch-cuda==11.8
-          - tensordict>=0.1.2
+          - *tensordict
           - pyg>=2.5,<2.6
 
   depends_on_pytorch:
diff --git a/python/cugraph-dgl/conda/cugraph_dgl_dev_cuda-118.yaml b/python/cugraph-dgl/conda/cugraph_dgl_dev_cuda-118.yaml
index 63771a75064..28c2fc81eeb 100644
--- a/python/cugraph-dgl/conda/cugraph_dgl_dev_cuda-118.yaml
+++ b/python/cugraph-dgl/conda/cugraph_dgl_dev_cuda-118.yaml
@@ -21,4 +21,5 @@ dependencies:
 - pytorch-cuda==11.8
 - pytorch>=2.0
 - scipy
+- tensordict>=0.1.2
 name: cugraph_dgl_dev_cuda-118

From 6107d8269de69ba3b41e0096d3964fc6e2d12268 Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Wed, 3 Jul 2024 13:54:29 -0700
Subject: [PATCH 21/47] fix csr/csc issue, wrap up tests

---
 .../cugraph_dgl/dataloading/__init__.py       |   5 +-
 .../cugraph_dgl/dataloading/dataloader.py     |   6 +-
 .../dataloading/neighbor_sampler.py           |  15 +-
 .../cugraph_dgl/dataloading/sampler.py        |  19 +-
 .../dataloading/utils/sampling_helpers.py     |  17 +-
 python/cugraph-dgl/cugraph_dgl/graph.py       |  34 +--
 .../tests/dataloading/test_dataloader.py      | 113 ++++++++--
 .../tests/dataloading/test_dataloader_mg.py   | 208 ++++++++++++++++++
 8 files changed, 365 insertions(+), 52 deletions(-)
 create mode 100644 python/cugraph-dgl/cugraph_dgl/tests/dataloading/test_dataloader_mg.py

diff --git a/python/cugraph-dgl/cugraph_dgl/dataloading/__init__.py b/python/cugraph-dgl/cugraph_dgl/dataloading/__init__.py
index 9ee5e6a970f..8a2e9cd954d 100644
--- a/python/cugraph-dgl/cugraph_dgl/dataloading/__init__.py
+++ b/python/cugraph-dgl/cugraph_dgl/dataloading/__init__.py
@@ -22,13 +22,14 @@
 from cugraph_dgl.dataloading.neighbor_sampler import NeighborSampler
 
 from cugraph_dgl.dataloading.dask_dataloader import DaskDataLoader
+from cugraph_dgl.dataloading.dataloader import DataLoader as FutureDataLoader
 
 
 def DataLoader(*args, **kwargs):
     warnings.warn(
         "DataLoader has been renamed to DaskDataLoader.  "
-        "In Release 24.10, cugraph_dgl.dataloading.dataloader.DataLoader "
+        "In Release 24.10, cugraph_dgl.dataloading.FutureDataLoader "
         "will take over the DataLoader name.",
-        FutureWarning
+        FutureWarning,
     )
     return DaskDataLoader(*args, **kwargs)
diff --git a/python/cugraph-dgl/cugraph_dgl/dataloading/dataloader.py b/python/cugraph-dgl/cugraph_dgl/dataloading/dataloader.py
index b3af6fc3bc9..39f43a5d805 100644
--- a/python/cugraph-dgl/cugraph_dgl/dataloading/dataloader.py
+++ b/python/cugraph-dgl/cugraph_dgl/dataloading/dataloader.py
@@ -154,13 +154,13 @@ def __iter__(self):
         return self.__sampler.sample(
             self.__graph,
             self.__dataset,
-            self.__batch_size,
+            batch_size=self.__batch_size,
         )
-    
+
         """
         start, end, blocks = out
 
         start = start.to(self.__device)
         end = end.to(self.__device)
         blocks = [b.to(self.__device) for b in blocks]
-        """
\ No newline at end of file
+        """
diff --git a/python/cugraph-dgl/cugraph_dgl/dataloading/neighbor_sampler.py b/python/cugraph-dgl/cugraph_dgl/dataloading/neighbor_sampler.py
index f77b00bbac8..1a35c3ea027 100644
--- a/python/cugraph-dgl/cugraph_dgl/dataloading/neighbor_sampler.py
+++ b/python/cugraph-dgl/cugraph_dgl/dataloading/neighbor_sampler.py
@@ -22,7 +22,7 @@
 from cugraph.utilities.utils import import_optional
 
 import cugraph_dgl
-from cugraph_dgl.typing import TensorType, DGLSamplerOutput
+from cugraph_dgl.typing import DGLSamplerOutput
 from cugraph_dgl.dataloading.sampler import Sampler, HomogeneousSampleReader
 
 torch = import_optional("torch")
@@ -161,11 +161,14 @@ def __init__(
         )
 
     def sample(
-        self, g: "cugraph_dgl.Graph", indices: Iterator["torch.Tensor"], batch_size: int = 1
+        self,
+        g: "cugraph_dgl.Graph",
+        indices: Iterator["torch.Tensor"],
+        batch_size: int = 1,
     ) -> Iterator[DGLSamplerOutput]:
         kwargs = dict(**self.__kwargs)
 
-        directory = kwargs.pop('directory', None)
+        directory = kwargs.pop("directory", None)
         if directory is None:
             warnings.warn("Setting a directory to store samples is recommended.")
             self._tempdir = tempfile.TemporaryDirectory()
@@ -180,7 +183,7 @@ def sample(
         ds = UniformNeighborSampler(
             g._graph(self.edge_dir),
             writer,
-            compression=self.sparse_format.upper(),
+            compression="CSR",
             fanout=self._reversed_fanout_vals,
             prior_sources_behavior="carryover",
             deduplicate_sources=True,
@@ -192,7 +195,9 @@ def sample(
         if g.is_homogeneous:
             indices = torch.concat(list(indices))
             ds.sample_from_nodes(indices, batch_size=batch_size)
-            return HomogeneousSampleReader(ds.get_reader(), self.output_format)
+            return HomogeneousSampleReader(
+                ds.get_reader(), self.output_format, self.edge_dir
+            )
 
         raise ValueError(
             "Sampling heterogeneous graphs is currently"
diff --git a/python/cugraph-dgl/cugraph_dgl/dataloading/sampler.py b/python/cugraph-dgl/cugraph_dgl/dataloading/sampler.py
index 20bce365c7e..731ec1b8d6f 100644
--- a/python/cugraph-dgl/cugraph_dgl/dataloading/sampler.py
+++ b/python/cugraph-dgl/cugraph_dgl/dataloading/sampler.py
@@ -15,7 +15,7 @@
 
 import cugraph_dgl
 from cugraph_dgl.nn import SparseGraph
-from cugraph_dgl.typing import TensorType, DGLSamplerOutput
+from cugraph_dgl.typing import DGLSamplerOutput
 from cugraph_dgl.dataloading.utils.sampling_helpers import (
     create_homogeneous_sampled_graphs_from_tensors_csc,
 )
@@ -81,7 +81,12 @@ class HomogeneousSampleReader(SampleReader):
     produced by the cuGraph distributed sampler.
     """
 
-    def __init__(self, base_reader: DistSampleReader, output_format: str = "dgl.Block"):
+    def __init__(
+        self,
+        base_reader: DistSampleReader,
+        output_format: str = "dgl.Block",
+        edge_dir="in",
+    ):
         """
         Constructs a new HomogeneousSampleReader
 
@@ -93,7 +98,11 @@ def __init__(self, base_reader: DistSampleReader, output_format: str = "dgl.Bloc
         output_format: str
             The output format for blocks (either "dgl.Block" or
             "cugraph_dgl.nn.SparseGraph").
+        edge_dir: str
+            The direction sampling was performed in ("in" or "out").
         """
+
+        self.__edge_dir = edge_dir
         super().__init__(base_reader, output_format=output_format)
 
     def __decode_csc(
@@ -141,7 +150,6 @@ def __init__(self, sparse_format: str = "csc", output_format="dgl.Block"):
         if sparse_format != "csc":
             raise ValueError("Only CSC format is supported at this time")
 
-        self.__sparse_format = sparse_format
         self.__output_format = output_format
 
     @property
@@ -153,7 +161,10 @@ def sparse_format(self):
         return self.__sparse_format
 
     def sample(
-        self, g: cugraph_dgl.Graph, indices: Iterator["torch.Tensor"], batch_size: int = 1
+        self,
+        g: cugraph_dgl.Graph,
+        indices: Iterator["torch.Tensor"],
+        batch_size: int = 1,
     ) -> Iterator[
         Tuple["torch.Tensor", "torch.Tensor", List[Union[SparseGraph, "dgl.Block"]]]
     ]:
diff --git a/python/cugraph-dgl/cugraph_dgl/dataloading/utils/sampling_helpers.py b/python/cugraph-dgl/cugraph_dgl/dataloading/utils/sampling_helpers.py
index da40fb6f564..e8c305b6ba1 100644
--- a/python/cugraph-dgl/cugraph_dgl/dataloading/utils/sampling_helpers.py
+++ b/python/cugraph-dgl/cugraph_dgl/dataloading/utils/sampling_helpers.py
@@ -592,21 +592,28 @@ def _create_homogeneous_blocks_from_csc(
         ]
 
         blocks = []
-        seednodes_range=None
+        seednodes_range = None
         for mfg in mfgs:
             block_mfg = _create_homogeneous_dgl_block_from_tensor_d(
-                {'sources': mfg.src_ids(), 'destinations': mfg.dst_ids(), 'sources_range': mfg._num_src_nodes-1, 'destinations_range': mfg._num_dst_nodes-1},
+                {
+                    "sources": mfg.src_ids(),
+                    "destinations": mfg.dst_ids(),
+                    "sources_range": mfg._num_src_nodes - 1,
+                    "destinations_range": mfg._num_dst_nodes - 1,
+                },
                 renumber_map=renumber_map_list[b_id],
-                seednodes_range=seednodes_range
+                seednodes_range=seednodes_range,
             )
 
             seednodes_range = max(
-                mfg._num_src_nodes-1,
-                mfg._num_dst_nodes-1,
+                mfg._num_src_nodes - 1,
+                mfg._num_dst_nodes - 1,
             )
             blocks.append(block_mfg)
         del mfgs
 
+        blocks.reverse()
+
         output_batch.append(blocks)
 
         output.append(output_batch)
diff --git a/python/cugraph-dgl/cugraph_dgl/graph.py b/python/cugraph-dgl/cugraph_dgl/graph.py
index 00fa9a66be7..5e18d5ea616 100644
--- a/python/cugraph-dgl/cugraph_dgl/graph.py
+++ b/python/cugraph-dgl/cugraph_dgl/graph.py
@@ -191,26 +191,28 @@ def add_nodes(
                 raise ValueError("The global number of nodes must match on all workers")
 
             # Ensure the sum of the feature shapes equals the global number of nodes.
-            for feature_name, feature_tensor in data.items():
-                features_size = torch.tensor(
-                    [int(feature_tensor.shape[0])], device="cuda", dtype=torch.int64
-                )
-                torch.distributed.all_reduce(
-                    features_size, op=torch.distributed.ReduceOp.SUM
-                )
-                if features_size != global_num_nodes:
-                    raise ValueError(
-                        "The total length of the feature vector across workers must"
-                        " match the global number of nodes but it does not match for "
-                        f"{feature_name}."
+            if data is not None:
+                for feature_name, feature_tensor in data.items():
+                    features_size = torch.tensor(
+                        [int(feature_tensor.shape[0])], device="cuda", dtype=torch.int64
                     )
+                    torch.distributed.all_reduce(
+                        features_size, op=torch.distributed.ReduceOp.SUM
+                    )
+                    if features_size != global_num_nodes:
+                        raise ValueError(
+                            "The total length of the feature vector across workers must"
+                            " match the global number of nodes but it does not "
+                            f"match for {feature_name}."
+                        )
 
         self.__num_nodes_dict[ntype] = global_num_nodes
 
-        for feature_name, feature_tensor in data.items():
-            self.__ndata_storage[ntype, feature_name] = self.__ndata_storage_type(
-                _cast_to_torch_tensor(feature_tensor), **self.__wg_kwargs
-            )
+        if data is not None:
+            for feature_name, feature_tensor in data.items():
+                self.__ndata_storage[ntype, feature_name] = self.__ndata_storage_type(
+                    _cast_to_torch_tensor(feature_tensor), **self.__wg_kwargs
+                )
 
         self.__graph = None
         self.__vertex_offsets = None
diff --git a/python/cugraph-dgl/cugraph_dgl/tests/dataloading/test_dataloader.py b/python/cugraph-dgl/cugraph_dgl/tests/dataloading/test_dataloader.py
index d29f1c8ea41..ef47875463d 100644
--- a/python/cugraph-dgl/cugraph_dgl/tests/dataloading/test_dataloader.py
+++ b/python/cugraph-dgl/cugraph_dgl/tests/dataloading/test_dataloader.py
@@ -11,39 +11,118 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import cugraph_dgl.dataloading
 import pytest
 
 import cugraph_dgl
-from cugraph_dgl.dataloading.dataloader import DataLoader
-from cugraph_dgl.dataloading import NeighborSampler
 
 from cugraph.datasets import karate
 from cugraph.utilities.utils import import_optional, MissingModule
 
-torch = import_optional('torch')
-dgl = import_optional('dgl')
+import numpy as np
+
+torch = import_optional("torch")
+dgl = import_optional("dgl")
+
 
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
 @pytest.mark.skipif(isinstance(dgl, MissingModule), reason="dgl not available")
 def test_dataloader_basic_homogeneous():
-    graph = cugraph_dgl.Graph(
-        is_multi_gpu=False
-    )
+    graph = cugraph_dgl.Graph(is_multi_gpu=False)
 
     num_nodes = karate.number_of_nodes()
-    graph.add_nodes(
-        num_nodes,
-        data={'z': torch.arange(num_nodes)}
-    )
+    graph.add_nodes(num_nodes, data={"z": torch.arange(num_nodes)})
 
     edf = karate.get_edgelist()
     graph.add_edges(
-        u=edf['src'],
-        v=edf['dst'],
-        data={'q': torch.arange(karate.number_of_edges())}
+        u=edf["src"], v=edf["dst"], data={"q": torch.arange(karate.number_of_edges())}
+    )
+
+    sampler = cugraph_dgl.dataloading.NeighborSampler([5, 5, 5])
+    loader = cugraph_dgl.dataloading.FutureDataLoader(
+        graph, torch.arange(num_nodes), sampler, batch_size=2
+    )
+
+    for in_t, out_t, blocks in loader:
+        assert len(blocks) == 3
+        assert len(out_t) <= 2
+
+
+def sample_dgl_graphs(g, train_nid, fanouts, batch_size=1):
+    # Single fanout to match cugraph
+    sampler = dgl.dataloading.NeighborSampler(fanouts)
+    dataloader = dgl.dataloading.DataLoader(
+        g,
+        train_nid,
+        sampler,
+        batch_size=batch_size,
+        shuffle=False,
+        drop_last=False,
+        num_workers=0,
+    )
+
+    dgl_output = {}
+    for batch_id, (input_nodes, output_nodes, blocks) in enumerate(dataloader):
+        dgl_output[batch_id] = {
+            "input_nodes": input_nodes,
+            "output_nodes": output_nodes,
+            "blocks": blocks,
+        }
+    return dgl_output
+
+
+def sample_cugraph_dgl_graphs(cugraph_g, train_nid, fanouts, batch_size=1):
+    sampler = cugraph_dgl.dataloading.NeighborSampler(fanouts)
+
+    dataloader = cugraph_dgl.dataloading.FutureDataLoader(
+        cugraph_g,
+        train_nid,
+        sampler,
+        batch_size=batch_size,
+        drop_last=False,
+        shuffle=False,
     )
 
-    sampler = NeighborSampler([5, 5, 5])
-    loader = DataLoader(graph, torch.arange(num_nodes), sampler, batch_size=2)
+    cugraph_dgl_output = {}
+    for batch_id, (input_nodes, output_nodes, blocks) in enumerate(dataloader):
+        cugraph_dgl_output[batch_id] = {
+            "input_nodes": input_nodes,
+            "output_nodes": output_nodes,
+            "blocks": blocks,
+        }
+    return cugraph_dgl_output
+
+
+@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.skipif(isinstance(dgl, MissingModule), reason="dgl not available")
+@pytest.mark.parametrize("ix", [[1], [1, 0]])
+@pytest.mark.parametrize("batch_size", [1, 2])
+def test_same_homogeneousgraph_results(ix, batch_size):
+    src = torch.tensor([1, 2, 3, 4, 5, 6, 7, 8])
+    dst = torch.tensor([0, 0, 0, 0, 1, 1, 1, 1])
+
+    train_nid = torch.tensor(ix)
+    # Create a heterograph with 3 node types and 3 edges types.
+    dgl_g = dgl.graph((src, dst))
 
-    print(next(iter(loader)))
\ No newline at end of file
+    cugraph_g = cugraph_dgl.Graph(is_multi_gpu=False)
+    cugraph_g.add_nodes(9)
+    cugraph_g.add_edges(u=src, v=dst)
+
+    dgl_output = sample_dgl_graphs(dgl_g, train_nid, [2], batch_size=batch_size)
+    cugraph_output = sample_cugraph_dgl_graphs(cugraph_g, train_nid, [2], batch_size)
+
+    cugraph_output_nodes = cugraph_output[0]["output_nodes"].cpu().numpy()
+    dgl_output_nodes = dgl_output[0]["output_nodes"].cpu().numpy()
+
+    np.testing.assert_array_equal(
+        np.sort(cugraph_output_nodes), np.sort(dgl_output_nodes)
+    )
+    assert (
+        dgl_output[0]["blocks"][0].num_dst_nodes()
+        == cugraph_output[0]["blocks"][0].num_dst_nodes()
+    )
+    assert (
+        dgl_output[0]["blocks"][0].num_edges()
+        == cugraph_output[0]["blocks"][0].num_edges()
+    )
diff --git a/python/cugraph-dgl/cugraph_dgl/tests/dataloading/test_dataloader_mg.py b/python/cugraph-dgl/cugraph_dgl/tests/dataloading/test_dataloader_mg.py
new file mode 100644
index 00000000000..39ce55d7616
--- /dev/null
+++ b/python/cugraph-dgl/cugraph_dgl/tests/dataloading/test_dataloader_mg.py
@@ -0,0 +1,208 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+import os
+
+import numpy as np
+
+import cugraph_dgl
+
+from cugraph.datasets import karate
+from cugraph.utilities.utils import import_optional, MissingModule
+
+from cugraph.gnn import (
+    cugraph_comms_create_unique_id,
+    cugraph_comms_init,
+    cugraph_comms_shutdown,
+)
+
+torch = import_optional("torch")
+dgl = import_optional("dgl")
+
+
+def init_pytorch_worker(rank, world_size, cugraph_id):
+    import rmm
+
+    rmm.reinitialize(
+        devices=rank,
+    )
+
+    import cupy
+
+    cupy.cuda.Device(rank).use()
+    from rmm.allocators.cupy import rmm_cupy_allocator
+
+    cupy.cuda.set_allocator(rmm_cupy_allocator)
+
+    from cugraph.testing.mg_utils import enable_spilling
+
+    enable_spilling()
+
+    torch.cuda.set_device(rank)
+
+    os.environ["MASTER_ADDR"] = "localhost"
+    os.environ["MASTER_PORT"] = "12355"
+    torch.distributed.init_process_group("nccl", rank=rank, world_size=world_size)
+
+    cugraph_comms_init(rank=rank, world_size=world_size, uid=cugraph_id, device=rank)
+
+
+def run_test_dataloader_basic_homogeneous(rank, world_size, uid):
+    init_pytorch_worker(rank, world_size, uid)
+
+    graph = cugraph_dgl.Graph(is_multi_gpu=True)
+
+    num_nodes = karate.number_of_nodes()
+    graph.add_nodes(
+        num_nodes,
+    )
+
+    edf = karate.get_edgelist()
+    graph.add_edges(
+        u=torch.tensor_split(torch.as_tensor(edf["src"], device="cuda"), world_size)[
+            rank
+        ],
+        v=torch.tensor_split(torch.as_tensor(edf["dst"], device="cuda"), world_size)[
+            rank
+        ],
+    )
+
+    sampler = cugraph_dgl.dataloading.NeighborSampler([5, 5, 5])
+    loader = cugraph_dgl.dataloading.FutureDataLoader(
+        graph,
+        torch.arange(num_nodes),
+        sampler,
+        batch_size=2,
+        use_ddp=True,
+    )
+
+    for in_t, out_t, blocks in loader:
+        assert len(blocks) == 3
+        assert len(out_t) <= 2
+
+
+@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.skipif(isinstance(dgl, MissingModule), reason="dgl not available")
+def test_dataloader_basic_homogeneous():
+    uid = cugraph_comms_create_unique_id()
+    # Limit the number of GPUs this rest is run with
+    world_size = min(torch.cuda.device_count(), 4)
+
+    torch.multiprocessing.spawn(
+        run_test_dataloader_basic_homogeneous,
+        args=(
+            world_size,
+            uid,
+        ),
+        nprocs=world_size,
+    )
+
+
+def sample_dgl_graphs(g, train_nid, fanouts, batch_size=1):
+    # Single fanout to match cugraph
+    sampler = dgl.dataloading.NeighborSampler(fanouts)
+    dataloader = dgl.dataloading.DataLoader(
+        g,
+        train_nid,
+        sampler,
+        batch_size=batch_size,
+        shuffle=False,
+        drop_last=False,
+        num_workers=0,
+    )
+
+    dgl_output = {}
+    for batch_id, (input_nodes, output_nodes, blocks) in enumerate(dataloader):
+        dgl_output[batch_id] = {
+            "input_nodes": input_nodes,
+            "output_nodes": output_nodes,
+            "blocks": blocks,
+        }
+    return dgl_output
+
+
+def sample_cugraph_dgl_graphs(cugraph_g, train_nid, fanouts, batch_size=1):
+    sampler = cugraph_dgl.dataloading.NeighborSampler(fanouts)
+
+    dataloader = cugraph_dgl.dataloading.FutureDataLoader(
+        cugraph_g,
+        train_nid,
+        sampler,
+        batch_size=batch_size,
+        drop_last=False,
+        shuffle=False,
+    )
+
+    cugraph_dgl_output = {}
+    for batch_id, (input_nodes, output_nodes, blocks) in enumerate(dataloader):
+        cugraph_dgl_output[batch_id] = {
+            "input_nodes": input_nodes,
+            "output_nodes": output_nodes,
+            "blocks": blocks,
+        }
+    return cugraph_dgl_output
+
+
+def run_test_same_homogeneousgraph_results(rank, world_size, uid, ix, batch_size):
+    init_pytorch_worker(rank, world_size, uid)
+
+    src = torch.tensor([1, 2, 3, 4, 5, 6, 7, 8])
+    dst = torch.tensor([0, 0, 0, 0, 1, 1, 1, 1])
+
+    local_src = torch.tensor_split(src, world_size)[rank]
+    local_dst = torch.tensor_split(dst, world_size)[rank]
+
+    train_nid = torch.tensor(ix)
+    # Create a heterograph with 3 node types and 3 edges types.
+    dgl_g = dgl.graph((src, dst))
+
+    cugraph_g = cugraph_dgl.Graph(is_multi_gpu=True)
+    cugraph_g.add_nodes(9)
+    cugraph_g.add_edges(u=local_src, v=local_dst)
+
+    dgl_output = sample_dgl_graphs(dgl_g, train_nid, [2], batch_size=batch_size)
+    cugraph_output = sample_cugraph_dgl_graphs(cugraph_g, train_nid, [2], batch_size)
+
+    cugraph_output_nodes = cugraph_output[0]["output_nodes"].cpu().numpy()
+    dgl_output_nodes = dgl_output[0]["output_nodes"].cpu().numpy()
+
+    np.testing.assert_array_equal(
+        np.sort(cugraph_output_nodes), np.sort(dgl_output_nodes)
+    )
+    assert (
+        dgl_output[0]["blocks"][0].num_dst_nodes()
+        == cugraph_output[0]["blocks"][0].num_dst_nodes()
+    )
+    assert (
+        dgl_output[0]["blocks"][0].num_edges()
+        == cugraph_output[0]["blocks"][0].num_edges()
+    )
+
+    cugraph_comms_shutdown()
+
+
+@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.skipif(isinstance(dgl, MissingModule), reason="dgl not available")
+@pytest.mark.parametrize("ix", [[1], [1, 0]])
+@pytest.mark.parametrize("batch_size", [1, 2])
+def test_same_homogeneousgraph_results_mg(ix, batch_size):
+    uid = cugraph_comms_create_unique_id()
+    # Limit the number of GPUs this rest is run with
+    world_size = min(torch.cuda.device_count(), 4)
+
+    torch.multiprocessing.spawn(
+        run_test_same_homogeneousgraph_results,
+        args=(world_size, uid, ix, batch_size),
+        nprocs=world_size,
+    )

From 6bc4b4a6696931fe65a5d59c95feae3e229f51c9 Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Mon, 8 Jul 2024 10:21:19 -0700
Subject: [PATCH 22/47] m

---
 .../cugraph_dgl/tests/test_graph.py           | 85 ++++++++++++-------
 .../cugraph_dgl/tests/test_graph_mg.py        |  4 +-
 2 files changed, 55 insertions(+), 34 deletions(-)

diff --git a/python/cugraph-dgl/cugraph_dgl/tests/test_graph.py b/python/cugraph-dgl/cugraph_dgl/tests/test_graph.py
index e47e280c48e..0c1fb088198 100644
--- a/python/cugraph-dgl/cugraph_dgl/tests/test_graph.py
+++ b/python/cugraph-dgl/cugraph_dgl/tests/test_graph.py
@@ -100,19 +100,18 @@ def test_graph_make_heterogeneous_graph(direction):
     df = karate.get_edgelist()
     df.src = df.src.astype("int64")
     df.dst = df.dst.astype("int64")
-    wgt = np.random.random((len(df),))
 
     graph = cugraph_dgl.Graph()
     total_num_nodes = max(df.src.max(), df.dst.max()) + 1
-    
+
     num_nodes_group_1 = total_num_nodes // 2
     num_nodes_group_2 = total_num_nodes - num_nodes_group_1
-    
+
     node_x_1 = np.random.random((num_nodes_group_1,))
     node_x_2 = np.random.random((num_nodes_group_2,))
 
-    graph.add_nodes(num_nodes_group_1, {'x':node_x_1}, 'type1')
-    graph.add_nodes(num_nodes_group_2, {'x':node_x_2}, 'type2')
+    graph.add_nodes(num_nodes_group_1, {"x": node_x_1}, "type1")
+    graph.add_nodes(num_nodes_group_2, {"x": node_x_2}, "type2")
 
     edges_11 = df[(df.src < num_nodes_group_1) & (df.dst < num_nodes_group_1)]
     edges_12 = df[(df.src < num_nodes_group_1) & (df.dst >= num_nodes_group_1)]
@@ -124,10 +123,10 @@ def test_graph_make_heterogeneous_graph(direction):
     edges_22.dst -= num_nodes_group_1
     edges_22.src -= num_nodes_group_1
 
-    graph.add_edges(edges_11.src, edges_11.dst, etype=('type1', 'e1', 'type1'))
-    graph.add_edges(edges_12.src, edges_12.dst, etype=('type1', 'e2', 'type2'))
-    graph.add_edges(edges_21.src, edges_21.dst, etype=('type2', 'e3', 'type1'))
-    graph.add_edges(edges_22.src, edges_22.dst, etype=('type2', 'e4', 'type2'))
+    graph.add_edges(edges_11.src, edges_11.dst, etype=("type1", "e1", "type1"))
+    graph.add_edges(edges_12.src, edges_12.dst, etype=("type1", "e2", "type2"))
+    graph.add_edges(edges_21.src, edges_21.dst, etype=("type2", "e3", "type1"))
+    graph.add_edges(edges_22.src, edges_22.dst, etype=("type2", "e4", "type2"))
 
     assert not graph.is_homogeneous
     assert not graph.is_multi_gpu
@@ -137,17 +136,31 @@ def test_graph_make_heterogeneous_graph(direction):
         graph.nodes() == torch.arange(total_num_nodes, dtype=torch.int64, device="cuda")
     ).all()
     assert (
-        graph.nodes('type1') == torch.arange(num_nodes_group_1, dtype=torch.int64, device="cuda")
+        graph.nodes("type1")
+        == torch.arange(num_nodes_group_1, dtype=torch.int64, device="cuda")
     ).all()
     assert (
-        graph.nodes('type2') == torch.arange(num_nodes_group_2, dtype=torch.int64, device="cuda")
+        graph.nodes("type2")
+        == torch.arange(num_nodes_group_2, dtype=torch.int64, device="cuda")
     ).all()
 
     # Verify graph.edges()
-    assert((graph.edges('eid',etype=('type1','e1','type1')) == torch.arange(len(edges_11), dtype=torch.int64, device='cuda')).all())
-    assert((graph.edges('eid',etype=('type1','e2','type2')) == torch.arange(len(edges_12), dtype=torch.int64, device='cuda')).all())
-    assert((graph.edges('eid',etype=('type2','e3','type1')) == torch.arange(len(edges_21), dtype=torch.int64, device='cuda')).all())
-    assert((graph.edges('eid',etype=('type2','e4','type2')) == torch.arange(len(edges_22), dtype=torch.int64, device='cuda')).all())
+    assert (
+        graph.edges("eid", etype=("type1", "e1", "type1"))
+        == torch.arange(len(edges_11), dtype=torch.int64, device="cuda")
+    ).all()
+    assert (
+        graph.edges("eid", etype=("type1", "e2", "type2"))
+        == torch.arange(len(edges_12), dtype=torch.int64, device="cuda")
+    ).all()
+    assert (
+        graph.edges("eid", etype=("type2", "e3", "type1"))
+        == torch.arange(len(edges_21), dtype=torch.int64, device="cuda")
+    ).all()
+    assert (
+        graph.edges("eid", etype=("type2", "e4", "type2"))
+        == torch.arange(len(edges_22), dtype=torch.int64, device="cuda")
+    ).all()
 
     # Use sampling call to check graph creation
     # This isn't a test of cuGraph sampling with DGL; the options are
@@ -156,20 +169,20 @@ def test_graph_make_heterogeneous_graph(direction):
     sampling_output = pylibcugraph.uniform_neighbor_sample(
         pylibcugraph.ResourceHandle(),
         plc_graph,
-        start_list=cupy.arange(total_num_nodes, dtype='int64'),
-        h_fan_out=np.array([1, 1], dtype='int32'),
+        start_list=cupy.arange(total_num_nodes, dtype="int64"),
+        h_fan_out=np.array([1, 1], dtype="int32"),
         with_replacement=False,
         do_expensive_check=True,
         with_edge_properties=True,
-        prior_sources_behavior='exclude',
+        prior_sources_behavior="exclude",
         return_dict=True,
     )
 
     expected_etypes = {
-        0: 'e1',
-        1: 'e2',
-        2: 'e3',
-        3: 'e4',
+        0: "e1",
+        1: "e2",
+        2: "e3",
+        3: "e4",
     }
     expected_offsets = {
         0: (0, 0),
@@ -177,20 +190,26 @@ def test_graph_make_heterogeneous_graph(direction):
         2: (num_nodes_group_1, 0),
         3: (num_nodes_group_1, num_nodes_group_1),
     }
-    if direction == 'in':   
-        src_col = 'minors'
-        dst_col = 'majors'
+    if direction == "in":
+        src_col = "minors"
+        dst_col = "majors"
     else:
-        src_col = 'majors'
-        dst_col = 'minors'
+        src_col = "majors"
+        dst_col = "minors"
 
     # Looping over the output verifies that all edges are valid
     # (and therefore, the graph is valid)
-    for i, etype in enumerate(sampling_output['edge_type'].tolist()):
-        eid = int(sampling_output['edge_id'][i])
+    for i, etype in enumerate(sampling_output["edge_type"].tolist()):
+        eid = int(sampling_output["edge_id"][i])
+
+        srcs, dsts, eids = graph.edges(
+            "all", etype=expected_etypes[etype], device="cpu"
+        )
 
-        srcs, dsts, eids = graph.edges('all', etype=expected_etypes[etype], device='cpu')
-        
         assert eids[eid] == eid
-        assert srcs[eid] == int(sampling_output[src_col][i]) - expected_offsets[etype][0]
-        assert dsts[eid] == int(sampling_output[dst_col][i]) - expected_offsets[etype][1]
\ No newline at end of file
+        assert (
+            srcs[eid] == int(sampling_output[src_col][i]) - expected_offsets[etype][0]
+        )
+        assert (
+            dsts[eid] == int(sampling_output[dst_col][i]) - expected_offsets[etype][1]
+        )
diff --git a/python/cugraph-dgl/cugraph_dgl/tests/test_graph_mg.py b/python/cugraph-dgl/cugraph_dgl/tests/test_graph_mg.py
index f0561c41095..8e469519433 100644
--- a/python/cugraph-dgl/cugraph_dgl/tests/test_graph_mg.py
+++ b/python/cugraph-dgl/cugraph_dgl/tests/test_graph_mg.py
@@ -32,9 +32,11 @@
     cugraph_comms_get_raft_handle,
 )
 
+
 pylibwholegraph = import_optional("pylibwholegraph")
 torch = import_optional("torch")
-dgl = import_optional('dgl')
+dgl = import_optional("dgl")
+
 
 def init_pytorch_worker(rank, world_size, cugraph_id):
     import rmm

From faeb4a52e280b11c736252540022a9e295c6ef63 Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Mon, 8 Jul 2024 10:45:16 -0700
Subject: [PATCH 23/47] style

---
 python/cugraph-dgl/cugraph_dgl/__init__.py | 2 +-
 python/cugraph-dgl/cugraph_dgl/view.py     | 7 ++++---
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/python/cugraph-dgl/cugraph_dgl/__init__.py b/python/cugraph-dgl/cugraph_dgl/__init__.py
index 61b4142a871..bbae569c91b 100644
--- a/python/cugraph-dgl/cugraph_dgl/__init__.py
+++ b/python/cugraph-dgl/cugraph_dgl/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
diff --git a/python/cugraph-dgl/cugraph_dgl/view.py b/python/cugraph-dgl/cugraph_dgl/view.py
index e2bf7c20a29..e65af53a096 100644
--- a/python/cugraph-dgl/cugraph_dgl/view.py
+++ b/python/cugraph-dgl/cugraph_dgl/view.py
@@ -11,6 +11,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+
 from collections import defaultdict
 from collections.abc import MutableMapping
 from typing import Union, Dict, List, Tuple
@@ -260,10 +261,10 @@ def __getitem__(self, key):
         )
 
     def __call__(self, *args, **kwargs):
-        if 'device' in kwargs:
+        if "device" in kwargs:
             return self.__graph.all_edges(*args, **kwargs)
-        
-        return self.__graph.all_edges(*args, **kwargs, device='cuda')
+
+        return self.__graph.all_edges(*args, **kwargs, device="cuda")
 
 
 class HeteroNodeView:

From afb94522422d4ac867e941c3bb594073d810b973 Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Mon, 8 Jul 2024 10:47:05 -0700
Subject: [PATCH 24/47] revert ci script

---
 ci/test_python.sh | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/ci/test_python.sh b/ci/test_python.sh
index 061a6459085..e8c8272e8d6 100755
--- a/ci/test_python.sh
+++ b/ci/test_python.sh
@@ -210,12 +210,9 @@ if [[ "${RAPIDS_CUDA_VERSION}" == "11.8.0" ]]; then
       --channel "${PYTHON_CHANNEL}" \
       --channel pyg \
       "cugraph-pyg" \
-      "pytorch=2.1.0" \
-      "pytorch-cuda=${CONDA_CUDA_VERSION}"
+      "ogb"
 
-    # Install pyg dependencies (which requires pip)
     pip install \
-        ogb \
         pyg_lib \
         torch_scatter \
         torch_sparse \

From 48ba6d42b9874f9612f6074af2f611acb0714c0e Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Tue, 9 Jul 2024 11:37:45 -0700
Subject: [PATCH 25/47] fix meta.yaml issue

---
 conda/recipes/cugraph-dgl/meta.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conda/recipes/cugraph-dgl/meta.yaml b/conda/recipes/cugraph-dgl/meta.yaml
index 7c30c9c602b..0affe456b73 100644
--- a/conda/recipes/cugraph-dgl/meta.yaml
+++ b/conda/recipes/cugraph-dgl/meta.yaml
@@ -31,7 +31,7 @@ requirements:
     - tensordict >=0.1.2
     - python
     - pytorch >=2.0
-    - cupy >= 12.0.0
+    - cupy >=12.0.0
 
 tests:
   imports:

From 801de87154aa997c49bb92034e197e9692b0d439 Mon Sep 17 00:00:00 2001
From: Alex Barghi <105237337+alexbarghi-nv@users.noreply.github.com>
Date: Wed, 10 Jul 2024 12:16:28 -0400
Subject: [PATCH 26/47] add type hint

Co-authored-by: Vibhu Jawa <vibhujawa@gmail.com>
---
 python/cugraph-dgl/cugraph_dgl/dataloading/dask_dataloader.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cugraph-dgl/cugraph_dgl/dataloading/dask_dataloader.py b/python/cugraph-dgl/cugraph_dgl/dataloading/dask_dataloader.py
index 7cd94a1be84..994e8609348 100644
--- a/python/cugraph-dgl/cugraph_dgl/dataloading/dask_dataloader.py
+++ b/python/cugraph-dgl/cugraph_dgl/dataloading/dask_dataloader.py
@@ -281,7 +281,7 @@ def get_batch_id_series(n_output_rows: int, batch_size: int):
     return cudf.Series(batch_ar)
 
 
-def create_batch_df(dataset: torch.Tensor):
+def create_batch_df(dataset: torch.Tensor) -> cudf.DataFrame:
     batch_id_ls = []
     indices_ls = []
     for batch_id, b_indices in enumerate(dataset):

From 5e511cc77092350602bc8d19fcd2a23306e99089 Mon Sep 17 00:00:00 2001
From: Alex Barghi <105237337+alexbarghi-nv@users.noreply.github.com>
Date: Wed, 10 Jul 2024 12:16:49 -0400
Subject: [PATCH 27/47] add missing type hint

Co-authored-by: Vibhu Jawa <vibhujawa@gmail.com>
---
 python/cugraph-dgl/cugraph_dgl/dataloading/dataloader.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cugraph-dgl/cugraph_dgl/dataloading/dataloader.py b/python/cugraph-dgl/cugraph_dgl/dataloading/dataloader.py
index 39f43a5d805..e01bd5b8d48 100644
--- a/python/cugraph-dgl/cugraph_dgl/dataloading/dataloader.py
+++ b/python/cugraph-dgl/cugraph_dgl/dataloading/dataloader.py
@@ -38,7 +38,7 @@ def __init__(
         device: Union[int, str, "torch.device"] = None,
         use_ddp: bool = False,
         ddp_seed: int = 0,
-        batch_size=1,
+        batch_size: int= 1,
         drop_last: bool = False,
         shuffle: bool = False,
         use_prefetch_thread: Optional[bool] = None,

From 035b69ae10721c4781cadaf8a02d19a23f46583b Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Wed, 10 Jul 2024 12:28:49 -0700
Subject: [PATCH 28/47] remove comment, add issue reference

---
 .../cugraph-dgl/cugraph_dgl/dataloading/dataloader.py  | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/python/cugraph-dgl/cugraph_dgl/dataloading/dataloader.py b/python/cugraph-dgl/cugraph_dgl/dataloading/dataloader.py
index 39f43a5d805..5ea27cc0a36 100644
--- a/python/cugraph-dgl/cugraph_dgl/dataloading/dataloader.py
+++ b/python/cugraph-dgl/cugraph_dgl/dataloading/dataloader.py
@@ -150,17 +150,9 @@ def dataset(
         return self.__dataset
 
     def __iter__(self):
-        # TODO move to the correct device
+        # TODO move to the correct device (rapidsai/cugraph-gnn#11)
         return self.__sampler.sample(
             self.__graph,
             self.__dataset,
             batch_size=self.__batch_size,
         )
-
-        """
-        start, end, blocks = out
-
-        start = start.to(self.__device)
-        end = end.to(self.__device)
-        blocks = [b.to(self.__device) for b in blocks]
-        """

From b412776cf99e20b88043f1eaeeec2d3edd0433d8 Mon Sep 17 00:00:00 2001
From: Alex Barghi <105237337+alexbarghi-nv@users.noreply.github.com>
Date: Wed, 10 Jul 2024 15:30:38 -0400
Subject: [PATCH 29/47] Add type hint

Co-authored-by: Vibhu Jawa <vibhujawa@gmail.com>
---
 python/cugraph-dgl/cugraph_dgl/dataloading/dask_dataloader.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cugraph-dgl/cugraph_dgl/dataloading/dask_dataloader.py b/python/cugraph-dgl/cugraph_dgl/dataloading/dask_dataloader.py
index 994e8609348..1b4233b14e4 100644
--- a/python/cugraph-dgl/cugraph_dgl/dataloading/dask_dataloader.py
+++ b/python/cugraph-dgl/cugraph_dgl/dataloading/dask_dataloader.py
@@ -273,7 +273,7 @@ def __del__(self):
             _clean_directory(self._sampling_output_dir)
 
 
-def get_batch_id_series(n_output_rows: int, batch_size: int):
+def get_batch_id_series(n_output_rows: int, batch_size: int) -> cudf.Series :
     num_batches = (n_output_rows + batch_size - 1) // batch_size
     print(f"Number of batches = {num_batches}".format(num_batches))
     batch_ar = cp.arange(0, num_batches).repeat(batch_size)

From 1c72bd6e1e054c8587506f4edae74b92e8d6b852 Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Wed, 10 Jul 2024 14:48:22 -0700
Subject: [PATCH 30/47] add convert function, fix bugs

---
 python/cugraph-dgl/cugraph_dgl/__init__.py    |  5 +-
 python/cugraph-dgl/cugraph_dgl/convert.py     | 54 +++++++++++++++++-
 python/cugraph-dgl/cugraph_dgl/graph.py       | 40 ++++++++++---
 .../tests/test_from_dgl_heterograph.py        | 41 +++++++++++++-
 python/cugraph-dgl/cugraph_dgl/tests/utils.py | 56 +++++++++++++++++--
 python/cugraph-dgl/cugraph_dgl/view.py        | 26 +++++++--
 6 files changed, 200 insertions(+), 22 deletions(-)

diff --git a/python/cugraph-dgl/cugraph_dgl/__init__.py b/python/cugraph-dgl/cugraph_dgl/__init__.py
index bbae569c91b..58850d47fba 100644
--- a/python/cugraph-dgl/cugraph_dgl/__init__.py
+++ b/python/cugraph-dgl/cugraph_dgl/__init__.py
@@ -17,7 +17,10 @@
 os.environ["RAPIDS_NO_INITIALIZE"] = "1"
 from cugraph_dgl.graph import Graph
 from cugraph_dgl.cugraph_storage import CuGraphStorage
-from cugraph_dgl.convert import cugraph_storage_from_heterograph
+from cugraph_dgl.convert import (
+    cugraph_storage_from_heterograph,
+    cugraph_dgl_graph_from_heterograph,
+)
 import cugraph_dgl.dataloading
 import cugraph_dgl.nn
 
diff --git a/python/cugraph-dgl/cugraph_dgl/convert.py b/python/cugraph-dgl/cugraph_dgl/convert.py
index 1235f07adf1..ae4b96dd391 100644
--- a/python/cugraph-dgl/cugraph_dgl/convert.py
+++ b/python/cugraph-dgl/cugraph_dgl/convert.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -12,6 +12,8 @@
 # limitations under the License.
 from __future__ import annotations
 from cugraph.utilities.utils import import_optional
+
+import cugraph_dgl
 from cugraph_dgl import CuGraphStorage
 from cugraph_dgl.utils.cugraph_conversion_utils import (
     get_edges_dict_from_dgl_HeteroGraph,
@@ -39,3 +41,53 @@ def cugraph_storage_from_heterograph(
     add_ndata_from_dgl_HeteroGraph(gs, g)
     add_edata_from_dgl_HeteroGraph(gs, g)
     return gs
+
+
+def cugraph_dgl_graph_from_heterograph(
+    input_graph: dgl.DGLGraph,
+    single_gpu: bool = True,
+    ndata_storage: str = "torch",
+    edata_storage: str = "torch",
+    **kwargs,
+) -> cugraph_dgl.Graph:
+    """
+    Converts a DGL Graph to a cuGraph-DGL Graph.
+    """
+
+    output_graph = cugraph_dgl.Graph(
+        is_multi_gpu=(not single_gpu),
+        ndata_storage=ndata_storage,
+        edata_storage=edata_storage,
+        **kwargs,
+    )
+
+    # Calling is_homogeneous does not work here
+    if len(input_graph.ntypes) <= 1:
+        output_graph.add_nodes(
+            input_graph.num_nodes(), data=input_graph.ndata, ntype=input_graph.ntypes[0]
+        )
+    else:
+        for ntype in input_graph.ntypes:
+            data = {
+                k: v_dict[ntype]
+                for k, v_dict in input_graph.ndata.items()
+                if ntype in v_dict
+            }
+            output_graph.add_nodes(input_graph.num_nodes(ntype), data=data, ntype=ntype)
+
+    if len(input_graph.canonical_etypes) <= 1:
+        can_etype = input_graph.canonical_etypes[0]
+        src_t, dst_t = input_graph.edges(form="uv", etype=can_etype)
+        output_graph.add_edges(src_t, dst_t, input_graph.edata, etype=can_etype)
+    else:
+        for can_etype in input_graph.canonical_etypes:
+            data = {
+                k: v_dict[can_etype]
+                for k, v_dict in input_graph.edata.items()
+                if can_etype in v_dict
+            }
+
+            src_t, dst_t = input_graph.edges(form="uv", etype=can_etype)
+            output_graph.add_edges(src_t, dst_t, data=data, etype=can_etype)
+
+    return output_graph
diff --git a/python/cugraph-dgl/cugraph_dgl/graph.py b/python/cugraph-dgl/cugraph_dgl/graph.py
index 5e18d5ea616..2eba13c6958 100644
--- a/python/cugraph-dgl/cugraph_dgl/graph.py
+++ b/python/cugraph-dgl/cugraph_dgl/graph.py
@@ -11,6 +11,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import warnings
+
 from typing import Union, Optional, Dict, Tuple, List
 
 from cugraph.utilities.utils import import_optional
@@ -561,6 +563,9 @@ def _graph(
 
         return self.__graph[0]
 
+    def _has_n_emb(self, ntype: str, emb_name: str) -> bool:
+        return (ntype, emb_name) in self.__ndata_storage
+
     def _get_n_emb(
         self, ntype: str, emb_name: str, u: Union[str, TensorType]
     ) -> "torch.Tensor":
@@ -595,9 +600,21 @@ def _get_n_emb(
         if dgl.base.is_all(u):
             u = torch.arange(self.num_nodes(ntype), dtype=self.idtype, device="cpu")
 
-        return self.__ndata_storage[ntype, emb_name].fetch(
-            _cast_to_torch_tensor(u), "cuda"
-        )
+        try:
+            return self.__ndata_storage[ntype, emb_name].fetch(
+                _cast_to_torch_tensor(u), "cuda"
+            )
+        except RuntimeError as ex:
+            warnings.warn(
+                "Got error accessing data, trying again with index on device: "
+                + str(ex)
+            )
+            return self.__ndata_storage[ntype, emb_name].fetch(
+                _cast_to_torch_tensor(u).cuda(), "cuda"
+            )
+
+    def _has_e_emb(self, etype: Tuple[str, str, str], emb_name: str) -> bool:
+        return (etype, emb_name) in self.__edata_storage
 
     def _get_e_emb(
         self, etype: Tuple[str, str, str], emb_name: str, u: Union[str, TensorType]
@@ -629,9 +646,18 @@ def _get_e_emb(
         if dgl.base.is_all(u):
             u = torch.arange(self.num_edges(etype), dtype=self.idtype, device="cpu")
 
-        return self.__edata_storage[etype, emb_name].fetch(
-            _cast_to_torch_tensor(u), "cuda"
-        )
+        try:
+            return self.__edata_storage[etype, emb_name].fetch(
+                _cast_to_torch_tensor(u), "cuda"
+            )
+        except RuntimeError as ex:
+            warnings.warn(
+                "Got error accessing data, trying again with index on device: "
+                + str(ex)
+            )
+            return self.__edata_storage[etype, emb_name].fetch(
+                _cast_to_torch_tensor(u).cuda(), "cuda"
+            )
 
     def _set_n_emb(
         self, ntype: str, u: Union[str, TensorType], kv: Dict[str, TensorType]
@@ -774,7 +800,7 @@ def _get_e_emb_keys(self, etype: str) -> List[str]:
         List[str]
             The list of embedding names for the given edge type.
         """
-        return [k for (t, k) in self.__ndata_storage if etype == t]
+        return [k for (t, k) in self.__edata_storage if etype == t]
 
     def all_edges(
         self,
diff --git a/python/cugraph-dgl/cugraph_dgl/tests/test_from_dgl_heterograph.py b/python/cugraph-dgl/cugraph_dgl/tests/test_from_dgl_heterograph.py
index 128d9bfaca5..667a4a2e66d 100644
--- a/python/cugraph-dgl/cugraph_dgl/tests/test_from_dgl_heterograph.py
+++ b/python/cugraph-dgl/cugraph_dgl/tests/test_from_dgl_heterograph.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -20,7 +20,9 @@
 from cugraph.utilities.utils import import_optional
 from .utils import (
     assert_same_edge_feats,
+    assert_same_edge_feats_daskapi,
     assert_same_node_feats,
+    assert_same_node_feats_daskapi,
     assert_same_num_edges_can_etypes,
     assert_same_num_edges_etypes,
     assert_same_num_nodes,
@@ -134,7 +136,7 @@ def create_heterograph4(idtype):
 
 
 @pytest.mark.parametrize("idxtype", [th.int32, th.int64])
-def test_heterograph_conversion_nodes(idxtype):
+def test_heterograph_conversion_nodes_daskapi(idxtype):
     graph_fs = [
         create_heterograph1,
         create_heterograph2,
@@ -145,6 +147,39 @@ def test_heterograph_conversion_nodes(idxtype):
         g = graph_f(idxtype)
         gs = cugraph_dgl.cugraph_storage_from_heterograph(g)
 
+        assert_same_num_nodes(gs, g)
+        assert_same_node_feats_daskapi(gs, g)
+
+
+@pytest.mark.parametrize("idxtype", [th.int32, th.int64])
+def test_heterograph_conversion_edges_daskapi(idxtype):
+    graph_fs = [
+        create_heterograph1,
+        create_heterograph2,
+        create_heterograph3,
+        create_heterograph4,
+    ]
+    for graph_f in graph_fs:
+        g = graph_f(idxtype)
+        gs = cugraph_dgl.cugraph_storage_from_heterograph(g)
+
+        assert_same_num_edges_can_etypes(gs, g)
+        assert_same_num_edges_etypes(gs, g)
+        assert_same_edge_feats_daskapi(gs, g)
+
+
+@pytest.mark.parametrize("idxtype", [th.int32, th.int64])
+def test_heterograph_conversion_nodes(idxtype):
+    graph_fs = [
+        create_heterograph1,
+        create_heterograph2,
+        create_heterograph3,
+        create_heterograph4,
+    ]
+    for graph_f in graph_fs:
+        g = graph_f(idxtype)
+        gs = cugraph_dgl.cugraph_dgl_graph_from_heterograph(g)
+
         assert_same_num_nodes(gs, g)
         assert_same_node_feats(gs, g)
 
@@ -159,7 +194,7 @@ def test_heterograph_conversion_edges(idxtype):
     ]
     for graph_f in graph_fs:
         g = graph_f(idxtype)
-        gs = cugraph_dgl.cugraph_storage_from_heterograph(g)
+        gs = cugraph_dgl.cugraph_dgl_graph_from_heterograph(g)
 
         assert_same_num_edges_can_etypes(gs, g)
         assert_same_num_edges_etypes(gs, g)
diff --git a/python/cugraph-dgl/cugraph_dgl/tests/utils.py b/python/cugraph-dgl/cugraph_dgl/tests/utils.py
index d6a90840b72..09c267099e5 100644
--- a/python/cugraph-dgl/cugraph_dgl/tests/utils.py
+++ b/python/cugraph-dgl/cugraph_dgl/tests/utils.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -15,8 +15,8 @@
 th = import_optional("torch")
 
 
-def assert_same_node_feats(gs, g):
-    set(gs.ndata.keys()) == set(g.ndata.keys())
+def assert_same_node_feats_daskapi(gs, g):
+    assert set(gs.ndata.keys()) == set(g.ndata.keys())
 
     for key in g.ndata.keys():
         for ntype in g.ntypes:
@@ -30,6 +30,29 @@ def assert_same_node_feats(gs, g):
                 assert equal_t == 0
 
 
+def assert_same_node_feats(gs, g):
+    assert set(gs.ndata.keys()) == set(g.ndata.keys())
+    assert set(gs.ntypes) == set(g.ntypes)
+
+    for key in g.ndata.keys():
+        for ntype in g.ntypes:
+            if len(g.ntypes) <= 1 or ntype in g.ndata[key]:
+                indices = th.arange(0, g.num_nodes(ntype), dtype=g.idtype)
+
+                g_output = g.ndata[key]
+                gs_output = gs.ndata[key]
+
+                if len(g.ntypes) > 1:
+                    g_output = g_output[ntype]
+                    gs_output = gs_output[ntype]
+
+                g_output = g_output[indices]
+                gs_output = gs_output[indices]
+
+                equal_t = (gs_output != g_output).sum()
+                assert equal_t == 0
+
+
 def assert_same_num_nodes(gs, g):
     for ntype in g.ntypes:
         assert g.num_nodes(ntype) == gs.num_nodes(ntype)
@@ -45,8 +68,8 @@ def assert_same_num_edges_etypes(gs, g):
         assert g.num_edges(etype) == gs.num_edges(etype)
 
 
-def assert_same_edge_feats(gs, g):
-    set(gs.edata.keys()) == set(g.edata.keys())
+def assert_same_edge_feats_daskapi(gs, g):
+    assert set(gs.edata.keys()) == set(g.edata.keys())
     for key in g.edata.keys():
         for etype in g.canonical_etypes:
             indices = th.arange(0, g.num_edges(etype), dtype=g.idtype).cuda()
@@ -59,6 +82,29 @@ def assert_same_edge_feats(gs, g):
                 assert equal_t == 0
 
 
+def assert_same_edge_feats(gs, g):
+    assert set(gs.edata.keys()) == set(g.edata.keys())
+    assert set(gs.canonical_etypes) == set(g.canonical_etypes)
+    assert set(gs.etypes) == set(g.etypes)
+
+    for key in g.edata.keys():
+        for etype in g.canonical_etypes:
+            if len(g.etypes) <= 1 or etype in g.edata[key]:
+                indices = th.arange(0, g.num_edges(etype), dtype=g.idtype).cuda()
+                g_output = g.edata[key]
+                gs_output = gs.edata[key]
+
+                if len(g.etypes) > 1:
+                    g_output = g_output[etype]
+                    gs_output = gs_output[etype]
+
+                g_output = g_output[indices]
+                gs_output = gs_output[indices]
+
+                equal_t = (gs_output != g_output).sum().cpu()
+                assert equal_t == 0
+
+
 def assert_same_sampling_len(dgl_g, cugraph_gs, nodes, fanout, edge_dir):
     dgl_o = dgl_g.sample_neighbors(nodes, fanout=fanout, edge_dir=edge_dir)
     cugraph_o = cugraph_gs.sample_neighbors(nodes, fanout=fanout, edge_dir=edge_dir)
diff --git a/python/cugraph-dgl/cugraph_dgl/view.py b/python/cugraph-dgl/cugraph_dgl/view.py
index e65af53a096..2e836591757 100644
--- a/python/cugraph-dgl/cugraph_dgl/view.py
+++ b/python/cugraph-dgl/cugraph_dgl/view.py
@@ -55,9 +55,17 @@ def _edges(self) -> TensorType:
 
     def __getitem__(self, key: str):
         if isinstance(self._etype, list):
-            return {t: self._graph._get_e_emb(t, key, self._edges) for t in self._etype}
-
-        return self._graph._get_e_emb(self._etype, key, self._edges)
+            return {
+                t: self._graph._get_e_emb(t, key, self._edges)
+                for t in self._etype
+                if self._graph._has_e_emb(t, key)
+            }
+
+        return (
+            self._graph._get_e_emb(self._etype, key, self._edges)
+            if self._graph._has_e_emb(self._etype, key)
+            else None
+        )
 
     def __setitem__(self, key: str, val: Union[TensorType, Dict[str, TensorType]]):
         if isinstance(self._etype, list):
@@ -152,9 +160,17 @@ def _nodes(self) -> TensorType:
 
     def __getitem__(self, key: str):
         if isinstance(self._ntype, list):
-            return {t: self._graph._get_n_emb(t, key, self._nodes) for t in self._ntype}
+            return {
+                t: self._graph._get_n_emb(t, key, self._nodes)
+                for t in self._ntype
+                if self._graph._has_n_emb(t, key)
+            }
         else:
-            return self._graph._get_n_emb(self._ntype, key, self._nodes)
+            return (
+                self._graph._get_n_emb(self._ntype, key, self._nodes)
+                if self._graph._has_n_emb(self._ntype, key)
+                else None
+            )
 
     def __setitem__(self, key: str, val: Union[TensorType, Dict[str, TensorType]]):
         if isinstance(self._ntype, list):

From 2d522b1c2b68e9888670efdf6f088faee23b4994 Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Wed, 10 Jul 2024 14:54:42 -0700
Subject: [PATCH 31/47] move worker init to utility

---
 .../tests/dataloading/test_dataloader_mg.py   | 31 +-------------
 .../cugraph_dgl/tests/test_graph_mg.py        | 39 +-----------------
 python/cugraph-dgl/cugraph_dgl/tests/utils.py | 41 +++++++++++++++++++
 3 files changed, 45 insertions(+), 66 deletions(-)

diff --git a/python/cugraph-dgl/cugraph_dgl/tests/dataloading/test_dataloader_mg.py b/python/cugraph-dgl/cugraph_dgl/tests/dataloading/test_dataloader_mg.py
index 39ce55d7616..3eabdf454e2 100644
--- a/python/cugraph-dgl/cugraph_dgl/tests/dataloading/test_dataloader_mg.py
+++ b/python/cugraph-dgl/cugraph_dgl/tests/dataloading/test_dataloader_mg.py
@@ -12,7 +12,6 @@
 # limitations under the License.
 
 import pytest
-import os
 
 import numpy as np
 
@@ -23,41 +22,15 @@
 
 from cugraph.gnn import (
     cugraph_comms_create_unique_id,
-    cugraph_comms_init,
     cugraph_comms_shutdown,
 )
 
+from utils import init_pytorch_worker
+
 torch = import_optional("torch")
 dgl = import_optional("dgl")
 
 
-def init_pytorch_worker(rank, world_size, cugraph_id):
-    import rmm
-
-    rmm.reinitialize(
-        devices=rank,
-    )
-
-    import cupy
-
-    cupy.cuda.Device(rank).use()
-    from rmm.allocators.cupy import rmm_cupy_allocator
-
-    cupy.cuda.set_allocator(rmm_cupy_allocator)
-
-    from cugraph.testing.mg_utils import enable_spilling
-
-    enable_spilling()
-
-    torch.cuda.set_device(rank)
-
-    os.environ["MASTER_ADDR"] = "localhost"
-    os.environ["MASTER_PORT"] = "12355"
-    torch.distributed.init_process_group("nccl", rank=rank, world_size=world_size)
-
-    cugraph_comms_init(rank=rank, world_size=world_size, uid=cugraph_id, device=rank)
-
-
 def run_test_dataloader_basic_homogeneous(rank, world_size, uid):
     init_pytorch_worker(rank, world_size, uid)
 
diff --git a/python/cugraph-dgl/cugraph_dgl/tests/test_graph_mg.py b/python/cugraph-dgl/cugraph_dgl/tests/test_graph_mg.py
index 8e469519433..cabeecff758 100644
--- a/python/cugraph-dgl/cugraph_dgl/tests/test_graph_mg.py
+++ b/python/cugraph-dgl/cugraph_dgl/tests/test_graph_mg.py
@@ -11,7 +11,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
 
 import pytest
 
@@ -26,54 +25,20 @@
 from cugraph.utilities.utils import import_optional, MissingModule
 
 from cugraph.gnn import (
-    cugraph_comms_init,
     cugraph_comms_shutdown,
     cugraph_comms_create_unique_id,
     cugraph_comms_get_raft_handle,
 )
 
+from utils import init_pytorch_worker
 
 pylibwholegraph = import_optional("pylibwholegraph")
 torch = import_optional("torch")
 dgl = import_optional("dgl")
 
 
-def init_pytorch_worker(rank, world_size, cugraph_id):
-    import rmm
-
-    rmm.reinitialize(
-        devices=rank,
-    )
-
-    import cupy
-
-    cupy.cuda.Device(rank).use()
-    from rmm.allocators.cupy import rmm_cupy_allocator
-
-    cupy.cuda.set_allocator(rmm_cupy_allocator)
-
-    from cugraph.testing.mg_utils import enable_spilling
-
-    enable_spilling()
-
-    torch.cuda.set_device(rank)
-
-    os.environ["MASTER_ADDR"] = "localhost"
-    os.environ["MASTER_PORT"] = "12355"
-    torch.distributed.init_process_group("nccl", rank=rank, world_size=world_size)
-
-    pylibwholegraph.torch.initialize.init(
-        rank,
-        world_size,
-        rank,
-        world_size,
-    )
-
-    cugraph_comms_init(rank=rank, world_size=world_size, uid=cugraph_id, device=rank)
-
-
 def run_test_graph_make_homogeneous_graph_mg(rank, uid, world_size, direction):
-    init_pytorch_worker(rank, world_size, uid)
+    init_pytorch_worker(rank, world_size, uid, init_wholegraph=True)
 
     df = karate.get_edgelist()
     df.src = df.src.astype("int64")
diff --git a/python/cugraph-dgl/cugraph_dgl/tests/utils.py b/python/cugraph-dgl/cugraph_dgl/tests/utils.py
index 09c267099e5..fa4eb05f297 100644
--- a/python/cugraph-dgl/cugraph_dgl/tests/utils.py
+++ b/python/cugraph-dgl/cugraph_dgl/tests/utils.py
@@ -10,7 +10,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+import os
+
 from cugraph.utilities.utils import import_optional
+from cugraph.gnn import cugraph_comms_init
 
 th = import_optional("torch")
 
@@ -111,3 +115,40 @@ def assert_same_sampling_len(dgl_g, cugraph_gs, nodes, fanout, edge_dir):
     assert cugraph_o.num_edges() == dgl_o.num_edges()
     for etype in dgl_o.canonical_etypes:
         assert dgl_o.num_edges(etype) == cugraph_o.num_edges(etype)
+
+
+def init_pytorch_worker(rank, world_size, cugraph_id, init_wholegraph=False):
+    import rmm
+
+    rmm.reinitialize(
+        devices=rank,
+    )
+
+    import cupy
+
+    cupy.cuda.Device(rank).use()
+    from rmm.allocators.cupy import rmm_cupy_allocator
+
+    cupy.cuda.set_allocator(rmm_cupy_allocator)
+
+    from cugraph.testing.mg_utils import enable_spilling
+
+    enable_spilling()
+
+    th.cuda.set_device(rank)
+
+    os.environ["MASTER_ADDR"] = "localhost"
+    os.environ["MASTER_PORT"] = "12355"
+    th.distributed.init_process_group("nccl", rank=rank, world_size=world_size)
+
+    if init_wholegraph:
+        import pylibwholegraph
+
+        pylibwholegraph.torch.initialize.init(
+            rank,
+            world_size,
+            rank,
+            world_size,
+        )
+
+    cugraph_comms_init(rank=rank, world_size=world_size, uid=cugraph_id, device=rank)

From e1fa6e06ac9bf64d7a76ff0e45b8c4382d8b403d Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Wed, 10 Jul 2024 15:17:43 -0700
Subject: [PATCH 32/47] revert none return, add check

---
 .../tests/dataloading/test_dataloader_mg.py          |  2 +-
 python/cugraph-dgl/cugraph_dgl/tests/test_graph.py   |  2 ++
 .../cugraph-dgl/cugraph_dgl/tests/test_graph_mg.py   |  3 ++-
 python/cugraph-dgl/cugraph_dgl/view.py               | 12 ++----------
 4 files changed, 7 insertions(+), 12 deletions(-)

diff --git a/python/cugraph-dgl/cugraph_dgl/tests/dataloading/test_dataloader_mg.py b/python/cugraph-dgl/cugraph_dgl/tests/dataloading/test_dataloader_mg.py
index 3eabdf454e2..b32233f16a6 100644
--- a/python/cugraph-dgl/cugraph_dgl/tests/dataloading/test_dataloader_mg.py
+++ b/python/cugraph-dgl/cugraph_dgl/tests/dataloading/test_dataloader_mg.py
@@ -25,7 +25,7 @@
     cugraph_comms_shutdown,
 )
 
-from utils import init_pytorch_worker
+from cugraph_dgl.tests.utils import init_pytorch_worker
 
 torch = import_optional("torch")
 dgl = import_optional("dgl")
diff --git a/python/cugraph-dgl/cugraph_dgl/tests/test_graph.py b/python/cugraph-dgl/cugraph_dgl/tests/test_graph.py
index 0c1fb088198..a60db97b8d6 100644
--- a/python/cugraph-dgl/cugraph_dgl/tests/test_graph.py
+++ b/python/cugraph-dgl/cugraph_dgl/tests/test_graph.py
@@ -52,6 +52,8 @@ def test_graph_make_homogeneous_graph(direction):
     assert (
         graph.nodes() == torch.arange(num_nodes, dtype=torch.int64, device="cuda")
     ).all()
+
+    assert graph.nodes[None]["x"] is not None
     assert (graph.nodes[None]["x"] == torch.as_tensor(node_x, device="cuda")).all()
     assert (
         graph.nodes[None]["num"]
diff --git a/python/cugraph-dgl/cugraph_dgl/tests/test_graph_mg.py b/python/cugraph-dgl/cugraph_dgl/tests/test_graph_mg.py
index cabeecff758..eedda664c52 100644
--- a/python/cugraph-dgl/cugraph_dgl/tests/test_graph_mg.py
+++ b/python/cugraph-dgl/cugraph_dgl/tests/test_graph_mg.py
@@ -30,7 +30,7 @@
     cugraph_comms_get_raft_handle,
 )
 
-from utils import init_pytorch_worker
+from .utils import init_pytorch_worker
 
 pylibwholegraph = import_optional("pylibwholegraph")
 torch = import_optional("torch")
@@ -75,6 +75,7 @@ def run_test_graph_make_homogeneous_graph_mg(rank, uid, world_size, direction):
         == torch.arange(global_num_nodes, dtype=torch.int64, device="cuda")
     ).all()
     ix = torch.arange(len(node_x) * rank, len(node_x) * (rank + 1), dtype=torch.int64)
+    assert graph.nodes[ix]["x"] is not None
     assert (graph.nodes[ix]["x"] == torch.as_tensor(node_x, device="cuda")).all()
 
     assert (
diff --git a/python/cugraph-dgl/cugraph_dgl/view.py b/python/cugraph-dgl/cugraph_dgl/view.py
index 2e836591757..dbc53e73b6a 100644
--- a/python/cugraph-dgl/cugraph_dgl/view.py
+++ b/python/cugraph-dgl/cugraph_dgl/view.py
@@ -61,11 +61,7 @@ def __getitem__(self, key: str):
                 if self._graph._has_e_emb(t, key)
             }
 
-        return (
-            self._graph._get_e_emb(self._etype, key, self._edges)
-            if self._graph._has_e_emb(self._etype, key)
-            else None
-        )
+        return self._graph._get_e_emb(self._etype, key, self._edges)
 
     def __setitem__(self, key: str, val: Union[TensorType, Dict[str, TensorType]]):
         if isinstance(self._etype, list):
@@ -166,11 +162,7 @@ def __getitem__(self, key: str):
                 if self._graph._has_n_emb(t, key)
             }
         else:
-            return (
-                self._graph._get_n_emb(self._ntype, key, self._nodes)
-                if self._graph._has_n_emb(self._ntype, key)
-                else None
-            )
+            return self._graph._get_n_emb(self._ntype, key, self._nodes)
 
     def __setitem__(self, key: str, val: Union[TensorType, Dict[str, TensorType]]):
         if isinstance(self._ntype, list):

From 85299878533b4c80b39d57b2bda5ad50b336fed6 Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Wed, 10 Jul 2024 16:11:49 -0700
Subject: [PATCH 33/47] style

---
 python/cugraph-dgl/cugraph_dgl/dataloading/dask_dataloader.py | 2 +-
 python/cugraph-dgl/cugraph_dgl/dataloading/dataloader.py      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/cugraph-dgl/cugraph_dgl/dataloading/dask_dataloader.py b/python/cugraph-dgl/cugraph_dgl/dataloading/dask_dataloader.py
index 1b4233b14e4..e220b93f738 100644
--- a/python/cugraph-dgl/cugraph_dgl/dataloading/dask_dataloader.py
+++ b/python/cugraph-dgl/cugraph_dgl/dataloading/dask_dataloader.py
@@ -273,7 +273,7 @@ def __del__(self):
             _clean_directory(self._sampling_output_dir)
 
 
-def get_batch_id_series(n_output_rows: int, batch_size: int) -> cudf.Series :
+def get_batch_id_series(n_output_rows: int, batch_size: int) -> cudf.Series:
     num_batches = (n_output_rows + batch_size - 1) // batch_size
     print(f"Number of batches = {num_batches}".format(num_batches))
     batch_ar = cp.arange(0, num_batches).repeat(batch_size)
diff --git a/python/cugraph-dgl/cugraph_dgl/dataloading/dataloader.py b/python/cugraph-dgl/cugraph_dgl/dataloading/dataloader.py
index e1072e6a24d..21b70b05f3a 100644
--- a/python/cugraph-dgl/cugraph_dgl/dataloading/dataloader.py
+++ b/python/cugraph-dgl/cugraph_dgl/dataloading/dataloader.py
@@ -38,7 +38,7 @@ def __init__(
         device: Union[int, str, "torch.device"] = None,
         use_ddp: bool = False,
         ddp_seed: int = 0,
-        batch_size: int= 1,
+        batch_size: int = 1,
         drop_last: bool = False,
         shuffle: bool = False,
         use_prefetch_thread: Optional[bool] = None,

From 89f4ef44bb3400b86f8022fbd0a1ffa008d4dfe5 Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Mon, 22 Jul 2024 09:20:15 -0700
Subject: [PATCH 34/47] use global communicator

---
 python/cugraph-pyg/cugraph_pyg/data/feature_store.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cugraph-pyg/cugraph_pyg/data/feature_store.py b/python/cugraph-pyg/cugraph_pyg/data/feature_store.py
index a3715d3ddf4..ce5b186b2fa 100644
--- a/python/cugraph-pyg/cugraph_pyg/data/feature_store.py
+++ b/python/cugraph-pyg/cugraph_pyg/data/feature_store.py
@@ -169,7 +169,7 @@ def __init__(self, memory_type="distributed", location="cpu"):
 
         self.__features = {}
 
-        self.__wg_comm = wgth.get_local_node_communicator()
+        self.__wg_comm = wgth.get_global_node_communicator()
         self.__wg_type = memory_type
         self.__wg_location = location
 

From 4d82ee0879efce80a605e0b7efabf1aaaacb1aa4 Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Mon, 22 Jul 2024 12:44:07 -0700
Subject: [PATCH 35/47] global

---
 python/cugraph-pyg/cugraph_pyg/data/feature_store.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cugraph-pyg/cugraph_pyg/data/feature_store.py b/python/cugraph-pyg/cugraph_pyg/data/feature_store.py
index ce5b186b2fa..b6450e7b192 100644
--- a/python/cugraph-pyg/cugraph_pyg/data/feature_store.py
+++ b/python/cugraph-pyg/cugraph_pyg/data/feature_store.py
@@ -169,7 +169,7 @@ def __init__(self, memory_type="distributed", location="cpu"):
 
         self.__features = {}
 
-        self.__wg_comm = wgth.get_global_node_communicator()
+        self.__wg_comm = wgth.get_global_communicator()
         self.__wg_type = memory_type
         self.__wg_location = location
 

From 2b160bfa37a2a346bd3553efcb2fd1b6d2cc4ab0 Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Wed, 24 Jul 2024 12:08:30 -0700
Subject: [PATCH 36/47] use int64 to store # edges

---
 python/cugraph-pyg/cugraph_pyg/data/graph_store.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cugraph-pyg/cugraph_pyg/data/graph_store.py b/python/cugraph-pyg/cugraph_pyg/data/graph_store.py
index 622b68d37e2..e086bf07b1f 100644
--- a/python/cugraph-pyg/cugraph_pyg/data/graph_store.py
+++ b/python/cugraph-pyg/cugraph_pyg/data/graph_store.py
@@ -271,7 +271,7 @@ def __get_edgelist(self):
             torch.tensor(
                 [self.__edge_indices[et].shape[1] for et in sorted_keys],
                 device="cuda",
-                dtype=torch.int32,
+                dtype=torch.int64,
             )
         )
 

From 6db236c13149b7c53d1a05df8501130172d596c1 Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Thu, 25 Jul 2024 13:05:50 -0700
Subject: [PATCH 37/47] example

---
 .../all_cuda-118_arch-x86_64.yaml             |   2 +
 .../all_cuda-125_arch-x86_64.yaml             |   2 +
 dependencies.yaml                             |   2 +
 .../dataloading/utils/sampling_helpers.py     |   2 +-
 .../graphsage/node-classification-dask.py     | 273 ++++++++++++++++++
 .../examples/graphsage/node-classification.py |  37 ++-
 6 files changed, 304 insertions(+), 14 deletions(-)
 create mode 100644 python/cugraph-dgl/examples/graphsage/node-classification-dask.py

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 5474c087532..24d59ba5865 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -42,6 +42,7 @@ dependencies:
 - numpy>=1.23,<2.0a0
 - numpydoc
 - nvcc_linux-64=11.8
+- ogb
 - openmpi
 - packaging>=21
 - pandas
@@ -74,6 +75,7 @@ dependencies:
 - sphinxcontrib-websupport
 - thriftpy2!=0.5.0,!=0.5.1
 - torchdata
+- torchmetrics
 - ucx-proc=*=gpu
 - ucx-py==0.39.*,>=0.0.0a0
 - wget
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index 54049a92061..bf7bcc88c51 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -47,6 +47,7 @@ dependencies:
 - numba>=0.57
 - numpy>=1.23,<2.0a0
 - numpydoc
+- ogb
 - openmpi
 - packaging>=21
 - pandas
@@ -79,6 +80,7 @@ dependencies:
 - sphinxcontrib-websupport
 - thriftpy2!=0.5.0,!=0.5.1
 - torchdata
+- torchmetrics
 - ucx-proc=*=gpu
 - ucx-py==0.39.*,>=0.0.0a0
 - wget
diff --git a/dependencies.yaml b/dependencies.yaml
index 5ffbcbab5fc..cd7cda89884 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -670,6 +670,8 @@ dependencies:
           - &pytorch_conda pytorch>=2.0,<2.2.0a0
           - torchdata
           - pydantic
+          - ogb
+          - torchmetrics
 
     specific:
       - output_types: [requirements]
diff --git a/python/cugraph-dgl/cugraph_dgl/dataloading/utils/sampling_helpers.py b/python/cugraph-dgl/cugraph_dgl/dataloading/utils/sampling_helpers.py
index e8c305b6ba1..3b7e4502134 100644
--- a/python/cugraph-dgl/cugraph_dgl/dataloading/utils/sampling_helpers.py
+++ b/python/cugraph-dgl/cugraph_dgl/dataloading/utils/sampling_helpers.py
@@ -593,7 +593,7 @@ def _create_homogeneous_blocks_from_csc(
 
         blocks = []
         seednodes_range = None
-        for mfg in mfgs:
+        for mfg in reversed(mfgs):
             block_mfg = _create_homogeneous_dgl_block_from_tensor_d(
                 {
                     "sources": mfg.src_ids(),
diff --git a/python/cugraph-dgl/examples/graphsage/node-classification-dask.py b/python/cugraph-dgl/examples/graphsage/node-classification-dask.py
new file mode 100644
index 00000000000..03cf49bd939
--- /dev/null
+++ b/python/cugraph-dgl/examples/graphsage/node-classification-dask.py
@@ -0,0 +1,273 @@
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Example modified from:
+# https://github.com/dmlc/dgl/blob/master/examples/pytorch/graphsage/node_classification.py
+
+# Ignore Warning
+import warnings
+import time
+import cugraph_dgl
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchmetrics.functional as MF
+import dgl
+import dgl.nn as dglnn
+from dgl.data import AsNodePredDataset
+from dgl.dataloading import (
+    DataLoader,
+    NeighborSampler,
+    MultiLayerFullNeighborSampler,
+)
+from ogb.nodeproppred import DglNodePropPredDataset
+import tqdm
+import argparse
+
+warnings.filterwarnings("ignore")
+
+
+def set_allocators():
+    import rmm
+    import cudf
+    import cupy
+    from rmm.allocators.torch import rmm_torch_allocator
+    from rmm.allocators.cupy import rmm_cupy_allocator
+
+    mr = rmm.mr.CudaAsyncMemoryResource()
+    rmm.mr.set_current_device_resource(mr)
+    torch.cuda.memory.change_current_allocator(rmm_torch_allocator)
+    cupy.cuda.set_allocator(rmm_cupy_allocator)
+    cudf.set_option("spill", True)
+
+
+class SAGE(nn.Module):
+    def __init__(self, in_size, hid_size, out_size):
+        super().__init__()
+        self.layers = nn.ModuleList()
+        # three-layer GraphSAGE-mean
+        self.layers.append(dglnn.SAGEConv(in_size, hid_size, "mean"))
+        self.layers.append(dglnn.SAGEConv(hid_size, hid_size, "mean"))
+        self.layers.append(dglnn.SAGEConv(hid_size, out_size, "mean"))
+        self.dropout = nn.Dropout(0.5)
+        self.hid_size = hid_size
+        self.out_size = out_size
+
+    def forward(self, blocks, x):
+        h = x
+        for l_id, (layer, block) in enumerate(zip(self.layers, blocks)):
+            h = layer(block, h)
+            if l_id != len(self.layers) - 1:
+                h = F.relu(h)
+                h = self.dropout(h)
+        return h
+
+    def inference(self, g, device, batch_size):
+        """Conduct layer-wise inference to get all the node embeddings."""
+        all_node_ids = torch.arange(0, g.num_nodes()).to(device)
+        feat = g.get_node_storage(key="feat", ntype="_N").fetch(
+            all_node_ids, device=device
+        )
+
+        sampler = MultiLayerFullNeighborSampler(1, prefetch_node_feats=["feat"])
+        dataloader = DataLoader(
+            g,
+            torch.arange(g.num_nodes()).to(g.device),
+            sampler,
+            device=device,
+            batch_size=batch_size,
+            shuffle=False,
+            drop_last=False,
+            num_workers=0,
+        )
+        buffer_device = torch.device("cpu")
+        pin_memory = buffer_device != device
+
+        for l_id, layer in enumerate(self.layers):
+            y = torch.empty(
+                g.num_nodes(),
+                self.hid_size if l_id != len(self.layers) - 1 else self.out_size,
+                device=buffer_device,
+                pin_memory=pin_memory,
+            )
+            feat = feat.to(device)
+            for input_nodes, output_nodes, blocks in tqdm.tqdm(dataloader):
+                x = feat[input_nodes]
+                h = layer(blocks[0], x)  # len(blocks) = 1
+                if l_id != len(self.layers) - 1:
+                    h = F.relu(h)
+                    h = self.dropout(h)
+                # by design, our output nodes are contiguous
+                y[output_nodes[0] : output_nodes[-1] + 1] = h.to(buffer_device)
+            feat = y
+        return y
+
+
+def evaluate(model, graph, dataloader):
+    model.eval()
+    ys = []
+    y_hats = []
+    for it, (input_nodes, output_nodes, blocks) in enumerate(dataloader):
+        with torch.no_grad():
+            if isinstance(graph.ndata["feat"], dict):
+                x = graph.ndata["feat"]["_N"][input_nodes]
+                label = graph.ndata["label"]["_N"][output_nodes]
+            else:
+                x = graph.ndata["feat"][input_nodes]
+                label = graph.ndata["label"][output_nodes]
+            ys.append(label)
+            y_hats.append(model(blocks, x))
+    num_classes = y_hats[0].shape[1]
+    return MF.accuracy(
+        torch.cat(y_hats),
+        torch.cat(ys),
+        task="multiclass",
+        num_classes=num_classes,
+    )
+
+
+def layerwise_infer(device, graph, nid, model, batch_size):
+    model.eval()
+    with torch.no_grad():
+        pred = model.inference(graph, device, batch_size)  # pred in buffer_device
+        pred = pred[nid]
+        label = graph.ndata["label"]
+        if isinstance(label, dict):
+            label = label["_N"]
+        label = label[nid].to(device).to(pred.device)
+        num_classes = pred.shape[1]
+        return MF.accuracy(pred, label, task="multiclass", num_classes=num_classes)
+
+
+def train(args, device, g, dataset, model):
+    # create sampler & dataloader
+    train_idx = dataset.train_idx.to(device)
+    val_idx = dataset.val_idx.to(device)
+
+    use_uva = args.mode == "mixed"
+    batch_size = 1024
+    fanouts = [5, 10, 15]
+    sampler = NeighborSampler(fanouts)
+    train_dataloader = DataLoader(
+        g,
+        train_idx,
+        sampler,
+        device=device,
+        batch_size=batch_size,
+        shuffle=True,
+        drop_last=False,
+        num_workers=0,
+        use_uva=use_uva,
+    )
+    val_dataloader = DataLoader(
+        g,
+        val_idx,
+        sampler,
+        device=device,
+        batch_size=batch_size,
+        shuffle=True,
+        drop_last=False,
+        num_workers=0,
+        use_uva=use_uva,
+    )
+
+    opt = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=5e-4)
+
+    for epoch in range(10):
+        model.train()
+        total_loss = 0
+        st = time.time()
+        for it, (input_nodes, output_nodes, blocks) in enumerate(train_dataloader):
+            if isinstance(g.ndata["feat"], dict):
+                x = g.ndata["feat"]["_N"][input_nodes]
+                y = g.ndata["label"]["_N"][output_nodes]
+            else:
+                x = g.ndata["feat"][input_nodes]
+                y = g.ndata["label"][output_nodes]
+
+            print(x.shape, input_nodes.shape, y.shape, output_nodes.shape)
+            print([b.num_nodes() for b in blocks])
+
+            y_hat = model(blocks, x)
+            loss = F.cross_entropy(y_hat, y)
+            opt.zero_grad()
+            loss.backward()
+            opt.step()
+            total_loss += loss.item()
+
+        et = time.time()
+
+        print(f"Time taken for epoch {epoch} with batch_size {batch_size} = {et-st} s")
+        acc = evaluate(model, g, val_dataloader)
+        print(
+            "Epoch {:05d} | Loss {:.4f} | Accuracy {:.4f} ".format(
+                epoch, total_loss / (it + 1), acc.item()
+            )
+        )
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--mode",
+        default="gpu_cugraph_dgl",
+        choices=["cpu", "mixed", "gpu_dgl", "gpu_cugraph_dgl"],
+        help="Training mode."
+        " 'cpu' for CPU training,"
+        " 'mixed' for CPU-GPU mixed training, "
+        " 'gpu_dgl' for pure-GPU training, "
+        " 'gpu_cugraph_dgl' for pure-GPU training.",
+    )
+    args = parser.parse_args()
+    if not torch.cuda.is_available():
+        args.mode = "cpu"
+    if args.mode == "gpu_cugraph_dgl":
+        set_allocators()
+    print(f"Training in {args.mode} mode.")
+
+    # load and preprocess dataset
+    print("Loading data")
+    dataset = AsNodePredDataset(DglNodePropPredDataset("ogbn-products"))
+    g = dataset[0]
+    g = dgl.add_self_loop(g)
+    if args.mode == "gpu_cugraph_dgl":
+        g = cugraph_dgl.cugraph_storage_from_heterograph(g.to("cuda"))
+        del dataset.g
+
+    else:
+        g = g.to("cuda" if args.mode == "gpu_dgl" else "cpu")
+    device = torch.device(
+        "cpu" if args.mode == "cpu" or args.mode == "mixed" else "cuda"
+    )
+
+    # create GraphSAGE model
+    feat_shape = (
+        g.get_node_storage(key="feat", ntype="_N")
+        .fetch(torch.LongTensor([0]).to(device), device=device)
+        .shape[1]
+    )
+    print(feat_shape)
+    # no ndata in cugraph storage object
+    in_size = feat_shape
+    out_size = dataset.num_classes
+    model = SAGE(in_size, 256, out_size).to(device)
+
+    # model training
+    print("Training...")
+    train(args, device, g, dataset, model)
+
+    # test the model
+    print("Testing...")
+    acc = layerwise_infer(device, g, dataset.test_idx, model, batch_size=4096)
+    print("Test Accuracy {:.4f}".format(acc.item()))
diff --git a/python/cugraph-dgl/examples/graphsage/node-classification.py b/python/cugraph-dgl/examples/graphsage/node-classification.py
index 539fd86d136..a8a542f8017 100644
--- a/python/cugraph-dgl/examples/graphsage/node-classification.py
+++ b/python/cugraph-dgl/examples/graphsage/node-classification.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -19,6 +19,7 @@
 import warnings
 import time
 import cugraph_dgl
+import cugraph_dgl.dataloading
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -80,8 +81,13 @@ def inference(self, g, device, batch_size):
             all_node_ids, device=device
         )
 
-        sampler = MultiLayerFullNeighborSampler(1, prefetch_node_feats=["feat"])
-        dataloader = DataLoader(
+        if isinstance(g, cugraph_dgl.Graph):
+            sampler = cugraph_dgl.sampling.NeighborSampler(-1)
+            loader_cls = cugraph_dgl.dataloading.FutureDataLoader
+        else:
+            sampler = MultiLayerFullNeighborSampler(1, prefetch_node_feats=["feat"])
+            loader_cls = DataLoader
+        dataloader = loader_cls(
             g,
             torch.arange(g.num_nodes()).to(g.device),
             sampler,
@@ -158,8 +164,13 @@ def train(args, device, g, dataset, model):
     use_uva = args.mode == "mixed"
     batch_size = 1024
     fanouts = [5, 10, 15]
-    sampler = NeighborSampler(fanouts)
-    train_dataloader = DataLoader(
+    if isinstance(g, cugraph_dgl.Graph):
+        sampler = cugraph_dgl.dataloading.NeighborSampler(fanouts)
+        loader_cls = cugraph_dgl.dataloading.FutureDataLoader
+    else:
+        sampler = NeighborSampler(fanouts)
+        loader_cls = DataLoader
+    train_dataloader = loader_cls(
         g,
         train_idx,
         sampler,
@@ -170,7 +181,7 @@ def train(args, device, g, dataset, model):
         num_workers=0,
         use_uva=use_uva,
     )
-    val_dataloader = DataLoader(
+    val_dataloader = loader_cls(
         g,
         val_idx,
         sampler,
@@ -195,6 +206,9 @@ def train(args, device, g, dataset, model):
             else:
                 x = g.ndata["feat"][input_nodes]
                 y = g.ndata["label"][output_nodes]
+
+            print(x.shape, input_nodes.shape, y.shape, output_nodes.shape)
+            print([b.num_nodes() for b in blocks])
             y_hat = model(blocks, x)
             loss = F.cross_entropy(y_hat, y)
             opt.zero_grad()
@@ -238,7 +252,7 @@ def train(args, device, g, dataset, model):
     g = dataset[0]
     g = dgl.add_self_loop(g)
     if args.mode == "gpu_cugraph_dgl":
-        g = cugraph_dgl.cugraph_storage_from_heterograph(g.to("cuda"))
+        g = cugraph_dgl.cugraph_dgl_graph_from_heterograph(g.to("cuda"))
         del dataset.g
 
     else:
@@ -248,12 +262,9 @@ def train(args, device, g, dataset, model):
     )
 
     # create GraphSAGE model
-    feat_shape = (
-        g.get_node_storage(key="feat", ntype="_N")
-        .fetch(torch.LongTensor([0]).to(device), device=device)
-        .shape[1]
-    )
-    # no ndata in cugraph storage object
+    feat_shape = g.ndata["feat"].shape[1]
+    print(feat_shape)
+
     in_size = feat_shape
     out_size = dataset.num_classes
     model = SAGE(in_size, 256, out_size).to(device)

From 7a3d38f3cddb5743120c91d28470b3ad1c85524a Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Thu, 25 Jul 2024 13:06:26 -0700
Subject: [PATCH 38/47] reverse mfgs

---
 .../cugraph_dgl/dataloading/utils/sampling_helpers.py           | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cugraph-dgl/cugraph_dgl/dataloading/utils/sampling_helpers.py b/python/cugraph-dgl/cugraph_dgl/dataloading/utils/sampling_helpers.py
index e8c305b6ba1..3b7e4502134 100644
--- a/python/cugraph-dgl/cugraph_dgl/dataloading/utils/sampling_helpers.py
+++ b/python/cugraph-dgl/cugraph_dgl/dataloading/utils/sampling_helpers.py
@@ -593,7 +593,7 @@ def _create_homogeneous_blocks_from_csc(
 
         blocks = []
         seednodes_range = None
-        for mfg in mfgs:
+        for mfg in reversed(mfgs):
             block_mfg = _create_homogeneous_dgl_block_from_tensor_d(
                 {
                     "sources": mfg.src_ids(),

From 710741c822dd3ac41e88a0e4860cdfa69728ab7d Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Tue, 30 Jul 2024 07:20:55 -0700
Subject: [PATCH 39/47] node classification

---
 python/cugraph-dgl/cugraph_dgl/graph.py       |  16 +-
 python/cugraph-dgl/cugraph_dgl/view.py        |  36 +++
 .../graphsage/node-classification-dask.py     |   3 -
 .../examples/graphsage/node-classification.py |   2 -
 .../multi_trainer_MG_example/model.py         |   6 +-
 .../{workflow.py => workflow_dask.py}         |   4 +
 .../multi_trainer_MG_example/workflow_mnmg.py |   0
 .../multi_trainer_MG_example/workflow_snmg.py | 228 ++++++++++++++++++
 8 files changed, 280 insertions(+), 15 deletions(-)
 rename python/cugraph-dgl/examples/multi_trainer_MG_example/{workflow.py => workflow_dask.py} (98%)
 create mode 100644 python/cugraph-dgl/examples/multi_trainer_MG_example/workflow_mnmg.py
 create mode 100644 python/cugraph-dgl/examples/multi_trainer_MG_example/workflow_snmg.py

diff --git a/python/cugraph-dgl/cugraph_dgl/graph.py b/python/cugraph-dgl/cugraph_dgl/graph.py
index 2eba13c6958..1341719a4b4 100644
--- a/python/cugraph-dgl/cugraph_dgl/graph.py
+++ b/python/cugraph-dgl/cugraph_dgl/graph.py
@@ -29,6 +29,7 @@
     HeteroNodeDataView,
     HeteroEdgeView,
     HeteroEdgeDataView,
+    EmbeddingView,
 )
 
 
@@ -567,8 +568,8 @@ def _has_n_emb(self, ntype: str, emb_name: str) -> bool:
         return (ntype, emb_name) in self.__ndata_storage
 
     def _get_n_emb(
-        self, ntype: str, emb_name: str, u: Union[str, TensorType]
-    ) -> "torch.Tensor":
+        self, ntype: Union[str, None], emb_name: str, u: Union[str, TensorType]
+    ) -> Union["torch.Tensor", "cugraph_dgl.view.EmbeddingView"]:
         """
         Gets the embedding of a single node type.
         Unlike DGL, this function takes the string node
@@ -583,11 +584,11 @@ def _get_n_emb(
         u: Union[str, TensorType]
             Nodes to get the representation of, or ALL
             to get the representation of all nodes of
-            the given type.
+            the given type (returns embedding view).
 
         Returns
         -------
-        torch.Tensor
+        Union[torch.Tensor, cugraph_dgl.view.EmbeddingView]
             The embedding of the given edge type with the given embedding name.
         """
 
@@ -598,9 +599,10 @@ def _get_n_emb(
                 raise ValueError("Must provide the node type for a heterogeneous graph")
 
         if dgl.base.is_all(u):
-            u = torch.arange(self.num_nodes(ntype), dtype=self.idtype, device="cpu")
+            return EmbeddingView(self.__ndata_storage[ntype, emb_name], self.num_nodes(ntype))
 
         try:
+            print(u,)
             return self.__ndata_storage[ntype, emb_name].fetch(
                 _cast_to_torch_tensor(u), "cuda"
             )
@@ -644,7 +646,9 @@ def _get_e_emb(
         etype = self.to_canonical_etype(etype)
 
         if dgl.base.is_all(u):
-            u = torch.arange(self.num_edges(etype), dtype=self.idtype, device="cpu")
+            return EmbeddingView(
+                self.__edata_storage[etype, emb_name], self.num_edges(etype)
+            )
 
         try:
             return self.__edata_storage[etype, emb_name].fetch(
diff --git a/python/cugraph-dgl/cugraph_dgl/view.py b/python/cugraph-dgl/cugraph_dgl/view.py
index dbc53e73b6a..4c980806ec7 100644
--- a/python/cugraph-dgl/cugraph_dgl/view.py
+++ b/python/cugraph-dgl/cugraph_dgl/view.py
@@ -12,6 +12,8 @@
 # limitations under the License.
 
 
+import warnings
+
 from collections import defaultdict
 from collections.abc import MutableMapping
 from typing import Union, Dict, List, Tuple
@@ -20,11 +22,45 @@
 
 import cugraph_dgl
 from cugraph_dgl.typing import TensorType
+from cugraph_dgl.utils.cugraph_conversion_utils import _cast_to_torch_tensor
 
 torch = import_optional("torch")
 dgl = import_optional("dgl")
 
 
+class EmbeddingView:
+    def __init__(self, storage: "dgl.storages.base.FeatureStorage", ld: int):
+        self.__ld = ld
+        self.__storage = storage
+    
+    def __getitem__(self, u: TensorType) -> "torch.Tensor":
+        u = _cast_to_torch_tensor(u)
+        try:
+           return self.__storage.fetch(
+            u,
+            'cuda',
+            )
+        except RuntimeError as ex:
+            warnings.warn(
+                "Got error accessing data, trying again with index on device: "
+                + str(ex)
+            )
+            return self.__storage.fetch(
+                u.cuda(),
+                'cuda',
+            )
+    
+    @property
+    def shape(self) -> "torch.Size":
+        try:
+            f = self.__storage.fetch(torch.tensor([0]), 'cpu')
+        except RuntimeError:
+            f = self.__storage.fetch(torch.tensor([0],device='cuda'), 'cuda')
+        sz = [s for s in f.shape]
+        sz[0] = self.__ld
+        return torch.Size(tuple(sz))
+
+
 class HeteroEdgeDataView(MutableMapping):
     """
     Duck-typed version of DGL's HeteroEdgeDataView.
diff --git a/python/cugraph-dgl/examples/graphsage/node-classification-dask.py b/python/cugraph-dgl/examples/graphsage/node-classification-dask.py
index 03cf49bd939..992669e4284 100644
--- a/python/cugraph-dgl/examples/graphsage/node-classification-dask.py
+++ b/python/cugraph-dgl/examples/graphsage/node-classification-dask.py
@@ -196,9 +196,6 @@ def train(args, device, g, dataset, model):
                 x = g.ndata["feat"][input_nodes]
                 y = g.ndata["label"][output_nodes]
 
-            print(x.shape, input_nodes.shape, y.shape, output_nodes.shape)
-            print([b.num_nodes() for b in blocks])
-
             y_hat = model(blocks, x)
             loss = F.cross_entropy(y_hat, y)
             opt.zero_grad()
diff --git a/python/cugraph-dgl/examples/graphsage/node-classification.py b/python/cugraph-dgl/examples/graphsage/node-classification.py
index a8a542f8017..731cbcce97e 100644
--- a/python/cugraph-dgl/examples/graphsage/node-classification.py
+++ b/python/cugraph-dgl/examples/graphsage/node-classification.py
@@ -207,8 +207,6 @@ def train(args, device, g, dataset, model):
                 x = g.ndata["feat"][input_nodes]
                 y = g.ndata["label"][output_nodes]
 
-            print(x.shape, input_nodes.shape, y.shape, output_nodes.shape)
-            print([b.num_nodes() for b in blocks])
             y_hat = model(blocks, x)
             loss = F.cross_entropy(y_hat, y)
             opt.zero_grad()
diff --git a/python/cugraph-dgl/examples/multi_trainer_MG_example/model.py b/python/cugraph-dgl/examples/multi_trainer_MG_example/model.py
index a6f771e4b51..acdd832424b 100644
--- a/python/cugraph-dgl/examples/multi_trainer_MG_example/model.py
+++ b/python/cugraph-dgl/examples/multi_trainer_MG_example/model.py
@@ -114,15 +114,13 @@ def layerwise_infer(graph, nid, model, batch_size, device):
 
 
 def train_model(model, g, opt, train_dataloader, num_epochs, rank, val_nid):
-    g.ndata["feat"]["_N"] = g.ndata["feat"]["_N"].to("cuda")
-    g.ndata["label"]["_N"] = g.ndata["label"]["_N"].to("cuda")
     st = time.time()
     model.train()
     for epoch in range(num_epochs):
         total_loss = 0
         for _, (input_nodes, output_nodes, blocks) in enumerate(train_dataloader):
-            x = g.ndata["feat"]["_N"][input_nodes]
-            y = g.ndata["label"]["_N"][output_nodes]
+            x = g.ndata["feat"][input_nodes]
+            y = g.ndata["label"][output_nodes]
             y_hat = model(blocks, x)
             y = y.squeeze(1)
             loss = F.cross_entropy(y_hat, y)
diff --git a/python/cugraph-dgl/examples/multi_trainer_MG_example/workflow.py b/python/cugraph-dgl/examples/multi_trainer_MG_example/workflow_dask.py
similarity index 98%
rename from python/cugraph-dgl/examples/multi_trainer_MG_example/workflow.py
rename to python/cugraph-dgl/examples/multi_trainer_MG_example/workflow_dask.py
index 474f17dc2bb..8ca40bd1e2a 100644
--- a/python/cugraph-dgl/examples/multi_trainer_MG_example/workflow.py
+++ b/python/cugraph-dgl/examples/multi_trainer_MG_example/workflow_dask.py
@@ -204,6 +204,10 @@ def run_workflow(rank, devices, scheduler_address):
     n_epochs = 10
     total_st = time.time()
     opt = torch.optim.Adam(model.parameters(), lr=0.01)
+
+    gs.ndata["feat"] = gs.ndata["feat"].to("cuda")
+    gs.ndata["label"] = gs.ndata["label"].to("cuda")
+    
     train_model(model, gs, opt, dataloader, n_epochs, rank, valid_idx)
     torch.distributed.barrier()
     total_et = time.time()
diff --git a/python/cugraph-dgl/examples/multi_trainer_MG_example/workflow_mnmg.py b/python/cugraph-dgl/examples/multi_trainer_MG_example/workflow_mnmg.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/python/cugraph-dgl/examples/multi_trainer_MG_example/workflow_snmg.py b/python/cugraph-dgl/examples/multi_trainer_MG_example/workflow_snmg.py
new file mode 100644
index 00000000000..f043940486b
--- /dev/null
+++ b/python/cugraph-dgl/examples/multi_trainer_MG_example/workflow_snmg.py
@@ -0,0 +1,228 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import dgl
+import torch
+import time
+import tempfile
+import argparse
+import os
+
+import cugraph_dgl
+
+from cugraph.gnn import (
+    cugraph_comms_init,
+    cugraph_comms_shutdown,
+    cugraph_comms_create_unique_id,
+)
+
+from pylibwholegraph.torch.initialize import (
+    init as wm_init,
+    finalize as wm_finalize,
+)
+
+# Allow computation on objects that are larger than GPU memory
+# https://docs.rapids.ai/api/cudf/stable/developer_guide/library_design/#spilling-to-host-memory
+os.environ["CUDF_SPILL"] = "1"
+
+
+def initalize_pytorch_worker(dev_id):
+    import cupy as cp
+    import rmm
+    from rmm.allocators.cupy import rmm_cupy_allocator
+
+    dev = cp.cuda.Device(
+        dev_id
+    )  # Create cuda context on the right gpu, defaults to gpu-0
+    dev.use()
+    rmm.reinitialize(
+        pool_allocator=True,
+        initial_pool_size=10e9,
+        maximum_pool_size=15e9,
+        devices=[dev_id],
+    )
+
+    from cugraph.testing.mg_utils import enable_spilling
+    enable_spilling()
+
+    torch.cuda.set_device(dev_id)
+    cp.cuda.set_allocator(rmm_cupy_allocator)
+    print("device_id", dev_id, flush=True)
+
+
+def load_dgl_dataset(dataset_name="ogbn-products"):
+    from ogb.nodeproppred import DglNodePropPredDataset
+
+    dataset = DglNodePropPredDataset(name=dataset_name)
+    split_idx = dataset.get_idx_split()
+    train_idx, valid_idx, test_idx = (
+        split_idx["train"],
+        split_idx["valid"],
+        split_idx["test"],
+    )
+    g, label = dataset[0]
+    g.ndata["label"] = label
+    if len(g.etypes) <= 1:
+        g = dgl.add_self_loop(g)
+    else:
+        for etype in g.etypes:
+            if etype[0] == etype[2]:
+                # only add self loops for src->dst
+                g = dgl.add_self_loop(g, etype=etype)
+
+    g = g.int()
+    train_idx = train_idx.int()
+    valid_idx = valid_idx.int()
+    test_idx = test_idx.int()
+    return g, train_idx, valid_idx, test_idx, dataset.num_classes
+
+
+def create_cugraph_graphstore_from_dgl_dataset(
+    dataset, rank, world_size
+):
+    (g, train_idx, valid_idx, test_idx, num_classes) = dataset
+    # Partition the data
+    cg = cugraph_dgl.Graph(is_multi_gpu=True,  ndata_storage='wholegraph', edata_storage='wholegraph')
+    
+    nix = torch.tensor_split(torch.arange(g.num_nodes()), world_size)[rank]
+    ndata = {
+        k: g.ndata[k][nix].cuda()
+        for k in g.ndata.keys()
+    }
+
+    eix = torch.tensor_split(torch.arange(g.num_edges()), world_size)[rank]
+    src, dst = g.all_edges(form='uv', order='eid')
+    edata = {
+        k: g.edata[k][eix].cuda()
+        for k in g.edata.keys()
+    }
+
+    cg.add_nodes(g.num_nodes(), data=ndata)
+    cg.add_edges(
+        torch.tensor_split(src, world_size)[rank].cuda(),
+        torch.tensor_split(dst, world_size)[rank].cuda(),
+        data=edata,
+    )
+
+    return (cg, torch.tensor_split(train_idx, world_size)[rank].to(torch.int64), torch.tensor_split(valid_idx, world_size)[rank].to(torch.int64), torch.tensor_split(test_idx, world_size)[rank].to(torch.int64), num_classes)
+
+
+def create_dataloader(gs, train_idx, device, temp_dir, stage):
+    import cugraph_dgl
+
+    temp_path = os.path.join(temp_dir, f'{stage}_{device}')
+    os.mkdir(temp_path)
+
+    sampler = cugraph_dgl.dataloading.NeighborSampler([10, 20], directory=temp_path, batches_per_partition=10,)
+    dataloader = cugraph_dgl.dataloading.FutureDataLoader(
+        gs,
+        train_idx,
+        sampler,
+        device=device,  # Put the sampled MFGs on CPU or GPU
+        use_ddp=True,  # Make it work with distributed data parallel
+        batch_size=1024,
+        shuffle=False,  # Whether to shuffle the nodes for every epoch
+        drop_last=False,
+        num_workers=0,
+    )
+    return dataloader
+
+
+def run_workflow(rank, world_size, cugraph_id, dataset, temp_dir):
+    from model import Sage, train_model
+
+    # Below sets gpu_number
+    dev_id = rank
+    initalize_pytorch_worker(dev_id)
+    device = torch.device(f"cuda:{dev_id}")
+
+    # Pytorch training worker initialization
+    dist_init_method = "tcp://{master_ip}:{master_port}".format(
+        master_ip="127.0.0.1", master_port="12346"
+    )
+
+    torch.distributed.init_process_group(
+        backend="nccl",
+        init_method=dist_init_method,
+        world_size=world_size,
+        rank=rank,
+    )
+
+    cugraph_comms_init(rank=rank, world_size=world_size, uid=cugraph_id, device=rank)
+    wm_init(rank, world_size, rank, world_size)
+
+    print(f"rank {rank}.", flush=True)
+    print("Initalized across GPUs.")
+
+    (
+        gs,
+        train_idx,
+        valid_idx,
+        test_idx,
+        num_classes,
+    ) = create_cugraph_graphstore_from_dgl_dataset(
+        dataset, rank, world_size,
+    )
+    del dataset
+
+    torch.distributed.barrier()
+    print(f"Loading graph to worker {rank} is complete", flush=True)
+
+    dataloader = create_dataloader(gs, train_idx, device, temp_dir, 'train')
+    print("Dataloader Creation Complete", flush=True)
+    num_feats = gs.ndata["feat"].shape[1]
+    hid_size = 256
+    # Load Training example
+    model = Sage(num_feats, hid_size, num_classes).to(device)
+    model = torch.nn.parallel.DistributedDataParallel(
+        model,
+        device_ids=[device],
+        output_device=device,
+    )
+    torch.distributed.barrier()
+    n_epochs = 10
+    total_st = time.time()
+    opt = torch.optim.Adam(model.parameters(), lr=0.01)
+    train_model(model, gs, opt, dataloader, n_epochs, rank, valid_idx)
+    torch.distributed.barrier()
+    total_et = time.time()
+    print(
+        f"Total time taken on n_epochs {n_epochs} = {total_et-total_st} s",
+        f"measured by worker = {rank}",
+    )
+
+    wm_finalize()
+    cugraph_comms_shutdown()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--dataset', type=str, default='ogbn-products')
+    args = parser.parse_args()
+
+    from rmm.allocators.torch import rmm_torch_allocator
+    torch.cuda.memory.change_current_allocator(rmm_torch_allocator)
+
+    # Create the uid needed for cuGraph comms
+    cugraph_id = cugraph_comms_create_unique_id()
+
+    ds = load_dgl_dataset(args.dataset)
+
+    world_size = torch.cuda.device_count()
+
+    with tempfile.TemporaryDirectory() as directory:
+        torch.multiprocessing.spawn(
+            run_workflow,
+            args=(world_size, cugraph_id, ds, directory),
+            nprocs=world_size,
+        )

From f943d9144afdb6d0409daed694454ee2ad6013f0 Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Tue, 30 Jul 2024 13:04:14 -0700
Subject: [PATCH 40/47] mnmg

---
 .../multi_trainer_MG_example/workflow_mnmg.py | 228 ++++++++++++++++++
 1 file changed, 228 insertions(+)

diff --git a/python/cugraph-dgl/examples/multi_trainer_MG_example/workflow_mnmg.py b/python/cugraph-dgl/examples/multi_trainer_MG_example/workflow_mnmg.py
index e69de29bb2d..f043940486b 100644
--- a/python/cugraph-dgl/examples/multi_trainer_MG_example/workflow_mnmg.py
+++ b/python/cugraph-dgl/examples/multi_trainer_MG_example/workflow_mnmg.py
@@ -0,0 +1,228 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import dgl
+import torch
+import time
+import tempfile
+import argparse
+import os
+
+import cugraph_dgl
+
+from cugraph.gnn import (
+    cugraph_comms_init,
+    cugraph_comms_shutdown,
+    cugraph_comms_create_unique_id,
+)
+
+from pylibwholegraph.torch.initialize import (
+    init as wm_init,
+    finalize as wm_finalize,
+)
+
+# Allow computation on objects that are larger than GPU memory
+# https://docs.rapids.ai/api/cudf/stable/developer_guide/library_design/#spilling-to-host-memory
+os.environ["CUDF_SPILL"] = "1"
+
+
+def initalize_pytorch_worker(dev_id):
+    import cupy as cp
+    import rmm
+    from rmm.allocators.cupy import rmm_cupy_allocator
+
+    dev = cp.cuda.Device(
+        dev_id
+    )  # Create cuda context on the right gpu, defaults to gpu-0
+    dev.use()
+    rmm.reinitialize(
+        pool_allocator=True,
+        initial_pool_size=10e9,
+        maximum_pool_size=15e9,
+        devices=[dev_id],
+    )
+
+    from cugraph.testing.mg_utils import enable_spilling
+    enable_spilling()
+
+    torch.cuda.set_device(dev_id)
+    cp.cuda.set_allocator(rmm_cupy_allocator)
+    print("device_id", dev_id, flush=True)
+
+
+def load_dgl_dataset(dataset_name="ogbn-products"):
+    from ogb.nodeproppred import DglNodePropPredDataset
+
+    dataset = DglNodePropPredDataset(name=dataset_name)
+    split_idx = dataset.get_idx_split()
+    train_idx, valid_idx, test_idx = (
+        split_idx["train"],
+        split_idx["valid"],
+        split_idx["test"],
+    )
+    g, label = dataset[0]
+    g.ndata["label"] = label
+    if len(g.etypes) <= 1:
+        g = dgl.add_self_loop(g)
+    else:
+        for etype in g.etypes:
+            if etype[0] == etype[2]:
+                # only add self loops for src->dst
+                g = dgl.add_self_loop(g, etype=etype)
+
+    g = g.int()
+    train_idx = train_idx.int()
+    valid_idx = valid_idx.int()
+    test_idx = test_idx.int()
+    return g, train_idx, valid_idx, test_idx, dataset.num_classes
+
+
+def create_cugraph_graphstore_from_dgl_dataset(
+    dataset, rank, world_size
+):
+    (g, train_idx, valid_idx, test_idx, num_classes) = dataset
+    # Partition the data
+    cg = cugraph_dgl.Graph(is_multi_gpu=True,  ndata_storage='wholegraph', edata_storage='wholegraph')
+    
+    nix = torch.tensor_split(torch.arange(g.num_nodes()), world_size)[rank]
+    ndata = {
+        k: g.ndata[k][nix].cuda()
+        for k in g.ndata.keys()
+    }
+
+    eix = torch.tensor_split(torch.arange(g.num_edges()), world_size)[rank]
+    src, dst = g.all_edges(form='uv', order='eid')
+    edata = {
+        k: g.edata[k][eix].cuda()
+        for k in g.edata.keys()
+    }
+
+    cg.add_nodes(g.num_nodes(), data=ndata)
+    cg.add_edges(
+        torch.tensor_split(src, world_size)[rank].cuda(),
+        torch.tensor_split(dst, world_size)[rank].cuda(),
+        data=edata,
+    )
+
+    return (cg, torch.tensor_split(train_idx, world_size)[rank].to(torch.int64), torch.tensor_split(valid_idx, world_size)[rank].to(torch.int64), torch.tensor_split(test_idx, world_size)[rank].to(torch.int64), num_classes)
+
+
+def create_dataloader(gs, train_idx, device, temp_dir, stage):
+    import cugraph_dgl
+
+    temp_path = os.path.join(temp_dir, f'{stage}_{device}')
+    os.mkdir(temp_path)
+
+    sampler = cugraph_dgl.dataloading.NeighborSampler([10, 20], directory=temp_path, batches_per_partition=10,)
+    dataloader = cugraph_dgl.dataloading.FutureDataLoader(
+        gs,
+        train_idx,
+        sampler,
+        device=device,  # Put the sampled MFGs on CPU or GPU
+        use_ddp=True,  # Make it work with distributed data parallel
+        batch_size=1024,
+        shuffle=False,  # Whether to shuffle the nodes for every epoch
+        drop_last=False,
+        num_workers=0,
+    )
+    return dataloader
+
+
+def run_workflow(rank, world_size, cugraph_id, dataset, temp_dir):
+    from model import Sage, train_model
+
+    # Below sets gpu_number
+    dev_id = rank
+    initalize_pytorch_worker(dev_id)
+    device = torch.device(f"cuda:{dev_id}")
+
+    # Pytorch training worker initialization
+    dist_init_method = "tcp://{master_ip}:{master_port}".format(
+        master_ip="127.0.0.1", master_port="12346"
+    )
+
+    torch.distributed.init_process_group(
+        backend="nccl",
+        init_method=dist_init_method,
+        world_size=world_size,
+        rank=rank,
+    )
+
+    cugraph_comms_init(rank=rank, world_size=world_size, uid=cugraph_id, device=rank)
+    wm_init(rank, world_size, rank, world_size)
+
+    print(f"rank {rank}.", flush=True)
+    print("Initalized across GPUs.")
+
+    (
+        gs,
+        train_idx,
+        valid_idx,
+        test_idx,
+        num_classes,
+    ) = create_cugraph_graphstore_from_dgl_dataset(
+        dataset, rank, world_size,
+    )
+    del dataset
+
+    torch.distributed.barrier()
+    print(f"Loading graph to worker {rank} is complete", flush=True)
+
+    dataloader = create_dataloader(gs, train_idx, device, temp_dir, 'train')
+    print("Dataloader Creation Complete", flush=True)
+    num_feats = gs.ndata["feat"].shape[1]
+    hid_size = 256
+    # Load Training example
+    model = Sage(num_feats, hid_size, num_classes).to(device)
+    model = torch.nn.parallel.DistributedDataParallel(
+        model,
+        device_ids=[device],
+        output_device=device,
+    )
+    torch.distributed.barrier()
+    n_epochs = 10
+    total_st = time.time()
+    opt = torch.optim.Adam(model.parameters(), lr=0.01)
+    train_model(model, gs, opt, dataloader, n_epochs, rank, valid_idx)
+    torch.distributed.barrier()
+    total_et = time.time()
+    print(
+        f"Total time taken on n_epochs {n_epochs} = {total_et-total_st} s",
+        f"measured by worker = {rank}",
+    )
+
+    wm_finalize()
+    cugraph_comms_shutdown()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--dataset', type=str, default='ogbn-products')
+    args = parser.parse_args()
+
+    from rmm.allocators.torch import rmm_torch_allocator
+    torch.cuda.memory.change_current_allocator(rmm_torch_allocator)
+
+    # Create the uid needed for cuGraph comms
+    cugraph_id = cugraph_comms_create_unique_id()
+
+    ds = load_dgl_dataset(args.dataset)
+
+    world_size = torch.cuda.device_count()
+
+    with tempfile.TemporaryDirectory() as directory:
+        torch.multiprocessing.spawn(
+            run_workflow,
+            args=(world_size, cugraph_id, ds, directory),
+            nprocs=world_size,
+        )

From 7ba4d898ec76c3e7d0e7f19c3669c12075d4d6a6 Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Tue, 30 Jul 2024 13:05:31 -0700
Subject: [PATCH 41/47] use global communicator

---
 python/cugraph-dgl/cugraph_dgl/features.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cugraph-dgl/cugraph_dgl/features.py b/python/cugraph-dgl/cugraph_dgl/features.py
index b4ff0049494..9dc009f4127 100644
--- a/python/cugraph-dgl/cugraph_dgl/features.py
+++ b/python/cugraph-dgl/cugraph_dgl/features.py
@@ -51,7 +51,7 @@ def __init__(
         location: str(optional, default='cpu')
             The location ('cpu' or 'cuda') where data is stored.
         """
-        self.__wg_comm = wgth.get_local_node_communicator()
+        self.__wg_comm = wgth.get_global_communicator()
 
         if len(tensor.shape) > 2:
             raise ValueError("Only 1-D or 2-D tensors are supported by WholeGraph.")

From 2d3a640ef042209c80ceeaab0c4fdb8438f83ee9 Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Thu, 1 Aug 2024 10:56:46 -0700
Subject: [PATCH 42/47] fix partition function

---
 .../examples/multi_trainer_MG_example/workflow_mnmg.py          | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cugraph-dgl/examples/multi_trainer_MG_example/workflow_mnmg.py b/python/cugraph-dgl/examples/multi_trainer_MG_example/workflow_mnmg.py
index 741f2e4fa31..2bec1a3dbcf 100644
--- a/python/cugraph-dgl/examples/multi_trainer_MG_example/workflow_mnmg.py
+++ b/python/cugraph-dgl/examples/multi_trainer_MG_example/workflow_mnmg.py
@@ -276,7 +276,7 @@ def run_workflow(
         meta_path = os.path.join(args.dataset_root, args.dataset + "_meta.json")
 
         if not args.skip_partition and global_rank == 0:
-            partition_data(*load_dgl_dataset(args.dataset_root, args.dataset))
+            partition_data(*load_dgl_dataset(args.dataset_root, args.dataset), edge_path, feature_path, label_path, meta_path)
         torch.distributed.barrier()
 
         print("loading partitions...")

From 994aca8975abdff7ab7c4a76d5e1270076e00bf9 Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Thu, 1 Aug 2024 13:04:02 -0700
Subject: [PATCH 43/47] fix minor issues

---
 .../cugraph_dgl/dataloading/dataloader.py     |  4 +++
 .../dataloading/neighbor_sampler.py           |  2 +-
 .../multi_trainer_MG_example/model.py         |  7 +++---
 .../multi_trainer_MG_example/workflow_mnmg.py | 25 ++++++++++++-------
 4 files changed, 24 insertions(+), 14 deletions(-)

diff --git a/python/cugraph-dgl/cugraph_dgl/dataloading/dataloader.py b/python/cugraph-dgl/cugraph_dgl/dataloading/dataloader.py
index 21b70b05f3a..4f36353cb18 100644
--- a/python/cugraph-dgl/cugraph_dgl/dataloading/dataloader.py
+++ b/python/cugraph-dgl/cugraph_dgl/dataloading/dataloader.py
@@ -140,6 +140,10 @@ def __init__(
         self.__graph = graph
         self.__device = device
 
+    @property
+    def _batch_size(self):
+        return self.__batch_size
+
     @property
     def dataset(
         self,
diff --git a/python/cugraph-dgl/cugraph_dgl/dataloading/neighbor_sampler.py b/python/cugraph-dgl/cugraph_dgl/dataloading/neighbor_sampler.py
index 1a35c3ea027..87d111adcba 100644
--- a/python/cugraph-dgl/cugraph_dgl/dataloading/neighbor_sampler.py
+++ b/python/cugraph-dgl/cugraph_dgl/dataloading/neighbor_sampler.py
@@ -194,7 +194,7 @@ def sample(
 
         if g.is_homogeneous:
             indices = torch.concat(list(indices))
-            ds.sample_from_nodes(indices, batch_size=batch_size)
+            ds.sample_from_nodes(indices.long(), batch_size=batch_size)
             return HomogeneousSampleReader(
                 ds.get_reader(), self.output_format, self.edge_dir
             )
diff --git a/python/cugraph-dgl/examples/multi_trainer_MG_example/model.py b/python/cugraph-dgl/examples/multi_trainer_MG_example/model.py
index acdd832424b..1becd9682bb 100644
--- a/python/cugraph-dgl/examples/multi_trainer_MG_example/model.py
+++ b/python/cugraph-dgl/examples/multi_trainer_MG_example/model.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -58,9 +58,8 @@ def inference(self, g, batch_size, device):
         # The nodes on each layer are of course splitted in batches.
 
         all_node_ids = torch.arange(0, g.num_nodes()).to(device)
-        feat = g.get_node_storage(key="feat", ntype="_N").fetch(
-            all_node_ids, device=device
-        )
+        feat = g.ndata["feat"][all_node_ids].to(device)
+
         sampler = dgl.dataloading.MultiLayerFullNeighborSampler(
             1, prefetch_node_feats=["feat"]
         )
diff --git a/python/cugraph-dgl/examples/multi_trainer_MG_example/workflow_mnmg.py b/python/cugraph-dgl/examples/multi_trainer_MG_example/workflow_mnmg.py
index 2bec1a3dbcf..b1878b37d4e 100644
--- a/python/cugraph-dgl/examples/multi_trainer_MG_example/workflow_mnmg.py
+++ b/python/cugraph-dgl/examples/multi_trainer_MG_example/workflow_mnmg.py
@@ -13,7 +13,6 @@
 
 import dgl
 import torch
-import pandas
 import time
 import tempfile
 import argparse
@@ -124,10 +123,12 @@ def partition_data(
     )
 
     nix = torch.arange(g.num_nodes())
-    ndata = pandas.DataFrame({k: g.ndata[k][nix] for k in g.ndata.keys()})
     for (r, f) in enumerate(torch.tensor_split(nix, world_size)):
-        rank_path = os.path.join(feature_path, f"rank={r}_feat.parquet")
-        ndata.iloc[f].to_parquet(rank_path)
+        feat_path = os.path.join(feature_path, f"rank={r}_feat.pt")
+        torch.save(g.ndata["feat"][f], feat_path)
+
+        label_f_path = os.path.join(feature_path, f"rank={r}_label.pt")
+        torch.save(g.ndata["label"][f], label_f_path)
 
     # Split and save labels
     os.makedirs(
@@ -167,10 +168,9 @@ def load_partitioned_data(rank, edge_path, feature_path, label_path, meta_path):
         )
 
     # Load features
-    ndata_df = pandas.read_parquet(
-        os.path.join(feature_path, f"rank={rank}_feat.parquet")
-    )
-    ndata = {col: torch.as_tensor(s.values) for col, s in ndata_df.items()}
+    feat_t = torch.load(os.path.join(feature_path, f"rank={rank}_feat.pt"))
+    label_f_t = torch.load(os.path.join(feature_path, f"rank={rank}_label.pt"))
+    ndata = {"feat": feat_t, "label": label_f_t}
     g.add_nodes(meta["num_nodes"], data=ndata)
 
     # Load edge index
@@ -191,6 +191,7 @@ def create_dataloader(gs, train_idx, device, temp_dir, stage):
         directory=temp_path,
         batches_per_partition=10,
     )
+
     dataloader = cugraph_dgl.dataloading.FutureDataLoader(
         gs,
         train_idx,
@@ -276,7 +277,13 @@ def run_workflow(
         meta_path = os.path.join(args.dataset_root, args.dataset + "_meta.json")
 
         if not args.skip_partition and global_rank == 0:
-            partition_data(*load_dgl_dataset(args.dataset_root, args.dataset), edge_path, feature_path, label_path, meta_path)
+            partition_data(
+                *load_dgl_dataset(args.dataset_root, args.dataset),
+                edge_path,
+                feature_path,
+                label_path,
+                meta_path,
+            )
         torch.distributed.barrier()
 
         print("loading partitions...")

From d1c8494178cca2d0522be4fbac5078310706e97d Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Thu, 1 Aug 2024 13:39:42 -0700
Subject: [PATCH 44/47] remove dask example

---
 .../multi_trainer_MG_example/workflow_dask.py | 248 ------------------
 1 file changed, 248 deletions(-)
 delete mode 100644 python/cugraph-dgl/examples/multi_trainer_MG_example/workflow_dask.py

diff --git a/python/cugraph-dgl/examples/multi_trainer_MG_example/workflow_dask.py b/python/cugraph-dgl/examples/multi_trainer_MG_example/workflow_dask.py
deleted file mode 100644
index 8ca40bd1e2a..00000000000
--- a/python/cugraph-dgl/examples/multi_trainer_MG_example/workflow_dask.py
+++ /dev/null
@@ -1,248 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import dgl
-import torch
-import time
-from distributed import Client, Event as Dask_Event
-import tempfile
-from cugraph.dask.comms import comms as Comms
-
-
-def enable_spilling():
-    import cudf
-
-    cudf.set_option("spill", True)
-
-
-def setup_cluster(dask_worker_devices):
-    dask_worker_devices_str = ",".join([str(i) for i in dask_worker_devices])
-    from dask_cuda import LocalCUDACluster
-
-    cluster = LocalCUDACluster(
-        protocol="tcp",
-        CUDA_VISIBLE_DEVICES=dask_worker_devices_str,
-        rmm_pool_size="25GB",
-    )
-
-    client = Client(cluster)
-    client.wait_for_workers(n_workers=len(dask_worker_devices))
-    client.run(enable_spilling)
-    print("Dask Cluster Setup Complete")
-    del client
-    return cluster
-
-
-def create_dask_client(scheduler_address):
-    from cugraph.dask.comms import comms as Comms
-
-    client = Client(scheduler_address)
-    Comms.initialize(p2p=True)
-    return client
-
-
-def initalize_pytorch_worker(dev_id):
-    import cupy as cp
-    import rmm
-    from rmm.allocators.torch import rmm_torch_allocator
-    from rmm.allocators.cupy import rmm_cupy_allocator
-
-    dev = cp.cuda.Device(
-        dev_id
-    )  # Create cuda context on the right gpu, defaults to gpu-0
-    dev.use()
-    rmm.reinitialize(
-        pool_allocator=True,
-        initial_pool_size=10e9,
-        maximum_pool_size=15e9,
-        devices=[dev_id],
-    )
-
-    if dev_id == 0:
-        torch.cuda.memory.change_current_allocator(rmm_torch_allocator)
-
-    torch.cuda.set_device(dev_id)
-    cp.cuda.set_allocator(rmm_cupy_allocator)
-    enable_spilling()
-    print("device_id", dev_id, flush=True)
-
-
-def load_dgl_dataset(dataset_name="ogbn-products"):
-    from ogb.nodeproppred import DglNodePropPredDataset
-
-    dataset = DglNodePropPredDataset(name=dataset_name)
-    split_idx = dataset.get_idx_split()
-    train_idx, valid_idx, test_idx = (
-        split_idx["train"],
-        split_idx["valid"],
-        split_idx["test"],
-    )
-    g, label = dataset[0]
-    g.ndata["label"] = label
-    if len(g.etypes) <= 1:
-        g = dgl.add_self_loop(g)
-    else:
-        for etype in g.etypes:
-            if etype[0] == etype[2]:
-                # only add self loops for src->dst
-                g = dgl.add_self_loop(g, etype=etype)
-
-    g = g.int()
-    train_idx = train_idx.int()
-    valid_idx = valid_idx.int()
-    test_idx = test_idx.int()
-    return g, train_idx, valid_idx, test_idx, dataset.num_classes
-
-
-def create_cugraph_graphstore_from_dgl_dataset(
-    dataset_name="ogbn-products", single_gpu=False
-):
-    from cugraph_dgl import cugraph_storage_from_heterograph
-
-    dgl_g, train_idx, valid_idx, test_idx, num_classes = load_dgl_dataset(dataset_name)
-    cugraph_gs = cugraph_storage_from_heterograph(dgl_g, single_gpu=single_gpu)
-    return cugraph_gs, train_idx, valid_idx, test_idx, num_classes
-
-
-def create_dataloader(gs, train_idx, device):
-    import cugraph_dgl
-
-    temp_dir = tempfile.TemporaryDirectory()
-    sampler = cugraph_dgl.dataloading.NeighborSampler([10, 20])
-    dataloader = cugraph_dgl.dataloading.DataLoader(
-        gs,
-        train_idx,
-        sampler,
-        sampling_output_dir=temp_dir.name,
-        batches_per_partition=10,
-        device=device,  # Put the sampled MFGs on CPU or GPU
-        use_ddp=True,  # Make it work with distributed data parallel
-        batch_size=1024,
-        shuffle=False,  # Whether to shuffle the nodes for every epoch
-        drop_last=False,
-        num_workers=0,
-    )
-    return dataloader
-
-
-def run_workflow(rank, devices, scheduler_address):
-    from model import Sage, train_model
-
-    # Below sets gpu_number
-    dev_id = devices[rank]
-    initalize_pytorch_worker(dev_id)
-    device = torch.device(f"cuda:{dev_id}")
-    # cugraph dask client initialization
-    client = create_dask_client(scheduler_address)
-
-    # Pytorch training worker initialization
-    dist_init_method = "tcp://{master_ip}:{master_port}".format(
-        master_ip="127.0.0.1", master_port="12346"
-    )
-
-    torch.distributed.init_process_group(
-        backend="nccl",
-        init_method=dist_init_method,
-        world_size=len(devices),
-        rank=rank,
-    )
-
-    print(f"rank {rank}.", flush=True)
-    print("Initalized across GPUs.")
-
-    event = Dask_Event("cugraph_gs_creation_event")
-    if rank == 0:
-        (
-            gs,
-            train_idx,
-            valid_idx,
-            test_idx,
-            num_classes,
-        ) = create_cugraph_graphstore_from_dgl_dataset(
-            "ogbn-products", single_gpu=False
-        )
-        client.publish_dataset(cugraph_gs=gs)
-        client.publish_dataset(train_idx=train_idx)
-        client.publish_dataset(valid_idx=valid_idx)
-        client.publish_dataset(test_idx=test_idx)
-        client.publish_dataset(num_classes=num_classes)
-        event.set()
-    else:
-        if event.wait(timeout=1000):
-            gs = client.get_dataset("cugraph_gs")
-            train_idx = client.get_dataset("train_idx")
-            valid_idx = client.get_dataset("valid_idx")
-            test_idx = client.get_dataset("test_idx")
-            num_classes = client.get_dataset("num_classes")
-        else:
-            raise RuntimeError(f"Fetch cugraph_gs to worker_id {rank} failed")
-
-    torch.distributed.barrier()
-    print(f"Loading cugraph_store to worker {rank} is complete", flush=True)
-    dataloader = create_dataloader(gs, train_idx, device)
-    print("Data Loading Complete", flush=True)
-    num_feats = gs.ndata["feat"]["_N"].shape[1]
-    hid_size = 256
-    # Load Training example
-    model = Sage(num_feats, hid_size, num_classes).to(device)
-    model = torch.nn.parallel.DistributedDataParallel(
-        model,
-        device_ids=[device],
-        output_device=device,
-    )
-    torch.distributed.barrier()
-    n_epochs = 10
-    total_st = time.time()
-    opt = torch.optim.Adam(model.parameters(), lr=0.01)
-
-    gs.ndata["feat"] = gs.ndata["feat"].to("cuda")
-    gs.ndata["label"] = gs.ndata["label"].to("cuda")
-    
-    train_model(model, gs, opt, dataloader, n_epochs, rank, valid_idx)
-    torch.distributed.barrier()
-    total_et = time.time()
-    print(
-        f"Total time taken on n_epochs {n_epochs} = {total_et-total_st} s",
-        f"measured by worker = {rank}",
-    )
-
-    # cleanup dask cluster
-    if rank == 0:
-        client.unpublish_dataset("cugraph_gs")
-        client.unpublish_dataset("train_idx")
-        client.unpublish_dataset("valid_idx")
-        client.unpublish_dataset("test_idx")
-        event.clear()
-    print("Workflow completed")
-    print("---" * 10)
-    Comms.destroy()
-
-
-if __name__ == "__main__":
-    # Load dummy first
-    # because new environments
-    # require dataset download
-    load_dgl_dataset()
-    dask_worker_devices = [5, 6]
-    cluster = setup_cluster(dask_worker_devices)
-
-    trainer_devices = [0, 1, 2]
-    import torch.multiprocessing as mp
-
-    mp.spawn(
-        run_workflow,
-        args=(trainer_devices, cluster.scheduler_address),
-        nprocs=len(trainer_devices),
-    )
-    Comms.destroy()
-    cluster.close()

From 05e1da42f8c8a4f90aea4fa83e0177f951973685 Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Fri, 2 Aug 2024 13:46:40 -0700
Subject: [PATCH 45/47] use float64

---
 python/cugraph-dgl/examples/multi_trainer_MG_example/model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cugraph-dgl/examples/multi_trainer_MG_example/model.py b/python/cugraph-dgl/examples/multi_trainer_MG_example/model.py
index 1becd9682bb..3293f731336 100644
--- a/python/cugraph-dgl/examples/multi_trainer_MG_example/model.py
+++ b/python/cugraph-dgl/examples/multi_trainer_MG_example/model.py
@@ -118,7 +118,7 @@ def train_model(model, g, opt, train_dataloader, num_epochs, rank, val_nid):
     for epoch in range(num_epochs):
         total_loss = 0
         for _, (input_nodes, output_nodes, blocks) in enumerate(train_dataloader):
-            x = g.ndata["feat"][input_nodes]
+            x = g.ndata["feat"][input_nodes].to(torch.float64)
             y = g.ndata["label"][output_nodes]
             y_hat = model(blocks, x)
             y = y.squeeze(1)

From 5b46f436ba69b544aaa501b7f8d7657cb28fbcd1 Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Fri, 2 Aug 2024 14:02:15 -0700
Subject: [PATCH 46/47] set dtype

---
 python/cugraph-dgl/examples/multi_trainer_MG_example/model.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/cugraph-dgl/examples/multi_trainer_MG_example/model.py b/python/cugraph-dgl/examples/multi_trainer_MG_example/model.py
index 3293f731336..d3aad2ab309 100644
--- a/python/cugraph-dgl/examples/multi_trainer_MG_example/model.py
+++ b/python/cugraph-dgl/examples/multi_trainer_MG_example/model.py
@@ -118,8 +118,8 @@ def train_model(model, g, opt, train_dataloader, num_epochs, rank, val_nid):
     for epoch in range(num_epochs):
         total_loss = 0
         for _, (input_nodes, output_nodes, blocks) in enumerate(train_dataloader):
-            x = g.ndata["feat"][input_nodes].to(torch.float64)
-            y = g.ndata["label"][output_nodes]
+            x = g.ndata["feat"][input_nodes].to(torch.float32)
+            y = g.ndata["label"][output_nodes].to(torch.int64)
             y_hat = model(blocks, x)
             y = y.squeeze(1)
             loss = F.cross_entropy(y_hat, y)

From 139b3d64a5656c5f6f65bd102505c662ddeb39f3 Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Mon, 5 Aug 2024 14:42:43 -0700
Subject: [PATCH 47/47] allow setting directories

---
 .../examples/graphsage/node-classification.py | 14 ++--
 .../multi_trainer_MG_example/workflow_snmg.py | 64 +++++++++++--------
 2 files changed, 49 insertions(+), 29 deletions(-)

diff --git a/python/cugraph-dgl/examples/graphsage/node-classification.py b/python/cugraph-dgl/examples/graphsage/node-classification.py
index 5b4f8863e5d..2b8b687efab 100644
--- a/python/cugraph-dgl/examples/graphsage/node-classification.py
+++ b/python/cugraph-dgl/examples/graphsage/node-classification.py
@@ -17,6 +17,7 @@
 
 # Ignore Warning
 import warnings
+import tempfile
 import time
 import cugraph_dgl
 import cugraph_dgl.dataloading
@@ -154,7 +155,7 @@ def layerwise_infer(device, graph, nid, model, batch_size):
         return MF.accuracy(pred, label, task="multiclass", num_classes=num_classes)
 
 
-def train(args, device, g, dataset, model):
+def train(args, device, g, dataset, model, directory):
     # create sampler & dataloader
     train_idx = dataset.train_idx.to(device)
     val_idx = dataset.val_idx.to(device)
@@ -163,7 +164,7 @@ def train(args, device, g, dataset, model):
     batch_size = 1024
     fanouts = [5, 10, 15]
     if isinstance(g, cugraph_dgl.Graph):
-        sampler = cugraph_dgl.dataloading.NeighborSampler(fanouts)
+        sampler = cugraph_dgl.dataloading.NeighborSampler(fanouts, directory=directory)
         loader_cls = cugraph_dgl.dataloading.FutureDataLoader
     else:
         sampler = NeighborSampler(fanouts)
@@ -235,6 +236,8 @@ def train(args, device, g, dataset, model):
         " 'gpu_dgl' for pure-GPU training, "
         " 'gpu_cugraph_dgl' for pure-GPU training.",
     )
+    parser.add_argument("--dataset_root", type=str, default="dataset")
+    parser.add_argument("--tempdir_root", type=str, default=None)
     args = parser.parse_args()
     if not torch.cuda.is_available():
         args.mode = "cpu"
@@ -244,7 +247,9 @@ def train(args, device, g, dataset, model):
 
     # load and preprocess dataset
     print("Loading data")
-    dataset = AsNodePredDataset(DglNodePropPredDataset("ogbn-products"))
+    dataset = AsNodePredDataset(
+        DglNodePropPredDataset("ogbn-products", root=args.dataset_root)
+    )
     g = dataset[0]
     g = dgl.add_self_loop(g)
     if args.mode == "gpu_cugraph_dgl":
@@ -267,7 +272,8 @@ def train(args, device, g, dataset, model):
 
     # model training
     print("Training...")
-    train(args, device, g, dataset, model)
+    with tempfile.TemporaryDirectory(dir=args.tempdir_root) as directory:
+        train(args, device, g, dataset, model, directory)
 
     # test the model
     print("Testing...")
diff --git a/python/cugraph-dgl/examples/multi_trainer_MG_example/workflow_snmg.py b/python/cugraph-dgl/examples/multi_trainer_MG_example/workflow_snmg.py
index f043940486b..da5c2b4d64e 100644
--- a/python/cugraph-dgl/examples/multi_trainer_MG_example/workflow_snmg.py
+++ b/python/cugraph-dgl/examples/multi_trainer_MG_example/workflow_snmg.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -53,6 +53,7 @@ def initalize_pytorch_worker(dev_id):
     )
 
     from cugraph.testing.mg_utils import enable_spilling
+
     enable_spilling()
 
     torch.cuda.set_device(dev_id)
@@ -60,10 +61,13 @@ def initalize_pytorch_worker(dev_id):
     print("device_id", dev_id, flush=True)
 
 
-def load_dgl_dataset(dataset_name="ogbn-products"):
+def load_dgl_dataset(
+    dataset_name="ogbn-products",
+    dataset_root=None,
+):
     from ogb.nodeproppred import DglNodePropPredDataset
 
-    dataset = DglNodePropPredDataset(name=dataset_name)
+    dataset = DglNodePropPredDataset(name=dataset_name, root=dataset_root)
     split_idx = dataset.get_idx_split()
     train_idx, valid_idx, test_idx = (
         split_idx["train"],
@@ -87,25 +91,19 @@ def load_dgl_dataset(dataset_name="ogbn-products"):
     return g, train_idx, valid_idx, test_idx, dataset.num_classes
 
 
-def create_cugraph_graphstore_from_dgl_dataset(
-    dataset, rank, world_size
-):
+def create_cugraph_graphstore_from_dgl_dataset(dataset, rank, world_size):
     (g, train_idx, valid_idx, test_idx, num_classes) = dataset
     # Partition the data
-    cg = cugraph_dgl.Graph(is_multi_gpu=True,  ndata_storage='wholegraph', edata_storage='wholegraph')
-    
+    cg = cugraph_dgl.Graph(
+        is_multi_gpu=True, ndata_storage="wholegraph", edata_storage="wholegraph"
+    )
+
     nix = torch.tensor_split(torch.arange(g.num_nodes()), world_size)[rank]
-    ndata = {
-        k: g.ndata[k][nix].cuda()
-        for k in g.ndata.keys()
-    }
+    ndata = {k: g.ndata[k][nix].cuda() for k in g.ndata.keys()}
 
     eix = torch.tensor_split(torch.arange(g.num_edges()), world_size)[rank]
-    src, dst = g.all_edges(form='uv', order='eid')
-    edata = {
-        k: g.edata[k][eix].cuda()
-        for k in g.edata.keys()
-    }
+    src, dst = g.all_edges(form="uv", order="eid")
+    edata = {k: g.edata[k][eix].cuda() for k in g.edata.keys()}
 
     cg.add_nodes(g.num_nodes(), data=ndata)
     cg.add_edges(
@@ -114,16 +112,26 @@ def create_cugraph_graphstore_from_dgl_dataset(
         data=edata,
     )
 
-    return (cg, torch.tensor_split(train_idx, world_size)[rank].to(torch.int64), torch.tensor_split(valid_idx, world_size)[rank].to(torch.int64), torch.tensor_split(test_idx, world_size)[rank].to(torch.int64), num_classes)
+    return (
+        cg,
+        torch.tensor_split(train_idx, world_size)[rank].to(torch.int64),
+        torch.tensor_split(valid_idx, world_size)[rank].to(torch.int64),
+        torch.tensor_split(test_idx, world_size)[rank].to(torch.int64),
+        num_classes,
+    )
 
 
 def create_dataloader(gs, train_idx, device, temp_dir, stage):
     import cugraph_dgl
 
-    temp_path = os.path.join(temp_dir, f'{stage}_{device}')
+    temp_path = os.path.join(temp_dir, f"{stage}_{device}")
     os.mkdir(temp_path)
 
-    sampler = cugraph_dgl.dataloading.NeighborSampler([10, 20], directory=temp_path, batches_per_partition=10,)
+    sampler = cugraph_dgl.dataloading.NeighborSampler(
+        [10, 20],
+        directory=temp_path,
+        batches_per_partition=10,
+    )
     dataloader = cugraph_dgl.dataloading.FutureDataLoader(
         gs,
         train_idx,
@@ -171,14 +179,16 @@ def run_workflow(rank, world_size, cugraph_id, dataset, temp_dir):
         test_idx,
         num_classes,
     ) = create_cugraph_graphstore_from_dgl_dataset(
-        dataset, rank, world_size,
+        dataset,
+        rank,
+        world_size,
     )
     del dataset
 
     torch.distributed.barrier()
     print(f"Loading graph to worker {rank} is complete", flush=True)
 
-    dataloader = create_dataloader(gs, train_idx, device, temp_dir, 'train')
+    dataloader = create_dataloader(gs, train_idx, device, temp_dir, "train")
     print("Dataloader Creation Complete", flush=True)
     num_feats = gs.ndata["feat"].shape[1]
     hid_size = 256
@@ -201,26 +211,30 @@ def run_workflow(rank, world_size, cugraph_id, dataset, temp_dir):
         f"measured by worker = {rank}",
     )
 
+    torch.cuda.synchronize()
     wm_finalize()
     cugraph_comms_shutdown()
 
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--dataset', type=str, default='ogbn-products')
+    parser.add_argument("--dataset_root", type=str, default="dataset")
+    parser.add_argument("--tempdir_root", type=str, default=None)
+    parser.add_argument("--dataset", type=str, default="ogbn-products")
     args = parser.parse_args()
 
     from rmm.allocators.torch import rmm_torch_allocator
+
     torch.cuda.memory.change_current_allocator(rmm_torch_allocator)
 
     # Create the uid needed for cuGraph comms
     cugraph_id = cugraph_comms_create_unique_id()
 
-    ds = load_dgl_dataset(args.dataset)
+    ds = load_dgl_dataset(args.dataset, args.dataset_root)
 
     world_size = torch.cuda.device_count()
 
-    with tempfile.TemporaryDirectory() as directory:
+    with tempfile.TemporaryDirectory(dir=args.tempdir_root) as directory:
         torch.multiprocessing.spawn(
             run_workflow,
             args=(world_size, cugraph_id, ds, directory),