Update Graph to store a Pylibcugraph Graph (SG/MG Graph) (#2394)

Closes #2373 Authors: - Alex Barghi (https://github.com/alexbarghi-nv) Approvers: - Brad Rees (https://github.com/BradReesWork) - Rick Ratzel (https://github.com/rlratzel) URL: #2394
rapidsai · Jul 22, 2022 · 5d0744c · 5d0744c
1 parent e9e48c8
commit 5d0744c
Show file tree

Hide file tree

Showing 12 changed files with 460 additions and 274 deletions.
diff --git a/python/cugraph/cugraph/dask/sampling/uniform_neighbor_sample.py b/python/cugraph/cugraph/dask/sampling/uniform_neighbor_sample.py
@@ -13,63 +13,19 @@
 # limitations under the License.
 
 import numpy
-from dask.distributed import wait, default_client
+from dask.distributed import wait
 
 import dask_cudf
 import cudf
 
-from pylibcugraph import (ResourceHandle,
-                          GraphProperties,
-                          MGGraph
-                          )
+from pylibcugraph import ResourceHandle
 
 from pylibcugraph import \
     uniform_neighbor_sample as pylibcugraph_uniform_neighbor_sample
 
-from cugraph.dask.common.input_utils import get_distributed_data
 from cugraph.dask.comms import comms as Comms
 
 
-def call_nbr_sampling(sID,
-                      data,
-                      src_col_name,
-                      dst_col_name,
-                      num_edges,
-                      do_expensive_check,
-                      start_list,
-                      h_fan_out,
-                      with_replacement):
-
-    # Preparation for graph creation
-    handle = Comms.get_handle(sID)
-    handle = ResourceHandle(handle.getHandle())
-    graph_properties = GraphProperties(is_symmetric=False, is_multigraph=False)
-    srcs = data[0][src_col_name]
-    dsts = data[0][dst_col_name]
-    weights = None
-    if "value" in data[0].columns:
-        weights = data[0]['value']
-
-    store_transposed = False
-
-    mg = MGGraph(handle,
-                 graph_properties,
-                 srcs,
-                 dsts,
-                 weights,
-                 store_transposed,
-                 num_edges,
-                 do_expensive_check)
-
-    ret_val = pylibcugraph_uniform_neighbor_sample(handle,
-                                                   mg,
-                                                   start_list,
-                                                   h_fan_out,
-                                                   with_replacement,
-                                                   do_expensive_check)
-    return ret_val
-
-
 def convert_to_cudf(cp_arrays, weight_t):
     """
     Creates a cudf DataFrame from cupy arrays from pylibcugraph wrapper
@@ -89,6 +45,24 @@ def convert_to_cudf(cp_arrays, weight_t):
     return df
 
 
+def _call_plc_uniform_neighbor_sample(sID,
+                                      mg_graph_x,
+                                      st_x,
+                                      fanout_vals,
+                                      with_replacement):
+    return pylibcugraph_uniform_neighbor_sample(
+        resource_handle=ResourceHandle(
+            Comms.get_handle(sID).getHandle()
+        ),
+        input_graph=mg_graph_x,
+        start_list=st_x.to_cupy(),
+        h_fan_out=fanout_vals,
+        with_replacement=with_replacement,
+        # FIXME: should we add this parameter as an option?
+        do_expensive_check=True
+    )
+
+
 def uniform_neighbor_sample(input_graph,
                             start_list,
                             fanout_vals,
@@ -97,6 +71,9 @@ def uniform_neighbor_sample(input_graph,
     Does neighborhood sampling, which samples nodes from a graph based on the
     current node's neighbors, with a corresponding fanout value at each hop.
 
+    Note: This is a pylibcugraph-enabled algorithm, which requires that the
+    graph was created with legacy_renum_only=True.
+
     Parameters
     ----------
     input_graph : cugraph.Graph
@@ -127,23 +104,16 @@ def uniform_neighbor_sample(input_graph,
             Contains the indices from the sampling result for path
             reconstruction
     """
-    # Initialize dask client
-    client = default_client()
-    # FIXME: 'legacy_renum_only' will not trigger the C++ renumbering
-    # In the future, once all the algos follow the C/Pylibcugraph path,
-    # compute_renumber_edge_list will only be used for multicolumn and
-    # string vertices since the renumbering will be done in pylibcugraph
-    input_graph.compute_renumber_edge_list(
-        transposed=False, legacy_renum_only=True)
 
     if isinstance(start_list, int):
         start_list = [start_list]
 
     if isinstance(start_list, list):
-        start_list = cudf.Series(start_list)
-        if start_list.dtype != "int32":
-            raise ValueError(f"'start_list' must have int32 values, "
-                             f"got: {start_list.dtype}")
+        start_list = cudf.Series(start_list, dtype='int32')
+
+    if start_list.dtype != "int32":
+        raise ValueError(f"'start_list' must have int32 values, "
+                         f"got: {start_list.dtype}")
 
     # fanout_vals must be a host array!
     # FIXME: ensure other sequence types (eg. cudf Series) can be handled.
@@ -153,39 +123,38 @@ def uniform_neighbor_sample(input_graph,
         raise TypeError("fanout_vals must be a list, "
                         f"got: {type(fanout_vals)}")
 
-    ddf = input_graph.edgelist.edgelist_df
-    src_col_name = input_graph.renumber_map.renumbered_src_col_name
-    dst_col_name = input_graph.renumber_map.renumbered_dst_col_name
-
-    weight_t = ddf["value"].dtype
-    if weight_t == "int32":
-        ddf = ddf.astype({"value": "float32"})
-    elif weight_t == "int64":
-        ddf = ddf.astype({"value": "float64"})
-
-    num_edges = len(ddf)
-    data = get_distributed_data(ddf)
+    weight_t = input_graph.edgelist.edgelist_df["value"].dtype
 
     # start_list uses "external" vertex IDs, but if the graph has been
     # renumbered, the start vertex IDs must also be renumbered.
     if input_graph.renumbered:
         start_list = input_graph.lookup_internal_vertex_id(
             start_list).compute()
-    # FIXME: should we add this parameter as an option?
-    do_expensive_check = False
-
-    result = [client.submit(call_nbr_sampling,
-                            Comms.get_session_id(),
-                            wf[1],
-                            src_col_name,
-                            dst_col_name,
-                            num_edges,
-                            do_expensive_check,
-                            start_list,
-                            fanout_vals,
-                            with_replacement,
-                            workers=[wf[0]])
-              for idx, wf in enumerate(data.worker_to_parts.items())]
+
+    '''
+    FIXME update the API to scatter the start list as shown below.
+    start_list = dask_cudf.from_cudf(
+        start_list,
+        npartitions=input_graph._npartitions
+    )
+    start_list = get_distributed_data(start_list)
+    wait(start_list)
+    '''
+
+    client = input_graph._client
+
+    result = [
+        client.submit(
+            _call_plc_uniform_neighbor_sample,
+            Comms.get_session_id(),
+            input_graph._plc_graph[w],
+            start_list,
+            fanout_vals,
+            with_replacement,
+            workers=[w],
+        )
+        for w in Comms.get_workers()
+    ]
 
     wait(result)
 
@@ -196,6 +165,7 @@ def uniform_neighbor_sample(input_graph,
     wait(cudf_result)
 
     ddf = dask_cudf.from_delayed(cudf_result)
+
     if input_graph.renumbered:
         ddf = input_graph.unrenumber(ddf, "sources", preserve_order=True)
         ddf = input_graph.unrenumber(ddf, "destinations", preserve_order=True)

diff --git a/python/cugraph/cugraph/dask/traversal/bfs.py b/python/cugraph/cugraph/dask/traversal/bfs.py
@@ -13,65 +13,17 @@
 # limitations under the License.
 #
 
-from pylibcugraph import (MGGraph,
-                          ResourceHandle,
-                          GraphProperties,
+from pylibcugraph import (ResourceHandle,
                           bfs as pylibcugraph_bfs
                           )
 
-from dask.distributed import wait, default_client
+from dask.distributed import wait
 from cugraph.dask.common.input_utils import get_distributed_data
 import cugraph.dask.comms.comms as Comms
 import cudf
 import dask_cudf
 
 
-def _call_plc_mg_bfs(
-                    sID,
-                    data,
-                    sources,
-                    depth_limit,
-                    src_col_name,
-                    dst_col_name,
-                    graph_properties,
-                    num_edges,
-                    direction_optimizing=False,
-                    do_expensive_check=False,
-                    return_predecessors=True):
-    comms_handle = Comms.get_handle(sID)
-    resource_handle = ResourceHandle(comms_handle.getHandle())
-    sources = sources[0]
-    srcs = cudf.Series(data[0][src_col_name], dtype='int32')
-    dsts = cudf.Series(data[0][dst_col_name], dtype='int32')
-    weights = cudf.Series(data[0]['value'], dtype='float32') \
-        if 'value' in data[0].columns \
-        else cudf.Series((srcs + 1) / (srcs + 1), dtype='float32')
-
-    mg = MGGraph(
-        resource_handle=resource_handle,
-        graph_properties=graph_properties,
-        src_array=srcs,
-        dst_array=dsts,
-        weight_array=weights,
-        store_transposed=False,
-        num_edges=num_edges,
-        do_expensive_check=do_expensive_check
-    )
-
-    res = \
-        pylibcugraph_bfs(
-            resource_handle,
-            mg,
-            cudf.Series(sources, dtype='int32'),
-            direction_optimizing,
-            depth_limit if depth_limit is not None else 0,
-            return_predecessors,
-            True
-        )
-
-    return res
-
-
 def convert_to_cudf(cp_arrays):
     """
     create a cudf DataFrame from cupy arrays
@@ -84,17 +36,36 @@ def convert_to_cudf(cp_arrays):
     return df
 
 
+def _call_plc_bfs(sID,
+                  mg_graph_x,
+                  st_x,
+                  depth_limit=None,
+                  return_distances=True):
+    return pylibcugraph_bfs(
+        ResourceHandle(Comms.get_handle(sID).getHandle()),
+        mg_graph_x,
+        cudf.Series(st_x, dtype='int32'),
+        False,
+        depth_limit if depth_limit is not None else 0,
+        return_distances,
+        True
+    )
+
+
 def bfs(input_graph,
         start,
         depth_limit=None,
         return_distances=True,
         check_start=True):
     """
-    Find the distances and predecessors for a breadth first traversal of a
+    Find the distances and predecessors for a breadth-first traversal of a
     graph.
-    The input graph must contain edge list as  dask-cudf dataframe with
+    The input graph must contain edge list as a dask-cudf dataframe with
     one partition per GPU.
 
+    Note: This is a pylibcugraph-enabled algorithm, which requires that the
+    graph was created with legacy_renum_only=True.
+
     Parameters
     ----------
     input_graph : cugraph.Graph
@@ -147,27 +118,14 @@ def bfs(input_graph,
 
     """
 
-    client = default_client()
-
-    input_graph.compute_renumber_edge_list(
-        transposed=False, legacy_renum_only=True)
-    ddf = input_graph.edgelist.edgelist_df
-
-    graph_properties = GraphProperties(
-        is_multigraph=False)
-
-    num_edges = len(ddf)
-    data = get_distributed_data(ddf)
-
-    src_col_name = input_graph.renumber_map.renumbered_src_col_name
-    dst_col_name = input_graph.renumber_map.renumbered_dst_col_name
+    client = input_graph._client
 
     if not isinstance(start, (dask_cudf.DataFrame, dask_cudf.Series)):
         if not isinstance(start, (cudf.DataFrame, cudf.Series)):
             start = cudf.Series(start)
         if isinstance(start, (cudf.DataFrame, cudf.Series)):
             # convert into a dask_cudf
-            start = dask_cudf.from_cudf(start, ddf.npartitions)
+            start = dask_cudf.from_cudf(start, input_graph._npartitions)
 
     def check_valid_vertex(G, start):
         is_valid_vertex = G.has_node(start)
@@ -190,23 +148,18 @@ def check_valid_vertex(G, start):
 
     data_start = get_distributed_data(start)
 
-    cupy_result = [client.submit(
-              _call_plc_mg_bfs,
-              Comms.get_session_id(),
-              wf[1],
-              wf_start[1],
-              depth_limit,
-              src_col_name,
-              dst_col_name,
-              graph_properties,
-              num_edges,
-              False,
-              True,
-              return_distances,
-              workers=[wf[0]])
-              for idx, (wf, wf_start) in enumerate(
-                  zip(data.worker_to_parts.items(),
-                      data_start.worker_to_parts.items()))]
+    cupy_result = [
+        client.submit(
+            _call_plc_bfs,
+            Comms.get_session_id(),
+            input_graph._plc_graph[w],
+            st[0],
+            depth_limit,
+            return_distances,
+            workers=[w]
+        )
+        for w, st in data_start.worker_to_parts.items()
+    ]
 
     wait(cupy_result)
 

diff --git a/python/cugraph/cugraph/sampling/node2vec.py b/python/cugraph/cugraph/sampling/node2vec.py
@@ -31,6 +31,9 @@ def node2vec(G,
     Computes random walks for each node in 'start_vertices', under the
     node2vec sampling framework.
 
+    Note: This is a pylibcugraph-enabled algorithm, which requires that the
+    graph was created with legacy_renum_only=True.
+
     References
     ----------