From 6b795336797817a096bd02cec99f473b314fee05 Mon Sep 17 00:00:00 2001
From: jnke2016 <jnke2016@gmail.com>
Date: Mon, 25 Sep 2023 21:47:16 -0700
Subject: [PATCH 1/3] add support for weighted mg jaccard , update tests and
 remove experimental calls

---
 .../cugraph/dask/link_prediction/jaccard.py   | 205 +-----------------
 .../cugraph/dask/link_prediction/overlap.py   |  10 +-
 .../cugraph/dask/link_prediction/sorensen.py  |  10 +-
 .../tests/link_prediction/test_jaccard_mg.py  |  54 +----
 .../tests/link_prediction/test_overlap_mg.py  |  54 +----
 .../tests/link_prediction/test_sorensen_mg.py |  54 +----
 6 files changed, 43 insertions(+), 344 deletions(-)

diff --git a/python/cugraph/cugraph/dask/link_prediction/jaccard.py b/python/cugraph/cugraph/dask/link_prediction/jaccard.py
index 218e6206fc3..32f679f57e0 100644
--- a/python/cugraph/cugraph/dask/link_prediction/jaccard.py
+++ b/python/cugraph/cugraph/dask/link_prediction/jaccard.py
@@ -1,204 +1 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from dask.distributed import wait, default_client
-import cugraph.dask.comms.comms as Comms
-import dask_cudf
-import cudf
-from cugraph.dask.common.input_utils import get_distributed_data
-from cugraph.utilities import renumber_vertex_pair
-
-from pylibcugraph import (
-    jaccard_coefficients as pylibcugraph_jaccard_coefficients,
-)
-from pylibcugraph import ResourceHandle
-
-
-def convert_to_cudf(cp_arrays):
-    """
-    Creates a cudf DataFrame from cupy arrays from pylibcugraph wrapper
-    """
-
-    cupy_first, cupy_second, cupy_similarity = cp_arrays
-
-    df = cudf.DataFrame()
-    df["first"] = cupy_first
-    df["second"] = cupy_second
-    df["jaccard_coeff"] = cupy_similarity
-
-    return df
-
-
-def _call_plc_jaccard(
-    sID, mg_graph_x, vertex_pair, use_weight, do_expensive_check, vertex_pair_col_name
-):
-
-    first = vertex_pair[vertex_pair_col_name[0]]
-    second = vertex_pair[vertex_pair_col_name[1]]
-
-    return pylibcugraph_jaccard_coefficients(
-        resource_handle=ResourceHandle(Comms.get_handle(sID).getHandle()),
-        graph=mg_graph_x,
-        first=first,
-        second=second,
-        use_weight=use_weight,
-        do_expensive_check=do_expensive_check,
-    )
-
-
-def jaccard(input_graph, vertex_pair=None, use_weight=False):
-    """
-    Compute the Jaccard similarity between each pair of vertices connected by
-    an edge, or between arbitrary pairs of vertices specified by the user.
-    Jaccard similarity is defined between two sets as the ratio of the volume
-    of their intersection divided by the volume of their union. In the context
-    of graphs, the neighborhood of a vertex is seen as a set. The Jaccard
-    similarity weight of each edge represents the strength of connection
-    between vertices based on the relative similarity of their neighbors. If
-    first is specified but second is not, or vice versa, an exception will be
-    thrown.
-
-    NOTE: If the vertex_pair parameter is not specified then the behavior
-    of cugraph.jaccard is different from the behavior of
-    networkx.jaccard_coefficient.
-
-    cugraph.dask.jaccard, in the absence of a specified vertex pair list, will
-    compute the two_hop_neighbors of the entire graph to construct a vertex pair
-    list and will return the jaccard coefficient for those vertex pairs. This is
-    not advisable as the vertex_pairs can grow exponentially with respect to the
-    size of the datasets
-
-    networkx.jaccard_coefficient, in the absence of a specified vertex
-    pair list, will return an upper triangular dense matrix, excluding
-    the diagonal as well as vertex pairs that are directly connected
-    by an edge in the graph, of jaccard coefficients.  Technically, networkx
-    returns a lazy iterator across this upper triangular matrix where
-    the actual jaccard coefficient is computed when the iterator is
-    dereferenced.  Computing a dense matrix of results is not feasible
-    if the number of vertices in the graph is large (100,000 vertices
-    would result in 4.9 billion values in that iterator).
-
-    If your graph is small enough (or you have enough memory and patience)
-    you can get the interesting (non-zero) values that are part of the networkx
-    solution by doing the following:
-
-    But please remember that cugraph will fill the dataframe with the entire
-    solution you request, so you'll need enough memory to store the 2-hop
-    neighborhood dataframe.
-
-
-    Parameters
-    ----------
-    input_graph : cugraph.Graph
-        cuGraph Graph instance, should contain the connectivity information
-        as an edge list (edge weights are not supported yet for this algorithm). The
-        graph should be undirected where an undirected edge is represented by a
-        directed edge in both direction. The adjacency list will be computed if
-        not already present.
-
-        This implementation only supports undirected, unweighted Graph.
-
-    vertex_pair : cudf.DataFrame, optional (default=None)
-        A GPU dataframe consisting of two columns representing pairs of
-        vertices. If provided, the jaccard coefficient is computed for the
-        given vertex pairs.  If the vertex_pair is not provided then the
-        current implementation computes the jaccard coefficient for all
-        adjacent vertices in the graph.
-
-    use_weight : bool, optional (default=False)
-        Currently not supported
-
-    Returns
-    -------
-    result : dask_cudf.DataFrame
-        GPU distributed data frame containing 2 dask_cudf.Series
-
-        ddf['first']: dask_cudf.Series
-            The first vertex ID of each pair (will be identical to first if specified).
-        ddf['second']: dask_cudf.Series
-            The second vertex ID of each pair (will be identical to second if
-            specified).
-        ddf['jaccard_coeff']: dask_cudf.Series
-            The computed jaccard coefficient between the first and the second
-            vertex ID.
-    """
-
-    if input_graph.is_directed():
-        raise ValueError("input graph must be undirected")
-
-    if vertex_pair is None:
-        # Call two_hop neighbor of the entire graph
-        vertex_pair = input_graph.get_two_hop_neighbors()
-
-    vertex_pair_col_name = vertex_pair.columns
-
-    if use_weight:
-        raise ValueError("'use_weight' is currently not supported.")
-
-    if input_graph.is_weighted():
-        raise ValueError("Weighted graphs are currently not supported.")
-
-    if isinstance(vertex_pair, (dask_cudf.DataFrame, cudf.DataFrame)):
-        vertex_pair = renumber_vertex_pair(input_graph, vertex_pair)
-
-    elif vertex_pair is not None:
-        raise ValueError("vertex_pair must be a dask_cudf or cudf dataframe")
-
-    if not isinstance(vertex_pair, (dask_cudf.DataFrame)):
-        vertex_pair = dask_cudf.from_cudf(
-            vertex_pair, npartitions=len(Comms.get_workers())
-        )
-    vertex_pair = get_distributed_data(vertex_pair)
-    wait(vertex_pair)
-    vertex_pair = vertex_pair.worker_to_parts
-
-    # Initialize dask client
-    client = default_client()
-
-    do_expensive_check = False
-
-    if vertex_pair is not None:
-        result = [
-            client.submit(
-                _call_plc_jaccard,
-                Comms.get_session_id(),
-                input_graph._plc_graph[w],
-                vertex_pair[w][0],
-                use_weight,
-                do_expensive_check,
-                vertex_pair_col_name,
-                workers=[w],
-                allow_other_workers=False,
-            )
-            for w in Comms.get_workers()
-        ]
-
-    wait(result)
-
-    cudf_result = [client.submit(convert_to_cudf, cp_arrays) for cp_arrays in result]
-
-    wait(cudf_result)
-
-    ddf = dask_cudf.from_delayed(cudf_result).persist()
-    wait(ddf)
-
-    # Wait until the inactive futures are released
-    wait([(r.release(), c_r.release()) for r, c_r in zip(result, cudf_result)])
-
-    if input_graph.renumbered:
-        ddf = input_graph.unrenumber(ddf, "first")
-        ddf = input_graph.unrenumber(ddf, "second")
-
-    return ddf
+sore
\ No newline at end of file
diff --git a/python/cugraph/cugraph/dask/link_prediction/overlap.py b/python/cugraph/cugraph/dask/link_prediction/overlap.py
index 5540be28fd1..4bda05e3c95 100644
--- a/python/cugraph/cugraph/dask/link_prediction/overlap.py
+++ b/python/cugraph/cugraph/dask/link_prediction/overlap.py
@@ -96,7 +96,9 @@ def overlap(input_graph, vertex_pair=None, use_weight=False):
         adjacent vertices in the graph.
 
     use_weight : bool, optional (default=False)
-        Currently not supported
+        Flag to indicate whether to compute weighted overlap (if use_weight==True)
+        or un-weighted overlap (if use_weight==False).
+        'input_graph' must be weighted if 'use_weight=True'.
 
     Returns
     -------
@@ -122,12 +124,6 @@ def overlap(input_graph, vertex_pair=None, use_weight=False):
 
     vertex_pair_col_name = vertex_pair.columns
 
-    if use_weight:
-        raise ValueError("'use_weight' is currently not supported.")
-
-    if input_graph.is_weighted():
-        raise ValueError("Weighted graphs are currently not supported.")
-
     if isinstance(vertex_pair, (dask_cudf.DataFrame, cudf.DataFrame)):
         vertex_pair = renumber_vertex_pair(input_graph, vertex_pair)
 
diff --git a/python/cugraph/cugraph/dask/link_prediction/sorensen.py b/python/cugraph/cugraph/dask/link_prediction/sorensen.py
index 24295ac330c..163b0d0dc16 100644
--- a/python/cugraph/cugraph/dask/link_prediction/sorensen.py
+++ b/python/cugraph/cugraph/dask/link_prediction/sorensen.py
@@ -92,7 +92,9 @@ def sorensen(input_graph, vertex_pair=None, use_weight=False):
         adjacent vertices in the graph.
 
     use_weight : bool, optional (default=False)
-        Currently not supported
+        Flag to indicate whether to compute weighted sorensen (if use_weight==True)
+        or un-weighted sorensen (if use_weight==False).
+        'input_graph' must be weighted if 'use_weight=True'.
 
     Returns
     -------
@@ -118,12 +120,6 @@ def sorensen(input_graph, vertex_pair=None, use_weight=False):
 
     vertex_pair_col_name = vertex_pair.columns
 
-    if use_weight:
-        raise ValueError("'use_weight' is currently not supported.")
-
-    if input_graph.is_weighted():
-        raise ValueError("Weighted graphs are currently not supported.")
-
     if isinstance(vertex_pair, (dask_cudf.DataFrame, cudf.DataFrame)):
         vertex_pair = renumber_vertex_pair(input_graph, vertex_pair)
 
diff --git a/python/cugraph/cugraph/tests/link_prediction/test_jaccard_mg.py b/python/cugraph/cugraph/tests/link_prediction/test_jaccard_mg.py
index b56a6baae2b..3202bf0a065 100644
--- a/python/cugraph/cugraph/tests/link_prediction/test_jaccard_mg.py
+++ b/python/cugraph/cugraph/tests/link_prediction/test_jaccard_mg.py
@@ -34,6 +34,7 @@ def setup_function():
 
 IS_DIRECTED = [False]
 HAS_VERTEX_PAIR = [True, False]
+IS_WEIGHTED = [True, False]
 
 
 # =============================================================================
@@ -48,6 +49,7 @@ def setup_function():
     (datasets, "graph_file"),
     (IS_DIRECTED, "directed"),
     (HAS_VERTEX_PAIR, "has_vertex_pair"),
+    (IS_WEIGHTED, "is_weighted"),
 )
 
 
@@ -57,7 +59,8 @@ def input_combo(request):
     Simply return the current combination of params as a dictionary for use in
     tests or other parameterized fixtures.
     """
-    parameters = dict(zip(("graph_file", "directed", "has_vertex_pair"), request.param))
+    parameters = dict(zip(
+        ("graph_file", "directed", "has_vertex_pair", "is_weighted"), request.param))
 
     return parameters
 
@@ -72,7 +75,9 @@ def input_expected_output(input_combo):
     input_data_path = input_combo["graph_file"]
     directed = input_combo["directed"]
     has_vertex_pair = input_combo["has_vertex_pair"]
-    G = utils.generate_cugraph_graph_from_file(input_data_path, directed=directed)
+    is_weighted = input_combo["is_weighted"]
+    G = utils.generate_cugraph_graph_from_file(
+        input_data_path, directed=directed, edgevals=is_weighted)
     if has_vertex_pair:
         # Sample random vertices from the graph and compute the two_hop_neighbors
         # with those seeds
@@ -84,7 +89,7 @@ def input_expected_output(input_combo):
         vertex_pair = None
 
     input_combo["vertex_pair"] = vertex_pair
-    sg_cugraph_jaccard = cugraph.experimental.jaccard(G, input_combo["vertex_pair"])
+    sg_cugraph_jaccard = cugraph.jaccard(G, input_combo["vertex_pair"], use_weight=is_weighted)
     # Save the results back to the input_combo dictionary to prevent redundant
     # cuGraph runs. Other tests using the input_combo fixture will look for
     # them, and if not present they will have to re-run the same cuGraph call.
@@ -104,6 +109,7 @@ def input_expected_output(input_combo):
         ddf,
         source="src",
         destination="dst",
+        edge_attr="value" if is_weighted else None,
         renumber=True,
         store_transposed=True,
     )
@@ -122,8 +128,10 @@ def input_expected_output(input_combo):
 def test_dask_mg_jaccard(dask_client, benchmark, input_expected_output):
 
     dg = input_expected_output["MGGraph"]
+    use_weight = input_expected_output["is_weighted"]
 
-    result_jaccard = benchmark(dcg.jaccard, dg, input_expected_output["vertex_pair"])
+    result_jaccard = benchmark(
+        dcg.jaccard, dg, input_expected_output["vertex_pair"], use_weight=use_weight)
 
     result_jaccard = (
         result_jaccard.compute()
@@ -151,41 +159,3 @@ def test_dask_mg_jaccard(dask_client, benchmark, input_expected_output):
 
     assert len(jaccard_coeff_diffs1) == 0
     assert len(jaccard_coeff_diffs2) == 0
-
-
-@pytest.mark.mg
-def test_dask_mg_weighted_jaccard(dask_client):
-    input_data_path = datasets[0]
-    chunksize = dcg.get_chunksize(input_data_path)
-    ddf = dask_cudf.read_csv(
-        input_data_path,
-        chunksize=chunksize,
-        delimiter=" ",
-        names=["src", "dst", "value"],
-        dtype=["int32", "int32", "float32"],
-    )
-
-    dg = cugraph.Graph(directed=False)
-    dg.from_dask_cudf_edgelist(
-        ddf,
-        source="src",
-        destination="dst",
-        edge_attr="value",
-        renumber=True,
-        store_transposed=True,
-    )
-    with pytest.raises(ValueError):
-        dcg.jaccard(dg)
-
-    dg = cugraph.Graph(directed=False)
-    dg.from_dask_cudf_edgelist(
-        ddf,
-        source="src",
-        destination="dst",
-        edge_attr="value",
-        store_transposed=True,
-    )
-
-    use_weight = True
-    with pytest.raises(ValueError):
-        dcg.jaccard(dg, use_weight=use_weight)
diff --git a/python/cugraph/cugraph/tests/link_prediction/test_overlap_mg.py b/python/cugraph/cugraph/tests/link_prediction/test_overlap_mg.py
index ce4bf619f47..5763b0f0e4f 100644
--- a/python/cugraph/cugraph/tests/link_prediction/test_overlap_mg.py
+++ b/python/cugraph/cugraph/tests/link_prediction/test_overlap_mg.py
@@ -34,6 +34,7 @@ def setup_function():
 
 IS_DIRECTED = [False]
 HAS_VERTEX_PAIR = [True, False]
+IS_WEIGHTED = [True, False]
 
 
 # =============================================================================
@@ -48,6 +49,7 @@ def setup_function():
     (datasets, "graph_file"),
     (IS_DIRECTED, "directed"),
     (HAS_VERTEX_PAIR, "has_vertex_pair"),
+    (IS_WEIGHTED, "is_weighted"),
 )
 
 
@@ -57,7 +59,7 @@ def input_combo(request):
     Simply return the current combination of params as a dictionary for use in
     tests or other parameterized fixtures.
     """
-    parameters = dict(zip(("graph_file", "directed", "has_vertex_pair"), request.param))
+    parameters = dict(zip(("graph_file", "directed", "has_vertex_pair", "is_weighted"), request.param))
 
     return parameters
 
@@ -72,7 +74,9 @@ def input_expected_output(input_combo):
     input_data_path = input_combo["graph_file"]
     directed = input_combo["directed"]
     has_vertex_pair = input_combo["has_vertex_pair"]
-    G = utils.generate_cugraph_graph_from_file(input_data_path, directed=directed)
+    is_weighted = input_combo["is_weighted"]
+    G = utils.generate_cugraph_graph_from_file(
+        input_data_path, directed=directed, edgevals=is_weighted)
     if has_vertex_pair:
         # Sample random vertices from the graph and compute the two_hop_neighbors
         # with those seeds
@@ -84,7 +88,8 @@ def input_expected_output(input_combo):
         vertex_pair = None
 
     input_combo["vertex_pair"] = vertex_pair
-    sg_cugraph_overlap = cugraph.experimental.overlap(G, input_combo["vertex_pair"])
+    sg_cugraph_overlap = cugraph.overlap(
+        G, input_combo["vertex_pair"], use_weight=is_weighted)
     # Save the results back to the input_combo dictionary to prevent redundant
     # cuGraph runs. Other tests using the input_combo fixture will look for
     # them, and if not present they will have to re-run the same cuGraph call.
@@ -104,6 +109,7 @@ def input_expected_output(input_combo):
         ddf,
         source="src",
         destination="dst",
+        edge_attr="value" if is_weighted else None,
         renumber=True,
         store_transposed=True,
     )
@@ -125,8 +131,10 @@ def input_expected_output(input_combo):
 def test_dask_mg_overlap(dask_client, benchmark, input_expected_output):
 
     dg = input_expected_output["MGGraph"]
+    use_weight = input_expected_output["is_weighted"]
 
-    result_overlap = benchmark(dcg.overlap, dg, input_expected_output["vertex_pair"])
+    result_overlap = benchmark(
+        dcg.overlap, dg, input_expected_output["vertex_pair"], use_weight=use_weight)
 
     result_overlap = (
         result_overlap.compute()
@@ -154,41 +162,3 @@ def test_dask_mg_overlap(dask_client, benchmark, input_expected_output):
 
     assert len(overlap_coeff_diffs1) == 0
     assert len(overlap_coeff_diffs2) == 0
-
-
-@pytest.mark.mg
-def test_dask_mg_weighted_overlap():
-    input_data_path = datasets[0]
-    chunksize = dcg.get_chunksize(input_data_path)
-    ddf = dask_cudf.read_csv(
-        input_data_path,
-        chunksize=chunksize,
-        delimiter=" ",
-        names=["src", "dst", "value"],
-        dtype=["int32", "int32", "float32"],
-    )
-
-    dg = cugraph.Graph(directed=False)
-    dg.from_dask_cudf_edgelist(
-        ddf,
-        source="src",
-        destination="dst",
-        edge_attr="value",
-        renumber=True,
-        store_transposed=True,
-    )
-    with pytest.raises(ValueError):
-        dcg.overlap(dg)
-
-    dg = cugraph.Graph(directed=False)
-    dg.from_dask_cudf_edgelist(
-        ddf,
-        source="src",
-        destination="dst",
-        edge_attr="value",
-        store_transposed=True,
-    )
-
-    use_weight = True
-    with pytest.raises(ValueError):
-        dcg.overlap(dg, use_weight=use_weight)
diff --git a/python/cugraph/cugraph/tests/link_prediction/test_sorensen_mg.py b/python/cugraph/cugraph/tests/link_prediction/test_sorensen_mg.py
index af6b60771a0..bc05929d50f 100644
--- a/python/cugraph/cugraph/tests/link_prediction/test_sorensen_mg.py
+++ b/python/cugraph/cugraph/tests/link_prediction/test_sorensen_mg.py
@@ -35,6 +35,7 @@ def setup_function():
 
 IS_DIRECTED = [False]
 HAS_VERTEX_PAIR = [True, False]
+IS_WEIGHTED = [True, False]
 
 
 # =============================================================================
@@ -49,6 +50,7 @@ def setup_function():
     (datasets, "graph_file"),
     (IS_DIRECTED, "directed"),
     (HAS_VERTEX_PAIR, "has_vertex_pair"),
+    (IS_WEIGHTED, "is_weighted"),
 )
 
 
@@ -58,7 +60,7 @@ def input_combo(request):
     Simply return the current combination of params as a dictionary for use in
     tests or other parameterized fixtures.
     """
-    parameters = dict(zip(("graph_file", "directed", "has_vertex_pair"), request.param))
+    parameters = dict(zip(("graph_file", "directed", "has_vertex_pair", "is_weighted"), request.param))
 
     return parameters
 
@@ -73,7 +75,9 @@ def input_expected_output(input_combo):
     input_data_path = input_combo["graph_file"]
     directed = input_combo["directed"]
     has_vertex_pair = input_combo["has_vertex_pair"]
-    G = utils.generate_cugraph_graph_from_file(input_data_path, directed=directed)
+    is_weighted = input_combo["is_weighted"]
+    G = utils.generate_cugraph_graph_from_file(
+        input_data_path, directed=directed,edgevals=is_weighted)
     if has_vertex_pair:
         # Sample random vertices from the graph and compute the two_hop_neighbors
         # with those seeds
@@ -85,7 +89,8 @@ def input_expected_output(input_combo):
         vertex_pair = None
 
     input_combo["vertex_pair"] = vertex_pair
-    sg_cugraph_sorensen = cugraph.experimental.sorensen(G, input_combo["vertex_pair"])
+    sg_cugraph_sorensen = cugraph.sorensen(
+        G, input_combo["vertex_pair"], use_weight=is_weighted)
     # Save the results back to the input_combo dictionary to prevent redundant
     # cuGraph runs. Other tests using the input_combo fixture will look for
     # them, and if not present they will have to re-run the same cuGraph call.
@@ -105,6 +110,7 @@ def input_expected_output(input_combo):
         ddf,
         source="src",
         destination="dst",
+        edge_attr="value" if is_weighted else None,
         renumber=True,
         store_transposed=True,
     )
@@ -124,8 +130,10 @@ def input_expected_output(input_combo):
 def test_dask_mg_sorensen(dask_client, benchmark, input_expected_output):
 
     dg = input_expected_output["MGGraph"]
+    use_weight = input_expected_output["is_weighted"]
 
-    result_sorensen = benchmark(dcg.sorensen, dg, input_expected_output["vertex_pair"])
+    result_sorensen = benchmark(
+        dcg.sorensen, dg, input_expected_output["vertex_pair"], use_weight=use_weight)
 
     result_sorensen = (
         result_sorensen.compute()
@@ -153,41 +161,3 @@ def test_dask_mg_sorensen(dask_client, benchmark, input_expected_output):
 
     assert len(sorensen_coeff_diffs1) == 0
     assert len(sorensen_coeff_diffs2) == 0
-
-
-@pytest.mark.mg
-def test_dask_mg_weighted_sorensen(dask_client):
-    input_data_path = datasets[0]
-    chunksize = dcg.get_chunksize(input_data_path)
-    ddf = dask_cudf.read_csv(
-        input_data_path,
-        chunksize=chunksize,
-        delimiter=" ",
-        names=["src", "dst", "value"],
-        dtype=["int32", "int32", "float32"],
-    )
-
-    dg = cugraph.Graph(directed=False)
-    dg.from_dask_cudf_edgelist(
-        ddf,
-        source="src",
-        destination="dst",
-        edge_attr="value",
-        renumber=True,
-        store_transposed=True,
-    )
-    with pytest.raises(ValueError):
-        dcg.sorensen(dg)
-
-    dg = cugraph.Graph(directed=False)
-    dg.from_dask_cudf_edgelist(
-        ddf,
-        source="src",
-        destination="dst",
-        edge_attr="value",
-        store_transposed=True,
-    )
-
-    use_weight = True
-    with pytest.raises(ValueError):
-        dcg.sorensen(dg, use_weight=use_weight)

From a1bf0a8c8b8e0afe4ee4bca209766daa468c110a Mon Sep 17 00:00:00 2001
From: jnke2016 <jnke2016@gmail.com>
Date: Mon, 25 Sep 2023 21:56:28 -0700
Subject: [PATCH 2/3] fix typo

---
 .../cugraph/dask/link_prediction/jaccard.py   | 201 +++++++++++++++++-
 1 file changed, 200 insertions(+), 1 deletion(-)

diff --git a/python/cugraph/cugraph/dask/link_prediction/jaccard.py b/python/cugraph/cugraph/dask/link_prediction/jaccard.py
index 32f679f57e0..5362c7a9e1e 100644
--- a/python/cugraph/cugraph/dask/link_prediction/jaccard.py
+++ b/python/cugraph/cugraph/dask/link_prediction/jaccard.py
@@ -1 +1,200 @@
-sore
\ No newline at end of file
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from dask.distributed import wait, default_client
+import cugraph.dask.comms.comms as Comms
+import dask_cudf
+import cudf
+from cugraph.dask.common.input_utils import get_distributed_data
+from cugraph.utilities import renumber_vertex_pair
+
+from pylibcugraph import (
+    jaccard_coefficients as pylibcugraph_jaccard_coefficients,
+)
+from pylibcugraph import ResourceHandle
+
+
+def convert_to_cudf(cp_arrays):
+    """
+    Creates a cudf DataFrame from cupy arrays from pylibcugraph wrapper
+    """
+
+    cupy_first, cupy_second, cupy_similarity = cp_arrays
+
+    df = cudf.DataFrame()
+    df["first"] = cupy_first
+    df["second"] = cupy_second
+    df["jaccard_coeff"] = cupy_similarity
+
+    return df
+
+
+def _call_plc_jaccard(
+    sID, mg_graph_x, vertex_pair, use_weight, do_expensive_check, vertex_pair_col_name
+):
+
+    first = vertex_pair[vertex_pair_col_name[0]]
+    second = vertex_pair[vertex_pair_col_name[1]]
+
+    return pylibcugraph_jaccard_coefficients(
+        resource_handle=ResourceHandle(Comms.get_handle(sID).getHandle()),
+        graph=mg_graph_x,
+        first=first,
+        second=second,
+        use_weight=use_weight,
+        do_expensive_check=do_expensive_check,
+    )
+
+
+def jaccard(input_graph, vertex_pair=None, use_weight=False):
+    """
+    Compute the Jaccard similarity between each pair of vertices connected by
+    an edge, or between arbitrary pairs of vertices specified by the user.
+    Jaccard similarity is defined between two sets as the ratio of the volume
+    of their intersection divided by the volume of their union. In the context
+    of graphs, the neighborhood of a vertex is seen as a set. The Jaccard
+    similarity weight of each edge represents the strength of connection
+    between vertices based on the relative similarity of their neighbors. If
+    first is specified but second is not, or vice versa, an exception will be
+    thrown.
+
+    NOTE: If the vertex_pair parameter is not specified then the behavior
+    of cugraph.jaccard is different from the behavior of
+    networkx.jaccard_coefficient.
+
+    cugraph.dask.jaccard, in the absence of a specified vertex pair list, will
+    compute the two_hop_neighbors of the entire graph to construct a vertex pair
+    list and will return the jaccard coefficient for those vertex pairs. This is
+    not advisable as the vertex_pairs can grow exponentially with respect to the
+    size of the datasets
+
+    networkx.jaccard_coefficient, in the absence of a specified vertex
+    pair list, will return an upper triangular dense matrix, excluding
+    the diagonal as well as vertex pairs that are directly connected
+    by an edge in the graph, of jaccard coefficients.  Technically, networkx
+    returns a lazy iterator across this upper triangular matrix where
+    the actual jaccard coefficient is computed when the iterator is
+    dereferenced.  Computing a dense matrix of results is not feasible
+    if the number of vertices in the graph is large (100,000 vertices
+    would result in 4.9 billion values in that iterator).
+
+    If your graph is small enough (or you have enough memory and patience)
+    you can get the interesting (non-zero) values that are part of the networkx
+    solution by doing the following:
+
+    But please remember that cugraph will fill the dataframe with the entire
+    solution you request, so you'll need enough memory to store the 2-hop
+    neighborhood dataframe.
+
+
+    Parameters
+    ----------
+    input_graph : cugraph.Graph
+        cuGraph Graph instance, should contain the connectivity information
+        as an edge list (edge weights are not supported yet for this algorithm). The
+        graph should be undirected where an undirected edge is represented by a
+        directed edge in both direction. The adjacency list will be computed if
+        not already present.
+
+        This implementation only supports undirected, unweighted Graph.
+
+    vertex_pair : cudf.DataFrame, optional (default=None)
+        A GPU dataframe consisting of two columns representing pairs of
+        vertices. If provided, the jaccard coefficient is computed for the
+        given vertex pairs.  If the vertex_pair is not provided then the
+        current implementation computes the jaccard coefficient for all
+        adjacent vertices in the graph.
+
+    use_weight : bool, optional (default=False)
+        Flag to indicate whether to compute weighted jaccard (if use_weight==True)
+        or un-weighted jaccard (if use_weight==False).
+        'input_graph' must be weighted if 'use_weight=True'.
+
+    Returns
+    -------
+    result : dask_cudf.DataFrame
+        GPU distributed data frame containing 2 dask_cudf.Series
+
+        ddf['first']: dask_cudf.Series
+            The first vertex ID of each pair (will be identical to first if specified).
+        ddf['second']: dask_cudf.Series
+            The second vertex ID of each pair (will be identical to second if
+            specified).
+        ddf['jaccard_coeff']: dask_cudf.Series
+            The computed jaccard coefficient between the first and the second
+            vertex ID.
+    """
+
+    if input_graph.is_directed():
+        raise ValueError("input graph must be undirected")
+
+    if vertex_pair is None:
+        # Call two_hop neighbor of the entire graph
+        vertex_pair = input_graph.get_two_hop_neighbors()
+
+    vertex_pair_col_name = vertex_pair.columns
+
+    if isinstance(vertex_pair, (dask_cudf.DataFrame, cudf.DataFrame)):
+        vertex_pair = renumber_vertex_pair(input_graph, vertex_pair)
+
+    elif vertex_pair is not None:
+        raise ValueError("vertex_pair must be a dask_cudf or cudf dataframe")
+
+    if not isinstance(vertex_pair, (dask_cudf.DataFrame)):
+        vertex_pair = dask_cudf.from_cudf(
+            vertex_pair, npartitions=len(Comms.get_workers())
+        )
+    vertex_pair = get_distributed_data(vertex_pair)
+    wait(vertex_pair)
+    vertex_pair = vertex_pair.worker_to_parts
+
+    # Initialize dask client
+    client = default_client()
+
+    do_expensive_check = False
+
+    if vertex_pair is not None:
+        result = [
+            client.submit(
+                _call_plc_jaccard,
+                Comms.get_session_id(),
+                input_graph._plc_graph[w],
+                vertex_pair[w][0],
+                use_weight,
+                do_expensive_check,
+                vertex_pair_col_name,
+                workers=[w],
+                allow_other_workers=False,
+            )
+            for w in Comms.get_workers()
+        ]
+
+    wait(result)
+
+    cudf_result = [client.submit(convert_to_cudf, cp_arrays) for cp_arrays in result]
+
+    wait(cudf_result)
+
+    ddf = dask_cudf.from_delayed(cudf_result).persist()
+    wait(ddf)
+
+    # Wait until the inactive futures are released
+    wait([(r.release(), c_r.release()) for r, c_r in zip(result, cudf_result)])
+
+    if input_graph.renumbered:
+        ddf = input_graph.unrenumber(ddf, "first")
+        ddf = input_graph.unrenumber(ddf, "second")
+
+    return ddf

From 8bb0d3f0f13711d1d0523fd8f61e6a708b657410 Mon Sep 17 00:00:00 2001
From: jnke2016 <jnke2016@gmail.com>
Date: Mon, 25 Sep 2023 22:02:52 -0700
Subject: [PATCH 3/3] fix style

---
 .../tests/link_prediction/test_jaccard_mg.py      | 15 ++++++++++-----
 .../tests/link_prediction/test_overlap_mg.py      | 13 +++++++++----
 .../tests/link_prediction/test_sorensen_mg.py     | 13 +++++++++----
 3 files changed, 28 insertions(+), 13 deletions(-)

diff --git a/python/cugraph/cugraph/tests/link_prediction/test_jaccard_mg.py b/python/cugraph/cugraph/tests/link_prediction/test_jaccard_mg.py
index 3202bf0a065..ee739c9f236 100644
--- a/python/cugraph/cugraph/tests/link_prediction/test_jaccard_mg.py
+++ b/python/cugraph/cugraph/tests/link_prediction/test_jaccard_mg.py
@@ -59,8 +59,9 @@ def input_combo(request):
     Simply return the current combination of params as a dictionary for use in
     tests or other parameterized fixtures.
     """
-    parameters = dict(zip(
-        ("graph_file", "directed", "has_vertex_pair", "is_weighted"), request.param))
+    parameters = dict(
+        zip(("graph_file", "directed", "has_vertex_pair", "is_weighted"), request.param)
+    )
 
     return parameters
 
@@ -77,7 +78,8 @@ def input_expected_output(input_combo):
     has_vertex_pair = input_combo["has_vertex_pair"]
     is_weighted = input_combo["is_weighted"]
     G = utils.generate_cugraph_graph_from_file(
-        input_data_path, directed=directed, edgevals=is_weighted)
+        input_data_path, directed=directed, edgevals=is_weighted
+    )
     if has_vertex_pair:
         # Sample random vertices from the graph and compute the two_hop_neighbors
         # with those seeds
@@ -89,7 +91,9 @@ def input_expected_output(input_combo):
         vertex_pair = None
 
     input_combo["vertex_pair"] = vertex_pair
-    sg_cugraph_jaccard = cugraph.jaccard(G, input_combo["vertex_pair"], use_weight=is_weighted)
+    sg_cugraph_jaccard = cugraph.jaccard(
+        G, input_combo["vertex_pair"], use_weight=is_weighted
+    )
     # Save the results back to the input_combo dictionary to prevent redundant
     # cuGraph runs. Other tests using the input_combo fixture will look for
     # them, and if not present they will have to re-run the same cuGraph call.
@@ -131,7 +135,8 @@ def test_dask_mg_jaccard(dask_client, benchmark, input_expected_output):
     use_weight = input_expected_output["is_weighted"]
 
     result_jaccard = benchmark(
-        dcg.jaccard, dg, input_expected_output["vertex_pair"], use_weight=use_weight)
+        dcg.jaccard, dg, input_expected_output["vertex_pair"], use_weight=use_weight
+    )
 
     result_jaccard = (
         result_jaccard.compute()
diff --git a/python/cugraph/cugraph/tests/link_prediction/test_overlap_mg.py b/python/cugraph/cugraph/tests/link_prediction/test_overlap_mg.py
index 5763b0f0e4f..87407d7b59c 100644
--- a/python/cugraph/cugraph/tests/link_prediction/test_overlap_mg.py
+++ b/python/cugraph/cugraph/tests/link_prediction/test_overlap_mg.py
@@ -59,7 +59,9 @@ def input_combo(request):
     Simply return the current combination of params as a dictionary for use in
     tests or other parameterized fixtures.
     """
-    parameters = dict(zip(("graph_file", "directed", "has_vertex_pair", "is_weighted"), request.param))
+    parameters = dict(
+        zip(("graph_file", "directed", "has_vertex_pair", "is_weighted"), request.param)
+    )
 
     return parameters
 
@@ -76,7 +78,8 @@ def input_expected_output(input_combo):
     has_vertex_pair = input_combo["has_vertex_pair"]
     is_weighted = input_combo["is_weighted"]
     G = utils.generate_cugraph_graph_from_file(
-        input_data_path, directed=directed, edgevals=is_weighted)
+        input_data_path, directed=directed, edgevals=is_weighted
+    )
     if has_vertex_pair:
         # Sample random vertices from the graph and compute the two_hop_neighbors
         # with those seeds
@@ -89,7 +92,8 @@ def input_expected_output(input_combo):
 
     input_combo["vertex_pair"] = vertex_pair
     sg_cugraph_overlap = cugraph.overlap(
-        G, input_combo["vertex_pair"], use_weight=is_weighted)
+        G, input_combo["vertex_pair"], use_weight=is_weighted
+    )
     # Save the results back to the input_combo dictionary to prevent redundant
     # cuGraph runs. Other tests using the input_combo fixture will look for
     # them, and if not present they will have to re-run the same cuGraph call.
@@ -134,7 +138,8 @@ def test_dask_mg_overlap(dask_client, benchmark, input_expected_output):
     use_weight = input_expected_output["is_weighted"]
 
     result_overlap = benchmark(
-        dcg.overlap, dg, input_expected_output["vertex_pair"], use_weight=use_weight)
+        dcg.overlap, dg, input_expected_output["vertex_pair"], use_weight=use_weight
+    )
 
     result_overlap = (
         result_overlap.compute()
diff --git a/python/cugraph/cugraph/tests/link_prediction/test_sorensen_mg.py b/python/cugraph/cugraph/tests/link_prediction/test_sorensen_mg.py
index bc05929d50f..66832d08427 100644
--- a/python/cugraph/cugraph/tests/link_prediction/test_sorensen_mg.py
+++ b/python/cugraph/cugraph/tests/link_prediction/test_sorensen_mg.py
@@ -60,7 +60,9 @@ def input_combo(request):
     Simply return the current combination of params as a dictionary for use in
     tests or other parameterized fixtures.
     """
-    parameters = dict(zip(("graph_file", "directed", "has_vertex_pair", "is_weighted"), request.param))
+    parameters = dict(
+        zip(("graph_file", "directed", "has_vertex_pair", "is_weighted"), request.param)
+    )
 
     return parameters
 
@@ -77,7 +79,8 @@ def input_expected_output(input_combo):
     has_vertex_pair = input_combo["has_vertex_pair"]
     is_weighted = input_combo["is_weighted"]
     G = utils.generate_cugraph_graph_from_file(
-        input_data_path, directed=directed,edgevals=is_weighted)
+        input_data_path, directed=directed, edgevals=is_weighted
+    )
     if has_vertex_pair:
         # Sample random vertices from the graph and compute the two_hop_neighbors
         # with those seeds
@@ -90,7 +93,8 @@ def input_expected_output(input_combo):
 
     input_combo["vertex_pair"] = vertex_pair
     sg_cugraph_sorensen = cugraph.sorensen(
-        G, input_combo["vertex_pair"], use_weight=is_weighted)
+        G, input_combo["vertex_pair"], use_weight=is_weighted
+    )
     # Save the results back to the input_combo dictionary to prevent redundant
     # cuGraph runs. Other tests using the input_combo fixture will look for
     # them, and if not present they will have to re-run the same cuGraph call.
@@ -133,7 +137,8 @@ def test_dask_mg_sorensen(dask_client, benchmark, input_expected_output):
     use_weight = input_expected_output["is_weighted"]
 
     result_sorensen = benchmark(
-        dcg.sorensen, dg, input_expected_output["vertex_pair"], use_weight=use_weight)
+        dcg.sorensen, dg, input_expected_output["vertex_pair"], use_weight=use_weight
+    )
 
     result_sorensen = (
         result_sorensen.compute()