Added Sorensen algorithm to Python API (#1820)

Add a python implementation of the Sorensen and the wSorensen from a prior Jaccard implementation Add tests for both algorithms Since there is no current implementation of networkX Sorensen, the tests convert networkX Jaccard to Sorensen and compare it to cugraph Sorensen Authors: - Joseph Nke (https://github.com/jnke2016) Approvers: - Rick Ratzel (https://github.com/rlratzel) - Brad Rees (https://github.com/BradReesWork) URL: #1820
rapidsai · Sep 24, 2021 · 9191711 · 9191711
1 parent c3b26b6
commit 9191711
Show file tree

Hide file tree

Showing 14 changed files with 933 additions and 204 deletions.
diff --git a/python/cugraph/cugraph/__init__.py b/python/cugraph/cugraph/__init__.py
@@ -77,8 +77,11 @@
     jaccard_coefficient,
     overlap,
     overlap_coefficient,
+    sorensen,
+    sorensen_coefficient,
     jaccard_w,
     overlap_w,
+    sorensen_w,
 )
 
 from cugraph.traversal import (

diff --git a/python/cugraph/cugraph/link_prediction/__init__.py b/python/cugraph/cugraph/link_prediction/__init__.py
@@ -16,5 +16,8 @@
 from cugraph.link_prediction.overlap import overlap
 from cugraph.link_prediction.wjaccard import jaccard_w
 from cugraph.link_prediction.woverlap import overlap_w
+from cugraph.link_prediction.wsorensen import sorensen_w
 from cugraph.link_prediction.jaccard import jaccard_coefficient
+from cugraph.link_prediction.sorensen import sorensen_coefficient
+from cugraph.link_prediction.sorensen import sorensen
 from cugraph.link_prediction.overlap import overlap_coefficient
diff --git a/python/cugraph/cugraph/link_prediction/jaccard.py b/python/cugraph/cugraph/link_prediction/jaccard.py
@@ -11,7 +11,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import pandas as pd
 import cudf
 from cugraph.structure.graph_classes import Graph
 from cugraph.link_prediction import jaccard_wrapper
@@ -69,7 +68,7 @@ def jaccard(input_graph, vertex_pair=None):
     Parameters
     ----------
     graph : cugraph.Graph
-        cuGraph graph descriptor, should contain the connectivity information
+        cuGraph Graph instance, should contain the connectivity information
         as an edge list (edge weights are not used for this algorithm). The
         graph should be undirected where an undirected edge is represented by a
         directed edge in both direction. The adjacency list will be computed if
@@ -107,13 +106,11 @@ def jaccard(input_graph, vertex_pair=None):
     >>> df = cugraph.jaccard(G)
     """
     if type(input_graph) is not Graph:
-        raise Exception("input graph must be undirected")
+        raise TypeError("input graph must a Graph")
 
     if type(vertex_pair) == cudf.DataFrame:
         vertex_pair = renumber_vertex_pair(input_graph, vertex_pair)
-    elif vertex_pair is None:
-        pass
-    else:
+    elif vertex_pair is not None:
         raise ValueError("vertex_pair must be a cudf dataframe")
 
     df = jaccard_wrapper.jaccard(input_graph, None, vertex_pair)
@@ -132,7 +129,7 @@ def jaccard_coefficient(G, ebunch=None):
     Parameters
     ----------
     graph : cugraph.Graph
-        cuGraph graph descriptor, should contain the connectivity information
+        cuGraph Graph instance, should contain the connectivity information
         as an edge list (edge weights are not used for this algorithm). The
         graph should be undirected where an undirected edge is represented by a
         directed edge in both direction. The adjacency list will be computed if
@@ -174,7 +171,7 @@ def jaccard_coefficient(G, ebunch=None):
     G, isNx = check_nx_graph(G)
 
     if isNx is True and ebunch is not None:
-        vertex_pair = cudf.from_pandas(pd.DataFrame(ebunch))
+        vertex_pair = cudf.DataFrame(ebunch)
 
     df = jaccard(G, vertex_pair)
 

diff --git a/python/cugraph/cugraph/link_prediction/overlap.py b/python/cugraph/cugraph/link_prediction/overlap.py
@@ -11,7 +11,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import pandas as pd
 from cugraph.link_prediction import overlap_wrapper
 import cudf
 from cugraph.utilities import check_nx_graph
@@ -29,7 +28,7 @@ def overlap_coefficient(G, ebunch=None):
     G, isNx = check_nx_graph(G)
 
     if isNx is True and ebunch is not None:
-        vertex_pair = cudf.from_pandas(pd.DataFrame(ebunch))
+        vertex_pair = cudf.DataFrame(ebunch)
 
     df = overlap(G, vertex_pair)
 
@@ -57,7 +56,7 @@ def overlap(input_graph, vertex_pair=None):
     Parameters
     ----------
     graph : cugraph.Graph
-        cuGraph graph descriptor, should contain the connectivity information
+        cuGraph Graph instance, should contain the connectivity information
         as an edge list (edge weights are not used for this algorithm). The
         adjacency list will be computed if not already present.
     vertex_pair : cudf.DataFrame
@@ -93,9 +92,7 @@ def overlap(input_graph, vertex_pair=None):
 
     if type(vertex_pair) == cudf.DataFrame:
         vertex_pair = renumber_vertex_pair(input_graph, vertex_pair)
-    elif vertex_pair is None:
-        pass
-    else:
+    elif vertex_pair is not None:
         raise ValueError("vertex_pair must be a cudf dataframe")
 
     df = overlap_wrapper.overlap(input_graph, None, vertex_pair)

diff --git a/python/cugraph/cugraph/link_prediction/sorensen.py b/python/cugraph/cugraph/link_prediction/sorensen.py
@@ -0,0 +1,152 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import cudf
+from cugraph.structure.graph_classes import Graph
+from cugraph.link_prediction import jaccard_wrapper
+from cugraph.utilities import check_nx_graph
+from cugraph.utilities import df_edge_score_to_dictionary
+from cugraph.utilities import renumber_vertex_pair
+
+
+def sorensen(input_graph, vertex_pair=None):
+    """
+    Compute the Sorensen coefficient between each pair of vertices connected by
+    an edge, or between arbitrary pairs of vertices specified by the user.
+    Sorensen coefficient is defined between two sets as the ratio of twice the
+    volume of their intersection divided by the volume of each set.
+    If first is specified but second is not, or vice versa, an exception will
+    be thrown.
+
+    cugraph.sorensen, in the absence of a specified vertex pair list, will
+    use the edges of the graph to construct a vertex pair list and will
+    return the sorensen coefficient for those vertex pairs.
+
+    Parameters
+    ----------
+    graph : cugraph.Graph
+        cuGraph Graph instance, should contain the connectivity information
+        as an edge list (edge weights are not used for this algorithm). The
+        graph should be undirected where an undirected edge is represented by a
+        directed edge in both direction. The adjacency list will be computed if
+        not already present.
+    vertex_pair : cudf.DataFrame
+        A GPU dataframe consisting of two columns representing pairs of
+        vertices. If provided, the Sorensen coefficient is computed for the
+        given vertex pairs.  If the vertex_pair is not provided then the
+        current implementation computes the Sorensen coefficient for all
+        adjacent vertices in the graph.
+
+    Returns
+    -------
+    df  : cudf.DataFrame
+        GPU data frame of size E (the default) or the size of the given pairs
+        (first, second) containing the Sorensen index. The ordering is
+        relative to the adjacency list, or that given by the specified vertex
+        pairs.
+
+        df['source'] : cudf.Series
+            The source vertex ID (will be identical to first if specified)
+        df['destination'] : cudf.Series
+            The destination vertex ID (will be identical to second if
+            specified)
+        df['sorensen_coeff'] : cudf.Series
+            The computed Sorensen coefficient between the source and
+            destination vertices
+
+    Examples
+    --------
+    >>> gdf = cudf.read_csv('datasets/karate.csv', delimiter=' ',
+    >>>                   dtype=['int32', 'int32', 'float32'], header=None)
+    >>> G = cugraph.Graph()
+    >>> G.from_cudf_edgelist(gdf, source='0', destination='1')
+    >>> df = cugraph.sorensen(G)
+    """
+    if type(input_graph) is not Graph:
+        raise TypeError("input graph must a Graph")
+
+    if type(vertex_pair) == cudf.DataFrame:
+        vertex_pair = renumber_vertex_pair(input_graph, vertex_pair)
+    elif vertex_pair is not None:
+        raise ValueError("vertex_pair must be a cudf dataframe")
+
+    df = jaccard_wrapper.jaccard(input_graph, None, vertex_pair)
+    df.jaccard_coeff = ((2*df.jaccard_coeff)/(1+df.jaccard_coeff))
+    df.rename(
+        {'jaccard_coeff': 'sorensen_coeff'}, axis=1, inplace=True)
+    if input_graph.renumbered:
+        df = input_graph.unrenumber(df, "source")
+        df = input_graph.unrenumber(df, "destination")
+
+    return df
+
+
+def sorensen_coefficient(G, ebunch=None):
+    """
+
+    Parameters
+    ----------
+    graph : cugraph.Graph
+        cuGraph Graph instance, should contain the connectivity information
+        as an edge list (edge weights are not used for this algorithm). The
+        graph should be undirected where an undirected edge is represented by a
+        directed edge in both direction. The adjacency list will be computed if
+        not already present.
+    ebunch : cudf.DataFrame
+        A GPU dataframe consisting of two columns representing pairs of
+        vertices. If provided, the sorensen coefficient is computed for the
+        given vertex pairs.  If the vertex_pair is not provided then the
+        current implementation computes the sorensen coefficient for all
+        adjacent vertices in the graph.
+
+    Returns
+    -------
+    df  : cudf.DataFrame
+        GPU data frame of size E (the default) or the size of the given pairs
+        (first, second) containing the Sorensen weights. The ordering is
+        relative to the adjacency list, or that given by the specified vertex
+        pairs.
+
+        df['source'] : cudf.Series
+            The source vertex ID (will be identical to first if specified)
+        df['destination'] : cudf.Series
+            The destination vertex ID (will be identical to second if
+            specified)
+        df['sorensen_coeff'] : cudf.Series
+            The computed sorensen coefficient between the source and
+            destination vertices
+
+    Examples
+    --------
+    >>> gdf = cudf.read_csv('datasets/karate.csv', delimiter=' ',
+    >>>                   dtype=['int32', 'int32', 'float32'], header=None)
+    >>> G = cugraph.Graph()
+    >>> G.from_cudf_edgelist(gdf, source='0', destination='1')
+    >>> df = cugraph.sorensen_coefficient(G)
+    """
+    vertex_pair = None
+
+    G, isNx = check_nx_graph(G)
+
+    if isNx is True and ebunch is not None:
+        vertex_pair = cudf.DataFrame(ebunch)
+
+    df = sorensen(G, vertex_pair)
+
+    if isNx is True:
+        df = df_edge_score_to_dictionary(df,
+                                         k="sorensen_coeff",
+                                         src="source",
+                                         dst="destination")
+
+    return df
diff --git a/python/cugraph/cugraph/link_prediction/wjaccard.py b/python/cugraph/cugraph/link_prediction/wjaccard.py
@@ -14,7 +14,6 @@
 from cugraph.structure.graph_classes import Graph
 from cugraph.link_prediction import jaccard_wrapper
 import cudf
-import numpy as np
 from cugraph.utilities import renumber_vertex_pair
 
 
@@ -33,7 +32,7 @@ def jaccard_w(input_graph, weights, vertex_pair=None):
     Parameters
     ----------
     graph : cugraph.Graph
-        cuGraph graph descriptor, should contain the connectivity information
+        cuGraph Graph instance , should contain the connectivity information
         as an edge list (edge weights are not used for this algorithm). The
         adjacency list will be computed if not already present.
 
@@ -74,32 +73,45 @@ def jaccard_w(input_graph, weights, vertex_pair=None):
     >>>                   dtype=['int32', 'int32', 'float32'], header=None)
     >>> G = cugraph.Graph()
     >>> G.from_cudf_edgelist(M, source='0', destination='1')
-    >>> df = cugraph.jaccard_w(G, M[2])
+    >>> # Create a dataframe containing the vertices with their
+    >>> # corresponding weight
+    >>> weights = cudf.DataFrame()
+    >>> # Sample 10 random vertices from the graph and drop duplicates if
+    >>> # there are any to avoid duplicates vertices with different weight
+    >>> # value in the 'weights' dataframe
+    >>> weights['vertex'] = G.nodes().sample(n=10).drop_duplicates()
+    >>> # Reset the indices and drop the index column
+    >>> weights.reset_index(inplace=True, drop=True)
+    >>> # Create a weight column with random weights
+    >>> weights['weight'] = [random.random() for w in range(
+    >>>                      len(weights['vertex']))]
+    >>> df = cugraph.jaccard_w(G, weights)
     """
     if type(input_graph) is not Graph:
-        raise Exception("input graph must be undirected")
+        raise TypeError("input graph must a Graph")
 
     if type(vertex_pair) == cudf.DataFrame:
         vertex_pair = renumber_vertex_pair(input_graph, vertex_pair)
-    elif vertex_pair is None:
-        pass
-    else:
+    elif vertex_pair is not None:
         raise ValueError("vertex_pair must be a cudf dataframe")
 
     if input_graph.renumbered:
+        # The 'vertex' column of the cudf 'weights' also needs to be renumbered
+        # if the graph was renumbered
         vertex_size = input_graph.vertex_column_size()
+        # single-column vertices i.e only one src and dst columns
         if vertex_size == 1:
             weights = input_graph.add_internal_vertex_id(
                 weights, 'vertex', 'vertex'
             )
+        # multi-column vertices i.e more than one src and dst columns
         else:
             cols = weights.columns[:vertex_size].to_list()
             weights = input_graph.add_internal_vertex_id(
                 weights, 'vertex', cols
             )
-    jaccard_weights = cudf.Series(np.ones(len(weights)))
-    for i in range(len(weights)):
-        jaccard_weights[weights['vertex'].iloc[i]] = weights['weight'].iloc[i]
+
+    jaccard_weights = weights['weight']
     df = jaccard_wrapper.jaccard(input_graph, jaccard_weights, vertex_pair)
 
     if input_graph.renumbered:

diff --git a/python/cugraph/cugraph/link_prediction/woverlap.py b/python/cugraph/cugraph/link_prediction/woverlap.py
@@ -13,7 +13,6 @@
 
 from cugraph.link_prediction import overlap_wrapper
 import cudf
-import numpy as np
 from cugraph.utilities import renumber_vertex_pair
 
 
@@ -32,7 +31,7 @@ def overlap_w(input_graph, weights, vertex_pair=None):
     Parameters
     ----------
     input_graph : cugraph.Graph
-        cuGraph graph descriptor, should contain the connectivity information
+        cuGraph Graph instance, should contain the connectivity information
         as an edge list (edge weights are not used for this algorithm). The
         adjacency list will be computed if not already present.
 
@@ -66,14 +65,24 @@ def overlap_w(input_graph, weights, vertex_pair=None):
     >>>                   dtype=['int32', 'int32', 'float32'], header=None)
     >>> G = cugraph.Graph()
     >>> G.from_cudf_edgelist(M, source='0', destination='1')
-    >>> df = cugraph.overlap_w(G, M[2])
+    >>> # Create a dataframe containing the vertices with their
+    >>> # corresponding weight
+    >>> weights = cudf.DataFrame()
+    >>> # Sample 10 random vertices from the graph and drop duplicates if
+    >>> # there are any to avoid duplicates vertices with different weight
+    >>> # value in the 'weights' dataframe
+    >>> weights['vertex'] = G.nodes().sample(n=10).drop_duplicates()
+    >>> # Reset the indices and drop the index column
+    >>> weights.reset_index(inplace=True, drop=True)
+    >>> # Create a weight column with random weights
+    >>> weights['weight'] = [random.random() for w in range(
+    >>>                      len(weights['vertex']))]
+    >>> df = cugraph.overlap_w(G, weights)
     """
 
     if type(vertex_pair) == cudf.DataFrame:
         vertex_pair = renumber_vertex_pair(input_graph, vertex_pair)
-    elif vertex_pair is None:
-        pass
-    else:
+    elif vertex_pair is not None:
         raise ValueError("vertex_pair must be a cudf dataframe")
 
     if input_graph.renumbered:
@@ -88,9 +97,7 @@ def overlap_w(input_graph, weights, vertex_pair=None):
                 weights, 'vertex', cols
             )
 
-    overlap_weights = cudf.Series(np.ones(len(weights)))
-    for i in range(len(weights)):
-        overlap_weights[weights['vertex'].iloc[i]] = weights['weight'].iloc[i]
+    overlap_weights = weights['weight']
 
     overlap_weights = overlap_weights.astype('float32')