Skip to content

Commit

Permalink
Added Sorensen algorithm to Python API (#1820)
Browse files Browse the repository at this point in the history
Add a python implementation of the Sorensen and the wSorensen from a prior Jaccard implementation

Add tests for both algorithms

Since there is no current implementation of networkX Sorensen, the tests convert networkX Jaccard to Sorensen and compare it to cugraph Sorensen

Authors:
  - Joseph Nke (https://github.com/jnke2016)

Approvers:
  - Rick Ratzel (https://github.com/rlratzel)
  - Brad Rees (https://github.com/BradReesWork)

URL: #1820
  • Loading branch information
jnke2016 authored Sep 24, 2021
1 parent c3b26b6 commit 9191711
Show file tree
Hide file tree
Showing 14 changed files with 933 additions and 204 deletions.
3 changes: 3 additions & 0 deletions python/cugraph/cugraph/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,8 +77,11 @@
jaccard_coefficient,
overlap,
overlap_coefficient,
sorensen,
sorensen_coefficient,
jaccard_w,
overlap_w,
sorensen_w,
)

from cugraph.traversal import (
Expand Down
3 changes: 3 additions & 0 deletions python/cugraph/cugraph/link_prediction/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,5 +16,8 @@
from cugraph.link_prediction.overlap import overlap
from cugraph.link_prediction.wjaccard import jaccard_w
from cugraph.link_prediction.woverlap import overlap_w
from cugraph.link_prediction.wsorensen import sorensen_w
from cugraph.link_prediction.jaccard import jaccard_coefficient
from cugraph.link_prediction.sorensen import sorensen_coefficient
from cugraph.link_prediction.sorensen import sorensen
from cugraph.link_prediction.overlap import overlap_coefficient
13 changes: 5 additions & 8 deletions python/cugraph/cugraph/link_prediction/jaccard.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import pandas as pd
import cudf
from cugraph.structure.graph_classes import Graph
from cugraph.link_prediction import jaccard_wrapper
Expand Down Expand Up @@ -69,7 +68,7 @@ def jaccard(input_graph, vertex_pair=None):
Parameters
----------
graph : cugraph.Graph
cuGraph graph descriptor, should contain the connectivity information
cuGraph Graph instance, should contain the connectivity information
as an edge list (edge weights are not used for this algorithm). The
graph should be undirected where an undirected edge is represented by a
directed edge in both direction. The adjacency list will be computed if
Expand Down Expand Up @@ -107,13 +106,11 @@ def jaccard(input_graph, vertex_pair=None):
>>> df = cugraph.jaccard(G)
"""
if type(input_graph) is not Graph:
raise Exception("input graph must be undirected")
raise TypeError("input graph must a Graph")

if type(vertex_pair) == cudf.DataFrame:
vertex_pair = renumber_vertex_pair(input_graph, vertex_pair)
elif vertex_pair is None:
pass
else:
elif vertex_pair is not None:
raise ValueError("vertex_pair must be a cudf dataframe")

df = jaccard_wrapper.jaccard(input_graph, None, vertex_pair)
Expand All @@ -132,7 +129,7 @@ def jaccard_coefficient(G, ebunch=None):
Parameters
----------
graph : cugraph.Graph
cuGraph graph descriptor, should contain the connectivity information
cuGraph Graph instance, should contain the connectivity information
as an edge list (edge weights are not used for this algorithm). The
graph should be undirected where an undirected edge is represented by a
directed edge in both direction. The adjacency list will be computed if
Expand Down Expand Up @@ -174,7 +171,7 @@ def jaccard_coefficient(G, ebunch=None):
G, isNx = check_nx_graph(G)

if isNx is True and ebunch is not None:
vertex_pair = cudf.from_pandas(pd.DataFrame(ebunch))
vertex_pair = cudf.DataFrame(ebunch)

df = jaccard(G, vertex_pair)

Expand Down
9 changes: 3 additions & 6 deletions python/cugraph/cugraph/link_prediction/overlap.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import pandas as pd
from cugraph.link_prediction import overlap_wrapper
import cudf
from cugraph.utilities import check_nx_graph
Expand All @@ -29,7 +28,7 @@ def overlap_coefficient(G, ebunch=None):
G, isNx = check_nx_graph(G)

if isNx is True and ebunch is not None:
vertex_pair = cudf.from_pandas(pd.DataFrame(ebunch))
vertex_pair = cudf.DataFrame(ebunch)

df = overlap(G, vertex_pair)

Expand Down Expand Up @@ -57,7 +56,7 @@ def overlap(input_graph, vertex_pair=None):
Parameters
----------
graph : cugraph.Graph
cuGraph graph descriptor, should contain the connectivity information
cuGraph Graph instance, should contain the connectivity information
as an edge list (edge weights are not used for this algorithm). The
adjacency list will be computed if not already present.
vertex_pair : cudf.DataFrame
Expand Down Expand Up @@ -93,9 +92,7 @@ def overlap(input_graph, vertex_pair=None):

if type(vertex_pair) == cudf.DataFrame:
vertex_pair = renumber_vertex_pair(input_graph, vertex_pair)
elif vertex_pair is None:
pass
else:
elif vertex_pair is not None:
raise ValueError("vertex_pair must be a cudf dataframe")

df = overlap_wrapper.overlap(input_graph, None, vertex_pair)
Expand Down
152 changes: 152 additions & 0 deletions python/cugraph/cugraph/link_prediction/sorensen.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
# Copyright (c) 2021, NVIDIA CORPORATION.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import cudf
from cugraph.structure.graph_classes import Graph
from cugraph.link_prediction import jaccard_wrapper
from cugraph.utilities import check_nx_graph
from cugraph.utilities import df_edge_score_to_dictionary
from cugraph.utilities import renumber_vertex_pair


def sorensen(input_graph, vertex_pair=None):
"""
Compute the Sorensen coefficient between each pair of vertices connected by
an edge, or between arbitrary pairs of vertices specified by the user.
Sorensen coefficient is defined between two sets as the ratio of twice the
volume of their intersection divided by the volume of each set.
If first is specified but second is not, or vice versa, an exception will
be thrown.
cugraph.sorensen, in the absence of a specified vertex pair list, will
use the edges of the graph to construct a vertex pair list and will
return the sorensen coefficient for those vertex pairs.
Parameters
----------
graph : cugraph.Graph
cuGraph Graph instance, should contain the connectivity information
as an edge list (edge weights are not used for this algorithm). The
graph should be undirected where an undirected edge is represented by a
directed edge in both direction. The adjacency list will be computed if
not already present.
vertex_pair : cudf.DataFrame
A GPU dataframe consisting of two columns representing pairs of
vertices. If provided, the Sorensen coefficient is computed for the
given vertex pairs. If the vertex_pair is not provided then the
current implementation computes the Sorensen coefficient for all
adjacent vertices in the graph.
Returns
-------
df : cudf.DataFrame
GPU data frame of size E (the default) or the size of the given pairs
(first, second) containing the Sorensen index. The ordering is
relative to the adjacency list, or that given by the specified vertex
pairs.
df['source'] : cudf.Series
The source vertex ID (will be identical to first if specified)
df['destination'] : cudf.Series
The destination vertex ID (will be identical to second if
specified)
df['sorensen_coeff'] : cudf.Series
The computed Sorensen coefficient between the source and
destination vertices
Examples
--------
>>> gdf = cudf.read_csv('datasets/karate.csv', delimiter=' ',
>>> dtype=['int32', 'int32', 'float32'], header=None)
>>> G = cugraph.Graph()
>>> G.from_cudf_edgelist(gdf, source='0', destination='1')
>>> df = cugraph.sorensen(G)
"""
if type(input_graph) is not Graph:
raise TypeError("input graph must a Graph")

if type(vertex_pair) == cudf.DataFrame:
vertex_pair = renumber_vertex_pair(input_graph, vertex_pair)
elif vertex_pair is not None:
raise ValueError("vertex_pair must be a cudf dataframe")

df = jaccard_wrapper.jaccard(input_graph, None, vertex_pair)
df.jaccard_coeff = ((2*df.jaccard_coeff)/(1+df.jaccard_coeff))
df.rename(
{'jaccard_coeff': 'sorensen_coeff'}, axis=1, inplace=True)
if input_graph.renumbered:
df = input_graph.unrenumber(df, "source")
df = input_graph.unrenumber(df, "destination")

return df


def sorensen_coefficient(G, ebunch=None):
"""
Parameters
----------
graph : cugraph.Graph
cuGraph Graph instance, should contain the connectivity information
as an edge list (edge weights are not used for this algorithm). The
graph should be undirected where an undirected edge is represented by a
directed edge in both direction. The adjacency list will be computed if
not already present.
ebunch : cudf.DataFrame
A GPU dataframe consisting of two columns representing pairs of
vertices. If provided, the sorensen coefficient is computed for the
given vertex pairs. If the vertex_pair is not provided then the
current implementation computes the sorensen coefficient for all
adjacent vertices in the graph.
Returns
-------
df : cudf.DataFrame
GPU data frame of size E (the default) or the size of the given pairs
(first, second) containing the Sorensen weights. The ordering is
relative to the adjacency list, or that given by the specified vertex
pairs.
df['source'] : cudf.Series
The source vertex ID (will be identical to first if specified)
df['destination'] : cudf.Series
The destination vertex ID (will be identical to second if
specified)
df['sorensen_coeff'] : cudf.Series
The computed sorensen coefficient between the source and
destination vertices
Examples
--------
>>> gdf = cudf.read_csv('datasets/karate.csv', delimiter=' ',
>>> dtype=['int32', 'int32', 'float32'], header=None)
>>> G = cugraph.Graph()
>>> G.from_cudf_edgelist(gdf, source='0', destination='1')
>>> df = cugraph.sorensen_coefficient(G)
"""
vertex_pair = None

G, isNx = check_nx_graph(G)

if isNx is True and ebunch is not None:
vertex_pair = cudf.DataFrame(ebunch)

df = sorensen(G, vertex_pair)

if isNx is True:
df = df_edge_score_to_dictionary(df,
k="sorensen_coeff",
src="source",
dst="destination")

return df
32 changes: 22 additions & 10 deletions python/cugraph/cugraph/link_prediction/wjaccard.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
from cugraph.structure.graph_classes import Graph
from cugraph.link_prediction import jaccard_wrapper
import cudf
import numpy as np
from cugraph.utilities import renumber_vertex_pair


Expand All @@ -33,7 +32,7 @@ def jaccard_w(input_graph, weights, vertex_pair=None):
Parameters
----------
graph : cugraph.Graph
cuGraph graph descriptor, should contain the connectivity information
cuGraph Graph instance , should contain the connectivity information
as an edge list (edge weights are not used for this algorithm). The
adjacency list will be computed if not already present.
Expand Down Expand Up @@ -74,32 +73,45 @@ def jaccard_w(input_graph, weights, vertex_pair=None):
>>> dtype=['int32', 'int32', 'float32'], header=None)
>>> G = cugraph.Graph()
>>> G.from_cudf_edgelist(M, source='0', destination='1')
>>> df = cugraph.jaccard_w(G, M[2])
>>> # Create a dataframe containing the vertices with their
>>> # corresponding weight
>>> weights = cudf.DataFrame()
>>> # Sample 10 random vertices from the graph and drop duplicates if
>>> # there are any to avoid duplicates vertices with different weight
>>> # value in the 'weights' dataframe
>>> weights['vertex'] = G.nodes().sample(n=10).drop_duplicates()
>>> # Reset the indices and drop the index column
>>> weights.reset_index(inplace=True, drop=True)
>>> # Create a weight column with random weights
>>> weights['weight'] = [random.random() for w in range(
>>> len(weights['vertex']))]
>>> df = cugraph.jaccard_w(G, weights)
"""
if type(input_graph) is not Graph:
raise Exception("input graph must be undirected")
raise TypeError("input graph must a Graph")

if type(vertex_pair) == cudf.DataFrame:
vertex_pair = renumber_vertex_pair(input_graph, vertex_pair)
elif vertex_pair is None:
pass
else:
elif vertex_pair is not None:
raise ValueError("vertex_pair must be a cudf dataframe")

if input_graph.renumbered:
# The 'vertex' column of the cudf 'weights' also needs to be renumbered
# if the graph was renumbered
vertex_size = input_graph.vertex_column_size()
# single-column vertices i.e only one src and dst columns
if vertex_size == 1:
weights = input_graph.add_internal_vertex_id(
weights, 'vertex', 'vertex'
)
# multi-column vertices i.e more than one src and dst columns
else:
cols = weights.columns[:vertex_size].to_list()
weights = input_graph.add_internal_vertex_id(
weights, 'vertex', cols
)
jaccard_weights = cudf.Series(np.ones(len(weights)))
for i in range(len(weights)):
jaccard_weights[weights['vertex'].iloc[i]] = weights['weight'].iloc[i]

jaccard_weights = weights['weight']
df = jaccard_wrapper.jaccard(input_graph, jaccard_weights, vertex_pair)

if input_graph.renumbered:
Expand Down
25 changes: 16 additions & 9 deletions python/cugraph/cugraph/link_prediction/woverlap.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@

from cugraph.link_prediction import overlap_wrapper
import cudf
import numpy as np
from cugraph.utilities import renumber_vertex_pair


Expand All @@ -32,7 +31,7 @@ def overlap_w(input_graph, weights, vertex_pair=None):
Parameters
----------
input_graph : cugraph.Graph
cuGraph graph descriptor, should contain the connectivity information
cuGraph Graph instance, should contain the connectivity information
as an edge list (edge weights are not used for this algorithm). The
adjacency list will be computed if not already present.
Expand Down Expand Up @@ -66,14 +65,24 @@ def overlap_w(input_graph, weights, vertex_pair=None):
>>> dtype=['int32', 'int32', 'float32'], header=None)
>>> G = cugraph.Graph()
>>> G.from_cudf_edgelist(M, source='0', destination='1')
>>> df = cugraph.overlap_w(G, M[2])
>>> # Create a dataframe containing the vertices with their
>>> # corresponding weight
>>> weights = cudf.DataFrame()
>>> # Sample 10 random vertices from the graph and drop duplicates if
>>> # there are any to avoid duplicates vertices with different weight
>>> # value in the 'weights' dataframe
>>> weights['vertex'] = G.nodes().sample(n=10).drop_duplicates()
>>> # Reset the indices and drop the index column
>>> weights.reset_index(inplace=True, drop=True)
>>> # Create a weight column with random weights
>>> weights['weight'] = [random.random() for w in range(
>>> len(weights['vertex']))]
>>> df = cugraph.overlap_w(G, weights)
"""

if type(vertex_pair) == cudf.DataFrame:
vertex_pair = renumber_vertex_pair(input_graph, vertex_pair)
elif vertex_pair is None:
pass
else:
elif vertex_pair is not None:
raise ValueError("vertex_pair must be a cudf dataframe")

if input_graph.renumbered:
Expand All @@ -88,9 +97,7 @@ def overlap_w(input_graph, weights, vertex_pair=None):
weights, 'vertex', cols
)

overlap_weights = cudf.Series(np.ones(len(weights)))
for i in range(len(weights)):
overlap_weights[weights['vertex'].iloc[i]] = weights['weight'].iloc[i]
overlap_weights = weights['weight']

overlap_weights = overlap_weights.astype('float32')

Expand Down
Loading

0 comments on commit 9191711

Please sign in to comment.