Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable weights for MG similarity algorithms #3879

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 3 additions & 7 deletions python/cugraph/cugraph/dask/link_prediction/jaccard.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,9 @@ def jaccard(input_graph, vertex_pair=None, use_weight=False):
adjacent vertices in the graph.

use_weight : bool, optional (default=False)
Currently not supported
Flag to indicate whether to compute weighted jaccard (if use_weight==True)
or un-weighted jaccard (if use_weight==False).
'input_graph' must be weighted if 'use_weight=True'.

Returns
-------
Expand All @@ -144,12 +146,6 @@ def jaccard(input_graph, vertex_pair=None, use_weight=False):

vertex_pair_col_name = vertex_pair.columns

if use_weight:
raise ValueError("'use_weight' is currently not supported.")

if input_graph.is_weighted():
raise ValueError("Weighted graphs are currently not supported.")

if isinstance(vertex_pair, (dask_cudf.DataFrame, cudf.DataFrame)):
vertex_pair = renumber_vertex_pair(input_graph, vertex_pair)

Expand Down
10 changes: 3 additions & 7 deletions python/cugraph/cugraph/dask/link_prediction/overlap.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,9 @@ def overlap(input_graph, vertex_pair=None, use_weight=False):
adjacent vertices in the graph.

use_weight : bool, optional (default=False)
Currently not supported
Flag to indicate whether to compute weighted overlap (if use_weight==True)
or un-weighted overlap (if use_weight==False).
'input_graph' must be weighted if 'use_weight=True'.

Returns
-------
Expand All @@ -122,12 +124,6 @@ def overlap(input_graph, vertex_pair=None, use_weight=False):

vertex_pair_col_name = vertex_pair.columns

if use_weight:
raise ValueError("'use_weight' is currently not supported.")

if input_graph.is_weighted():
raise ValueError("Weighted graphs are currently not supported.")

if isinstance(vertex_pair, (dask_cudf.DataFrame, cudf.DataFrame)):
vertex_pair = renumber_vertex_pair(input_graph, vertex_pair)

Expand Down
10 changes: 3 additions & 7 deletions python/cugraph/cugraph/dask/link_prediction/sorensen.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,9 @@ def sorensen(input_graph, vertex_pair=None, use_weight=False):
adjacent vertices in the graph.

use_weight : bool, optional (default=False)
Currently not supported
Flag to indicate whether to compute weighted sorensen (if use_weight==True)
or un-weighted sorensen (if use_weight==False).
'input_graph' must be weighted if 'use_weight=True'.

Returns
-------
Expand All @@ -118,12 +120,6 @@ def sorensen(input_graph, vertex_pair=None, use_weight=False):

vertex_pair_col_name = vertex_pair.columns

if use_weight:
raise ValueError("'use_weight' is currently not supported.")

if input_graph.is_weighted():
raise ValueError("Weighted graphs are currently not supported.")

if isinstance(vertex_pair, (dask_cudf.DataFrame, cudf.DataFrame)):
vertex_pair = renumber_vertex_pair(input_graph, vertex_pair)

Expand Down
59 changes: 17 additions & 42 deletions python/cugraph/cugraph/tests/link_prediction/test_jaccard_mg.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ def setup_function():

IS_DIRECTED = [False]
HAS_VERTEX_PAIR = [True, False]
IS_WEIGHTED = [True, False]


# =============================================================================
Expand All @@ -48,6 +49,7 @@ def setup_function():
(datasets, "graph_file"),
(IS_DIRECTED, "directed"),
(HAS_VERTEX_PAIR, "has_vertex_pair"),
(IS_WEIGHTED, "is_weighted"),
)


Expand All @@ -57,7 +59,9 @@ def input_combo(request):
Simply return the current combination of params as a dictionary for use in
tests or other parameterized fixtures.
"""
parameters = dict(zip(("graph_file", "directed", "has_vertex_pair"), request.param))
parameters = dict(
zip(("graph_file", "directed", "has_vertex_pair", "is_weighted"), request.param)
)

return parameters

Expand All @@ -72,7 +76,10 @@ def input_expected_output(input_combo):
input_data_path = input_combo["graph_file"]
directed = input_combo["directed"]
has_vertex_pair = input_combo["has_vertex_pair"]
G = utils.generate_cugraph_graph_from_file(input_data_path, directed=directed)
is_weighted = input_combo["is_weighted"]
G = utils.generate_cugraph_graph_from_file(
input_data_path, directed=directed, edgevals=is_weighted
)
if has_vertex_pair:
# Sample random vertices from the graph and compute the two_hop_neighbors
# with those seeds
Expand All @@ -84,7 +91,9 @@ def input_expected_output(input_combo):
vertex_pair = None

input_combo["vertex_pair"] = vertex_pair
sg_cugraph_jaccard = cugraph.experimental.jaccard(G, input_combo["vertex_pair"])
sg_cugraph_jaccard = cugraph.jaccard(
G, input_combo["vertex_pair"], use_weight=is_weighted
)
# Save the results back to the input_combo dictionary to prevent redundant
# cuGraph runs. Other tests using the input_combo fixture will look for
# them, and if not present they will have to re-run the same cuGraph call.
Expand All @@ -104,6 +113,7 @@ def input_expected_output(input_combo):
ddf,
source="src",
destination="dst",
edge_attr="value" if is_weighted else None,
renumber=True,
store_transposed=True,
)
Expand All @@ -122,8 +132,11 @@ def input_expected_output(input_combo):
def test_dask_mg_jaccard(dask_client, benchmark, input_expected_output):

dg = input_expected_output["MGGraph"]
use_weight = input_expected_output["is_weighted"]

result_jaccard = benchmark(dcg.jaccard, dg, input_expected_output["vertex_pair"])
result_jaccard = benchmark(
dcg.jaccard, dg, input_expected_output["vertex_pair"], use_weight=use_weight
)

result_jaccard = (
result_jaccard.compute()
Expand Down Expand Up @@ -151,41 +164,3 @@ def test_dask_mg_jaccard(dask_client, benchmark, input_expected_output):

assert len(jaccard_coeff_diffs1) == 0
assert len(jaccard_coeff_diffs2) == 0


@pytest.mark.mg
def test_dask_mg_weighted_jaccard(dask_client):
input_data_path = datasets[0]
chunksize = dcg.get_chunksize(input_data_path)
ddf = dask_cudf.read_csv(
input_data_path,
chunksize=chunksize,
delimiter=" ",
names=["src", "dst", "value"],
dtype=["int32", "int32", "float32"],
)

dg = cugraph.Graph(directed=False)
dg.from_dask_cudf_edgelist(
ddf,
source="src",
destination="dst",
edge_attr="value",
renumber=True,
store_transposed=True,
)
with pytest.raises(ValueError):
dcg.jaccard(dg)

dg = cugraph.Graph(directed=False)
dg.from_dask_cudf_edgelist(
ddf,
source="src",
destination="dst",
edge_attr="value",
store_transposed=True,
)

use_weight = True
with pytest.raises(ValueError):
dcg.jaccard(dg, use_weight=use_weight)
59 changes: 17 additions & 42 deletions python/cugraph/cugraph/tests/link_prediction/test_overlap_mg.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ def setup_function():

IS_DIRECTED = [False]
HAS_VERTEX_PAIR = [True, False]
IS_WEIGHTED = [True, False]


# =============================================================================
Expand All @@ -48,6 +49,7 @@ def setup_function():
(datasets, "graph_file"),
(IS_DIRECTED, "directed"),
(HAS_VERTEX_PAIR, "has_vertex_pair"),
(IS_WEIGHTED, "is_weighted"),
)


Expand All @@ -57,7 +59,9 @@ def input_combo(request):
Simply return the current combination of params as a dictionary for use in
tests or other parameterized fixtures.
"""
parameters = dict(zip(("graph_file", "directed", "has_vertex_pair"), request.param))
parameters = dict(
zip(("graph_file", "directed", "has_vertex_pair", "is_weighted"), request.param)
)

return parameters

Expand All @@ -72,7 +76,10 @@ def input_expected_output(input_combo):
input_data_path = input_combo["graph_file"]
directed = input_combo["directed"]
has_vertex_pair = input_combo["has_vertex_pair"]
G = utils.generate_cugraph_graph_from_file(input_data_path, directed=directed)
is_weighted = input_combo["is_weighted"]
Comment on lines 76 to +79
Copy link
Contributor

@naimnv naimnv Sep 26, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder, wouldn't it be cleaner to have these column names as global variable declared at the beginning of the files?

G = utils.generate_cugraph_graph_from_file(
input_data_path, directed=directed, edgevals=is_weighted
)
if has_vertex_pair:
# Sample random vertices from the graph and compute the two_hop_neighbors
# with those seeds
Expand All @@ -84,7 +91,9 @@ def input_expected_output(input_combo):
vertex_pair = None

input_combo["vertex_pair"] = vertex_pair
sg_cugraph_overlap = cugraph.experimental.overlap(G, input_combo["vertex_pair"])
sg_cugraph_overlap = cugraph.overlap(
G, input_combo["vertex_pair"], use_weight=is_weighted
)
# Save the results back to the input_combo dictionary to prevent redundant
# cuGraph runs. Other tests using the input_combo fixture will look for
# them, and if not present they will have to re-run the same cuGraph call.
Expand All @@ -104,6 +113,7 @@ def input_expected_output(input_combo):
ddf,
source="src",
destination="dst",
edge_attr="value" if is_weighted else None,
renumber=True,
store_transposed=True,
)
Expand All @@ -125,8 +135,11 @@ def input_expected_output(input_combo):
def test_dask_mg_overlap(dask_client, benchmark, input_expected_output):

dg = input_expected_output["MGGraph"]
use_weight = input_expected_output["is_weighted"]

result_overlap = benchmark(dcg.overlap, dg, input_expected_output["vertex_pair"])
result_overlap = benchmark(
dcg.overlap, dg, input_expected_output["vertex_pair"], use_weight=use_weight
)

result_overlap = (
result_overlap.compute()
Expand Down Expand Up @@ -154,41 +167,3 @@ def test_dask_mg_overlap(dask_client, benchmark, input_expected_output):

assert len(overlap_coeff_diffs1) == 0
assert len(overlap_coeff_diffs2) == 0


@pytest.mark.mg
def test_dask_mg_weighted_overlap():
input_data_path = datasets[0]
chunksize = dcg.get_chunksize(input_data_path)
ddf = dask_cudf.read_csv(
input_data_path,
chunksize=chunksize,
delimiter=" ",
names=["src", "dst", "value"],
dtype=["int32", "int32", "float32"],
)

dg = cugraph.Graph(directed=False)
dg.from_dask_cudf_edgelist(
ddf,
source="src",
destination="dst",
edge_attr="value",
renumber=True,
store_transposed=True,
)
with pytest.raises(ValueError):
dcg.overlap(dg)

dg = cugraph.Graph(directed=False)
dg.from_dask_cudf_edgelist(
ddf,
source="src",
destination="dst",
edge_attr="value",
store_transposed=True,
)

use_weight = True
with pytest.raises(ValueError):
dcg.overlap(dg, use_weight=use_weight)
Loading