Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix OOM Bug for Jaccard, Sorensen, and Overlap benchmarks #4524

Merged
33 changes: 31 additions & 2 deletions benchmarks/cugraph/pytest-based/bench_algos.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,20 @@ def is_graph_distributed(graph):
return isinstance(graph.edgelist.edgelist_df, dask_cudf.DataFrame)


def get_vertex_pairs(G, num_vertices=10):
"""
Return a DateFrame containing two-hop vertex pairs randomly sampled from
a Graph.
"""
random_vertices = G.select_random_vertices(num_vertices=num_vertices)

if isinstance(random_vertices, dask_cudf.Series):
random_vertices = random_vertices.compute()

vertices = random_vertices.to_arrow().to_pylist()
return G.get_two_hop_neighbors(start_vertices=vertices)


###############################################################################
# Benchmarks
def bench_create_graph(gpubenchmark, edgelist):
Expand Down Expand Up @@ -323,8 +337,20 @@ def bench_sssp(gpubenchmark, graph):

def bench_jaccard(gpubenchmark, unweighted_graph):
G = unweighted_graph
# algo cannot compute neighbors on all nodes without running into OOM
# this is why we will call jaccard on a subset of nodes
vert_pairs = get_vertex_pairs(G)
jaccard = dask_cugraph.jaccard if is_graph_distributed(G) else cugraph.jaccard
gpubenchmark(jaccard, G)
gpubenchmark(jaccard, G, vert_pairs)


def bench_sorensen(gpubenchmark, unweighted_graph):
G = unweighted_graph
# algo cannot compute neighbors on all nodes without running into OOM
# this is why we will call sorensen on a subset of nodes
vert_pairs = get_vertex_pairs(G)
sorensen = dask_cugraph.sorensen if is_graph_distributed(G) else cugraph.sorensen
gpubenchmark(sorensen, G, vert_pairs)


@pytest.mark.skipif(
Expand All @@ -347,8 +373,11 @@ def bench_weakly_connected_components(gpubenchmark, graph):

def bench_overlap(gpubenchmark, unweighted_graph):
G = unweighted_graph
# algo cannot compute neighbors on all nodes without running into OOM
# this is why we will call sorensen on a subset of nodes
vertex_pairs = get_vertex_pairs(G)
overlap = dask_cugraph.overlap if is_graph_distributed(G) else cugraph.overlap
gpubenchmark(overlap, G)
gpubenchmark(overlap, G, vertex_pairs)


def bench_triangle_count(gpubenchmark, graph):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -855,6 +855,12 @@ def get_two_hop_neighbors(self, start_vertices=None):
Compute vertex pairs that are two hops apart. The resulting pairs are
sorted before returning.

Parameters
----------
start_vertices : Int or List (default=None)
Subset of vertices to compute two hop neighbors on. If None, compute
for all nodes.

Returns
-------
df : cudf.DataFrame
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -728,6 +728,12 @@ def get_two_hop_neighbors(self, start_vertices=None):
Compute vertex pairs that are two hops apart. The resulting pairs are
sorted before returning.

Parameters
----------
start_vertices : Int or List (default=None)
Subset of vertices to compute two hop neighbors on. If None, compute
for all nodes.

Returns
-------
df : cudf.DataFrame
Expand Down
Loading