Skip to content

Commit

Permalink
Shuffle the vertex pair (#3002)
Browse files Browse the repository at this point in the history
An illegal memory access occurs when running the MG similarity algos at certain scale. This is caused by vertex pairs not being shuffled appropriately.
This PR:
1. Shuffle the vertex pairs based on the edge partitioning
2. Update the the vertex pairs column names which are not necessarily edgelists
3. Update the docstrings, tests and notebooks accordingly

closes #3001

Authors:
  - Joseph Nke (https://github.com/jnke2016)
  - Chuck Hastings (https://github.com/ChuckHastings)

Approvers:
  - Rick Ratzel (https://github.com/rlratzel)
  - Chuck Hastings (https://github.com/ChuckHastings)

URL: #3002
  • Loading branch information
jnke2016 authored Nov 30, 2022
1 parent 60f5e7b commit 8148123
Show file tree
Hide file tree
Showing 30 changed files with 294 additions and 267 deletions.
8 changes: 7 additions & 1 deletion cpp/src/c_api/graph_functions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,13 @@ struct create_vertex_pairs_functor : public cugraph::c_api::abstract_functor {
second_copy.data(), second_->as_type<vertex_t>(), second_->size_, handle_.get_stream());

if constexpr (multi_gpu) {
// FIXME: shuffle first_copy/second_copy
std::tie(first_copy, second_copy, std::ignore) =
cugraph::detail::shuffle_edgelist_by_gpu_id<vertex_t, weight_t>(
handle_,
std::move(first_copy),
std::move(second_copy),
std::nullopt); // vertex pairs should be shuffled based on the edge partitioning, so we
// can use this edge shuffling function to shuffle vertex pairs.
}

result_ = new cugraph::c_api::cugraph_vertex_pairs_t{
Expand Down
26 changes: 13 additions & 13 deletions notebooks/algorithms/link_prediction/Jaccard-Similarity.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,9 @@
"Returns:\n",
"\n",
" df: cudf.DataFrame with three names columns:\n",
" df[\"source\"]: The source vertex id.\n",
" df[\"destination\"]: The destination vertex id.\n",
" df[\"jaccard_coeff\"]: The jaccard coefficient computed between the source and destination vertex.\n",
" df[\"first\"]: The first vertex id of each pair.\n",
" df[\"second\"]: The second vertex i of each pair.\n",
" df[\"jaccard_coeff\"]: The jaccard coefficient computed between the vertex pairs.\n",
"<br>\n",
"\n",
"__References__ \n",
Expand Down Expand Up @@ -102,9 +102,9 @@
"Returns: \n",
"\n",
" df: cudf.DataFrame with three names columns:\n",
" df['source']: The source vertex id.\n",
" df['destination']: The destination vertex id.\n",
" df['jaccard_coeff']: The weighted jaccard coefficient computed between the source and destination vertex.\n",
" df['first']: The first vertex id of each pair.\n",
" df['second']: The second vertex id of each pair.\n",
" df['jaccard_coeff']: The weighted jaccard coefficient computed between the vertex pairs.\n",
" \n",
"\n",
"__Note:__ For this example we will be using PageRank as the edge weights. Please review the PageRank notebook if you have any questions about running PageRank\n"
Expand Down Expand Up @@ -187,8 +187,8 @@
" \n",
" #find the best\n",
" for i in range(len(dm)): \n",
" print(\"Vertices \" + str(dm['source'].iloc[i]) + \" and \" + \n",
" str(dm['destination'].iloc[i]) + \" are most similar with score: \" \n",
" print(\"Vertices \" + str(dm['first'].iloc[i]) + \" and \" + \n",
" str(dm['second'].iloc[i]) + \" are most similar with score: \" \n",
" + str(dm['jaccard_coeff'].iloc[i]))\n",
" del jmax\n",
" del dm"
Expand All @@ -206,8 +206,8 @@
" filtered = _d.query('jaccard_coeff > @limit')\n",
" \n",
" for i in range(len(filtered)):\n",
" print(\"Vertices \" + str(filtered['source'].iloc[i]) + \" and \" + \n",
" str(filtered['destination'].iloc[i]) + \" are similar with score: \" + \n",
" print(\"Vertices \" + str(filtered['first'].iloc[i]) + \" and \" + \n",
" str(filtered['second'].iloc[i]) + \" are similar with score: \" + \n",
" str(filtered['jaccard_coeff'].iloc[i]))"
]
},
Expand Down Expand Up @@ -322,7 +322,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"The Most similar shoul be 33 and 34.\n",
"The Most similar should be 33 and 34.\n",
"Vertex 33 has 12 neighbors, vertex 34 has 17 neighbors. They share 10 neighbors in common:\n",
"$jaccard = 10 / (10 + (12 -10) + (17-10)) = 10 / 19 = 0.526$"
]
Expand All @@ -348,7 +348,7 @@
"# Before printing, let's get rid of the duplicates (x compared to y is the same as y compared to x). We will do that\n",
"# by performing a query. Then let's sort the data by score\n",
"\n",
"jdf_s = jdf.query('source < destination').sort_values(by='jaccard_coeff', ascending=False)\n",
"jdf_s = jdf.query('first < second').sort_values(by='jaccard_coeff', ascending=False)\n",
"\n",
"print_jaccard_threshold(jdf_s, 0.0)"
]
Expand Down Expand Up @@ -500,4 +500,4 @@
},
"nbformat": 4,
"nbformat_minor": 4
}
}
56 changes: 28 additions & 28 deletions notebooks/algorithms/link_prediction/Overlap-Similarity.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,9 @@
"Returns:\n",
"\n",
" df: cudf.DataFrame with three names columns:\n",
" df[\"source\"]: The source vertex id.\n",
" df[\"destination\"]: The destination vertex id.\n",
" df[\"overlap_coeff\"]: The overlap coefficient computed between the source and destination vertex.\n",
" df[\"first\"]: The first vertex id of each pair.\n",
" df[\"second\"]: The second vertex id of each pair.\n",
" df[\"overlap_coeff\"]: The overlap coefficient computed between the vertex pairs.\n",
"\n",
"__References__\n",
"- https://en.wikipedia.org/wiki/Overlap_coefficient\n"
Expand All @@ -87,9 +87,9 @@
"Returns:\n",
"\n",
" df: cudf.DataFrame with three names columns:\n",
" df[\"source\"]: The source vertex id.\n",
" df[\"destination\"]: The destination vertex id.\n",
" df[\"jaccard_coeff\"]: The jaccard coefficient computed between the source and destination vertex.\n",
" df[\"first\"]: The first vertex id of each pair.\n",
" df[\"second\"]: The second vertex id of each pair.\n",
" df[\"jaccard_coeff\"]: The jaccard coefficient computed between the vertex pairs.\n",
"<br>\n",
"\n",
"See the Jaccard notebook for additional information and background"
Expand Down Expand Up @@ -174,8 +174,8 @@
" \n",
" #find the best\n",
" for i in range(len(dm)): \n",
" print(\"Vertices \" + str(dm['source'].iloc[i]) + \" and \" + \n",
" str(dm['destination'].iloc[i]) + \" are most similar with score: \" \n",
" print(\"Vertices \" + str(dm['first'].iloc[i]) + \" and \" + \n",
" str(dm['second'].iloc[i]) + \" are most similar with score: \" \n",
" + str(dm['jaccard_coeff'].iloc[i]))\n",
" del jmax\n",
" del dm"
Expand All @@ -191,11 +191,11 @@
"def print_most_similar_overlap(df):\n",
" \n",
" smax = df['overlap_coeff'].max()\n",
" dm = df.query('overlap_coeff >= @smax and source < destination') \n",
" dm = df.query('overlap_coeff >= @smax and first < second') \n",
" \n",
" for i in range(len(dm)):\n",
" print(\"Vertices \" + str(dm['source'].iloc[i]) + \" and \" + \n",
" str(dm['destination'].iloc[i]) + \" are most similar with score: \" \n",
" print(\"Vertices \" + str(dm['first'].iloc[i]) + \" and \" + \n",
" str(dm['second'].iloc[i]) + \" are most similar with score: \" \n",
" + str(dm['overlap_coeff'].iloc[i]))\n",
" \n",
" del smax\n",
Expand All @@ -214,8 +214,8 @@
" filtered = _d.query('jaccard_coeff > @limit')\n",
" \n",
" for i in range(len(filtered)):\n",
" print(\"Vertices \" + str(filtered['source'].iloc[i]) + \" and \" + \n",
" str(filtered['destination'].iloc[i]) + \" are similar with score: \" + \n",
" print(\"Vertices \" + str(filtered['first'].iloc[i]) + \" and \" + \n",
" str(filtered['second'].iloc[i]) + \" are similar with score: \" + \n",
" str(filtered['jaccard_coeff'].iloc[i]))"
]
},
Expand All @@ -231,9 +231,9 @@
" filtered = _d.query('overlap_coeff > @limit')\n",
" \n",
" for i in range(len(filtered)):\n",
" if filtered['source'].iloc[i] != filtered['destination'].iloc[i] :\n",
" print(\"Vertices \" + str(filtered['source'].iloc[i]) + \" and \" + \n",
" str(filtered['destination'].iloc[i]) + \" are similar with score: \" + \n",
" if filtered['first'].iloc[i] != filtered['second'].iloc[i] :\n",
" print(\"Vertices \" + str(filtered['first'].iloc[i]) + \" and \" + \n",
" str(filtered['second'].iloc[i]) + \" are similar with score: \" + \n",
" str(filtered['overlap_coeff'].iloc[i]))"
]
},
Expand Down Expand Up @@ -361,7 +361,7 @@
"# Before printing, let's get rid of the duplicates (x compared to y is the same as y compared to x). We will do that\n",
"# by performing a query. Then let's sort the data by score\n",
"\n",
"jdf_s = jdf.query('source < destination').sort_values(by='jaccard_coeff', ascending=False)\n",
"jdf_s = jdf.query('first < second').sort_values(by='jaccard_coeff', ascending=False)\n",
"\n",
"print_jaccard_threshold(jdf_s, 0.0)"
]
Expand Down Expand Up @@ -410,8 +410,8 @@
"outputs": [],
"source": [
"# print all similarities over a threshold, in this case 0.5\n",
"#also, drop duplicates\n",
"odf_s = odf.query('source < destination').sort_values(by='overlap_coeff', ascending=False)\n",
"# also, drop duplicates\n",
"odf_s = odf.query('first < second').sort_values(by='overlap_coeff', ascending=False)\n",
"\n",
"print_overlap_threshold(odf_s, 0.5)"
]
Expand Down Expand Up @@ -467,7 +467,7 @@
"source": [
"# print all similarities over a threshold, in this case 0.5\n",
"# also, drop duplicates\n",
"odf_s2 = ol2.query('source < destination').sort_values(by='overlap_coeff', ascending=False)\n",
"odf_s2 = ol2.query('first < second').sort_values(by='overlap_coeff', ascending=False)\n",
"\n",
"print_overlap_threshold(odf_s2, 0.74)"
]
Expand Down Expand Up @@ -513,7 +513,7 @@
"outputs": [],
"source": [
"# Let's combine the Jaccard and Overlap scores\n",
"mdf = jdf.merge(odf, on=['source','destination'])"
"mdf = jdf.merge(odf, on=['first','second'])"
]
},
{
Expand All @@ -532,8 +532,8 @@
"metadata": {},
"outputs": [],
"source": [
"dS = degree.rename(columns={'vertex':'source','degree': 'src_degree'})\n",
"dD = degree.rename(columns={'vertex':'destination','degree': 'dst_degree'})"
"dS = degree.rename(columns={'vertex':'first','degree': 'src_degree'})\n",
"dD = degree.rename(columns={'vertex':'second','degree': 'dst_degree'})"
]
},
{
Expand All @@ -542,8 +542,8 @@
"metadata": {},
"outputs": [],
"source": [
"m = mdf.merge(dS, how=\"left\", on='source')\n",
"m = m.merge(dD, how=\"left\", on='destination')"
"m = mdf.merge(dS, how=\"left\", on='first')\n",
"m = m.merge(dD, how=\"left\", on='second')"
]
},
{
Expand All @@ -552,7 +552,7 @@
"metadata": {},
"outputs": [],
"source": [
"m.query('source < destination').sort_values(by='jaccard_coeff', ascending=False).head(20)"
"m.query('first < second').sort_values(by='jaccard_coeff', ascending=False).head(20)"
]
},
{
Expand All @@ -562,7 +562,7 @@
"outputs": [],
"source": [
"# Now sort on the overlap\n",
"m.query('source < destination').sort_values(by='overlap_coeff', ascending=False).head(20)"
"m.query('first < second').sort_values(by='overlap_coeff', ascending=False).head(20)"
]
},
{
Expand Down Expand Up @@ -605,4 +605,4 @@
},
"nbformat": 4,
"nbformat_minor": 4
}
}
10 changes: 5 additions & 5 deletions notebooks/algorithms/structure/Renumber.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -273,16 +273,16 @@
"source": [
"jac = cugraph.jaccard(G)\n",
"\n",
"jac = numbering.unrenumber(jac, 'source')\n",
"jac = numbering.unrenumber(jac, 'destination')\n",
"jac = numbering.unrenumber(jac, 'first')\n",
"jac = numbering.unrenumber(jac, 'second')\n",
"\n",
"jac.insert(len(jac.columns),\n",
" \"original_source\",\n",
" [ socket.inet_ntoa(struct.pack('!L', x)) for x in jac['source'].values_host ])\n",
" [ socket.inet_ntoa(struct.pack('!L', x)) for x in jac['first'].values_host ])\n",
"\n",
"jac.insert(len(jac.columns),\n",
" \"original_destination\",\n",
" [ socket.inet_ntoa(struct.pack('!L', x)) for x in jac['destination'].values_host ])\n",
" [ socket.inet_ntoa(struct.pack('!L', x)) for x in jac['second'].values_host ])\n",
"\n",
"jac.to_pandas()\n"
]
Expand Down Expand Up @@ -358,4 +358,4 @@
},
"nbformat": 4,
"nbformat_minor": 4
}
}
24 changes: 12 additions & 12 deletions python/cugraph/cugraph/dask/link_prediction/jaccard.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,11 @@ def convert_to_cudf(cp_arrays):
Creates a cudf DataFrame from cupy arrays from pylibcugraph wrapper
"""

cupy_source, cupy_destination, cupy_similarity = cp_arrays
cupy_first, cupy_second, cupy_similarity = cp_arrays

df = cudf.DataFrame()
df["source"] = cupy_source
df["destination"] = cupy_destination
df["first"] = cupy_first
df["second"] = cupy_second
df["jaccard_coeff"] = cupy_similarity

return df
Expand Down Expand Up @@ -125,14 +125,14 @@ def jaccard(input_graph, vertex_pair=None, use_weight=False):
result : dask_cudf.DataFrame
GPU distributed data frame containing 2 dask_cudf.Series
ddf['source']: dask_cudf.Series
The source vertex ID (will be identical to first if specified)
ddf['destination']: dask_cudf.Series
The destination vertex ID (will be identical to second if
specified)
ddf['first']: dask_cudf.Series
The first vertex ID of each pair (will be identical to first if specified).
ddf['second']: dask_cudf.Series
The second vertex ID of each pair (will be identical to second if
specified).
ddf['jaccard_coeff']: dask_cudf.Series
The computed Jaccard coefficient between the source and destination
vertices
The computed jaccard coefficient between the first and the second
vertex ID.
"""

if input_graph.is_directed():
Expand Down Expand Up @@ -198,7 +198,7 @@ def jaccard(input_graph, vertex_pair=None, use_weight=False):
wait([(r.release(), c_r.release()) for r, c_r in zip(result, cudf_result)])

if input_graph.renumbered:
ddf = input_graph.unrenumber(ddf, "source")
ddf = input_graph.unrenumber(ddf, "destination")
ddf = input_graph.unrenumber(ddf, "first")
ddf = input_graph.unrenumber(ddf, "second")

return ddf
24 changes: 12 additions & 12 deletions python/cugraph/cugraph/dask/link_prediction/overlap.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,11 @@ def convert_to_cudf(cp_arrays):
Creates a cudf DataFrame from cupy arrays from pylibcugraph wrapper
"""

cupy_source, cupy_destination, cupy_similarity = cp_arrays
cupy_first, cupy_second, cupy_similarity = cp_arrays

df = cudf.DataFrame()
df["source"] = cupy_source
df["destination"] = cupy_destination
df["first"] = cupy_first
df["second"] = cupy_second
df["overlap_coeff"] = cupy_similarity

return df
Expand Down Expand Up @@ -103,14 +103,14 @@ def overlap(input_graph, vertex_pair=None, use_weight=False):
result : dask_cudf.DataFrame
GPU distributed data frame containing 2 dask_cudf.Series
ddf['source']: dask_cudf.Series
The source vertex ID (will be identical to first if specified)
ddf['destination']: dask_cudf.Series
The destination vertex ID (will be identical to second if
specified)
ddf['first']: dask_cudf.Series
The first vertex ID of each pair(will be identical to first if specified).
ddf['second']: dask_cudf.Series
The second vertex ID of each pair(will be identical to second if
specified).
ddf['overlap_coeff']: dask_cudf.Series
The computed Overlap coefficient between the source and destination
vertices
The computed overlap coefficient between the first and the second
vertex ID.
"""

if input_graph.is_directed():
Expand Down Expand Up @@ -176,7 +176,7 @@ def overlap(input_graph, vertex_pair=None, use_weight=False):
wait([(r.release(), c_r.release()) for r, c_r in zip(result, cudf_result)])

if input_graph.renumbered:
ddf = input_graph.unrenumber(ddf, "source")
ddf = input_graph.unrenumber(ddf, "destination")
ddf = input_graph.unrenumber(ddf, "first")
ddf = input_graph.unrenumber(ddf, "second")

return ddf
Loading

0 comments on commit 8148123

Please sign in to comment.