Shuffle the vertex pair (#3002)

An illegal memory access occurs when running the MG similarity algos at certain scale. This is caused by vertex pairs not being shuffled appropriately. This PR: 1. Shuffle the vertex pairs based on the edge partitioning 2. Update the the vertex pairs column names which are not necessarily edgelists 3. Update the docstrings, tests and notebooks accordingly closes #3001 Authors: - Joseph Nke (https://github.com/jnke2016) - Chuck Hastings (https://github.com/ChuckHastings) Approvers: - Rick Ratzel (https://github.com/rlratzel) - Chuck Hastings (https://github.com/ChuckHastings) URL: #3002
rapidsai · Nov 30, 2022 · 8148123 · 8148123
1 parent 60f5e7b
commit 8148123
Show file tree

Hide file tree

Showing 30 changed files with 294 additions and 267 deletions.
diff --git a/cpp/src/c_api/graph_functions.cpp b/cpp/src/c_api/graph_functions.cpp
@@ -71,7 +71,13 @@ struct create_vertex_pairs_functor : public cugraph::c_api::abstract_functor {
         second_copy.data(), second_->as_type<vertex_t>(), second_->size_, handle_.get_stream());
 
       if constexpr (multi_gpu) {
-        // FIXME: shuffle first_copy/second_copy
+        std::tie(first_copy, second_copy, std::ignore) =
+          cugraph::detail::shuffle_edgelist_by_gpu_id<vertex_t, weight_t>(
+            handle_,
+            std::move(first_copy),
+            std::move(second_copy),
+            std::nullopt);  // vertex pairs should be shuffled based on the edge partitioning, so we
+                            // can use this edge shuffling function to shuffle vertex pairs.
       }
 
       result_ = new cugraph::c_api::cugraph_vertex_pairs_t{

diff --git a/notebooks/algorithms/link_prediction/Jaccard-Similarity.ipynb b/notebooks/algorithms/link_prediction/Jaccard-Similarity.ipynb
@@ -59,9 +59,9 @@
     "Returns:\n",
     "\n",
     "    df: cudf.DataFrame with three names columns:\n",
-    "        df[\"source\"]: The source vertex id.\n",
-    "        df[\"destination\"]: The destination vertex id.\n",
-    "        df[\"jaccard_coeff\"]: The jaccard coefficient computed between the source and destination vertex.\n",
+    "        df[\"first\"]: The first vertex id of each pair.\n",
+    "        df[\"second\"]: The second vertex i of each pair.\n",
+    "        df[\"jaccard_coeff\"]: The jaccard coefficient computed between the vertex pairs.\n",
     "<br>\n",
     "\n",
     "__References__ \n",
@@ -102,9 +102,9 @@
     "Returns: \n",
     "\n",
     "    df: cudf.DataFrame with three names columns:\n",
-    "        df['source']: The source vertex id.\n",
-    "        df['destination']: The destination vertex id.\n",
-    "        df['jaccard_coeff']: The weighted jaccard coefficient computed between the source and destination vertex.\n",
+    "        df['first']: The first vertex id of each pair.\n",
+    "        df['second']: The second vertex id of each pair.\n",
+    "        df['jaccard_coeff']: The weighted jaccard coefficient computed between the vertex pairs.\n",
     " \n",
     "\n",
     "__Note:__ For this example we will be using PageRank as the edge weights.  Please review the PageRank notebook if you have any questions about running PageRank\n"
@@ -187,8 +187,8 @@
     "    \n",
     "    #find the best\n",
     "    for i in range(len(dm)):    \n",
-    "        print(\"Vertices \" + str(dm['source'].iloc[i]) + \" and \" + \n",
-    "              str(dm['destination'].iloc[i]) + \" are most similar with score: \" \n",
+    "        print(\"Vertices \" + str(dm['first'].iloc[i]) + \" and \" + \n",
+    "              str(dm['second'].iloc[i]) + \" are most similar with score: \" \n",
     "              + str(dm['jaccard_coeff'].iloc[i]))\n",
     "    del jmax\n",
     "    del dm"
@@ -206,8 +206,8 @@
     "    filtered = _d.query('jaccard_coeff > @limit')\n",
     "    \n",
     "    for i in range(len(filtered)):\n",
-    "        print(\"Vertices \" + str(filtered['source'].iloc[i]) + \" and \" + \n",
-    "            str(filtered['destination'].iloc[i]) + \" are similar with score: \" + \n",
+    "        print(\"Vertices \" + str(filtered['first'].iloc[i]) + \" and \" + \n",
+    "            str(filtered['second'].iloc[i]) + \" are similar with score: \" + \n",
     "            str(filtered['jaccard_coeff'].iloc[i]))"
    ]
   },
@@ -322,7 +322,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "The Most similar shoul be 33 and 34.\n",
+    "The Most similar should be 33 and 34.\n",
     "Vertex 33 has 12 neighbors, vertex 34 has 17 neighbors.  They share 10 neighbors in common:\n",
     "$jaccard = 10 / (10 + (12 -10) + (17-10)) = 10 / 19 = 0.526$"
    ]
@@ -348,7 +348,7 @@
     "# Before printing, let's get rid of the duplicates (x compared to y is the same as y compared to x).  We will do that\n",
     "# by performing a query.  Then let's sort the data by score\n",
     "\n",
-    "jdf_s = jdf.query('source < destination').sort_values(by='jaccard_coeff', ascending=False)\n",
+    "jdf_s = jdf.query('first < second').sort_values(by='jaccard_coeff', ascending=False)\n",
     "\n",
     "print_jaccard_threshold(jdf_s, 0.0)"
    ]
@@ -500,4 +500,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 4
-}
+}
diff --git a/notebooks/algorithms/link_prediction/Overlap-Similarity.ipynb b/notebooks/algorithms/link_prediction/Overlap-Similarity.ipynb
@@ -59,9 +59,9 @@
     "Returns:\n",
     "\n",
     "    df: cudf.DataFrame with three names columns:\n",
-    "        df[\"source\"]: The source vertex id.\n",
-    "        df[\"destination\"]: The destination vertex id.\n",
-    "        df[\"overlap_coeff\"]: The overlap coefficient computed between the source and destination vertex.\n",
+    "        df[\"first\"]: The first vertex id of each pair.\n",
+    "        df[\"second\"]: The second vertex id of each pair.\n",
+    "        df[\"overlap_coeff\"]: The overlap coefficient computed between the vertex pairs.\n",
     "\n",
     "__References__\n",
     "- https://en.wikipedia.org/wiki/Overlap_coefficient\n"
@@ -87,9 +87,9 @@
     "Returns:\n",
     "\n",
     "    df: cudf.DataFrame with three names columns:\n",
-    "        df[\"source\"]: The source vertex id.\n",
-    "        df[\"destination\"]: The destination vertex id.\n",
-    "        df[\"jaccard_coeff\"]: The jaccard coefficient computed between the source and destination vertex.\n",
+    "        df[\"first\"]: The first vertex id of each pair.\n",
+    "        df[\"second\"]: The second vertex id of each pair.\n",
+    "        df[\"jaccard_coeff\"]: The jaccard coefficient computed between the vertex pairs.\n",
     "<br>\n",
     "\n",
     "See the Jaccard notebook for additional information and background"
@@ -174,8 +174,8 @@
     "    \n",
     "    #find the best\n",
     "    for i in range(len(dm)):    \n",
-    "        print(\"Vertices \" + str(dm['source'].iloc[i]) + \" and \" + \n",
-    "              str(dm['destination'].iloc[i]) + \" are most similar with score: \" \n",
+    "        print(\"Vertices \" + str(dm['first'].iloc[i]) + \" and \" + \n",
+    "              str(dm['second'].iloc[i]) + \" are most similar with score: \" \n",
     "              + str(dm['jaccard_coeff'].iloc[i]))\n",
     "    del jmax\n",
     "    del dm"
@@ -191,11 +191,11 @@
     "def print_most_similar_overlap(df):\n",
     "    \n",
     "    smax = df['overlap_coeff'].max()\n",
-    "    dm = df.query('overlap_coeff >= @smax and source < destination')      \n",
+    "    dm = df.query('overlap_coeff >= @smax and first < second')      \n",
     "    \n",
     "    for i in range(len(dm)):\n",
-    "        print(\"Vertices \" + str(dm['source'].iloc[i]) + \" and \" + \n",
-    "          str(dm['destination'].iloc[i]) + \" are most similar with score: \" \n",
+    "        print(\"Vertices \" + str(dm['first'].iloc[i]) + \" and \" + \n",
+    "          str(dm['second'].iloc[i]) + \" are most similar with score: \" \n",
     "          + str(dm['overlap_coeff'].iloc[i]))\n",
     "        \n",
     "    del smax\n",
@@ -214,8 +214,8 @@
     "    filtered = _d.query('jaccard_coeff > @limit')\n",
     "    \n",
     "    for i in range(len(filtered)):\n",
-    "        print(\"Vertices \" + str(filtered['source'].iloc[i]) + \" and \" + \n",
-    "            str(filtered['destination'].iloc[i]) + \" are similar with score: \" + \n",
+    "        print(\"Vertices \" + str(filtered['first'].iloc[i]) + \" and \" + \n",
+    "            str(filtered['second'].iloc[i]) + \" are similar with score: \" + \n",
     "            str(filtered['jaccard_coeff'].iloc[i]))"
    ]
   },
@@ -231,9 +231,9 @@
     "    filtered = _d.query('overlap_coeff > @limit')\n",
     "    \n",
     "    for i in range(len(filtered)):\n",
-    "        if filtered['source'].iloc[i] != filtered['destination'].iloc[i] :\n",
-    "            print(\"Vertices \" + str(filtered['source'].iloc[i]) + \" and \" + \n",
-    "                str(filtered['destination'].iloc[i]) + \" are similar with score: \" + \n",
+    "        if filtered['first'].iloc[i] != filtered['second'].iloc[i] :\n",
+    "            print(\"Vertices \" + str(filtered['first'].iloc[i]) + \" and \" + \n",
+    "                str(filtered['second'].iloc[i]) + \" are similar with score: \" + \n",
     "                str(filtered['overlap_coeff'].iloc[i]))"
    ]
   },
@@ -361,7 +361,7 @@
     "# Before printing, let's get rid of the duplicates (x compared to y is the same as y compared to x).  We will do that\n",
     "# by performing a query.  Then let's sort the data by score\n",
     "\n",
-    "jdf_s = jdf.query('source < destination').sort_values(by='jaccard_coeff', ascending=False)\n",
+    "jdf_s = jdf.query('first < second').sort_values(by='jaccard_coeff', ascending=False)\n",
     "\n",
     "print_jaccard_threshold(jdf_s, 0.0)"
    ]
@@ -410,8 +410,8 @@
    "outputs": [],
    "source": [
     "# print all similarities over a threshold, in this case 0.5\n",
-    "#also, drop duplicates\n",
-    "odf_s = odf.query('source < destination').sort_values(by='overlap_coeff', ascending=False)\n",
+    "# also, drop duplicates\n",
+    "odf_s = odf.query('first < second').sort_values(by='overlap_coeff', ascending=False)\n",
     "\n",
     "print_overlap_threshold(odf_s, 0.5)"
    ]
@@ -467,7 +467,7 @@
    "source": [
     "# print all similarities over a threshold, in this case 0.5\n",
     "# also, drop duplicates\n",
-    "odf_s2 = ol2.query('source < destination').sort_values(by='overlap_coeff', ascending=False)\n",
+    "odf_s2 = ol2.query('first < second').sort_values(by='overlap_coeff', ascending=False)\n",
     "\n",
     "print_overlap_threshold(odf_s2, 0.74)"
    ]
@@ -513,7 +513,7 @@
    "outputs": [],
    "source": [
     "# Let's combine the Jaccard and Overlap scores\n",
-    "mdf = jdf.merge(odf, on=['source','destination'])"
+    "mdf = jdf.merge(odf, on=['first','second'])"
    ]
   },
   {
@@ -532,8 +532,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "dS = degree.rename(columns={'vertex':'source','degree': 'src_degree'})\n",
-    "dD = degree.rename(columns={'vertex':'destination','degree': 'dst_degree'})"
+    "dS = degree.rename(columns={'vertex':'first','degree': 'src_degree'})\n",
+    "dD = degree.rename(columns={'vertex':'second','degree': 'dst_degree'})"
    ]
   },
   {
@@ -542,8 +542,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "m = mdf.merge(dS, how=\"left\", on='source')\n",
-    "m = m.merge(dD, how=\"left\", on='destination')"
+    "m = mdf.merge(dS, how=\"left\", on='first')\n",
+    "m = m.merge(dD, how=\"left\", on='second')"
    ]
   },
   {
@@ -552,7 +552,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "m.query('source < destination').sort_values(by='jaccard_coeff', ascending=False).head(20)"
+    "m.query('first < second').sort_values(by='jaccard_coeff', ascending=False).head(20)"
    ]
   },
   {
@@ -562,7 +562,7 @@
    "outputs": [],
    "source": [
     "# Now sort on the overlap\n",
-    "m.query('source < destination').sort_values(by='overlap_coeff', ascending=False).head(20)"
+    "m.query('first < second').sort_values(by='overlap_coeff', ascending=False).head(20)"
    ]
   },
   {
@@ -605,4 +605,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 4
-}
+}
diff --git a/notebooks/algorithms/structure/Renumber.ipynb b/notebooks/algorithms/structure/Renumber.ipynb
@@ -273,16 +273,16 @@
    "source": [
     "jac = cugraph.jaccard(G)\n",
     "\n",
-    "jac = numbering.unrenumber(jac, 'source')\n",
-    "jac = numbering.unrenumber(jac, 'destination')\n",
+    "jac = numbering.unrenumber(jac, 'first')\n",
+    "jac = numbering.unrenumber(jac, 'second')\n",
     "\n",
     "jac.insert(len(jac.columns),\n",
     "           \"original_source\",\n",
-    "           [ socket.inet_ntoa(struct.pack('!L', x)) for x in jac['source'].values_host ])\n",
+    "           [ socket.inet_ntoa(struct.pack('!L', x)) for x in jac['first'].values_host ])\n",
     "\n",
     "jac.insert(len(jac.columns),\n",
     "           \"original_destination\",\n",
-    "           [ socket.inet_ntoa(struct.pack('!L', x)) for x in jac['destination'].values_host ])\n",
+    "           [ socket.inet_ntoa(struct.pack('!L', x)) for x in jac['second'].values_host ])\n",
     "\n",
     "jac.to_pandas()\n"
    ]
@@ -358,4 +358,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 4
-}
+}
diff --git a/python/cugraph/cugraph/dask/link_prediction/jaccard.py b/python/cugraph/cugraph/dask/link_prediction/jaccard.py
@@ -31,11 +31,11 @@ def convert_to_cudf(cp_arrays):
     Creates a cudf DataFrame from cupy arrays from pylibcugraph wrapper
     """
 
-    cupy_source, cupy_destination, cupy_similarity = cp_arrays
+    cupy_first, cupy_second, cupy_similarity = cp_arrays
 
     df = cudf.DataFrame()
-    df["source"] = cupy_source
-    df["destination"] = cupy_destination
+    df["first"] = cupy_first
+    df["second"] = cupy_second
     df["jaccard_coeff"] = cupy_similarity
 
     return df
@@ -125,14 +125,14 @@ def jaccard(input_graph, vertex_pair=None, use_weight=False):
     result : dask_cudf.DataFrame
         GPU distributed data frame containing 2 dask_cudf.Series
 
-        ddf['source']: dask_cudf.Series
-            The source vertex ID (will be identical to first if specified)
-        ddf['destination']: dask_cudf.Series
-            The destination vertex ID (will be identical to second if
-            specified)
+        ddf['first']: dask_cudf.Series
+            The first vertex ID of each pair (will be identical to first if specified).
+        ddf['second']: dask_cudf.Series
+            The second vertex ID of each pair (will be identical to second if
+            specified).
         ddf['jaccard_coeff']: dask_cudf.Series
-            The computed Jaccard coefficient between the source and destination
-            vertices
+            The computed jaccard coefficient between the first and the second
+            vertex ID.
     """
 
     if input_graph.is_directed():
@@ -198,7 +198,7 @@ def jaccard(input_graph, vertex_pair=None, use_weight=False):
     wait([(r.release(), c_r.release()) for r, c_r in zip(result, cudf_result)])
 
     if input_graph.renumbered:
-        ddf = input_graph.unrenumber(ddf, "source")
-        ddf = input_graph.unrenumber(ddf, "destination")
+        ddf = input_graph.unrenumber(ddf, "first")
+        ddf = input_graph.unrenumber(ddf, "second")
 
     return ddf
diff --git a/python/cugraph/cugraph/dask/link_prediction/overlap.py b/python/cugraph/cugraph/dask/link_prediction/overlap.py
@@ -31,11 +31,11 @@ def convert_to_cudf(cp_arrays):
     Creates a cudf DataFrame from cupy arrays from pylibcugraph wrapper
     """
 
-    cupy_source, cupy_destination, cupy_similarity = cp_arrays
+    cupy_first, cupy_second, cupy_similarity = cp_arrays
 
     df = cudf.DataFrame()
-    df["source"] = cupy_source
-    df["destination"] = cupy_destination
+    df["first"] = cupy_first
+    df["second"] = cupy_second
     df["overlap_coeff"] = cupy_similarity
 
     return df
@@ -103,14 +103,14 @@ def overlap(input_graph, vertex_pair=None, use_weight=False):
     result : dask_cudf.DataFrame
         GPU distributed data frame containing 2 dask_cudf.Series
 
-        ddf['source']: dask_cudf.Series
-            The source vertex ID (will be identical to first if specified)
-        ddf['destination']: dask_cudf.Series
-            The destination vertex ID (will be identical to second if
-            specified)
+        ddf['first']: dask_cudf.Series
+            The first vertex ID of each pair(will be identical to first if specified).
+        ddf['second']: dask_cudf.Series
+            The second vertex ID of each pair(will be identical to second if
+            specified).
         ddf['overlap_coeff']: dask_cudf.Series
-            The computed Overlap coefficient between the source and destination
-            vertices
+            The computed overlap coefficient between the first and the second
+            vertex ID.
     """
 
     if input_graph.is_directed():
@@ -176,7 +176,7 @@ def overlap(input_graph, vertex_pair=None, use_weight=False):
     wait([(r.release(), c_r.release()) for r, c_r in zip(result, cudf_result)])
 
     if input_graph.renumbered:
-        ddf = input_graph.unrenumber(ddf, "source")
-        ddf = input_graph.unrenumber(ddf, "destination")
+        ddf = input_graph.unrenumber(ddf, "first")
+        ddf = input_graph.unrenumber(ddf, "second")
 
     return ddf