diff --git a/CHANGELOG.md b/CHANGELOG.md index 57b637f9f90..f7b380814d1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,6 +18,7 @@ - PR #1090 Add experimental Leiden function - PR #1077 Updated/added copyright notices, added copyright CI check from cuml - PR #1100 Add support for new build process (Project Flash) +- PR #1093 New benchmarking notebook ## Improvements - PR #898 Add Edge Betweenness Centrality, and endpoints to BC diff --git a/notebooks/cugraph_benchmarks/release.ipynb b/notebooks/cugraph_benchmarks/release.ipynb new file mode 100644 index 00000000000..ff5ed5abf9f --- /dev/null +++ b/notebooks/cugraph_benchmarks/release.ipynb @@ -0,0 +1,596 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Release Benchmarking\n", + "\n", + "With every release, RAPIDS publishes a release slide deck that includes the current performance state of cuGraph. \n", + "This notebook, starting with release 0.15, runs all the various algorithms to computes the performance gain. \n", + "\n", + "### Algorithms\n", + "| Algorithm | Graph | DiGraph |\n", + "| ------------------------| -------- | ----------- |\n", + "| BFS | X | |\n", + "| SSSP | X | |\n", + "| PageRank | | X |\n", + "| WCC | | X |\n", + "| Betweenness Centrality | X | |\n", + "| Louvain | X | |\n", + "| Triangle Counting | X | |\n", + "\n", + "### Test Data\n", + "\n", + "| File Name | Num of Vertices | Num of Edges |\n", + "| ---------------------- | --------------: | -----------: |\n", + "| preferentialAttachment | 100,000 | 999,970 |\n", + "| dblp-2010 | 326,186 | 1,615,400 |\n", + "| coPapersCiteseer | 434,102 | 32,073,440 |\n", + "| as-Skitter | 1,696,415 | 22,190,596 |\n", + "\n", + "\n", + "Notebook Credits\n", + "\n", + " Original Authors: Bradley Rees\n", + " Last Edit: 08/17/2020\n", + " \n", + "RAPIDS Versions: 0.15\n", + "\n", + "Test Hardware\n", + " GV100 32G, CUDA 10.2\n", + " Intel(R) Core(TM) CPU i7-7800X @ 3.50GHz\n", + " 32GB system memory\n", + " " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Timing \n", + "What is not timed: Reading the data

\n", + "What is timmed: (1) creating a Graph, (2) running the algorithm" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Import Modules" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# system and other\n", + "import gc\n", + "import os\n", + "import time\n", + "import numpy as np\n", + "\n", + "# rapids\n", + "import cugraph\n", + "import cudf\n", + "\n", + "# NetworkX libraries\n", + "import networkx as nx\n", + "\n", + "# MTX file reader\n", + "from scipy.io import mmread" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "try: \n", + " import community\n", + "except ModuleNotFoundError:\n", + " os.system('pip install python-louvain')\n", + " import community" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "try: \n", + " import matplotlib\n", + "except ModuleNotFoundError:\n", + " os.system('pip install matplotlib')\n", + "\n", + "import matplotlib.pyplot as plt; plt.rcdefaults()\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Define the test data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Test File\n", + "data = {\n", + " 'preferentialAttachment' : './data/preferentialAttachment.mtx',\n", + " 'dblp' : './data/dblp-2010.mtx',\n", + " 'coPapersCiteseer' : './data/coPapersCiteseer.mtx',\n", + " 'as-Skitter' : './data/as-Skitter.mtx'\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Read data\n", + "The data is read in once and used for both cuGraph and NetworkX." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Data reader - the file format is MTX, so we will use the reader from SciPy\n", + "def read_data(datafile):\n", + " print('Reading ' + str(datafile) + '...')\n", + " M = mmread(datafile).asfptype()\n", + "\n", + " _gdf = cudf.DataFrame()\n", + " _gdf['src'] = M.row\n", + " _gdf['dst'] = M.col\n", + " \n", + " return _gdf" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create Graph functions\n", + "There are two types of graphs created:\n", + "Directed Graphs - calls to create_xx_digraph\n", + "Undirected Graphs - calls to create_xx_ugraph <- fully syemmeterized " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# NetworkX\n", + "def create_nx_digraph(_df):\n", + " _gnx = nx.from_pandas_edgelist(_df, source='src', target='dst', edge_attr=None, create_using=nx.DiGraph)\n", + " return _gnx\n", + "\n", + "def create_nx_ugraph(_df):\n", + " _gnx = nx.from_pandas_edgelist(_df, source='src', target='dst', edge_attr=None, create_using=nx.Graph)\n", + " return _gnx\n", + "\n", + "\n", + "# cuGraph\n", + "def create_cu_digraph(_df):\n", + " _g = cugraph.DiGraph()\n", + " _g.from_cudf_edgelist(_df, source='src', destination='dst', renumber=False)\n", + " return _g\n", + "\n", + "def create_cu_ugraph(_df):\n", + " _g = cugraph.Graph()\n", + " _g.from_cudf_edgelist(_df, source='src', destination='dst', renumber=False)\n", + " return _g" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### BFS" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def nx_bfs(_df):\n", + " t1 = time.time()\n", + " _G = create_nx_ugraph(_df)\n", + " _ = nx.bfs_edges(_G, 1)\n", + " t2 = time.time() - t1\n", + " return t2\n", + "\n", + "def cu_bfs(_df):\n", + " t1 = time.time()\n", + " _G = create_cu_ugraph(_df)\n", + " _ = cugraph.bfs(_G, 1)\n", + " t2 = time.time() - t1\n", + " return t2" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### SSSP" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def nx_sssp(_df):\n", + " t1 = time.time()\n", + " _G = create_nx_ugraph(_df)\n", + " _ = nx.shortest_path(_G, 1)\n", + " t2 = time.time() - t1\n", + " return t2\n", + "\n", + "def cu_sssp(_df):\n", + " t1 = time.time()\n", + " _G = create_cu_ugraph(_df) \n", + " _ = cugraph.sssp(_G, 1)\n", + " t2 = time.time() - t1\n", + " return t2" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### PageRank" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def nx_pagerank(_df):\n", + " t1 = time.time()\n", + " _G = create_nx_digraph(_df)\n", + " _ = nx.pagerank(_G)\n", + " t2 = time.time() - t1\n", + " return t2\n", + "\n", + "def cu_pagerank(_df):\n", + " t1 = time.time()\n", + " _G = create_cu_graph(_df)\n", + " _ = cugraph.pagerank(_G)\n", + " t2 = time.time() - t1\n", + " return t2" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### WCC" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def nx_wcc(_df):\n", + " t1 = time.time()\n", + " _G = create_nx_digraph(_df)\n", + " _ = nx.weakly_connected_components(_G)\n", + " t2 = time.time() - t1\n", + " return t2\n", + "\n", + "def cu_wcc(_df):\n", + " t1 = time.time()\n", + " _G = create_cu_graph(_df) \n", + " _ = cugraph.weakly_connected_components(_G)\n", + " t2 = time.time() - t1\n", + " return t2" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Betweenness Centrality (vertex)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def nx_bc(_df):\n", + " t1 = time.time()\n", + " _G = create_nx_ugraph(_df)\n", + " _ = nx.betweenness_centrality(_G, k=100)\n", + " t2 = time.time() - t1\n", + " return t2\n", + "\n", + "def cu_bc(_df):\n", + " t1 = time.time()\n", + " _G = create_cu_ugraph(_df)\n", + " _ = cugraph.betweenness_centrality(_G, k=100)\n", + " t2 = time.time() - t1\n", + " return t2" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Louvain" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def nx_louvain(_df):\n", + " t1 = time.time()\n", + " _G = create_nx_ugraph(_df)\n", + " parts = community.best_partition(_G)\n", + " \n", + " # Calculating modularity scores for comparison \n", + " _ = community.modularity(parts, _G) \n", + " \n", + " t2 = time.time() - t1\n", + " return t2\n", + "\n", + "def cu_louvain(_df):\n", + " t1 = time.time()\n", + " _G = create_cu_ugraph(_df)\n", + " _,_ = cugraph.louvain(_G)\n", + " t2 = time.time() - t1\n", + " return t2" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Triangle Counting" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def nx_tc(_df):\n", + " t1 = time.time()\n", + " _G = create_nx_ugraph(_df)\n", + " nx_count = nx.triangles(_G)\n", + " \n", + " # To get the number of triangles, we would need to loop through the array and add up each count\n", + " count = 0\n", + " for key, value in nx_count.items():\n", + " count = count + value \n", + " \n", + " t2 = time.time() - t1\n", + " return t2\n", + "\n", + "def cu_tc(_df):\n", + " t1 = time.time()\n", + " _G = create_cu_ugraph(_df)\n", + " _ = cugraph.triangles(_G)\n", + " t2 = time.time() - t1\n", + " return t2" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Benchmark Functions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# number of datasets\n", + "num_datasets = len(data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# do a simple pass just to get all the libraries initiallized\n", + "# This cell might not be needed\n", + "v = './data/preferentialAttachment.mtx'\n", + "gdf = read_data(v)\n", + "print(f\"\\tGDF Size {len(gdf)}\")\n", + "\n", + "g = create_cu_ugraph(gdf)\n", + "\n", + "print(f\"\\tcugraph Size {g.number_of_edges()}\")\n", + "print(f\"\\tcugraph Order {g.number_of_vertices()}\")\n", + "\n", + "# clean up what we just created\n", + "del gdf\n", + "del g\n", + "gc.collect()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# arrays to capture performance gains\n", + "names = []\n", + "\n", + "# Two dimension data\n", + "time_algo_cu = [] # will be two dimensional\n", + "time_algo_nx = [] # will be two dimensional\n", + "perf = []\n", + "\n", + "\n", + "\n", + "i = 0\n", + "for k,v in data.items():\n", + " time_algo_cu.append([])\n", + " time_algo_nx.append([])\n", + " perf.append([])\n", + " \n", + " # Saved the file Name\n", + " names.append(k)\n", + "\n", + " # read data\n", + " gdf = read_data(v)\n", + " pdf = gdf.to_pandas()\n", + " print(f\"\\tdata in gdf {len(gdf)} and data in pandas {len(pdf)}\")\n", + "\n", + " # BFS\n", + " print(\"\\tBFS\")\n", + " tx = nx_bfs(pdf)\n", + " tc = cu_bfs(gdf)\n", + "\n", + " time_algo_nx[i].append(tx)\n", + " time_algo_cu[i].append(tc)\n", + " perf[i].append(tx/tc)\n", + " gc.collect()\n", + " \n", + " # SSSP\n", + " print(\"\\tSSSP\")\n", + " tx = nx_sssp(pdf)\n", + " tc = cu_sssp(gdf)\n", + "\n", + " time_algo_nx[i].append(tx)\n", + " time_algo_cu[i].append(tc)\n", + " perf[i].append(tx/tc)\n", + " gc.collect()\n", + "\n", + " # PageRank\n", + " print(\"\\tPageRank\") \n", + " tx = nx_pagerank(pdf)\n", + " tc = cu_pagerank(gdf)\n", + "\n", + " time_algo_nx[i].append(tx)\n", + " time_algo_cu[i].append(tc)\n", + " perf[i].append(tx/tc)\n", + " gc.collect()\n", + "\n", + " # WCC\n", + " print(\"\\tWCC\")\n", + " tx = nx_wcc(pdf)\n", + " tc = cu_wcc(gdf)\n", + "\n", + " time_algo_nx[i].append(tx)\n", + " time_algo_cu[i].append(tc)\n", + " perf[i].append(tx/tc)\n", + " gc.collect()\n", + "\n", + " # BC\n", + " print(\"\\tBC\")\n", + " tx = nx_bc(pdf)\n", + " tc = cu_bc(gdf)\n", + "\n", + " time_algo_nx[i].append(tx)\n", + " time_algo_cu[i].append(tc)\n", + " perf[i].append(tx/tc)\n", + " gc.collect()\n", + "\n", + " # Louvain\n", + " print(\"\\tLouvain\")\n", + " tx = nx_louvain(pdf)\n", + " tc = cu_louvain(gdf)\n", + "\n", + " time_algo_nx[i].append(tx)\n", + " time_algo_cu[i].append(tc)\n", + " perf[i].append(tx/tc)\n", + " gc.collect()\n", + "\n", + " # TC\n", + " print(\"\\tTC\")\n", + " tx = nx_tc(pdf)\n", + " tc = cu_tc(gdf)\n", + "\n", + " time_algo_nx[i].append(tx)\n", + " time_algo_cu[i].append(tc)\n", + " perf[i].append(tx/tc)\n", + " gc.collect()\n", + "\n", + " i = i + 1\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#Print results\n", + "for i in range(num_datasets):\n", + " print(f\"{names[i]}\")\n", + " print(f\"{perf[i]}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "___\n", + "Copyright (c) 2020, NVIDIA CORPORATION.\n", + "\n", + "Licensed under the Apache License, Version 2.0 (the \"License\"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0\n", + "\n", + "Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.\n", + "___" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "cugraph_dev", + "language": "python", + "name": "cugraph_dev" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.8" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}