From 2e17025c71198ea8b591ad7680c9c6711b8679e5 Mon Sep 17 00:00:00 2001 From: BradReesWork Date: Wed, 14 Sep 2022 12:11:02 -0400 Subject: [PATCH 1/4] new notebook for loading mag240m --- notebooks/gnn/mag240m_pg.ipynb | 405 +++++++++++++++++++++++++++++++++ 1 file changed, 405 insertions(+) create mode 100644 notebooks/gnn/mag240m_pg.ipynb diff --git a/notebooks/gnn/mag240m_pg.ipynb b/notebooks/gnn/mag240m_pg.ipynb new file mode 100644 index 00000000000..92d98a02fbc --- /dev/null +++ b/notebooks/gnn/mag240m_pg.ipynb @@ -0,0 +1,405 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Load the MAG240M Dataset into a Property Graph\n", + "\n", + "The notebook is meant to stress test loading data into a property graph. \n", + "The notebook requires that the data has already been downloaded \n", + "\n", + "\n", + " from ogb.lsc import MAG240MDataset\n", + " dataset = MAG240MDataset(root = base_dir)\n", + " dataset.download()\n", + "\n", + "\n", + "num_gpus = Enter the number of GPUs that you have in the cluster. This will determine the number of records loaded\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Feature variable\n", + "skip_edges = False\n", + "skip_features = True\n", + "\n", + "load_paper_features = True\n", + "load_paper_labels = True\n", + "load_paper_year = True\n", + "\n", + "load_affiliation_edges = False\n", + "load_writes_edges = True\n", + "load_cites_edges = False" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Import needed libraries. \n", + "# We recommend using the [cugraph_dev](https://github.com/rapidsai/cugraph/tree/branch-22.10/conda/environments) env through conda\n", + "\n", + "from dask.distributed import Client, wait\n", + "from dask_cuda import LocalCUDACluster\n", + "from cugraph.dask.comms import comms as Comms\n", + "import cugraph.dask as dask_cugraph\n", + "import cugraph\n", + "import dask_cudf\n", + "import cudf\n", + "import time\n", + "import numpy as np\n", + "import math" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup variables" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# size of dataset\n", + "class Dataset:\n", + " def __init__(self):\n", + " self.num_papers = 121751666\n", + " self.num_authors = 122383112\n", + " self.num_institutions = 25721\n", + " self.num_paper_features = 768\n", + " self.num_classes = 153\n", + " self.num_cite_edges = 1297748926\n", + " self.num_write_edges = 386022720\n", + " self.num_affiliated_edges = 44592586\n", + " \n", + " def adjust_by(self, factor):\n", + " self.num_papers = math.floor(self.num_papers * factor)\n", + " self.num_authors = math.floor(self.num_authors * factor)\n", + " self.num_institutions = math.floor(self.num_institutions * factor)\n", + " self.num_paper_features = math.floor(self.num_paper_features * factor)\n", + " self.num_classes = math.floor(self.num_classes * factor)\n", + " self.num_cite_edges = math.floor(self.num_cite_edges * factor)\n", + " self.num_write_edges = math.floor(self.num_write_edges * factor)\n", + " self.num_affiliated_edges = math.floor(self.num_affiliated_edges * factor) \n", + "\n", + "dataset = Dataset()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Directories and Files" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "mag_dir = \"/home/brad/Notebooks/dataset/mag240m_kddcup2021/processed\"\n", + "\n", + "# Features\n", + "paper_feature_file = mag_dir + \"/paper/node_feat.npy\"\n", + "paper_label_file = mag_dir + \"/paper/node_label.npy\"\n", + "paper_year_file = mag_dir + \"/paper/node_year.npy\"\n", + "\n", + "# Edges\n", + "auth_institute_file = mag_dir + \"/author___affiliated_with___institution/edge_index.npy\"\n", + "auth_write_file = mag_dir + \"/author___writes___paper/edge_index.npy\"\n", + "auth_cites_file = mag_dir + \"/paper___cites___paper/edge_index.npy\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#!nvidia-smi" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Specify the GPUs to use\n", + "import os\n", + "os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"0\"\n", + "\n", + "num_gpus = 1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# it takes 16 GPUs to fully load the data\n", + "# compute the percent of data to be loaded\n", + "# This will be used later\n", + "percent_data_factor = num_gpus / 200" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# number of features to load?\n", + "# there are 768 features, specify how many to be loaded\n", + "# the code just loadfs feature 0 to N, sequencially\n", + "#num_features = 768\n", + "num_features = 10" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dataset.adjust_by(factor=percent_data_factor)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup the Cluster" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Create the DASK Cluster\n", + "cluster = LocalCUDACluster()\n", + "client = Client(cluster)\n", + "Comms.initialize(p2p=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create the Property Graph" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from cugraph.experimental import MGPropertyGraph\n", + "from cugraph.experimental import PropertyGraph\n", + "\n", + "pG = PropertyGraph()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load the Paper (Vertex) Features\n", + "The code reads in data in 1,000 record chunks since the data is loaded from a single host" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def data_load_properties(_PG, file_name, num_rec, name=None, chunk=-1, data_range=None, col_names=None):\n", + " \n", + " # are we using SG or MG PG?\n", + " _use_dask = isinstance(_PG, MGPropertyGraph)\n", + " \n", + " _data = np.lib.format.open_memmap(file_name, mode='r') \n", + " \n", + " if chunk == -1:\n", + " chunk = num_rec\n", + " \n", + " _rec_read = 0\n", + " \n", + " while _rec_read < num_rec:\n", + " _start_id = _rec_read\n", + " _end_id = _start_id + chunk\n", + " \n", + " if (_end_id > num_rec):\n", + " _end_id = num_rec\n", + "\n", + " print(f\"reading {name} data from {_start_id} to {_end_id}\")\n", + " \n", + " if data_range is not None:\n", + " _x = _data[_start_id:_end_id, 0:data_range]\n", + " else:\n", + " _x = _data[_start_id:_end_id]\n", + "\n", + " gdf = cudf.DataFrame(_x, columns=col_names)\n", + " gdf['id'] = gdf.index + _start_id\n", + " gdf.columns = gdf.columns.astype(str)\n", + "\n", + " if _use_dask:\n", + " ddf = dask_cudf.from_cudf(gdf, npartitions=num_gpus)\n", + " else:\n", + " ddf = gdf\n", + "\n", + " pG.add_vertex_data(ddf, vertex_col_name='id', type_name=name)\n", + "\n", + " _rec_read = _end_id " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if not skip_features:\n", + " \n", + " if load_paper_labels:\n", + " data_load_properties(pG, paper_label_file, num_rec=dataset.num_papers, name='paper_label', col_names=[\"label\"])\n", + " print(f\"PG now contains {pG.get_num_vertices()} \")\n", + " \n", + " if load_paper_year:\n", + " data_load_properties(pG, paper_year_file, num_rec=dataset.num_papers, name='paper_year', col_names=[\"year\"])\n", + " print(f\"PG now contains {pG.get_num_vertices()} \")\n", + " \n", + " if load_paper_features:\n", + " data_load_properties(pG, paper_feature_file, num_rec=dataset.num_papers, name='paper_feature', data_range=num_features)\n", + " print(f\"PG now contains {pG.get_num_vertices()} \")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load the Edges" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def data_load_edges(_PG, file_name, num_edges, name=None, chunk=-1, col_names=None):\n", + " \n", + " # are we using SG or MG PG?\n", + " _use_dask = isinstance(_PG, MGPropertyGraph)\n", + " \n", + " _data = np.lib.format.open_memmap(file_name, mode='r') \n", + " \n", + " if chunk == -1:\n", + " chunk = num_edges\n", + " \n", + " _rec_read = 0\n", + " \n", + " while _rec_read < num_edges:\n", + " _start_id = _rec_read\n", + " _end_id = _start_id + chunk\n", + " \n", + " if (_end_id > num_edges):\n", + " _end_id = num_edges\n", + "\n", + " print(f\"reading {name} data from {_start_id} to {_end_id}\")\n", + " \n", + " _x = _data[_start_id:_end_id]\n", + "\n", + " gdf = cudf.DataFrame()\n", + " gdf['src'] = _x[0]\n", + " gdf['dst'] = _x[1]\n", + " gdf.columns = gdf.columns.astype(str)\n", + "\n", + " if _use_dask:\n", + " ddf = dask_cudf.from_cudf(gdf, npartitions=num_gpus)\n", + " else:\n", + " ddf = gdf\n", + "\n", + " pG.add_edge_data(ddf, vertex_col_names=['src', 'dst'], type_name=name)\n", + "\n", + " _rec_read = _end_id " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if not skip_edges: \n", + " if load_affiliation_edges:\n", + " data_load_edges(pG, auth_institute_file, num_edges=dataset.num_affiliated_edges, name=\"affiliated\" )\n", + " print(f\"PG now contains {pG.get_num_edges()} \")\n", + "\n", + " if load_writes_edges:\n", + " data_load_edges(pG, auth_write_file, num_edges=dataset.num_write_edges, name=\"writes\" )\n", + " print(f\"PG now contains {pG.get_num_edges()} \")\n", + "\n", + "\n", + " if load_cites_edges:\n", + " data_load_edges(pG, auth_cites_file, num_edges=dataset.num_cite_edges, name=\"cites\" )\n", + " print(f\"PG now contains {pG.get_num_edges()} \")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(f\"PG now contains {pG.get_num_edges()} \")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "cugraph_dev", + "language": "python", + "name": "cugraph_dev" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.13" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} From f2802442c053048ada2c364f8e73e9781754751c Mon Sep 17 00:00:00 2001 From: BradReesWork Date: Wed, 21 Sep 2022 12:33:14 -0400 Subject: [PATCH 2/4] updates --- .../{gnn => stress_tests}/mag240m_pg.ipynb | 79 ++++++++++++------- 1 file changed, 50 insertions(+), 29 deletions(-) rename notebooks/{gnn => stress_tests}/mag240m_pg.ipynb (85%) diff --git a/notebooks/gnn/mag240m_pg.ipynb b/notebooks/stress_tests/mag240m_pg.ipynb similarity index 85% rename from notebooks/gnn/mag240m_pg.ipynb rename to notebooks/stress_tests/mag240m_pg.ipynb index 92d98a02fbc..151a3d5307f 100644 --- a/notebooks/gnn/mag240m_pg.ipynb +++ b/notebooks/stress_tests/mag240m_pg.ipynb @@ -7,16 +7,32 @@ "# Load the MAG240M Dataset into a Property Graph\n", "\n", "The notebook is meant to stress test loading data into a property graph. \n", - "The notebook requires that the data has already been downloaded \n", + "This notebook requires some setup \n", + "\n", + "__num_gpus = Enter the number of GPUs that you have in the cluster. This will determine the number of records loaded__\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# location of where the data is being downloaded/saved to\n", + "base_dir = \"../../datasets\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "__The notebook requires that the data has already been downloaded__\n", "\n", "\n", " from ogb.lsc import MAG240MDataset\n", " dataset = MAG240MDataset(root = base_dir)\n", " dataset.download()\n", - "\n", - "\n", - "num_gpus = Enter the number of GPUs that you have in the cluster. This will determine the number of records loaded\n", - "\n" + " " ] }, { @@ -27,15 +43,15 @@ "source": [ "# Feature variable\n", "skip_edges = False\n", - "skip_features = True\n", + "skip_features = False\n", "\n", "load_paper_features = True\n", "load_paper_labels = True\n", "load_paper_year = True\n", "\n", - "load_affiliation_edges = False\n", + "load_affiliation_edges = True\n", "load_writes_edges = True\n", - "load_cites_edges = False" + "load_cites_edges = True" ] }, { @@ -55,8 +71,7 @@ "import dask_cudf\n", "import cudf\n", "import time\n", - "import numpy as np\n", - "import math" + "import numpy as np" ] }, { @@ -85,14 +100,14 @@ " self.num_affiliated_edges = 44592586\n", " \n", " def adjust_by(self, factor):\n", - " self.num_papers = math.floor(self.num_papers * factor)\n", - " self.num_authors = math.floor(self.num_authors * factor)\n", - " self.num_institutions = math.floor(self.num_institutions * factor)\n", - " self.num_paper_features = math.floor(self.num_paper_features * factor)\n", - " self.num_classes = math.floor(self.num_classes * factor)\n", - " self.num_cite_edges = math.floor(self.num_cite_edges * factor)\n", - " self.num_write_edges = math.floor(self.num_write_edges * factor)\n", - " self.num_affiliated_edges = math.floor(self.num_affiliated_edges * factor) \n", + " self.num_papers = int(self.num_papers * factor)\n", + " self.num_authors = int(self.num_authors * factor)\n", + " self.num_institutions = int(self.num_institutions * factor)\n", + " self.num_paper_features = int(self.num_paper_features * factor)\n", + " self.num_classes = int(self.num_classes * factor)\n", + " self.num_cite_edges = int(self.num_cite_edges * factor)\n", + " self.num_write_edges = int(self.num_write_edges * factor)\n", + " self.num_affiliated_edges = int(self.num_affiliated_edges * factor) \n", "\n", "dataset = Dataset()" ] @@ -110,7 +125,8 @@ "metadata": {}, "outputs": [], "source": [ - "mag_dir = \"/home/brad/Notebooks/dataset/mag240m_kddcup2021/processed\"\n", + "# This NEEDS to be set to the location of the downloaded data\n", + "mag_dir = base_dir + \"/mag240m_kddcup2021/processed\"\n", "\n", "# Features\n", "paper_feature_file = mag_dir + \"/paper/node_feat.npy\"\n", @@ -142,6 +158,7 @@ "import os\n", "os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"0\"\n", "\n", + "# The number oif GPUs in your system - here we are just testing with 1\n", "num_gpus = 1" ] }, @@ -153,8 +170,12 @@ "source": [ "# it takes 16 GPUs to fully load the data\n", "# compute the percent of data to be loaded\n", + "\n", + "# Note, you can adjust the amount of data loaded by increasing or decreasisng this value\n", + "# use a large number to load a small amount per GPU\n", + "\n", "# This will be used later\n", - "percent_data_factor = num_gpus / 200" + "percent_data_factor = num_gpus / 16" ] }, { @@ -163,11 +184,7 @@ "metadata": {}, "outputs": [], "source": [ - "# number of features to load?\n", - "# there are 768 features, specify how many to be loaded\n", - "# the code just loadfs feature 0 to N, sequencially\n", - "#num_features = 768\n", - "num_features = 10" + "dataset.adjust_by(factor=percent_data_factor)" ] }, { @@ -176,7 +193,13 @@ "metadata": {}, "outputs": [], "source": [ - "dataset.adjust_by(factor=percent_data_factor)" + "# number of features to load?\n", + "# there are 768 features, specify how many to be loaded\n", + "# the code just loadfs feature 0 to N, sequencially\n", + "#num_features = 768\n", + "\n", + "# using just 10 features in this test\n", + "num_features = 10" ] }, { @@ -221,8 +244,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Load the Paper (Vertex) Features\n", - "The code reads in data in 1,000 record chunks since the data is loaded from a single host" + "## Load the Paper (Vertex) Features\n" ] }, { @@ -358,7 +380,6 @@ " data_load_edges(pG, auth_write_file, num_edges=dataset.num_write_edges, name=\"writes\" )\n", " print(f\"PG now contains {pG.get_num_edges()} \")\n", "\n", - "\n", " if load_cites_edges:\n", " data_load_edges(pG, auth_cites_file, num_edges=dataset.num_cite_edges, name=\"cites\" )\n", " print(f\"PG now contains {pG.get_num_edges()} \")" From ad6b1930fe9bf9a6a5367f257c1be1dfd3245040 Mon Sep 17 00:00:00 2001 From: BradReesWork Date: Thu, 22 Sep 2022 10:01:36 -0400 Subject: [PATCH 3/4] renamed folder --- notebooks/modules/README.md | 3 +++ notebooks/{stress_tests => modules}/mag240m_pg.ipynb | 0 2 files changed, 3 insertions(+) create mode 100644 notebooks/modules/README.md rename notebooks/{stress_tests => modules}/mag240m_pg.ipynb (100%) diff --git a/notebooks/modules/README.md b/notebooks/modules/README.md new file mode 100644 index 00000000000..29cb47784c7 --- /dev/null +++ b/notebooks/modules/README.md @@ -0,0 +1,3 @@ +# This folde contains partial notebooks + +The code here is meant to be used to build other notebooks \ No newline at end of file diff --git a/notebooks/stress_tests/mag240m_pg.ipynb b/notebooks/modules/mag240m_pg.ipynb similarity index 100% rename from notebooks/stress_tests/mag240m_pg.ipynb rename to notebooks/modules/mag240m_pg.ipynb From f99555d17e7c04a35e800b96b10f94f8fc0b962a Mon Sep 17 00:00:00 2001 From: BradReesWork Date: Mon, 26 Sep 2022 09:41:22 -0400 Subject: [PATCH 4/4] addeed flag to skip CI testing of notebooks in this folder --- notebooks/modules/SKIP_CI_TESTING | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 notebooks/modules/SKIP_CI_TESTING diff --git a/notebooks/modules/SKIP_CI_TESTING b/notebooks/modules/SKIP_CI_TESTING new file mode 100644 index 00000000000..e69de29bb2d