Add datasets API to import graph data from configuration/metadata fil…

…es (#2367) Addresses issue [#1348](https://nvidia.slack.com/archives/C01SCT7ELMR). A working version of the datasets API has been added under the "experimental" module of cuGraph. This API comes with the ability to import a handful of built-in datasets to create graphs and edge lists. Each dataset comes with its own metadata file in the format of a YAML file. These files contain general information about the dataset, as well as formatting information about their columns and datatypes. Authors: - Dylan Chima-Sanchez (https://github.com/betochimas) - Ralph Liu (https://github.com/oorliu) Approvers: - Rick Ratzel (https://github.com/rlratzel) - Joseph Nke (https://github.com/jnke2016) URL: #2367
rapidsai · Jul 25, 2022 · 6644813 · 6644813
1 parent 5788f02
commit 6644813
Show file tree

Hide file tree

Showing 17 changed files with 638 additions and 1 deletion.
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -1,2 +1,4 @@
 include python/versioneer.py
-include python/cugraph/_version.py
+include python/cugraph/_version.py
+include cugraph/experimental/datasets/*.yaml
+include cugraph/experimental/datasets/metadata/*.yaml
diff --git a/python/cugraph/MANIFEST.in b/python/cugraph/MANIFEST.in
@@ -1,2 +1,4 @@
 include versioneer.py
 include cugraph/_version.py
+include cugraph/experimental/datasets/*.yaml
+include cugraph/experimental/datasets/metadata/*.yaml
diff --git a/python/cugraph/cugraph/experimental/__init__.py b/python/cugraph/cugraph/experimental/__init__.py
@@ -39,3 +39,5 @@
 find_bicliques = deprecated_warning_wrapper(
     experimental_warning_wrapper(EXPERIMENTAL__find_bicliques)
 )
+
+from cugraph.experimental.datasets.dataset import Dataset
diff --git a/python/cugraph/cugraph/experimental/datasets/__init__.py b/python/cugraph/cugraph/experimental/datasets/__init__.py
@@ -0,0 +1,51 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from cugraph.experimental.datasets.dataset import (
+    Dataset,
+    load_all,
+    set_config,
+    set_download_dir,
+    get_download_dir,
+    default_download_dir
+)
+from cugraph.experimental.datasets import metadata
+from pathlib import Path
+
+
+meta_path = Path(__file__).parent / "metadata"
+
+karate = Dataset(meta_path / "karate.yaml")
+karate_undirected = Dataset(meta_path / "karate_undirected.yaml")
+karate_asymmetric = Dataset(meta_path / "karate_asymmetric.yaml")
+dolphins = Dataset(meta_path / "dolphins.yaml")
+polbooks = Dataset(meta_path / "polbooks.yaml")
+netscience = Dataset(meta_path / "netscience.yaml")
+cyber = Dataset(meta_path / "cyber.yaml")
+small_line = Dataset(meta_path / "small_line.yaml")
+small_tree = Dataset(meta_path / "small_tree.yaml")
+
+
+# LARGE DATASETS
+LARGE_DATASETS = [cyber]
+
+# <10,000 lines
+MEDIUM_DATASETS = [netscience, polbooks]
+
+# <500 lines
+SMALL_DATASETS = [karate, small_line, small_tree, dolphins]
+
+# ALL
+ALL_DATASETS = [karate, dolphins, netscience, polbooks, cyber,
+                small_line, small_tree]
diff --git a/python/cugraph/cugraph/experimental/datasets/dataset.py b/python/cugraph/cugraph/experimental/datasets/dataset.py
@@ -0,0 +1,207 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import cugraph
+import cudf
+import yaml
+import os
+from pathlib import Path
+
+
+class DefaultDownloadDir:
+    """
+    Maintains the path to the download directory used by Dataset instances.
+    Instances of this class are typically shared by several Dataset instances
+    in order to allow for the download directory to be defined and updated by
+    a single object.
+    """
+    def __init__(self):
+        self._path = Path(os.environ.get("RAPIDS_DATASET_ROOT_DIR",
+                                         Path.home() / ".cugraph/datasets"))
+
+    @property
+    def path(self):
+        """
+        If `path` is not set, set it to the environment variable
+        RAPIDS_DATASET_ROOT_DIR. If the variable is not set, default to the
+        user's home directory.
+        """
+        if self._path is None:
+            self._path = Path(os.environ.get("RAPIDS_DATASET_ROOT_DIR",
+                                             Path.home() /
+                                             ".cugraph/datasets"))
+        return self._path
+
+    @path.setter
+    def path(self, new):
+        self._path = Path(new)
+
+    def clear(self):
+        self._path = None
+
+
+default_download_dir = DefaultDownloadDir()
+
+
+class Dataset:
+    """
+    A Dataset Object, used to easily import edgelist data and cuGraph.Graph
+    instances.
+
+    Parameters
+    ----------
+    meta_data_file_name : yaml file
+        The metadata file for the specific graph dataset, which includes
+        information on the name, type, url link, data loading format, graph
+        properties
+
+    """
+    def __init__(self, meta_data_file_name):
+        with open(meta_data_file_name, 'r') as file:
+            self.metadata = yaml.safe_load(file)
+
+        self._dl_path = default_download_dir
+        self._edgelist = None
+        self._graph = None
+        self._path = None
+
+    def __download_csv(self, url):
+        self._dl_path.path.mkdir(parents=True, exist_ok=True)
+
+        filename = self.metadata['name'] + self.metadata['file_type']
+        if self._dl_path.path.is_dir():
+            df = cudf.read_csv(url)
+            df.to_csv(self._dl_path.path / filename, index=False)
+
+        else:
+            raise RuntimeError(f"The directory {self._dl_path.path.absolute()}"
+                               "does not exist")
+
+    def get_edgelist(self, fetch=False):
+        """
+        Return an Edgelist
+
+        Parameters
+        ----------
+        fetch : Boolean (default=False)
+            Automatically fetch for the dataset from the 'url' location within
+            the YAML file.
+        """
+
+        if self._edgelist is None:
+            full_path = self._dl_path.path / (self.metadata['name'] +
+                                              self.metadata['file_type'])
+
+            if not full_path.is_file():
+                if fetch:
+                    self.__download_csv(self.metadata['url'])
+                else:
+                    raise RuntimeError(f"The datafile {full_path} does not"
+                                       " exist. Try get_edgelist(fetch=True)"
+                                       " to download the datafile")
+
+            self._edgelist = cudf.read_csv(full_path,
+                                           delimiter=self.metadata['delim'],
+                                           names=self.metadata['col_names'],
+                                           dtype=self.metadata['col_types'])
+            self._path = full_path
+
+        return self._edgelist
+
+    def get_graph(self, fetch=False):
+        """
+        Return a Graph object.
+
+        Parameters
+        ----------
+        fetch : Boolean (default=False)
+            Automatically fetch for the dataset from the 'url' location within
+            the YAML file.
+        """
+        if self._edgelist is None:
+            self.get_edgelist(fetch)
+
+        self._graph = cugraph.Graph(directed=self.metadata['is_directed'])
+        self._graph.from_cudf_edgelist(self._edgelist, source='src',
+                                       destination='dst')
+
+        return self._graph
+
+    def get_path(self):
+        """
+        Returns the location of the stored dataset file
+        """
+        if self._path is None:
+            raise RuntimeError("Path to datafile has not been set." +
+                               " Call get_edgelist or get_graph first")
+
+        return self._path.absolute()
+
+
+def load_all(force=False):
+    """
+    Looks in `metadata` directory and fetches all datafiles from the the URLs
+    provided in each YAML file.
+
+    Parameters
+    force : Boolean (default=False)
+        Overwrite any existing copies of datafiles.
+    """
+    default_download_dir.path.mkdir(parents=True, exist_ok=True)
+
+    meta_path = Path(__file__).parent.absolute() / "metadata"
+    for file in meta_path.iterdir():
+        meta = None
+        if file.suffix == '.yaml':
+            with open(meta_path / file, 'r') as metafile:
+                meta = yaml.safe_load(metafile)
+
+            if 'url' in meta:
+                filename = meta['name'] + meta['file_type']
+                save_to = default_download_dir.path / filename
+                if not save_to.is_file() or force:
+                    df = cudf.read_csv(meta['url'])
+                    df.to_csv(save_to, index=False)
+
+
+def set_config(cfgpath):
+    """
+    Read in a custom config file.
+
+    Parameters
+    ----------
+    cfgfile : String
+        Read the custom config file given its path, and override the default
+    """
+    with open(Path(cfgpath), 'r') as file:
+        cfg = yaml.safe_load(file)
+        default_download_dir.path = Path(cfg['download_dir'])
+
+
+def set_download_dir(path):
+    """
+    Set the download directory for fetching datasets
+
+    Parameters
+    ----------
+    path : String
+        Location used to store datafiles
+    """
+    if path is None:
+        default_download_dir.clear()
+    else:
+        default_download_dir.path = path
+
+
+def get_download_dir():
+    return default_download_dir.path.absolute()
diff --git a/python/cugraph/cugraph/experimental/datasets/datasets_config.yaml b/python/cugraph/cugraph/experimental/datasets/datasets_config.yaml
@@ -0,0 +1,5 @@
+---
+fetch: "False"
+force: "False"
+# path where datasets will be downloaded to and stored
+download_dir: "datasets"
diff --git a/python/cugraph/cugraph/experimental/datasets/metadata/cyber.yaml b/python/cugraph/cugraph/experimental/datasets/metadata/cyber.yaml
@@ -0,0 +1,21 @@
+name: cyber
+file_type: .csv
+author: N/A
+url: https://raw.githubusercontent.com/rapidsai/cugraph/branch-22.08/datasets/cyber.csv
+refs: N/A
+col_names:
+  - idx
+  - src
+  - dst
+col_types:
+  - int32
+  - str
+  - str
+delim: ","
+has_loop: true
+is_directed: true
+is_multigraph: false
+is_symmetric: false
+number_of_edges: 54
+number_of_nodes: 314
+number_of_lines: 2546576
diff --git a/python/cugraph/cugraph/experimental/datasets/metadata/dolphins.yaml b/python/cugraph/cugraph/experimental/datasets/metadata/dolphins.yaml
@@ -0,0 +1,24 @@
+name: dolphins
+file_type: .csv
+author: D. Lusseau
+url: https://raw.githubusercontent.com/rapidsai/cugraph/branch-22.08/datasets/dolphins.csv
+refs:
+  D. Lusseau, K. Schneider, O. J. Boisseau, P. Haase, E. Slooten, and S. M. Dawson,
+  The bottlenose dolphin community of Doubtful Sound features a large proportion of
+  long-lasting associations, Behavioral Ecology and Sociobiology 54, 396-405 (2003).
+col_names:
+  - src
+  - dst
+  - wgt
+col_types:
+  - int32
+  - int32
+  - float32
+delim: " "
+has_loop: false
+is_directed: true
+is_multigraph: false
+is_symmetric: false
+number_of_edges: 318
+number_of_nodes: 62
+number_of_lines: 318
diff --git a/python/cugraph/cugraph/experimental/datasets/metadata/karate.yaml b/python/cugraph/cugraph/experimental/datasets/metadata/karate.yaml
@@ -0,0 +1,21 @@
+name: karate-data
+file_type: .csv
+author: Zachary W.
+url: https://raw.githubusercontent.com/rapidsai/cugraph/branch-22.08/datasets/karate-data.csv
+refs:
+  W. W. Zachary, An information flow model for conflict and fission in small groups,
+  Journal of Anthropological Research 33, 452-473 (1977).
+delim: "\t"
+col_names:
+  - src
+  - dst
+col_types:
+  - int32
+  - int32
+has_loop: true
+is_directed: true
+is_multigraph: false
+is_symmetric: true
+number_of_edges: 156
+number_of_nodes: 34
+number_of_lines: 156
diff --git a/python/cugraph/cugraph/experimental/datasets/metadata/karate_asymmetric.yaml b/python/cugraph/cugraph/experimental/datasets/metadata/karate_asymmetric.yaml
@@ -0,0 +1,23 @@
+name: karate-asymmetric
+file_type: .csv
+author: Zachary W.
+url: https://raw.githubusercontent.com/rapidsai/cugraph/branch-22.08/datasets/karate-asymmetric.csv
+refs:
+  W. W. Zachary, An information flow model for conflict and fission in small groups,
+  Journal of Anthropological Research 33, 452-473 (1977).
+delim: "\t"
+col_names:
+  - src
+  - dst
+  - wgt
+col_types:
+  - int32
+  - int32
+  - float32
+has_loop: true
+is_directed: false
+is_multigraph: false
+is_symmetric: false
+number_of_edges: 78
+number_of_nodes: 34
+number_of_lines: 78