-
Notifications
You must be signed in to change notification settings - Fork 310
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add datasets API to import graph data from configuration/metadata fil…
…es (#2367) Addresses issue [#1348](https://nvidia.slack.com/archives/C01SCT7ELMR). A working version of the datasets API has been added under the "experimental" module of cuGraph. This API comes with the ability to import a handful of built-in datasets to create graphs and edge lists. Each dataset comes with its own metadata file in the format of a YAML file. These files contain general information about the dataset, as well as formatting information about their columns and datatypes. Authors: - Dylan Chima-Sanchez (https://github.com/betochimas) - Ralph Liu (https://github.com/oorliu) Approvers: - Rick Ratzel (https://github.com/rlratzel) - Joseph Nke (https://github.com/jnke2016) URL: #2367
- Loading branch information
1 parent
5788f02
commit 6644813
Showing
17 changed files
with
638 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,4 @@ | ||
include python/versioneer.py | ||
include python/cugraph/_version.py | ||
include python/cugraph/_version.py | ||
include cugraph/experimental/datasets/*.yaml | ||
include cugraph/experimental/datasets/metadata/*.yaml |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,4 @@ | ||
include versioneer.py | ||
include cugraph/_version.py | ||
include cugraph/experimental/datasets/*.yaml | ||
include cugraph/experimental/datasets/metadata/*.yaml |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
# Copyright (c) 2022, NVIDIA CORPORATION. | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
|
||
from cugraph.experimental.datasets.dataset import ( | ||
Dataset, | ||
load_all, | ||
set_config, | ||
set_download_dir, | ||
get_download_dir, | ||
default_download_dir | ||
) | ||
from cugraph.experimental.datasets import metadata | ||
from pathlib import Path | ||
|
||
|
||
meta_path = Path(__file__).parent / "metadata" | ||
|
||
karate = Dataset(meta_path / "karate.yaml") | ||
karate_undirected = Dataset(meta_path / "karate_undirected.yaml") | ||
karate_asymmetric = Dataset(meta_path / "karate_asymmetric.yaml") | ||
dolphins = Dataset(meta_path / "dolphins.yaml") | ||
polbooks = Dataset(meta_path / "polbooks.yaml") | ||
netscience = Dataset(meta_path / "netscience.yaml") | ||
cyber = Dataset(meta_path / "cyber.yaml") | ||
small_line = Dataset(meta_path / "small_line.yaml") | ||
small_tree = Dataset(meta_path / "small_tree.yaml") | ||
|
||
|
||
# LARGE DATASETS | ||
LARGE_DATASETS = [cyber] | ||
|
||
# <10,000 lines | ||
MEDIUM_DATASETS = [netscience, polbooks] | ||
|
||
# <500 lines | ||
SMALL_DATASETS = [karate, small_line, small_tree, dolphins] | ||
|
||
# ALL | ||
ALL_DATASETS = [karate, dolphins, netscience, polbooks, cyber, | ||
small_line, small_tree] |
207 changes: 207 additions & 0 deletions
207
python/cugraph/cugraph/experimental/datasets/dataset.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,207 @@ | ||
# Copyright (c) 2022, NVIDIA CORPORATION. | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
import cugraph | ||
import cudf | ||
import yaml | ||
import os | ||
from pathlib import Path | ||
|
||
|
||
class DefaultDownloadDir: | ||
""" | ||
Maintains the path to the download directory used by Dataset instances. | ||
Instances of this class are typically shared by several Dataset instances | ||
in order to allow for the download directory to be defined and updated by | ||
a single object. | ||
""" | ||
def __init__(self): | ||
self._path = Path(os.environ.get("RAPIDS_DATASET_ROOT_DIR", | ||
Path.home() / ".cugraph/datasets")) | ||
|
||
@property | ||
def path(self): | ||
""" | ||
If `path` is not set, set it to the environment variable | ||
RAPIDS_DATASET_ROOT_DIR. If the variable is not set, default to the | ||
user's home directory. | ||
""" | ||
if self._path is None: | ||
self._path = Path(os.environ.get("RAPIDS_DATASET_ROOT_DIR", | ||
Path.home() / | ||
".cugraph/datasets")) | ||
return self._path | ||
|
||
@path.setter | ||
def path(self, new): | ||
self._path = Path(new) | ||
|
||
def clear(self): | ||
self._path = None | ||
|
||
|
||
default_download_dir = DefaultDownloadDir() | ||
|
||
|
||
class Dataset: | ||
""" | ||
A Dataset Object, used to easily import edgelist data and cuGraph.Graph | ||
instances. | ||
Parameters | ||
---------- | ||
meta_data_file_name : yaml file | ||
The metadata file for the specific graph dataset, which includes | ||
information on the name, type, url link, data loading format, graph | ||
properties | ||
""" | ||
def __init__(self, meta_data_file_name): | ||
with open(meta_data_file_name, 'r') as file: | ||
self.metadata = yaml.safe_load(file) | ||
|
||
self._dl_path = default_download_dir | ||
self._edgelist = None | ||
self._graph = None | ||
self._path = None | ||
|
||
def __download_csv(self, url): | ||
self._dl_path.path.mkdir(parents=True, exist_ok=True) | ||
|
||
filename = self.metadata['name'] + self.metadata['file_type'] | ||
if self._dl_path.path.is_dir(): | ||
df = cudf.read_csv(url) | ||
df.to_csv(self._dl_path.path / filename, index=False) | ||
|
||
else: | ||
raise RuntimeError(f"The directory {self._dl_path.path.absolute()}" | ||
"does not exist") | ||
|
||
def get_edgelist(self, fetch=False): | ||
""" | ||
Return an Edgelist | ||
Parameters | ||
---------- | ||
fetch : Boolean (default=False) | ||
Automatically fetch for the dataset from the 'url' location within | ||
the YAML file. | ||
""" | ||
|
||
if self._edgelist is None: | ||
full_path = self._dl_path.path / (self.metadata['name'] + | ||
self.metadata['file_type']) | ||
|
||
if not full_path.is_file(): | ||
if fetch: | ||
self.__download_csv(self.metadata['url']) | ||
else: | ||
raise RuntimeError(f"The datafile {full_path} does not" | ||
" exist. Try get_edgelist(fetch=True)" | ||
" to download the datafile") | ||
|
||
self._edgelist = cudf.read_csv(full_path, | ||
delimiter=self.metadata['delim'], | ||
names=self.metadata['col_names'], | ||
dtype=self.metadata['col_types']) | ||
self._path = full_path | ||
|
||
return self._edgelist | ||
|
||
def get_graph(self, fetch=False): | ||
""" | ||
Return a Graph object. | ||
Parameters | ||
---------- | ||
fetch : Boolean (default=False) | ||
Automatically fetch for the dataset from the 'url' location within | ||
the YAML file. | ||
""" | ||
if self._edgelist is None: | ||
self.get_edgelist(fetch) | ||
|
||
self._graph = cugraph.Graph(directed=self.metadata['is_directed']) | ||
self._graph.from_cudf_edgelist(self._edgelist, source='src', | ||
destination='dst') | ||
|
||
return self._graph | ||
|
||
def get_path(self): | ||
""" | ||
Returns the location of the stored dataset file | ||
""" | ||
if self._path is None: | ||
raise RuntimeError("Path to datafile has not been set." + | ||
" Call get_edgelist or get_graph first") | ||
|
||
return self._path.absolute() | ||
|
||
|
||
def load_all(force=False): | ||
""" | ||
Looks in `metadata` directory and fetches all datafiles from the the URLs | ||
provided in each YAML file. | ||
Parameters | ||
force : Boolean (default=False) | ||
Overwrite any existing copies of datafiles. | ||
""" | ||
default_download_dir.path.mkdir(parents=True, exist_ok=True) | ||
|
||
meta_path = Path(__file__).parent.absolute() / "metadata" | ||
for file in meta_path.iterdir(): | ||
meta = None | ||
if file.suffix == '.yaml': | ||
with open(meta_path / file, 'r') as metafile: | ||
meta = yaml.safe_load(metafile) | ||
|
||
if 'url' in meta: | ||
filename = meta['name'] + meta['file_type'] | ||
save_to = default_download_dir.path / filename | ||
if not save_to.is_file() or force: | ||
df = cudf.read_csv(meta['url']) | ||
df.to_csv(save_to, index=False) | ||
|
||
|
||
def set_config(cfgpath): | ||
""" | ||
Read in a custom config file. | ||
Parameters | ||
---------- | ||
cfgfile : String | ||
Read the custom config file given its path, and override the default | ||
""" | ||
with open(Path(cfgpath), 'r') as file: | ||
cfg = yaml.safe_load(file) | ||
default_download_dir.path = Path(cfg['download_dir']) | ||
|
||
|
||
def set_download_dir(path): | ||
""" | ||
Set the download directory for fetching datasets | ||
Parameters | ||
---------- | ||
path : String | ||
Location used to store datafiles | ||
""" | ||
if path is None: | ||
default_download_dir.clear() | ||
else: | ||
default_download_dir.path = path | ||
|
||
|
||
def get_download_dir(): | ||
return default_download_dir.path.absolute() |
5 changes: 5 additions & 0 deletions
5
python/cugraph/cugraph/experimental/datasets/datasets_config.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
--- | ||
fetch: "False" | ||
force: "False" | ||
# path where datasets will be downloaded to and stored | ||
download_dir: "datasets" |
21 changes: 21 additions & 0 deletions
21
python/cugraph/cugraph/experimental/datasets/metadata/cyber.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
name: cyber | ||
file_type: .csv | ||
author: N/A | ||
url: https://raw.githubusercontent.com/rapidsai/cugraph/branch-22.08/datasets/cyber.csv | ||
refs: N/A | ||
col_names: | ||
- idx | ||
- src | ||
- dst | ||
col_types: | ||
- int32 | ||
- str | ||
- str | ||
delim: "," | ||
has_loop: true | ||
is_directed: true | ||
is_multigraph: false | ||
is_symmetric: false | ||
number_of_edges: 54 | ||
number_of_nodes: 314 | ||
number_of_lines: 2546576 |
24 changes: 24 additions & 0 deletions
24
python/cugraph/cugraph/experimental/datasets/metadata/dolphins.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
name: dolphins | ||
file_type: .csv | ||
author: D. Lusseau | ||
url: https://raw.githubusercontent.com/rapidsai/cugraph/branch-22.08/datasets/dolphins.csv | ||
refs: | ||
D. Lusseau, K. Schneider, O. J. Boisseau, P. Haase, E. Slooten, and S. M. Dawson, | ||
The bottlenose dolphin community of Doubtful Sound features a large proportion of | ||
long-lasting associations, Behavioral Ecology and Sociobiology 54, 396-405 (2003). | ||
col_names: | ||
- src | ||
- dst | ||
- wgt | ||
col_types: | ||
- int32 | ||
- int32 | ||
- float32 | ||
delim: " " | ||
has_loop: false | ||
is_directed: true | ||
is_multigraph: false | ||
is_symmetric: false | ||
number_of_edges: 318 | ||
number_of_nodes: 62 | ||
number_of_lines: 318 |
21 changes: 21 additions & 0 deletions
21
python/cugraph/cugraph/experimental/datasets/metadata/karate.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
name: karate-data | ||
file_type: .csv | ||
author: Zachary W. | ||
url: https://raw.githubusercontent.com/rapidsai/cugraph/branch-22.08/datasets/karate-data.csv | ||
refs: | ||
W. W. Zachary, An information flow model for conflict and fission in small groups, | ||
Journal of Anthropological Research 33, 452-473 (1977). | ||
delim: "\t" | ||
col_names: | ||
- src | ||
- dst | ||
col_types: | ||
- int32 | ||
- int32 | ||
has_loop: true | ||
is_directed: true | ||
is_multigraph: false | ||
is_symmetric: true | ||
number_of_edges: 156 | ||
number_of_nodes: 34 | ||
number_of_lines: 156 |
23 changes: 23 additions & 0 deletions
23
python/cugraph/cugraph/experimental/datasets/metadata/karate_asymmetric.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
name: karate-asymmetric | ||
file_type: .csv | ||
author: Zachary W. | ||
url: https://raw.githubusercontent.com/rapidsai/cugraph/branch-22.08/datasets/karate-asymmetric.csv | ||
refs: | ||
W. W. Zachary, An information flow model for conflict and fission in small groups, | ||
Journal of Anthropological Research 33, 452-473 (1977). | ||
delim: "\t" | ||
col_names: | ||
- src | ||
- dst | ||
- wgt | ||
col_types: | ||
- int32 | ||
- int32 | ||
- float32 | ||
has_loop: true | ||
is_directed: false | ||
is_multigraph: false | ||
is_symmetric: false | ||
number_of_edges: 78 | ||
number_of_nodes: 34 | ||
number_of_lines: 78 |
Oops, something went wrong.