diff --git a/python/cugraph/cugraph/experimental/datasets/__init__.py b/python/cugraph/cugraph/experimental/datasets/__init__.py index fedf9d0dbda..5f355eb8cbc 100644 --- a/python/cugraph/cugraph/experimental/datasets/__init__.py +++ b/python/cugraph/cugraph/experimental/datasets/__init__.py @@ -27,6 +27,7 @@ meta_path = Path(__file__).parent / "metadata" karate = Dataset(meta_path / "karate.yaml") +karate_data = Dataset(meta_path / "karate_data.yaml") karate_undirected = Dataset(meta_path / "karate_undirected.yaml") karate_asymmetric = Dataset(meta_path / "karate_asymmetric.yaml") dolphins = Dataset(meta_path / "dolphins.yaml") @@ -37,15 +38,20 @@ small_tree = Dataset(meta_path / "small_tree.yaml") -# LARGE DATASETS -LARGE_DATASETS = [cyber] +MEDIUM_DATASETS = [polbooks] -# <10,000 lines -MEDIUM_DATASETS = [netscience, polbooks] +SMALL_DATASETS = [karate, dolphins, netscience] -# <500 lines -SMALL_DATASETS = [karate, small_line, small_tree, dolphins] +RLY_SMALL_DATASETS = [small_line, small_tree] -# ALL -ALL_DATASETS = [karate, dolphins, netscience, polbooks, cyber, - small_line, small_tree] \ No newline at end of file +ALL_DATASETS = [karate, dolphins, netscience, polbooks, + small_line, small_tree] + +ALL_DATASETS_WGT = [karate, dolphins, netscience, polbooks, + small_line, small_tree] + +TEST_GROUP = [dolphins, netscience] + +DATASETS_KTRUSS = [polbooks] + +DATASETS_UNDIRECTED = [karate_undirected, small_line, karate_asymmetric] \ No newline at end of file diff --git a/python/cugraph/cugraph/experimental/datasets/dataset.py b/python/cugraph/cugraph/experimental/datasets/dataset.py index 3ae904904f6..f5595e1f354 100644 --- a/python/cugraph/cugraph/experimental/datasets/dataset.py +++ b/python/cugraph/cugraph/experimental/datasets/dataset.py @@ -11,11 +11,11 @@ # See the License for the specific language governing permissions and # limitations under the License. -import cugraph import cudf import yaml import os from pathlib import Path +from cugraph.structure.graph_classes import Graph class DefaultDownloadDir: @@ -64,7 +64,6 @@ class Dataset: The metadata file for the specific graph dataset, which includes information on the name, type, url link, data loading format, graph properties - """ def __init__(self, meta_data_file_name): with open(meta_data_file_name, 'r') as file: @@ -118,22 +117,48 @@ def get_edgelist(self, fetch=False): return self._edgelist - def get_graph(self, fetch=False): + def get_graph(self, fetch=False, create_using=Graph, ignore_weights=False): """ Return a Graph object. Parameters ---------- fetch : Boolean (default=False) - Automatically fetch for the dataset from the 'url' location within - the YAML file. + Downloads the dataset from the web. + + create_using: cugraph.Graph (instance or class), optional + (default=Graph) + Specify the type of Graph to create. Can pass in an instance to + create a Graph instance with specified 'directed' attribute. + + ignore_weights : Boolean (default=False) + Ignores weights in the dataset if True, resulting in an + unweighted Graph. If False (the default), weights from the + dataset -if present- will be applied to the Graph. If the + dataset does not contain weights, the Graph returned will + be unweighted regardless of ignore_weights. """ if self._edgelist is None: self.get_edgelist(fetch) - self._graph = cugraph.Graph(directed=self.metadata['is_directed']) - self._graph.from_cudf_edgelist(self._edgelist, source='src', - destination='dst') + if create_using is None: + self._graph = Graph() + elif isinstance(create_using, Graph): + attrs = {"directed": create_using.is_directed()} + self._graph = type(create_using)(**attrs) + elif type(create_using) is type: + self._graph = create_using() + else: + raise TypeError("create_using must be a cugraph.Graph " + "(or subclass) type or instance, got: " + f"{type(create_using)}") + + if (len(self.metadata['col_names']) > 2 and not(ignore_weights)): + self._graph.from_cudf_edgelist(self._edgelist, source='src', + destination='dst', edge_attr='wgt') + else: + self._graph.from_cudf_edgelist(self._edgelist, source='src', + destination='dst') return self._graph diff --git a/python/cugraph/cugraph/experimental/datasets/metadata/karate.yaml b/python/cugraph/cugraph/experimental/datasets/metadata/karate.yaml index d86c7b1a241..9b7ac679e96 100644 --- a/python/cugraph/cugraph/experimental/datasets/metadata/karate.yaml +++ b/python/cugraph/cugraph/experimental/datasets/metadata/karate.yaml @@ -1,17 +1,19 @@ -name: karate-data +name: karate file_type: .csv author: Zachary W. -url: https://raw.githubusercontent.com/rapidsai/cugraph/branch-22.08/datasets/karate-data.csv +url: https://raw.githubusercontent.com/rapidsai/cugraph/branch-22.08/datasets/karate.csv refs: W. W. Zachary, An information flow model for conflict and fission in small groups, Journal of Anthropological Research 33, 452-473 (1977). -delim: "\t" +delim: " " col_names: - src - dst + - wgt col_types: - int32 - int32 + - float32 has_loop: true is_directed: true is_multigraph: false diff --git a/python/cugraph/cugraph/experimental/datasets/metadata/karate_data.yaml b/python/cugraph/cugraph/experimental/datasets/metadata/karate_data.yaml new file mode 100644 index 00000000000..d86c7b1a241 --- /dev/null +++ b/python/cugraph/cugraph/experimental/datasets/metadata/karate_data.yaml @@ -0,0 +1,21 @@ +name: karate-data +file_type: .csv +author: Zachary W. +url: https://raw.githubusercontent.com/rapidsai/cugraph/branch-22.08/datasets/karate-data.csv +refs: + W. W. Zachary, An information flow model for conflict and fission in small groups, + Journal of Anthropological Research 33, 452-473 (1977). +delim: "\t" +col_names: + - src + - dst +col_types: + - int32 + - int32 +has_loop: true +is_directed: true +is_multigraph: false +is_symmetric: true +number_of_edges: 156 +number_of_nodes: 34 +number_of_lines: 156 diff --git a/python/cugraph/cugraph/tests/test_dataset.py b/python/cugraph/cugraph/tests/test_dataset.py index 093f1382bcb..9d9078af9d1 100644 --- a/python/cugraph/cugraph/tests/test_dataset.py +++ b/python/cugraph/cugraph/tests/test_dataset.py @@ -17,13 +17,18 @@ import os from pathlib import Path from tempfile import NamedTemporaryFile, TemporaryDirectory -from cugraph.experimental.datasets import (ALL_DATASETS) +from cugraph.experimental.datasets import (ALL_DATASETS, ALL_DATASETS_WGT, + SMALL_DATASETS) +from cugraph.structure import Graph # ============================================================================= # Pytest Setup / Teardown - called for each test function # ============================================================================= +dataset_path = Path(__file__).parents[4] / "datasets" + + # Use this to simulate a fresh API import @pytest.fixture def datasets(): @@ -125,25 +130,19 @@ def test_fetch(dataset, datasets): @pytest.mark.parametrize("dataset", ALL_DATASETS) def test_get_edgelist(dataset, datasets): - tmpd = TemporaryDirectory() - datasets.set_download_dir(tmpd.name) + datasets.set_download_dir(dataset_path) E = dataset.get_edgelist(fetch=True) assert E is not None - tmpd.cleanup() - @pytest.mark.parametrize("dataset", ALL_DATASETS) def test_get_graph(dataset, datasets): - tmpd = TemporaryDirectory() - datasets.set_download_dir(tmpd.name) + datasets.set_download_dir(dataset_path) G = dataset.get_graph(fetch=True) assert G is not None - tmpd.cleanup() - @pytest.mark.parametrize("dataset", ALL_DATASETS) def test_metadata(dataset): @@ -167,3 +166,27 @@ def test_get_path(dataset, datasets): def test_get_path_raises(dataset): with pytest.raises(RuntimeError): dataset.get_path() + + +@pytest.mark.parametrize("dataset", ALL_DATASETS_WGT) +def test_weights(dataset, datasets): + datasets.set_download_dir(dataset_path) + + G_w = dataset.get_graph(fetch=True) + G = dataset.get_graph(fetch=True, ignore_weights=True) + + assert G_w.is_weighted() + assert not G.is_weighted() + + +@pytest.mark.parametrize("dataset", SMALL_DATASETS) +def test_create_using(dataset, datasets): + datasets.set_download_dir(dataset_path) + + G_d = dataset.get_graph() + G_t = dataset.get_graph(create_using=Graph) + G = dataset.get_graph(create_using=Graph(directed=True)) + + assert not G_d.is_directed() + assert not G_t.is_directed() + assert G.is_directed()