Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adds new dataset for benchmarking in the 100k node 1M edge range, adds additional k-values for BC benchmarks #4726

Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 37 additions & 13 deletions benchmarks/nx-cugraph/pytest-based/bench_algos.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,40 @@
iterations = 1
warmup_rounds = 1

# FIXME: Add this to cugraph.datasets. This is done here so these benchmarks
# can be run without requiring an updated cugraph install. This temporarily
# adds a dataset based on an Amazon product co-purchasing network.
amazon0302_metadata = """
name: amazon0302
description:
Network was collected by crawling Amazon website. It is based on Customers Who Bought This Item Also Bought feature of the Amazon website. If a product i is frequently co-purchased with product j, the graph contains a directed edge from i to j. The data was collected in March 02 2003.
author: J. Leskovec, L. Adamic and B. Adamic
refs: J. Leskovec, L. Adamic and B. Adamic. The Dynamics of Viral Marketing. ACM Transactions on the Web (ACM TWEB), 1(1), 2007.
delim: "\t"
header: 3
col_names:
- FromNodeId
- ToNodeId
col_types:
- int32
- int32
has_loop: false
is_directed: true
is_multigraph: false
is_symmetric: false
number_of_edges: 1234877
number_of_nodes: 262111
url: https://snap.stanford.edu/data/amazon0302.txt.gz
"""
amazon0302_metadata_file_name = datasets.default_download_dir.path / "amazon0302.yaml"
if not amazon0302_metadata_file_name.exists():
amazon0302_metadata_file_name.parent.mkdir(parents=True, exist_ok=True)
with open(amazon0302_metadata_file_name, "w") as f:
f.write(amazon0302_metadata)

amazon0302_dataset = datasets.Dataset(amazon0302_metadata_file_name)
amazon0302_dataset.metadata["file_type"] = ".gz"

dataset_param_values = [
# name: karate, nodes: 34, edges: 156
pytest.param(datasets.karate, marks=[pytest.mark.small, pytest.mark.undirected]),
Expand All @@ -46,6 +80,8 @@
pytest.param(
datasets.email_Eu_core, marks=[pytest.mark.small, pytest.mark.directed]
),
# name: amazon0302, nodes: 262111, edges: 1234877
pytest.param(amazon0302_dataset, marks=[pytest.mark.medium, pytest.mark.directed]),
# name: cit-Patents, nodes: 3774768, edges: 16518948
pytest.param(
datasets.cit_patents, marks=[pytest.mark.medium, pytest.mark.directed]
Expand Down Expand Up @@ -113,19 +149,7 @@ def nx_graph_from_dataset(dataset_obj):
"""
create_using = nx.DiGraph if dataset_obj.metadata["is_directed"] else nx.Graph
names = dataset_obj.metadata["col_names"]
dtypes = dataset_obj.metadata["col_types"]
if isinstance(dataset_obj.metadata["header"], int):
header = dataset_obj.metadata["header"]
else:
header = None

pandas_edgelist = pd.read_csv(
dataset_obj.get_path(),
delimiter=dataset_obj.metadata["delim"],
names=names,
dtype=dict(zip(names, dtypes)),
header=header,
)
pandas_edgelist = dataset_obj.get_edgelist(download=True, reader="pandas")
G = nx.from_pandas_edgelist(
pandas_edgelist, source=names[0], target=names[1], create_using=create_using
)
Expand Down
35 changes: 0 additions & 35 deletions benchmarks/nx-cugraph/pytest-based/get_graph_bench_dataset.py

This file was deleted.

3 changes: 1 addition & 2 deletions benchmarks/nx-cugraph/pytest-based/run-main-benchmarks.sh
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ algos="
datasets="
netscience
email_Eu_core
amazon0302
cit-patents
hollywood
soc-livejournal
Expand Down Expand Up @@ -58,8 +59,6 @@ fi

for algo in $algos; do
for dataset in $datasets; do
# this script can be used to download benchmarking datasets by name via cugraph.datasets
python get_graph_bench_dataset.py $dataset
for backend in $backends; do
name="${backend}__${algo}__${dataset}"
echo "Running: $backend, $dataset, bench_$algo"
Expand Down
Loading