From 8daec65e8afb88a7a2b902bc23eaa3517da8d00b Mon Sep 17 00:00:00 2001 From: Padarn Wilson Date: Sat, 7 May 2022 08:39:34 +0800 Subject: [PATCH 1/9] adding wiki linkx dataset --- torch_geometric/data/download.py | 17 +++++-- torch_geometric/datasets/linkx_dataset.py | 57 ++++++++++++++++++----- 2 files changed, 59 insertions(+), 15 deletions(-) diff --git a/torch_geometric/data/download.py b/torch_geometric/data/download.py index 6a4f524e54c5..2f7a783b709d 100644 --- a/torch_geometric/data/download.py +++ b/torch_geometric/data/download.py @@ -2,11 +2,13 @@ import ssl import sys import urllib +from typing import Optional from .makedirs import makedirs -def download_url(url: str, folder: str, log: bool = True): +def download_url(url: str, folder: str, log: bool = True, + filename: Optional[str] = None): r"""Downloads the content of an URL to a specific folder. Args: @@ -16,8 +18,10 @@ def download_url(url: str, folder: str, log: bool = True): console. (default: :obj:`True`) """ - filename = url.rpartition('/')[2] - filename = filename if filename[0] == '?' else filename.split('?')[0] + if filename is None: + filename = url.rpartition('/')[2] + filename = filename if filename[0] == '?' else filename.split('?')[0] + path = osp.join(folder, filename) if osp.exists(path): # pragma: no cover @@ -34,6 +38,11 @@ def download_url(url: str, folder: str, log: bool = True): data = urllib.request.urlopen(url, context=context) with open(path, 'wb') as f: - f.write(data.read()) + # workaround for https://bugs.python.org/issue42853 + while True: + chunk = data.read(6 * 1024) + if not chunk: + break + f.write(chunk) return path diff --git a/torch_geometric/datasets/linkx_dataset.py b/torch_geometric/datasets/linkx_dataset.py index 84ae20f477ca..5fec0cd7d39a 100644 --- a/torch_geometric/datasets/linkx_dataset.py +++ b/torch_geometric/datasets/linkx_dataset.py @@ -32,23 +32,44 @@ class LINKXDataset(InMemoryDataset): being saved to disk. (default: :obj:`None`) """ - url = 'https://github.com/CUAI/Non-Homophily-Large-Scale/raw/master/data' + ghurl = 'https://github.com/CUAI/Non-Homophily-Large-Scale/raw/master/data' + gdriveurl = 'https://drive.google.com/uc?confirm=t&' facebook_datasets = [ 'penn94', 'reed98', 'amherst41', 'cornell5', 'johnshopkins55' ] datasets = { - 'penn94': f'{url}/facebook100/Penn94.mat', - 'reed98': f'{url}/facebook100/Reed98.mat', - 'amherst41': f'{url}/facebook100/Amherst41.mat', - 'cornell5': f'{url}/facebook100/Cornell5.mat', - 'johnshopkins55': f'{url}/facebook100/Johns%20Hopkins55.mat', - 'genius': f'{url}/genius.mat' + 'penn94': { + 'data.mat': f'{ghurl}/facebook100/Penn94.mat' + }, + 'reed98': { + 'data.mat': f'{ghurl}/facebook100/Reed98.mat' + }, + 'amherst41': { + 'data.mat': f'{ghurl}/facebook100/Amherst41.mat', + }, + 'cornell5': { + 'data.mat': f'{ghurl}/facebook100/Cornell5.mat' + }, + 'johnshopkins55': { + 'data.mat': f'{ghurl}/facebook100/Johns%20Hopkins55.mat' + }, + 'genius': { + 'data.mat': f'{ghurl}/genius.mat' + }, + 'wiki': { + 'wiki_views2M.pt': + f'{gdriveurl}id=1p5DlVHrnFgYm3VsNIzahSsvCD424AyvP', + 'wiki_edges2M.pt': + f'{gdriveurl}id=14X7FlkjrlUgmnsYtPwdh-gGuFla4yb5u', + 'wiki_features2M.pt': + f'{gdriveurl}id=1ySNspxbK-snNoAZM7oxiWGvOnTRdSyEK' + } } splits = { - 'penn94': f'{url}/splits/fb100-Penn94-splits.npy', + 'penn94': f'{ghurl}/splits/fb100-Penn94-splits.npy', } def __init__(self, root: str, name: str, @@ -69,7 +90,7 @@ def processed_dir(self) -> str: @property def raw_file_names(self) -> List[str]: - names = [self.datasets[self.name].split('/')[-1]] + names = list(self.datasets[self.name].keys()) if self.name in self.splits: names += [self.splits[self.name].split('/')[-1]] return names @@ -79,10 +100,21 @@ def processed_file_names(self) -> str: return 'data.pt' def download(self): - download_url(self.datasets[self.name], self.raw_dir) + for filename, path in self.datasets[self.name].items(): + download_url(path, self.raw_dir, filename=filename) if self.name in self.splits: download_url(self.splits[self.name], self.raw_dir) + def _process_wiki(self): + + paths = {x.split("/")[-1]: x for x in self.raw_paths} + print(paths) + x = torch.load(paths['wiki_features2M.pt']) + edge_index = torch.load(paths['wiki_edges2M.pt']).T + y = torch.load(paths['wiki_views2M.pt']) + + return Data(x=x, edge_index=edge_index, y=y) + def _process_facebook(self): from scipy.io import loadmat @@ -134,8 +166,11 @@ def process(self): data = self._process_facebook() elif self.name == 'genius': data = self._process_genius() + elif self.name == 'wiki': + data = self._process_wiki() else: - raise NotImplementedError + raise NotImplementedError( + f"chosen dataset '{self.name}' is not implemented") if self.pre_transform is not None: data = self.pre_transform(data) From 9c14e46e2a5f5a02180fc9f51a1028c50281939a Mon Sep 17 00:00:00 2001 From: Padarn Wilson Date: Sat, 7 May 2022 08:40:04 +0800 Subject: [PATCH 2/9] adding wiki linkx dataset --- torch_geometric/datasets/linkx_dataset.py | 25 ++++++++++++----------- 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/torch_geometric/datasets/linkx_dataset.py b/torch_geometric/datasets/linkx_dataset.py index 5fec0cd7d39a..53aef78cdc6f 100644 --- a/torch_geometric/datasets/linkx_dataset.py +++ b/torch_geometric/datasets/linkx_dataset.py @@ -32,8 +32,9 @@ class LINKXDataset(InMemoryDataset): being saved to disk. (default: :obj:`None`) """ - ghurl = 'https://github.com/CUAI/Non-Homophily-Large-Scale/raw/master/data' - gdriveurl = 'https://drive.google.com/uc?confirm=t&' + github_url = 'https://github.com/CUAI/Non-Homophily-Large-Scale/' \ + 'raw/master/data' + gdrive_url = 'https://drive.google.com/uc?confirm=t&' facebook_datasets = [ 'penn94', 'reed98', 'amherst41', 'cornell5', 'johnshopkins55' @@ -41,35 +42,35 @@ class LINKXDataset(InMemoryDataset): datasets = { 'penn94': { - 'data.mat': f'{ghurl}/facebook100/Penn94.mat' + 'data.mat': f'{github_url}/facebook100/Penn94.mat' }, 'reed98': { - 'data.mat': f'{ghurl}/facebook100/Reed98.mat' + 'data.mat': f'{github_url}/facebook100/Reed98.mat' }, 'amherst41': { - 'data.mat': f'{ghurl}/facebook100/Amherst41.mat', + 'data.mat': f'{github_url}/facebook100/Amherst41.mat', }, 'cornell5': { - 'data.mat': f'{ghurl}/facebook100/Cornell5.mat' + 'data.mat': f'{github_url}/facebook100/Cornell5.mat' }, 'johnshopkins55': { - 'data.mat': f'{ghurl}/facebook100/Johns%20Hopkins55.mat' + 'data.mat': f'{github_url}/facebook100/Johns%20Hopkins55.mat' }, 'genius': { - 'data.mat': f'{ghurl}/genius.mat' + 'data.mat': f'{github_url}/genius.mat' }, 'wiki': { 'wiki_views2M.pt': - f'{gdriveurl}id=1p5DlVHrnFgYm3VsNIzahSsvCD424AyvP', + f'{gdrive_url}id=1p5DlVHrnFgYm3VsNIzahSsvCD424AyvP', 'wiki_edges2M.pt': - f'{gdriveurl}id=14X7FlkjrlUgmnsYtPwdh-gGuFla4yb5u', + f'{gdrive_url}id=14X7FlkjrlUgmnsYtPwdh-gGuFla4yb5u', 'wiki_features2M.pt': - f'{gdriveurl}id=1ySNspxbK-snNoAZM7oxiWGvOnTRdSyEK' + f'{gdrive_url}id=1ySNspxbK-snNoAZM7oxiWGvOnTRdSyEK' } } splits = { - 'penn94': f'{ghurl}/splits/fb100-Penn94-splits.npy', + 'penn94': f'{github_url}/splits/fb100-Penn94-splits.npy', } def __init__(self, root: str, name: str, From 922858da8dae7e1752400dd67460851618568a74 Mon Sep 17 00:00:00 2001 From: Padarn Wilson Date: Sat, 7 May 2022 08:48:29 +0800 Subject: [PATCH 3/9] add changelog --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 79d8a8f17bbb..aa11b32c5ffe 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,7 +5,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ## [2.0.5] - 2022-MM-DD ### Added -- Added the `Genius` datasets to `nn.datasets.LINKXDataset` ([#4570](https://github.com/pyg-team/pytorch_geometric/pull/4570)) +- Added the `Genius` and `Wiki` datasets to `nn.datasets.LINKXDataset` ([#4570](https://github.com/pyg-team/pytorch_geometric/pull/4570), [#4600](https://github.com/pyg-team/pytorch_geometric/pull/4600)) - Added `nn.glob.GlobalPooling` module with support for multiple aggregations ([#4582](https://github.com/pyg-team/pytorch_geometric/pull/4582)) - Added support for graph-level outputs in `to_hetero` ([#4582](https://github.com/pyg-team/pytorch_geometric/pull/4582)) - Added `CHANGELOG.md` ([#4581](https://github.com/pyg-team/pytorch_geometric/pull/4581)) From 74a13f83bf28883ba8c5463ba32055748f87892e Mon Sep 17 00:00:00 2001 From: Padarn Wilson Date: Sat, 7 May 2022 09:07:57 +0800 Subject: [PATCH 4/9] increase chunk size --- torch_geometric/data/download.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch_geometric/data/download.py b/torch_geometric/data/download.py index 2f7a783b709d..256610bab8a9 100644 --- a/torch_geometric/data/download.py +++ b/torch_geometric/data/download.py @@ -40,7 +40,7 @@ def download_url(url: str, folder: str, log: bool = True, with open(path, 'wb') as f: # workaround for https://bugs.python.org/issue42853 while True: - chunk = data.read(6 * 1024) + chunk = data.read(10 * 1024 * 1024) if not chunk: break f.write(chunk) From b3c414a063c0a47addef1e51b55d6b23faedfb21 Mon Sep 17 00:00:00 2001 From: Padarn Wilson Date: Mon, 9 May 2022 18:29:24 +0800 Subject: [PATCH 5/9] Update torch_geometric/datasets/linkx_dataset.py Co-authored-by: Matthias Fey --- torch_geometric/datasets/linkx_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch_geometric/datasets/linkx_dataset.py b/torch_geometric/datasets/linkx_dataset.py index 53aef78cdc6f..0e80ab225022 100644 --- a/torch_geometric/datasets/linkx_dataset.py +++ b/torch_geometric/datasets/linkx_dataset.py @@ -111,7 +111,7 @@ def _process_wiki(self): paths = {x.split("/")[-1]: x for x in self.raw_paths} print(paths) x = torch.load(paths['wiki_features2M.pt']) - edge_index = torch.load(paths['wiki_edges2M.pt']).T + edge_index = torch.load(paths['wiki_edges2M.pt']).t().contiguous() y = torch.load(paths['wiki_views2M.pt']) return Data(x=x, edge_index=edge_index, y=y) From 5695c3c2bcb3deed6c63327386a9745cf20f1a2c Mon Sep 17 00:00:00 2001 From: Padarn Wilson Date: Mon, 9 May 2022 18:29:37 +0800 Subject: [PATCH 6/9] Update torch_geometric/datasets/linkx_dataset.py Co-authored-by: Matthias Fey --- torch_geometric/datasets/linkx_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch_geometric/datasets/linkx_dataset.py b/torch_geometric/datasets/linkx_dataset.py index 0e80ab225022..ac1db6c6ebfd 100644 --- a/torch_geometric/datasets/linkx_dataset.py +++ b/torch_geometric/datasets/linkx_dataset.py @@ -108,7 +108,7 @@ def download(self): def _process_wiki(self): - paths = {x.split("/")[-1]: x for x in self.raw_paths} + paths = {x.split('/')[-1]: x for x in self.raw_paths} print(paths) x = torch.load(paths['wiki_features2M.pt']) edge_index = torch.load(paths['wiki_edges2M.pt']).t().contiguous() From f434e0e76e64673313426ec759cc8f0fa1109c77 Mon Sep 17 00:00:00 2001 From: Padarn Wilson Date: Mon, 9 May 2022 18:29:44 +0800 Subject: [PATCH 7/9] Update torch_geometric/datasets/linkx_dataset.py Co-authored-by: Matthias Fey --- torch_geometric/datasets/linkx_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch_geometric/datasets/linkx_dataset.py b/torch_geometric/datasets/linkx_dataset.py index ac1db6c6ebfd..93e303ba4ae6 100644 --- a/torch_geometric/datasets/linkx_dataset.py +++ b/torch_geometric/datasets/linkx_dataset.py @@ -32,7 +32,7 @@ class LINKXDataset(InMemoryDataset): being saved to disk. (default: :obj:`None`) """ - github_url = 'https://github.com/CUAI/Non-Homophily-Large-Scale/' \ + github_url = ('https://github.com/CUAI/Non-Homophily-Large-Scale/' 'raw/master/data' gdrive_url = 'https://drive.google.com/uc?confirm=t&' From 9bd80126835486c7e230953009abe41e7918f026 Mon Sep 17 00:00:00 2001 From: Padarn Wilson Date: Mon, 9 May 2022 18:29:50 +0800 Subject: [PATCH 8/9] Update torch_geometric/datasets/linkx_dataset.py Co-authored-by: Matthias Fey --- torch_geometric/datasets/linkx_dataset.py | 1 - 1 file changed, 1 deletion(-) diff --git a/torch_geometric/datasets/linkx_dataset.py b/torch_geometric/datasets/linkx_dataset.py index 93e303ba4ae6..445acab7996c 100644 --- a/torch_geometric/datasets/linkx_dataset.py +++ b/torch_geometric/datasets/linkx_dataset.py @@ -109,7 +109,6 @@ def download(self): def _process_wiki(self): paths = {x.split('/')[-1]: x for x in self.raw_paths} - print(paths) x = torch.load(paths['wiki_features2M.pt']) edge_index = torch.load(paths['wiki_edges2M.pt']).t().contiguous() y = torch.load(paths['wiki_views2M.pt']) From 8395cbaaa9b9824a1aab2cc7d19d6f50fe6bf304 Mon Sep 17 00:00:00 2001 From: Matthias Fey Date: Mon, 9 May 2022 03:49:03 -0700 Subject: [PATCH 9/9] Update torch_geometric/datasets/linkx_dataset.py --- torch_geometric/datasets/linkx_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch_geometric/datasets/linkx_dataset.py b/torch_geometric/datasets/linkx_dataset.py index 445acab7996c..e0c7038f4d6b 100644 --- a/torch_geometric/datasets/linkx_dataset.py +++ b/torch_geometric/datasets/linkx_dataset.py @@ -33,7 +33,7 @@ class LINKXDataset(InMemoryDataset): """ github_url = ('https://github.com/CUAI/Non-Homophily-Large-Scale/' - 'raw/master/data' + 'raw/master/data') gdrive_url = 'https://drive.google.com/uc?confirm=t&' facebook_datasets = [