-
Notifications
You must be signed in to change notification settings - Fork 0
/
fetch2a.py
110 lines (95 loc) · 3.07 KB
/
fetch2a.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#!/usr/bin/env python3
import threading
import os
import requests
import json
# import sys
# import time
from tqdm.contrib.concurrent import thread_map
SERVERS = [
"usegalaxy.eu",
"usegalaxy.fr",
"usegalaxy.org",
"usegalaxy.org.au",
#
"usegalaxy.be",
"usegalaxy.cz",
"usegalaxy.no",
"usegalaxy.es",
# The rest of the public servers list.
"galaxy-web.ipk-gatersleben.de",
"galaxy.bio.di.uminho.pt",
"galaxy.hyphy.org",
"galaxy.mesocentre.uca.fr",
"galaxy.pasteur.fr",
"galaxytrakr.org",
"hyperbrowser.uio.no/coloc-stats",
"iris.angers.inra.fr/galaxypub-cfbp",
"mississippi.sorbonne-universite.fr",
"neo.engr.uconn.edu",
"palfinder.ls.manchester.ac.uk",
"vm-chemflow-francegrille.eu",
"www.immportgalaxy.org",
]
headers = {
"User-Agent": "tsvdb@1a (https://github.com/hexylena/toolshed-version-database/)",
}
def seq_try(path, servers):
# print(f"seq_try {path}")
for server in servers:
# print(f" → {server}")
try:
d = requests.get(
f"https://{server}{path}", timeout=10, headers=headers
).json()
if 'err_msg' in d:
continue
# print(f" success")
return d
except Exception:
# print(f"Failed to fetch {server}{path}")
pass
def download_api(tool_id):
# print(f"Downloading {tool_id}")
if tool_id.count("/") > 4:
tool_id_without_version = "/".join(tool_id.split("/")[0:-1])
else:
tool_id_without_version = tool_id
# If exactly this tool id was already downloaded, skip
if os.path.exists("api/tools/" + tool_id):
# print("Already downloaded")
return None
meta = seq_try(f"/api/tools/{tool_id}?io_details=True&link_details=False", SERVERS)
if meta is None or "name" not in meta:
# print("Meta is none:", meta)
tid = threading.current_thread().ident
with open(f'failed.{tid}.log', 'a') as handle:
handle.write(f"{tool_id}\n")
return None
else:
# make dir
os.makedirs("api/tools/" + tool_id_without_version, exist_ok=True)
print(f"Downloaded {tool_id}")
with open("api/tools/" + tool_id, "w") as handle:
json.dump(meta, handle)
with open("guid-rev.json") as f:
guid_rev = json.load(f)
tool_ids = set(guid_rev.keys())
print(f"Found {len(tool_ids)} tools")
# Existing
import glob
existing_tool_ids = glob.glob("api/**/*", recursive=True)
existing_tool_ids = [x for x in existing_tool_ids if os.path.isfile(x)]
existing_tool_ids = set([x[len("api/tools/"):] for x in existing_tool_ids])
# Recently failed
failed_files = glob.glob("failed.*")
ids = []
for fn in failed_files:
with open(fn, 'r') as handle:
ids += handle.readlines()
recently_failed_ids = set([x.strip() for x in ids])
tool_ids = list(tool_ids - existing_tool_ids)
print(f"Reduced to {len(tool_ids)} tools")
tool_ids = list(set(tool_ids) - recently_failed_ids)
print(f"Reduced to {len(tool_ids)} tools")
thread_map(download_api, tool_ids, max_workers=10)