-
Notifications
You must be signed in to change notification settings - Fork 0
/
get_img_urls.py
86 lines (66 loc) · 2.82 KB
/
get_img_urls.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import argparse
import json
import os
from tqdm import tqdm
import paths
from tqdm import tqdm
from multiprocessing import Pool
from pathlib import Path
from urllib.parse import unquote
import utils
batch_size=100
workers=8
commons_prefix=utils.commons_prefix
commons_prefix_len=len(commons_prefix)
def process_candidate_batch(candidate_files):
folder = paths.CANDIDATES_FOLDER_PATH
image_urls = set()
for candidate_file in candidate_files:
candidate_path=folder / candidate_file
with open(candidate_path, 'r') as f:
data = json.load(f)
images=data["images"]
for image in images:
try:
commons_url = image
#http://commons.wikimedia.org/wiki/Special:FilePath/T%C3%B6%C3%B6l%C3%B6nlahti1.jpg
if commons_url.startswith(commons_prefix):#from a different domain
file_name = commons_url[commons_prefix_len:]
image_urls.add(file_name)
except:
pass
return image_urls
def main(args):
batch_size=args.batch_size
workers=args.workers
image_urls = []
candidates_folder = paths.CANDIDATES_FOLDER_PATH
json_file_names = [f for f in os.listdir(candidates_folder) if f.endswith('.json')]
#split the list of json files into batches
json_file_names = [json_file_names[i:i + batch_size] for i in range(0, len(json_file_names), batch_size)]
with Pool(workers) as p:
for urls in tqdm(p.imap_unordered(process_candidate_batch, json_file_names), total=len(json_file_names), desc="Processing candidates in batches"):
image_urls.extend(urls)
artpedia_matches = paths.ARTPEDIA2WIKI_MATCHED_PATH
with open(artpedia_matches, 'r') as f:
artpedia_matches = json.load(f)
for qid, obj in artpedia_matches.items():
img_url = obj.get('img_url')
#get the file name from the url
if img_url.startswith(utils.commons_prefix):
img_url = Path(img_url).name
image_urls.append(img_url)
image_urls = set(image_urls)
#url decode the names (for instance Albert%20II%20of%20Austria.jpg -> Albert_II_of_Austria.jpg)
image_urls = {unquote(url) for url in image_urls}
#sort the set
image_urls = sorted(image_urls)
with open(paths.IMAGES_TXT_PATH, 'w') as f:
for item in image_urls:
f.write("%s\n" % item)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Get the candidates from wikidata, and build a file with all the images urls')
parser.add_argument('--batch_size', type=int, default=batch_size, help='Batch size for processing candidates')
parser.add_argument('--workers', type=int, default=workers, help='Number of worker processes to use')
args = parser.parse_args()
main(args)