-
Notifications
You must be signed in to change notification settings - Fork 2
/
loadwfc-en.py
executable file
·114 lines (98 loc) · 3.87 KB
/
loadwfc-en.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
#!/usr/bin/env python3
import os
import json
from urllib.request import urlopen
from glob import glob
from tqdm import tqdm
from pathlib import Path
from functools import partial
from argparse import ArgumentParser
PROJECT = 'wikifactcheck-english'
BASEDIR = '~/.{}'.format(PROJECT)
REPOURL = 'https://rawcdn.githack.com/{prj}/{prj}/master/'.format(prj=PROJECT)
def download(full=True, dest=BASEDIR, force=False):
'''
'''
parent = Path(dest).expanduser()
parent.mkdir(exist_ok=True)
filename = 'wikifactcheck-english_{}.jsonl'
url = REPOURL + filename
for part in tqdm(['train', 'test'], desc='train and test'):
filepath = parent / filename.format(part)
if filepath.exists() and not force:
print(filepath, 'already exists')
continue
with filepath.open('wb+') as fp, urlopen(url.format(part)) as web:
for line in web:
fp.write(line)
# if full data is requested (default), also download the 5 full{} part files
# and combine them into one, also deleting the part files
if full:
filename = 'wikifactcheck-english_full{}.jsonl'
url = REPOURL + filename
filepath = parent / filename.format('')
if filepath.exists() and not force:
print(filepath, 'already exists')
return
with filepath.open('wb+') as combined:
for part in tqdm(range(5), desc='full'):
with urlopen(url.format(part)) as web:
for line in web:
combined.write(line)
def load_(pattern, lines=None, path=BASEDIR):
'''
load from all .jsonl files containing {pattern}
returns a generator over entries of the appropriate kind
'''
parent = Path(path).expanduser()
# parent.mkdir(exist_ok=True)
if not parent.exists():
path = '.'
parent = Path(path)
filenames = sorted(glob('*{}*.jsonl'.format(pattern)))
if not filenames:
inp = input('Data not found at {}. Download? [Y/n]'.format(path))
if inp.lower() in ('', 'y', 'yes', 'yep', 'yeah'):
download(dest=path, full='full' in pattern)
else:
raise StopIteration
ctr = 0
for fname in filenames:
with Path(fname).open('r') as f:
for line in f:
ctr += 1
yield json.loads(line[:-1])
if lines and ctr >= lines:
raise StopIteration
load = load_
load_train = partial(load_, 'train') # train set
load_test = partial(load_, 'test') # held-out test set
load_full = partial(load_, 'full') # full dataset, including non-annot.
if __name__ == '__main__':
parser = ArgumentParser('wikifactcheck-english')
parser.add_argument('-d', '--download', help='download dataset',
action='store_true', default=False)
parser.add_argument('-f', '--force', help='force re-download?',
action='store_true', default=False)
datasets = ['train', 'test', 'full']
parser.add_argument('-r', '--read', type=str, nargs='*',
choices=datasets,
help='read from particular datasets (default: all)')
parser.add_argument('-n', '--numlines', type=int, default=None,
help='numlines to read from each one')
parser.add_argument('-t', '--fmt', help='output format for --read option',
default='json', choices=['json', 'python'])
args = parser.parse_args()
if args.download:
download(force=args.force)
if args.read:
for name in args.read:
ctr = 0
for item in load(name):
if args.numlines and ctr >= args.numlines:
break
if args.fmt is 'json':
print(json.dumps(item))
else:
print(item)
ctr += 1