-
Notifications
You must be signed in to change notification settings - Fork 1
/
create_corpus.py
96 lines (74 loc) · 2.75 KB
/
create_corpus.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
"""self."""
import xml.etree.ElementTree as ET
from os.path import expanduser, join
from os import makedirs
import re
from main import load_configuration, QAPair, load_qas
def answers_wikia_iter():
"""
Iteratates of question and answers pairs from answers.wiki.com 'Current pages dump'
link: http://s3.amazonaws.com/wikia_xml_dumps/a/an/answers_pages_current.xml.7z
link on site: http://answers.wikia.com/wiki/Special:Statistics
"""
xml_filepath = expanduser('answers_pages_current.xml')
tree = ET.parse(xml_filepath)
pages = tree.getroot().findall('page')
#start_cutout_phrases = ['User talk:', 'Forum:', 'File:', 'Talk:', 'User:','Category:','Template','MediaWiki:','Help:']
#cutout_phrases = ['wikianswers']
for p in pages:
question = p.find('title').text
answer = p.find('revision').find('text').text
# skip if stats with sart-cutout-phrase
#if any(title.startswith(scp) for scp in start_cutout_phrases):
# continue
# skip if contains cutout-phrase
#if any(title.lower().startswith(scp) for scp in cutout_phrases):
# continue
# skip empty answers
if not answer: continue
# skip questions not marked as answered
if '[Category:Answered questions]' not in answer:
continue
# cut certain bracket contents
bracket_patterns = [
r'\[\[.*\]\]',
r'<.*>',
r'\{\{.*\}\}',
]
for pat in bracket_patterns:
answer = re.sub(
pattern = pat,
repl = ' ',
string = answer,
flags = re.DOTALL)
# remove uneccessary blanks
answer = answer.strip()
# if answer hsa been stripped empty, skip it!
if not answer: continue
yield QAPair(question, answer)
def save_to_pickle(filename, qas):
import pickle
with open(filename, 'wb') as f:
pickle.dump(qas, f)
if __name__ == '__main__':
import sys; args = sys.argv[1:]
conf = load_configuration()
if 'load' in args:
qas = load_qas(conf['QuestionAnswering']['corpus_path'])
else:
print('read xml file ...')
qas = list(answers_wikia_iter())
if 'save' in args:
print('pickle ...')
conf = load_configuration()
corpus_path = conf['QuestionAnswering']['corpus_path']
makedirs(corpus_path, exist_ok=True)
filepath = join(corpus_path, 'answers.wikia.p')
save_to_pickle(filepath, qas)
if 'loop' in args:
print('loop over qa_pairs\n')
from random import shuffle; shuffle(qas)
for q,a in qas:
print('Q:', q)
print('A:', a, '\n')
input()