-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathrunparser.py
80 lines (68 loc) · 2.23 KB
/
runparser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
"""Given one or more CoNLL files, parse sentences with Alpino.
Assumes that Alpino is installed and available in PATH."""
import os
import re
import tempfile
from glob import glob
from coref import readconll
def escapebrackets(word):
"""Escape square brackets for Alpino."""
return word.replace('[', r'\[').replace(']', r'\]')
def parse(conlldata, docname, tokenidx):
"""Parse a single document in a CoNLL file."""
with tempfile.NamedTemporaryFile(mode='wt', encoding='utf8') as out:
for sent in conlldata:
out.write(' '.join(
escapebrackets(fields[tokenidx])
for fields in sent))
out.write('\n\n')
out.flush()
os.mkdir(docname)
os.system(
'cat %s | Alpino number_analyses=1 end_hook=xml '
'-flag treebank %s -parse' % (out.name, docname))
def parseclindata(pattern, outdir):
"""Parse the CLIN dataset."""
origdir = os.getcwd()
filenames = glob(os.path.abspath(pattern))
os.mkdir(outdir)
os.chdir(outdir)
for n, conllfile in enumerate(filenames, 1):
data = next(iter(readconll(conllfile).values()))
fname = os.path.basename(conllfile)
docname = fname[:fname.index('_')]
tokenidx = 3
print('Parsing %d/%d: %s' % (n, len(filenames), docname))
parse(data, docname, tokenidx)
os.chdir(origdir)
def parsesemeval(path, outdir):
"""Parse the SemEval dataset."""
path = os.path.abspath(path)
origdir = os.getcwd()
os.mkdir(outdir)
os.chdir(outdir)
with open(path) as inp:
data = inp.read()
docnames = re.findall(r'#begin document ([\w_]+)', data)
docs = readconll(path)
for n, docname in enumerate(docnames, 1):
data = docs[docname]
tokenidx = 2
print('Parsing %d/%d: %s' % (n, len(docnames), docname))
parse(data, docname, tokenidx)
os.chdir(origdir)
if __name__ == '__main__':
# CLIN test:
# - separate .coref_ne files
# - parses in directories named after numeric prefix '_'
# SemEval test
# - single conll file, multiple chunks
# - create dir for chunks
parsesemeval('data/semeval2010/task01.posttask.v1.0/corpora/test/'
'nl.test.txt.fixed', 'data/semeval2010NLtestparses')
os.mkdir('data/clinTestData/')
for subset in ('boeing', 'gm', 'stock'):
parseclindata(
('../groref/clin26-eval-master/eval_corpora/%s/coref_ne/'
'*.coref_ne' % subset),
'data/clinTestData/%s' % subset)