-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathparse.py
executable file
·79 lines (65 loc) · 2.4 KB
/
parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
#!/usr/bin/python
# -*- coding: utf-8 -*-
#### Script to execute the mxpost and malparser to annotate a sentence sent from the input or by file
####
#### Some code borrow from http://www.nltk.org/_modules/nltk/parse/malt.html
# Author: Pedro Balage ([email protected])
# Date: 25/05/2015
# Version: 1.0
# Python 3 compatibility
from __future__ import unicode_literals
from __future__ import division
from __future__ import absolute_import
from __future__ import with_statement
from __future__ import print_function
# imports
from subprocess import Popen, PIPE
import tempfile
import fileinput
import codecs
import os
import sys
# read from Stdin or from files
sentences = []
for line in fileinput.input(openhook = fileinput.hook_encoded('utf-8')):
# MXPOST
p = Popen(['java',
'-mx30m',
'-cp',
'mxpost/mxpost.jar',
'tagger.TestTagger',
'pt-br-universal-tagger.project'],
stdin = PIPE, stdout=PIPE, stderr=PIPE)
stdout, stderr = p.communicate(input=line)
sentence = stdout.decode('utf8')
tokens = [tuple(w.split('_')) for w in sentence.split()]
sentences.append(tokens)
# MALT Parser
input_file = tempfile.NamedTemporaryFile(prefix='malt_input.conll',
dir=tempfile.gettempdir(),
delete=False)
output_file = tempfile.NamedTemporaryFile(prefix='malt_output.conll',
dir=tempfile.gettempdir(),
delete=False)
try:
for sentence in sentences:
for (i, (word, tag)) in enumerate(sentence, start=1):
input_str = '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' %\
(i, word, '_', tag, tag, '_', '0', 'a', '_', '_')
input_file.write(input_str.encode("utf8"))
input_file.write(b'\n\n')
input_file.close()
cmd = ['java' ,
'-jar', 'maltparser-1.8.1/maltparser-1.8.1.jar',
'-c' , 'uni-dep-tb-ptbr',
'-i' , input_file.name,
'-o' , output_file.name,
'-m' , 'parse']
p = Popen(cmd, stdout=PIPE, stderr=PIPE)
ret = p.wait()
print(codecs.open(output_file.name,'r',encoding='utf8').read())
finally:
input_file.close()
os.remove(input_file.name)
output_file.close()
os.remove(output_file.name)