-
Notifications
You must be signed in to change notification settings - Fork 0
/
Model.py
64 lines (51 loc) · 2.32 KB
/
Model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import ufal.udpipe
class Model:
def __init__(self, path):
"""Load given model."""
self.model = ufal.udpipe.Model.load(path)
if not self.model:
raise Exception("Cannot load UDPipe model from file '%s'" % path)
def tokenize(self, text):
"""Tokenize the text and return list of ufal.udpipe.Sentence-s."""
tokenizer = self.model.newTokenizer(self.model.DEFAULT)
if not tokenizer:
raise Exception("The model does not have a tokenizer")
return self._read(text, tokenizer)
def read(self, text, in_format):
"""Load text in the given format (conllu|horizontal|vertical) and return list of ufal.udpipe.Sentence-s."""
input_format = ufal.udpipe.InputFormat.newInputFormat(in_format)
if not input_format:
raise Exception("Cannot create input format '%s'" % in_format)
return self._read(text, input_format)
def _read(self, text, input_format):
input_format.setText(text)
error = ufal.udpipe.ProcessingError()
sentences = []
sentence = ufal.udpipe.Sentence()
while input_format.nextSentence(sentence, error):
sentences.append(sentence)
sentence = ufal.udpipe.Sentence()
if error.occurred():
raise Exception(error.message)
return sentences
def tag(self, sentence):
"""Tag the given ufal.udpipe.Sentence (inplace)."""
self.model.tag(sentence, self.model.DEFAULT)
def parse(self, sentence):
"""Parse the given ufal.udpipe.Sentence (inplace)."""
self.model.parse(sentence, self.model.DEFAULT)
def write(self, sentences, out_format):
"""Write given ufal.udpipe.Sentence-s in the required format (conllu|horizontal|vertical)."""
output_format = ufal.udpipe.OutputFormat.newOutputFormat(out_format)
output = ''
for sentence in sentences:
output += output_format.writeSentence(sentence)
output += output_format.finishDocument()
return output
# Can be used as
# model = Model('french-spoken-ud-2.3-181115.udpipe')
# sentences = model.tokenize("Bonjour. Comment allez vous?")
# for s in sentences:
# model.tag(s)
# model.parse(s)
# conllu = model.write(sentences, "conllu")