-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtest.py
122 lines (108 loc) · 3.09 KB
/
test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
from nltk.corpus import cess_esp as cess
from nltk import RegexpTokenizer
import nltk
import pickle
# My sentences
sentence = "hola, hola, soy Pedro ¿como te llamas?."
tokenizer = RegexpTokenizer(r'\w+')
tokenized_words = tokenizer.tokenize(sentence)
# Dec train/test
train = None
test = None
cess_sents = cess.tagged_sents()
try:
with open('test_pickles/test_data.pickle', 'rb') as fa:
div = pickle.load(fa)
train = cess_sents[:div]
test = cess_sents[div+1:]
except FileNotFoundError as a:
# training data
print("dumping train/test")
div = len(cess_sents)*90//100
train = cess_sents[:div]
test = cess_sents[div+1:]
with open('test_pickles/test_data.pickle', 'wb') as fb:
pickle.dump(div, fb)
#####
#
# 1 ubt tagger
#
#####
def backoff_tagger(tagged_sents, tagger_classes, backoff=None):
if not backoff:
backoff = tagger_classes[0](tagged_sents)
del tagger_classes[0]
for cls in tagger_classes:
tagger = cls(tagged_sents, backoff=backoff)
backoff = tagger
return backoff
print("started ubt")
from nltk import UnigramTagger, TrigramTagger, RegexpTokenizer
ubt_tagger = None
try:
with open('test_pickles/ubt.pickle', 'rb') as fa:
ubt_tagger = pickle.load(fa)
except FileNotFoundError as a:
# training data
print("dumping ubt")
ubt_tagger = backoff_tagger(train, [nltk.tag.UnigramTagger, nltk.tag.BigramTagger, nltk.tag.TrigramTagger])
with open('test_pickles/ubt.pickle', 'wb') as fb:
pickle.dump(ubt_tagger, fb)
#print(ubt_tagger.evaluate(test))
print(ubt_tagger.tag(tokenized_words))
#####
#
# 2 brill tagger
#
#####
# from nltk.tag_util import train_brill_tagger
# brill_tagger = None
# try:
# with open('test_pickles/brill.pickle', 'rb') as fa:
# brill_tagger = pickle.load(fa)
# except FileNotFoundError as a:
# # training data
# print("dumping brill")
# brill_tagger = train_brill_tagger(ubt_tagger, train)
# with open('test_pickles/brill.pickle', 'wb') as fb:
# pickle.dump(brill_tagger, fb)
# print(brill_tagger.evaluate(test))
#####
#
# 3 classified tagger
#
#####
from nltk.tag.sequential import ClassifierBasedPOSTagger
print("started classified")
class_tagger = None
try:
with open('test_pickles/class.pickle', 'rb') as fa:
class_tagger = pickle.load(fa)
except FileNotFoundError as a:
# training data
print("dumping class")
class_tagger = ClassifierBasedPOSTagger(train=train)
with open('test_pickles/class.pickle', 'wb') as fb:
pickle.dump(class_tagger, fb)
#print(class_tagger.evaluate(test))
print(class_tagger.tag(tokenized_words))
####
#
# 4 TnT
#
####
print("started tnt")
from nltk.tag import tnt
tnt_tagger = None
try:
with open('test_pickles/tnt.pickle', 'rb') as fa:
tnt_tagger = pickle.load(fa)
except FileNotFoundError as a:
# training data
print("dumping tnt")
tnt_tagger = tnt_tagger = tnt.TnT()
tnt_tagger.train(train)
with open('test_pickles/tnt.pickle', 'wb') as fb:
pickle.dump(tnt_tagger, fb)
#print(tnt_tagger.evaluate(test))
print(tnt_tagger.tag(tokenized_words))