-
Notifications
You must be signed in to change notification settings - Fork 0
/
build_twitter_hmm_laplace.py
82 lines (70 loc) · 2.4 KB
/
build_twitter_hmm_laplace.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import json
def load_parameters(train_filename, test_filename):
params = []
with open(train_filename, 'r') as f:
for jsonObj in f:
param = json.loads(jsonObj)
params.append(param)
e = {}
s = ['<S>', 'N', 'O', 'S', '^', 'Z', 'L', 'M', 'V', 'A', 'R', '!', 'D', \
'P', '&', 'T', 'X', 'Y', '#', '@', '~', 'U', 'E', '$', ',', 'G', '<E>']
o = {}
trans = {}
for st in s:
trans[st] = {}
e[st] = {}
for param in params:
prev = None
next = param[0][1]
if next not in trans['<S>']:
trans['<S>'][next] = 0
trans['<S>'][next] += 1.0
for a, b in param:
if a not in o:
o[a] = 0
o[a] += 1
if b not in e:
e[b] = {}
trans[b] = {}
if a not in e[b]:
e[b][a] = 0
e[b][a] += 1
if prev is not None:
if b not in trans[prev[1]]:
trans[prev[1]][b] = 0
trans[prev[1]][b] += 1
prev = [a, b]
if '<E>' not in trans[prev[1]]:
trans[prev[1]]['<E>'] = 0
trans[prev[1]]['<E>'] += 1.0
# laplace smoothing
af = 1.0
with open(test_filename, 'r') as f:
for jsonObj in f:
param = json.loads(jsonObj)
for a,b in param:
if a not in o:
o[a] = af
if a not in e[b]:
e[b][a] = 0.0
o = [k for k, v in sorted(o.items(), key=lambda item: item[1], reverse=True)]
for state in e:
n = sum(e[state].values()) + af * len(o)
for word in e[state]:
e[state][word] = (e[state][word] + af) / n
for state in trans:
n = sum(trans[state].values())
for next_state in s:
if next_state == '<S>':
continue
if next_state in trans[state]:
trans[state][next_state] /= n
else:
trans[state][next_state] = 0
return s, o, trans, e
def write_parameters(filename, s, o, trans, e):
with open(filename, 'w') as f:
json.dump({'S': s, 'O': o, 'P_trans': trans, 'P_emission': e}, f, indent=4, separators=(',', ': '))
if __name__ == "__main__":
s, o, trans, e = load_parameters('twt.train.json', 'twt.test.json')
write_parameters('twitter_pos_hmm_laplace.json', s, o, trans, e)