-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy pathextract_vocab.py
141 lines (117 loc) · 3.93 KB
/
extract_vocab.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
# -*- coding: utf-8 -*-
import pickle
import time
import os
import json
#from collections import OrderedDict
import dataset_walker
# ontology所在位置
cur_dir = os.path.dirname(os.path.abspath(__file__))
ontology_path = os.path.join(cur_dir, 'config/ontology_dstc2.json')
ontology = json.load(open(ontology_path, 'r'))
dataset_name = 'dstc2_train'
dataroot = 'dstc2_traindev/data'
start_time = time.time()
# vocab to be generated
vocab_dict = {}
# 词表中每个词出现的最小词频
oov_threshold = 20
# 出现过的所有词
word_dict = {}
# pre-set some words
vocab_dict["<oov>"] = len(vocab_dict)
vocab_dict["</s>"] = len(vocab_dict)
vocab_dict["#turn#"] = len(vocab_dict)
# TODO
vocab_dict["#food#"] = len(vocab_dict)
# vocab_dict["#food2#"] = len(vocab_dict)
# vocab_dict["#food3#"] = len(vocab_dict)
# vocab_dict["#food4#"] = len(vocab_dict)
# vocab_dict["#food5#"] = len(vocab_dict)
vocab_dict["#name#"] = len(vocab_dict)
vocab_dict["#slot#"] = len(vocab_dict)
# vocab_dict["#name2#"] = len(vocab_dict)
# vocab_dict["#name3#"] = len(vocab_dict)
# vocab_dict["#name4#"] = len(vocab_dict)
# vocab_dict["#name5#"] = len(vocab_dict)
# TODO
# vocab_dict["<addr>"] = len(vocab_dict)
# vocab_dict["<phone>"] = len(vocab_dict)
# vocab_dict["<postcode>"] = len(vocab_dict)
def add_word(word):
word=word.encode('utf-8')
"""向word_dict中添加一个word"""
word_dict[word] = word_dict.get(word, 0) + 1
def add_words(words):
"""向word_dict中添加若干个words"""
word_list = words.split()
# add 1-gram word
for word in word_list:
add_word(word)
# TODO add 2-gram word
#for word in [' '.join(word_list[i:i+2]) for i in xrange(len(word_list)-1)]:
# add_word(word)
# TODO add 3-gram word
#for word in [' '.join(word_list[i:i+3]) for i in xrange(len(word_list)-2)]:
# add_word(word)
# include all ontology values into vocab
# TODO
add_words("none")
add_words("dontcare")
for i in range(59):
for key in ontology:
add_words(key)
if key in ["requestable", "method"]:
for val in ontology[key]:
add_words(val)
elif key == "informable":
for slot in ["area", "pricerange"]:
add_words(slot)
for val in ontology[key][slot]:
add_words(val)
# TODO
for slot in ["food"]:
add_words(slot)
for val in ontology[key][slot]:
add_words(val)
# TODO
for slot in ["name"]:
add_words(slot)
for val in ontology[key][slot]:
add_words(val)
# include asr words and slu words appeared in data set
dataset = dataset_walker.dataset_walker(dataset_name, dataroot=dataroot, labels=True)
add_words("asr")
add_words("slots")
add_words("act")
for call in dataset:
for turn, labelJson in call:
asrs = turn["input"]["live"]["asr-hyps"]
# 1best
add_words(asrs[0]["asr-hyp"])
# 2best - nbest
# TODO
for asr in asrs[1:]:
add_words(asr["asr-hyp"])
# dialog acts
machine_act_words = []
for act_item in turn["output"]["dialog-acts"]:
if "act" in act_item:
machine_act_words.append(act_item["act"])
if "slots" in act_item:
for item in act_item["slots"]:
for item_val in item:
machine_act_words.append(item_val)
machine_act = ' '.join(machine_act_words)
add_words(machine_act)
# save vocab to file
# TODO modify file name if needed
with open('vocab_matNN.dict', 'wb') as f:
for word, freq in word_dict.items():
if freq >= oov_threshold:
vocab_dict[word] = len(vocab_dict)
pickle.dump(vocab_dict, f)
end_time = time.time()
print "vocab size:", len(vocab_dict)
#print vocab_dict
print "cost time: ", end_time-start_time, 's'