This repository has been archived by the owner on Oct 13, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 7
/
ngram.py
95 lines (82 loc) · 3.75 KB
/
ngram.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
# Usage: python rescore.py [rnn_model_file]
# coding='utf8'
import os
import sys
import re
def main(argv):
# Process I/O files
sentences_file = '/tmp4/eric11220/sentences_100_no_smooth_500.txt'
sentences = open(sentences_file, 'U')
outfile = open("middle/output_without_id_100_no_smooth_500ngram.txt", 'w')
idx = 0
tmp_n_sentences_file = sentences_file + str(idx) + '_tmp_n_sentences.txt'
tmp_n_scores_file = sentences_file + str(idx) + '_tmp_n_scores.txt'
while os.path.isfile(tmp_n_sentences_file):
tmp_n_sentences_file = sentences_file + str(idx) + '_tmp_n_sentences.txt'
tmp_n_scores_file = sentences_file + str(idx) + '_tmp_n_scores.txt'
idx += 1
os.system('rm ' + tmp_n_sentences_file)
os.system('rm ' + tmp_n_scores_file)
os.system("sed -i -- ':a;N;$!ba;s/\\n\{2,\}/\\n/g' " + sentences_file)
rnn_model_file = '/tmp4/eric11220/rnn/simple-examples/models/swb.ngram.model'
rnnpath = "/tmp4/eric11220/rnn/simple-examples/rnnlm-0.2b"
# Build up timit.chmap dictionary
timit_chmap = open('/tmp4/eric11220/MLDS_Final/conf/timit.chmap', 'r')
en_ch_dict = {}
for line in timit_chmap:
line_list = line.strip().split(' ') # remove \n, ' '
line_en_ch = line_list[0].split('\t') # remove \t to separate en-ch
en_ch_dict[line_en_ch[0]] = line_en_ch[1]
# Chunk n sentences for tmp
idx = 0
tmp_n_sentences = open(tmp_n_sentences_file, 'w')
for line in sentences:
if(line == '<s>\n'):
tmp_n_sentences.close()
sort_file = tmp_n_sentences_file + '_sort.txt'
os.system("sort " + tmp_n_sentences_file + " | uniq > " + sort_file)
os.system('mv ' + sort_file + ' ' + tmp_n_sentences_file)
os.system("~/Deep/final/srilm-1.5.10/i686-m64/ngram -lm /tmp4/eric11220/rnn/simple-examples/models/swb.ngram.model -ppl " + tmp_n_sentences_file + " -debug 1 > " + tmp_n_scores_file)
# Parse logp
inf = open(tmp_n_scores_file, 'U')
content = inf.read()
matches = re.finditer(r'logprob= (.*?) ', content)
result = [match.group(1) for match in matches]
with open(tmp_n_scores_file + '_parsed.txt', 'w') as outf:
for prob in result[:-1]:
outf.write(prob + '\n')
# Read in the best sentence
n_sentences = open(tmp_n_sentences_file, 'U')
n_scores = open(tmp_n_scores_file + '_parsed.txt', 'U')
best_score = float('-inf')
best_sentence = ''
for sentence_line, score_line in zip(n_sentences, n_scores):
sentence = sentence_line.strip().split(' ')
score = float(score_line)
# If meet word not in dictionary, give penalty to that sentence
#for word in sentence:
# try:
# tmp = en_ch_dict[word]
# except:
# score *= -10
if score > best_score:
best_score = score
best_sentence = sentence
# Map the best sentence to character-based sentence(Skip if not in dictionary)
ch_sentence = ''
idx += 1
print idx, best_sentence
for word in best_sentence:
try:
ch_sentence += en_ch_dict[word]
except:
continue
outfile.write(ch_sentence + '\n')
# Close files, open new tmp_n_sentences
n_sentences.close()
n_scores.close()
tmp_n_sentences = open(tmp_n_sentences_file, 'w')
continue
tmp_n_sentences.write(line)
if __name__ == "__main__":
main(sys.argv)