-
Notifications
You must be signed in to change notification settings - Fork 17
/
trans_utils.py
82 lines (73 loc) · 2.48 KB
/
trans_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
PUNC_LIST = [',', '。', '!', '?', '、']
def pre_proc(text):
res = ''
for i in range(len(text)):
if text[i] in PUNC_LIST:
continue
if '\u4e00' <= text[i] <= '\u9fff':
if len(res) and res[-1] != " ":
res += ' ' + text[i]+' '
else:
res += text[i]+' '
else:
res += text[i]
if res[-1] == ' ':
res = res[:-1]
return res
def proc(raw_text, timestamp, dest_text):
# simple matching
ld = len(dest_text.split())
mi, ts = [], []
offset = 0
while True:
fi = raw_text.find(dest_text, offset, len(raw_text))
# import pdb; pdb.set_trace()
ti = raw_text[:fi].count(' ')
if fi == -1:
break
offset = fi + ld
mi.append(fi)
ts.append([timestamp[ti][0]*16, timestamp[ti+ld-1][1]*16])
# import pdb; pdb.set_trace()
return ts
def proc_spk(dest_spk, sd_sentences):
ts = []
for d in sd_sentences:
d_start = d['ts_list'][0][0]
d_end = d['ts_list'][-1][1]
spkid=dest_spk[3:]
if str(d['spk']) == spkid and d_end-d_start>999:
ts.append([d['start']*16, d['end']*16])
return ts
def generate_vad_data(data, sd_sentences, sr=16000):
assert len(data.shape) == 1
vad_data = []
for d in sd_sentences:
d_start = round(d['ts_list'][0][0]/1000, 2)
d_end = round(d['ts_list'][-1][1]/1000, 2)
vad_data.append([d_start, d_end, data[int(d_start * sr):int(d_end * sr)]])
return vad_data
def write_state(output_dir, state):
for key in ['/recog_res_raw', '/timestamp', '/sentences', '/sd_sentences']:
with open(output_dir+key, 'w') as fout:
fout.write(str(state[key[1:]]))
if 'sd_sentences' in state:
with open(output_dir+'/sd_sentences', 'w') as fout:
fout.write(str(state['sd_sentences']))
import os
def load_state(output_dir):
state = {}
with open(output_dir+'/recog_res_raw') as fin:
line = fin.read()
state['recog_res_raw'] = line
with open(output_dir+'/timestamp') as fin:
line = fin.read()
state['timestamp'] = eval(line)
with open(output_dir+'/sentences') as fin:
line = fin.read()
state['sentences'] = eval(line)
if os.path.exists(output_dir+'/sd_sentences'):
with open(output_dir+'/sd_sentences') as fin:
line = fin.read()
state['sd_sentences'] = eval(line)
return state