-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpredicted_segment_transcript.py
executable file
·26 lines (25 loc) · 1.25 KB
/
predicted_segment_transcript.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
import numpy as np
from transcribe import transcribe
from normalize import normalize
from smooth import smooth
from extremize import extremize
from mask_boundaries import mask_boundaries
from allocate_pred_to_speech_segments import allocate_pred_to_speech_segments
def predicted_segment_transcript(C, model, audio, start, end, s_dB_mean, samples_per_spect, dt_S):
clip_audio=audio[start:end]
prediction=transcribe(C, model, clip_audio)
print(f"PRED {start/C.sample_rate:2f} {prediction}")
spec_start=int(start/samples_per_spect)
spec_end=int(end/samples_per_spect)
clip_power=s_dB_mean[spec_start:spec_end]
normalized_power=normalize(np.copy(clip_power))
timeline=np.arange(spec_start,spec_end)*dt_S
w=min(30, normalized_power.shape[0])
smoothed_normalized_power=normalize(smooth(normalized_power,w))
speech_mask=extremize(smoothed_normalized_power, 0.2)
speech_segments=mask_boundaries(speech_mask)+spec_start
spec_to_words=allocate_pred_to_speech_segments(prediction, speech_segments)
if len(spec_to_words)==0:
return None
segment_transcript = [(spec1*dt_S, (spec2-spec1)*dt_S, word) for spec1, spec2, word in spec_to_words]
return segment_transcript, timeline, normalized_power, speech_mask, clip_audio