forked from huggingface/transformers
-
Notifications
You must be signed in to change notification settings - Fork 0
/
whisper-online-demo.py
66 lines (57 loc) · 2.68 KB
/
whisper-online-demo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import argparse
import torch
import torchaudio
from transformers.generation_utils import EvaluatedHypotheses
from transformers import WhisperProcessor, WhisperForConditionalGeneration
def longest_common_prefix(o,t, eos):
pref = []
for p, d in zip(o, t):
if p == d and p != eos:
pref.append(p)
else:
break
return pref
def main(args):
# load model and processor
processor = WhisperProcessor.from_pretrained(args.model)
model = WhisperForConditionalGeneration.from_pretrained(args.model)
model.to(args.device)
model.config.forced_decoder_ids = None
samples, sr = torchaudio.load(args.audio_path)
input_features = processor(samples[0], sampling_rate=sr, return_tensors="pt").input_features.to(args.device)
predicted_ids = model.generate(input_features)
transcription = processor.decode(predicted_ids[0], skip_special_tokens=True)
print('Offline:', transcription)
eos = predicted_ids[0,-1]
eh = EvaluatedHypotheses(repetition_detection=args.repetition_detection)
unstable = predicted_ids[0,:4] # check out the Whisper's documentation
stable = predicted_ids[0,:4]
STEP_SIZE = int(sr * args.chunk_size)
idx = 0
while idx < samples.shape[-1]:
idx += STEP_SIZE
input_features = processor(samples[0,:idx], sampling_rate=sr, return_tensors="pt").input_features
input_features = input_features.to(args.device)
decoder_input_ids = stable.unsqueeze(0).to(args.device)
if idx >= samples.shape[-1]:
eh = None
out = model.generate(input_features, decoder_input_ids=decoder_input_ids, evaluated_hypotheses=eh, num_return_sequences=1, num_beams=6)
if eh: eh.finish_step()
h = out[0].cpu()
stable = h
#do LCP if not finished
if idx < samples.shape[1]:
stable = torch.LongTensor(longest_common_prefix(unstable, h[:-2], eos)) # -2 remove EOS and one more token before
print(f'Input {idx/sr}s out of {samples.shape[-1]/sr}s')
print('Hypo: ', processor.decode(h, skip_special_tokens=True))
print('Stable:', processor.decode(stable, skip_special_tokens=True))
print('*'*50,'\n')
unstable = h
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('audio_path', type=str)
parser.add_argument('--device', choices=['cpu', 'cuda'], default='cuda')
parser.add_argument('--chunk-size', type=float, default=0.3, help='Chunk size in seconds')
parser.add_argument('--model', type=str, default='openai/whisper-base')
parser.add_argument('--repetition-detection', action='store_true')
main(parser.parse_args())