-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdemo.py
88 lines (68 loc) · 2.74 KB
/
demo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import io
import json
import matplotlib as mpl
import matplotlib.pyplot as plt
import mmap
import numpy
import soundfile
import torchaudio
import torch
import os
from collections import defaultdict
# from IPython.display import Audio, display
from pathlib import Path
from pydub import AudioSegment
from pydub.playback import play
from seamless_communication.inference import Translator
from seamless_communication.streaming.dataloaders.s2tt import SileroVADSilenceRemover
# Speech to Speech Translation
def s2st_inference(in_file="", play_input=False, play_output=False):
# README: https://github.com/facebookresearch/seamless_communication/tree/main/src/seamless_communication/cli/m4t/predict
# Please use audios with duration under 20 seconds for optimal performance.
# Resample the audio in 16khz if sample rate is not 16khz already.
# torchaudio.functional.resample(audio, orig_freq=orig_freq, new_freq=16_000)
if in_file=="":
in_file = "content/LJ037-0171_sr16k.wav"
if (play_input):
if (os.path.exists(in_file)):
play(AudioSegment.from_wav(in_file))
else:
print(f"File not found: {in_file}")
tgt_langs = ("spa", "fra", "deu", "ita", "hin", "cmn")
for tgt_lang in tgt_langs:
text_output, speech_output = translator.predict(
input=in_file,
task_str="s2st",
tgt_lang=tgt_lang,
)
print(f"Translated text in {tgt_lang}: {text_output[0]}")
print()
out_file = f"/content/translated_LJ_{tgt_lang}.wav"
torchaudio.save(out_file, speech_output.audio_wavs[0][0].to(torch.float32).cpu(), speech_output.sample_rate)
print(f"Translated audio in {tgt_lang}:")
if (play_output):
audio=AudioSegment.from_wav(out_file)
play(audio)
if __name__ == "__main__":
print("Pytorch Version: ", torch.__version__)
print("CUDA Available: ", torch.cuda.is_available()) # This should be False on M1 Macs
print("MPS Available: ", torch.backends.mps.is_available()) # This should be True if using macOS 12+ and PyTorch 1.12+
# Initialize a Translator object with a multitask model, vocoder on the GPU.
model_name = "seamlessM4T_v2_large"
vocoder_name = "vocoder_v2" if model_name == "seamlessM4T_v2_large" else "vocoder_36langs"
if torch.cuda.is_available():
device = torch.device("cuda:0")
dtype = torch.float16
elif torch.backends.mps.is_available():
device = torch.device("mps")
dtype = torch.float32
else:
device = torch.device("cpu")
dtype = torch.float32
translator = Translator(
model_name,
vocoder_name,
device=device,
dtype=dtype,
)
s2st_inference(play_output=True)