forked from DigitalPhonetics/IMS-Toucan
-
Notifications
You must be signed in to change notification settings - Fork 0
/
run_text_to_file_reader.py
165 lines (152 loc) · 6.24 KB
/
run_text_to_file_reader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
import os
import argparse
import torch
from Utility.storage_config import MODELS_DIR
from InferenceInterfaces.PortaSpeechInterface import PortaSpeechInterface
def read_texts(
model_id,
sentence,
filename,
device="cpu",
language="en",
lang_emb=None,
speaker_reference=None,
input_is_phones=False,
):
tts = PortaSpeechInterface(
device=device, tts_model_path=model_id, language=language
)
print(f"Instantiated a PortaSpeechInterface object with language {language}.")
tts.set_language(language)
if lang_emb is not None:
tts.set_language_embedding(lang_emb)
if speaker_reference is not None:
tts.set_utterance_embedding(speaker_reference)
if type(sentence) == str:
sentence = [sentence]
tts.read_to_file(
text_list=sentence, file_location=filename, input_is_phones=input_is_phones
)
del tts
def le_corbeau_et_le_renard(version, model_id="Meta", exec_device="cpu"):
os.makedirs("audios", exist_ok=True)
read_texts(
model_id=model_id,
sentence=[
"Maître Corbeau, sur un arbre perché, tenait en son bec un fromage.",
"Maître Renard, par l’odeur alléché, lui tint à peu près ce langage:",
"Et bonjour, Monsieur du Corbeau, que vous êtes joli! que vous me semblez beau!",
"Sans mentir, si votre ramage se rapporte à votre plumage, vous êtes le Phénix des hôtes de ces bois.",
"À ces mots le Corbeau ne se sent pas de joie, et pour montrer sa belle voix, il ouvre un large bec, laisse tomber sa proie.",
"Le Renard s’en saisit, et dit: Mon bon Monsieur, apprenez que tout flatteur vit aux dépens de celui qui l’écoute.",
"Cette leçon vaut bien un fromage sans doute.",
"Le Corbeau honteux et confus jura, mais un peu tard, qu’on ne l’y prendrait plus.",
],
filename=f"audios/Le_corbeau_et_le_renard_{version}.wav",
device=exec_device,
language="fr",
speaker_reference=None,
)
def the_raven(version, model_id="Meta", exec_device="cpu"):
os.makedirs("audios", exist_ok=True)
read_texts(
model_id=model_id,
sentence=[
"Once upon a midnight dreary, while I pondered, weak, and weary,",
"Over many a quaint, and curious volume of forgotten lore,",
"While I nodded, nearly napping, suddenly, there came a tapping,",
"As of someone gently rapping, rapping at my chamber door.",
"Tis some visitor, I muttered, tapping at my chamber door,",
"Only this, and nothing more.",
"Ah, distinctly, I remember, it was in, the bleak December,",
"And each separate dying ember, wrought its ghost upon the floor.",
"Eagerly, I wished the morrow, vainly, I had sought to borrow",
"From my books surcease of sorrow, sorrow, for the lost Lenore,",
"For the rare and radiant maiden, whom the angels name Lenore,",
"Nameless here, for evermore.",
"And the silken, sad, uncertain, rustling of each purple curtain",
"Thrilled me, filled me, with fantastic terrors, never felt before",
],
filename=f"audios/the_raven_{version}.wav",
device=exec_device,
language="en",
speaker_reference=None,
)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--lang_emb",
help="Language embedding to use. Choices are accent-specific (e.g. 'en-gb') or just languages (e.g. 'en')",
)
parser.add_argument("--language", help="Language to use (e.g. 'en')")
parser.add_argument("--model_id", help="Name of the model to use (e.g. 'Meta')")
parser.add_argument(
"--input_is_phones",
help="Bool whether the input is phones",
action="store_true",
)
parser.add_argument(
"--speaker_reference", help="path of sample audio from reference speaker"
)
parser.add_argument("--text", help="text that should be synthesized to speech")
args = parser.parse_args()
exec_device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"running on {exec_device}")
language = args.language if args.language else "en"
lang_emb = args.lang_emb if args.lang_emb else "en"
if args.model_id:
model_id = args.model_id
else:
model_id = "Meta"
sentence_dict = dict()
sentence_dict["de"] = [
"Heute ist schönes Frühlingswetter.",
"Die Sonne lacht.",
"Am blauen Himmel ziehen die Wolken.",
"Über die Felder weht ein Wind.",
"Gestern stürmte es noch.",
"Montag war es uns zu regnerisch.",
]
sentence_dict["en"] = [
"It is important to distinguish between dialect and accent.",
"A dialect includes a certain variety of pronunciation, vocabulary, and grammar.",
"In contrast, an accent only refers to the aspect of pronunciation.",
]
sentence_dict["es"] = [
"Hay gemas de gran valor en la tienda.",
"Tienen un unevo puerto para botar barcos.",
"Para hacer hielo, echa más agua.",
"Mandó la postal en un sobre de papel grueso.",
]
if args.text:
sentence = args.text
else:
sentence = (
sentence_dict[language]
if language in sentence_dict
else sentence_dict["en"]
if language == "en-gb" or language == "en-us"
else sentence_dict[language[-2:]] # works for language at-de, hi-en, eu-es...
)
# set accent-specific language
if language == "en" and lang_emb[:2] == "en":
language = lang_emb
# create output directory
os.makedirs(f"audios/{model_id}", exist_ok=True)
speaker = (
os.path.dirname(args.speaker_reference).split("/")[-1] + "-ref"
if args.speaker_reference
else ""
)
filename = f"audios/{model_id}/{language}-lang_{lang_emb}-emb-{speaker}.wav"
print(f"Writing file to path: {filename}")
read_texts(
model_id=model_id,
sentence=sentence,
filename=filename,
language=language,
lang_emb=lang_emb,
input_is_phones=args.input_is_phones,
speaker_reference=args.speaker_reference,
device=exec_device,
)