-
Notifications
You must be signed in to change notification settings - Fork 1
/
tts.py
120 lines (106 loc) · 5.3 KB
/
tts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import sys
import numpy as np # this goes into the vocaliser file
import torch # sane
from scipy.io.wavfile import write #same
#import soundfile as sf # ogg codec crashes on long files
# set up tachotron and waveglow from torchub
def initializeTTSEngine():
tacotron2 = torch.hub.load('nvidia/DeepLearningExamples:torchhub', 'nvidia_tacotron2')
# modified this to allow longer sentences, unsure if it did anything at all.
tacotron2.max_decoder_steps = 3000
tacotron2 = tacotron2.to('cuda')
tacotron2.eval()
waveglow = torch.hub.load('nvidia/DeepLearningExamples:torchhub', 'nvidia_waveglow')
waveglow = waveglow.remove_weightnorm(waveglow)
waveglow = waveglow.to('cuda')
waveglow.eval()
return tacotron2, waveglow
# just stuff i found here https://pytorch.org/hub/nvidia_deeplearningexamples_tacotron2/
# This does the actual text to speech
def vocalise(subsentences, outputFile, outputFormat, speed, intermediaryFormat, tacotron2, waveglow):
print("[+] Starting TTS on "+outputFile.split("/")[-1])
rate = 22050
# load a saved state if any
audio_numpy, currentSubsentenceIndex = loadVocaliseState(outputFile)
try:# this is here so we can save before Ctrl+C
# process each sentence as tacotron2 -> waveglow -> whatever
from tqdm import tqdm
for text in tqdm(subsentences[currentSubsentenceIndex:],
dynamic_ncols=True,
initial=currentSubsentenceIndex,
total=len(subsentences)):
# preprocessing
sequence = np.array(tacotron2.text_to_sequence(text, ['english_cleaners']))[None, :]
sequence = torch.from_numpy(sequence).to(device='cuda', dtype=torch.int64)
# run the models
with torch.no_grad():
_, mel, _, _ = tacotron2.infer(sequence)
audio = waveglow.infer(mel)
audio_numpy = np.concatenate((audio_numpy, audio[0].data.cpu().numpy()))
# save current position in case TTS is interrupted
currentSubsentenceIndex = currentSubsentenceIndex + 1
# save resulting wav file
write(outputFile+intermediaryFormat, rate, audio_numpy)
# these are just alternative ways to write files, none seemed too good
# sf.write(outputFile+".wav",audio_numpy, rate, format="ogg")
# writeAudio(outputFile+".wav",audio_numpy, rate)
convertFormat(outputFile+intermediaryFormat, outputFormat, speed)
cleanSaves(outputFile,intermediaryFormat)
except KeyboardInterrupt:
saveVocaliseState(outputFile,intermediaryFormat,audio_numpy,currentSubsentenceIndex,rate)
sys.exit()
# Saving and loading progress to disk
import io
import json, zlib, base64
import os
def saveVocaliseState(outputFile, intermediaryFormat, audio_numpy, currentSubsentenceIndex,rate):
# saving binary array compressed and b64 json
memfile = io.BytesIO()
np.save(memfile, audio_numpy)
memfile.seek(0)
saveDict = {
"subsentenceIndex" : currentSubsentenceIndex,
"binaryAudio_numpy" : base64.b64encode(zlib.compress(memfile.read())).decode('ascii')
}
with open(outputFile+'.sav', 'w') as f:
json.dump(saveDict, f)
# this means there was a partial TTS, so we also save where we left off
print("\n[!] TTS interrupted, progress saved to "+outputFile+".sav")
print("[?] To continue from here, set the --output to the folder containing the .sav file")
return
def loadVocaliseState(loadPath):
loadPath=loadPath+".sav"
if os.path.exists(loadPath):
with open(loadPath, 'r') as f:
saveDict = json.load(f)
memfile = io.BytesIO()
memfile.write(zlib.decompress(base64.b64decode(saveDict["binaryAudio_numpy"])))
memfile.seek(0)
audio_numpy = np.load(memfile)
print("[+] Found saved state at "+loadPath+" continuing from subsentence " + str(saveDict["subsentenceIndex"]))
return audio_numpy, saveDict["subsentenceIndex"]
else:
print("[+] Saved state not found at "+loadPath+" starting from the beginning")
audio_numpy = np.ndarray(1)
currentSubsentenceIndex = 0
return audio_numpy, currentSubsentenceIndex
def cleanSaves(outputFile,intermediaryFormat):
if os.path.exists(outputFile+".sav"):
os.remove(outputFile+".sav")
if os.path.exists(outputFile+intermediaryFormat+".part"):
os.remove(outputFile+intermediaryFormat+".part")
print("[+] Partial files "+outputFile+".sav, "+outputFile+intermediaryFormat+".part and "+outputFile+intermediaryFormat+" removed")
import ffmpeg
def convertFormat(sourceFile, format, speed):
sourceSize=os.stat(sourceFile).st_size
stream = ffmpeg.input(sourceFile)
stream = stream.audio.filter("atempo", speed) # slow it down by 10%
stream = ffmpeg.output(stream, os.path.splitext(sourceFile)[0]+format)
stream = stream.global_args('-loglevel', 'quiet')
stream = stream.global_args('-y')
ffmpeg.run(stream)
convertedSize=os.stat(os.path.splitext(sourceFile)[0]+format).st_size
os.remove(sourceFile)
print("[+] Converted "+ sourceFile+ " ("+str(round(sourceSize/1024**2,2))+"MB) to "+
os.path.splitext(sourceFile)[0]+format+" ("+str(round(convertedSize/1024**2,2))+
"MB) and removed source file" )