Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[server] add stream tts server #1652

Merged
merged 3 commits into from
Apr 10, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions paddlespeech/server/bin/paddlespeech_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,9 @@
from ..util import stats_wrapper
from paddlespeech.cli.log import logger
from paddlespeech.server.engine.engine_pool import init_engine_pool
from paddlespeech.server.restful.api import setup_router
from paddlespeech.server.restful.api import setup_router as setup_http_router
from paddlespeech.server.utils.config import get_config
from paddlespeech.server.ws.api import setup_router as setup_ws_router

__all__ = ['ServerExecutor', 'ServerStatsExecutor']

Expand Down Expand Up @@ -63,7 +64,12 @@ def init(self, config) -> bool:
"""
# init api
api_list = list(engine.split("_")[0] for engine in config.engine_list)
api_router = setup_router(api_list)
if config.protocol == "websocket":
api_router = setup_ws_router(api_list)
elif config.protocol == "http":
api_router = setup_http_router(api_list)
else:
raise Exception("unsupported protocol")
app.include_router(api_router)

if not init_engine_pool(config):
Expand Down
46 changes: 46 additions & 0 deletions paddlespeech/server/conf/tts_online_application.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# This is the parameter configuration file for PaddleSpeech Serving.

#################################################################################
# SERVER SETTING #
#################################################################################
host: 127.0.0.1
port: 8092

# The task format in the engin_list is: <speech task>_<engine type>
# task choices = ['asr_online', 'tts_online']
# protocol = ['websocket', 'http'] (only one can be selected).
protocol: 'http'
engine_list: ['tts_online']


#################################################################################
# ENGINE CONFIG #
#################################################################################

################################### TTS #########################################
################### speech task: tts; engine_type: online #######################
tts_online:
# am (acoustic model) choices=['fastspeech2_csmsc']
am: 'fastspeech2_csmsc'
am_config:
am_ckpt:
am_stat:
phones_dict:
tones_dict:
speaker_dict:
spk_id: 0

# voc (vocoder) choices=['mb_melgan_csmsc']
voc: 'mb_melgan_csmsc'
voc_config:
voc_ckpt:
voc_stat:

# others
lang: 'zh'
device: # set 'gpu:id' or 'cpu'
am_block: 42
am_pad: 12
voc_block: 14
voc_pad: 14

18 changes: 2 additions & 16 deletions paddlespeech/server/engine/asr/online/asr_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
from paddlespeech.s2t.modules.ctc import CTCDecoder
from paddlespeech.s2t.utils.utility import UpdateConfig
from paddlespeech.server.engine.base_engine import BaseEngine
from paddlespeech.server.utils.audio_process import pcm2float
from paddlespeech.server.utils.paddle_predictor import init_predictor

__all__ = ['ASREngine']
Expand Down Expand Up @@ -222,21 +223,6 @@ def decode_one_chunk(self, x_chunk, x_chunk_lens, model_type: str):
else:
raise Exception("invalid model name")

def _pcm16to32(self, audio):
"""pcm int16 to float32

Args:
audio(numpy.array): numpy.int16

Returns:
audio(numpy.array): numpy.float32
"""
if audio.dtype == np.int16:
audio = audio.astype("float32")
bits = np.iinfo(np.int16).bits
audio = audio / (2**(bits - 1))
return audio

def extract_feat(self, samples, sample_rate):
"""extract feat

Expand All @@ -249,7 +235,7 @@ def extract_feat(self, samples, sample_rate):
x_chunk_lens (numpy.array): shape[B]
"""
# pcm16 -> pcm 32
samples = self._pcm16to32(samples)
samples = pcm2float(samples)

# read audio
speech_segment = SpeechSegment.from_pcm(
Expand Down
3 changes: 3 additions & 0 deletions paddlespeech/server/engine/engine_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,9 @@ def get_engine(engine_name: Text, engine_type: Text):
elif engine_name == 'tts' and engine_type == 'python':
from paddlespeech.server.engine.tts.python.tts_engine import TTSEngine
return TTSEngine()
elif engine_name == 'tts' and engine_type == 'online':
from paddlespeech.server.engine.tts.online.tts_engine import TTSEngine
return TTSEngine()
elif engine_name == 'cls' and engine_type == 'inference':
from paddlespeech.server.engine.cls.paddleinference.cls_engine import CLSEngine
return CLSEngine()
Expand Down
13 changes: 13 additions & 0 deletions paddlespeech/server/engine/tts/online/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
220 changes: 220 additions & 0 deletions paddlespeech/server/engine/tts/online/tts_engine.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,220 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import base64
import time

import numpy as np
import paddle

from paddlespeech.cli.log import logger
from paddlespeech.cli.tts.infer import TTSExecutor
from paddlespeech.server.engine.base_engine import BaseEngine
from paddlespeech.server.utils.audio_process import float2pcm
from paddlespeech.server.utils.util import get_chunks

__all__ = ['TTSEngine']


class TTSServerExecutor(TTSExecutor):
def __init__(self):
super().__init__()
pass

@paddle.no_grad()
def infer(
self,
text: str,
lang: str='zh',
am: str='fastspeech2_csmsc',
spk_id: int=0,
am_block: int=42,
am_pad: int=12,
voc_block: int=14,
voc_pad: int=14, ):
"""
Model inference and result stored in self.output.
"""
am_name = am[:am.rindex('_')]
am_dataset = am[am.rindex('_') + 1:]
get_tone_ids = False
merge_sentences = False
frontend_st = time.time()
if lang == 'zh':
input_ids = self.frontend.get_input_ids(
text,
merge_sentences=merge_sentences,
get_tone_ids=get_tone_ids)
phone_ids = input_ids["phone_ids"]
if get_tone_ids:
tone_ids = input_ids["tone_ids"]
elif lang == 'en':
input_ids = self.frontend.get_input_ids(
text, merge_sentences=merge_sentences)
phone_ids = input_ids["phone_ids"]
else:
print("lang should in {'zh', 'en'}!")
self.frontend_time = time.time() - frontend_st

for i in range(len(phone_ids)):
am_st = time.time()
part_phone_ids = phone_ids[i]
# am
if am_name == 'speedyspeech':
part_tone_ids = tone_ids[i]
zh794390558 marked this conversation as resolved.
Show resolved Hide resolved
mel = self.am_inference(part_phone_ids, part_tone_ids)
# fastspeech2
else:
# multi speaker
if am_dataset in {"aishell3", "vctk"}:
mel = self.am_inference(
part_phone_ids, spk_id=paddle.to_tensor(spk_id))
else:
mel = self.am_inference(part_phone_ids)
am_et = time.time()

# voc streaming
voc_upsample = self.voc_config.n_shift
mel_chunks = get_chunks(mel, voc_block, voc_pad, "voc")
chunk_num = len(mel_chunks)
voc_st = time.time()
for i, mel_chunk in enumerate(mel_chunks):
sub_wav = self.voc_inference(mel_chunk)
front_pad = min(i * voc_block, voc_pad)

if i == 0:
sub_wav = sub_wav[:voc_block * voc_upsample]
elif i == chunk_num - 1:
sub_wav = sub_wav[front_pad * voc_upsample:]
else:
sub_wav = sub_wav[front_pad * voc_upsample:(
front_pad + voc_block) * voc_upsample]

yield sub_wav


class TTSEngine(BaseEngine):
"""TTS server engine

Args:
metaclass: Defaults to Singleton.
"""

def __init__(self, name=None):
"""Initialize TTS server engine
"""
super(TTSEngine, self).__init__()

def init(self, config: dict) -> bool:
self.executor = TTSServerExecutor()
self.config = config
assert "fastspeech2_csmsc" in config.am and (
config.voc == "hifigan_csmsc-zh" or config.voc == "mb_melgan_csmsc"
), 'Please check config, am support: fastspeech2, voc support: hifigan_csmsc-zh or mb_melgan_csmsc.'
try:
if self.config.device:
self.device = self.config.device
else:
self.device = paddle.get_device()
paddle.set_device(self.device)
except Exception as e:
logger.error(
"Set device failed, please check if device is already used and the parameter 'device' in the yaml file"
)
logger.error("Initialize TTS server engine Failed on device: %s." %
(self.device))
return False

try:
self.executor._init_from_path(
am=self.config.am,
am_config=self.config.am_config,
am_ckpt=self.config.am_ckpt,
am_stat=self.config.am_stat,
phones_dict=self.config.phones_dict,
tones_dict=self.config.tones_dict,
speaker_dict=self.config.speaker_dict,
voc=self.config.voc,
voc_config=self.config.voc_config,
voc_ckpt=self.config.voc_ckpt,
voc_stat=self.config.voc_stat,
lang=self.config.lang)
except Exception as e:
logger.error("Failed to get model related files.")
logger.error("Initialize TTS server engine Failed on device: %s." %
(self.device))
return False

self.am_block = self.config.am_block
self.am_pad = self.config.am_pad
self.voc_block = self.config.voc_block
self.voc_pad = self.config.voc_pad

logger.info("Initialize TTS server engine successfully on device: %s." %
(self.device))
return True

def preprocess(self, text_bese64: str=None, text_bytes: bytes=None):
# Convert byte to text
if text_bese64:
text_bytes = base64.b64decode(text_bese64) # base64 to bytes
text = text_bytes.decode('utf-8') # bytes to text

return text

def run(self,
sentence: str,
spk_id: int=0,
speed: float=1.0,
volume: float=1.0,
sample_rate: int=0,
save_path: str=None):
""" run include inference and postprocess.

Args:
sentence (str): text to be synthesized
spk_id (int, optional): speaker id for multi-speaker speech synthesis. Defaults to 0.
speed (float, optional): speed. Defaults to 1.0.
volume (float, optional): volume. Defaults to 1.0.
sample_rate (int, optional): target sample rate for synthesized audio,
0 means the same as the model sampling rate. Defaults to 0.
save_path (str, optional): The save path of the synthesized audio.
None means do not save audio. Defaults to None.

Returns:
wav_base64: The base64 format of the synthesized audio.
"""

lang = self.config.lang
wav_list = []

for wav in self.executor.infer(
text=sentence,
lang=lang,
am=self.config.am,
spk_id=spk_id,
am_block=self.am_block,
am_pad=self.am_pad,
voc_block=self.voc_block,
voc_pad=self.voc_pad):
# wav type: <class 'numpy.ndarray'> float32, convert to pcm (base64)
wav = float2pcm(wav) # float32 to int16
wav_bytes = wav.tobytes() # to bytes
wav_base64 = base64.b64encode(wav_bytes).decode('utf8') # to base64
wav_list.append(wav)

yield wav_base64

wav_all = np.concatenate(wav_list, axis=0)
logger.info("The durations of audio is: {} s".format(
len(wav_all) / self.executor.am_config.fs))
12 changes: 12 additions & 0 deletions paddlespeech/server/restful/tts_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from typing import Union

from fastapi import APIRouter
from fastapi.responses import StreamingResponse

from paddlespeech.cli.log import logger
from paddlespeech.server.engine.engine_pool import get_engine_pool
Expand Down Expand Up @@ -125,3 +126,14 @@ def tts(request_body: TTSRequest):
traceback.print_exc()

return response


@router.post("/paddlespeech/streaming/tts")
async def stream_tts(request_body: TTSRequest):
text = request_body.text

engine_pool = get_engine_pool()
tts_engine = engine_pool['tts']
logger.info("Get tts engine successfully.")

return StreamingResponse(tts_engine.run(sentence=text))
Loading