From 6da7db9ea4f3b362841489145b914b5105c0a4d0 Mon Sep 17 00:00:00 2001 From: Sihan Chen <39623753+Spycsh@users.noreply.github.com> Date: Sat, 12 Oct 2024 10:44:46 +0800 Subject: [PATCH] Add GPT-SoVITS microservice (#784) * add gpt-sovits microservice * add readme * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix * fix eol --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .../workflows/docker/compose/tts-compose.yaml | 4 ++ comps/tts/gpt-sovits/Dockerfile | 32 +++++++++++ comps/tts/gpt-sovits/README.md | 56 +++++++++++++++++++ comps/tts/gpt-sovits/__init__.py | 2 + 4 files changed, 94 insertions(+) create mode 100644 comps/tts/gpt-sovits/Dockerfile create mode 100644 comps/tts/gpt-sovits/README.md create mode 100644 comps/tts/gpt-sovits/__init__.py diff --git a/.github/workflows/docker/compose/tts-compose.yaml b/.github/workflows/docker/compose/tts-compose.yaml index 07a8b48692..d43fcbd790 100644 --- a/.github/workflows/docker/compose/tts-compose.yaml +++ b/.github/workflows/docker/compose/tts-compose.yaml @@ -16,3 +16,7 @@ services: build: dockerfile: comps/tts/speecht5/dependency/Dockerfile.intel_hpu image: ${REGISTRY:-opea}/speecht5-gaudi:${TAG:-latest} + gpt-sovits: + build: + dockerfile: comps/tts/gpt-sovits/Dockerfile + image: ${REGISTRY:-opea}/gpt-sovits:${TAG:-latest} diff --git a/comps/tts/gpt-sovits/Dockerfile b/comps/tts/gpt-sovits/Dockerfile new file mode 100644 index 0000000000..3f2ca72dbe --- /dev/null +++ b/comps/tts/gpt-sovits/Dockerfile @@ -0,0 +1,32 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +FROM python:3.10-slim +RUN useradd -m -s /bin/bash user && \ + mkdir -p /home/user && \ + chown -R user /home/user/ + +# Install system dependencies +RUN apt-get update && \ + apt-get install -y ffmpeg git-lfs git wget vim build-essential && \ + pip install --upgrade pip + +# Clone source repo +RUN git clone https://github.com/RVC-Boss/GPT-SoVITS.git +# Download pre-trained models, and prepare env +RUN git clone https://huggingface.co/lj1995/GPT-SoVITS pretrained_models +RUN mv pretrained_models/* GPT-SoVITS/GPT_SoVITS/pretrained_models/ && \ + rm -rf pretrained_models && \ + pip install --no-cache-dir -r GPT-SoVITS/requirements.txt && \ + python -m nltk.downloader averaged_perceptron_tagger_eng cmudict + +RUN mv GPT-SoVITS /home/user/ + +# USER user +# ENV LANG=C.UTF-8 + +WORKDIR /home/user/GPT-SoVITS + +RUN wget "https://github.com/intel/intel-extension-for-transformers/raw/refs/heads/main/intel_extension_for_transformers/neural_chat/assets/audio/welcome_cn.wav" + +ENTRYPOINT ["python", "api.py", "--default_refer_path", "./welcome_cn.wav", "--default_refer_text", "欢迎使用", "--default_refer_language", "zh"] \ No newline at end of file diff --git a/comps/tts/gpt-sovits/README.md b/comps/tts/gpt-sovits/README.md new file mode 100644 index 0000000000..0823da5e5b --- /dev/null +++ b/comps/tts/gpt-sovits/README.md @@ -0,0 +1,56 @@ +# GPT-SoVITS Microservice + +[GPT-SoVITS](https://github.com/RVC-Boss/GPT-SoVITS) allows you to to do zero-shot voice cloning and text to speech of multi languages such as English, Japanese, Korean, Cantonese and Chinese. + +This microservice is validated on Xeon/CUDA. HPU support is under development. + +## Build the Image + +```bash +docker build -t opea/gpt-sovits:latest --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy -f comps/tts/gpt-sovits/Dockerfile . +``` + +## Start the Service + +```bash +docker run -itd -p 9880:9880 -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/gpt-sovits:latest +``` + +## Test + +- Chinese only + +```bash +curl localhost:9880/ -XPOST -d '{ + "text": "先帝创业未半而中道崩殂,今天下三分,益州疲弊,此诚危急存亡之秋也。", + "text_language": "zh" +}' --output out.wav +``` + +- English only + +```bash +curl localhost:9880/ -XPOST -d '{ + "text": "Discuss the evolution of text-to-speech (TTS) technology from its early beginnings to the present day. Highlight the advancements in natural language processing that have contributed to more realistic and human-like speech synthesis. Also, explore the various applications of TTS in education, accessibility, and customer service, and predict future trends in this field. Write a comprehensive overview of text-to-speech (TTS) technology.", + "text_language": "en" +}' --output out.wav +``` + +- Auto detection of languages + +```bash +curl localhost:9880/ -XPOST -d '{ + "text": "Hi 你好,这里是一个 cross-lingual 的例子。", + "text_language": "auto" +}' --output out.wav +``` + +- Change reference audio + +```bash +curl localhost:9880/change_refer -d '{ + "refer_wav_path": "path_to_your_audio.wav", + "prompt_text": "transcription_of_your_audio", + "prompt_language": "language_of_your_audio" +}' +``` diff --git a/comps/tts/gpt-sovits/__init__.py b/comps/tts/gpt-sovits/__init__.py new file mode 100644 index 0000000000..916f3a44b2 --- /dev/null +++ b/comps/tts/gpt-sovits/__init__.py @@ -0,0 +1,2 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0