From 6da7db9ea4f3b362841489145b914b5105c0a4d0 Mon Sep 17 00:00:00 2001
From: Sihan Chen <39623753+Spycsh@users.noreply.github.com>
Date: Sat, 12 Oct 2024 10:44:46 +0800
Subject: [PATCH] Add GPT-SoVITS microservice (#784)

* add gpt-sovits microservice

* add readme

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix

* fix eol

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .../workflows/docker/compose/tts-compose.yaml |  4 ++
 comps/tts/gpt-sovits/Dockerfile               | 32 +++++++++++
 comps/tts/gpt-sovits/README.md                | 56 +++++++++++++++++++
 comps/tts/gpt-sovits/__init__.py              |  2 +
 4 files changed, 94 insertions(+)
 create mode 100644 comps/tts/gpt-sovits/Dockerfile
 create mode 100644 comps/tts/gpt-sovits/README.md
 create mode 100644 comps/tts/gpt-sovits/__init__.py

diff --git a/.github/workflows/docker/compose/tts-compose.yaml b/.github/workflows/docker/compose/tts-compose.yaml
index 07a8b48692..d43fcbd790 100644
--- a/.github/workflows/docker/compose/tts-compose.yaml
+++ b/.github/workflows/docker/compose/tts-compose.yaml
@@ -16,3 +16,7 @@ services:
     build:
       dockerfile: comps/tts/speecht5/dependency/Dockerfile.intel_hpu
     image: ${REGISTRY:-opea}/speecht5-gaudi:${TAG:-latest}
+  gpt-sovits:
+    build:
+      dockerfile: comps/tts/gpt-sovits/Dockerfile
+    image: ${REGISTRY:-opea}/gpt-sovits:${TAG:-latest}
diff --git a/comps/tts/gpt-sovits/Dockerfile b/comps/tts/gpt-sovits/Dockerfile
new file mode 100644
index 0000000000..3f2ca72dbe
--- /dev/null
+++ b/comps/tts/gpt-sovits/Dockerfile
@@ -0,0 +1,32 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+FROM python:3.10-slim
+RUN useradd -m -s /bin/bash user && \
+    mkdir -p /home/user && \
+    chown -R user /home/user/
+
+# Install system dependencies
+RUN apt-get update && \
+    apt-get install -y ffmpeg git-lfs git wget vim build-essential && \
+    pip install --upgrade pip
+
+# Clone source repo
+RUN git clone https://github.com/RVC-Boss/GPT-SoVITS.git
+# Download pre-trained models, and prepare env
+RUN git clone https://huggingface.co/lj1995/GPT-SoVITS pretrained_models
+RUN mv pretrained_models/*  GPT-SoVITS/GPT_SoVITS/pretrained_models/ && \
+    rm -rf pretrained_models && \
+    pip install --no-cache-dir -r GPT-SoVITS/requirements.txt && \
+    python -m nltk.downloader averaged_perceptron_tagger_eng cmudict
+
+RUN mv GPT-SoVITS /home/user/
+
+# USER user
+# ENV LANG=C.UTF-8
+
+WORKDIR /home/user/GPT-SoVITS
+
+RUN wget "https://github.com/intel/intel-extension-for-transformers/raw/refs/heads/main/intel_extension_for_transformers/neural_chat/assets/audio/welcome_cn.wav"
+
+ENTRYPOINT ["python", "api.py", "--default_refer_path", "./welcome_cn.wav", "--default_refer_text", "欢迎使用", "--default_refer_language", "zh"]
\ No newline at end of file
diff --git a/comps/tts/gpt-sovits/README.md b/comps/tts/gpt-sovits/README.md
new file mode 100644
index 0000000000..0823da5e5b
--- /dev/null
+++ b/comps/tts/gpt-sovits/README.md
@@ -0,0 +1,56 @@
+# GPT-SoVITS Microservice
+
+[GPT-SoVITS](https://github.com/RVC-Boss/GPT-SoVITS) allows you to to do zero-shot voice cloning and text to speech of multi languages such as English, Japanese, Korean, Cantonese and Chinese.
+
+This microservice is validated on Xeon/CUDA. HPU support is under development.
+
+## Build the Image
+
+```bash
+docker build -t opea/gpt-sovits:latest --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy -f comps/tts/gpt-sovits/Dockerfile .
+```
+
+## Start the Service
+
+```bash
+docker run  -itd -p 9880:9880 -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/gpt-sovits:latest
+```
+
+## Test
+
+- Chinese only
+
+```bash
+curl localhost:9880/ -XPOST -d '{
+    "text": "先帝创业未半而中道崩殂，今天下三分，益州疲弊，此诚危急存亡之秋也。",
+    "text_language": "zh"
+}' --output out.wav
+```
+
+- English only
+
+```bash
+curl localhost:9880/ -XPOST -d '{
+    "text": "Discuss the evolution of text-to-speech (TTS) technology from its early beginnings to the present day. Highlight the advancements in natural language processing that have contributed to more realistic and human-like speech synthesis. Also, explore the various applications of TTS in education, accessibility, and customer service, and predict future trends in this field. Write a comprehensive overview of text-to-speech (TTS) technology.",
+    "text_language": "en"
+}' --output out.wav
+```
+
+- Auto detection of languages
+
+```bash
+curl localhost:9880/ -XPOST -d '{
+    "text": "Hi 你好，这里是一个 cross-lingual 的例子。",
+    "text_language": "auto"
+}' --output out.wav
+```
+
+- Change reference audio
+
+```bash
+curl localhost:9880/change_refer -d '{
+    "refer_wav_path": "path_to_your_audio.wav",
+    "prompt_text": "transcription_of_your_audio",
+    "prompt_language": "language_of_your_audio"
+}'
+```
diff --git a/comps/tts/gpt-sovits/__init__.py b/comps/tts/gpt-sovits/__init__.py
new file mode 100644
index 0000000000..916f3a44b2
--- /dev/null
+++ b/comps/tts/gpt-sovits/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0