Merge pull request #248 from 2DIPW/dev-vits

feat: 增加VITS文本转语音的支持
wzpan · Apr 24, 2023 · e18bc67 · e18bc67
2 parents 57a7518 + ca3566f
commit e18bc67
Show file tree

Hide file tree

Showing 3 changed files with 77 additions and 1 deletion.
diff --git a/robot/TTS.py b/robot/TTS.py
@@ -16,7 +16,7 @@
 from pypinyin import lazy_pinyin
 from pydub import AudioSegment
 from abc import ABCMeta, abstractmethod
-from .sdk import TencentSpeech, AliSpeech, XunfeiSpeech, atc
+from .sdk import TencentSpeech, AliSpeech, XunfeiSpeech, atc, VITSClient
 import requests
 from xml.etree import ElementTree
 
@@ -408,6 +408,37 @@ def get_speech(self, phrase):
         else:
             logger.critical(f"{self.SLUG} 合成失败！", stack_info=True)
 
+class VITS(AbstractTTS):
+    """
+    VITS 语音合成
+    需要自行搭建vits-simple-api服务器：https://github.com/Artrajz/vits-simple-api
+    server_url : 服务器url，如http://127.0.0.1:23456
+    api_key : 若服务器配置了API Key，在此填入
+    speaker_id : 说话人ID，由所使用的模型决定
+    length : 调节语音长度，相当于调节语速，该数值越大语速越慢。
+    noise : 噪声
+    noisew : 噪声偏差
+    max : 分段阈值，按标点符号分段，加起来大于max时为一段文本。max<=0表示不分段。
+    timeout: 响应超时时间，根据vits-simple-api服务器性能不同配置合理的超时时间。
+    """
+
+    SLUG = "VITS"
+
+    def __init__(self, server_url, api_key, speaker_id, length, noise, noisew, max, timeout, **args):
+        super(self.__class__, self).__init__()
+        self.server_url, self.api_key, self.speaker_id, self.length, self.noise, self.noisew, self.max, self.timeout = (
+            server_url, api_key, speaker_id, length, noise, noisew, max, timeout)
+
+    @classmethod
+    def get_config(cls):
+        return config.get("VITS", {})
+
+    def get_speech(self, phrase):
+        result = VITSClient.tts(phrase, self.server_url, self.api_key, self.speaker_id, self.length, self.noise,
+                                self.noisew, self.max, self.timeout)
+        tmpfile = utils.write_temp_file(result, ".wav")
+        logger.info(f"{self.SLUG} 语音合成成功，合成路径：{tmpfile}")
+        return tmpfile
 
 def get_engine_by_slug(slug=None):
     """

diff --git a/robot/sdk/VITSClient.py b/robot/sdk/VITSClient.py
@@ -0,0 +1,24 @@
+# coding: utf-8
+# !/usr/bin/env python3
+
+"""VITS TTS API"""
+
+import requests
+
+
+def tts(text, server_url, api_key, speaker_id, length, noise, noisew, max, timeout):
+    data = {
+        "text": text,
+        "id": speaker_id,
+        "format": "wav",
+        "lang": "auto",
+        "length": length,
+        "noise": noise,
+        "noisew": noisew,
+        "max": max
+    }
+    headers = {"X-API-KEY": api_key}
+    url = f"{server_url}/voice"
+    res = requests.post(url=url, data=data, headers=headers, timeout=timeout)
+    res.raise_for_status()
+    return res.content
diff --git a/static/default.yml b/static/default.yml
@@ -97,6 +97,7 @@ lru_cache:
 # azure-tts     - 微软语音合成
 # mac-tts       - macOS 系统自带TTS（mac 系统推荐）
 # edge-tts      - 基于 Edge 的 TTS（推荐）
+# VITS          - 基于 VITS 的AI语音合成
 tts_engine: edge-tts
 
 # 语音识别服务配置
@@ -179,6 +180,26 @@ edge-tts:
     # 中文推荐 `zh` 开头的音色
     voice: zh-CN-XiaoxiaoNeural
 
+# 基于 VITS 的AI语音合成
+VITS:
+    # 需要自行搭建vits-simple-api服务器：https://github.com/Artrajz/vits-simple-api
+    #    server_url: 服务器url（格式为http://{IP地址}:{端口}，不带最后的斜杠），如http://127.0.0.1:23456
+    #    api_key: 若服务器配置了API Key，在此填入
+    #    speaker_id: 说话人ID，由所使用的模型决定
+    #    length: 调节语音长度，相当于调节语速，该数值越大语速越慢。
+    #    noise: 噪声
+    #    noisew: 噪声偏差
+    #    max: 分段阈值，按标点符号分段，加起来大于max时为一段文本。max<=0表示不分段。
+    #    timeout: 响应超时时间（秒），根据vits-simple-api服务器性能不同配置合理的超时时间。
+    server_url: "http://127.0.0.1:23456"
+    api_key: "api_key"
+    speaker_id: 0
+    length: 1.0
+    noise: 0.667
+    noisew: 0.8
+    max: 50
+    timeout: 60
+
 # NLU 引擎
 # 可选值：
 # unit      - 百度 UNIT