diff --git a/.github/workflows/apk-tts-engine.yaml b/.github/workflows/apk-tts-engine.yaml index b8614cb76..68fdaa05d 100644 --- a/.github/workflows/apk-tts-engine.yaml +++ b/.github/workflows/apk-tts-engine.yaml @@ -26,6 +26,7 @@ jobs: total: ["40"] index: ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24", "25", "26", "27", "28", "29", "30", "31", "32", "33", "34", "35", "36", "37", "38", "39"] + steps: - uses: actions/checkout@v4 with: diff --git a/android/SherpaOnnxTts/app/src/main/java/com/k2fsa/sherpa/onnx/MainActivity.kt b/android/SherpaOnnxTts/app/src/main/java/com/k2fsa/sherpa/onnx/MainActivity.kt index b95ad7d78..5119a50b2 100644 --- a/android/SherpaOnnxTts/app/src/main/java/com/k2fsa/sherpa/onnx/MainActivity.kt +++ b/android/SherpaOnnxTts/app/src/main/java/com/k2fsa/sherpa/onnx/MainActivity.kt @@ -183,6 +183,8 @@ class MainActivity : AppCompatActivity() { private fun initTts() { var modelDir: String? var modelName: String? + var acousticModelName: String? + var vocoder: String? var ruleFsts: String? var ruleFars: String? var lexicon: String? @@ -193,8 +195,18 @@ class MainActivity : AppCompatActivity() { // The purpose of such a design is to make the CI test easier // Please see // https://github.com/k2-fsa/sherpa-onnx/blob/master/scripts/apk/generate-tts-apk-script.py - modelDir = null + + // VITS -- begin modelName = null + // VITS -- end + + // Matcha -- begin + acousticModelName = null + vocoder = null + // Matcha -- end + + + modelDir = null ruleFsts = null ruleFars = null lexicon = null @@ -217,7 +229,6 @@ class MainActivity : AppCompatActivity() { // https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2 // modelDir = "vits-icefall-zh-aishell3" // modelName = "model.onnx" - // ruleFsts = "vits-icefall-zh-aishell3/phone.fst,vits-icefall-zh-aishell3/date.fst,vits-icefall-zh-aishell3/number.fst,vits-icefall-zh-aishell3/new_heteronym.fst" // ruleFars = "vits-icefall-zh-aishell3/rule.far" // lexicon = "lexicon.txt" @@ -233,24 +244,47 @@ class MainActivity : AppCompatActivity() { // modelDir = "vits-coqui-de-css10" // modelName = "model.onnx" + // Example 6 + // vits-melo-tts-zh_en + // https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/vits.html#vits-melo-tts-zh-en-chinese-english-1-speaker + // modelDir = "vits-melo-tts-zh_en" + // modelName = "model.onnx" + // lexicon = "lexicon.txt" + // dictDir = "vits-melo-tts-zh_en/dict" + + // Example 7 + // matcha-icefall-zh-baker + // https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-zh-baker-chinese-1-female-speaker + // modelDir = "matcha-icefall-zh-baker" + // acousticModelName = "model-steps-3.onnx" + // vocoder = "hifigan_v2.onnx" + // lexicon = "lexicon.txt" + // dictDir = "matcha-icefall-zh-baker/dict" + + // Example 8 + // matcha-icefall-en_US-ljspeech + // https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker + // modelDir = "matcha-icefall-en_US-ljspeech" + // acousticModelName = "model-steps-3.onnx" + // vocoder = "hifigan_v2.onnx" + // dataDir = "matcha-icefall-en_US-ljspeech/espeak-ng-data" + if (dataDir != null) { - val newDir = copyDataDir(modelDir!!) - modelDir = newDir + "/" + modelDir - dataDir = newDir + "/" + dataDir - assets = null + val newDir = copyDataDir(dataDir!!) + dataDir = "$newDir/$dataDir" } if (dictDir != null) { - val newDir = copyDataDir(modelDir!!) - modelDir = newDir + "/" + modelDir - dictDir = modelDir + "/" + "dict" + val newDir = copyDataDir(dictDir!!) + dictDir = "$newDir/$dictDir" ruleFsts = "$modelDir/phone.fst,$modelDir/date.fst,$modelDir/number.fst" - assets = null } val config = getOfflineTtsConfig( modelDir = modelDir!!, - modelName = modelName!!, + modelName = modelName ?: "", + acousticModelName = acousticModelName ?: "", + vocoder = vocoder ?: "", lexicon = lexicon ?: "", dataDir = dataDir ?: "", dictDir = dictDir ?: "", diff --git a/android/SherpaOnnxTtsEngine/app/src/main/java/com/k2fsa/sherpa/onnx/tts/engine/MainActivity.kt b/android/SherpaOnnxTtsEngine/app/src/main/java/com/k2fsa/sherpa/onnx/tts/engine/MainActivity.kt index 9a6bd47ab..f7e34c5dd 100644 --- a/android/SherpaOnnxTtsEngine/app/src/main/java/com/k2fsa/sherpa/onnx/tts/engine/MainActivity.kt +++ b/android/SherpaOnnxTtsEngine/app/src/main/java/com/k2fsa/sherpa/onnx/tts/engine/MainActivity.kt @@ -57,7 +57,7 @@ class MainActivity : ComponentActivity() { color = MaterialTheme.colorScheme.background ) { Scaffold(topBar = { - TopAppBar(title = { Text("Next-gen Kaldi: TTS") }) + TopAppBar(title = { Text("Next-gen Kaldi: TTS Engine") }) }) { Box(modifier = Modifier.padding(it)) { Column(modifier = Modifier.padding(16.dp)) { @@ -65,8 +65,8 @@ class MainActivity : ComponentActivity() { Text("Speed " + String.format("%.1f", TtsEngine.speed)) Slider( value = TtsEngine.speedState.value, - onValueChange = { - TtsEngine.speed = it + onValueChange = { + TtsEngine.speed = it preferenceHelper.setSpeed(it) }, valueRange = 0.2F..3.0F, @@ -138,7 +138,9 @@ class MainActivity : ComponentActivity() { val filename = application.filesDir.absolutePath + "/generated.wav" val ok = - audio.samples.isNotEmpty() && audio.save(filename) + audio.samples.isNotEmpty() && audio.save( + filename + ) if (ok) { stopMediaPlayer() diff --git a/android/SherpaOnnxTtsEngine/app/src/main/java/com/k2fsa/sherpa/onnx/tts/engine/TtsEngine.kt b/android/SherpaOnnxTtsEngine/app/src/main/java/com/k2fsa/sherpa/onnx/tts/engine/TtsEngine.kt index 480f8a384..cec07ffd5 100644 --- a/android/SherpaOnnxTtsEngine/app/src/main/java/com/k2fsa/sherpa/onnx/tts/engine/TtsEngine.kt +++ b/android/SherpaOnnxTtsEngine/app/src/main/java/com/k2fsa/sherpa/onnx/tts/engine/TtsEngine.kt @@ -1,5 +1,6 @@ package com.k2fsa.sherpa.onnx.tts.engine +import PreferenceHelper import android.content.Context import android.content.res.AssetManager import android.util.Log @@ -11,7 +12,6 @@ import com.k2fsa.sherpa.onnx.getOfflineTtsConfig import java.io.File import java.io.FileOutputStream import java.io.IOException -import PreferenceHelper object TtsEngine { var tts: OfflineTts? = null @@ -41,6 +41,8 @@ object TtsEngine { private var modelDir: String? = null private var modelName: String? = null + private var acousticModelName: String? = null + private var vocoder: String? = null private var ruleFsts: String? = null private var ruleFars: String? = null private var lexicon: String? = null @@ -52,8 +54,17 @@ object TtsEngine { // The purpose of such a design is to make the CI test easier // Please see // https://github.com/k2-fsa/sherpa-onnx/blob/master/scripts/apk/generate-tts-apk-script.py - modelDir = null + // + // For VITS -- begin modelName = null + // For VITS -- end + + // For Matcha -- begin + acousticModelName = null + vocoder = null + // For Matcha -- end + + modelDir = null ruleFsts = null ruleFars = null lexicon = null @@ -82,7 +93,6 @@ object TtsEngine { // https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2 // modelDir = "vits-icefall-zh-aishell3" // modelName = "model.onnx" - // ruleFsts = "vits-icefall-zh-aishell3/phone.fst,vits-icefall-zh-aishell3/date.fst,vits-icefall-zh-aishell3/number.fst,vits-icefall-zh-aishell3/new_heteronym.fst" // ruleFars = "vits-icefall-zh-aishell3/rule.far" // lexicon = "lexicon.txt" // lang = "zho" @@ -101,8 +111,35 @@ object TtsEngine { // modelDir = "vits-coqui-de-css10" // modelName = "model.onnx" // lang = "deu" - } + // Example 6 + // vits-melo-tts-zh_en + // https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/vits.html#vits-melo-tts-zh-en-chinese-english-1-speaker + // modelDir = "vits-melo-tts-zh_en" + // modelName = "model.onnx" + // lexicon = "lexicon.txt" + // dictDir = "vits-melo-tts-zh_en/dict" + // lang = "zho" + + // Example 7 + // matcha-icefall-zh-baker + // https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-zh-baker-chinese-1-female-speaker + // modelDir = "matcha-icefall-zh-baker" + // acousticModelName = "model-steps-3.onnx" + // vocoder = "hifigan_v2.onnx" + // lexicon = "lexicon.txt" + // dictDir = "matcha-icefall-zh-baker/dict" + // lang = "zho" + + // Example 8 + // matcha-icefall-en_US-ljspeech + // https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker + // modelDir = "matcha-icefall-en_US-ljspeech" + // acousticModelName = "model-steps-3.onnx" + // vocoder = "hifigan_v2.onnx" + // dataDir = "matcha-icefall-en_US-ljspeech/espeak-ng-data" + // lang = "eng" + } fun createTts(context: Context) { Log.i(TAG, "Init Next-gen Kaldi TTS") @@ -115,22 +152,22 @@ object TtsEngine { assets = context.assets if (dataDir != null) { - val newDir = copyDataDir(context, modelDir!!) - modelDir = "$newDir/$modelDir" + val newDir = copyDataDir(context, dataDir!!) dataDir = "$newDir/$dataDir" - assets = null } if (dictDir != null) { - val newDir = copyDataDir(context, modelDir!!) - modelDir = "$newDir/$modelDir" - dictDir = "$modelDir/dict" + val newDir = copyDataDir(context, dictDir!!) + dictDir = "$newDir/$dictDir" ruleFsts = "$modelDir/phone.fst,$modelDir/date.fst,$modelDir/number.fst" - assets = null } val config = getOfflineTtsConfig( - modelDir = modelDir!!, modelName = modelName!!, lexicon = lexicon ?: "", + modelDir = modelDir!!, + modelName = modelName ?: "", + acousticModelName = acousticModelName ?: "", + vocoder = vocoder ?: "", + lexicon = lexicon ?: "", dataDir = dataDir ?: "", dictDir = dictDir ?: "", ruleFsts = ruleFsts ?: "", diff --git a/scripts/apk/build-apk-tts-engine.sh.in b/scripts/apk/build-apk-tts-engine.sh.in index c611c061b..69933d2fc 100644 --- a/scripts/apk/build-apk-tts-engine.sh.in +++ b/scripts/apk/build-apk-tts-engine.sh.in @@ -37,6 +37,8 @@ mkdir -p apks pushd ./android/SherpaOnnxTtsEngine/app/src/main/assets/ model_dir={{ tts_model.model_dir }} model_name={{ tts_model.model_name }} +acoustic_model_name={{ tts_model.acoustic_model_name }} +vocoder={{ tts_model.vocoder }} lang={{ tts_model.lang }} lang_iso_639_3={{ tts_model.lang_iso_639_3 }} @@ -44,15 +46,30 @@ wget -qq https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/$mod tar xf $model_dir.tar.bz2 rm $model_dir.tar.bz2 +{% if tts_model.vocoder %} + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/$vocoder +{% endif %} + popd # Now we are at the project root directory git checkout . pushd android/SherpaOnnxTtsEngine/app/src/main/java/com/k2fsa/sherpa/onnx/tts/engine sed -i.bak s/"modelDir = null"/"modelDir = \"$model_dir\""/ ./TtsEngine.kt -sed -i.bak s/"modelName = null"/"modelName = \"$model_name\""/ ./TtsEngine.kt sed -i.bak s/"lang = null"/"lang = \"$lang_iso_639_3\""/ ./TtsEngine.kt +{% if tts_model.model_name %} + sed -i.bak s/"modelName = null"/"modelName = \"$model_name\""/ ./TtsEngine.kt +{% endif %} + +{% if tts_model.model_name %} + sed -i.bak s/"acousticModelName = null"/"acousticModelName = \"$acoustic_model_name\""/ ./TtsEngine.kt +{% endif %} + +{% if tts_model.vocoder %} + sed -i.bak s/"vocoder = null"/"vocoder = \"$vocoder\""/ ./TtsEngine.kt +{% endif %} + {% if tts_model.rule_fsts %} rule_fsts={{ tts_model.rule_fsts }} sed -i.bak s%"ruleFsts = null"%"ruleFsts = \"$rule_fsts\""% ./TtsEngine.kt @@ -109,6 +126,7 @@ for arch in arm64-v8a armeabi-v7a x86_64 x86; do done rm -rf ./android/SherpaOnnxTtsEngine/app/src/main/assets/$model_dir +rm -fv ./android/SherpaOnnxTtsEngine/app/src/main/assets/*.onnx {% endfor %} git checkout . diff --git a/scripts/apk/build-apk-tts.sh.in b/scripts/apk/build-apk-tts.sh.in index 2e62ad636..34135f1a1 100644 --- a/scripts/apk/build-apk-tts.sh.in +++ b/scripts/apk/build-apk-tts.sh.in @@ -37,19 +37,38 @@ mkdir -p apks pushd ./android/SherpaOnnxTts/app/src/main/assets/ model_dir={{ tts_model.model_dir }} model_name={{ tts_model.model_name }} +acoustic_model_name={{ tts_model.acoustic_model_name }} +vocoder={{ tts_model.vocoder }} lang={{ tts_model.lang }} wget -qq https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/$model_dir.tar.bz2 tar xf $model_dir.tar.bz2 rm $model_dir.tar.bz2 +{% if tts_model.vocoder %} + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/$vocoder +{% endif %} + popd # Now we are at the project root directory git checkout . pushd android/SherpaOnnxTts/app/src/main/java/com/k2fsa/sherpa/onnx sed -i.bak s/"modelDir = null"/"modelDir = \"$model_dir\""/ ./MainActivity.kt -sed -i.bak s/"modelName = null"/"modelName = \"$model_name\""/ ./MainActivity.kt + + +{% if tts_model.model_name %} + sed -i.bak s/"modelName = null"/"modelName = \"$model_name\""/ ./MainActivity.kt +{% endif %} + +{% if tts_model.acoustic_model_name %} + sed -i.bak s/"acousticModelName = null"/"acousticModelName = \"$acoustic_model_name\""/ ./MainActivity.kt +{% endif %} + +{% if tts_model.vocoder %} + sed -i.bak s/"vocoder = null"/"vocoder = \"$vocoder\""/ ./MainActivity.kt +{% endif %} + {% if tts_model.rule_fsts %} rule_fsts={{ tts_model.rule_fsts }} @@ -107,6 +126,8 @@ for arch in arm64-v8a armeabi-v7a x86_64 x86; do done rm -rf ./android/SherpaOnnxTts/app/src/main/assets/$model_dir +rm -fv ./android/SherpaOnnxTts/app/src/main/assets/*.onnx + {% endfor %} git checkout . diff --git a/scripts/apk/generate-tts-apk-script.py b/scripts/apk/generate-tts-apk-script.py index 1aa034945..1d804ecf9 100755 --- a/scripts/apk/generate-tts-apk-script.py +++ b/scripts/apk/generate-tts-apk-script.py @@ -30,7 +30,9 @@ def get_args(): @dataclass class TtsModel: model_dir: str - model_name: str = "" + model_name: str = "" # for vits + acoustic_model_name: str = "" # for matcha + vocoder: str = "" # for matcha lang: str = "" # en, zh, fr, de, etc. rule_fsts: Optional[List[str]] = None rule_fars: Optional[List[str]] = None @@ -378,6 +380,35 @@ def get_vits_models() -> List[TtsModel]: return all_models +def get_matcha_models() -> List[TtsModel]: + chinese_models = [ + TtsModel( + model_dir="matcha-icefall-zh-baker", + acoustic_model_name="model-steps-3.onnx", + lang="zh", + ) + ] + rule_fsts = ["phone.fst", "date.fst", "number.fst"] + for m in chinese_models: + s = [f"{m.model_dir}/{r}" for r in rule_fsts] + m.rule_fsts = ",".join(s) + m.dict_dir = m.model_dir + "/dict" + m.vocoder = "hifigan_v2.onnx" + + english_models = [ + TtsModel( + model_dir="matcha-icefall-en_US-ljspeech", + acoustic_model_name="model-steps-3.onnx", + lang="en", + ) + ] + for m in english_models: + m.data_dir = f"{m.model_dir}/espeak-ng-data" + m.vocoder = "hifigan_v2.onnx" + + return chinese_models + english_models + + def main(): args = get_args() index = args.index @@ -389,7 +420,10 @@ def main(): all_model_list += get_piper_models() all_model_list += get_mimic3_models() all_model_list += get_coqui_models() + all_model_list += get_matcha_models() + convert_lang_to_iso_639_3(all_model_list) + print(all_model_list) num_models = len(all_model_list) diff --git a/sherpa-onnx/csrc/offline-tts-vits-impl.h b/sherpa-onnx/csrc/offline-tts-vits-impl.h index 1cc8d5f95..72146b02c 100644 --- a/sherpa-onnx/csrc/offline-tts-vits-impl.h +++ b/sherpa-onnx/csrc/offline-tts-vits-impl.h @@ -348,6 +348,10 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl { mgr, config_.model.vits.lexicon, config_.model.vits.tokens, config_.model.vits.dict_dir, model_->GetMetaData(), config_.model.debug); + } else if (meta_data.jieba && !config_.model.vits.dict_dir.empty()) { + frontend_ = std::make_unique( + mgr, config_.model.vits.lexicon, config_.model.vits.tokens, + config_.model.vits.dict_dir, config_.model.debug); } else if (meta_data.is_melo_tts && meta_data.language == "English") { frontend_ = std::make_unique( mgr, config_.model.vits.lexicon, config_.model.vits.tokens, diff --git a/sherpa-onnx/kotlin-api/Tts.kt b/sherpa-onnx/kotlin-api/Tts.kt index 231b87d81..98efe6644 100644 --- a/sherpa-onnx/kotlin-api/Tts.kt +++ b/sherpa-onnx/kotlin-api/Tts.kt @@ -173,22 +173,55 @@ class OfflineTts( // to download models fun getOfflineTtsConfig( modelDir: String, - modelName: String, + modelName: String, // for VITS + acousticModelName: String, // for Matcha + vocoder: String, // for Matcha lexicon: String, dataDir: String, dictDir: String, ruleFsts: String, ruleFars: String ): OfflineTtsConfig { + if (modelName.isEmpty() && acousticModelName.isEmpty()) { + throw IllegalArgumentException("Please specify a TTS model") + } + + if (modelName.isNotEmpty() && acousticModelName.isNotEmpty()) { + throw IllegalArgumentException("Please specify either a VITS or a Matcha model, but not both") + } + + if (acousticModelName.isNotEmpty() && vocoder.isEmpty()) { + throw IllegalArgumentException("Please provide vocoder for Matcha TTS") + } + val vits = if (modelName.isNotEmpty()) { + OfflineTtsVitsModelConfig( + model = "$modelDir/$modelName", + lexicon = "$modelDir/$lexicon", + tokens = "$modelDir/tokens.txt", + dataDir = dataDir, + dictDir = dictDir, + ) + } else { + OfflineTtsVitsModelConfig() + } + + val matcha = if (acousticModelName.isNotEmpty()) { + OfflineTtsMatchaModelConfig( + acousticModel = "$modelDir/$acousticModelName", + vocoder = vocoder, + lexicon = "$modelDir/$lexicon", + tokens = "$modelDir/tokens.txt", + dictDir = dictDir, + dataDir = dataDir, + ) + } else { + OfflineTtsMatchaModelConfig() + } + return OfflineTtsConfig( model = OfflineTtsModelConfig( - vits = OfflineTtsVitsModelConfig( - model = "$modelDir/$modelName", - lexicon = "$modelDir/$lexicon", - tokens = "$modelDir/tokens.txt", - dataDir = dataDir, - dictDir = dictDir, - ), + vits = vits, + matcha = matcha, numThreads = 2, debug = true, provider = "cpu",