Skip to content

Commit

Permalink
Support Matcha-TTS models using espeak-ng (#1672)
Browse files Browse the repository at this point in the history
  • Loading branch information
csukuangfj authored Jan 2, 2025
1 parent 3422b93 commit f457bae
Show file tree
Hide file tree
Showing 10 changed files with 288 additions and 57 deletions.
22 changes: 22 additions & 0 deletions .github/scripts/test-offline-tts.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,28 @@ which $EXE
# test waves are saved in ./tts
mkdir ./tts

log "------------------------------------------------------------"
log "matcha-icefall-en_US-ljspeech"
log "------------------------------------------------------------"
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2
tar xvf matcha-icefall-en_US-ljspeech.tar.bz2
rm matcha-icefall-en_US-ljspeech.tar.bz2

curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx

$EXE \
--matcha-acoustic-model=./matcha-icefall-en_US-ljspeech/model-steps-3.onnx \
--matcha-vocoder=./hifigan_v2.onnx \
--matcha-tokens=./matcha-icefall-en_US-ljspeech/tokens.txt \
--matcha-data-dir=./matcha-icefall-en_US-ljspeech/espeak-ng-data \
--num-threads=2 \
--output-filename=./tts/matcha-ljspeech-1.wav \
--debug=1 \
"Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar."

rm hifigan_v2.onnx
rm -rf matcha-icefall-en_US-ljspeech

log "------------------------------------------------------------"
log "matcha-icefall-zh-baker"
log "------------------------------------------------------------"
Expand Down
25 changes: 23 additions & 2 deletions .github/scripts/test-python.sh
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,27 @@ log "Offline TTS test"
# test waves are saved in ./tts
mkdir ./tts

log "vits-ljs test"
log "matcha-ljspeech-en test"

curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2
tar xvf matcha-icefall-en_US-ljspeech.tar.bz2
rm matcha-icefall-en_US-ljspeech.tar.bz2

curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx

python3 ./python-api-examples/offline-tts.py \
--matcha-acoustic-model=./matcha-icefall-en_US-ljspeech/model-steps-3.onnx \
--matcha-vocoder=./hifigan_v2.onnx \
--matcha-tokens=./matcha-icefall-en_US-ljspeech/tokens.txt \
--matcha-data-dir=./matcha-icefall-en_US-ljspeech/espeak-ng-data \
--output-filename=./tts/test-matcha-ljspeech-en.wav \
--num-threads=2 \
"Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar."

rm hifigan_v2.onnx
rm -rf matcha-icefall-en_US-ljspeech

log "matcha-baker-zh test"

curl -O -SL https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2
tar xvf matcha-icefall-zh-baker.tar.bz2
Expand All @@ -282,12 +302,13 @@ python3 ./python-api-examples/offline-tts.py \
--matcha-tokens=./matcha-icefall-zh-baker/tokens.txt \
--tts-rule-fsts=./matcha-icefall-zh-baker/phone.fst,./matcha-icefall-zh-baker/date.fst,./matcha-icefall-zh-baker/number.fst \
--matcha-dict-dir=./matcha-icefall-zh-baker/dict \
--output-filename=./tts/test-matcha.wav \
--output-filename=./tts/test-matcha-baker-zh.wav \
"某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。"

rm -rf matcha-icefall-zh-baker
rm hifigan_v2.onnx

log "vits-ljs test"

curl -LS -O https://huggingface.co/csukuangfj/vits-ljs/resolve/main/vits-ljs.onnx
curl -LS -O https://huggingface.co/csukuangfj/vits-ljs/resolve/main/lexicon.txt
Expand Down
25 changes: 21 additions & 4 deletions python-api-examples/offline-tts-play.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
Usage:
Example (1/4)
Example (1/5)
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2
tar xf vits-piper-en_US-amy-low.tar.bz2
Expand All @@ -23,7 +23,7 @@
--output-filename=./generated.wav \
"Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar."
Example (2/4)
Example (2/5)
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-zh-aishell3.tar.bz2
tar xvf vits-zh-aishell3.tar.bz2
Expand All @@ -37,7 +37,7 @@
--output-filename=./liubei-21.wav \
"勿以恶小而为之,勿以善小而不为。惟贤惟德,能服于人。122334"
Example (3/4)
Example (3/5)
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/sherpa-onnx-vits-zh-ll.tar.bz2
tar xvf sherpa-onnx-vits-zh-ll.tar.bz2
Expand All @@ -53,7 +53,7 @@
--output-filename=./test-2.wav \
"当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔。2024年5月11号,拨打110或者18920240511。123456块钱。"
Example (4/4)
Example (4/5)
curl -O -SL https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2
tar xvf matcha-icefall-zh-baker.tar.bz2
Expand All @@ -71,6 +71,23 @@
--output-filename=./test-matcha.wav \
"某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。"
Example (5/5)
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2
tar xvf matcha-icefall-en_US-ljspeech.tar.bz2
rm matcha-icefall-en_US-ljspeech.tar.bz2
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx
python3 ./python-api-examples/offline-tts-play.py \
--matcha-acoustic-model=./matcha-icefall-en_US-ljspeech/model-steps-3.onnx \
--matcha-vocoder=./hifigan_v2.onnx \
--matcha-tokens=./matcha-icefall-en_US-ljspeech/tokens.txt \
--matcha-data-dir=./matcha-icefall-en_US-ljspeech/espeak-ng-data \
--output-filename=./test-matcha-ljspeech-en.wav \
--num-threads=2 \
"Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar."
You can find more models at
https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models
Expand Down
25 changes: 21 additions & 4 deletions python-api-examples/offline-tts.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
Usage:
Example (1/4)
Example (1/5)
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2
tar xf vits-piper-en_US-amy-low.tar.bz2
Expand All @@ -24,7 +24,7 @@
--output-filename=./generated.wav \
"Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar."
Example (2/4)
Example (2/5)
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2
tar xvf vits-icefall-zh-aishell3.tar.bz2
Expand All @@ -38,7 +38,7 @@
--output-filename=./liubei-21.wav \
"勿以恶小而为之,勿以善小而不为。惟贤惟德,能服于人。122334"
Example (3/4)
Example (3/5)
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/sherpa-onnx-vits-zh-ll.tar.bz2
tar xvf sherpa-onnx-vits-zh-ll.tar.bz2
Expand All @@ -54,7 +54,7 @@
--output-filename=./test-2.wav \
"当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔。2024年5月11号,拨打110或者18920240511。123456块钱。"
Example (4/4)
Example (4/5)
curl -O -SL https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2
tar xvf matcha-icefall-zh-baker.tar.bz2
Expand All @@ -72,6 +72,23 @@
--output-filename=./test-matcha.wav \
"某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。"
Example (5/5)
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2
tar xvf matcha-icefall-en_US-ljspeech.tar.bz2
rm matcha-icefall-en_US-ljspeech.tar.bz2
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx
python3 ./python-api-examples/offline-tts.py \
--matcha-acoustic-model=./matcha-icefall-en_US-ljspeech/model-steps-3.onnx \
--matcha-vocoder=./hifigan_v2.onnx \
--matcha-tokens=./matcha-icefall-en_US-ljspeech/tokens.txt \
--matcha-data-dir=./matcha-icefall-en_US-ljspeech/espeak-ng-data \
--output-filename=./test-matcha-ljspeech-en.wav \
--num-threads=2 \
"Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar."
You can find more models at
https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models
Expand Down
32 changes: 16 additions & 16 deletions sherpa-onnx/csrc/macros.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,19 +49,21 @@
} while (0)
#endif

#define SHERPA_ONNX_EXIT(code) exit(code)

// Read an integer
#define SHERPA_ONNX_READ_META_DATA(dst, src_key) \
do { \
auto value = LookupCustomModelMetaData(meta_data, src_key, allocator); \
if (value.empty()) { \
SHERPA_ONNX_LOGE("'%s' does not exist in the metadata", src_key); \
exit(-1); \
SHERPA_ONNX_EXIT(-1); \
} \
\
dst = atoi(value.c_str()); \
if (dst < 0) { \
SHERPA_ONNX_LOGE("Invalid value %d for '%s'", dst, src_key); \
exit(-1); \
SHERPA_ONNX_EXIT(-1); \
} \
} while (0)

Expand All @@ -74,7 +76,7 @@
dst = atoi(value.c_str()); \
if (dst < 0) { \
SHERPA_ONNX_LOGE("Invalid value %d for '%s'", dst, src_key); \
exit(-1); \
SHERPA_ONNX_EXIT(-1); \
} \
} \
} while (0)
Expand All @@ -85,13 +87,13 @@
auto value = LookupCustomModelMetaData(meta_data, src_key, allocator); \
if (value.empty()) { \
SHERPA_ONNX_LOGE("'%s' does not exist in the metadata", src_key); \
exit(-1); \
SHERPA_ONNX_EXIT(-1); \
} \
\
bool ret = SplitStringToIntegers(value.c_str(), ",", true, &dst); \
if (!ret) { \
SHERPA_ONNX_LOGE("Invalid value '%s' for '%s'", value.c_str(), src_key); \
exit(-1); \
SHERPA_ONNX_EXIT(-1); \
} \
} while (0)

Expand All @@ -101,13 +103,13 @@
auto value = LookupCustomModelMetaData(meta_data, src_key, allocator); \
if (value.empty()) { \
SHERPA_ONNX_LOGE("%s does not exist in the metadata", src_key); \
exit(-1); \
SHERPA_ONNX_EXIT(-1); \
} \
\
bool ret = SplitStringToFloats(value.c_str(), ",", true, &dst); \
if (!ret) { \
SHERPA_ONNX_LOGE("Invalid value '%s' for '%s'", value.c_str(), src_key); \
exit(-1); \
SHERPA_ONNX_EXIT(-1); \
} \
} while (0)

Expand All @@ -117,14 +119,14 @@
auto value = LookupCustomModelMetaData(meta_data, src_key, allocator); \
if (value.empty()) { \
SHERPA_ONNX_LOGE("'%s' does not exist in the metadata", src_key); \
exit(-1); \
SHERPA_ONNX_EXIT(-1); \
} \
SplitStringToVector(value.c_str(), ",", false, &dst); \
\
if (dst.empty()) { \
SHERPA_ONNX_LOGE("Invalid value '%s' for '%s'. Empty vector!", \
value.c_str(), src_key); \
exit(-1); \
SHERPA_ONNX_EXIT(-1); \
} \
} while (0)

Expand All @@ -134,14 +136,14 @@
auto value = LookupCustomModelMetaData(meta_data, src_key, allocator); \
if (value.empty()) { \
SHERPA_ONNX_LOGE("'%s' does not exist in the metadata", src_key); \
exit(-1); \
SHERPA_ONNX_EXIT(-1); \
} \
SplitStringToVector(value.c_str(), sep, false, &dst); \
\
if (dst.empty()) { \
SHERPA_ONNX_LOGE("Invalid value '%s' for '%s'. Empty vector!", \
value.c_str(), src_key); \
exit(-1); \
SHERPA_ONNX_EXIT(-1); \
} \
} while (0)

Expand All @@ -151,13 +153,13 @@
auto value = LookupCustomModelMetaData(meta_data, src_key, allocator); \
if (value.empty()) { \
SHERPA_ONNX_LOGE("'%s' does not exist in the metadata", src_key); \
exit(-1); \
SHERPA_ONNX_EXIT(-1); \
} \
\
dst = std::move(value); \
if (dst.empty()) { \
SHERPA_ONNX_LOGE("Invalid value for '%s'\n", src_key); \
exit(-1); \
SHERPA_ONNX_EXIT(-1); \
} \
} while (0)

Expand All @@ -178,11 +180,9 @@
dst = std::move(value); \
if (dst.empty()) { \
SHERPA_ONNX_LOGE("Invalid value for '%s'\n", src_key); \
exit(-1); \
SHERPA_ONNX_EXIT(-1); \
} \
} \
} while (0)

#define SHERPA_ONNX_EXIT(code) exit(code)

#endif // SHERPA_ONNX_CSRC_MACROS_H_
41 changes: 37 additions & 4 deletions sherpa-onnx/csrc/offline-tts-matcha-impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -321,12 +321,45 @@ class OfflineTtsMatchaImpl : public OfflineTtsImpl {

private:
template <typename Manager>
void InitFrontend(Manager *mgr) {}
void InitFrontend(Manager *mgr) {
// for piper phonemizer
// we require that you copy espeak_ng_data
// from assets to disk
//
// for jieba
// we require that you copy tokens.txt, lexicon.txt and dict
// from assets to disk
const auto &meta_data = model_->GetMetaData();

if (meta_data.jieba && !meta_data.has_espeak) {
frontend_ = std::make_unique<JiebaLexicon>(
config_.model.matcha.lexicon, config_.model.matcha.tokens,
config_.model.matcha.dict_dir, config_.model.debug);
} else if (meta_data.has_espeak && !meta_data.jieba) {
frontend_ = std::make_unique<PiperPhonemizeLexicon>(
mgr, config_.model.matcha.tokens, config_.model.matcha.data_dir,
meta_data);
} else {
SHERPA_ONNX_LOGE("jieba + espeaker-ng is not supported yet");
SHERPA_ONNX_EXIT(-1);
}
}

void InitFrontend() {
frontend_ = std::make_unique<JiebaLexicon>(
config_.model.matcha.lexicon, config_.model.matcha.tokens,
config_.model.matcha.dict_dir, config_.model.debug);
const auto &meta_data = model_->GetMetaData();

if (meta_data.jieba && !meta_data.has_espeak) {
frontend_ = std::make_unique<JiebaLexicon>(
config_.model.matcha.lexicon, config_.model.matcha.tokens,
config_.model.matcha.dict_dir, config_.model.debug);
} else if (meta_data.has_espeak && !meta_data.jieba) {
frontend_ = std::make_unique<PiperPhonemizeLexicon>(
config_.model.matcha.tokens, config_.model.matcha.data_dir,
meta_data);
} else {
SHERPA_ONNX_LOGE("jieba + espeaker-ng is not supported yet");
SHERPA_ONNX_EXIT(-1);
}
}

GeneratedAudio Process(const std::vector<std::vector<int64_t>> &tokens,
Expand Down
2 changes: 1 addition & 1 deletion sherpa-onnx/csrc/offline-tts-matcha-model-metadata.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ struct OfflineTtsMatchaModelMetaData {
int32_t num_speakers = 0;
int32_t version = 1;
int32_t jieba = 0;
int32_t espeak = 0;
int32_t has_espeak = 0;
int32_t use_eos_bos = 0;
int32_t pad_id = 0;
};
Expand Down
2 changes: 1 addition & 1 deletion sherpa-onnx/csrc/offline-tts-matcha-model.cc
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ class OfflineTtsMatchaModel::Impl {
SHERPA_ONNX_READ_META_DATA_WITH_DEFAULT(meta_data_.version, "version", 1);
SHERPA_ONNX_READ_META_DATA(meta_data_.num_speakers, "n_speakers");
SHERPA_ONNX_READ_META_DATA(meta_data_.jieba, "jieba");
SHERPA_ONNX_READ_META_DATA(meta_data_.espeak, "has_espeak");
SHERPA_ONNX_READ_META_DATA(meta_data_.has_espeak, "has_espeak");
SHERPA_ONNX_READ_META_DATA(meta_data_.use_eos_bos, "use_eos_bos");
SHERPA_ONNX_READ_META_DATA(meta_data_.pad_id, "pad_id");
}
Expand Down
Loading

0 comments on commit f457bae

Please sign in to comment.