Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[tts] add zh_en mix example #2234

Merged
merged 4 commits into from
Aug 13, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
306 changes: 289 additions & 17 deletions examples/zh_en_tts/tts3/README.md

Large diffs are not rendered by default.

104 changes: 104 additions & 0 deletions examples/zh_en_tts/tts3/conf/default.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
###########################################################
# FEATURE EXTRACTION SETTING #
###########################################################

fs: 24000 # sr
n_fft: 2048 # FFT size (samples).
n_shift: 300 # Hop size (samples). 12.5ms
win_length: 1200 # Window length (samples). 50ms
# If set to null, it will be the same as fft_size.
window: "hann" # Window function.

# Only used for feats_type != raw

fmin: 80 # Minimum frequency of Mel basis.
fmax: 7600 # Maximum frequency of Mel basis.
n_mels: 80 # The number of mel basis.

# Only used for the model using pitch features (e.g. FastSpeech2)
f0min: 80 # Minimum f0 for pitch extraction.
f0max: 400 # Maximum f0 for pitch extraction.


###########################################################
# DATA SETTING #
###########################################################
batch_size: 64
num_workers: 2


###########################################################
# MODEL SETTING #
###########################################################
model:
adim: 384 # attention dimension
aheads: 2 # number of attention heads
elayers: 4 # number of encoder layers
eunits: 1536 # number of encoder ff units
dlayers: 4 # number of decoder layers
dunits: 1536 # number of decoder ff units
positionwise_layer_type: conv1d # type of position-wise layer
positionwise_conv_kernel_size: 3 # kernel size of position wise conv layer
duration_predictor_layers: 2 # number of layers of duration predictor
duration_predictor_chans: 256 # number of channels of duration predictor
duration_predictor_kernel_size: 3 # filter size of duration predictor
postnet_layers: 5 # number of layers of postnset
postnet_filts: 5 # filter size of conv layers in postnet
postnet_chans: 256 # number of channels of conv layers in postnet
use_scaled_pos_enc: True # whether to use scaled positional encoding
encoder_normalize_before: True # whether to perform layer normalization before the input
decoder_normalize_before: True # whether to perform layer normalization before the input
reduction_factor: 1 # reduction factor
init_type: xavier_uniform # initialization type
init_enc_alpha: 1.0 # initial value of alpha of encoder scaled position encoding
init_dec_alpha: 1.0 # initial value of alpha of decoder scaled position encoding
transformer_enc_dropout_rate: 0.2 # dropout rate for transformer encoder layer
transformer_enc_positional_dropout_rate: 0.2 # dropout rate for transformer encoder positional encoding
transformer_enc_attn_dropout_rate: 0.2 # dropout rate for transformer encoder attention layer
transformer_dec_dropout_rate: 0.2 # dropout rate for transformer decoder layer
transformer_dec_positional_dropout_rate: 0.2 # dropout rate for transformer decoder positional encoding
transformer_dec_attn_dropout_rate: 0.2 # dropout rate for transformer decoder attention layer
pitch_predictor_layers: 5 # number of conv layers in pitch predictor
pitch_predictor_chans: 256 # number of channels of conv layers in pitch predictor
pitch_predictor_kernel_size: 5 # kernel size of conv leyers in pitch predictor
pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor
pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch
pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch
stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder
energy_predictor_layers: 2 # number of conv layers in energy predictor
energy_predictor_chans: 256 # number of channels of conv layers in energy predictor
energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor
energy_predictor_dropout: 0.5 # dropout rate in energy predictor
energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy
energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy
stop_gradient_from_energy_predictor: False # whether to stop the gradient from energy predictor to encoder
spk_embed_dim: 256 # speaker embedding dimension
spk_embed_integration_type: concat # speaker embedding integration type



###########################################################
# UPDATER SETTING #
###########################################################
updater:
use_masking: True # whether to apply masking for padded part in loss calculation


###########################################################
# OPTIMIZER SETTING #
###########################################################
optimizer:
optim: adam # optimizer type
learning_rate: 0.001 # learning rate

###########################################################
# TRAINING SETTING #
###########################################################
max_epoch: 200
num_snapshots: 5


###########################################################
# OTHER SETTING #
###########################################################
seed: 10086
54 changes: 54 additions & 0 deletions examples/zh_en_tts/tts3/local/inference.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#!/bin/bash

train_output_path=$1

stage=0
stop_stage=0

# voc: pwgan_aishell3
# the spk_id=174 means baker speaker, default
# the spk_id=175 means ljspeech speaker
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
python3 ${BIN_DIR}/../inference.py \
--inference_dir=${train_output_path}/inference \
--am=fastspeech2_mix \
--voc=pwgan_aishell3 \
--text=${BIN_DIR}/../sentences_mix.txt \
--output_dir=${train_output_path}/pd_infer_out \
--phones_dict=dump/phone_id_map.txt \
--speaker_dict=dump/speaker_id_map.txt \
--lang=mix \
--spk_id=174
fi


# voc: hifigan_aishell3
# the spk_id=174 means baker speaker, default
# the spk_id=175 means ljspeech speaker
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
python3 ${BIN_DIR}/../inference.py \
--inference_dir=${train_output_path}/inference \
--am=fastspeech2_mix \
--voc=hifigan_aishell3 \
--text=${BIN_DIR}/../sentences_mix.txt \
--output_dir=${train_output_path}/pd_infer_out \
--phones_dict=dump/phone_id_map.txt \
--speaker_dict=dump/speaker_id_map.txt \
--lang=mix \
--spk_id=174
fi

# voc: hifigan_csmsc
# when speaker is 174 (csmsc), use csmsc's vocoder is better than aishell3's
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
python3 ${BIN_DIR}/../inference.py \
--inference_dir=${train_output_path}/inference \
--am=fastspeech2_mix \
--voc=hifigan_csmsc \
--text=${BIN_DIR}/../sentences_mix.txt \
--output_dir=${train_output_path}/pd_infer_out \
--phones_dict=dump/phone_id_map.txt \
--speaker_dict=dump/speaker_id_map.txt \
--lang=mix \
--spk_id=174
fi
54 changes: 54 additions & 0 deletions examples/zh_en_tts/tts3/local/ort_predict.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
train_output_path=$1

stage=0
stop_stage=0

# e2e, synthesize from text
# voc: pwgan_aishell3
# the spk_id=174 means baker speaker, default
# the spk_id=175 means ljspeech speaker
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
python3 ${BIN_DIR}/../ort_predict_e2e.py \
--inference_dir=${train_output_path}/inference_onnx \
--am=fastspeech2_mix \
--voc=pwgan_aishell3 \
--output_dir=${train_output_path}/onnx_infer_out_e2e \
--text=${BIN_DIR}/../sentences_mix.txt \
--phones_dict=dump/phone_id_map.txt \
--device=cpu \
--cpu_threads=4 \
--lang=mix \
--spk_id=174
fi


# voc: hifigan_aishell3
# the spk_id=174 means baker speaker, default
# the spk_id=175 means ljspeech speaker
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
python3 ${BIN_DIR}/../ort_predict_e2e.py \
--inference_dir=${train_output_path}/inference_onnx \
--am=fastspeech2_mix \
--voc=hifigan_aishell3 \
--output_dir=${train_output_path}/onnx_infer_out_e2e \
--text=${BIN_DIR}/../sentences_mix.txt \
--phones_dict=dump/phone_id_map.txt \
--device=cpu \
--cpu_threads=4 \
--lang=mix \
--spk_id=174
fi

if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
python3 ${BIN_DIR}/../ort_predict_e2e.py \
--inference_dir=${train_output_path}/inference_onnx \
--am=fastspeech2_mix \
--voc=hifigan_csmsc \
--output_dir=${train_output_path}/onnx_infer_out_e2e \
--text=${BIN_DIR}/../sentences_mix.txt \
--phones_dict=dump/phone_id_map.txt \
--device=cpu \
--cpu_threads=4 \
--lang=mix \
--spk_id=174
fi
1 change: 1 addition & 0 deletions examples/zh_en_tts/tts3/local/paddle2onnx.sh
149 changes: 149 additions & 0 deletions examples/zh_en_tts/tts3/local/preprocess.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
#!/bin/bash

stage=0
stop_stage=100

config_path=$1
datasets_root_dir=$2
mfa_root_dir=$3

# 1. get durations from MFA's result
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
echo "Generate durations_baker.txt from MFA results ..."
python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
--inputdir=${mfa_root_dir}/baker_alignment_tone \
--output durations_baker.txt \
--config=${config_path}
fi

if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
echo "Generate durations_ljspeech.txt from MFA results ..."
python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
--inputdir=${mfa_root_dir}/ljspeech_alignment \
--output durations_ljspeech.txt \
--config=${config_path}
fi

if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
echo "Generate durations_aishell3.txt from MFA results ..."
python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
--inputdir=${mfa_root_dir}/aishell3_alignment_tone \
--output durations_aishell3.txt \
--config=${config_path}
fi

if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
echo "Generate durations_vctk.txt from MFA results ..."
python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
--inputdir=${mfa_root_dir}/vctk_alignment \
--output durations_vctk.txt \
--config=${config_path}
fi

if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
# concat duration file
echo "concat durations_baker.txt, durations_ljspeech.txt, durations_aishell3.txt and durations_vctk.txt to durations.txt"
cat durations_baker.txt durations_ljspeech.txt durations_aishell3.txt durations_vctk.txt > durations.txt
fi

# 2. extract features
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
echo "Extract baker features ..."
python3 ${BIN_DIR}/preprocess.py \
--dataset=baker \
--rootdir=${datasets_root_dir}/BZNSYP/ \
--dumpdir=dump \
--dur-file=durations.txt \
--config=${config_path} \
--num-cpu=20 \
--cut-sil=True \
--write_metadata_method=a
fi

if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
echo "Extract ljspeech features ..."
python3 ${BIN_DIR}/preprocess.py \
--dataset=ljspeech \
--rootdir=${datasets_root_dir}/LJSpeech-1.1/ \
--dumpdir=dump \
--dur-file=durations.txt \
--config=${config_path} \
--num-cpu=20 \
--cut-sil=True \
--write_metadata_method=a
fi

if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
echo "Extract aishell3 features ..."
python3 ${BIN_DIR}/preprocess.py \
--dataset=aishell3 \
--rootdir=${datasets_root_dir}/data_aishell3/ \
--dumpdir=dump \
--dur-file=durations.txt \
--config=${config_path} \
--num-cpu=20 \
--cut-sil=True \
--write_metadata_method=a
fi

if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then
echo "Extract vctk features ..."
python3 ${BIN_DIR}/preprocess.py \
--dataset=vctk \
--rootdir=${datasets_root_dir}/VCTK-Corpus-0.92/ \
--dumpdir=dump \
--dur-file=durations.txt \
--config=${config_path} \
--num-cpu=20 \
--cut-sil=True \
--write_metadata_method=a
fi


# 3. get features' stats(mean and std)
if [ ${stage} -le 9 ] && [ ${stop_stage} -ge 9 ]; then
echo "Get features' stats ..."
python3 ${MAIN_ROOT}/utils/compute_statistics.py \
--metadata=dump/train/raw/metadata.jsonl \
--field-name="speech"

python3 ${MAIN_ROOT}/utils/compute_statistics.py \
--metadata=dump/train/raw/metadata.jsonl \
--field-name="pitch"

python3 ${MAIN_ROOT}/utils/compute_statistics.py \
--metadata=dump/train/raw/metadata.jsonl \
--field-name="energy"
fi


# 4. normalize and covert phone/speaker to id, dev and test should use train's stats
if [ ${stage} -le 10 ] && [ ${stop_stage} -ge 10 ]; then
echo "Normalize ..."
python3 ${BIN_DIR}/normalize.py \
--metadata=dump/train/raw/metadata.jsonl \
--dumpdir=dump/train/norm \
--speech-stats=dump/train/speech_stats.npy \
--pitch-stats=dump/train/pitch_stats.npy \
--energy-stats=dump/train/energy_stats.npy \
--phones-dict=dump/phone_id_map.txt \
--speaker-dict=dump/speaker_id_map.txt

python3 ${BIN_DIR}/normalize.py \
--metadata=dump/dev/raw/metadata.jsonl \
--dumpdir=dump/dev/norm \
--speech-stats=dump/train/speech_stats.npy \
--pitch-stats=dump/train/pitch_stats.npy \
--energy-stats=dump/train/energy_stats.npy \
--phones-dict=dump/phone_id_map.txt \
--speaker-dict=dump/speaker_id_map.txt

python3 ${BIN_DIR}/normalize.py \
--metadata=dump/test/raw/metadata.jsonl \
--dumpdir=dump/test/norm \
--speech-stats=dump/train/speech_stats.npy \
--pitch-stats=dump/train/pitch_stats.npy \
--energy-stats=dump/train/energy_stats.npy \
--phones-dict=dump/phone_id_map.txt \
--speaker-dict=dump/speaker_id_map.txt
fi
Loading