From 993f730804d5bb98a70c90f9d8921ae0c63bd0d7 Mon Sep 17 00:00:00 2001 From: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> Date: Mon, 14 Nov 2022 11:50:21 -0800 Subject: [PATCH] [TTS] replace pitch normalization params with ??? (#5392) * replace pitch normalization params with ??? * add pitch mean and std overrides in CI Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> --- Jenkinsfile | 4 ++++ examples/tts/conf/de/fastpitch_align_22050.yaml | 8 ++++---- examples/tts/conf/de/fastpitch_align_44100.yaml | 9 ++++----- examples/tts/conf/es/fastpitch_align_44100.yaml | 7 ++++--- examples/tts/conf/es/fastpitch_align_44100_ipa.yaml | 7 ++++--- .../tts/conf/es/fastpitch_align_44100_ipa_multi.yaml | 7 ++++--- examples/tts/conf/fastpitch_align_44100.yaml | 8 ++++---- examples/tts/conf/fastpitch_align_ipa.yaml | 7 ++++--- examples/tts/conf/fastpitch_align_v1.05.yaml | 7 ++++--- examples/tts/conf/mixer-tts-x.yaml | 7 ++++--- examples/tts/conf/mixer-tts.yaml | 7 ++++--- examples/tts/conf/rad-tts_dec.yaml | 7 ++++--- examples/tts/conf/rad-tts_feature_pred.yaml | 7 ++++--- examples/tts/conf/tacotron2.yaml | 5 ----- examples/tts/conf/zh/fastpitch_align_22050.yaml | 8 ++++---- 15 files changed, 56 insertions(+), 49 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 87dc4782f6e9..9d9bea8871bb 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -4164,6 +4164,8 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"''' trainer.devices="[0]" \ +trainer.limit_train_batches=1 +trainer.limit_val_batches=1 trainer.max_epochs=1 \ trainer.strategy=null \ + model.pitch_mean=212.35873413085938 \ + model.pitch_std=68.52806091308594 \ model.train_ds.dataloader_params.batch_size=4 \ model.train_ds.dataloader_params.num_workers=0 \ model.validation_ds.dataloader_params.batch_size=4 \ @@ -4187,6 +4189,8 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"''' trainer.devices="[0]" \ +trainer.limit_train_batches=1 +trainer.limit_val_batches=1 trainer.max_epochs=1 \ trainer.strategy=null \ + model.pitch_mean=212.35873413085938 \ + model.pitch_std=68.52806091308594 \ model.train_ds.dataloader_params.batch_size=4 \ model.train_ds.dataloader_params.num_workers=0 \ model.validation_ds.dataloader_params.batch_size=4 \ diff --git a/examples/tts/conf/de/fastpitch_align_22050.yaml b/examples/tts/conf/de/fastpitch_align_22050.yaml index 20f9011287c7..62c8543b98a4 100644 --- a/examples/tts/conf/de/fastpitch_align_22050.yaml +++ b/examples/tts/conf/de/fastpitch_align_22050.yaml @@ -13,10 +13,10 @@ sup_data_types: [ "align_prior_matrix", "pitch" ] pitch_fmin: 65.40639132514966 pitch_fmax: 2093.004522404789 -# Stats (per frame), train (these values depend on pitch_fmin and pitch_fmax) -# For example, on Thorsten Muller (German Neutral-TTS dataset): http://www.openslr.org/95/ -pitch_mean: 132.524658203125 -pitch_std: 37.389366149902344 +# these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values +# by running `scripts/dataset_processing/tts/extract_sup_data.py` +pitch_mean: ??? # e.g. 132.524658203125 for http://www.openslr.org/95/ +pitch_std: ??? # e.g. 37.389366149902 for http://www.openslr.org/95/ # Default values for dataset with sample_rate=22050 sample_rate: 22050 diff --git a/examples/tts/conf/de/fastpitch_align_44100.yaml b/examples/tts/conf/de/fastpitch_align_44100.yaml index 231311782264..832a37505ca4 100644 --- a/examples/tts/conf/de/fastpitch_align_44100.yaml +++ b/examples/tts/conf/de/fastpitch_align_44100.yaml @@ -2,7 +2,6 @@ # rate. If you want to train model on other dataset, you can change config values according to your dataset. # Most dataset-specific arguments are in the head of the config file, see below. - name: FastPitch train_dataset: ??? @@ -14,10 +13,10 @@ sup_data_types: [ "align_prior_matrix", "pitch", "speaker_id" ] pitch_fmin: 65.40639132514966 pitch_fmax: 2093.004522404789 -# Stats (per frame), train (these values depend on pitch_fmin and pitch_fmax) -# For example, on HUI-Audio-Corpus-German (speaker: Friedrich): https://github.com/iisys-hof/HUI-Audio-Corpus-German -pitch_mean: 119.04859924316406 -pitch_std: 95.5344009399414 +# these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values +# by running `scripts/dataset_processing/tts/extract_sup_data.py` +pitch_mean: ??? # e.g. 151.9578857421875 for top-5 speakers in https://github.com/iisys-hof/HUI-Audio-Corpus-German +pitch_std: ??? # e.g. 87.1997680664063 for top-5 speakers in https://github.com/iisys-hof/HUI-Audio-Corpus-German # Default values for dataset with sample_rate=44100 sample_rate: 44100 diff --git a/examples/tts/conf/es/fastpitch_align_44100.yaml b/examples/tts/conf/es/fastpitch_align_44100.yaml index 69c73be1df80..3a2cd983f368 100644 --- a/examples/tts/conf/es/fastpitch_align_44100.yaml +++ b/examples/tts/conf/es/fastpitch_align_44100.yaml @@ -13,9 +13,10 @@ sup_data_types: ["align_prior_matrix", "pitch"] pitch_fmin: 65.40639132514966 pitch_fmax: 2093.004522404789 -# Stats (per frame), train (these values depend on pitch_fmin and pitch_fmax) -pitch_mean: 212.35873413085938 -pitch_std: 68.52806091308594 +# these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values +# by running `scripts/dataset_processing/tts/extract_sup_data.py` +pitch_mean: ??? # e.g. 178.86880493164062 for 174 speakers in https://research.google/pubs/pub49150/ +pitch_std: ??? # e.g. 60.64979553222656 for 174 speakers in https://research.google/pubs/pub49150/ sample_rate: 44100 n_mel_channels: 80 diff --git a/examples/tts/conf/es/fastpitch_align_44100_ipa.yaml b/examples/tts/conf/es/fastpitch_align_44100_ipa.yaml index cffca9d06b6b..53b8f813e1dc 100644 --- a/examples/tts/conf/es/fastpitch_align_44100_ipa.yaml +++ b/examples/tts/conf/es/fastpitch_align_44100_ipa.yaml @@ -13,9 +13,10 @@ sup_data_types: ["align_prior_matrix", "pitch"] pitch_fmin: 65.40639132514966 pitch_fmax: 2093.004522404789 -# Stats (per frame), train (these values depend on pitch_fmin and pitch_fmax) -pitch_mean: 212.35873413085938 -pitch_std: 68.52806091308594 +# these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values +# by running `scripts/dataset_processing/tts/extract_sup_data.py` +pitch_mean: ??? # e.g. 178.86880493164062 for 174 speakers in https://research.google/pubs/pub49150/ +pitch_std: ??? # e.g. 60.64979553222656 for 174 speakers in https://research.google/pubs/pub49150/ sample_rate: 44100 n_mel_channels: 80 diff --git a/examples/tts/conf/es/fastpitch_align_44100_ipa_multi.yaml b/examples/tts/conf/es/fastpitch_align_44100_ipa_multi.yaml index 77d67f7880ce..8f8b30bc0aa5 100644 --- a/examples/tts/conf/es/fastpitch_align_44100_ipa_multi.yaml +++ b/examples/tts/conf/es/fastpitch_align_44100_ipa_multi.yaml @@ -13,9 +13,10 @@ sup_data_types: ["align_prior_matrix", "pitch", "speaker_id"] pitch_fmin: 65.40639132514966 pitch_fmax: 2093.004522404789 -# Stats (per frame), train (these values depend on pitch_fmin and pitch_fmax) -pitch_mean: 212.35873413085938 -pitch_std: 68.52806091308594 +# these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values +# by running `scripts/dataset_processing/tts/extract_sup_data.py` +pitch_mean: ??? # e.g. 178.86880493164062 for 174 speakers in https://research.google/pubs/pub49150/ +pitch_std: ??? # e.g. 60.64979553222656 for 174 speakers in https://research.google/pubs/pub49150/ sample_rate: 44100 n_mel_channels: 80 diff --git a/examples/tts/conf/fastpitch_align_44100.yaml b/examples/tts/conf/fastpitch_align_44100.yaml index e67b0b447252..aa60093f2d3b 100644 --- a/examples/tts/conf/fastpitch_align_44100.yaml +++ b/examples/tts/conf/fastpitch_align_44100.yaml @@ -2,7 +2,6 @@ # rate. If you want to train model on other dataset, you can change config values according to your dataset. # Most dataset-specific arguments are in the head of the config file, see below. - name: FastPitch train_dataset: ??? @@ -14,9 +13,10 @@ sup_data_types: [ "align_prior_matrix", "pitch" ] pitch_fmin: 65.40639132514966 pitch_fmax: 2093.004522404789 -# Stats (per frame), train (these values depend on pitch_fmin and pitch_fmax) -pitch_mean: 212.35873413085938 -pitch_std: 68.52806091308594 +# these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values +# by running `scripts/dataset_processing/tts/extract_sup_data.py` +pitch_mean: ??? # e.g. 212.35873413085938 for LJSpeech +pitch_std: ??? # e.g. 68.52806091308594 for LJSpeech sample_rate: 44100 n_mel_channels: 80 diff --git a/examples/tts/conf/fastpitch_align_ipa.yaml b/examples/tts/conf/fastpitch_align_ipa.yaml index 7132dc1b0e32..5765b3e91797 100644 --- a/examples/tts/conf/fastpitch_align_ipa.yaml +++ b/examples/tts/conf/fastpitch_align_ipa.yaml @@ -13,9 +13,10 @@ sup_data_types: [ "align_prior_matrix", "pitch" ] pitch_fmin: 65.40639132514966 pitch_fmax: 2093.004522404789 -# LJSpeech stats (per frame), train (these values depend on pitch_fmin and pitch_fmax) -pitch_mean: 212.35873413085938 -pitch_std: 68.52806091308594 +# these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values +# by running `scripts/dataset_processing/tts/extract_sup_data.py` +pitch_mean: ??? # e.g. 212.35873413085938 for LJSpeech +pitch_std: ??? # e.g. 68.52806091308594 for LJSpeech # Default values for dataset with sample_rate=22050 sample_rate: 22050 diff --git a/examples/tts/conf/fastpitch_align_v1.05.yaml b/examples/tts/conf/fastpitch_align_v1.05.yaml index c0cf8311f67e..368dd4e502fd 100644 --- a/examples/tts/conf/fastpitch_align_v1.05.yaml +++ b/examples/tts/conf/fastpitch_align_v1.05.yaml @@ -13,9 +13,10 @@ sup_data_types: [ "align_prior_matrix", "pitch" ] pitch_fmin: 65.40639132514966 pitch_fmax: 2093.004522404789 -# LJSpeech stats (per frame), train (these values depend on pitch_fmin and pitch_fmax) -pitch_mean: 212.35873413085938 -pitch_std: 68.52806091308594 +# these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values +# by running `scripts/dataset_processing/tts/extract_sup_data.py` +pitch_mean: ??? # e.g. 212.35873413085938 for LJSpeech +pitch_std: ??? # e.g. 68.52806091308594 for LJSpeech # Default values for dataset with sample_rate=22050 sample_rate: 22050 diff --git a/examples/tts/conf/mixer-tts-x.yaml b/examples/tts/conf/mixer-tts-x.yaml index 51688376d0a3..9a0f552dc4bb 100644 --- a/examples/tts/conf/mixer-tts-x.yaml +++ b/examples/tts/conf/mixer-tts-x.yaml @@ -13,9 +13,10 @@ sup_data_types: [ "align_prior_matrix", "pitch", "lm_tokens" ] pitch_fmin: 65.40639132514966 pitch_fmax: 2093.004522404789 -# LJSpeech stats (per frame), train (these values depend on pitch_fmin and pitch_fmax) -pitch_mean: 212.35873413085938 -pitch_std: 68.52806091308594 +# these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values +# by running `scripts/dataset_processing/tts/extract_sup_data.py` +pitch_mean: ??? # e.g. 212.35873413085938 for LJSpeech +pitch_std: ??? # e.g. 68.52806091308594 for LJSpeech # Default values for dataset with sample_rate=22050 sample_rate: 22050 diff --git a/examples/tts/conf/mixer-tts.yaml b/examples/tts/conf/mixer-tts.yaml index dc8455cad44d..474e32a3e730 100644 --- a/examples/tts/conf/mixer-tts.yaml +++ b/examples/tts/conf/mixer-tts.yaml @@ -13,9 +13,10 @@ sup_data_types: [ "align_prior_matrix", "pitch" ] pitch_fmin: 65.40639132514966 pitch_fmax: 2093.004522404789 -# LJSpeech stats (per frame), train (these values depend on pitch_fmin and pitch_fmax) -pitch_mean: 212.35873413085938 -pitch_std: 68.52806091308594 +# these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values +# by running `scripts/dataset_processing/tts/extract_sup_data.py` +pitch_mean: ??? # e.g. 212.35873413085938 for LJSpeech +pitch_std: ??? # e.g. 68.52806091308594 for LJSpeech # Default values for dataset with sample_rate=22050 sample_rate: 22050 diff --git a/examples/tts/conf/rad-tts_dec.yaml b/examples/tts/conf/rad-tts_dec.yaml index 964d97162844..7b5cd5bc3442 100644 --- a/examples/tts/conf/rad-tts_dec.yaml +++ b/examples/tts/conf/rad-tts_dec.yaml @@ -10,9 +10,10 @@ sup_data_types: ["log_mel", "align_prior_matrix", "pitch", "voiced_mask", "p_voi whitelist_path: "nemo_text_processing/text_normalization/en/data/whitelist/tts.tsv" -# LJSpeech stats (per frame), train -pitch_mean: 212.35873413085938 -pitch_std: 68.52806091308594 +# these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values +# by running `scripts/dataset_processing/tts/extract_sup_data.py` +pitch_mean: ??? # e.g. 212.35873413085938 for LJSpeech +pitch_std: ??? # e.g. 68.52806091308594 for LJSpeech # default values from librosa.pyin pitch_fmin: 65.40639132514966 diff --git a/examples/tts/conf/rad-tts_feature_pred.yaml b/examples/tts/conf/rad-tts_feature_pred.yaml index bf03e4c5927c..cd7cce38f1a1 100644 --- a/examples/tts/conf/rad-tts_feature_pred.yaml +++ b/examples/tts/conf/rad-tts_feature_pred.yaml @@ -10,9 +10,10 @@ sup_data_types: ["log_mel", "align_prior_matrix", "pitch", "voiced_mask", "p_voi whitelist_path: "nemo_text_processing/text_normalization/en/data/whitelist/tts.tsv" -# LJSpeech stats (per frame), train -pitch_mean: 212.35873413085938 -pitch_std: 68.52806091308594 +# these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values +# by running `scripts/dataset_processing/tts/extract_sup_data.py` +pitch_mean: ??? # e.g. 212.35873413085938 for LJSpeech +pitch_std: ??? # e.g. 68.52806091308594 for LJSpeech # default values from librosa.pyin pitch_fmin: 65.40639132514966 diff --git a/examples/tts/conf/tacotron2.yaml b/examples/tts/conf/tacotron2.yaml index 9a443520885b..4137ef19b5ad 100644 --- a/examples/tts/conf/tacotron2.yaml +++ b/examples/tts/conf/tacotron2.yaml @@ -13,8 +13,6 @@ phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.10" heteronyms_path: "scripts/tts_dataset_files/heteronyms-052722" whitelist_path: "nemo_text_processing/text_normalization/en/data/whitelist/lj_speech.tsv" - - model: pitch_fmin: 65.40639132514966 pitch_fmax: 2093.004522404789 @@ -29,7 +27,6 @@ model: window: hann pad_value: -11.52 - text_normalizer: _target_: nemo_text_processing.text_normalization.normalize.Normalizer lang: en @@ -176,7 +173,6 @@ model: name: CosineAnnealing min_lr: 1e-5 - trainer: devices: 1 # number of gpus max_epochs: ??? @@ -191,7 +187,6 @@ trainer: check_val_every_n_epoch: 2 benchmark: false - exp_manager: exp_dir: null name: ${name} diff --git a/examples/tts/conf/zh/fastpitch_align_22050.yaml b/examples/tts/conf/zh/fastpitch_align_22050.yaml index 5a35dfa16865..20b18d042014 100644 --- a/examples/tts/conf/zh/fastpitch_align_22050.yaml +++ b/examples/tts/conf/zh/fastpitch_align_22050.yaml @@ -13,10 +13,10 @@ sup_data_types: [ "align_prior_matrix", "pitch" ] pitch_fmin: 65.40639132514966 pitch_fmax: 1986.977294921875 -# Stats (per frame), train (these values depend on pitch_fmin and pitch_fmax) -# For example, on SFbilingual dataset (Chinese and English Neutral-TTS dataset) -pitch_mean: 221.4948272705078 -pitch_std: 64.65289306640625 +# these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values +# by running `scripts/dataset_processing/tts/extract_sup_data.py` +pitch_mean: ??? # e.g. 221.4948272705078 for SFbilingual dataset. +pitch_std: ??? # e.g. 64.6528930664063 for SFbilingual dataset. # Default values for dataset with sample_rate=22050 sample_rate: 22050