[TTS] replace pitch normalization params with ??? (NVIDIA#5392)

* replace pitch normalization params with ??? * add pitch mean and std overrides in CI Signed-off-by: Xuesong Yang <[email protected]>
JimmyZhang12 · Dec 14, 2022 · 993f730 · 993f730
1 parent ce39a0d
commit 993f730
Show file tree

Hide file tree

Showing 15 changed files with 56 additions and 49 deletions.
diff --git a/Jenkinsfile b/Jenkinsfile
@@ -4164,6 +4164,8 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
             trainer.devices="[0]" \
             +trainer.limit_train_batches=1 +trainer.limit_val_batches=1 trainer.max_epochs=1 \
             trainer.strategy=null \
+            model.pitch_mean=212.35873413085938 \
+            model.pitch_std=68.52806091308594 \
             model.train_ds.dataloader_params.batch_size=4 \
             model.train_ds.dataloader_params.num_workers=0 \
             model.validation_ds.dataloader_params.batch_size=4 \
@@ -4187,6 +4189,8 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
             trainer.devices="[0]" \
             +trainer.limit_train_batches=1 +trainer.limit_val_batches=1 trainer.max_epochs=1 \
             trainer.strategy=null \
+            model.pitch_mean=212.35873413085938 \
+            model.pitch_std=68.52806091308594 \
             model.train_ds.dataloader_params.batch_size=4 \
             model.train_ds.dataloader_params.num_workers=0 \
             model.validation_ds.dataloader_params.batch_size=4 \

diff --git a/examples/tts/conf/de/fastpitch_align_22050.yaml b/examples/tts/conf/de/fastpitch_align_22050.yaml
@@ -13,10 +13,10 @@ sup_data_types: [ "align_prior_matrix", "pitch" ]
 pitch_fmin: 65.40639132514966
 pitch_fmax: 2093.004522404789
 
-# Stats (per frame), train (these values depend on pitch_fmin and pitch_fmax)
-# For example, on Thorsten Muller (German Neutral-TTS dataset): http://www.openslr.org/95/
-pitch_mean: 132.524658203125
-pitch_std: 37.389366149902344
+# these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values
+# by running `scripts/dataset_processing/tts/extract_sup_data.py`
+pitch_mean: ???  # e.g. 132.524658203125 for http://www.openslr.org/95/
+pitch_std: ???   # e.g.  37.389366149902 for http://www.openslr.org/95/
 
 # Default values for dataset with sample_rate=22050
 sample_rate: 22050

diff --git a/examples/tts/conf/de/fastpitch_align_44100.yaml b/examples/tts/conf/de/fastpitch_align_44100.yaml
@@ -2,7 +2,6 @@
 # rate. If you want to train model on other dataset, you can change config values according to your dataset.
 # Most dataset-specific arguments are in the head of the config file, see below.
 
-
 name: FastPitch
 
 train_dataset: ???
@@ -14,10 +13,10 @@ sup_data_types: [ "align_prior_matrix", "pitch", "speaker_id" ]
 pitch_fmin: 65.40639132514966
 pitch_fmax: 2093.004522404789
 
-# Stats (per frame), train (these values depend on pitch_fmin and pitch_fmax)
-# For example, on HUI-Audio-Corpus-German (speaker: Friedrich): https://github.com/iisys-hof/HUI-Audio-Corpus-German
-pitch_mean: 119.04859924316406
-pitch_std: 95.5344009399414
+# these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values
+# by running `scripts/dataset_processing/tts/extract_sup_data.py`
+pitch_mean: ???  # e.g. 151.9578857421875 for top-5 speakers in https://github.com/iisys-hof/HUI-Audio-Corpus-German
+pitch_std: ???   # e.g.  87.1997680664063 for top-5 speakers in https://github.com/iisys-hof/HUI-Audio-Corpus-German
 
 # Default values for dataset with sample_rate=44100
 sample_rate: 44100

diff --git a/examples/tts/conf/es/fastpitch_align_44100.yaml b/examples/tts/conf/es/fastpitch_align_44100.yaml
@@ -13,9 +13,10 @@ sup_data_types: ["align_prior_matrix", "pitch"]
 pitch_fmin: 65.40639132514966
 pitch_fmax: 2093.004522404789
 
-# Stats (per frame), train (these values depend on pitch_fmin and pitch_fmax)
-pitch_mean: 212.35873413085938
-pitch_std: 68.52806091308594
+# these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values
+# by running `scripts/dataset_processing/tts/extract_sup_data.py`
+pitch_mean: ???  # e.g. 178.86880493164062 for 174 speakers in https://research.google/pubs/pub49150/
+pitch_std:  ???  # e.g.  60.64979553222656 for 174 speakers in https://research.google/pubs/pub49150/
 
 sample_rate: 44100
 n_mel_channels: 80

diff --git a/examples/tts/conf/es/fastpitch_align_44100_ipa.yaml b/examples/tts/conf/es/fastpitch_align_44100_ipa.yaml
@@ -13,9 +13,10 @@ sup_data_types: ["align_prior_matrix", "pitch"]
 pitch_fmin: 65.40639132514966
 pitch_fmax: 2093.004522404789
 
-# Stats (per frame), train (these values depend on pitch_fmin and pitch_fmax)
-pitch_mean: 212.35873413085938
-pitch_std: 68.52806091308594
+# these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values
+# by running `scripts/dataset_processing/tts/extract_sup_data.py`
+pitch_mean: ???  # e.g. 178.86880493164062 for 174 speakers in https://research.google/pubs/pub49150/
+pitch_std:  ???  # e.g.  60.64979553222656 for 174 speakers in https://research.google/pubs/pub49150/
 
 sample_rate: 44100
 n_mel_channels: 80

diff --git a/examples/tts/conf/es/fastpitch_align_44100_ipa_multi.yaml b/examples/tts/conf/es/fastpitch_align_44100_ipa_multi.yaml
@@ -13,9 +13,10 @@ sup_data_types: ["align_prior_matrix", "pitch", "speaker_id"]
 pitch_fmin: 65.40639132514966
 pitch_fmax: 2093.004522404789
 
-# Stats (per frame), train (these values depend on pitch_fmin and pitch_fmax)
-pitch_mean: 212.35873413085938
-pitch_std: 68.52806091308594
+# these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values
+# by running `scripts/dataset_processing/tts/extract_sup_data.py`
+pitch_mean: ???  # e.g. 178.86880493164062 for 174 speakers in https://research.google/pubs/pub49150/
+pitch_std:  ???  # e.g.  60.64979553222656 for 174 speakers in https://research.google/pubs/pub49150/
 
 sample_rate: 44100
 n_mel_channels: 80

diff --git a/examples/tts/conf/fastpitch_align_44100.yaml b/examples/tts/conf/fastpitch_align_44100.yaml
@@ -2,7 +2,6 @@
 # rate. If you want to train model on other dataset, you can change config values according to your dataset.
 # Most dataset-specific arguments are in the head of the config file, see below.
 
-
 name: FastPitch
 
 train_dataset: ???
@@ -14,9 +13,10 @@ sup_data_types: [ "align_prior_matrix", "pitch" ]
 pitch_fmin: 65.40639132514966
 pitch_fmax: 2093.004522404789
 
-# Stats (per frame), train (these values depend on pitch_fmin and pitch_fmax)
-pitch_mean: 212.35873413085938
-pitch_std: 68.52806091308594
+# these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values
+# by running `scripts/dataset_processing/tts/extract_sup_data.py`
+pitch_mean: ???  # e.g. 212.35873413085938 for LJSpeech
+pitch_std:  ???  # e.g.  68.52806091308594 for LJSpeech
 
 sample_rate: 44100
 n_mel_channels: 80

diff --git a/examples/tts/conf/fastpitch_align_ipa.yaml b/examples/tts/conf/fastpitch_align_ipa.yaml
@@ -13,9 +13,10 @@ sup_data_types: [ "align_prior_matrix", "pitch" ]
 pitch_fmin: 65.40639132514966
 pitch_fmax: 2093.004522404789
 
-# LJSpeech stats (per frame), train (these values depend on pitch_fmin and pitch_fmax)
-pitch_mean: 212.35873413085938
-pitch_std: 68.52806091308594
+# these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values
+# by running `scripts/dataset_processing/tts/extract_sup_data.py`
+pitch_mean: ???  # e.g. 212.35873413085938 for LJSpeech
+pitch_std:  ???  # e.g.  68.52806091308594 for LJSpeech
 
 # Default values for dataset with sample_rate=22050
 sample_rate: 22050

diff --git a/examples/tts/conf/fastpitch_align_v1.05.yaml b/examples/tts/conf/fastpitch_align_v1.05.yaml
@@ -13,9 +13,10 @@ sup_data_types: [ "align_prior_matrix", "pitch" ]
 pitch_fmin: 65.40639132514966
 pitch_fmax: 2093.004522404789
 
-# LJSpeech stats (per frame), train (these values depend on pitch_fmin and pitch_fmax)
-pitch_mean: 212.35873413085938
-pitch_std: 68.52806091308594
+# these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values
+# by running `scripts/dataset_processing/tts/extract_sup_data.py`
+pitch_mean: ???  # e.g. 212.35873413085938 for LJSpeech
+pitch_std:  ???  # e.g.  68.52806091308594 for LJSpeech
 
 # Default values for dataset with sample_rate=22050
 sample_rate: 22050

diff --git a/examples/tts/conf/mixer-tts-x.yaml b/examples/tts/conf/mixer-tts-x.yaml
@@ -13,9 +13,10 @@ sup_data_types: [ "align_prior_matrix", "pitch", "lm_tokens" ]
 pitch_fmin: 65.40639132514966
 pitch_fmax: 2093.004522404789
 
-# LJSpeech stats (per frame), train (these values depend on pitch_fmin and pitch_fmax)
-pitch_mean: 212.35873413085938
-pitch_std: 68.52806091308594
+# these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values
+# by running `scripts/dataset_processing/tts/extract_sup_data.py`
+pitch_mean: ???  # e.g. 212.35873413085938 for LJSpeech
+pitch_std:  ???  # e.g.  68.52806091308594 for LJSpeech
 
 # Default values for dataset with sample_rate=22050
 sample_rate: 22050

diff --git a/examples/tts/conf/mixer-tts.yaml b/examples/tts/conf/mixer-tts.yaml
@@ -13,9 +13,10 @@ sup_data_types: [ "align_prior_matrix", "pitch" ]
 pitch_fmin: 65.40639132514966
 pitch_fmax: 2093.004522404789
 
-# LJSpeech stats (per frame), train (these values depend on pitch_fmin and pitch_fmax)
-pitch_mean: 212.35873413085938
-pitch_std: 68.52806091308594
+# these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values
+# by running `scripts/dataset_processing/tts/extract_sup_data.py`
+pitch_mean: ???  # e.g. 212.35873413085938 for LJSpeech
+pitch_std:  ???  # e.g.  68.52806091308594 for LJSpeech
 
 # Default values for dataset with sample_rate=22050
 sample_rate: 22050

diff --git a/examples/tts/conf/rad-tts_dec.yaml b/examples/tts/conf/rad-tts_dec.yaml
@@ -10,9 +10,10 @@ sup_data_types: ["log_mel", "align_prior_matrix", "pitch", "voiced_mask", "p_voi
 
 whitelist_path: "nemo_text_processing/text_normalization/en/data/whitelist/tts.tsv"
 
-# LJSpeech stats (per frame), train
-pitch_mean: 212.35873413085938
-pitch_std: 68.52806091308594
+# these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values
+# by running `scripts/dataset_processing/tts/extract_sup_data.py`
+pitch_mean: ???  # e.g. 212.35873413085938 for LJSpeech
+pitch_std:  ???  # e.g.  68.52806091308594 for LJSpeech
 
 # default values from librosa.pyin
 pitch_fmin: 65.40639132514966

diff --git a/examples/tts/conf/rad-tts_feature_pred.yaml b/examples/tts/conf/rad-tts_feature_pred.yaml
@@ -10,9 +10,10 @@ sup_data_types: ["log_mel", "align_prior_matrix", "pitch", "voiced_mask", "p_voi
 
 whitelist_path: "nemo_text_processing/text_normalization/en/data/whitelist/tts.tsv"
 
-# LJSpeech stats (per frame), train
-pitch_mean: 212.35873413085938
-pitch_std: 68.52806091308594
+# these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values
+# by running `scripts/dataset_processing/tts/extract_sup_data.py`
+pitch_mean: ???  # e.g. 212.35873413085938 for LJSpeech
+pitch_std:  ???  # e.g.  68.52806091308594 for LJSpeech
 
 # default values from librosa.pyin
 pitch_fmin: 65.40639132514966

diff --git a/examples/tts/conf/tacotron2.yaml b/examples/tts/conf/tacotron2.yaml
@@ -13,8 +13,6 @@ phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.10"
 heteronyms_path: "scripts/tts_dataset_files/heteronyms-052722"
 whitelist_path: "nemo_text_processing/text_normalization/en/data/whitelist/lj_speech.tsv"
 
-
-
 model:
   pitch_fmin: 65.40639132514966
   pitch_fmax: 2093.004522404789
@@ -29,7 +27,6 @@ model:
   window: hann
   pad_value: -11.52
 
-
   text_normalizer:
     _target_: nemo_text_processing.text_normalization.normalize.Normalizer
     lang: en
@@ -176,7 +173,6 @@ model:
       name: CosineAnnealing
       min_lr: 1e-5
 
-
 trainer:
   devices: 1 # number of gpus
   max_epochs: ???
@@ -191,7 +187,6 @@ trainer:
   check_val_every_n_epoch: 2
   benchmark: false
 
-
 exp_manager:
   exp_dir: null
   name: ${name}

diff --git a/examples/tts/conf/zh/fastpitch_align_22050.yaml b/examples/tts/conf/zh/fastpitch_align_22050.yaml
@@ -13,10 +13,10 @@ sup_data_types: [ "align_prior_matrix", "pitch" ]
 pitch_fmin: 65.40639132514966
 pitch_fmax: 1986.977294921875
 
-# Stats (per frame), train (these values depend on pitch_fmin and pitch_fmax)
-# For example, on SFbilingual dataset (Chinese and English Neutral-TTS dataset)
-pitch_mean: 221.4948272705078
-pitch_std: 64.65289306640625
+# these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values
+# by running `scripts/dataset_processing/tts/extract_sup_data.py`
+pitch_mean: ???  # e.g. 221.4948272705078 for SFbilingual dataset.
+pitch_std:  ???  # e.g.  64.6528930664063 for SFbilingual dataset.
 
 # Default values for dataset with sample_rate=22050
 sample_rate: 22050