From 993f730804d5bb98a70c90f9d8921ae0c63bd0d7 Mon Sep 17 00:00:00 2001
From: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Date: Mon, 14 Nov 2022 11:50:21 -0800
Subject: [PATCH] [TTS] replace pitch normalization params with ??? (#5392)

* replace pitch normalization params with ???
* add pitch mean and std overrides in CI

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
---
 Jenkinsfile                                              | 4 ++++
 examples/tts/conf/de/fastpitch_align_22050.yaml          | 8 ++++----
 examples/tts/conf/de/fastpitch_align_44100.yaml          | 9 ++++-----
 examples/tts/conf/es/fastpitch_align_44100.yaml          | 7 ++++---
 examples/tts/conf/es/fastpitch_align_44100_ipa.yaml      | 7 ++++---
 .../tts/conf/es/fastpitch_align_44100_ipa_multi.yaml     | 7 ++++---
 examples/tts/conf/fastpitch_align_44100.yaml             | 8 ++++----
 examples/tts/conf/fastpitch_align_ipa.yaml               | 7 ++++---
 examples/tts/conf/fastpitch_align_v1.05.yaml             | 7 ++++---
 examples/tts/conf/mixer-tts-x.yaml                       | 7 ++++---
 examples/tts/conf/mixer-tts.yaml                         | 7 ++++---
 examples/tts/conf/rad-tts_dec.yaml                       | 7 ++++---
 examples/tts/conf/rad-tts_feature_pred.yaml              | 7 ++++---
 examples/tts/conf/tacotron2.yaml                         | 5 -----
 examples/tts/conf/zh/fastpitch_align_22050.yaml          | 8 ++++----
 15 files changed, 56 insertions(+), 49 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 87dc4782f6e9..9d9bea8871bb 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -4164,6 +4164,8 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
             trainer.devices="[0]" \
             +trainer.limit_train_batches=1 +trainer.limit_val_batches=1 trainer.max_epochs=1 \
             trainer.strategy=null \
+            model.pitch_mean=212.35873413085938 \
+            model.pitch_std=68.52806091308594 \
             model.train_ds.dataloader_params.batch_size=4 \
             model.train_ds.dataloader_params.num_workers=0 \
             model.validation_ds.dataloader_params.batch_size=4 \
@@ -4187,6 +4189,8 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
             trainer.devices="[0]" \
             +trainer.limit_train_batches=1 +trainer.limit_val_batches=1 trainer.max_epochs=1 \
             trainer.strategy=null \
+            model.pitch_mean=212.35873413085938 \
+            model.pitch_std=68.52806091308594 \
             model.train_ds.dataloader_params.batch_size=4 \
             model.train_ds.dataloader_params.num_workers=0 \
             model.validation_ds.dataloader_params.batch_size=4 \
diff --git a/examples/tts/conf/de/fastpitch_align_22050.yaml b/examples/tts/conf/de/fastpitch_align_22050.yaml
index 20f9011287c7..62c8543b98a4 100644
--- a/examples/tts/conf/de/fastpitch_align_22050.yaml
+++ b/examples/tts/conf/de/fastpitch_align_22050.yaml
@@ -13,10 +13,10 @@ sup_data_types: [ "align_prior_matrix", "pitch" ]
 pitch_fmin: 65.40639132514966
 pitch_fmax: 2093.004522404789
 
-# Stats (per frame), train (these values depend on pitch_fmin and pitch_fmax)
-# For example, on Thorsten Muller (German Neutral-TTS dataset): http://www.openslr.org/95/
-pitch_mean: 132.524658203125
-pitch_std: 37.389366149902344
+# these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values
+# by running `scripts/dataset_processing/tts/extract_sup_data.py`
+pitch_mean: ???  # e.g. 132.524658203125 for http://www.openslr.org/95/
+pitch_std: ???   # e.g.  37.389366149902 for http://www.openslr.org/95/
 
 # Default values for dataset with sample_rate=22050
 sample_rate: 22050
diff --git a/examples/tts/conf/de/fastpitch_align_44100.yaml b/examples/tts/conf/de/fastpitch_align_44100.yaml
index 231311782264..832a37505ca4 100644
--- a/examples/tts/conf/de/fastpitch_align_44100.yaml
+++ b/examples/tts/conf/de/fastpitch_align_44100.yaml
@@ -2,7 +2,6 @@
 # rate. If you want to train model on other dataset, you can change config values according to your dataset.
 # Most dataset-specific arguments are in the head of the config file, see below.
 
-
 name: FastPitch
 
 train_dataset: ???
@@ -14,10 +13,10 @@ sup_data_types: [ "align_prior_matrix", "pitch", "speaker_id" ]
 pitch_fmin: 65.40639132514966
 pitch_fmax: 2093.004522404789
 
-# Stats (per frame), train (these values depend on pitch_fmin and pitch_fmax)
-# For example, on HUI-Audio-Corpus-German (speaker: Friedrich): https://github.com/iisys-hof/HUI-Audio-Corpus-German
-pitch_mean: 119.04859924316406
-pitch_std: 95.5344009399414
+# these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values
+# by running `scripts/dataset_processing/tts/extract_sup_data.py`
+pitch_mean: ???  # e.g. 151.9578857421875 for top-5 speakers in https://github.com/iisys-hof/HUI-Audio-Corpus-German
+pitch_std: ???   # e.g.  87.1997680664063 for top-5 speakers in https://github.com/iisys-hof/HUI-Audio-Corpus-German
 
 # Default values for dataset with sample_rate=44100
 sample_rate: 44100
diff --git a/examples/tts/conf/es/fastpitch_align_44100.yaml b/examples/tts/conf/es/fastpitch_align_44100.yaml
index 69c73be1df80..3a2cd983f368 100644
--- a/examples/tts/conf/es/fastpitch_align_44100.yaml
+++ b/examples/tts/conf/es/fastpitch_align_44100.yaml
@@ -13,9 +13,10 @@ sup_data_types: ["align_prior_matrix", "pitch"]
 pitch_fmin: 65.40639132514966
 pitch_fmax: 2093.004522404789
 
-# Stats (per frame), train (these values depend on pitch_fmin and pitch_fmax)
-pitch_mean: 212.35873413085938
-pitch_std: 68.52806091308594
+# these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values
+# by running `scripts/dataset_processing/tts/extract_sup_data.py`
+pitch_mean: ???  # e.g. 178.86880493164062 for 174 speakers in https://research.google/pubs/pub49150/
+pitch_std:  ???  # e.g.  60.64979553222656 for 174 speakers in https://research.google/pubs/pub49150/
 
 sample_rate: 44100
 n_mel_channels: 80
diff --git a/examples/tts/conf/es/fastpitch_align_44100_ipa.yaml b/examples/tts/conf/es/fastpitch_align_44100_ipa.yaml
index cffca9d06b6b..53b8f813e1dc 100644
--- a/examples/tts/conf/es/fastpitch_align_44100_ipa.yaml
+++ b/examples/tts/conf/es/fastpitch_align_44100_ipa.yaml
@@ -13,9 +13,10 @@ sup_data_types: ["align_prior_matrix", "pitch"]
 pitch_fmin: 65.40639132514966
 pitch_fmax: 2093.004522404789
 
-# Stats (per frame), train (these values depend on pitch_fmin and pitch_fmax)
-pitch_mean: 212.35873413085938
-pitch_std: 68.52806091308594
+# these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values
+# by running `scripts/dataset_processing/tts/extract_sup_data.py`
+pitch_mean: ???  # e.g. 178.86880493164062 for 174 speakers in https://research.google/pubs/pub49150/
+pitch_std:  ???  # e.g.  60.64979553222656 for 174 speakers in https://research.google/pubs/pub49150/
 
 sample_rate: 44100
 n_mel_channels: 80
diff --git a/examples/tts/conf/es/fastpitch_align_44100_ipa_multi.yaml b/examples/tts/conf/es/fastpitch_align_44100_ipa_multi.yaml
index 77d67f7880ce..8f8b30bc0aa5 100644
--- a/examples/tts/conf/es/fastpitch_align_44100_ipa_multi.yaml
+++ b/examples/tts/conf/es/fastpitch_align_44100_ipa_multi.yaml
@@ -13,9 +13,10 @@ sup_data_types: ["align_prior_matrix", "pitch", "speaker_id"]
 pitch_fmin: 65.40639132514966
 pitch_fmax: 2093.004522404789
 
-# Stats (per frame), train (these values depend on pitch_fmin and pitch_fmax)
-pitch_mean: 212.35873413085938
-pitch_std: 68.52806091308594
+# these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values
+# by running `scripts/dataset_processing/tts/extract_sup_data.py`
+pitch_mean: ???  # e.g. 178.86880493164062 for 174 speakers in https://research.google/pubs/pub49150/
+pitch_std:  ???  # e.g.  60.64979553222656 for 174 speakers in https://research.google/pubs/pub49150/
 
 sample_rate: 44100
 n_mel_channels: 80
diff --git a/examples/tts/conf/fastpitch_align_44100.yaml b/examples/tts/conf/fastpitch_align_44100.yaml
index e67b0b447252..aa60093f2d3b 100644
--- a/examples/tts/conf/fastpitch_align_44100.yaml
+++ b/examples/tts/conf/fastpitch_align_44100.yaml
@@ -2,7 +2,6 @@
 # rate. If you want to train model on other dataset, you can change config values according to your dataset.
 # Most dataset-specific arguments are in the head of the config file, see below.
 
-
 name: FastPitch
 
 train_dataset: ???
@@ -14,9 +13,10 @@ sup_data_types: [ "align_prior_matrix", "pitch" ]
 pitch_fmin: 65.40639132514966
 pitch_fmax: 2093.004522404789
 
-# Stats (per frame), train (these values depend on pitch_fmin and pitch_fmax)
-pitch_mean: 212.35873413085938
-pitch_std: 68.52806091308594
+# these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values
+# by running `scripts/dataset_processing/tts/extract_sup_data.py`
+pitch_mean: ???  # e.g. 212.35873413085938 for LJSpeech
+pitch_std:  ???  # e.g.  68.52806091308594 for LJSpeech
 
 sample_rate: 44100
 n_mel_channels: 80
diff --git a/examples/tts/conf/fastpitch_align_ipa.yaml b/examples/tts/conf/fastpitch_align_ipa.yaml
index 7132dc1b0e32..5765b3e91797 100644
--- a/examples/tts/conf/fastpitch_align_ipa.yaml
+++ b/examples/tts/conf/fastpitch_align_ipa.yaml
@@ -13,9 +13,10 @@ sup_data_types: [ "align_prior_matrix", "pitch" ]
 pitch_fmin: 65.40639132514966
 pitch_fmax: 2093.004522404789
 
-# LJSpeech stats (per frame), train (these values depend on pitch_fmin and pitch_fmax)
-pitch_mean: 212.35873413085938
-pitch_std: 68.52806091308594
+# these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values
+# by running `scripts/dataset_processing/tts/extract_sup_data.py`
+pitch_mean: ???  # e.g. 212.35873413085938 for LJSpeech
+pitch_std:  ???  # e.g.  68.52806091308594 for LJSpeech
 
 # Default values for dataset with sample_rate=22050
 sample_rate: 22050
diff --git a/examples/tts/conf/fastpitch_align_v1.05.yaml b/examples/tts/conf/fastpitch_align_v1.05.yaml
index c0cf8311f67e..368dd4e502fd 100644
--- a/examples/tts/conf/fastpitch_align_v1.05.yaml
+++ b/examples/tts/conf/fastpitch_align_v1.05.yaml
@@ -13,9 +13,10 @@ sup_data_types: [ "align_prior_matrix", "pitch" ]
 pitch_fmin: 65.40639132514966
 pitch_fmax: 2093.004522404789
 
-# LJSpeech stats (per frame), train (these values depend on pitch_fmin and pitch_fmax)
-pitch_mean: 212.35873413085938
-pitch_std: 68.52806091308594
+# these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values
+# by running `scripts/dataset_processing/tts/extract_sup_data.py`
+pitch_mean: ???  # e.g. 212.35873413085938 for LJSpeech
+pitch_std:  ???  # e.g.  68.52806091308594 for LJSpeech
 
 # Default values for dataset with sample_rate=22050
 sample_rate: 22050
diff --git a/examples/tts/conf/mixer-tts-x.yaml b/examples/tts/conf/mixer-tts-x.yaml
index 51688376d0a3..9a0f552dc4bb 100644
--- a/examples/tts/conf/mixer-tts-x.yaml
+++ b/examples/tts/conf/mixer-tts-x.yaml
@@ -13,9 +13,10 @@ sup_data_types: [ "align_prior_matrix", "pitch", "lm_tokens" ]
 pitch_fmin: 65.40639132514966
 pitch_fmax: 2093.004522404789
 
-# LJSpeech stats (per frame), train (these values depend on pitch_fmin and pitch_fmax)
-pitch_mean: 212.35873413085938
-pitch_std: 68.52806091308594
+# these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values
+# by running `scripts/dataset_processing/tts/extract_sup_data.py`
+pitch_mean: ???  # e.g. 212.35873413085938 for LJSpeech
+pitch_std:  ???  # e.g.  68.52806091308594 for LJSpeech
 
 # Default values for dataset with sample_rate=22050
 sample_rate: 22050
diff --git a/examples/tts/conf/mixer-tts.yaml b/examples/tts/conf/mixer-tts.yaml
index dc8455cad44d..474e32a3e730 100644
--- a/examples/tts/conf/mixer-tts.yaml
+++ b/examples/tts/conf/mixer-tts.yaml
@@ -13,9 +13,10 @@ sup_data_types: [ "align_prior_matrix", "pitch" ]
 pitch_fmin: 65.40639132514966
 pitch_fmax: 2093.004522404789
 
-# LJSpeech stats (per frame), train (these values depend on pitch_fmin and pitch_fmax)
-pitch_mean: 212.35873413085938
-pitch_std: 68.52806091308594
+# these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values
+# by running `scripts/dataset_processing/tts/extract_sup_data.py`
+pitch_mean: ???  # e.g. 212.35873413085938 for LJSpeech
+pitch_std:  ???  # e.g.  68.52806091308594 for LJSpeech
 
 # Default values for dataset with sample_rate=22050
 sample_rate: 22050
diff --git a/examples/tts/conf/rad-tts_dec.yaml b/examples/tts/conf/rad-tts_dec.yaml
index 964d97162844..7b5cd5bc3442 100644
--- a/examples/tts/conf/rad-tts_dec.yaml
+++ b/examples/tts/conf/rad-tts_dec.yaml
@@ -10,9 +10,10 @@ sup_data_types: ["log_mel", "align_prior_matrix", "pitch", "voiced_mask", "p_voi
 
 whitelist_path: "nemo_text_processing/text_normalization/en/data/whitelist/tts.tsv"
 
-# LJSpeech stats (per frame), train
-pitch_mean: 212.35873413085938
-pitch_std: 68.52806091308594
+# these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values
+# by running `scripts/dataset_processing/tts/extract_sup_data.py`
+pitch_mean: ???  # e.g. 212.35873413085938 for LJSpeech
+pitch_std:  ???  # e.g.  68.52806091308594 for LJSpeech
 
 # default values from librosa.pyin
 pitch_fmin: 65.40639132514966
diff --git a/examples/tts/conf/rad-tts_feature_pred.yaml b/examples/tts/conf/rad-tts_feature_pred.yaml
index bf03e4c5927c..cd7cce38f1a1 100644
--- a/examples/tts/conf/rad-tts_feature_pred.yaml
+++ b/examples/tts/conf/rad-tts_feature_pred.yaml
@@ -10,9 +10,10 @@ sup_data_types: ["log_mel", "align_prior_matrix", "pitch", "voiced_mask", "p_voi
 
 whitelist_path: "nemo_text_processing/text_normalization/en/data/whitelist/tts.tsv"
 
-# LJSpeech stats (per frame), train
-pitch_mean: 212.35873413085938
-pitch_std: 68.52806091308594
+# these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values
+# by running `scripts/dataset_processing/tts/extract_sup_data.py`
+pitch_mean: ???  # e.g. 212.35873413085938 for LJSpeech
+pitch_std:  ???  # e.g.  68.52806091308594 for LJSpeech
 
 # default values from librosa.pyin
 pitch_fmin: 65.40639132514966
diff --git a/examples/tts/conf/tacotron2.yaml b/examples/tts/conf/tacotron2.yaml
index 9a443520885b..4137ef19b5ad 100644
--- a/examples/tts/conf/tacotron2.yaml
+++ b/examples/tts/conf/tacotron2.yaml
@@ -13,8 +13,6 @@ phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.10"
 heteronyms_path: "scripts/tts_dataset_files/heteronyms-052722"
 whitelist_path: "nemo_text_processing/text_normalization/en/data/whitelist/lj_speech.tsv"
 
-
-
 model:
   pitch_fmin: 65.40639132514966
   pitch_fmax: 2093.004522404789
@@ -29,7 +27,6 @@ model:
   window: hann
   pad_value: -11.52
 
-
   text_normalizer:
     _target_: nemo_text_processing.text_normalization.normalize.Normalizer
     lang: en
@@ -176,7 +173,6 @@ model:
       name: CosineAnnealing
       min_lr: 1e-5
 
-
 trainer:
   devices: 1 # number of gpus
   max_epochs: ???
@@ -191,7 +187,6 @@ trainer:
   check_val_every_n_epoch: 2
   benchmark: false
 
-
 exp_manager:
   exp_dir: null
   name: ${name}
diff --git a/examples/tts/conf/zh/fastpitch_align_22050.yaml b/examples/tts/conf/zh/fastpitch_align_22050.yaml
index 5a35dfa16865..20b18d042014 100644
--- a/examples/tts/conf/zh/fastpitch_align_22050.yaml
+++ b/examples/tts/conf/zh/fastpitch_align_22050.yaml
@@ -13,10 +13,10 @@ sup_data_types: [ "align_prior_matrix", "pitch" ]
 pitch_fmin: 65.40639132514966
 pitch_fmax: 1986.977294921875
 
-# Stats (per frame), train (these values depend on pitch_fmin and pitch_fmax)
-# For example, on SFbilingual dataset (Chinese and English Neutral-TTS dataset)
-pitch_mean: 221.4948272705078
-pitch_std: 64.65289306640625
+# these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values
+# by running `scripts/dataset_processing/tts/extract_sup_data.py`
+pitch_mean: ???  # e.g. 221.4948272705078 for SFbilingual dataset.
+pitch_std:  ???  # e.g.  64.6528930664063 for SFbilingual dataset.
 
 # Default values for dataset with sample_rate=22050
 sample_rate: 22050