Skip to content

Commit

Permalink
[TTS] replace pitch normalization params with ??? (NVIDIA#5392)
Browse files Browse the repository at this point in the history
* replace pitch normalization params with ???
* add pitch mean and std overrides in CI

Signed-off-by: Xuesong Yang <[email protected]>
  • Loading branch information
XuesongYang authored and Jimmy Zhang committed Dec 14, 2022
1 parent ce39a0d commit 993f730
Show file tree
Hide file tree
Showing 15 changed files with 56 additions and 49 deletions.
4 changes: 4 additions & 0 deletions Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -4164,6 +4164,8 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
trainer.devices="[0]" \
+trainer.limit_train_batches=1 +trainer.limit_val_batches=1 trainer.max_epochs=1 \
trainer.strategy=null \
model.pitch_mean=212.35873413085938 \
model.pitch_std=68.52806091308594 \
model.train_ds.dataloader_params.batch_size=4 \
model.train_ds.dataloader_params.num_workers=0 \
model.validation_ds.dataloader_params.batch_size=4 \
Expand All @@ -4187,6 +4189,8 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
trainer.devices="[0]" \
+trainer.limit_train_batches=1 +trainer.limit_val_batches=1 trainer.max_epochs=1 \
trainer.strategy=null \
model.pitch_mean=212.35873413085938 \
model.pitch_std=68.52806091308594 \
model.train_ds.dataloader_params.batch_size=4 \
model.train_ds.dataloader_params.num_workers=0 \
model.validation_ds.dataloader_params.batch_size=4 \
Expand Down
8 changes: 4 additions & 4 deletions examples/tts/conf/de/fastpitch_align_22050.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,10 @@ sup_data_types: [ "align_prior_matrix", "pitch" ]
pitch_fmin: 65.40639132514966
pitch_fmax: 2093.004522404789

# Stats (per frame), train (these values depend on pitch_fmin and pitch_fmax)
# For example, on Thorsten Muller (German Neutral-TTS dataset): http://www.openslr.org/95/
pitch_mean: 132.524658203125
pitch_std: 37.389366149902344
# these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values
# by running `scripts/dataset_processing/tts/extract_sup_data.py`
pitch_mean: ??? # e.g. 132.524658203125 for http://www.openslr.org/95/
pitch_std: ??? # e.g. 37.389366149902 for http://www.openslr.org/95/

# Default values for dataset with sample_rate=22050
sample_rate: 22050
Expand Down
9 changes: 4 additions & 5 deletions examples/tts/conf/de/fastpitch_align_44100.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
# rate. If you want to train model on other dataset, you can change config values according to your dataset.
# Most dataset-specific arguments are in the head of the config file, see below.


name: FastPitch

train_dataset: ???
Expand All @@ -14,10 +13,10 @@ sup_data_types: [ "align_prior_matrix", "pitch", "speaker_id" ]
pitch_fmin: 65.40639132514966
pitch_fmax: 2093.004522404789

# Stats (per frame), train (these values depend on pitch_fmin and pitch_fmax)
# For example, on HUI-Audio-Corpus-German (speaker: Friedrich): https://github.com/iisys-hof/HUI-Audio-Corpus-German
pitch_mean: 119.04859924316406
pitch_std: 95.5344009399414
# these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values
# by running `scripts/dataset_processing/tts/extract_sup_data.py`
pitch_mean: ??? # e.g. 151.9578857421875 for top-5 speakers in https://github.com/iisys-hof/HUI-Audio-Corpus-German
pitch_std: ??? # e.g. 87.1997680664063 for top-5 speakers in https://github.com/iisys-hof/HUI-Audio-Corpus-German

# Default values for dataset with sample_rate=44100
sample_rate: 44100
Expand Down
7 changes: 4 additions & 3 deletions examples/tts/conf/es/fastpitch_align_44100.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,10 @@ sup_data_types: ["align_prior_matrix", "pitch"]
pitch_fmin: 65.40639132514966
pitch_fmax: 2093.004522404789

# Stats (per frame), train (these values depend on pitch_fmin and pitch_fmax)
pitch_mean: 212.35873413085938
pitch_std: 68.52806091308594
# these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values
# by running `scripts/dataset_processing/tts/extract_sup_data.py`
pitch_mean: ??? # e.g. 178.86880493164062 for 174 speakers in https://research.google/pubs/pub49150/
pitch_std: ??? # e.g. 60.64979553222656 for 174 speakers in https://research.google/pubs/pub49150/

sample_rate: 44100
n_mel_channels: 80
Expand Down
7 changes: 4 additions & 3 deletions examples/tts/conf/es/fastpitch_align_44100_ipa.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,10 @@ sup_data_types: ["align_prior_matrix", "pitch"]
pitch_fmin: 65.40639132514966
pitch_fmax: 2093.004522404789

# Stats (per frame), train (these values depend on pitch_fmin and pitch_fmax)
pitch_mean: 212.35873413085938
pitch_std: 68.52806091308594
# these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values
# by running `scripts/dataset_processing/tts/extract_sup_data.py`
pitch_mean: ??? # e.g. 178.86880493164062 for 174 speakers in https://research.google/pubs/pub49150/
pitch_std: ??? # e.g. 60.64979553222656 for 174 speakers in https://research.google/pubs/pub49150/

sample_rate: 44100
n_mel_channels: 80
Expand Down
7 changes: 4 additions & 3 deletions examples/tts/conf/es/fastpitch_align_44100_ipa_multi.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,10 @@ sup_data_types: ["align_prior_matrix", "pitch", "speaker_id"]
pitch_fmin: 65.40639132514966
pitch_fmax: 2093.004522404789

# Stats (per frame), train (these values depend on pitch_fmin and pitch_fmax)
pitch_mean: 212.35873413085938
pitch_std: 68.52806091308594
# these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values
# by running `scripts/dataset_processing/tts/extract_sup_data.py`
pitch_mean: ??? # e.g. 178.86880493164062 for 174 speakers in https://research.google/pubs/pub49150/
pitch_std: ??? # e.g. 60.64979553222656 for 174 speakers in https://research.google/pubs/pub49150/

sample_rate: 44100
n_mel_channels: 80
Expand Down
8 changes: 4 additions & 4 deletions examples/tts/conf/fastpitch_align_44100.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
# rate. If you want to train model on other dataset, you can change config values according to your dataset.
# Most dataset-specific arguments are in the head of the config file, see below.


name: FastPitch

train_dataset: ???
Expand All @@ -14,9 +13,10 @@ sup_data_types: [ "align_prior_matrix", "pitch" ]
pitch_fmin: 65.40639132514966
pitch_fmax: 2093.004522404789

# Stats (per frame), train (these values depend on pitch_fmin and pitch_fmax)
pitch_mean: 212.35873413085938
pitch_std: 68.52806091308594
# these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values
# by running `scripts/dataset_processing/tts/extract_sup_data.py`
pitch_mean: ??? # e.g. 212.35873413085938 for LJSpeech
pitch_std: ??? # e.g. 68.52806091308594 for LJSpeech

sample_rate: 44100
n_mel_channels: 80
Expand Down
7 changes: 4 additions & 3 deletions examples/tts/conf/fastpitch_align_ipa.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,10 @@ sup_data_types: [ "align_prior_matrix", "pitch" ]
pitch_fmin: 65.40639132514966
pitch_fmax: 2093.004522404789

# LJSpeech stats (per frame), train (these values depend on pitch_fmin and pitch_fmax)
pitch_mean: 212.35873413085938
pitch_std: 68.52806091308594
# these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values
# by running `scripts/dataset_processing/tts/extract_sup_data.py`
pitch_mean: ??? # e.g. 212.35873413085938 for LJSpeech
pitch_std: ??? # e.g. 68.52806091308594 for LJSpeech

# Default values for dataset with sample_rate=22050
sample_rate: 22050
Expand Down
7 changes: 4 additions & 3 deletions examples/tts/conf/fastpitch_align_v1.05.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,10 @@ sup_data_types: [ "align_prior_matrix", "pitch" ]
pitch_fmin: 65.40639132514966
pitch_fmax: 2093.004522404789

# LJSpeech stats (per frame), train (these values depend on pitch_fmin and pitch_fmax)
pitch_mean: 212.35873413085938
pitch_std: 68.52806091308594
# these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values
# by running `scripts/dataset_processing/tts/extract_sup_data.py`
pitch_mean: ??? # e.g. 212.35873413085938 for LJSpeech
pitch_std: ??? # e.g. 68.52806091308594 for LJSpeech

# Default values for dataset with sample_rate=22050
sample_rate: 22050
Expand Down
7 changes: 4 additions & 3 deletions examples/tts/conf/mixer-tts-x.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,10 @@ sup_data_types: [ "align_prior_matrix", "pitch", "lm_tokens" ]
pitch_fmin: 65.40639132514966
pitch_fmax: 2093.004522404789

# LJSpeech stats (per frame), train (these values depend on pitch_fmin and pitch_fmax)
pitch_mean: 212.35873413085938
pitch_std: 68.52806091308594
# these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values
# by running `scripts/dataset_processing/tts/extract_sup_data.py`
pitch_mean: ??? # e.g. 212.35873413085938 for LJSpeech
pitch_std: ??? # e.g. 68.52806091308594 for LJSpeech

# Default values for dataset with sample_rate=22050
sample_rate: 22050
Expand Down
7 changes: 4 additions & 3 deletions examples/tts/conf/mixer-tts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,10 @@ sup_data_types: [ "align_prior_matrix", "pitch" ]
pitch_fmin: 65.40639132514966
pitch_fmax: 2093.004522404789

# LJSpeech stats (per frame), train (these values depend on pitch_fmin and pitch_fmax)
pitch_mean: 212.35873413085938
pitch_std: 68.52806091308594
# these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values
# by running `scripts/dataset_processing/tts/extract_sup_data.py`
pitch_mean: ??? # e.g. 212.35873413085938 for LJSpeech
pitch_std: ??? # e.g. 68.52806091308594 for LJSpeech

# Default values for dataset with sample_rate=22050
sample_rate: 22050
Expand Down
7 changes: 4 additions & 3 deletions examples/tts/conf/rad-tts_dec.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,10 @@ sup_data_types: ["log_mel", "align_prior_matrix", "pitch", "voiced_mask", "p_voi

whitelist_path: "nemo_text_processing/text_normalization/en/data/whitelist/tts.tsv"

# LJSpeech stats (per frame), train
pitch_mean: 212.35873413085938
pitch_std: 68.52806091308594
# these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values
# by running `scripts/dataset_processing/tts/extract_sup_data.py`
pitch_mean: ??? # e.g. 212.35873413085938 for LJSpeech
pitch_std: ??? # e.g. 68.52806091308594 for LJSpeech

# default values from librosa.pyin
pitch_fmin: 65.40639132514966
Expand Down
7 changes: 4 additions & 3 deletions examples/tts/conf/rad-tts_feature_pred.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,10 @@ sup_data_types: ["log_mel", "align_prior_matrix", "pitch", "voiced_mask", "p_voi

whitelist_path: "nemo_text_processing/text_normalization/en/data/whitelist/tts.tsv"

# LJSpeech stats (per frame), train
pitch_mean: 212.35873413085938
pitch_std: 68.52806091308594
# these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values
# by running `scripts/dataset_processing/tts/extract_sup_data.py`
pitch_mean: ??? # e.g. 212.35873413085938 for LJSpeech
pitch_std: ??? # e.g. 68.52806091308594 for LJSpeech

# default values from librosa.pyin
pitch_fmin: 65.40639132514966
Expand Down
5 changes: 0 additions & 5 deletions examples/tts/conf/tacotron2.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,6 @@ phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.10"
heteronyms_path: "scripts/tts_dataset_files/heteronyms-052722"
whitelist_path: "nemo_text_processing/text_normalization/en/data/whitelist/lj_speech.tsv"



model:
pitch_fmin: 65.40639132514966
pitch_fmax: 2093.004522404789
Expand All @@ -29,7 +27,6 @@ model:
window: hann
pad_value: -11.52


text_normalizer:
_target_: nemo_text_processing.text_normalization.normalize.Normalizer
lang: en
Expand Down Expand Up @@ -176,7 +173,6 @@ model:
name: CosineAnnealing
min_lr: 1e-5


trainer:
devices: 1 # number of gpus
max_epochs: ???
Expand All @@ -191,7 +187,6 @@ trainer:
check_val_every_n_epoch: 2
benchmark: false


exp_manager:
exp_dir: null
name: ${name}
Expand Down
8 changes: 4 additions & 4 deletions examples/tts/conf/zh/fastpitch_align_22050.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,10 @@ sup_data_types: [ "align_prior_matrix", "pitch" ]
pitch_fmin: 65.40639132514966
pitch_fmax: 1986.977294921875

# Stats (per frame), train (these values depend on pitch_fmin and pitch_fmax)
# For example, on SFbilingual dataset (Chinese and English Neutral-TTS dataset)
pitch_mean: 221.4948272705078
pitch_std: 64.65289306640625
# these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values
# by running `scripts/dataset_processing/tts/extract_sup_data.py`
pitch_mean: ??? # e.g. 221.4948272705078 for SFbilingual dataset.
pitch_std: ??? # e.g. 64.6528930664063 for SFbilingual dataset.

# Default values for dataset with sample_rate=22050
sample_rate: 22050
Expand Down

0 comments on commit 993f730

Please sign in to comment.