Update Interface(s) phonetic entry (NVIDIA#5212)

* change interface(s) phone Signed-off-by: Jason <[email protected]> * push version Signed-off-by: Jason <[email protected]> * update dict path Signed-off-by: Jason <[email protected]> Signed-off-by: Jason <[email protected]> Signed-off-by: Hainan Xu <[email protected]>
hainan-xv · Nov 29, 2022 · 68f2743 · 68f2743
1 parent cd85fef
commit 68f2743
Show file tree

Hide file tree

Showing 20 changed files with 99 additions and 88 deletions.
diff --git a/examples/tts/conf/aligner.yaml b/examples/tts/conf/aligner.yaml
@@ -19,7 +19,7 @@ lowfreq: 0
 highfreq: 8000
 window: hann
 
-phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.08"
+phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.10"
 heteronyms_path: "scripts/tts_dataset_files/heteronyms-052722"
 whitelist_path: "nemo_text_processing/text_normalization/en/data/whitelist/lj_speech.tsv"
 

diff --git a/examples/tts/conf/fastpitch_align_44100.yaml b/examples/tts/conf/fastpitch_align_44100.yaml
@@ -1,5 +1,5 @@
-# This config contains the default values for training FastPitch model with aligner using 44.1KHz sampling 
-# rate. If you want to train model on other dataset, you can change config values according to your dataset. 
+# This config contains the default values for training FastPitch model with aligner using 44.1KHz sampling
+# rate. If you want to train model on other dataset, you can change config values according to your dataset.
 # Most dataset-specific arguments are in the head of the config file, see below.
 
 
@@ -27,7 +27,7 @@ lowfreq: 0
 highfreq: null
 window: hann
 
-phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.08"
+phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.10"
 heteronyms_path: "scripts/tts_dataset_files/heteronyms-052722"
 whitelist_path: "nemo_text_processing/text_normalization/en/data/whitelist/lj_speech.tsv"
 
@@ -60,12 +60,12 @@ model:
     lang: en
     input_case: cased
     whitelist: ${whitelist_path}
-  
+
   text_normalizer_call_kwargs:
     verbose: false
     punct_pre_process: true
     punct_post_process: true
-  
+
   text_tokenizer:
     _target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.EnglishPhonemesTokenizer
     punct: true

diff --git a/examples/tts/conf/fastpitch_align_ipa.yaml b/examples/tts/conf/fastpitch_align_ipa.yaml
@@ -27,7 +27,7 @@ lowfreq: 0
 highfreq: 8000
 window: hann
 
-phoneme_dict_path: "scripts/tts_dataset_files/ipa_cmudict-0.7b_nv22.08.txt"
+phoneme_dict_path: "scripts/tts_dataset_files/ipa_cmudict-0.7b_nv22.10.txt"
 heteronyms_path: "scripts/tts_dataset_files/heteronyms-052722"
 whitelist_path: "nemo_text_processing/text_normalization/en/data/whitelist/lj_speech.tsv"
 

diff --git a/examples/tts/conf/fastpitch_align_v1.05.yaml b/examples/tts/conf/fastpitch_align_v1.05.yaml
@@ -27,7 +27,7 @@ lowfreq: 0
 highfreq: 8000
 window: hann
 
-phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.08"
+phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.10"
 heteronyms_path: "scripts/tts_dataset_files/heteronyms-052722"
 whitelist_path: "nemo_text_processing/text_normalization/en/data/whitelist/lj_speech.tsv"
 

diff --git a/examples/tts/conf/mixer-tts.yaml b/examples/tts/conf/mixer-tts.yaml
@@ -27,7 +27,7 @@ lowfreq: 0
 highfreq: 8000
 window: hann
 
-phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.08"
+phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.10"
 heteronyms_path: "scripts/tts_dataset_files/heteronyms-052722"
 whitelist_path: "nemo_text_processing/text_normalization/en/data/whitelist/lj_speech.tsv"
 

diff --git a/examples/tts/conf/rad-tts_dec.yaml b/examples/tts/conf/rad-tts_dec.yaml
@@ -6,7 +6,7 @@ validation_datasets: ???
 ckpt_path: None
 export_dir: ???
 sup_data_path: ???
-sup_data_types: ["log_mel", "align_prior_matrix", "pitch", "voiced_mask", "p_voiced", "energy"] 
+sup_data_types: ["log_mel", "align_prior_matrix", "pitch", "voiced_mask", "p_voiced", "energy"]
 
 whitelist_path: "nemo_text_processing/text_normalization/en/data/whitelist/tts.tsv"
 
@@ -28,7 +28,7 @@ highfreq: 8000
 window: "hann"
 
 
-phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.08"
+phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.10"
 heteronyms_path: "scripts/tts_dataset_files/heteronyms-052722"
 mapping_file_path: ""
 
@@ -42,7 +42,7 @@ model:
 
   pitch_mean: ${pitch_mean}
   pitch_std: ${pitch_std}
-  
+
   text_normalizer:
     _target_: nemo_text_processing.text_normalization.normalize.Normalizer
     lang: en
@@ -75,7 +75,7 @@ model:
       sample_rate: ${sample_rate}
       sup_data_path: ${sup_data_path}
       sup_data_types: ${sup_data_types}
-      n_fft: ${n_fft} 
+      n_fft: ${n_fft}
       win_length: ${n_window_size}
       hop_length: ${n_window_stride}
       window: ${window}
@@ -87,10 +87,10 @@ model:
       ignore_file: null
       trim: False
       pitch_fmin: ${pitch_fmin}
-      pitch_fmax: ${pitch_fmax} 
-      
-      
-      
+      pitch_fmax: ${pitch_fmax}
+
+
+
       text_tokenizer:
         _target_: "nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.EnglishPhonemesTokenizer"
         punct: True
@@ -134,7 +134,7 @@ model:
       trim: False
       pitch_fmin: ${pitch_fmin}
       pitch_fmax: ${pitch_fmax}
-         
+
       text_tokenizer:
         _target_: "nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.EnglishPhonemesTokenizer"
         punct: True
@@ -167,8 +167,8 @@ model:
     sched:
       name: exp_decay
       warmup_steps: 40000
-      last_epoch: -1 
-      d_model: 1  # Disable scaling based on model dim 
+      last_epoch: -1
+      d_model: 1  # Disable scaling based on model dim
   trainerConfig:
     sigma: 1
     iters_per_checkpoint: 3000
@@ -190,7 +190,7 @@ model:
         energy_loss_weight: 1.0
         vpred_loss_weight: 1.0
     unfreeze_modules: "all"
-        
+
   load_from_checkpoint: False
   init_from_ptl_ckpt: ${ckpt_path}
   modelConfig:
@@ -229,17 +229,17 @@ model:
         dur_model_config: null
         f0_model_config: null
         energy_model_config: null
-        v_model_config : 
+        v_model_config :
              name : dap
-             hparams : 
-                n_speaker_dim : 16 
+             hparams :
+                n_speaker_dim : 16
                 take_log_of_input: false
-                bottleneck_hparams: 
+                bottleneck_hparams:
                     in_dim: 512
                     reduction_factor: 16
                     norm: weightnorm
                     non_linearity: relu
-                arch_hparams: 
+                arch_hparams:
                     out_dim: 1
                     n_layers: 2
                     n_channels: 256
@@ -256,7 +256,7 @@ trainer:
   accumulate_grad_batches: 1
   enable_checkpointing: False
   logger: False
-  gradient_clip_val: 1 
+  gradient_clip_val: 1
   log_every_n_steps: 100
   check_val_every_n_epoch: 5
 

diff --git a/examples/tts/conf/rad-tts_feature_pred.yaml b/examples/tts/conf/rad-tts_feature_pred.yaml
@@ -6,7 +6,7 @@ validation_datasets: ???
 ckpt_path: ???
 export_dir: ???
 sup_data_path: ???
-sup_data_types: ["log_mel", "align_prior_matrix", "pitch", "voiced_mask", "p_voiced", "energy"] 
+sup_data_types: ["log_mel", "align_prior_matrix", "pitch", "voiced_mask", "p_voiced", "energy"]
 
 whitelist_path: "nemo_text_processing/text_normalization/en/data/whitelist/tts.tsv"
 
@@ -27,8 +27,8 @@ lowfreq: 0
 highfreq: 8000
 window: "hann"
 
-phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.08"
-heteronyms_path: "scripts/tts_dataset_files/heteronyms-052722" 
+phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.10"
+heteronyms_path: "scripts/tts_dataset_files/heteronyms-052722"
 mapping_file_path: ""
 
 model:
@@ -41,7 +41,7 @@ model:
 
   pitch_mean: ${pitch_mean}
   pitch_std: ${pitch_std}
-  
+
   text_normalizer:
       _target_: nemo_text_processing.text_normalization.normalize.Normalizer
       lang: en
@@ -52,7 +52,7 @@ model:
       verbose: false
       punct_pre_process: true
       punct_post_process: true
-  
+
   text_tokenizer:
     _target_: nemo.collections.tts.torch.tts_tokenizers.EnglishPhonemesTokenizer
     punct: true
@@ -86,10 +86,10 @@ model:
       ignore_file: null
       trim: False
       pitch_fmin: ${pitch_fmin}
-      pitch_fmax: ${pitch_fmax} 
-      
-      
-      
+      pitch_fmax: ${pitch_fmax}
+
+
+
       text_tokenizer:
         _target_: "nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.EnglishPhonemesTokenizer"
         punct: True
@@ -133,7 +133,7 @@ model:
       trim: False
       pitch_fmin: ${pitch_fmin}
       pitch_fmax: ${pitch_fmax}
-         
+
       text_tokenizer:
         _target_: "nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.EnglishPhonemesTokenizer"
         punct: True
@@ -166,8 +166,8 @@ model:
     sched:
       name: exp_decay
       warmup_steps: 40000
-      last_epoch: -1 
-      d_model: 1  # Disable scaling based on model dim 
+      last_epoch: -1
+      d_model: 1  # Disable scaling based on model dim
   trainerConfig:
     sigma: 1
     iters_per_checkpoint: 3000
@@ -189,7 +189,7 @@ model:
         energy_loss_weight: 1.0
         vpred_loss_weight: 1.0
     unfreeze_modules: "durf0energyvpred"
-        
+
   load_from_checkpoint: True
   init_from_ptl_ckpt: ${ckpt_path}
   modelConfig:
@@ -241,66 +241,66 @@ model:
         p_dropout: 0.1
         noise_to_unvoiced_in_f0: 0
         noise_to_pvoiced: 0
-        dur_model_config: 
-            name: dap 
-            hparams: 
-                n_speaker_dim: 16 
-                bottleneck_hparams: 
+        dur_model_config:
+            name: dap
+            hparams:
+                n_speaker_dim: 16
+                bottleneck_hparams:
                     in_dim: 512
                     reduction_factor: 16
                     norm: weightnorm
                     non_linearity: relu
                 take_log_of_input: true
-                arch_hparams: 
+                arch_hparams:
                     out_dim: 1
                     n_layers: 2
                     n_channels: 256
                     kernel_size: 3
                     p_dropout: 0.1
-        f0_model_config: 
+        f0_model_config:
             name: dap
             hparams:
                 n_speaker_dim: 16
-                bottleneck_hparams: 
+                bottleneck_hparams:
                     in_dim: 512
                     reduction_factor: 16
                     norm: weightnorm
                     non_linearity: relu
                 take_log_of_input: false
-                arch_hparams: 
+                arch_hparams:
                     out_dim: 1
                     n_layers: 2
                     n_channels: 256
                     kernel_size: 11
                     p_dropout: 0.5
 
-        energy_model_config: 
+        energy_model_config:
             name: dap
             hparams:
                 n_speaker_dim: 16
-                bottleneck_hparams: 
+                bottleneck_hparams:
                     in_dim: 512
                     reduction_factor: 16
                     norm: weightnorm
                     non_linearity: relu
                 take_log_of_input: false
-                arch_hparams: 
+                arch_hparams:
                     out_dim: 1
                     n_layers: 2
                     n_channels: 256
                     kernel_size: 3
                     p_dropout: 0.5
-        v_model_config : 
+        v_model_config :
             name: dap
             hparams:
                 n_speaker_dim: 16
                 take_log_of_input: false
-                bottleneck_hparams: 
+                bottleneck_hparams:
                     in_dim: 512
                     reduction_factor: 16
                     norm: weightnorm
                     non_linearity: relu
-                arch_hparams: 
+                arch_hparams:
                     out_dim: 1
                     n_layers: 2
                     n_channels: 256

diff --git a/examples/tts/conf/tacotron2.yaml b/examples/tts/conf/tacotron2.yaml
@@ -9,7 +9,7 @@ validation_datasets: ???
 sup_data_path: null
 sup_data_types: null
 
-phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.08"
+phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.10"
 heteronyms_path: "scripts/tts_dataset_files/heteronyms-052722"
 whitelist_path: "nemo_text_processing/text_normalization/en/data/whitelist/lj_speech.tsv"
 
@@ -79,7 +79,7 @@ model:
       batch_size: 48
       num_workers: 4
       pin_memory: true
-  
+
   validation_ds:
     dataset:
       _target_: "nemo.collections.tts.torch.data.TTSDataset"

diff --git a/nemo/collections/tts/models/fastpitch.py b/nemo/collections/tts/models/fastpitch.py
@@ -46,7 +46,7 @@
 @dataclass
 class G2PConfig:
     _target_: str = "nemo_text_processing.g2p.modules.EnglishG2p"
-    phoneme_dict: str = "scripts/tts_dataset_files/cmudict-0.7b_nv22.08"
+    phoneme_dict: str = "scripts/tts_dataset_files/cmudict-0.7b_nv22.10"
     heteronyms: str = "scripts/tts_dataset_files/heteronyms-052722"
     phoneme_probability: float = 0.5
 

diff --git a/nemo/collections/tts/torch/tts_dataset.yaml b/nemo/collections/tts/torch/tts_dataset.yaml
@@ -42,5 +42,5 @@ tts_dataset:
     pad_with_space: True
     g2p:
       _target_: nemo_text_processing.g2p.modules.EnglishG2p
-      phoneme_dict: "scripts/tts_dataset_files/cmudict-0.7b_nv22.08"
+      phoneme_dict: "scripts/tts_dataset_files/cmudict-0.7b_nv22.10"
       heteronyms: "scripts/tts_dataset_files/heteronyms-052722"