diff --git a/examples/tts/conf/aligner.yaml b/examples/tts/conf/aligner.yaml index ff36804a3a9a..3ee77b70ead1 100644 --- a/examples/tts/conf/aligner.yaml +++ b/examples/tts/conf/aligner.yaml @@ -19,7 +19,7 @@ lowfreq: 0 highfreq: 8000 window: hann -phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.08" +phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.10" heteronyms_path: "scripts/tts_dataset_files/heteronyms-052722" whitelist_path: "nemo_text_processing/text_normalization/en/data/whitelist/lj_speech.tsv" diff --git a/examples/tts/conf/fastpitch_align_44100.yaml b/examples/tts/conf/fastpitch_align_44100.yaml index 04f8dfd367b6..e67b0b447252 100644 --- a/examples/tts/conf/fastpitch_align_44100.yaml +++ b/examples/tts/conf/fastpitch_align_44100.yaml @@ -1,5 +1,5 @@ -# This config contains the default values for training FastPitch model with aligner using 44.1KHz sampling -# rate. If you want to train model on other dataset, you can change config values according to your dataset. +# This config contains the default values for training FastPitch model with aligner using 44.1KHz sampling +# rate. If you want to train model on other dataset, you can change config values according to your dataset. # Most dataset-specific arguments are in the head of the config file, see below. @@ -27,7 +27,7 @@ lowfreq: 0 highfreq: null window: hann -phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.08" +phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.10" heteronyms_path: "scripts/tts_dataset_files/heteronyms-052722" whitelist_path: "nemo_text_processing/text_normalization/en/data/whitelist/lj_speech.tsv" @@ -60,12 +60,12 @@ model: lang: en input_case: cased whitelist: ${whitelist_path} - + text_normalizer_call_kwargs: verbose: false punct_pre_process: true punct_post_process: true - + text_tokenizer: _target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.EnglishPhonemesTokenizer punct: true diff --git a/examples/tts/conf/fastpitch_align_ipa.yaml b/examples/tts/conf/fastpitch_align_ipa.yaml index 40cd544b113c..7132dc1b0e32 100644 --- a/examples/tts/conf/fastpitch_align_ipa.yaml +++ b/examples/tts/conf/fastpitch_align_ipa.yaml @@ -27,7 +27,7 @@ lowfreq: 0 highfreq: 8000 window: hann -phoneme_dict_path: "scripts/tts_dataset_files/ipa_cmudict-0.7b_nv22.08.txt" +phoneme_dict_path: "scripts/tts_dataset_files/ipa_cmudict-0.7b_nv22.10.txt" heteronyms_path: "scripts/tts_dataset_files/heteronyms-052722" whitelist_path: "nemo_text_processing/text_normalization/en/data/whitelist/lj_speech.tsv" diff --git a/examples/tts/conf/fastpitch_align_v1.05.yaml b/examples/tts/conf/fastpitch_align_v1.05.yaml index 6deedfffb4bf..c0cf8311f67e 100644 --- a/examples/tts/conf/fastpitch_align_v1.05.yaml +++ b/examples/tts/conf/fastpitch_align_v1.05.yaml @@ -27,7 +27,7 @@ lowfreq: 0 highfreq: 8000 window: hann -phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.08" +phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.10" heteronyms_path: "scripts/tts_dataset_files/heteronyms-052722" whitelist_path: "nemo_text_processing/text_normalization/en/data/whitelist/lj_speech.tsv" diff --git a/examples/tts/conf/mixer-tts.yaml b/examples/tts/conf/mixer-tts.yaml index c66aac76d446..dc8455cad44d 100644 --- a/examples/tts/conf/mixer-tts.yaml +++ b/examples/tts/conf/mixer-tts.yaml @@ -27,7 +27,7 @@ lowfreq: 0 highfreq: 8000 window: hann -phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.08" +phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.10" heteronyms_path: "scripts/tts_dataset_files/heteronyms-052722" whitelist_path: "nemo_text_processing/text_normalization/en/data/whitelist/lj_speech.tsv" diff --git a/examples/tts/conf/rad-tts_dec.yaml b/examples/tts/conf/rad-tts_dec.yaml index d6dce256d50a..964d97162844 100644 --- a/examples/tts/conf/rad-tts_dec.yaml +++ b/examples/tts/conf/rad-tts_dec.yaml @@ -6,7 +6,7 @@ validation_datasets: ??? ckpt_path: None export_dir: ??? sup_data_path: ??? -sup_data_types: ["log_mel", "align_prior_matrix", "pitch", "voiced_mask", "p_voiced", "energy"] +sup_data_types: ["log_mel", "align_prior_matrix", "pitch", "voiced_mask", "p_voiced", "energy"] whitelist_path: "nemo_text_processing/text_normalization/en/data/whitelist/tts.tsv" @@ -28,7 +28,7 @@ highfreq: 8000 window: "hann" -phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.08" +phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.10" heteronyms_path: "scripts/tts_dataset_files/heteronyms-052722" mapping_file_path: "" @@ -42,7 +42,7 @@ model: pitch_mean: ${pitch_mean} pitch_std: ${pitch_std} - + text_normalizer: _target_: nemo_text_processing.text_normalization.normalize.Normalizer lang: en @@ -75,7 +75,7 @@ model: sample_rate: ${sample_rate} sup_data_path: ${sup_data_path} sup_data_types: ${sup_data_types} - n_fft: ${n_fft} + n_fft: ${n_fft} win_length: ${n_window_size} hop_length: ${n_window_stride} window: ${window} @@ -87,10 +87,10 @@ model: ignore_file: null trim: False pitch_fmin: ${pitch_fmin} - pitch_fmax: ${pitch_fmax} - - - + pitch_fmax: ${pitch_fmax} + + + text_tokenizer: _target_: "nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.EnglishPhonemesTokenizer" punct: True @@ -134,7 +134,7 @@ model: trim: False pitch_fmin: ${pitch_fmin} pitch_fmax: ${pitch_fmax} - + text_tokenizer: _target_: "nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.EnglishPhonemesTokenizer" punct: True @@ -167,8 +167,8 @@ model: sched: name: exp_decay warmup_steps: 40000 - last_epoch: -1 - d_model: 1 # Disable scaling based on model dim + last_epoch: -1 + d_model: 1 # Disable scaling based on model dim trainerConfig: sigma: 1 iters_per_checkpoint: 3000 @@ -190,7 +190,7 @@ model: energy_loss_weight: 1.0 vpred_loss_weight: 1.0 unfreeze_modules: "all" - + load_from_checkpoint: False init_from_ptl_ckpt: ${ckpt_path} modelConfig: @@ -229,17 +229,17 @@ model: dur_model_config: null f0_model_config: null energy_model_config: null - v_model_config : + v_model_config : name : dap - hparams : - n_speaker_dim : 16 + hparams : + n_speaker_dim : 16 take_log_of_input: false - bottleneck_hparams: + bottleneck_hparams: in_dim: 512 reduction_factor: 16 norm: weightnorm non_linearity: relu - arch_hparams: + arch_hparams: out_dim: 1 n_layers: 2 n_channels: 256 @@ -256,7 +256,7 @@ trainer: accumulate_grad_batches: 1 enable_checkpointing: False logger: False - gradient_clip_val: 1 + gradient_clip_val: 1 log_every_n_steps: 100 check_val_every_n_epoch: 5 diff --git a/examples/tts/conf/rad-tts_feature_pred.yaml b/examples/tts/conf/rad-tts_feature_pred.yaml index 3adada9d3f0c..bf03e4c5927c 100644 --- a/examples/tts/conf/rad-tts_feature_pred.yaml +++ b/examples/tts/conf/rad-tts_feature_pred.yaml @@ -6,7 +6,7 @@ validation_datasets: ??? ckpt_path: ??? export_dir: ??? sup_data_path: ??? -sup_data_types: ["log_mel", "align_prior_matrix", "pitch", "voiced_mask", "p_voiced", "energy"] +sup_data_types: ["log_mel", "align_prior_matrix", "pitch", "voiced_mask", "p_voiced", "energy"] whitelist_path: "nemo_text_processing/text_normalization/en/data/whitelist/tts.tsv" @@ -27,8 +27,8 @@ lowfreq: 0 highfreq: 8000 window: "hann" -phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.08" -heteronyms_path: "scripts/tts_dataset_files/heteronyms-052722" +phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.10" +heteronyms_path: "scripts/tts_dataset_files/heteronyms-052722" mapping_file_path: "" model: @@ -41,7 +41,7 @@ model: pitch_mean: ${pitch_mean} pitch_std: ${pitch_std} - + text_normalizer: _target_: nemo_text_processing.text_normalization.normalize.Normalizer lang: en @@ -52,7 +52,7 @@ model: verbose: false punct_pre_process: true punct_post_process: true - + text_tokenizer: _target_: nemo.collections.tts.torch.tts_tokenizers.EnglishPhonemesTokenizer punct: true @@ -86,10 +86,10 @@ model: ignore_file: null trim: False pitch_fmin: ${pitch_fmin} - pitch_fmax: ${pitch_fmax} - - - + pitch_fmax: ${pitch_fmax} + + + text_tokenizer: _target_: "nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.EnglishPhonemesTokenizer" punct: True @@ -133,7 +133,7 @@ model: trim: False pitch_fmin: ${pitch_fmin} pitch_fmax: ${pitch_fmax} - + text_tokenizer: _target_: "nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.EnglishPhonemesTokenizer" punct: True @@ -166,8 +166,8 @@ model: sched: name: exp_decay warmup_steps: 40000 - last_epoch: -1 - d_model: 1 # Disable scaling based on model dim + last_epoch: -1 + d_model: 1 # Disable scaling based on model dim trainerConfig: sigma: 1 iters_per_checkpoint: 3000 @@ -189,7 +189,7 @@ model: energy_loss_weight: 1.0 vpred_loss_weight: 1.0 unfreeze_modules: "durf0energyvpred" - + load_from_checkpoint: True init_from_ptl_ckpt: ${ckpt_path} modelConfig: @@ -241,66 +241,66 @@ model: p_dropout: 0.1 noise_to_unvoiced_in_f0: 0 noise_to_pvoiced: 0 - dur_model_config: - name: dap - hparams: - n_speaker_dim: 16 - bottleneck_hparams: + dur_model_config: + name: dap + hparams: + n_speaker_dim: 16 + bottleneck_hparams: in_dim: 512 reduction_factor: 16 norm: weightnorm non_linearity: relu take_log_of_input: true - arch_hparams: + arch_hparams: out_dim: 1 n_layers: 2 n_channels: 256 kernel_size: 3 p_dropout: 0.1 - f0_model_config: + f0_model_config: name: dap hparams: n_speaker_dim: 16 - bottleneck_hparams: + bottleneck_hparams: in_dim: 512 reduction_factor: 16 norm: weightnorm non_linearity: relu take_log_of_input: false - arch_hparams: + arch_hparams: out_dim: 1 n_layers: 2 n_channels: 256 kernel_size: 11 p_dropout: 0.5 - energy_model_config: + energy_model_config: name: dap hparams: n_speaker_dim: 16 - bottleneck_hparams: + bottleneck_hparams: in_dim: 512 reduction_factor: 16 norm: weightnorm non_linearity: relu take_log_of_input: false - arch_hparams: + arch_hparams: out_dim: 1 n_layers: 2 n_channels: 256 kernel_size: 3 p_dropout: 0.5 - v_model_config : + v_model_config : name: dap hparams: n_speaker_dim: 16 take_log_of_input: false - bottleneck_hparams: + bottleneck_hparams: in_dim: 512 reduction_factor: 16 norm: weightnorm non_linearity: relu - arch_hparams: + arch_hparams: out_dim: 1 n_layers: 2 n_channels: 256 diff --git a/examples/tts/conf/tacotron2.yaml b/examples/tts/conf/tacotron2.yaml index e227a82d49af..9a443520885b 100644 --- a/examples/tts/conf/tacotron2.yaml +++ b/examples/tts/conf/tacotron2.yaml @@ -9,7 +9,7 @@ validation_datasets: ??? sup_data_path: null sup_data_types: null -phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.08" +phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.10" heteronyms_path: "scripts/tts_dataset_files/heteronyms-052722" whitelist_path: "nemo_text_processing/text_normalization/en/data/whitelist/lj_speech.tsv" @@ -79,7 +79,7 @@ model: batch_size: 48 num_workers: 4 pin_memory: true - + validation_ds: dataset: _target_: "nemo.collections.tts.torch.data.TTSDataset" diff --git a/nemo/collections/tts/models/fastpitch.py b/nemo/collections/tts/models/fastpitch.py index e13092454691..880004a47d1d 100644 --- a/nemo/collections/tts/models/fastpitch.py +++ b/nemo/collections/tts/models/fastpitch.py @@ -46,7 +46,7 @@ @dataclass class G2PConfig: _target_: str = "nemo_text_processing.g2p.modules.EnglishG2p" - phoneme_dict: str = "scripts/tts_dataset_files/cmudict-0.7b_nv22.08" + phoneme_dict: str = "scripts/tts_dataset_files/cmudict-0.7b_nv22.10" heteronyms: str = "scripts/tts_dataset_files/heteronyms-052722" phoneme_probability: float = 0.5 diff --git a/nemo/collections/tts/torch/tts_dataset.yaml b/nemo/collections/tts/torch/tts_dataset.yaml index 510f8f8ce20b..fb7aef09b955 100644 --- a/nemo/collections/tts/torch/tts_dataset.yaml +++ b/nemo/collections/tts/torch/tts_dataset.yaml @@ -42,5 +42,5 @@ tts_dataset: pad_with_space: True g2p: _target_: nemo_text_processing.g2p.modules.EnglishG2p - phoneme_dict: "scripts/tts_dataset_files/cmudict-0.7b_nv22.08" + phoneme_dict: "scripts/tts_dataset_files/cmudict-0.7b_nv22.10" heteronyms: "scripts/tts_dataset_files/heteronyms-052722" diff --git a/nemo_text_processing/g2p/modules.py b/nemo_text_processing/g2p/modules.py index 65c2b1a37d39..4021195c99fd 100644 --- a/nemo_text_processing/g2p/modules.py +++ b/nemo_text_processing/g2p/modules.py @@ -150,7 +150,7 @@ def _parse_as_cmu_dict(phoneme_dict_path=None, encoding='latin-1'): f"English g2p_dict will be used from nltk.corpus.cmudict.dict(), because phoneme_dict_path=None. " "Note that nltk.corpus.cmudict.dict() has old version (0.6) of CMUDict. " "You can use the latest official version of CMUDict (0.7b) with additional changes from NVIDIA directly from NeMo " - "using the path scripts/tts_dataset_files/cmudict-0.7b_nv22.07." + "using the path scripts/tts_dataset_files/cmudict-0.7b_nv22.10." ) return nltk.corpus.cmudict.dict() @@ -276,10 +276,10 @@ def __init__( """Generic IPA G2P module. This module converts words from grapheme to International Phonetic Alphabet representations. Optionally, it can ignore heteronyms, ambiguous words, or words marked as unchangeable by word_tokenize_func (see code for details). Ignored words are left unchanged or passed through apply_to_oov_word for handling. - + Args: phoneme_dict (str, Path, Dict): Path to file in CMUdict format or dictionary of CMUdict-like entries. - Must be given for IPA G2P. (Consider using scripts/tts_dataset_files/ipa_cmudict-0.7b_nv22.06.txt.) + Must be given for IPA G2P. (Consider using scripts/tts_dataset_files/ipa_cmudict-0.7b_nv22.10.txt.) word_tokenize_func: Function for tokenizing text to words. It has to return List[Tuple[Union[str, List[str]], bool]] where every tuple denotes word representation and flag whether to leave unchanged or not. @@ -502,7 +502,7 @@ class ChineseG2p(BaseG2p): def __init__( self, phoneme_dict=None, word_tokenize_func=None, apply_to_oov_word=None, mapping_file: Optional[str] = None, ): - """Chinese G2P module. This module first converts Chinese characters into pinyin sequences using pypinyin, then pinyin sequences would + """Chinese G2P module. This module first converts Chinese characters into pinyin sequences using pypinyin, then pinyin sequences would be further converted into phoneme sequences using pinyin_dict_nv_22.10.txt dict file. For Chinese and English bilingual sentences, the English words would be converted into letters. Args: @@ -560,8 +560,8 @@ def __call__(self, text): where English words would be split into letters. e.g. 我今天去了Apple Store, 买了一个iPhone。 would return a list - ['wo3', 'jin1', 'tian1', 'qu4', 'le5', 'A', 'p', 'p', 'l', 'e', - ' ', 'S', 't', 'o', 'r', 'e', ',', ' ', 'mai3', 'le5', 'yi2', + ['wo3', 'jin1', 'tian1', 'qu4', 'le5', 'A', 'p', 'p', 'l', 'e', + ' ', 'S', 't', 'o', 'r', 'e', ',', ' ', 'mai3', 'le5', 'yi2', 'ge4', 'i', 'P', 'h', 'o', 'n', 'e', '。'] """ pinyin_seq = self._lazy_pinyin( diff --git a/scripts/dataset_processing/g2p/convert_cmu_arpabet_to_ipa.py b/scripts/dataset_processing/g2p/convert_cmu_arpabet_to_ipa.py index bae3b4c73780..f9a80f353f54 100644 --- a/scripts/dataset_processing/g2p/convert_cmu_arpabet_to_ipa.py +++ b/scripts/dataset_processing/g2p/convert_cmu_arpabet_to_ipa.py @@ -32,14 +32,14 @@ def parse_args(): '--cmu_arpabet', help="Path to CMU ARPABET dictionary file", type=str, - default="tts_dataset_files/cmudict-0.7b_nv22.08", + default="tts_dataset_files/cmudict-0.7b_nv22.10", ) parser.add_argument("--ipa_out", help="Path to save IPA version of the dictionary", type=str, required=True) parser.add_argument( "--mapping", help="ARPABET to IPA phoneme mapping file", type=str, - default="tts_dataset_files/cmudict-arpabet_to_ipa_nv22.08.tsv", + default="tts_dataset_files/cmudict-arpabet_to_ipa_nv22.10.tsv", ) return parser.parse_args() diff --git a/scripts/dataset_processing/tts/ljspeech/ds_conf/ds_for_fastpitch_align.yaml b/scripts/dataset_processing/tts/ljspeech/ds_conf/ds_for_fastpitch_align.yaml index bed6b2ee49af..cbb124fdef1f 100644 --- a/scripts/dataset_processing/tts/ljspeech/ds_conf/ds_for_fastpitch_align.yaml +++ b/scripts/dataset_processing/tts/ljspeech/ds_conf/ds_for_fastpitch_align.yaml @@ -4,7 +4,7 @@ manifest_filepath: "train_manifest.json" sup_data_path: "sup_data" sup_data_types: [ "align_prior_matrix", "pitch" ] whitelist_path: "nemo_text_processing/text_normalization/en/data/whitelist/lj_speech.tsv" -phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.08" +phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.10" heteronyms_path: "scripts/tts_dataset_files/heteronyms-052722" dataset: diff --git a/scripts/dataset_processing/tts/ljspeech/ds_conf/ds_for_mixer_tts.yaml b/scripts/dataset_processing/tts/ljspeech/ds_conf/ds_for_mixer_tts.yaml index d4151e888ae0..66222656d9b9 100644 --- a/scripts/dataset_processing/tts/ljspeech/ds_conf/ds_for_mixer_tts.yaml +++ b/scripts/dataset_processing/tts/ljspeech/ds_conf/ds_for_mixer_tts.yaml @@ -4,7 +4,7 @@ manifest_filepath: "train_manifest.json" sup_data_path: "sup_data" sup_data_types: [ "align_prior_matrix", "pitch" ] whitelist_path: "nemo_text_processing/text_normalization/en/data/whitelist/lj_speech.tsv" -phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.08" +phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.10" heteronyms_path: "scripts/tts_dataset_files/heteronyms-052722" dataset: diff --git a/scripts/tts_dataset_files/cmudict-0.7b_nv22.08 b/scripts/tts_dataset_files/cmudict-0.7b_nv22.10 similarity index 99% rename from scripts/tts_dataset_files/cmudict-0.7b_nv22.08 rename to scripts/tts_dataset_files/cmudict-0.7b_nv22.10 index 1c18eb3c6aa3..32bebb4b8e3f 100644 --- a/scripts/tts_dataset_files/cmudict-0.7b_nv22.08 +++ b/scripts/tts_dataset_files/cmudict-0.7b_nv22.10 @@ -60174,9 +60174,7 @@ INTERESTS(1) IH1 N T R IH0 S T S INTERESTS(2) IH1 N T ER0 AH0 S T S INTERESTS(3) IH1 N T ER0 IH0 S T S INTERFACE IH1 N T ER0 F EY2 S -INTERFACE(1) IH1 N ER0 F EY2 S INTERFACES IH1 N T ER0 F EY2 S IH0 Z -INTERFACES(1) IH1 N ER0 F EY2 S IH0 Z INTERFAITH IH2 N T ER0 F EY1 TH INTERFAX IH1 N T ER0 F AE2 K S INTERFERE IH2 N T ER0 F IH1 R diff --git a/scripts/tts_dataset_files/ipa_cmudict-0.7b_nv22.08.txt b/scripts/tts_dataset_files/ipa_cmudict-0.7b_nv22.10.txt similarity index 99% rename from scripts/tts_dataset_files/ipa_cmudict-0.7b_nv22.08.txt rename to scripts/tts_dataset_files/ipa_cmudict-0.7b_nv22.10.txt index 1bedf98caf1f..99a57282685a 100644 --- a/scripts/tts_dataset_files/ipa_cmudict-0.7b_nv22.08.txt +++ b/scripts/tts_dataset_files/ipa_cmudict-0.7b_nv22.10.txt @@ -60203,9 +60203,7 @@ INTERESTS(1) ˈɪntɹɪsts INTERESTS(2) ˈɪntɚəsts INTERESTS(3) ˈɪntɚɪsts INTERFACE ˈɪntɚˌfeɪs -INTERFACE(1) ˈɪnɚˌfeɪs INTERFACES ˈɪntɚˌfeɪsɪz -INTERFACES(1) ˈɪnɚˌfeɪsɪz INTERFAITH ˌɪntɚˈfeɪθ INTERFAX ˈɪntɚˌfæks INTERFERE ˌɪntɚˈfɪɹ diff --git a/tutorials/tts/FastPitch_Finetuning.ipynb b/tutorials/tts/FastPitch_Finetuning.ipynb index 47ead32fbd65..80dd2fde5458 100755 --- a/tutorials/tts/FastPitch_Finetuning.ipynb +++ b/tutorials/tts/FastPitch_Finetuning.ipynb @@ -246,7 +246,7 @@ "source": [ "# additional files\n", "!mkdir -p tts_dataset_files && cd tts_dataset_files \\\n", - "&& wget https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/scripts/tts_dataset_files/cmudict-0.7b_nv22.08 \\\n", + "&& wget https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/scripts/tts_dataset_files/cmudict-0.7b_nv22.10 \\\n", "&& wget https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/scripts/tts_dataset_files/heteronyms-052722 \\\n", "&& wget https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/nemo_text_processing/text_normalization/en/data/whitelist/lj_speech.tsv \\\n", "&& cd .." @@ -288,7 +288,7 @@ " train_dataset=./6097_manifest_train_dur_5_mins_local.json \\\n", " validation_datasets=./6097_manifest_dev_ns_all_local.json \\\n", " sup_data_path=./fastpitch_sup_data \\\n", - " phoneme_dict_path=tts_dataset_files/cmudict-0.7b_nv22.08 \\\n", + " phoneme_dict_path=tts_dataset_files/cmudict-0.7b_nv22.10 \\\n", " heteronyms_path=tts_dataset_files/heteronyms-052722 \\\n", " whitelist_path=tts_dataset_files/lj_speech.tsv \\\n", " exp_manager.exp_dir=./ljspeech_to_6097_no_mixing_5_mins \\\n", @@ -320,7 +320,7 @@ " sup_data_path=./fastpitch_sup_data`\n", " * We tell the script what manifest files to train and eval on, as well as where supplementary data is located (or will be calculated and saved during training if not provided).\n", " \n", - "* `phoneme_dict_path=tts_dataset_files/cmudict-0.7b_nv22.08 \n", + "* `phoneme_dict_path=tts_dataset_files/cmudict-0.7b_nv22.10 \n", "heteronyms_path=tts_dataset_files/heteronyms-052722\n", "whitelist_path=tts_dataset_files/lj_speech.tsv \n", "`\n", @@ -701,7 +701,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3.9.15 ('ptl_venv')", "language": "python", "name": "python3" }, @@ -715,7 +715,12 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.7" + "version": "3.9.15" + }, + "vscode": { + "interpreter": { + "hash": "f8a1d50fd7b1e17bd198f085b8ced031398c6134b0da7c4415c17601bbcccc4e" + } } }, "nbformat": 4, diff --git a/tutorials/tts/FastPitch_MixerTTS_Training.ipynb b/tutorials/tts/FastPitch_MixerTTS_Training.ipynb index 1dfd14ed2972..1b2ebc66ea3b 100644 --- a/tutorials/tts/FastPitch_MixerTTS_Training.ipynb +++ b/tutorials/tts/FastPitch_MixerTTS_Training.ipynb @@ -228,7 +228,7 @@ "\n", "# additional files\n", "!mkdir -p tts_dataset_files && cd tts_dataset_files \\\n", - "&& wget https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/scripts/tts_dataset_files/cmudict-0.7b_nv22.08 \\\n", + "&& wget https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/scripts/tts_dataset_files/cmudict-0.7b_nv22.10 \\\n", "&& wget https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/scripts/tts_dataset_files/heteronyms-052722 \\\n", "&& wget https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/nemo_text_processing/text_normalization/en/data/whitelist/lj_speech.tsv \\\n", "&& cd .." @@ -430,7 +430,7 @@ "\n", "# Grapheme-to-phoneme module\n", "g2p = EnglishG2p(\n", - " phoneme_dict=\"tts_dataset_files/cmudict-0.7b_nv22.08\",\n", + " phoneme_dict=\"tts_dataset_files/cmudict-0.7b_nv22.10\",\n", " heteronyms=\"tts_dataset_files/heteronyms-052722\"\n", ")\n", "\n", @@ -556,7 +556,7 @@ "validation_datasets=tests/data/asr/an4_val.json \\\n", "sup_data_types=\"['align_prior_matrix', 'pitch']\" \\\n", "sup_data_path={mixer_tts_sup_data_path} \\\n", - "phoneme_dict_path=tts_dataset_files/cmudict-0.7b_nv22.08 \\\n", + "phoneme_dict_path=tts_dataset_files/cmudict-0.7b_nv22.10 \\\n", "heteronyms_path=tts_dataset_files/heteronyms-052722 \\\n", "whitelist_path=tts_dataset_files/lj_speech.tsv \\\n", "pitch_mean={pitch_mean} \\\n", @@ -589,7 +589,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3.9.15 ('ptl_venv')", "language": "python", "name": "python3" }, @@ -603,7 +603,12 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.7" + "version": "3.9.15" + }, + "vscode": { + "interpreter": { + "hash": "f8a1d50fd7b1e17bd198f085b8ced031398c6134b0da7c4415c17601bbcccc4e" + } } }, "nbformat": 4, diff --git a/tutorials/tts/NeMo_TTS_Primer.ipynb b/tutorials/tts/NeMo_TTS_Primer.ipynb index 13829d426f1e..21c366155b17 100644 --- a/tutorials/tts/NeMo_TTS_Primer.ipynb +++ b/tutorials/tts/NeMo_TTS_Primer.ipynb @@ -427,7 +427,7 @@ "import os\n", "from nemo.collections.tts.torch.g2ps import EnglishG2p, IPAG2P\n", "\n", - "ipa_dict_path = os.path.join(NEMO_DIR, \"scripts\", \"tts_dataset_files\", \"ipa_cmudict-0.7b_nv22.08.txt\")\n", + "ipa_dict_path = os.path.join(NEMO_DIR, \"scripts\", \"tts_dataset_files\", \"ipa_cmudict-0.7b_nv22.10.txt\")\n", "\n", "# EnglishG2p uses the CMU dictionary by default, if none is provided\n", "arpabet_g2p = EnglishG2p()\n", @@ -2048,7 +2048,7 @@ }, "gpuClass": "standard", "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3.9.15 ('ptl_venv')", "language": "python", "name": "python3" }, @@ -2062,7 +2062,12 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.7" + "version": "3.9.15" + }, + "vscode": { + "interpreter": { + "hash": "f8a1d50fd7b1e17bd198f085b8ced031398c6134b0da7c4415c17601bbcccc4e" + } } }, "nbformat": 4, diff --git a/tutorials/tts/Tacotron2_Training.ipynb b/tutorials/tts/Tacotron2_Training.ipynb index 99ba165f7287..3642a3e9e4dc 100644 --- a/tutorials/tts/Tacotron2_Training.ipynb +++ b/tutorials/tts/Tacotron2_Training.ipynb @@ -165,7 +165,7 @@ "# We will also need a few extra files for handling text.\n", "!(mkdir -p scripts/tts_dataset_files \\\n", " && cd scripts/tts_dataset_files \\\n", - " && wget https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/scripts/tts_dataset_files/cmudict-0.7b_nv22.08 \\\n", + " && wget https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/scripts/tts_dataset_files/cmudict-0.7b_nv22.10 \\\n", " && wget https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/scripts/tts_dataset_files/heteronyms-052722 \\\n", " && cd ..)\n", " \n", @@ -233,7 +233,7 @@ "sup_data_path: null\n", "sup_data_types: null\n", "\n", - "phoneme_dict_path: \"scripts/tts_dataset_files/cmudict-0.7b_nv22.08\"\n", + "phoneme_dict_path: \"scripts/tts_dataset_files/cmudict-0.7b_nv22.10\"\n", "heteronyms_path: \"scripts/tts_dataset_files/heteronyms-052722\"\n", "whitelist_path: \"nemo_text_processing/text_normalization/en/data/whitelist/lj_speech.tsv\"\n", "```\n",