Skip to content

Commit

Permalink
Update Interface(s) phonetic entry (NVIDIA#5212)
Browse files Browse the repository at this point in the history
* change interface(s) phone

Signed-off-by: Jason <[email protected]>

* push version

Signed-off-by: Jason <[email protected]>

* update dict path

Signed-off-by: Jason <[email protected]>

Signed-off-by: Jason <[email protected]>
Signed-off-by: Hainan Xu <[email protected]>
  • Loading branch information
blisc authored and Hainan Xu committed Nov 29, 2022
1 parent cd85fef commit 68f2743
Show file tree
Hide file tree
Showing 20 changed files with 99 additions and 88 deletions.
2 changes: 1 addition & 1 deletion examples/tts/conf/aligner.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ lowfreq: 0
highfreq: 8000
window: hann

phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.08"
phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.10"
heteronyms_path: "scripts/tts_dataset_files/heteronyms-052722"
whitelist_path: "nemo_text_processing/text_normalization/en/data/whitelist/lj_speech.tsv"

Expand Down
10 changes: 5 additions & 5 deletions examples/tts/conf/fastpitch_align_44100.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# This config contains the default values for training FastPitch model with aligner using 44.1KHz sampling
# rate. If you want to train model on other dataset, you can change config values according to your dataset.
# This config contains the default values for training FastPitch model with aligner using 44.1KHz sampling
# rate. If you want to train model on other dataset, you can change config values according to your dataset.
# Most dataset-specific arguments are in the head of the config file, see below.


Expand Down Expand Up @@ -27,7 +27,7 @@ lowfreq: 0
highfreq: null
window: hann

phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.08"
phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.10"
heteronyms_path: "scripts/tts_dataset_files/heteronyms-052722"
whitelist_path: "nemo_text_processing/text_normalization/en/data/whitelist/lj_speech.tsv"

Expand Down Expand Up @@ -60,12 +60,12 @@ model:
lang: en
input_case: cased
whitelist: ${whitelist_path}

text_normalizer_call_kwargs:
verbose: false
punct_pre_process: true
punct_post_process: true

text_tokenizer:
_target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.EnglishPhonemesTokenizer
punct: true
Expand Down
2 changes: 1 addition & 1 deletion examples/tts/conf/fastpitch_align_ipa.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ lowfreq: 0
highfreq: 8000
window: hann

phoneme_dict_path: "scripts/tts_dataset_files/ipa_cmudict-0.7b_nv22.08.txt"
phoneme_dict_path: "scripts/tts_dataset_files/ipa_cmudict-0.7b_nv22.10.txt"
heteronyms_path: "scripts/tts_dataset_files/heteronyms-052722"
whitelist_path: "nemo_text_processing/text_normalization/en/data/whitelist/lj_speech.tsv"

Expand Down
2 changes: 1 addition & 1 deletion examples/tts/conf/fastpitch_align_v1.05.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ lowfreq: 0
highfreq: 8000
window: hann

phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.08"
phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.10"
heteronyms_path: "scripts/tts_dataset_files/heteronyms-052722"
whitelist_path: "nemo_text_processing/text_normalization/en/data/whitelist/lj_speech.tsv"

Expand Down
2 changes: 1 addition & 1 deletion examples/tts/conf/mixer-tts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ lowfreq: 0
highfreq: 8000
window: hann

phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.08"
phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.10"
heteronyms_path: "scripts/tts_dataset_files/heteronyms-052722"
whitelist_path: "nemo_text_processing/text_normalization/en/data/whitelist/lj_speech.tsv"

Expand Down
36 changes: 18 additions & 18 deletions examples/tts/conf/rad-tts_dec.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ validation_datasets: ???
ckpt_path: None
export_dir: ???
sup_data_path: ???
sup_data_types: ["log_mel", "align_prior_matrix", "pitch", "voiced_mask", "p_voiced", "energy"]
sup_data_types: ["log_mel", "align_prior_matrix", "pitch", "voiced_mask", "p_voiced", "energy"]

whitelist_path: "nemo_text_processing/text_normalization/en/data/whitelist/tts.tsv"

Expand All @@ -28,7 +28,7 @@ highfreq: 8000
window: "hann"


phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.08"
phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.10"
heteronyms_path: "scripts/tts_dataset_files/heteronyms-052722"
mapping_file_path: ""

Expand All @@ -42,7 +42,7 @@ model:

pitch_mean: ${pitch_mean}
pitch_std: ${pitch_std}

text_normalizer:
_target_: nemo_text_processing.text_normalization.normalize.Normalizer
lang: en
Expand Down Expand Up @@ -75,7 +75,7 @@ model:
sample_rate: ${sample_rate}
sup_data_path: ${sup_data_path}
sup_data_types: ${sup_data_types}
n_fft: ${n_fft}
n_fft: ${n_fft}
win_length: ${n_window_size}
hop_length: ${n_window_stride}
window: ${window}
Expand All @@ -87,10 +87,10 @@ model:
ignore_file: null
trim: False
pitch_fmin: ${pitch_fmin}
pitch_fmax: ${pitch_fmax}
pitch_fmax: ${pitch_fmax}



text_tokenizer:
_target_: "nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.EnglishPhonemesTokenizer"
punct: True
Expand Down Expand Up @@ -134,7 +134,7 @@ model:
trim: False
pitch_fmin: ${pitch_fmin}
pitch_fmax: ${pitch_fmax}

text_tokenizer:
_target_: "nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.EnglishPhonemesTokenizer"
punct: True
Expand Down Expand Up @@ -167,8 +167,8 @@ model:
sched:
name: exp_decay
warmup_steps: 40000
last_epoch: -1
d_model: 1 # Disable scaling based on model dim
last_epoch: -1
d_model: 1 # Disable scaling based on model dim
trainerConfig:
sigma: 1
iters_per_checkpoint: 3000
Expand All @@ -190,7 +190,7 @@ model:
energy_loss_weight: 1.0
vpred_loss_weight: 1.0
unfreeze_modules: "all"

load_from_checkpoint: False
init_from_ptl_ckpt: ${ckpt_path}
modelConfig:
Expand Down Expand Up @@ -229,17 +229,17 @@ model:
dur_model_config: null
f0_model_config: null
energy_model_config: null
v_model_config :
v_model_config :
name : dap
hparams :
n_speaker_dim : 16
hparams :
n_speaker_dim : 16
take_log_of_input: false
bottleneck_hparams:
bottleneck_hparams:
in_dim: 512
reduction_factor: 16
norm: weightnorm
non_linearity: relu
arch_hparams:
arch_hparams:
out_dim: 1
n_layers: 2
n_channels: 256
Expand All @@ -256,7 +256,7 @@ trainer:
accumulate_grad_batches: 1
enable_checkpointing: False
logger: False
gradient_clip_val: 1
gradient_clip_val: 1
log_every_n_steps: 100
check_val_every_n_epoch: 5

Expand Down
56 changes: 28 additions & 28 deletions examples/tts/conf/rad-tts_feature_pred.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ validation_datasets: ???
ckpt_path: ???
export_dir: ???
sup_data_path: ???
sup_data_types: ["log_mel", "align_prior_matrix", "pitch", "voiced_mask", "p_voiced", "energy"]
sup_data_types: ["log_mel", "align_prior_matrix", "pitch", "voiced_mask", "p_voiced", "energy"]

whitelist_path: "nemo_text_processing/text_normalization/en/data/whitelist/tts.tsv"

Expand All @@ -27,8 +27,8 @@ lowfreq: 0
highfreq: 8000
window: "hann"

phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.08"
heteronyms_path: "scripts/tts_dataset_files/heteronyms-052722"
phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.10"
heteronyms_path: "scripts/tts_dataset_files/heteronyms-052722"
mapping_file_path: ""

model:
Expand All @@ -41,7 +41,7 @@ model:

pitch_mean: ${pitch_mean}
pitch_std: ${pitch_std}

text_normalizer:
_target_: nemo_text_processing.text_normalization.normalize.Normalizer
lang: en
Expand All @@ -52,7 +52,7 @@ model:
verbose: false
punct_pre_process: true
punct_post_process: true

text_tokenizer:
_target_: nemo.collections.tts.torch.tts_tokenizers.EnglishPhonemesTokenizer
punct: true
Expand Down Expand Up @@ -86,10 +86,10 @@ model:
ignore_file: null
trim: False
pitch_fmin: ${pitch_fmin}
pitch_fmax: ${pitch_fmax}
pitch_fmax: ${pitch_fmax}



text_tokenizer:
_target_: "nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.EnglishPhonemesTokenizer"
punct: True
Expand Down Expand Up @@ -133,7 +133,7 @@ model:
trim: False
pitch_fmin: ${pitch_fmin}
pitch_fmax: ${pitch_fmax}

text_tokenizer:
_target_: "nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.EnglishPhonemesTokenizer"
punct: True
Expand Down Expand Up @@ -166,8 +166,8 @@ model:
sched:
name: exp_decay
warmup_steps: 40000
last_epoch: -1
d_model: 1 # Disable scaling based on model dim
last_epoch: -1
d_model: 1 # Disable scaling based on model dim
trainerConfig:
sigma: 1
iters_per_checkpoint: 3000
Expand All @@ -189,7 +189,7 @@ model:
energy_loss_weight: 1.0
vpred_loss_weight: 1.0
unfreeze_modules: "durf0energyvpred"

load_from_checkpoint: True
init_from_ptl_ckpt: ${ckpt_path}
modelConfig:
Expand Down Expand Up @@ -241,66 +241,66 @@ model:
p_dropout: 0.1
noise_to_unvoiced_in_f0: 0
noise_to_pvoiced: 0
dur_model_config:
name: dap
hparams:
n_speaker_dim: 16
bottleneck_hparams:
dur_model_config:
name: dap
hparams:
n_speaker_dim: 16
bottleneck_hparams:
in_dim: 512
reduction_factor: 16
norm: weightnorm
non_linearity: relu
take_log_of_input: true
arch_hparams:
arch_hparams:
out_dim: 1
n_layers: 2
n_channels: 256
kernel_size: 3
p_dropout: 0.1
f0_model_config:
f0_model_config:
name: dap
hparams:
n_speaker_dim: 16
bottleneck_hparams:
bottleneck_hparams:
in_dim: 512
reduction_factor: 16
norm: weightnorm
non_linearity: relu
take_log_of_input: false
arch_hparams:
arch_hparams:
out_dim: 1
n_layers: 2
n_channels: 256
kernel_size: 11
p_dropout: 0.5

energy_model_config:
energy_model_config:
name: dap
hparams:
n_speaker_dim: 16
bottleneck_hparams:
bottleneck_hparams:
in_dim: 512
reduction_factor: 16
norm: weightnorm
non_linearity: relu
take_log_of_input: false
arch_hparams:
arch_hparams:
out_dim: 1
n_layers: 2
n_channels: 256
kernel_size: 3
p_dropout: 0.5
v_model_config :
v_model_config :
name: dap
hparams:
n_speaker_dim: 16
take_log_of_input: false
bottleneck_hparams:
bottleneck_hparams:
in_dim: 512
reduction_factor: 16
norm: weightnorm
non_linearity: relu
arch_hparams:
arch_hparams:
out_dim: 1
n_layers: 2
n_channels: 256
Expand Down
4 changes: 2 additions & 2 deletions examples/tts/conf/tacotron2.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ validation_datasets: ???
sup_data_path: null
sup_data_types: null

phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.08"
phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.10"
heteronyms_path: "scripts/tts_dataset_files/heteronyms-052722"
whitelist_path: "nemo_text_processing/text_normalization/en/data/whitelist/lj_speech.tsv"

Expand Down Expand Up @@ -79,7 +79,7 @@ model:
batch_size: 48
num_workers: 4
pin_memory: true

validation_ds:
dataset:
_target_: "nemo.collections.tts.torch.data.TTSDataset"
Expand Down
2 changes: 1 addition & 1 deletion nemo/collections/tts/models/fastpitch.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@
@dataclass
class G2PConfig:
_target_: str = "nemo_text_processing.g2p.modules.EnglishG2p"
phoneme_dict: str = "scripts/tts_dataset_files/cmudict-0.7b_nv22.08"
phoneme_dict: str = "scripts/tts_dataset_files/cmudict-0.7b_nv22.10"
heteronyms: str = "scripts/tts_dataset_files/heteronyms-052722"
phoneme_probability: float = 0.5

Expand Down
2 changes: 1 addition & 1 deletion nemo/collections/tts/torch/tts_dataset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -42,5 +42,5 @@ tts_dataset:
pad_with_space: True
g2p:
_target_: nemo_text_processing.g2p.modules.EnglishG2p
phoneme_dict: "scripts/tts_dataset_files/cmudict-0.7b_nv22.08"
phoneme_dict: "scripts/tts_dataset_files/cmudict-0.7b_nv22.10"
heteronyms: "scripts/tts_dataset_files/heteronyms-052722"
Loading

0 comments on commit 68f2743

Please sign in to comment.