diff --git a/config/LJSpeech/train.yaml b/config/LJSpeech/train.yaml index 7992ea4..cdad49b 100644 --- a/config/LJSpeech/train.yaml +++ b/config/LJSpeech/train.yaml @@ -4,9 +4,9 @@ dist_config: dist_url: "tcp://localhost:80000" world_size: 1 path: - ckpt_path: "./output/ckpt/LJSpeech" - log_path: "./output/log/LJSpeech" - result_path: "./output/result/LJSpeech" + ckpt_path: "./output/ckpt/LJSpeech_fixing" + log_path: "./output/log/LJSpeech_fixing" + result_path: "./output/result/LJSpeech_fixing" optimizer: batch_size: 64 betas: [0.9, 0.98] diff --git a/model/linguistic_encoder.py b/model/linguistic_encoder.py index 9cd57c5..65def1a 100644 --- a/model/linguistic_encoder.py +++ b/model/linguistic_encoder.py @@ -185,9 +185,9 @@ def forward( # Phoneme-level Duration Prediction log_duration_p_prediction = self.duration_predictor(enc_p_out, src_p_mask) - # Word-level Pooling + # Word-level Pooling (in log scale) log_duration_w_prediction = word_level_pooling( - log_duration_p_prediction.unsqueeze(-1), src_p_len, word_boundary, src_w_len, reduce="sum").squeeze(-1) + log_duration_p_prediction.exp().unsqueeze(-1), src_p_len, word_boundary, src_w_len, reduce="sum").log().squeeze(-1) x = enc_w_out if duration_target is not None: