diff --git a/nemo/collections/tts/losses/fastpitchloss.py b/nemo/collections/tts/losses/fastpitchloss.py index 1a7e505d8902..f9e9b3926e64 100644 --- a/nemo/collections/tts/losses/fastpitchloss.py +++ b/nemo/collections/tts/losses/fastpitchloss.py @@ -138,6 +138,7 @@ def output_types(self): def forward(self, spect_predicted, spect_tgt): spect_tgt.requires_grad = False spect_tgt = spect_tgt.transpose(1, 2) # (B, T, H) + spect_predicted = spect_predicted.transpose(1, 2) # (B, T, H) ldiff = spect_tgt.size(1) - spect_predicted.size(1) spect_predicted = F.pad(spect_predicted, (0, 0, 0, ldiff, 0, 0), value=0.0) diff --git a/nemo/collections/tts/models/fastpitch.py b/nemo/collections/tts/models/fastpitch.py index b09c4566581e..bf5521df8cb0 100644 --- a/nemo/collections/tts/models/fastpitch.py +++ b/nemo/collections/tts/models/fastpitch.py @@ -194,9 +194,10 @@ def forward( @typecheck(output_types={"spect": NeuralType(('B', 'C', 'T'), MelSpectrogramType())}) def generate_spectrogram(self, tokens: 'torch.tensor', speaker: int = 0, pace: float = 1.0) -> torch.tensor: + # FIXME: return masks as well? self.eval() spect, *_ = self(text=tokens, durs=None, pitch=None, speaker=speaker, pace=pace) - return spect.transpose(1, 2) + return spect def training_step(self, batch, batch_idx): attn_prior, durs, speakers = None, None, None @@ -206,7 +207,7 @@ def training_step(self, batch, batch_idx): audio, audio_lens, text, text_lens, durs, pitch, speakers = batch mels, spec_len = self.preprocessor(input_signal=audio, length=audio_lens) - mels_pred, _, log_durs_pred, pitch_pred, attn_soft, attn_logprob, attn_hard, attn_hard_dur, pitch = self( + mels_pred, _, _, log_durs_pred, pitch_pred, attn_soft, attn_logprob, attn_hard, attn_hard_dur, pitch = self( text=text, durs=durs, pitch=pitch, @@ -275,7 +276,7 @@ def validation_step(self, batch, batch_idx): mels, mel_lens = self.preprocessor(input_signal=audio, length=audio_lens) # Calculate val loss on ground truth durations to better align L2 loss in time - mels_pred, _, log_durs_pred, pitch_pred, _, _, _, attn_hard_dur, pitch = self( + mels_pred, _, _, log_durs_pred, pitch_pred, _, _, _, attn_hard_dur, pitch = self( text=text, durs=durs, pitch=pitch, @@ -390,6 +391,7 @@ def output_module(self): def forward_for_export(self, text): ( spect, + num_frames, durs_predicted, log_durs_predicted, pitch_predicted, @@ -399,7 +401,7 @@ def forward_for_export(self, text): attn_hard_dur, pitch, ) = self.fastpitch(text=text) - return spect, durs_predicted, log_durs_predicted, pitch_predicted + return spect, num_frames, durs_predicted, log_durs_predicted, pitch_predicted @property def disabled_deployment_input_names(self): diff --git a/nemo/collections/tts/modules/fastpitch.py b/nemo/collections/tts/modules/fastpitch.py index 56e3d50e96d5..ba1bf8a14b1d 100644 --- a/nemo/collections/tts/modules/fastpitch.py +++ b/nemo/collections/tts/modules/fastpitch.py @@ -213,6 +213,7 @@ def input_types(self): def output_types(self): return { "spect": NeuralType(('B', 'D', 'T'), MelSpectrogramType()), + "num_frames": NeuralType(('B'), TokenDurationType()), "durs_predicted": NeuralType(('B', 'T'), TokenDurationType()), "log_durs_predicted": NeuralType(('B', 'T'), TokenLogDurationType()), "pitch_predicted": NeuralType(('B', 'T'), RegressionValuesType()), @@ -282,9 +283,10 @@ def forward( # Output FFT dec_out, _ = self.decoder(input=len_regulated, seq_lens=dec_lens) - spect = self.proj(dec_out) + spect = self.proj(dec_out).transpose(1, 2) return ( spect, + dec_lens, durs_predicted, log_durs_predicted, pitch_predicted, diff --git a/nemo/collections/tts/modules/transformer.py b/nemo/collections/tts/modules/transformer.py index 9c66fb09dce9..c8f12813f805 100644 --- a/nemo/collections/tts/modules/transformer.py +++ b/nemo/collections/tts/modules/transformer.py @@ -24,7 +24,7 @@ def mask_from_lens(lens, max_len: Optional[int] = None): if max_len is None: - max_len = lens.max().item() + max_len = lens.max() ids = torch.arange(0, max_len, device=lens.device, dtype=lens.dtype) mask = torch.lt(ids, lens.unsqueeze(1)) return mask