diff --git a/nemo/collections/tts/losses/fastpitchloss.py b/nemo/collections/tts/losses/fastpitchloss.py
index 1a7e505d8902..f9e9b3926e64 100644
--- a/nemo/collections/tts/losses/fastpitchloss.py
+++ b/nemo/collections/tts/losses/fastpitchloss.py
@@ -138,6 +138,7 @@ def output_types(self):
     def forward(self, spect_predicted, spect_tgt):
         spect_tgt.requires_grad = False
         spect_tgt = spect_tgt.transpose(1, 2)  # (B, T, H)
+        spect_predicted = spect_predicted.transpose(1, 2)  # (B, T, H)
 
         ldiff = spect_tgt.size(1) - spect_predicted.size(1)
         spect_predicted = F.pad(spect_predicted, (0, 0, 0, ldiff, 0, 0), value=0.0)
diff --git a/nemo/collections/tts/models/fastpitch.py b/nemo/collections/tts/models/fastpitch.py
index b09c4566581e..bf5521df8cb0 100644
--- a/nemo/collections/tts/models/fastpitch.py
+++ b/nemo/collections/tts/models/fastpitch.py
@@ -194,9 +194,10 @@ def forward(
 
     @typecheck(output_types={"spect": NeuralType(('B', 'C', 'T'), MelSpectrogramType())})
     def generate_spectrogram(self, tokens: 'torch.tensor', speaker: int = 0, pace: float = 1.0) -> torch.tensor:
+        # FIXME: return masks as well?
         self.eval()
         spect, *_ = self(text=tokens, durs=None, pitch=None, speaker=speaker, pace=pace)
-        return spect.transpose(1, 2)
+        return spect
 
     def training_step(self, batch, batch_idx):
         attn_prior, durs, speakers = None, None, None
@@ -206,7 +207,7 @@ def training_step(self, batch, batch_idx):
             audio, audio_lens, text, text_lens, durs, pitch, speakers = batch
         mels, spec_len = self.preprocessor(input_signal=audio, length=audio_lens)
 
-        mels_pred, _, log_durs_pred, pitch_pred, attn_soft, attn_logprob, attn_hard, attn_hard_dur, pitch = self(
+        mels_pred, _, _, log_durs_pred, pitch_pred, attn_soft, attn_logprob, attn_hard, attn_hard_dur, pitch = self(
             text=text,
             durs=durs,
             pitch=pitch,
@@ -275,7 +276,7 @@ def validation_step(self, batch, batch_idx):
         mels, mel_lens = self.preprocessor(input_signal=audio, length=audio_lens)
 
         # Calculate val loss on ground truth durations to better align L2 loss in time
-        mels_pred, _, log_durs_pred, pitch_pred, _, _, _, attn_hard_dur, pitch = self(
+        mels_pred, _, _, log_durs_pred, pitch_pred, _, _, _, attn_hard_dur, pitch = self(
             text=text,
             durs=durs,
             pitch=pitch,
@@ -390,6 +391,7 @@ def output_module(self):
     def forward_for_export(self, text):
         (
             spect,
+            num_frames,
             durs_predicted,
             log_durs_predicted,
             pitch_predicted,
@@ -399,7 +401,7 @@ def forward_for_export(self, text):
             attn_hard_dur,
             pitch,
         ) = self.fastpitch(text=text)
-        return spect, durs_predicted, log_durs_predicted, pitch_predicted
+        return spect, num_frames, durs_predicted, log_durs_predicted, pitch_predicted
 
     @property
     def disabled_deployment_input_names(self):
diff --git a/nemo/collections/tts/modules/fastpitch.py b/nemo/collections/tts/modules/fastpitch.py
index 56e3d50e96d5..ba1bf8a14b1d 100644
--- a/nemo/collections/tts/modules/fastpitch.py
+++ b/nemo/collections/tts/modules/fastpitch.py
@@ -213,6 +213,7 @@ def input_types(self):
     def output_types(self):
         return {
             "spect": NeuralType(('B', 'D', 'T'), MelSpectrogramType()),
+            "num_frames": NeuralType(('B'), TokenDurationType()),
             "durs_predicted": NeuralType(('B', 'T'), TokenDurationType()),
             "log_durs_predicted": NeuralType(('B', 'T'), TokenLogDurationType()),
             "pitch_predicted": NeuralType(('B', 'T'), RegressionValuesType()),
@@ -282,9 +283,10 @@ def forward(
 
         # Output FFT
         dec_out, _ = self.decoder(input=len_regulated, seq_lens=dec_lens)
-        spect = self.proj(dec_out)
+        spect = self.proj(dec_out).transpose(1, 2)
         return (
             spect,
+            dec_lens,
             durs_predicted,
             log_durs_predicted,
             pitch_predicted,
diff --git a/nemo/collections/tts/modules/transformer.py b/nemo/collections/tts/modules/transformer.py
index 9c66fb09dce9..c8f12813f805 100644
--- a/nemo/collections/tts/modules/transformer.py
+++ b/nemo/collections/tts/modules/transformer.py
@@ -24,7 +24,7 @@
 
 def mask_from_lens(lens, max_len: Optional[int] = None):
     if max_len is None:
-        max_len = lens.max().item()
+        max_len = lens.max()
     ids = torch.arange(0, max_len, device=lens.device, dtype=lens.dtype)
     mask = torch.lt(ids, lens.unsqueeze(1))
     return mask