From 30db4d4f3fff2cd8304e4b64e5141f87e5292123 Mon Sep 17 00:00:00 2001
From: treacker <36159472+treacker@users.noreply.github.com>
Date: Wed, 11 May 2022 19:25:37 +0400
Subject: [PATCH] Tacotron2 retrain (#4103)

* fix yaml

Signed-off-by: treacker <emshabalin@yandex.ru>

* Fix for new TTSDataset class

Signed-off-by: treacker <emshabalin@yandex.ru>

* added wandb logging

Signed-off-by: treacker <emshabalin@yandex.ru>

* added wandb logging

Signed-off-by: treacker <emshabalin@yandex.ru>

* fix numpy version

Signed-off-by: treacker <emshabalin@yandex.ru>

* fix numpy version

Signed-off-by: treacker <emshabalin@yandex.ru>

* inference fix

Signed-off-by: treacker <emshabalin@yandex.ru>

* removed old code

Signed-off-by: treacker <emshabalin@yandex.ru>

* updated parser logic

Signed-off-by: treacker <emshabalin@yandex.ru>

* reverted version update

Signed-off-by: treacker <emshabalin@yandex.ru>

* refactored parser logic

Signed-off-by: treacker <emshabalin@yandex.ru>

* Updated Jenkinsfile

Signed-off-by: treacker <emshabalin@yandex.ru>

* Refactored tutorial for Tacotron2

Signed-off-by: treacker <emshabalin@yandex.ru>

* Made backward compatibility

Signed-off-by: treacker <emshabalin@yandex.ru>

* Made backward compatibility

Signed-off-by: treacker <emshabalin@yandex.ru>

* Update Jenkinsfile

Signed-off-by: treacker <emshabalin@yandex.ru>

* Update tacotron.yaml

Signed-off-by: treacker <emshabalin@yandex.ru>

* Refactoring

Signed-off-by: treacker <emshabalin@yandex.ru>

* cleaned up TN/ ITN doc (#4119)

* cleaned up TN/ ITN doc

Signed-off-by: Yang Zhang <yangzhang@nvidia.com>

* fix typo

Signed-off-by: Yang Zhang <yangzhang@nvidia.com>

* fix image

Signed-off-by: Yang Zhang <yangzhang@nvidia.com>

* fix image

Signed-off-by: Yang Zhang <yangzhang@nvidia.com>
Signed-off-by: treacker <emshabalin@yandex.ru>

* Check implicit grad acc in GLUE dataset building (#4123)

* Check implicit grad acc in GLUE dataset building

Signed-off-by: MaximumEntropy <sandeep.subramanian.1@umontreal.ca>

* Fix jenkins test for GLUE/XNLI

Signed-off-by: MaximumEntropy <sandeep.subramanian.1@umontreal.ca>
Signed-off-by: treacker <emshabalin@yandex.ru>

* Refactoring

Signed-off-by: treacker <emshabalin@yandex.ru>

* Refactoring

Signed-off-by: treacker <emshabalin@yandex.ru>

* Fixed jenkins

Signed-off-by: treacker <emshabalin@yandex.ru>

* Refactoring

Signed-off-by: treacker <emshabalin@yandex.ru>

* Refactoring

Signed-off-by: treacker <emshabalin@yandex.ru>

* Refactoring

Signed-off-by: treacker <emshabalin@yandex.ru>

Co-authored-by: Yang Zhang <yzhang123@users.noreply.github.com>
Co-authored-by: Sandeep Subramanian <sandeep.subramanian.1@umontreal.ca>
---
 Jenkinsfile                              |  12 +-
 examples/tts/conf/tacotron2.yaml         | 163 ++++--
 nemo/collections/tts/helpers/helpers.py  |  68 +++
 nemo/collections/tts/models/tacotron2.py | 172 ++++--
 tutorials/tts/Tacotron2_Training.ipynb   | 680 +++++++++++------------
 5 files changed, 640 insertions(+), 455 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index a8b6b8067ba3..63978e89c011 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -2983,7 +2983,6 @@ pipeline {
         }
       }
       parallel {
-        // TODO(Oktai15): update it in 1.8.0 version
         stage('Tacotron 2') {
           steps {
             sh 'python examples/tts/tacotron2.py \
@@ -2993,13 +2992,18 @@ pipeline {
             trainer.accelerator="gpu" \
             +trainer.limit_train_batches=1 +trainer.limit_val_batches=1 trainer.max_epochs=1 \
             trainer.strategy=null \
-            model.train_ds.dataloader_params.batch_size=4 \
-            model.validation_ds.dataloader_params.batch_size=4 \
             model.decoder.decoder_rnn_dim=256 \
             model.decoder.attention_rnn_dim=1024 \
             model.decoder.prenet_dim=128 \
             model.postnet.postnet_n_convolutions=3 \
-            ~trainer.check_val_every_n_epoch'
+            model.train_ds.dataloader_params.batch_size=4 \
+            model.train_ds.dataloader_params.num_workers=1 \
+            model.validation_ds.dataloader_params.batch_size=4 \
+            model.validation_ds.dataloader_params.num_workers=1 \
+            ~model.text_normalizer \
+            ~model.text_normalizer_call_kwargs \
+            ~trainer.check_val_every_n_epoch \
+            '
           }
         }
         stage('WaveGlow') {
diff --git a/examples/tts/conf/tacotron2.yaml b/examples/tts/conf/tacotron2.yaml
index 58fbf4d750c2..a12b8d5489d6 100644
--- a/examples/tts/conf/tacotron2.yaml
+++ b/examples/tts/conf/tacotron2.yaml
@@ -1,81 +1,136 @@
-# TODO(Oktai15): update this config in 1.8.0 version
+# This config contains the default values for training Tacotron2 model on LJSpeech dataset.
+# If you want to train model on other dataset, you can change config values according to your dataset.
+# Most dataset-specific arguments are in the head of the config file, see below.
 
 name: Tacotron2
-sample_rate: 22050
-# <PAD>, <BOS>, <EOS> will be added by the tacotron2.py script
-labels: [' ', '!', '"', "'", '(', ')', ',', '-', '.', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
-        'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']',
-        'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't',
-        'u', 'v', 'w', 'x', 'y', 'z']
-n_fft: 1024
-n_mels: 80
-fmax: 8000
-n_stride: 256
-pad_value: -11.52
+
 train_dataset: ???
 validation_datasets: ???
+sup_data_path: null
+sup_data_types: null
+
+phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.01"
+heteronyms_path: "scripts/tts_dataset_files/heteronyms-030921"
+whitelist_path: "nemo_text_processing/text_normalization/en/data/whitelist_lj_speech.tsv"
+
+
 
 model:
-  labels: ${labels}
+  pitch_fmin: 65.40639132514966
+  pitch_fmax: 2093.004522404789
+
+  sample_rate: 22050
+  n_mel_channels: 80
+  n_window_size: 1024
+  n_window_stride: 256
+  n_fft: 1024
+  lowfreq: 0
+  highfreq: 8000
+  window: hann
+  pad_value: -11.52
+
+
+  text_normalizer:
+    _target_: nemo_text_processing.text_normalization.normalize.Normalizer
+    lang: en
+    input_case: cased
+    whitelist: ${whitelist_path}
+
+  text_normalizer_call_kwargs:
+    verbose: false
+    punct_pre_process: true
+    punct_post_process: true
+
+  text_tokenizer:
+    _target_: nemo.collections.tts.torch.tts_tokenizers.EnglishPhonemesTokenizer
+    punct: true
+    stresses: true
+    chars: true
+    apostrophe: true
+    pad_with_space: true
+    g2p:
+      _target_: nemo.collections.tts.torch.g2ps.EnglishG2p
+      phoneme_dict: ${phoneme_dict_path}
+      heteronyms: ${heteronyms_path}
+
   train_ds:
     dataset:
-      _target_: "nemo.collections.asr.data.audio_to_text.AudioToCharDataset"
+      _target_: "nemo.collections.tts.torch.data.TTSDataset"
       manifest_filepath: ${train_dataset}
+      sample_rate: ${model.sample_rate}
+      sup_data_path: ${sup_data_path}
+      sup_data_types: ${sup_data_types}
+      n_fft: ${model.n_fft}
+      win_length: ${model.n_window_size}
+      hop_length: ${model.n_window_stride}
+      window: ${model.window}
+      n_mels: ${model.n_mel_channels}
+      lowfreq: ${model.lowfreq}
+      highfreq: ${model.highfreq}
       max_duration: null
       min_duration: 0.1
-      trim: false
-      int_values: false
-      normalize: true
-      sample_rate: ${sample_rate}
-      # bos_id: 66
-      # eos_id: 67
-      # pad_id: 68  These parameters are added automatically in Tacotron2
+      ignore_file: null
+      trim: False
+      pitch_fmin: ${model.pitch_fmin}
+      pitch_fmax: ${model.pitch_fmax}
     dataloader_params:
       drop_last: false
       shuffle: true
       batch_size: 48
       num_workers: 4
-
-
+      pin_memory: false
+  
   validation_ds:
     dataset:
-      _target_: "nemo.collections.asr.data.audio_to_text.AudioToCharDataset"
-      manifest_filepath: ${validation_datasets}
+      _target_: "nemo.collections.tts.torch.data.TTSDataset"
+      manifest_filepath: ${train_dataset}
+      sample_rate: ${model.sample_rate}
+      sup_data_path: ${sup_data_path}
+      sup_data_types: ${sup_data_types}
+      n_fft: ${model.n_fft}
+      win_length: ${model.n_window_size}
+      hop_length: ${model.n_window_stride}
+      window: ${model.window}
+      n_mels: ${model.n_mel_channels}
+      lowfreq: ${model.lowfreq}
+      highfreq: ${model.highfreq}
       max_duration: null
       min_duration: 0.1
-      int_values: false
-      normalize: true
-      sample_rate: ${sample_rate}
-      trim: false
-      # bos_id: 66
-      # eos_id: 67
-      # pad_id: 68  These parameters are added automatically in Tacotron2
+      ignore_file: null
+      trim: False
+      pitch_fmin: ${model.pitch_fmin}
+      pitch_fmax: ${model.pitch_fmax}
     dataloader_params:
       drop_last: false
       shuffle: false
-      batch_size: 48
+      batch_size: 24
       num_workers: 8
+      pin_memory: false
 
   preprocessor:
     _target_: nemo.collections.asr.parts.preprocessing.features.FilterbankFeatures
-    dither: 0.0
-    nfilt: ${n_mels}
-    frame_splicing: 1
-    highfreq: ${fmax}
+    nfilt: ${model.n_mel_channels}
+    highfreq: ${model.highfreq}
     log: true
     log_zero_guard_type: clamp
     log_zero_guard_value: 1e-05
-    lowfreq: 0
-    mag_power: 1.0
-    n_fft: ${n_fft}
-    n_window_size: 1024
-    n_window_stride: ${n_stride}
-    normalize: null
+    lowfreq: ${model.lowfreq}
+    n_fft: ${model.n_fft}
+    n_window_size: ${model.n_window_size}
+    n_window_stride: ${model.n_window_stride}
     pad_to: 16
-    pad_value: ${pad_value}
+    pad_value: ${model.pad_value}
+    sample_rate: ${model.sample_rate}
+    window: ${model.window}
+    normalize: null
     preemph: null
-    sample_rate: ${sample_rate}
-    window: hann
+    dither: 0.0
+    frame_splicing: 1
+    stft_conv: false
+    nb_augmentation_prob : 0
+    mag_power: 1.0
+    exact_pad: true
+    use_grads: false
 
   encoder:
     _target_: nemo.collections.tts.modules.tacotron2.Encoder
@@ -90,7 +145,7 @@ model:
     gate_threshold: 0.5
     max_decoder_steps: 1000
     n_frames_per_step: 1  # currently only 1 is supported
-    n_mel_channels: ${n_mels}
+    n_mel_channels: ${model.n_mel_channels}
     p_attention_dropout: 0.1
     p_decoder_dropout: 0.1
     prenet_dim: 256
@@ -105,7 +160,7 @@ model:
 
   postnet:
     _target_: nemo.collections.tts.modules.tacotron2.Postnet
-    n_mel_channels: ${n_mels}
+    n_mel_channels: ${model.n_mel_channels}
     p_dropout: 0.5
     postnet_embedding_dim: 512
     postnet_kernel_size: 5
@@ -132,11 +187,15 @@ trainer:
   enable_checkpointing: False  # Provided by exp_manager
   logger: False  # Provided by exp_manager
   gradient_clip_val: 1.0
-  log_every_n_steps: 200
-  check_val_every_n_epoch: 25
+  log_every_n_steps: 60
+  check_val_every_n_epoch: 2
+
 
 exp_manager:
   exp_dir: null
   name: ${name}
-  create_tensorboard_logger: True
-  create_checkpoint_callback: True
+  create_tensorboard_logger: true
+  create_checkpoint_callback: true
+  checkpoint_callback_params:
+    monitor: val_loss
+    mode: min
diff --git a/nemo/collections/tts/helpers/helpers.py b/nemo/collections/tts/helpers/helpers.py
index fea53bf1d8fa..995c6bd59fac 100644
--- a/nemo/collections/tts/helpers/helpers.py
+++ b/nemo/collections/tts/helpers/helpers.py
@@ -56,6 +56,12 @@
 
 from nemo.utils import logging
 
+HAVE_WANDB = True
+try:
+    import wandb
+except ModuleNotFoundError:
+    HAVE_WANDB = False
+
 try:
     from pytorch_lightning.utilities import rank_zero_only
 except ModuleNotFoundError:
@@ -284,6 +290,7 @@ def tacotron2_log_to_tb_func(
             step,
             dataformats="HWC",
         )
+
         if add_audio:
             filterbank = librosa.filters.mel(sr=sr, n_fft=n_fft, n_mels=n_mels, fmax=fmax)
             log_mel = mel_postnet[0].data.cpu().numpy().T
@@ -299,6 +306,67 @@ def tacotron2_log_to_tb_func(
             swriter.add_audio(f"audio/{tag}_target", audio / max(np.abs(audio)), step, sample_rate=sr)
 
 
+def tacotron2_log_to_wandb_func(
+    swriter,
+    tensors,
+    step,
+    tag="train",
+    log_images=False,
+    log_images_freq=1,
+    add_audio=True,
+    griffin_lim_mag_scale=1024,
+    griffin_lim_power=1.2,
+    sr=22050,
+    n_fft=1024,
+    n_mels=80,
+    fmax=8000,
+):
+    _, spec_target, mel_postnet, gate, gate_target, alignments = tensors
+    if not HAVE_WANDB:
+        return
+    if log_images and step % log_images_freq == 0:
+        alignments = []
+        specs = []
+        gates = []
+        alignments += [
+            wandb.Image(plot_alignment_to_numpy(alignments[0].data.cpu().numpy().T), caption=f"{tag}_alignment",)
+        ]
+        alignments += [
+            wandb.Image(plot_spectrogram_to_numpy(spec_target[0].data.cpu().numpy()), caption=f"{tag}_mel_target",),
+            wandb.Image(plot_spectrogram_to_numpy(mel_postnet[0].data.cpu().numpy()), caption=f"{tag}_mel_predicted",),
+        ]
+        gates += [
+            wandb.Image(
+                plot_gate_outputs_to_numpy(
+                    gate_target[0].data.cpu().numpy(), torch.sigmoid(gate[0]).data.cpu().numpy(),
+                ),
+                caption=f"{tag}_gate",
+            )
+        ]
+
+        swriter.log({"specs": specs, "alignments": alignments, "gates": gates})
+
+        if add_audio:
+            audios = []
+            filterbank = librosa.filters.mel(sr=sr, n_fft=n_fft, n_mels=n_mels, fmax=fmax)
+            log_mel = mel_postnet[0].data.cpu().numpy().T
+            mel = np.exp(log_mel)
+            magnitude = np.dot(mel, filterbank) * griffin_lim_mag_scale
+            audio_pred = griffin_lim(magnitude.T ** griffin_lim_power)
+
+            log_mel = spec_target[0].data.cpu().numpy().T
+            mel = np.exp(log_mel)
+            magnitude = np.dot(mel, filterbank) * griffin_lim_mag_scale
+            audio_true = griffin_lim(magnitude.T ** griffin_lim_power)
+
+            audios += [
+                wandb.Audio(audio_true / max(np.abs(audio_true)), caption=f"{tag}_wav_target", sample_rate=sr,),
+                wandb.Audio(audio_pred / max(np.abs(audio_pred)), caption=f"{tag}_wav_predicted", sample_rate=sr,),
+            ]
+
+            swriter.log({"audios": audios})
+
+
 def plot_alignment_to_numpy(alignment, info=None):
     fig, ax = plt.subplots(figsize=(6, 4))
     im = ax.imshow(alignment, aspect='auto', origin='lower', interpolation='none')
diff --git a/nemo/collections/tts/models/tacotron2.py b/nemo/collections/tts/models/tacotron2.py
index baae5f2f43fb..60146741d900 100644
--- a/nemo/collections/tts/models/tacotron2.py
+++ b/nemo/collections/tts/models/tacotron2.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import contextlib
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional
 
@@ -19,11 +20,15 @@
 from hydra.utils import instantiate
 from omegaconf import MISSING, DictConfig, OmegaConf, open_dict
 from omegaconf.errors import ConfigAttributeError
-from pytorch_lightning.loggers import LoggerCollection, TensorBoardLogger
+from pytorch_lightning.loggers import LoggerCollection, TensorBoardLogger, WandbLogger
 from torch import nn
 
 from nemo.collections.common.parts.preprocessing import parsers
-from nemo.collections.tts.helpers.helpers import get_mask_from_lengths, tacotron2_log_to_tb_func
+from nemo.collections.tts.helpers.helpers import (
+    get_mask_from_lengths,
+    tacotron2_log_to_tb_func,
+    tacotron2_log_to_wandb_func,
+)
 from nemo.collections.tts.losses.tacotron2loss import Tacotron2Loss
 from nemo.collections.tts.models.base import SpectrogramGenerator
 from nemo.core.classes.common import PretrainedModelInfo, typecheck
@@ -36,7 +41,7 @@
     SequenceToSequenceAlignmentType,
 )
 from nemo.core.neural_types.neural_type import NeuralType
-from nemo.utils import logging
+from nemo.utils import logging, model_utils
 
 
 @dataclass
@@ -60,8 +65,28 @@ class Tacotron2Model(SpectrogramGenerator):
     """Tacotron 2 Model that is used to generate mel spectrograms from text"""
 
     def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None):
-        if isinstance(cfg, dict):
-            cfg = OmegaConf.create(cfg)
+        # Convert to Hydra 1.0 compatible DictConfig
+        cfg = model_utils.convert_model_config_to_dict_config(cfg)
+        cfg = model_utils.maybe_update_config_version(cfg)
+
+        # setup normalizer
+        self.normalizer = None
+        self.text_normalizer_call = None
+        self.text_normalizer_call_kwargs = {}
+        self._setup_normalizer(cfg)
+
+        # setup tokenizer
+        self.tokenizer = None
+        if hasattr(cfg, 'text_tokenizer'):
+            self._setup_tokenizer(cfg)
+
+            self.num_tokens = len(self.tokenizer.tokens)
+            self.tokenizer_pad = self.tokenizer.pad
+            self.tokenizer_unk = self.tokenizer.oov
+            # assert self.tokenizer is not None
+        else:
+            self.num_tokens = len(cfg.labels) + 3
+
         super().__init__(cfg=cfg, trainer=trainer)
 
         schema = OmegaConf.structured(Tacotron2Config)
@@ -73,17 +98,17 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None):
         # Ensure passed cfg is compliant with schema
         try:
             OmegaConf.merge(cfg, schema)
-            self.pad_value = self._cfg.preprocessor.pad_value
+            self.pad_value = cfg.preprocessor.pad_value
         except ConfigAttributeError:
-            self.pad_value = self._cfg.preprocessor.params.pad_value
+            self.pad_value = cfg.preprocessor.params.pad_value
             logging.warning(
                 "Your config is using an old NeMo yaml configuration. Please ensure that the yaml matches the "
                 "current version in the main branch for future compatibility."
             )
 
         self._parser = None
-        self.audio_to_melspec_precessor = instantiate(self._cfg.preprocessor)
-        self.text_embedding = nn.Embedding(len(cfg.labels) + 3, 512)
+        self.audio_to_melspec_precessor = instantiate(cfg.preprocessor)
+        self.text_embedding = nn.Embedding(self.num_tokens, 512)
         self.encoder = instantiate(self._cfg.encoder)
         self.decoder = instantiate(self._cfg.decoder)
         self.postnet = instantiate(self._cfg.postnet)
@@ -94,46 +119,45 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None):
     def parser(self):
         if self._parser is not None:
             return self._parser
-        if self._validation_dl is not None:
-            return self._validation_dl.dataset.manifest_processor.parser
-        if self._test_dl is not None:
-            return self._test_dl.dataset.manifest_processor.parser
-        if self._train_dl is not None:
-            return self._train_dl.dataset.manifest_processor.parser
-
-        # Else construct a parser
-        # Try to get params from validation, test, and then train
-        params = {}
-        try:
-            params = self._cfg.validation_ds.dataset
-        except ConfigAttributeError:
-            pass
-        if params == {}:
-            try:
-                params = self._cfg.test_ds.dataset
-            except ConfigAttributeError:
-                pass
-        if params == {}:
-            try:
-                params = self._cfg.train_ds.dataset
-            except ConfigAttributeError:
-                pass
-
-        name = params.get('parser', None) or 'en'
-        unk_id = params.get('unk_index', None) or -1
-        blank_id = params.get('blank_index', None) or -1
-        do_normalize = params.get('normalize', True)
-        self._parser = parsers.make_parser(
-            labels=self._cfg.labels, name=name, unk_id=unk_id, blank_id=blank_id, do_normalize=do_normalize,
-        )
+
+        ds_class_name = self._cfg.train_ds.dataset._target_.split(".")[-1]
+        if ds_class_name == "TTSDataset":
+            self._parser = None
+        elif hasattr(self._cfg, "labels"):
+            self._parser = parsers.make_parser(
+                labels=self._cfg.labels,
+                name='en',
+                unk_id=-1,
+                blank_id=-1,
+                do_normalize=True,
+                abbreviation_version="fastpitch",
+                make_table=False,
+            )
+        elif ds_class_name == "AudioToCharWithPriorAndPitchDataset":
+            self.parser = self.vocab.encode
+        else:
+            raise ValueError("Wanted to setup parser, but model does not have necessary paramaters")
+
         return self._parser
 
-    def parse(self, str_input: str) -> torch.tensor:
-        tokens = self.parser(str_input)
-        # Parser doesn't add bos and eos ids, so maunally add it
-        tokens = [len(self._cfg.labels)] + tokens + [len(self._cfg.labels) + 1]
+    def parse(self, text: str, normalize=True) -> torch.Tensor:
+        if self.training:
+            logging.warning("parse() is meant to be called in eval mode.")
+        if normalize and self.text_normalizer_call is not None:
+            text = self.text_normalizer_call(text, **self.text_normalizer_call_kwargs)
+
+        eval_phon_mode = contextlib.nullcontext()
+        if hasattr(self.tokenizer, "set_phone_prob"):
+            eval_phon_mode = self.tokenizer.set_phone_prob(prob=1.0)
+
+        with eval_phon_mode:
+            if self.tokenizer is not None:
+                tokens = self.tokenizer.encode(text)
+            else:
+                tokens = self.parser(text)
+                # Old parser doesn't add bos and eos ids, so maunally add it
+                tokens = [len(self._cfg.labels)] + tokens + [len(self._cfg.labels) + 1]
         tokens_tensor = torch.tensor(tokens).unsqueeze_(0).to(self.device)
-
         return tokens_tensor
 
     @property
@@ -259,18 +283,56 @@ def validation_step(self, batch, batch_idx):
 
     def validation_epoch_end(self, outputs):
         if self.logger is not None and self.logger.experiment is not None:
-            tb_logger = self.logger.experiment
+            logger = self.logger.experiment
             if isinstance(self.logger, LoggerCollection):
                 for logger in self.logger:
                     if isinstance(logger, TensorBoardLogger):
-                        tb_logger = logger.experiment
+                        logger = logger.experiment
                         break
-            tacotron2_log_to_tb_func(
-                tb_logger, outputs[0].values(), self.global_step, tag="val", log_images=True, add_audio=False,
-            )
+            if isinstance(logger, TensorBoardLogger):
+                tacotron2_log_to_tb_func(
+                    logger, outputs[0].values(), self.global_step, tag="val", log_images=True, add_audio=False,
+                )
+            elif isinstance(logger, WandbLogger):
+                tacotron2_log_to_wandb_func(
+                    logger, outputs[0].values(), self.global_step, tag="val", log_images=True, add_audio=False,
+                )
         avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()  # This reduces across batches, not workers!
         self.log('val_loss', avg_loss)
 
+    def _setup_normalizer(self, cfg):
+        if "text_normalizer" in cfg:
+            normalizer_kwargs = {}
+
+            if "whitelist" in cfg.text_normalizer:
+                normalizer_kwargs["whitelist"] = self.register_artifact(
+                    'text_normalizer.whitelist', cfg.text_normalizer.whitelist
+                )
+
+            self.normalizer = instantiate(cfg.text_normalizer, **normalizer_kwargs)
+            self.text_normalizer_call = self.normalizer.normalize
+            if "text_normalizer_call_kwargs" in cfg:
+                self.text_normalizer_call_kwargs = cfg.text_normalizer_call_kwargs
+
+    def _setup_tokenizer(self, cfg):
+        text_tokenizer_kwargs = {}
+        if "g2p" in cfg.text_tokenizer and cfg.text_tokenizer.g2p is not None:
+            g2p_kwargs = {}
+
+            if "phoneme_dict" in cfg.text_tokenizer.g2p:
+                g2p_kwargs["phoneme_dict"] = self.register_artifact(
+                    'text_tokenizer.g2p.phoneme_dict', cfg.text_tokenizer.g2p.phoneme_dict,
+                )
+
+            if "heteronyms" in cfg.text_tokenizer.g2p:
+                g2p_kwargs["heteronyms"] = self.register_artifact(
+                    'text_tokenizer.g2p.heteronyms', cfg.text_tokenizer.g2p.heteronyms,
+                )
+
+            text_tokenizer_kwargs["g2p"] = instantiate(cfg.text_tokenizer.g2p, **g2p_kwargs)
+
+        self.tokenizer = instantiate(cfg.text_tokenizer, **text_tokenizer_kwargs)
+
     def __setup_dataloader_from_config(self, cfg, shuffle_should_be: bool = True, name: str = "train"):
         if "dataset" not in cfg or not isinstance(cfg.dataset, DictConfig):
             raise ValueError(f"No dataset for {name}")
@@ -289,11 +351,13 @@ def __setup_dataloader_from_config(self, cfg, shuffle_should_be: bool = True, na
         elif not shuffle_should_be and cfg.dataloader_params.shuffle:
             logging.error(f"The {name} dataloader for {self} has shuffle set to True!!!")
 
-        labels = self._cfg.labels
-
         dataset = instantiate(
-            cfg.dataset, labels=labels, bos_id=len(labels), eos_id=len(labels) + 1, pad_id=len(labels) + 2
+            cfg.dataset,
+            text_normalizer=self.normalizer,
+            text_normalizer_call_kwargs=self.text_normalizer_call_kwargs,
+            text_tokenizer=self.tokenizer,
         )
+
         return torch.utils.data.DataLoader(dataset, collate_fn=dataset.collate_fn, **cfg.dataloader_params)
 
     def setup_training_data(self, cfg):
diff --git a/tutorials/tts/Tacotron2_Training.ipynb b/tutorials/tts/Tacotron2_Training.ipynb
index 8109b734af60..be021cb78212 100644
--- a/tutorials/tts/Tacotron2_Training.ipynb
+++ b/tutorials/tts/Tacotron2_Training.ipynb
@@ -1,347 +1,337 @@
 {
-    "cells": [
-        {
-            "cell_type": "markdown",
-            "metadata": {
-                "id": "htbJiaJjYQAD"
-            },
-            "source": [
-                "# Tacotron 2 Training\n",
-                "\n",
-                "This notebook is designed to provide a guide on how to train Tacotron2 as part of the TTS pipeline. It contains the following sections\n",
-                "\n",
-                "  1. Tacotron2 and NeMo - An introduction to the Tacotron2 model\n",
-                "  2. LJSpeech - How to train Tacotron2 on LJSpeech\n",
-                "  3. Custom Datasets - How to collect audio data to train Tacotron2 for difference voices and languages"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {
-                "id": "wqPMTEXXYUP4"
-            },
-            "source": [
-                "# License\n",
-                "\n",
-                "> Copyright 2020 NVIDIA. All Rights Reserved.\n",
-                "> \n",
-                "> Licensed under the Apache License, Version 2.0 (the \"License\");\n",
-                "> you may not use this file except in compliance with the License.\n",
-                "> You may obtain a copy of the License at\n",
-                "> \n",
-                ">     http://www.apache.org/licenses/LICENSE-2.0\n",
-                "> \n",
-                "> Unless required by applicable law or agreed to in writing, software\n",
-                "> distributed under the License is distributed on an \"AS IS\" BASIS,\n",
-                "> WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
-                "> See the License for the specific language governing permissions and\n",
-                "> limitations under the License."
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {
-                "id": "SUkq9HAvYU7T"
-            },
-            "outputs": [],
-            "source": [
-                "\"\"\"\n",
-                "You can either run this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.\n",
-                "Instructions for setting up Colab are as follows:\n",
-                "1. Open a new Python 3 notebook.\n",
-                "2. Import this notebook from GitHub (File -> Upload Notebook -> \"GITHUB\" tab -> copy/paste GitHub URL)\n",
-                "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n",
-                "4. Run this cell to set up dependencies# .\n",
-                "\"\"\"\n",
-                "BRANCH = 'r1.9.0'\n",
-                "# # If you're using Colab and not running locally, uncomment and run this cell.\n",
-                "# !apt-get install sox libsndfile1 ffmpeg\n",
-                "# !pip install wget unidecode\n",
-                "# !python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {
-                "id": "ZivXzmq0YYLj"
-            },
-            "source": [
-                "# Tacotron2 and NeMo\n",
-                "\n",
-                "Tacotron2 is a neural network that converts text characters into a mel spectrogram. For more details on the model, please refer to Nvidia's [Tacotron2 Model Card](https://ngc.nvidia.com/catalog/models/nvidia:nemo:tts_en_tacotron2), or the original [paper](https://arxiv.org/abs/1712.05884).\n",
-                "\n",
-                "Tacotron2 like most NeMo models are defined as a LightningModule, allowing for easy training via PyTorch Lightning, and parameterized by a configuration, currently defined via a yaml file and loading using Hydra.\n",
-                "\n",
-                "Let's take a look using NeMo's pretrained model and how to use it to generate spectrograms."
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {
-                "id": "HEvdSU5WYZbj"
-            },
-            "outputs": [],
-            "source": [
-                "# Load the Tacotron2Model\n",
-                "from nemo.collections.tts.models import Tacotron2Model\n",
-                "from nemo.collections.tts.models.base import SpectrogramGenerator\n",
-                "\n",
-                "# Let's see what pretrained models are available\n",
-                "print(Tacotron2Model.list_available_models())"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {
-                "id": "3W8unatgYbUp"
-            },
-            "outputs": [],
-            "source": [
-                "# We can load the pre-trained model as follows\n",
-                "model = Tacotron2Model.from_pretrained(\"tts_en_tacotron2\")"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {
-                "id": "xsyBa9tIdHp4"
-            },
-            "outputs": [],
-            "source": [
-                "# Tacotron2 is a SpectrogramGenerator\n",
-                "assert isinstance(model, SpectrogramGenerator)\n",
-                "\n",
-                "# SpectrogramGenerators in NeMo have two helper functions:\n",
-                "#   1. parse(str_input: str, **kwargs) which takes an English string and produces a token tensor\n",
-                "#   2. generate_spectrogram(tokens: 'torch.tensor', **kwargs) which takes the token tensor and generates a spectrogram\n",
-                "# Let's try it out\n",
-                "tokens = model.parse(str_input = \"Hey, this produces speech!\")\n",
-                "spectrogram = model.generate_spectrogram(tokens = tokens)\n",
-                "\n",
-                "# Now we can visualize the generated spectrogram\n",
-                "# If we want to generate speech, we have to use a vocoder in conjunction to a spectrogram generator.\n",
-                "# Refer to the TTS Inference notebook on how to convert spectrograms to speech.\n",
-                "from matplotlib.pyplot import imshow\n",
-                "from matplotlib import pyplot as plt\n",
-                "%matplotlib inline\n",
-                "imshow(spectrogram.cpu().detach().numpy()[0,...], origin=\"lower\")\n",
-                "plt.show()"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {
-                "id": "zZ90eCfdrNIf"
-            },
-            "source": [
-                "# Training\n",
-                "\n",
-                "Now that we looked at the Tacotron2 model, let's see how to train a Tacotron2 Model\n",
-                "\n"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {
-                "id": "7rHG-LERrPRY"
-            },
-            "outputs": [],
-            "source": [
-                "# NeMo's training scripts are stored inside the examples/ folder. Let's grab the tacotron2.py file\n",
-                "# as well as the tacotron2.yaml file\n",
-                "!wget https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/examples/tts/tacotron2.py\n",
-                "!mkdir -p conf && cd conf && wget https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/examples/tts/conf/tacotron2.yaml && cd .."
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {
-                "id": "Upv_LxBIsC51"
-            },
-            "source": [
-                "Let's take a look at the tacotron2.py file\n",
-                "\n",
-                "```python\n",
-                "import pytorch_lightning as pl\n",
-                "\n",
-                "from nemo.collections.common.callbacks import LogEpochTimeCallback\n",
-                "from nemo.collections.tts.models import Tacotron2Model\n",
-                "from nemo.core.config import hydra_runner\n",
-                "from nemo.utils.exp_manager import exp_manager\n",
-                "\n",
-                "\n",
-                "# hydra_runner is a thin NeMo wrapper around Hydra\n",
-                "# It looks for a config named tacotron2.yaml inside the conf folder\n",
-                "# Hydra parses the yaml and returns it as a Omegaconf DictConfig\n",
-                "@hydra_runner(config_path=\"conf\", config_name=\"tacotron2\")\n",
-                "def main(cfg):\n",
-                "    # Define the Lightning trainer\n",
-                "    trainer = pl.Trainer(**cfg.trainer)\n",
-                "    # exp_manager is a NeMo construct that helps with logging and checkpointing\n",
-                "    exp_manager(trainer, cfg.get(\"exp_manager\", None))\n",
-                "    # Define the Tacotron 2 model, this will construct the model as well as\n",
-                "    # define the training and validation dataloaders\n",
-                "    model = Tacotron2Model(cfg=cfg.model, trainer=trainer)\n",
-                "    # Let's add a few more callbacks\n",
-                "    lr_logger = pl.callbacks.LearningRateMonitor()\n",
-                "    epoch_time_logger = LogEpochTimeCallback()\n",
-                "    trainer.callbacks.extend([lr_logger, epoch_time_logger])\n",
-                "    # Call lightning trainer's fit() to train the model\n",
-                "    trainer.fit(model)\n",
-                "\n",
-                "\n",
-                "if __name__ == '__main__':\n",
-                "    main()  # noqa pylint: disable=no-value-for-parameter\n",
-                "```"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {
-                "id": "6nM-fZO-s75u"
-            },
-            "source": [
-                "Let's take a look at the yaml config\n",
-                "\n",
-                "```yaml\n",
-                "name: &name Tacotron2\n",
-                "sample_rate: &sr 22050\n",
-                "# <PAD>, <BOS>, <EOS> will be added by the tacotron2.py script\n",
-                "labels: &labels [' ', '!', '\"', \"'\", '(', ')', ',', '-', '.', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',\n",
-                "                 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']',\n",
-                "                 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't',\n",
-                "                 'u', 'v', 'w', 'x', 'y', 'z']\n",
-                "n_fft: &n_fft 1024\n",
-                "n_mels: &n_mels 80\n",
-                "fmax: &fmax null\n",
-                "n_stride: &n_window_stride 256\n",
-                "pad_value: &pad_value -11.52\n",
-                "train_dataset: ???\n",
-                "validation_datasets: ???\n",
-                "```\n",
-                "\n",
-                "The first part of the yaml defines some parameters used by Tacotron. You can see\n",
-                "that the sample rate is set to 22050 for LJSpeech. You can also see that this\n",
-                "model has characters for labels instead of phones. To use phones as input,\n",
-                "see the GlowTTS yaml and setup for an example.\n",
-                "\n",
-                "Looking at the yaml, there is `train_dataset: ???` and `validation_datasets: ???`. The ??? indicates to hydra that these values must be passed via the command line or the script will fail.\n",
-                "\n",
-                "Looking further down the yaml, we get to the pytorch lightning trainer parameters.\n",
-                "\n",
-                "```yaml\n",
-                "trainer:\n",
-                "  devices: 1 # number of gpus\n",
-                "  accelerator: 'gpu' \n",
-                "  max_epochs: ???\n",
-                "  num_nodes: 1\n",
-                "  accelerator: 'gpu'\n",
-                "  strategy: 'dp'\n",
-                "  accumulate_grad_batches: 1\n",
-                "  enable_checkpointing: False  # Provided by exp_manager\n",
-                "  logger: False  # Provided by exp_manager\n",
-                "  gradient_clip_val: 1.0\n",
-                "  log_every_n_steps: 200\n",
-                "  check_val_every_n_epoch: 25\n",
-                "```\n",
-                "\n",
-                "These values can be changed either by editing the yaml or through the command line.\n",
-                "\n",
-                "Let's grab some simple audio data and test Tacotron2."
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {
-                "id": "GnEzODcorugt"
-            },
-            "outputs": [],
-            "source": [
-                "!wget https://github.com/NVIDIA/NeMo/releases/download/v0.11.0/test_data.tar.gz \\\n",
-                "    && mkdir -p tests/data \\\n",
-                "    && tar xzf test_data.tar.gz -C tests/data\n",
-                "\n",
-                "# Just like ASR, the Tacotron2 require .json files to define the training and validation data.\n",
-                "!cat tests/data/asr/an4_val.json\n",
-                "\n",
-                "# Now that we have some sample data, we can try training Tacotron 2\n",
-                "# NOTE: The sample data is not enough data to properly train a Tacotron 2. This will not result in a trained Tacotron 2 and is used to illustrate how to train Tacotron 2 model\n",
-                "!python tacotron2.py \\\n",
-                "sample_rate=16000 \\\n",
-                "train_dataset=tests/data/asr/an4_train.json \\\n",
-                "validation_datasets=tests/data/asr/an4_val.json \\\n",
-                "trainer.max_epochs=3 \\\n",
-                "trainer.accelerator=null trainer.check_val_every_n_epoch=1 \\\n",
-                "+trainer.gpus=1"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {
-                "id": "9erGDGZJ1H_p"
-            },
-            "source": [
-                "# Training Data\n",
-                "\n",
-                "In order to train Tacotron2, it is highly recommended to obtain high quality speech data with the following properties:\n",
-                "  - Sampling rate of 22050Hz or higher\n",
-                "  - Single speaker\n",
-                "  - Speech should contain a variety of speech phonemes\n",
-                "  - Audio split into segments of 1-10 seconds\n",
-                "  - Audio segments should not have silence at the beginning and end\n",
-                "  - Audio segments should not contain long silences inside\n",
-                "\n",
-                "After obtaining the speech data and splitting into training, validation, and test sections, it is required to construct .json files to tell NeMo where to find these audio files.\n",
-                "\n",
-                "The .json files should adhere to the format required by the `nemo.collections.asr.data.audio_to_text.AudioToCharDataset` class. For example, here is a sample .json file\n",
-                "\n",
-                "```json\n",
-                "{\"audio_filepath\": \"/path/to/audio1.wav\", \"text\": \"the transcription\", \"duration\": 0.82}\n",
-                "{\"audio_filepath\": \"/path/to/audio2.wav\", \"text\": \"the other transcription\", \"duration\": 2.1}\n",
-                "...\n",
-                "```\n",
-                "Please note that the duration is in seconds.\n",
-                "\n",
-                "Lastly, update the labels inside the Tacotron 2 yaml config if your data contains a different set of characters.\n",
-                "\n",
-                "Then you are ready to run your training script:\n",
-                "```bash\n",
-                "python tacotron2.py train_dataset=YOUR_TRAIN.json validation_datasets=YOUR_VAL.json trainer.devices=-1\n",
-                "```"
-            ]
-        }
-    ],
-    "metadata": {
-        "accelerator": "GPU",
-        "colab": {
-            "collapsed_sections": [],
-            "name": "Taco2.ipynb",
-            "provenance": []
-        },
-        "kernelspec": {
-            "display_name": "Python 3",
-            "language": "python",
-            "name": "python3"
-        },
-        "language_info": {
-            "codemirror_mode": {
-                "name": "ipython",
-                "version": 3
-            },
-            "file_extension": ".py",
-            "mimetype": "text/x-python",
-            "name": "python",
-            "nbconvert_exporter": "python",
-            "pygments_lexer": "ipython3",
-            "version": "3.9.7"
-        }
-    },
-    "nbformat": 4,
-    "nbformat_minor": 1
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "htbJiaJjYQAD"
+   },
+   "source": [
+    "# Tacotron 2 Training\n",
+    "\n",
+    "This notebook is designed to provide a guide on how to train Tacotron2 as part of the TTS pipeline. It contains the following sections\n",
+    "\n",
+    "  1. Tacotron2 and NeMo - An introduction to the Tacotron2 model\n",
+    "  2. LJSpeech - How to train Tacotron2 on LJSpeech\n",
+    "  3. Custom Datasets - How to collect audio data to train Tacotron2 for difference voices and languages"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "wqPMTEXXYUP4"
+   },
+   "source": [
+    "# License\n",
+    "\n",
+    "> Copyright 2020 NVIDIA. All Rights Reserved.\n",
+    "> \n",
+    "> Licensed under the Apache License, Version 2.0 (the \"License\");\n",
+    "> you may not use this file except in compliance with the License.\n",
+    "> You may obtain a copy of the License at\n",
+    "> \n",
+    ">     http://www.apache.org/licenses/LICENSE-2.0\n",
+    "> \n",
+    "> Unless required by applicable law or agreed to in writing, software\n",
+    "> distributed under the License is distributed on an \"AS IS\" BASIS,\n",
+    "> WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+    "> See the License for the specific language governing permissions and\n",
+    "> limitations under the License."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "SUkq9HAvYU7T"
+   },
+   "outputs": [],
+   "source": [
+    "\"\"\"\n",
+    "You can either run this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.\n",
+    "Instructions for setting up Colab are as follows:\n",
+    "1. Open a new Python 3 notebook.\n",
+    "2. Import this notebook from GitHub (File -> Upload Notebook -> \"GITHUB\" tab -> copy/paste GitHub URL)\n",
+    "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n",
+    "4. Run this cell to set up dependencies# .\n",
+    "\"\"\"\n",
+    "BRANCH = 'main'\n",
+    "# # If you're using Colab and not running locally, uncomment and run this cell.\n",
+    "# !apt-get install sox libsndfile1 ffmpeg\n",
+    "# !pip install wget unidecode\n",
+    "# !python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "ZivXzmq0YYLj"
+   },
+   "source": [
+    "# Tacotron2 and NeMo\n",
+    "\n",
+    "Tacotron2 is a neural network that converts text characters into a mel spectrogram. For more details on the model, please refer to Nvidia's [Tacotron2 Model Card](https://ngc.nvidia.com/catalog/models/nvidia:nemo:tts_en_tacotron2), or the original [paper](https://arxiv.org/abs/1712.05884).\n",
+    "\n",
+    "Tacotron2 like most NeMo models are defined as a LightningModule, allowing for easy training via PyTorch Lightning, and parameterized by a configuration, currently defined via a yaml file and loading using Hydra.\n",
+    "\n",
+    "Let's take a look using NeMo's pretrained model and how to use it to generate spectrograms."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "HEvdSU5WYZbj"
+   },
+   "outputs": [],
+   "source": [
+    "# Load the Tacotron2Model\n",
+    "from nemo.collections.tts.models import Tacotron2Model\n",
+    "from nemo.collections.tts.models.base import SpectrogramGenerator\n",
+    "\n",
+    "# Let's see what pretrained models are available\n",
+    "print(Tacotron2Model.list_available_models())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "3W8unatgYbUp"
+   },
+   "outputs": [],
+   "source": [
+    "# We can load the pre-trained model as follows\n",
+    "model = Tacotron2Model.from_pretrained(\"tts_en_tacotron2\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "xsyBa9tIdHp4"
+   },
+   "outputs": [],
+   "source": [
+    "# Tacotron2 is a SpectrogramGenerator\n",
+    "assert isinstance(model, SpectrogramGenerator)\n",
+    "\n",
+    "# SpectrogramGenerators in NeMo have two helper functions:\n",
+    "#   1. parse(self, text: str, normalize=True) which takes an English string and produces a token tensor\n",
+    "#   2. generate_spectrogram(self, *, tokens) which takes the token tensor and generates a spectrogram\n",
+    "# Let's try it out\n",
+    "tokens = model.parse(text = \"Hey, this produces speech!\")\n",
+    "spectrogram = model.generate_spectrogram(tokens = tokens)\n",
+    "\n",
+    "# Now we can visualize the generated spectrogram\n",
+    "# If we want to generate speech, we have to use a vocoder in conjunction to a spectrogram generator.\n",
+    "# Refer to the TTS Inference notebook on how to convert spectrograms to speech.\n",
+    "from matplotlib.pyplot import imshow\n",
+    "from matplotlib import pyplot as plt\n",
+    "%matplotlib inline\n",
+    "imshow(spectrogram.cpu().detach().numpy()[0,...], origin=\"lower\")\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "zZ90eCfdrNIf"
+   },
+   "source": [
+    "# Training\n",
+    "\n",
+    "Now that we looked at the Tacotron2 model, let's see how to train a Tacotron2 Model\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "7rHG-LERrPRY"
+   },
+   "outputs": [],
+   "source": [
+    "# NeMo's training scripts are stored inside the examples/ folder. Let's grab the tacotron2.py file\n",
+    "# as well as the tacotron2.yaml file\n",
+    "!wget https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/examples/tts/tacotron2.py\n",
+    "!mkdir -p conf && cd conf && wget https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/examples/tts/conf/tacotron2.yaml && cd .."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "Upv_LxBIsC51"
+   },
+   "source": [
+    "Let's take a look at the tacotron2.py file\n",
+    "\n",
+    "```python\n",
+    "import pytorch_lightning as pl\n",
+    "\n",
+    "from nemo.collections.common.callbacks import LogEpochTimeCallback\n",
+    "from nemo.collections.tts.models import Tacotron2Model\n",
+    "from nemo.core.config import hydra_runner\n",
+    "from nemo.utils.exp_manager import exp_manager\n",
+    "\n",
+    "\n",
+    "# hydra_runner is a thin NeMo wrapper around Hydra\n",
+    "# It looks for a config named tacotron2.yaml inside the conf folder\n",
+    "# Hydra parses the yaml and returns it as a Omegaconf DictConfig\n",
+    "@hydra_runner(config_path=\"conf\", config_name=\"tacotron2\")\n",
+    "def main(cfg):\n",
+    "    # Define the Lightning trainer\n",
+    "    trainer = pl.Trainer(**cfg.trainer)\n",
+    "    # exp_manager is a NeMo construct that helps with logging and checkpointing\n",
+    "    exp_manager(trainer, cfg.get(\"exp_manager\", None))\n",
+    "    # Define the Tacotron 2 model, this will construct the model as well as\n",
+    "    # define the training and validation dataloaders\n",
+    "    model = Tacotron2Model(cfg=cfg.model, trainer=trainer)\n",
+    "    # Let's add a few more callbacks\n",
+    "    lr_logger = pl.callbacks.LearningRateMonitor()\n",
+    "    epoch_time_logger = LogEpochTimeCallback()\n",
+    "    trainer.callbacks.extend([lr_logger, epoch_time_logger])\n",
+    "    # Call lightning trainer's fit() to train the model\n",
+    "    trainer.fit(model)\n",
+    "\n",
+    "\n",
+    "if __name__ == '__main__':\n",
+    "    main()  # noqa pylint: disable=no-value-for-parameter\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "6nM-fZO-s75u"
+   },
+   "source": [
+    "Let's take a look at the yaml config\n",
+    "\n",
+    "```yaml\n",
+    "name: &name Tacotron2\n",
+    "\n",
+    "train_dataset: ???\n",
+    "validation_datasets: ???\n",
+    "sup_data_path: null\n",
+    "sup_data_types: null\n",
+    "\n",
+    "phoneme_dict_path: \"scripts/tts_dataset_files/cmudict-0.7b_nv22.01\"\n",
+    "heteronyms_path: \"scripts/tts_dataset_files/heteronyms-030921\"\n",
+    "whitelist_path: \"nemo_text_processing/text_normalization/en/data/whitelist_lj_speech.tsv\"\n",
+    "```\n",
+    "\n",
+    "The first part of the yaml defines dataset parameters used by Tacotron. Then in the head of 'model' section there are processing - related parameters. You can see\n",
+    "that the sample rate is set to 22050 for LJSpeech. \n",
+    "\n",
+    "Looking at the yaml, there is `train_dataset: ???` and `validation_datasets: ???`. The ??? indicates to hydra that these values must be passed via the command line or the script will fail.\n",
+    "\n",
+    "Looking further down the yaml, we get to the pytorch lightning trainer parameters.\n",
+    "\n",
+    "```yaml\n",
+    "trainer:\n",
+    "  devices: 1 # number of gpus\n",
+    "  accelerator: 'gpu' \n",
+    "  max_epochs: ???\n",
+    "  num_nodes: 1\n",
+    "  accelerator: 'gpu'\n",
+    "  strategy: 'ddp'\n",
+    "  accumulate_grad_batches: 1\n",
+    "  enable_checkpointing: False  # Provided by exp_manager\n",
+    "  logger: False  # Provided by exp_manager\n",
+    "  gradient_clip_val: 1.0\n",
+    "  log_every_n_steps: 200\n",
+    "  check_val_every_n_epoch: 25\n",
+    "```\n",
+    "\n",
+    "These values can be changed either by editing the yaml or through the command line.\n",
+    "\n",
+    "Let's grab some simple audio data and test Tacotron2."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "GnEzODcorugt"
+   },
+   "outputs": [],
+   "source": [
+    "!wget https://github.com/NVIDIA/NeMo/releases/download/v0.11.0/test_data.tar.gz && mkdir -p tests/data && tar xzf test_data.tar.gz -C tests/data\n",
+    "\n",
+    "# Just like ASR, the Tacotron2 require .json files to define the training and validation data.\n",
+    "!cat tests/data/asr/an4_val.json\n",
+    "\n",
+    "# Now that we have some sample data, we can try training Tacotron 2\n",
+    "# NOTE: The sample data is not enough data to properly train a Tacotron 2. This will not result in a trained Tacotron 2 and is used to illustrate how to train Tacotron 2 model\n",
+    "!python tacotron2.py sample_rate=16000 train_dataset=tests/data/asr/an4_train.json validation_datasets=tests/data/asr/an4_val.json trainer.max_epochs=3 trainer.accelerator=null trainer.check_val_every_n_epoch=1"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "9erGDGZJ1H_p"
+   },
+   "source": [
+    "# Training Data\n",
+    "\n",
+    "In order to train Tacotron2, it is highly recommended to obtain high quality speech data with the following properties:\n",
+    "  - Sampling rate of 22050Hz or higher\n",
+    "  - Single speaker\n",
+    "  - Speech should contain a variety of speech phonemes\n",
+    "  - Audio split into segments of 1-10 seconds\n",
+    "  - Audio segments should not have silence at the beginning and end\n",
+    "  - Audio segments should not contain long silences inside\n",
+    "\n",
+    "After obtaining the speech data and splitting into training, validation, and test sections, it is required to construct .json files to tell NeMo where to find these audio files.\n",
+    "\n",
+    "The .json files should adhere to the format required by the `nemo.collections.tts.torch.data.TTSDataset` class. For example, here is a sample .json file\n",
+    "\n",
+    "```json\n",
+    "{\"audio_filepath\": \"/path/to/audio1.wav\", \"text\": \"the transcription\", \"duration\": 0.82}\n",
+    "{\"audio_filepath\": \"/path/to/audio2.wav\", \"text\": \"the other transcription\", \"duration\": 2.1}\n",
+    "...\n",
+    "```\n",
+    "Please note that the duration is in seconds.\n",
+    "\n",
+    "\n",
+    "Then you are ready to run your training script:\n",
+    "```bash\n",
+    "python tacotron2.py train_dataset=YOUR_TRAIN.json validation_datasets=YOUR_VAL.json trainer.devices=-1\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": []
+  }
+ ],
+ "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "collapsed_sections": [],
+   "name": "Taco2.ipynb",
+   "provenance": []
+  },
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
 }
\ No newline at end of file