diff --git a/docs/source/asr/configs.rst b/docs/source/asr/configs.rst
index 47ec92337eb8..10f94d576f06 100644
--- a/docs/source/asr/configs.rst
+++ b/docs/source/asr/configs.rst
@@ -671,9 +671,16 @@ The most important component at the top level is the ``strategy``. It can take o
   decoding:
     strategy: "greedy_batch"
 
+    # preserve decoding alignments
+    preserve_alignments: false
+
+    # Overrides the fused batch size after training.
+    # Setting it to -1 will process whole batch at once when combined with `greedy_batch` decoding strategy
+    fused_batch_size: Optional[int] = -1
+
     # greedy strategy config
     greedy:
-      max_symbols: 30
+      max_symbols: 10
 
     # beam strategy config
     beam:
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 9e6b754ad59f..51c0878e7276 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -113,8 +113,8 @@
 bibtex_bibfiles = [
     'asr/asr_all.bib',
     'nlp/nlp_all.bib',
+    'nlp/text_normalization/tn_itn_all.bib',
     'tools/tools_all.bib',
-    'nemo_text_processing/textprocessing_all.bib',
     'tts_all.bib',
 ]
 
diff --git a/docs/source/index.rst b/docs/source/index.rst
index f4bc317fed48..2fcc9daede27 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -31,13 +31,14 @@ NVIDIA NeMo User Guide
    asr/speaker_diarization/intro
 
 .. toctree::
-   :maxdepth: 2
+   :maxdepth: 3
    :caption: Natural Language Processing
    :name: Natural Language Processing
-   
-   nlp/megatron
+
    nlp/models
+   nlp/megatron
    nlp/api
+   nlp/text_normalization/intro
 
 .. toctree::
    :maxdepth: 2
@@ -55,12 +56,6 @@ NVIDIA NeMo User Guide
 
    common/intro
 
-.. toctree::
-   :maxdepth: 2
-   :caption: Text Processing
-   :name: Text Processing
-
-   nemo_text_processing/intro
 
 .. toctree::
    :maxdepth: 2
diff --git a/docs/source/nemo_text_processing/intro.rst b/docs/source/nemo_text_processing/intro.rst
deleted file mode 100644
index 1e631ced44a9..000000000000
--- a/docs/source/nemo_text_processing/intro.rst
+++ /dev/null
@@ -1,17 +0,0 @@
-Text Processing
-===============
-
-`nemo_text_processing` is a python package that is installed with the `nemo_toolkit`.
-
-See :doc:`NeMo Introduction <../starthere/intro>` for installation details.
-
-Additional requirements can be found in `setup.sh <https://github.com/NVIDIA/NeMo/blob/stable/nemo_text_processing/setup.sh>`_.
-
-.. toctree::
-   :maxdepth: 1
-
-   text_normalization
-   inverse_text_normalization
-   api
-
-
diff --git a/docs/source/nemo_text_processing/text_normalization.rst b/docs/source/nemo_text_processing/text_normalization.rst
deleted file mode 100644
index 12acf3cb3c7c..000000000000
--- a/docs/source/nemo_text_processing/text_normalization.rst
+++ /dev/null
@@ -1,78 +0,0 @@
-Text Normalization
-==================
-
-NeMo Text Normalization converts text from written form into its verbalized form. It is used as a preprocessing step before Text to Speech (TTS). It could also be used for preprocessing Automatic Speech Recognition (ASR) training transcripts.
-
-
-For example, 
-`"at 10:00"` -> `"at ten o'clock"` 
-and `"it weighs 10kg."` -> `"it weights ten kilograms ."`.
-
-
-NeMo Text Normalization :cite:`textprocessing-norm-zhang2021nemo` is based on WFST-grammars :cite:`textprocessing-norm-Mohri2009`. We also provide a deployment route to C++ using `Sparrowhawk <https://github.com/google/sparrowhawk>`_ :cite:`textprocessing-norm-sparrowhawk` -- an open-source version of Google Kestrel :cite:`textprocessing-norm-ebden2015kestrel`.
-See :doc:`Text Procesing Deployment <../tools/text_processing_deployment>` for details.
-
-
-.. note::
-
-    For more details, see the tutorial `NeMo/tutorials/text_processing/Text_Normalization.ipynb <https://github.com/NVIDIA/NeMo/blob/stable/tutorials/text_processing/Text_Normalization.ipynb>`__ in `Google's Colab <https://colab.research.google.com/github/NVIDIA/NeMo/blob/stable/tutorials/text_processing/Text_Normalization.ipynb>`_.
-
-
-
-
-
-
-Classes
-----------------------------------
-
-
-The base class for every grammar is :class:`GraphFst<nemo_text_processing.text_normalization.en.GraphFst>`.
-This tool is designed as a two-stage application: 1. `classification` of the input into semiotic tokens and 2. `verbalization` into written form.
-For every stage and every semiotic token class there is a corresponding grammar, e.g. :class:`taggers.CardinalFst<nemo_text_processing.text_normalization.en.taggers.cardinal.CardinalFst>`
-and :class:`verbalizers.CardinalFst<nemo_text_processing.text_normalization.en.verbalizers.cardinal.CardinalFst>`.
-Together, they compose the final grammars :class:`ClassifyFst<nemo_text_processing.text_normalization.en.ClassifyFst>` and 
-:class:`VerbalizeFinalFst<nemo_text_processing.text_normalization.en.VerbalizeFinalFst>` that are compiled into WFST and used for inference.
-
-
-
-
-.. autoclass:: nemo_text_processing.text_normalization.en.ClassifyFst
-    :show-inheritance:
-    :members:
-
-.. autoclass:: nemo_text_processing.text_normalization.en.VerbalizeFinalFst
-    :show-inheritance:
-    :members:
- 
-
-Prediction
-----------------------------------
-
-Example prediction run:
-
-.. code::
-
-    python run_prediction.py  <--input INPUT_TEXT_FILE> <--output OUTPUT_PATH> <--language LANGUAGE> [--input_case INPUT_CASE]
-
-``INPUT_CASE`` specifies whether to treat the input as lower-cased or case sensitive. By default treat the input as cased since this is more informative, especially for abbreviations. Punctuation are outputted with separating spaces after semiotic tokens, e.g. `"I see, it is 10:00..."` -> `"I see, it is ten o'clock  .  .  ."`.
-Inner-sentence white-space characters in the input are not maintained. 
-
-
-Evaluation
-----------------------------------
-
-Example evaluation run on `Google's text normalization dataset <https://www.kaggle.com/richardwilliamsproat/text-normalization-for-english-russian-and-polish>`__ :cite:`textprocessing-norm-sproat2016rnn`:
-
-.. code::
-
-    python run_evaluation.py  --input=./en_with_types/output-00001-of-00100 --language=en [--cat CLASS_CATEGORY] [--input_case INPUT_CASE]
- 
-
-
-References
-----------
-
-.. bibliography:: textprocessing_all.bib
-    :style: plain
-    :labelprefix: TEXTPROCESSING-NORM
-    :keyprefix: textprocessing-norm-
\ No newline at end of file
diff --git a/docs/source/nlp/models.rst b/docs/source/nlp/models.rst
index 6d921a12de86..0e5d8cf4881e 100755
--- a/docs/source/nlp/models.rst
+++ b/docs/source/nlp/models.rst
@@ -21,4 +21,3 @@ NeMo's NLP collection supports provides the following task-specific models:
    entity_linking
    nlp_model
    machine_translation
-   text_normalization
diff --git a/docs/source/nlp/text_normalization/intro.rst b/docs/source/nlp/text_normalization/intro.rst
new file mode 100644
index 000000000000..5cb35408d849
--- /dev/null
+++ b/docs/source/nlp/text_normalization/intro.rst
@@ -0,0 +1,21 @@
+(Inverse) Text Normalization
+============================
+
+NeMo supports Text Normalization (TN) and Inverse Text Normalization (ITN) tasks via rule-based `nemo_text_processing` python package and Neural-based TN/ITN model.
+
+Rule-based (WFST) TN/ITN:
+
+.. toctree::
+   :maxdepth: 1
+
+   wfst/intro
+
+
+Neural TN/ITN:
+
+.. toctree::
+   :maxdepth: 1
+
+   nn_text_normalization
+
+
diff --git a/docs/source/nlp/text_normalization.rst b/docs/source/nlp/text_normalization/nn_text_normalization.rst
similarity index 99%
rename from docs/source/nlp/text_normalization.rst
rename to docs/source/nlp/text_normalization/nn_text_normalization.rst
index 6557cc3ae8e1..ecce7ffbb8f1 100644
--- a/docs/source/nlp/text_normalization.rst
+++ b/docs/source/nlp/text_normalization/nn_text_normalization.rst
@@ -1,7 +1,7 @@
-.. _text_normalization:
+.. _nn_text_normalization:
 
-Text Normalization Models
-==========================
+Neural Text Normalization Models
+================================
 Text normalization is the task of converting a written text into its spoken form. For example,
 ``$123`` should be verbalized as ``one hundred twenty three dollars``, while ``123 King Ave``
 should be verbalized as ``one twenty three King Avenue``. At the same time, the inverse problem
@@ -279,7 +279,7 @@ The argument ``data.train_ds.decoder_data_augmentation`` in the config file cont
 References
 ----------
 
-.. bibliography:: nlp_all.bib
+.. bibliography:: tn_itn_all.bib
     :style: plain
     :labelprefix: NLP-TEXTNORM
     :keyprefix: nlp-textnorm-
diff --git a/docs/source/nlp/text_normalization/tn_itn_all.bib b/docs/source/nlp/text_normalization/tn_itn_all.bib
new file mode 100644
index 000000000000..1244e4ed176f
--- /dev/null
+++ b/docs/source/nlp/text_normalization/tn_itn_all.bib
@@ -0,0 +1,56 @@
+@article{ebden2015kestrel,
+  title={The Kestrel TTS text normalization system},
+  author={Ebden, Peter and Sproat, Richard},
+  journal={Natural Language Engineering},
+  volume={21},
+  number={3},
+  pages={333},
+  year={2015},
+  publisher={Cambridge University Press}
+}
+
+@article{sproat2016rnn,
+  title={RNN approaches to text normalization: A challenge},
+  author={Sproat, Richard and Jaitly, Navdeep},
+  journal={arXiv preprint arXiv:1611.00068},
+  year={2016}
+}
+
+@book{taylor2009text,
+  title={Text-to-speech synthesis},
+  author={Taylor, Paul},
+  year={2009},
+  publisher={Cambridge university press}
+}
+
+@misc{zhang2021nemo,
+      title={NeMo Inverse Text Normalization: From Development To Production}, 
+      author={Yang Zhang and Evelina Bakhturina and Kyle Gorman and Boris Ginsburg},
+      year={2021},
+      eprint={2104.05055},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+
+@inproceedings{sparrowhawk,
+ title	= {TTS for Low Resource Languages: A Bangla Synthesizer},
+ author	= {Alexander Gutkin and Linne Ha and Martin Jansche and Knot Pipatsrisawat and Richard Sproat},
+ booktitle	= {10th Language Resources and Evaluation Conference},
+ year	= {2016},
+}
+
+@article{mohri2005weighted,
+  title={Weighted automata in text and speech processing},
+  author={Mohri, Mehryar and Pereira, Fernando and Riley, Michael},
+  journal={arXiv preprint cs/0503077},
+  year={2005}
+}
+
+@incollection{mohri2009weighted,
+  title={Weighted automata algorithms},
+  author={Mohri, Mehryar},
+  booktitle={Handbook of weighted automata},
+  pages={213--254},
+  year={2009},
+  publisher={Springer}
+}
\ No newline at end of file
diff --git a/docs/source/nlp/text_normalization/wfst/intro.rst b/docs/source/nlp/text_normalization/wfst/intro.rst
new file mode 100644
index 000000000000..f79c576dceec
--- /dev/null
+++ b/docs/source/nlp/text_normalization/wfst/intro.rst
@@ -0,0 +1,22 @@
+WFST-based (Inverse) Text Normalization
+=======================================
+
+NeMo supports Text Normalization (TN) and Inverse Text Normalization (ITN) tasks via rule-based `nemo_text_processing` python package and Neural-based TN/ITN model.
+
+`nemo_text_processing` that is installed with the `nemo_toolkit`, see :doc:`NeMo Introduction <../starthere/intro>` for installation details.
+Additional requirements can be found in `setup.sh <https://github.com/NVIDIA/NeMo/blob/stable/nemo_text_processing/setup.sh>`_.
+
+Tutorials on how to get started with WFST-based NeMo text normalization could be found `tutorials/text_processing <https://github.com/NVIDIA/NeMo/tree/stable/tutorials/text_processing>`_.
+
+Rule-based (WFST) TN/ITN:
+
+.. toctree::
+   :maxdepth: 2
+
+   wfst_text_normalization
+   wfst_inverse_text_normalization
+   wfst_text_processing_deployment
+   wfst_api
+
+
+
diff --git a/docs/source/nemo_text_processing/api.rst b/docs/source/nlp/text_normalization/wfst/wfst_api.rst
similarity index 98%
rename from docs/source/nemo_text_processing/api.rst
rename to docs/source/nlp/text_normalization/wfst/wfst_api.rst
index d68f7b05fa57..bd6cfd9cedcc 100755
--- a/docs/source/nemo_text_processing/api.rst
+++ b/docs/source/nlp/text_normalization/wfst/wfst_api.rst
@@ -1,3 +1,5 @@
+.. _wfst_api:
+
 NeMo Text Processing API
 ========================
 
diff --git a/docs/source/nemo_text_processing/inverse_text_normalization.rst b/docs/source/nlp/text_normalization/wfst/wfst_inverse_text_normalization.rst
similarity index 76%
rename from docs/source/nemo_text_processing/inverse_text_normalization.rst
rename to docs/source/nlp/text_normalization/wfst/wfst_inverse_text_normalization.rst
index d8eacb3b1698..27124ef42433 100644
--- a/docs/source/nemo_text_processing/inverse_text_normalization.rst
+++ b/docs/source/nlp/text_normalization/wfst/wfst_inverse_text_normalization.rst
@@ -1,12 +1,29 @@
+.. _wfst_itn:
+
 Inverse Text Normalization
 ==========================
 
 Inverse text normalization (ITN) is a part of the Automatic Speech Recognition (ASR) post-processing pipeline.
 ITN is the task of converting the raw spoken output of the ASR model into its written form to improve text readability.
 
-For example, 
-`"in nineteen seventy"` -> `"in 1975"` 
-and `"it costs one hundred and twenty three dollars"` -> `"it costs $123"`.
+Quick Start Guide
+-----------------
+
+.. code-block:: python
+
+    # import WFST-based ITN module
+    from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer
+
+    # initialize inverse normalizer
+    inverse_normalizer = InverseNormalizer(lang="en")
+
+    # try normalizer on a few examples
+    print(inverse_normalizer.normalize("it costs one hundred and twenty three dollars"))
+    # >>>"it costs $123"
+
+    print(inverse_normalizer.normalize("in nineteen seventy"))
+    # >>> "in 1970"
+
 
 NeMo ITN :cite:`textprocessing-itn-zhang2021nemo` is based on WFST-grammars :cite:`textprocessing-itn-Mohri2009`. We also provide a deployment route to C++ using `Sparrowhawk <https://github.com/google/sparrowhawk>`_ :cite:`textprocessing-itn-sparrowhawk` -- an open-source version of Google Kestrel :cite:`textprocessing-itn-ebden2015kestrel`.
 See :doc:`Text Procesing Deployment <../tools/text_processing_deployment>` for details.
@@ -17,11 +34,8 @@ See :doc:`Text Procesing Deployment <../tools/text_processing_deployment>` for d
 
 
 
-
-
-
 Classes
-----------------------------------
+--------
 
 
 The base class for every grammar is :class:`GraphFst<nemo_text_processing.text_normalization.en.GraphFst>`.
@@ -75,13 +89,25 @@ Example evaluation run on (cleaned) `Google's text normalization dataset <https:
 
     python run_evaluation.py  --input=./en_with_types/output-00001-of-00100 <--language LANGUAGE> [--cat CLASS_CATEGORY] [--filter]
 
+Supported Languages
+-------------------
+
+ITN supports: English, Spanish, German, French, Vietnamese, and Russian languages.
+
+Installation
+------------
+
+`nemo_text_processing` is installed with the `nemo_toolkit`.
+
+See :doc:`NeMo Introduction <../starthere/intro>` for installation details.
 
+Additional requirements can be found in `setup.sh <https://github.com/NVIDIA/NeMo/blob/stable/nemo_text_processing/setup.sh>`_.
 
 
 References
 ----------
 
-.. bibliography:: textprocessing_all.bib
+.. bibliography:: ../tn_itn_all.bib
     :style: plain
     :labelprefix: TEXTPROCESSING-ITN
     :keyprefix: textprocessing-itn-
\ No newline at end of file
diff --git a/docs/source/nlp/text_normalization/wfst/wfst_text_normalization.rst b/docs/source/nlp/text_normalization/wfst/wfst_text_normalization.rst
new file mode 100644
index 000000000000..c5911ed278a2
--- /dev/null
+++ b/docs/source/nlp/text_normalization/wfst/wfst_text_normalization.rst
@@ -0,0 +1,184 @@
+.. _wfst_tn:
+
+Text Normalization
+==================
+
+NeMo Text Normalization converts text from written form into its verbalized form. It is used as a preprocessing step before Text to Speech (TTS). It could also be used for preprocessing Automatic Speech Recognition (ASR) training transcripts.
+
+
+Quick Start Guide
+-----------------
+
+.. code-block:: python
+
+    # import WFST-based TN module
+    from nemo_text_processing.text_normalization.normalize import Normalizer
+
+    # initialize normalizer
+    normalizer = Normalizer(input_case="cased", lang="en")
+
+    # try normalizer on a few examples
+    print(normalizer.normalize("123"))
+    # >>> one hundred twenty three
+    print(normalizer.normalize_list(["at 10:00", "it weights 10kg."], punct_post_process=True))
+    # >>> ["at ten o'clock", 'it weights ten kilograms.']
+
+
+
+NeMo Text Normalization :cite:`textprocessing-norm-zhang2021nemo` is based on WFST-grammars :cite:`textprocessing-norm-mohri2005weighted` and :cite:`textprocessing-norm-mohri2009weighted`. \
+We also provide a deployment route to C++ using `Sparrowhawk <https://github.com/google/sparrowhawk>`_ :cite:`textprocessing-norm-sparrowhawk` -- an open-source version of Google Kestrel :cite:`textprocessing-norm-ebden2015kestrel`.
+See :doc:`Text Procesing Deployment <wfst_text_processing_deployment>` for details.
+
+
+.. note::
+
+    For more details, see the tutorial `NeMo/tutorials/text_processing/Text_Normalization.ipynb <https://github.com/NVIDIA/NeMo/blob/stable/tutorials/text_processing/Text_Normalization.ipynb>`__ in `Google's Colab <https://colab.research.google.com/github/NVIDIA/NeMo/blob/stable/tutorials/text_processing/Text_Normalization.ipynb>`_.
+
+
+Classes
+-------
+
+
+The base class for every grammar is :class:`GraphFst<nemo_text_processing.text_normalization.en.GraphFst>`.
+This tool is designed as a two-stage application: 1. `classification` of the input into semiotic tokens and 2. `verbalization` into written form.
+For every stage and every semiotic token class there is a corresponding grammar, e.g. :class:`taggers.CardinalFst<nemo_text_processing.text_normalization.en.taggers.cardinal.CardinalFst>`
+and :class:`verbalizers.CardinalFst<nemo_text_processing.text_normalization.en.verbalizers.cardinal.CardinalFst>`.
+Together, they compose the final grammars :class:`ClassifyFst<nemo_text_processing.text_normalization.en.ClassifyFst>` and 
+:class:`VerbalizeFinalFst<nemo_text_processing.text_normalization.en.VerbalizeFinalFst>` that are compiled into WFST and used for inference.
+
+
+
+.. autoclass:: nemo_text_processing.text_normalization.en.ClassifyFst
+    :show-inheritance:
+    :members:
+
+.. autoclass:: nemo_text_processing.text_normalization.en.VerbalizeFinalFst
+    :show-inheritance:
+    :members:
+ 
+
+Prediction
+----------
+
+Example prediction run:
+
+.. code::
+
+    python run_prediction.py  <--input INPUT_TEXT_FILE> <--output OUTPUT_PATH> <--language LANGUAGE> [--input_case INPUT_CASE]
+
+``INPUT_CASE`` specifies whether to treat the input as lower-cased or case sensitive. By default treat the input as cased since this is more informative, especially for abbreviations. Punctuation are outputted with separating spaces after semiotic tokens, e.g. `"I see, it is 10:00..."` -> `"I see, it is ten o'clock  .  .  ."`.
+Inner-sentence white-space characters in the input are not maintained. 
+
+
+Evaluation
+----------
+
+Example evaluation run on `Google's text normalization dataset <https://www.kaggle.com/richardwilliamsproat/text-normalization-for-english-russian-and-polish>`__ :cite:`textprocessing-norm-sproat2016rnn`:
+
+.. code::
+
+    python run_evaluation.py  --input=./en_with_types/output-00001-of-00100 --language=en [--cat CLASS_CATEGORY] [--input_case INPUT_CASE]
+ 
+
+Audio-based Text Normalization
+==============================
+
+Quick Start Guide
+-----------------
+
+To normalize text that has corresponding audio recording, it is recommened to use `nemo_text_processing/text_normalization/normalize_with_audio.py <https://github.com/NVIDIA/NeMo/blob/stable/nemo_text_processing/text_normalization/normalize_with_audio.py>`__ script \
+that provides multiple normalization options and chooses the one that minimizes character error rate (CER) of the automatic speech recognition (ASR) output.
+The main difference between the default normalization and the audio-based one, is that most of the semiotic classes use deterministic=False flag.
+
+.. code-block:: python
+
+    # import WFST-based non-deterministic TN module
+    from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio
+
+    # initialize normalizer
+    normalizer = NormalizerWithAudio(
+            lang="en",
+            input_case="cased",
+            overwrite_cache=False,
+            cache_dir="cache_dir",
+        )
+    # try normalizer on a few examples
+    print(normalizer.normalize("123", n_tagged=10, punct_post_process=True))
+    # >>> {'one hundred twenty three', 'one hundred and twenty three', 'one twenty three', 'one two three'}
+
+
+
+To run this script with a .json manifest file, the manifest file should contain the following fields:
+Parameters to run audio-based normalization (more details could be found in `nemo_text_processing/text_normalization/normalize_with_audio.py <https://github.com/NVIDIA/NeMo/blob/stable/nemo_text_processing/text_normalization/normalize_with_audio.py>`__)
+
+.. list-table:: Parameters to run audio-based normalization
+   :widths: 10 10
+   :header-rows: 1
+
+   * - **Parameter**
+     - **Description**
+   * - **audio_data**
+     - path to the audio file
+   * - **text**
+     - raw text
+   * - **pred_text**
+     - ASR model prediction
+   * - **n_tagged**
+     - Number of tagged options to return, -1 - return all possible tagged options
+
+
+See `examples/asr/transcribe_speech.py <https://github.com/NVIDIA/NeMo/blob/stable/examples/asr/transcribe_speech.py>`__ on how to add ASR predictions.
+
+When the manifest is ready, run:
+
+.. code-block:: python
+
+    python normalize_with_audio.py \
+           --audio_data PATH/TO/MANIFEST.JSON \
+           --language en
+
+
+To run with a single audio file, specify path to audio and text with:
+
+    .. code-block:: python
+
+        python normalize_with_audio.py \
+               --audio_data PATH/TO/AUDIO.WAV \
+               --language en \
+               --text raw text OR PATH/TO/.TXT/FILE
+               --model QuartzNet15x5Base-En \
+               --verbose
+
+To see possible normalization options for a text input without an audio file (could be used for debugging), run:
+
+    .. code-block:: python
+
+        python python normalize_with_audio.py --text "RAW TEXT" --cache_dir "<PATH_TO_CACHE_DIR_TO_STORE_GRAMMARS>"
+
+Specify `--cache_dir` to generate .far grammars once and re-used them for faster inference.
+
+See `nemo_text_processing/text_normalization/normalize_with_audio.py <https://github.com/NVIDIA/NeMo/blob/stable/nemo_text_processing/text_normalization/normalize_with_audio.py>`__ for more arguments.
+
+
+Supported Languages
+-------------------
+
+Deterministic TN supports: English, German and Spanish languages.
+Non-deterministic (audio-based) TN supports: English, German, Spanish, and Russian languages.
+
+Installation
+------------
+
+`nemo_text_processing` is installed with the `nemo_toolkit`.
+
+See :doc:`NeMo Introduction <../starthere/intro>` for installation details.
+
+Additional requirements can be found in `setup.sh <https://github.com/NVIDIA/NeMo/blob/stable/nemo_text_processing/setup.sh>`_.
+
+References
+----------
+
+.. bibliography:: ../tn_itn_all.bib
+    :style: plain
+    :labelprefix: TEXTPROCESSING-NORM
+    :keyprefix: textprocessing-norm-
\ No newline at end of file
diff --git a/docs/source/nlp/text_normalization/wfst/wfst_text_processing_deployment.rst b/docs/source/nlp/text_normalization/wfst/wfst_text_processing_deployment.rst
new file mode 100644
index 000000000000..9927ca2cd32a
--- /dev/null
+++ b/docs/source/nlp/text_normalization/wfst/wfst_text_processing_deployment.rst
@@ -0,0 +1,86 @@
+.. _wfst_deployment:
+
+NeMo Text Processing Deployment
+===============================
+
+NeMo provides a tool for deployment of :doc:`NeMo Inverse Text Normalization (ITN) <wfst_inverse_text_normalization>` and :doc:`NeMo Text Normalization (TN) <wfst_text_normalization>` for production :cite:`textprocessing-deployment-zhang2021nemo`.
+It uses `Sparrowhawk <https://github.com/google/sparrowhawk>`_ :cite:`textprocessing-deployment-sparrowhawk` -- an open-source version of Google Kestrel :cite:`textprocessing-deployment-ebden2015kestrel`.
+The scripts for deployment could be found at `NeMo/tools/text_processing_deployment <https://github.com/NVIDIA/NeMo/tree/main/tools/text_processing_deployment>`_.
+
+Requirements
+------------
+
+:doc:`nemo_text_processing <intro>` package
+
+
+Usage
+-----
+
+Starts docker container with production backend with plugged in grammars. This is entry point script.
+
+Arguments:
+^^^^^^^^^
+* ``GRAMMARS`` - ``tn_grammars`` or ``itn_grammars`` to export either TN or ITN grammars from :doc:`WFST ITN <wfst_inverse_text_normalization>` or :doc:`WFST TN <wfst_text_normalization>`.
+* ``LANGUAGE`` - `en` for English
+* ``INPUT_CASE`` - ``cased`` or ``lower_cased`` (lower_cased is supported only in TN grammars).
+* ``MODE`` - choose ``test`` to run test on the grammars inside the container.
+
+For example:
+
+
+.. code-block:: bash
+
+    # to export ITN grammars
+    cd NeMo/tools/text_processing_deployment
+    bash export_grammar.sh --GRAMMARS=itn_grammars --LANGUAGE=en
+
+    # to export and test TN grammars
+    bash export_grammar.sh --GRAMMARS=itn_grammars --INPUT_CASE=cased --MODE=test --LANGUAGE=en
+
+This script runs the following steps in sequence:
+
+Exports grammar `ClassifyFst` and `VerbalizeFst` from :doc:`nemo_text_processing <intro>` to `OUTPUT_DIR/classify/tokenize_and_classify.far` and `OUTPUT_DIR/verbalize/verbalize.far` respectively.
+
+.. code-block:: bash
+
+    cd NeMo/tools/text_processing_deployment
+    python pynini_export.py <--output_dir OUTPUT_DIR> <--grammars GRAMMARS> <--input_case INPUT_CASE> <--language LANGUAGE>
+
+Builds C++ production backend docker
+
+.. code-block:: bash
+
+    cd NeMo/tools/text_processing_deployment
+    bash docker/build.sh
+
+
+Plugs in grammars into production backend by mounting grammar directory `classify/` and `verbalize/` with sparrowhawk grammar directory inside docker. Returns docker prompt
+
+.. code-block:: bash
+
+    cd NeMo/tools/text_processing_deployment
+    # to launch container with the exported grammars
+    bash docker/launch.sh
+
+    # to launch container with the exported grammars and run tests on TN grammars
+    bash docker/launch.sh test_tn_grammars
+
+    # to launch container with the exported grammars and run tests on ITN grammars
+    bash docker/launch.sh test_itn_grammars
+
+
+Runs TN or ITN in docker container:
+
+.. code-block:: bash
+
+    echo "two dollars fifty" | ../../src/bin/normalizer_main --config=sparrowhawk_configuration.ascii_proto
+
+This returns $2.50 for ITN.
+
+References
+----------
+
+.. bibliography:: ../tn_itn_all.bib
+    :style: plain
+    :labelprefix: TEXTPROCESSING-DEPLOYMENT
+    :keyprefix: textprocessing-deployment-
\ No newline at end of file
diff --git a/docs/source/tools/intro.rst b/docs/source/tools/intro.rst
index ff7a61fcb172..5cf8032fb6f9 100644
--- a/docs/source/tools/intro.rst
+++ b/docs/source/tools/intro.rst
@@ -9,6 +9,5 @@ NeMo provides a set of tools useful for developing Automatic Speech Recognitions
 
    ctc_segmentation
    speech_data_explorer
-   text_processing_deployment
 
 
diff --git a/docs/source/tools/tools_all.bib b/docs/source/tools/tools_all.bib
index 4b86ee2b7417..5b61eb6d35cc 100644
--- a/docs/source/tools/tools_all.bib
+++ b/docs/source/tools/tools_all.bib
@@ -5,45 +5,4 @@ @inproceedings{kurzinger2020ctc
   pages={267--278},
   year={2020},
   organization={Springer}
-}
-
-@article{ebden2015kestrel,
-  title={The Kestrel TTS text normalization system},
-  author={Ebden, Peter and Sproat, Richard},
-  journal={Natural Language Engineering},
-  volume={21},
-  number={3},
-  pages={333},
-  year={2015},
-  publisher={Cambridge University Press}
-}
-
-@article{sproat2016rnn,
-  title={RNN approaches to text normalization: A challenge},
-  author={Sproat, Richard and Jaitly, Navdeep},
-  journal={arXiv preprint arXiv:1611.00068},
-  year={2016}
-}
-
-@book{taylor2009text,
-  title={Text-to-speech synthesis},
-  author={Taylor, Paul},
-  year={2009},
-  publisher={Cambridge university press}
-}
-
-@misc{zhang2021nemo,
-      title={NeMo Inverse Text Normalization: From Development To Production}, 
-      author={Yang Zhang and Evelina Bakhturina and Kyle Gorman and Boris Ginsburg},
-      year={2021},
-      eprint={2104.05055},
-      archivePrefix={arXiv},
-      primaryClass={cs.CL}
-}
-
-@inproceedings{sparrowhawk,
- title	= {TTS for Low Resource Languages: A Bangla Synthesizer},
- author	= {Alexander Gutkin and Linne Ha and Martin Jansche and Knot Pipatsrisawat and Richard Sproat},
- booktitle	= {10th Language Resources and Evaluation Conference},
- year	= {2016},
 }
\ No newline at end of file
diff --git a/examples/asr/conf/ssl/conformer/conformer_ssl.yaml b/examples/asr/conf/ssl/conformer/conformer_ssl.yaml
index a31781763954..1ab2d35a6739 100644
--- a/examples/asr/conf/ssl/conformer/conformer_ssl.yaml
+++ b/examples/asr/conf/ssl/conformer/conformer_ssl.yaml
@@ -150,7 +150,7 @@ model:
       min_lr: 1e-6
 
 trainer:
-  devices: -1 # number of GPUs, -1 would use all available GPUs
+  devices: 1 # number of GPUs, -1 would use all available GPUs
   num_nodes: 1
   max_epochs: 1000
   max_steps: null # computed at runtime if not set
@@ -167,7 +167,7 @@ trainer:
   num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it
   check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs
   sync_batchnorm: true
-  checkpoint_callback: false  # Provided by exp_manager
+  enable_checkpointing: False  # Provided by exp_manager
   logger: false  # Provided by exp_manager
 
 exp_manager:
diff --git a/examples/asr/conf/ssl/contextnet/contextnet_ssl.yaml b/examples/asr/conf/ssl/contextnet/contextnet_ssl.yaml
index 6c74fc6036a0..00cbf7d1260d 100644
--- a/examples/asr/conf/ssl/contextnet/contextnet_ssl.yaml
+++ b/examples/asr/conf/ssl/contextnet/contextnet_ssl.yaml
@@ -415,7 +415,7 @@ trainer:
   accelerator: gpu
   strategy: ddp
   accumulate_grad_batches: 1
-  checkpoint_callback: false  # Provided by exp_manager
+  enable_checkpointing: False  # Provided by exp_manager
   logger: false  # Provided by exp_manager
   log_every_n_steps: 100  # Interval of logging.
   val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
diff --git a/examples/asr/conf/wav2vec/wav2vecCTC.yaml b/examples/asr/conf/wav2vec/wav2vecCTC.yaml
index e2ced250cb83..89d97aa2e5e1 100644
--- a/examples/asr/conf/wav2vec/wav2vecCTC.yaml
+++ b/examples/asr/conf/wav2vec/wav2vecCTC.yaml
@@ -131,7 +131,7 @@ trainer:
   num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it
   check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs
   sync_batchnorm: true
-  checkpoint_callback: false # Provided by exp_manager
+  enable_checkpointing: False # Provided by exp_manager
   logger: false # Provided by exp_manager
 
 exp_manager:
diff --git a/examples/asr/conf/wav2vec/wav2vecCTC_large.yaml b/examples/asr/conf/wav2vec/wav2vecCTC_large.yaml
index a404dacee025..911c466aa137 100644
--- a/examples/asr/conf/wav2vec/wav2vecCTC_large.yaml
+++ b/examples/asr/conf/wav2vec/wav2vecCTC_large.yaml
@@ -131,7 +131,7 @@ trainer:
   num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it
   check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs
   sync_batchnorm: true
-  checkpoint_callback: false # Provided by exp_manager
+  enable_checkpointing: False # Provided by exp_manager
   logger: false # Provided by exp_manager
 
 exp_manager:
diff --git a/examples/asr/conf/wav2vec/wav2vec_pretrain.yaml b/examples/asr/conf/wav2vec/wav2vec_pretrain.yaml
index cae47e874517..836294fbeef2 100644
--- a/examples/asr/conf/wav2vec/wav2vec_pretrain.yaml
+++ b/examples/asr/conf/wav2vec/wav2vec_pretrain.yaml
@@ -131,7 +131,7 @@ trainer:
   num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it
   check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs
   sync_batchnorm: true
-  checkpoint_callback: false # Provided by exp_manager
+  enable_checkpointing: False # Provided by exp_manager
   logger: false # Provided by exp_manager
 
 exp_manager:
diff --git a/examples/asr/conf/wav2vec/wav2vec_pretrain_large.yaml b/examples/asr/conf/wav2vec/wav2vec_pretrain_large.yaml
index 1081fcc96831..c1d74cf4d29d 100644
--- a/examples/asr/conf/wav2vec/wav2vec_pretrain_large.yaml
+++ b/examples/asr/conf/wav2vec/wav2vec_pretrain_large.yaml
@@ -132,7 +132,7 @@ trainer:
   num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it
   check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs
   sync_batchnorm: true
-  checkpoint_callback: false # Provided by exp_manager
+  enable_checkpointing: False # Provided by exp_manager
   logger: false # Provided by exp_manager
 
 exp_manager:
diff --git a/examples/asr/experimental/wav2vec/configs/wav2vecCTC.yaml b/examples/asr/experimental/wav2vec/configs/wav2vecCTC.yaml
index 072e0e66e376..09a4ddc4a51c 100644
--- a/examples/asr/experimental/wav2vec/configs/wav2vecCTC.yaml
+++ b/examples/asr/experimental/wav2vec/configs/wav2vecCTC.yaml
@@ -131,7 +131,7 @@ trainer:
   num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it
   check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs
   sync_batchnorm: true
-  checkpoint_callback: false # Provided by exp_manager
+  enable_checkpointing: False # Provided by exp_manager
   logger: false # Provided by exp_manager
 
 exp_manager:
diff --git a/examples/asr/experimental/wav2vec/configs/wav2vecCTC_large.yaml b/examples/asr/experimental/wav2vec/configs/wav2vecCTC_large.yaml
index c6c6ac893f29..f6121bc369f8 100644
--- a/examples/asr/experimental/wav2vec/configs/wav2vecCTC_large.yaml
+++ b/examples/asr/experimental/wav2vec/configs/wav2vecCTC_large.yaml
@@ -129,7 +129,7 @@ trainer:
   num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it
   check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs
   sync_batchnorm: true
-  checkpoint_callback: false # Provided by exp_manager
+  enable_checkpointing: False # Provided by exp_manager
   logger: false # Provided by exp_manager
 
 exp_manager:
diff --git a/examples/nlp/dialogue_state_tracking_generative/conf/dialogue_config.yaml b/examples/nlp/dialogue_state_tracking_generative/conf/dialogue_config.yaml
index ca794d0a926c..7e41477e58f5 100644
--- a/examples/nlp/dialogue_state_tracking_generative/conf/dialogue_config.yaml
+++ b/examples/nlp/dialogue_state_tracking_generative/conf/dialogue_config.yaml
@@ -28,7 +28,7 @@ trainer:
   val_check_interval: 1.0  # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
   resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
   num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it
-  checkpoint_callback: False  # Provided by exp_manager
+  enable_checkpointing: False # Provided by exp_manager
   logger: False  # Provided by exp_manager 
 
 model:
diff --git a/examples/nlp/duplex_text_normalization/duplex_text_normalization_infer.py b/examples/nlp/duplex_text_normalization/duplex_text_normalization_infer.py
index 692a39daa4ad..7306a7517090 100644
--- a/examples/nlp/duplex_text_normalization/duplex_text_normalization_infer.py
+++ b/examples/nlp/duplex_text_normalization/duplex_text_normalization_infer.py
@@ -89,9 +89,13 @@ def main(cfg: DictConfig) -> None:
 
         if lang == constants.ENGLISH:
             new_lines = normalizer_electronic.normalize_list(lines)
-            lines = [post_process_punct(input=lines[idx], normalized_text=new_lines[idx]) for idx in range(lines)]
+            lines = [
+                post_process_punct(input=input_, normalized_text=norm_) for input_, norm_ in zip(lines, new_lines)
+            ]
             new_lines = normalizer_whitelist.normalize_list(lines)
-            lines = [post_process_punct(input=lines[idx], normalized_text=new_lines[idx]) for idx in range(lines)]
+            lines = [
+                post_process_punct(input=input_, normalized_text=norm_) for input_, norm_ in zip(lines, new_lines)
+            ]
 
         def _get_predictions(lines: List[str], mode: str, batch_size: int, text_file: str):
             """ Runs inference on a batch data without labels and saved predictions to a file. """
diff --git a/examples/nlp/duplex_text_normalization/nn_wfst/en/electronic/normalize.py b/examples/nlp/duplex_text_normalization/nn_wfst/en/electronic/normalize.py
index 82bd11fcdf99..204708bef9a9 100644
--- a/examples/nlp/duplex_text_normalization/nn_wfst/en/electronic/normalize.py
+++ b/examples/nlp/duplex_text_normalization/nn_wfst/en/electronic/normalize.py
@@ -47,7 +47,7 @@ def __init__(
         from nn_wfst.en.electronic.tokenize_and_classify import ClassifyFst
         from nn_wfst.en.electronic.verbalize_final import VerbalizeFinalFst
 
-        self.tagger = self.tagger = ClassifyFst(
+        self.tagger = ClassifyFst(
             input_case=input_case, deterministic=deterministic, cache_dir=cache_dir, overwrite_cache=overwrite_cache
         )
         self.verbalizer = VerbalizeFinalFst(deterministic=deterministic)
diff --git a/examples/nlp/duplex_text_normalization/nn_wfst/en/electronic/tokenize_and_classify.py b/examples/nlp/duplex_text_normalization/nn_wfst/en/electronic/tokenize_and_classify.py
index 1dba5235987c..b1a9ac285fbd 100644
--- a/examples/nlp/duplex_text_normalization/nn_wfst/en/electronic/tokenize_and_classify.py
+++ b/examples/nlp/duplex_text_normalization/nn_wfst/en/electronic/tokenize_and_classify.py
@@ -16,6 +16,7 @@
 import os
 
 from nemo_text_processing.text_normalization.en.graph_utils import (
+    NEMO_WHITE_SPACE,
     GraphFst,
     delete_extra_space,
     delete_space,
@@ -71,14 +72,30 @@ def __init__(
 
             classify = pynutil.add_weight(electonic_graph, 1.1) | pynutil.add_weight(word_graph, 100)
 
-            punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, weight=1.1) + pynutil.insert(" }")
+            punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, weight=2.1) + pynutil.insert(" }")
+            punct = pynini.closure(
+                pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1), delete_extra_space)
+                | (pynutil.insert(" ") + punct),
+                1,
+            )
             token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }")
             token_plus_punct = (
                 pynini.closure(punct + pynutil.insert(" ")) + token + pynini.closure(pynutil.insert(" ") + punct)
             )
 
-            graph = token_plus_punct + pynini.closure(delete_extra_space + token_plus_punct)
+            graph = (
+                token_plus_punct
+                + pynini.closure(
+                    (
+                        pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1), delete_extra_space)
+                        | (pynutil.insert(" ") + punct + pynutil.insert(" "))
+                    )
+                    + token_plus_punct
+                ).optimize()
+            )
+
             graph = delete_space + graph + delete_space
+            graph |= punct
 
             self.fst = graph.optimize()
 
diff --git a/examples/nlp/duplex_text_normalization/nn_wfst/en/whitelist/tokenize_and_classify.py b/examples/nlp/duplex_text_normalization/nn_wfst/en/whitelist/tokenize_and_classify.py
index 33a97f80dc30..9249ff315440 100644
--- a/examples/nlp/duplex_text_normalization/nn_wfst/en/whitelist/tokenize_and_classify.py
+++ b/examples/nlp/duplex_text_normalization/nn_wfst/en/whitelist/tokenize_and_classify.py
@@ -16,6 +16,7 @@
 import os
 
 from nemo_text_processing.text_normalization.en.graph_utils import (
+    NEMO_WHITE_SPACE,
     GraphFst,
     delete_extra_space,
     delete_space,
@@ -80,14 +81,30 @@ def __init__(
 
             classify = pynutil.add_weight(whitelist_graph, 1) | pynutil.add_weight(word_graph, 100)
 
-            punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, weight=1.1) + pynutil.insert(" }")
+            punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, weight=2.1) + pynutil.insert(" }")
+            punct = pynini.closure(
+                pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1), delete_extra_space)
+                | (pynutil.insert(" ") + punct),
+                1,
+            )
             token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }")
             token_plus_punct = (
                 pynini.closure(punct + pynutil.insert(" ")) + token + pynini.closure(pynutil.insert(" ") + punct)
             )
 
-            graph = token_plus_punct + pynini.closure(delete_extra_space + token_plus_punct)
+            graph = (
+                token_plus_punct
+                + pynini.closure(
+                    (
+                        pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1), delete_extra_space)
+                        | (pynutil.insert(" ") + punct + pynutil.insert(" "))
+                    )
+                    + token_plus_punct
+                ).optimize()
+            )
+
             graph = delete_space + graph + delete_space
+            graph |= punct
 
             self.fst = graph.optimize()
 
diff --git a/examples/nlp/language_modeling/conf/megatron_prompt_tuning_gpt.yaml b/examples/nlp/language_modeling/conf/megatron_prompt_tuning_gpt.yaml
index 5b2dbcaf544e..203d2c4f59dd 100755
--- a/examples/nlp/language_modeling/conf/megatron_prompt_tuning_gpt.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_prompt_tuning_gpt.yaml
@@ -7,7 +7,7 @@ trainer:
   accelerator: gpu
   precision: 32
   logger: False # logger provided by exp_manager
-  checkpoint_callback: False
+  enable_checkpointing: False
   replace_sampler_ddp: False
   max_epochs: null
   max_steps: 1000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
diff --git a/examples/nlp/language_modeling/conf/megatron_ptune_gpt.yaml b/examples/nlp/language_modeling/conf/megatron_ptune_gpt.yaml
index 016f4ab1b2fe..76ed42dbecff 100644
--- a/examples/nlp/language_modeling/conf/megatron_ptune_gpt.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_ptune_gpt.yaml
@@ -6,7 +6,7 @@ trainer:
   num_nodes: 1
   precision: 16
   logger: False # logger provided by exp_manager
-  checkpoint_callback: False
+  enable_checkpointing: False
   replace_sampler_ddp: False
   max_epochs: 3
   max_steps: null # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
diff --git a/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_eval.yaml b/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_eval.yaml
new file mode 100644
index 000000000000..b80a9c2b1d9d
--- /dev/null
+++ b/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_eval.yaml
@@ -0,0 +1,35 @@
+name: megatron_t5_glue_eval
+
+trainer:
+  devices: 1
+  num_nodes: 1
+  accelerator: gpu
+  strategy: ddp
+  precision: 16
+  logger: False # logger provided by exp_manager
+  enable_checkpointing: False
+  replace_sampler_ddp: False
+  log_every_n_steps: 10
+
+
+exp_manager:
+  explicit_log_dir: null
+  exp_dir: null
+  name: megatron_t5_glue_eval
+  create_checkpoint_callback: False
+
+model:
+  restore_from_finetuned_path: ??? # Path to a finetuned T5 .nemo file
+  tensor_model_parallel_size: 1
+  gradient_as_bucket_view: True # Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
+  megatron_amp_O2: False # Enable O2 optimization for megatron amp
+
+  data:
+    validation_ds:
+      task_name: 'mnli'
+      file_path: ??? # Path to the TSV file for MNLI dev ex: '/raid/Data/GLUE/MNLI/dev_matched.tsv'
+      batch_size: 32
+      shuffle: False
+      num_workers: 4
+      pin_memory: True
+      max_seq_length: 512
diff --git a/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_mnli.yaml b/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_mnli.yaml
index baabe3637250..5dd880060ec6 100644
--- a/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_mnli.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_mnli.yaml
@@ -6,7 +6,7 @@ trainer:
   accelerator: gpu
   precision: 16
   logger: False # logger provided by exp_manager
-  checkpoint_callback: False
+  enable_checkpointing: False
   replace_sampler_ddp: False
   max_epochs: 3
   max_steps: null # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
diff --git a/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_xnli.yaml b/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_xnli.yaml
index 21ecbeb1243d..abd26028ca4a 100644
--- a/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_xnli.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_xnli.yaml
@@ -6,7 +6,7 @@ trainer:
   accelerator: gpu
   precision: 16
   logger: False # logger provided by exp_manager
-  checkpoint_callback: False
+  enable_checkpointing: False
   replace_sampler_ddp: False
   max_epochs: 3
   max_steps: null # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
diff --git a/examples/nlp/language_modeling/megatron_gpt_eval.py b/examples/nlp/language_modeling/megatron_gpt_eval.py
index 126878e1c712..4779f46c0d4c 100644
--- a/examples/nlp/language_modeling/megatron_gpt_eval.py
+++ b/examples/nlp/language_modeling/megatron_gpt_eval.py
@@ -105,7 +105,7 @@ def main():
         "--pipeline_model_parallel_size", type=int, default=1, required=False,
     )
     parser.add_argument("--precision", default=16, help="PyTorch Lightning Trainer precision flag")
-    parser.add_argument("--batch_size", default=1, required=False, help="Evaluation batch_size")
+    parser.add_argument("--batch_size", default=1, type=int, required=False, help="Evaluation batch_size")
     parser.add_argument(
         "--compute_logprobs", type=bool, default=False, required=False, help="Method for logprobs computation"
     )
@@ -165,7 +165,7 @@ def pad_collate(batch):
     # defining type of request
     if args.path_to_file != "":
         request = []
-        prompts = open(args.path_to_file, 'r')
+        prompts = open(args.path_to_file, 'r', encoding='utf-8')
 
         for prompt in prompts.readlines():
             prompt = prompt.split('\n')[0]
diff --git a/examples/nlp/language_modeling/megatron_t5_glue_eval.py b/examples/nlp/language_modeling/megatron_t5_glue_eval.py
new file mode 100644
index 000000000000..07e3954b2e87
--- /dev/null
+++ b/examples/nlp/language_modeling/megatron_t5_glue_eval.py
@@ -0,0 +1,75 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from omegaconf.omegaconf import OmegaConf
+from pytorch_lightning import Trainer
+from pytorch_lightning.callbacks.timer import Timer
+from pytorch_lightning.plugins.environments.torchelastic_environment import TorchElasticEnvironment
+from pytorch_lightning.plugins.precision.native_amp import NativeMixedPrecisionPlugin
+
+from nemo.collections.nlp.models.language_modeling.megatron_glue_model import MegatronT5GLUEModel
+from nemo.collections.nlp.parts.nlp_overrides import GradScaler, MegatronHalfPrecisionPlugin, NLPDDPPlugin
+from nemo.core.config import hydra_runner
+from nemo.utils import logging
+from nemo.utils.exp_manager import StatelessTimer, exp_manager
+
+
+@hydra_runner(config_path="conf", config_name="megatron_t5_config_finetune_glue_eval")
+def main(cfg) -> None:
+    logging.info("\n\n************** Experiment configuration ***********")
+    logging.info(f'\n{OmegaConf.to_yaml(cfg)}')
+
+    megatron_amp_o2 = cfg.model.get('megatron_amp_O2', False)
+    plugins = [
+        NLPDDPPlugin(
+            num_nodes=cfg.trainer.num_nodes,
+            no_ddp_communication_hook=(
+                megatron_amp_o2 and cfg.trainer.precision == 'bf16'
+            ),  # Only bf16 uses fp32_grad_accum.
+            gradient_as_bucket_view=cfg.model.gradient_as_bucket_view,
+            find_unused_parameters=False,
+        )
+    ]
+    if cfg.trainer.precision in [16, 'bf16']:
+        scaler = None
+        if cfg.trainer.precision == 16:
+            scaler = GradScaler(
+                init_scale=cfg.model.get('native_amp_init_scale', 2 ** 32),
+                growth_interval=cfg.model.get('native_amp_growth_interval', 1000),
+                hysteresis=cfg.model.get('hysteresis', 2),
+            )
+        if megatron_amp_o2:
+            plugins.append(MegatronHalfPrecisionPlugin(precision=cfg.trainer.precision, device='cuda', scaler=scaler))
+        else:
+            plugins.append(NativeMixedPrecisionPlugin(precision=cfg.trainer.precision, device='cuda', scaler=scaler))
+
+    if cfg.get('cluster_type', None) == 'BCP':
+        plugins.append(TorchElasticEnvironment())
+
+    trainer = Trainer(plugins=plugins, **cfg.trainer)
+    exp_manager(trainer, cfg.exp_manager)
+
+    # Override timer callback to a stateless one
+    for idx, callback in enumerate(trainer.callbacks):
+        if isinstance(callback, Timer):
+            trainer.callbacks[idx] = StatelessTimer(cfg.trainer.max_time,)
+
+    model = MegatronT5GLUEModel.restore_from(restore_path=cfg.model.restore_from_finetuned_path, trainer=trainer)
+    model.freeze()
+
+    trainer.validate(model)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/examples/nlp/text_classification/conf/ptune_text_classification_config.yaml b/examples/nlp/text_classification/conf/ptune_text_classification_config.yaml
index f4fa603cd6b9..9f91620bdf0a 100644
--- a/examples/nlp/text_classification/conf/ptune_text_classification_config.yaml
+++ b/examples/nlp/text_classification/conf/ptune_text_classification_config.yaml
@@ -27,8 +27,7 @@ trainer:
   val_check_interval: 1.0  # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
   resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
   num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it
-
-  checkpoint_callback: False  # Provided by exp_manager
+  enable_checkpointing: False  # Provided by exp_manager
   logger: False  # Provided by exp_manager
 
 model:
diff --git a/examples/tts/conf/univnet/univnet.yaml b/examples/tts/conf/univnet/univnet.yaml
index bef3321e85f0..8222ed2165e6 100644
--- a/examples/tts/conf/univnet/univnet.yaml
+++ b/examples/tts/conf/univnet/univnet.yaml
@@ -82,7 +82,7 @@ trainer:
   precision: 32
   max_steps: ${model.max_steps}
   accumulate_grad_batches: 1
-  checkpoint_callback: false # Provided by exp_manager
+  enable_checkpointing: False # Provided by exp_manager
   logger: false # Provided by exp_manager
   log_every_n_steps: 100
   check_val_every_n_epoch: 10
diff --git a/nemo/collections/asr/models/ctc_bpe_models.py b/nemo/collections/asr/models/ctc_bpe_models.py
index 17bad25e5b1f..1e1496b97938 100644
--- a/nemo/collections/asr/models/ctc_bpe_models.py
+++ b/nemo/collections/asr/models/ctc_bpe_models.py
@@ -119,21 +119,21 @@ def list_available_models(cls) -> Optional[PretrainedModelInfo]:
         model = PretrainedModelInfo(
             pretrained_model_name="stt_en_conformer_ctc_small",
             description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_ctc_small",
-            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_conformer_ctc_small/versions/1.0.0/files/stt_en_conformer_ctc_small.nemo",
+            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_conformer_ctc_small/versions/1.6.0/files/stt_en_conformer_ctc_small.nemo",
         )
         results.append(model)
 
         model = PretrainedModelInfo(
             pretrained_model_name="stt_en_conformer_ctc_medium",
             description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_ctc_medium",
-            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_conformer_ctc_medium/versions/1.0.0/files/stt_en_conformer_ctc_medium.nemo",
+            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_conformer_ctc_medium/versions/1.6.0/files/stt_en_conformer_ctc_medium.nemo",
         )
         results.append(model)
 
         model = PretrainedModelInfo(
             pretrained_model_name="stt_en_conformer_ctc_large",
             description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_ctc_large",
-            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_conformer_ctc_large/versions/1.0.0/files/stt_en_conformer_ctc_large.nemo",
+            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_conformer_ctc_large/versions/1.6.0/files/stt_en_conformer_ctc_large.nemo",
         )
         results.append(model)
 
diff --git a/nemo/collections/asr/models/rnnt_bpe_models.py b/nemo/collections/asr/models/rnnt_bpe_models.py
index ea2bf26288cd..3c40dc4a81c0 100644
--- a/nemo/collections/asr/models/rnnt_bpe_models.py
+++ b/nemo/collections/asr/models/rnnt_bpe_models.py
@@ -64,13 +64,6 @@ def list_available_models(cls) -> List[PretrainedModelInfo]:
         )
         results.append(model)
 
-        model = PretrainedModelInfo(
-            pretrained_model_name="stt_en_conformer_transducer_small",
-            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_transducer_small",
-            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_conformer_transducer_small/versions/1.4.0/files/stt_en_conformer_transducer_small.nemo",
-        )
-        results.append(model)
-
         model = PretrainedModelInfo(
             pretrained_model_name="stt_en_contextnet_256_mls",
             description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_contextnet_256_mls",
@@ -95,7 +88,7 @@ def list_available_models(cls) -> List[PretrainedModelInfo]:
         model = PretrainedModelInfo(
             pretrained_model_name="stt_en_conformer_transducer_small",
             description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_transducer_small",
-            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_conformer_transducer_small/versions/1.4.0/files/stt_en_conformer_transducer_small.nemo",
+            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_conformer_transducer_small/versions/1.6.0/files/stt_en_conformer_transducer_small.nemo",
         )
         results.append(model)
 
@@ -109,7 +102,7 @@ def list_available_models(cls) -> List[PretrainedModelInfo]:
         model = PretrainedModelInfo(
             pretrained_model_name="stt_en_conformer_transducer_large",
             description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_transducer_large",
-            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_conformer_transducer_large/versions/1.4.0/files/stt_en_conformer_transducer_large.nemo",
+            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_conformer_transducer_large/versions/1.6.0/files/stt_en_conformer_transducer_large.nemo",
         )
         results.append(model)
 
diff --git a/nemo/collections/nlp/data/text_normalization/utils.py b/nemo/collections/nlp/data/text_normalization/utils.py
index 6f81ed0afcb6..c12392761af8 100644
--- a/nemo/collections/nlp/data/text_normalization/utils.py
+++ b/nemo/collections/nlp/data/text_normalization/utils.py
@@ -216,8 +216,8 @@ def post_process_punct(input: str, normalized_text: str):
     punct_default = [x for x in string.punctuation]
     punct_unicode = [chr(i) for i in range(sys.maxunicode) if category(chr(i)).startswith("P")]
     punct_marks = set(punct_default + punct_unicode)
-    try:
-        for punct in punct_marks:
+    for punct in punct_marks:
+        try:
             equal = True
             if input.count(punct) != normalized_text.count(punct):
                 equal = False
@@ -253,8 +253,8 @@ def _is_valid(idx_out, idx_in, normalized_text, input):
                         normalized_text[idx_out] = normalized_text[idx_out] + " "
                 idx_out += 1
                 idx_in += 1
-    except:
-        logging.debug(f"Skipping post-processing of {''.join(normalized_text)} for '{punct}'")
+        except:
+            logging.debug(f"Skipping post-processing of {''.join(normalized_text)} for '{punct}'")
 
     normalized_text = "".join(normalized_text)
     return re.sub(r' +', ' ', normalized_text)
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py b/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py
index 6d40b0ef514d..5b43cdb82948 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py
@@ -155,7 +155,11 @@ def training_step(self, batch, batch_idx):
             lr = self._optimizer.param_groups[0]['lr']
             self.log('lr', lr)
             self.log('global_step', self.trainer.global_step, prog_bar=True)
-            self.log('consumed_samples', self.compute_consumed_samples(self.trainer.global_step), prog_bar=True)
+            self.log(
+                'consumed_samples',
+                self.compute_consumed_samples(self.trainer.global_step - self.init_global_step),
+                prog_bar=True,
+            )
             self._reduced_loss_buffer = []
             self._reduced_lm_loss_buffer = []
             self._reduced_sop_loss_buffer = []
@@ -180,7 +184,7 @@ def validation_step(self, batch, batch_idx):
     def validation_epoch_end(self, outputs):
         averaged_loss = torch.stack(outputs).mean()
         self.log('val_loss', averaged_loss, prog_bar=True)
-        self.log('consumed_samples', self.compute_consumed_samples(self.trainer.global_step))
+        self.log('consumed_samples', self.compute_consumed_samples(self.trainer.global_step - self.init_global_step))
 
     def test_step(self, batch, batch_idx):
         return self.validation_step(batch, batch_idx)
@@ -306,6 +310,19 @@ def build_pretraining_data_loader(self, dataset, consumed_samples):
         )
 
     def setup(self, stage=None):
+        resume_checkpoint_path = self.trainer.checkpoint_connector.resume_from_checkpoint_fit_path
+        if resume_checkpoint_path:
+            try:
+                init_consumed_samples = int(
+                    float(re.findall(r"consumed_samples\=([0-9]+.[0-9]+)", resume_checkpoint_path)[0])
+                )
+            except (ValueError, TypeError):
+                logging.warning("Cannot parse the checkpoint file to get the consumed samples. assume it is zero.")
+                init_consumed_samples = 0
+        else:
+            init_consumed_samples = 0
+        self.init_consumed_samples = init_consumed_samples
+
         if stage == 'predict':
             return
         # TODO: consider adding a ModelPT guard to check if model is being restored.
@@ -317,13 +334,7 @@ def setup(self, stage=None):
 
     def setup_training_data(self, cfg):
         if hasattr(self, '_train_ds'):
-            resume_checkpoint_path = self.trainer.checkpoint_connector.resume_from_checkpoint_fit_path
-            if resume_checkpoint_path:
-                consumed_samples = int(
-                    float(re.findall(r"consumed_samples\=([0-9]+.[0-9]+)", resume_checkpoint_path)[0])
-                )
-            else:
-                consumed_samples = 0
+            consumed_samples = self.compute_consumed_samples(0)
             logging.info(
                 f'Setting up train dataloader with len(len(self._train_ds)): {len(self._train_ds)} and consumed samples: {consumed_samples}'
             )
@@ -345,10 +356,16 @@ def setup_test_data(self, cfg):
             )
             self._test_dl = self.build_pretraining_data_loader(self._test_ds, consumed_samples)
 
-    def compute_consumed_samples(self, global_step):
+    def on_pretrain_routine_start(self) -> None:
+        # keep a copy of init_global_step
+        self.init_global_step = self.trainer.global_step
+        return super().on_pretrain_routine_start()
+
+    def compute_consumed_samples(self, steps_since_resume=0):
         app_state = AppState()
         consumed_samples = (
-            global_step
+            self.init_consumed_samples
+            + steps_since_resume
             * app_state.data_parallel_size
             * self.cfg.micro_batch_size
             * self.trainer.accumulate_grad_batches
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index 7ec7d128e8de..e41df68f9366 100755
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -307,7 +307,7 @@ def training_step(self, batch, batch_idx):
         # TODO: make sure compute_consumed_samples works for pipeline parallelism
         self.log(
             'consumed_samples',
-            self.compute_consumed_samples(self.trainer.global_step),
+            self.compute_consumed_samples(self.trainer.global_step - self.init_global_step),
             prog_bar=True,
             rank_zero_only=True,
         )
@@ -448,6 +448,11 @@ def id_func(output_tensor):
 
         return fwd_output_only_func
 
+    def on_pretrain_routine_start(self) -> None:
+        # keep a copy of init_global_step
+        self.init_global_step = self.trainer.global_step
+        return super().on_pretrain_routine_start()
+
     def validation_step(self, batch, batch_idx):
         """
             Our dataloaders produce a micro-batch and then we fetch
@@ -505,7 +510,11 @@ def validation_epoch_end(self, outputs):
         torch.distributed.broadcast(averaged_loss, get_last_rank())
 
         self.log('val_loss', averaged_loss, prog_bar=True, rank_zero_only=True)
-        self.log('consumed_samples', self.compute_consumed_samples(self.trainer.global_step), rank_zero_only=True)
+        self.log(
+            'consumed_samples',
+            self.compute_consumed_samples(self.trainer.global_step - self.init_global_step),
+            rank_zero_only=True,
+        )
 
     def test_step(self, batch, batch_idx):
         return self.validation_step(batch, batch_idx)
@@ -640,6 +649,19 @@ def setup(self, stage=None):
         Args:
             stage (str, optional): Can be 'fit', 'validate', 'test' or 'predict'. Defaults to None.
         """
+        resume_checkpoint_path = self.trainer.checkpoint_connector.resume_from_checkpoint_fit_path
+        if resume_checkpoint_path:
+            try:
+                init_consumed_samples = int(
+                    float(re.findall(r"consumed_samples\=([0-9]+.[0-9]+)", resume_checkpoint_path)[0])
+                )
+            except (ValueError, TypeError):
+                logging.warning("Cannot parse the checkpoint file to get the consumed samples. assume it is zero.")
+                init_consumed_samples = 0
+        else:
+            init_consumed_samples = 0
+        self.init_consumed_samples = init_consumed_samples
+
         # Initalize soft prompts before loading datasets and training
         if self.use_soft_prompts:
             self.init_new_prompts()
@@ -669,13 +691,7 @@ def setup_training_data(self, cfg):
             self.prompt_tuning_param_freeze_and_optimizer_setup()
 
         elif hasattr(self, '_train_ds'):
-            resume_checkpoint_path = self.trainer.checkpoint_connector.resume_from_checkpoint_fit_path
-            if resume_checkpoint_path:
-                consumed_samples = int(
-                    float(re.findall(r"consumed_samples\=([0-9]+.[0-9]+)", resume_checkpoint_path)[0])
-                )
-            else:
-                consumed_samples = 0
+            consumed_samples = self.compute_consumed_samples(0)
             logging.info(
                 f'Setting up train dataloader with len(len(self._train_ds)): {len(self._train_ds)} and consumed samples: {consumed_samples}'
             )
@@ -745,12 +761,11 @@ def configure_optimizers(self):
         else:
             return [self._optimizer], [self._scheduler]
 
-    def compute_consumed_samples(self, global_step):
-        # TODO: this should be a counter self.consumed_samples
-        # and updated after every train_step: self.consumed_samples += global_batch_size
+    def compute_consumed_samples(self, steps_since_resume=0):
         app_state = AppState()
         consumed_samples = (
-            global_step * app_state.data_parallel_size * self.cfg.micro_batch_size * get_num_microbatches()
+            self.init_consumed_samples
+            + steps_since_resume * app_state.data_parallel_size * self.cfg.micro_batch_size * get_num_microbatches()
         )
         return int(consumed_samples)
 
@@ -878,18 +893,19 @@ def complete(self, request: Dict, positions: List, tokens_to_generate: int):
 
         """
         app_state = AppState()
-        _reconfigure_microbatch_calculator(
-            rank=app_state.global_rank,
-            rampup_batch_size=None,
-            global_batch_size=1,
-            micro_batch_size=1,
-            data_parallel_size=1,
-        )
 
         results = []
         request_tokens = request["tokens"]
 
         for idx, tokens in enumerate(request_tokens):
+            micro_batch_size = tokens.shape[0]
+            _reconfigure_microbatch_calculator(
+                rank=app_state.global_rank,
+                rampup_batch_size=None,
+                global_batch_size=micro_batch_size,
+                micro_batch_size=micro_batch_size,
+                data_parallel_size=1,
+            )
 
             # For prompt tuned GPT models
             if self.use_soft_prompts:
@@ -928,11 +944,12 @@ def complete(self, request: Dict, positions: List, tokens_to_generate: int):
                         reset_attention_mask=self.cfg.get('reset_attention_mask', False),
                         eod_mask_loss=self.cfg.get('eod_mask_loss', False),
                     )
+                attention_mask_repeat = torch.concat([attention_mask for _ in range(micro_batch_size)])
                 if self.use_soft_prompts:
-                    batch = [tokens, attention_mask, position_ids, prompt_ids]
+                    batch = [tokens, attention_mask_repeat, position_ids, prompt_ids]
                 else:
-                    batch = [tokens, attention_mask, position_ids]
-                tensor_shape = [tokens.shape[1], 1, self.cfg.hidden_size]
+                    batch = [tokens, attention_mask_repeat, position_ids]
+                tensor_shape = [tokens.shape[1], micro_batch_size, self.cfg.hidden_size]
                 if self.cfg.get('pipeline_model_parallel_size', 1) > 1:
                     output_tensor = forward_backward_pipelining_without_interleaving(
                         forward_step_func=self.get_forward_output_only_func(),
@@ -1003,18 +1020,19 @@ def compute_logprobs(self, request: Dict, positions: List):
             * offsets: list of tokens start positions in text
         """
         app_state = AppState()
-        _reconfigure_microbatch_calculator(
-            rank=app_state.global_rank,
-            rampup_batch_size=None,
-            global_batch_size=1,
-            micro_batch_size=1,
-            data_parallel_size=1,
-        )
 
         results = []
         request_tokens = request["tokens"]
         for idx, tokens in enumerate(request_tokens):
             tokens_cut = tokens[:, :-1]
+            micro_batch_size = tokens_cut.shape[0]
+            _reconfigure_microbatch_calculator(
+                rank=app_state.global_rank,
+                rampup_batch_size=None,
+                global_batch_size=micro_batch_size,
+                micro_batch_size=micro_batch_size,
+                data_parallel_size=1,
+            )
             # For prompt tuned GPT models
             if self.use_soft_prompts:
                 if self.cfg.get('pipeline_model_parallel_size', 1) > 1:
@@ -1048,11 +1066,13 @@ def compute_logprobs(self, request: Dict, positions: List):
                     eod_mask_loss=self.cfg.get('eod_mask_loss', False),
                 )
 
+            # we repeat attention mask to work with apex fwd/bwd function
+            attention_mask_repeat = torch.concat([attention_mask for _ in range(micro_batch_size)])
             if self.use_soft_prompts:
-                batch = [tokens, attention_mask, position_ids, prompt_ids]
+                batch = [tokens_cut, attention_mask_repeat, position_ids, prompt_ids]
             else:
-                batch = [tokens, attention_mask, position_ids]
-            tensor_shape = [tokens_cut.shape[1], 1, self.cfg.hidden_size]
+                batch = [tokens_cut, attention_mask_repeat, position_ids]
+            tensor_shape = [tokens_cut.shape[1], micro_batch_size, self.cfg.hidden_size]
             if self.cfg.get('pipeline_model_parallel_size', 1) > 1:
                 output_tensor = forward_backward_pipelining_without_interleaving(
                     forward_step_func=self.get_forward_output_only_func(),
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py b/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py
index 0bcff177eb22..71e094cac37e 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py
@@ -226,7 +226,11 @@ def training_step(self, batch, batch_idx):
             lr = self._optimizer.param_groups[0]['lr']
             self.log('lr', lr)
             self.log('global_step', self.trainer.global_step, prog_bar=True)
-            self.log('consumed_samples', self.compute_consumed_samples(self.trainer.global_step), prog_bar=True)
+            self.log(
+                'consumed_samples',
+                self.compute_consumed_samples(self.trainer.global_step - self.init_global_step),
+                prog_bar=True,
+            )
             self._reduced_loss_buffer = []
 
         return loss
@@ -244,7 +248,7 @@ def validation_step(self, batch, batch_idx):
     def validation_epoch_end(self, outputs):
         averaged_loss = average_losses_across_data_parallel_group(outputs)
         self.log('val_loss', averaged_loss[0], prog_bar=True)
-        self.log('consumed_samples', self.compute_consumed_samples(self.trainer.global_step))
+        self.log('consumed_samples', self.compute_consumed_samples(self.trainer.global_step - self.init_global_step))
 
     def test_step(self, batch, batch_idx):
         return self.validation_step(batch, batch_idx)
@@ -317,6 +321,19 @@ def build_pretraining_data_loader(self, dataset, consumed_samples):
         )
 
     def setup(self, stage=None):
+        resume_checkpoint_path = self.trainer.checkpoint_connector.resume_checkpoint_path
+        if resume_checkpoint_path:
+            try:
+                init_consumed_samples = int(
+                    float(re.findall(r"consumed_samples\=([0-9]+.[0-9]+)", resume_checkpoint_path)[0])
+                )
+            except (ValueError, TypeError):
+                logging.warning("Cannot parse the checkpoint file to get the consumed samples. assume it is zero.")
+                init_consumed_samples = 0
+        else:
+            init_consumed_samples = 0
+        self.init_consumed_samples = init_consumed_samples
+
         """A PTL method to setup the training, validation and test datasets."""
         if stage == 'predict':
             return
@@ -327,15 +344,14 @@ def setup(self, stage=None):
         self.setup_validation_data(self._cfg.data)
         self.setup_test_data(self._cfg.data)
 
+    def on_pretrain_routine_start(self) -> None:
+        # keep a copy of init_global_step
+        self.init_global_step = self.trainer.global_step
+        return super().on_pretrain_routine_start()
+
     def setup_training_data(self, cfg):
         if hasattr(self, '_train_ds'):
-            resume_checkpoint_path = self.trainer.checkpoint_connector.resume_checkpoint_path
-            if resume_checkpoint_path:
-                consumed_samples = int(
-                    float(re.findall(r"consumed_samples\=([0-9]+.[0-9]+)", resume_checkpoint_path)[0])
-                )
-            else:
-                consumed_samples = 0
+            consumed_samples = self.compute_consumed_samples(0)
             self._train_dl = self.build_pretraining_data_loader(self._train_ds, consumed_samples)
 
     def setup_validation_data(self, cfg):
@@ -348,10 +364,11 @@ def setup_test_data(self, cfg):
             consumed_samples = 0
             self._test_dl = self.build_pretraining_data_loader(self._test_ds, consumed_samples)
 
-    def compute_consumed_samples(self, global_step):
+    def compute_consumed_samples(self, steps_since_resume=0):
         app_state = AppState()
         consumed_samples = (
-            global_step
+            self.init_consumed_samples
+            + steps_since_resume
             * app_state.data_parallel_size
             * self._cfg.micro_batch_size
             * self.trainer.accumulate_grad_batches
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_ptune_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_ptune_gpt_model.py
index 11bfff58dd67..0880dc24318f 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_ptune_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_ptune_gpt_model.py
@@ -87,6 +87,8 @@ def __init__(self, cfg: DictConfig, trainer: Trainer):
             num_layers=cfg.prompt_encoder.num_layers,
         )
 
+        self._reduced_loss_buffer = []
+
         # load prompt encoder
         self.hidden_size = hidden_size
         self.tokenizer.add_special_tokens({'additional_special_tokens': [cfg.pseudo_token]})
@@ -192,16 +194,16 @@ def training_step(self, batch, batch_idx):
         # Reduced loss for logging.
         reduced_loss = average_losses_across_data_parallel_group([loss])
         # cache reduced loss while accumulating gradients
-        self.model._reduced_loss_buffer.append(reduced_loss[0])
+        self._reduced_loss_buffer.append(reduced_loss[0])
 
         if (batch_idx + 1) % self.trainer.accumulate_grad_batches == 0:
             # Reduced loss for logging.
-            average_reduced_loss = sum(self.model._reduced_loss_buffer) / len(self.model._reduced_loss_buffer)
+            average_reduced_loss = sum(self._reduced_loss_buffer) / len(self._reduced_loss_buffer)
             self.log('reduced_train_loss', average_reduced_loss, prog_bar=True)
             lr = self._optimizer.param_groups[0]['lr']
             self.log('lr', lr)
             self.log('global_step', self.trainer.global_step, prog_bar=True)
-            self.model._reduced_loss_buffer = []
+            self._reduced_loss_buffer = []
 
         return loss
 
@@ -242,7 +244,7 @@ def decode(self, enc_query, enc_taskname, label_position, num_tokens_to_generate
 
                 input_embeds = self.embed_input(predicted_tokens_dec, enc_taskname)
 
-                encoder_position_ids = t5_position_ids(predicted_tokens_dec)
+                encoder_position_ids = build_position_ids(predicted_tokens_dec)
                 position_embeddings = self.model.model.language_model.embedding.position_embeddings(
                     encoder_position_ids
                 )
diff --git a/nemo/collections/tts/models/fastpitch.py b/nemo/collections/tts/models/fastpitch.py
index 29d68637d347..84ab147278fd 100644
--- a/nemo/collections/tts/models/fastpitch.py
+++ b/nemo/collections/tts/models/fastpitch.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+import contextlib
 from typing import Optional
 
 import torch
@@ -204,8 +204,12 @@ def parse(self, str_input: str, normalize=True) -> torch.tensor:
             str_input = self.text_normalizer_call(str_input, **self.text_normalizer_call_kwargs)
 
         if self.learn_alignment:
-            # Disable mixed g2p representation
-            with self.vocab.set_phone_prob(prob=1.0):
+            eval_phon_mode = contextlib.nullcontext()
+            if hasattr(self.vocab, "set_phone_prob"):
+                eval_phon_mode = self.vocab.set_phone_prob(prob=1.0)
+
+            # Disable mixed g2p representation if necessary
+            with eval_phon_mode:
                 tokens = self.parser(str_input)
         else:
             # TODO(Oktai15): remove it in 1.8.0 version
@@ -438,7 +442,11 @@ def __setup_dataloader_from_config(self, cfg, shuffle_should_be: bool = True, na
         if cfg.dataset._target_ == "nemo.collections.asr.data.audio_to_text.FastPitchDataset":
             dataset = instantiate(cfg.dataset, parser=self.parser)
         elif cfg.dataset._target_ == "nemo.collections.tts.torch.data.TTSDataset":
-            with self.vocab.set_phone_prob(prob=None if name == "val" else self.vocab.phoneme_probability):
+            phon_mode = contextlib.nullcontext()
+            if hasattr(self.vocab, "set_phone_prob"):
+                phon_mode = self.vocab.set_phone_prob(prob=None if name == "val" else self.vocab.phoneme_probability)
+
+            with phon_mode:
                 dataset = instantiate(
                     cfg.dataset,
                     text_normalizer=self.normalizer,
diff --git a/nemo/collections/tts/models/fastspeech2_hifigan_e2e.py b/nemo/collections/tts/models/fastspeech2_hifigan_e2e.py
index 479cb540a5e4..7ca874ce46ca 100644
--- a/nemo/collections/tts/models/fastspeech2_hifigan_e2e.py
+++ b/nemo/collections/tts/models/fastspeech2_hifigan_e2e.py
@@ -54,7 +54,7 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None):
             cfg = OmegaConf.create(cfg)
         super().__init__(cfg=cfg, trainer=trainer)
 
-        self.audio_to_melspec_preprocessor = instantiate(cfg.preprocessor)
+        self.audio_to_melspec_precessor = instantiate(cfg.preprocessor)
         self.encoder = instantiate(cfg.encoder)
         self.variance_adapter = instantiate(cfg.variance_adaptor)
 
@@ -184,7 +184,7 @@ def forward(self, *, text, text_length, splice=True, durations=None, pitch=None,
 
     def training_step(self, batch, batch_idx, optimizer_idx):
         f, fl, t, tl, durations, pitch, energies = batch
-        _, spec_len = self.audio_to_melspec_preprocessor(f, fl)
+        _, spec_len = self.audio_to_melspec_precessor(f, fl)
 
         # train discriminator
         if optimizer_idx == 0:
@@ -298,7 +298,7 @@ def training_step(self, batch, batch_idx, optimizer_idx):
 
     def validation_step(self, batch, batch_idx):
         f, fl, t, tl, _, _, _ = batch
-        spec, spec_len = self.audio_to_melspec_preprocessor(f, fl)
+        spec, spec_len = self.audio_to_melspec_precessor(f, fl)
         audio_pred, _, _, _, _, _ = self(spec_len=spec_len, text=t, text_length=tl, splice=False)
         audio_pred.squeeze_()
         pred_spec, _ = self.melspec_fn(audio_pred, seq_len=spec_len)
diff --git a/nemo/collections/tts/models/mixer_tts.py b/nemo/collections/tts/models/mixer_tts.py
index ce155ab1a97b..0fcebd6563f3 100644
--- a/nemo/collections/tts/models/mixer_tts.py
+++ b/nemo/collections/tts/models/mixer_tts.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+import contextlib
 from typing import List, Optional
 
 import numpy as np
@@ -648,7 +648,12 @@ def parse(self, text: str, normalize=True) -> torch.Tensor:
             logging.warning("parse() is meant to be called in eval mode.")
         if normalize and self.text_normalizer_call is not None:
             text = self.text_normalizer_call(text, **self.text_normalizer_call_kwargs)
-        with self.tokenizer.set_phone_prob(prob=1.0):
+
+        eval_phon_mode = contextlib.nullcontext()
+        if hasattr(self.tokenizer, "set_phone_prob"):
+            eval_phon_mode = self.tokenizer.set_phone_prob(prob=1.0)
+
+        with eval_phon_mode:
             tokens = self.tokenizer.encode(text)
         return torch.tensor(tokens).long().unsqueeze(0).to(self.device)
 
diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/decimal.py b/nemo_text_processing/inverse_text_normalization/es/taggers/decimal.py
index 85904c3cf133..f72e30a9cc06 100644
--- a/nemo_text_processing/inverse_text_normalization/es/taggers/decimal.py
+++ b/nemo_text_processing/inverse_text_normalization/es/taggers/decimal.py
@@ -105,7 +105,7 @@ def __init__(self, cardinal: GraphFst):
 
         graph_fractional = pynutil.insert("fractional_part: \"") + graph_decimal + pynutil.insert("\"")
 
-        cardinal_graph = cardinal.graph_no_exception | pynini.string_file(get_abs_path("data/numbers/es/zero.tsv"))
+        cardinal_graph = cardinal.graph_no_exception | pynini.string_file(get_abs_path("data/numbers/zero.tsv"))
         graph_integer = pynutil.insert("integer_part: \"") + cardinal_graph + pynutil.insert("\"")
         final_graph_wo_sign = (
             pynini.closure(graph_integer + delete_extra_space, 0, 1)
diff --git a/nemo_text_processing/inverse_text_normalization/fr/data/suppletive.tsv b/nemo_text_processing/inverse_text_normalization/fr/data/suppletive.tsv
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/nemo_text_processing/inverse_text_normalization/fr/graph_utils.py b/nemo_text_processing/inverse_text_normalization/fr/graph_utils.py
index 845f6d3f9e9f..94bb912b0400 100644
--- a/nemo_text_processing/inverse_text_normalization/fr/graph_utils.py
+++ b/nemo_text_processing/inverse_text_normalization/fr/graph_utils.py
@@ -53,7 +53,6 @@
     # French frequently compounds numbers with hyphen.
     delete_hyphen = pynutil.delete(pynini.closure("-", 0, 1))
     insert_hyphen = pynutil.insert("-")
-
     suppletive = pynini.string_file(get_abs_path("data/suppletive.tsv"))
 
     _s = NEMO_SIGMA + pynutil.insert("s")
diff --git a/nemo_text_processing/text_normalization/de/utils.py b/nemo_text_processing/text_normalization/de/utils.py
index d94499ea61f9..9eb0923cd2a3 100644
--- a/nemo_text_processing/text_normalization/de/utils.py
+++ b/nemo_text_processing/text_normalization/de/utils.py
@@ -43,6 +43,6 @@ def load_labels(abs_path):
 
     Returns dictionary of mappings
     """
-    label_tsv = open(abs_path)
+    label_tsv = open(abs_path, encoding="utf-8")
     labels = list(csv.reader(label_tsv, delimiter="\t"))
     return labels
diff --git a/nemo_text_processing/text_normalization/en/utils.py b/nemo_text_processing/text_normalization/en/utils.py
index 0076e35c4e59..54a8ad95707c 100644
--- a/nemo_text_processing/text_normalization/en/utils.py
+++ b/nemo_text_processing/text_normalization/en/utils.py
@@ -37,6 +37,6 @@ def load_labels(abs_path):
 
     Returns dictionary of mappings
     """
-    label_tsv = open(abs_path)
+    label_tsv = open(abs_path, encoding="utf-8")
     labels = list(csv.reader(label_tsv, delimiter="\t"))
     return labels
diff --git a/requirements/requirements_lightning.txt b/requirements/requirements_lightning.txt
index 874cb2936b26..6352ae7eae50 100644
--- a/requirements/requirements_lightning.txt
+++ b/requirements/requirements_lightning.txt
@@ -1,4 +1,4 @@
-pytorch-lightning==1.5.10
+pytorch-lightning>=1.5.10
 torchmetrics>=0.4.1rc0
 transformers>=4.0.1
 webdataset>=0.1.48,<=0.1.62
diff --git a/tutorials/asr/ASR_for_telephony_speech.ipynb b/tutorials/asr/ASR_for_telephony_speech.ipynb
index cf12042572de..b8478aba1839 100644
--- a/tutorials/asr/ASR_for_telephony_speech.ipynb
+++ b/tutorials/asr/ASR_for_telephony_speech.ipynb
@@ -102,7 +102,7 @@
     "# Download the dataset. This will take a few moments...\n",
     "print(\"******\")\n",
     "if not os.path.exists(data_dir + '/an4_sphere.tar.gz'):\n",
-    "    an4_url = 'http://www.speech.cs.cmu.edu/databases/an4/an4_sphere.tar.gz'\n",
+    "    an4_url = 'https://dldata-public.s3.us-east-2.amazonaws.com/an4_sphere.tar.gz'  # for the original source, please visit http://www.speech.cs.cmu.edu/databases/an4/an4_sphere.tar.gz \n",
     "    an4_path = wget.download(an4_url, data_dir)\n",
     "    print(f\"Dataset downloaded at: {an4_path}\")\n",
     "else:\n",
diff --git a/tutorials/asr/ASR_with_NeMo.ipynb b/tutorials/asr/ASR_with_NeMo.ipynb
index 61e8d84fa7c4..35c23ac76cf5 100644
--- a/tutorials/asr/ASR_with_NeMo.ipynb
+++ b/tutorials/asr/ASR_with_NeMo.ipynb
@@ -197,7 +197,7 @@
     "# Download the dataset. This will take a few moments...\n",
     "print(\"******\")\n",
     "if not os.path.exists(data_dir + '/an4_sphere.tar.gz'):\n",
-    "    an4_url = 'http://www.speech.cs.cmu.edu/databases/an4/an4_sphere.tar.gz'\n",
+    "    an4_url = 'https://dldata-public.s3.us-east-2.amazonaws.com/an4_sphere.tar.gz'  # for the original source, please visit http://www.speech.cs.cmu.edu/databases/an4/an4_sphere.tar.gz \n",
     "    an4_path = wget.download(an4_url, data_dir)\n",
     "    print(f\"Dataset downloaded at: {an4_path}\")\n",
     "else:\n",
diff --git a/tutorials/asr/ASR_with_Subword_Tokenization.ipynb b/tutorials/asr/ASR_with_Subword_Tokenization.ipynb
index 1749e2934466..d3e7aa7ce911 100644
--- a/tutorials/asr/ASR_with_Subword_Tokenization.ipynb
+++ b/tutorials/asr/ASR_with_Subword_Tokenization.ipynb
@@ -371,7 +371,7 @@
         "# Download the dataset. This will take a few moments...\r\n",
         "print(\"******\")\r\n",
         "if not os.path.exists(data_dir + '/an4_sphere.tar.gz'):\r\n",
-        "    an4_url = 'http://www.speech.cs.cmu.edu/databases/an4/an4_sphere.tar.gz'\r\n",
+        "    an4_url = 'https://dldata-public.s3.us-east-2.amazonaws.com/an4_sphere.tar.gz'  # for the original source, please visit http://www.speech.cs.cmu.edu/databases/an4/an4_sphere.tar.gz \r\n",
         "    an4_path = wget.download(an4_url, data_dir)\r\n",
         "    print(f\"Dataset downloaded at: {an4_path}\")\r\n",
         "else:\r\n",
diff --git a/tutorials/asr/ASR_with_Transducers.ipynb b/tutorials/asr/ASR_with_Transducers.ipynb
index 8077d9391c8e..96cd4207089e 100644
--- a/tutorials/asr/ASR_with_Transducers.ipynb
+++ b/tutorials/asr/ASR_with_Transducers.ipynb
@@ -154,7 +154,7 @@
         "# Download the dataset. This will take a few moments...\n",
         "print(\"******\")\n",
         "if not os.path.exists(data_dir + '/an4_sphere.tar.gz'):\n",
-        "    an4_url = 'http://www.speech.cs.cmu.edu/databases/an4/an4_sphere.tar.gz'\n",
+        "    an4_url = 'https://dldata-public.s3.us-east-2.amazonaws.com/an4_sphere.tar.gz'  # for the original source, please visit http://www.speech.cs.cmu.edu/databases/an4/an4_sphere.tar.gz \n",
         "    an4_path = wget.download(an4_url, data_dir)\n",
         "    print(f\"Dataset downloaded at: {an4_path}\")\n",
         "else:\n",
@@ -1109,6 +1109,7 @@
         "# Instruct Greedy Decoders to preserve alignment information during autoregressive decoding\n",
         "with open_dict(decoding_config):\n",
         "  decoding_config.preserve_alignments = True\n",
+        "  decoding_config.fused_batch_size = -1  # temporarily stop fused batch during inference.\n",
         "\n",
         "model.change_decoding_strategy(decoding_config)"
       ],
diff --git a/tutorials/asr/Online_Noise_Augmentation.ipynb b/tutorials/asr/Online_Noise_Augmentation.ipynb
index 0ada862e4364..88588321de50 100644
--- a/tutorials/asr/Online_Noise_Augmentation.ipynb
+++ b/tutorials/asr/Online_Noise_Augmentation.ipynb
@@ -134,7 +134,7 @@
     "# Download the dataset. This will take a few moments...\n",
     "print(\"******\")\n",
     "if not os.path.exists(data_dir + '/an4_sphere.tar.gz'):\n",
-    "    an4_url = 'http://www.speech.cs.cmu.edu/databases/an4/an4_sphere.tar.gz'\n",
+    "    an4_url = 'https://dldata-public.s3.us-east-2.amazonaws.com/an4_sphere.tar.gz'  # for the original source, please visit http://www.speech.cs.cmu.edu/databases/an4/an4_sphere.tar.gz \n",
     "    an4_path = wget.download(an4_url, data_dir)\n",
     "    print(f\"Dataset downloaded at: {an4_path}\")\n",
     "else:\n",
diff --git a/tutorials/asr/Self_Supervised_Pre_Training.ipynb b/tutorials/asr/Self_Supervised_Pre_Training.ipynb
index b11bd3da3375..b4bc064d6367 100644
--- a/tutorials/asr/Self_Supervised_Pre_Training.ipynb
+++ b/tutorials/asr/Self_Supervised_Pre_Training.ipynb
@@ -28,7 +28,7 @@
         "\n",
         "## Install NeMo\n",
         "BRANCH = 'main'\n",
-        "!python -m pip install git+https://github.com/sam1373/NeMo.git@$BRANCH#egg=nemo_toolkit[all]\n",
+        "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]\n",
         "\n",
         "\"\"\"\n",
         "Remember to restart the runtime for the kernel to pick up any upgraded packages (e.g. matplotlib)!\n",
@@ -109,7 +109,7 @@
         "# Download the dataset. This will take a few moments...\n",
         "print(\"******\")\n",
         "if not os.path.exists(data_dir + '/an4_sphere.tar.gz'):\n",
-        "    an4_url = 'http://www.speech.cs.cmu.edu/databases/an4/an4_sphere.tar.gz'\n",
+        "    an4_url = 'https://dldata-public.s3.us-east-2.amazonaws.com/an4_sphere.tar.gz'\n",
         "    an4_path = wget.download(an4_url, data_dir)\n",
         "    print(f\"Dataset downloaded at: {an4_path}\")\n",
         "else:\n",
@@ -281,7 +281,7 @@
         "id": "RLzjCgmHuJ_j"
       },
       "source": [
-        "Since this config is for a very large model, we will modify it to make a much smaller version for the purpose of this tutorial by reducing the number of channels and the number of sub-blocks in each block, as well as reducing augmentation."
+        "Since this config is for a very large model, we will modify it to make a much smaller version for the purpose of this tutorial by reducing the number of channels and the number of sub-blocks in each block."
       ]
     },
     {
@@ -295,7 +295,7 @@
         "from omegaconf import OmegaConf\n",
         "import torch\n",
         "\n",
-        "config_path = './configs/citrinet_ssl_1024.yaml'\n",
+        "config_path = data_dir + '/configs/citrinet_ssl_1024.yaml'\n",
         "\n",
         "cfg = OmegaConf.load(config_path)\n",
         "\n",
@@ -303,16 +303,10 @@
         "cfg.model.model_defaults.repeat = 1\n",
         "cfg.model.model_defaults.enc_final = 256\n",
         "\n",
-        "cfg.model.spec_augment.freq_masks = 2\n",
-        "cfg.model.spec_augment.time_masks = 5\n",
-        "\n",
-        "cfg.model.optim.weight_decay = 0\n",
-        "cfg.model.optim.sched.warmup_steps = 2000\n",
-        "\n",
-        "cfg.model.train_ds.manifest_filepath = \"/content/an4/train_manifest.json\"\n",
+        "cfg.model.train_ds.manifest_filepath = train_manifest\n",
         "cfg.model.train_ds.batch_size = 16\n",
         "\n",
-        "cfg.model.validation_ds.manifest_filepath = \"/content/an4/test_manifest.json\"\n",
+        "cfg.model.validation_ds.manifest_filepath = test_manifest\n",
         "cfg.model.validation_ds.batch_size = 16\n",
         "\n",
         "cfg.trainer.max_epochs = None\n",
@@ -324,17 +318,42 @@
         "if torch.cuda.is_available():\n",
         "  cfg.trainer.accelerator = 'gpu'\n",
         "  cfg.trainer.strategy = 'dp'\n",
+        "  cfg.trainer.gpus = 1\n",
         "else:\n",
         "  cfg.trainer.accelerator = 'cpu'\n",
         "  cfg.trainer.strategy = None\n",
+        "  cfg.trainer.gpus = 0\n",
         "\n",
-        "cfg.exp_manager.exp_dir = \"/content/exp\"\n",
+        "cfg.exp_manager.exp_dir = data_dir + \"/content/exp\"\n",
         "cfg.exp_manager.name = \"pre_trained\"\n",
         "cfg.exp_manager.use_datetime_version = False\n",
         "cfg.exp_manager.create_tensorboard_logger = False\n",
         "cfg.exp_manager.resume_if_exists = True\n",
         "cfg.exp_manager.resume_ignore_no_checkpoint = True\n",
-        "cfg.exp_manager.checkpoint_callback_params.save_best_model = True"
+        "cfg.exp_manager.checkpoint_callback_params.save_best_model = True\n",
+        "\n",
+        "cfg.trainer.check_val_every_n_epoch = 1\n",
+        "\n",
+        "cfg.model.optim.sched.name = \"CosineAnnealing\"\n",
+        "cfg.model.optim.sched.warmup_steps = 1000\n",
+        "cfg.model.optim.sched.max_steps = 5000\n",
+        "#in practice you will usually want a much larger amount of pre-training steps\n",
+        "cfg.model.optim.sched.min_lr = 0\n",
+        "cfg.model.optim.lr = 0.015\n",
+        "cfg.model.optim.weight_decay = 0\n",
+        "\n",
+        "cfg.trainer.max_steps = cfg.model.optim.sched.max_steps\n",
+        "\n",
+        "cfg.model.spec_augment.patch_size = 16\n",
+        "\n",
+        "cfg.model.train_ds.min_duration = 2\n",
+        "cfg.model.validation_ds.min_duration = 2\n",
+        "#with patch_size set to 16 and 10 patches, \n",
+        "#we need to be able to mask 160 time steps;\n",
+        "#at preprocessor stride 0.01 this means we need minimum duration of 1.6 seconds \n",
+        "#or more to sample only from masked steps in the same utterance\n",
+        "\n",
+        "cfg.model.loss.num_negatives = 40"
       ]
     },
     {
@@ -343,7 +362,7 @@
         "id": "TrfAb1DjWzpL"
       },
       "source": [
-        "The parameters that are relevant to the self-supervised decoder and loss can be found in cfg.model.decoder and cfg.model loss. The default parameters for them tend to work well, so we will keep them as is for this tutorial."
+        "The following parameters will be used for decoder, loss, and masking:"
       ]
     },
     {
@@ -355,13 +374,18 @@
       "outputs": [],
       "source": [
         "print(OmegaConf.to_yaml(cfg.model.decoder))\n",
-        "print(OmegaConf.to_yaml(cfg.model.loss))"
+        "print(OmegaConf.to_yaml(cfg.model.loss))\n",
+        "print(OmegaConf.to_yaml(cfg.model.spec_augment))"
       ]
     },
     {
       "cell_type": "markdown",
       "source": [
-        "Note that for this loss the outputs must match the inputs, so since we are using Citrinet architecture with 8x stride, we would need to either set \"cfg.model.loss.combine_time_steps\" to 8, or put additional stride layers in the decoder. By default for Citrinet with 8x stride we use \"cfg.model.loss.combine_time_steps=4\" and \"cfg.model.decoder.stride_layers=1\" to match the 8x stride."
+        "Note that for this loss the outputs must match the inputs, so since we are using Citrinet architecture with 8x stride, we would need to either set \"cfg.model.loss.combine_time_steps\" to 8, or put additional stride layers in the decoder. By default for Citrinet with 8x stride we use \"cfg.model.loss.combine_time_steps=4\" and \"cfg.model.decoder.stride_layers=1\" to match the 8x stride.\n",
+        "\n",
+        "Since in MaskedPatchAugmentation we set patch_size to 16 and mask_patches is set to 10, this will result in 160 total masked steps in the spectrogram. Since combine_time_steps is set to 4, this means that 160 / 4 = 40 total potential negative can be used, so we set loss.num_negatives to 40 (unless you set sample_from_same_utterance_only to false or sample_from_non_masked to true, but this tends to make results worse).\n",
+        "\n",
+        "In the default configs we assume that min_duration for samples is higher (8 seconds by default), so there we can set patch_size to 48 for a total of 480 masked steps, and use 100 sampled negatives. If the min_duration of samples that you are training on allows, the amount of masked steps as well as negatives can be increased further (masking around 50% of the sample duration tends to work well)."
       ],
       "metadata": {
         "id": "4JnepitBZ3ta"
@@ -373,7 +397,7 @@
         "id": "yoUIMS7mgrUs"
       },
       "source": [
-        "Now we will can create the config object."
+        "Now we can create the config object."
       ]
     },
     {
@@ -489,23 +513,21 @@
       },
       "outputs": [],
       "source": [
-        "config_path = './configs/citrinet_1024.yaml'\n",
+        "config_path = data_dir + '/configs/citrinet_1024.yaml'\n",
         "\n",
         "cfg = OmegaConf.load(config_path)\n",
         "\n",
         "cfg.model.model_defaults.filters = 256\n",
         "cfg.model.model_defaults.repeat = 1\n",
         "cfg.model.model_defaults.enc_final = 256\n",
-        "cfg.model.optim.weight_decay = 0\n",
-        "cfg.model.optim.sched.warmup_steps = 500\n",
         "\n",
         "cfg.model.spec_augment.freq_masks = 2\n",
         "cfg.model.spec_augment.time_masks = 5\n",
         "\n",
-        "cfg.model.train_ds.manifest_filepath = \"/content/an4/train_manifest.json\"\n",
+        "cfg.model.train_ds.manifest_filepath = train_manifest\n",
         "cfg.model.train_ds.batch_size = 16\n",
         "\n",
-        "cfg.model.validation_ds.manifest_filepath = \"/content/an4/test_manifest.json\"\n",
+        "cfg.model.validation_ds.manifest_filepath = test_manifest\n",
         "cfg.model.validation_ds.batch_size = 16\n",
         "\n",
         "cfg.model.log_prediction = False\n",
@@ -517,20 +539,31 @@
         "if torch.cuda.is_available():\n",
         "  cfg.trainer.accelerator = 'gpu'\n",
         "  cfg.trainer.strategy = 'dp'\n",
+        "  cfg.trainer.gpus = 1\n",
         "else:\n",
         "  cfg.trainer.accelerator = 'cpu'\n",
         "  cfg.trainer.strategy = None\n",
+        "  cfg.trainer.gpus = 0\n",
         "\n",
         "cfg.model.tokenizer.dir = data_dir + \"/tokenizers/an4/tokenizer_spe_unigram_v128/\"  # note this is a directory, not a path to a vocabulary file\n",
         "cfg.model.tokenizer.type = \"bpe\"\n",
         "\n",
-        "cfg.exp_manager.exp_dir = \"/content/exp\"\n",
+        "cfg.exp_manager.exp_dir = data_dir + \"/content/exp\"\n",
         "cfg.exp_manager.name = \"fine_tuned\"\n",
         "cfg.exp_manager.use_datetime_version = False\n",
         "cfg.exp_manager.create_tensorboard_logger = False\n",
         "cfg.exp_manager.resume_if_exists = True\n",
         "cfg.exp_manager.resume_ignore_no_checkpoint = True\n",
-        "cfg.exp_manager.checkpoint_callback_params.save_best_model = True"
+        "cfg.exp_manager.checkpoint_callback_params.save_best_model = True\n",
+        "\n",
+        "cfg.model.optim.sched.name = \"CosineAnnealing\"\n",
+        "cfg.model.optim.sched.warmup_steps = 500\n",
+        "cfg.model.optim.sched.max_steps = 2000\n",
+        "cfg.model.optim.sched.min_lr = 0\n",
+        "cfg.model.optim.lr = 0.015 #if encoder is frozen, lr can be much higher\n",
+        "cfg.model.optim.weight_decay = 0\n",
+        "\n",
+        "cfg.trainer.max_steps = cfg.model.optim.sched.max_steps"
       ]
     },
     {
@@ -550,7 +583,7 @@
       },
       "outputs": [],
       "source": [
-        "cfg.init_from_nemo_model=\"/content/exp/pre_trained/checkpoints/pre_trained.nemo\"\n",
+        "cfg.init_from_nemo_model=data_dir + \"/content/exp/pre_trained/checkpoints/pre_trained.nemo\"\n",
         "cfg.init_strict = False"
       ]
     },
@@ -622,7 +655,7 @@
     {
       "cell_type": "markdown",
       "source": [
-        "We can optionally freeze the encoder and only fine-tune the decoder during traning. This can be done to lower the computational requirements of fine-tuning, but will likely result in a higher word error rate."
+        "We can optionally freeze the encoder and only fine-tune the decoder during training. This can be done to lower the memory and time requirements of fine-tuning, but will likely result in a higher word error rate."
       ],
       "metadata": {
         "id": "S5aVb2F8WuAR"
@@ -662,7 +695,7 @@
     {
       "cell_type": "markdown",
       "source": [
-        "With the default parameters in this notebook, this pre-training and fine-tuning should achieve around 0.2-0.3 word error rate on the an4 validation set."
+        "With the default parameters in this notebook, this pre-training and fine-tuning should achieve around 0.2-0.3 word error rate on the an4 validation set. With frozen encoder and lr increased to 0.15, you will get around 0.6 WER."
       ],
       "metadata": {
         "id": "UfnbNZ-AmmD1"
diff --git a/tutorials/nlp/PTune_multiple_NLP_tasks.ipynb b/tutorials/nlp/PTune_multiple_NLP_tasks.ipynb
index 7e34e1481efc..c449e8c53d48 100644
--- a/tutorials/nlp/PTune_multiple_NLP_tasks.ipynb
+++ b/tutorials/nlp/PTune_multiple_NLP_tasks.ipynb
@@ -181,7 +181,7 @@
    "source": [
     "## Downloading Financial Phrase Bank Dataset\n",
     "\n",
-    "The datase is collected by Malo et al. 2014, and can be downloaded from this [link](https://www.researchgate.net/profile/Pekka_Malo/publication/251231364_FinancialPhraseBank-v10/data/0c96051eee4fb1d56e000000/FinancialPhraseBank-v10.zip). The zip file for the Financial Phrase Bank Dataset has been provided for ease of download and use."
+    "The dataset is collected by Malo et al. 2014, and can be downloaded from this [link](https://www.researchgate.net/profile/Pekka_Malo/publication/251231364_FinancialPhraseBank-v10/data/0c96051eee4fb1d56e000000/FinancialPhraseBank-v10.zip). The zip file for the Financial Phrase Bank Dataset has been provided for ease of download and use."
    ]
   },
   {
@@ -449,7 +449,7 @@
    "source": [
     "## Convert the Megatron-LM Weights to Nemo file\n",
     "\n",
-    "P-Tuning method works the best with large GPT lanague models. From our experiences, models of size 5B or above give good performance. If you already have a large GPT model ready, skip this section. \n",
+    "P-Tuning method works the best with large GPT language models. From our experiences, models of size 5B or above give good performance. If you already have a large GPT model ready, skip this section. \n",
     "\n",
     "In this example, we will use the pretrained 344M NeMo Megatron GPT model from [Megatron-LM project](https://github.com/NVIDIA/Megatron-LM). To load it in NeMo Megatron, We first need to convert the Megatron-LM checkpoint to the `.nemo` file. Let's download the pretrained model weights and vocabulary file.\n",
     "\n"
@@ -470,7 +470,7 @@
     "\n",
     "if not pathlib.Path(gpt_file).exists():\n",
     "    !wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/megatron_lm_345m/versions/v0.0/zip -O $gpt_file\n",
-    "    !unzip -f $gpt_file\n",
+    "    !unzip -o $gpt_file\n",
     "    !wget https://s3.amazonaws.com/models.huggingface.co/bert/$vocab_file -O $vocab_file \n",
     "    !wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt -O $merge_file\n",
     "\n"
@@ -634,7 +634,7 @@
    "source": [
     "## Add the Data Processors to Generate the Prompts\n",
     "\n",
-    "To customize different prompts for different tasks, we can configure the `TemplateProcessor` to define the template for the prompt input. The curley brackets defines the variables in the templte string. `{v0}`, `{v1}`, `{v2}` indicates the virtual token of length `prompt_encoder.template[0]`, `prompt_encoder.template[1]` and `prompot_encoder.template[2]`. The other variables `{var}` refers to the variables in the data record. For example.\n",
+    "To customize different prompts for different tasks, we can configure the `TemplateProcessor` to define the template for the prompt input. The curly brackets defines the variables in the template string. `{v0}`, `{v1}`, `{v2}` indicates the virtual token of length `prompt_encoder.template[0]`, `prompt_encoder.template[1]` and `prompt_encoder.template[2]`. The other variables `{var}` refers to the variables in the data record. For example.\n",
     "\n",
     "Given the data record, **{\"sentence1\": \"And he said, Mama, I'm home.\", \"sentence2\": \"He didn't say a word.\"}** and template list [3, 3, 3], \n",
     "the template string **{v0} Hypothesis: [sentence1], {v1} Premise: [sentence2] {v2} Answer:** will be translated into **<span style=\"color:red\">VVV</span> Hypothesis: And he said, Mama, I'm home.<span style=\"color:red\">VVV</span> Premise: He didn't say a word.<span style=\"color:red\">VVV</span> Answer:**, where <span style=\"color:red\">VVV</span> is virtual token of space 3.\n",
@@ -739,8 +739,7 @@
     "config.trainer.accelerator = accelerator\n",
     "config.trainer.devices = 1\n",
     "config.trainer.max_epochs = 100\n",
-    "config.trainer.val_check_interval=95230\n",
-    "\n",
+    "config.trainer.val_check_interval=1.0\n",
     "# for PyTorch Native AMP set precision=16\n",
     "config.trainer.precision = 16 if torch.cuda.is_available() else 32\n",
     "\n",
diff --git a/tutorials/speaker_tasks/Speaker_Identification_Verification.ipynb b/tutorials/speaker_tasks/Speaker_Identification_Verification.ipynb
index 44f1e0a05ff3..f5237970ef71 100644
--- a/tutorials/speaker_tasks/Speaker_Identification_Verification.ipynb
+++ b/tutorials/speaker_tasks/Speaker_Identification_Verification.ipynb
@@ -85,7 +85,7 @@
     "# Download the dataset. This will take a few moments...\n",
     "print(\"******\")\n",
     "if not os.path.exists(data_dir + '/an4_sphere.tar.gz'):\n",
-    "    an4_url = 'http://www.speech.cs.cmu.edu/databases/an4/an4_sphere.tar.gz'\n",
+    "    an4_url = 'https://dldata-public.s3.us-east-2.amazonaws.com/an4_sphere.tar.gz'  # for the original source, please visit http://www.speech.cs.cmu.edu/databases/an4/an4_sphere.tar.gz \n",
     "    an4_path = wget.download(an4_url, data_dir)\n",
     "    print(f\"Dataset downloaded at: {an4_path}\")\n",
     "else:\n",
diff --git a/tutorials/tts/Tacotron2_Training.ipynb b/tutorials/tts/Tacotron2_Training.ipynb
index db031572e748..e1484d6412e3 100644
--- a/tutorials/tts/Tacotron2_Training.ipynb
+++ b/tutorials/tts/Tacotron2_Training.ipynb
@@ -155,7 +155,7 @@
     "# NeMo's training scripts are stored inside the examples/ folder. Let's grab the tacotron2.py file\n",
     "# as well as the tacotron2.yaml file\n",
     "!wget https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/examples/tts/tacotron2.py\n",
-    "!mkdir conf && cd conf && wget https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/examples/tts/conf/tacotron2.yaml && cd .."
+    "!mkdir -p conf && cd conf && wget https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/examples/tts/conf/tacotron2.yaml && cd .."
    ]
   },
   {