From c1b198b5317b2cf0b7e11e2e5e33b425986b4f4b Mon Sep 17 00:00:00 2001
From: anteju <108555623+anteju@users.noreply.github.com>
Date: Mon, 20 May 2024 10:17:02 -0700
Subject: [PATCH 01/47] Add mel codec checkpoints (#9228)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Add mel codec checkpoints

Signed-off-by: Ante Jukić <ajukic@nvidia.com>

---------

Signed-off-by: Ante Jukić <ajukic@nvidia.com>
---
 docs/source/tts/data/ngc_models_codec.csv  |  2 ++
 docs/source/tts/models.rst                 |  3 ++-
 nemo/collections/tts/models/audio_codec.py | 26 ++++++++++++++++++----
 3 files changed, 26 insertions(+), 5 deletions(-)

diff --git a/docs/source/tts/data/ngc_models_codec.csv b/docs/source/tts/data/ngc_models_codec.csv
index d46567012600..6827c54ce7f4 100644
--- a/docs/source/tts/data/ngc_models_codec.csv
+++ b/docs/source/tts/data/ngc_models_codec.csv
@@ -1,2 +1,4 @@
 Model Name,Dataset,Sampling Rate,Model Class,Overview,Checkpoint
 audio_codec_16khz_small,Libri-Light,16000Hz,nemo.collections.tts.models.AudioCodecModel,`audio_codec_16khz_small <https://ngc.nvidia.com/catalog/models/nvidia:nemo:audio_codec_16khz_small>`_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/audio_codec_16khz_small/versions/v1/files/audio_codec_16khz_small.nemo``
+mel_codec_22khz_medium,LibriVox and Common Voice,22050Hz,nemo.collections.tts.models.AudioCodecModel,`mel_codec_22khz_medium <https://ngc.nvidia.com/catalog/models/nvidia:nemo:mel_codec_22khz_medium>`_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/mel_codec_22khz_medium/versions/v1/files/mel_codec_22khz_medium.nemo``
+mel_codec_44khz_medium,LibriVox and Common Voice,44100Hz,nemo.collections.tts.models.AudioCodecModel,`mel_codec_44khz_medium <https://ngc.nvidia.com/catalog/models/nvidia:nemo:mel_codec_44khz_medium>`_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/mel_codec_44khz_medium/versions/v1/files/mel_codec_44khz_medium.nemo``
diff --git a/docs/source/tts/models.rst b/docs/source/tts/models.rst
index 6f9d7d24c45d..7ea5caa4d871 100644
--- a/docs/source/tts/models.rst
+++ b/docs/source/tts/models.rst
@@ -140,9 +140,10 @@ Codecs
 Audio Codec
 ~~~~~~~~~~~
 
-The NeMo Audio Codec model is a non-autoregressive convolutional encoder-quantizer-decoder model for coding or tokenization of raw audio signals.
+The NeMo Audio Codec model is a non-autoregressive convolutional encoder-quantizer-decoder model for coding or tokenization of raw audio signals or mel-spectrogram features.
 The NeMo Audio Codec model supports residual vector quantizer (RVQ) :cite:`tts-models-zeghidour2022soundstream` and finite scalar quantizer (FSQ) :cite:`tts-models-mentzer2023finite` for quantization of the encoder output.
 This model is trained end-to-end using generative loss, discriminative loss, and reconstruction loss, similar to other neural audio codecs such as SoundStream :cite:`tts-models-zeghidour2022soundstream` and EnCodec :cite:`tts-models-defossez2022encodec`.
+For further information refer to the ``Audio Codec Training`` tutorial in the TTS tutorial section.
 
     .. image:: images/audiocodec_model.png
         :align: center
diff --git a/nemo/collections/tts/models/audio_codec.py b/nemo/collections/tts/models/audio_codec.py
index 81fb7cb5cd7b..04a6d2793f88 100644
--- a/nemo/collections/tts/models/audio_codec.py
+++ b/nemo/collections/tts/models/audio_codec.py
@@ -118,7 +118,10 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None):
         # STFT loss setup
         stft_loss_log_guard = cfg.get("stft_loss_log_guard", 1.0)
         self.stft_loss_scale = cfg.get("stft_loss_scale", 0.0)
-        self.stft_loss_fn = MultiResolutionSTFTLoss(resolutions=loss_resolutions, log_guard=stft_loss_log_guard,)
+        self.stft_loss_fn = MultiResolutionSTFTLoss(
+            resolutions=loss_resolutions,
+            log_guard=stft_loss_log_guard,
+        )
 
         # Time domain loss setup
         self.time_domain_loss_scale = cfg.get("time_domain_loss_scale", 1.0)
@@ -237,7 +240,9 @@ def quantize(self, encoded: torch.Tensor, encoded_len: torch.Tensor) -> torch.Te
             "tokens": NeuralType(('B', 'C', 'T_encoded'), TokenIndex()),
             "tokens_len": NeuralType(tuple('B'), LengthsType()),
         },
-        output_types={"dequantized": NeuralType(('B', 'D', 'T_encoded'), EncodedRepresentation()),},
+        output_types={
+            "dequantized": NeuralType(('B', 'D', 'T_encoded'), EncodedRepresentation()),
+        },
     )
     def dequantize(self, tokens: torch.Tensor, tokens_len: torch.Tensor) -> torch.Tensor:
         """Convert the discrete tokens into a continuous encoded representation.
@@ -392,8 +397,7 @@ def _process_batch(self, batch):
 
     @property
     def disc_update_prob(self) -> float:
-        """Probability of updating the discriminator.
-        """
+        """Probability of updating the discriminator."""
         return self.disc_updates_per_period / self.disc_update_period
 
     def should_update_disc(self, batch_idx) -> bool:
@@ -652,4 +656,18 @@ def list_available_models(cls) -> List[PretrainedModelInfo]:
         )
         models.append(model)
 
+        model = PretrainedModelInfo(
+            pretrained_model_name="mel_codec_22khz_medium",
+            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/mel_codec_22khz_medium/versions/v1/files/mel_codec_22khz_medium.nemo",
+            description="For details about this model please refer to the model card: https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/mel_codec_22khz_medium",
+        )
+        models.append(model)
+
+        model = PretrainedModelInfo(
+            pretrained_model_name="mel_codec_44khz_medium",
+            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/mel_codec_44khz_medium/versions/v1/files/mel_codec_44khz_medium.nemo",
+            description="For details about this model please refer to the model card: https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/mel_codec_44khz_medium",
+        )
+        models.append(model)
+
         return models

From 67f06aca159e1970f0df25ba0c69180536ea5a1a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?oliver=20k=C3=B6nig?= <okoenig@nvidia.com>
Date: Mon, 20 May 2024 20:24:51 +0200
Subject: [PATCH 02/47] ci: Remove duplicated job (#9258)

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>
---
 .github/workflows/cicd-main.yml | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 4efb525100d9..dbc7d907580a 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -289,9 +289,6 @@ jobs:
           run: |
             rm -rf /home/TestData/nlp/megatron_gpt/starcoder-ci-hf/megatron_starcoder_tp1_pp1.nemo; 
             rm -rf /home/TestData/nlp/megatron_gpt/starcoder-ci-hf/${{ github.run_id }}/
-        - name: Cleanup
-          if: "always()"
-          run: |
             rm -rf /home/TestData/nlp/megatron_gpt/starcoder-ci-hf/model_weights
         - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"

From 5d4f6b2ea629ed7b89305a4a7d984b792af2139e Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com>
Date: Mon, 20 May 2024 12:42:16 -0700
Subject: [PATCH 03/47] fix import (#9240)

* fix import

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>

---------

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>
Co-authored-by: akoumpa <akoumpa@users.noreply.github.com>
---
 .../nlp/language_modeling/megatron_lm_ckpt_to_nemo.py     | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/nlp/language_modeling/megatron_lm_ckpt_to_nemo.py b/examples/nlp/language_modeling/megatron_lm_ckpt_to_nemo.py
index 03d6fd94e4e2..72252a03d5be 100644
--- a/examples/nlp/language_modeling/megatron_lm_ckpt_to_nemo.py
+++ b/examples/nlp/language_modeling/megatron_lm_ckpt_to_nemo.py
@@ -291,9 +291,9 @@ def load_from_checkpoint(
     **kwargs,
 ):
     """
-        Loads Megatron_LM checkpoints, convert it, with some maintenance of restoration.
-        For documentation, please refer to LightningModule.load_from_checkpoin() documentation.
-        """
+    Loads Megatron_LM checkpoints, convert it, with some maintenance of restoration.
+    For documentation, please refer to LightningModule.load_from_checkpoin() documentation.
+    """
     checkpoint = None
     try:
         cls._set_model_restore_state(is_being_restored=True)
@@ -470,7 +470,7 @@ def convert(local_rank, rank, world_size, args):
             )
         if mcore_output and not args.mcore_input:
             # convert from legacy Megatron-LM to MCore NeMo. Initialize an mcore translation dict
-            from scripts.nlp_language_modeling.convert_nemo_gpt_to_mcore import build_key_mapping
+            from scripts.checkpoint_converters.convert_gpt_nemo_to_mcore import build_key_mapping
 
             mcore_translate = {}
             for k, v in build_key_mapping(model_cfg).items():

From a69ace4f5ac5f72367852f78538ea6c9880c39b2 Mon Sep 17 00:00:00 2001
From: yaoyu-33 <54727607+yaoyu-33@users.noreply.github.com>
Date: Mon, 20 May 2024 16:38:19 -0700
Subject: [PATCH 04/47] Fix document links (#9260)

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>
---
 docs/source/features/parallelisms.rst | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/source/features/parallelisms.rst b/docs/source/features/parallelisms.rst
index 9d5f33196c4e..d5e86e46a49d 100644
--- a/docs/source/features/parallelisms.rst
+++ b/docs/source/features/parallelisms.rst
@@ -44,7 +44,7 @@ Implement Tensor Parallelism
 
 NeMo integrates Tensor Parallelism through the implementation from Megatron Core. To understand how TP is activated within transformer blocks, refer to the code in the following repository: `Megatron-LM Transformer Block <https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/transformer/transformer_block.py>`_.
 
-For detailed API usage and additional configurations, consult the `Megatron Core Developer Guide <https://docs.nvidia.com/Megatron Core/developer-guide/latest/api-guide/tensor_parallel.html>`_.
+For detailed API usage and additional configurations, consult the `Megatron Core Developer Guide <https://docs.nvidia.com/Megatron-Core/developer-guide/latest/api-guide/tensor_parallel.html>`_.
 
 Pipeline Parallelism
 ^^^^^^^^^^^^^^^^^^^^
@@ -87,7 +87,7 @@ Implement Pipeline Parallelism
 
 NeMo's implementation of PP leverages functionalities from Megatron Core. For a practical example of how PP is implemented within transformer blocks in NeMo, you can inspect the following codebase: `Megatron-LM Transformer Block <https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/transformer/transformer_block.py>`_.
 
-For more detailed API usage and configurations related to PP, visit the `Megatron Core Developer Guide <https://docs.nvidia.com/Megatron Core/developer-guide/latest/api-guide/tensor_parallel.html>`_.
+For more detailed API usage and configurations related to PP, visit the `Megatron Core Developer Guide <https://docs.nvidia.com/Megatron-Core/developer-guide/latest/api-guide/tensor_parallel.html>`_.
 
 Sequence Parallelism
 ^^^^^^^^^^^^^^^^^^^^
@@ -132,7 +132,7 @@ To activate CP in the NeMo framework, set the ``context_parallel_size`` paramete
 
        context_parallel_size: 1  # Example to enable Context Parallelism
 
-The configuration can be found and modified here: `NeMo Megatron Core Context Config <https://docs.nvidia.com/Megatron Core/developer-guide/latest/api-guide/context_parallel.html>`_.
+The configuration can be found and modified here: `NeMo Megatron Core Context Config <https://docs.nvidia.com/Megatron-Core/developer-guide/latest/api-guide/context_parallel.html>`_.
 
 Implement Context Parallelism
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

From 2e1814c9f031ad2aeeebad44597365e97253d2c4 Mon Sep 17 00:00:00 2001
From: Onur Yilmaz <35306097+oyilmaz-nvidia@users.noreply.github.com>
Date: Tue, 21 May 2024 09:56:38 -0400
Subject: [PATCH 05/47] Add TRT-LLM params like max_num_tokens and
 opt_num_tokens (#9210)

* Add params like max_num_tokens and opt_num_tokens

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* remove padding param added

* update params like max_num_token

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: oyilmaz-nvidia <oyilmaz-nvidia@users.noreply.github.com>

* remove context context_fmha param for now

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* add params like max num token to the script

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: oyilmaz-nvidia <oyilmaz-nvidia@users.noreply.github.com>

---------

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>
Signed-off-by: oyilmaz-nvidia <oyilmaz-nvidia@users.noreply.github.com>
Co-authored-by: oyilmaz-nvidia <oyilmaz-nvidia@users.noreply.github.com>
Co-authored-by: Pablo Garay <palenq@gmail.com>
---
 nemo/export/tensorrt_llm.py               | 21 ++++++---
 nemo/export/trt_llm/tensorrt_llm_build.py | 28 ++++++++++--
 scripts/deploy/nlp/deploy_triton.py       | 53 +++++++++++++++++++----
 scripts/export/export_to_trt_llm.py       | 19 +++++---
 tests/export/test_nemo_export.py          |  2 +
 5 files changed, 100 insertions(+), 23 deletions(-)

diff --git a/nemo/export/tensorrt_llm.py b/nemo/export/tensorrt_llm.py
index af4f1b6699ee..cad7b821b3b4 100644
--- a/nemo/export/tensorrt_llm.py
+++ b/nemo/export/tensorrt_llm.py
@@ -117,15 +117,16 @@ def export(
         max_batch_size: int = 8,
         max_prompt_embedding_table_size=None,
         use_parallel_embedding: bool = False,
-        use_inflight_batching: bool = False,
-        enable_context_fmha: bool = True,
-        paged_kv_cache: bool = False,
+        paged_kv_cache: bool = True,
+        remove_input_padding: bool = True,
         dtype: str = "bfloat16",
         load_model: bool = True,
         enable_multi_block_mode: bool = False,
         use_lora_plugin: str = None,
         lora_target_modules: List[str] = None,
         max_lora_rank: int = 64,
+        max_num_tokens: int = None,
+        opt_num_tokens: int = None,
         save_nemo_model_config: bool = False,
     ):
         """
@@ -142,12 +143,18 @@ def export(
             max_output_token (int): max output length.
             max_batch_size (int): max batch size.
             max_prompt_embedding_table_size (int): max prompt embedding size.
-            use_inflight_batching (bool): if True, enables inflight batching for TensorRT-LLM Triton backend.
-            enable_context_fmha (bool): if True, use fused Context MultiHeadedAttention.
+            use_parallel_embedding (bool): whether to use parallel embedding feature of TRT-LLM or not
             paged_kv_cache (bool): if True, uses kv cache feature of the TensorRT-LLM.
+            remove_input_padding (bool): enables removing input padding or not.
             dtype (str): Floating point type for model weights (Supports BFloat16/Float16).
             load_model (bool): load TensorRT-LLM model after the export.
             enable_multi_block_mode (bool): enable faster decoding in multihead attention. Required for long context.
+            use_lora_plugin (str): use dynamic lora or not.
+            lora_target_modules (List[str]): list of the target lora modules.
+            max_lora_rank (int): maximum lora rank.
+            max_num_tokens (int):
+            opt_num_tokens (int):
+            save_nemo_model_config (bool):
         """
 
         if model_type not in self.get_supported_models_list:
@@ -238,6 +245,10 @@ def export(
                         lora_target_modules=lora_target_modules,
                         max_prompt_embedding_table_size=max_prompt_embedding_table_size,
                         enable_multi_block_mode=enable_multi_block_mode,
+                        paged_kv_cache=paged_kv_cache,
+                        remove_input_padding=remove_input_padding,
+                        max_num_tokens=max_num_tokens,
+                        opt_num_tokens=opt_num_tokens,
                     )
 
             tokenizer_path = os.path.join(nemo_export_dir, "tokenizer.model")
diff --git a/nemo/export/trt_llm/tensorrt_llm_build.py b/nemo/export/trt_llm/tensorrt_llm_build.py
index ac8d9094ea32..2336b8eb38ce 100644
--- a/nemo/export/trt_llm/tensorrt_llm_build.py
+++ b/nemo/export/trt_llm/tensorrt_llm_build.py
@@ -24,6 +24,7 @@
 import tensorrt_llm
 import torch
 from tensorrt_llm import str_dtype_to_trt
+from tensorrt_llm._common import check_max_num_tokens
 from tensorrt_llm._utils import np_dtype_to_trt
 from tensorrt_llm.builder import BuildConfig, Builder
 from tensorrt_llm.commands.build import build as build_trtllm
@@ -371,6 +372,12 @@ def build_and_save_engine(
     lora_target_modules=None,
     max_prompt_embedding_table_size=0,
     enable_multi_block_mode: bool = False,
+    paged_kv_cache: bool = True,
+    remove_input_padding: bool = True,
+    max_num_tokens: int = None,
+    opt_num_tokens: int = None,
+    max_beam_width: int = 1,
+    tokens_per_block: int = 128,
 ):
     try:
         model_cls = getattr(tensorrt_llm.models, model_config.architecture)
@@ -383,15 +390,30 @@ def build_and_save_engine(
     plugin_config.set_gpt_attention_plugin(dtype=str_dtype)
     plugin_config.set_gemm_plugin(dtype=str_dtype)
     plugin_config.set_plugin("multi_block_mode", enable_multi_block_mode)
-    max_num_tokens = max_batch_size * max_input_len
+    if paged_kv_cache:
+        plugin_config.enable_paged_kv_cache(tokens_per_block=tokens_per_block)
+    else:
+        plugin_config.paged_kv_cache = False
+    plugin_config.remove_input_padding = remove_input_padding
+
+    max_num_tokens, opt_num_tokens = check_max_num_tokens(
+        max_num_tokens=max_num_tokens,
+        opt_num_tokens=opt_num_tokens,
+        max_batch_size=max_batch_size,
+        max_input_len=max_input_len,
+        max_beam_width=max_beam_width,
+        remove_input_padding=remove_input_padding,
+        enable_context_fmha=plugin_config.context_fmha,
+        tokens_per_block=tokens_per_block,
+    )
 
     build_dict = {
         'max_input_len': max_input_len,
         'max_output_len': max_output_len,
         'max_batch_size': max_batch_size,
-        'max_beam_width': 1,
+        'max_beam_width': max_beam_width,
         'max_num_tokens': max_num_tokens,
-        'opt_num_tokens': None,
+        'opt_num_tokens': opt_num_tokens,
         'max_prompt_embedding_table_size': max_prompt_embedding_table_size,
         'gather_context_logits': False,
         'gather_generation_logits': False,
diff --git a/scripts/deploy/nlp/deploy_triton.py b/scripts/deploy/nlp/deploy_triton.py
index aa896e924584..7370731ec996 100755
--- a/scripts/deploy/nlp/deploy_triton.py
+++ b/scripts/deploy/nlp/deploy_triton.py
@@ -27,7 +27,8 @@
 
 def get_args(argv):
     parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter, description=f"Deploy nemo models to Triton",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+        description=f"Deploy nemo models to Triton",
     )
     parser.add_argument("-nc", "--nemo_checkpoint", type=str, help="Source .nemo file")
     parser.add_argument(
@@ -73,6 +74,8 @@ def get_args(argv):
     parser.add_argument("-mil", "--max_input_len", default=256, type=int, help="Max input length of the model")
     parser.add_argument("-mol", "--max_output_len", default=256, type=int, help="Max output length of the model")
     parser.add_argument("-mbs", "--max_batch_size", default=8, type=int, help="Max batch size of the model")
+    parser.add_argument("-mnt", "--max_num_tokens", default=None, type=int, help="Max number of tokens")
+    parser.add_argument("-ont", "--opt_num_tokens", default=None, type=int, help="Optimum number of tokens")
     parser.add_argument(
         "-mpet", "--max_prompt_embedding_table_size", default=None, type=int, help="Max prompt embedding table size"
     )
@@ -80,11 +83,11 @@ def get_args(argv):
         "-upkc", "--use_paged_kv_cache", default=False, action='store_true', help="Enable paged kv cache."
     )
     parser.add_argument(
-        "-dcf",
-        "--disable_context_fmha",
+        "-drip",
+        "--disable_remove_input_padding",
         default=False,
         action='store_true',
-        help="Disable fused Context MultiHeadedAttention (required for V100 support).",
+        help="Disables the remove input padding option.",
     )
     parser.add_argument(
         "-mbm",
@@ -101,7 +104,6 @@ def get_args(argv):
         '--use_lora_plugin',
         nargs='?',
         const=None,
-        default=False,
         choices=['float16', 'float32', 'bfloat16'],
         help="Activates the lora plugin which enables embedding sharing.",
     )
@@ -109,7 +111,16 @@ def get_args(argv):
         '--lora_target_modules',
         nargs='+',
         default=None,
-        choices=["attn_qkv", "attn_q", "attn_k", "attn_v", "attn_dense", "mlp_h_to_4h", "mlp_gate", "mlp_4h_to_h",],
+        choices=[
+            "attn_qkv",
+            "attn_q",
+            "attn_k",
+            "attn_v",
+            "attn_dense",
+            "mlp_h_to_4h",
+            "mlp_gate",
+            "mlp_4h_to_h",
+        ],
         help="Add lora in which modules. Only be activated when use_lora_plugin is enabled.",
     )
     parser.add_argument(
@@ -198,6 +209,29 @@ def nemo_deploy(argv):
     trt_llm_exporter = TensorRTLLM(model_dir=trt_llm_path, lora_ckpt_list=args.lora_ckpt)
 
     if args.nemo_checkpoint is not None:
+
+        trt_llm_exporter.export(
+            nemo_checkpoint_path=args.nemo_checkpoint,
+            model_type=args.model_type,
+            n_gpus=args.num_gpus,
+            tensor_parallel_size=args.num_gpus,
+            pipeline_parallel_size=1,
+            max_input_token=args.max_input_len,
+            max_output_token=args.max_output_len,
+            max_batch_size=args.max_batch_size,
+            max_num_tokens=args.max_num_tokens,
+            opt_num_tokens=args.opt_num_tokens,
+            max_prompt_embedding_table_size=args.max_prompt_embedding_table_size,
+            paged_kv_cache=args.use_paged_kv_cache,
+            remove_input_padding=(not args.disable_remove_input_padding),
+            dtype=args.dtype,
+            enable_multi_block_mode=args.multi_block_mode,
+            use_lora_plugin=args.use_lora_plugin,
+            lora_target_modules=args.lora_target_modules,
+            max_lora_rank=args.max_lora_rank,
+            save_nemo_model_config=True,
+        )
+
         try:
             LOGGER.info("Export operation will be started to export the nemo checkpoint to TensorRT-LLM.")
             trt_llm_exporter.export(
@@ -209,9 +243,11 @@ def nemo_deploy(argv):
                 max_input_token=args.max_input_len,
                 max_output_token=args.max_output_len,
                 max_batch_size=args.max_batch_size,
+                max_num_tokens=args.max_num_tokens,
+                opt_num_tokens=args.opt_num_tokens,
                 max_prompt_embedding_table_size=args.max_prompt_embedding_table_size,
                 paged_kv_cache=args.use_paged_kv_cache,
-                enable_context_fmha=not args.disable_context_fmha,
+                remove_input_padding=(not args.disable_remove_input_padding),
                 dtype=args.dtype,
                 enable_multi_block_mode=args.multi_block_mode,
                 use_lora_plugin=args.use_lora_plugin,
@@ -236,7 +272,8 @@ def nemo_deploy(argv):
                 )
             )
             trt_llm_exporter.add_prompt_table(
-                task_name=str(task_id), prompt_embeddings_checkpoint_path=prompt_embeddings_checkpoint_path,
+                task_name=str(task_id),
+                prompt_embeddings_checkpoint_path=prompt_embeddings_checkpoint_path,
             )
     except Exception as error:
         LOGGER.error("An error has occurred during adding the prompt embedding table(s). Error message: " + str(error))
diff --git a/scripts/export/export_to_trt_llm.py b/scripts/export/export_to_trt_llm.py
index 5e5833444f65..e9741516cf00 100644
--- a/scripts/export/export_to_trt_llm.py
+++ b/scripts/export/export_to_trt_llm.py
@@ -53,18 +53,20 @@ def get_args(argv):
     parser.add_argument("-mil", "--max_input_len", default=256, type=int, help="Max input length of the model")
     parser.add_argument("-mol", "--max_output_len", default=256, type=int, help="Max output length of the model")
     parser.add_argument("-mbs", "--max_batch_size", default=8, type=int, help="Max batch size of the model")
+    parser.add_argument("-mnt", "--max_num_tokens", default=None, type=int, help="Max number of tokens")
+    parser.add_argument("-ont", "--opt_num_tokens", default=None, type=int, help="Optimum number of tokens")
     parser.add_argument(
         "-mpet", "--max_prompt_embedding_table_size", default=None, type=int, help="Max prompt embedding table size"
     )
     parser.add_argument(
-        "-uib",
-        "--use_inflight_batching",
-        default=False,
-        action='store_true',
-        help="Enable inflight batching for TensorRT-LLM Triton backend.",
+        "-upkc", "--use_paged_kv_cache", default=False, action='store_true', help="Enable paged kv cache."
     )
     parser.add_argument(
-        "-upkc", "--use_paged_kv_cache", default=False, action='store_true', help="Enable paged kv cache."
+        "-drip",
+        "--disable_remove_input_padding",
+        default=False,
+        action='store_true',
+        help="Disables the remove input padding option.",
     )
     parser.add_argument(
         "-mbm",
@@ -141,9 +143,12 @@ def nemo_export_trt_llm(argv):
             max_input_token=args.max_input_len,
             max_output_token=args.max_output_len,
             max_batch_size=args.max_batch_size,
+            max_num_tokens=args.max_num_tokens,
+            opt_num_tokens=args.opt_num_tokens,
             max_prompt_embedding_table_size=args.max_prompt_embedding_table_size,
-            use_inflight_batching=args.use_inflight_batching,
             paged_kv_cache=args.use_paged_kv_cache,
+            remove_input_padding=(not args.disable_remove_input_padding),
+            dtype=args.dtype,
             enable_multi_block_mode=args.multi_block_mode,
             use_lora_plugin=args.use_lora_plugin,
             lora_target_modules=args.lora_target_modules,
diff --git a/tests/export/test_nemo_export.py b/tests/export/test_nemo_export.py
index 0e9981403a1a..b3e186433561 100644
--- a/tests/export/test_nemo_export.py
+++ b/tests/export/test_nemo_export.py
@@ -214,6 +214,8 @@ def run_trt_llm_inference(
             max_prompt_embedding_table_size=max_prompt_embedding_table_size,
             use_lora_plugin=use_lora_plugin,
             lora_target_modules=lora_target_modules,
+            max_num_tokens=int(max_input_token * max_batch_size * 0.2),
+            opt_num_tokens=60,
             save_nemo_model_config=True,
         )
 

From c7bf46e88b404078f58c077f25a9b9180565d43d Mon Sep 17 00:00:00 2001
From: Sangkug Lym <slym@nvidia.com>
Date: Tue, 21 May 2024 11:25:15 -0700
Subject: [PATCH 06/47] sum-reduce grad_norm in DP+CP domain (#9262)

* sum-reudce grad_norm in DP+CP domain

Signed-off-by: Sangkug Lym <slym@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: pablo-garay <pablo-garay@users.noreply.github.com>

---------

Signed-off-by: Sangkug Lym <slym@nvidia.com>
Signed-off-by: pablo-garay <pablo-garay@users.noreply.github.com>
Co-authored-by: Pablo Garay <palenq@gmail.com>
Co-authored-by: pablo-garay <pablo-garay@users.noreply.github.com>
---
 .../nlp/modules/common/megatron/clip_grads.py        | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/nemo/collections/nlp/modules/common/megatron/clip_grads.py b/nemo/collections/nlp/modules/common/megatron/clip_grads.py
index 7edc6720574e..b87c260ca4da 100644
--- a/nemo/collections/nlp/modules/common/megatron/clip_grads.py
+++ b/nemo/collections/nlp/modules/common/megatron/clip_grads.py
@@ -142,7 +142,7 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2, use_fsdp=False):
                 grad_norm = torch.zeros(1, device='cuda', dtype=torch.float32).squeeze()
             # Since we will be summing across data parallel groups,
             # we need the pow(norm-type).
-            total_norm = grad_norm ** norm_type
+            total_norm = grad_norm**norm_type
             if use_fsdp:
                 if len(sharded_grads_for_norm) > 0:
                     sharded_grad_norm, _ = multi_tensor_applier(
@@ -150,20 +150,22 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2, use_fsdp=False):
                     )
                 else:
                     sharded_grad_norm = torch.zeros(1, device='cuda', dtype=torch.float32).squeeze()
-                total_sharded_norm = sharded_grad_norm ** norm_type
+                total_sharded_norm = sharded_grad_norm**norm_type
         else:
             for grad in grads_for_norm:
                 grad_norm = torch.norm(grad, norm_type)
-                total_norm += grad_norm ** norm_type
+                total_norm += grad_norm**norm_type
             if use_fsdp:
                 for grad in sharded_grads_for_norm:
                     grad_norm = torch.norm(grad, norm_type)
-                    total_sharded_norm += grad_norm ** norm_type
+                    total_sharded_norm += grad_norm**norm_type
 
         if use_fsdp:
             # Sum norm of grad shards across data-parallel GPUs.
             torch.distributed.all_reduce(
-                total_sharded_norm, op=torch.distributed.ReduceOp.SUM, group=parallel_state.get_data_parallel_group(),
+                total_sharded_norm,
+                op=torch.distributed.ReduceOp.SUM,
+                group=parallel_state.get_data_parallel_group(with_context_parallel=True),
             )
             total_norm += total_sharded_norm.squeeze()
 

From d7bb40364c17bf322004539f851cc83df4c4c2b7 Mon Sep 17 00:00:00 2001
From: yaoyu-33 <54727607+yaoyu-33@users.noreply.github.com>
Date: Tue, 21 May 2024 20:14:18 -0700
Subject: [PATCH 07/47] Add llama3 and distributed checkpoint support in NeVA
 (#9101)

* temp save

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* temp save 2

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* update code

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* enable seq packing

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* fix neva and clip

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Enable parallel seq packing algo and few other fixes

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Pipeline parallel support

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Update data preprocess

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* fix few pp issues

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* enable sequence packing w/ PP

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Fix cu_seqlens in inputs

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* add assert

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Depend on PP to decide whether do padding

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Add docstring

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Fix few evaluation issues

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Fix few PP evaluation issues

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Address comments

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add llama3 template

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* address comments

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Fix license

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Fix llama3

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Few fixes

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Few neva bugs

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Few neva bugs

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Few neva bugs

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* llama3 inference fix

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Force vision encoder to run in fp32

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Revert "Force vision encoder to run in fp32"

This reverts commit 9d2160d96cb3e2a27a18538950ef43b4482c04da.

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Try adding distributed format of checkpoint

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Allow dist checkpoint to be non-strict

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Fix

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Some fixes for PP + dist ckpt in Neva

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* fix peft

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* few fixes for lora

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* checkpoint updates

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Apply isort and black reformatting

Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>

* bug fix

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Add neva dist checkpoint converter

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Apply isort and black reformatting

Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>

* resolve comments

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* update neva dist ckpt apis

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Apply isort and black reformatting

Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>

* fix return

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

---------

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>
Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>
---
 .../neva/conf/neva_inference.yaml             |   2 +-
 .../multimodal_llm/neva/eval/gradio_server.py |   1 +
 .../multimodal_llm/neva/eval/vqa_science.py   |   1 -
 .../neva/neva_convert_to_dist_ckpt.py         |  89 +++++++
 .../multimodal/data/neva/conversation.py      |  61 ++++-
 .../multimodal/data/neva/neva_dataset.py      | 221 +++++++++++++++---
 .../models/multimodal_llm/neva/neva_model.py  | 144 ++++++++++--
 nemo/collections/multimodal/parts/utils.py    |  18 +-
 .../common/text_generation_strategy.py        | 118 +++++++---
 .../modules/common/text_generation_utils.py   |  38 +--
 .../parts/mixins/multimodal_adapter_mixins.py |  85 ++++---
 .../nlp/parts/mixins/nlp_adapter_mixins.py    |  44 +++-
 nemo/collections/nlp/parts/nlp_overrides.py   |   4 +-
 nemo/utils/callbacks/dist_ckpt_io.py          |  36 ++-
 14 files changed, 690 insertions(+), 172 deletions(-)
 create mode 100644 examples/multimodal/multimodal_llm/neva/neva_convert_to_dist_ckpt.py

diff --git a/examples/multimodal/multimodal_llm/neva/conf/neva_inference.yaml b/examples/multimodal/multimodal_llm/neva/conf/neva_inference.yaml
index 145575d8a73b..b06f4bd8e535 100644
--- a/examples/multimodal/multimodal_llm/neva/conf/neva_inference.yaml
+++ b/examples/multimodal/multimodal_llm/neva/conf/neva_inference.yaml
@@ -11,7 +11,7 @@ inference:
   compute_logprob: False  # a flag used to compute logprob of all the input text, a very special case of running inference, default False
   end_strings: ["<extra_id_1>","<extra_id_7>",]  # generation will stop when one of these tokens is generated
   media_base_path: /pwd/images # /path/to/images or /path/to/videos
-  insert_media_token: left # `left` or `right` or `null`
+  insert_media_token: null # `left` or `right` or `null`
   media_type: image # `image` or `video` 
 
 trainer:
diff --git a/examples/multimodal/multimodal_llm/neva/eval/gradio_server.py b/examples/multimodal/multimodal_llm/neva/eval/gradio_server.py
index 88cfdc4ed194..7c04a7045f00 100644
--- a/examples/multimodal/multimodal_llm/neva/eval/gradio_server.py
+++ b/examples/multimodal/multimodal_llm/neva/eval/gradio_server.py
@@ -20,6 +20,7 @@
 from omegaconf import OmegaConf
 
 from nemo.collections.multimodal.parts.utils import create_neva_model_and_processor
+from nemo.collections.nlp.modules.common.transformer.text_generation import LengthParam, SamplingParam
 
 CFG_STRING = """
 trainer:
diff --git a/examples/multimodal/multimodal_llm/neva/eval/vqa_science.py b/examples/multimodal/multimodal_llm/neva/eval/vqa_science.py
index a80c9e70f4ed..17bda5725eb4 100644
--- a/examples/multimodal/multimodal_llm/neva/eval/vqa_science.py
+++ b/examples/multimodal/multimodal_llm/neva/eval/vqa_science.py
@@ -169,7 +169,6 @@ def eval_model(args):
     parser.add_argument("--image-folder", type=str, default="")
     parser.add_argument("--question-file", type=str, default="tables/question.json")
     parser.add_argument("--answers-file", type=str, default="answer.jsonl")
-    parser.add_argument("--conv-mode", type=str, default="llava_v0")
     parser.add_argument("--tp", type=int, default=1)
     parser.add_argument("--pp", type=int, default=1)
     parser.add_argument("--num-chunks", type=int, default=1)
diff --git a/examples/multimodal/multimodal_llm/neva/neva_convert_to_dist_ckpt.py b/examples/multimodal/multimodal_llm/neva/neva_convert_to_dist_ckpt.py
new file mode 100644
index 000000000000..8891a8e9d208
--- /dev/null
+++ b/examples/multimodal/multimodal_llm/neva/neva_convert_to_dist_ckpt.py
@@ -0,0 +1,89 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from argparse import ArgumentParser
+from omegaconf.omegaconf import OmegaConf
+
+from nemo.collections.multimodal.models.multimodal_llm.neva.neva_model import MegatronNevaModel
+from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronTrainerBuilder
+from nemo.collections.nlp.parts.nlp_overrides import NLPSaveRestoreConnector
+from nemo.utils import logging
+
+
+def get_args():
+    parser = ArgumentParser()
+    parser.add_argument(
+        "--input_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to NeMo legacy checkpoints",
+    )
+    parser.add_argument("--output_path", type=str, default=None, required=True, help="Path to output .nemo file.")
+    parser.add_argument("--gpus_per_node", type=int, required=False, default=8)
+    parser.add_argument("--num_nodes", type=int, required=False, default=1)
+    parser.add_argument(
+        "--precision",
+        type=str,
+        required=False,
+        default='bf16-mixed',
+        choices=['32-true', '16-mixed', 'bf16-mixed'],
+        help="Precision value for the trainer that matches with precision of the ckpt",
+    )
+    args = parser.parse_args()
+    return args
+
+
+def main() -> None:
+    args = get_args()
+    cfg = {
+        'trainer': {
+            'devices': args.gpus_per_node,
+            'num_nodes': args.num_nodes,
+            'accelerator': 'gpu',
+            'precision': args.precision,
+        },
+        'model': {
+            'native_amp_init_scale': 2**32,
+            'native_amp_growth_interval': 1000,
+            'hysteresis': 2,
+            'gradient_as_bucket_view': True,
+        },
+        'cluster_type': 'BCP',
+    }
+    cfg = OmegaConf.create(cfg)
+
+    # Set precision None after precision plugins are created as PTL >= 2.1 does not allow both
+    # precision plugins and precision to exist
+    cfg.trainer.precision = None
+    trainer = MegatronTrainerBuilder(cfg).create_trainer()
+
+    save_restore_connector = NLPSaveRestoreConnector()
+    if os.path.isdir(args.input_path):
+        save_restore_connector.model_extracted_dir = args.input_path
+
+    model = MegatronNevaModel.restore_from(
+        restore_path=args.input_path,
+        trainer=trainer,
+        save_restore_connector=save_restore_connector,
+        strict=False,
+    )
+
+    model.save_to(args.output_path)
+    logging.info(f'NeMo model saved to: {args.output_path}')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/nemo/collections/multimodal/data/neva/conversation.py b/nemo/collections/multimodal/data/neva/conversation.py
index 80a297a5b952..43b1977aa993 100644
--- a/nemo/collections/multimodal/data/neva/conversation.py
+++ b/nemo/collections/multimodal/data/neva/conversation.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import dataclasses
+from collections import defaultdict
 from enum import Enum, auto
 from typing import List
 
@@ -24,9 +25,14 @@
 DEFAULT_SYSTEM_TOKEN = "<extra_id_0>"
 DEFAULT_SEPARATOR_TOKEN = "<extra_id_1>"
 DEFAULT_LABELS_TOKEN = "<extra_id_2>"
-DEFAULT_IMAGE_PATCH_TOKEN = "<extra_id_3>"
-DEFAULT_IM_START_TOKEN = "<extra_id_4>"
-DEFAULT_IM_END_TOKEN = "<extra_id_5>"
+DEFAULT_IMAGE_PATCH_TOKEN = defaultdict(lambda: "<extra_id_3>")
+DEFAULT_IM_START_TOKEN = defaultdict(lambda: "<extra_id_4>")
+DEFAULT_IM_END_TOKEN = defaultdict(lambda: "<extra_id_5>")
+
+# Update llama3 default
+DEFAULT_IMAGE_PATCH_TOKEN["llama_3"] = "<|reserved_special_token_3|>"
+DEFAULT_IM_START_TOKEN["llama_3"] = "<|reserved_special_token_4|>"
+DEFAULT_IM_END_TOKEN["llama_3"] = "<|reserved_special_token_5|>"
 
 
 class SeparatorStyle(Enum):
@@ -36,6 +42,7 @@ class SeparatorStyle(Enum):
     TWO = auto()
     PLAIN = auto()
     LLAMA_2 = auto()
+    LLAMA_3 = auto()
     NVGPT = auto()
 
 
@@ -109,6 +116,34 @@ def get_prompt(self):
                 else:
                     ret += ""
             ret = ret.lstrip(self.sep)
+        elif self.sep_style == SeparatorStyle.LLAMA_3:
+            """
+            <|begin_of_text|><|start_header_id|>system<|end_header_id|>
+
+            {{ system_prompt }}<|eot_id|><|start_header_id|>user<|end_header_id|>
+
+            {{ user_message_1 }}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
+
+            {{ model_answer_1 }}<|eot_id|><|start_header_id|>user<|end_header_id|>
+
+            {{ user_message_2 }}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
+            """
+            wrap_sys = lambda msg: f"<|start_header_id|>system<|end_header_id|>\n\n{msg}"
+            wrap_user = lambda msg: f"<|start_header_id|>user<|end_header_id|>\n\n{msg}"
+            wrap_assistant = lambda msg: f"<|start_header_id|>assistant<|end_header_id|>\n\n{msg}"
+
+            ret = "<|begin_of_text|>" + wrap_sys(self.system) + self.sep
+            for i, (role, message) in enumerate(messages):
+                if i == 0:
+                    assert message, "first message should not be none"
+                    assert role == self.roles[0], "first message should come from user"
+                if type(message) is tuple:
+                    message, _, _ = message
+                elif i % 2 == 0:
+                    ret += wrap_user(message) + self.sep
+                else:
+                    ret += wrap_assistant(message) + (self.sep if message else "")
+
         elif self.sep_style == SeparatorStyle.PLAIN:
             seps = [self.sep, self.sep2]
             ret = self.system
@@ -346,8 +381,25 @@ def dict(self):
     sep2=DEFAULT_EOS_TOKEN,
 )
 
+conv_llava_llama_3 = Conversation(
+    system="You are a helpful language and vision assistant. "
+    "You are able to understand the visual content that the user provides, "
+    "and assist the user with a variety of tasks using natural language.",
+    roles=("user", "assistant"),
+    version="llama_v3",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.LLAMA_3,
+    sep="<|eot_id|>",
+)
+
 conv_llava_plain = Conversation(
-    system="", roles=("", ""), messages=(), offset=0, sep_style=SeparatorStyle.PLAIN, sep="\n",
+    system="",
+    roles=("", ""),
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.PLAIN,
+    sep="\n",
 )
 
 conv_llava_v0 = Conversation(
@@ -416,6 +468,5 @@ def dict(self):
     "nv_dpo": conv_nv_dpo,
 }
 
-
 if __name__ == "__main__":
     print(default_conversation.get_prompt())
diff --git a/nemo/collections/multimodal/data/neva/neva_dataset.py b/nemo/collections/multimodal/data/neva/neva_dataset.py
index caaab2c5d67e..70afc5b4a19a 100644
--- a/nemo/collections/multimodal/data/neva/neva_dataset.py
+++ b/nemo/collections/multimodal/data/neva/neva_dataset.py
@@ -34,17 +34,11 @@
 import nemo.collections.multimodal.data.neva.conversation as conversation_lib
 from nemo.collections.multimodal.data.clip.augmentations.augmentations import image_transform
 from nemo.collections.multimodal.data.neva.conversation import (
-    DEFAULT_BOS_TOKEN,
-    DEFAULT_EOS_TOKEN,
     DEFAULT_IM_END_TOKEN,
     DEFAULT_IM_START_TOKEN,
     DEFAULT_IMAGE_PATCH_TOKEN,
     DEFAULT_IMAGE_TOKEN,
     DEFAULT_LABELS_TOKEN,
-    DEFAULT_PAD_TOKEN,
-    DEFAULT_SEPARATOR_TOKEN,
-    DEFAULT_SYSTEM_TOKEN,
-    DEFAULT_UNK_TOKEN,
     DEFAULT_VIDEO_TOKEN,
 )
 from nemo.collections.nlp.modules.common.megatron.utils import get_ltor_masks_and_position_ids
@@ -188,7 +182,10 @@ def flatten_frames(self, cap):
 
 
 def tokenize(
-    texts: Union[str, List[str]], tokenizer: Any, context_length: int, add_extra_token: int,
+    texts: Union[str, List[str]],
+    tokenizer: Any,
+    context_length: int,
+    add_extra_token: int,
 ) -> torch.LongTensor:
     """
     Returns the tokenized representation of given input string(s). If the list of tokens exceeds the context
@@ -216,7 +213,7 @@ def tokenize(
     if isinstance(texts, str):
         texts = [texts]
         texts_is_str = True
-    tokens = tokenizer.text_to_ids(texts)
+    tokens = [tokenizer.text_to_ids(t) for t in texts]
     max_len = max([len(token) for token in tokens])
     context_length = min(max_len - add_extra_token, context_length)
     # truncate and padding
@@ -251,6 +248,7 @@ def preprocess_multimodal(sources: dict, multimodal_cfg: dict, cur_token_len: in
     - dict: The processed sources dictionary after applying multimodal preprocessing steps.
     """
     is_multimodal = multimodal_cfg['is_multimodal']
+    model_type = multimodal_cfg['model_type']
     media_type = multimodal_cfg['media_type']
     image_token_len = cur_token_len
     if media_type == 'image':
@@ -268,11 +266,10 @@ def preprocess_multimodal(sources: dict, multimodal_cfg: dict, cur_token_len: in
         num_patches *= multimodal_cfg['num_frames']
 
     if multimodal_cfg['use_im_start_end']:
-        replace_token = DEFAULT_IMAGE_PATCH_TOKEN * num_patches
+        replace_token = DEFAULT_IMAGE_PATCH_TOKEN[model_type] * num_patches
     else:
-        replace_token = DEFAULT_IMAGE_PATCH_TOKEN * (num_patches - 2)
-
-    replace_token = DEFAULT_IM_START_TOKEN + replace_token + DEFAULT_IM_END_TOKEN
+        replace_token = DEFAULT_IMAGE_PATCH_TOKEN[model_type] * (num_patches - 2)
+    replace_token = DEFAULT_IM_START_TOKEN[model_type] + replace_token + DEFAULT_IM_END_TOKEN[model_type]
 
     for source in sources:
         conversation = source['conversations']
@@ -295,7 +292,103 @@ def preprocess_multimodal(sources: dict, multimodal_cfg: dict, cur_token_len: in
     return sources
 
 
-def preprocess_llama_2(sources: dict, tokenizer, cfg,) -> Dict:
+def preprocess_llama_3(
+    sources: dict,
+    tokenizer,
+    cfg,
+) -> Dict:
+    """
+    Preprocesses sources for the LLaMA 3 model configuration.
+
+    The function applies prompt templates and tokenizes the conversations according to the LLaMA 2 model specifications.
+    It involves special handling of tokens, masking of labels, and adjustments based on configuration settings.
+
+    Parameters:
+    - sources (dict): A dictionary of sources containing conversations to be processed.
+    - tokenizer: The tokenizer to be used for processing the text.
+    - cfg: Configuration settings for preprocessing, including context length and additional tokens.
+
+    Returns:
+    - Dict: A dictionary containing tokenized and labeled data suitable for the LLaMA 2 model.
+      This includes tokens, labels, and any special processing as defined in the configuration.
+    """
+    conv = conversation_lib.conv_llava_llama_3.copy()
+    roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
+
+    # Apply prompt templates
+    conversations = []
+    for i, source in enumerate(sources):
+        source = source['conversations']
+        if roles[source[0]["from"]] != conv.roles[0]:
+            # Skip the first one if it is not from human
+            source = source[1:]
+
+        conv.messages = []
+        for j, sentence in enumerate(source):
+            role = roles[sentence["from"]]
+            assert role == conv.roles[j % 2], f"{i}"
+            conv.append_message(role, sentence["value"])
+        conversations.append(conv.get_prompt())
+
+    add_extra_token = cfg.get("add_extra_token")
+
+    # Tokenize conversations
+    tokens = tokenize(
+        texts=conversations,
+        tokenizer=tokenizer,
+        context_length=cfg.get("context_length"),
+        add_extra_token=add_extra_token,
+    )
+    labels = tokens.clone().detach()
+    # Mask labels
+    sep = "<|start_header_id|>assistant<|end_header_id|>\n\n"  # part sep
+    round_sep = "<|start_header_id|>user<|end_header_id|>\n\n"
+    for conversation, target in zip(conversations, labels):
+        # the first match of round sep is going to be the one after system, which is not the intended behavior
+        rounds = conversation.split(round_sep)
+        rounds = [round_sep.join(rounds[:2])] + rounds[2:]
+        cur_len = 0
+        for i, rou in enumerate(rounds):
+
+            if rou == "":
+                break
+
+            parts = rou.split(sep)
+            if len(parts) != 2:
+                break
+            parts[0] += sep
+
+            if i == 0:
+                round_len = len(tokenizer.text_to_ids(rou))
+                instruction_len = len(tokenizer.text_to_ids(parts[0]))
+            else:
+                round_len = len(tokenizer.text_to_ids(round_sep + rou))
+                instruction_len = len(tokenizer.text_to_ids(round_sep + parts[0]))
+            target[cur_len : cur_len + instruction_len] = IGNORE_INDEX
+            cur_len += round_len
+        target[cur_len:] = IGNORE_INDEX
+
+    # Check if masking working correctly
+    # print([x for x in zip(tokens[0].numpy().tolist(), labels[0].numpy().tolist())])
+
+    if add_extra_token:
+        tokens = tokens[:, :-1].contiguous()
+        labels = labels[:, 1:].contiguous()
+    else:
+        labels = torch.roll(labels, shifts=-1, dims=-1)
+        labels[:, -1] = IGNORE_INDEX
+
+    return dict(
+        tokens=tokens,
+        labels=labels,
+    )
+
+
+def preprocess_llama_2(
+    sources: dict,
+    tokenizer,
+    cfg,
+) -> Dict:
     """
     Preprocesses sources for the LLaMA 2 model configuration.
 
@@ -379,10 +472,17 @@ def preprocess_llama_2(sources: dict, tokenizer, cfg,) -> Dict:
         labels = torch.roll(labels, shifts=-1, dims=-1)
         labels[:, -1] = IGNORE_INDEX
 
-    return dict(tokens=tokens, labels=labels,)
+    return dict(
+        tokens=tokens,
+        labels=labels,
+    )
 
 
-def preprocess_v1(sources: dict, tokenizer, cfg,) -> Dict:
+def preprocess_v1(
+    sources: dict,
+    tokenizer,
+    cfg,
+) -> Dict:
     """
     Preprocesses sources for the Vicuna V1 model configuration.
 
@@ -462,10 +562,17 @@ def preprocess_v1(sources: dict, tokenizer, cfg,) -> Dict:
         labels = torch.roll(labels, shifts=-1, dims=-1)
         labels[:, -1] = IGNORE_INDEX
 
-    return dict(tokens=tokens, labels=labels,)
+    return dict(
+        tokens=tokens,
+        labels=labels,
+    )
 
 
-def preprocess_nvgpt(sources: dict, tokenizer, cfg,) -> Dict:
+def preprocess_nvgpt(
+    sources: dict,
+    tokenizer,
+    cfg,
+) -> Dict:
     """
     Preprocess a given set of conversational sources using nvgpt conversation template
 
@@ -503,9 +610,9 @@ def preprocess_nvgpt(sources: dict, tokenizer, cfg,) -> Dict:
             if i % 2 == 1:
                 turn['from'] = conv.roles[1]
                 if 'label' not in turn:
-                    turn[
-                        'label'
-                    ] = "quality:4,toxicity:0,humor:0,creativity:0,helpfulness:4,correctness:4,coherence:4,complexity:4,verbosity:4"
+                    turn['label'] = (
+                        "quality:4,toxicity:0,humor:0,creativity:0,helpfulness:4,correctness:4,coherence:4,complexity:4,verbosity:4"
+                    )
                 value = DEFAULT_LABELS_TOKEN + turn['label'] + '\n' + turn['value']
                 conv.append_message(turn['from'], value)
                 if not turn["value"]:
@@ -567,10 +674,17 @@ def preprocess_nvgpt(sources: dict, tokenizer, cfg,) -> Dict:
         labels = torch.roll(labels, shifts=-1, dims=-1)
         labels[:, -1] = IGNORE_INDEX
 
-    return dict(tokens=tokens, labels=labels,)
+    return dict(
+        tokens=tokens,
+        labels=labels,
+    )
 
 
-def preprocess_nv_dpo(sources: dict, tokenizer, cfg,) -> Dict:
+def preprocess_nv_dpo(
+    sources: dict,
+    tokenizer,
+    cfg,
+) -> Dict:
     """
     Preprocess a given set of conversational sources using nvgpt conversation template
 
@@ -666,10 +780,17 @@ def preprocess_nv_dpo(sources: dict, tokenizer, cfg,) -> Dict:
         labels = torch.roll(labels, shifts=-1, dims=-1)
         labels[:, -1] = IGNORE_INDEX
 
-    return dict(tokens=tokens, labels=labels,)
+    return dict(
+        tokens=tokens,
+        labels=labels,
+    )
 
 
-def preprocess_plain(sources, tokenizer, cfg,) -> Dict:
+def preprocess_plain(
+    sources,
+    tokenizer,
+    cfg,
+) -> Dict:
     """
     Preprocesses plain text sources (no template) for tokenization and label generation.
 
@@ -717,7 +838,10 @@ def preprocess_plain(sources, tokenizer, cfg,) -> Dict:
         labels = torch.roll(labels, shifts=-1, dims=-1)
         labels[:, -1] = IGNORE_INDEX
 
-    return dict(tokens=tokens, labels=labels,)
+    return dict(
+        tokens=tokens,
+        labels=labels,
+    )
 
 
 class LazySupervisedDataset(Dataset):
@@ -865,20 +989,45 @@ def expand2square(pil_img, background_color):
                 )
 
         else:
-            logging.warning("media not found in sources")
             media_tensors = torch.tensor([])
             sources = copy.deepcopy(sources)
 
         if self.conv_template in ["nvgpt", "nv_steerlm"]:
-            data_dict = preprocess_nvgpt(sources, self.tokenizer, self.multimodal_cfg,)
+            data_dict = preprocess_nvgpt(
+                sources,
+                self.tokenizer,
+                self.multimodal_cfg,
+            )
         elif self.conv_template == "nv_dpo":
-            data_dict = preprocess_nv_dpo(sources, self.tokenizer, self.multimodal_cfg,)
+            data_dict = preprocess_nv_dpo(
+                sources,
+                self.tokenizer,
+                self.multimodal_cfg,
+            )
         elif self.conv_template == "v1":
-            data_dict = preprocess_v1(sources, self.tokenizer, self.multimodal_cfg,)
+            data_dict = preprocess_v1(
+                sources,
+                self.tokenizer,
+                self.multimodal_cfg,
+            )
         elif self.conv_template == "llama_2":
-            data_dict = preprocess_llama_2(sources, self.tokenizer, self.multimodal_cfg,)
+            data_dict = preprocess_llama_2(
+                sources,
+                self.tokenizer,
+                self.multimodal_cfg,
+            )
+        elif self.conv_template == "llama_3":
+            data_dict = preprocess_llama_3(
+                sources,
+                self.tokenizer,
+                self.multimodal_cfg,
+            )
         elif self.conv_template == "plain":
-            data_dict = preprocess_plain(sources, self.tokenizer, self.multimodal_cfg,)
+            data_dict = preprocess_plain(
+                sources,
+                self.tokenizer,
+                self.multimodal_cfg,
+            )
         else:
             raise ValueError(f"Conversation template `{self.conv_template}` is not supported in Neva now.")
 
@@ -981,7 +1130,7 @@ def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
 
         tokens = batch['tokens']
         labels = batch['labels']
-        media_type = model_cfg.data.get('media_type')
+        media_type = model_cfg.data.get('media_type', 'image')
         if media_type == 'image':
             media = batch.get('image')
         elif media_type == 'video':
@@ -1048,7 +1197,12 @@ def make_supervised_data_module(tokenizer, model_cfg) -> Dict:
         )
     else:
         # TODO(yuya): Fix this hard-code for our own CLIP
-        image_processor = image_transform(crop_size, is_train=False, mean=None, std=None,)
+        image_processor = image_transform(
+            crop_size,
+            is_train=False,
+            mean=None,
+            std=None,
+        )
 
     train_dataset = NevaDataset(
         tokenizer=tokenizer,
@@ -1056,6 +1210,7 @@ def make_supervised_data_module(tokenizer, model_cfg) -> Dict:
         multimodal_cfg=dict(
             is_multimodal=data_cfg.is_multimodal,
             sep_image_conv_front=data_cfg.sep_image_conv_front,
+            model_type=mm_cfg.llm.get("model_type", "nvgpt"),
             conv_template=data_cfg.get("conv_template", "nvgpt"),
             crop_size=crop_size,
             image_token_len=data_cfg.image_token_len,
diff --git a/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py b/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py
index 7192a1b018b1..e33cf267c230 100644
--- a/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py
+++ b/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py
@@ -78,6 +78,7 @@
     from megatron.core import InferenceParams, dist_checkpointing, parallel_state
     from megatron.core.models.gpt import GPTModel as MCoreGPTModel
     from megatron.core.pipeline_parallel.schedules import get_forward_backward_func
+    from megatron.core.transformer.utils import make_sharded_tensors_for_checkpoint
 
     HAVE_MEGATRON_CORE = True
 
@@ -91,7 +92,11 @@ class FrozenCLIPVisionTransformer(CLIPVisionTransformer):
 
     def __init__(self, model_cfg, model_parallel_config, pre_process=True, post_process=True):
         super().__init__(
-            model_cfg, model_parallel_config, pre_process=pre_process, post_process=post_process, skip_head=True,
+            model_cfg,
+            model_parallel_config,
+            pre_process=pre_process,
+            post_process=post_process,
+            skip_head=True,
         )
         self.frozen = False
         self.dtype = self.config.params_dtype
@@ -235,6 +240,15 @@ def replace_media_embeddings(self, input_ids, inputs_embeds, media):
 
         return updated_input_embeds
 
+    def sharded_state_dict(self, prefix: str = '', sharded_offsets: tuple = (), **kwargs):
+        sharded_state_dict = super().sharded_state_dict(prefix=prefix, sharded_offsets=sharded_offsets, **kwargs)
+
+        state_dict = self.state_dict(prefix='', keep_vars=True)
+        state_dict.pop('weight')
+        # duplicate everything else
+        sharded_state_dict.update(make_sharded_tensors_for_checkpoint(state_dict, prefix=prefix))
+        return sharded_state_dict
+
 
 class NevaBaseModel:
     """
@@ -245,7 +259,12 @@ class NevaBaseModel:
     """
 
     def __init__(
-        self, mm_cfg, media_start_id, media_end_id, mcore_gpt, **kwargs,
+        self,
+        mm_cfg,
+        media_start_id,
+        media_end_id,
+        mcore_gpt,
+        **kwargs,
     ):
         self.mm_cfg = mm_cfg
         self.media_start_id = media_start_id
@@ -264,7 +283,8 @@ def __init__(
         # Initialize vision encoder and freeze it
         if mm_cfg.vision_encoder.from_hf:
             vision_encoder = CLIPVisionModel.from_pretrained(
-                mm_cfg.vision_encoder.from_pretrained, torch_dtype=torch.bfloat16,
+                mm_cfg.vision_encoder.from_pretrained,
+                torch_dtype=torch.bfloat16,
             ).cuda()
             vision_encoder = vision_encoder.to(torch.bfloat16)
             if mm_cfg.vision_encoder.freeze:
@@ -385,7 +405,12 @@ class MCoreNevaModel(MCoreGPTModel, NevaBaseModel):
     """
 
     def __init__(
-        self, mm_cfg, media_start_id, media_end_id, mcore_gpt, **kwargs,
+        self,
+        mm_cfg,
+        media_start_id,
+        media_end_id,
+        mcore_gpt,
+        **kwargs,
     ):
         MCoreGPTModel.__init__(self, **kwargs)
         NevaBaseModel.__init__(self, mm_cfg, media_start_id, media_end_id, mcore_gpt, **kwargs)
@@ -400,11 +425,17 @@ def freeze_llm(self, mm_cfg):
         else:
             output_layer_parameters = {}
 
-        for param in chain(embedding_parameters, self.decoder.parameters(), output_layer_parameters,):
+        for param in chain(
+            embedding_parameters,
+            self.decoder.parameters(),
+            output_layer_parameters,
+        ):
             param.requires_grad = False
 
     def forward(
-        self, *args, **kwargs,
+        self,
+        *args,
+        **kwargs,
     ):
         media = kwargs.pop('media', None)
         if parallel_state.is_pipeline_first_stage(ignore_virtual=True):
@@ -421,7 +452,12 @@ class NevaModel(GPTModel, NevaBaseModel):
     """
 
     def __init__(
-        self, mm_cfg, media_start_id, media_end_id, mcore_gpt, **kwargs,
+        self,
+        mm_cfg,
+        media_start_id,
+        media_end_id,
+        mcore_gpt,
+        **kwargs,
     ):
         GPTModel.__init__(self, **kwargs)
         NevaBaseModel.__init__(self, mm_cfg, media_start_id, media_end_id, mcore_gpt, **kwargs)
@@ -431,7 +467,9 @@ def freeze_llm(self, mm_cfg):
             param.requires_grad = False
 
     def forward(
-        self, *args, **kwargs,
+        self,
+        *args,
+        **kwargs,
     ):
         media = kwargs.pop('media', None)
         if parallel_state.is_pipeline_first_stage(ignore_virtual=True):
@@ -455,7 +493,7 @@ def init_neva_adapter(self):
             adapter_type=self.cfg.mm_cfg.get("mm_mlp_adapter_type", "linear"),
             in_features=self.cfg.mm_cfg.vision_encoder.hidden_size,
             out_features=self.cfg.hidden_size,
-            bias=True,
+            bias=True,  # self.cfg.get("bias", False),
         )
         for name, module in self.named_modules():
             self._check_and_add_adapter(
@@ -471,8 +509,10 @@ def init_neva_adapter(self):
 
     def model_provider_func(self, pre_process, post_process):
         """Model depends on pipeline paralellism."""
-        media_start_id = self.tokenizer.token_to_id(DEFAULT_IM_START_TOKEN)
-        media_end_id = self.tokenizer.token_to_id(DEFAULT_IM_END_TOKEN)
+
+        model_type = self.cfg.mm_cfg.llm.get("model_type", "nvgpt")
+        media_start_id = self.tokenizer.token_to_id(DEFAULT_IM_START_TOKEN[model_type])
+        media_end_id = self.tokenizer.token_to_id(DEFAULT_IM_END_TOKEN[model_type])
 
         if self.mcore_gpt:
             if not parallel_state.is_initialized():
@@ -581,6 +621,13 @@ def setup_optimizer_param_groups(self):
         else:
             MegatronGPTModel.setup_optimizer_param_groups(self)
 
+        # TODO(yuya): Refactor the handling of distributed checkpoint optimizer state loading
+        # With Pipeline Parallelism (PP) greater than 1, different stages might have varying lengths for `self._optimizer_param_groups`.
+        # This inconsistency can lead to errors during the loading of distributed checkpoints.
+        # As a temporary workaround, if `self._optimizer_param_groups` has less than 2 groups, add an empty parameter group marked as non-expert.
+        if len(self._optimizer_param_groups) < 2 and not self.use_peft:
+            self._optimizer_param_groups = (self._optimizer_param_groups[0], {'params': [], 'is_expert': False})
+
         # filter out params doesn't have grad
         for param_group in self._optimizer_param_groups:
             params_with_grad = [param for param in param_group['params'] if param.requires_grad]
@@ -640,7 +687,10 @@ def fwd_bwd_step(self, dataloader_iter, forward_only, first_val_step=None):
             grad_sync_func = None
             param_sync_func = None
             if not forward_only and self.with_distributed_adam:
-                no_sync_func = partial(self._optimizer.no_sync, greedy_grad_copy=self.megatron_amp_O2,)
+                no_sync_func = partial(
+                    self._optimizer.no_sync,
+                    greedy_grad_copy=self.megatron_amp_O2,
+                )
                 grad_sync_func = self.reduce_overlap_gradients
                 param_sync_func = self.sync_overlap_parameters
 
@@ -698,9 +748,9 @@ def fwd_bwd_step(self, dataloader_iter, forward_only, first_val_step=None):
 
     def training_step(self, dataloader_iter):
         """
-            We pass the dataloader iterator function to the micro-batch scheduler.
-            The input batch to each micro-batch is fetched using the dataloader function
-            in the micro-batch fwd function.
+        We pass the dataloader iterator function to the micro-batch scheduler.
+        The input batch to each micro-batch is fetched using the dataloader function
+        in the micro-batch fwd function.
         """
         return MegatronGPTModel.training_step(self, dataloader_iter)
 
@@ -903,7 +953,7 @@ def loss_func(self, loss_mask, output_tensor):
         return loss
 
     def setup(self, stage=None):
-        """ PTL hook that is executed after DDP spawns.
+        """PTL hook that is executed after DDP spawns.
             We setup datasets here as megatron datasets require DDP to instantiate.
             See https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html#setup for more information.
         Args:
@@ -981,7 +1031,10 @@ def build_train_valid_test_datasets(self):
             self._train_ds = NevaPackedSeqDatatset(self.cfg.data.data_prefix, self.cfg.data.get("crop_size"))
             self._validation_ds = NevaPackedSeqDatatset(self.cfg.data.data_prefix, self.cfg.data.get("crop_size"))
         else:
-            ds_dict = make_supervised_data_module(tokenizer=self.tokenizer, model_cfg=self.cfg,)
+            ds_dict = make_supervised_data_module(
+                tokenizer=self.tokenizer,
+                model_cfg=self.cfg,
+            )
             self._train_ds = ds_dict["train_dataset"]
             self._validation_ds = ds_dict["eval_dataset"]
 
@@ -1049,10 +1102,7 @@ def list_available_models(cls) -> Optional[PretrainedModelInfo]:
     def setup_test_data(self, cfg):
         pass
 
-    def state_dict(self, destination=None, prefix='', keep_vars=False):
-        # Get the original state dictionary
-        original_state_dict = super().state_dict(destination=destination, prefix=prefix, keep_vars=keep_vars)
-
+    def get_keys_to_keep(self):
         keys_to_keep = list(self.adapter_keys)
         # TODO(yuya): maybe not hard-code vision_encoder keys here
         vision_encoder_keys = [k for k in self.base_keys if "vision_encoder" in k]
@@ -1061,6 +1111,12 @@ def state_dict(self, destination=None, prefix='', keep_vars=False):
             keys_to_keep += llm_keys
         if not self.cfg.mm_cfg.vision_encoder.freeze:
             keys_to_keep += vision_encoder_keys
+        return keys_to_keep
+
+    def state_dict(self, destination=None, prefix='', keep_vars=False):
+        # Get the original state dictionary
+        original_state_dict = super().state_dict(destination=destination, prefix=prefix, keep_vars=keep_vars)
+        keys_to_keep = self.get_keys_to_keep()
         new_state_dict = {k: original_state_dict[k] for k in keys_to_keep}
         return new_state_dict
 
@@ -1079,10 +1135,46 @@ def load_state_dict(self, state_dict, strict=False):
             logging.critical(f'Unexpected keys: \n{unexpected_keys}')
 
     def on_load_checkpoint(self, checkpoint) -> None:
-        pass
+        """LightningModule hook:
+        https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html#on-load-checkpoint
+        """
+
+        # mcore uses distributed checkpointing
+        # FSDP supports the lagecy checkpointing or torch-FSDP-native sharded checkpointing
+        if self.mcore_gpt and not self.use_fsdp:
+            if 'state_dict' in checkpoint and checkpoint['state_dict']:
+                for index, module in enumerate(self.get_model_module_list()):
+                    if parallel_state.get_virtual_pipeline_model_parallel_world_size() is not None:
+                        checkpoint_state_dict = checkpoint['state_dict'][f'model_{index}']
+                    else:
+                        checkpoint_state_dict = checkpoint['state_dict']
+                    # checkpoint_state_dict has "model." but module does not so we need to remove it when loading
+                    checkpoint_state_dict = {
+                        key.replace('model.', ''): checkpoint_state_dict.pop(key)
+                        for key in list(checkpoint_state_dict.keys())
+                    }
+                    module.load_state_dict(checkpoint_state_dict, strict=False)
+            else:
+                # when restoring a distributed checkpoint from a ptl checkpoint we need to defer loading the state_dict
+                # see NLPModel.on_load_checkpoint
+                checkpoint['state_dict'] = {}
+
+        # legacy checkpointing for interleaved
+        else:
+            if isinstance(self.model, list):
+                for i in range(len(self.model)):
+                    parallel_state.set_virtual_pipeline_model_parallel_rank(i)
+                    self.model[i].module.load_state_dict(checkpoint[f'model{i}'], strict=True)
+                parallel_state.set_virtual_pipeline_model_parallel_rank(0)
 
     def sharded_state_dict(self, prefix: str = ''):
-        return None
+        if self.use_peft:
+            return None
+
+        original_sharded_state_dict = super().sharded_state_dict()
+        keys_to_keep = self.get_keys_to_keep()
+        new_sharded_state_dict = {k: original_sharded_state_dict[k] for k in keys_to_keep}
+        return new_sharded_state_dict
 
     def predict_step(self, batch: Any, batch_idx: int, dataloader_idx: Optional[int] = None) -> Any:
         inference_config = self.get_inference_config()
@@ -1111,7 +1203,11 @@ def predict_step(self, batch: Any, batch_idx: int, dataloader_idx: Optional[int]
                 return generate(self, **inference_config)
 
     def generate(
-        self, input_prompts, inference_config, length_params: LengthParam, sampling_params: SamplingParam = None,
+        self,
+        input_prompts,
+        inference_config,
+        length_params: LengthParam,
+        sampling_params: SamplingParam = None,
     ) -> OutputType:
 
         # check whether the DDP is initialized
diff --git a/nemo/collections/multimodal/parts/utils.py b/nemo/collections/multimodal/parts/utils.py
index f9d6ed5250f6..70dd2174a2b7 100644
--- a/nemo/collections/multimodal/parts/utils.py
+++ b/nemo/collections/multimodal/parts/utils.py
@@ -344,22 +344,6 @@ def create_neva_model_and_processor(cfg):
     # trainer required for restoring model parallel models
     trainer = Trainer(plugins=plugins, strategy=NLPDDPStrategy(), **cfg.trainer)
 
-    if (
-        cfg.tensor_model_parallel_size < 0
-        or cfg.pipeline_model_parallel_size < 0
-        or cfg.get('pipeline_model_parallel_split_rank', -1) < 0
-    ):
-        model_config = MegatronNevaModel.restore_from(
-            restore_path=cfg.neva_model_file,
-            trainer=trainer,
-            return_config=True,
-        )
-
-        with open_dict(cfg):
-            cfg.tensor_model_parallel_size = model_config.get('tensor_model_parallel_size', 1)
-            cfg.pipeline_model_parallel_size = model_config.get('pipeline_model_parallel_size', 1)
-            cfg.pipeline_model_parallel_split_rank = model_config.get('pipeline_model_parallel_split_rank', 0)
-
     assert (
         cfg.trainer.devices * cfg.trainer.num_nodes
         == cfg.tensor_model_parallel_size * cfg.pipeline_model_parallel_size
@@ -385,6 +369,8 @@ def create_neva_model_and_processor(cfg):
             neva_cfg.mm_cfg.llm.from_pretrained = cfg.get('base_model_file', None)
             neva_cfg.apply_rope_fusion = False
             neva_cfg.fp8 = False
+            neva_cfg.tensor_model_parallel_size = cfg.tensor_model_parallel_size
+            neva_cfg.pipeline_model_parallel_size = cfg.pipeline_model_parallel_size
         #    neva_cfg.mm_cfg.vision_encoder.from_pretrained = None
 
         model = MegatronNevaModel.restore_from(
diff --git a/nemo/collections/nlp/modules/common/text_generation_strategy.py b/nemo/collections/nlp/modules/common/text_generation_strategy.py
index fd32ac844274..44a80465c34b 100644
--- a/nemo/collections/nlp/modules/common/text_generation_strategy.py
+++ b/nemo/collections/nlp/modules/common/text_generation_strategy.py
@@ -69,7 +69,11 @@ def forward_step(self, batch, tensor_shape):
         fwd_bwd_function = get_forward_backward_func()
         output_tensor = fwd_bwd_function(
             forward_step_func=self.model.get_forward_output_only_func(),
-            data_iterator=iter([batch,]),
+            data_iterator=iter(
+                [
+                    batch,
+                ]
+            ),
             model=[self.forward_model],
             num_microbatches=get_num_microbatches(),
             forward_only=True,
@@ -104,7 +108,7 @@ def tokenize_batch(self, sentences, max_len, add_BOS):
 
     @abc.abstractclassmethod
     def clip_max_len(self, maxlen: int) -> int:
-        """ clip the max len based on the LM model max sequence length
+        """clip the max len based on the LM model max sequence length
         Args:
             maxlen (int): the max len computed from the context and number of tokens to generate
         returns (int):
@@ -119,7 +123,7 @@ def init_batch(self, context_tokens: torch.Tensor, context_length: int, compute_
            context_length (int): the context token length
            compute_attention_mask: bool: set to True to compute attention mask (not needed for FA)
         Args:
-            context_tokens (torch.Tensor):  The padded context tokens including the space for tokens to be generated 
+            context_tokens (torch.Tensor):  The padded context tokens including the space for tokens to be generated
         """
         pass
 
@@ -262,7 +266,7 @@ def __init__(self, model):
         self.forward_model = self.model.model
 
     def clip_max_len(self, maxlen: int) -> int:
-        """ clip the max len based on the LM model max sequence length"""
+        """clip the max len based on the LM model max sequence length"""
 
         # for positional embedding types that allow length extrapolation, don't clip the max length
         if self.model.cfg.get("position_embedding_type", "learned_absolute") == "learned_absolute":
@@ -336,7 +340,7 @@ def __init__(self, model):
         self.forward_model = self.model.model
 
     def clip_max_len(self, maxlen: int) -> int:
-        """ clip the max len based on the LM model max sequence length"""
+        """clip the max len based on the LM model max sequence length"""
 
         # for positional embedding types that allow length extrapolation, don't clip the max length
         if self.model.cfg.get("position_embedding_type", "learned_absolute") == "learned_absolute":
@@ -390,7 +394,11 @@ def forward_step(self, batch, tensor_shape_and_context_length):
 
         output_tensor = fwd_bwd_function(
             forward_step_func=self.model.get_forward_output_only_func(),
-            data_iterator=iter([batch,]),
+            data_iterator=iter(
+                [
+                    batch,
+                ]
+            ),
             model=[self.forward_model],
             num_microbatches=get_num_microbatches(),
             forward_only=True,
@@ -406,6 +414,7 @@ def neva_process_prompts(prompt, tokenizer, multimodal_cfg, num_media_latents, c
     from nemo.collections.multimodal.data.neva.neva_dataset import (
         DEFAULT_IMAGE_TOKEN,
         preprocess_llama_2,
+        preprocess_llama_3,
         preprocess_multimodal,
         preprocess_nv_dpo,
         preprocess_nvgpt,
@@ -415,10 +424,18 @@ def neva_process_prompts(prompt, tokenizer, multimodal_cfg, num_media_latents, c
     list_data_dict = []
     if multimodal_cfg["conv_template"] in ["nvgpt", "nv_steerlm", "nv_dpo"]:
         record = {
-            'system': '\n'
-            if multimodal_cfg["conv_template"] == 'nv_dpo'
-            else 'A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user\'s questions.\n\n',
-            'conversations': [{'from': 'User', 'value': prompt}, {'from': 'Assistant', 'value': '',},],
+            'system': (
+                '\n'
+                if multimodal_cfg["conv_template"] == 'nv_dpo'
+                else 'A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user\'s questions.\n\n'
+            ),
+            'conversations': [
+                {'from': 'User', 'value': prompt},
+                {
+                    'from': 'Assistant',
+                    'value': '',
+                },
+            ],
         }
 
         for turn in record['conversations']:
@@ -441,7 +458,16 @@ def neva_process_prompts(prompt, tokenizer, multimodal_cfg, num_media_latents, c
 
     elif multimodal_cfg["conv_template"] == "llama_2":
         record = {
-            'conversations': [{'from': 'human', 'value': prompt,}, {'from': 'gpt', 'value': '',},],
+            'conversations': [
+                {
+                    'from': 'human',
+                    'value': prompt,
+                },
+                {
+                    'from': 'gpt',
+                    'value': '',
+                },
+            ],
         }
 
         for turn in record['conversations']:
@@ -453,9 +479,40 @@ def neva_process_prompts(prompt, tokenizer, multimodal_cfg, num_media_latents, c
             copy.deepcopy(list_data_dict), multimodal_cfg, num_media_latents
         )  # HARDCODED FOR NOW
         data_dict = preprocess_llama_2(sources, tokenizer, multimodal_cfg)
+    elif multimodal_cfg["conv_template"] == "llama_3":
+        record = {
+            'conversations': [
+                {
+                    'from': 'human',
+                    'value': prompt,
+                },
+                {
+                    'from': 'gpt',
+                    'value': '',
+                },
+            ],
+        }
+
+        for turn in record['conversations']:
+            if turn.get('value') is not None:
+                turn['value'] = re.sub('<image>', f'{DEFAULT_IMAGE_TOKEN}\n', turn['value'])
+        list_data_dict.append(record)
+        sources = preprocess_multimodal(
+            copy.deepcopy(list_data_dict), multimodal_cfg, num_media_latents
+        )  # HARDCODED FOR NOW
+        data_dict = preprocess_llama_3(sources, tokenizer, multimodal_cfg)
     elif multimodal_cfg["conv_template"] == "v1":
         record = {
-            'conversations': [{'from': 'human', 'value': prompt,}, {'from': 'gpt', 'value': '',},],
+            'conversations': [
+                {
+                    'from': 'human',
+                    'value': prompt,
+                },
+                {
+                    'from': 'gpt',
+                    'value': '',
+                },
+            ],
         }
 
         for turn in record['conversations']:
@@ -487,6 +544,7 @@ def __init__(self, model):
             is_multimodal=self.data_cfg.is_multimodal,
             sep_image_conv_front=self.data_cfg.sep_image_conv_front,
             conv_template=self.data_cfg.get("conv_template", "nvgpt"),
+            model_type=self.cfg.mm_cfg.llm.get("model_type", "nvgpt"),
             image_token_len=self.data_cfg.image_token_len,
             image_folder=self.data_cfg.image_folder,
             image_aspect_ratio=self.data_cfg.image_aspect_ratio,
@@ -499,7 +557,7 @@ def __init__(self, model):
         )
 
     def clip_max_len(self, maxlen: int) -> int:
-        """ clip the max len based on the LM model max sequence length"""
+        """clip the max len based on the LM model max sequence length"""
         if maxlen > self.model.cfg.encoder_seq_length + 1:
             maxlen = self.model.cfg.encoder_seq_length + 1
         return maxlen
@@ -616,7 +674,7 @@ def init_batch(self, context_tokens: torch.Tensor, context_length: int, compute_
         )
 
     def clip_max_len(self, maxlen: int) -> int:
-        """ clip the max len based on the LM model max sequence length"""
+        """clip the max len based on the LM model max sequence length"""
         if maxlen > self.model.frozen_model.cfg.encoder_seq_length + 1:
             maxlen = self.model.frozen_model.cfg.encoder_seq_length + 1
         return maxlen
@@ -681,7 +739,7 @@ def __init__(self, model):
         self.forward_model = self.model.model
 
     def clip_max_len(self, maxlen: int) -> int:
-        """ clip the max len based on the LM model max sequence length"""
+        """clip the max len based on the LM model max sequence length"""
 
         # for positional embedding types that allow length extrapolation, don't clip the max length
         if self.model.cfg.get("position_embedding_type", "learned_absolute") == "learned_absolute":
@@ -830,21 +888,21 @@ def init_batch(self, context_tokens: torch.Tensor, context_length: int, compute_
 
                 # updating RetroEncoder (RetroEncoderCrossAttention, RetroEncoderBiasDropoutAdd, RetroEncoderLayerNorm)
                 if contain_encoder:  # the first cross-attention decoder layer contain encoder
-                    layer.cross_attention.encoder.layers[
-                        0
-                    ].cross_attention.retro_num_neighbors = inference_retro_num_neighbors
-                    layer.cross_attention.encoder.layers[
-                        0
-                    ].cross_attention.retro_chunk_length = inference_retro_chunk_length
-                    layer.cross_attention.encoder.layers[
-                        0
-                    ].cross_attention.retro_retrieved_length = inference_retro_retrieved_length
-                    layer.cross_attention.encoder.layers[
-                        0
-                    ].cross_attn_bda.retro_num_neighbors = inference_retro_num_neighbors
-                    layer.cross_attention.encoder.layers[
-                        0
-                    ].pre_mlp_layernorm.retro_num_neighbors = inference_retro_num_neighbors
+                    layer.cross_attention.encoder.layers[0].cross_attention.retro_num_neighbors = (
+                        inference_retro_num_neighbors
+                    )
+                    layer.cross_attention.encoder.layers[0].cross_attention.retro_chunk_length = (
+                        inference_retro_chunk_length
+                    )
+                    layer.cross_attention.encoder.layers[0].cross_attention.retro_retrieved_length = (
+                        inference_retro_retrieved_length
+                    )
+                    layer.cross_attention.encoder.layers[0].cross_attn_bda.retro_num_neighbors = (
+                        inference_retro_num_neighbors
+                    )
+                    layer.cross_attention.encoder.layers[0].pre_mlp_layernorm.retro_num_neighbors = (
+                        inference_retro_num_neighbors
+                    )
                     contain_encoder = False
 
         return context_tokens
diff --git a/nemo/collections/nlp/modules/common/text_generation_utils.py b/nemo/collections/nlp/modules/common/text_generation_utils.py
index 850eb3d5c778..722c493dfa9b 100644
--- a/nemo/collections/nlp/modules/common/text_generation_utils.py
+++ b/nemo/collections/nlp/modules/common/text_generation_utils.py
@@ -151,6 +151,7 @@ def megatron_gpt_generate(model, inputs, tokenizer, length_params, sampling_para
 
 def megatron_neva_generate(model, prompt_dict_list, length_params, sampling_params, inference_config, **strategy_args):
 
+    model_type = model.cfg.mm_cfg.llm.get("model_type", "nvgpt")
     conv_template = model.cfg.data.get("conv_template", "nvgpt")
     final_response = []
     for idx, prompt_dict in enumerate(prompt_dict_list):
@@ -180,8 +181,14 @@ def megatron_neva_generate(model, prompt_dict_list, length_params, sampling_para
             continue
 
         # Regular expression pattern to match the sequence
-        pattern = re.compile(rf'{DEFAULT_IM_START_TOKEN}( ⁇ )+{DEFAULT_IM_END_TOKEN}')
-        pattern_nvgpt = re.compile(rf'{DEFAULT_IM_START_TOKEN}({DEFAULT_IMAGE_PATCH_TOKEN})+{DEFAULT_IM_END_TOKEN}')
+        pattern = re.compile(
+            rf'{DEFAULT_IM_START_TOKEN[model_type]}( ⁇ )+{DEFAULT_IM_END_TOKEN[model_type]}'.replace(r'|', r'\|')
+        )
+        pattern_nvgpt = re.compile(
+            rf'{DEFAULT_IM_START_TOKEN[model_type]}({DEFAULT_IMAGE_PATCH_TOKEN[model_type]})+{DEFAULT_IM_END_TOKEN[model_type]}'.replace(
+                r'|', r'\|'
+            )
+        )
         combined_pattern = re.compile(f'{pattern.pattern}|{pattern_nvgpt.pattern}')
         clean_text = re.sub(combined_pattern, f"<{media_type_token}>", response['sentences'][0])
 
@@ -199,6 +206,9 @@ def megatron_neva_generate(model, prompt_dict_list, length_params, sampling_para
             clean_response = clean_response.split("<extra_id_1>")[-2][10:]  # [10:] for removing "Assistant\n"
         elif conv_template == "llama_2":
             clean_response = clean_response.rsplit("[/INST] ", 1)[-1]
+        elif conv_template == "llama_3":
+            clean_response = clean_response.rsplit("assistant<|end_header_id|>\n\n", 1)[-1]
+            clean_response = clean_response.rstrip("<|eot_id|>")
         elif conv_template == "v1":
             clean_response = clean_response.rsplit("ASSISTANT: ", 1)[-1]
 
@@ -287,17 +297,17 @@ def tab_logits(logits, min_id, max_id, filter_value=-float('Inf')):
 
 def top_k_logits(logits, top_k=0, top_p=0.0, filter_value=-float('Inf'), started=None):
     """
-       This function has been mostly taken from huggingface conversational
-         ai code at
-         https://medium.com/huggingface/how-to-build-a-state-of-the-art-
-              conversational-ai-with-transfer-learning-2d818ac26313
-
-        @param logits: logits tensor
-        @param top_k: keep only top k tokens with highest probability
-        @param top_p: keep the top tokens with cumulative probability
-        @filter_value: value to set filtered tokens to
-        @started: a tensor of bools indicating whether the text generation starts for the batch
-        returns the filtered logits
+    This function has been mostly taken from huggingface conversational
+      ai code at
+      https://medium.com/huggingface/how-to-build-a-state-of-the-art-
+           conversational-ai-with-transfer-learning-2d818ac26313
+
+     @param logits: logits tensor
+     @param top_k: keep only top k tokens with highest probability
+     @param top_p: keep the top tokens with cumulative probability
+     @filter_value: value to set filtered tokens to
+     @started: a tensor of bools indicating whether the text generation starts for the batch
+     returns the filtered logits
     """
     if top_k > 0:
         # Remove all tokens with a probability less than the
@@ -333,7 +343,7 @@ def top_k_logits(logits, top_k=0, top_p=0.0, filter_value=-float('Inf'), started
 
 
 def repetition_penalty(logits, repetition_penalty, used_tokens):
-    """ Implement the repetition penalty, check paper
+    """Implement the repetition penalty, check paper
     https://arxiv.org/pdf/1909.05858.pdf
     """
     if used_tokens is not None and repetition_penalty != 1.0:
diff --git a/nemo/collections/nlp/parts/mixins/multimodal_adapter_mixins.py b/nemo/collections/nlp/parts/mixins/multimodal_adapter_mixins.py
index 5da7296519cb..1a5321065fa9 100644
--- a/nemo/collections/nlp/parts/mixins/multimodal_adapter_mixins.py
+++ b/nemo/collections/nlp/parts/mixins/multimodal_adapter_mixins.py
@@ -12,26 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
-import tempfile
 from typing import List, Optional, Union
 
 import torch
-from omegaconf import DictConfig, OmegaConf, open_dict
 
 from nemo.collections.nlp.models.nlp_model import NLPModel
-from nemo.collections.nlp.parts.mixins.nlp_adapter_mixins import NLPAdapterModelMixin
-from nemo.collections.nlp.parts.peft_config import (
-    PEFT_CONFIG_MAP,
-    CanonicalAdaptersPEFTConfig,
-    LoraPEFTConfig,
-    PEFTConfig,
-    PtuningPEFTConfig,
-)
+from nemo.collections.nlp.parts.mixins.nlp_adapter_mixins import NLPAdapterModelMixin, replace_prefix
+from nemo.collections.nlp.parts.peft_config import PEFT_CONFIG_MAP, PEFTConfig, PtuningPEFTConfig
 from nemo.core.classes.mixins.adapter_mixins import AdapterModuleMixin
-from nemo.core.connectors.save_restore_connector import SaveRestoreConnector
 from nemo.utils import logging, model_utils
-from nemo.utils.model_utils import inject_model_parallel_rank
 
 try:
     from megatron.core import parallel_state
@@ -46,7 +35,9 @@ class MultimodalAdapterModelMixin(NLPAdapterModelMixin):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
-    def _get_all_keys(self,):
+    def _get_all_keys(
+        self,
+    ):
         # TODO (yuya): p-tuning need additional handle, check peft models.
         """
         Returns all the keys in the model
@@ -57,35 +48,50 @@ def _get_all_keys(self,):
         return set(k)
 
     def add_adapter(self, peft_cfgs: Union[PEFTConfig, List[PEFTConfig]]):
+        if self.cfg.get('virtual_pipeline_model_parallel_size', None):
+            raise ValueError('Virtual pipeline model parallel is not supported when using PEFT')
+        if self.cfg.optim.name == "distributed_fused_adam":
+            raise ValueError('distributed_fused_adam is not supported for PEFT. Please use fused_adam')
+
+        self.use_peft = True
         if not isinstance(peft_cfgs, List):
             peft_cfgs = [peft_cfgs]
 
+        # @chcui crucial to set self.virtual_tokens and self.use_peft for all PP ranks
+        for peft_cfg in peft_cfgs:
+            if isinstance(peft_cfg, PtuningPEFTConfig):
+                self.virtual_tokens = peft_cfg.virtual_tokens
+        ptuning_only = len(peft_cfgs) == 1 and isinstance(peft_cfgs[0], PtuningPEFTConfig)
+        self.ptuning_only_and_non_first_stage = ptuning_only and not self.first_stage_of_pipeline()
+        if self.ptuning_only_and_non_first_stage:
+            # There are no params to add if we are not in the first state of the pipeline
+            return
+
         self.base_keys = getattr(self, "base_keys", self._get_all_keys())
         logging.info(f"Before adding PEFT params:\n{self.summarize()}")
 
-        self.use_ptuning_only = len(peft_cfgs) == 1 and isinstance(peft_cfgs[0], PtuningPEFTConfig)
-
         for peft_cfg in peft_cfgs:
-            if self.use_ptuning_only:
-                if not self.first_stage_of_pipeline():
-                    # There are no params to add if we are not in the first state of the pipeline
-                    continue
-                self.virtual_tokens = peft_cfg.virtual_tokens
-
             self._check_and_add_peft_cfg(peft_cfg)
 
         logging.info(f"After adding PEFT params:\n{self.summarize()}")
         self.adapter_keys = self._get_all_keys() - self.base_keys
-        if self.megatron_amp_O2:
-            self.adapter_keys = set(key.replace("model.module.", "model.", 1) for key in self.adapter_keys)
+        self.tunable_base_param_keys = set()
 
         for cfg in peft_cfgs:
-            if cfg.weight_tying:
+            if hasattr(cfg, "weight_tying") and cfg.weight_tying:
                 self.tie_weights(cfg)
-        self.use_peft = True
+
+            if hasattr(cfg, "tunable_base_param_names") and cfg.tunable_base_param_names:
+                self.set_tunable_base_params(cfg)
+
+        if self.megatron_amp_O2:
+            self.adapter_keys = set(key.replace("model.module.", "model.", 1) for key in self.adapter_keys)
 
     def load_adapters(
-        self, filepath: str, peft_cfgs: Optional[Union[PEFTConfig, List[PEFTConfig]]] = None, map_location: str = None,
+        self,
+        filepath: str,
+        peft_cfgs: Optional[Union[PEFTConfig, List[PEFTConfig]]] = None,
+        map_location: str = None,
     ):
         """
         Utility method that restores only the adapter module(s), and not the entire model itself.
@@ -110,22 +116,27 @@ def load_adapters(
             else:
                 map_location = 'cpu'
 
-        if filepath.endswith('.nemo'):
-            conf, state_dict = self._get_config_and_state_dict_from_nemo(filepath, map_location)
-        elif filepath.endswith('.ckpt'):
-            state_dict = torch.load(filepath, map_location)['state_dict']
-        else:
-            raise RuntimeError(f"{filepath} is not nemo file or ckpt file")
+        # TODO (yuya): this logic needs to change for dist ckpt because after
+        # adding adapaters the checkpoint will change
         if not peft_cfgs:
             assert filepath.endswith(
                 '.nemo'
             ), "Inferring peft scheme is only supported for .nemo checkpoints. Please supply the `peft_cfgs` argument."
             peft_cfgs = [PEFT_CONFIG_MAP[conf.peft.peft_scheme](conf)]
         self.add_adapter(peft_cfgs)
-        assert set(state_dict.keys()) == self.adapter_keys
-
-        if self.megatron_amp_O2:
-            state_dict = {k.replace("model.", "model.module.", 1): v for k, v in state_dict.items()}
+        if filepath.endswith('.nemo'):
+            sharded_state_dict = None
+            if getattr(self, "sharded_state_dict", None) is not None:
+                sharded_state_dict = self.sharded_state_dict(prefix="model.")
+            conf, state_dict = self._get_config_and_state_dict_from_nemo(filepath, map_location, sharded_state_dict)
+        elif filepath.endswith('.ckpt'):
+            state_dict = torch.load(filepath, map_location)['state_dict']
+        else:
+            raise RuntimeError(f"{filepath} is not nemo file or ckpt file")
+        if self.cfg.megatron_amp_O2:
+            state_dict = {replace_prefix(k, 'model.', 'model.module.'): v for k, v in state_dict.items()}
+        if not self.ptuning_only_and_non_first_stage:
+            assert set(state_dict.keys()) == self.adapter_keys.union(self.tunable_base_param_keys)
 
         missing_keys, unexpected_keys = NLPModel.load_state_dict(self, state_dict, strict=False)
 
diff --git a/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py b/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py
index 123f0f06a33d..ca5820772c62 100644
--- a/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py
+++ b/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py
@@ -30,6 +30,7 @@
 
 
 from nemo.collections.nlp.modules.common.megatron.adapters.parallel_adapters import PromptEncoderAdapterConfig
+from nemo.collections.nlp.parts.nlp_overrides import NLPSaveRestoreConnector
 from nemo.collections.nlp.parts.peft_config import (
     PEFT_CONFIG_MAP,
     CanonicalAdaptersPEFTConfig,
@@ -38,11 +39,13 @@
     PtuningPEFTConfig,
 )
 from nemo.core.classes.mixins.adapter_mixins import AdapterModuleMixin
-from nemo.core.connectors.save_restore_connector import SaveRestoreConnector
 from nemo.utils import logging, model_utils
 
 try:
-    from megatron.core import parallel_state
+    from megatron.core import dist_checkpointing, parallel_state
+
+    HAVE_MEGATRON_CORE = True
+
 except (ImportError, ModuleNotFoundError):
     HAVE_MEGATRON_CORE = False
 
@@ -56,7 +59,7 @@ def replace_prefix(name, old_prefix, new_prefix):
 
 
 class NLPAdapterModelMixin:
-    """ NLP Adapter Mixin that can augment any transformer-based model with Adapter module support.
+    """NLP Adapter Mixin that can augment any transformer-based model with Adapter module support.
     This mixin class should be used only with a top level ModelPT subclass, that includes either a `model` or an `enc_dec_model` submodule.
     This mixin class adds several utility methods to add, load and save adapters.
 
@@ -92,7 +95,9 @@ def first_stage_of_pipeline(self):
         logging.warning("no attribute named model or no model.pre_process found. Can not detect stage of pipeline...")
         return False
 
-    def _get_all_keys(self,):
+    def _get_all_keys(
+        self,
+    ):
         """
         Returns all the keys in the model
         """
@@ -216,15 +221,18 @@ def add_adapter(self, peft_cfgs: Union[PEFTConfig, List[PEFTConfig]]):
             if hasattr(cfg, "tunable_base_param_names") and cfg.tunable_base_param_names:
                 self.set_tunable_base_params(cfg)
 
-    def _get_config_and_state_dict_from_nemo(self, filepath, map_location):
+    def _get_config_and_state_dict_from_nemo(self, filepath, map_location, sharded_state_dict=None):
         cwd = os.getcwd()
+        save_restore_connector = NLPSaveRestoreConnector()
 
         with tempfile.TemporaryDirectory() as tmpdir:
             try:
-                SaveRestoreConnector._unpack_nemo_file(filepath, tmpdir)
+                if os.path.isfile(filepath):
+                    save_restore_connector._unpack_nemo_file(path2file=filepath, out_folder=tmpdir)
+                else:
+                    tmpdir = filepath
 
                 os.chdir(tmpdir)
-
                 config_yaml = "model_config.yaml"
                 model_weights_ckpt = "model_weights.ckpt"
 
@@ -233,7 +241,22 @@ def _get_config_and_state_dict_from_nemo(self, filepath, map_location):
                 os.chdir(cwd)
                 model_weights = os.path.join(tmpdir, model_weights_ckpt)
                 model_weights = inject_model_parallel_rank(model_weights)
-                state_dict = torch.load(model_weights, map_location=map_location)
+                state_dict = save_restore_connector._load_state_dict_from_disk(
+                    model_weights, map_location=map_location
+                )
+
+                # distributed checkpointing
+                if state_dict is None and sharded_state_dict is not None:
+                    checkpoint = dict(state_dict=sharded_state_dict)
+                    tmp_model_weights_ckpt = os.path.join(tmpdir, save_restore_connector.model_weights_ckpt)
+                    tmp_model_weights_dir = os.path.splitext(tmp_model_weights_ckpt)[0]
+                    assert os.path.isdir(tmp_model_weights_dir), f'Expected {tmp_model_weights_dir} to be a directory.'
+                    checkpoint = dist_checkpointing.load(
+                        sharded_state_dict=checkpoint,
+                        checkpoint_dir=tmp_model_weights_dir,
+                    )
+                    state_dict = checkpoint["state_dict"]
+
                 return conf, state_dict
             finally:
                 os.chdir(cwd)
@@ -271,7 +294,10 @@ def setup_optimizer_param_groups(self):
             super().setup_optimizer_param_groups()
 
     def load_adapters(
-        self, filepath: str, peft_cfgs: Optional[Union[PEFTConfig, List[PEFTConfig]]] = None, map_location: str = None,
+        self,
+        filepath: str,
+        peft_cfgs: Optional[Union[PEFTConfig, List[PEFTConfig]]] = None,
+        map_location: str = None,
     ):
         """
         Utility method that restores only the adapter module(s), and not the entire model itself.
diff --git a/nemo/collections/nlp/parts/nlp_overrides.py b/nemo/collections/nlp/parts/nlp_overrides.py
index f50a467cf71a..e8f7009b791c 100644
--- a/nemo/collections/nlp/parts/nlp_overrides.py
+++ b/nemo/collections/nlp/parts/nlp_overrides.py
@@ -1236,7 +1236,9 @@ def dummy():
                 tmp_model_weights_dir = os.path.splitext(tmp_model_weights_ckpt)[0]
                 assert os.path.isdir(tmp_model_weights_dir), f'Expected {tmp_model_weights_dir} to be a directory.'
                 checkpoint_io = DistributedCheckpointIO.from_config(conf)
-                checkpoint = checkpoint_io.load_checkpoint(tmp_model_weights_dir, sharded_state_dict=checkpoint)
+                checkpoint = checkpoint_io.load_checkpoint(
+                    tmp_model_weights_dir, sharded_state_dict=checkpoint, strict=strict
+                )
                 instance.on_load_checkpoint(checkpoint)
                 if hasattr(instance, 'setup_transformer_engine_tp_groups'):
                     instance.setup_transformer_engine_tp_groups()
diff --git a/nemo/utils/callbacks/dist_ckpt_io.py b/nemo/utils/callbacks/dist_ckpt_io.py
index 905de4eb3567..b95be90274e3 100644
--- a/nemo/utils/callbacks/dist_ckpt_io.py
+++ b/nemo/utils/callbacks/dist_ckpt_io.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
 import shutil
 from abc import ABC, abstractmethod
 from contextlib import contextmanager
@@ -29,6 +30,8 @@
 
 try:
     from megatron.core import dist_checkpointing
+    from megatron.core.dist_checkpointing.dict_utils import extract_matching_values
+    from megatron.core.dist_checkpointing.mapping import ShardedBase
     from megatron.core.dist_checkpointing.strategies import tensorstore
 
     from nemo.utils.callbacks.torch_dist_async import AsyncCallsQueue, AsyncRequest, TorchDistAsyncSaveShardedStrategy
@@ -234,7 +237,11 @@ def save_checkpoint(
 
     @_debug_time('DistributedCheckpointIO.load_checkpoint')
     def load_checkpoint(
-        self, path: _PATH, map_location: Optional[Any] = None, sharded_state_dict: Dict[str, Any] = None
+        self,
+        path: _PATH,
+        map_location: Optional[Any] = None,
+        sharded_state_dict: Dict[str, Any] = None,
+        strict: Optional[bool] = True,
     ) -> Dict[str, Any]:
         """Loads a distributed checkpoint.
 
@@ -259,10 +266,37 @@ def load_checkpoint(
         else:
             sharded_strategy = None
 
+        if not strict:
+            sharded_state_dict = self.adjust_non_strict_load(path, sharded_state_dict)
+
         return dist_checkpointing.load(
             sharded_state_dict=sharded_state_dict, checkpoint_dir=path, sharded_strategy=sharded_strategy
         )
 
+    def adjust_non_strict_load(self, path: _PATH, sharded_state_dict: Dict[str, Any]):
+        ckpt_sharded_metadata = dist_checkpointing.load_tensors_metadata(path)
+        loaded_keys = []
+        missing_keys = []
+        unexpected_keys = []
+
+        def should_remove_missing_sharded_base(x: Any):
+            if isinstance(x, ShardedBase):
+                if x.key in ckpt_sharded_metadata:
+                    loaded_keys.append(x.key)
+                    return False
+                else:
+                    unexpected_keys.append(x.key)
+                    return True
+            return False
+
+        _, sharded_state_dict = extract_matching_values(sharded_state_dict, should_remove_missing_sharded_base)
+        logging.info(f'The following keys are not in the checkpoint and will not be loaded: {unexpected_keys}')
+
+        # TODO: compute missing_keys by:
+        #  1. all_gather_object of loaded_keys
+        #  2. missing_keys = ckpt_sharded_metadata.keys() - loaded_keys
+        return sharded_state_dict
+
     @_debug_time('DistributedCheckpointIO.remove_checkpoint')
     def remove_checkpoint(self, path: _PATH) -> None:
         """Remove a distributed checkpoint.

From fe7e2e5767940d3aa114a161072f6905fa3b8057 Mon Sep 17 00:00:00 2001
From: huvunvidia <86480512+huvunvidia@users.noreply.github.com>
Date: Wed, 22 May 2024 11:41:25 -0400
Subject: [PATCH 08/47] RAG Pipeline (#9143)

* first commit

* working pipeline rag_indexing; rag_eval with rag.yaml

* udpate RAG documentation

* add image to documents

* cleaning docs

* before merge from main

* refactor code to make it easier to support more customized embedder and LLMs in future

* addressing Ali's comments

* addressing Ali's comments

* addressing Ali's comments

* fix Code scanning results / CodeQL

---------

Co-authored-by: Huy Vu2 <huvu@login-eos01.eos.clusters.nvidia.com>
---
 examples/nlp/rag/conf/rag_generating.yaml     |  37 +++++
 examples/nlp/rag/conf/rag_indexing.yaml       |  19 +++
 examples/nlp/rag/images/rag_pipeline.png      | Bin 0 -> 86786 bytes
 examples/nlp/rag/rag.md                       | 141 +++++++++++++++++
 examples/nlp/rag/rag_generating.py            |  49 ++++++
 examples/nlp/rag/rag_indexing.py              |  44 ++++++
 nemo/collections/nlp/models/rag/__init__.py   |  16 ++
 .../nlp/models/rag/custom_bert_embedder.py    | 145 ++++++++++++++++++
 .../nlp/models/rag/custom_gpt_llm.py          | 130 ++++++++++++++++
 9 files changed, 581 insertions(+)
 create mode 100644 examples/nlp/rag/conf/rag_generating.yaml
 create mode 100644 examples/nlp/rag/conf/rag_indexing.yaml
 create mode 100644 examples/nlp/rag/images/rag_pipeline.png
 create mode 100644 examples/nlp/rag/rag.md
 create mode 100644 examples/nlp/rag/rag_generating.py
 create mode 100644 examples/nlp/rag/rag_indexing.py
 create mode 100644 nemo/collections/nlp/models/rag/__init__.py
 create mode 100644 nemo/collections/nlp/models/rag/custom_bert_embedder.py
 create mode 100644 nemo/collections/nlp/models/rag/custom_gpt_llm.py

diff --git a/examples/nlp/rag/conf/rag_generating.yaml b/examples/nlp/rag/conf/rag_generating.yaml
new file mode 100644
index 000000000000..dcd86b1b220e
--- /dev/null
+++ b/examples/nlp/rag/conf/rag_generating.yaml
@@ -0,0 +1,37 @@
+
+trainer:
+  devices: 1
+  num_nodes: 1
+  accelerator: gpu
+  logger: False # logger provided by exp_manager
+  precision: 'bf16-mixed'
+  use_distributed_sampler: False
+  
+indexing:
+  embedder:
+    model_type: bert
+    model_path: null
+    embed_batch_size: 128
+  data:
+    data_path: null
+    chunk_size: 256
+    chunk_overlap: 10
+  index_path: null
+
+generating:
+  llm:
+    model_type: gpt
+    model_path: null
+  query: null
+  inference:
+    greedy: False # Whether or not to use sampling ; use greedy decoding otherwise
+    top_k: 0  # The number of highest probability vocabulary tokens to keep for top-k-filtering.
+    top_p: 0.9 # If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation.
+    temperature: 1.0 # sampling temperature
+    add_BOS: True # add the bos token at the begining of the prompt
+    tokens_to_generate: 500 # The minimum length of the sequence to be generated.
+    all_probs: False  # whether return the log prob for all the tokens in vocab
+    repetition_penalty: 1.2  # The parameter for repetition penalty. 1.0 means no penalty.
+    min_tokens_to_generate: 0  # The minimum length of the sequence to be generated.
+    compute_logprob: False  # a flag used to compute logprob of all the input text, a very special case of running inference, default False
+    end_strings: ["<|endoftext|>"]  # generation will stop when one of these tokens is generated
\ No newline at end of file
diff --git a/examples/nlp/rag/conf/rag_indexing.yaml b/examples/nlp/rag/conf/rag_indexing.yaml
new file mode 100644
index 000000000000..049afc1dbbfe
--- /dev/null
+++ b/examples/nlp/rag/conf/rag_indexing.yaml
@@ -0,0 +1,19 @@
+
+trainer:
+  devices: 1
+  num_nodes: 1
+  accelerator: gpu
+  logger: False # logger provided by exp_manager
+  precision: 'bf16-mixed'
+  use_distributed_sampler: False
+  
+indexing:
+  embedder:
+    model_type: bert
+    model_path: null
+    embed_batch_size: 128
+  data:
+    data_path: null
+    chunk_size: 256
+    chunk_overlap: 10
+  index_path: null
\ No newline at end of file
diff --git a/examples/nlp/rag/images/rag_pipeline.png b/examples/nlp/rag/images/rag_pipeline.png
new file mode 100644
index 0000000000000000000000000000000000000000..810ef254e857017679567c1547c9a51f29b6cddc
GIT binary patch
literal 86786
zcmeGDWl&se8wH5Qa0?J5xVr>`6C8rOyGzi<U4j$bH9&CJ;56<U9D=*kxXWyEzBxHl
zQ!{n{+^YNIc2y@d?B4s8N7l2R^@hmFh$6w`z`uC$0!ds<SmDJBXsj16UVFkq1HXyF
zrV<4HdugvAD)^#o1aBAk@Wxa?THwWt%1DHJeJJ2F97s&v{>2OA&gb{bUYi1=7cZW2
z#f1fwTyzc=5j=)3o?wnulLobFlkc#%*k%?~8JOVsHWkFlh!tXl;Wkg55egR+a|*O#
zG;@-e4b1J5?-l|ZQs+zzO?JGOGR`>X=cY!=w$s4rt;<)oEz6LGgEXPD2QQuF(YuFP
zZ)hTtzb}dGPqzude_vG6*bHw`|9wS^{Qp1xyBMhdx78rH8|5$84C<6T9#A3#%OiQ3
z%JMhGxJ_vIU+dlfBIn3u`@Oop=_#*SI{)GfQfIC^ls=0*eR+ZQ?>(a(*N33~ZU;#f
zZ0G;`$AE=AYh60smWBF?<yZgS1~vEf|Lgbq44>ZbIPQa7;B=Drve}D%Kn~!{66>8J
z=M0N3lEdK+kx*RTM%X{{s$HZK%ILthe^=}LjvPUyUiz|m-a&+$k)2~pS=Kcts+0G4
zt_t6)cZB%9Oh3r&d!9?};F+1~%yG4##|N(+v^0fC>!8TQD-o-jbYB=}Q2+7Lw9Q%f
zSmxVx3mguPMBxjKC9}ms))Z9-wj-$n0a!L^>~F^T7qvq6h?*a;rSUGD3_=KXYp_fr
zy)nt9GZydfN`GTjqbGhIID%jzgDwlgr}5oaIWM?~kKePulbpPhT^-H%Yrl4Jo<myo
zTS3X4L2KEr`>1boU|K~E@cr@5I*VLMKbT!hU29L=DcXH}G1zn7{;N>3;!Eq7<9UAY
zh!b5h*E`QV>y8F7Fdx#<8>@~uc4)n>X60jUOhUTg+2gl(i(vRO(bjNYLzPuCyYkdJ
zvUL0ftSP*6rxL=JN8%UG=OzaEUN|<G!b(ER?I!JlZbsA2xUTPtWA3a`Lyt3Gl&F?h
zzkOPoYzp~zz<uU(9`$eCt<ihDIS8Ku3kG*rwTd}^*VqI(S?zU}7JLz2I<AlMX7V0q
zPIJq2CUe@RN(j4qm+C8(y^Rbiks~bKtoh9@YGt}^(M_Nec%oa<OI{P-$o*3CxIM|3
z1BX4z`tosuRgw$m61y9^v^Q=na5Q>h#ckKgWjo&7HWFNINUVY{gTvTGpy`|38n{mn
zqqP5k{5yU+bvLST`EZ9?jqqv_M`yT!Ers&YTBz1=3nsR5S6%?>ukCG8r80&Ztu5p*
z^5XqMj{mlYKY*X6$&SD7%?mYA;dJ&xl4+LKOK*V+&npAD`~Jm4fp5*2#n6XO`_UDb
za($2FNu2XA+iSgjND5S;4p!(rD4Kbal$@D4s(c>TP8Rj#k7L9y)lR#muG3~1Q|iOy
z)7}Ex_(D=<rx%0$eqj`muP)P85oyJn1B#a`4P<#iyw$8uC|*RVE{EAqX1IKaf6s06
z*5qh9(yk!zXk9eRA`5akQfZv@$x|^4l3*PD?`{N_L~F;1*cFUnp)W~)x~9jI-@S>(
zo)xyixrV}<vM`ac>~PN0dpY&XVy{=+7fQZLUpMR~6JKE`LXQG(zLgaU!N-0cg~`#3
zx!spaozSz_E7+;~;&NB{+Pdz`<=2K7vn!|a>r*jy&ks-e*=~AI$gjMT-eb{;9e*fs
zW^NbcGP-Zl4l0*!_w}gDkP$<pgYDG-ANS`6j^@|>$7bB-1nO6Uop^rdj;pqzBH_%;
z>_aD(kqVcm;9Z=B?CY=0o+y7xm7pf>ZCo5HSGS}7nmp(=>!4Q7%_7=LP@_n=<6IK1
z15Vu!S9@c*7`wqkLq4xR>vL<n8QoDg)LS&shW`ZlU>~%L&Qbg}%gKZcVeWo0>%7{f
z#xSqh_HVQr5OjLWz`}p=&5ac*VkGxm!%l{{ZP~#X{95DFV}Zps{#Z@`HLRFU-zv`u
z&-)Y>kq4HrMDEF=$a?+TiI*J7f)i%j36PcV^&nEb)E2_XUESO6cnUlBUBmOzi{N9@
zVdmRKg6>vsDaa7&lI$lp9l;uvQF6Ko;lD4H-pkH_ace5mJ?<z1_R$oi4OKx-c5n*L
zl|*3zq`_fC>23c?8e;;zwl8_4#`JBn_Judvwt}VkK`=!*ijak3a@IsXNFs7s5-uBs
zJAd<aj{N7=`6x*aivi4E?`mlGrzVvO0-@dF|6?Puhd~G;_|dxMd6Jfv3(8OY#`Vc=
zggJTjsjJb{aP1fMUl5=ZU$0bmQzA+7-&?{g2w48qk&ZC*Y`&GrT0%jd`n7*JDS@3U
z1U#uYas5`NjhGK_{Cr8`hn6x};y-iiXTaey?YnrJHc?#LRjJgUv*|~$r)i;7iKjrD
z`S&{ETEA7n!kB^_)oQ^S-s(z^8@tfN7RL(&vTY!~4(np_(eihtcf{PJzCGo)(M+94
zwAC*EPYSX!tY2C2xxVZ*wcX9rkse;;t}j)MwF?ZLBfm`)?7Yy}YWTUTNXK2u{!LYo
zT#-6|V(@qu`Yc6?lTH9+kL!H2H>ITAYdm2}`qwmyyFfo&jFs3~MSD24WCmyNyJI;r
zft&Jk)?NFGI3uUC?B_<o$uM@tWk{Fr1W&*2(Hrua4rI-i<Qwg{MQ|g{k1z1wv+W;#
znbusOs%U?RC$p#g_x4pXh*)w+(wZrOTZXGm>$(eLn$(;Zkv2t<w#D--))j<U)B1OC
z;Pe!PeWt9@7l`<5?}R8BJIBPp%L{&rYHSwhQuf4nnqWWGeeRCR$6}Z2<S*}@&{U*#
z0~M#`2&iowC%rn(TaMbUGTsEF#W?ORAF|E8_WRcesMz)E_2f#2&p;FVl})XkMV*Dh
zFf(kcy~_lsd`ec~+q6553+;Igk|36v{7<l7N9*=M)R)MDWdTHF<L^gotEr_7Y|nd>
zXN_p)m431=?P}`=VMmyq-g28+<Mg&Rz<u=oWbX1=#-2Zx?QqREfm_qtf0Pv*oo2zS
zhRsg<cObzf7@bh^W%#$X&eOQ7$2XQNm`BKC^Cy4u*H{P+HRT}E;q6bqT@2d^n2ZP=
z*w!3IIN@}^(X<mayN-8I9L3uy9muiVL5cd0m=kGGFf85=%9D|)1V9jZ1BRZA+$ThB
zG3HIk#f&jZ#2M`!EsDyU$j;YOlpDWtr0j$FNJ-ybm?kUy@5%)4pkgynAs0OCy*V${
z$9aR&e(5SSwmp23)an42K=(b8+UBa=hiMR`l!8WS_UikmAPrBZ)r;S@k)cR)5jR3e
z<xCFWyoSk>_-?E^N3UTeAHz|QiY}O%BQ-*F9!BI#Zhupj;l&75JO3IT{G|TD&YJV>
z>;2SpT;z;$)m&af63xDnZzy9UlMc5>@^rW#UwAIDd7=bDoOWIy;#9wR8Yl^4@u>Uj
zJaN_73`raGAx?(8?fm62CtvwDsIRR~zK&w@;UUc=PGXgWw<rd+&)3Hn#aQw69H3-~
z<ode77}!#76nIG!UgIM~GHP$gp-C^@zd`@q%kIb4=;uRd($TAg-2a1f$rUW_Ash52
zZOmhZziSf#2>fS3KBAXVeRdJ2-#x=jX?uF1D_8R%?Z=CO3-Zf;LfxiJ`W}o*%)DKh
z@3dm#g8tfkR$33JT}y}kCjl2OLLSQJE0Y@es)8>c7G6e>TVy+CL^8RL^_r<N9L`L0
z&p-lXF+D?Mmv$qzY9vy?c>)5aaV1lD+MAB8$>I8?Xjnq8j;_6flDVZ$d|vON>;zEs
zQzCpG86GQluBrFg7>P)}9$AJTAjK#R*NJUyKN#~I`+dyA(F?`!jKLzyAVY1w{a`tI
zfh0NX2;0d2c)rC6P-3`!r`itqgMoWmFmsy>dxpxyZ)N`$7V;x&Lr?<>qOFI1bwNno
zajudT56w%--)1uvPmrpM>wm=QkpG?eemm%EmC~axGHffRZ+4UC@%l*pERidU;7WDB
ztj&v>{hmbUzgLs|b9v3@CEMYA$UPhSnRC!isW8cCyXlAv5XI_3YlT!9O~C?xjbPfJ
z2a76nvAq=mT7PRq4UEL`jy`115w}g6)uttAKmQeog$mC$5=0f^L?|V)+*s&XM)B_h
zv(h?%;Vt=r?(c?OhUQVx2#Ipqei)&j<J?mGcKx9Cx7?~yLByiE^H1xzT;x5T_j*TJ
z-aV=W0x`g`N9LI=w%|8g#QO{Bc*6XJR_f$|2!DquY=p|@@b|$aB%vR){@(rns~<mI
zW_;Dje|Jp#@3ct<bRu~!lXOTex<Ktiok1v5rz`ER9cgvL7yjlJ`O8=zfrL-4r$jbO
z&9-%MXLg?TL;}8aSCD1XDE@zEX7rj&uP>1?LxZD}*mUDJNT9=ic2*68?ytc_TiJm?
za-l=!nFq1^9Z-Sr|9)5aQuI~e{mq#@g%Ddm>#DEEP*M^$#(#>~=!a-MfmhIQI8+v!
z-#`Mdpx`j=#3Ka%y+K^DIAh=|D0mR25N6dKNI-%}yyEZBgd2j+^`d+PQuq|Ns`o*w
zOJH|%;(r$*YeD=Kbb|PRhtoW-rU~tTH+x{ws6>rsL2j#R5cJhs{cnKm^Ed2C9zE)S
zx<Tk@CHr?6!bUQ$0%1@HcIXGFn&7dHmRrjoeg7^1D!J3uk@l=D?wX}B`>OAKfZl%x
zNxjNpw^f-65_lB+@E?GdH7N##r%q*fK%%vx3;pk?NhD}Tbow}&V6l!CoAPJ=4lP)L
zQW?Zimd3<J$k5f(v$on9<QDq(&pTt-3`Swv{{J<}1cSCHFE)IJPcFwk)tG;;rP7`q
zWPh;a(po9`PeCie*x>U0rO@r{{eP?8tn)Pv``cN|dLr+Y@v+qZ+yu;FPwWZY<UdwE
z0NW7!9qdVOTD|(}sp2mxruukn;55kepG&i}!pX1yAFi7-swZlH_1~I#01?loHf?z1
zKQ|y5Kul}U<4pbD@E>~+=JflB^>>KDeI$tgzwt5h!8CKmst52y+wBp2u*kDzHH71J
z1O^4IkLOClA|MD^T9*7yod0w=YX|~?d}3qKX_N{wt<N@AFNFTsVM2`Ien$YpKD`9O
zKXJ!MaYO9`2M2cR<1H<l^+IsAXe0?Uv%=x16`2^7#!sqGJUk~0_M<-xdn1mIkNI0X
zZq7;ntPK@CHiKHk>K}$;E^aW9(XibYO0ChMFr|k7v+LQG$kovzlih}xQkhorN9;de
zp=$%R+kIO9v(>4@Sjh(q4d|`+rwD!l!UUgOd@{Ul#p%b!#@rbHgh%;lz4|}9!nS^S
zIA4S~esi{cLBLG8I#^{q2wLw6*P8S5{d3<cltGeBG~juim4>~O{T!{_q@tpt4o3^>
zQ`FDFbRKlFCL)!=i!Yf#H|9>olsxiq2Wedop-h*{jrntjoiR1cemg|0gN28?tCP({
z-6HhkYSYmot8d#!M-lO~s#(^HjczEwxYR#>+4$tRr*wb49_fZcZhCXNx%2DCYdk!>
zbAmtaCxO%n1sG|CA+zS{F#B@X?;49K=g<ALx-Fl)u2*4LtQXWE&*8=5q5vdNHaVKX
z*UH6IVQSYdW@%~pEjMywHV;_qy{12_lqG6dug4jP;3V;=mYba6kAR3tP^wg@pjK^y
zv^P^;eg%Y&XjV6+dLk^OpXO8W*4EZ;k)F5P8QvAnkfnE#A@BbQlj?!1zMI$#vxUdU
zpvhuY6n%Ytx5yKT7KwP8AA0S+Zn8&Yn?GXi?$!?u!b&ykIwmF*{~6?20cf?KGNUU;
zJa%PsGxusXf0XY@E54=K>!EQe_U-@NYw&|s1zsl<ca8Uxm*Z4=NSYdhcH_IzPiInP
zHV2f;U$u4<@xC0<v3}F6vo8H-%mBLX0)2g6d@6fjwX_I4C7me9$b`$^1FMbvbhrE@
zm&yNf1F$Pq#=<GRh_Eo>avgBRmkA1R(4Q^)^A8%ZJo!2884ZE~@SC^q5vXmQon_y0
zEHpYI02|pGh=lvUUHP*$%)Owmc0AOjziKR0!Nv}EmsaQdGbk1_Wy-^q{|Pe$olpZ8
zIZETIx}E{RTN%RC=nNrPa#=@Go=ATA`q>E3cXL;Ec7jPnzL7Qcogn>VuPO%h{6<8S
z+vC{sCbUV3kug52i_Xi+t9~qKl)&?=pi4DsIz$OQ{bhM?$mVhZi$eUzFQ_Depf*mX
zm&9t3uvfRTvXXiA3do<Tlo0#v5#zRJCs6sJ19sb)tIluyK+=$F(EY71HYn)!k4win
zy7UMbeU|CqyF2*U{BSuBR|e!{{rl^aBu;w@Nitw?b8ro2K2K%nf5w@M(g~F%OyGXN
zB$dX23fS6|d)Tx4iN{jtpKkQ0wFA#H%FfyN5g8fTXtwg*d8xe-AQpj$<N1$cqUKH+
zKFzIcYA{y}#<op9-0qi~7d49yr?6I<juL?5n`*vHKtDa+l`p;Wy4|I8IGB~U=?)DK
zR|aE}8Aa)}3!AVb5`4mby7GR?PoV~&g#0BOGIGykq2kGUB;Otn9M8>0?4H33Mn*<3
zWjdY^U`t9oSF@Fd^`3XNd!;M}U9cRSoV%jrjGsQ?5fc;dT@%$s8xnp&)6Noxdq<}x
z2l<06s~GLtZC5)2E--VU0Nd;a5gUI&!t(crKie8stj0K60vBK5c+WLI-tHIaXPuh^
z4wrQh9SvIj^+U+z>EWu<8KR?nDu;;A#WDq2U1+dZYw>7sEAvO)!ou6Yv|Awk;hUP8
z$`#>uyPGfn(@g%K)rt1h`^RIytqKm%!C^_6jHZWEE9Mb#acMx9+-^1#m9w`2Yu)7m
zHZLxo8I8f??yxS2*OQAux25DNhkya^lZ)K2(n6~@pY>96;nkLoU9)1Y1V;052g1&B
zE8jOP#<D-YNmAvBh(*_%ju5xs#|Ur#rQwx4G=ULOXLEmbl-^prs*d=F;%5gdQ8Y3#
zS_lgeZE)Pj><PyWvYM}DQv0<*f<R4q(TT#Rcd<8hAn+%d!~uaTQ>(YTTcrH`!{Z&q
zjo#7o?y?(57|M(K_TG;-lz=Bh6ckYX@xhYl{QUeq{3y&+HY?wZ3__S+=+TgM;Nl``
zCpo#OCCNYL*?<bTlOqnhb;(SaN{QN!0bKDpKR9%XoO-e403elRdp+F3ned4jGwJH;
z0{$4O$_AK(IumvBBw%(d7YMUe#*$^0OKF(QcR~Y-_&jb(|4ct?A(#j2;GH`|j#OEd
z#e6NjprGJ+`Vc93Ov~e~#h>h$Oi#*#<<DZK1B6R9o23#r#`E0?7(Zwv2{aBdSYj}y
z<*^*6W_?qW;cSII_6I$mr9WBHXnxkd-7c+xzab(hC<w;^NTt8rnIk8SBSJ#PG=W(e
zS<$GJ6qw+Sq_U#`5vsSiyagV}IG#4m?pH^<tP&Ag2eH$S7=Pw(l)+)&E@QhTxVKRb
z)~clPe8+qSQRt5$SKcMdO9L)ZC(ZtdH7=neI`By5T%ZL+J=SVsTS5wVlHPgMGOhL>
zLP6*0{sW1Ok-$%DrFBc=?tfyRC9dgn0`~oGP=9Q8s;VnN+J9ALv&^fE(?0%$m(%Mz
zX~d@goI&GiN{F>Uj*i`g-uFKQq1`d?@xzzBpYRQP!p7XufS~m}&ddL02<kwDIYY=2
z`1ta2TJsO?sY+q>N63|k?}#Q9l`D`V2KPSpRbK8*6*IXU>wzhW|A!-Np5Xw`|HnXy
zj*OfIvfLWk<P0gQiiwVnhDAXUH8Z2KUa0@NKU2OAq|JJ_3z{;mhOAr9fzpFm5x1`c
z>pA(R!ovP;EzJ?l>CI(GD%JT~f&UnR5z6H=t{c!fXo%vOwSPAJKCvERLj!|J62C_P
z8$3fJjjZf!R7}h~OgLyVqTp}e6!uTmTq8amwfn*BxuRinaBvWLKi>DnQlh}Wqw?{%
zIW4r!$r1@I%2lg0*sFCpSpzvaeFk98l)cnwtd(mz-8iWaGm88FxeZY~e|l-g<wLMo
z8^W?3FB|Mi>^29E&(D=8Ew=33>xn2k><+Ghq_E2a*HB--;{9|-_`BAs_&Ft_&hbtp
ztsl<UO_kEsSub_~XlKgqXn~Uzp`C?PKoANJrRtkM9JxfCTyI8I9KEIrek<Tk^7IiA
z5$4ghRO%boabJaQfmHgOh=E||{-<&r(0Px9b^q{i;thmb4wvK1M!SQ<y5*kZ)y^pl
zgFj=EQk^9im1*9f;r>&?SjLAj4M@GFHYOwXjicmpJtd1E<SR?a%p}4f>9dWl=}%>6
z9Ehh4DpD@aQYqCiKPcCzF$dY%DfHGd?+f(gW3%Pp0|=;c`wueQgvVmgYD;1=Jic1?
zE-u_-4*y<bsYpRtf|f~E*XQ5?{ak4ng>J0D?Ye^wW|^KLp=2UM_w~v8Sd+7j!{Kjb
zZI>#uF_MGdHIl;)H9dL)eRnbInn0`=%HSh}G@l`-yB~2r=kZud*`CwQ!Akcl>o?HQ
zMah`zoSnXQjQ^Nt9PRzhcE;WtI&PV_xh21LC-SHI)6?po>sa}b=&z1&?~E?@Agxak
z$sbLN-j^&kI(GIFdWW%EOs@<jf86Vc4U0iw$szQOqm5aDV~*uFK^0RMS5;MYq@Ay7
z6^ka>v&|zwd{3UCXrd)n@d3zC7dJVw3H~Sm;8Cu&Avfy_8x<!5Fs4i;ox-J_I@xos
ziI1)l%kX||;Hrd&gBx?tl&Rh5l9V|b%B`b|!OKtQuzdw2=KvrJyA9EnH)qM{J!2Fq
zck#k(V1)`pd5tK1>FmNNdo6c~Ke8J`=ZD=Tnb;4Itopox&(mzs6@2X)c77SN(x#*^
zKDna4<TsQR#Uzs~PwPpr!K+AW)*w5*$*aC$TlIYj!FU4<q+b|%jh}M8jmkwz@;rEC
zD@3FTB2|B_Df56xRkcrPXfTmspGP=3TVYdTPfr2IA9e0qm_Gu5qPpvC*Ofl<#ttE4
zjimF0j%AA)PUK0OYs%34gF=l7IlW?w&vI{0TRiV3G42!y#GrR3B%qTwIb<SGjW55_
zjra1`0&^=J$=*nk>-qYe>M)a|i6RHZcE#YZ01Ty+ZKj?B<FAEHDxG6&$$!fy<|?aB
z#u2UQ+MJsqZ6d>BvxFzh_rUUujZ=gMOTH~EWqkby`aTCXztva90l$99IlgdMX?ux5
zqh=SyA4g#4a{z$+&*3;fY}b4C%yCI9;-f(V5ri;EIR3+_?8VEacGzjjtG-f<%e7Xx
z&sfk|L|vm3>W1?aPu*xS1$k_fOcFDBjZxB?Zx`5El6|sXS!P(qYO!3FO`Fz-JQKQ{
zO$qO0b)sHByVZCaU8N3;&BcLig<Od}twLXSs5nMloyElWo1x>9J;LOc^)4q_TnsI5
zJp{BDNUA?d>HCLKiycQLtcxc*5*r3SW8+^yPE;z_(e{LCxNwR5zfO?irU`&1_u!b_
z$gJR4Qg|#{|Bk?UgKkdcXl7NTYb)V^MU^^d5=fC}w{~c@ZFY~jUM&Df%n74e`^D;s
zHjFHnnqAfEtiN;JxSb8NE4LA;W+h9?))Oht0SLep@jbIJL|n6;=!btAtmFs%Jp}z*
zXQF)ZBqG(7eh83TaaMrBihrs|SsAuJaOBy&>5cTXBSv%RHg7NXr&jK=1q1G(@Z_3v
zLT#y65wB>h19Lwu&?Ja(UNKW?P0I6)--;|YhKFu5tgp=ta(HiLHG<!xMO5GhlC!+3
zyi%~)I*3|+IaRDIx;^rJEKeHO#m$YP;hKW|FYvkqD#v6I=-6lYS(DzH`MQW#XRM+=
z+hup*Dw1<EjCNb_t;6*z-w3TXzdMvE47yRCD?A|M<P!=Q;WOhfB=z}tDJjL_kR?VD
zolL3k!eNoU1z-q*j{9LvJ%wu|Jl2M-mzNh1)}Wn^778QmOOrj)O@>o`=(c*9EjHG0
zZAHI$E={Q@`%YB#^fI~fb!yBpWYRwsN+9Is@VDPnsrO)z4<^XH6r4(lmv0E<rco*N
z5sZMk$hJ(yYh8+H^s}e*Lv`q=c$40_i7@llZWCIte7Osp6xW*J3x$yRTntPflCv&t
zuCD%Y`@Dhqj08dLGP#smQKv7)@pWY^?N7@CwD2hx4>kulEHVPSL0_3WotHg>TAv;@
zOJ}@`^58MPam<fkpc&~}FNO*JS`CH0-mm{%rd*7<<PPDfkugGIrqyzZJNHKrlrb~j
z@@;KxZPZ|>0SV}6Mv(K~Y@pwt^b)$2TSfJFqw%-g9W@omXG0GbK1bBr*!P8Y(a+I%
zN-bfCL^>jjz9GTxt7dLhfl|$@qM1Tp^T1reTss%K$cMzIXmtdv5uKip&V1WFv$wvo
zx?Q<vs&BnMZjOsHbZZ|=&6^&NOEMO&o9~WA;r_4Ej86yYpR9GOeDooEB7Qm><twWF
z&T3fC|DF}J!tb~r#YdR#bpsbysH9+pzWLMeF~72RyJG1TCWBVNOt4)-O%Q_JMOxYh
zklJ*nTH|TKIWV0+zf7EyuJxY)Dad}RBmm$T!81-`VEvnPzWZY#<?K5Cudi;OYy7Md
za`3yH`)EA$bUtt1IW#fmrtwmP_ey2*sOgcaLW@1DtMXuy(~aSVbU04>Eo;OPJiZ{(
zyH}_#*FRGhL5B)rRm&N1Z6%8N$Zi*V0TKAzMd|3%UZ|kY|I6OM<OHBORROSEXSWnX
zE_u8b%D6|r57f)d2Gb?#98L$=63QTv-WJ(iu<$hL+Gnz9`VTtN59brI>i~D5vM<cp
z|GN{4xVLCrNB{>;r^yMa#$x94-R0p0_~bd8^M;;#S18-Pny%0n3d7@6z(*GsXC;x~
z{?h2W-h%`diL+H81xqC}yM^{W_SsGYxN|Zo+8<$Dh!$u)o@NQgpBCuMzrM&Q2!)oN
z{u(RE)I^9~=kE{4UbZCc^Ms1WLPr@1y()2BV<ZF(hmq3}?2ib<gTb-DY4z~`5fOk{
zok}*yX7KLa4SxY>3!uD%kpM#fl^SIrLY&UHkvUM7fao+}7nk=luh7&pb_$xVzidIw
zwQ-i&%qPW4(Kp{|!UDM2<O|?e>iB1~%$#r*3-wH&!?1tcelR+BC$2IDX51NqP6^r_
zC)aB6;Os^t5Ply$N=&#wTc2$?S5=@^nZ@N5d)g^nE7McFakT@G%$1J&D#D*X@41I<
zB;{BJjGG4v#wH*g*Q;8+qgF}!AxJ~O>k^#G-U0gcMD@6c;bV9wOSEBb7GtOfpaF;7
zaVVSSlRzB9pEExH%R7a@OvmLwGiBOEm+msqh<P|IK!P1}FTv`Bl88jWVkkhvMj=@0
zeY)lC&KowmU`TKL#;5XF>cxjb$g8E@AQ-23Kn-}cI69?F7r^FT5X_TP8c3(Gv;!Pg
zc|<t&hqv8LcAM~*hl@epS4T`Gx>N37n{Q4tpJPA9rZ;1p>ygN^kka+{H%OYH@92Gd
z!f}eD1a3Fp&J_V1NiqD3@nRYp8VLw%jLDEA&W!L`t@P?t5ph~5_vSY*_+$~mK#N47
zl~zMOr>6!ON|cQD#T|cRZfz~C<NLEw!(W*KPIpcIe|OagpsV5t6uhJ)jDH?Yzl%3-
zd^S>gJ<&|_(}gl^ZdURHs{D;SkbumJL>yIg_HYbPZ4{SPB+QFO$5!S^rxyAAP%c&}
zN@kU}z5<A>wfqdPaZR~iFXcUDANzg-s=W?-=m9R~I~L{xWZwv#wq(EdV6e09A<%zl
zC{p7$vhelwJ?_TREhxeTaP+u)xI{etR}-t$EUWE&xk3#nR`;NBQ8(r^4i9K=>uZFk
zBgD4hs1Kfz(SSnX^0&ov8K|w}0CeJ>`M`Ib0S7>DijKUd&X}mM!k*Z{bN>rCvF~=V
z@qK`9%Okps%t#W4zCi@ClfqGg`l|0_Y`l6ZbFOB}RDXy+rv*?5s2u@-JFuy#X({$0
z$79eP0?v1PlN;a>dAf*#%O({sX3)mpploSLm%yNn<BY5V79dx0UBJ(!tuM05I(YZq
z2{#`FdcAL%@?_;vXew5~OI9*i+6ciYp##VS6#z7zni)uQk3Thg@?~$%utH8m8xfD#
z?X!F0xkpK4)KY%w2ch88Y5++u8<K5KZy3ZR>bUhE^|S)@8pBnsC?(0jOJ*N3%G9(<
z{@OgX6O!Ws#h2_tHbr{zeJcHsm1i3C>w|PETcLpcoM|{&SM1k}weHaES%nfcrj6lL
z)z)m$*bhC+%ewJ&vn`(7jIdc+;$FP~!^6R#W;XEv$Kqc0(hq8YiCHpFU_ZrdzWRNd
zR%k)H`j;+mpuBm2CE2IgY}zEzXiyu~f<j?;aINu!$NHA&pkEOso|6`**<H0W`|$qf
zM~^z=fyD7E?EY?EX1@)fKyX9ZG<mL2t3nWUlFMXDwNh7K%@IU4Voa8@$dguy<Wcwl
zC2sp5V{6HP;5}fh!4VPDk(on8Fa21G(|O$4UPBMW=gDT2S!+vhug0Tf__>EynWs|a
z$CoOhjkZ1@(kL-z*YW{m6iYmqbU>WlW~u8BoKs90rJ`3adc)On&XR8alw<x|D!E*E
zcdcl0QV~C0m}&bu@A!247$_&!%3B|SK9v8GWCPH}C&{uzBOoYyT&0(z;!Bh0dg_!i
zqTTTSDHrYAJP>Y~z@q#d!YcvtoJ{%!wQ+xZpxZmw2rXj{(-Az34+|}xn#)2)%Nvm9
zH_0Mt_+!R;xEqWsnGoVC0@>DkGIKt4=~Bt$_aqih$14IgNxCIO#`GC(c@5@ET9bvr
zOOon8jNYrdI9o<EN}E%dMd81HUug0wgVX*q<j2A{fXJ6;i>77pk4q%3QWp7+KlOFv
zVt?j>fMmYXep|+IY98!<b)H^A{5te>xgV&Xl$W8u1X~u5qthkz*&FwyBi0_xR{Fhs
zJqHE|<RULRG^+dZQAIb#Xh-w$+6Wp~YM$4s<vOK6D97SVfQ5w>v$F*#OqdiGKIED4
z?hAqe2?ejBT}342UwSfvgm75QcFkh7k$;8-hk&41r*Ib6r-I?c%w}4%*{(1E&XKWE
zN%W4Q7~3Ar=mn4llk2Hb7%n^V{&eZY)$hDX8SLCuZF3|6>6e{ALVy1uN;~$K-F&T;
zIanm|GnE#dA<}Y4mC%HfqkQvEbPo>?rE3`@tBrbMDk`cYt=}6Vlwi3$LGjRALHXNF
z`@W!}8k){*HxY0z6$^{fg{~gl&_eh>gKHqjvgWk^YMC5cT$wY|jes>pA5BlKT80OY
zNmo!;^;t{|_L-0h!{^4dm@a8A({8f#YIKoU^<|L;<bni^PKQM)eEGuZj1ybHD0$|a
zoXc^Hq92710DsKVTYVMSTE02S<!&C$8Z-BVMhCRv_#sS1Tk04x40Nn%-z5@hLzpon
z_UTD)?4v`SXPt?i0UY#9=`{EzCSnnI-`l>UiDOlLH&_DIaB4qSWjr@drU8BU%#2JK
zFzY=Ph!9Ycr_#m$9tyx=HD3XOmC8%MVvCN#(=Dl&oz1RiN@skoe8QJqEFbpeo0=QR
zt1vJudV&xX#v}Vu`YJ>5_Q|#m*8K0G&i3gCkRt*R+AGDz4b3>W!o$O-G~;l2OgMhk
zHFT)iDRfb@7v5dW=#EvJ%7URLOVrWbFAv5v`(s~+kV_^M6yBW<3gk5mVti?cE;RB*
z6hwSZfV)X}4TAmi^O`m<fs2$_X*ZilB`nti!)LGeKTmoWQS8;7PP&I!O14GtzqxnB
z`<}k<FY}!nNIG+(c{;M+xiI)1ud|n8f5po3h5#IQp?7C2>2Lr5gWd7C9K`VNo(W)c
zP_x;XziMm$zF4dO26DV)JWcLi5hXJ9#<fyJ_LAE^J-8P{vb7Ves^bdv(R5_eeXU>b
zt}6sxc`kF~nNuYF9LC@M4K6<ufm)gZBG7`uX)QTO3})erd(k#Afev_*>U<;#Up7!6
zz#<|(f892#Srn*ke+)dg8Gye74K3VTRPla$jh|*;iG>1J3iH#~n9%2-3ZWi>4os#4
zl<IvsqdR@n;oSKlvcFbi;Um}_s5O=(<IXnJECIT|$RV*o)>KAhcp76RDnt6LdJvqX
zMyFYU!$+@CeuaH;w5W++u2Q5V4ym-VD;Zc<d~Vw)0>#Zn*Ku|v@3^MXc!50WRN8Pm
z@h+bRB32(7@Fw`6m6?_>1`<f)aaYm)K^@+2Od$k}RUY6eJ<H|YI-nY=tN})yrs4x@
z_z2K_Z}H2OOf&%CUOi}4Fh=2JVe}}q-<?npvNQ-W7LBtkdKBm>35c~3k^dmm6PIi|
zK5Bv#`3SU-l4w$Ft1o+;A>r_7<2t4)&j6d+yShkO2^q~cn<~NpuyXDd+GcZHM}Tt>
z%ICA~{mw!cCU8*I?+aS?<Id)cO2-E$aBuMHt4cbw?r7cLag!(Oy_}AKcDOGB=t8~+
zNFBwGm?`Ww>11kR<p9<zW)=YQn)F18&v{g+rO~mDiEiK2c)}Y5!B=m;aB&+Cyi@Mw
zpjT+Z6!%R5h#GZzW?W%6#++$yfxdllMtJJz4G1J-Q@S>!O2fsazR}S@m%d6queE8M
z-M<g>zR8qOPza69HLnE2V+(EMJN&e!@Ci199A063>XbCHRX8GI^UYQ&l}gwb4*A)<
z(XkeltI0%U!w+_Cs%rH|upM-`kOA#i>;@H&BbY}fo)RSP#Hw5kO-;q>FY&n?e1PBy
z0I07dBFr{zC)1^xMW$be8pRmjFQCA}%1LVEykqF|YB0D182&I#Oh%osP%Og5W9{@N
zAez%@{DjSy&5%)Lge9@iPbpNK3{d5Yv0{E$l(w3FAI-{%2g2?}Nf$mpEP>dH1g?)B
zi{Z_=BVze70BwttXBsOvpDyNJIcg4}rKM$$I|1ED-46GLz$ki@L6wjf$YRf0j}~^~
z1Q8O+SvPeuI4rlY;~C8_$1d(tw<UZjjk{Lg(4Vy3^y46R#|%~rHVA5%A$A%jaUv3-
z=W!y;(Q_{a1OzxhD=)ILvL;Xb<5I-vc#NZSW(1!eAExX-3Yw_*4ZS>D@80OM4L{e4
ziV;;8V6cpSRCx`JDAy}5Tx&Lx#<}O*QJl?1njxeE05+$ENKYTad@H$JA0T0r|1qQi
zk`VJ6p5v{)8M$gN$Bp5^Ww83@;{(j>LodPoho^0$PAFgkK=+k~XZ4jj(CbzwnF&D1
z+SF9Lb;OMuOC>WvJyvH7yPvMuo5MYuaS~Ks$7lPx`f8KqxiSI1<Qr#>$71%D&Fc5N
znHgdpBerg!Bl#;w934#QssLc*^z(u~nyF&d-h;X7lP+}G92WtYXx2LbswlshaXvV6
zLy10AXa4fNQM(@W0|>m(g#idRc&9&X&w9tU7Zc?RQ@9BRWA7LuhRsn3KXtWxM;hFy
zTG-yV!C|nF^?<$tiweM_k&*b>gV8EJIPI<zB)1xM<NAGNlB}OmvENNIJXJ2j#cXt8
z-nS*r6lgcqF2yq6xB|{Qp{Xu3KqB5oLjFg3KfdO5;E8v=4Vy5c9Z;R`JH^joOaO=W
z>OOKMKVaW>?N_0Z9~21yl!JrE1qN<PIuZ=G2ANB4E<DfA?WQA9S7$wr$7<^A<5#L-
ze{EWV_Y*|D)@e}XQ^fg#oR?f-4u97uk|!FO_?^4COp>U^v6}sQ0YE3NCGRIs4lb@e
zRjK6juqgEu1_lN<*{xXh{`oo^Q8BULKmD3>GbU$4&<aFI?YQ5Wqkfyp0CLh%J46Jg
zbF#jHku)jqs`Npx$LfT{D*$T}9LyZ@`kRZ(4~RcOLHSj11&la~a0l{H$G1qzH8AqY
zaHhLMa&R<ZdBAev{H^?!j6r}`8XYb2D^*YLwpoVPQ%)J_y7~d$^<E_>7c)}#c%&B(
z=C0;R)@+_HR7o#h$D+I9yC=~1bfrUfwvoGV9}vr1ca7ksi|E~L<V{C9W(8}I_hwj7
z41lHT>F<YQY`tgx97zxkNEZwi>TSPhY*ap@*1l*`<^G|cQ7^KL2NQOdz)iOe>LLSD
z0fB)Ont|V*xg~NYXUM8coA^-bInp%U4F)Q^&4~-i_YL<ZJP(Y)R-I6Y3B9P?3MVwe
zx!5yIro!!ByY2|Vp0r0!DB<>9Z+EK`m@OoQRjWfoZ$j++DLr$sw6VAV#+-n*=}K4Z
z5x6Ucxgv#Dz+!s5z$!Zz^=Z*<u!VcsRHr}Ot2nluTfUO`6AR-A%BpiHajAWlT64wu
z*EvB4>Om`#<mWJfNL4IDS?1A?iKp%#8{Zb(sl*ldeJcqOjyv3-aor{SQ@Av(M)Ezj
zEjBq{fNO<rc~~rG-)c8H$iIF<WBX(t-P>;ov-@5@1dw|?x4k-=AT6O=!>PsFgQJXc
znLZjhoIh_!kOX2ibLO%pE2U*zTsQ!5mGFZX4FErJ+~WP|gY7dFPP-HyT;>e-b&&1M
z`T#)3(UXDu*fCV8yLzSUO|l_DB}puw^4u%Q8ag4u)E9VEWA@5zVO}mM%J$QFeT2Ud
z>{=Z`B*pfLTYp@)F(+o(0de0FDADm+NZAZzX{r&}hO|T0-L&Jr+>mIvaIU03;YfV|
zdzt&z>92KJ)fchHSG-o6yrQ*aL--WKQ4eLvrxv>IeD1`o;AZoA#uhj*@#(k9XQ*`?
zRAcl897!BvfD^(0Xfo6W^c|j&cUM=}j;Nenj;K&Pjq(S8-D16aZqw5wG0_9;x@47=
z#?X~yJ+W~@45>F8vpq3e!t0mM^KZn`+509jh0bQ1U>Ksg8VxuG_Lhv6zpUs;_mcL@
zBKFt0bTp@IPWLUi84)Q#-5HFE)%->oHwP#;>4Y{KxB$D8ef75c50>0riTtv3lIt_6
zP>1FA>B`j?ji2B6=Sfk?j1B2ZVmwD_UCI^2KQE7o0;_YKt8Nn40Ab|K?$o<_2QMs1
z6ztBBT-`cR<<6FmcQZXO2=wlT3CmUW>2Gm8dw-PI;ig==yzJbP>K#wElKaf3H0+|*
zfQIY9_$HH!Ycx41{B)snS^qKYGMJXg4n&?{BkpM;Vh53Ku*H1Y6ka^M-}EEcHa@tj
zAL!g11K@QH5U6wk8j`m_mlFoXMMOoTFYL9csVS@d57W`~=MFlc<Or`9@eSHYU|;t5
zX#EAi1x2~>M$jQcbkD*Z329nL{_&;ggIt}h^oD(3{uPdR(pxtkMEA$vk)wP%K5#-N
zE9Jx9>((rc6im@8ME53dymTKZDNr6>>3W^wUqX2OD}Du&Rb#-v8;VRgOy_^Pa&NEJ
zVFf2r0n!qnN6E~<<CY1|U*gwi!8gSNB^obO!`&fmFyiTQ?Tj@aZ&hGYDe~J;<jWvU
zI)r4fMz=cPpQrJm$M#(C2i3;QQwR`FRKu#-s+7N0=G%#Ew~~CHO217XRy5iY$R3m=
zh^9B1M+#@kLl$-F-sF5r<}sD>vB*_TCBZ+Kd1QH+kWwbCFJYV;O$%s~tpP=9&+bH?
zTD$^jyYG#%-E-m4Pc2vR!ko3>>uN0Ys`ujWOtH$G?HC12>VwZ5$oKYd&M4@6+{?q-
z$2s{{?{^VyQ5zyznvp*_z$5TI5Rl$V$$i9#IN&b3RU%{W%ohHl+I;S2+RT*2UNl+q
zCP-KSIU1nn;rigdry(vk*awB`ufj3N-^RN>(tF$r_$if-v|_!FSn-2dxjQrn!>?0^
zNc))}$yNW<VOMOg=FvLm!t^lVfW3X0`os}h934qjjp~8=(zY7YhvH)0?+qyoz@Avb
z?;KUb$`o2E!b7CrWN>B8Xk<|^-VzNsv0E?nodT3CKf$|qIRgygIII%!Z(|gsp64Cs
z3e=II<JRP35U}b8%eB-3jpC}`iAjhlk_}{V7}K7uaD<ns!6-VokyFc7!k}NIID2z1
zlS4oNZS9rz^zrlgcb^j**<PFC$F6SsMCFWtjawRy4QKvNlZ)f@RrkY7H`6#OZLNzD
z5{Z##3C&Jn+Ry$r#@vG$U2kQ@X=(H6k`NyE%bn5<z$PW9Iv4A0NnxgtjBdSwl+0u?
zO{esX-lfg{&;C3=AS1B7KGuI%y8~}IU<t_EzFOF-I_xa~5FZ(r-Lk7kmp$uIkyXA`
zd13ROjnr~tdxuGoom563xpzM^Ow`x~x8)952hNnDtHJh7YT`l17GEv7*-o0;GDh*Q
zGh6W2ST%VI#DpFCpWaxsrnSK+Xlq)%>D2ZgoaV&AR%P$ZYi)cItTeNv=l4KN8R{Sq
z{qHNSLiX_vgzVZD6dQJR**o_$&ntYXyUvP~HNz6!zeSpn<5M{5WUBx#FK^MkRQBia
zbDcXhJiL+!Rj>OKQ848X$_x**SPE&Nh)rBVw|Tg|!2W20IFi9ve)R&VCc1-BNn_&n
zqh~nX&dsWNg~xwg11Ynp(AGe~`U?R50I?1ypr3&QWEst;1GGi0LI%$r8xC6%C&={0
zI2yoUM7r5v(5g(jqs6?(24qA<X>u}zO`H9(Fe0It0imIxdt*^}H!@@n;$J9w<ERxU
zO=U_Q(4tvy0eCi5QqW?UG2<R=IT|CrKY0j<UVy5fHOjCOC^?GPHSKjhxe4Q&*X}s_
zU%0xManzPAf7MmJT`>`JDeC>~db;)1&+qjq7KlIjG<WS5ZqlQf3m!}QL-*oV-*!Y;
zaK|dR7j?z^jVNN>c$)gHMt9FrZzl|OVrJYD{3dOJNPha^Sk(hM)s_N_KtaX9;MBGL
z<EYzU0igNYb79T3{isQ5_!V!W8WE5JJU0_$E;rsJq7}(!2Xfso$`O4DM-N1p7B~Nd
zV1ax8vmDfpknOP?<P5L6lTT*3Z6M`hFX_>_j%IE4yB6v9Pn%#06sNWBmlhZJk+4`g
zghAnilS~Qp5}#+=wP%~nKeCl`Cwtsvp*bF}`PY5;rI|=k#i8}V+xv@$4U=d^MZ0hb
zQ;7;bC;r(TaGFvd7oZ*4Cm`>MS&t5x^!mi=L5IItJ+4=SW}98@7b=ZR0ujbr1$=#h
z7V+AxqoGYg2|%Ip0QeB$St$zim5T)f(|B_d(V1>akZ@S|0bYG|H~D-SV4aQ1u^FOx
zfY>rkRl4dMWk{Ty`__*2U8>jbZ>9Tnq|7AxhHD#rK8;$~(d;$^%GQMnHSh)U<wbjp
za;3FFjpn)$_9b&up)<B?$6u6B#jO2CwFD2qcYCu+2~PyPx7<|9wQsnZXQ7uyYd?%A
z^(C_RZ=2Nf_v@>ZeX|Dyrr^(8Wjf%LP4ahCXYtgEpOGs1`Ek`ll1Q&N*RS;iSa~kv
z31Gjk9Bu}Z1So!kkI{FrG>>$0ZH97_Ne;{HP0$~n-c4f3=5xC|7L;~=@WB<WFgeDv
z<jAc<;OJu4W{X;Bo5Ur_CFi{sBg?&V+9RvoouY8G`lv=Ue+rN)hRZFUZlV7EuQ~&f
zI-dneC@3hb*y+iN&6_TO3J$@xT~H-j&fxN3jz({;(UG>+W*O)_Ic@BaKm-JF-^T&8
z1FFn-<_F(r@!WtLGZo#Y$AAh|xEJK2t9fc-?5q@PMPq<`3d9e+l#BJ1#Ulc?DL}|Q
zQvWbUm+8W94tw?s$L&vd7(*E8&bp6@>!ab^Q!C(es=gxx?v3q5M7Yke2g(ZLQpT3_
zmUr1OQa!Y#%U!A3yltk0o*_tsh}-ipvW0e{0F+mnjnSwe>TSoMN$m8w((T^nlWLQE
zXev-p94DLq{Tic+061hZBFPB$M<Yk0Xl?jDq~ZbrUrn-jSJ8KW!w}jX&wc+vEw(LI
zO@DeOQ*soO%8)njiij4cgex;x)0oAxmghFYov<3SHFWEiAlddUh<Tx2qk(y-+E@E-
zEtT6ep}04aNUF4kpH;C>AWkIx@VcbYxGgFwjSQ&OWQSGMD)heMq(P>N6XD-8`^7=i
z0KZ5`59^P+d}axTwE&&snCiMQWwJJ%aY{~CUNhg^vkZAa#tx6dZ=HOwer-x%@pw2-
z{aSBNZ_-Lj5x<z{8)A=SGC3Sx7Gy!0JmB&OUXBN+<=W-tlD8wCj_0k_xrceCIod=2
zUG*gY)9A9b+-m2NHp~>FFM>d#682jRS54ML8IK`~d9)W|Z$axC1$%I|pH|2&?;ZK$
zoo~O4xp{DIa!cXeat1mOh9{<6^jeHI`$a!c=#4zlcQ6jJk*;Kr7QKlBgjLHwhZNoh
z;30GrI|_N6ce^K5o71F27UfxO64EgcG5QqfLXGmr6*0y8ly4s&ANQ~ntSMXNa>-&m
zoT(q|jMd5Z$&90mRLjd=eQfYmfaWUs;V)Un7#de_E;mZ3`ol&D{8bH4_X9L!7ziY?
zHFV=q{j<ykjZ4AKjHg__50QGQ5C*)D&Asf))_y70$UWSK*6Zhi-Pj+!5(ze7xJI9l
z{lJ^8Giq;!aKfSy9A|y@c0OhE-8vM6k+>q2yC1$th`c2*rKuCZg>V(j6)+EUg^T2E
zf){(zLmob;@Oo9(Rh*}?Q5iOOZdK65IDULc2Q;6n-Y6$PKdt!TQi+6`)Hs?hI=xEF
zF3b&J(wi@)HG!6E((lAMfPu*G^m+NZ0>I_P@R&pWA$L*4LP|pWBqD3`x7BoLb3kw=
zac`><GYY4p4n3!`TW~m3fc5Y`>x@`-USH@}!g-M=q=GnLx>cDzv{$gEj#|paCJJ=i
zl&=kT-A&ZVB#@s&_7c!>>IAiOB?Igl^93O7-y2v2WGfCwlkZ0z-DLP^?lx~h#HTN|
z+@-sr!6BCqm1YL+_1puQPbf*}C$(=LP}g7{MKqv~M-8M$qt#sGY<ML?E|bnBy;)ct
zm0NxyOt_7FEuTC17FU0c9d6UZ>SNnSM-Mq9OTZ?m?x@%@sP{HzZVxU66cvj@odFS}
zPD-pd(v*gmXdl$<>xoE6NvE!Ot$wpiad1_4LQywNmh1A}12M=g@*)}_5ZGPLcgEmM
z0)VQ@9nc*dyzGRk`kefpfsDcv{_%1V;py=1t;_Mdqs578gpH?Kb|oB9DCoeaZC5%J
zcnpT%-$)j1#mb~C*6O+!%6+gvI}s09Rg$Kyt*yMWV+SXvp1^S2<NY4p3!b>cZ)|O$
zA2r%o5+dnCPXbk)f0oaq_>rqotwK9j+X?iJXlPV7Gwa-2vgiOr#C##u*o)U#Ex>Vq
z*g8yX-a42)zBQL(sk_Id`>)h$V%crz*Q({U=MEDV!s_LK63J}7wp7D3B<Fxwb<g=|
z;k~Zc4LQKm=KJ~pYL8v_k0XFo7HVhf<Es+s2D?qU8gpf3l5bStjSi}0d1+x`aq&!Q
ztoXjLo&_=Jr;j)<;;3~c;$NrKy3nZ{Y59vId|u6+;m(#Jl?8gsYTCPu#=bd>Us*oY
zvN6LlKqQwknq^&EyVF85^{X3;@FsuS^cvh;mNV%;ia8uCv30h;l9%|Ow)C=ZAQ>T#
zHj~Oz>LZrahJ<QixhK#F!E0LMni<vl)p1{^cXO3mIdK(3BdfG!%gEwchNN?_@Z#E#
z=GMcb?Ya$(QSWw|pg@<OZCnMje}}E?=)5;@xCyk?C-NqZIi8CqAZ2(jbGlk*XQB4x
zK+dED`6<^W_G1a#lkJ<{`)~;A(Ug3vJdz9U<I>L)JDJIfn=uHz<sjZUcI{w=$@Nk#
zP76Ek2v(KhmM@~uAU{WL#+NPMen3RJy1!pMZRH8=S2L>hp~3s9C8e84I(^f&UpqV_
zJsCkzf-xMQTZQLo30$(H8tXNlFLA5Z{46$+`{BD}(Upu~1UFF$k{SIZjQ||%ac@gT
zsO@&JrpB5O=ls0OZWgJK((u7>n#HA-rW|^*T5xF5ox_<1Tj1+9s>5kB%?}68Ynmfr
zk@Zy}I5q+t%$3$4l#^fY9QvL<+6GT~K>KP8XWJmn1At9nl_Wr}y%Q<ExCuAm+wpNp
zu*4KiK?O0bm@G_kLxtU#8$dk+$A<psaie&un6;wDdE>l702#(>1v~OW#LErgHog7T
zSJ8bDp!V}+_}q-R)h|n*T09#n5`EfLIs$%*+0+wBe;-Zb%yu(>^X3f^(3Mw|HJK2)
zrR%eex(8b<HnI*R(5G%}Bmhig$Gu=`TH^<@(tj|Ip5{<wBKc6FAvY&)B|CKh0;Kth
zO`}<jaHP8$k|f|38@L$Uwt1!sx)^ty+PUeuNUgVJeW(8eH8?C-@Z8GqI@1^WCS&!Q
z?&vN3No5KdR0<P|3*sps2Gz#!>)wt4xyeFUEy>2V8VB1rs`YOQfw@vQ<5a4ZVXM_x
zilhAu<K=6;jNU54dBea_27oaI)bE$TMD-u<Z}t+HgAgSEAstGc&GN`4x-x*&P^vy^
ziFsV`7%u>7y?3g`%K&j3i169&6P<p;y*mYE({x*V`%561D=tggQ8qpIB0@}0G|mu#
ztBg>9gDjX6CsqD83ON0Xi`v6_RPr4y63YPN{b_Cu?%b`Mw#Y?gVEW~yXms&iHVLF4
zYBfT?!F`+IsJp9Zovi^(=&s9$)~8Fvcgl%9$k5+-E=sjwriNIDMq@OE=zkuyz6x%7
zb^NfP%;Cr)24Mvx(g(l+v4(GW4&bv9Zu1o(<erVpuwbe>7eLQHmL-BvGeUq^D{TVb
zN7Q`I9z}~;st-v1!^~GKr73TW?sA+H^dNL3SvfdtW(BXH8?FU?e*od!GtzT@zZHcN
zIiqKB2OHeWC6-KqrP_It?EYS<RQ<gp&Xh1vIR2^^@nmstgiGH$Hd;9KcP6cAZI5+S
zo-mHY!H};N{KAUcnc@qD>bTE_{NRqbEu;O*J$Mb+ge)!5SI~t}-}$_~pb>?EGXg`O
z75mA6AB_%iSj^gdh*5X_rdm{-5fVpN-KAUkRk-Wf+DCP4<sfzN<FtbZ?&=@~h%9OG
zqh*|CKST@;d*nNJ5KtKre_EKdAMp|o=?kaz0G%GcFHUa%Qf|c7lirG~cSZPA<e1e3
z$e`uR$X<3<=yygj>hk`1e$thZ9&nUpIs!OBBR^oYl}W7!3lFdG5gRxMlMAWQ9XE1a
zyh;UaU2{Ssstb4c`?*KxpY8G4?%rbn#0l}`bF1pGfv7bq*Q@-#rf*pBxAc>`*lrpi
zf+)J!sMz*BjNQ%77`{eCA>i#hB|yw;OaeBoHVlx5hfcO7Ha>NzxPACkv@HlUub9jy
z{|~O-I;^U0eZ&2fl9uik=}t*O8l@YgK|s12=?($uP?T<vZkBX6NO!Y9y3bgC`<%Vc
zkIR36%eCfW&N0Rt&vU;8+<y2B>j`?X|9pft7)Cmpj38Y5xc*5D)|65u-8C`vW%dRV
zQEwoCv|3C{6Z+sbf{@xUY7*48B*(-S^~Z-|Dfir5Bfj|7t89|w^?%ak&+DdJorKI|
zx`doWx`^sCB$kF#*(@fGIokmlf^=oMik5V3ZeXF78?==@-2Fg$12j(Rm?6MMRB&sp
zE+<Sd$n$X?ck$Lu<DTFLGnQFQN)rcv^CiEpe)AKgOVTN?1mh6|V_<n2$6=MR5xMU9
z05*9>(>P+g1`flx#}^x?@+|9&>O}=OMv5aFNC8Bwvq(e*H$YS{mF)xagRv5AE+0j3
zyuZHqv>I^Ffq*tVJiO=jt;P#D$le9hvd7K;j8if@Ug-eVzk(%`M9}w((adG`&)6CG
z*woOl>jj=HnO<^8@UM5_eU9O=!KECAhV^j(X_TIbvl_6Q5<3nJCeUx6P!uh#`$WQL
zy8<#1l_!BwaD5Ijqy%$CKZ8(nz<GB@cqL>dqE808#jBM<2ak#RpnoA@DuH`(L-y;2
z!dHI0{S`+DD)1e{B#kLT9-J_pMqTd@tEr;B4bcX_xW6Cm>kWw6rqs(EkH4t@XCE~9
z!CQ`Yf%35tg`WrLG?~wVm~k}_Q$&XIYbH?jF0GenSA)kAv1YNw%gy^wADz{ro)iRu
z=ylu^&<LE!ctK#6$ogJ~j4#*ui>WOJK?nv}(PbQM1Rhgfzt0QTolz?F@poDfXjuk2
zK@|tL3{wayF9RI};nh~LPHEASd{W$#_q((ftjx@BqiJN?m6}{_Tw>C7BoRL*P1}nl
zGV4BNYo5k>mt_^kX-f;VXxXiy<Ki*2(5?jtP(7Z^Y_-AKm_hoO6PI~Wt51O<@ZS#Y
zZ|69L6Foh>Nl}aNatmBE0_$a<sK@m4dt}DMhX(-`5>PAkz-pi{>642tGJX&cITEnx
z{{p(k$=8kCB#DtYKYF@ARr#D|EtgNP-tSQa))quS#V_!O`}is&If0`v8t{1Rg-!>#
zWX;>RE`U>7O=;`%qX<a7<!iDg+91{(di<?l1((u``6mBqA_;h$4d5AYC60+^qbdm^
zNvl2lq#5#4`W?YKB<I4eW4sH595e2I_em^c4SsaomonU{%neW6FRN#p-D#(-`9H0Y
zyiV<D89i&baL<({tKXQye9H_AEMky(A3gu^MJgw8<%cX5Q~h@S7zl1e{+&=D`y-3i
zLyK=YeQ?@g%jo7I!OHu?@$Zmeq}69y4Tv8#u1!V??b}-f(nwk(TkUE;c_|ThB0x@K
z?Uq7@iqG9FXU*=KZH|q%5$O{)esYqwLJ*CA<F9L+5}|SwMTl-GvU|Hiwp|OS|6#ek
zXeSxbl<K6~82mD^`ZV^MGx<y}h3i9%fI@+8KI@YA{?H4mAQl-0J$k)aERM^gRnR1B
z>5*Y|wYX7(bKbe&77}90ahD|kF2;T~R26XgP@mnrBNeoNsyltXw$q9ud!q$Y@WdP%
zA1zIIMS*RTL_jqo*}zx(o-@v%TfCoob6T>O4Hp^z0s8nbvbB$^@~0Iz{H#%d54)tV
zFM-a6Q|q~-v305Da-VDpS_bf9$RVI=n*o(ailAGW)o*%&1qo?s-7adVr1?|1IO+~$
zWw2>EgrX7UOq<!OATlsy+LCuf_(G1mo0uxKF+<k7X%Dm;2%#>iaBUwe`CFVB5t|zc
zNhS?jP98qFFmaqTAZt0iIA3<-yR`W=%Syv>s%ug~Sw;@RMFt!Wi*_AS#jVB;FR^*{
z(_A-p*M595<}iEsTRTWRmdPmOSpPzAya}g1MpzZ?@{-G|JjAD`&idC+RtUT6F7krf
zt&08bKU-}A&uP#_H4#v$uU_1>W(M+QWk&GaMwO@JRM2Q~uDogH>pW=UX7^nx+@UGQ
zx(f57K+C?5^p8}ppzKs2Ygfk|9yjzqh#kBm=Cw|-`s01oYve*@sX?A@T9@CPEnf4R
z+^T&(Loe!X<iaO~!-^bMyM<NliwVl<kg0n4RmE-?^@Q9I9<PqN)<%+zmNx7hEJi9{
zbcj>+1Qg8Izbu;m^Y1D9l^-U=1X_~FOhGqeAUECPtz#8*J!m)ZyAKBH%5{(wDLQVF
z&50CFH34rHu)8GglZyJ10#%*Hex{&HZ6PqUFpIglx#7|)y@pnjb8v7JO(vrK{D4Q)
zz6{F0ta|l^JX|2vlx-PI`VvQV63(D(p3rKM7q8mDXL#Jh4L!0jU^+Kjdnv4xm0`wI
z`AanT*6yHlK?K5Xe&b*7|0hVnxNjk|zNjv~dbOIOLOK*BJovi&S5Wu^vVq{iKltbO
zDCvRcu8tVC9kZD6h&cJl!yU6`3_Qx1Jz2Y=?OGMAuRc_61s2ZQE7`qtUH=E)H!q*y
z#lZ1h9i6eH+;VQ*3}O90t$m0zQog7foihiwj)l|6nL5eix)PRjng#=Sn~aA1-daO)
zmC1hTP0gdoEr>2}0_pa1WX3vWyRB`d7yU;X4^w!Jd6NdU!~MK3R}cCOB^Db?DL#A2
zDh7w8*TI5<yP9hX@6);1E8_#r*_tVeu{s=bh#VAUdriUp&+bci7kf8uZtrD@aNuBs
z-;iquYLUVxGr&Q(^*h%H5>jB$CIZwF-`a?(=&+e(xL(8V#C&h~CeSxcHo~LA((2NA
z>{KIq+Q=;THRP5a&)$QPlB`2|^qkyDf3!8-Vqoo~0tV&t0Lp$V6<awu2jpSH-cLRA
zCJS3;vyRqywRh(t0_mBE!3YM@TH%O<^=pFb_rB|+GtIc8!=M>)F6#Dy+sBSngPdJx
z=J+mJSDKEV7Pr=jr50fi{ny%6|Cib4eD(TdncIU9sq2Q&Hpm^(p|a|7U?5OkM_OtG
zifufxPPN5nCR}n_hT;=vds!1{_})L=%Xjo7*la>a9ST4ofRd*i6}Kh8p>kL%k?o);
z_o2-HoIbb?84+^ioF<(Xy=@}5u)3M!b6wrfj-@m*`Yb`tcV{iSep&M;dC4N}H|9&Z
zV7(|1AJOBn!;Gj1XuMiv!rIZgLNO1}wupSeItgIY=t~l9#KDU`Ouu74guJ@?lK?%Q
zFA}xiSvIM6evpgzclNqihCHfOV3Rr{-#&^&86??tFN7}}PG6M+WqlLiNlAAzDY1((
zYB0oZdkO{40E$gotmee!bJ|~7Fu8>=L9gT7n3LF^V*F-*5>xS>2whS_M?hg7y%%d7
z%fJyZy^4(Eaya;0e){weP_xLC&RbO&Qiqm<rP#D*L-}~rUuootX_>p-2_l4SOJM?2
zKM?<@-@AOC6NVXyj(>kF@0gjHu|h(6@lU)6K85d<QNYdcK|^~Ms`f6>23^5|rp7xy
z@CbdZ$s%*C-4_2XL-yz}BCua7Axxk|<fHjkYNTg<Ao4{s#SZ(nAlgR~zZfSM$o38w
zV{j{GxF_q%^<|}b1Oc{0dB;3^Eku(&^V`35a5dhN%wG}t-G<>91Vz4vvyY7jeBnIJ
zbsQm=uG95%2Q19@dFVe5FIyHxzfOF*vS&E)!dLe54klP$%&S9N^JM)+y;7}DcwOqi
zH;TA^w<LZFA9z!@dg&ngmb><)(wZ0Ri-YvIg%1i>+tbJsm}gQxJjiTgPOH}Li@|eC
z^l2a@PA6r0mo4)CJj80QDz_-X4pn8Z;drX<RJm^;Wmfjn%4HAvRM6GN7*{<Wc_HG2
zs!H<@E-St>HTg}eBYO^O!3Nwn9Qo9Haz0*E)oQlE(o{m+;^(T*1RThvFOk-MeVcgJ
z<{>Z2$nYx0iD4Cf#B*xpUJsSqT0`-RB=$h7UNptlW6AV8;L`-T9L$+TBiM-`NAb#^
z?_C04-!2jA1vaY$0$F=4MO_E<N*g}KYDG@9ZI4H3Q=84%V)36NZAX_ctEOCE5WSHV
z1H25UD+$)ht+~RR;oB(C>0H>FKW#B^i$n=*#zy-QD###t<a)a3*nY`eeVS{qmf(E7
zNY@4zT#AIc1#2kC!QTS|NxjFZd1KiYXstB~n9JSSxp{)hR`^fwQ{a%m{`T$LUWAy3
z50Up<<`(o!vrqa~bOX$Y9|aSWajFDd_Evy%y}NqkCE7D0R(8E!xvkXz((S$zeb>r+
z`ws7~^YpUr73V(gO}p;`)zanYPze65W8!2}yzkk10yWIroiW(HtaMCUon>+-k#fC4
zdE)8zpK|FpS~RTRCh>lL+$@Z=pQVhCoyi1VoTes{JfvJqZXZvjpY$Yv9Xj~4mPMM_
z$qm2t-B*Qp5hNqst*<>5?>z^Y^}=2S1f#z+j&|TnQ%nza$i(@=DR(@_XtMHq*IhXi
z`vnGpiaS(0YBj)z3zNXQa9KmA6Cok;0~_R$Pn7g-``K_7ipDp{_m~7d`G&7ji$$Pt
ziHujQp)}v67m<_Vq>RhD7AE9jlfM#~C>RkHrm`v*WYhra3)>2a>t`oUA)6&AM4-PM
z;eFIUwK(>}4367w9@ng&?oJp8a^gXyQ-MRTmt32OwfEDnXc7iox?HtBWc@Y!4q`bf
zK+im05%BJL_v@#N`NT+qItp1{RQJmEB4@*i9%3hm`}l3I9+C$a!ak9ND|~uqoAGO_
zi>cCd7b~ddZ_^UwCs7B_2!eKt?5YO)aOe^;exxIz)=oFSq61;@^KteQV12v-z8U+?
zemr*%k8ePxe6!@fiw;t-cp+Hh(P~i#V>_2w4V3uUf#<pTX0KS5*p5pNys|oIHR8<t
z5NQfT_*}gzj?%*udt4C7Nq)`+y#H(EM|{Z^uNR#UzPL|AXo!4kF+|3lzsW}QBotD7
z=$75Bem@@^zT{*m%XE9S^1`7$Zrp1G2rmz2K<F@Kt@!Gt<#MaAzW0fwPwaX13%%!@
zCRE4AmRC0C(kmUR{BAc-ZKaoFCn{AQo?a_gT6w4)+rPP%GgGehw&YK?7aud0RH`rd
zu0tBh|5DeNp=q)0@t-=oMJcrn8{kgeP@avpS3i;^=9a&vLDn4~@6~yVOShfFs8=}A
zi!=0_SDi~BKD`TQy8ibDDc$syV4i{-6M$ee4=j;*6j7g-^TxljKKbCBCCZgrJjqFY
zzuNg-u~U`5V=5h&tqH8_>|b=dX!zr7G^^q|us=EH)FO}}++ITKB0h=bjV6O;nUCEc
zJNvcnh=A3nkl+$~AaJRFdHxEE8B~8?5wZSiYb^~%J*k$(_8~CO&8otIAl%~O;Cz2L
z{9^bGNH}-KzK6E<@M^)Fdk*7$G-9*WCZY;t!htJa2kISI)ZQ;<Yru4K&JEb~>SZrr
zhLCtOy;YsY`o_lPN_#3?Mm?oPd}g(O5KxHmAvO<p7^$4|VoQ%LO_%mF6}>iRnN3B~
zzsMliKDT9QnXsnfO!)%)!czPB0H6J#qfF7}S8ONtX3gujctZW(H13Hl7OGlN@Kn4W
zL8t1~|7Rls1^rOeX>8_a=@<mqQ(dKV`|q1i^9YKPMr%AwY-!xLrMyp$bV&J*o(OrF
zJp5KDDB|N{ji&SI0}q-9tZ#an;JhmgF{HXL5!S1aL?ZyOrrP%Q*MpfKN#6~FNRnMo
zMp#cqFTRgnS^Ee1_eyg;I9%j~unxAne<tk0Hv+>q-rsvr-$aS#)U6HB(mwb2=d>ly
zIB+RSH0$l^{m#1t`sFTn2m1xwz^lvT9hZd1L59khb)^v0i)|Q!XFCBN71Z%GAG?32
zorR#jbR>17+KIPX^FC-t*28Eu*D5ROxIT;XcVkH2&}$~lkmoPfMJ9>BWh%jMDF12#
zqq!&s<5=}2YYsh)6Z_ae_@=DQ{pwhhr<MV#_%s&jl7f==qBYHuvKsqGN7bNASTkPm
zeBBWx^&Nia8}xypXF<zhN>pz=L!73V5t5DRb-&>*2={;22;3`}D9AHva;_o>fyo}@
zO$56jVPVXs0JB<(D(FqgsuhRR%L=FS3kfynwNZWDq5tao-VPa}Kb^yBnm%2Qb&sKc
z*g+~9zxJDo*Rfl5MxteV=lmVgE1(rPUA}8h+FPziw58oy=95Q`605N=)O>P^e|5b)
zy?;`6V;hNyk8LkTbo$Xx<7VA3ML*Vw?9nz1{E&VxgmLJs(SP{}Cu|&;^2E$}K&g&l
z#3Hgo+&s?T_>yt+L@(CW-D_}vhwIEt=ut=d=Zyi^RK;v=kW(7s)VsGn=&TtnQIJP+
z$S}*J;e?{@$wbb%*4;JdVCEURd9-dA04R0b^)0Uz1J3dB1hjHz0x0XcF|W3&DNdqS
z9UO}<h?2_l{bJcfuZNGXx>C4|KCfOI7%lzix^mreylZxc^S!;sys}yRp2lGy;D%85
zeOzO!PvU7bc5dW%R|v7iV&yCSgIiL4iX~JyQE&9atV<~n0OI=~NrXFW1!S%b0)JF0
z>U1BslUQD#u4;wh02cwae&$wG9<(KL^?tv-E5|Vb`2%?YQ_Dh(AtYJ1!lq6%28l#r
zzli7cz)|M0T<nIvAeq@`avsT+`=+a+QqiS|TWeX`hChER0T0Z4uu4GDw;Ha67UG<D
zb-Vz#<T-wsxw0qhVZieIdd`^nxBWjId~)BmPHjdn7YUS>JDD(?PyieQrZd2Kc$G`>
zfz`BiLBz?Q8F(r=)NPq6fq<kiZ>y6yAUGqlA<lKHhkNdolURXvxn13Bb%K?p8aS<o
zCAAX%0;mKW+Pn}k*@Brc{*!7%lMQiIhT;D<979Z)&jHkN{p}{N#rk_B9ujpaBI*0l
z6dJ8RgAyg|HKk0-NW%Hek(xRJG;OOc5ZAJfAig2<Ih5-4KCcI+6P>mib~6-KeKy0Z
zp9Et+e+I7BKq^0Ud?0K({?R(q>uugO<pQ*p>u}xZ6DN)o5ROuAa}2@Ap^0rnr=^gc
zmBV12kjuj(<3^=WE&|gn@hPiRzU5bZpNqRrw-sl1K@|$^bbCY-ECU0J<grv#24vJ|
zrJ<NWg$zwDk{_n=iTI5U_jy+EHUpDiE+xs-`)FzsS^Oi^`@cIzM=)AAILZp;G9B_e
z{d)$0y*tj+JKlc#MXJb4I_&r|9YL6mq!Yo&21%usl=X$koXP^pS0^aO&Sa(k>V^zd
zUcJ-)>uDT0KaZPPCyvliS20re2w%oVngafp3`_!;c=WEWt#iw;CUtWCkYl&{!D273
zPBH-J;9^H`gE?uCN05W%<Y}JWq5qBJnkC%dO%TBLq=+E)RHvI;_Wt9Fh`ocUm+_TJ
zM5t&4<YcTrx;M`qt{wsf!br$l{?qAXcp(!kA3Ju{UE-}goAip1+gh{JWP0)ie->Kk
zaseiX!y6s1C|bsO=N|i#-^=zT!~|(S)=rjE+B(%l5<$$kj0z%3;6v20djj-h?ZB~N
zM$i$^TV&snamJ+-9XF!SyP)3{UHt7CIx^%PggQ5jnuZ$7=`zdr=lN$}Oa+y^zKZ%J
zXE6teN>vHd0^=9S;C|a8_;{h0T)<wj@gExotA2AB@aD<BKg>+n5H)>Oc?WWkQV~#K
zHOvIQNJIhxf?~nAs!jVR{E<Au(#A$rbn?UYezkD=9yRD`qO6)Bxi6IYkDrAr;yVfH
zJrn$N{XWka@?F81g75C2%e+ITh;BA_)Oq1HWfI~|+~xw<S&Lv>&oA5B%vS6M5tw%a
zhhE%zr_+a#U6y?_#!%n{eB+^~Lryb(s<U;gkR?!vcrXM(_=pAyj=__i!36A9-7en3
z)Ynh^l85b9Cmd{6vQ`4HSF%=lx-qoFKdHqu_5Xy%R;vE~ZWXZI4b;blDeU_)D5B8!
z&AYrv6Zku1;W2dTui3A-m`@a04ediNqh^%Mf034qdc~gw)o;Y6-is<;i{ru(lq(R5
zG}O&?(KqnvJwr$DvOT@g(&&$0{@6Lb>=%3EV-fG3Z8;rUXwU_$G*j8xE%2z+Af<5n
zbKgH0yr2BW{_qcakbYZXK*;gvv#ESBIcmueX{x<xoAdgnF*o(tpput9B%t{W-viow
zKee!MT{u}maNF56h1@0IP@s*Hw>2?tD!}wVx2rpbEK<ALe77`%_FI5=2tJF>j_`E&
zYJf}pGG<In_^?vRC!0{zkc8&;=oIc_qyt$S`}afUGA7zIaId~)2vxm9QQctr8-jBj
z70XH2f2|{T3Ao!y5P*d51#pDS@J-#9#tb_xb&=S}Q#}UG^S5}=Zbi>x^Pzs<P4q}P
z^JZ9=ins5uHx{tr3~}QX(u7vhq7js2quEatD8I2&_M8*1w+BQ?Q2$%+2rLmTe22;~
zK)D<q7^tmC&INAu*E25k+4N3?kLds2PlWZ4n*3F$B%0Qc+Dz%^*8EA#r|u-qJ9hj@
z7R6pHFM8A>{>0MoZ~65R;@4|R-%IhV*zhA-u7CL>$)YS>YjnM4b3E^et+^>q$Y3-#
zaCOR>;&<_DHNcp^R<{m+K=YbJ-Z07j&EIOUofmhxD?`VLMZCS3(6gSCq4^4rMPK-X
zq-;CN^05MpG?974!<J~;thG%bh8Q1lwzWhd0P?CLU+Lef6_izidNRX)n2)MH7%vRN
zo<(vq*Qt%#-%i`qh~8YhpEA@o=ldnmL-}%x+!)$bG%99Gu@f?cnea+~Ae}ASs)~8l
zpWDxso%tG-zrDl$Ep7BSRD5pc7zFhK%AuTG%b(nFN#j!8b0`XXtp@=8`ZoXueF#Ef
zPi`PU&#!aT$P)6X0N^TkxXWo^IMVK%h9GQqW0j=-!Na%xtPfXcZx?7}a3PI^dHN(z
zV)b>nLP~1`-(LFm>wilD4nyCIV=`i{eco!GZ=2Q|Ud<`OvI7lJ4RALoPaolx!4={5
zLX~f`gj1&#0ExjHx~SA(;4HzPCS~odJl-5go_hOby(gB2k#T!Iq(59M9GmtSAPK*N
zZW_IQbN%lRFuO{pkY2@>p_r*#{p&<LSl9H_{+rSF5LY7RFyNwkg=(H-b;@aFC_*QD
zY=3EscC^$G*)-OzYf^7-fPf0vX@N+LhB9ya1B&g%kWd|&fS-kfQAz1(NPs=F-e?7z
zU;KK}!ONi%KfMa#0fLe-SOr^Y_(4-fL(-ROvg|#HWCX7ABh}_4HC>_#Uj<k!H<?(U
zr|6ZD*oHGgqgxX=tYk<g)uJ!sv6|7jZhd2g3-OwL`k0I=C>Qp>Hq80m<i{E)dJQd<
z;Cgd({JeX4j+4yc`1(uF{*FuF;s-3!!Wo>x<&wrBP|PzW3kVG40ANNK0nh5SFVMNY
zYVAFk;V|hgxI6<Ur~(xNiPVnYz^0OuVP4)LZe<&^@Op5Ou;q&r`M@b5$}9XJdE^g?
zrAj(H?=DI-(OpPpVC~nHsYa|c;YE0sd13!)vUECaou8*FmwY}&Pl$!IHHFJz*|1t^
zfTu8n#|yDH3P`Bp=9UZL8^KG7xDbR0)nW~nyZg&!MxvzR6GyAhS(wW}4U2+X@%boE
zqCx3ld}>vAhh^}0GYUzaLQg$I=F0ma{vna!Xcn!;4mBzI!Yw=;Vg0_?r;_JIVc}Cu
zub7!hpLnX5xYr2&o{_6}CN^A(VkNCp-HA4s9m9{L!utFw_0@B1>;Qn{5qA81)ARW=
z!;k1)O3hXuK(w<;+AGy%xXbAi15hR<s9jGCS>NytWfd;$kCJ#OS|K_EYVOBZ$m%&^
zRU<V6RP~N0X-m`*0Go&%$E}uEIq>r$L#L!tLj)_AxeX}k{95^>?yH`E)Hst;Z86Be
z7(I7La9}y8)vuO*s818Oi{NC<FqSpl{~@#u+O+IIK_kujryek__CJ_sz@w-naWD@u
zZn@Ah(bLPkFTuu1IMu|B_knS56Pp(ZnV&tjU9OB@Gigk?^)D^Pw#2GR2R_Gj*N0NU
zE-5Tb#SKC+F<3TAaqZN9j~^kl^n|UojE{WPxz}c4V*libs{8IKZgQtgD~km$)5@nJ
zPj#8oR+b(WgLv-Ci3t4K2t^4ABFfc(vqHHtA*HoV;1vH=Ycqew#iiiu3UFcSBPemH
zWZ2BTw=eqX!80boFc94lQ18Bq4Ko%2*u|t5JrgM6Kk;%skhTar*Iy8S6$&Gw=9WpA
zP4evrKCdt)iOJWpI-wp;xuku^LVD#&@bJ#t8rvJ**L<X0XN*Yl%rmA4mRod``7k7V
zQB&U<qSx3Y&~Uw7Tv$-5FcfR)x#1TS{I1vFNM!5&u3;Xi&j<lEIrppTw!+StUcMd?
z^Rv{-<5;N3!EAH-=I5%}^8u>aT1x^9sBl@D01oY6%M<xG+~%yy@qI-9j~qtFn81xC
zaMU#<9xbUh`lN(V2oDwRoHY<*s3;#B!y2-{@e#{@$lYwe<s(vs>7CqxvThFNZ+BYT
zj0~c(^*h>oE*jPPqKi98y)XZ4`mZ|4s>eR%8Fe=Fjt!g0X6e3r_W>(cw9ONGA7XUy
zEBbiRWBN*uy2^4|fSnau3`A800DSUiyRMa-H5ip3q^^!H%lnTkEH{MGR%HNWE$nGj
zlWC7j4_=OHhl)OiAGRHen@S`u$)_@GSkc#(%*V9(E=L#WUg%AyaDs{f7M5`QKy?d8
zk<;CZXUe10;K4P9z!c9H`MqF*?DPoQ*3}!VkZs^a5uVRmGN2rEqm}Z!<m8pO$cn&w
zFdu=A!5s>pOCdHS@?kOz8+R4oe26)n5vIye&TG`+K+%i5BqkBjT-3i>!}L(ZD0m$I
zc+6_lOu`?IP9wD@d4C-Zy~n;NxqC9PL>hi6BKQya680RgqM~APX3J=+j~5y-_p=d^
zTjn%w%K!jC+P)h9Dh7u~?+jfc1FR54azCNq-}W&3LFz_-@W-poo}t-#V^rIaAZ?d9
z8C~k-^{(pV%7@V9mFj5%zOu?Y+}r~Whyh{xEt}$Ay<Cg7+#GPqMF5_`9?MstN}H{w
zd+;>uCmmN(E*s`z(5`xcK_OC7_WS>uru%;=pB^}GWAiGeMx^em95ooK<oWnTWsZ}p
z<D7jE4aWnlweThJ@+Crd4;uV51nifrOD!B8Qu&kFqNbd+Xc3C=F>E_Ams@-2XYC~>
zY+v)1#0RmMJ<O>!Uf5P0zzmVZzA;AUS&wqf({?uTh?DBl3O#os-S%rs9W{i4rnFC0
zET);`LA%xIuW2Cao>*nC#}M_YD?If|`MMRct8JT3#KAm_TG(F|6cVy0yt@vdtN*oN
zUteDXk5%|=%Y1W|UbBeLjp45_EEv=UxV^g~39w*pf=H*EI1$Axp(0Y1&HgXyyLlU6
zv=S-IUj-Zh=UG8{Z&Pzbs`V-rOyxx$@0>CKb<L+o1b}ttvS<^g3+KTjTBJ+;(f({l
zIf_D*Ttq~~Q}K^nB5O=H%ieke`Jqx_qabjN6ho=xyB4B5DnBL6HhYp_+)$`<NvKQ!
zgq=&XPN52IOHXZImn2M*{{awrm4EX$euE@>Xr1m`pn0{R?|C(JNG{WAJ~VkviG|KA
zch=XSRyv!n8B!5-eO&2!m2G^}b2#=h%|Rnbz6NpU?!?raOVptvP7Go1EA+Pm;gz>b
z;#iYj5z!Kq>&t3U-Kp1kdjQn)J7higjQsko&1fgrgX)&w9m3mXZp;KUXhXyGwp)43
z&4^oP1r*;{Sw}<=OS=Wo{0jY#fZB9lcsd_&ek`U-hya~1@<W!#OX*}-`tzkimf<h$
zzMs+2>^=r~po%T!k8el(DA%@FhK(ymVIdip*!1JEU~78A(}(#>VNNYdY=GF#2bga!
z`z;WXY=LB4YmW=Fmlu4y`4WaY1VV;#b3(l~U$V+dhBHgq;4{gc*-DxaOp3D0UjsXX
z2)YoU^D75w=)sc@tZ_VYKszY?rN5rsQCODZQ_mlvka#n>ubwuTKdm2w(uP*Fe#A{B
zHU328WF~c`A$<_MFuBZrq^3V6wI!~lwPx3B{(<_3uyy$tZ9I5M@DI2t!{~{C+6*EH
zZ4Lx<UU&fB>jL~g>l02uuy)=V6>Fo0v2)GTVl|t-n_Sk~yS{T53qT7qXd?~qtA#=?
zoitZZIZ%(cL)z4yb)=#p_qn;O>(KkY-wo7WCotaZbm6bKeJ)-eN{fu&-;gXev-J2T
zA)j~$xT*glBf*g&e)H_7?V=8kO0Dzfc(EF*squ2yrXWGIsXs>c-t!*QC!{$lLVbWn
zaVZFrerW<-2PflfwGV~wog+^kT6TCgeyQ|m_-TWPX*~f>qzRJ3%B@E+yU&JD$;}qQ
z!KGl>$7FH)#|OB3dYaw$%K4w^^hw6a4zn+%!d004WM%)CtDe59(~nU4FoPB->Px0r
z<0t+_RPHU}K)HT%%s-F7&(O1caxHCO++E-?TQ#kDPSDG;;l=NF^OTo)IbcL2Ef!Xl
zvMa7#>up_s)<B&0LhU(gnS#ZmG0B<Vqq=*IEqwl3*O{;c^8fEWsr|)EL$Y=*mzdm5
zB<$3R55JYwD!%Q%3ksH6dAsuJ>b??5boh}-HDu`5Pt0vtG0fp+R-0$OBkR{fOX&I&
zy2_*5cPS#qF{{<XU5b&@s}##E{9c@dSVo^^`zgYo0&H5VCK=k@G0&OzDS}%0x<3rK
z)*|uullQ;u%7~JE%q*~QNxxN-9x*Ad==xcaTK%pO$MWuWU#C-j@!y#To4Ck0a!p?(
z_8)m5NZHNNOHgM`;iuTia-5&V+!}Nq&rYhZz=Ei}E9SX!-q7>rwnu5bK9Ih${R3%G
zC3x<|rXdR9m1&?smi3;t=}A^?g-FA_EY8AFx#;!l5y=pC&_7p=>!GE~ej}n*s!mjn
z2@QAaMsH70<P!@Y?*hh^@rsaM0A&O$8I|VxN)aQ|SsNF5(@1Jaw4dL=7p8pMe@WJk
zI^Q~jSM+m*vV5+IqB8GW_j;6C9YNa;kxDEOtsV*KedlK%|0|#KY*;z*E%^lk%)JJ)
zH>yYabsUL4s+$ELN%ib}*ecnXAG5;mHb{T$jwqj5n@~PI_Jum6+;mtoXI5kTS<m{L
zAStc^RG(G1l$?KneJ}LWo=cJ8O4!ns{mpZsiCi84_Y><7M*!e>mCIQ_-r2qlFm|Bc
zX**^qXX6ggqhVa@rKkU|`HiKY8MNAi>+0&3p2{ULhr<}b_N$$qn99w7mSs5pLzaF|
zg^;avLIK1JEc>ks5e3H<gIU5PfS3xH5%j`$cX#X#YZ3=bExJ(I0<l0ui>2mQ8JU@Y
zoCVWw0g@;xK=}aQR~n}o%FWp>93359OK2~z_rvuzzrz~*O^go+F{^~-UzV!Yfwxqv
zlbBusEe1kSUm*aBIV1mG-kKRJr%!vPp~5-oA-IRp`NnSbc*$h8VUvP%!3h^LewQUK
z9_&kb!)`DSD4-oqQZ^?dIl8)t7go_K73m=-(jOLiiuQ^C`2zi(nuIX#_ZMeCiYqZ2
z4T{`RN>6$L)8e_YR+&Moe^`a?9>=AzX?S!~jcJ&soSf|Scl^U<rPGtWkV^wJ1oUiZ
zWe@p8tc=xMa`6e|F8e%58FVo@0EG0fU9i&T&-JUk4@Cb_<pO_H5<uT}!U(VeiXwgZ
zlG2M^dkrG=x_QhnY5OI%vZpYk9t_~{KG+?TT<A53gawimfia;<{a+vm<;TP&Q2H9=
z4Qq5E(PYzy0O{un)W*vJbOTyvI603qfWK;khxGd^{teaKpg*rRB(s`5rhMm;{t;(^
z@nP)6aGBLckKG|oXtAF^iJPm)_#tU;q|i^>dtDB{mmo<Jtb-Q0c#Jr<1yMQPy+BWn
z9^qK(Z#|34HUI!Gq18n~^z4#I&Q1A_%CPS!O5-2~qz#2ld8?BBOf9LDuRvBq?z|BI
zhzYcSomC9C01&l+T#<<7_$QuDu|ISjMpq}8oE^Z%!AUT#pxkJZIkb8OWC7IjvK|z>
zWH1~=v(Q(jJl|6e5$!V${k>%(bt?@{(zsouJb%4x7Px!hQ-Yaph8XJ|4quiTV03&z
zolzsoQSAv(3(p1W;fjyNq7Mu8IJz^?_AQ?I<9$WlN~3H1yV^ZWYwi8wi`$32{C?KM
zQ^6oWkSoxCj_upK-)D`GTJzlD6yH{wI3^cObY|SJgUV0je9o(75nZ*FJo+*Hk_uRH
zuz2<~g5{C7xsgyM7+{F!#OO4hk9oF%7zW^3tzGN559Rlv*D3|IO07j~<!)R?wJ=So
zR0jtK<!T<-VaJ(1ND4!3e&H;d`N`{v;r%3`?cv#}ree=LMO#e1fKWJSkDY&?knRk9
zB!BpjVe{O};0t5?>(}Pgz0@t(lQ|#gdZ{l5*57hrZ(I{mZDzoXV)=ri!Iv*v!yAh8
zjP4dNM@ykj1&h6^UmW7&WeU%8D68NfUxFi)hA0tm@EKXNS|do3A!qv3`))TY9uf;-
zK=R{qXx`7HtiYsOTXF+ssF_1;2u@6Lc5{>Ve`-xGQ!5=H?f|vzN9PJJN!^8(?Ls*}
z*LV%a?;`err?s5e(9OXmK>9oa20{yYSbjaI&4YKUp{Tk~W?)rZc9krGGF_4)uRLj^
zg8JUmr5(0=BYgZ|IQ^&3{r0=B#i^dWT8WDOd0-spJM4<*X1rN_&#}z7CJuuAd@zbZ
z3`l_Ii}b_~q39f-VheSt`}qO!PY7T;3RH3<mJ$SI8GZmmT@C;~3+>XTt2MhjxR4OJ
z>Mml>5l<|?pIA1d{+P_CxHaUQ$aJ5}9XDtdSMg0QTFzpAlL3wNn)sz34{@aLy*L8K
zck|}RnGk2q%PHHPLSlh!wm4zh;Enq}>mbi}AHP8&ZAPIjSy%2`?A|pGT;8YqZ{Ma|
zQt4AXpy;?+mvzF<*2(S9TjPJO!>X`J1m+odorlS-ci!%v12G!RC3sw*`$W;jES#vy
zRyP^ioy#>d2^Bo`(+#6vFEeP10aBPJFbof?{-6)-sp+&7fM%VnhEn8!oz!)zJ<So*
zlDpCK-0Ht{zuNVM6ZzRz&)BqY0;avsfn0k#cXIT)DR(*NElrR0{?v=y2=a%^=a>|P
z_>jiRyRkP@eybYpFFI|5XsDT0N~NVFRu$BKGI}`E?O_;07kdC|Y|>_pY8Dg?3SN#P
zploWs`<e^zRyi^BYibfhGWL$FMK{#F2gsI$_bEq;W_o^T9^_n0<^;UXd4H?vXarSA
z06YJTNCp{T#plTtbG&OfVJxz_+a4^^>)S-eWlp0`_?F}_&PdV-g9b+g7{}(y2GC`m
zAtQ@fm)s|5eoo*8=3yR__1*kzDS(<)22$Rnb;@+wZFq1fNUZ1Ye^scq0xx>VUv-On
z@!$auo@;S1q~c6JC&O)Tk|BzS3lXMAS~B@**zr`&QW*vFbYaHJF@{*<%cJ}XC6iTl
zW4NxY<qnS+f6mNix7vm5BHJUMpB~0%O3BV(JcDvDnc?Q*FcMSb>h+hppFrXJ7xD`D
z{pbLbF{b+o%pV{=bA{3A>A7Hh>z&^<#Y;~CojX6?g_q{X&$wds&DXL*bCn`i+(jkR
z;U&SA-2pIaZAM1MqyUy#D?s`4IEg*j05YCNPt%2*!tV_&S>JrM0;MN_G`>=aGsle#
z5BOOoiHruMWVbcrFeX!o<h=c<RS*jP3q#5(d|<$ZfWvX>fZeGIx@$XR-$vPkSS{Z>
zrF6&t_yM-pT@s>`@S;ImZ_5}jErEICFSH&74NU^LyU&sw`mbS8H^>39qyKP*1KJ&s
zSo}S^{fp9@0gbao^2`FrZ&J7|<8LxGN_{1Tmd0@a{|<z1uig;z-P$a~?i1MMSd8WD
z$>FB{9puxO4#cmGDaBK$@01!`Jz%`X_ifK3mXq#sStT-fKH{^<yeOt#_j44>8pKE#
z7k{au;&3GmrB>XvfbpdK{V>*V8;b3PR(>qeL|2{~fgr*C{(R;z&kq0J;V!Cw?t8gU
zX{2Lzgu5SKgV@3YzzR1|Aiw-DK~b$(Zt=OcxNrVFD9Fy%d?Y}@XZ-YMYM}kQf=A*h
zeJ+JqqOfU_xcv$qe0F`n&b-^|=4SB2{q4RN2;Jyu!Q>1!&`f}@W6V7nWEz6)kaa!V
z>@nBbQh>8Hmaj+wK96T_L)Bo8h}T8Q65P~xS@A_VMcf|0>jSsi{F=B2EK$FvC&jJT
z2$h#>@@{g-t_@mM@0(mbV-No`PG%73FJLtZV`Jkp7<rFR%bZ1K86oFvw%&-w^|3{F
z$ozR%1c6w%<1qc29--uWG}I5=N-vx6TuDF(O0#C3*l2ZlWV7Bz07x5Bz=UwJk)T*)
z0}fER%rn`~dEdd&^BTt18z?Sn4ej+@Y{vIrPQJ?}9s<P((yXj30Mgy3@*xnp*`b-5
zob!HG4T|Z6!n<&|#Tkm|&s3-5%%^?*4gg_-$zk=ttYwD(8JqtP;d9;R|E__UK8iyi
zlP)B-mpjz&C;k(e58(Vo$R?QrE@1VYeO|T!X8)pvX@DXF-ybN0GAtf_ZxjN!z^bUT
z7TpHNTpcKPJ4u;Y=9Iwans`SUNB_RZhf7hR$GcOoq|Z|U9uijS_he5=XscsuIVaWY
z$LlunAY?bK4aFp1yXsKAIvL1zSb;)3%ay?pAfRg`Fz{A}R$~5pg~NRm<!1&dAi}J*
zo(lj|4)(wr?PU%IpR;fY6`TFJ8UD2}-B1UhmKK_G28D*w0LQ?xeB!Gg;YG{<b7b||
zvJ_;0iM}cX-XNNzSF8U=pL|IfguEhtkNBT(Zim5O0(|$wni<#w;=J04FvV}cy4MUG
zbW0!4#Ot<C+C&sP0G4>8FI^g%8`bFd_(XwJAR6o{F#L;W`R6koci?W}aNbb@a04l5
z&A%5;mF#}J;!pfFQ8Mp9rMr)f1epNlefT?}0881H4=6)j{KMoe>F&xE<u};cv)oMI
zKiuuw>G6Bum6esH2)F<fgz4f&z?|ho-fFu)oR!|F-f4c?w|QMM<*sUM(m&O2Rit{^
z9e@+>830OYu<C@i2j-s%OO4kz0PZX3vHW~L?Ec~J5S70PFer@4AX>j0lAd?=dwRsv
z4-7^b&YODA9<Evb!TP3bwbg<)i9LrxgnD-|Z^z;H;C+6}<It|O)FRY~c@WgiLLRDm
zVr=z1gV{;+LIkAXMm<}LvlZ4{wt7v4P>y)0nt79a*aMhxMWsl7o6gV)z^@*UVCwfg
zTHhP@b75HC59X14Ph!z6A{q|Gr0AaePJl(^E1lURki1dyq2G7B{(3fx>v-Aw*;t*)
zXuX3845_><Z%Sf1TIYKByNFpuN$EMvGYeHu(8a2&*e-{A*5HUkD@PzL))QcF-`k^)
zW=TmK2V|5WV}9)|>W0T#z7#UAjLf!|q#^_3g8|3NXrU@yjZN$ZR2z8eU^8uCg`~e<
zX8r&TY<QkF#qW32{G0V)Bg`sRQ=)P$Sov22dqg9D4&aOKiR290Lv5L<A+|J}cf2{B
zN0e6^W_oQv%r-mMMMU=7?KPO<GRgayB_cdrnTrrOLrTO$ehCvwDCg&Z(8%fM?j*L^
zg50+-Rij~3nUFy>kgaq!Ob{Fc+uthG=5Y-i;H`Z6{!bBBc6Y-8zwj$)LAVNE4X2_e
z{f9>hp61|(2Ov%diCHkZ#9S#rFvC(aSos&1MMn)h+waFUoxvQTE07dAZ1iF+w0QA#
zFe4^VcSU?P^J4qyQAXRJ$QGQVAgNTL^u)*R=nTk=pZjc=r4>H+-lDF(Z^QxB9vaXB
zybp8F^S=OH^EH43>IQZsn@g&H2Xp<Ms|C#Xh+q=mFW{iSA|{Tgs9<;T+Mj0eVN>qL
zV^n(vo3b&K1`x+9z;Y~(fP}=0Pj_1l<^c@?7h{v_p-P(~t+Pzb-M!sH36+k>DLn8p
zp$BJ@DgUrA!lh3U>|gF3wFA8+5Yk4E#<7zwd7(8u9QT|A(oG!%Z}hqW2qOciDm{VK
ze}SVZ7g}lech~q^iGl^_Vb;SyXmtXc&I6)iE7z1Lf;`?y?N)COP$MFS#Tj6yTiMw7
zfls7Vn~&Iq{Zm3x62Oa|f_+y>_!D3|V2Fx}>a}>P0}bvqhG|HGHy;4YNF=kqpS(=a
z{THGR%Xj{MRcRw(khTNRxEgXJm_CIoPn%k-^zA9gDMDaV5ddu7symi@J-NdL1Vrg5
zB&H>5o$!{<EL>}_co8VD?|{hK3aBF^d`<tn4cr5uDit4Xpui6XFR|z4gc<Tb&j47{
zyez!D8t*gP95>|vg7X<@M!j}+KBEDp2_tdvan*Y%ha!tLN+SSf_&b=bVO61Ri;B-2
z21=5<A2vuWjak-nuwSOw0!5~wAtj7wZq7g|==zqY%@-G0X`>p3NL)f;YdMQ=94gJ7
z{2Orm9Y*qZwag(9sXgwOx4}$cJfpq<+@Ve^(~UDx1n}+!dJUXHLPC?JdSh%{WIvL_
zgV3<0WMXl&lY9E9;r^tK9c#%E16-M+4~DiWen`*@PJG6?@#<{=n01f`iyWwl(~U>M
zH+sFkJTPN3!gyxD7a|25hf~|VL;v?e@QD5-5~2CsorGH5Fmn<8SbptEvI{7!09FN?
z@4^BOFDU@{nL<NiD131#yFwY8(Bakar9up`#4O|!a~11)T2=3Df-W+GDQ^HX=dF0R
z5Cj65vWC|NlaMBFK?}P|h9dY3ZM%OY`2qRQ5-qzW5PLKFtF?H#q%%JG^5y?t5KC&q
z-!UzlfTUCJea#7IYIy-6gT%yND5=0)94mVram3U`!@WhrM#brOTsWrQs!tInO5i63
z;ZsW~FU=6I8?ZfPo>OzN{ZOtxOO4Y;;Y1At1BPG|?EbTrgFS$$cj^&vBWA&XE7(MY
zG>QT0CqbpLd})Y1W>S)cD!G%)Fc57)-jvFN%!P9Q|GCOd0Wd7c;ayxVRn^%Cm}q-}
znl{Wd$-gF^9GR1=LoH1fKlPguIPgIOS{DJ8(<@;7-_b^}-STrLN-%VC46w88^1_ik
z703|mG+Qj`#OvNccD{&pM{7VTE0Xga7Wd$dyV}`SitCz}b;Gi1&Je->RmP8}QbES~
zAy!o=TXkj`huB|bAV=SK2-=zJa)h!l3CU*STeGAHT!sKJpsjETxk{!$snWO$ekgyF
zBT?kPEBqkQCJ6OgsD`0|f0l*j8))xS7@rRa7U@jluo-1y5iN_DBNG+l+r`pYr6nZ`
zE5vSIq~NFSQhp0ukn=;JYQY-}HGJnoXv(f@oBQNv(6g#uYxoIjxJB%jy$A5xKNdbG
z3NQlyaU+=LQs67ukM_TB{UsE<btTwfSDaXR2av}8Gn<8#C>so~vJHq6>t-_hd|xHg
zOZleUk?Bcbc+fKdYo7)SumGQEHc1=Bk*UKU9tE4Tcb$I!@4$j!(l5sw3=BxPjx3Qy
z=;`aJyi3PFt9P$mcg3D-_;I6)7-1X}bEDpq@b3sR<9HczEa|SqYSopcu;<bAZ%w&@
z8>5V<J+R`^xe~Y<5MFb44ZBs!wm=u|=yArPovS4ginx9}jdx{y_Kf*TPz{<-dP<2K
zs(C}G;FUnoXoBb6iN=J%GTN+v6M(R)G(xhZLJb}FWM1qL{qNM^D%OmKxbN8dspl37
z$G&#dp3{s!>KFur$#tR;BE6rJu1Bw;dQtbC`*I*vxL-ZJ>UkQRMHD>8=CLC<PPeoD
zEv1%+nBMrxLls@L{8KAzJ~t<2v}U(70>p07s$>}_EQ}E1`h>22#v<lJmK`P=->FHZ
z1w1QBD4VMUAs^XZ&LiEMl}#{+O{Dhey7{&E;{J+%)*~Ov5)~6W&{AmQ4>E?eZp5<e
z*LFfBDo%t2a07~^-w0hAyK$E%9=mh&4F+RWYPA{jc*h!u{al^x>gdCHmkkX$8d@-V
z7=#}wUcgb6_M-9SwCS89AQ#RvRwJ2cF<-70f<ox*=l$?B%I5>ypI!S0E7j)lHJ%?t
zG$NaC=iO~5tTZLAuTh1puQlE*iiAm3G(Y6g@Z(@VnbJ{i#jGnj_B_IKF707xhppg5
zfnhLU3P>x_3!zQVQ3=vmoY3hkgo)dEj-FmUyJB4oqBLP6>{s~Ks>Q7sfY6c|73R~*
zsKuVqw-)}J2o*m_Ixwj_;g;Uuu2vCK3wGm^B1Kw=6QmWo$1?@Ao^xP56FDQnSG@Yi
z;6;kM*0U_O=8!GUb){eA-G2<=q66p0LM!)3CJgK0gc{qPK;x}wAmVP`{+KVByX``q
zD=3Y>V|jLB%8OqAGf|t%d3rKx97QNRbF2p}ipDfgSL;kUyGtjb!d#1(AtT!qhJUa{
z!Eex@=Mk&78G^RN-V^MyTsI4LDZ-);O=kx@{mfd=g3EW$fsQy1y1rm^$7@F_^_YQ>
z{dZ(8eKN?VhEVD|zYkGHc@=xy;h7q>9t;0crP+5vc+hpJd=ZLop#-T<qx!PHf=tGY
zV0+GY2M<o%e#bGS3hzA3fKB^>4zk0gbfe}S6DvRWhF&;hta@t<-=-J)I^P<1me?NL
zSv-4CtXpuOJleGe7$iDmzge7NFZj~x*{Y!k`KTZ{(%=1#N^_?;J0V0HwUSQ5@HeNh
z*3nupX!jLw4^u3olkd$^;fpbCw{I7f(9tkIb*~c58|Hf$L}v47cV?0QRa-h+zU-9O
zu!6LFiZ|-=^MUmPnt$C5E4p!D#ggVol#FZ`oB4=VEakaB*~*v+bx3-veBkVx{mPr4
zCs*~>O{*!D?N<$hZLy=JOe^W*m;cV(-UU%KB5u(<#z^BXI;3yx0q4V+r}(!X@5-~&
zF+u@%N<<9OunvgydZe8%WOcF((d&6Mlh?G#5giTt>~Ti@_Were$_u(A&UcGfgxVUU
z6HV`8O2eA$R@(7Gq|<v7JtiJ3kI}=Wj*oJ$jDH!fw|2bY@LjTdmY7Ya`Wt8YGTM$4
zDw#n3tZlICYV{<R;{ICYid_EaO?n&V`YkF?M$~X7yPDrc2ceOB*>Z!$I;5uEEaX<I
z{Pmq>Ia{kzEZ;2#E7_-V%eamSI6NFaLtdY<*!8v#=lY_lk>{Z*S8)Bh2wCW7H?#(%
z>=eIXlOvh2ZZ7eKaj@)dE{Pkb#?PbmT1FqET?Jm(2fPzOiVD01H*}HXYbt|2`-12p
zjklkcXlDm19HP5en+l?&Nt@yw-krX%exRnT3HQp%G=`5r8sZ{4)JPL0@Ivyp(db8g
zhia1nkqJDOx4QwW&lu=YV_pe)p^NwQ-;mfBMY`H7M(-nziEW>6%kC}F`Y~S(+LM~2
z;5SD78m%9fm#XU&UwdFa`b9WRPCPSl=L6eev6LG@Ac$<HT;sa1fbc7;B>)~W#);=W
zZ5WkR^VFA4d_BKTI+$81AaE=Y=iTzmhKG}AyE`%^=&n&|j?6dUCP>RY|LSI*#&mp5
z-C%jLf70~>LCe}*RQ71T`e_1Pse%p$<1hQF>oJSLMQ1|IoHYuz&3`loi9^Sj;L!7z
z8IXJY&1l#?vsSAOBBry2@v({$+xbiQzZtMaiz|{w+;Tab%V-tz->aUJt>}+AuaQN(
z2@%zWEg-ouejje%e;V~`uAbQEisMaA6jz3htOV%uF$OiVVTLiq;*#JLKiaSbZP~R1
zAcFCtoGTrc@Z{6`#}b)V4uzjoF6Iim=bnU^xz^$Kb59oqpdhmb(T8(xo)<Omzh&##
zMHz`e{*I?v$LKlKetdegKlSU*S3e5Q$P-eEdLxv+!#~hospS2BA&kt4!}!<`jqDx)
zzx=c-+VeZR!HB8M$?e}fBgYxcxuwa7NS^wLNcGKJ)2)m8c-vCTVbJ|%PpO7UYKT4S
z^$_7A4*FUibiiaV^f!*kNFO(kZ^zwL;Yp>K)=4fVXC0{`x47MxXf_V^N|H5&(@u=`
z#}pdkvrI!F<itO4c{g1~BFE_LB))$32yH%=0&~t0e!~PNqcug%w^~8A>5-eaDiH)2
z?QTdbJcdOl9;b9)(>=4oK9!F=)EjnICpRZ&oj`HMN9QU=DhNr?x~ci;eJML977s@H
zDQQ&C-w&vbetfN@RI&OQb^y2>t;Eb(%Pwns)_$`js`}^QMnvMhn9N%0m+*d?aB)S9
z*KVqIp9Y`%LJnq2;4|L#$R*LM^}3fN+hiyeeAOQclTWXBPtladG<k-179|*iQyGSV
z9`p!hVs{&VhUV)=pLg#~h;^VV_4l|b1fs>90<DFK;`+Tvym{wIO4XxK=b%u_ZVxj%
zw4L=*Mf+T$Gko}Bm%xJiS*wd?P5m*|^;3bb2V6eKC53ZeS6>W6OY}!x1nab68yJ6m
zOXHq8MIu#N=ANsq&EGraT&8bf)hIMd`C=_epNqeeEd8zO1&Q+;g9)dI`tR%M<?ULf
z{P+)TM{hYu8O{bN&+PWBytCeD|GYI&T_>Yvf3|(T_kB@ZW&x8Ss54yQ4;Qz3o4f3<
zcXy&*2z+OZV2y45oDcDvIk(EN$PjgY(J|P!(=zWspPMF{_!2|%t&z;nRribJAJy8Y
zjh(mmBkXy0PVNG<IO)HnXe{>iyw7s|A~us@v0+#*7)N9MdG3|!PUZbc;P$(;m#L0W
zr2$ly^|FV{9;}H^cXO|QjTDr_jXd>y&kz}-8l^eIrpHgRIEeJfB_<quReWNW=8as9
zdcXM5>t%uTf#HRA`9hQ^8JfU_;=-pE>F6Tl{2oD%b}m-ZSD%gBLV}=>uNhu<2k?=3
z&w?nq%UH#=>+vUwbwcv?2$*pneS##8gHk}jQhpO|upgm|M_a*_2mu<D`-tUZEsPJ9
zIV*QLF!Fw^vFnUaB3jk!wEt=3NUnL%Y?bq7Dl3KGgj&D27!w8JUBTGknC*eu(V`7(
z{2Uah+X`JNm$zR(4m8s1zIZU?yr^KiNS9-VZCF^!At2(!!r~h$YhyD&%&R<&V>;1n
z@~Tgs__&p^V&z$}`jb~21V!BjR+zYMaw<){Ot0|e+$B>zKE<(#-7g2_n5=DmT+x4f
z$Kx-?Bu&Sx&2BF=-E$tnAC1az7A<opQiYPg2j(Qtq0qTc$5J$kIy2f1=VMpiM%J%K
zcP!Id??mmqE-@fpwwU`K?Y1C^TA8nDGFCgO`pL^u7^7DgFU#dzAmF+COJ!sk$Oy5O
zz{|9_lRrsiv_oZ28?txyB2x_E-%svq|LAgBvLD)qFDHM$);jW)XWxPH$i%>Z1b%{J
z#ju3md{(_3?}#Vuy9tykg?*icn@?m^W`vR!magYSfv^Kd+~Gm(F8uqO&8m}J#~rL3
zvzOk(Yid5=_vy;h#WZgT`a*A*+s8o|8O9&A?2F7@20>mbB}sYrGP-*;8y|QW@SsXH
zBuhJEI6jRjh+8IDKh3ecarBN?BC5>qbKnA_#H6gxzUL|8i$h$O9Joj@nKPr8hj#-r
z!zk}+r!|w!9A7WG>YG=Sj`w++ZwHrZ+1-fb1gzd3I$r5=&vU`a)xNoOy;1GCG3opL
zn(Qc1a$vjpEL)P<JDD}aFU(gJ(-^DK4l{<Z?w`7l;|($HzqdCJY?r94*Ct+i%Wz2C
zCCZZJB*WGH-O1y~D6J(kgOZfoM-tO`3O5<d9qJYxnw)k;zOH>Fege=j6D!VKlr;jA
ziA~b(LfByJ+KxVOyzY10&MIB`eE%R5lTr+ULxTK2?7eqTR6)}>3L=7JC1(VYBp^8l
zQ9;SkC4(Y4O3olzq9Qp5$ysuiBuLH+EU+v&FKJma_n`PZ?>}GFSNB$Zb?eqWRlss~
zPS2c~p6;IRnO|3Q=ifDE$VRevgBlSit(%`#N;vMaKw#{SPCUEPRs?T7m;9A0$+3*t
zC>vjX9#;4k!9#Dd<G%bc@kK-$*+k!%V1|wB93mZO0{)R=GMyi<SfrZ54^`<GF-Tql
z%rt!UW^7;f?u7!hd+#>@ltC+6WCAs7mdU%?1F~&qyA=O?tG8RJoDJIZ7I9fZK3I+F
zqQ&QL>3;6dbw=s6ALNhW-ZICr^bY$EyLSQIF+kI~*js@(c|UV@!m`6LzgJ2sLARq_
z^zP!#XsbtmZU|);&0)8q+7POME$_3`1qwxzC6n=V+|G@cL)d3p>E9{h_QiA4^&gg+
zSYtg@g0|X^IPw`x7O7@uwnz=^=_{ou4hwQs_Mtv@!mqe5?(rt(Go@jjZ_)h*4pY0r
ztGV@)X`I-lzIQjXZ<0m5h{;5rIx>x23`XJd$QLaI=h_>2y3P}BVZ@(71B<e4L4ZLc
zB5fXfZ2My62XU{u!82V0+&`x$5-NqRdOCkw5(G)uI#+yp@u-eMf1~Cjn?`9#&{F08
z<w!-ENQ!)O<?<<C(=Z~nAYr7B=UcBniIhKByzPtWz`W_r;%Y185n*1I(vm#pv(Z9d
zriNqg0ad`l>XUDNiq)14A@e*|p{?S%7RQNty1W_LFm&7<u#vqmfW`t$3`&9!;#{uO
z52PDLLgWrU$FDL5^jIvrrM$QlE>Goyl(1HQ{}vZJPXDA|OAh2->jYdkM{(@m3=#}}
zf^5Z-U;&8c&+A%!7wTLT7%itL*vDi_=G2AGUFWvW0l_d)&c$NRIG^e}7~>kwpVKMB
ze#X+m!mJy(%VBN|&|OR<{XrA7e4!G786EGKA9j_F^C@237Dtz#%wRj@S}S)gfJXZh
zy1_a9lxzGafgsN9`%4Jd%)L}$3bvzdZ1YGn@nkx?X~_;t3a~MW-|79vQ?F+!FFAQr
znA!0C%FnXR_eMYj?#`5-pDQ(H4|lNbB?^%jO=x)F<wJ(-xT#zw^f%!zF6pV7@BHE*
zS%n)UTx->5Ey<{jR7*Y0qNPyO#x<=Aj>qYoo!eA&Z5E=vq)fa=zijcV>tw-FY<&S(
zQqVTtD~k>~pHRPqa}7AYO4SjNCre)sZRn8WvMWvh*gQ?}beod&{vXZ-eod3h{pw8_
z&VVxGxN{Sls)}QF!P1d!nKB}kT$xmc>~MyN_el0%3uTw67`8a+w&H0fIyYJAi8ypM
zZB?}GR{cus>@-@R5#GbjBA_^;_-s=h?mu0T2R53)G)fYAZ{b<~<)|KS&-2Z3mxS$s
z-Pj?p0Q}Usfcmk`Z_jEDeb;?<_ChFo?Q~I=Z|EecNsGM<qrq#KkY=;Flim*T0f+j7
zE2n?=mSNl8E>97auPVvz$LBAZz0W)MD(~U`+D`qI&}0#D^89-2ejdjMvqJCn32b-V
ze4Xq<eTEx33|zfC^mYr4fIatAMJUn{ORS3G8<aAl9%An-8&KVZFQ+T$(Nj12lW@Ol
zmgOcr=|q2?y^?~^XZGbtXZS$(IMO}K4Hx9Z*lfqVh&7a}$Ms<|d^w#hA}H81=suco
zmJK7*(n9U0)5GWha+{sKs}S*^bR+ksCK)AOxX8V9+>MY>3j=<ZjR=)7ri|-KCm0Qw
zd3w)#b~DQ;Hx7+J(M;S>uwjxddEvav%LwKtHL=x+&Xx%(-5kYyvXNRy8@6!xYc=v&
zlY8oBBGUYinF0JNP6j+<dI{yVtFr}U6WOS`3LTOmzVJ~6Oj-2?$oreM{SZbQm%B)w
zX5V@FYi31SfqSC6#E7oy7|Zgf4^?u%%wsK8Xj5RD%n+5+71N00ijVRz)JMO$G3jnD
z=dPxw(E-@m3#t8xA0xr>@5gl|=KRM{_5DpUef)87Ec3KkMJTlZ;xGGApV9zECUbyu
zwrRGGu7N6~<HzfpfI`e0dS{<c59GcVRI@ciriVu6u1;?_Z3u*?mFx4d9&L-DbtD_V
z{mk@tpy=r>V`}UkwY^Hahi4sR<c(kX?gV25^VlG8f)cqTZ&p>&9so$B28}?lLs^Dp
z^9iZKFr4S>-AiG1*8xGmU*_G#xrg82H_U95ijBbpQZGPKHHblV5nOz44$lcxKnvGv
z7FKh_Aq$vpdKiKl2LYfh4%f*(>c9gET9bzj))dlZywW5zV!tvxmhJnG?@Kurt52AH
zLM8ObDjhV*8ajP9k%X9XpqKKJ#$a-Bt#Q3Kmv$_rRf(ir>%_vjng#D^a^fUGXmJ-=
zou_DS{OMYcv*Z`YdnolcNk1|>-_un&=U(hp@gaWtvkiMb;^vIN--|r<)x^yHO19CE
zK38NW^9ZJ{5Z<@=lSjiucS5Gj0@6gnp{o%cRn&3EJmnXB*-<4wLq;6-A(;zxhR-{t
z1L*D6ROM+d(J!8QQQkXf$d-xhUQ`Hn0{kft-nr6>%RO20=VBvBkfjV|(^*|SRCxH5
zjGyWt(<*sVSTjU=*rMZ5LJLa*0D+ncF{GNWB3r_Rg9To~c?l^zf1OPA(%99)3_pzm
zrzTWPScXxk8_BRq1FtsX+uVIXc80-R8O)H1Y=yj*zl;Nf0tX7}wD#A(D8F3SNSO~n
z7YWx1Ou}skP|PX-oDKPPlg&8tFgjYF1P*M(wsD=0#ZaAo(l7g$L0&<OHZ{J1Wh_g=
zB0k4_Elj};>paFS6{ekj<Yc3=29gwnAuE)4ow$p68(slt?~gqV0t%Ug+AM#n_s1i-
zeOQC4xQ<0a_(V;1bvuky$>Kxt!hDU29wn5d2~cLIad}6LoX0IWq#Gi<saUHz_Q-#w
zi#hqm!<fj)=Ha27J{_@l%P1qpKK=5}3@p@m`Qw{>PbskdwfCj54kDm7>>Z;8({VcA
zu3Y?>_Y}Q(d6<BEjd9*hy!z(KB6`)en5K_=gq&z5(;c}=yk{5v@VOhca?MxJ#ZRx-
zFFnb=-ivEBxhv)L`^f^o4RoT&EmgS|!nQY>T;T(6_ui;Zd{QA;{*KSjkwT@W&kk6}
zU}3m%;+5t@jFfd-{ilHSvxR}|7-`d<Y;D75S*E~6T3R1!$GXMD`MHGx9s+RgCD%H+
znB4N{lMCk_qTrB2%xEmwJp?Q0S*da<a7K=qhj{Ph&_?w3pmc4ayMep<PSA9;q7wYR
z(xCQj$c<h$QH#@bNZ&o~XM4fACOuIb(*>;2hAUS=tAvtKXYc^rkB_0|<2OZDoW@Ir
z799R_*-dUABF^NP!dAxEN8ri`?xqvMAEdJ@GdF}%4<tWnA)6ppswyo48+}|*Hk>UE
zxf{)9K;%Rrv&a!$f-xlPu2Hn0qEyzkIGl*>KDVGs5U%BqAu8H`MZgRO_Cfw*s+atX
zi@92@eFw(Ms%3@0`WoZk&)AU<(?<MG=lm54y?pRt68ZBzNoy`f8ufzpQ>G6O2|q2o
z@YZboK=BKoJfv-jqywqTI`H5jcB|jDDO_<OyKt93Qi33efCQ}*1$CNU?(TC*bgsHi
zi>eF~V$v$&;<!orE?nYj?UK;YtdKfo^UM}MY*r5CSC-{u#OfqkI3cKQWOR2Q#@!|J
zgQtj?M!&)Q@`n4J$-()tgU3*FWz>h&LAPC@YuYf{UDs+h?>5Hw@1v^Oqf$h7$_G>z
zlQ+pAPitx=8?Du3#{wQO@V48MpDs#yB&sK*8p(WgouTyJn`|(feX}zlPm}M2Ns99K
z3izsfdE)h^yQ#>?FBJ|g2^VMzj8Bd90fj$2#~U9`-kmr(4?daT!F;*0%aaRHj-VBg
zMe}&VoSX9PY%ZC^VwC}D{#GM*qBJ~#?D`YUCx}MI9mqw6tK#?4SWnvryZ@~ysyHyo
zV3q~0nqJ!Nk=#hKm969%i~o>s*_O}=jO!meTABKuQvHWqGb@rD{KDVXU6L?8vG_Co
ze9l%GMZUM5(WNl*Ft%pcd6@Q%uHOtK_sE|=K|9r$Xw??zh68D4D%k>>#b~L~6%Hdc
zt1sV5`ZeY=i9S<9ge=F!1F&!3SIXe4&UBH9WqWp|hA4|_B~i{%)g0oqo^?at*1%$u
z4tds(HK{wr<nq;;BDPKm8C^iJk^I(7H_1L*H`F{f-+K*b5>Xog_K%Nu{1xwBtlBGz
zPhIx?szvj?|0wCH0`^MJMEqeqq(<&iQM_W2fhdWwK#3AES=*~=n;hwxBe9cAZ~i@(
zqr!n_XC{L-#g{=*Oe8i51*;kpcIR&!eF?r5BpP|8&N$b9{J!hsUdnwuQ^fq1rQ;q8
zS8pwOUUVUv0qZ4p9=-3Z)`t5ABDw(;gN2Wk?uFvOB7H5Ef`mKc?%0Gw#=TSFE%3mo
z#OeoDC@ImhsRitz!LtR`RvYK>z>jrnuzDN#`16C<kAvQ(ez_dIh<gOPy6irP(?ymz
z-P9nna!$K~FLx1gJMTI|biIxi9odgs<X3z|K$RdjNRD5M6S;jB4oHcy#{M{ir0_5-
z*g7}@4FfMf<EKF5EqD3u#b+imJYgQNp^jryZ1(YJwAqHIe(&xp2?xO^0~XE0FY1>=
zo%?p~67KLW!#rZOMX}maQYECvyi(WSE@~_}f`z)(cf}n@Hcy#yEq*&U>P1df+D!O^
zAG7?jp}lf*H88yr7(=W}jKx}F5X7ulBo{(#XtO$_MKpGqqAa$HoWldxxbj?S4h;i)
zU01{vPO6%~8!*}N;nukvqVuyKYvB`@@MT%CQgElr<%Rx>lR~w;NFWmVp;q~UiqB#B
zPWih7g4%NepXayy(?0yR#1)Pmn-rTz6#srPa_s&Z&PdOLju?B-7mZLR+3qyEAVrHG
zdeAaxmfktu>yZiz@Fl3>copLK{975H&jiO1;-I?uH;e4XXr~7cpK>Fi2pg+Mv-fO`
z+_{*517D<?)#4ipJI}-3In(SV$4#NUq8FEXGhZteP6N+Z>T?_*b&7drd+@8`2dx&S
zGT0|nyKb&|G#0}R@q=PpMAiB6MYHUX*jDgtkz56J!&Y${gX7rnm4ynl-lC%2DSe}r
z&8l}Kvz230i<=}nK(6~(eqy6rU@~oIdZr<J*v<3M^P6I;tPh1#wL9OT{#eswnQ3b_
zG30yQmA}*T{d2N;`8wrR??eIo;I1C@+g_0@`PCEnQ^f)G_J5SS0e1iAK7SwNq^Y6i
z$D<O=@bkOV_^(mK@%z1)3Mb<>DNaSXy5Bws_;p`%5tdQpH4c70w4r+pKtRa=xqyD?
z<YP*6{A;+cFTwxDXXRn?zYj>3*suEyk|znh_ZyY3TvCo`T;ko)KWQ*&ZhlREYV(^o
zJAhD?_rtm&g1MH)A@c|6z({{B>3Z<2RQ&+L1Ah_$%E@0y?I3mJ&`mD|LsLp#6&<d)
zJ+6NbVKM*dMsZN&Eo-*j9GN|QNOisH771xiE~No)eO0+6+Oi^c*N1qeb^ms$0jBF_
z8$^|ebj51%8S^@o;D6pnQtMp6Z2nvAR;KyJykVOiDeF0;KFJZ9E!&6KWrgibwO;4A
z^*(F!n>H?OHOr-;5P9w(AP8cW$>cT_`wpQ^EOn%OO167}t1WI?xQm?SXYbzSeILzx
zQu{l!B6)X(hu0+*`8Kf60DQ==`Grm8MJbb}<lcX}P*RI>EcVz)2bV@fUoBUDBxXN5
zX0%^_b#Q#UO1=>md)+0Mv<$8=uy1VSt=qxx)m&@NFK)LrS^w=u*_2H@le;;&p9XQY
z5!k&aLh;Tv$V6NKH2un?gDtn4xB=#9(4?GSB_OVPd>rV$5$HG+2UCrL&gaM2%YaZO
z{Qtv&y<Z&%A}FRtL||XudoUwro#$(0-mOOcQ1O!~B^ff%7v0P_+Ih&g7X|7Q*jF{L
zKeL~g@@N!HMK(T?{UH#x<<V>-I&=?H%YhfE18t_{+WfeJD|@UEzg&Njd45&0a?D!V
zBCsBGbsXtS_el4alYIR<Sqi*1@XqS^yCHEhU#p<})h_<rVKPr17b5Q)3m+RPslyd(
zc5K1gCCn{)N{mkB3PslMl_2(85L*738sf1RAl<cOn&0e5{p6>-ij(Q38+xMF4+)<B
zN0-@s(d^<aA7Ge{&5Qnlxf8pPEMA~l{BUL}x3w94#yVLw=}9<Q<^%7(9#YbozVGeZ
zR3a-XpGU_S$hkHJF0c@ET9UYQ|7ovsZIL8;jaKoVOv%Zhggx{4qEag&x%56JP5ZwY
zl^>eh&DzGFLmQiFncQ^{n`)eVC5sr$U4N^Fa;DrjMHE{)q?;dJo6Bn5I9D~>cU`h3
zA+~-WWg_xFV^K(7_$}Ifs)oDzgHvJ!+9_MrhZ5^^%ee32U1N-1Md65=jULe1SPM(E
zw8iGbYrjENSmAWiS%DW7C`)xv_NZp%?dsOGvi5IL1xe-q@>y(W{!p1}Xc)6;(?Qc}
zX(jdbX=2j(B95_ZZSQO4)^Qqpp`E`<tjZtnpf$B<x~R#TPfkTv30_SV*oQciGj~2?
zh<{ji;vb~XC-3j|JWxcf@+vlWmc9}9F6uwkjH3Rv2x2(rs+c0cY-eh5T*o!9f?-56
zv0aTDJ5_M>U0S^2(snq%@_lNJh=872x4R5mxZ7k9M(Io3X0MQk*(pV?L>9l<IGMfT
zUHT@F_aemm9H1)qDO@`i7B>=SEloFR)1Qew^b7uPe?66c1Bw{)7r<4I;^&(G(ng$J
zSNfa68R>u4!7`5mGfac%@#rtv&=fA0ehl5mRSt>N7$xs;gd4fAFwXBD_FOMAHdpm1
zejR`MD*J|m-b-2Ev(U8oz4W1FKYTR~w1t{~&&hiq&~zUG@N;4!Ds@Wge=O`f@qb@L
z{%34Z{`W-c|Ehd|aox?~9>}e6gpy&g?trH|6@-1!eebrN>KJFqXrwh`^!v1wx3>Tw
zRNFtLoVHXs{ynPfbR>ZBUshBu8i7*Pm3wsq`DU&D*@wQp*{wo$|CwUG+I`{L8LLtY
z?e4`GsA#v;Gg#M6OH<16NP%P)raHGbQSL_CW!krk0}z2&|H^Uq$GlGI)E9b3&5>aE
z>3Z+>zHhHxy>3mQWS~=>;T|pR?Og(9hXdiuzF5*@m>+pfyTm`bl60HkGFzSe@N#@o
zoT~@_ZnK7dn=Im>Z%60xWE2*jvH7QO?aYptKzc6&u!G(tGVkKt78j)AAyiaXkC~G)
zekEULm{9c_$MoM58BZPjgvYi)Q6Zb31;!YFM65smLv6uWn(Z02D&lk9*u_8(`kjA$
z?i;lsz~AO4^qhKvfqspE`-Kj$uY8;2wziC#gVZmBF=rEO@1Y7joq@jps<PlDWKQsY
z?-#fUS|i1WM7{=gAr8+sduXVue?b0wGy2rNm8@S<1g|{n7L4xPtA$wC{`=}{&$tGF
z>)S%eLFa0rP-nOeZ8ZQOm>r~f{*|B)E9jPkttP1ywWHJW__YP#RPW{74xD5s7oY3E
zYji@643qI*ySDBC91~tN1ljGgbQcNF+5)hi=t>S0PQd)45}*7g<tE_0{Z`V9dyG0!
zvMu1g$-A@c>|nc0oI~e?^0ucf{I?}CRufu)G+SH3uuod@klKi0$+~~7Lb;;~F8$XD
zD6Ib{t)u_Hu0MnDf7BxV|NH#^8-4zFWib;v+p85{dUO=4UGEt2<G>-Nwp;&6d{T{7
zA%8d1*7(?YA^Z*XIB&0hz+McG5;7_+7oHlSKF4EJ*C1kjD*}FwketD!R3hJAtvM-p
zu<2=ay=#9^M^Sj_Ca{s4*-WI=L-50%q%|w=En<|Rm7R(Izn0k-;+Zk}OHl)*Irm1I
zG!<b@JH7}|u(&B3;zAv6@9xqi;502KHj94dy}Q(KImvQnkvtdt1vZ~h%7VljTW4A|
zyA*FEyln>ozm)T`RNE%#XqdsFYTnwVCkGEr#;h~&_vY~UE-RaD^C&bui^Tz4bYe1~
z<}|d_G0~hk_e&-I5Ozr+HYf7r`PzIod8_I2Kdl<@Z5W62U=(q({#$!f4l^me@PP|7
zt5OHD!@#VG2&Hi^ew<+DekGP9SloE=i+_Q+Q29S%5mW1tAiHYXXr4UFeG=VNzKb$q
ze3I~>xzZ8#)-7GCC(B-fIvZ}z|MV`3`kVDY?4q@G1IFKe(bt}NsyZ_RD4ls1ZwnN*
zgWj@Qzl*f~SJt->v%>uA?Ahii<<a;ZN{*^#=_Ojq@BH&6zQ)ZywKV!O(x&N2#Vwca
z)D3U_hX$YK_;BkSu9b}^4Y<|V_{%BiIIAElhbA*B@zmgLU$fni5tCKbxq7NhKbzfh
zM(2}Dgu=kgOhfaQV|bh14Qt{<N*YL4Q&8aY)AT$y+Cgv@yfUV8Y##K@wR&R~Wa_cY
zmZQ_8t1`HIz@DC4)IH#klp3rsj<zN+dZu)(RMr$%8oOH0XI6+eCpxs%iyb>#BcB<X
zC0b%o({o7UM&q+Yva&)%1g?2DlxYs(kQFlhGPnzVu7K~3cLg7K<VaEa89%qzO(as&
z;ZYB~8V+q|lB@dFqIgKrm?Hez-^U2v4R8M7&UDKL)o;uKu{~Nwm!T3lj*ZioQ9~PP
zm60)+>cMwPe6CsMB#eyTJX*J}ABRPYIKp%0Cdy~=E`CtYgYvH^@i#JSFR|olJ7?FL
z8;}Qs!d3$t)&}OiXhRm>gAdZ)T@*Cz4tB^hXt3(Kz65;`D-D(qcOA^<2StHTu;t#T
zHszG9@0$W#Nz=GwN8t$OL(XSC|9n-(JOz_NG=sI$#dEr^Ph;^e-!_h4Zelpfou4**
z`ezYKvJ)W$@FC)7E$6SlNt{2&NQuCcz1*R5GT*kktEBQNPO(uBzol|Syi~?NcXJ*o
zkJxo?inN`U6gfChFCWr%mvke?K58x5iJ8rdprGO{Q2*rp`dm`3HqqIkYq_<sz(GI=
zxn(gbErxLf8sg|K(r_&0U)ipRGTr6TqtyP+T<?WuI`rH-E&IG%eq=GV#dKY@qed)t
zwl%N8(W2-{>oSW=Bk~H5=%Uzmv_)h5&1KAZamvF<ow!YfArGtV>5~YO;{L<g!JOQ=
zv#ow{Cf9VG`QHsT<7MN&=4XivTM+e})z+;pI;BCON}ENfX<M|=4cF=jcGdY~(Vcz0
z773X?(&fUrQ?D@ii^6QNMkI4?_1YQYN3}Tkc(!&oXMD^<80p}-_874{+J0yyise0g
zfgjiF#^%&C*i;;^Eohy-&;(wso)>bem!01XvKV92y7X|~ZyFSv`Qel|i-E^%dU}1-
zwMj)xG*!j8FRwS*c*Jw}FW>mSCRSx2wO^et8M&^VZ-mKCh>i87mFQmrx8QE_@Y8+!
zuB~w>G&A{Lm5`Z5W=hxC(Rinuw(WVR$hC784Vk8MWUBq(po^38=DH4VlTzZA&AIG1
zxQFb<{JcZK3Ngj?_=10)FRNxDORZ?>sR-ck=xl3PI%XSh-u8`5-Qt7ibo1@7iC|^^
zo^Dtwc(AE)OyqT4;E4k=)%$pzO0g}``*X+bD@QWca?aNN$hR<TD*-Vio)~vANN9}4
zP5*N3o6~jW#zBPto?@fpp6JV5G@m2K^EBGc@M~Xb4owZOGJ(O#1Q7#J+?<vBuh(MF
z=(YB*%Bq>=4%6K<)jZ*q_zeGi=i~H`G=|;!o&X+_9i4a~H8ISCRm)ILO=Y#5d2z+s
z6eqiK!%if5sfVo+Lbq@09DJ2?<ZnvI+5yZP3>Rn00i}r^9Sn~=6mEXb(b0jLowf7Y
zH1_)>4R~Ne{20QgWD3H}2u^%6R}dxE_EA&KbPL$o%=%swGxbxl7Q;4%t;*@HHoqDD
zOb@&VPs#T4qQ=?L<aW2t5iK@nNy-yQVWq?Cvd5Zcu9MX6i@&(r{2ok5n?er)b_KWW
z2-$9}QOOV31$LRa#$Nm~Y}OE`GpB~EZkR|o<#C-fIMqgAR*uj1UF4i#01e1&54Ouw
zQG3Q;VO;_mB!L`FAI7eFsT&H+tVcuhXlAE(I9c@zT3f%@5sJY(97~@5pj-(B9k2wm
z#p`7>HBgT?8#gT`*Ntzrt5*xnE_pbv&q8WMf7GmF!j%yPn;xl^ug&|cb><KdHV^7t
zju$Tw4F|Q)X*K>y^i0<N4s;&A!HI^m&CsVvOZ|ehU8)jYb$b&ps2V;vaeGM>`OSU0
z=IHVV-+|L*Jd=2V!~W~w13nE;Impu?BwKZlfJKJ!4@GNBNbp#V#~Y@Ga~^g*Q^?9@
zDPAz^#kN`gE}G0{THX_j#jr7_ZgOJRU<&>pB}p1USK4Ydr275N22XE#oQNFG>itml
zdj3zPj&oPLEK8}xO`Z>cTfN)ZlK8!59>jw$W!NstTU;qp4;;<)sz0aN^Ot_*)KPAG
zIKZ-YO4an;JfitCx!admuY{9quVjYEmx5V<``&)ia%JuC=&TZdYe?yI!Xf`Tz><~x
zG4hWlV4WPqSV_W;yH=1}DX!cYpk6mzl?e4bmAESY!2?z=veuDBN7D*hGoKG8)+9Nf
zv&hpqx2hD4pokmi$PZWLV|Ml~J6#b=_m4Y0o1Zj$=1jU>Q(}KI@izZU6GHoR!%D$p
z<KvuHpV{Rp6124m#T~=VhEH}3DUNLqfmm&$nr(24W;{}}pSIS{XtcgZ5<?rZt^e`T
z&O=q0RM@1RRbNN5q87yJ{L5nO*#gc$!p9I}tHNfQ!Vhb*%~wBhDUR0~5qn`}mh((<
z)!q(+ZO^3uVV&k$M}Y^+H0om*_<SpGP^xua2V$4nG%tfjB;`OfrCHCNHj80|efOCn
z>u}2Ivs%{X)drVn-P{&DTSw?(E8Nm(XfDf5$~c%4vwn9>@apf+MaQ~d5mS6TN29o2
zbR0NYb7)xW-Q#p<+Fheydx3k~lkZrnEI4>KN?XLNAVwReVKJtViPASbvHhdzIVqkH
zoJTpipJ$R>rWQxqzUCcml$qt&2KW{EtJ>0*ZEy|^AFTmQ)6Br<Ic1nSmsdH%TcLKJ
znA#>QUzxa!#2by{0E_jQ!roils@m$d!Vt|zZ-?s(qB-T+S#ldMu8+<}>6{7#%u4v#
zt##0~lRnms_RGi=m}kA^VBc6SW0A<SJXVFe1_sGCU!vl|$D~U;h&~OTPM5hpCo><*
z{4VG)h}>c$TK-VN$#Uo4TUEaa3B-;r>NeB^_komaB@ffx<q!EG0~Qm~b4u)yTrr4w
zPcMTH0(cBtJ6sgU-6Ishz>A3*87s>W#w0!IO)Yi9W_uQyy}+hG!!Pf1EZRDP6fZrc
z+_~!Q8Q5oY7n9687F5k#qGCVg7VQW;q45&CrZ3JptX_X@i%2VrxvKwXk4dr<+Qa;K
z++n%NF|$8y8d||s75J8lCcbkgLUM>+OLOOJ&(xuo@84gztx<{bU1Gj#TXjp~7LnUc
z73$?ekBBnY=I*5@OFGi%C1<2QW4k^bf8ZB)r^FLsAnVc?yXNLCL`hkoLM+D-)1x~0
zKzP<c-1BW&aWy3@*XhSSVx`~Js%5QCbw1c>NVg!j^qnWS!!XYMh>XLPS5$+c8yDna
z@Uz@BN538^9g!v|!{#F{A7%4ZN62z*3(+lOIkn!+$8?C<+*+c>cX5&H{G#{#<YR;v
zm!8l{xFK^-N?zWTxB~Qi_ecF&&Dxp5I4u!Js_-&})A0&+Ft&E6eU;<c3-=2ngF6eV
zYf!yRH81!(W5@MvkfBG;)r#A7WiRC%j6ce&)+YVRs~~I09+ZKc3*fDBbL5+tZf(vI
zv=Ux!4fLNky={5E*XG6IJ4mJ}M75b4R^b1_@16p2`jlQXg#W;%vDR#$7tuSD0%~x8
z*mtEDT1sE56ntI@=V9(0Y3jBGSFL#0I`XM|uFW4`xv*Kn=u7$!Tbn9_B(8i_!z}qv
zTz9uxMv_d1=9Q>LUUXbysnZs}2XktQD;T9G<WXJc4OjU$AUCo|ILr+OW!bgUa_+SD
zC8v2rrJZ;o{bk+)k8gfFY}!%m;M;Fgr?#={K=ND_pF@#buaQjx*6v3d$dj|Q2(Oug
zOVyJ8qh>fq7ji2n7rQmOmg0HgZX%cNTxw0a#u)E*rdwC7`BZh;J;%K!@W;H(Q0yIt
z!Nk0C86}ba%e!mMgQt5}qmMpAx8~w2rR6{+pft7fcE<QVz36n;c{q6*@}Eo4!he{m
zeqpRBzSPaE;mN$XX7l3Z>N)gVUW&#>5)vi~KlrU$BEhU_SD=vIMj^B-q>%0^cM2#?
z;<9m(C!3L<s6}hbXVJZp>H5IwrsrKDA6L+^+s4Yf?IId|>J))!&aB&3^${U_E4_6}
z0wB}Dh+O~tZsjg^?C@zX2Ja68&jREV%YD!D`F^mfLHTfo|Gl}ZfG3eH%6dtH;3Lf}
zcwf=!u+x{)jm<`xTCBGa<XJ>n)!?p}_TrF_WYopJI&qZgzDM&R>A7EEt}x;klVhLO
zARpEKfUNY-Tc(iwj=f{o77BL=+)WVF8%uYWWGEHYX@0mw4DIB!+Oy-^-<I3Xsr2E|
zWuT=3B$pES1X8AKe~#sNCe5c&1{KfR7!u3+3VKrH(dWu{)aU^kb9IvNDi4ZE&5wUF
zd$X*=wovZqRG;I9F#>7rbp6j9l|9w2nWl!%XUeDwWYUsLWjNraK`O^T3PS<yzy3Mt
zWC0Vtbk+=C6@w3(ehGq#>vLW%J<uEL4*#@XtbgeV{;hHbW@O9^OO#xjXL9WRxtnt>
z(g0?$Otz+dWj)yNYADeMJN`&D_#~*SE!3@fZU```o`~n^4DMm;iYD>%G6>zieTFmU
zj`>$#f|`SL&=xGJd1rZKjPHk_`mK8M?I9syAob1Q;iI_{)Xs)`>8j0ZX@8<0(>n96
zw`SG1)+T&<ODSIr*tn4WoPlp9F*~I{C+T{Iid{^qLAl&9avo|W0Ei)E_A`xt?oKN9
z0Bqlp?~P~2<<=$RtxpCg&r+Q$!*7h=VYNdI!&(u8q{6`ojirBU=5`c5Qk3)v2S;Rc
znOQT$rVr3Aa}z}l*g`8wE+*!`5BPegwLH=5-EqK#F5g+x{=4{*szW(p=}F1Xqj7)5
zhMb}fURDkMRJ$IT**qR&_)iV9zo<ir(NNDQHcmIcVgjV`d2#etEMP%lJFlWQD6Zo_
zDEl@zcYf~nWtAcS)A|0N*e1&VvncK#-w7_A<mym&EtR0%Ppz<4=06qshN$G(P)`&r
z>1|GyY!#ydm7`9#`m~0IhNg6pvi@yUhVwo;u1Nu?EDi9^Z!=2nJg-Bc3L=j%^t(&Q
zN)#o3+w=>-0ecsxr7HkW@;ierO7FWnfnMjwLYB8gvOKdX0O$Eo$i0`m?dU(F$Wsla
z(0NpE`zarb$2j^vIoWUD*Dvc(ToaVrcAQ0S0|CmZx*YM91l)Ad1RNQYkXo1Y+4H$^
zwZC*FhZ8oUbrHISw0h88)OJeP9gAuJzldA=mN4Hh`Pgv)@wE{>fU_NaTy{1662NL@
zp8D&I_oXn+mElH38+-gH<pDsP3qg#9h4s&OZs*w9*~!ex+G~Rf3$#P64o$rCTR!mC
z916HWlNG@xupx8t*Y^K6-kBx<!qW`UgL8M<F@Uz9tLc3D&v!lJ_xRM@&|rW|^1%?O
z={9bSsgZjfO1}L-5tEc)|HSaGS6?!phk-QVw(Ni>T{6qpzwhg-Y-+)oK!GDENT(~=
z$xZt!H{{mePQbyu7jE~w&Nt*$`E6Odm#rl2&r=N&!f77#d~YQBcu(qE;)~H|1`@dg
zwFHGP)MJpIpMmsQ`8IlksW;CMTmi|6GXUs-X?D6J0uDNQViFQnOH@qMC!){Z8NB)O
z6NG0Gbe|lDkT-#ml{IwurjsQ#xmw==sUFR0eTQEFwHpORsA$m&y33cZ>70Z<XWqvr
z?_b2GyCF}f{z^usZ^a&<UIkEJd}e`~@3v|?=o52?U{k)_Fo-lc@XH^X`P{9b{&jTk
zS~{W!9CTUAmZ<q!t8AU&B-}n8l7a_;I4SpU#(_?q%N;c{^mp(B6|xGH@st~eV+h0~
zAsF;TO}w9bT6;6hbVMW2K;>6J)7pdC2|N|^l_m3id*0_FxUXvz254I3ZkiVG@Q^a_
zw<$p=FbZ7qGI<+RcL=CfC4V_KCAC%?h+d=7e-$!AUW#A<Wqp+my`vl+5=lLP-J!on
z`$KXsVPR07vI9zPfz1=z_D66LU<smPWrhGF-O-<jn{ft2EAC?Bw7s-8!kZsx^}4B2
za0aOBroIP)$rPA)4twu&Lr^W)cr5ZVGJNfB?8cueGS{S1xIis;s!4n|O{LMe*l`v#
zG&Or%Zoc--4*ix!U~#jr3xmt_#j^=AWr{KXoBk6_IuY|zz8Q_5!EDN2{nOI`-^N5y
z{NegY5@|Jt@f$f4nwSx3h$_f?B@H}-LdBk<8}^@voL1pI<C+Wb`+}mmxHs!QJKjAH
z>dYYEjXu45CpnQgs5)kjwb0WK#r_kF^G$mBx2-5Y?-L)xp|-Us>H+sj1?z5lOj(Vr
z-WMPZ42C8O>*v-#VNFTvSHO+xlnp9l;l~;FC+B~r?D$(C;-}n4ypv}{l#eCNdDAi!
z3kD$Yo!`|@4t|A!N$Oi@cNIz2{O6l|W8n2ONS4AaGu-!T|IE#N>zi9~!gdQNraEZA
z$wBtSR+hhHuy4rZ9IFBBbW;s*`Oe$~2rLS+=yN}!77{Xi`cts_J`iRem&zqP0$`3R
zbWjS>{)U}1_<>CYzS<#+m6CRQwSQvq`Rsoh@ZQFiPVnS!Xmp(9&nrA*d`KV~{x_(9
zdyC)?Z=j%L6#i)<K+G?!;P1??{%>R(1@mSD+Bi4xtY26w*G-Egt#1NY)i`Q3C~aL(
zvYT5O{P~C2dEiHPh)75Z^YeRbPz9+Dk@i*MXD&N4IRHu-3n!-%q}CSn?s;bO-Ta0p
z-MlJexe60aZVsl@goH+%1zJG<Yc(Lj+Rh*4sh$qtPeMrSj(`bA4{!%DXbjd4fG{B@
zKnCVOmi3tmqu@F0yEidDKEt0Vi0x^$k&zK++r5df!o0JQzuj3#3%er)bRPX3*d67M
z=;-J*#JXBqT1f)7X#j%Hsuj3I2Xvo$X{FFW2|-y$M6G6^g2+ng^W7|mHUpIFQjKk4
z;>Kk|ACCpBrj(h|J%4c2E!^R=81I=Tl~++w;iE9QdG{NefI#+U>HypjTJ+iO5f6{r
z^~IV*s`(cR-r_PvrfTrM7^FZe%xIg>ccD)|KD)ZldmO({Kp4WVQ!}ms(#rwR0)-)M
zfD#8TI?ew{58Jt}{CyWz^U<iBVxvy0qHu)|6Q7m6<7x{=c!`897<3qCdH74$6ml0Q
zFV#6|4Da6GZ5aSDUuEDY1Q;uI-De$vQ4&|0@Yd^=G4Zv~vGGz%zp;B#ZPz9~CGsMn
zu!BBMkiGqnu~M7AUFQd=n9Lj10Qgt)>kI?24LtV)^*wg@^1Kk#K%&a3K6A9p<vh5+
z#uBL2ZpP?WxC($7l;Z3}ubZz2K<(tt>caEM0`S%Lyn4Poa>fo3-2#_^fdQ=cI(G)x
zVL%H$UD>UVr9Q{iO_g0&C&*_xDGy|C(+I|+PT}`mV9=Fhexxh;5(sv%a*)CNuGL?!
z=}#DRJK3yMeNwJEOLjVYzpF~^v0c${x|&>cxsYO)=}gY`KEV3Xtb8JOSB-B^1%ONF
z1;DrtzM+*=uo(UR`3wn%hvO)`Hay}ckPpDdWDLL#!3@Bb?9y<lD+ll{#1P|ZGYw8f
z4e66_^3uQ^`ga0KkB}r==Q$Fa-@hr2I)U{}k#F(a182HOIl^hgX0#ZN6jB67w1A*<
zbi!9Ght>X(AI5##GzA0u>=WO$82HR*lq1Bp*6)S0zCy4}Rf8H2oGsU!hoV6l3R?~-
zmk3>iqx<z3V$0xqhI|?&;|yT-m>Wg+hvDa7ezwAod9>TYwNcvJIEJDYgs#uj0~?P8
zvq}4vuQsLXlp9al0|D3~5zBy<uM9H6mhI>-u?U%)_7<^OXAHiRSTBuFI>=8VVj|^L
zK2sP=1v#<#*vIYYI18#V>bY;^2=7#lZS$e1leVJ=L^n9&qDbc4B)K1KFhVY+@Dboq
z2w3L>%<C{?;Dv~-yoW+J3qb2a?h7cCfALhI#ijDjb${&gNK-O`C;Njt-g+VQVjr~W
zZZEU4va*qWeUYc{eQN9hP<>1f$Q^a<w{PPnOZC`5j$({yuj_pGMhi?uUQHAg0r2b)
zF+l0z^K^j$^qf@1xkuT9pf&xS9mby{6F!EVsp9#yWBTQWpRO+9K0x41a`|RAmHG^T
zQk3Gf_}N+m*awn_8xS^Sl$<~IW+$}(!OhLRt&f5t1wGmrs|KKj_#&<I`)p*Lq?b`M
zzssfLQCxxLQjSg2y>bD({26mJ6nbnx$YqQIU`e~hOWJ%+lah7j^W2kK=U41#C<L40
z0PJB-UY^L#^HzyIEws!{KPax<WbR-D`92Ox$j#0xU)e*ao+Q|bq@V>6<x=r|Kn=nS
z0SV_0U<IbuB)UJz{fGv%vG6VR;>1=zVahS3Z4;~*73J&Ab~I)47Cfz>MF!ojUwj`Y
zZ>ZTVBu9_RZ<&U!-hpL+wcZSxjW}&hns1EeICG-VJ-pGEtPx+aF<n}G8vm3xTrCxN
zs3bb5g$Bz4`XB`X=!%US`nLfRUxHfCQ~Q<WWx+PrtxghWP68TXgatg@S-;7(0t*{^
zM;}G9uIOgBF50_3suUt(;%Ro2U^QiBLmdqlfGI?_U_Q;A0Clq&DD-WfVyfeqg21$x
z1|czV&icApW8iB60A$!y%<bTVDRl?{@@O`cCgzOIXAFSy51!RPY#b&(S<E$5C-Lqp
z7Tg<u%wupVL+iENZ(y67h}u3ds>D-u<OAIp^0}%6EIwo89ZYoqD#;pK@a&&p1<ZI^
zKrCpe^Xoo)r3C==wL;hfz5(|0TgVzZK?te7Yn#y4d4F3TbA^mfBPk}WW&CCS(7ax8
zQE7whT8cyD95$Wu|D5&@I8br`JDZi2lA~Cw@|Tc*3y_e)8NdD0DSWcp5-c0#j#Bu3
zw(g_i^Vv@wL#kml13UV3o>toF+4%Zb@&yG26x<&|{#Qf`8E#SpRn3Jdub$gV*ZF3z
z0El#8K+HR?u(!UIU%Cg3fH;=(?#PzXwM<@$z3e`c$@k}t0JvRe`4X7=p#!EotneKn
zymbyZfGBK<B6$Mz!oLgkW)|FD|F8I5@Rxmx%FC5sy?W)0{S!t+L^SnwVh)fhgYG8=
zoF6A81(S}EwZQf<ra6FtoRua3tTL9HT}qEeBgnYgwSepl(^{Z$04t0afbDkL7=4v3
z6IJ7ZZ@ZgPf9Ab|1UPEI)8`6EhGsEUo(W_mutd@T5LbTpa4Eh|O=&h_<WoV{aJ%&|
zO;_uR0MxmLof^nC4az5r*}4jAR2je^_-hQns5q?B55JdER1Vu{s7uVnIwz}Y&kWG9
zUS3%2eq{SFtt0HSUgX?y2(?<>qHvF%%MU4>7Z12$7YlCPXJCQa7bje)X1P4L`B;F|
zz0;1KvH4%EP0)LjKLSu^3Z#0jUpX31=o^vik92_@lYx6L>y#^RQ10sIY=BwPFO)4Z
zOC4dCnFa110WuR@be{s;LO>pQ%tZ~f6%{7c6)<D_I)DTa(GP)&?gD>*Am_c5;LX=*
zV=X6HVNS4Dq{2Jdfp(8vF-hAAVNwI!uy$bo*#wZR`f@+Y2j|=u28481oiV*yu1x~4
zzrxID#{fP9WQe+a{$<chGeFNWXJ+uAjAyt&JwWhQWXs$g-<hqobf3vE&X)Y`ZL3BM
zl=c?@3Y%aqnzGSJ6DmP3VCnQu#g+tKCl=mEh65Yj8B!`kq{GPyxRCP0M@rl=TQy*`
z3mgi;9xVquO}+jeUoZkz$iV)H*BwQY>oD>4PZT<;)Y-KU)n+4pM#3o1N!?o?ImGf0
zf-&C5=Z(eMa-u7r15yr^WjshJ4V<6@2ang1vNA9kFbetUoC=Z4`Z)RhniqXL<nz@M
ziN<JC2fJrF5H*s$*0@$Sd^)#%(8LnnphGF{EvKaC5?0E6KG^ii$7}C}SM8jKml1do
zpU>^X?-iH%!?;l%wmACTKJ6?4cs-~irJE{KLKo?0;+>MCcYI*KGn@6QOn$=uh3$`_
ziFunr3Tug39J^O;+MD%r20lFP9*x+$6aGq`xws-bD>AAR5R8i^uvxBy7@XT7v5IA9
zWwD!<f7}5Q=yu!?!i$E1``8eLpyp~XSYmz3C~74EPjWbL+_q=dw%RV$`n*^>qs!3;
zrJ4jVSl`(gw;X<_@o@V6Rn5(%H11+cT<q*Vw4Axb1wMZlC-VaYYoKYD<PyeNw&@?J
zWq%M*qNRPbsXlj+Y#P%tT~a;Py%7XeoM<#>jOX+^*AR1sp(WKX?xA)>(aKLyuSJZ-
z)Pl{vRWd12l6YsC`_#W;fNPl{V{14M3B{aF70$LgXj#3cNgBcPHvP>Fy@$Fa#?y0G
zKjO>|Y-QFJhjvK3Y(<s@idD`e(no0TMT{jwPjVPn3Kn<RE}Jf`&M(--gb$h*yP;K9
z6~~2N$D}{d1Fv2xKSxIRjMsKP#i)Toee>~G8`}+$#@6?;>EjDxzaOLIUd3GR>>Mnl
zyhz8aa6U_2nO%L8V_gG6VWo@leqrLnV~|**N3V9+3M0eIFJWE}4^bvWPn|0=-P~mw
z3-w0_>Dpedx&Wt!_@0+C58=6F{%B3kF1Sv}!}Vwee|^a3IIAc7xiXm<46HATs(bA6
zwJO~s)WZ85GZG_tjR1%}z%GhUaaJH*FSJHa{no8C;^m(-!w!!cO=g;oU@)634Xk<>
zCZNtdMD8<sL^`Cx)=0j7C<0sUJ~|1BEVQky_WJzf+9YLu&Y~87<$KZ5(YTzG)QiRv
zCwGVS)TAH%Hg6?2{Q5PTymHcOPrv$<<xZFU&`$l>e08>Xy7{5jpK%;ZIQf0|*KHtG
zn2Ub459)Hg!>YyLtk65H<!O*%tluEBQa*`e<dIRCAqzagrXvW~9TrF$lb%j{WF!n-
zQ?TY(*F6&)bkMEnb<llR8(G}{u{%Hb2o{0r8JA@D?nbH&X&50^y_%iu+$Px17J`zG
z#@C!H6B1`Cj};+Wdg!UAxNf|mqNbMfXh_RAhj!)g=%LMmSLqx_gR2A*n)srk$}oq!
zr1L=U|5;%WT-ZG_*iecZIntPC2jx9#K-Q=;CMahXx8n?4K<?&D&%Xv;QzDL>npX~N
zvT(-QjsS$<nUbDWA49VfS}J`<*sF&1#jq6T@&_C-_em9n4Ul5aCD9PKE7@9(@nyO6
zr*m(-t!yI_j!5;KRT+i|H8T@^yp_j7MPNf<D5-u~i$aqQhD0kzgX;x?!uh!jU!=Vy
z=+_-Fkwx?zV(7I{1LviZtc}Tx&5%PQ_;A*9DO{}9YnAh?qn=&8)vR{Xd#^B!`taR8
zxjT?T<*iTa_rS6FSbmN8h>Kx!$^q5VQ9Y#IcYSuTGIQHB5(PJD!ZZUF*2nw!7=_N~
zy%O+)_COQT>sLw=&m>D&p#rq!%2=fH9!RmZ;1AZyH9X?FXuz6`N|8ATyLw}aO|zF~
zEYTMRKy=@j1AgxEGIa!5Fh7n_BRTIpedE?M0iRq9wKSE6xu?(Tlqc*yLBH@_Q;t@c
zr|q`d#-(s!l@iUJ$=kA@_T9}9+K>uY?N%KB5}6xe_;1aG>0VbZrVuqRG=_IAH@t98
zJN%P>P`Z9rE_ZWv&Dsy{xq+t<P5API*(&Bxe3`lX_gT?bFB+$DjV9zHTpLbQ=Y+c)
zL>gDVo$pu;xQZx{YXzpAyRbajbwvo-W^jnIz4>U|9vKdjHfztDcka#qD64xCe$~tj
zsn^@SZ(4NzE9$78XWt`bjW0svgm1-NAE|lXAlDqZxg_$8H}x@)Bdx}RK`n1E6>k$q
zyTP&G3Q$slJ4wQVk9GI1Rc2H=OvS%vGR1cK7<U9?r=7;z2-SYDNK=4|7rb*hO?$CH
z8*H3zkO!e1TZYDqx6dO_`x=gekf9=*oM{)F5%OF}_-UI*?ixlJE)8Cbb)w?M*!A}f
z&O{YL8+NZnin&uI`0nVZz#*F)s6#4^C&b46Bp6Mj(aSi!R)t?zP*|eo1()tKiN_3D
zQd@UTLwvMgWX}jV7VM;ibVH7Y5)E1534?cWRviohu=D34xeU6P6Ly&eYjr(`je|lQ
ztSZOL5f1~fCx|)rD1Vrs1#P<_x30A2H==Y+=L&xV=hK#Azb5S!xD-w$9PW;G_7SE!
zn|oi<Q3qShQs<c?+rfNWJFj|skh7Ej<A{*R)(e$P{)DYTtxQ@^K29-kl;zl)OO7UO
ze~+^{I?K{u;?Hyjm&c7|REgZuUVJ2=N7UJ5Nh6~6ZCr`Gx!z=7SI?TC>nAH!BibA(
zvvh0~3*tto|Cz$i@kuX4F4?$e%gpL?p@P5Dau%T<;mDlL2g-HIF-3uDG!x^DD<Dt1
zt{k&*ue7arPl>p*_hHr~MHf5AH6s<WSC{^>18!9(3v^E=9O#~XKl<c*-PVzbPgMBp
z`e%7oD%)2R02F)B&OQ4H9^QS@3!fMo&)n(fur`@AS|9H=osBiSI=jql=_W)12U{}1
zu3jcfGwpb?bLI3v)W*aE>Kxqk^+9a?Vbw;T_3P$RnAdiYvESz~o4wSDT}r+RMO%IM
zyxb2{bFHmP@Hy}Izt!a?kMYw-G-^FfYA-rIvCg-Q4neB=E_yhLMaNT=aY!v3*7kvg
zCZDO|$z7ZPY{3*|pbLhAUQHzWz5`Oq3|72-7P)~*D)m&i&ed0S{<8nn8oV*s&FiMQ
z@r^B1Y5w(QNsD&Ys}>{n_CO0<(i2=QHPjb1tMd-6CTj^bax4N$$H&ISsW~H2Lu#3o
z=gLfv&k+7q3U6hP=yOV{n=h_(KU17#K&wVy)ysMuhz~r4H{iuD4pQb|a4H>C;asn2
znuJ<hFSM<DWIur}B)@AfVdRV$KMa$Hz^l<S@l7;sTiHN>hfeA~dg>>vp1GrJYd1&v
z;ffl!*D56wr6*I|pwkr}QuN2?{-&X=<L^E2$I!mzne`qLI>BuprgFY?u}ON;{G;<c
zM?dmn2VJ)50#zY#v3~CR{$RYe)X?(cm+yOrUB*BZ#PXE1B}<-5INJq*fxWjmk}dNw
zFLy(oH#ICD^RQ#v$*XR2FU+Uhu3VZGf<dKsY*?43hXI`DjTb(LhL&=Qghd;9d==B2
zPXo<9x9z{Mo?i3;oj46sy3I6q6wl3oE{rC!RWAMgP_wm)maK}3|BO{5mBMC`z1~5{
zW8m26co(UG{8v+ycX<G4J#YXshVYq#JP5l=LZw*|$q(kx%u|@ptyY9KTPEir)$BbW
z_X^)cTnF`QXQ3<RH3t_(MMu`|M0z;}m8C;26GvOc)Cml&<v}x1)_ktpuC~z56BXJ0
zOA~uj3uo;&WdY}q=_F(Sz<_ZbzALvOgjj8&=QRPNaDlH*J?((H_CBqFpL_bQ93gkL
zoz};gjM)ij++FNn(Kwe2A`(t;EOuu<bbK%hqGs*R-E91OA;4)1$&B~!ts3<FL%Df<
zO3NH_YJ+f;-uk&cRN=A^SeMqbmXW9<9hmeRm8oW2#4w+7GakIMnoufe`AXPGVz#dh
z;pb-uHK^tPX1MZX#AwX2V0UT1&qS2&)Dfkk-Y;b>tctqw#W7)RF1JATNrsfQGvM*s
zik9PG1Z*v5`JaZ;&XZ*Z=|&A9E-5qTxl(7If!=!(Ex$EOhPIt$qass8KV{_qhJKlr
zBaLw!MDAu2dR^ydKb**Ob$?V#!l?&4z3z}}Se3@zCfk||*~tVfSvLqxV4_5?i_L1}
zj!+j;qZ~XYykOlp^W9=f(jeqZiBNj%-G}$*6yxC2A4+vCMXs=`osBotSXQo7+4!f9
z@yWHVl1tU19h-!vpK5wNle`Ofhwf;Ytxk(@BfTN{c$9C~<)Hq~@~U~NpInL`io|<X
zl<K>L%r@z~7g7`IDuk0?S~V(T$HjQUOnknr)(eycRJw!untTOCd5V#fI<`(T7i1B4
zi>eEn&<2{%tH$^v6mhL}$#<fCf=qL%S#2K&T`#6i<`*PQSFTbCHLk>{vKWE+ukHJ<
z3SG}-CE)f{@5k?2<T(so#s=Y#O93mD?F3&~oQ{o3vpYgCH_XmjNfcTmKGw^}*!oa_
z-GCnMrb9hZ@8$MF>(izRPwOL|*ehuC;ng`pykh)*{O7zBmDx=A)%%LK+tt9%H#Joc
z%BPwitvx&FL_;4NfMh!oqQ@c+BvQX~G00dPP2EpAb55!`Zj$@9^Eya%OyoT3yJm|N
z`D*?4z%Z7r5PJR0W7o7#D*4r0O=T<&6Y(`s-8tgT+m_hcv^WHAz`_hG#BxO-(n+r-
z()GvQJnl^0NR60*UPVr;2pVXegWRIlCihn)8LYp4Q)kyFZ#m4{`t4~d=SPJ;pNFM{
z=lSVk+{{N*m>+~+pk3nAJdBzBR@Rwhf3e?f*`L6rm`s}u<4}ToR!N<2#|^dE#;p@V
z_!T#;@+dc_RIa>s)4hza>1b7Wst?Mx_BN&1MVK+BvdZS!%Q7-lx*(u4H#C{J=;$oA
zvZiQXRLLnWRR2V^79deE)!3`@U)?V1oRF&ziEuJzC8$?RdyPkt&h(j%Bp>#fZI4q}
z^-uVAb|7cu(+OIx_bDay3!3>=ws-=Zj>YdOrdA(x9yM#f5IOB8vmstJF@yP-Sd2ou
z&6W+Mw?+iw*jm+32VNqC=Hl-@eftYPEf(4nAWn)m@tfasFVd>#?XQQ>N_e*ToM$RS
z+m=5nJrL(oesI4;)IjC!pcMPQ0_n1y<?kvk_oEd@;~Dty?El5rdq6d{ZSmfyAOa#N
zO{yqLlcsc#s-kp|-a(}IE*)tek)qN|sM5RimIw+c9RfrGA<}E;5IWxqo_p^(@4fM5
zj5AJ)$<E$u%{9w!{^t&vgh^pSasCav3Nf3h$uCC9#+47=irf-utb5T{YXWT=s&=li
z!US#%aBH2|Ha6OFBeEV7-eY4E#}H(2JCCyW%SxoUZXBP^ag+7^tg0*28$UaUJe9Nv
zs~xZM^0e^FCh}^@pN1Zp^J<}EF&FIJb#@umU)*|$q0kWY?i0g&A?!X9;vKABaU3+c
z<)><6X^ft3oZ#^gpLr3VMuC0DiaIQ!cBsuquJ*#S^D_o>-gZ$J`GEE9l=-qxWaesq
zEO2A)>y%adUSQry?k@vFN5mbXm+q*#Hc_Xa{pc)A<o;22k~1<nNGy)9iC?c@Dk`&G
z5goPt{+j?_k1z1CWkA(e-bW%|BA?!m$QLU5R3MT&u@}0=m|pT(_v}NfD-Gv%j(p=$
zLs%{oZ)Kv>-7o`_vYs9K<blU*p2|b-plB1zSKk`v^cRco4I-Qx^YQQ}2roUk2{I{n
zQg#Pvp-6m76`0OE$9Q7&0`Cjt!a<iIpHWOSZO#HZxiGL<?_rDDW}3g&RD=Jfj!kD4
zofYv#M%xC~4Gr@fzKMPZ0kzKW5(14}H$DtwpM0!Clz@b&-qO;N%6MAkduL0r`CG$U
zEiJb@EfS%b0<bJq|7VaK<95Odp54Y{{ai|&NRu3|&WXZqpVv;l>RlqehDo_k%b_LD
z&R*v`2E3Ht+<CftWR$5+ljy$TWU`-dn&KKIY`7#L8wmN*Vf+GGgo1AM(~EpMIl``d
ze8w~wPD<~-^^<J{%$BH0#FJi31(SC=-LIENMPvO1Ew~1eq(jiT`?JM!6Dy%Ywk2}5
zH=MliWNuWWKRiv=KlWxG4DKu<&^L5_890(eN0?6rFP&kt>-y{hVl6W|KRL2{0z5?Y
z6W#FYUxBkV=;7yKCe39Ce-;X{^a7RilTNnXO-H**Nk4O!P4|J^JYQm0U@YWu#^Rk8
z+9aRS>q`@&Z|E^rvb@@(!BtpgbT_cYn8QIC>jzEfk=Mv`a+K7`4mX@=npKqFjIgsy
zWjazIMaO;#?5ZezD0a6nW?f~}!n*X#t7)M-V6+rE<&CwQ%!(jlRLOruYCGf52usfj
z44m=}Vp3R>uU3o2m56v@$3r$}!25R>gOlg=!Hm74mR|~A^KAR;$bDgsdo^LiGRP6)
zuC+<){C23z4NZoc9N#t{$uIo0-emOcC>|d_S5T{KA7)*E64Rw}GFe(6Wk5pxk9HPw
zV#HXed#th!l<<R`r*3gM0|`XBLL*)i(k3eXYQOgZy0u>&g|*5lx)@e2$dC`CKjh{;
zQ5dS&oO3TrojYiX)A#$3QFwsX^x*8a@AdtP=zs8nU&q_?X=kAk;uYyJid};5RCOyF
zUSVHp^-S~S&&5!q<gRKKifJEK(OqgoBUMc@s~Cf1JAbzD6m`;j62da;z@~sgs+vq$
z*<<&aOwV?iH{xFJETid*KMF?=B*9d*kR2iL<6my&+7ab_G2gZ=l61GiY8Iv3YFY!-
z7XrjJG|-~oX)2N%+1n96VNcEiT+*?gPaQ1+w;G35{p)?MmdsMh5iJj*N41<6?xVc|
zPY4UhT7m@Ua}$k+bWHr`st>Ny5z!SQqOf*{?-480+f6@?^2anWFZvKa0+k2h!Wp`b
zY`!P4JR|0A=_BZMQInr1$T%gP;j;HWMy?+nHz)RPsY1$CNHGB!;YI>GCaN}D-)J}v
z?jAkf`z4e<ZkT&C>8;>JYMb>fdaLw23-rbnB!Cw;J8yZ}a)D~|6=8gm8pY!8V4kYP
z656kT;MPIj%xeD3i30XQEBPw=L$cy=-jn5D2J&ykuzZ~A{)b1ljL*k1csQ7@tBDec
z&9nv7p0$@&FKN{9b*8uL&1$a+zn?9(&ld8J(=0+iRalnzx_(!cMUKs@%AYApeCd1i
zq-7<O{|Nl7Q*YqNH0XyquDSGbo3@!*RzyVC)k{|*WF7Wj0cW`JbFkZ$MoO<OCjtBC
zwm%b%MjuJ+{NQrfh;rPRSx)#|Tl{dgw@<@;B`%AT$y6t<xSDMp-cvI-F<?F4aEg9}
z;60Gk>A24#Fs0!nzFFkbxRUk|U4Oa-7c}c}kB@{n7oww7R}Lq0VV{b}L3VZ*HzNs@
z8|ZK~0)wNZkhXh@2_hKNvnzrS&yl(LR432TZ)ZO5sxG$;R>NSWDiE=yyi~_8zD2B@
zoC<@3@ktdU-zb0chwb+)WE~imM0u`F6uzTi3s<f%?bGLHRZU+lxo-=`r}V8{ojvL|
zL{F3nogG~f1}WgY3MUY{VkUFCy;cRSe>0OZ2n?(}^+*<V%ac4><<B=lb-F|QQ;~iX
zyQpoQ#d&$p$2v<(T8Yc}O&+i81ggecQY`BPmT&vNHpk1z?VcPucx+Nb|6*&OqtQuC
z!)BdWdFdo`vij&f<cLN&&@#v^Sbk{%rRtT7%8N_ztV(#$X*9n*;K!p)4ZGbEd`dr*
zmL)Z>-dn}E^lVi_S6DIU{d@6Fc8}F3q%LBc`~q&@L7L-;0++bURc$jA)L=>9GhSAb
z5fEypH8VXl0eTdLYzqz*f)rg-TN7yXBdm_+a^;<V^V9b2q;}!ee0nyc^2t4N3E#e7
zFP<j*8ValD9JgJ6R&iunl<8vLck$_>>Fh=F4>xPX7v!QcHiF7vpEEv9o1Y9hC%&AC
z9bW%r))?^Isa=#N>4|?rdZ3fHJQB^SRTo8@H(LfBZ=|G1{5(5_U4Ey2$>T??f&GIV
zLdTm0OpFRlh!GJPiOW$#_0JRn3kkStV?MQ*UurSD6gR<N-+Hmx_Mt9-cM=j4X|!7}
z;_dr_@lC<hT+BP}l6@*{v?y6fHwV%1(yWl@Y|%o#*0(SD@|UzUSYWqxc)27Wqa*Jw
z!ZBAE=K^Z3De+bOUiqXS7f+G%{WiFernek5R^si4BSr7u+2u2Zlp!v~#DlI8_0IcC
zl(5rCCN|EW>k}EFj%?$Nv2vdlYU1I^H&!KYGRyM0Ubvn$FUNy#inyE$lTfTmtQVL)
zRbQsLoJ;c1n6{88!$;%bzj+J3gFdlB(lY3w?LD@{6xyu<d=dbOD3G*&B>7m@%unj(
z=C*sh-T&68;_DAw`YXmzdM@5eN!V_bD>v7>znOIO0V4}dS2SKh(iFc5VjxzjxaUV0
z@g_Z^y5+3w&l3f^e01T*n9^-W-Ncri2)VR1O!S20g1nKSWfmN~1L&$3kjl;p-<2g{
z0RYC$20VPl8Bc{%R4JX*?MJ}`{Pxa27obxCxbSr&gkT2b+qDc0ox_R4Q;wU5Vr^X1
zu4I;sI_QsUzv73Ko`Tt2HxgPel1JOZD|IU3B=U;inO?{$HCmu>=(Y-K^fIG-U9my`
z3H<?2mJod3by?S0*?Q;6T3LtJf!Rzc=5M)cX-f##7z;w#*IA1?yjZ`SO>{il&iwF^
zMA{}xz%@&V&x$UE|C+zmp%b$D=Yi5ygz1})xa<Q%Lr~ZK^`td0o@f+InqI9A@mK2n
z5jMcskXcHyyO^ws6rGb$Ig8DMxvOkeKaVtIR|hxFHbm`+^xZ()4^SBbx-K>P;g*vk
zvMi1aMmM;CDpX=VIT!wQ!Z5twRBNrEo`ZpSO%_&~UbwyXi+VZj)p&<|Clw)yw8wAZ
zyFq|Q!yC3GvPG2EbA5a%zBBMu9cydt-5#mNEy9kE#~Vz5H-*bbBbvRp(}X(R1s6i;
zO%QL3FB(KVqbYbFp5S&cMI&z8DzY^6A~AlVd{8~HWzMTfJ@;9nAiaAk+QDwUfI#ya
z?DmDwhrqwney_FF8!KqhMcMyQZ)Tk-w^gDoOZv&{L(wCvYs446-qFP)?>pVjI1{mp
zUc$qt!2LNfK(!RyW}v(%O@zA|Ttd7r1r&~2JDi^5W#PmdY{_vy7T|`=X)a2HnRy%D
zmzd)~jaT}E$Ko#;kUf~oB*%Ny=)RWi)-RE03W5*O?Gk=bP|{-ACnaJw1l1<#54($l
zkLysL^tg|58EmIninw1^VSq<2`O@6{UvK@p?)v}Iu+XZu>`e(u$Ot(6Ie7#3K2Vjz
z0lsQa!ECOp*506+j^W1PWz3Jj<$+hcfZx8e5y*mH;05kt9TxZB@TT5@-vr>I&tPDc
z5si_b`q%dj=cHOFKe!ctzsSk|fBsoI7eF;Vdg~L=aZtDKz&atYPJb{@&8z3MgCFCU
zrY&MXdN^*mf$H+#A9WE&#uW(2@i0$d`gq-OI0Ej44`91<P+@FB*}ISKitxD5g80Ax
zOf?W}5MSw`qTlW2g^4&Y8_qhR3IXb*siYM7>C-11kpW(6*UxbYJ9u6cpgr$mCr`Qz
zH)ByGp>J!@j%tIamzLrAF{nSyU7SUIylJ)(SJZTM3$nBCG!guECMGVb+us87^|BDg
zzRCay%5^JWcEVoF>ZQD$v0bjJRc{yL2tBa$SR80Ie&GQyt%&I8S_A0=?Q-K%z;F#Y
z5?Jmm4lWPnzDLQjr;4~_kCy1=j5{_dgC$bqdtF8Hc;j&kfQmeemwT|iU`}c|fW?p!
zR9;w=mmL^y*tnXfDQGkLV~wD#GkU_vXQd%#7#0|4<c0F^<PDM*be#BA8<Z3C5l|eQ
zOJJIHw8Wn7$<7I@|L&kGz!Ds~sb7h-w#l<WG)r4As;SSr@$>NXIbOJ;HDVNN1pv6N
z?qb8zqW%5XR*;M)ICyfb{iov&lg$l)zzLNCkL3<9(*^a*Obw<bx|OoH*;a5NfCRB$
zX4Wh{wS+*EnueDq1aqj1j(To^x9j#iC*+?7mS!ph80Om&5=QWRRWGFQg!5yU-a_eK
z2)~Zod|%-XjBuNw$8E_=Mit_`yotjTw7Ct7pKd3tAnnRWf-YPF&+pC4S&((aH9Bf}
zPQPPY?z&&@j=Tjs4BNwah&wP3CtVj})#dAP_Pf?|0q@zTw6qfR$(k1kxs)nj{k)NH
z1Wys(PRFzybQ)TbhNkw?x14#>HW2;|!EpCX6atDP3!c8Ye}t?M8wZHlbYK<CvyFEK
z@M&JepDuyLW0mI8p-y&FMRb?as^zqaOJ!F2*Vy5<=gB^*-xi(Kq9QVd4|L|p&lLto
z=0UvWkw?NO>3yPT^~<fbPu7ZFM879y>`dt><6)wcdu@5+GUo!9CLZ3!sat%_n;rDX
z3bPP+83z#&k=Vg(EM)IcOAjn}nicKS*%DQWIZZ24yuU!!7%#}yBJcTOx5??WlCb%_
zmcLgQ+u`916%uOkw&k)K{};rZb?a*R?&<JBAKx)pLSk+s8J<X+Jvy+L<^#`;GOC1b
zdw>CQpx7ujN4dTjMoKFsZpX~93ihk$&m1ruvi$-r9`i%MgIs?T+$Y^mkdx`={pHIS
zP&|DExWS&WGK0FZ8)6;ZTCntGVP_7pJZSdvQ_;i=BW5sP*dE43ynC|EzsopdX4MJ6
z65EgWf=PJPa^#}Y{9B*MSmBcpK)#U?Kv>=$j2&-$BNNY|0+U#9Gt$QtIm?O&ApYq-
z%N+h-)h`b<g{Z(PT-$TsqzD9#-mwxZ-|m+(Ni_0IMv-KGDWHG*KhRJ)j6^b3*l83*
z9Jyd9rA_BGIo{0qYA3YurOQ{<025?aU6tW{0nmx$0as}5gdF@l{=pKn2#&oH_X<-@
zoqE&gQYN3p+Za@n4uJ3;qf}R3@m`SaZzF?_c$u919@eX$OE3USQ47H3|M=RsB&T4$
z;=f4K3%fVf{8Jd<PjFlz7?%q10CuS5vn7BZ0AbliHYK~yFO}%eLakI_+7{MTROZ~u
z$MTpBxX#rYJnQ*;@}&?Vas|nZswZ~T<z7hEO!2*6<`AKvA{;RBA2@N%I-CT@!v>iG
zzC$XMWU5pwgdj)k8^H`1*f*T&1YN53g?I=1K;mbs-{f!$i>qd|94Wi>*4UM|jM6F-
zXZir(??22Bp44e;TNsV8?wslEn?1+70IPICDxP5>lCKZ=9dscW`%BLr0}H8t(kte_
zADAK98r(fMlySPl2F}F4O#imjx<EU}*_LiUILzei5V7}Z0N&tcW!0DrvTQc+Lk9n?
z!$g%I8z)1zFn~tAKT-3n8;)2+1^Q+4&hv~wB^>kTN6cd+@+AEaf7xBYyK^0W=k^^w
z?i*?@O7HRUX~4hfCQ)-7^Krk}f6y9CTUp^U(_u?SUF55OH_)-pz{bYr5KI)S325<(
zJ~|{dv~^dCq-z(_jLnMzge`iEO!I9U%D4J{xpdQ(gyN*5)o?H<E096_wtkA8?7$%o
zcq1`3AOFYG0^lk5{eKrs+xr6vGr0?z&nhZx;H)isNIS9Z9{!OgFDEM#zHbD9-HE&i
zVYFWk09}nmMMZ%XW0<ecS56&OP!N*=m{o+B`=TlD1)Nyg|M*XE3bz6x|36PKr!gcf
z(bz|RY;5w75SYmj3cg{@`rl4q?(-q2+Cmn-xOJ*D(G)c9Q*YvsHvL-rpDtu--~Pw{
z1V#KGR~>S|Xa<t+V0v_spsvVFBtLEnv=xl#=9j>}vQ2QX;u5$2_ivuQ#l!pvfW&8D
zZMlG@@s!5sGYL=qSK}M?Us~JN1F+&JV61T834O@rYv+t5?LUEMP<wF3t#!Yti*00(
zpIJ6wqP0qAptiQQS};Kd80{?+Be9o+nnqs&;#vQ%uiu(%TDR%N+O1NzJ^d$Rm6e$X
zJ^&18T>?MI6N$5hMW6xXzk33BV^?^9H*v)QTr;TOlmKgiHohTpW@`2|=t3iSXqbD1
z2DSFd^L>zFG$8p@t^<;d6OfSqMUF+<2$<^umd@-BI4Oy(A1s1QfL2#C!d>PvQt;f7
zxZWM9yFox=$@zJinEF#}ED<2brAtA_4(|1rwdMD=$7B+kk6vaq+H&1xBO3ry9#`*y
z|5bxyJ;96lWwL4Ou<ky1ro|qdcllqM{o<1Y-x7ob>kan?@{fC%aUBM(<9T6N;k@aa
z>%bGr^!%H^d%{4$l|)AZHN7n;I0Akg*nxOKA%7e?ygLdN!C}C0gP4L3`igrr;PoUD
z^a4>6l7tYFm5rL3?Qcx&;oL(U%L!h0z*gdOoT#5Rooe3ht4hT0lhR#8!=ylLh$l_^
zXKM#l=y;z0+Md6{R?3?CjB&r?)*ru3T6dd_I*FJFT%zUQAK9ioMn^It`trQ#=l1}(
zow%3TPkfTrf=X#zUyk!DoWz?0vps07`5W6$15%me7s;>w*Ku;54cIE^fTq~gUArx&
z?Lvfc-x`ArW(Hs=DJ-zJv>FuXg|+z+-gPe=)3dWs%@}9xWmu<Poo$z({G?EDmy~Y(
z>?cL{wT1<x;zga>*ZhLuF=Z72qftx1;Qyy1;m+Ws-&~nVnKzti2opOLXAreuMp>o`
zUwzu(NmAhLySI8<7hJ6fz|O$mg03tW3o7zc&yAOI{;H9&;f)@_#Pn!(@HCMXn1QFB
z6C=5dPXczZO$`AFVB+b{Zy{b1HPQ7qpbId<T>ma}J@_}ltUK_H+49u<K-HAoIVgE1
zTrCR5^0d;k>LCbrvp}#r{tig&87+MG7Pb}5ds-xF`8_zWVx!pPqzgQ7+)`=78OJ!_
zkAchjOS+O#!(+F6e2hD$CN<6>6|@GCQTJ#*AEDyndkNl==?eNz{sOUZ<}@x=rD35_
zqu}((^V>_SY&>H$pwDtCT1(-3@(vVBvVq`kV7R4!O4h0oPAs(kyQIR&ZkWQ-yfEi&
zVda=;NTI9Zb1#M>7jq5Dg+RuBs4s6nB630UMczVyQPhmU??ytw>dt5u>VFG}%DJ}J
zY77N!3(V@A#QpZlHlC-?*<E}VrXp_k<B|`vV5<$yc#T@DBcy1qPmat;$}H~i00qsC
znc)rGebno-&BAT$<T&%f9zkg&?)R5IulcNOMe@u^CzEdp@)=i>=@b@r*3yEH7c|iM
zcUTH;%*VVRr4t@*g8q*3gBCh5Gb5w5jG`pF$0&0_n?6^k3e$R$_$DzU{tT$9xk>Mb
zx)taR*Lm9^4L68s>-LW#1m{yLFE?*q@z1greLm0Yq#hbA)qX=eFE|ZsiE4P9{UTE6
zt`wKJp(^_LK`rDYY(el>t?Fsrl92Akqma`}!}<GOuT5iS%r8QkGSYIxRKWK^D$Iub
z%>_?ZYzwU3JO<$=9^Mt@^DQPVa^xNiI7W3Jpt0N$om#47(%*1HfW4+Jq6z!{iy*e4
zQ2z_c{(BL2gHYh$fFtGMlribrtFk%mHMUrR8G3r&@41U*93WAsH>vz>*f42&Ot?K#
zzv3tY)1?zILm8FwyTPyPm2xwS77=KLeawDr8J?b2*Sb{iRiivoI@9ZrZ#_YJ((G|(
z&(eC{%`F~7UxIv~7lQodIm(bbK1gnSpkK-Ut=2rwRqm%9*}hApN7c?^ak7x+Us^Ew
z4sUe%&l&d<j-&X>+PmLnc8Rqs2FgvO&TSLtA2W1E&t06f8N)-m<MExoc)?C3!2!#Z
zJ(bV)!$Rzg-nZ<ivv>|N+*um7&RgIP2@lY@svbiegX{<D)zrwoqrcTp<MU?@jmzo#
ztIR3C%v-P3LBHRXIqBP0meMQ$b4!$waZS1;+GCzG{KNUN4m!&&pCkL>v-GLtZj(`s
zP^UEPiEDcyx<;5YevBx%4VZ}4FWul|tUo<m!Qf1E0oS=vW4_V8Gw@rSX#jMZjQlQx
z5`{KD)bY0w|CRf_3=)w~<NBSIlVh^BO55%m&-k_H**C-)Z0PPvHI3Rn-TV=fuCQ#H
zsy+Tay>@Q2PRhR8p<hEn^eHGFZ@h2ss?H((0!ubAyme<Pc=nSRMZ8a%egDy%Z=Io7
zsa0VS)eFsRl~ww$c2C3NNU8fo7Z`lye_z41CI$b|kG*1*7gUj)zIgDH&PyW#N!5Q=
zW$0#^XE!6+gt0<TezGM_E|2C;R{5#7hFG0Wxnx&UnRa;T8FhKyo3viofArX2^jn3>
zLF)&M=e_mDMDNuwXRB&vobC~<D`=$fBbf~pC3mChp1-h_g30!SEA*SAfe|{4EbET%
zP@uCl1xYm3Wn?a#o41>PWWnyp#3{N+yj`x9x<?c9#Mvn=pN%b-1E`NkOA{+cEhtzQ
zBw3G_I5rV$nx<Obm09P1vAW`6w>d=RTCRwA3+2l+kb+p}OAzZ8cFHI}Xl<K`ky^ti
zc^rPVu+}uo2@S=^B?Cqx&(Fudm>)=#4MbVR?-YRI(J=q5xWY7Yt%p&nE@tzOxd2gi
z|FTA*$vuw7nzRM=<~VL<RN?1}s-u;w{yP|9&+l&(Px_WuxQA9X3OnEF7;b(IRdPVu
zrv|vz#wm@2c8$0Adbw1X%5i=E{P`C+IfQiUr^4ya5AXt>KYR7bJN2Pl#Qof4)Xt9p
ztH${QWhZw(gydE`NpdkFL5cqrj=_Nz1~Vmj{Rr+mx-UVO&CvU4IKTe)&Oc<=8?Axt
zItNGfYJ4Ws31;}&nm})Q?21FvW%dTEWc@4Pwu^X(WqsysPrQFZ{EQxy<0=bQ-DESB
zD)<79_iZzXmxeV|@15HaKxl*8!#wo;uFXaTC9G<Xf48rgB;KjcP(nkY?d`p@(d})a
zf`L$iHBeQ$U%KCO##DGRt5YexNp!D0Qn+L{{Ki__@}jOPPMKC{<HyJ|UKNCF>h;;b
z)zxk1y(cX$U5W_}C{+03kdah%#KF<!NLf5Fb2slQ4-c`?{@`{>BfE~{!b#UiTcA|A
zcRZ&LX@==#lED3MxBlF_KMh{e_z}QBB92sJs*lkryHSie<naZ!>&UdS;el(~9VKrC
z4m+;XJ=hB9)n6ecdD%~wOMjTiqf3q`({Pa><W63WbiOI-lkjl0tmrRHV2Iq<!b+bq
zVz_cr9p5}%c{}{h-^#XR#8)4^JmPHE`8D=2Q^qO||Mn^8u80fA;Wt5fp|~H2w!{wG
z&)We}Bg4GJalZ<WJKH{92j?mtu0I-gEAPZZn0}Uql{c5oK}A1IYq%rK$rcY~B!+oL
z5f9d1D5Wydy@rcK!B`9LKHpdpc;bcnP;wWx(%G@DVO9u!efPFsIc>R%FqxJsLZp44
z>5iDTywz2BOAIPE?sd~TGtcgt@K?`MU-w4_Y;|8SA)0G0(|wIvsuL1DZmGIQBI$_9
z)@|b`pC?{Mzb3~*zU~u}jOaBd<J>wR6?)MAt@cfY*5bia15Hf`#o5J=#hIC3Yw_?O
z14Y&BoE*^0nE42V_(K$`kJK5sG<7yDfhm^$Z1;CJ!rzKff~>l&Z5;V&vN)GaE#x0w
zg7Xe$qCvx;&QU8M0J^ocm9`6z(X^TaE`A742Pkz`TlF*Hw0KGB1U`UPW+ad;cSTlK
zR*KChnWxq0fi{Xf&|A0Q${+z8t)R#J=r^j82cIdp3<D14#Ew?0F!$cvr{_vSg0$ch
zF_0_zZ}%BsQvK$)twGeR-{e_}%aHA^AjBc;pnXz>@Gl}VzYM*3yZTj~&5zI&<*&c{
z?T@J8l@1r*3OOLzZ{Kd!V`F6niuo#iuB(-uMUF<7c`oB}pmF_Dx&?jLNpP){r~92!
zNJZc_+5bBf(<}!$P&?531H!6g=)pWaw+VEDh*F6#f>&gNTTa;@bJ%wItd@wXXEB_|
zsIfq&+;|z)R9EE)=9$B9M=t|C06an#upR$I*8gdzdseR|IA7Hf`DA~!s(%N>En;ti
zlc(7Yr9zZ`o5~6A07u^n;*kKth>;?#`)%Qzz@7#T1q(i>wpYNzdoeHnL`IY91~9dm
zLU;ZZ>SRs#=*+&kL|6?v2(4bV5#I9qjZZ9fx}P)QSl6PxyBQ@=P+;m8hA7ri1qnuv
zKK7H{^1#y1pYP^Rh7jGp=pRr+g}XDL6&YMHuK(wMSM2<H-fqw@JxSWEJ%Y+-y#>4T
zlPyC;<CfOmeYo(wk4`>!(30qvVmqk<Hq(?Zg?4W!S%Iw4E~Fbs?xw9Aoi>3a3gV!L
z--BN1ZIpMqr7bI9pWmB}vfkW8)39&v=czolwJqXlT)dO-kWMD^Z3;v$K{zN`z^n})
zEL6NA&{O`u2HH@PIex}XOM&2}=6z6N7XWpf`~X?z4DP&dD0i;ZKCh~zTB2)8${`?X
zDt`R-n(cUPX_Z3)HMhRV3cw$<8>=QCIe}*Vi$fqX??bvbC-Hq(LRXu8yKIGIrI1cS
zyrg!cH`xFwQI9mA<-*3TuKN&m(mc<Q?zua!*nxlnUfeb6vifOatL2t1oYR(x|L@Do
zO;+*TrriRGmqn1aMI5ce*4<x%zHf4+{HCQ>!YiQtRJ{FDucoZ!12_rjfltE(R*Oyb
zBAS<e>(x0zG_1u~SRSMOu@KOvxUwqzCPL3I>+#bhI@0;8npM?~C?u_o%n!9-=L^yf
zi5VH?`RP6pzz_s6|2y_=?WW;OQrV;Bx3u0{9aFLTF_m;}$7eTtwrJfoLA%soF*LSu
zIW*lU-u<OjjZI?a;ovT&=J9xXn*_3+M-k0!lCusq!sv~%3tOdvv}$)tKfMsc5_xMV
z2gs?_l!0_5n=|vP4dBhGrQ&QQx#2%I(8wN`eS}Ho-|cmmVO4v5Fqa4^487Lwp3t<6
zaFY#Tt&ECK8a8Izl~9*Z#pI1*^u70*Rt3L}PfGai&LA_YkBhbR8=b?1URl~;qc37M
zS9y_v`<YoF*gpgXnGzD-=7HqppOMoAy*PM8A6Jf5*{&78Hd0^75wYzUmNucvH1bP_
zzlHwxsbLH&ZWRT`{3EchM>6mv4HA2Mdn>4_B0wc39~8cSDPFUbV;A_#Ci7Rl`EY}8
zwhH5bqQsFWqLMrH+40!H`~b7>j3xJpYZ&y@zHqd3ER)e&TuZm&8DYdV36CM0d`R8{
z^Ns?m{&bZ$%g6}l3tBJYiszjCSP)v^plphcFrts!L)-5jE5m+!WK-}wp~PL6?Paih
z<c*p@V{3MDa7S`-vPyw7D9$7Ti#YH#^2uLzgG#7F_mL4{<2;yDVB(seeO!RYW^#lR
zmCk)I1F3v4vg_vS_J+fi@8j-h{<@VRGnyhr>=VKF@3ka!4p=Cl$6Ja<;;Z_~8Iq`_
zo72Yx%zO=KK-mb#jC@vg#iZwC7O2rndK;Us`#tB^c@V|K*&(IN7j}kAWZ>Bnwt=JK
zJ3pKm7(hcZ$^Fc9|2;j-WXrhmw1pTvhkjV6TWE9H*YXx>2eGnSC-1d5WEPU9vWFyR
zXNd5|Fg8`g1;LfS&6_Bdn%ESF4A(VLRL9XgT%D{jAd}en!1Vg}g;F4y-S#y5jXtli
z^4V+8bfMLn@n_x3V?K)`_qsgB9e!Ww`AhoDbgGBHzd8{g$TNT=xfz#TxPt?2s-34~
zQQ#5c8tPF7ZSEiHto=(4Gu$F5I3vAxSXGZS1??Hk%P(YfDAO&ak@P#u@kiEg>+g!T
z<nt?Qq6$LD*d_yx9HN*c^zBAVZkvM0-Xb_GKO-RJ{CrKwo(J7}g1DY~WkDK{@+gNM
zDqtsl%e71OEwDZ#g$7_uhp=WMMU2k!J*$=KIE`WEaPO^4X107i8L?$#Xcc%2+G#kw
zt%XoI%za3n%?9+Mf;?bRRVRYomWNr5wmfnmR23sf3yAH(0Y}eoELed5P93Cn|KN->
z;A#KPST4ChIvi$okAtVn_~#FKcurtyp_2H;+d{iMQMW!VqS(8%kg-^sdHwaUO2GFM
zmcK(Jf^6u#n3WFgJ36s7o?Ok(KcpSQhvxf6n(s-mdnGmmCcYHE%W=?FELd-qEY5D=
zW;VRp_1ZVVlDA9k)VQCUo{fE}<aXm;8&kw?phMR{&vlAdPTm8)n#%$4*#`MnxywwZ
zOvGCojJoGiB-|2)pxxS83rmeJV~4SyZZ~F|h_~-c%-^=-iSy<)f`l(lj=zZi-Q-Z#
zaOPFb$3G6eHUD7R-LIcSaQ4%y8j=}~*gDYi><3LXohm9W8=|#JGvg&<0LL62)wA9|
zg=M@upNJUvA3odxHG*4|TuCo`i8^bB%X!b@<<1;J`<RlHD%6M`kkX5+PbImqc#M99
zL-eUwt2gWH#~z|U&wN)`*B4(rO$GHbYDp<M^9)Gw94-deN=nVz(P3MJ`dV@342&&N
zWn@cTtu+gNcG`@Hw^S4bS*MukK4(G2!QIE7G#Slnh;@30e;u(}&j&|NN4sw`Y$JFP
z6IPwJJ@;w10wZ!~dq#4+BtI{ukEeinChJ|JTcWY<*jO84IjV@}d;Efyt9N@(MsY(+
z{Q8eXF}G1zxp4m&vK)33#nTgO`3myHZ1o(4V1n&{*Xw~H=p!>G79CoWyp^m<insvE
zZaYl^++D%DdcG?kSoJS}R_gXt7;sRU5Zzkn-0z>SHExlG>^6PW5%Hi_6LdhbS<gzq
zydos)9td)F;a`1E%{>&WDp?4;fH4WY<9kN@Pc=G7wd5&H`)aYP+gwoottzS4avF)V
z;+S8P-*&#f>YwXZ9d3ci$8}frLsIkP%egp4yl;8+-4ghmgHp@0Y{4}6p|~Ut!zC`6
zR4San)$WOo*`~ed@LAiQr(C%4R`fC2n~CaX&2R7Hcd_v18amK-@W``y`#3-HR&Jc-
zWbtgEm^^<1Lw0w74wM*4Hfl7}JEJ`KU7aJ{YGKY;+`wiox}YHCUPhk$NjXAfp(U+^
zkOvD*lsjd+WwU7}^@TWobNR8)Dam>u1;tQgWB;}>b-uuu{)Svinxc_}!0Ob`N$kwQ
zCY?uuzlATqzzpi%vuh7zub$sXKU^;Fdgrhn&X?Hf6JP%&Va$`L2x;^f9tmvhNhU4~
zs4k}`(z1}H*T6eUp?@o477}>fUc=1{c%z3P?Tujq-Jx9r#|>kiIe(z~gZ+pI^(GnN
zWm|gbdK^@*JXy=asZuC>mE#)s#Czi*48^>+c(cTf0Jmb3eKo;ko2vKurV9JLe59+^
z;sfTs)<39DnjIQyHmcg+-fa2sx%f;(?cJW`1BuJFrfg2*?z82n-@c<G1ujL7&9xcR
zY1pCMP`As4<=7iBS1p)aziL4rT1phkXB-BH1vGB*oSC-%&~e<vc!4J6@yH0RnA<wV
z`-}(ERV<-5v%+vT4)65*E`SUXA0rhnbpK_cY?~$1v1jM4{M!hl&+r#2JR=8D%dxIg
zc%+Oj?@}1Yhu1c7icb8WzPl@3xH1q<bvWeZ<J)=HgLC8^WAXPss2RbSeoL7W!r!mx
z)Y;pmnYj+)`ke9YA}`I|Ur|L}Bgl2X82TJ0h$6atWp<^-3X^J~3ERpyjZo#hM&0}}
zexO4rD*L2XJ*tthy!Nz)AzQ!L$mQ3x_ct=FK0B15az<U&>+do!yD}Xoaj~2y{tbTc
z+fDp}E9)$RFNbeR?5w)?p0H|PVjIJncd_r1#M+$a<?+-&ODi~MlT=ng1o8xQX4m*O
zrwBUbx$)3t!uv%>)?_Nu+y-g;evoU7w{_x6fCFI=&?caFn@W3;M|3<8r4+^ZK$hh=
z;Rwx<Tp#F=;1v4!d}a-p5Y{+6M37m0_gMqIg9cHAc!$uXK@quKhpLY6-zil?G^t1y
z;&dqPsE0VRCnJAW3Y*YFl@Y5pG0UgwECO55LInepQ;}%3Du;0)oL_*)fK%ZiR|no(
z4mfW59lev6K74BQ81xS57b{@GE6SjydYSj!76siI>jo9Cb?A{};q`lXMLJWH{<U*f
z<8}JSu@2%wi#h#%9yDqkWjEVJYB(Q__6}V&!4OD2K!w{L*Q?Y0;+J_a_xa|UGKt{!
z$0LFJS9b3ffqRwiJx|M~dsEDv{d}85;3^7t!jo@sNJ*J=CvaE!Hb;xj-z44F3S@*=
zze9zX@~`hU&}7)uH3OZrYM>Oe0J|Q9_oQ(4>e2L_?zg&S@?UhSBV6P6%JtfJ$^Et#
zB+8SEyH`u9U$@uPC|lUVEyp8uD-~!D3RovoY@e~#*BnPRbZsb$&heJLy^P)oVwL3A
zqqx<qh%gaUJ8A4Vj7s{UgKWDxeodmGp&>Uo?82qXAV2dA7rWxBSYT6s3nif!vR9Ll
zd4rl6sdq7lymn2#CXA0NwA%SndLt|>Odl4|g1V>%^>;n$$D(h``}?&Ft~{(B4P=F6
zEA=*x2tOS<Nm(!%7-PZ01Fc8(>F)@v<i|mw5rhW{M+;mQ4366ACsd@jriYh8KPeES
z17NMSCCBc{8_F{Ru0;uP2%=l$XMEqb9pek5KAdQL5ML3$rfDnOzk(5Yj!1Vj!Ila1
z9@oE8;5;AW=i56o6h}YzLaF%4uDDf28O))Xsks@_lDE+Kwg}@KmwcThmR0#YOZ@fr
z-;cx53XmY4#ZN%E<zZhu4`c~)SF4|QWS!Dqs4+gDLKv5IONlh(P8~-b^HZ$0LWI-a
z>Agej`|WD8#Qdd(M@O;XltVI=F<lxdkFUf0NX`#zPO%{&>*SonB5W50zm4?MXxgj7
zhwC0HEQ{XcHu$M?sz)EVahZ0ljm0yFE$^y!b_;q!G*-alW2x-#zLB;%OJC!UI`#??
zK1X>JMb#oT+Y(8g*=Iew&|6Hv?HnHVx>hEBodHa#`Z=;}mZ7jG4v)7qsV!cRCf%a?
zY%HvJmK0m)hxSd%C=65d>mSSwK!Mh&fhnn%Q_IEWtRUXPlWzE9u6kBX(oF))($7+`
z?~M{I^PXK)(+);@R4LEduCcHPOm}=tssB1|0D1k>hz|L4{5=NLYsiuZiPO8+1HXk6
z_P!8U4iC6T5PF_a*W1$4e!kyZN{1YAJ4%{tJj9+v<j?&)+wpFJPVgFABaI<+<FyIa
z%hvoz(`w7o+NNvwcz`TK9(!NIj32k3O$lQmpSFgB#6_L_tj3j3(~H>(pZZRtkYhAc
z#^U2g0k4^yR;z5f*3vX@e84xbmooBCULSl2`Yi8FR<Lr)Tsre>@e$^y$F<uRbWAMz
zN3(Q1DjM??kYClyarbaS2SyE;1{0wFBA@_eM7Ht&to5<;d*QS`*@E2Jrfv#L*@7RY
zi22_93%^nUI;BmzcmH(7mVpg}#-RR+(CBhdU3^1-x#-#3PohUgQ53`G1ME@0uL?dX
zShBvDAEyuy?;aO|90oRIzPhX4^YgKL>AJfw73WD9t^Ra3O*R;Z8NCv}szdXYiay`p
zS24<oJ(0uo(ORp4;7;V-c0nF!4dC|FmN$HNzdLY5y#LZvT%=s&NYCH1?tPA^zS<Va
zxqKLPkqBB}6>d|(nETsU;Bl3I4cGM1Qpd9no8fyWag;O~#{rz6#%B~OxHIoU7pVN{
z__T`A<2>nm<q5>32Y*C5iQnzD8!MOE1B;dGPtMyL09ZKq<0A`(sT>5@D_((6Sr}8w
zOKc`<&+94W6W^f~&+hJF*z3J9E00zJnqLwh+0GGWt#m8_o-HNL%1yN%=UtQ~z@@{k
zpO@MFPVYFNQ>?JeziJ&P^Hk5a8&DVVLLZ)6XH5{c;W1Aqi(N&RAIfAj*GaGW&mmq(
z?><=HJSy*lhbxJ*k>D}EO1^V+e&Ge@E?a2@OJKgFpDz^p{7C-@%O9m*k;Jh)H1C&3
zW_h9mV=$e6ig(2V6!?;h&chtMcw8c>fa)40;{XW2mr8BcSGrZ9miUj!ldwCmb)Eq`
z-j<wWj5tm>hyCSuoNrhJB2PRuyH&4s1af_Jr6$pGWyb3)A9B#Ndar<!g0)%LE#3I2
zb6n-G?QUM6>J+4A5lnz9BmR+K!>frPy7`blV3=M5J(7Cnqbl{P#Og4elW!g6)eS}5
z@9!N-g-^o4)oce`l)i$?ap6gO;6z}Ci9MEwE4iJFf>xta1s8T>cDJgW7IVDj?N;U;
z_FiZXCuF$oto4xXtjm`#eXARaYqAh$p~hnl1$sc|DsY*|^PtWc2o`yCG@MXTQCVPr
zEa3~o@A$WvzB7hAG=OJ#E8DmYL9V$nOwN}u_;1X!rHGk@PbM7#Ej9hZ7T-OjV$1Sy
zX^x4JUmkc^3=XWiNsqiAo*!aGpqmOjGC`+FL)ZyCUZG{jzFi=q(zj)Ao!8YC-|sNP
zz1MF_SsAhs;YJOm9s9Z?c~41Zl{8t;$E)M5&(gayxZPP&=}ru_?aI92!oxsP`p(5K
zW;C{oe<=bF;p<?vMK|@I#UXy~pSv{}F8j%5^PHWutGva!Fy2_bcMpfCK^90Pk6)Cl
zcpLCaf{LT@YYlWjjY5+b^)I^-N%1Yu<(2*f+r!U@H0*3t;rinCS-@N<FKZG<v41`&
zt2rPRKiLs3)kl|thyHc&r5x8I?1bB&Mds-dVCMSjNl;sxtkpFSyl2{K<PD5-N7!|Q
z_<C~=Q0GEQ7z59LL~7Dsp$R2kZP)3@#+M`|Y-GZ#ThtaTv8SIq5>-?T@-rhv$|N;5
zXaz{)z2(qYG4R#3eaQzOdz5Hu9Cu^%YuuwWHpG0tSvnlry!Tw^5t+4)zWiWNY&-Z=
z%-8e_5MM4@ygb-<O44YuRN_x3?6}&4_5ZW)I+b8lj2QnDpq>cMG~9JRq25j<zC!gM
zA+(Ws=Pc7R6|3=<evDFB_5S<0Lo}g@b??@#Ye%#%`n6P*LHl1lb??X*<!0oqz_e(o
ztehLM$YO6l{*hvx7u~lRFktN`td^>L*!bqP0=dp0nvN+=b**7>LYSdb!3eCPP7XHq
zxwZ?--nPQ`7!X5+b-!`A=`%vyI{65Y*MAZqyxY4v6~$^&WvE5UyQlV(vZZGMGuNdQ
zl|BqCCdt_%A8A+irWRPNyk<mX)gW1G)yJbtN7||wQJK&dPwR&{f=cT1q{&h?Gh}*)
zHvYA)LJnG|dyuZ~Ua!eSmKHKr=|yBtY+qw0Oy$u5evLRZYWW#L2O@T_H5j+|1!I7R
z9`xhsKQJ?fDOnrKlnlm$rX6@=Mvbd{xP~e-zj=#B$kcPCXN|=H5`wl9YEkC&#V)bo
zwnNqLq8-KzOQY?kxLEyg*(Bwel+pbwgxhpmuG`ff+d>Yf-o-)|4ukU<wIbt|%Q+}<
zFS~_kluDARoSWv86YE43k;3<%`d9Klr$dJG{DVQ(WK69UHw4gj6xdoEU5`^HCeQje
zy~CbCMC-ZE=`8k}A6OF|k_77m1X|Q;i?A(i5S=MgO($TVd9&$B=j&PtMR>h#d%H0F
zL2t@pQeJv{m7}W?i=h3)K*NL~qGV;bbR{x?eca3`eP*)dNV6LD+?N<D#zMYFx#rS#
zfr2_|x?6nPlf%gXTI<m3mVRX1zYDzPe2MfG>a{(t^j(l7x~3;SrKV&$v{Fssu}s@C
zvbH}nT%_E*O-Dw>xeu(4bT5wWVBY$_Q{OJ@>HbKlplWJhpUd(CuG49QQlpwaj>|@R
zc@&dryX!?xG&&Mij9R>1w1cLjc^XK{PeLcVc?qpDQ^5-jj6hnrDuIG8slO#V`;yA|
zurgl`S&vVKm4h62lBX7QKE?*AYvk*xodNePa;uBB{pei6fAL%7sK|?g^ED}m)6oq=
z0{ygqtjVwlaW!{of+mu9nE%^aM`T_Nx)dJa1cj$iNm=_kLfyRYK7dSj*4nNOzu@zr
zywOza7NMJ}PEAVR9>+Oss2)m!NA8E)dBaTq%G!=Pn{|m(XzEsXuMwpY-ROyv%~N?(
zIJmJLu6!8qG!JSLzeE`<q;Vt6;FqabYig&k*4sXZ5s)dZs<wyA0u}sOB?Kjk8A&Uz
z4t6uh3A0{(4OC%p73+d~|6Co#QB$%2WZUg2p3%&g+4WONDp$cmPbO0c^Ag@zjeTm-
zqUW!wb}>rOU^vX*9aA|8Q9H0NS}4lT>y2?C=Brb-pV8AVcIXsR;U+jcB)9a*i%{`)
zzw3)#9dS~Djyie+2U&U-_eeAM{>e#}35AnjkTviVip~P%L~2jfT{m0CRS$83L{xfv
zwHIar6MD@cV6q5Ui}=(kRN-6&H(P&$z0Rp)ez>ewGT+zGi&3cpnz6q+Y^l+jy)nC7
z+$r~=7H&1xjOF{<E-F*j8ra)njnKN&5G#je3H<lTlFPh0lMZ#wz)j2aNh1FZK(xEK
z%S!k^FRM3I7$F!)mo0-h?piCPg)!_ub;Jfxd8Z=+G~CB1&1&BhQ7JvJ<nR4dxKfet
zTuv;p(h_!*hA^XCo17_O^q7t0;*$xc4KvN-dN@u>Cnvs3uH;NkrG?O2mV4@5Hp5Ns
z(00k~1jyQH6<Nw^HE&IDwLCf?ZJK)utc(^eaFWLS)5;h+T%}R!^ubQbAFlA%iS6Rc
zz9VA3cI^7KX^#u`8rc`nU$@PB3Iwc&)lt*Vnb8yvlN=~+4wq`wb%cblj3vVqS>8@8
zMItwz;XeQI`R4-#I<*%yoKc{%dVxZ$<SZa$GQhhGMIold{yUP-d1PBfZ<raWNB8~7
z{XBK0pF<Tgn-+tl&2YNp@ucx^{R&g1)cqn?VW&C+yrAWCbPNaI;v6K;+4W(zb#67}
zQO20Wm4t@!z!KF*>-;QTK+p2rw_rY3_q8S9C;6RM!tFB`CcFzHgH`ydqa^+AE1&uJ
z<Y`D<#8Z<vrz>z0#DLHXFSPrxYhIftc@Qz_KX<9Gq+D*kMSvjfY;(UGSz}&@BEc)*
zJU{S5IdKPm>C20kHD2w{{dBS(3gNX;^jr%5cExb7qxbM=k;un?SsYp8R>!Y^kEiyx
zGiw1hNsxZ0d%$`RI@jB&A7OGPpmOH#^k(8{0_QM6TO3=qy8_*hYOr=w`DklkiU6!9
zt#w|UW*$C|0*ygbINW5S+N3otJ$T`yeveJL`Rpm`sMF=pw)F-$3F{K4)p~q)U}Vq%
zZXI+nM@&lz7{Lv>e%k`#W9bCsdGUjRi2?3!5H8=asbA+<3X^;ww?HbX=&{0dJYv=N
z)uh%s_f+ywKI-J#H-$6nER7GpLdldiXBd~vCf}pPNzw+H0vom*B?1=io_wf;PBc;h
zZtt*HkY@2z-W<3_(og?fF|NA>&o<25;QLaH7b*NN@p3KuDsxTYqT>%{2Rg+&KO87L
zwnwnbHHGWeGmROLlRSzRAp+9*niPb+aTC$?Jc(_P^Ky?&hn_vPC_7a@!ro%wOu|;%
z7?wER5`Gcb?3T1V^8QOAz&QW@O+e;7?bAD4=Z5auSQXwiZJd<kM=QXf`Yb@a1YBpI
zH+A9^`VKprR10xVheF0kci=!I&Ezo7o<jZCT=RD}ZTys2AOpqV5X2Ox=X+{w?^~+m
zH;9VER8V4j`X5^(RroS;Sw(*QoGfZL;nF$me7|!NajP2a(w5QIo+C9_fIlQ{LIVby
zv#qhoP&=M995_+1!oWdRLITvk8-E9rtfv1kKXrKW?AbUsUcpV=qEfp)FHA+w3C!jn
zMPCe<`N^^PDA2KSGF4PHwW?OcYVp?6{YrUza}Ug*yuZqNA%&{Si*6xqO)2DDC@D}f
zCTXew13_7zvu`y!`C`}Lsw-7B$1CgA``b=6o+Dybwguld!-z}AYlf*1XL6Tb9R1Wn
zTk!6JqFxZoxnocDPm2Kj4x();>%M`r-=zZnV!|j6C3~?kGT~3HY@9`9`gN*mb;d?%
zU$Zpoq9g1E4F^e=#S?+D>O$zXIc1n8yl3T+^Y5UDy;KVr1^X(h3j74u>b)i<3oe7=
z^@P&RW|{`7LoUH`jz*d4g<T;yK_X;^D=I}m6Cy_V_CDAt&aZJt7Bl%jkF4uF3_YPl
zsbEYKZ2eR<!@v2tf$*Fd950z8j<Y1161}YJvFM(LWDHEhd1m7MFZ0wN=FltsBG)P}
zgKyeLC_GCpjDNJUOouU*yDf4dSkG6Ithdn`B+PUCeX$&!+bKepmNuZ4YZC(cJdUEe
z#uH4knA<xe6|BiAN)sHWJ^I3|7SrBe7TLYuMM8y3Cj-P4oOjZh|2*$l3`GRC<LkNE
zRNqS)tq#-a*3CboTAj=R0gD&}8-E>M@Fv=UHecIedL+BXN?t^}H#%yUOL?T)yF>%t
z8cNFgs$hjHy|%n+vT9Ed;$(Le@BjNNZ+adP#Us1^m&VvK#-{kh)N%)d?P^q&Jy?d^
zQcfOue#l1eRQ?xP50##IhlCH>lEbBqRFsC3Y0}Cx`d4#7J#5U5fe8dfaX2Rv4;Rw@
zv(8Rsele_Pgnq{Yi8$OV*6NFs2_pzY+Q|v5mP7mLbxwRDY<@|TK2rQ9_aI`E%hLAV
z!b9K*RR<jud9L%rX)#`H_}ddoi)OZWD5+nMrng7FDB#(B6k{jHp0qHsTu;4>g1uEy
z6n*L-UMpFU*Y*(@hp(;Y_s-(qZWwvJk+|3W?WSE^O<0>-FQ->(!8!&rjIQf9T2d#?
zYI6@qIH{qDC|=UZ-OqBk?;#~B-*V`w(^-arWM+NA9lZkQI|?`@|K#m8dx3$qaGLlW
z>4fk}SO=BoDCEaFMPK4QTh%G1eCQim3-Th9a?8Wq!(Ag48B2cIw#X!FI-25Zka5mJ
zE35t?d2WMF@f@0SE%bAE0#EDz*fF~gH7Id05`WlGM^DBM{LnuCwQFmcESvQPk7ENi
z*#f5FF8a%)CC?JjI!0E+ELBOtL%TUE;wy5>bfO-&qE~+DV`8LK7#pkIYI`kOahm7G
zxP6c*hHKgQAErU>XW4icIGVv-1_6C8$OeO24yW;Jn8}85B6=lM_?^1KoZnz|!+M4l
zSYPwmZTH$E=N@FesVbwHqYNxt854qxS;A-?OsOgE3OEj_^-ia9@<5ZPDWW|thEkWt
zjDRAzcn-BKF<36R12t$FNa_Et^948yjqXd?G?M|cN1E1H1_@T(UmxopH+fQu)G!cx
zQWu!kdBuI3lt{M%_3|x0{#1^Hu7Jm>98q)ve|cUc&>(CLy`nNTCr3J5;PO~dGVh}i
z9ihWrS!N}*hZPHI=T1sy?Kyru^3U+B9(j}JE-S0;aW&yEdl?iJl2XBK)l?>Vr0UrX
z@Of#6bC{_JjKGk6=j8~D>YXGaW<&rs1SPAiSMw^$;es><=bGRJJ;%8zf4vs>S{rUV
z^Upb1JMkn7?xn0a-3b92lX%0D{FXg9ItEYr2@bt~(u`{IT3P8&WH&$1Zz6211!i?;
zBskZT1kX03s$X(kFZBaxazoh=;@}K`n&V(a@=Yc0tllw$ojuqBJ3tAVXj;EK;0K*G
z6ZA-fdc@j=AY`Z1)dHojf4ergODdq+TN}%_AkkUI1oq!Ey^@_VdQy_CF{#~AX9rOu
z#IvpHx_lM49`LRZ0vj2mZ;psYDWq;EQ}C4-RLQ^?GPj!=ld!J65x3Gzr{HIPWtW~3
z0tQd5DW3EuHJ9%1YO+5ypeaC<;Hed_0#D=&ij`^+n{(hcs+SFGBEj4%C;^9j^#;7%
z=U@!6yENfWh2h1WvRBVO{036^TKf7aaGv*nya-&?LTx3fZ-ZCzKmp`yw6-n#y=?KF
z^Zb8<(m-6p;j><E3=1yF0zEi3z_5Y#bJv5n1XY&3L!ceaB>(T{;fxE&|9TB-aojhi
zF$oB;Q%j9cLoA^i3H<T>XOhRV;P^=5EW8L@8?dQPuqc8<*n*`^$9pHy8tSgg0wCZu
z1H;e1NeR4ODXrU?dV5RE0cQm8LURgW-oJ#%Yr#YY)hV^iV1P(DJWU5GU)JErOcD1g
z1NDM07K)^_o*!%NyVOr>89mDppni|scnqY&_47*&Y70%hva;!)6r0T?ojI)=f6d`s
z_m4Saj-`Kte}g&HE*M{6f^t}QY-zHdM8EPSfodwe2xaWtR7e${u8^VNmBXz)2+@{c
zk8qe`M-gk6n8pJ&|8UJq#5cY^xpI^A!b54Lhj){ICwuj#ylNAqph)V*Za-T2*)%#a
z`h@@Xy3+|7*676+g0(8Tp%mX4UiW?%t1~%*)`k`ieS)|OH)4FN=|0A1**P^U&=W^s
zSd1S$cz^?$1c5mA(ewQWXJ`?IYrbT7NIDVc%o-{h8Ut7rNQ>lw^a(DK5Au8vsBxhc
zEC@3YcUr@qU!}n*8)x*ul!^SOHFXTim;DLH4K17`rIe}`jKueJZ$GwZmt-uGT=j;Q
zQ)ZCoC;j?y+BrTIVxr%hB24LLs&&7_|A5f}ZOEyEy529`4~^w!71E~(bPI<1&Ip`z
zQzb4pJiWhaa2_T(d{xueFG3>qagS>NP_|cC!Cxexk?82@%ld&jCzIa#P4Q)^yH7)g
z;KMDx!gg&FJH{IeB(z=Kk{U~$;x$$a3q{8zK&*wX-{FN{jEHD9i}_GJbToHssK46g
z#F7dMHFhdD&i67wXR6l4cWVXu6_|b|l^=l_pD6od!|w}vws|klA4sNJ#2y1LJ18;m
z&z{8f<g|Vs*Xj9j=W4wnPjMxC5WKkDNj!^5MQNbkc7(GRNYhJBgW(M95+_}Og_9?_
zT9u{=Qa&)}*iqQQ8wJS;Gv&`2kqtLsf%+z-w351y`)Oy#F55ltkhZ(iVpWcBcTs{N
zNs{<2*~sF1l&_ZhX;@b`r)K50+kI0}wYj*arTyF(zaFfa;hwefA>zHeSs#JIDhjSY
z|E>Pz4z{FRQo?s{81T|%_si<rByR+O`9zfT^mgkPz7}ff=-j_|uYD3UHH!#t1K*Mh
zxG2;BpxS%C<J_7*^*qNj(LZn6<JMA08<|#Z>4aNJD|Gk!%bAyPLsEK|-sTmZeGQ-!
za1XkCjXv{2$YC-1D^%h6;`N7SAw+u6l0x|D|JU7DM@9X0?;-*ssFZ+$gwl;t(ufFB
zqA+wyhlDf?Eg>i%D%~PALrF>tC`jkfDLurHLkw}x;P<`veeb>N{&j!<{LWgeHO!pP
z>Cf5w?6ddtbX*?S|E89XDjK4OqV-e@xwv2plm1UtBXf(K@7Htmh)WOZ5v&F|zOQJe
z7WlRaJhH(1h$BWahZ_V4GMWKFMxS3Wv3+88jMP-6vcA-4jw)?sIWNYAW-6-&u5B80
zoRh<z?a#n+u$JX_S$&AFuy|Rg)U6ATOp^RhAtp_AY4=-Y>t2$RUUI(NXk(Qgh70bt
z_2ZG<%&Q!RkbIMebc5^1b=!$z=O4e(2+SFzx)_CO&b__|HcpWGu+eWS{~8mZvZWnq
zTC)y!N{?mHC3lv)4c>si+-|xYHZj%vi~F)U>l6j_KwUju&dw%74g()UrG6Z8uCp=?
zBKJ-#4AZ<mr!SkArNZd+)9t4>39DS*Y3VbeD~&@hts-xTlIR=T-9UI(^63k^tiOfd
z%{UnGkclYmVaV^N=t1OtTecp3dpB3ozZtt>;jBpf`#V6!o062TH;U1Xb&uVAq1(=D
zkI92Tr3xPw9l1ag3HoxpoA9n)1%ysgcb~o9wtS~qVD%0~22U8hW%qIVKuD}rAK!#E
zKkLr4MW{;e<-wBilW_-ISAFG&iP@<%w8=xWwk1KhWkTPhR-XF4L9Q8SjhY-QewQ$-
zsx?cGssHHBX*TpcRh(CYsr13;G|?eF0;G8?Q5!fjfE3UJv;q^r_fZ1$lyZhSSxiFj
zO&7Ow>U7O~(-CUyp&1e1v}eQr=rIT~7gT0_q99&C+C2BRv>~b8X2yQ178(x$RIsa`
zOcFAF1Ych|^==fdnnb%La^h+D#-xe<B%*jAe*=-R7fADv89_vYYxw6cLc#yBiUI{9
zBS{zv`p@d0ZD33rDTP1B`{nGE_DW!`JewdC8~w%h)9y!xA5EO7-0j52Le@G-$0U2`
zD1K$}#J;R7?D-$)Si3;KN&tV{KsVh`_`bEDKy;~Wvabo*BKH0KLkfn(7PX9lyfDh|
z=taYjyQf;e9xKG(McP`nvkabtqu1rI8+{pjf_8~nQoc}l^z!#3+ql=r;nTJKz^%Wy
z`Tsd$#$Q8;d(Zao`;NKxNey^)Sw*kfZ7t5}A%=%^ldn>vwKd^29r>3rDX?je=V34s
zeFTXX#bTBr20p@Vc$0x4_v?Zj7x&ZEnYMB8s*EBC>xas;gAh97+<`Y|n0RLwqy80D
z_@6Ib3=J>0;pyia!jLhAnz($uho(YWQunk|m*G>ds=sCS+m;)+tSG}BAJ<#Kch{3K
zLc)T&*OR*E7mm7pesWj1t{1O3Ox>iWw!^>FK+tEo7DJj#FA4QbmWNjU{F*kbu5dpM
zlYnX4-xB^HOnLR&_I(~nx*oj$+v*O1e%}I-{;L2%u_D^j!qU=q%s8z-(*e4{WEHKe
z1^5oy%c>DWD_UjCDO0N5S}jYK9%ajA>v;(7WAk(y%Rl3ShEWm7B6fw(h^;YO0=f-*
z0y;W6d!e_RFr{FbgdYpYtJ38MbzRcI)I54$IW%%c6+jR+_Tns{2rS(Hxp<8qh7nr^
z`2k?-i%?M}Bk*bPixs{0LEs!_`b~ciUF|-@$Qc8)m+Suh7kqVFK|RvvK0vzuo!351
z>}Jw)W&w;{uH-IH{Jp(v{>;w<SzJ+3F>X+&**}%ICHYB;{iKh+BI4b<GH_>)U2*|^
z@<8z+q-8Z&LBQj&Oh1^n+N!FNXXvU%j-2~VQb9#p7>m^5hLg(r<wd85f)0}p;ocXI
z;sgtW5e3|1+O2>p1%XbBUwyp=V6WCHsHaPD30>>Y{f;d({e~+8E3Z=LsHHpXhhu95
z%4uX8kgt0NIOBXNP~KZhBjW&dzX%1?F-|V7vS{y-CV+Gpe0I8->^3cZy)60*SKZNw
zERUac{ta=vb*5?9N8tBrz1R`^(zsft7eDw7@*^%~UB$Cs;#``B6Xta9HWR;;&y6KL
z@GUJ3W7cg0RXSPryIg7ly5e6A!-<6YeQT?{9XV#R7FF3yz~HiYveKWm-{_d0xh<Nt
zp0%RlSgf1N5-nAH84929L1krSVZp0x$IETdF<aow^}*fgU7YmUS0brh0wOkwRKM)a
z>NT3{juU?fKy`*d{cd7LN=vTaCoP?H@JLqV#$mIHBP8;$97_eRBY7~G(nO_wS+tTu
zG`*&qTLt>FpM$D939aZ`7H_fetDeS2=%h4%=j<C0(Fq^cWYlX1ha!jSMJle@s~$_I
z^352zTQJ^%FEdj4`<e<Hm)Sh_p65?<!8DhoBQ<BNF^r%MH3lH?UlO-RiW5lPJnUeT
zb5dWUVD<q4*)_X8;uWvw<uQ5j)uj?{GB8%v`lo}%jk3NPpEmUS$S@6~#TIIHdH!`}
zFNDHuqluwqWn~or%wm6IVnm93I?q7H8m86sc(<aii&{p(ZsjwrYs=78nxq9)k|dZ1
zvYt}S?#b&4pM!|=<=Tp^E6d)SfGsURAg}`;5ElM>d&*k<)zv=&e1`(3xgUX-(LB=)
z*T9P9JU5T7^woiV57xO(BPKJIm1jyQr(yQ#tO04rqqn7C&of<gT*K7nOBGEgKCE}6
z9r965Nx&Wy{jp=1{k%BaSzlZ+$)^psJ1wQ?>v#6Gw`7Q{=m)$falDG7Fb`=dc8!_Y
zX~QCVYSm%UlSm+so#dZagST-Js0FcD%};4DDEg{~u;X+A;HOwV2JU+6S%qE9PNwR7
zsUmkZX4>lw%vjTwMTf-_L+7ify0I#k77tX_V!}PYsM5%oPVy|B0<7x>tZNg9RADy-
z#3mj1!CyA~(+U-4c0zmmsb`8psthtdRcYi6(Ka4uYx!(W(Vm^+*Bq4Iv@<vzev@>t
zoI0#?yIPOmv9zx076LG>s&#sJ9h@+Xd~uH+KD7ujI%{*MbJ$vMW3^eyQ%u^K>Wr0K
z85I~WS}dd23=~Q$5ciWbIQM*&AA$Cs4qJZB#L5hRRex%lcCZ(`EJ>&T^4qCsFUGG8
z)1c-l1s2VtQ0xrx_T%9M3k~YOJUQGxo<jNMI{L2Zqx-dTc2+ZXKSU4-Bb%zDR|CZv
zzOnYu(%#EzW+J_=K(oI&Gi*2A4Uiwb&b5=J*<aYJPF;Nyf+eB>m~WBaO)e3tB4Cq<
z<3y3Up!>OmRPO~<>s<3B;{b}=V5!Q$$Q-AB<`qA5z!}8)3bB04%Z;%*d1XH#?dS##
zdj^S^NMMc$7tz^z8%3MNF;g!))*v{?fU+?wltkN%esw#a7vFV>6^OYw77~IW34mam
zhztgo(8$a%`T;tR$xILxgrVZo_-Q2@`ncz@cjMM$NzY!C8>&$<Lpu%n7{jR_uMq*E
zT#VH_8&W=QKUiArH|8D~j-1d0TVRwh!_1zLl$6o#a&J|03WzxYM&`RO8`0y`hsqpE
zv`)u2h&bN#d{dyukmR@0yC>LvN;@wWDR1lAS0F(QiIUiD|M{g?H*0BeaNomv0KM#2
zEaYDCbNj}c(}q)b*RDkC?sqo&h8NU(^jTM_kEc7SPR-tR(nEz}Uk%9{SAK{syswD9
z1?EVmwuGC!8MpJ^r44GE&U;4!jXPb-zrcA4>#MBa$&&g$@uo=U1n}BfYktQ!6g${Z
z=xE>_u6``v8e13}3)MVy$Q-UaPLi5D%tT~0`}CwC)2{=NrlHFu5*c7;)uCjHiLi(v
z^*)KGrYV}lKd{|dl|m}T*=aQ#kezihxVh>wpSNYJU_z)=91Gt+A0wJN>Z5(oFS94&
zpncYO6eqGH?bth?bFM_3lRX9%ZfxURlb4jR`dD{c_LK1G%P6YCgB;?qD#^@bHu=qE
zV7w%x<9-*6efI!5LBfv=uqO`hJOyrlgKGZA8e9?&$&7_sz@_TmIC5m~)0l@X(n5Ph
z+Trw@(J?AsQ}df~e}tw1B)Vkuj`1(+-dxIKe}7LS>EkBVV{}!~P`%6YYHE;&>q=jy
zhH#IXqGCV5^eP5WO8v+(()K1?(&HPQZa?YMlw7hTSFZ8K+E-k@8vQ!auPNyM!kx&n
zFjajigcbTwXZ}IdRjT)7pIp79p$6(-qe&*_L!&GDZ&2M#Exc-WIEfYMxY*0Bw+~s(
zpQ@@N05(!K7A_)8`XtkBzpXuP>;5O?Kl?p4wyYM_pbsunN@XLAPySF3yFefio$jn|
zUtj%_ZRsi?o_xVhjb*uT?Ug)ONuD{$;jLYNum)&G&=Xo2Ceyp^(_u$OEUc_ZEXfi8
zUCGyXsQyLd@C+@9N7{94BNHkqcuKwAfS;>gZB;9|vofL|jxI4nYNAF+G8z5jEW39m
zHg&$<!w4g^Y|p>dQ*d)AiOrSD+tkda`)0PdQ`+z65En(FDkQA-LMQ}7O_@Ls8L@aP
z0dEl?P8BFi$fhd{LjZ3@d&+Y@5;@b@h{&3A{j{*8bOzY&Ruiq#`t7TxasZ#sDA65<
znpIXTriB*aH<#sScDFex#1gtCTDf@^lFOyXe)=jLru9lxd0pStYZwtyrtt;1v08+s
z05QbS|8&2jr$_7AvnaovPw6Yw%V}jj2#F;~*TGL8M-_enDEbu_<U4Rk6*T#QBa6@Y
zHJhC$Yro?(GFb}E%d}8-a9xuB{C<61wVtXaWk=|SH>P`dsiP%~q#~X}i~<Qk*`X)p
zKmM7h9JGmvlhj-upWi^ef>bO|*`Jxb^JO;r<l@uGja7LF@g#xlTB*h<H)cLCcu2R)
z^{up29Rz5=&UWG(b>;&3NLN;hJB{kSZMJ5cI)4AgqHhSgt~|GMmg3>*CVBwyzzROJ
zsPtf1S5DcGzGmkmB4#rUN47ov5E_W4rVVhb8?FQ)b#`2Qd?Q%2ubLXMwX?tn#g4f6
zWr-}&MYWq^zL<#_Es${{g-s7AHFw(^MFKwx9z{tkMnzVk-5We}=?&KES-gLntKP1A
zVK<B!rdAs4sx!7h;-HM$ym{{cxh;*@d4Sq(ccJ1E=@gf$K%*lb7PuWrBhr<+MqL_Q
za**>U(q8l%{VuRgU(cLSg+zeC7BH*O3uFVJV#066&p0Kd9&a<)XasmPT-NN|+~ut6
zs+0l6I~ipOiq;^k&7U!;*ixXA_%<*F4R0)IyFGsVX8(ApqT9a(AmEl-2rSz=tZ}oh
z^HVG=?T1~v{RJ@kt%=VZMaf6gcYpN~2Zta*P55l)k)F{KhC6(?kK?>3xmZlwCW^+M
z&-0P8)~k<v>RXC6>kfdne7Vx-sBuv(X@67<<@sBM=ewh=-S@r{MdeK+u)JALXvMS}
zwv|tvpJWS3Rc7waoqIu~PqRF3=X?@zm2zW3=BO$QTD^$rQfAv%9GH7E?LjWRZ*=Og
z<YadZ_?Zy>=P{iyC7;Oi{PbWSpgtG^#xJct6K|@7tG*gSq{v34(-yqJ_hS$p1Lt;@
znv<PQA-@w(7G~zr134(@lDQ_eP5)!%^3{xbe?2p|-7F9GiF4Eh5?At}V|85xv0_KV
zqS2Jk>j+4#0)8j`;ErnHcCi2VjFu<f>&wdJzq-y8a;*m<UEH#~Cg*tc&XU_8CaD@9
zb9$gU0{X{~%cTt-3Xg8t>brr}`oFjP&^zjm=$C;3b^FnhM_67;5U^Od)f9Y<)#<RN
zlif||+#*WlgGuGWf_da^0eOHVl26I`H8)qkt^Yo-2iRPxvObzugUyx+mu<08i$gSu
z8+8X+b2z5;^nPcn`X|7jtf_4suG~x&R~a{2+8F6$HP~{B<sLaX37Us7)qRmX=(Te>
z4`{|GIbTfWeq)K$ZcO1cuC)Ry>p2!l>fcWB%MWIHf!ml35COwl>guNZ_WbF}4IKnC
z&Pol1@I;&@6{qBgV6!)PWeXp5k?dSt%4Tg*6~bi<)E*z7<q_Tjww*_Aih)Gadbxj6
zV_`-61o-`u8lQ!IzOWjm<RBZRi$ywoFEu-LpX}EsjrUI5OBL`G^CBWVD&YeX!orG=
z6;ucIChY{bvWC;HwS^MLAvZX6%H$ni9S`|US+hje=!|q78Yg|;+_==wS>p7lR%@jP
zzoCJg&C3_`hEd@^dov<1GanH_#O2CAktPW(u<W!H_uR@rNH^_)Yh3*^<L+W}W9cjB
z-=d4R%g#ruzQ&;7V=B*0U}P%uE8wi4qxx0#u}#_aI}blFX1)VARq7B&Ki%H*Y#HPC
z4I0F<nU0T=S(1AYkt8#Pii*QGfP(NF#AfmXhYuYP&O09MG`Idh^BQo<(*_3(#QW-;
zoE!^*bNj9(Vj3h8xvt;^C-bQ%QDF(nm*LkpHT_bNDBq{3sEC}XDn8y@Z7G$v0s*n4
zDX<ya!ldWJk=~v?%rQLJS^H1h3vOQpu<bFLo_Se&KY~!(v=_(c<qh9?uZ6ZJcH8}F
z7Ox8=Bq6Z3{V($>Fm)ON3nwAHJ&f{IKa+E;N3VId&h<R7dSwAzQIMhsh6`-FIKi=>
zGnp3jnR^7kN{KAXpRV7)nrd+}1c1t3BC`mrco1w!9T(Os!;rs+4D5s0BqQ_Jxbk23
zxyE<#Wpv<&Md#|F$;nr!OAlj@kG-KcrBYB!T&QekC<@P%B36P~W9GXn?99do_9tt7
z{vlk+I0eo&T%EnH+tJ%y8Xek@m&g9AwRIedZD^mDHyXrh1e8UCPRShF<;9d0=z#Ni
zpKo-LdUdDNw4wo&sKPsIa9SK1v?wv2tF$OSIn3_6VzPEIVnC(3kkfz8>SUs&4cV=%
zZi*u`^R}YBYEb>LoP`z9sn?sF=>xY?c8e}sekkZ~p(uQetr%cWVVx$LJHL)~^!7SV
z(d*{n?^@dIJP-4|(q8WptM#hj+kse-YqxCSNgoGl|G-MEBt=2S-Tw=@-G}@~r-gq_
z1B7?6)o)QPKKHSW%b7WHQDv}X_@vL`ND1$lZb{VEe1W7hkAx6N@?TU+C@SbGual7}
z(C7DZrf&?wYY@Lm$Iw#mmAL6!Ioi-HZ76W&66NfzFK*OBUT9P;|HOh4X0t`_%*@JV
zwzrt%r&&qB<{90tn|D$dEB2Cj^`(H|dC?t~iP{anXX%#loKl{;@L`9+qRT?lE^a@$
zyi7t_jifXN<*OuIOqS>7CST-@j8Iv-RBQ0|QizjFEZ;}cCix(4S{%*vtHWYl(w=7u
z_4F4Qe}x~eE^-#_Nbn24NYw5fURMNhxwh`Gg?&oBicJblL?RnH#qs!|YMOJkvpn6w
zPkJHFlT)9<Ne&icQMKLN@m>m6>lGkWa5CV1eYcx}v9tTrDdkgk3|fbCMCyDX<Z?*I
zdADid)Hn_9txcL-B@pk|9HtE$LV4EW9jA0Zqm+7`PQ}Zop3kSlKglb4@Y#A}u<^xo
zXLcZ;zj>NRzriAKO#bo!3ox`MvHb0I1=Vslf@19%g7BE>^k~~++N7z444UT5IvV&R
z&{PuYpGMXA;?H%eYc35IPxPvPyEO2{BEO%FbyLDdZ3u2e(o-2VVO3k;sEWO@BNZhl
zk2NKUMAFQZdqhSU8RR?LIY<}|`d9W%6yHU$T3|SdZ&2E{2t#)yL={{ls(RBFri2EH
zQ&GMu$`vvJy{mK6HF|J&?*k3K<ia|II)FrY_Nr!}FpkVYbjB!C)|h>uXx9nus@b5m
zxg}L>b|~20zANQ8Z73kv1Do8MTz=w`K5vSnA8BhViF%plE9F&9LyZ!hG{~+{j<eal
z>yj=P*_oOfvXtU4sXZ8)Oo2{8)vDS23Tu;ae3CD$|0YI>;T14OfF$!4Yde!G@Q#;U
z-8gx==HVHZ<do<<u)fqAKJPkrG@bdvkV1xH<%Wb+cP26phNu1blR2Nk%G2b6$^3@J
z5ZIo4|5a*Pv2QThLeCMu>uqvVPHQ)_*}Od{!WLITC+{St{Lq70AL+vY$%k@U61=^Q
z>h|aQ+!3yg?eD7{-OplM_SBx<D&UZ=c{cqkf0EPB_i^$Wl@MiZ9paj=K$I~R(M3Ch
zv6f4jg@{tNlMN!AwTN>D^Zb060m~D+ij!4*Z`;%Z-b-sgTw2=>N)*()Yknn3dlW4#
zN@iNXi8Up?qpCQ6?D`NFJ*z^wdH2vURDfW#l$FAVTl2SsUc{F})Nk4)^Kn0H4UgmF
zoY{V)eCwmsy_OpsufkVp#zyG1<6vKU=%Ms4&hka>yf)8%_v9~E`UP?nfR@+JvV~e-
zTeUN8c1&pPRO9QLgf7`m2l?<AHRwIjhwHKSiZlOMexJ^-AyBXw>@(>ss;pV(o=qh!
zVeH~Btaii~pT`<-wvW&()V#6!hG3M*KU@hNg5!+*`RjJO(jG?ZNKmMp*8_5<p0?DL
zIA6unhrEoA9M6u3dmZ9FMoGsMR>FD#H;j{sjYA+6<?PrGmlX6RJ;v}RdmPj7LD|me
z@^w4X;^3mYRJL)((r*G>7euo@#tLBky1TozEWh~wHgTzioS{t@c{sFihy~q(<bM4c
zZ@KZh-S;QGnDihoJmO`^1A!=xtc%&exr@a@BE4%cZcyG(VasS=C$HE2K6~e2cr-FT
z`*2;i#($c7@maf;H|%l$bjzi+Ef@J;c24d^mdiYlGsk7S&1e^)4O(3#RqN}tlPn>}
zziq$xcyr_zDznG>H;m99!DGs@VY5FeiP#qPwK}r0WA+p)Z)epVxS%-VHI?Un_szyd
z0KhwtNNusA?Paf_M&_4A61!D-<MP?(!xj`9`)5-y(V?0;HDtymzxvBC$fjL~56xy4
z`o}=ba)El2qpvbiH$urJeboFqeM}+8OmeoDmk7rMRH0b7cm%fL^|U>!wF)i7&O5v`
zJXreO<EGu4Vg^~gsMhSUdH3J9?oaQg?s4*!B@1HK4YIu&SU&pc_YE9mnyQ6+@bhZi
zo+i6!AK{}5DNwq36&lpwmMrd{8AmOu52L!%l54kwH^Xv#g9Yu7dd$aXgtC{KG{_pz
zj$7T2gC9pP8&f|2HYi@b>7J;)`Wu`_K6bK=f7*AT<TcLua~XYGq;~NnPiLYIIt_bg
z4qbn}v%B@QUT%<elq{fDM>*XH?hIAUPMQkQtEua|flnUm(#>*7NXPws&x<I-)6DM9
z{dIjx=R`l!O-{->B-H4f5l~!+*C`ARUa7*PxM(^+q{no$&|kg_0A}*5MFVc+%HY9^
zi>nmoT8<nVc8qN84SYDGGBx(*CvrKa{Hy)p(#;)l7Vl3)P3hF%Owg*B_p19j#JNqH
z7WJI_KAPY{$7YQk349*QTz1VriDlb6)TiWm^!xRl)uh*`JBx>W9Imik{>R!OL}v-b
zn&QvU1>rOYZ$xWW?=}9E&DIlLzDlWQ)@#>uP?vMwEp<f#wcA9|sK)eq;8k}m@1TBl
ze~E6VAG#ny68Z_Hm$BMXP#c_jp-}i0vQlDu<RlAjYReUVj8X5W)1z+@>Vg=~1#=mU
zppej5*4Es3{Ug*X@h22PqjD@(DaBlk9%T-vJ)Zkgm8*|V_zj)so(uUH!HBHhM0I2y
zF1QG8CnfJO%+PWyizh>D;%H$X36|b#!%*A4J&4h>E1DH>r5*y8D`DiA9NUE_X0b`5
zG0X3@P%q*++2GMp!*@QG<;Y*xa<HWrf6nBRvlho-2<JAg2<Xl4X_0Vr{qle}3HDBa
z+;!o@vXl_7-4U~bY-!JJXrkq`_q&{^Ej9gXxs|6Iz8l|K>tpwO<E=-(dEEE*oL>G)
zJk&e)+2#BZVzihc-Mxr)dAexg%)-eoI(+A#e4PO*G4pP_X1zA;;7(39F>tAgZ+T>h
zHP{FAP*YHt{nfXE@j1aAHE~-9K4Zf3YV6o$z%OjWy%+i=_KW`~0y#DVNdMpRd~h)K
z8Z~1yuQuQn5)qkz;ylZ4XqcwIYtkdTxLgRSuJ}}Rdv}?gZESp1%-%>qp={TReUOGp
z2b@hggoTH(_wS&8gJo7~P`nghpeg^8E>PTMf>(zNlO=KcC=YLAGvZ)bopIjsPu1@B
zlen^g&mhS#n}q(|alk)#!J$2b#DcB_<Hg`|Y`pJMuJy{gkBG4s-<W+o@F{-p$D&v7
z96LSnf+b$xnc33zPx)%iQgc&8bm`{saHcwCU8xejJ{M$gw%)(zz*$Jrel4F?OhjCB
z=a{Ha?;sq8>>~cfCF+~Ps#a|vnl<j8EWCV3+bin5xfqkz?y+cd%x|Y4HtXd`;%(Gc
zbH9I=e#xda&vT6xkCl~G{zh2KKOKJj*a~~s0Isvm!$gXQPU?Q?)grJ+2ubRcJjQwW
zaHWlQUujFz^FX$8+bj_P;C)J*b{4sFR;wd4{kTp1d4UzGckqvhQe9KfoO`>j*4Nl~
z64Ac-5MI;^<(;pzg5A`to@o`c*|MVDC1d$P77ZqklunlRlb)6Iz2ug}t}2|jcqjYB
zIAo#wv$fMpZXp9tM(oa-eHNulpYIka#-qg0Ngl*|FH-Bx1mo=x^L`W<zvyKD7SC#r
z&(O;OW<uIF8y_vkeSWVq+Zt?2@c8`P{chZgD4$zGOgcAUK5O}`Q1i*L*zN2O#yX&E
z4>$|alSU0<#&GwEHgC0I)j<LM9_qk(E3CeWLwXg+3dUysZ7%|5m0{JH^gMHc#|(?l
zZF?Z9d((fm!#lV(GGAyF$Z|1mJzRR-PEj<XdhYy@D5&&cuLkASs;cea4!dJnogLhi
zw3D^Ap8W}{ue^a3d@DqmPdP>M8Es8AoJdKY#T5(R(;Re2F508unZUzVjWdf4-npj&
zxhc?Q=l}^N!1yO;5m-BQI?Fk<7e7%!Y!)uP{dNET+V}9&z5CQ@B@r1uZC7}rl^J3(
zg~{YhYvj=tZL!&~t>VvHPkX8}D@J7B)b4u~RjX<Ytkb!^)g=$dA;H%3w~x5}uPWdD
zYO8|kW+t^1zN5dsnQR(;!hA4d0$nN2&vdKK+KJy0wCcIjXdkG>P~temt6BcuzKu89
z{900xfw70J>n&;E$pv1Tr`U)nvbV^Pr++?T-b(Jxb00mWM33%x1qzpo21I2f3ShG_
zs&7&A<H#|x%DhofY8<W!-htUd7uu?e?FnZo5Z)m^#}&J=>FNFh_O0xgD5)z%6??He
zaf=`7Y$01=v7GiL9%TGwcgX)$XOr;{9MZrpW_AQ|(_K-fvI=8>p~rc!)@JhAe{x*E
z$+-3HHjgr2Z2X4$+C4>jac!G4$MCN+ROR+7lk1SVT{y%++UCy_(WHb~y0zz-arNrs
z=;}TP?(mDbc*_h#GHGT4;dSyBXV>iFYG=d1v{RRFDVBd4$9R@oExgei6l(k8^2_YG
zRrO_F)oH$PSzZd2?{LVLZ6+P*!_RJGa@l%jd{%Sng_AaH#Pu$&r(L&`yrzD}Wqh%m
zdQdp_sIHF;0t^Ho-Mi|Ke_1B@0Yzp`&?PJVQs<)d%q|_2Q8%XZvxrC3(j?%QQhui?
zEm3bjZ;_tos(QYCEn6S(PG%Q#57j{>qLXW7;OkqAuLtaF&c4Xmy~;O$=Nj==d8i9*
z2&2|o)1@^;RIJ0PA>a2bVn6WVeT7zSLM~q%LGNKbN9EWlxUoL&(1-BjC2RCab;qfE
z4*AB1rkIC1x*AqCXX`dJe<WQcdgqr;Y}1O;ld_w_8=QV*9+q>fR=YLq6z%E*>2Cl}
z)3fJ`S$KY2QDvi)R54Fh{CZ@8v-9Al+Y@i}uWs(>n*r<>v3y`Ll>WQkaFh_zajE>>
z(b1gtrM}lWrgw#hpM_utCWCl?eQc2ak`+TNn2x!XdK@1~?y$cz1n4utS1R9SnDjEt
zR7zxIu3(>dE7!$(NhVmJo;u+UzF27b1uKTkSF9h54#ijQ4P3m?bXsgi8D>C<ti!Ku
zbotQxJ1(uYblGPeECl>TDTdKUe0a228?KZ=U~#WOAl-$t*2CMo;$i6*dV&?rDLL7z
zLuqoHzM!S@AFGKG__Frca;<3m!x%@g&M6-zaZ}q=Qpovn*ep4zR?4uy&b|URr;?ZQ
zR5(s|DERpUiOMPI=(;Z?W<Xi!JG+Y_Uj{Bvt@s#ZUN3(T2;58lKKJsBxCyXqybL7K
z&ag`eXi|aU#&n(BDS=?`x&q1~|J7g@Hz9@+nGya^FuZGT<(3YV3=Hm2N?^+f!u3zS
zXxXb01N=TW!2ek^&HD54SN?NrktSTbB<%5(kN~2=bZeVhpqXHmr>~-`_m=N1uw4vd
zpT`aeUe;eJyMP!OSp%*BX!?MnfyqhkstJ2v{N|9Owwkd#Eo^2|m97vqN$<T`Ty{LV
zGjDmFaQ6~Hawpb9*ew-40#;Kp<l#907Q~U^7w|75<z~F~0@$cbi@vy58-Oweex0lf
z2@6LUa}csDF|6dM2WB^5OT`m#xqq`>p6oraHM;{lttrn4?rfL9-8W-hJLcRkLQTwr
z@HA3qF&78IruQr{q+QsYc}4)46_5+J5X`>X4!MeV@w4o8O=3bzU}sZ?@ZmJI0orEm
zC`W;lwjU!ei~ejIwwpC1kv91=V5^mlJgRL4?`jh`JG~4?{-fUnmx7B@xY`}k0v%ca
zoXzdq6vl_m4f2z@=o8|DqV)?DF4hyU0{*O{k_EK9&2fsJM~oLz4}^w?hlgj{Ub+k1
zIbU|W4vE~oP%XgVtz%gS@ELD{Wwf5SBr;XwR(9~W=z!TlaHq5;tIG(v_|c>Z??P^Y
zku3Rxs~b$yj4y{*CP?1?Pg8<ou2QKi;*u&IN>h=w2zs!Z{4Ga@KQmB3R<o*D0{C8#
zMbS&bu*SFx@c{e~#eq*uA*D#bbGh#UZ_UrDq@5SuCM73tF#Ub_N8-yk8B-2sEb)$9
zd1xqs5YUb<4050uBM+1r0R|?a8wecsjJzIc>nx&+_*WaXSAZJzg|0s$Jn~yK@NjXu
zw8igxO9lV3=07=BUiJ6)54S#eO2`&W9Kb;FKu}n?W5azy;h*kD-3O&V56tN%R0t;i
zaWz4{m(@ID?n1zTdqs<8pvxoxQ>ixikt0Eg0C?Uj5eO``{WE23ra{;Yk&Dq-85n?*
z@BtgeVa=c16is?jfy@GbWMa2v0%;R9=V8<(tSx5+uYrNdc=n(Ax8@kwN$=qpfz9Wi
s#K_2f_D}Jk1=zT^f6{-e&y1)33)w+ka^ch=*zPGlc_vpRYZCN70JI<S%m4rY

literal 0
HcmV?d00001

diff --git a/examples/nlp/rag/rag.md b/examples/nlp/rag/rag.md
new file mode 100644
index 000000000000..b7de0ef46fd2
--- /dev/null
+++ b/examples/nlp/rag/rag.md
@@ -0,0 +1,141 @@
+RAG with NeMo
+================
+
+Retrieval-augmented generation (RAG) is a technique for enhancing the accuracy and reliability of generative AI models with facts fetched from external sources. With NeMo, we can employ a text embedder and an LLM trained with NeMo Framework to set up a RAG pipeline.
+This document illustrates how NeMo models can be used with LlamaIndex, a popular RAG library, for a retrieval-based text generation application.
+
+## Quick Start
+
+In this example, we set up a pipeline that lets us index a document file (e.g., a manual, repository documentation) then ask questions and details in the document.
+
+The only dependency in this example is LlamaIndex, which can be installed with:
+```
+!pip install llama-index
+```
+
+A general RAG pipeline includes an Indexing step, in which the corpus document(s) are processed, embedded and indexed, and Generating step, in which given a query, the relevant neighbors text chunks are retrieved from the index to provide context to the query and fed into the LLM to generate answers. Below we walk through these two steps of the pipeline.
+
+<p align="center">
+        <img src="images/rag_pipeline.png" width="1000" >
+</p>
+
+### Indexing data
+
+
+The first step is processing and indexing the corpus document(s). To do so, set the path to the embedder checkpoint, corpus document(s), index saving directory and relevant arguments, then run the following command. Below we explain in more details the steps run within the script.
+
+
+```
+python examples/nlp/rag/rag_indexing.py \
+        trainer.devices=1 \
+        trainer.precision='bf16-mixed' \
+        indexing.embedder.model_path='/path/to/checkpoints/embedder_model.nemo' \
+        indexing.embedder.embed_batch_size=128 \
+        indexing.data.data_path='/path/to/data' \
+        indexing.data.chunk_size=256 \
+        indexing.data.chunk_overlap=10 \
+        indexing.index_path='/path/to/index'
+```
+
+Inside the script, the following steps are run.
+
+First, the document is read into LlamaIndex's `SimpleDirectoryReader` object.
+
+```
+print("Loading documents.")
+documents = SimpleDirectoryReader(cfg.indexing.data.data_path).load_data()
+```
+
+We then set up how the corpus document(s) will be split into smaller chunks, by setting splitter type, chunk size, and chunk overlap values.
+
+```
+print("Setting text transformation.")
+Settings.text_splitter = SentenceSplitter()
+Settings.chunk_size = cfg.indexing.data.chunk_size
+Settings.chunk_overlap = cfg.indexing.data.chunk_overlap
+```
+
+We then load the trained embedder NeMo model. Currently, this script only supports `.nemo` checkpoints. The wrapper around NeMo embedder to work with LLamaIndex interface is implemented at `nemo/collections/nlp/models/rag/custom_embedder.py`. We can try different embedding batch size to balance the number of samples embedded at once and embedding speed.
+
+```
+print("Loading embedding models.")
+model_path = cfg.indexing.embedder.model_path
+embed_batch_size = cfg.indexing.embedder.embed_batch_size
+embed_model = NeMoEmbeddings(model_path = model_path, cfg = cfg, embed_batch_size = embed_batch_size)
+Settings.embed_model = embed_model
+```
+
+Next, we will index the corpus document(s), simply by using the LlamaIndex `VectorStoreIndex.from_documents()` method. Under the hood, this method will split the corpus document(s) into smaller chunks having a pre-defined chunk size, batch them and feed them to the embedder, then put the output embeddings into an index. In this example, we use the built-in LlamaIndex's in-memory vector store to save the index. We can also use external vector stores, such as Milvus, Qdrant, etc. See more at [LlamaIndex Vector Stores](https://docs.llamaindex.ai/en/stable/module_guides/storing/vector_stores/).          
+
+
+```
+print("Indexing data.")
+index = VectorStoreIndex.from_documents(documents, show_progress=True)
+```
+
+After indexing, we save the index to disk that later we can load to be used with an LLM.
+
+```
+print("Saving index to disk.")
+index_path = cfg.indexing.index_path
+index.storage_context.persist(persist_dir=index_path)
+```
+
+
+###  Generation
+
+After processing and indexing the document, we can have a NeMo LLM model to interact with the corpus document(s) through RAG, such as asking details within the documents. To do so, set the path to the LLM checkpoint, save index, and a query to ask and run the following command. Below we explain in more details the steps run within the script.
+
+```
+python examples/nlp/rag/rag_eval.py \
+        trainer.devices=1 \
+        trainer.precision='bf16-mixed' \
+        indexing.embedder.model_path='/path/to/checkpoints/embedder_model.nemo' \
+        indexing.index_path='/path/to/index' \
+        generating.llm.model_path='/path/to/checkpoints/llm_model.nemo' \
+        generating.inference.greedy=False \
+        generating.inference.temperature=1.0 \
+        generating.query='Which art schools did I applied to?'
+```
+
+Inside the script, the following steps are run.
+
+
+First, the LLM is loaded from `generating.llm.model_path`. Currently the script only works with `.nemo` checkpoints. The wrapper around NeMo LLM to work with LLamaIndex interface is implemented at `nemo/collections/nlp/models/rag/custom_llm.py`. 
+
+```
+print("Loading LLM.")
+model_path = cfg.generating.llm.model_path
+Settings.llm = NeMoLLM(model_path = model_path, cfg = cfg)
+```
+
+Then we load the index saved on disk in the previous indexing step. If using Milvus database, it can also be loaded at this step.
+```
+print("Loading index from disk.")
+index_path = cfg.indexing.index_path
+storage_context = StorageContext.from_defaults(persist_dir=index_path)
+index = load_index_from_storage(storage_context)
+```
+
+Finally, we will retrieve the relevant contexts and generate answers for the query using LlamaIndex's `query_engine.query()` method. Under the hood, this method automatically embeds the query with the defined embedder, then retrieve the k relevant contexts from the index, and add those contexts to a predefined template along with the query before feeding them to the LLM for generation. We can set the number of relevant contexts to be retrieved by setting the argument `similarity_top_k` value.
+```
+print("Responding to query using relevant contexts.")
+query_engine = index.as_query_engine(similarity_top_k=3)
+response = query_engine.query(query)
+print(response)
+```
+
+Below is an example of the default template by LlamaIndex to feed a query and relevant contexts to the LLM. This template can be modified following LlamaIndex's documentation [Prompts RAG](https://docs.llamaindex.ai/en/stable/examples/prompts/prompts_rag/).
+
+
+```
+Context information is below.
+---------------------
+{context_str 1}
+{context_str 2}
+...
+---------------------
+Given the context information and not prior knowledge, answer the query.
+Query: {query_str}
+Answer:
+```
\ No newline at end of file
diff --git a/examples/nlp/rag/rag_generating.py b/examples/nlp/rag/rag_generating.py
new file mode 100644
index 000000000000..952dc2532102
--- /dev/null
+++ b/examples/nlp/rag/rag_generating.py
@@ -0,0 +1,49 @@
+from llama_index.core import Settings, StorageContext, load_index_from_storage
+
+from nemo.collections.nlp.models.rag.custom_bert_embedder import NeMoBertEmbeddings
+from nemo.collections.nlp.models.rag.custom_gpt_llm import NeMoGPTLLM
+from nemo.core.config import hydra_runner
+from nemo.utils import logging
+
+
+@hydra_runner(config_path="conf", config_name="rag_generating")
+def main(cfg) -> None:
+
+    # load LLM
+    logging.info("Loading LLM.")
+    model_path = cfg.generating.llm.model_path
+    if cfg.generating.llm.model_type == "gpt":
+        Settings.llm = NeMoGPTLLM(model_path=model_path, cfg=cfg)
+    else:
+        assert cfg.generating.model_type in ["gpt"], "Currently RAG pipeline supports 'gpt' for LLM models."
+
+    # load embedder
+    logging.info("Loading embedder.")
+    model_path = cfg.indexing.embedder.model_path
+    if cfg.indexing.embedder.model_type == "bert":
+        embed_model = NeMoBertEmbeddings(model_path=model_path, cfg=cfg)
+    else:
+        assert cfg.indexing.model_type in ["bert"], "Currently RAG pipeline supports 'bert' for embeddings models."
+        embed_model = None
+    Settings.embed_model = embed_model
+
+    # load index from disk
+    logging.info("Loading index from disk.")
+    index_path = cfg.indexing.index_path
+    storage_context = StorageContext.from_defaults(persist_dir=index_path)
+    index = load_index_from_storage(storage_context)
+
+    # set query
+    logging.info("Setting query.")
+    query = cfg.generating.query
+    logging.info("Query: ", query)
+
+    # query and print response
+    logging.info("Responding to query using relevant contexts.")
+    query_engine = index.as_query_engine(similarity_top_k=3)
+    response = query_engine.query(query)
+    logging.info(response)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/examples/nlp/rag/rag_indexing.py b/examples/nlp/rag/rag_indexing.py
new file mode 100644
index 000000000000..ab487c035228
--- /dev/null
+++ b/examples/nlp/rag/rag_indexing.py
@@ -0,0 +1,44 @@
+from llama_index.core import Settings, SimpleDirectoryReader, VectorStoreIndex
+from llama_index.core.node_parser import SentenceSplitter
+
+from nemo.collections.nlp.models.rag.custom_bert_embedder import NeMoBertEmbeddings
+from nemo.core.config import hydra_runner
+from nemo.utils import logging
+
+
+@hydra_runner(config_path="conf", config_name="rag_indexing")
+def main(cfg) -> None:
+
+    # load data
+    logging.info("Loading documents.")
+    documents = SimpleDirectoryReader(cfg.indexing.data.data_path).load_data()
+
+    # set text transformation
+    logging.info("Setting text transformation.")
+    Settings.text_splitter = SentenceSplitter()
+    Settings.chunk_size = cfg.indexing.data.chunk_size
+    Settings.chunk_overlap = cfg.indexing.data.chunk_overlap
+
+    # load embedder
+    logging.info("Loading embedding models.")
+    model_path = cfg.indexing.embedder.model_path
+    embed_batch_size = cfg.indexing.embedder.embed_batch_size
+    if cfg.indexing.embedder.model_type == "bert":
+        embed_model = NeMoBertEmbeddings(model_path=model_path, cfg=cfg, embed_batch_size=embed_batch_size)
+    else:
+        assert cfg.indexing.model_type in ["bert"], "Currently RAG pipeline supports 'bert' for embeddings models."
+        embed_model = None
+    Settings.embed_model = embed_model
+
+    # index data
+    logging.info("Indexing data.")
+    index = VectorStoreIndex.from_documents(documents, show_progress=True)
+
+    # save index data to disk
+    logging.info("Saving index to disk.")
+    index_path = cfg.indexing.index_path
+    index.storage_context.persist(persist_dir=index_path)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/nemo/collections/nlp/models/rag/__init__.py b/nemo/collections/nlp/models/rag/__init__.py
new file mode 100644
index 000000000000..15434bc2e603
--- /dev/null
+++ b/nemo/collections/nlp/models/rag/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from nemo.collections.nlp.models.rag.custom_bert_embedder import NeMoBertEmbeddings
+from nemo.collections.nlp.models.rag.custom_gpt_llm import NeMoGPTLLM
diff --git a/nemo/collections/nlp/models/rag/custom_bert_embedder.py b/nemo/collections/nlp/models/rag/custom_bert_embedder.py
new file mode 100644
index 000000000000..e2f26fadf247
--- /dev/null
+++ b/nemo/collections/nlp/models/rag/custom_bert_embedder.py
@@ -0,0 +1,145 @@
+from typing import Any, List
+
+import torch
+from llama_index.core.bridge.pydantic import PrivateAttr
+from llama_index.core.embeddings import BaseEmbedding
+from omegaconf import DictConfig
+from pytorch_lightning.trainer.trainer import Trainer
+
+from nemo.collections.nlp.models.information_retrieval.megatron_bert_embedding_model import MegatronBertEmbeddingModel
+from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy
+
+
+class NeMoBertEmbeddings(BaseEmbedding):
+    _model: MegatronBertEmbeddingModel = PrivateAttr()
+    _model_cfg: DictConfig = PrivateAttr()
+
+    def __init__(
+        self,
+        model_path: str = None,
+        cfg: Any = None,
+        embed_batch_size: int = 16,
+        **kwargs: Any,
+    ) -> None:
+
+        # set up trainer
+        trainer_config = {
+            "devices": cfg.trainer.devices,
+            "num_nodes": 1,
+            "accelerator": "gpu",
+            "logger": False,
+            "precision": cfg.trainer.precision,
+        }
+        trainer = Trainer(strategy=NLPDDPStrategy(), **trainer_config)
+
+        # setup/override model config
+        model_cfg = MegatronBertEmbeddingModel.restore_from(
+            restore_path=model_path, trainer=trainer, return_config=True
+        )
+        model_cfg.micro_batch_size = 1
+        model_cfg.global_batch_size = cfg.trainer.devices
+        self._model_cfg = model_cfg
+        print("self._model_cfg: ", self._model_cfg)
+
+        # restore model
+        model = MegatronBertEmbeddingModel.restore_from(
+            restore_path=model_path, trainer=trainer, override_config_path=model_cfg, strict=True
+        )
+        model.freeze()
+        self._model = model
+
+        super().__init__(
+            embed_batch_size=embed_batch_size,
+            **kwargs,
+        )
+
+    @classmethod
+    def class_name(cls) -> str:
+        return "nemo_bert_embeddings"
+
+    async def _aget_query_embedding(self, query: str) -> List[float]:
+        return self._get_query_embedding(query)
+
+    async def _aget_text_embedding(self, text: str) -> List[float]:
+        return self._get_text_embedding(text)
+
+    def _construct_forward_input(self, texts: List[str]):
+        # this method construct model's forward input arguments from texts, following the constructing step in nemo/collections/nlp/data/information_retrieval/bert_embedding_dataset.py
+
+        # retrieve arguments from model_config
+        max_seq_length = self._model_cfg.encoder_seq_length
+
+        # tokenize text
+        input_ids = [self._model.tokenizer.text_to_ids(text) for text in texts]
+
+        # truncate input_ids
+        input_ids = [item[: (max_seq_length - 1)] for item in input_ids]
+
+        # add bos and eos
+        input_ids = [([self._model.tokenizer.bos_id] + item + [self._model.tokenizer.eos_id]) for item in input_ids]
+
+        # pad input_ids
+        def _ceil_to_nearest(n, m):
+            return (n + m - 1) // m * m
+
+        lengths = [len(item) for item in input_ids]
+        max_length = min(max_seq_length, _ceil_to_nearest(max(lengths), 16))
+        assert max_length <= max_seq_length
+        input_ids = [item + [self._model.tokenizer.pad_id] * (max_length - len(item)) for item in input_ids]
+        input_ids = torch.LongTensor(input_ids)
+
+        # construct attention_mask
+        def _create_attention_mask2(max_length, item_lengh):
+            """Create `attention_mask`.
+            Args:
+                input_ids: A 1D tensor that holds the indices of tokens.
+            """
+            # seq_length = len(input_ids)
+            # `attention_mask` has the shape of [1, seq_length, seq_length]
+            attention_mask = torch.zeros(max_length)
+            attention_mask[:item_lengh] = 1
+            return attention_mask
+
+        attention_mask = [_create_attention_mask2(max_length, len) for len in lengths]
+        attention_mask = torch.stack(attention_mask)
+
+        # construct token_type_ids
+        token_type_ids = torch.zeros_like(input_ids)
+
+        processed_batch = {
+            'input_ids': input_ids,
+            'token_type_ids': token_type_ids,
+            'attention_mask': attention_mask,
+        }
+
+        return processed_batch
+
+    def _get_query_embedding(self, query: str) -> List[float]:
+        constructed_forward_input = self._construct_forward_input([query])
+        for key in constructed_forward_input.keys():
+            constructed_forward_input[key] = constructed_forward_input[key].to(self._model.device)
+
+        embeddings = self._model.forward(**constructed_forward_input)
+        embeddings = embeddings.transpose(0, 1)  # reshape tensor shape [hidden_dim, bs] to [bs, hidden_dim]
+
+        return embeddings[0].tolist()
+
+    def _get_text_embedding(self, text: str) -> List[float]:
+        constructed_forward_input = self._construct_forward_input([text])
+        for key in constructed_forward_input.keys():
+            constructed_forward_input[key] = constructed_forward_input[key].to(self._model.device)
+
+        embeddings = self._model.forward(**constructed_forward_input)
+        embeddings = embeddings.transpose(0, 1)  # reshape tensor shape [hidden_dim, bs] to [bs, hidden_dim]
+
+        return embeddings[0].tolist()
+
+    def _get_text_embeddings(self, texts: List[str]) -> List[List[float]]:
+        constructed_forward_input = self._construct_forward_input(texts)
+        for key in constructed_forward_input.keys():
+            constructed_forward_input[key] = constructed_forward_input[key].to(self._model.device)
+
+        embeddings = self._model.forward(**constructed_forward_input)
+        embeddings = embeddings.transpose(0, 1)  # reshape tensor shape [hidden_dim, bs] to [bs, hidden_dim]
+
+        return embeddings.tolist()
diff --git a/nemo/collections/nlp/models/rag/custom_gpt_llm.py b/nemo/collections/nlp/models/rag/custom_gpt_llm.py
new file mode 100644
index 000000000000..bcd52b3f9b16
--- /dev/null
+++ b/nemo/collections/nlp/models/rag/custom_gpt_llm.py
@@ -0,0 +1,130 @@
+from typing import Any
+
+from llama_index.core.bridge.pydantic import PrivateAttr
+from llama_index.core.llms import CompletionResponse, CompletionResponseGen, CustomLLM, LLMMetadata
+from llama_index.core.llms.callbacks import llm_completion_callback
+from pytorch_lightning.trainer.trainer import Trainer
+
+from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
+from nemo.collections.nlp.modules.common.transformer.text_generation import LengthParam, SamplingParam
+from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy
+
+
+class NeMoGPTLLM(CustomLLM):
+    context_window: int = 2048
+    num_output: int = 256
+    model_name: str = "NeMo LLM"
+    dummy_response: str = "My response"
+
+    length_params: LengthParam = {
+        "max_length": 500,
+        "min_length": 0,
+    }
+
+    sampling_params: SamplingParam = {
+        "use_greedy": True,
+        "temperature": 1.0,
+        "top_k": 0,
+        "top_p": 1.0,
+        "repetition_penalty": 1.0,
+        "add_BOS": True,
+        "all_probs": False,
+        "compute_logprob": False,
+        "end_strings": ["<|endoftext|>"],
+    }
+
+    _model: Any = PrivateAttr()
+    _model_cfg: Any = PrivateAttr()
+    _tokenizer: Any = PrivateAttr()
+
+    def __init__(
+        self,
+        model_path: str = None,
+        cfg: Any = None,
+        **kwargs: Any,
+    ) -> None:
+
+        # set up trainer
+        trainer_config = {
+            "devices": cfg.trainer.devices,
+            "num_nodes": 1,
+            "accelerator": "gpu",
+            "logger": False,
+            "precision": cfg.trainer.precision,
+        }
+
+        tensor_model_parallel_size = 1
+        pipeline_model_parallel_size = 1
+
+        # trainer required for restoring model parallel models
+        trainer = Trainer(strategy=NLPDDPStrategy(), **trainer_config)
+        assert (
+            trainer_config["devices"] * trainer_config['num_nodes']
+            == tensor_model_parallel_size * pipeline_model_parallel_size
+        ), "devices * num_nodes should equal tensor_model_parallel_size * pipeline_model_parallel_size"
+
+        # setup/override model config
+        model_cfg = MegatronGPTModel.restore_from(restore_path=model_path, trainer=trainer, return_config=True)
+        model_cfg.micro_batch_size = 1
+        model_cfg.global_batch_size = cfg.trainer.devices
+        self._model_cfg = model_cfg
+        print("self._model_cfg: ", self._model_cfg)
+
+        # restore model
+        model = MegatronGPTModel.restore_from(
+            restore_path=model_path, trainer=trainer, override_config_path=model_cfg, strict=True
+        )
+        model.freeze()
+        self._model = model
+        super().__init__(**kwargs)
+
+        # update LLM metadata
+        self.context_window = self._model_cfg.encoder_seq_length
+
+        # update inference params
+        length_params: LengthParam = {
+            "max_length": cfg.generating.inference.tokens_to_generate,
+            "min_length": cfg.generating.inference.min_tokens_to_generate,
+        }
+
+        sampling_params: SamplingParam = {
+            "use_greedy": cfg.generating.inference.greedy,
+            "temperature": cfg.generating.inference.temperature,
+            "top_k": cfg.generating.inference.top_k,
+            "top_p": cfg.generating.inference.top_p,
+            "repetition_penalty": cfg.generating.inference.repetition_penalty,
+            "add_BOS": cfg.generating.inference.add_BOS,
+            "all_probs": cfg.generating.inference.all_probs,
+            "compute_logprob": cfg.generating.inference.compute_logprob,
+            "end_strings": cfg.generating.inference.end_strings,
+        }
+
+    @property
+    def metadata(self) -> LLMMetadata:
+        """Get LLM metadata."""
+        return LLMMetadata(
+            context_window=self.context_window,
+            num_output=self.num_output,
+            model_name=self.model_name,
+        )
+
+    @llm_completion_callback()
+    def complete(self, prompt: str, **kwargs: Any) -> CompletionResponse:
+        llm_response = self._model.generate(
+            inputs=[prompt], length_params=self.length_params, sampling_params=self.sampling_params
+        )
+        text_response = llm_response['sentences'][0]
+
+        return CompletionResponse(text=text_response)
+
+    @llm_completion_callback()
+    def stream_complete(self, prompt: str, **kwargs: Any) -> CompletionResponseGen:
+        llm_response = self._model.generate(
+            inputs=[prompt], length_params=self.length_params, sampling_params=self.sampling_params
+        )
+        text_response = llm_response['sentences'][0]
+
+        response = ""
+        for token in text_response:
+            response += token
+            yield CompletionResponse(text=response, delta=token)

From 9ca10104777f14f79e626eacb0381a59e25b896e Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Wed, 22 May 2024 10:03:37 -0600
Subject: [PATCH 09/47] Pin transformers (#9261) (#9273)

* update branch


* pin


---------

Signed-off-by: eharper <eharper@nvidia.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
---
 requirements/requirements_lightning.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements/requirements_lightning.txt b/requirements/requirements_lightning.txt
index 5ad2519cfd1a..cf996584da23 100644
--- a/requirements/requirements_lightning.txt
+++ b/requirements/requirements_lightning.txt
@@ -4,6 +4,6 @@ hydra-core>1.3,<=1.3.2
 omegaconf<=2.3
 pytorch-lightning>=2.2.1
 torchmetrics>=0.11.0
-transformers>=4.36.0
+transformers>=4.36.0,<=4.40.2
 wandb
 webdataset>=0.2.86

From 286d38704dc934cdbbb37fa3b026d04e547ba71c Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com>
Date: Wed, 22 May 2024 11:45:37 -0700
Subject: [PATCH 10/47] Mcore dist opt ckpt fix (#9156)

* Mcore dist opt ckpt fix

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* pass dp_zero_gather_scatter to starded-state-dict

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>

* introduce dist_ckpt_parallel_save option

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* determine sharding type from dist_ckpt_parallel_save

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>

* read model.disk_ckpt_parallel_save from cfg and pass it to mcore dist ckpt

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>

* Pass is_loading to mcore_optim.py's sharded_state_dict

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>

* Update nemo/core/optim/mcore_optim.py

Co-authored-by: mikolajblaz <mikolajblaz@users.noreply.github.com>
Signed-off-by: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com>

---------

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>
Signed-off-by: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com>
Co-authored-by: akoumpa <akoumpa@users.noreply.github.com>
Co-authored-by: mikolajblaz <mikolajblaz@users.noreply.github.com>
---
 .../conf/megatron_gpt_config.yaml               |  1 +
 .../nlp/parts/megatron_trainer_builder.py       |  1 +
 nemo/collections/nlp/parts/nlp_overrides.py     | 17 +++++++++++++----
 nemo/core/optim/mcore_optim.py                  | 11 +++++++++--
 4 files changed, 24 insertions(+), 6 deletions(-)

diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
index 269aa8f55153..ca0c3f74e4c8 100755
--- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
@@ -154,6 +154,7 @@ model:
   # Distributed checkpoint setup
   dist_ckpt_format: 'zarr' # Set to 'torch_dist' to use PyTorch distributed checkpoint format.
   dist_ckpt_load_on_device: True # whether to load checkpoint weights directly on GPU or to CPU
+  dist_ckpt_parallel_save: False # if true, each worker will write its own part of the dist checkpoint
 
   ## Activation Checkpointing
   # NeMo Megatron supports 'selective' activation checkpointing where only the memory intensive part of attention is checkpointed.
diff --git a/nemo/collections/nlp/parts/megatron_trainer_builder.py b/nemo/collections/nlp/parts/megatron_trainer_builder.py
index 03cf5fb755bd..f6336f6bcc71 100644
--- a/nemo/collections/nlp/parts/megatron_trainer_builder.py
+++ b/nemo/collections/nlp/parts/megatron_trainer_builder.py
@@ -90,6 +90,7 @@ def _training_strategy(self) -> Union[NLPDDPStrategy, NLPFSDPStrategy]:
             find_unused_parameters=False,
             nccl_communicator_config_path=self.cfg.model.get('nccl_communicator_config_path', None),
             sharp=self.cfg.model.get('sharp', False),
+            dist_ckpt_parallel_save=self.cfg.model.get('dist_ckpt_parallel_save', False),
         )
 
     def _grad_scaler(self) -> GradScaler:
diff --git a/nemo/collections/nlp/parts/nlp_overrides.py b/nemo/collections/nlp/parts/nlp_overrides.py
index e8f7009b791c..79937c265b09 100644
--- a/nemo/collections/nlp/parts/nlp_overrides.py
+++ b/nemo/collections/nlp/parts/nlp_overrides.py
@@ -78,6 +78,7 @@
     from apex.transformer.pipeline_parallel.utils import get_num_microbatches
 
     from nemo.core.optim.distributed_adam import MegatronDistributedFusedAdam
+    from nemo.core.optim.mcore_optim import McoreDistributedOptimizer
 
     HAVE_APEX = True
 
@@ -183,6 +184,7 @@ def __init__(
         no_ddp_communication_hook: bool = False,
         nccl_communicator_config_path: Optional[str] = None,
         sharp: bool = False,
+        dist_ckpt_parallel_save: bool = False,
         **kwargs: Union[Any, Dict[str, Any]],
     ) -> None:
         if not HAVE_APEX:
@@ -199,6 +201,7 @@ def __init__(
         self.no_ddp_communication_hook = no_ddp_communication_hook
         self.nccl_communicator_config_path = nccl_communicator_config_path
         self.sharp = sharp
+        self._dist_ckpt_parallel_save = dist_ckpt_parallel_save
 
     def setup(self, trainer: "pl.Trainer") -> None:
         """
@@ -276,7 +279,7 @@ def configure_ddp(self):
             else:
                 super().configure_ddp()
 
-    def optimizer_sharded_state_dict(self, unsharded_optim_state=None):
+    def optimizer_sharded_state_dict(self, unsharded_optim_state=None, is_loading=False):
         """
         Sharded state dictionary for an MainParamsOptimizerWrapper.
         Used to save and load the optimizer state when training with distributed_checkpoint.
@@ -294,8 +297,14 @@ def optimizer_sharded_state_dict(self, unsharded_optim_state=None):
         model_sharded_state_dict = {
             key: value for key, value in model_sharded_state_dict.items() if not key.endswith('_extra_state')
         }
-
-        if isinstance(optimizer, MegatronDistributedFusedAdam):
+        if isinstance(optimizer, McoreDistributedOptimizer):
+            return optimizer.sharded_state_dict(
+                model_sharded_state_dict,
+                unsharded_optim_state,
+                is_loading=is_loading,
+                dist_ckpt_parallel_save=self._dist_ckpt_parallel_save,
+            )
+        elif isinstance(optimizer, MegatronDistributedFusedAdam):
             return optimizer.sharded_state_dict(model_sharded_state_dict, unsharded_optim_state)
         elif not isinstance(optimizer, MainParamsOptimizerWrapper):
             # Regular optimizer, e.g. Adam or FusedAdam
@@ -501,7 +510,7 @@ def load_checkpoint(self, checkpoint_path: Union[str, Path]) -> Dict[str, Any]:
 
             # after dist_checkpointing.load, sharded tensors will be replaced with tensors
             checkpoint['state_dict'] = sharded_state_dict
-            checkpoint['optimizer_states'] = [self.optimizer_sharded_state_dict()]
+            checkpoint['optimizer_states'] = [self.optimizer_sharded_state_dict(is_loading=True)]
 
             if self._check_param_groups_mismatch(checkpoint_path, checkpoint):
                 return self._fix_param_groups(checkpoint_path, checkpoint)
diff --git a/nemo/core/optim/mcore_optim.py b/nemo/core/optim/mcore_optim.py
index 0d4b524049ca..234680f49249 100644
--- a/nemo/core/optim/mcore_optim.py
+++ b/nemo/core/optim/mcore_optim.py
@@ -55,8 +55,15 @@ def state_dict(self):
     def load_state_dict(self, state_dict):
         self.mcore_optimizer.load_state_dict(state_dict)
 
-    def sharded_state_dict(self, model_sharded_state_dict, is_loading: bool = False, **kwargs):
-        return self.mcore_optimizer.sharded_state_dict(model_sharded_state_dict, is_loading, **kwargs)
+    def sharded_state_dict(
+        self, model_sharded_state_dict, optimizer_state_dict=None, is_loading=False, dist_ckpt_parallel_save=False
+    ):
+        # TODO(@akoumparouli, @mikolajblaz): switch to sharding_type once support for fully_sharded_model_space merged in mcore.
+        # sharding_type = 'fully_sharded_model_space' if dist_ckpt_parallel_save else 'dp_zero_gather_scatter'
+        sharding_type = 'dp_zero_gather_scatter'
+        return self.mcore_optimizer.sharded_state_dict(
+            model_sharded_state_dict, is_loading=is_loading, sharding_type=sharding_type
+        )
 
     def step(self, closure):
         """Clip gradients (if needed) and step the base optimizer.

From 52364c1fe1d614d3a11e60fd68a428e1377e3d8e Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Wed, 22 May 2024 15:29:50 -0400
Subject: [PATCH 11/47] Fix loading github raw images on notebook (#9282)
 (#9283)

Signed-off-by: Nithin Rao Koluguri <nithinraok>
Co-authored-by: Nithin Rao <nithinrao.koluguri@gmail.com>
---
 tutorials/asr/ASR_TTS_Tutorial.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tutorials/asr/ASR_TTS_Tutorial.ipynb b/tutorials/asr/ASR_TTS_Tutorial.ipynb
index 067c007ea3df..709f96d14ba5 100644
--- a/tutorials/asr/ASR_TTS_Tutorial.ipynb
+++ b/tutorials/asr/ASR_TTS_Tutorial.ipynb
@@ -38,7 +38,7 @@
     "### Architecture\n",
     "\n",
     "<img width=\"400px\" height=\"auto\"\n",
-    "     src=\"https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/_images/hybrid_asr_tts_model.png\"\n",
+    "     src=\"https://github.com/NVIDIA/NeMo/blob/stable/docs/source/asr/images/hybrid_asr_tts_model.png?raw=true\"\n",
     "     alt=\"ASR-TTS model architecture\"\n",
     "     style=\"float: right; margin-left: 20px;\">\n",
     "\n",

From 0e744c9300ca99060696b3536978ff5629312071 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Wed, 22 May 2024 13:51:38 -0700
Subject: [PATCH 12/47] Accept None as an argument to decoder_lengths in
 GreedyBatchedCTCInfer::forward (#9278)

* Accept None as an argument to decoder_lengths in GreedyBatchedCTCInfer::forward (#9246)

* Accept None as an argument to decoder_lengths in GreedyBatchedCTCInfer::forward

GreedyCTCInfer::forward already allowed for this, so they did not
implement the exact same interface. Now, they do.

Also warn about not passing in the decoder_lengths argument. It is
likely an error on the user's part not to pass it in explicitly.

Signed-off-by: Daniel Galvez <dgalvez@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: titu1994 <titu1994@users.noreply.github.com>

* Log warning only once for sanity.

Signed-off-by: Daniel Galvez <dgalvez@nvidia.com>

---------

Signed-off-by: Daniel Galvez <dgalvez@nvidia.com>
Signed-off-by: titu1994 <titu1994@users.noreply.github.com>
Co-authored-by: titu1994 <titu1994@users.noreply.github.com>
Co-authored-by: Somshubra Majumdar <titu1994@gmail.com>

* Apply isort and black reformatting

Signed-off-by: nithinraok <nithinraok@users.noreply.github.com>

---------

Signed-off-by: Daniel Galvez <dgalvez@nvidia.com>
Signed-off-by: titu1994 <titu1994@users.noreply.github.com>
Signed-off-by: nithinraok <nithinraok@users.noreply.github.com>
Co-authored-by: Daniel Galvez <galv@users.noreply.github.com>
Co-authored-by: titu1994 <titu1994@users.noreply.github.com>
Co-authored-by: Somshubra Majumdar <titu1994@gmail.com>
Co-authored-by: Nithin Rao <nithinrao.koluguri@gmail.com>
Co-authored-by: nithinraok <nithinraok@users.noreply.github.com>
---
 .../parts/submodules/ctc_greedy_decoding.py   | 50 +++++++++++++------
 .../asr/decoding/test_ctc_decoding.py         | 22 ++++++--
 2 files changed, 53 insertions(+), 19 deletions(-)

diff --git a/nemo/collections/asr/parts/submodules/ctc_greedy_decoding.py b/nemo/collections/asr/parts/submodules/ctc_greedy_decoding.py
index 1ef26cd7adf3..c4e9a14f6e1d 100644
--- a/nemo/collections/asr/parts/submodules/ctc_greedy_decoding.py
+++ b/nemo/collections/asr/parts/submodules/ctc_greedy_decoding.py
@@ -25,7 +25,10 @@
 from nemo.utils import logging
 
 
-def pack_hypotheses(hypotheses: List[rnnt_utils.Hypothesis], logitlen: torch.Tensor,) -> List[rnnt_utils.Hypothesis]:
+def pack_hypotheses(
+    hypotheses: List[rnnt_utils.Hypothesis],
+    logitlen: torch.Tensor,
+) -> List[rnnt_utils.Hypothesis]:
 
     if logitlen is not None:
         if hasattr(logitlen, 'cpu'):
@@ -55,6 +58,9 @@ def _states_to_device(dec_state, device='cpu'):
     return dec_state
 
 
+_DECODER_LENGTHS_NONE_WARNING = "Passing in decoder_lengths=None for CTC decoding is likely to be an error, since it is unlikely that each element of your batch has exactly the same length. decoder_lengths will default to decoder_output.shape[0]."
+
+
 class GreedyCTCInfer(Typing, ConfidenceMethodMixin):
     """A greedy CTC decoder.
 
@@ -108,8 +114,7 @@ class GreedyCTCInfer(Typing, ConfidenceMethodMixin):
 
     @property
     def input_types(self):
-        """Returns definitions of module input ports.
-        """
+        """Returns definitions of module input ports."""
         # Input can be of dimension -
         # ('B', 'T', 'D') [Log probs] or ('B', 'T') [Labels]
 
@@ -120,8 +125,7 @@ def input_types(self):
 
     @property
     def output_types(self):
-        """Returns definitions of module output ports.
-        """
+        """Returns definitions of module output ports."""
         return {"predictions": [NeuralType(elements_type=HypothesisType())]}
 
     def __init__(
@@ -145,7 +149,9 @@ def __init__(
 
     @typecheck()
     def forward(
-        self, decoder_output: torch.Tensor, decoder_lengths: torch.Tensor,
+        self,
+        decoder_output: torch.Tensor,
+        decoder_lengths: Optional[torch.Tensor],
     ):
         """Returns a list of hypotheses given an input batch of the encoder hidden embedding.
         Output token is generated auto-repressively.
@@ -158,6 +164,15 @@ def forward(
         Returns:
             packed list containing batch number of sentences (Hypotheses).
         """
+
+        logging.warning(
+            "CTC decoding strategy 'greedy' is slower than 'greedy_batch', which implements the same exact interface. Consider changing your strategy to 'greedy_batch' for a free performance improvement.",
+            mode=logging_mode.ONCE,
+        )
+
+        if decoder_lengths is None:
+            logging.warning(_DECODER_LENGTHS_NONE_WARNING, mode=logging_mode.ONCE)
+
         with torch.inference_mode():
             hypotheses = []
             # Process each sequence independently
@@ -204,7 +219,7 @@ def forward(
         return (packed_result,)
 
     @torch.no_grad()
-    def _greedy_decode_logprobs(self, x: torch.Tensor, out_len: torch.Tensor):
+    def _greedy_decode_logprobs(self, x: torch.Tensor, out_len: Optional[torch.Tensor]):
         # x: [T, D]
         # out_len: [seq_len]
 
@@ -234,7 +249,7 @@ def _greedy_decode_logprobs(self, x: torch.Tensor, out_len: torch.Tensor):
         return hypothesis
 
     @torch.no_grad()
-    def _greedy_decode_labels(self, x: torch.Tensor, out_len: torch.Tensor):
+    def _greedy_decode_labels(self, x: torch.Tensor, out_len: Optional[torch.Tensor]):
         # x: [T]
         # out_len: [seq_len]
 
@@ -324,8 +339,7 @@ class GreedyBatchedCTCInfer(Typing, ConfidenceMethodMixin):
 
     @property
     def input_types(self):
-        """Returns definitions of module input ports.
-        """
+        """Returns definitions of module input ports."""
         # Input can be of dimension -
         # ('B', 'T', 'D') [Log probs] or ('B', 'T') [Labels]
 
@@ -336,8 +350,7 @@ def input_types(self):
 
     @property
     def output_types(self):
-        """Returns definitions of module output ports.
-        """
+        """Returns definitions of module output ports."""
         return {"predictions": [NeuralType(elements_type=HypothesisType())]}
 
     def __init__(
@@ -361,7 +374,9 @@ def __init__(
 
     @typecheck()
     def forward(
-        self, decoder_output: torch.Tensor, decoder_lengths: torch.Tensor,
+        self,
+        decoder_output: torch.Tensor,
+        decoder_lengths: Optional[torch.Tensor],
     ):
         """Returns a list of hypotheses given an input batch of the encoder hidden embedding.
         Output token is generated auto-repressively.
@@ -374,11 +389,18 @@ def forward(
         Returns:
             packed list containing batch number of sentences (Hypotheses).
         """
+
+        input_decoder_lengths = decoder_lengths
+
+        if decoder_lengths is None:
+            logging.warning(_DECODER_LENGTHS_NONE_WARNING, mode=logging_mode.ONCE)
+            decoder_lengths = torch.tensor([decoder_output.shape[1]], dtype=torch.long).expand(decoder_output.shape[0])
+
         if decoder_output.ndim == 2:
             hypotheses = self._greedy_decode_labels_batched(decoder_output, decoder_lengths)
         else:
             hypotheses = self._greedy_decode_logprobs_batched(decoder_output, decoder_lengths)
-        packed_result = pack_hypotheses(hypotheses, decoder_lengths)
+        packed_result = pack_hypotheses(hypotheses, input_decoder_lengths)
         return (packed_result,)
 
     @torch.no_grad()
diff --git a/tests/collections/asr/decoding/test_ctc_decoding.py b/tests/collections/asr/decoding/test_ctc_decoding.py
index 8eceb822fd38..a42d61f051ad 100644
--- a/tests/collections/asr/decoding/test_ctc_decoding.py
+++ b/tests/collections/asr/decoding/test_ctc_decoding.py
@@ -90,7 +90,9 @@ def test_constructor_subword(self, tmp_tokenizer):
         assert decoding is not None
 
     @pytest.mark.unit
-    def test_char_decoding_greedy_forward(self,):
+    def test_char_decoding_greedy_forward(
+        self,
+    ):
         cfg = CTCDecodingConfig(strategy='greedy')
         vocab = char_vocabulary()
         decoding = CTCDecoding(decoding_cfg=cfg, vocabulary=vocab)
@@ -197,7 +199,10 @@ def test_subword_decoding_greedy_forward_hypotheses(self, tmp_tokenizer, alignme
     @pytest.mark.parametrize('alignments', [False, True])
     @pytest.mark.parametrize('timestamps', [False, True])
     @pytest.mark.parametrize('preserve_frame_confidence', [False, True])
-    def test_batched_decoding_logprobs(self, tmp_tokenizer, alignments, timestamps, preserve_frame_confidence):
+    @pytest.mark.parametrize('length_is_none', [False, True])
+    def test_batched_decoding_logprobs(
+        self, tmp_tokenizer, alignments, timestamps, preserve_frame_confidence, length_is_none
+    ):
         cfg = CTCBPEDecodingConfig(
             strategy='greedy',
             preserve_alignments=alignments,
@@ -217,7 +222,10 @@ def test_batched_decoding_logprobs(self, tmp_tokenizer, alignments, timestamps,
         # that we always handle at least a few blanks.
         input_signal[:, 0, unbatched_decoding.tokenizer.tokenizer.vocab_size] = 1000
         input_signal[:, 1, unbatched_decoding.tokenizer.tokenizer.vocab_size] = 1000
-        length = torch.randint(low=1, high=T, size=[B])
+        if length_is_none:
+            length = None
+        else:
+            length = torch.randint(low=1, high=T, size=[B])
 
         with torch.inference_mode():
             hyps, _ = unbatched_decoding.ctc_decoder_predictions_tensor(
@@ -240,7 +248,8 @@ def test_batched_decoding_logprobs(self, tmp_tokenizer, alignments, timestamps,
 
     @pytest.mark.unit
     @pytest.mark.parametrize('timestamps', [False, True])
-    def test_batched_decoding_labels(self, tmp_tokenizer, timestamps):
+    @pytest.mark.parametrize('length_is_none', [False, True])
+    def test_batched_decoding_labels(self, tmp_tokenizer, timestamps, length_is_none):
         cfg = CTCBPEDecodingConfig(strategy='greedy', compute_timestamps=timestamps)
         unbatched_decoding = CTCBPEDecoding(decoding_cfg=cfg, tokenizer=tmp_tokenizer)
         cfg.strategy = 'greedy_batched'
@@ -254,7 +263,10 @@ def test_batched_decoding_labels(self, tmp_tokenizer, timestamps):
         # at least a few blanks.
         input_labels[:, 0] = unbatched_decoding.tokenizer.tokenizer.vocab_size
         input_labels[:, 1] = unbatched_decoding.tokenizer.tokenizer.vocab_size
-        length = torch.randint(low=1, high=T, size=[B])
+        if length_is_none:
+            length = None
+        else:
+            length = torch.randint(low=1, high=T, size=[B])
 
         with torch.inference_mode():
             hyps, _ = unbatched_decoding.ctc_decoder_predictions_tensor(

From 0f2874b270f476405f11aeb09d38a709118c67b5 Mon Sep 17 00:00:00 2001
From: Ali Taghibakhshi <71892896+JRD971000@users.noreply.github.com>
Date: Wed, 22 May 2024 20:10:25 -0500
Subject: [PATCH 13/47] Alit/bert convert fix (#9285)

* fix extra state and post process

* move to args

* Apply isort and black reformatting

Signed-off-by: JRD971000 <JRD971000@users.noreply.github.com>

---------

Signed-off-by: JRD971000 <JRD971000@users.noreply.github.com>
Co-authored-by: JRD971000 <JRD971000@users.noreply.github.com>
---
 .../convert_bert_hf_to_nemo.py                | 25 +++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/scripts/checkpoint_converters/convert_bert_hf_to_nemo.py b/scripts/checkpoint_converters/convert_bert_hf_to_nemo.py
index 278f7b879b28..a81fd33f47a2 100644
--- a/scripts/checkpoint_converters/convert_bert_hf_to_nemo.py
+++ b/scripts/checkpoint_converters/convert_bert_hf_to_nemo.py
@@ -19,6 +19,7 @@
      --input_name_or_path "thenlper/gte-large" \
      --output_path /path/to/output/nemo/file.nemo \
      --mcore True \
+     --post_process False \
      --precision 32
 ```
 """
@@ -62,6 +63,9 @@ def get_args():
         help="Path config for restoring. It's created during training and may need to be modified during restore if restore environment is different than training. Ex: /raid/nemo_experiments/megatron_gpt/hparams.yaml",
     )
     parser.add_argument("--output_path", type=str, default=None, required=True, help="Path to output .nemo file.")
+    parser.add_argument(
+        "--post_process", type=bool, default=False, required=False, help="Whether to have the postprocessing modules"
+    )
     parser.add_argument(
         "--precision", type=str, default="32", choices=["bf16", "32"], help="Precision for checkpoint weights saved"
     )
@@ -81,6 +85,14 @@ def convert(args):
     trainer = MegatronTrainerBuilder(nemo_config).create_trainer()
     model = MegatronBertModel(nemo_config.model, trainer)
 
+    if not args.post_process:
+        model.model.lm_head, model.model.encoder.final_layernorm, model.model.binary_head, model.model.output_layer = (
+            None,
+            None,
+            None,
+            None,
+        )
+
     nemo_state_dict = {}
     hf_config = hf_model.config.to_dict()
     hidden_size = hf_config["hidden_size"]
@@ -184,6 +196,19 @@ def convert(args):
         nemo_state_dict[LayerNorm2_weight_base_name] = param_to_weights(LayerNorm2_weight)
         nemo_state_dict[LayerNorm2_bias_base_name] = param_to_weights(LayerNorm2_bias)
 
+        nemo_state_dict[f'model.encoder.layers.{l}.self_attention.linear_proj._extra_state'] = model.state_dict()[
+            f'model.encoder.layers.{l}.self_attention.linear_proj._extra_state'
+        ]
+        nemo_state_dict[f'model.encoder.layers.{l}.self_attention.linear_qkv._extra_state'] = model.state_dict()[
+            f'model.encoder.layers.{l}.self_attention.linear_qkv._extra_state'
+        ]
+        nemo_state_dict[f'model.encoder.layers.{l}.mlp.linear_fc1._extra_state'] = model.state_dict()[
+            f'model.encoder.layers.{l}.mlp.linear_fc1._extra_state'
+        ]
+        nemo_state_dict[f'model.encoder.layers.{l}.mlp.linear_fc2._extra_state'] = model.state_dict()[
+            f'model.encoder.layers.{l}.mlp.linear_fc2._extra_state'
+        ]
+
     # Non-layer dependent keys
     word_embeddings_weight = hf_model.state_dict()['embeddings.word_embeddings.weight']
     position_embeddings_weight = hf_model.state_dict()['embeddings.position_embeddings.weight']

From 9d6e4724edacb76a82767dcdd37963b7a55fe83e Mon Sep 17 00:00:00 2001
From: mikolajblaz <mikolajblaz@users.noreply.github.com>
Date: Thu, 23 May 2024 18:06:12 +0200
Subject: [PATCH 14/47] Remove .nemo instead of renaming (#9281)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Remove .nemo instead of renaming

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* add ignore_errors=True flag

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* Revert "Remove .nemo instead of renaming"

This reverts commit b836410a2d369aeb231f00b651d9b0f22b355929.

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Remove backup .nemo after success

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Update tests

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Backup .nemo imediately before save_to

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: mikolajblaz <mikolajblaz@users.noreply.github.com>

* Fix CTC import

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

---------

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>
Signed-off-by: dimapihtar <dpihtar@gmail.com>
Signed-off-by: mikolajblaz <mikolajblaz@users.noreply.github.com>
Co-authored-by: dimapihtar <dpihtar@gmail.com>
---
 .../parts/submodules/ctc_greedy_decoding.py   |  2 +-
 nemo/utils/callbacks/nemo_model_checkpoint.py | 33 +++++++---
 tests/core/test_exp_manager.py                | 65 ++++++++++++++-----
 3 files changed, 76 insertions(+), 24 deletions(-)

diff --git a/nemo/collections/asr/parts/submodules/ctc_greedy_decoding.py b/nemo/collections/asr/parts/submodules/ctc_greedy_decoding.py
index c4e9a14f6e1d..a7f57c82279a 100644
--- a/nemo/collections/asr/parts/submodules/ctc_greedy_decoding.py
+++ b/nemo/collections/asr/parts/submodules/ctc_greedy_decoding.py
@@ -22,7 +22,7 @@
 from nemo.collections.asr.parts.utils.asr_confidence_utils import ConfidenceMethodConfig, ConfidenceMethodMixin
 from nemo.core.classes import Typing, typecheck
 from nemo.core.neural_types import HypothesisType, LengthsType, LogprobsType, NeuralType
-from nemo.utils import logging
+from nemo.utils import logging, logging_mode
 
 
 def pack_hypotheses(
diff --git a/nemo/utils/callbacks/nemo_model_checkpoint.py b/nemo/utils/callbacks/nemo_model_checkpoint.py
index 15e8a4e21f55..e1d1f2e94586 100644
--- a/nemo/utils/callbacks/nemo_model_checkpoint.py
+++ b/nemo/utils/callbacks/nemo_model_checkpoint.py
@@ -22,6 +22,8 @@
 import pytorch_lightning
 import torch
 from _weakref import proxy
+
+from lightning_fabric.utilities.cloud_io import get_filesystem
 from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint, _is_local_file_protocol
 from pytorch_lightning.utilities import rank_zero_info
 
@@ -198,7 +200,6 @@ def on_save_checkpoint(self, trainer, pl_module, checkpoint):
         if app_state.model_parallel_size is not None and app_state.model_parallel_size > 1:
             logging.warning(f'always_save_nemo will slow down training for model_parallel > 1.')
         # since we are creating tarfile artifacts we need to update .nemo path
-        self._backup_existing_nemo_ckpt(trainer)
         app_state.model_restore_path = self._format_nemo_checkpoint_name()
         if app_state.model_parallel_size is not None and app_state.model_parallel_size > 1:
             maybe_injected_best_model_path = inject_model_parallel_rank(self.best_model_path)
@@ -222,14 +223,19 @@ def on_save_checkpoint(self, trainer, pl_module, checkpoint):
             pl_module.load_state_dict(checkpoint, strict=True)
             if torch.distributed.is_initialized():
                 torch.distributed.barrier()
+            backup_path = self._backup_existing_nemo_ckpt(trainer)
             pl_module.save_to(save_path=app_state.model_restore_path)
             logging.info(f"New best .nemo model saved to: {app_state.model_restore_path}")
             pl_module.load_state_dict(old_state_dict, strict=True)
         else:
             if torch.distributed.is_initialized():
                 torch.distributed.barrier()
+            backup_path = self._backup_existing_nemo_ckpt(trainer)
             pl_module.save_to(save_path=app_state.model_restore_path)
             logging.info(f"New .nemo model saved to: {app_state.model_restore_path}")
+        if backup_path is not None and is_global_rank_zero():
+            logging.info(f'Removing old .nemo backup {backup_path}')
+            get_filesystem(backup_path).rm(backup_path)
         return output
 
     def on_train_end(self, trainer, pl_module):
@@ -268,16 +274,25 @@ def on_train_end(self, trainer, pl_module):
                 trainer._checkpoint_connector.restore(self.best_model_path)
 
         if self.save_nemo_on_train_end:
-            self._backup_existing_nemo_ckpt(trainer)
+            backup_path = self._backup_existing_nemo_ckpt(trainer)
             pl_module.save_to(save_path=self._format_nemo_checkpoint_name())
+            if backup_path is not None and is_global_rank_zero():
+                logging.info(f'Removing old .nemo backup {backup_path}')
+                get_filesystem(backup_path).rm(backup_path)
 
-    def _backup_existing_nemo_ckpt(self, trainer) -> str:
+    def _backup_existing_nemo_ckpt(self, trainer) -> Optional[str]:
         """Search for an available name with version infix and rename existing checkpoint.
 
         NOTE: this behavior is slightly different from regular checkpoints.
         PTL creates new regular checkpoint with the first available name.
         Here, for backward compatibility, we create .nemo checkpoint as before
         and create a backup under the first available name.
+
+        Args:
+            trainer (Trainer): trainer instance.
+
+        Returns:
+            Path to the backup checkpoint or None, if no backup was created
         """
         base_path = self._format_nemo_checkpoint_name()
         available_path = base_path
@@ -286,11 +301,13 @@ def _backup_existing_nemo_ckpt(self, trainer) -> str:
             while self.file_exists(available_path, trainer, check_dist_ckpt=False):
                 available_path = self._format_nemo_checkpoint_name(version_cnt)
                 version_cnt += 1
-        if available_path != base_path:
-            if trainer.is_global_zero:
-                logging.info(f'{base_path} already exists, moving existing checkpoint to {available_path}')
-                shutil.move(base_path, available_path)
-            trainer.strategy.barrier()
+        if available_path == base_path:
+            # no existing ckpt, no need to backup
+            return None
+        if trainer.is_global_zero:
+            logging.info(f'{base_path} already exists, moving existing checkpoint to {available_path}')
+            shutil.move(base_path, available_path)
+        trainer.strategy.barrier()
         return available_path
 
     def _format_nemo_checkpoint_name(self, ver: Optional[int] = None) -> str:
diff --git a/tests/core/test_exp_manager.py b/tests/core/test_exp_manager.py
index 8c6b33022dac..2d9bd03f0203 100644
--- a/tests/core/test_exp_manager.py
+++ b/tests/core/test_exp_manager.py
@@ -151,7 +151,7 @@ def test_omegaconf(self):
 
     @pytest.mark.unit
     def test_trainer_loggers(self, tmp_path):
-        """ Test that a trainer with logger errors out with a number of arguments. Test that it works with
+        """Test that a trainer with logger errors out with a number of arguments. Test that it works with
         create_tensorboard_logger set to False
         """
         test_trainer = pl.Trainer(accelerator='cpu')  # Should create logger and modelcheckpoint
@@ -235,7 +235,7 @@ def test_trainer_neptune_logger(self, tmp_path):
 
     @pytest.mark.unit
     def test_checkpoint_configurations(self):
-        """ Test that trainer creating modelcheckpoint and asking exp_manager to do it too results in errors, but
+        """Test that trainer creating modelcheckpoint and asking exp_manager to do it too results in errors, but
         is error free if only one is asked to do so.
         """
         disable_tb_logger = {"create_tensorboard_logger": False}
@@ -297,7 +297,7 @@ def test_log_dir_overrides(self, monkeypatch, tmp_path):
 
     @pytest.mark.unit
     def test_resume(self, tmp_path):
-        """ Tests the resume capabilities of exp_manager"""
+        """Tests the resume capabilities of exp_manager"""
         test_trainer = pl.Trainer(accelerator='cpu', enable_checkpointing=False, logger=False)
 
         # Error because explicit_log_dir does not exist
@@ -428,7 +428,8 @@ def test_nemo_checkpoint_save_best_model_1(self, tmp_path):
     def test_nemo_checkpoint_save_best_model_2(self, tmp_path):
         test_trainer = pl.Trainer(accelerator='cpu', enable_checkpointing=False, logger=False, max_epochs=4)
         exp_manager(
-            test_trainer, {"explicit_log_dir": str(tmp_path / "test")},
+            test_trainer,
+            {"explicit_log_dir": str(tmp_path / "test")},
         )
         model = ExampleModel()
         test_trainer.fit(model)
@@ -456,6 +457,27 @@ def test_nemo_checkpoint_always_save_nemo(self, tmp_path):
         model = ExampleModel.restore_from(str(tmp_path / "test" / "checkpoints" / "default.nemo"))
         assert float(model(torch.tensor([1.0, 1.0], device=model.device))) == 0.0
 
+    @pytest.mark.unit
+    def test_nemo_checkpoint_doesnt_produce_too_many_nemo_ckpts(self, tmp_path):
+        test_trainer = pl.Trainer(accelerator='cpu', enable_checkpointing=False, logger=False, max_epochs=4)
+        exp_manager(
+            test_trainer,
+            {
+                "checkpoint_callback_params": {"save_best_model": True, "always_save_nemo": True, "save_top_k": 2},
+                "explicit_log_dir": str(tmp_path / "test"),
+            },
+        )
+        model = ExampleModel()
+        test_trainer.fit(model)
+
+        assert Path(str(tmp_path / "test" / "checkpoints" / "default.nemo")).exists()
+        assert (
+            len(list((tmp_path / "test" / "checkpoints").glob("default*.nemo"))) == 1
+        )  # check number of `.nemo` checkpoints
+
+        model = ExampleModel.restore_from(str(tmp_path / "test" / "checkpoints" / "default.nemo"))
+        assert float(model(torch.tensor([1.0, 1.0], device=model.device))) == 0.0
+
     @pytest.mark.unit
     def test_nemo_checkpoint_make_checkpoint_dir(self, tmp_path):
         test_trainer = pl.Trainer(
@@ -511,8 +533,8 @@ def test_nemo_checkpoint_restore_model(self, tmp_path):
 
     @pytest.mark.run_only_on('GPU')
     @pytest.mark.parametrize('test_dist_ckpt', [False, True])
-    def test_checkpoints_are_not_overwritten(self, tmp_path, test_dist_ckpt):
-        """ Simulates already existing checkpoints in the ckpt directory and tests ckpt versioning """
+    def test_base_checkpoints_are_not_overwritten(self, tmp_path, test_dist_ckpt):
+        """Simulates already existing checkpoints in the ckpt directory and tests non-nemo ckpt versioning"""
         strategy = NLPDDPStrategy() if test_dist_ckpt else 'auto'
         test_trainer = pl.Trainer(
             accelerator='cpu', enable_checkpointing=False, logger=False, max_epochs=4, strategy=strategy
@@ -563,7 +585,8 @@ def _get_versioned_name(ckpt_name: Path, nemo: bool = False):
 
         assert _get_versioned_name(ckpt_1).exists(), all_checkpoints
         assert not _get_versioned_name(ckpt_2).exists(), all_checkpoints  # ckpt2 didn't exist before
-        assert _get_versioned_name(ckpt_nemo, nemo=True).exists(), all_checkpoints
+        # .nemo checkpoints are not versioned:
+        assert not _get_versioned_name(ckpt_nemo, nemo=True).exists(), all_checkpoints
 
     @pytest.mark.unit
     def test_last_checkpoint_saved(self, tmp_path):
@@ -592,6 +615,7 @@ def train_dataloader(self):
         model_path = checkpoint_dir / "val_loss=0.0300-epoch=1-step=64-last.ckpt"
         last_saved_checkpoint = torch.load(model_path)
         assert max_steps == last_saved_checkpoint['global_step']
+
         # restart training, ensure global step starts correctly
         class AssertCallback(Callback):
             def on_train_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None:
@@ -681,8 +705,7 @@ def test_warning_validation_skipping_when_custom_epoch_loop(self, tmp_path):
         """
         tmp_path = tmp_path / "test_3"
 
-        class CustomLoop(_TrainingEpochLoop):
-            ...
+        class CustomLoop(_TrainingEpochLoop): ...
 
         trainer = pl.Trainer(
             accelerator='cpu', enable_checkpointing=False, logger=False, max_epochs=1, val_check_interval=0.33
@@ -759,7 +782,8 @@ def test_skipped_unfinished_checkpoints_when_restoring(self, tmp_path):
 
         restored_trainer = pl.Trainer(accelerator='cpu', enable_checkpointing=False, logger=False)
         exp_manager(
-            restored_trainer, {"resume_if_exists": True, "explicit_log_dir": str(test_dir)},
+            restored_trainer,
+            {"resume_if_exists": True, "explicit_log_dir": str(test_dir)},
         )
 
         # Check that last complete (w/o unifinished marker) checkpoint was found
@@ -803,7 +827,8 @@ def test_skipped_unfinished_dist_checkpoints_when_restoring(self, tmp_path):
 
         restored_trainer = pl.Trainer(accelerator='cpu', enable_checkpointing=False, logger=False)
         exp_manager(
-            restored_trainer, {"resume_if_exists": True, "explicit_log_dir": str(test_dir)},
+            restored_trainer,
+            {"resume_if_exists": True, "explicit_log_dir": str(test_dir)},
         )
 
         # Check that last complete (w/o unifinished marker) checkpoint was found
@@ -850,13 +875,17 @@ def test_incomplete_checkpoints_cleanup(self, tmp_path):
 
         # unfinished checkpoint with EMA part, both parts should be removed
         self._write_fake_checkpoint(
-            checkpoints_dir / "incomplete01-EMA.ckpt", isdir=False, add_unfinished_marker=False,
+            checkpoints_dir / "incomplete01-EMA.ckpt",
+            isdir=False,
+            add_unfinished_marker=False,
         )
         self._write_fake_checkpoint(checkpoints_dir / "incomplete01.ckpt", isdir=False, add_unfinished_marker=True)
 
         # just EMA part - should be removed. NOTE marker path is the same for base part and for EMA part
         self._write_fake_checkpoint(
-            checkpoints_dir / "incomplete02-EMA.ckpt", isdir=False, add_unfinished_marker=False,
+            checkpoints_dir / "incomplete02-EMA.ckpt",
+            isdir=False,
+            add_unfinished_marker=False,
         )
         (checkpoints_dir / f"incomplete02{NeMoModelCheckpoint.UNFINISHED_CHECKPOINT_SUFFIX}").touch()
 
@@ -864,7 +893,10 @@ def test_incomplete_checkpoints_cleanup(self, tmp_path):
 
         exp_manager(
             test_trainer,
-            {"checkpoint_callback_params": {"save_top_k": 0, "save_last": False}, "explicit_log_dir": str(test_dir),},
+            {
+                "checkpoint_callback_params": {"save_top_k": 0, "save_last": False},
+                "explicit_log_dir": str(test_dir),
+            },
         )
 
         model = ExampleModel()
@@ -909,7 +941,10 @@ def test_incomplete_dist_checkpoints_cleanup(self, tmp_path):
 
         exp_manager(
             test_trainer,
-            {"checkpoint_callback_params": {"save_top_k": 0, "save_last": False}, "explicit_log_dir": str(test_dir),},
+            {
+                "checkpoint_callback_params": {"save_top_k": 0, "save_last": False},
+                "explicit_log_dir": str(test_dir),
+            },
         )
 
         model = ExampleModel()

From a589828b7268dfb7aff505ba2a49ab151c5d5ee4 Mon Sep 17 00:00:00 2001
From: Chen Cui <chcui@nvidia.com>
Date: Thu, 23 May 2024 12:13:31 -0400
Subject: [PATCH 15/47] Refactor Sequence Packing Script (#9271)

* refactor pack seq script

Signed-off-by: Chen Cui <chcui@nvidia.com>

* add copyright header

Signed-off-by: Chen Cui <chcui@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>

* update doc

Signed-off-by: Chen Cui <chcui@nvidia.com>

* minor

Signed-off-by: Chen Cui <chcui@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>

---------

Signed-off-by: Chen Cui <chcui@nvidia.com>
Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>
Co-authored-by: cuichenx <cuichenx@users.noreply.github.com>
---
 .../features/throughput_optimizations.rst     |   9 +-
 nemo/utils/sequence_packing_utils.py          | 232 ++++++++++++++++++
 .../prepare_packed_ft_dataset.py              | 206 ++++++----------
 3 files changed, 306 insertions(+), 141 deletions(-)
 create mode 100644 nemo/utils/sequence_packing_utils.py

diff --git a/docs/source/features/throughput_optimizations.rst b/docs/source/features/throughput_optimizations.rst
index 3f3ded01b1a2..dfd8b6cf9310 100644
--- a/docs/source/features/throughput_optimizations.rst
+++ b/docs/source/features/throughput_optimizations.rst
@@ -71,8 +71,8 @@ target length (i.e. efficient packing), then use shuffle. Otherwise try *first_f
         python scripts/nlp_language_modeling/prepare_packed_ft_dataset.py \
            model.data.train_ds.file_names=[/path/to/training.jsonl] \
            model.data.train_ds.max_seq_length=2048 \
-           model.restore_from_path=<path/to/nemo_model> \
-           +output_dir=<output_folder>
+           +tokenizer_path=/path/to/tokenizer.model \
+           +output_dir=/path/to/output_folder \
            +pack_sizes=[2048,4096,8192] \
         [  +packing_algorithm=first_fit_shuffle \  ]
         [  +seed=0                                 ]
@@ -86,10 +86,7 @@ target length (i.e. efficient packing), then use shuffle. Otherwise try *first_f
     to the size of packed sequence (``pack_size``). ``max_seq_length`` should be set to the same value as unpacked data,
     and can be determined by examining the distribution of sequence lengths in the dataset.
 
-    Note 3. Currently, we require a full nemo model file for simplicity and readability of code, but in theory only a
-    tokenizer file is needed. This part can be improved in a future iteration of the script.
-
-    Note 4. ``pack_sizes`` is a list of packed sequence lengths. In this example, there will be three output files, one for
+    Note 3. ``pack_sizes`` is a list of packed sequence lengths. In this example, there will be three output files, one for
     each pack size. The output files are named ``<output_folder>/packed_{pack_size}_seed{seed}.npy``.
     This argument is a list because you will likely want to experiment with a few ``pack_sizes`` to find out which length
     can fill the GPU memory without exceeding it. Adjusting ``pack_size`` is analogous to adjusting the micro batch size in
diff --git a/nemo/utils/sequence_packing_utils.py b/nemo/utils/sequence_packing_utils.py
new file mode 100644
index 000000000000..2a5a14f83823
--- /dev/null
+++ b/nemo/utils/sequence_packing_utils.py
@@ -0,0 +1,232 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+from typing import Dict, List
+
+import numpy as np
+from tqdm import tqdm
+
+from nemo.utils import logging
+
+PACKING_ALGOS = ['first_fit_decreasing', 'first_fit_shuffle']
+
+
+def find_first_bin_that_fits(bins: List[List[int]], s: int, bin_size: int) -> int:
+    """
+    Finds the first bin in a list of bins that has enough space to fit a sequence of size 's'.
+
+    Args:
+      bins: A list of lists, where each inner list represents a bin and contains the current elements in that bin.
+      s: The size of the sequence to be placed in a bin.
+      bin_size: The maximum capacity of each bin.
+
+    Returns:
+      The index of the first bin that can fit the sequence 's', or -1 if no such bin exists.
+    """
+    for i, abin in enumerate(bins):
+        if sum(abin) + s <= bin_size:
+            return i
+    return -1
+
+
+def first_fit(seqlens: List[int], pack_size: int) -> List[List[int]]:
+    """
+    Packs sequences of varying lengths into bins using the First-Fit algorithm.
+
+    Args:
+      seqlens: A list of integers, representing the lengths of the sequences to be packed.
+      pack_size: The maximum capacity of each bin.
+
+    Returns:
+      A list of lists, where each inner list represents a bin and contains the indices of the sequences assigned to that bin.
+    """
+    res = []
+    for s in seqlens:
+        first_bin = find_first_bin_that_fits(res, s, pack_size)
+        if first_bin == -1:  # open a new bin
+            res.append([s])
+        else:
+            res[first_bin].append(s)
+    return res
+
+
+def first_fit_decreasing(seqlens: List[int], pack_size: int) -> List[List[int]]:
+    """
+    Packs sequences of varying lengths into bins using the First-Fit Decreasing algorithm.
+
+    This is a variation of the First-Fit algorithm where the sequences are sorted by decreasing length before packing.
+
+    Args:
+      seqlens: A list of integers, representing the lengths of the sequences to be packed.
+      pack_size: The maximum capacity of each bin.
+
+    Returns:
+      A list of lists, similar to the output of the 'first_fit' function.
+    """
+    sorted_seqlens = sorted(seqlens, reverse=True)
+    return first_fit(sorted_seqlens, pack_size)
+
+
+def first_fit_shuffle(seqlens: List[int], pack_size: int) -> List[List[int]]:
+    """
+    Packs sequences of varying lengths into bins using the First-Fit with Shuffling algorithm.
+
+    This variation shuffles the order of the sequences before applying the First-Fit algorithm.
+
+    Args:
+      seqlens: A list of integers, representing the lengths of the sequences to be packed.
+      pack_size: The maximum capacity of each bin.
+
+    Returns:
+      A list of lists, similar to the output of the 'first_fit' function.
+    """
+    shuffled_seqlens = seqlens[:]
+    np.random.shuffle(shuffled_seqlens)
+    return first_fit(shuffled_seqlens, pack_size)
+
+
+def create_hist(dataset: np.array, truncate_seq_len: int):
+    """
+    Creates a histogram of sequence lengths from a tokenized dataset.
+
+    This function analyzes the tokenized dataset and creates a histogram showing the distribution of sequence lengths.
+
+    Args:
+      dataset: A NumPy array containing the tokenized sequences. Each element is a dictionary that contains at minimum
+               the key `input_ids`.
+      truncate_seq_len: The maximum sequence length to consider in the histogram.
+
+    Returns:
+      sequences: A dictionary where keys are sequence lengths and values are lists of corresponding sequences from the dataset.
+      histogram: A list representing the histogram data (number of sequences for each length).
+    """
+    logging.info("Creating histogram from tokenized dataset...")
+
+    sequences = collections.defaultdict(list)
+    counts = [0] * truncate_seq_len
+
+    for item_dict in dataset:
+        seq_len = len(item_dict['input_ids']) - 1
+        sequences[seq_len].append(item_dict)
+        counts[seq_len] += 1
+
+    logging.debug("Histogram of sequence lengths")
+    logging.debug(counts)
+
+    histogram = []
+    for seq_len in range(truncate_seq_len):
+        histogram.append(len(sequences[seq_len]))
+
+    return sequences, histogram
+
+
+def create_packing_strategy(
+    histogram: List[int], pack_size: int, packing_algorithm: str = 'first_fit'
+) -> List[List[int]]:
+    """
+    Packs sequences into bins using the specified packing algorithm.
+
+    This function takes the histogram of sequence lengths, desired pack size, and a string representing the packing
+    algorithm to use. It then calls the corresponding function (e.g., 'first_fit_decreasing') and performs the
+    packing process using only sequence lengths as input (without the actual sequences).
+
+    Args:
+          histogram: A list representing the histogram data (number of sequences for each length).
+          pack_size: The maximum capacity of each bin.
+          packing_algorithm: One of the supported packing algorithms from ['first_fit_decreasing', 'first_fit_shuffle']
+
+    Returns:
+          assignments: A list of lists, where each inner list represents a bin and contains the indices of the
+                        sequence lengths assigned to that bin.
+    """
+
+    logging.info(f"Packing sequences to length {pack_size}...")
+
+    all_seq_lens = []
+    for i, count in enumerate(histogram):
+        all_seq_lens.extend([i] * count)
+
+    packing_fn = globals()[packing_algorithm]
+    assignments = packing_fn(all_seq_lens, pack_size)
+    packed_seq_lens = [sum(x) for x in assignments]
+    packing_factor = len(all_seq_lens) / len(packed_seq_lens)
+
+    logging.debug("Packed sequence lengths:")
+    logging.debug(packed_seq_lens)
+    logging.info(f"Packing is {sum(packed_seq_lens)/len(packed_seq_lens)/pack_size*100:.2f}% efficient")
+    logging.info(
+        f">>>>> For pack size {pack_size}, average number of sequences per pack is n = {packing_factor:.3f} <<<<<"
+    )
+    return assignments
+
+
+def fill_packing_strategy(
+    assignments: List[List[int]], sequences: Dict[int, List[Dict]], pack_size: int
+) -> List[Dict]:
+    """
+    Fills the packing strategy with actual sequence data based on assignments and sequence information.
+
+    This function takes the assignments generated by the packing algorithm (containing sequence length indices),
+    the original sequences data, and the pack size. It iterates through the assignments, retrieves the corresponding
+    sequences from the sequences dictionary, and constructs the final output data structure with input IDs, loss masks
+    (if available), and starting indices for each sequence in a packed sequence.
+
+    Args:
+          assignments: A list of lists, where each inner list represents a bin and contains the indices of the
+                        sequence lengths assigned to that bin (output of 'create_packing_strategy').
+          sequences: A dictionary where keys are sequence lengths and values are lists of corresponding sequences
+                      from the dataset (output of 'create_hist').
+          pack_size: The maximum capacity of each bin.
+
+    Returns:
+          output_data: A list of dictionaries, where each dictionary represents a packed sequence with its input IDs,
+                        loss mask (if available), and starting indices.
+    """
+    ifile_handles = dict()
+    for seq_len in tqdm(range(pack_size + 1)):
+        per_seq_data = sequences[seq_len]
+        if len(per_seq_data) > 0:
+            perm = np.random.permutation(len(per_seq_data))
+            input_ids = np.array([x['input_ids'] for x in per_seq_data])[perm].tolist()
+            try:
+                loss_mask = np.array(
+                    [[idx >= x['answer_start_idx'] for idx in range(len(x['input_ids']))] for x in per_seq_data]
+                )[perm].tolist()
+            except KeyError:
+                loss_mask = None
+            ifile_handles[seq_len] = (input_ids, loss_mask)
+
+    input_ids, loss_mask, seq_start_id = {}, {}, {}
+
+    for oindex, assignment in tqdm(enumerate(assignments), total=len(assignments)):
+        _input_ids, _loss_mask, _seq_start_id = [], [], [0]
+
+        for seq_length in assignment:
+            _input_ids.extend(ifile_handles[seq_length][0].pop())
+            _loss_mask.extend(ifile_handles[seq_length][1].pop())
+            _seq_start_id.append(len(_input_ids))
+
+        input_ids[oindex] = _input_ids
+        loss_mask[oindex] = _loss_mask
+        seq_start_id[oindex] = _seq_start_id[:-1]
+
+    output_data = []
+    for i in range(len(input_ids)):
+        item_dict = {'input_ids': input_ids[i], 'loss_mask': loss_mask[i], 'seq_start_id': seq_start_id[i]}
+        output_data.append(item_dict)
+
+    assert all(not seq[0] for seq in ifile_handles.values()), "Error: There are items left over from the assignment"
+    assert all(not seq[1] for seq in ifile_handles.values()), "Error: There are items left over from the assignment"
+    return output_data
diff --git a/scripts/nlp_language_modeling/prepare_packed_ft_dataset.py b/scripts/nlp_language_modeling/prepare_packed_ft_dataset.py
index f01aa54fc265..b3251e75c84e 100644
--- a/scripts/nlp_language_modeling/prepare_packed_ft_dataset.py
+++ b/scripts/nlp_language_modeling/prepare_packed_ft_dataset.py
@@ -12,19 +12,20 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import collections
 import os
 from dataclasses import dataclass
-from typing import Tuple
+from typing import TYPE_CHECKING, Tuple
 
 import numpy as np
-from tqdm import tqdm
 
-from nemo.collections.nlp.models.language_modeling.megatron_gpt_sft_model import MegatronGPTSFTModel
-from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronLMPPTrainerBuilder
+from nemo.collections.nlp.data.language_modeling.megatron.gpt_sft_dataset import GPTSFTDataset
+from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
 from nemo.core.config import hydra_runner
 from nemo.utils import logging
-from nemo.utils.exp_manager import exp_manager
+from nemo.utils.sequence_packing_utils import create_hist, create_packing_strategy, fill_packing_strategy
+
+if TYPE_CHECKING:
+    from omegaconf import DictConfig
 
 """ 
 Script to prepare packed dataset from a SFT/PEFT dataset in the jsonl format.
@@ -45,146 +46,71 @@
 python scripts/nlp_language_modeling/prepare_packed_ft_dataset.py \
    model.data.train_ds.file_names=[/path/to/training.jsonl] \
    model.data.train_ds.max_seq_length=2048 \
-   model.restore_from_path=<path/to/nemo_model> \
-   +output_dir=<output_folder> 
+   +tokenizer_path=/path/to/tokenizer.model
+   +output_dir=/path/to/output_folder
    +pack_sizes=[2048,4096,8192]
    
 Note: 
-- pack_sizes can take in a list 
-- model.data.train_ds.max_seq_length is the length to truncate long sequences before packing, and is different from the packing sizes
-- currenlty, we require a full nemo model file for simplicity and readability of code, but in theory only a tokenizer file is needed.
-  This part can be improved in a future iteration of the script.
+  - If your model or dataset requires non-default configs for conventional SFT/PEFT training in NeMo, you will
+    need to pass in the same configs to ``model.data.train_ds`` as you would for training with unpacked dataset.
+
+  - ``model.data.train_ds.max_seq_length`` is the length to truncate each sequence before packing multiple sequences
+    to the size of packed sequence (``pack_size``). ``max_seq_length`` should be set to the same value as unpacked data,
+    and can be determined by examining the distribution of sequence lengths in the dataset.
+
+  - ``pack_sizes`` is a list of packed sequence lengths. In this example, there will be three output files, one for
+    each pack size. The output files are named ``<output_folder>/packed_{pack_size}_seed{seed}.npy``.
+    This argument is a list because you will likely want to experiment with a few ``pack_sizes`` to find out which length
+    can fill the GPU memory without exceeding it. Adjusting ``pack_size`` is analogous to adjusting the micro batch size in
+    the unpacked case.
 """
 
-PACKING_ALGOS = ['first_fit_decreasing', 'first_fit_shuffle']
-
-
-def find_first_bin_that_fits(bins, s, bin_size):
-    for i, abin in enumerate(bins):
-        if sum(abin) + s <= bin_size:
-            return i
-    return -1
-
-
-def first_fit(seqlens, pack_size):
-    res = []
-    for s in seqlens:
-        first_bin = find_first_bin_that_fits(res, s, pack_size)
-        if first_bin == -1:  # open a new bin
-            res.append([s])
-        else:
-            res[first_bin].append(s)
-    return res
-
-
-def first_fit_decreasing(seqlens, pack_size):
-    sorted_seqlens = sorted(seqlens, reverse=True)
-    return first_fit(sorted_seqlens, pack_size)
-
 
-def first_fit_shuffle(seqlens, pack_size):
-    shuffled_seqlens = seqlens[:]
-    np.random.shuffle(shuffled_seqlens)
-    return first_fit(shuffled_seqlens, pack_size)
+def tokenize_dataset(cfg: 'DictConfig'):
+    """
+    Tokenizes a dataset using the same configuration file as finetuninng with GPTSFTDataset.
 
+    This function reads a dataset and tokenizes it using SentencePiece tokenizer based on the provided configuration.
 
-def create_assignment(output_path, assignments, ifile_handles):
-    n_samples_in_this_shard = len(assignments)
-    input_ids, loss_mask, seq_start_id = {}, {}, {}
+    Args:
+      cfg: A Hydra configuration object containing parameters for tokenization.
 
-    for oindex, assignment in tqdm(enumerate(assignments), total=n_samples_in_this_shard):
-        _input_ids, _loss_mask, _seq_start_id = [], [], [0]
+    Returns:
+      A NumPy array containing the tokenized sequences from the dataset.
+    """
 
-        for seq_length in assignment:
-            _input_ids.extend(ifile_handles[seq_length][0].pop())
-            _loss_mask.extend(ifile_handles[seq_length][1].pop())
-            _seq_start_id.append(len(_input_ids))
-
-        input_ids[oindex] = _input_ids
-        loss_mask[oindex] = _loss_mask
-        seq_start_id[oindex] = _seq_start_id[:-1]
-
-    output_data = []
-    for i in range(len(input_ids)):
-        item_dict = {'input_ids': input_ids[i], 'loss_mask': loss_mask[i], 'seq_start_id': seq_start_id[i]}
-        output_data.append(item_dict)
-
-    assert all(not seq[0] for seq in ifile_handles.values()), "Error: There are items left over from the assignment"
-    assert all(not seq[1] for seq in ifile_handles.values()), "Error: There are items left over from the assignment"
-    np.save(output_path, output_data)
-    logging.info(f"Done, output written to {output_path}")
-
-
-def tokenize_dataset(cfg):
     logging.info("Tokenizing dataset...")
     # using the same template as SFT/PEFT script. This may be overkill but guarantees the preprocess settings
     # are identical to normal SFT training
-    trainer = MegatronLMPPTrainerBuilder(cfg).create_trainer()
-    exp_manager(trainer, cfg.exp_manager)
-
-    model_cfg = MegatronGPTSFTModel.merge_cfg_with(cfg.model.restore_from_path, cfg)
-    model = MegatronGPTSFTModel.restore_from(cfg.model.restore_from_path, model_cfg, trainer=trainer)
-
-    # we set is_train=False to turn off samples mapping and get the actual length of train dataset
-    train_ds = model._build_dataset(cfg.model.data.train_ds, is_train=False)[0]
-    return np.array([train_ds[i] for i in range(len(train_ds))])
-
-
-def create_hist(dataset, truncate_seq_len):
-    logging.info("Creating histogram from tokenized dataset...")
-
-    sequences = collections.defaultdict(list)
-    counts = [0] * truncate_seq_len
-
-    for item_dict in dataset:
-        seq_len = len(item_dict['input_ids']) - 1
-        sequences[seq_len].append(item_dict)
-        counts[seq_len] += 1
-
-    logging.info("Histogram of sequence lengths")
-    logging.info(counts)
-
-    histogram = []
-    for seq_len in range(truncate_seq_len):
-        histogram.append(len(sequences[seq_len]))
-
-    return sequences, histogram
-
-
-def run_packing(sequences, histogram, output_dir, pack_size, packing_algorithm, seed=0):
-    logging.info(f"Packing sequences to length {pack_size}...")
-
-    all_seq_lens = []
-    for i, count in enumerate(histogram):
-        all_seq_lens.extend([i] * count)
-
-    packing_fn = globals()[packing_algorithm]
-    assignments = packing_fn(all_seq_lens, pack_size)
-    packed_seq_lens = [sum(x) for x in assignments]
-    packing_factor = len(all_seq_lens) / len(packed_seq_lens)
-
-    logging.info("Packed sequence lengths:")
-    logging.info(packed_seq_lens)
-    logging.info(
-        f">>>>> For pack size {pack_size}, average number of sequences per pack is n = {packing_factor} <<<<<"
+    data_cfg = cfg.model.data.train_ds
+    dataset = GPTSFTDataset(
+        file_path=data_cfg.file_names[0],
+        tokenizer=get_nmt_tokenizer(library="sentencepiece", tokenizer_model=cfg.tokenizer_path),
+        max_seq_length=data_cfg.max_seq_length,
+        min_seq_length=data_cfg.min_seq_length,
+        pad_seq_length_to_mult=16,  # adds padding in collate_fn so this value is irrelevant here
+        add_bos=data_cfg.get('add_bos', False),
+        add_eos=data_cfg.get('add_eos', True),
+        add_sep=data_cfg.get('add_sep', False),
+        sep_id=cfg.get('sep_id', 49704),
+        max_num_samples=None,
+        seed=data_cfg.get('seed', 1234),
+        label_key=data_cfg.get('label_key', 'answer'),
+        answer_only_loss=cfg.get('answer_only_loss', True),
+        truncation_field=data_cfg.get('truncation_field', 'text'),
+        pad_to_max_length=data_cfg.get('pad_to_max_length', False),
+        index_mapping_dir=data_cfg.get('index_mapping_dir', None),
+        prompt_template=data_cfg.get('prompt_template', None),
+        virtual_tokens=0,
+        tokens_to_generate=data_cfg.get('tokens_to_generate', 0),
+        memmap_workers=data_cfg.get('memmap_workers', None),
+        hf_dataset=data_cfg.get('hf_dataset', False),
+        truncation_method=data_cfg.get('truncation_method', 'right'),
+        special_tokens=data_cfg.get('chat_prompt_tokens', None),
+        is_test=True,
     )
 
-    ifile_handles = {}
-    for seq_len in tqdm(range(pack_size + 1)):
-        per_seq_data = sequences[seq_len]
-        if len(per_seq_data) > 0:
-            input_ids = np.array([x['input_ids'] for x in per_seq_data])
-            loss_mask = np.array(
-                [[idx >= x['answer_start_idx'] for idx in range(len(x['input_ids']))] for x in per_seq_data]
-            )
-            perm = np.random.permutation(len(input_ids))
-            ifile_handles[seq_len] = (input_ids[perm].tolist(), loss_mask[perm].tolist())
-        else:
-            ifile_handles[seq_len] = [], []
-
-    os.makedirs(output_dir, exist_ok=True)
-    output_path = os.path.join(output_dir, f'packed_{pack_size}_seed{seed}.npy')
-    create_assignment(output_path, assignments, ifile_handles)
+    return np.array([dataset[i] for i in range(len(dataset))])
 
 
 @dataclass
@@ -194,7 +120,7 @@ class PackingArgs:
     packing_algorithm: str = "first_fit_shuffle"
     seed: int = 0
 
-    def from_config(self, cfg):
+    def from_config(self, cfg: 'DictConfig'):
         for required_arg in ('output_dir', 'pack_sizes'):
             assert cfg.get(required_arg, None), f"Please specify +{required_arg}=..."
         self.output_dir = cfg.output_dir
@@ -207,12 +133,20 @@ def from_config(self, cfg):
 @hydra_runner(
     config_path="../../examples/nlp/language_modeling/tuning/conf", config_name="megatron_gpt_finetuning_config"
 )
-def main(cfg) -> None:
+def main(cfg: 'DictConfig') -> None:
     args = PackingArgs().from_config(cfg)
     dataset = tokenize_dataset(cfg)
     sequences, histogram = create_hist(dataset, cfg.model.data.train_ds.max_seq_length)
     for pack_size in args.pack_sizes:
-        run_packing(sequences, histogram, args.output_dir, pack_size, args.packing_algorithm, args.seed)
+        assignments = create_packing_strategy(histogram, pack_size, args.packing_algorithm)
+        output_data = fill_packing_strategy(assignments, sequences, pack_size)
+
+        # save output data
+        os.makedirs(args.output_dir, exist_ok=True)
+        output_path = os.path.join(args.output_dir, f'packed_{pack_size}_seed{args.seed}.npy')
+        np.save(output_path, output_data)
+        logging.info(f"Done, output written to {output_path}")
+
     logging.info(
         f"""
 ✅ Packed datasets with pack sizes {args.pack_sizes} are prepared successfully.
@@ -221,7 +155,9 @@ def main(cfg) -> None:
    > +model.data.train_ds.packed_sequence=True
 2. Use the new dataset file instead of the original jsonl file
    > model.data.train_ds.file_names=/path/to/packed_dataset.npy
-3. Adjust the batch sizes. 
+3. Specify the packed sequence length. This should be one of the ``pack_sizes`` you specified during data preparation.
+   > model.data.train_ds.max_seq_length=<pack_size>
+4. Adjust the batch sizes. 
    Micro batch size has to be set to 1 as a nominal constraint. This is because batches are now concatenated 
    in the preprocessing step. You can increase the pack_size to achieve the same purpose of increasing micro batch size.
    Global batch size has to be reduced by the average number of sequences per pack `n`, 

From dddc125227413ce9f84f83515d5b99c82b2279fa Mon Sep 17 00:00:00 2001
From: Marc Romeyn <mromeijn@nvidia.com>
Date: Fri, 24 May 2024 02:16:23 +0200
Subject: [PATCH 16/47] [Nemo-UX] Move code to collections + fix some small
 bugs (#9277)

* Move io & llm

* Run linting

* Fix 2 bugs in megatron-strategy

* Use teardown inside mistral hf-importer

* Fix bug inside HF import

* Apply isort and black reformatting

Signed-off-by: marcromeyn <marcromeyn@users.noreply.github.com>

* Port LLM api

* Apply isort and black reformatting

Signed-off-by: marcromeyn <marcromeyn@users.noreply.github.com>

* fix imports

Signed-off-by: Chen Cui <chcui@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>

---------

Signed-off-by: marcromeyn <marcromeyn@users.noreply.github.com>
Signed-off-by: Chen Cui <chcui@nvidia.com>
Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>
Co-authored-by: marcromeyn <marcromeyn@users.noreply.github.com>
Co-authored-by: Chen Cui <chcui@nvidia.com>
Co-authored-by: cuichenx <cuichenx@users.noreply.github.com>
---
 nemo/collections/llm/__init__.py              |  43 +++++
 nemo/collections/llm/api.py                   | 161 ++++++++++++++++++
 nemo/{ => collections}/llm/gpt/__init__.py    |   0
 nemo/collections/llm/gpt/data/__init__.py     |   7 +
 nemo/{ => collections}/llm/gpt/data/core.py   |   4 +-
 nemo/{ => collections}/llm/gpt/data/dolly.py  |   4 +-
 .../llm/gpt/data/fine_tuning.py               |  12 +-
 nemo/{ => collections}/llm/gpt/data/mock.py   |   9 +-
 .../llm/gpt/data/pre_training.py              |   6 +-
 nemo/{ => collections}/llm/gpt/data/squad.py  |   4 +-
 .../llm/gpt/model}/__init__.py                |  15 +-
 nemo/{ => collections}/llm/gpt/model/base.py  |   3 +-
 .../llm/gpt/model/mistral_7b.py               |  12 +-
 nemo/collections/llm/utils.py                 |  16 ++
 nemo/io/__init__.py                           |  25 ---
 nemo/lightning/__init__.py                    |   6 +
 nemo/lightning/base.py                        |  30 +---
 nemo/lightning/data.py                        |   9 +-
 nemo/lightning/io/__init__.py                 |  25 +++
 nemo/{ => lightning}/io/api.py                |  20 +--
 nemo/{ => lightning}/io/capture.py            |   6 +-
 nemo/{ => lightning}/io/connector.py          |  30 +++-
 nemo/{ => lightning}/io/mixin.py              |  16 +-
 nemo/{ => lightning}/io/pl.py                 |  10 +-
 nemo/{ => lightning}/io/state.py              |  20 ++-
 nemo/lightning/megatron_parallel.py           |  55 +++---
 .../pytorch/plugins/mixed_precision.py        |  15 +-
 nemo/lightning/pytorch/strategies.py          |  21 ++-
 nemo/lightning/pytorch/trainer.py             |   2 +-
 nemo/llm/gpt/data/__init__.py                 |   7 -
 nemo/llm/gpt/model/__init__.py                |  12 --
 tests/{ => lightning}/io/__init__.py          |   0
 tests/{ => lightning}/io/test_api.py          |  11 +-
 tests/{ => lightning}/io/test_mixin.py        |   2 +-
 tests/{ => lightning}/io/test_state.py        |   3 +-
 tests/lightning/test_data.py                  |  28 ++-
 tests/lightning/test_megatron_parallel.py     |   2 +-
 37 files changed, 454 insertions(+), 197 deletions(-)
 create mode 100644 nemo/collections/llm/__init__.py
 create mode 100644 nemo/collections/llm/api.py
 rename nemo/{ => collections}/llm/gpt/__init__.py (100%)
 create mode 100644 nemo/collections/llm/gpt/data/__init__.py
 rename nemo/{ => collections}/llm/gpt/data/core.py (98%)
 rename nemo/{ => collections}/llm/gpt/data/dolly.py (97%)
 rename nemo/{ => collections}/llm/gpt/data/fine_tuning.py (93%)
 rename nemo/{ => collections}/llm/gpt/data/mock.py (97%)
 rename nemo/{ => collections}/llm/gpt/data/pre_training.py (97%)
 rename nemo/{ => collections}/llm/gpt/data/squad.py (97%)
 rename nemo/{llm => collections/llm/gpt/model}/__init__.py (65%)
 rename nemo/{ => collections}/llm/gpt/model/base.py (99%)
 rename nemo/{ => collections}/llm/gpt/model/mistral_7b.py (96%)
 create mode 100644 nemo/collections/llm/utils.py
 delete mode 100644 nemo/io/__init__.py
 create mode 100644 nemo/lightning/io/__init__.py
 rename nemo/{ => lightning}/io/api.py (96%)
 rename nemo/{ => lightning}/io/capture.py (96%)
 rename nemo/{ => lightning}/io/connector.py (92%)
 rename nemo/{ => lightning}/io/mixin.py (98%)
 rename nemo/{ => lightning}/io/pl.py (98%)
 rename nemo/{ => lightning}/io/state.py (97%)
 delete mode 100644 nemo/llm/gpt/data/__init__.py
 delete mode 100644 nemo/llm/gpt/model/__init__.py
 rename tests/{ => lightning}/io/__init__.py (100%)
 rename tests/{ => lightning}/io/test_api.py (65%)
 rename tests/{ => lightning}/io/test_mixin.py (91%)
 rename tests/{ => lightning}/io/test_state.py (99%)

diff --git a/nemo/collections/llm/__init__.py b/nemo/collections/llm/__init__.py
new file mode 100644
index 000000000000..0f60fd7438b9
--- /dev/null
+++ b/nemo/collections/llm/__init__.py
@@ -0,0 +1,43 @@
+# This is here to import it once, which improves the speed of launch when in debug-mode
+try:
+    import transformer_engine  # noqa
+except ImportError:
+    pass
+
+from nemo.collections.llm.api import export_ckpt, import_ckpt, pretrain, train, validate
+from nemo.collections.llm.gpt.data import (
+    DollyDataModule,
+    FineTuningDataModule,
+    MockDataModule,
+    PreTrainingDataModule,
+    SquadDataModule,
+)
+from nemo.collections.llm.gpt.model import (
+    GPTConfig,
+    GPTModel,
+    MaskedTokenLossReduction,
+    Mistral7BConfig,
+    Mistral7BModel,
+    gpt_data_step,
+    gpt_forward_step,
+)
+
+__all__ = [
+    "MockDataModule",
+    "GPTModel",
+    "GPTConfig",
+    "gpt_data_step",
+    "gpt_forward_step",
+    "MaskedTokenLossReduction",
+    "Mistral7BConfig",
+    "Mistral7BModel",
+    "PreTrainingDataModule",
+    "FineTuningDataModule",
+    "SquadDataModule",
+    "DollyDataModule",
+    "train",
+    "import_ckpt",
+    "export_ckpt",
+    "pretrain",
+    "validate",
+]
diff --git a/nemo/collections/llm/api.py b/nemo/collections/llm/api.py
new file mode 100644
index 000000000000..824d84ffb461
--- /dev/null
+++ b/nemo/collections/llm/api.py
@@ -0,0 +1,161 @@
+from pathlib import Path
+from typing import Callable, Optional
+
+import pytorch_lightning as pl
+
+from nemo.collections.llm.utils import task
+from nemo.lightning import MegatronStrategy, Trainer, io, teardown
+
+
+@task(namespace="llm")
+def train(
+    model: pl.LightningModule,
+    data: pl.LightningDataModule,
+    trainer: Trainer,
+    tokenizer: Optional[str] = None,
+    source: Optional[str] = None,
+    export: Optional[str] = None,
+) -> Path:
+    """
+    Trains a model using the specified data and trainer, with optional tokenizer, source, and export.
+
+    Args:
+        model (pl.LightningModule): The model to be trained.
+        data (pl.LightningDataModule): The data module containing training data.
+        trainer (Trainer): The trainer instance configured with a MegatronStrategy.
+        tokenizer (Optional[str]): Tokenizer setting to be applied. Can be 'data' or 'model'.
+        source (Optional[str]): Path to a checkpoint from which to continue training.
+        export (Optional[str]): Filename to save the exported checkpoint after training.
+
+    Returns
+    -------
+        Path: The directory path where training artifacts are saved.
+
+    Raises
+    ------
+        ValueError: If the trainer's strategy is not MegatronStrategy.
+
+    Examples
+    --------
+        >>> model = MyModel()
+        >>> data = MyDataModule()
+        >>> trainer = Trainer(strategy=MegatronStrategy())
+        >>> train(model, data, trainer, tokenizer='data', source='path/to/ckpt.ckpt', export='final.ckpt')
+        PosixPath('/path/to/log_dir')
+    """
+    if not isinstance(trainer.strategy, MegatronStrategy):
+        raise ValueError("Only MegatronStrategy is supported")
+
+    fit_kwargs = {}
+    run_dir = Path(trainer.logger.log_dir)
+    export_dir = run_dir / "export"
+
+    if hasattr(train, "__io__"):
+        _save_config_img(run_dir, train.__io__)
+
+    if tokenizer:  # TODO: Improve this
+        _use_tokenizer(model, data, tokenizer)
+    if source:
+        _add_ckpt_path(source, model, fit_kwargs)
+
+    trainer.fit(model, data, **fit_kwargs)
+
+    print(f"Saving checkpoint to: {export_dir}")
+    trainer.save_checkpoint(export_dir)
+
+    if export and trainer.strategy.is_global_zero:
+        teardown(trainer, model=model)
+        print(f"Exporting checkpoint to: {export_dir / export}")
+        export_ckpt(export_dir, export)
+
+    return run_dir
+
+
+@task(namespace="llm")
+def pretrain(
+    model: pl.LightningModule,
+    data: pl.LightningDataModule,
+    trainer: Trainer,
+    source: Optional[str] = None,
+    # export: Optional[str] = None
+) -> Path:
+    return train(model=model, data=data, trainer=trainer, tokenizer="data", source=source)
+
+
+@task(namespace="llm")
+def validate(
+    model: pl.LightningModule,
+    data: pl.LightningDataModule,
+    trainer: Trainer,
+    tokenizer: Optional[str] = None,
+    source: Optional[str] = None,
+    export: Optional[str] = None,
+) -> Path:
+    if not isinstance(trainer.strategy, MegatronStrategy):
+        raise ValueError("Only MegatronStrategy is supported")
+
+    validate_kwargs = {}
+    run_dir = Path(trainer.logger.log_dir)
+    export_dir = run_dir / "export"
+
+    if tokenizer:  # TODO: Improve this
+        _use_tokenizer(model, data, tokenizer)
+    if source:
+        _add_ckpt_path(source, model, validate_kwargs)
+
+    trainer.validate(model, data, **validate_kwargs)
+    trainer.save_checkpoint(export_dir)
+    if export:
+        teardown(trainer)
+        del trainer, model, data
+        export_ckpt(export_dir, export)
+
+    return run_dir
+
+
+@task(name="import", namespace="llm")
+def import_ckpt(
+    model: pl.LightningModule,
+    source: str,
+    output_path: Optional[Path] = None,
+    overwrite: bool = False,
+) -> Path:
+    return io.import_ckpt(model=model, source=source, output_path=output_path, overwrite=overwrite)
+
+
+def load_connector_from_trainer_ckpt(path: Path, target: str) -> io.ModelConnector:
+    return io.load_ckpt(path).model.exporter(target, path)
+
+
+@task(name="export", namespace="llm")
+def export_ckpt(
+    path: Path,
+    target: str,
+    output_path: Optional[Path] = None,
+    overwrite: bool = False,
+    load_connector: Callable[[Path, str], io.ModelConnector] = load_connector_from_trainer_ckpt,
+) -> Path:
+    return io.export_ckpt(path, target, output_path, overwrite, load_connector)
+
+
+def _use_tokenizer(model: pl.LightningModule, data: pl.LightningDataModule, tokenizer: str) -> None:
+    if tokenizer == "data":
+        model.tokenizer = data.tokenizer
+    elif tokenizer == "model":
+        data.tokenizer = model.tokenizer
+
+
+def _add_ckpt_path(source, model, kwargs) -> None:
+    if io.is_distributed_ckpt(source):
+        kwargs["ckpt_path"] = source
+    else:
+        kwargs["ckpt_path"] = model.import_ckpt(source)
+
+
+def _save_config_img(*args, **kwargs):
+    try:
+        from nemo_sdk.utils import save_config_img
+
+        save_config_img(*args, **kwargs)
+    except ImportError:
+        pass
diff --git a/nemo/llm/gpt/__init__.py b/nemo/collections/llm/gpt/__init__.py
similarity index 100%
rename from nemo/llm/gpt/__init__.py
rename to nemo/collections/llm/gpt/__init__.py
diff --git a/nemo/collections/llm/gpt/data/__init__.py b/nemo/collections/llm/gpt/data/__init__.py
new file mode 100644
index 000000000000..f83da73c987b
--- /dev/null
+++ b/nemo/collections/llm/gpt/data/__init__.py
@@ -0,0 +1,7 @@
+from nemo.collections.llm.gpt.data.dolly import DollyDataModule
+from nemo.collections.llm.gpt.data.fine_tuning import FineTuningDataModule
+from nemo.collections.llm.gpt.data.mock import MockDataModule
+from nemo.collections.llm.gpt.data.pre_training import PreTrainingDataModule
+from nemo.collections.llm.gpt.data.squad import SquadDataModule
+
+__all__ = ["FineTuningDataModule", "SquadDataModule", "DollyDataModule", "MockDataModule", "PreTrainingDataModule"]
diff --git a/nemo/llm/gpt/data/core.py b/nemo/collections/llm/gpt/data/core.py
similarity index 98%
rename from nemo/llm/gpt/data/core.py
rename to nemo/collections/llm/gpt/data/core.py
index c8ce328c1e0b..8d99583016a4 100644
--- a/nemo/llm/gpt/data/core.py
+++ b/nemo/collections/llm/gpt/data/core.py
@@ -32,7 +32,7 @@ def create_sft_dataset(
     truncation_method: str = 'right',
     memmap_workers: int = 2,
     hf_dataset: bool = False,
-    **kwargs
+    **kwargs,
 ) -> "GPTSFTDataset":
     from nemo.collections.nlp.data.language_modeling.megatron.gpt_sft_dataset import GPTSFTDataset
 
@@ -53,5 +53,5 @@ def create_sft_dataset(
         index_mapping_dir=index_mapping_dir,
         prompt_template=prompt_template,
         truncation_method=truncation_method,
-        **kwargs
+        **kwargs,
     )
diff --git a/nemo/llm/gpt/data/dolly.py b/nemo/collections/llm/gpt/data/dolly.py
similarity index 97%
rename from nemo/llm/gpt/data/dolly.py
rename to nemo/collections/llm/gpt/data/dolly.py
index 2e3dcaffbf0a..9632a142eb35 100644
--- a/nemo/llm/gpt/data/dolly.py
+++ b/nemo/collections/llm/gpt/data/dolly.py
@@ -5,8 +5,8 @@
 import numpy as np
 from datasets import load_dataset
 
-from nemo.llm.gpt.data.core import get_dataset_root
-from nemo.llm.gpt.data.fine_tuning import FineTuningDataModule
+from nemo.collections.llm.gpt.data.core import get_dataset_root
+from nemo.collections.llm.gpt.data.fine_tuning import FineTuningDataModule
 from nemo.utils import logging
 
 if TYPE_CHECKING:
diff --git a/nemo/llm/gpt/data/fine_tuning.py b/nemo/collections/llm/gpt/data/fine_tuning.py
similarity index 93%
rename from nemo/llm/gpt/data/fine_tuning.py
rename to nemo/collections/llm/gpt/data/fine_tuning.py
index 1e4ab0432847..1be5c41e4919 100644
--- a/nemo/llm/gpt/data/fine_tuning.py
+++ b/nemo/collections/llm/gpt/data/fine_tuning.py
@@ -5,8 +5,8 @@
 import pytorch_lightning as pl
 from torch.utils.data import DataLoader
 
+from nemo.collections.llm.gpt.data.core import create_sft_dataset
 from nemo.lightning.pytorch.plugins import MegatronDataSampler
-from nemo.llm.gpt.data.core import create_sft_dataset
 
 if TYPE_CHECKING:
     from nemo.collections.common.tokenizers import TokenizerSpec
@@ -74,7 +74,13 @@ def val_dataloader(self) -> DataLoader:
         return self._create_dataloader(self._create_dataset(str(self.validation_path)))
 
     def test_dataloader(self) -> DataLoader:
-        return self._create_dataloader(self._create_dataset(str(self.test_path), tokens_to_generate=32, is_test=True,))
+        return self._create_dataloader(
+            self._create_dataset(
+                str(self.test_path),
+                tokens_to_generate=32,
+                is_test=True,
+            )
+        )
 
     @lru_cache
     def _create_dataset(self, path, **kwargs):
@@ -89,7 +95,7 @@ def _create_dataloader(self, dataset, **kwargs) -> DataLoader:
             pin_memory=self.pin_memory,
             persistent_workers=self.persistent_workers,
             collate_fn=dataset.collate_fn,
-            **kwargs
+            **kwargs,
         )
 
     @property
diff --git a/nemo/llm/gpt/data/mock.py b/nemo/collections/llm/gpt/data/mock.py
similarity index 97%
rename from nemo/llm/gpt/data/mock.py
rename to nemo/collections/llm/gpt/data/mock.py
index ff035a78453d..ccc1acfd6a2a 100644
--- a/nemo/llm/gpt/data/mock.py
+++ b/nemo/collections/llm/gpt/data/mock.py
@@ -74,7 +74,12 @@ def _create_dataloader(self, dataset, **kwargs) -> DataLoader:
 
 class _MockGPTDataset(Dataset):
     def __init__(
-        self, tokenizer: "TokenizerSpec", name: str, num_samples: int, seq_length: int, seed: int = 42,
+        self,
+        tokenizer: "TokenizerSpec",
+        name: str,
+        num_samples: int,
+        seq_length: int,
+        seed: int = 42,
     ) -> None:
         super().__init__()
         self.name = name
@@ -118,7 +123,7 @@ def _collate_fn(self, batch):
 
     def collate_fn(self, batch):
         """Method that user pass as functor to DataLoader.
-        
+
         The method optionally performs neural type checking and add types to the outputs.
 
         Please note, subclasses of Dataset should not implement `input_types`.
diff --git a/nemo/llm/gpt/data/pre_training.py b/nemo/collections/llm/gpt/data/pre_training.py
similarity index 97%
rename from nemo/llm/gpt/data/pre_training.py
rename to nemo/collections/llm/gpt/data/pre_training.py
index d5d05955078b..80e099290b1d 100644
--- a/nemo/llm/gpt/data/pre_training.py
+++ b/nemo/collections/llm/gpt/data/pre_training.py
@@ -9,6 +9,7 @@
 
 if TYPE_CHECKING:
     from megatron.core.datasets.gpt_dataset import GPTDatasetConfig
+
     from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec
 
 
@@ -82,7 +83,10 @@ def setup(self, stage: str = "") -> None:
 
         train_valid_test_num_samples = [num_train_samples, num_val_samples, num_test_samples]
         self._train_ds, self._validation_ds, self._test_ds = BlendedMegatronDatasetBuilder(
-            GPTDataset, train_valid_test_num_samples, is_built_on_rank=lambda: True, config=self.gpt_dataset_config,
+            GPTDataset,
+            train_valid_test_num_samples,
+            is_built_on_rank=lambda: True,
+            config=self.gpt_dataset_config,
         ).build()
 
     # uncomment once fabric API is merged
diff --git a/nemo/llm/gpt/data/squad.py b/nemo/collections/llm/gpt/data/squad.py
similarity index 97%
rename from nemo/llm/gpt/data/squad.py
rename to nemo/collections/llm/gpt/data/squad.py
index c5235905b4ed..77d48da98a0e 100644
--- a/nemo/llm/gpt/data/squad.py
+++ b/nemo/collections/llm/gpt/data/squad.py
@@ -4,8 +4,8 @@
 
 from datasets import DatasetDict, load_dataset
 
-from nemo.llm.gpt.data.core import get_dataset_root
-from nemo.llm.gpt.data.fine_tuning import FineTuningDataModule
+from nemo.collections.llm.gpt.data.core import get_dataset_root
+from nemo.collections.llm.gpt.data.fine_tuning import FineTuningDataModule
 from nemo.utils import logging
 
 if TYPE_CHECKING:
diff --git a/nemo/llm/__init__.py b/nemo/collections/llm/gpt/model/__init__.py
similarity index 65%
rename from nemo/llm/__init__.py
rename to nemo/collections/llm/gpt/model/__init__.py
index a05c96f60944..fcb78d6cd397 100644
--- a/nemo/llm/__init__.py
+++ b/nemo/collections/llm/gpt/model/__init__.py
@@ -1,21 +1,18 @@
-from nemo.llm.gpt.data import MockDataModule
-from nemo.llm.gpt.model import (
+from nemo.collections.llm.gpt.model.base import (
     GPTConfig,
     GPTModel,
     MaskedTokenLossReduction,
-    Mistral7BConfig,
-    Mistral7BModel,
     gpt_data_step,
     gpt_forward_step,
 )
+from nemo.collections.llm.gpt.model.mistral_7b import Mistral7BConfig, Mistral7BModel
 
 __all__ = [
-    "MockDataModule",
-    "GPTModel",
     "GPTConfig",
-    "gpt_data_step",
-    "gpt_forward_step",
-    "MaskedTokenLossReduction",
+    "GPTModel",
     "Mistral7BConfig",
     "Mistral7BModel",
+    "MaskedTokenLossReduction",
+    "gpt_data_step",
+    "gpt_forward_step",
 ]
diff --git a/nemo/llm/gpt/model/base.py b/nemo/collections/llm/gpt/model/base.py
similarity index 99%
rename from nemo/llm/gpt/model/base.py
rename to nemo/collections/llm/gpt/model/base.py
index 7aaac96fdc4f..c6db9b8cbd80 100644
--- a/nemo/llm/gpt/model/base.py
+++ b/nemo/collections/llm/gpt/model/base.py
@@ -7,8 +7,7 @@
 from megatron.core.transformer.transformer_config import TransformerConfig
 from torch.optim import Optimizer
 
-from nemo import io
-from nemo.lightning import get_vocab_size
+from nemo.lightning import get_vocab_size, io
 from nemo.lightning.base import ModelConfig
 from nemo.lightning.megatron_parallel import MaskedTokenLossReduction
 
diff --git a/nemo/llm/gpt/model/mistral_7b.py b/nemo/collections/llm/gpt/model/mistral_7b.py
similarity index 96%
rename from nemo/llm/gpt/model/mistral_7b.py
rename to nemo/collections/llm/gpt/model/mistral_7b.py
index 83d3b3412a39..e0035a086fbe 100644
--- a/nemo/llm/gpt/model/mistral_7b.py
+++ b/nemo/collections/llm/gpt/model/mistral_7b.py
@@ -5,8 +5,8 @@
 import torch
 import torch.nn.functional as F
 
-from nemo import io
-from nemo.llm.gpt.model.base import GPTConfig, GPTModel
+from nemo.collections.llm.gpt.model.base import GPTConfig, GPTModel
+from nemo.lightning import io, teardown
 
 if TYPE_CHECKING:
     from transformers import MistralConfig, MistralForCausalLM
@@ -21,7 +21,7 @@ class Mistral7BConfig(GPTConfig):
     position_embedding_type: str = "rope"
     add_bias_linear: bool = False
     gated_linear_unit: bool = True
-    apply_query_key_layer_scaling: bool = True
+    apply_query_key_layer_scaling: bool = False  # TODO: Should this be True?
 
     num_layers: int = 32
     hidden_size: int = 4096
@@ -56,6 +56,9 @@ def apply(self, output_path: Path) -> Path:
         self.convert_state(source, target)
         self.nemo_save(output_path, trainer)
 
+        teardown(trainer, target)
+        del trainer, target
+
         return output_path
 
     def convert_state(self, source, target):
@@ -90,11 +93,12 @@ def make_vocab_size_divisible_by(mistral_vocab_size):
             return base
 
         output = Mistral7BConfig(
-            seq_length=source.max_position_embeddings,
+            seq_length=source.sliding_window,
             num_layers=source.num_hidden_layers,
             hidden_size=source.hidden_size,
             ffn_hidden_size=source.intermediate_size,
             num_attention_heads=source.num_attention_heads,
+            max_position_embeddings=source.max_position_embeddings,
             init_method_std=source.initializer_range,
             layernorm_epsilon=source.rms_norm_eps,
             num_query_groups=source.num_key_value_heads,
diff --git a/nemo/collections/llm/utils.py b/nemo/collections/llm/utils.py
new file mode 100644
index 000000000000..848a83f5dc08
--- /dev/null
+++ b/nemo/collections/llm/utils.py
@@ -0,0 +1,16 @@
+from typing import Any, Callable, TypeVar
+
+T = TypeVar('T', bound=Callable[..., Any])
+
+
+def task(*args: Any, **kwargs: Any) -> Callable[[T], T]:
+    try:
+        import nemo_sdk as sdk
+
+        return sdk.task(*args, **kwargs)
+    except ImportError:
+        # Return a no-op function
+        def noop_decorator(func: T) -> T:
+            return func
+
+        return noop_decorator
diff --git a/nemo/io/__init__.py b/nemo/io/__init__.py
deleted file mode 100644
index 1b541ff7ba34..000000000000
--- a/nemo/io/__init__.py
+++ /dev/null
@@ -1,25 +0,0 @@
-from nemo.io.api import export_ckpt, import_ckpt, load, load_ckpt, model_exporter, model_importer
-from nemo.io.capture import reinit
-from nemo.io.connector import Connector, ModelConnector
-from nemo.io.mixin import ConnectorMixin, IOMixin
-from nemo.io.pl import TrainerCheckpoint, is_distributed_ckpt
-from nemo.io.state import TransformCTX, apply_transforms, state_transform
-
-__all__ = [
-    "apply_transforms",
-    "Connector",
-    "ConnectorMixin",
-    "IOMixin",
-    "import_ckpt",
-    "is_distributed_ckpt",
-    "export_ckpt",
-    "load",
-    "load_ckpt",
-    "ModelConnector",
-    "model_importer",
-    "model_exporter",
-    'reinit',
-    "state_transform",
-    "TrainerCheckpoint",
-    "TransformCTX",
-]
diff --git a/nemo/lightning/__init__.py b/nemo/lightning/__init__.py
index afbdb39f42d4..e54f223f91cc 100644
--- a/nemo/lightning/__init__.py
+++ b/nemo/lightning/__init__.py
@@ -3,6 +3,12 @@
 from lightning_fabric.plugins.environments import slurm
 from pytorch_lightning import plugins as _pl_plugins
 
+# This is here to import it once, which improves the speed of launch when in debug-mode
+try:
+    import transformer_engine  # noqa
+except ImportError:
+    pass
+
 from nemo.lightning.base import get_vocab_size, teardown
 from nemo.lightning.pytorch.plugins import MegatronDataSampler, MegatronMixedPrecision
 from nemo.lightning.pytorch.plugins import data_sampler as _data_sampler
diff --git a/nemo/lightning/base.py b/nemo/lightning/base.py
index 65bc1310f426..9cf2d9a44f35 100644
--- a/nemo/lightning/base.py
+++ b/nemo/lightning/base.py
@@ -9,7 +9,7 @@
 from pytorch_lightning import LightningModule, Trainer
 from torch import nn
 
-from nemo import io
+from nemo.lightning import io
 
 DEFAULT_NEMO_CACHE_HOME = Path.home() / ".cache" / "nemo"
 NEMO_CACHE_HOME = Path(os.getenv("NEMO_HOME", DEFAULT_NEMO_CACHE_HOME))
@@ -18,27 +18,7 @@
 DEFAULT_NEMO_MODELS_CACHE = NEMO_CACHE_HOME / "models"
 NEMO_MODELS_CACHE = Path(os.getenv("NEMO_MODELS_CACHE", DEFAULT_NEMO_MODELS_CACHE))
 
-#
-# @dataclass
-# class DataConfig:
-#     seq_length: int
-#     micro_batch_size: int = 4
-#     global_batch_size: int = 8
-#     rampup_batch_size: Optional[List[int]] = None
-#     train_drop_last: bool = True
-#     val_drop_last: bool = True
-#     test_drop_last: bool = True
-#     num_workers: int = 8
-#     pin_memory: bool = True
-#     persistent_workers: bool = False
-#
-#     @property
-#     def num_microbatches(self) -> int:
-#         from apex.transformer.pipeline_parallel.utils import get_num_microbatches
-#
-#         return get_num_microbatches()
-#
-#
+
 ModelT = TypeVar("ModelT", bound=LightningModule)
 
 
@@ -66,7 +46,11 @@ def init(self, *args, data=None, cpu: bool = False, **kwargs) -> ModelT:
         return model
 
 
-def get_vocab_size(config, vocab_size: int, make_vocab_size_divisible_by: int = 128,) -> int:
+def get_vocab_size(
+    config,
+    vocab_size: int,
+    make_vocab_size_divisible_by: int = 128,
+) -> int:
     from nemo.utils import logging
 
     after = vocab_size
diff --git a/nemo/lightning/data.py b/nemo/lightning/data.py
index 794300db72f0..88e2f3436699 100644
--- a/nemo/lightning/data.py
+++ b/nemo/lightning/data.py
@@ -20,7 +20,10 @@ def create_dataloader(
 
 
 def setup_microbatch_calculator(
-    global_rank: int, micro_batch_size: int, global_batch_size: int, rampup_batch_size: Optional[List[int]] = None,
+    global_rank: int,
+    micro_batch_size: int,
+    global_batch_size: int,
+    rampup_batch_size: Optional[List[int]] = None,
 ) -> None:
     """
     Initializes the data for distributed training by setting up the microbatch calculator
@@ -41,7 +44,6 @@ def setup_microbatch_calculator(
 
     """
     from nemo.lightning._strategy_lib import NEMO_MEGATRON_MODEL_PARALLEL_APPSTATE_OVERRIDE
-
     from nemo.utils import AppState
 
     app_state = AppState()
@@ -189,8 +191,7 @@ def __len__(self):
             return (num_available_samples - 1) // self.micro_batch_times_data_parallel_size + 1
 
     @abc.abstractmethod
-    def __iter__(self):
-        ...
+    def __iter__(self): ...
 
 
 class MegatronPretrainingSampler(BaseMegatronSampler):
diff --git a/nemo/lightning/io/__init__.py b/nemo/lightning/io/__init__.py
new file mode 100644
index 000000000000..d1a193c5e728
--- /dev/null
+++ b/nemo/lightning/io/__init__.py
@@ -0,0 +1,25 @@
+from nemo.lightning.io.api import export_ckpt, import_ckpt, load, load_ckpt, model_exporter, model_importer
+from nemo.lightning.io.capture import reinit
+from nemo.lightning.io.connector import Connector, ModelConnector
+from nemo.lightning.io.mixin import ConnectorMixin, IOMixin
+from nemo.lightning.io.pl import TrainerCheckpoint, is_distributed_ckpt
+from nemo.lightning.io.state import TransformCTX, apply_transforms, state_transform
+
+__all__ = [
+    "apply_transforms",
+    "Connector",
+    "ConnectorMixin",
+    "IOMixin",
+    "import_ckpt",
+    "is_distributed_ckpt",
+    "export_ckpt",
+    "load",
+    "load_ckpt",
+    "ModelConnector",
+    "model_importer",
+    "model_exporter",
+    'reinit',
+    "state_transform",
+    "TrainerCheckpoint",
+    "TransformCTX",
+]
diff --git a/nemo/io/api.py b/nemo/lightning/io/api.py
similarity index 96%
rename from nemo/io/api.py
rename to nemo/lightning/io/api.py
index c8fe3c04a811..9af1d3d2a9d6 100644
--- a/nemo/io/api.py
+++ b/nemo/lightning/io/api.py
@@ -5,8 +5,8 @@
 import fiddle as fdl
 import pytorch_lightning as pl
 
-from nemo.io.mixin import ConnectorMixin, ConnT, ModelConnector
-from nemo.io.pl import TrainerCheckpoint
+from nemo.lightning.io.mixin import ConnectorMixin, ConnT, ModelConnector
+from nemo.lightning.io.pl import TrainerCheckpoint
 
 CkptType = TypeVar("CkptType")
 
@@ -128,14 +128,14 @@ def import_ckpt(
     path for the imported checkpoint; if not provided, the importer's default path will be used.
     The 'overwrite' parameter enables the replacement of existing data at the output path, which
     is useful when updating models with new data and discarding old checkpoint files.
-    
-    For instance, using `import_ckpt(Mistral7BModel(), "hf")` initiates the import process 
-    by searching for a registered model importer tagged with "hf". In NeMo, `HFMistral7BImporter` 
+
+    For instance, using `import_ckpt(Mistral7BModel(), "hf")` initiates the import process
+    by searching for a registered model importer tagged with "hf". In NeMo, `HFMistral7BImporter`
     is registered under this tag via:
-    `@io.model_importer(Mistral7BModel, "hf", default_path="mistralai/Mistral-7B-v0.1")`. 
-    This links `Mistral7BModel` to `HFMistral7BImporter`, designed for HuggingFace checkpoints. 
-    The importer then processes and integrates these checkpoints into `Mistral7BModel` for further 
-    fine-tuning. 
+    `@io.model_importer(Mistral7BModel, "hf", default_path="mistralai/Mistral-7B-v0.1")`.
+    This links `Mistral7BModel` to `HFMistral7BImporter`, designed for HuggingFace checkpoints.
+    The importer then processes and integrates these checkpoints into `Mistral7BModel` for further
+    fine-tuning.
 
     Args:
         model (pl.LightningModule): The model into which the checkpoint will be imported.
@@ -188,7 +188,7 @@ def export_ckpt(
 ) -> Path:
     """
     Exports a checkpoint from a model using the model's associated exporter, typically for
-    the purpose of sharing a model that has been fine-tuned or customized within NeMo. 
+    the purpose of sharing a model that has been fine-tuned or customized within NeMo.
     This function leverages the ConnectorMixin interface to seamlessly integrate
     the model's state into an external checkpoint format.
 
diff --git a/nemo/io/capture.py b/nemo/lightning/io/capture.py
similarity index 96%
rename from nemo/io/capture.py
rename to nemo/lightning/io/capture.py
index 2a65d18c15e3..910506f13147 100644
--- a/nemo/io/capture.py
+++ b/nemo/lightning/io/capture.py
@@ -42,14 +42,12 @@ def wrapper(*args, **kwargs):
 @runtime_checkable
 class IOProtocol(Protocol, Generic[SelfT]):
     @property
-    def __io__(self) -> fdl.Config[SelfT]:
-        ...
+    def __io__(self) -> fdl.Config[SelfT]: ...
 
 
 @runtime_checkable
 class ReInitProtocol(Protocol, Generic[SelfT]):
-    def reinit(self) -> SelfT:
-        ...
+    def reinit(self) -> SelfT: ...
 
 
 def reinit(configurable: IOProtocol[SelfT]) -> SelfT:
diff --git a/nemo/io/connector.py b/nemo/lightning/io/connector.py
similarity index 92%
rename from nemo/io/connector.py
rename to nemo/lightning/io/connector.py
index bf5f88f95992..cd77abf9dc1c 100644
--- a/nemo/io/connector.py
+++ b/nemo/lightning/io/connector.py
@@ -29,19 +29,19 @@ class Connector(BasePath, Generic[SourceT, TargetT]):
     -------
         init() -> TargetT:
             Should be implemented to initialize the target type from the source type.
-        
+
         apply(output_path: Path) -> Path:
             Should be implemented to apply the transformation and save the result at the output path.
-        
+
         __new__(cls, *args, **kwargs) -> 'Connector':
             Creates a new instance of the connector, using default_path if no path is provided.
-        
+
         __call__(output_path: Optional[Path] = None, overwrite: bool = False) -> Path:
             Processes the transformation and handles file operations like overwriting.
-        
+
         local_path(base_path: Optional[Path] = None) -> Path:
             Computes the local path for storage based on a base path or a default cache home.
-        
+
         is_in_cache(base_path: Optional[Path] = None) -> bool:
             Checks if the transformed data is already cached at the specified base path.
     """
@@ -96,10 +96,10 @@ class ModelConnector(Connector, Generic[SourceT, TargetT]):
     -------
         nemo_setup(model: pl.LightningModule, trainer: Optional[pl.Trainer] = None) -> pl.Trainer:
             Sets up the model and trainer using a specified strategy, preparing it for training or inference.
-        
+
         nemo_save(output_path: Path, trainer: pl.Trainer):
             Saves the model's state to the specified path using the trainer's current strategy.
-        
+
         nemo_load(path: Path, trainer: Optional[pl.Trainer] = None, cpu: bool = True) -> Tuple[Any, pl.Trainer]:
             Loads a model from the specified path, optionally using a CPU-focused strategy, and returns the model and trainer.
     """
@@ -118,7 +118,9 @@ def nemo_setup(self, model: pl.LightningModule, trainer: Optional[pl.Trainer] =
         """
         from nemo.lightning import MegatronStrategy, Trainer
 
-        _trainer = trainer or Trainer(devices=1, accelerator="cpu", strategy=MegatronStrategy())
+        _trainer = trainer or Trainer(
+            devices=1, accelerator="cpu", strategy=MegatronStrategy(store_optimizer_states=False)
+        )
 
         _trainer.strategy.connect(model)
         _trainer.strategy.setup_environment()
@@ -156,8 +158,8 @@ def nemo_load(
         -------
             Tuple[pl.LightningModule, pl.Trainer]: The loaded model and the trainer configured with the model.
         """
-        from nemo.io.api import load_ckpt
         from nemo.lightning import MegatronStrategy, Trainer, _strategy_lib
+        from nemo.lightning.io.api import load_ckpt
 
         model = load_ckpt(path).model
         _trainer = trainer or Trainer(devices=1, accelerator="cpu" if cpu else "gpu", strategy=MegatronStrategy())
@@ -177,3 +179,13 @@ def nemo_load(
         _trainer.strategy.load_checkpoint(path)
 
         return model, _trainer
+
+    def local_path(self, base_path: Optional[Path] = None) -> Path:
+        if base_path:
+            _base = base_path
+        else:
+            from nemo.lightning.base import NEMO_MODELS_CACHE
+
+            _base = Path(NEMO_MODELS_CACHE)
+
+        return _base / str(self).replace("://", "/")
diff --git a/nemo/io/mixin.py b/nemo/lightning/io/mixin.py
similarity index 98%
rename from nemo/io/mixin.py
rename to nemo/lightning/io/mixin.py
index bba6677b452b..b5ee76a2fe03 100644
--- a/nemo/io/mixin.py
+++ b/nemo/lightning/io/mixin.py
@@ -8,8 +8,8 @@
 from cloudpickle import dump
 from typing_extensions import Self
 
-from nemo.io.capture import IOProtocol
-from nemo.io.connector import ModelConnector
+from nemo.lightning.io.capture import IOProtocol
+from nemo.lightning.io.connector import ModelConnector
 
 ConnT = TypeVar('ConnT', bound=ModelConnector)
 
@@ -35,8 +35,8 @@ class IOMixin:
 
     Examples
     --------
-        from nemo import io
-        
+        from nemo.lightning import io
+
         class ExampleClass(io.IOMixin):
             def __init__(self, param1, param2):
                 super().__init__()
@@ -46,7 +46,7 @@ def __init__(self, param1, param2):
         # Creating an instance of ExampleClass
         example = ExampleClass('value1', 'value2')
         example_copy = io.reinit(example)
-        
+
 
     Note:
         For more information on `fdl.Config`, refer to the Fiddle library documentation at
@@ -168,9 +168,9 @@ def import_from(cls, path: str) -> Self:
 
         Args:
             path (str): The path to the model file to be imported.
-            
+
         Example:
-            from nemo import llm
+            from nemo.collections import llm
             model = llm.Mistral7BModel.import_from("hf")
 
         Returns
@@ -285,7 +285,7 @@ def import_ckpt(self, path: str, overwrite: bool = False, base_path: Optional[Pa
     @classmethod
     def _get_connector(cls, ext, path=None, importer=True) -> ModelConnector:
         """
-        Retrieves the appropriate model connector based on the file extension and path, 
+        Retrieves the appropriate model connector based on the file extension and path,
         distinguishing between importers and exporters.
 
         Args:
diff --git a/nemo/io/pl.py b/nemo/lightning/io/pl.py
similarity index 98%
rename from nemo/io/pl.py
rename to nemo/lightning/io/pl.py
index ba9b5be72cab..fba94f5e3a55 100644
--- a/nemo/io/pl.py
+++ b/nemo/lightning/io/pl.py
@@ -11,8 +11,8 @@
 from torch import nn
 from typing_extensions import Self, override
 
-from nemo.io.capture import IOProtocol
-from nemo.io.mixin import IOMixin
+from nemo.lightning.io.capture import IOProtocol
+from nemo.lightning.io.mixin import IOMixin
 
 if TYPE_CHECKING:
     from nemo.lightning.pytorch.strategies import MegatronStrategy
@@ -53,11 +53,9 @@ def construct_extra(cls, strategy: "MegatronStrategy") -> Dict[str, Any]:
 
 class TrainerCkptProtocol(Protocol):
     @classmethod
-    def from_strategy(cls, strategy: "MegatronStrategy") -> Self:
-        ...
+    def from_strategy(cls, strategy: "MegatronStrategy") -> Self: ...
 
-    def io_dump(self, output: Path):
-        ...
+    def io_dump(self, output: Path): ...
 
 
 class MegatronCheckpointIO(CheckpointIO):
diff --git a/nemo/io/state.py b/nemo/lightning/io/state.py
similarity index 97%
rename from nemo/io/state.py
rename to nemo/lightning/io/state.py
index d978cd0ade8e..ed481cfcfe08 100644
--- a/nemo/io/state.py
+++ b/nemo/lightning/io/state.py
@@ -26,11 +26,11 @@ def apply_transforms(
     transforms: Optional[List[Callable[[TransformCTX], TransformCTX]]] = None,
 ) -> TargetModuleT:
     """
-    Applies a series of transformations to adapt the state dictionary of a source module to 
+    Applies a series of transformations to adapt the state dictionary of a source module to
     match the structure of a target module's state dictionary.
 
     This function renames keys according to a provided mapping and modifies values using a list
-    of transformation functions. Each transformation function typically is decorated 
+    of transformation functions. Each transformation function typically is decorated
     with `io.state_transform`.
 
     Args:
@@ -91,7 +91,12 @@ def scale_weights(ctx):
         _target = target.module
 
     target_state = _target.state_dict()
-    ctx = TransformCTX(source=_source, source_state=_source.state_dict(), target=_target, target_state=target_state,)
+    ctx = TransformCTX(
+        source=_source,
+        source_state=_source.state_dict(),
+        target=_target,
+        target_state=target_state,
+    )
 
     for key, val in mapping.items():
         ctx = StateDictTransform(key, val)(ctx)
@@ -349,16 +354,15 @@ def _match_keys(keys: List[str], pattern: str) -> np.ndarray:
 
 @overload
 def state_transform(
-    source_key: Union[str, Tuple[str, ...], Dict[str, str]], target_key: Union[str, Tuple[str, ...]],
-) -> Callable[[F], StateDictTransform[F]]:
-    ...
+    source_key: Union[str, Tuple[str, ...], Dict[str, str]],
+    target_key: Union[str, Tuple[str, ...]],
+) -> Callable[[F], StateDictTransform[F]]: ...
 
 
 @overload
 def state_transform(
     source_key: Union[str, Tuple[str, ...], Dict[str, str]], target_key: Union[str, Tuple[str, ...]], fn: F
-) -> StateDictTransform[F]:
-    ...
+) -> StateDictTransform[F]: ...
 
 
 def state_transform(
diff --git a/nemo/lightning/megatron_parallel.py b/nemo/lightning/megatron_parallel.py
index 899f2fb2c06c..8106b83a41d1 100644
--- a/nemo/lightning/megatron_parallel.py
+++ b/nemo/lightning/megatron_parallel.py
@@ -31,11 +31,9 @@
 
 @runtime_checkable
 class PrecisionPluginProtocol(Protocol[DataT]):
-    def convert_input(self, data: DataT) -> DataT:
-        ...
+    def convert_input(self, data: DataT) -> DataT: ...
 
-    def convert_output(self, output: torch.Tensor) -> torch.Tensor:
-        ...
+    def convert_output(self, output: torch.Tensor) -> torch.Tensor: ...
 
 
 def default_data_step(dataloader_iter: Iterator[DataT]) -> DataT:
@@ -122,7 +120,7 @@ def __init__(
 
         if vp_size is not None:
             if len(_pipeline) == 1 and parallel_state.get_pipeline_model_parallel_world_size() > 1:
-                from nemo import io
+                from nemo.lightning import io
 
                 parallel_state.set_virtual_pipeline_model_parallel_world_size(vp_size)
                 for i in range(1, vp_size):
@@ -212,7 +210,10 @@ def forward(
         if wrap_forward_step:
             _data_step = data_step or self.data_step
             forward_step_func = self.wrapped_forward_step(
-                _forward_step, data_step=_data_step, loss_reduction=loss_reduction, context=context,
+                _forward_step,
+                data_step=_data_step,
+                loss_reduction=_loss_reduction,
+                context=context,
             )
         else:
             forward_step_func = _forward_step
@@ -259,7 +260,11 @@ def forward(
         return loss_mean
 
     def wrapped_forward_step(
-        self, forward_step, loss_reduction, context, data_step,
+        self,
+        forward_step,
+        loss_reduction,
+        context,
+        data_step,
     ) -> Callable[[nn.Module, DataT], Tuple[torch.Tensor, "MegatronCallbackProtocol"]]:
         """The method wraps the forward step function and returns a callable.
 
@@ -309,7 +314,11 @@ def wrapped_forward_step_func(dataloader_iter, model):
 
             # callback
             self._setup_module(
-                forward_callback, batch=batch, model=self, forward_module=model, tensor=output_tensor,
+                forward_callback,
+                batch=batch,
+                model=self,
+                forward_module=model,
+                tensor=output_tensor,
             )
 
             if self.precision_plugin and parallel_state.is_pipeline_last_stage():
@@ -728,29 +737,21 @@ def __contains__(self, callback_object) -> bool:
 
 
 class CallbackMethods:
-    def on_megatron_step_start(self, *args, **kwargs) -> None:
-        ...
+    def on_megatron_step_start(self, *args, **kwargs) -> None: ...
 
-    def on_megatron_microbatch_start(self, *args, **kwargs) -> None:
-        ...
+    def on_megatron_microbatch_start(self, *args, **kwargs) -> None: ...
 
-    def on_megatron_microbatch_callback(self, *args, **kwargs) -> None:
-        ...
+    def on_megatron_microbatch_callback(self, *args, **kwargs) -> None: ...
 
-    def on_megatron_microbatch_end(self, *args, **kwargs) -> None:
-        ...
+    def on_megatron_microbatch_end(self, *args, **kwargs) -> None: ...
 
-    def on_megatron_reduce_microbatches_start(self, *args, **kwargs) -> None:
-        ...
+    def on_megatron_reduce_microbatches_start(self, *args, **kwargs) -> None: ...
 
-    def on_megatron_reduce_microbatches_end(self, *args, **kwargs) -> None:
-        ...
+    def on_megatron_reduce_microbatches_end(self, *args, **kwargs) -> None: ...
 
-    def on_megatron_log_step_end(self, *args, **kwargs) -> None:
-        ...
+    def on_megatron_log_step_end(self, *args, **kwargs) -> None: ...
 
-    def on_megatron_step_end(self, *args, **kwargs) -> None:
-        ...
+    def on_megatron_step_end(self, *args, **kwargs) -> None: ...
 
 
 ReductionT = TypeVar("ReductionT")
@@ -778,8 +779,7 @@ def reduce(self, losses_reduced_per_micro_batch: Sequence[ReductionT]) -> torch.
 
 @runtime_checkable
 class MegatronCallbackProtocol(Protocol):
-    def __call__(self, tensor: torch.Tensor) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
-        ...
+    def __call__(self, tensor: torch.Tensor) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]: ...
 
 
 @runtime_checkable
@@ -796,8 +796,7 @@ def __call__(
         decoder_seq_length: Optional[int] = None,
         forward_only: bool = False,
         collect_non_loss_data: bool = False,
-    ) -> list:
-        ...
+    ) -> list: ...
 
 
 def _calc_number_of_params(model: List[nn.Module]) -> int:
diff --git a/nemo/lightning/pytorch/plugins/mixed_precision.py b/nemo/lightning/pytorch/plugins/mixed_precision.py
index af7054526957..6c3d556816d2 100644
--- a/nemo/lightning/pytorch/plugins/mixed_precision.py
+++ b/nemo/lightning/pytorch/plugins/mixed_precision.py
@@ -27,11 +27,16 @@
 
 
 class MegatronMixedPrecision(MixedPrecision):
-    def __init__(self, precision: Literal["16-mixed", "bf16-mixed"], amp_O2: bool = True, device="cuda",) -> None:
+    def __init__(
+        self,
+        precision: Literal["16-mixed", "bf16-mixed"],
+        amp_O2: bool = False,
+        device="cuda",
+    ) -> None:
         if precision == "bf16-mixed":
             scaler = None
         else:
-            scaler = GradScaler(init_scale=2 ** 32, growth_interval=1000, hysteresis=2)
+            scaler = GradScaler(init_scale=2**32, growth_interval=1000, hysteresis=2)
 
         super().__init__(precision, device, scaler)
 
@@ -94,7 +99,11 @@ def convert_optimizer(self, optimizer: Optimizer) -> Optimizer:
         if isinstance(optimizer, MainParamsOptimizerWrapper) or not self.amp_O2:
             return optimizer
 
-        return MainParamsOptimizerWrapper(optimizer, fp32_grad_accum=True, contiguous_grad_bucket=True,)
+        return MainParamsOptimizerWrapper(
+            optimizer,
+            fp32_grad_accum=True,
+            contiguous_grad_bucket=True,
+        )
 
     def convert_input(self, data: AnyT) -> AnyT:
         """Convert model inputs (forward) to the floating point precision type of this plugin.
diff --git a/nemo/lightning/pytorch/strategies.py b/nemo/lightning/pytorch/strategies.py
index 65986b2a4855..c002ecf7fd68 100644
--- a/nemo/lightning/pytorch/strategies.py
+++ b/nemo/lightning/pytorch/strategies.py
@@ -27,9 +27,8 @@
 from torch.utils.data import DataLoader
 from typing_extensions import override
 
-from nemo import io
-from nemo.io.pl import MegatronCheckpointIO, TrainerCheckpoint, TrainerCkptProtocol
-from nemo.lightning import _strategy_lib
+from nemo.lightning import _strategy_lib, io
+from nemo.lightning.io.pl import MegatronCheckpointIO, TrainerCheckpoint, TrainerCkptProtocol
 from nemo.lightning.megatron_parallel import CallbackConnector, MegatronParallel, _ModuleStepFunction
 from nemo.lightning.pytorch.callbacks import MegatronProgressBar
 
@@ -63,6 +62,7 @@ def __init__(
         find_unused_parameters: bool = False,
         enable_nemo_ckpt_io: bool = True,
         ckpt_type: TrainerCkptProtocol = TrainerCheckpoint,
+        ckpt_include_optimizer: bool = False,
         lazy_init: bool = False,
         **kwargs,
     ) -> None:
@@ -83,6 +83,7 @@ def __init__(
         self.enable_nemo_ckpt_io = enable_nemo_ckpt_io
         self.ckpt_type = ckpt_type
         self.lazy_init = lazy_init
+        self.ckpt_include_optimizer = ckpt_include_optimizer
 
         # used in NVIDIA NGC PyTorch containers
         _strategy_lib.enable_nvidia_optimizations()
@@ -174,6 +175,7 @@ def setup_distributed(self) -> None:
         super().setup_distributed()
 
         from megatron.core import parallel_state
+
         from nemo.utils import AppState
 
         # init model parallel if needed
@@ -227,6 +229,7 @@ def configure_ddp(self) -> None:
     def _setup_model(self, model: nn.Module) -> DistributedDataParallel:
         """Only called when we need to wrap the model for pytorch's ddp."""
         from megatron.core import parallel_state
+
         from nemo.utils import AppState
 
         app_state = AppState()
@@ -345,10 +348,10 @@ def optimizer_sharded_state_dict(self):
     def save_checkpoint(
         self, checkpoint: Dict[str, Any], filepath: Union[str, Path], storage_options: Optional[Any] = None
     ) -> None:
-        checkpoint['state_dict'] = OrderedDict([])  # remove device state_dict
-        checkpoint['sharded_state_dict'] = self.megatron_parallel.sharded_state_dict()
+        checkpoint["state_dict"] = OrderedDict([])  # remove device state_dict
+        checkpoint["sharded_state_dict"] = self.megatron_parallel.sharded_state_dict()
         if self.trainer.state.fn == TrainerFn.FITTING:
-            checkpoint['optimizer_states'] = [self.optimizer_sharded_state_dict()]
+            checkpoint["optimizer_states"] = [self.optimizer_sharded_state_dict()]
 
         self.checkpoint_io.save_checkpoint(checkpoint, filepath, storage_options=storage_options)
         if self.enable_nemo_ckpt_io and self.is_global_zero and self.ckpt_type:
@@ -367,9 +370,9 @@ def load_checkpoint(self, checkpoint_path: Union[str, Path]) -> Dict[str, Any]:
         sharded_state_dict = {}
         sharded_state_dict["state_dict"] = self.megatron_parallel.sharded_state_dict()
 
-        # if self.trainer.state.fn == TrainerFn.FITTING:
-        #     if self.lightning_module.optimizers(use_pl_optimizer=False):
-        #         sharded_state_dict["optimizer_states"] = [self.optimizer_sharded_state_dict()]
+        if self.ckpt_include_optimizer and self.trainer.state.fn == TrainerFn.FITTING:
+            if self.lightning_module.optimizers(use_pl_optimizer=False):
+                sharded_state_dict["optimizer_states"] = [self.optimizer_sharded_state_dict()]
 
         checkpoint = self.checkpoint_io.load_checkpoint(checkpoint_path, sharded_state_dict=sharded_state_dict)
 
diff --git a/nemo/lightning/pytorch/trainer.py b/nemo/lightning/pytorch/trainer.py
index da04a93eef05..b4483d4af4b9 100644
--- a/nemo/lightning/pytorch/trainer.py
+++ b/nemo/lightning/pytorch/trainer.py
@@ -4,7 +4,7 @@
 import pytorch_lightning as pl
 from typing_extensions import Self
 
-from nemo.io.mixin import IOMixin
+from nemo.lightning.io.mixin import IOMixin
 
 
 class Trainer(pl.Trainer, IOMixin):
diff --git a/nemo/llm/gpt/data/__init__.py b/nemo/llm/gpt/data/__init__.py
deleted file mode 100644
index 1c1c9ce5d525..000000000000
--- a/nemo/llm/gpt/data/__init__.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from nemo.llm.gpt.data.dolly import DollyDataModule
-from nemo.llm.gpt.data.fine_tuning import FineTuningDataModule
-from nemo.llm.gpt.data.mock import MockDataModule
-from nemo.llm.gpt.data.pre_training import PreTrainingDataModule
-from nemo.llm.gpt.data.squad import SquadDataModule
-
-__all__ = ["FineTuningDataModule", "SquadDataModule", "DollyDataModule", "MockDataModule", "PreTrainingDataModule"]
diff --git a/nemo/llm/gpt/model/__init__.py b/nemo/llm/gpt/model/__init__.py
deleted file mode 100644
index 05c3e9928fab..000000000000
--- a/nemo/llm/gpt/model/__init__.py
+++ /dev/null
@@ -1,12 +0,0 @@
-from nemo.llm.gpt.model.base import GPTConfig, GPTModel, MaskedTokenLossReduction, gpt_data_step, gpt_forward_step
-from nemo.llm.gpt.model.mistral_7b import Mistral7BConfig, Mistral7BModel
-
-__all__ = [
-    "GPTConfig",
-    "GPTModel",
-    "Mistral7BConfig",
-    "Mistral7BModel",
-    "MaskedTokenLossReduction",
-    "gpt_data_step",
-    "gpt_forward_step",
-]
diff --git a/tests/io/__init__.py b/tests/lightning/io/__init__.py
similarity index 100%
rename from tests/io/__init__.py
rename to tests/lightning/io/__init__.py
diff --git a/tests/io/test_api.py b/tests/lightning/io/test_api.py
similarity index 65%
rename from tests/io/test_api.py
rename to tests/lightning/io/test_api.py
index d4c317bf2e9f..9872d0860193 100644
--- a/tests/io/test_api.py
+++ b/tests/lightning/io/test_api.py
@@ -1,6 +1,6 @@
-from nemo import io
 from nemo import lightning as nl
-from nemo import llm
+from nemo.collections import llm
+from nemo.lightning import io
 
 
 class TestLoad:
@@ -8,7 +8,12 @@ def test_reload_ckpt(self, tmpdir):
         trainer = nl.Trainer(devices=1, accelerator="cpu", strategy=nl.MegatronStrategy())
         # model = llm.Mistral7BModel()
         model = llm.GPTModel(
-            llm.GPTConfig(num_layers=2, hidden_size=1024, ffn_hidden_size=4096, num_attention_heads=8,)
+            llm.GPTConfig(
+                num_layers=2,
+                hidden_size=1024,
+                ffn_hidden_size=4096,
+                num_attention_heads=8,
+            )
         )
 
         ckpt = io.TrainerCheckpoint(model, trainer)
diff --git a/tests/io/test_mixin.py b/tests/lightning/io/test_mixin.py
similarity index 91%
rename from tests/io/test_mixin.py
rename to tests/lightning/io/test_mixin.py
index ed898d435609..824608db6bf0 100644
--- a/tests/io/test_mixin.py
+++ b/tests/lightning/io/test_mixin.py
@@ -1,4 +1,4 @@
-from nemo import io
+from nemo.lightning import io
 
 
 class DummyClass(io.IOMixin):
diff --git a/tests/io/test_state.py b/tests/lightning/io/test_state.py
similarity index 99%
rename from tests/io/test_state.py
rename to tests/lightning/io/test_state.py
index bb5dc4a9af3d..f368f3ce02ce 100644
--- a/tests/io/test_state.py
+++ b/tests/lightning/io/test_state.py
@@ -1,7 +1,7 @@
 import pytest
 from torch import nn
 
-from nemo.io.state import StateDictTransform, TransformCTX, state_transform
+from nemo.lightning.io.state import StateDictTransform, TransformCTX, state_transform
 
 
 class TestStateDictTransform:
@@ -141,6 +141,7 @@ def test_transform_with_tuple_target_key_and_multiple_outputs(self, mock_multi_t
         Test transformation where the target_key is a tuple and the transform function
         returns multiple values that are then unrolled to these target keys.
         """
+
         # Define a transformation that splits the input into two parts
         def split_transform(ctx, x):
             return x - 1, x + 1
diff --git a/tests/lightning/test_data.py b/tests/lightning/test_data.py
index e3143b6da03c..7acdcc91b486 100644
--- a/tests/lightning/test_data.py
+++ b/tests/lightning/test_data.py
@@ -6,11 +6,15 @@
     'nemo.collections.nlp.data.language_modeling.megatron.gpt_sft_dataset.GPTSFTDataset.__init__', return_value=None
 )
 def test_finetuning_module(mock_gpt_sft_dataset) -> None:
-    from nemo.llm.gpt.data import FineTuningDataModule
+    from nemo.collections.llm.gpt.data import FineTuningDataModule
 
     dataset_root = 'random_root'
     datamodule = FineTuningDataModule(
-        dataset_root, seq_length=2048, micro_batch_size=4, global_batch_size=8, seed=1234,
+        dataset_root,
+        seq_length=2048,
+        micro_batch_size=4,
+        global_batch_size=8,
+        seed=1234,
     )
 
     datamodule.train_dataloader()
@@ -21,9 +25,14 @@ def test_finetuning_module(mock_gpt_sft_dataset) -> None:
     'nemo.collections.nlp.data.language_modeling.megatron.gpt_sft_dataset.GPTSFTDataset.__init__', return_value=None
 )
 def test_dolly_module(mock_gpt_sft_dataset) -> None:
-    from nemo.llm.gpt.data import DollyDataModule
+    from nemo.collections.llm.gpt.data import DollyDataModule
 
-    datamodule = DollyDataModule(seq_length=2048, micro_batch_size=4, global_batch_size=8, seed=1234,)
+    datamodule = DollyDataModule(
+        seq_length=2048,
+        micro_batch_size=4,
+        global_batch_size=8,
+        seed=1234,
+    )
 
     datamodule.train_dataloader()
     mock_gpt_sft_dataset.assert_called_once()
@@ -33,9 +42,14 @@ def test_dolly_module(mock_gpt_sft_dataset) -> None:
     'nemo.collections.nlp.data.language_modeling.megatron.gpt_sft_dataset.GPTSFTDataset.__init__', return_value=None
 )
 def test_squad_module(mock_gpt_sft_dataset) -> None:
-    from nemo.llm.gpt.data import SquadDataModule
+    from nemo.collections.llm.gpt.data import SquadDataModule
 
-    datamodule = SquadDataModule(seq_length=2048, micro_batch_size=4, global_batch_size=8, seed=1234,)
+    datamodule = SquadDataModule(
+        seq_length=2048,
+        micro_batch_size=4,
+        global_batch_size=8,
+        seed=1234,
+    )
 
     datamodule.train_dataloader()
     mock_gpt_sft_dataset.assert_called_once()
@@ -45,7 +59,7 @@ def test_squad_module(mock_gpt_sft_dataset) -> None:
 # @patch('megatron.core.datasets.blended_megatron_dataset_builder.BlendedMegatronDatasetBuilder')
 # @patch('nemo.lightning.pytorch.trainer.Trainer')
 # def test_pretraining_module(mock_pretraining_dataset_builder, mock_trainer) -> None:
-#     from nemo.llm.gpt.data import PreTrainingDataModule
+#     from nemo.collections.llm.gpt.data import PreTrainingDataModule
 #
 #     datamodule = PreTrainingDataModule(
 #         path=Path('random_path'),
diff --git a/tests/lightning/test_megatron_parallel.py b/tests/lightning/test_megatron_parallel.py
index 877e6a39a976..31d20170c0b6 100644
--- a/tests/lightning/test_megatron_parallel.py
+++ b/tests/lightning/test_megatron_parallel.py
@@ -103,7 +103,7 @@ def test_init_with_virtual_pipeline(self, mocker, mock_pipeline):
         mocker.patch('megatron.core.parallel_state.model_parallel_is_initialized', return_value=True)
         mocker.patch('megatron.core.parallel_state.set_virtual_pipeline_model_parallel_world_size')
         mocker.patch('megatron.core.parallel_state.set_virtual_pipeline_model_parallel_rank')
-        mocker.patch('nemo.io.reinit', return_value=mock_pipeline)
+        mocker.patch('nemo.lightning.io.reinit', return_value=mock_pipeline)
 
         megatron_parallel = mp.MegatronParallel(mock_pipeline, vp_size=2, cpu=True)
 

From a2a75c5da06b21a24e83328cb55e7cb017d9faa4 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Thu, 23 May 2024 19:06:24 -0700
Subject: [PATCH 17/47] Fix typo in HF tutorial (#9302) (#9304)

Signed-off-by: smajumdar <titu1994@gmail.com>
Co-authored-by: Somshubra Majumdar <titu1994@gmail.com>
---
 tutorials/Publish_NeMo_Model_On_Hugging_Face_Hub.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tutorials/Publish_NeMo_Model_On_Hugging_Face_Hub.ipynb b/tutorials/Publish_NeMo_Model_On_Hugging_Face_Hub.ipynb
index 1771d65c5e50..73a8ebc29ee3 100644
--- a/tutorials/Publish_NeMo_Model_On_Hugging_Face_Hub.ipynb
+++ b/tutorials/Publish_NeMo_Model_On_Hugging_Face_Hub.ipynb
@@ -916,7 +916,7 @@
     {
       "cell_type": "code",
       "source": [
-        "hf_model2 = nemo_asr.models.ASRModel.from_pretrained(hf_model_name + \"v2\")"
+        "hf_model2 = nemo_asr.models.ASRModel.from_pretrained(hf_model_name + \"_v2\")"
       ],
       "metadata": {
         "id": "WDgwrr2aQyUS"

From cde0b2b226fb519798008f96b0b70d271e503d49 Mon Sep 17 00:00:00 2001
From: Tim Moon <4406448+timmoon10@users.noreply.github.com>
Date: Fri, 24 May 2024 13:28:16 -0700
Subject: [PATCH 18/47] Expand documentation for data parallelism and
 distributed optimizer (#9227)

* Add distributed optimizer to docs

Signed-off-by: Tim Moon <tmoon@nvidia.com>

* Debug RST table

Signed-off-by: Tim Moon <tmoon@nvidia.com>

* Review suggestions from @jgerh

Signed-off-by: Tim Moon <tmoon@nvidia.com>

* Copyedits and formatting changes

Signed-off-by: Tim Moon <tmoon@nvidia.com>
Co-authored-by: Tim Moon <tmoon@nvidia.com>
Co-authored-by: Jennifer Gerhold <jgerhold@nvidia.com>

---------

Signed-off-by: Tim Moon <tmoon@nvidia.com>
Co-authored-by: Jennifer Gerhold <jgerhold@nvidia.com>
---
 docs/source/features/parallelisms.rst | 112 ++++++++++++++++++++------
 1 file changed, 88 insertions(+), 24 deletions(-)

diff --git a/docs/source/features/parallelisms.rst b/docs/source/features/parallelisms.rst
index d5e86e46a49d..4cc493f40024 100644
--- a/docs/source/features/parallelisms.rst
+++ b/docs/source/features/parallelisms.rst
@@ -3,22 +3,87 @@
 Parallelisms
 ------------
 
-NeMo Megatron supports 5 types of parallelisms (which can be mixed together arbitrarily):
+NeMo Megatron supports five types of parallelism (which can be mixed together arbitrarily).
+
+Data Parallelism
+^^^^^^^^^^^^^^^^
+
+Data Parallelism (DP) creates identical copies of the model across
+multiple GPUs. Data batches are distributed between GPUs so that the
+GPUs can process them independently. While compute is efficiently
+distributed between GPUs, communication is required in order to keep
+the model copies consistent with each other.
 
 Distributed Data Parallelism
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-Distributed Data Parallelism (DDP) creates idential copies of the model across multiple GPUs.
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Distributed Data Parallelism (DDP) keeps model copies consistent by
+synchronizing parameter gradients before each optimization step. More
+specifically, it sums gradients over all model copies using an
+all-reduce communication collective.
 
 .. image:: ../nlp/nemo_megatron/images/ddp.gif
     :align: center
     :width: 800px
     :alt: Distributed Data Parallel
 
+Distributed Optimizer (ZeRO-1)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The ZeRO-1 algorithm keeps model copies consistent by sharding the
+optimizer state between GPUs. During each optimization step, the
+parameter gradients are first summed and sharded (with a
+reduce-scatter collective), each GPU applies an optimization to its
+local shard of the parameters, and the updated parameter shards are
+broadcast to update all of the model copies (with an all-gather
+collective). This approach is attractive for large models since
+sharding the optimizer state can significantly reduce its memory
+footprint on individual GPUs. It also has, in theory, the same
+communication volume as DDP and its communication pattern has more
+opportunities for overlapping with compute.
+
+Enable Data Parallelism
+~~~~~~~~~~~~~~~~~~~~~~~
+
+DDP is the default parallelism scheme when NeMo is run on multiple
+GPUs. Enabling other parallelism schemes in the model configuration
+will decrease the size of the DP group, that is the number of
+identical model copies.
+
+To enable the distributed optimizer, set
+``model.optim.name=distributed_fused_adam`` in the model
+configuration. It can be configured with the following options:
+
+===========================  =========  ==================================================================================================================================
+Option                       Default    Description
+===========================  =========  ==================================================================================================================================
+``dtype``                    fp32       Optimizer state datatype
+``grad_sync_dtype``          ``dtype``  Gradient reduce-scatter datatype
+``overlap_grad_sync``        True       Overlap gradient reduce-scatter with compute
+``overlap_param_sync``       False      Overlap parameter all-gather with compute
+``bucket_cap_mb``            100        Buffer size (in MiB) for internal state and workspaces. Larger buckets have lower runtime overheads but may increase memory usage.
+``contiguous_param_buffer``  False      Allocate parameters as views into a large buffer. Helps avoid some data copies.
+``contiguous_grad_buffer``   True       Allocate parameter gradients as views into a large buffer. Helps avoid some data copies.
+===========================  =========  ==================================================================================================================================
+
+See the keyword arguments in `Apex DistributedFusedAdam <https://github.com/NVIDIA/apex/blob/master/apex/contrib/optimizers/distributed_fused_adam.py>`_ and `NeMo MegatronDistributedFusedAdam <https://github.com/NVIDIA/NeMo/blob/main/nemo/core/optim/distributed_adam.py>`_ for a full list of distributed optimizer options.
+
+Implement Data Parallelism
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+DDP in NeMo either uses PyTorch
+`DistributedDataParallel <https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html>`_
+(default) or a custom implementation (if custom multi-precision
+training is enabled with ``megatron_amp_O2``).
+
+The distributed optimizer in NeMo is built on top of
+`DistributedFusedAdam <https://github.com/NVIDIA/apex/blob/master/apex/contrib/optimizers/distributed_fused_adam.py>`_
+from Apex.
 
 Tensor Parallelism
 ^^^^^^^^^^^^^^^^^^
 
-**Tensor Parallelism (TP)** is a method for distributing a model's computation across multiple GPUs by splitting tensors into non-overlapping pieces. This allows different parts of the tensor to be processed simultaneously on separate GPUs, enhancing performance and enabling the training of larger models.
+Tensor Parallelism (TP) is a method for distributing a model's computation across multiple GPUs by splitting tensors into non-overlapping pieces. This allows different parts of the tensor to be processed simultaneously on separate GPUs, enhancing performance and enabling the training of larger models.
 
 .. image:: ../nlp/nemo_megatron/images/tp.gif
     :align: center
@@ -31,7 +96,8 @@ Enable Tensor Parallelism
 To enable TP in the NeMo framework, configure the ``tensor_model_parallel_size`` parameter in the model configuration. This parameter determines the number of GPUs among which the model's tensors are partitioned.
 
 **For Tensor Parallelism**:
-   - Set ``tensor_model_parallel_size`` to greater than ``1`` to enable intra-layer model parallelism.
+
+Set ``tensor_model_parallel_size`` to greater than ``1`` to enable intra-layer model parallelism.
 
    .. code-block:: yaml
 
@@ -49,7 +115,7 @@ For detailed API usage and additional configurations, consult the `Megatron Core
 Pipeline Parallelism
 ^^^^^^^^^^^^^^^^^^^^
 
-**Pipeline Parallelism (PP)** is a technique that assigns consecutive layers or segments of a neural network to different GPUs. This division allows each GPU to process different stages of the network sequentially.
+Pipeline Parallelism (PP) is a technique that assigns consecutive layers or segments of a neural network to different GPUs. This division allows each GPU to process different stages of the network sequentially.
 
 .. image:: ../nlp/nemo_megatron/images/pp.gif
     :align: center
@@ -63,7 +129,8 @@ Enable Pipeline Parallelism
 To utilize PP in the NeMo framework, you need to set the ``pipeline_model_parallel_size`` parameter in the model's configuration. This parameter specifies the number of GPUs among which the model's layers are distributed.
 
 **For Pipeline Parallelism**:
-   - Set ``pipeline_model_parallel_size`` to a value greater than ``1`` to enable inter-layer model parallelism.
+
+Set ``pipeline_model_parallel_size`` to a value greater than ``1`` to enable inter-layer model parallelism.
 
    .. code-block:: yaml
 
@@ -74,7 +141,7 @@ Adjust the configuration accordingly here: `NeMo Megatron GPT Config <https://gi
 Interleaved Pipeline Parallel Schedule
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-To minimize the pipeline bubble, the computation on each GPU can be divided into multiple subsets of layers (referred to as model chunks), rather than a single contiguous block. For instance, instead of each GPU processing a continuous set of four layers, it might handle two model chunks with two layers each. This method ensures that each GPU in the pipeline manages multiple stages rather than on a single contiguous block.
+To minimize the pipeline bubble, the computation on each GPU can be divided into multiple subsets of layers (referred to as model chunks), rather than a single contiguous block. For instance, instead of each GPU processing a continuous set of four layers, it might handle two model chunks with two layers each.
 
    .. code-block:: yaml
 
@@ -85,14 +152,14 @@ For more insights into this approach, see our detailed blog: `Scaling Language M
 Implement Pipeline Parallelism
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-NeMo's implementation of PP leverages functionalities from Megatron Core. For a practical example of how PP is implemented within transformer blocks in NeMo, you can inspect the following codebase: `Megatron-LM Transformer Block <https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/transformer/transformer_block.py>`_.
+The NeMo implementation of PP leverages functionalities from Megatron Core. For a practical example of how PP is implemented within transformer blocks in NeMo, you can inspect the following codebase: `Megatron-LM Transformer Block <https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/transformer/transformer_block.py>`_.
 
 For more detailed API usage and configurations related to PP, visit the `Megatron Core Developer Guide <https://docs.nvidia.com/Megatron-Core/developer-guide/latest/api-guide/tensor_parallel.html>`_.
 
 Sequence Parallelism
 ^^^^^^^^^^^^^^^^^^^^
 
-**Sequence Parallelism** extends tensor-level model parallelism by distributing computing load and activation memory across multiple GPUs along the sequence dimension of transformer layers. This method is particularly useful for portions of the layer that have previously not been parallelized, enhancing overall model performance and efficiency.
+Sequence Parallelism extends tensor-level model parallelism by distributing computing load and activation memory across multiple GPUs along the sequence dimension of transformer layers. This method is particularly useful for portions of the layer that have previously not been parallelized, enhancing overall model performance and efficiency.
 
 .. image:: ../nlp/nemo_megatron/images/sp.gif
     :align: center
@@ -113,12 +180,12 @@ For further information on configuration, refer to the following documentation:
 Implement Sequence Parallelism
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-NeMo's implementation of Sequence Parallelism utilizes functionality from Megatron Core. For an in-depth look at how Sequence Parallelism is integrated into the Megatron Core architecture, you can examine the source code here: `Megatron-LM Sequence Parallel Source Code <https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/tensor_parallel/layers.py>`_.
+The NeMo implementation of Sequence Parallelism utilizes functionality from Megatron Core. For an in-depth look at how Sequence Parallelism is integrated into the Megatron Core architecture, you can examine the source code here: `Megatron-LM Sequence Parallel Source Code <https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/tensor_parallel/layers.py>`_.
 
 Context Parallelism
 ^^^^^^^^^^^^^^^^^^^
 
-**Context Parallelism (CP)** is a method for parallelizing the processing of neural network activations across multiple GPUs, focusing on the sequence dimension of the input data. Unlike Sequence Parallelism (SP) that only partitions specific types of activations, CP divides all network activations along the sequence dimension.
+Context Parallelism (CP) is a method for parallelizing the processing of neural network activations across multiple GPUs, focusing on the sequence dimension of the input data. Unlike Sequence Parallelism (SP) that only partitions specific types of activations, CP divides all network activations along the sequence dimension.
 
 Enable Context Parallelism
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -126,7 +193,8 @@ Enable Context Parallelism
 To activate CP in the NeMo framework, set the ``context_parallel_size`` parameter in the model configuration. This parameter specifies the number of GPUs among which the model's sequence activations are distributed.
 
 **For Context Parallelism**:
-   - Set ``context_parallel_size`` to a value greater than ``1`` to enable sequence-wide model parallelism.
+
+Set ``context_parallel_size`` to a value greater than ``1`` to enable sequence-wide model parallelism.
 
    .. code-block:: yaml
 
@@ -137,18 +205,16 @@ The configuration can be found and modified here: `NeMo Megatron Core Context Co
 Implement Context Parallelism
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-NeMo leverages functionalities from both Megatron Core and transformer-engine to implement CP efficiently. During forward propagation, each GPU handles a segment of the sequence, storing only the necessary Key and Value (KV) pairs. In the backward pass, these KV pairs are reassembled across GPUs using advanced communication schemes like all-gather and reduce-scatter transformed into point-to-point communications in a ring topology. This method reduces the memory footprint significantly while maintaining computational efficiency.
-
-Additionally, NeMo's CP supports integration with various forms of model parallelism such as TP (Tensor Parallelism), PP (Pipeline Parallelism), and DP (Data Parallelism), ensuring broad usability and flexibility in large-scale model training environments.
+NeMo leverages functionalities from both Megatron Core and Transformer Engine to implement CP efficiently. During forward propagation, each GPU handles a segment of the sequence, storing only the necessary Key and Value (KV) pairs. In the backward pass, these KV pairs are reassembled across GPUs using advanced communication schemes like all-gather and reduce-scatter transformed into point-to-point communications in a ring topology. This method reduces the memory footprint significantly while maintaining computational efficiency.
 
 Visit our source code for more insights into the implementation:
-- Megatron Core transformer engine: `Megatron Core <https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/transformer/custom_layers/transformer_engine.py>`_
-- Transformer Engine repository: `Transformer Engine Code <https://github.com/NVIDIA/TransformerEngine/blob/main/transformer_engine/pytorch/attention.py>`_
+- `Megatron Core wrappers for Transformer Engine <https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/transformer/custom_layers/transformer_engine.py>`_
+- `Transformer Engine attention modules <https://github.com/NVIDIA/TransformerEngine/blob/main/transformer_engine/pytorch/attention.py>`_
 
 
 Expert Parallelism
 ^^^^^^^^^^^^^^^^^^
-**Expert Parallelism (EP)** is a type of model parallelism that distributes experts of an MoE across GPUs.
+Expert Parallelism (EP) is a type of model parallelism that distributes experts of an MoE across GPUs.
 
 .. image:: ../nlp/nemo_megatron/images/ep.png
     :align: center
@@ -158,9 +224,7 @@ Expert Parallelism
 Enable Expert Parallelism
 ~~~~~~~~~~~~~~~~~~~~~~~~~
 
-To enable it users can pass ``model.expert_model_parallel_size=k``, where k is an integer with the desired
-expert parallelism level, for example if the model has three experts (i.e. ``model.num_moe_experts=3``), we can specify
-k=3 (i.e. via CLI using ``model.expert_model_parallel_size=3``). The number of experts should be exactly divisible by the ``expert_model_parallel_size``.
+To enable EP, set ``model.expert_model_parallel_size`` to the desired expert parallel size. For example, if the model has six experts (``model.num_moe_experts=6``), then setting ``model.expert_model_parallel_size=3`` results in each GPU processing two experts. The number of experts should be divisible by the expert parallel size.
 
    .. code-block:: yaml
 
@@ -172,13 +236,13 @@ For further information on configuration, refer to the following documentation:
 Implement Expert Parallelism
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-NeMo's expert parallelism functionality is provided by Megatron-LM repository, please consult the corresponding `Moe-layer <https://github.com/NVIDIA/Megatron-LM/blob/e2ec14ab5690fead7e33760b0f8fb20c83b4fd1f/megatron/core/transformer/moe/moe_layer.py#L29>`_ for more moe implementation details.
+The NeMo implementation of Expert Parallelism uses functionality from Megatron Core. Please consult the `Megatron Core MoE layer <https://github.com/NVIDIA/Megatron-LM/blob/e2ec14ab5690fead7e33760b0f8fb20c83b4fd1f/megatron/core/transformer/moe/moe_layer.py#L29>`_ for more MoE implementation details.
 
 
 Parallelism nomenclature
 ^^^^^^^^^^^^^^^^^^^^^^^^
 
-When reading and modifying NeMo Megatron code you will encounter the following terms.
+The following figure illustrates some terms that you may encounter in the NeMo Megatron codebase.
 
 .. image:: ../nlp/nemo_megatron/images/pnom.gif
     :align: center

From c3f19e928bb040351b58f66b5642030a5aea14df Mon Sep 17 00:00:00 2001
From: Charlene Yang <8636796+cyanguwa@users.noreply.github.com>
Date: Fri, 24 May 2024 13:28:31 -0700
Subject: [PATCH 19/47] Update flash attention section in
 memory_optimizations.rst (#9188)

* Update flash attention section in memory_optimizations.rst

Signed-off-by: cyanguwa <8636796+cyanguwa@users.noreply.github.com>

* update changes based on comments

Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com>

---------

Signed-off-by: cyanguwa <8636796+cyanguwa@users.noreply.github.com>
Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com>
---
 docs/source/features/memory_optimizations.rst | 24 ++++++++++++++-----
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/docs/source/features/memory_optimizations.rst b/docs/source/features/memory_optimizations.rst
index d87cb1e191ca..4d363670fedf 100644
--- a/docs/source/features/memory_optimizations.rst
+++ b/docs/source/features/memory_optimizations.rst
@@ -11,14 +11,26 @@ Flash Attention
 Overview
 ^^^^^^^^
 
-Flash Attention is a method designed to enhance the efficiency of Transformer models, which are widely utilized in applications such as Natural Language Processing (NLP). Traditional Transformers are slow and consume a lot of memory, especially with long sequences, due to the quadratic time and memory complexity of self-attention. Flash Attention is an IO-aware exact attention algorithm that leverages tiling to minimize the number of memory reads/writes between the GPU's high-bandwidth memory (HBM) and on-chip SRAM. This approach is designed to be more efficient in terms of IO complexity compared to standard attention mechanisms.
+Flash attention is an algorithm designed to improve the efficiency of the attention mechanism in transformer models such as GPT and BERT. The attention mechanism has quadratic time and memory complexity in sequence length and can present significant runtime and memory challenges for longer sequences.
+
+Compared to the standard, non-flash algorithm, flash attention applies two techniques to lower the memory requirement and improve compute efficiency.
+
+The tiling technique decomposes the inputs based on the shared memory size and calculates the softmax one tile at a time. Instead of working on the entire query, key, value tensors at once, it makes several passes at these tensors and then combines the results in a subsequent step.
+
+The recomputation technique stores the softmax normalization factors (linear to sequence length), instead of the softmax results (qudratic to sequence length), and uses these normalization factors to recompute the attention scores. This saves the amount of data to write to global memory and reduces both the memory requirement and I/O traffic between global memory and shared memory.
+
+Flash attention lowers the memory footprint and computational complexity from quadratic to linear, and greatly extending the range of sequence length allowed in large language models.
+
+The flash attention algorithm was first propsed `here <https://arxiv.org/pdf/2205.14135>`_. Two of its implementations are `flash-attention <https://github.com/Dao-AILab/flash-attention>`_ by Tri Dao *et al*, and `fused flash attention <https://docs.nvidia.com/deeplearning/cudnn/archives/cudnn-897/developer-guide/index.html#flash-fused-multi-head-att-fprop>`_ by NVIDIA cuDNN.
 
 Turn Flash Attention On and Off
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-In the NeMo Framework, Flash Attention is supported through the Transformer Engine with the inclusion of Flash Attention 2. By default, Flash Attention is enabled, but the Transformer Engine may switch to a different kernel if the tensor dimensions are not optimal for Flash Attention. Users can completely disable Flash Attention by setting the environment variable ``NVTE_FLASH_ATTN=0``.
+In the NeMo framework, flash attention is supported through `Transformer Engine <https://github.com/NVIDIA/TransformerEngine/tree/main>`_, including both of the implementations mentioned above. Transformer Engine selects the appropriate implementation based on input information such as sequence length, number of heads and head dimension. When both implementations are applicable, Transformer Engine prefers cuDNN flash attention on Hopper+ architectures and Tri Dao flash attention on Ampere architectures.
+
+To disable Tri Dao flash attention, set the environment variable ``NVTE_FLASH_ATTN=0``. To disable cuDNN flash attention, set ``NVTE_FUSED_ATTN=0``.
 
-For more details on the supported Dot Attention backend, please refer to the Transformer Engine source code available at `Transformer Engine's Attention Mechanism <https://github.com/NVIDIA/TransformerEngine/blob/main/transformer_engine/pytorch/attention.py>`_.
+For more details on the Dot Product Attention backends supported in Transformer Engine, please refer to the source code at `Transformer Engine's Attention Mechanism <https://github.com/NVIDIA/TransformerEngine/blob/main/transformer_engine/pytorch/attention.py>`_.
 
 Activation Recomputation
 ------------------------
@@ -28,15 +40,15 @@ Overview
 
 Full Activation Recomputation
 """""""""""""""""""""""""""""
-This method recalculates all the intermediate activations during the backward pass of a model's training, instead of storing them during the forward pass. This technique maximizes memory efficiency at the cost of computational overhead, as each activation is recomputed when needed.
+The full activation recomputation method recalculates all the intermediate activations during the backward pass of a model's training, instead of storing them during the forward pass. This technique maximizes memory efficiency at the cost of computational overhead, as each activation is recomputed when needed.
 
 Partial Activation Recomputation
 """"""""""""""""""""""""""""""""
-This method recomputes only a subset of layers during the backward phase. It is a trade-off between the full recomputation and no recomputation, balancing memory savings with computational efficiency.
+The partial activation recomputation method recomputes only a subset of layers during the backward phase. It is a trade-off between the full recomputation and no recomputation, balancing memory savings with computational efficiency.
 
 Selective Activation Recomputation
 """"""""""""""""""""""""""""""""""
-This method reduces memory footprint of activations significantly via smart activation checkpointing. This approach involves selectively storing only crucial activations and recomputing the others as needed. It is particularly useful in large models to minimize memory usage while controlling the computational cost.
+The selective activation recomputation method reduces memory footprint of activations significantly via smart activation checkpointing. This approach involves selectively storing only crucial activations and recomputing the others as needed. It is particularly useful in large models to minimize memory usage while controlling the computational cost.
 
 Refer to "Reducing Activation Recomputation in Large Transformer Models" for more details: https://arxiv.org/abs/2205.05198.
 

From 251cf66910a24a76ed39ac8e66c9387e5ebfa7fd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?oliver=20k=C3=B6nig?= <okoenig@nvidia.com>
Date: Sat, 25 May 2024 00:31:25 +0200
Subject: [PATCH 20/47] Install alerting (#9311)

* ci: Send Slack alerts on CI failure

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* chore: Set live

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

---------

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>
---
 .github/scripts/slackHelper.sh  | 23 +++++++++++++++++++++++
 .github/workflows/cicd-main.yml | 21 +++++++++++++++++----
 2 files changed, 40 insertions(+), 4 deletions(-)
 create mode 100644 .github/scripts/slackHelper.sh

diff --git a/.github/scripts/slackHelper.sh b/.github/scripts/slackHelper.sh
new file mode 100644
index 000000000000..4696cebcf13b
--- /dev/null
+++ b/.github/scripts/slackHelper.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+function sendSlackMessage() {
+
+  WEBHOOK_URL="$1"
+  PIPELINE_URL="$2"
+
+  curl -X POST -H "Content-type: application/json" --data "{
+      \"blocks\": [
+        {
+			\"type\": \"section\",
+			\"text\": {
+				\"type\": \"mrkdwn\",
+				\"text\": \"\
+🚨 *CI/CD failure at <$PIPELINE_URL|NeMo CI>*:
+
+\"
+			}
+		}
+      ]
+    }" $WEBHOOK_URL
+
+}
diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index dbc7d907580a..53e92e976240 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -6482,9 +6482,8 @@ jobs:
         - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
-
   Nemo_CICD_Test:
-    needs:
+    needs: 
       - L0_Unit_Tests_GPU
       - L0_Unit_Tests_CPU
       - L2_Community_LLM_Checkpoints_tests_Llama
@@ -6598,8 +6597,22 @@ jobs:
       - L2_TTS_Fast_dev_runs_1_Mixer-TTS
       - L2_TTS_Fast_dev_runs_1_Hifigan
       - Speech_Checkpoints_tests
-
+    if: always()
     runs-on: ubuntu-latest
     steps:
         # This should depend on all the tests so we block/unblock based on all tests passing
-      - run: exit 0
+      - if: ${{ contains(needs.*.result, 'success') }}
+        run: exit 0
+
+      - if: ${{ contains(needs.*.result, 'failure') }}
+        name: Checkout repository
+        uses: actions/checkout@v4
+      
+      - if: ${{ contains(needs.*.result, 'failure') }}
+        run: |
+          source .github/scripts/slackHelper.sh
+
+          WEBHOOK_URL=${{ secrets.SLACK_WEBHOOK }}
+          PIPELINE_URL=${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+
+          sendSlackMessage "$WEBHOOK_URL" "$PIPELINE_URL"

From 1fa961ba03ab5f8c91b278640e29807079373372 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Sat, 25 May 2024 00:18:43 -0400
Subject: [PATCH 21/47] typos (#9314) (#9315)

Signed-off-by: Nithin Rao Koluguri <nithinraok>
Co-authored-by: Nithin Rao <nithinrao.koluguri@gmail.com>
---
 tutorials/00_NeMo_Primer.ipynb                           | 2 +-
 tutorials/asr/ASR_Confidence_Estimation.ipynb            | 4 ++--
 tutorials/asr/ASR_Context_Biasing.ipynb                  | 2 +-
 tutorials/asr/Speech_Commands.ipynb                      | 4 ++--
 tutorials/nlp/Joint_Intent_and_Slot_Classification.ipynb | 2 +-
 5 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/tutorials/00_NeMo_Primer.ipynb b/tutorials/00_NeMo_Primer.ipynb
index 50aa60260b35..07d7f6b46539 100644
--- a/tutorials/00_NeMo_Primer.ipynb
+++ b/tutorials/00_NeMo_Primer.ipynb
@@ -588,7 +588,7 @@
         "id": "U7Eezf_sAVS0"
       },
       "source": [
-        "You might wonder why we didnt explicitly set `citrinet.cfg.optim = cfg.optim`. \n",
+        "You might wonder why we didn't explicitly set `citrinet.cfg.optim = cfg.optim`. \n",
         "\n",
         "This is because the `setup_optimization()` method does it for you! You can still update the config manually."
       ]
diff --git a/tutorials/asr/ASR_Confidence_Estimation.ipynb b/tutorials/asr/ASR_Confidence_Estimation.ipynb
index eb8cd7b11688..9b925adbd777 100644
--- a/tutorials/asr/ASR_Confidence_Estimation.ipynb
+++ b/tutorials/asr/ASR_Confidence_Estimation.ipynb
@@ -284,7 +284,7 @@
     "            eps_padded_hyp, labels, padded_labels, fill_confidence_deletions(confidence_scores, labels)\n",
     "        ):\n",
     "            word_len = len(word)\n",
-    "            # shield angle brakets for <eps>\n",
+    "            # shield angle brackets for <eps>\n",
     "            if html and word == \"<eps>\":\n",
     "                word = \"&lt;eps&gt;\"\n",
     "            if current_line_len + word_len + 1 <= terminal_width:\n",
@@ -307,7 +307,7 @@
     "        current_word_line = \"\"\n",
     "        for word, score in zip(transcript_list, confidence_scores):\n",
     "            word_len = len(word)\n",
-    "            # shield angle brakets for <eps>\n",
+    "            # shield angle brackets for <eps>\n",
     "            if html and word == \"<eps>\":\n",
     "                word = \"&lt;eps&gt;\"\n",
     "            if current_line_len + word_len + 1 <= terminal_width:\n",
diff --git a/tutorials/asr/ASR_Context_Biasing.ipynb b/tutorials/asr/ASR_Context_Biasing.ipynb
index dd2e8176ad33..7171510f4e0d 100644
--- a/tutorials/asr/ASR_Context_Biasing.ipynb
+++ b/tutorials/asr/ASR_Context_Biasing.ipynb
@@ -361,7 +361,7 @@
    "source": [
     "## Create a context-biasing list\n",
     "\n",
-    "Now, we need to select the words, recognition of wich we want to improve by CTC-WS context-biasing.\n",
+    "Now, we need to select the words, recognition of which we want to improve by CTC-WS context-biasing.\n",
     "Usually, we select only nontrivial words with the lowest recognition accuracy.\n",
     "Such words should have a character length >= 3 because short words in a context-biasing list may produce high false-positive recognition.\n",
     "In this toy example, we will select all the words that look like names with a recognition accuracy less than 1.0.\n",
diff --git a/tutorials/asr/Speech_Commands.ipynb b/tutorials/asr/Speech_Commands.ipynb
index 58b719a867fa..438533f0f03a 100644
--- a/tutorials/asr/Speech_Commands.ipynb
+++ b/tutorials/asr/Speech_Commands.ipynb
@@ -1431,10 +1431,10 @@
                 "# Lets change the scheduler\n",
                 "optim_sched_cfg.sched.name = \"CosineAnnealing\"\n",
                 "\n",
-                "# \"power\" isnt applicable to CosineAnnealing so let's remove it\n",
+                "# \"power\" isn't applicable to CosineAnnealing so let's remove it\n",
                 "optim_sched_cfg.sched.pop('power')\n",
                 "\n",
-                "# \"hold_ratio\" isnt applicable to CosineAnnealing, so let's remove it\n",
+                "# \"hold_ratio\" isn't applicable to CosineAnnealing, so let's remove it\n",
                 "optim_sched_cfg.sched.pop('hold_ratio')\n",
                 "\n",
                 "# Set \"min_lr\" to lower value\n",
diff --git a/tutorials/nlp/Joint_Intent_and_Slot_Classification.ipynb b/tutorials/nlp/Joint_Intent_and_Slot_Classification.ipynb
index 675fdfd5351c..608685254a0d 100644
--- a/tutorials/nlp/Joint_Intent_and_Slot_Classification.ipynb
+++ b/tutorials/nlp/Joint_Intent_and_Slot_Classification.ipynb
@@ -749,7 +749,7 @@
             "source": [
                 "### Optimizing Threshold\n",
                 "\n",
-                "As mentioned above, when classifiying a given query such as `show all flights and fares from denver to san francisco`, our model checks whether each individual intent would be suitable. Before assigning the final labels for a query, the model assigns a probability an intent matches the query. For example, if our `dict.intents.csv` had 5 different intents, then the model could output for a given query \\[0.52, 0.38, 0.21, 0.67. 0.80\\] where each value represents the probability that query matches that particular intent. \n",
+                "As mentioned above, when classifying a given query such as `show all flights and fares from denver to san francisco`, our model checks whether each individual intent would be suitable. Before assigning the final labels for a query, the model assigns a probability an intent matches the query. For example, if our `dict.intents.csv` had 5 different intents, then the model could output for a given query \\[0.52, 0.38, 0.21, 0.67. 0.80\\] where each value represents the probability that query matches that particular intent. \n",
                 "\n",
                 "We need to use these probabilities to generate final label predictions of 0 or 1 for each label. While we can use 0.5 as the probability threshold, it is usually the case that there is a better threshold to use depending on the metric we want to optimize. For this tutorial, we will be finding the threshold that gives us the best micro-F1 score on the validation set. After running the `optimize_threshold` method, the threshold attribute for our model will be updated."
             ]

From c39204d67c5b28f63bc5b9eed30a4c93002c1584 Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com>
Date: Tue, 28 May 2024 01:28:03 -0700
Subject: [PATCH 22/47] call set_expert_model_parallel_world_size instead of
 set_cpu_expert_model_parallel_world_size (#9275)

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
---
 scripts/checkpoint_converters/convert_mixtral_nemo_to_hf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/checkpoint_converters/convert_mixtral_nemo_to_hf.py b/scripts/checkpoint_converters/convert_mixtral_nemo_to_hf.py
index dbcbb80a7fda..ca9e44f82922 100644
--- a/scripts/checkpoint_converters/convert_mixtral_nemo_to_hf.py
+++ b/scripts/checkpoint_converters/convert_mixtral_nemo_to_hf.py
@@ -233,7 +233,7 @@ def convert(in_file, precision=None) -> None:
 
 if __name__ == '__main__':
     args = get_args()
-    parallel_state.set_cpu_expert_model_parallel_world_size(1)
+    parallel_state.set_expert_model_parallel_world_size(1)
     hf_state_dict, nemo_config = convert(args.input_name_or_path, args.precision)
 
     config = load_config(args.hf_model_name, nemo_config)

From 7a8da171ed072433db9d615cc0eca132bc8351ca Mon Sep 17 00:00:00 2001
From: Ali Taghibakhshi <71892896+JRD971000@users.noreply.github.com>
Date: Tue, 28 May 2024 12:57:08 -0500
Subject: [PATCH 23/47] conv1d stable version (#9330)

---
 requirements/requirements_nlp.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements/requirements_nlp.txt b/requirements/requirements_nlp.txt
index 9fd75ad8a95a..494a9ab6d672 100644
--- a/requirements/requirements_nlp.txt
+++ b/requirements/requirements_nlp.txt
@@ -1,6 +1,6 @@
 accelerated-scan
 boto3
-causal-conv1d>=1.2.0
+causal-conv1d==1.2.0.post2
 einops
 faiss-cpu
 fasttext

From 5f7b0304f23cbcb64d92d9f511a22bffb7a5cb28 Mon Sep 17 00:00:00 2001
From: Kirthi Shankar Sivamani <ksivamani@nvidia.com>
Date: Tue, 28 May 2024 14:05:20 -0700
Subject: [PATCH 24/47] FP8 feature documentation (#9265)

* Create fp8.rst

Signed-off-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com>

* Update fp8.rst

Signed-off-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com>

* add fp8_params

Signed-off-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com>

* Update mixed_precision.rst

Signed-off-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com>

* review comments

Signed-off-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com>

* rm file

Signed-off-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com>

---------

Signed-off-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com>
---
 docs/source/features/mixed_precision.rst | 42 ++++++++++++++++++++++++
 1 file changed, 42 insertions(+)

diff --git a/docs/source/features/mixed_precision.rst b/docs/source/features/mixed_precision.rst
index d193752e5475..ba0dfb4e945b 100644
--- a/docs/source/features/mixed_precision.rst
+++ b/docs/source/features/mixed_precision.rst
@@ -4,3 +4,45 @@ Mixed Precision Training
 ------------------------
 
 Mixed precision training significantly enhances computational efficiency by conducting operations in half-precision and fp8 formats, while selectively maintaining minimal data in single-precision to preserve critical information throughout key areas of the network. NeMo now supports FP16, BF16, and FP8 (via Transformer Engine) across most models. Further details will be provided shortly.
+
+
+FP8 usage
+=========
+
+Overview
+^^^^^^^^
+
+NVIDIA H100 GPU introduced support for a new datatype, FP8 (8-bit floating point), enabling higher throughput of matrix multiplies and convolutions. NeMo uses the NVIDIA `TransformerEngine <https://github.com/NVIDIA/TransformerEngine>`_ (TE) in order to leverage speedups from FP8. The following table summarizes the FP8 related arguments that can be configured in NeMo (`example config setting <https://github.com/NVIDIA/NeMo/blob/2e1814c9f031ad2aeeebad44597365e97253d2c4/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml/#L192-L200>`_). For a more detailed overview, refer to the TE `documentation <https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/index.html>`_, specifically the FP8 `format <https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/api/common.html#transformer_engine.common.recipe.Format>`_ and `recipe <https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/api/common.html#transformer_engine.common.recipe.DelayedScaling>`_.
+
+.. list-table:: FP8 arguments
+   :widths: 25 25 50
+   :header-rows: 1
+
+   * - Argument
+     - Description
+   * - transformer_engine
+     - TE and related functionality can be enabled by setting this boolean argument to True. If this argument is not set to True, all subsequent arguments will be ignored.
+   * - fp8
+     - Enables FP8 training. For transformer networks, the QKV, projection, FC1, and FC2 matrix multiplications are executed using the 4th generation H100 tensor cores with FP8 support.
+   * - fp8_e4m3
+     - Training recipe format for FP8. Activations, weights, and gradient tensors use the E4M3 format.
+   * - fp8_hybrid
+     - Training recipe format for FP8. Activations and weight tensors use the E4M3 format, whereas gradient use the E5M2 format to satisfy the additional dynamic range requirement for backward tensors. This is the default setting.
+   * - fp8_margin
+     - The scaling factor for FP8 tensors can be shifted by a factor of $2 ^ {margin}$ using this argument.
+   * - fp8_amax_history_len
+     - Window size for amax history. The window size determines how many instances of the most recent absolute max values (amaxes) are stored per tensor.
+   * - fp8_amax_compute_algo
+     - The choice between “max” and “most_recent” specifies how to select an amax value from the given history.
+   * - reduce_amax
+     - Indicates whether or not to perform an allreduce on the amax (absolute max) values for the FP8 tensors. Since the amax is directly used to compute the scaling factor for FP8 tensors, setting this argument ensures that the scaling factors for a tensor remain synchronized across devices in multi-GPU training configurations.
+   * - fp8_params
+     - Indicates whether or not to store module level parameters in FP8. Enabling this option can lead to reduced memory consumption. It eliminates the need to store a copy of weights in higher precision (> half) for cases where these weights are externally maintained, such as master parameters in the optimizer. For more information, refer to the `fp8_model_init <https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/api/pytorch.html#transformer_engine.pytorch.fp8_model_init>`_ API in TE.
+
+Resources
+^^^^^^^^^
+
+- `TE documentation <https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/index.html>`_
+- `Intro to FP8, floating point formats, and mixed precision training <https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/examples/fp8_primer.html#Introduction-to-FP8>`_
+- `Performance optimizations <https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/examples/advanced_optimizations.html>`_ that are natively supported in NeMo by enabling FP8 training with TE
+- `TE installation <https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/installation.html>`_

From 8a8c45319ef9e2a0e803918c6bb09745341d2647 Mon Sep 17 00:00:00 2001
From: Pablo Garay <palenq@gmail.com>
Date: Tue, 28 May 2024 15:17:43 -0700
Subject: [PATCH 25/47] comment out flaky tests (#9333)

---
 .github/workflows/cicd-main.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 53e92e976240..1e977a7e717d 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -133,7 +133,7 @@ jobs:
     #      chmod -R 777 .
 
 
-  L0_Unit_Tests_GPU:
+  OPTIONAL_L0_Unit_Tests_GPU:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
@@ -325,7 +325,7 @@ jobs:
   # this test is using a 7B model which is too large for GitHub CI
   # replace the model in this test with a toy model or move the test
   # to the nightly CI
-  # L2_Community_LLM_Checkpoints_tests_Baichuan2:
+  # OPTIONAL_L2_Community_LLM_Checkpoints_tests_Baichuan2:
   #   needs: [cicd-test-container-setup]
   #   runs-on: self-hosted-azure
   #   container:
@@ -6484,12 +6484,12 @@ jobs:
 
   Nemo_CICD_Test:
     needs: 
-      - L0_Unit_Tests_GPU
+      #- OPTIONAL_L0_Unit_Tests_GPU
       - L0_Unit_Tests_CPU
       - L2_Community_LLM_Checkpoints_tests_Llama
       - L2_Community_LLM_Checkpoints_tests_StarCoder
       - L2_Community_LLM_Checkpoints_tests_Falcon
-      #- L2_Community_LLM_Checkpoints_tests_Baichuan2
+      #- OPTIONAL_L2_Community_LLM_Checkpoints_tests_Baichuan2
       - ASR_dev_run_Speech_to_Text
       - ASR_dev_run_Speech_to_Text_WPE_-_CitriNet
       - ASR_dev_run_Speech_Pre-training_-_CitriNet

From 136aeee276568122745f29a1d58de30c207df0a1 Mon Sep 17 00:00:00 2001
From: Eduardo Vellasques <evellasques@gmail.com>
Date: Wed, 29 May 2024 04:10:31 +0200
Subject: [PATCH 26/47] fix typos in convert_mixtral_nemo_to_hf.py and
 convert_starcoder2_nemo_to_hf.py (#9325)

Signed-off-by: evellasques <evellasques@gmail.com>
---
 scripts/checkpoint_converters/convert_mixtral_nemo_to_hf.py    | 3 ++-
 scripts/checkpoint_converters/convert_starcoder2_nemo_to_hf.py | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/scripts/checkpoint_converters/convert_mixtral_nemo_to_hf.py b/scripts/checkpoint_converters/convert_mixtral_nemo_to_hf.py
index ca9e44f82922..58311d0324c2 100644
--- a/scripts/checkpoint_converters/convert_mixtral_nemo_to_hf.py
+++ b/scripts/checkpoint_converters/convert_mixtral_nemo_to_hf.py
@@ -61,7 +61,7 @@ def load_config(hf_model_name, nemo_config):
     hf_config.num_key_value_heads = nemo_config.num_query_groups
     hf_config.num_local_experts = nemo_config.num_moe_experts
     assert hf_config.num_local_experts > 0, "num_experts must be greater than zero."
-    hf_config.num_experts_per_tok = nemo_config.num_experts_per_token
+    hf_config.num_experts_per_tok = nemo_config.moe_router_topk
     assert hf_config.num_experts_per_tok > 0, "num_experts_per_token must be greater than zero."
     if nemo_config.activation == 'fast-swiglu':
         hf_config.activation = 'silu'
@@ -122,6 +122,7 @@ def convert(in_file, precision=None) -> None:
         embed_weights_base_name = f'model.language_model.embedding.word_embeddings.weight'
     state_dict[hf_embed_weight_name] = param_to_weights(ckpt[embed_weights_base_name])
 
+    head_num = model.cfg.num_attention_heads
     if nemo_config.num_query_groups is None or nemo_config.num_query_groups == head_num:
         num_query_groups = head_num
     else:
diff --git a/scripts/checkpoint_converters/convert_starcoder2_nemo_to_hf.py b/scripts/checkpoint_converters/convert_starcoder2_nemo_to_hf.py
index b7b85ee826a8..043d1fd35261 100644
--- a/scripts/checkpoint_converters/convert_starcoder2_nemo_to_hf.py
+++ b/scripts/checkpoint_converters/convert_starcoder2_nemo_to_hf.py
@@ -266,7 +266,7 @@ def convert(in_file, precision=None, cpu_only=True) -> None:
     config = load_config(args.hf_model_name, nemo_config)
     model = AutoModelForCausalLM.from_config(config)
     model.load_state_dict(hf_state_dict, strict=True)
-    model.save_pretrained(args.out_file)
+    model.save_pretrained(args.output_path)
     hf_tokenizer = AutoTokenizer.from_pretrained('bigcode/starcoder2-tokenizer')
     hf_tokenizer.save_pretrained(args.output_path)
     logging.info(f'HF checkpoint saved to: {args.output_path}')

From a1173eb1884969812a20d58d5be4ccf73b09b036 Mon Sep 17 00:00:00 2001
From: Deva Kumar Gajulamandyam <37027138+gdevakumar@users.noreply.github.com>
Date: Wed, 29 May 2024 01:25:48 -0700
Subject: [PATCH 27/47] typos fixed in READMe.rst (#9322)

Signed-off-by: Deva Kumar Gajulamandyam <gdevakumar267@gmail.com>
---
 README.rst | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/README.rst b/README.rst
index 0b05bd0390f8..a2c595d62137 100644
--- a/README.rst
+++ b/README.rst
@@ -108,7 +108,7 @@ Latest News
 Introduction
 ------------
 
-NVIDIA NeMo Framework is a generative AI framework built for researchers and pytorch developers
+NVIDIA NeMo Framework is a generative AI framework built for researchers and PyTorch developers
 working on large language models (LLMs), multimodal models (MM), automatic speech recognition (ASR),
 and text-to-speech synthesis (TTS).
 The primary objective of NeMo is to provide a scalable framework for researchers and developers from industry and academia
@@ -219,8 +219,8 @@ The NeMo Framework can be installed in a variety of ways, depending on your need
   * NeMo LLM & Multimodal Container - `nvcr.io/nvidia/nemo:24.03.framework`
   * NeMo Speech Container - `nvcr.io/nvidia/nemo:24.01.speech`
 
-* LLM and Multimodal Dependencies - Refer to the `LLM and Multimodal dependencies <#llm-and-multimodal-dependencies>`_ section for isntallation instructions.
-  * It's higly recommended to start with a base NVIDIA PyTorch container: `nvcr.io/nvidia/pytorch:24.02-py3`
+* LLM and Multimodal Dependencies - Refer to the `LLM and Multimodal dependencies <#llm-and-multimodal-dependencies>`_ section for installation instructions.
+  * It's highly recommended to start with a base NVIDIA PyTorch container: `nvcr.io/nvidia/pytorch:24.02-py3`
 
 Conda
 ~~~~~
@@ -452,9 +452,9 @@ Megatron Core
 ~~~~~~~~~~~~~
 
 The NeMo LLM Multimodal Domains require that NVIDIA Megatron Core to be installed.
-Megatron core is a library for scaling large transfromer base models. 
+Megatron core is a library for scaling large transformer base models. 
 NeMo LLM and Multimodal models leverage Megatron Core for model parallelism, 
-transformer architectures, and optimized pytorch datasets.
+transformer architectures, and optimized PyTorch datasets.
 
 NeMo LLM and Multimodal may need Megatron Core to be updated to a recent version.
 

From cff6b95e74f9048409584092f9891e8de2f455d5 Mon Sep 17 00:00:00 2001
From: yaoyu-33 <54727607+yaoyu-33@users.noreply.github.com>
Date: Wed, 29 May 2024 09:36:00 -0700
Subject: [PATCH 28/47] Fix trainer builder when exp_manager is not in config
 (#9293)

* fix

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Apply isort and black reformatting

Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>

* rollback changes

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

---------

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>
Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>
Co-authored-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>
---
 .../modules/stable_diffusion/attention.py     |   8 +-
 .../diffusionmodules/model.py                 |  11 +-
 .../diffusionmodules/openaimodel.py           | 124 ++++++++++++------
 .../stable_diffusion/diffusionmodules/util.py |  19 ++-
 .../nlp/parts/megatron_trainer_builder.py     |   4 +-
 5 files changed, 117 insertions(+), 49 deletions(-)

diff --git a/nemo/collections/multimodal/modules/stable_diffusion/attention.py b/nemo/collections/multimodal/modules/stable_diffusion/attention.py
index c70b59d39481..2eeed97db781 100644
--- a/nemo/collections/multimodal/modules/stable_diffusion/attention.py
+++ b/nemo/collections/multimodal/modules/stable_diffusion/attention.py
@@ -122,7 +122,11 @@ def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.0, use_te=Fal
         if use_te:
             activation = 'gelu' if not glu else 'geglu'
             # TODO: more parameters to be confirmed, dropout, seq_length
-            self.net = LayerNormMLP(hidden_size=dim, ffn_hidden_size=inner_dim, activation=activation,)
+            self.net = LayerNormMLP(
+                hidden_size=dim,
+                ffn_hidden_size=inner_dim,
+                activation=activation,
+            )
         else:
             norm = nn.LayerNorm(dim)
             project_in = nn.Sequential(LinearWrapper(dim, inner_dim), nn.GELU()) if not glu else GEGLU(dim, inner_dim)
@@ -264,7 +268,7 @@ def __init__(
         self.query_dim = query_dim
         self.dim_head = dim_head
 
-        self.scale = dim_head ** -0.5
+        self.scale = dim_head**-0.5
         self.heads = heads
 
         self.to_k = LinearWrapper(context_dim, self.inner_dim, bias=False, lora_network_alpha=lora_network_alpha)
diff --git a/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/model.py b/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/model.py
index 644efafaf06a..5b874f5f10ad 100644
--- a/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/model.py
+++ b/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/model.py
@@ -233,7 +233,10 @@ def __init__(
             # timestep embedding
             self.temb = nn.Module()
             self.temb.dense = nn.ModuleList(
-                [torch.nn.Linear(self.ch, self.temb_ch), torch.nn.Linear(self.temb_ch, self.temb_ch),]
+                [
+                    torch.nn.Linear(self.ch, self.temb_ch),
+                    torch.nn.Linear(self.temb_ch, self.temb_ch),
+                ]
             )
 
         # downsampling
@@ -669,7 +672,11 @@ def __init__(self, factor, in_channels, mid_channels, out_channels, depth=2):
             ]
         )
 
-        self.conv_out = nn.Conv2d(mid_channels, out_channels, kernel_size=1,)
+        self.conv_out = nn.Conv2d(
+            mid_channels,
+            out_channels,
+            kernel_size=1,
+        )
 
     def forward(self, x):
         x = self.conv_in(x)
diff --git a/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/openaimodel.py b/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/openaimodel.py
index 3e301f0b8fc1..30ff0e1a9ff3 100644
--- a/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/openaimodel.py
+++ b/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/openaimodel.py
@@ -115,10 +115,14 @@ class AttentionPool2d(nn.Module):
     """
 
     def __init__(
-        self, spacial_dim: int, embed_dim: int, num_heads_channels: int, output_dim: int = None,
+        self,
+        spacial_dim: int,
+        embed_dim: int,
+        num_heads_channels: int,
+        output_dim: int = None,
     ):
         super().__init__()
-        self.positional_embedding = nn.Parameter(th.randn(embed_dim, spacial_dim ** 2 + 1) / embed_dim ** 0.5)
+        self.positional_embedding = nn.Parameter(th.randn(embed_dim, spacial_dim**2 + 1) / embed_dim**0.5)
         self.qkv_proj = conv_nd(1, embed_dim, 3 * embed_dim, 1)
         self.c_proj = conv_nd(1, embed_dim, output_dim or embed_dim, 1)
         self.num_heads = embed_dim // num_heads_channels
@@ -332,7 +336,10 @@ def __init__(
             self.emb_layers = None
             self.exchange_temb_dims = False
         else:
-            self.emb_layers = nn.Sequential(nn.SiLU(), linear(emb_channels, self.emb_out_channels),)
+            self.emb_layers = nn.Sequential(
+                nn.SiLU(),
+                linear(emb_channels, self.emb_out_channels),
+            )
         self.out_layers = nn.Sequential(
             normalization(self.out_channels, act="silu", gn_groups=resblock_gn_groups),
             nn.Dropout(p=dropout),
@@ -400,7 +407,12 @@ class AttentionBlock(nn.Module):
     """
 
     def __init__(
-        self, channels, num_heads=1, num_head_channels=-1, use_checkpoint=False, use_new_attention_order=False,
+        self,
+        channels,
+        num_heads=1,
+        num_head_channels=-1,
+        use_checkpoint=False,
+        use_new_attention_order=False,
     ):
         super().__init__()
         self.channels = channels
@@ -451,7 +463,7 @@ def count_flops_attn(model, _x, y):
     # We perform two matmuls with the same number of ops.
     # The first computes the weight matrix, the second computes
     # the combination of the value vectors.
-    matmul_ops = 2 * b * (num_spatial ** 2) * c
+    matmul_ops = 2 * b * (num_spatial**2) * c
     model.total_ops += th.DoubleTensor([matmul_ops])
 
 
@@ -653,7 +665,10 @@ def __init__(
         if num_attention_blocks is not None:
             assert len(num_attention_blocks) == len(self.num_res_blocks)
             assert all(
-                map(lambda i: self.num_res_blocks[i] >= num_attention_blocks[i], range(len(num_attention_blocks)),)
+                map(
+                    lambda i: self.num_res_blocks[i] >= num_attention_blocks[i],
+                    range(len(num_attention_blocks)),
+                )
             )
             logging.info(
                 f"Constructor of UNetModel received num_attention_blocks={num_attention_blocks}. "
@@ -674,7 +689,9 @@ def __init__(
         self.predict_codebook_ids = n_embed is not None
         time_embed_dim = model_channels * 4
         self.time_embed = nn.Sequential(
-            linear(model_channels, time_embed_dim), nn.SiLU(), linear(time_embed_dim, time_embed_dim),
+            linear(model_channels, time_embed_dim),
+            nn.SiLU(),
+            linear(time_embed_dim, time_embed_dim),
         )
 
         self.time_embeddings = torch.Tensor(build_timestep_embedding(model_channels, timesteps))
@@ -691,7 +708,9 @@ def __init__(
                 self.label_emb = nn.Sequential(
                     Timestep(model_channels),
                     nn.Sequential(
-                        linear(model_channels, time_embed_dim), nn.SiLU(), linear(time_embed_dim, time_embed_dim),
+                        linear(model_channels, time_embed_dim),
+                        nn.SiLU(),
+                        linear(time_embed_dim, time_embed_dim),
                     ),
                 )
             elif self.num_classes == "sequential":
@@ -699,7 +718,9 @@ def __init__(
                 self.adm_in_channels = adm_in_channels
                 self.label_emb = nn.Sequential(
                     nn.Sequential(
-                        linear(adm_in_channels, time_embed_dim), nn.SiLU(), linear(time_embed_dim, time_embed_dim),
+                        linear(adm_in_channels, time_embed_dim),
+                        nn.SiLU(),
+                        linear(time_embed_dim, time_embed_dim),
                     )
                 )
             else:
@@ -810,26 +831,28 @@ def __init__(
                 use_scale_shift_norm=use_scale_shift_norm,
                 resblock_gn_groups=resblock_gn_groups,
             ),
-            AttentionBlock(
-                ch,
-                use_checkpoint=use_checkpoint,
-                num_heads=num_heads,
-                num_head_channels=dim_head,
-                use_new_attention_order=use_new_attention_order,
-            )
-            if not use_spatial_transformer
-            else SpatialTransformer(
-                ch,
-                num_heads,
-                dim_head,
-                depth=transformer_depth_middle,
-                context_dim=context_dim,
-                disable_self_attn=disable_middle_self_attn,
-                use_linear=use_linear_in_transformer,
-                use_checkpoint=use_checkpoint,
-                use_flash_attention=use_flash_attention,
-                use_te=self.use_te_fp8,
-                lora_network_alpha=lora_network_alpha,
+            (
+                AttentionBlock(
+                    ch,
+                    use_checkpoint=use_checkpoint,
+                    num_heads=num_heads,
+                    num_head_channels=dim_head,
+                    use_new_attention_order=use_new_attention_order,
+                )
+                if not use_spatial_transformer
+                else SpatialTransformer(
+                    ch,
+                    num_heads,
+                    dim_head,
+                    depth=transformer_depth_middle,
+                    context_dim=context_dim,
+                    disable_self_attn=disable_middle_self_attn,
+                    use_linear=use_linear_in_transformer,
+                    use_checkpoint=use_checkpoint,
+                    use_flash_attention=use_flash_attention,
+                    use_te=self.use_te_fp8,
+                    lora_network_alpha=lora_network_alpha,
+                )
             ),
             ResBlock(
                 ch,
@@ -1123,9 +1146,15 @@ def te_fp8_key_mapping(self, unet_dict):
             # norm_to_q.layer_norm_{weight|bias} -> norm.{weight|bias}
             # norm_to_q.weight -> to_q.weight
             new_key = key.replace('attn1.norm.', 'attn1.norm_to_q.layer_norm_')
-            new_key = new_key.replace('attn1.to_q.weight', 'attn1.norm_to_q.weight',)
+            new_key = new_key.replace(
+                'attn1.to_q.weight',
+                'attn1.norm_to_q.weight',
+            )
             new_key = new_key.replace('attn2.norm.', 'attn2.norm_to_q.layer_norm_')
-            new_key = new_key.replace('attn2.to_q.weight', 'attn2.norm_to_q.weight',)
+            new_key = new_key.replace(
+                'attn2.to_q.weight',
+                'attn2.norm_to_q.weight',
+            )
 
             ### LayerNormMLP
             # ff.net.layer_norm_{weight|bias} -> ff.net.0.{weight|bias}
@@ -1214,7 +1243,10 @@ def _load_pretrained_model(self, state_dict, ignore_mismatched_sizes=False, from
         unexpected_keys = list(set(loaded_keys) - set(expected_keys))
 
         def _find_mismatched_keys(
-            state_dict, model_state_dict, loaded_keys, ignore_mismatched_sizes,
+            state_dict,
+            model_state_dict,
+            loaded_keys,
+            ignore_mismatched_sizes,
         ):
             mismatched_keys = []
             if ignore_mismatched_sizes:
@@ -1234,7 +1266,10 @@ def _find_mismatched_keys(
         if state_dict is not None:
             # Whole checkpoint
             mismatched_keys = _find_mismatched_keys(
-                state_dict, model_state_dict, original_loaded_keys, ignore_mismatched_sizes,
+                state_dict,
+                model_state_dict,
+                original_loaded_keys,
+                ignore_mismatched_sizes,
             )
             error_msgs = self._load_state_dict_into_model(state_dict)
         return missing_keys, unexpected_keys, mismatched_keys, error_msgs
@@ -1329,9 +1364,14 @@ def _forward(self, x, timesteps=None, context=None, y=None, **kwargs):
             return self.out(h)
 
     def forward(self, x, timesteps=None, context=None, y=None, **kwargs):
-        with transformer_engine.pytorch.fp8_autocast(
-            enabled=self.use_te_fp8, fp8_recipe=self.fp8_recipe,
-        ) if self.use_te_fp8 else nullcontext():
+        with (
+            transformer_engine.pytorch.fp8_autocast(
+                enabled=self.use_te_fp8,
+                fp8_recipe=self.fp8_recipe,
+            )
+            if self.use_te_fp8
+            else nullcontext()
+        ):
             out = self._forward(x, timesteps, context, y, **kwargs)
         return out
 
@@ -1387,7 +1427,9 @@ def __init__(
 
         time_embed_dim = model_channels * 4
         self.time_embed = nn.Sequential(
-            linear(model_channels, time_embed_dim), nn.SiLU(), linear(time_embed_dim, time_embed_dim),
+            linear(model_channels, time_embed_dim),
+            nn.SiLU(),
+            linear(time_embed_dim, time_embed_dim),
         )
 
         self.input_blocks = nn.ModuleList(
@@ -1489,11 +1531,15 @@ def __init__(
         elif pool == "attention":
             assert num_head_channels != -1
             self.out = nn.Sequential(
-                normalization(ch), nn.SiLU(), AttentionPool2d((image_size // ds), ch, num_head_channels, out_channels),
+                normalization(ch),
+                nn.SiLU(),
+                AttentionPool2d((image_size // ds), ch, num_head_channels, out_channels),
             )
         elif pool == "spatial":
             self.out = nn.Sequential(
-                nn.Linear(self._feature_size, 2048), nn.ReLU(), nn.Linear(2048, self.out_channels),
+                nn.Linear(self._feature_size, 2048),
+                nn.ReLU(),
+                nn.Linear(2048, self.out_channels),
             )
         elif pool == "spatial_v2":
             self.out = nn.Sequential(
diff --git a/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/util.py b/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/util.py
index 53f9669a0b8f..69700a43614e 100644
--- a/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/util.py
+++ b/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/util.py
@@ -44,7 +44,7 @@
 
 def make_beta_schedule(schedule, n_timestep, linear_start=1e-4, linear_end=2e-2, cosine_s=8e-3):
     if schedule == "linear":
-        betas = torch.linspace(linear_start ** 0.5, linear_end ** 0.5, n_timestep, dtype=torch.float64) ** 2
+        betas = torch.linspace(linear_start**0.5, linear_end**0.5, n_timestep, dtype=torch.float64) ** 2
 
     elif schedule == "cosine":
         timesteps = torch.arange(n_timestep + 1, dtype=torch.float64) / n_timestep + cosine_s
@@ -169,7 +169,10 @@ def backward(ctx, *output_grads):
             shallow_copies = [x.view_as(x) for x in ctx.input_tensors]
             output_tensors = ctx.run_function(*shallow_copies)
         input_grads = torch.autograd.grad(
-            output_tensors, ctx.input_tensors + ctx.input_params, output_grads, allow_unused=True,
+            output_tensors,
+            ctx.input_tensors + ctx.input_params,
+            output_grads,
+            allow_unused=True,
         )
         del ctx.input_tensors
         del ctx.input_params
@@ -319,7 +322,11 @@ def interpolate_fn(x, xp, yp):
     start_idx = torch.where(
         torch.eq(x_idx, 0),
         torch.tensor(1, device=x.device),
-        torch.where(torch.eq(x_idx, K), torch.tensor(K - 2, device=x.device), cand_start_idx,),
+        torch.where(
+            torch.eq(x_idx, K),
+            torch.tensor(K - 2, device=x.device),
+            cand_start_idx,
+        ),
     )
     end_idx = torch.where(torch.eq(start_idx, cand_start_idx), start_idx + 2, start_idx + 1)
     start_x = torch.gather(sorted_all_x, dim=2, index=start_idx.unsqueeze(2)).squeeze(2)
@@ -327,7 +334,11 @@ def interpolate_fn(x, xp, yp):
     start_idx2 = torch.where(
         torch.eq(x_idx, 0),
         torch.tensor(0, device=x.device),
-        torch.where(torch.eq(x_idx, K), torch.tensor(K - 2, device=x.device), cand_start_idx,),
+        torch.where(
+            torch.eq(x_idx, K),
+            torch.tensor(K - 2, device=x.device),
+            cand_start_idx,
+        ),
     )
     y_positions_expanded = yp.unsqueeze(0).expand(N, -1, -1)
     start_y = torch.gather(y_positions_expanded, dim=2, index=start_idx2.unsqueeze(2)).squeeze(2)
diff --git a/nemo/collections/nlp/parts/megatron_trainer_builder.py b/nemo/collections/nlp/parts/megatron_trainer_builder.py
index f6336f6bcc71..194168008dc4 100644
--- a/nemo/collections/nlp/parts/megatron_trainer_builder.py
+++ b/nemo/collections/nlp/parts/megatron_trainer_builder.py
@@ -146,7 +146,7 @@ def _plugins(self) -> list:
         use_dist_ckpt = not self.cfg.model.get('fsdp', False) and (
             self.cfg.model.get('mcore_gpt', False) or self.cfg.model.get('mcore_bert', False)
         )
-        async_save = self.cfg.exp_manager.get('checkpoint_callback_params', {}).get('async_save', False)
+        async_save = self.cfg.get('exp_manager', {}).get('checkpoint_callback_params', {}).get('async_save', False)
         if use_dist_ckpt:
             checkpoint_io = DistributedCheckpointIO.from_config(self.cfg.model, async_save)
             if async_save:
@@ -171,7 +171,7 @@ def _callbacks(self, callbacks: Optional[list]) -> list:
         if 'enable_progress_bar' not in self.cfg.trainer or self.cfg.trainer.enable_progress_bar:
             callbacks.append(CustomProgressBar())
 
-        if self.cfg.exp_manager.get('checkpoint_callback_params', {}).get('async_save', False):
+        if self.cfg.get('exp_manager', {}).get('checkpoint_callback_params', {}).get('async_save', False):
             callbacks.append(AsyncFinalizerCallback())
         return callbacks
 

From 962b846be205562b047c3c4842cbb3db3757677e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Simon=20Wei=C3=9F?= <SimonCW@users.noreply.github.com>
Date: Wed, 29 May 2024 18:56:47 +0200
Subject: [PATCH 29/47] Update README.rst to clarify installation via Conda
 (#9323)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Simon Weiß <SimonCW@users.noreply.github.com>
---
 README.rst | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.rst b/README.rst
index a2c595d62137..121c82b8590f 100644
--- a/README.rst
+++ b/README.rst
@@ -240,6 +240,8 @@ Install PyTorch using their `configurator <https://pytorch.org/get-started/local
 
 The command used to install PyTorch may depend on your system. Please use the configurator linked above to find the right command for your system.
 
+Then, install NeMo via Pip or from Source. We do not provide NeMo on the conda-forge or any other Conda channel.
+
 Pip
 ~~~
 Use this installation mode if you want the latest released version.

From 008e5a05721bc38605ab714a7252949ae639779f Mon Sep 17 00:00:00 2001
From: Pablo Garay <palenq@gmail.com>
Date: Wed, 29 May 2024 11:00:00 -0700
Subject: [PATCH 30/47] [Nemo CICD] update flaky test (#9339)

* comment out flaky tests

* optional test should not cancel workflow
---
 .github/workflows/cicd-main.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 1e977a7e717d..b924cf975b18 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -152,8 +152,8 @@ jobs:
     - name: "L0: Unit Tests GPU"
       run: |
         NEMO_NUMBA_MINVER=0.53 pytest -m "not pleasefixme" --with_downloads
-    - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-      if: "failure()"
+    #- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
+    #  if: "failure()"
       
 
   L0_Unit_Tests_CPU:

From da720ae38ba2b47d10f365c6760357d504fd9039 Mon Sep 17 00:00:00 2001
From: yaoyu-33 <54727607+yaoyu-33@users.noreply.github.com>
Date: Wed, 29 May 2024 14:08:50 -0700
Subject: [PATCH 31/47] Fix peft weights loading (#9341)

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>
---
 .../collections/nlp/parts/mixins/multimodal_adapter_mixins.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/nemo/collections/nlp/parts/mixins/multimodal_adapter_mixins.py b/nemo/collections/nlp/parts/mixins/multimodal_adapter_mixins.py
index 1a5321065fa9..00552cb7f96e 100644
--- a/nemo/collections/nlp/parts/mixins/multimodal_adapter_mixins.py
+++ b/nemo/collections/nlp/parts/mixins/multimodal_adapter_mixins.py
@@ -133,10 +133,10 @@ def load_adapters(
             state_dict = torch.load(filepath, map_location)['state_dict']
         else:
             raise RuntimeError(f"{filepath} is not nemo file or ckpt file")
-        if self.cfg.megatron_amp_O2:
-            state_dict = {replace_prefix(k, 'model.', 'model.module.'): v for k, v in state_dict.items()}
         if not self.ptuning_only_and_non_first_stage:
             assert set(state_dict.keys()) == self.adapter_keys.union(self.tunable_base_param_keys)
+        if self.cfg.megatron_amp_O2:
+            state_dict = {replace_prefix(k, 'model.', 'model.module.'): v for k, v in state_dict.items()}
 
         missing_keys, unexpected_keys = NLPModel.load_state_dict(self, state_dict, strict=False)
 

From 4aba557bd23c27b6eeca7cf0da91845a4532178c Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Wed, 29 May 2024 15:11:55 -0700
Subject: [PATCH 32/47] fix lora and ptuning and isort/black (#9290) (#9295)

* fix lora and ptuning and isort/black


* remove raise error when multiple config files


* Apply isort and black reformatting


* fix script issues


---------

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>
Signed-off-by: oyilmaz-nvidia <oyilmaz-nvidia@users.noreply.github.com>
Co-authored-by: Onur Yilmaz <35306097+oyilmaz-nvidia@users.noreply.github.com>
Co-authored-by: oyilmaz-nvidia <oyilmaz-nvidia@users.noreply.github.com>
---
 nemo/deploy/deploy_pytriton.py            |  4 --
 nemo/deploy/nlp/query_llm.py              |  3 +-
 nemo/export/tensorrt_llm.py               | 18 +++++-
 nemo/export/trt_llm/decoder/decoder.py    |  6 +-
 nemo/export/trt_llm/decoder/falcon.py     | 29 ++++++++--
 nemo/export/trt_llm/decoder/gemma.py      | 29 ++++++++--
 nemo/export/trt_llm/decoder/gpt.py        | 28 ++++++++--
 nemo/export/trt_llm/decoder/gptj.py       | 18 +++++-
 nemo/export/trt_llm/decoder/llama.py      | 29 ++++++++--
 nemo/export/trt_llm/model_config.py       | 55 +++++++++++++-----
 nemo/export/trt_llm/nemo/nemo.py          | 10 ++--
 nemo/export/trt_llm/tensorrt_llm_model.py |  7 ++-
 nemo/export/trt_llm/tensorrt_llm_run.py   | 68 +++++++++++++++++------
 scripts/deploy/nlp/deploy_triton.py       | 40 +++++--------
 scripts/export/export_to_trt_llm.py       | 10 ++--
 15 files changed, 252 insertions(+), 102 deletions(-)

diff --git a/nemo/deploy/deploy_pytriton.py b/nemo/deploy/deploy_pytriton.py
index 22dea8ac47cd..25e09cf3eacc 100644
--- a/nemo/deploy/deploy_pytriton.py
+++ b/nemo/deploy/deploy_pytriton.py
@@ -24,7 +24,6 @@
 
 
 class DeployPyTriton(DeployBase):
-
     """
     Deploys any models to Triton Inference Server that implements ITritonDeployable interface in nemo.deploy.
 
@@ -102,7 +101,6 @@ def __init__(
         )
 
     def deploy(self):
-
         """
         Deploys any models to Triton Inference Server.
         """
@@ -148,7 +146,6 @@ def deploy(self):
             print(e)
 
     def serve(self):
-
         """
         Starts serving the model and waits for the requests
         """
@@ -163,7 +160,6 @@ def serve(self):
             print(e)
 
     def run(self):
-
         """
         Starts serving the model asynchronously.
         """
diff --git a/nemo/deploy/nlp/query_llm.py b/nemo/deploy/nlp/query_llm.py
index 6a4337024eeb..c8387914c2e9 100644
--- a/nemo/deploy/nlp/query_llm.py
+++ b/nemo/deploy/nlp/query_llm.py
@@ -71,7 +71,8 @@ class NemoQueryLLM(NemoQueryLLMBase):
 
     def __init__(self, url, model_name):
         super().__init__(
-            url=url, model_name=model_name,
+            url=url,
+            model_name=model_name,
         )
 
     def query_llm(
diff --git a/nemo/export/tensorrt_llm.py b/nemo/export/tensorrt_llm.py
index cad7b821b3b4..b030165a3d45 100644
--- a/nemo/export/tensorrt_llm.py
+++ b/nemo/export/tensorrt_llm.py
@@ -82,15 +82,24 @@ class TensorRTLLM(ITritonDeployable):
 
     """
 
-    def __init__(self, model_dir: str, lora_ckpt_list: List[str] = None, load_model: bool = True):
+    def __init__(
+        self,
+        model_dir: str,
+        lora_ckpt_list: List[str] = None,
+        load_model: bool = True,
+        use_python_runtime: bool = True,
+    ):
         """
         Args:
             model_dir (str): path for storing the TensorRT-LLM model files.
+            lora_ckpt_list (List[str]): lora checkpoint paths.
             load_model (bool): load TensorRT-LLM model if the engine files exist in the model_dir.
+            use_python_runtime (bool): whether to use python or c++ runtime.
         """
 
         self.model_dir = model_dir
         self.lora_ckpt_list = lora_ckpt_list
+        self.use_python_runtime = use_python_runtime
         self.model = None
         self.tokenizer = None
         self.n_gpus = None
@@ -623,7 +632,7 @@ def _prep_ptuning_table(self):
         if len(vtokens_embeddings) > 0:
             self.p_table = torch.stack(vtokens_embeddings, dim=0).view(-1, self.get_hidden_size)
 
-            max_prompt_embedding_table_size = self.config['builder_config']['max_prompt_embedding_table_size']
+            max_prompt_embedding_table_size = self.config['build_config']['max_prompt_embedding_table_size']
             actual_prompt_table_size = self.p_table.shape[0]
 
             if actual_prompt_table_size > max_prompt_embedding_table_size:
@@ -754,7 +763,10 @@ def _load(self):
                     self._load_config_file()
                     self.tokenizer = get_tokenzier(Path(os.path.join(self.model_dir)))
                     self.model = load(
-                        tokenizer=self.tokenizer, engine_dir=self.model_dir, lora_ckpt_list=self.lora_ckpt_list
+                        tokenizer=self.tokenizer,
+                        engine_dir=self.model_dir,
+                        lora_ckpt_list=self.lora_ckpt_list,
+                        use_python_runtime=self.use_python_runtime,
                     )
                     self._load_prompt_tables()
                 except Exception as error:
diff --git a/nemo/export/trt_llm/decoder/decoder.py b/nemo/export/trt_llm/decoder/decoder.py
index b3c0e2257e9f..2d1993fd74c0 100644
--- a/nemo/export/trt_llm/decoder/decoder.py
+++ b/nemo/export/trt_llm/decoder/decoder.py
@@ -90,7 +90,11 @@ def build_post_layernorm(self, layer) -> Optional[LayernormConfig]:
         pass
 
     def __init__(
-        self, decoder_type: str, dtype: trt.DataType = trt.float16, rank: int = 0, tensor_parallel: int = 1,
+        self,
+        decoder_type: str,
+        dtype: trt.DataType = trt.float16,
+        rank: int = 0,
+        tensor_parallel: int = 1,
     ):
         """Initializes the DecoderLayerConfigBuilder."""
         self.decoder_type = decoder_type
diff --git a/nemo/export/trt_llm/decoder/falcon.py b/nemo/export/trt_llm/decoder/falcon.py
index 91edc7794607..e05979fa75a0 100644
--- a/nemo/export/trt_llm/decoder/falcon.py
+++ b/nemo/export/trt_llm/decoder/falcon.py
@@ -69,7 +69,11 @@ def build_attention(self, layer) -> AttentionConfig:
         )
 
         config.dense = LinearConfig.from_nn_module(
-            layer.self_attn.o_proj, LINEAR_ROW, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
+            layer.self_attn.o_proj,
+            LINEAR_ROW,
+            rank=self.rank,
+            tensor_parallel=self.tensor_parallel,
+            dtype=self.dtype,
         )
 
         return config
@@ -78,13 +82,25 @@ def build_attention(self, layer) -> AttentionConfig:
     def build_mlp(self, layer) -> MLPConfig:
         config = MLPConfig()
         config.fc = LinearConfig.from_nn_module(
-            layer.mlp.gate_proj, LINEAR_COLUMN, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
+            layer.mlp.gate_proj,
+            LINEAR_COLUMN,
+            rank=self.rank,
+            tensor_parallel=self.tensor_parallel,
+            dtype=self.dtype,
         )
         config.proj = LinearConfig.from_nn_module(
-            layer.mlp.down_proj, LINEAR_ROW, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
+            layer.mlp.down_proj,
+            LINEAR_ROW,
+            rank=self.rank,
+            tensor_parallel=self.tensor_parallel,
+            dtype=self.dtype,
         )
         config.gate = LinearConfig.from_nn_module(
-            layer.mlp.up_proj, LINEAR_COLUMN, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
+            layer.mlp.up_proj,
+            LINEAR_COLUMN,
+            rank=self.rank,
+            tensor_parallel=self.tensor_parallel,
+            dtype=self.dtype,
         )
 
         return config
@@ -130,4 +146,7 @@ def build_decoder(self, layer):
         config.set_if_not_exist('bias', False)
         config.set_if_not_exist('moe_num_experts', 0)
 
-        return FalconDecoderLayer(config=config, layer_idx=self.layer_id,)
+        return FalconDecoderLayer(
+            config=config,
+            layer_idx=self.layer_id,
+        )
diff --git a/nemo/export/trt_llm/decoder/gemma.py b/nemo/export/trt_llm/decoder/gemma.py
index 10301c7a47d7..37f843dcf0ca 100644
--- a/nemo/export/trt_llm/decoder/gemma.py
+++ b/nemo/export/trt_llm/decoder/gemma.py
@@ -64,7 +64,11 @@ def build_attention(self, layer) -> AttentionConfig:
         )
 
         config.dense = LinearConfig.from_nn_module(
-            layer.self_attn.o_proj, LINEAR_ROW, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
+            layer.self_attn.o_proj,
+            LINEAR_ROW,
+            rank=self.rank,
+            tensor_parallel=self.tensor_parallel,
+            dtype=self.dtype,
         )
 
         return config
@@ -73,13 +77,25 @@ def build_attention(self, layer) -> AttentionConfig:
     def build_mlp(self, layer) -> MLPConfig:
         config = MLPConfig()
         config.fc = LinearConfig.from_nn_module(
-            layer.mlp.gate_proj, LINEAR_COLUMN, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
+            layer.mlp.gate_proj,
+            LINEAR_COLUMN,
+            rank=self.rank,
+            tensor_parallel=self.tensor_parallel,
+            dtype=self.dtype,
         )
         config.proj = LinearConfig.from_nn_module(
-            layer.mlp.down_proj, LINEAR_ROW, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
+            layer.mlp.down_proj,
+            LINEAR_ROW,
+            rank=self.rank,
+            tensor_parallel=self.tensor_parallel,
+            dtype=self.dtype,
         )
         config.gate = LinearConfig.from_nn_module(
-            layer.mlp.up_proj, LINEAR_COLUMN, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
+            layer.mlp.up_proj,
+            LINEAR_COLUMN,
+            rank=self.rank,
+            tensor_parallel=self.tensor_parallel,
+            dtype=self.dtype,
         )
 
         return config
@@ -128,4 +144,7 @@ def build_decoder(self, layer):
         config.set_if_not_exist('dense_context_fmha', False)
         config.set_if_not_exist('moe_num_experts', 0)
 
-        return GemmaDecoderLayer(config=config, layer_idx=self.layer_id,)
+        return GemmaDecoderLayer(
+            config=config,
+            layer_idx=self.layer_id,
+        )
diff --git a/nemo/export/trt_llm/decoder/gpt.py b/nemo/export/trt_llm/decoder/gpt.py
index 8af4e4ef01e4..a405aabbbd48 100644
--- a/nemo/export/trt_llm/decoder/gpt.py
+++ b/nemo/export/trt_llm/decoder/gpt.py
@@ -54,11 +54,18 @@ def build_input_layernorm(self, layer) -> LayernormConfig:
     def build_attention(self, layer) -> AttentionConfig:
         config = AttentionConfig()
         config.qkv = LinearConfig.from_qkv_nn_modules(
-            [layer.attn.c_attn], rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
+            [layer.attn.c_attn],
+            rank=self.rank,
+            tensor_parallel=self.tensor_parallel,
+            dtype=self.dtype,
         )
 
         config.dense = LinearConfig.from_nn_module(
-            layer.attn.c_proj, LINEAR_ROW, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
+            layer.attn.c_proj,
+            LINEAR_ROW,
+            rank=self.rank,
+            tensor_parallel=self.tensor_parallel,
+            dtype=self.dtype,
         )
 
         return config
@@ -67,10 +74,18 @@ def build_attention(self, layer) -> AttentionConfig:
     def build_mlp(self, layer) -> MLPConfig:
         config = MLPConfig()
         config.fc = LinearConfig.from_nn_module(
-            layer.mlp.c_fc, LINEAR_COLUMN, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
+            layer.mlp.c_fc,
+            LINEAR_COLUMN,
+            rank=self.rank,
+            tensor_parallel=self.tensor_parallel,
+            dtype=self.dtype,
         )
         config.proj = LinearConfig.from_nn_module(
-            layer.mlp.c_proj, LINEAR_ROW, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
+            layer.mlp.c_proj,
+            LINEAR_ROW,
+            rank=self.rank,
+            tensor_parallel=self.tensor_parallel,
+            dtype=self.dtype,
         )
 
         return config
@@ -126,4 +141,7 @@ def build_decoder(self, layer):
         config.set_if_not_exist('rotary_pct', rotary_pct)
         config.set_if_not_exist('moe_num_experts', 0)
 
-        return GPTDecoderLayer(config=config, layer_idx=self.layer_id,)
+        return GPTDecoderLayer(
+            config=config,
+            layer_idx=self.layer_id,
+        )
diff --git a/nemo/export/trt_llm/decoder/gptj.py b/nemo/export/trt_llm/decoder/gptj.py
index aa65ca385a47..327a31fdd35c 100644
--- a/nemo/export/trt_llm/decoder/gptj.py
+++ b/nemo/export/trt_llm/decoder/gptj.py
@@ -60,7 +60,11 @@ def build_attention(self, layer) -> AttentionConfig:
         )
 
         config.dense = LinearConfig.from_nn_module(
-            layer.attn.out_proj, LINEAR_ROW, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
+            layer.attn.out_proj,
+            LINEAR_ROW,
+            rank=self.rank,
+            tensor_parallel=self.tensor_parallel,
+            dtype=self.dtype,
         )
 
         config.rotary_dim = layer.attn.rotary_dim
@@ -71,10 +75,18 @@ def build_attention(self, layer) -> AttentionConfig:
     def build_mlp(self, layer) -> MLPConfig:
         config = MLPConfig()
         config.fc = LinearConfig.from_nn_module(
-            layer.mlp.fc_in, LINEAR_COLUMN, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
+            layer.mlp.fc_in,
+            LINEAR_COLUMN,
+            rank=self.rank,
+            tensor_parallel=self.tensor_parallel,
+            dtype=self.dtype,
         )
         config.proj = LinearConfig.from_nn_module(
-            layer.mlp.fc_out, LINEAR_ROW, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
+            layer.mlp.fc_out,
+            LINEAR_ROW,
+            rank=self.rank,
+            tensor_parallel=self.tensor_parallel,
+            dtype=self.dtype,
         )
 
         return config
diff --git a/nemo/export/trt_llm/decoder/llama.py b/nemo/export/trt_llm/decoder/llama.py
index 873c0306375b..b37d62e214de 100644
--- a/nemo/export/trt_llm/decoder/llama.py
+++ b/nemo/export/trt_llm/decoder/llama.py
@@ -66,7 +66,11 @@ def build_attention(self, layer) -> AttentionConfig:
         )
 
         config.dense = LinearConfig.from_nn_module(
-            layer.self_attn.o_proj, LINEAR_ROW, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
+            layer.self_attn.o_proj,
+            LINEAR_ROW,
+            rank=self.rank,
+            tensor_parallel=self.tensor_parallel,
+            dtype=self.dtype,
         )
 
         return config
@@ -75,13 +79,25 @@ def build_attention(self, layer) -> AttentionConfig:
     def build_mlp(self, layer) -> MLPConfig:
         config = MLPConfig()
         config.fc = LinearConfig.from_nn_module(
-            layer.mlp.gate_proj, LINEAR_COLUMN, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
+            layer.mlp.gate_proj,
+            LINEAR_COLUMN,
+            rank=self.rank,
+            tensor_parallel=self.tensor_parallel,
+            dtype=self.dtype,
         )
         config.proj = LinearConfig.from_nn_module(
-            layer.mlp.down_proj, LINEAR_ROW, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
+            layer.mlp.down_proj,
+            LINEAR_ROW,
+            rank=self.rank,
+            tensor_parallel=self.tensor_parallel,
+            dtype=self.dtype,
         )
         config.gate = LinearConfig.from_nn_module(
-            layer.mlp.up_proj, LINEAR_COLUMN, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
+            layer.mlp.up_proj,
+            LINEAR_COLUMN,
+            rank=self.rank,
+            tensor_parallel=self.tensor_parallel,
+            dtype=self.dtype,
         )
 
         return config
@@ -147,4 +163,7 @@ def build_decoder(self, layer):
                 config.moe_tp_mode = layer.moe_tp_mode
                 config.moe_normalization_mode = layer.moe_renorm_mode
 
-        return LLaMADecoderLayer(config=config, layer_idx=self.layer_id,)
+        return LLaMADecoderLayer(
+            config=config,
+            layer_idx=self.layer_id,
+        )
diff --git a/nemo/export/trt_llm/model_config.py b/nemo/export/trt_llm/model_config.py
index dd360afd6b8a..0f120dc56153 100644
--- a/nemo/export/trt_llm/model_config.py
+++ b/nemo/export/trt_llm/model_config.py
@@ -122,7 +122,11 @@ def from_nn_module(module: nn.Module, linear_type: str, rank=0, tensor_parallel=
         if hasattr(module, "bias") and module.bias is not None:
             if linear_type == LINEAR_COLUMN:
                 config.bias = np.ascontiguousarray(
-                    split(torch_to_numpy_with_dtype(module.bias, dtype), tensor_parallel, rank,)
+                    split(
+                        torch_to_numpy_with_dtype(module.bias, dtype),
+                        tensor_parallel,
+                        rank,
+                    )
                 )
             else:
                 config.bias = torch_to_numpy_with_dtype(module.bias, dtype)
@@ -234,7 +238,9 @@ class AttentionConfig:
 
     @staticmethod
     def from_nemo(
-        weights_dict: Dict[str, np.ndarray], layer_id: int, rank: int = 0,
+        weights_dict: Dict[str, np.ndarray],
+        layer_id: int,
+        rank: int = 0,
     ):
         """Converts the nemo weights and config to `AttentionConfig`."""
         attention = AttentionConfig()
@@ -243,12 +249,16 @@ def from_nemo(
             weights_dict, f"layers.{layer_id}.attention.query_key_value.weight.{rank}"
         )
         attention.qkv.bias = get_tensor_from_dict(
-            weights_dict, f"layers.{layer_id}.attention.query_key_value.bias.{rank}",
+            weights_dict,
+            f"layers.{layer_id}.attention.query_key_value.bias.{rank}",
         )
 
         attention.dense = LinearConfig(linear_type=LINEAR_ROW)
         attention.dense.weight = get_tensor_from_dict(weights_dict, f"layers.{layer_id}.attention.dense.weight.{rank}")
-        attention.dense.bias = get_tensor_from_dict(weights_dict, f"layers.{layer_id}.attention.dense.bias",)
+        attention.dense.bias = get_tensor_from_dict(
+            weights_dict,
+            f"layers.{layer_id}.attention.dense.bias",
+        )
         return attention
 
 
@@ -276,7 +286,10 @@ def from_nemo(
 
         # print("********** mlp.fc.weight : ", mlp.fc.weight )
 
-        mlp.fc.bias = get_tensor_from_dict(weights_dict, f"layers.{layer_id}.mlp.dense_h_to_4h.bias.{rank}",)
+        mlp.fc.bias = get_tensor_from_dict(
+            weights_dict,
+            f"layers.{layer_id}.mlp.dense_h_to_4h.bias.{rank}",
+        )
 
         gated = is_gated_activation(mlp.hidden_act)
         is_fast_glu = mlp.hidden_act in ['fast-geglu', 'fast-swiglu', 'fast-reglu']
@@ -287,9 +300,13 @@ def from_nemo(
                 if isinstance(llm_config, LlamaConfig) and not is_mcore and not is_fast_glu
                 else f"layers.{layer_id}.mlp.dense_h_to_4h.gate.weight.{rank}"
             )
-            mlp.gate.weight = get_tensor_from_dict(weights_dict, layer_name,)
+            mlp.gate.weight = get_tensor_from_dict(
+                weights_dict,
+                layer_name,
+            )
             mlp.gate.bias = get_tensor_from_dict(
-                weights_dict, f"layers.{layer_id}.mlp.dense_h_to_4h.gate.bias.{rank}",
+                weights_dict,
+                f"layers.{layer_id}.mlp.dense_h_to_4h.gate.bias.{rank}",
             )
 
         mlp.proj = LinearConfig(linear_type=LINEAR_ROW)
@@ -382,19 +399,23 @@ def from_nemo(
             LAYERNORM_RMS if isinstance(llm_config, LlamaConfig) else LAYERNORM_DEFAULT
         )
         layer_config.input_layernorm.weight = get_tensor_from_dict(
-            weights_dict, f"layers.{layer_id}.input_layernorm.weight",
+            weights_dict,
+            f"layers.{layer_id}.input_layernorm.weight",
         )
         layer_config.input_layernorm.bias = get_tensor_from_dict(
-            weights_dict, f"layers.{layer_id}.input_layernorm.bias",
+            weights_dict,
+            f"layers.{layer_id}.input_layernorm.bias",
         )
 
         layer_config.mlp_layernorm = LayernormConfig()
         layer_config.mlp_layernorm.layernorm_type = LAYERNORM_DEFAULT  # Falcon uses default layernorm
         layer_config.mlp_layernorm.weight = get_tensor_from_dict(
-            weights_dict, f"layers.{layer_id}.pre_mlp_layernorm.weight",
+            weights_dict,
+            f"layers.{layer_id}.pre_mlp_layernorm.weight",
         )
         layer_config.mlp_layernorm.bias = get_tensor_from_dict(
-            weights_dict, f"layers.{layer_id}.pre_mlp_layernorm.bias",
+            weights_dict,
+            f"layers.{layer_id}.pre_mlp_layernorm.bias",
         )
 
         layer_config.post_layernorm = LayernormConfig()
@@ -403,10 +424,12 @@ def from_nemo(
         )
 
         layer_config.post_layernorm.weight = get_tensor_from_dict(
-            weights_dict, f"layers.{layer_id}.post_attention_layernorm.weight",
+            weights_dict,
+            f"layers.{layer_id}.post_attention_layernorm.weight",
         )
         layer_config.post_layernorm.bias = get_tensor_from_dict(
-            weights_dict, f"layers.{layer_id}.post_attention_layernorm.bias",
+            weights_dict,
+            f"layers.{layer_id}.post_attention_layernorm.bias",
         )
 
         if layer_config.post_layernorm.weight is None:  # Falcon doesn't have post layernorm
@@ -415,7 +438,11 @@ def from_nemo(
         if layer_config.mlp_layernorm.weight is None:
             layer_config.mlp_layernorm = None
 
-        layer_config.attention = AttentionConfig.from_nemo(weights_dict, layer_id, rank,)
+        layer_config.attention = AttentionConfig.from_nemo(
+            weights_dict,
+            layer_id,
+            rank,
+        )
 
         moe = False
         if llm_config.moe_num_experts is not None:
diff --git a/nemo/export/trt_llm/nemo/nemo.py b/nemo/export/trt_llm/nemo/nemo.py
index 9026cd9cfba9..c3564f1c4e8e 100644
--- a/nemo/export/trt_llm/nemo/nemo.py
+++ b/nemo/export/trt_llm/nemo/nemo.py
@@ -106,7 +106,9 @@ def extract_layers_with_prefix(model_, prefix):
 
 class UnpackedNemoCheckpointDir:
     def __init__(
-        self, checkpoints_dir: typing.Union[pathlib.Path, TarPath], load_checkpoints_to_cpu: bool = False,
+        self,
+        checkpoints_dir: typing.Union[pathlib.Path, TarPath],
+        load_checkpoints_to_cpu: bool = False,
     ):
         assert isinstance(checkpoints_dir, (pathlib.Path, TarPath))
         self._checkpoints_dir = checkpoints_dir
@@ -121,11 +123,7 @@ def model_config(self):
         model_configs_paths = list(self._checkpoints_dir.rglob(model_config_filename))
         if model_configs_paths:
             if len(model_configs_paths) > 1:
-                raise RuntimeError(
-                    f"There are more than single {model_config_filename} in"
-                    f" {self._checkpoints_dir}:"
-                    f" {', '.join(map(lambda p: p.as_posix(), model_configs_paths))}"
-                )
+                LOGGER.debug(f"There are more than single {model_config_filename} in" f" {self._checkpoints_dir}")
             model_config_path = model_configs_paths[0]
             LOGGER.debug("Loading model config from %s", model_config_path)
             with model_config_path.open("r") as model_config_file:
diff --git a/nemo/export/trt_llm/tensorrt_llm_model.py b/nemo/export/trt_llm/tensorrt_llm_model.py
index 736d6180807e..f4b44552af63 100644
--- a/nemo/export/trt_llm/tensorrt_llm_model.py
+++ b/nemo/export/trt_llm/tensorrt_llm_model.py
@@ -144,7 +144,12 @@ def forward(
         if attention_mask is not None:
             attention_mask = expand_mask(attention_mask, shape(input_ids, -1))
 
-        for layer_idx, (layer, past) in enumerate(zip(self.layers, kv_cache_params.past_key_value,)):
+        for layer_idx, (layer, past) in enumerate(
+            zip(
+                self.layers,
+                kv_cache_params.past_key_value,
+            )
+        ):
 
             decoder_params = {
                 "hidden_states": hidden_states,
diff --git a/nemo/export/trt_llm/tensorrt_llm_run.py b/nemo/export/trt_llm/tensorrt_llm_run.py
index 92fc36272f7c..fe0189b10628 100644
--- a/nemo/export/trt_llm/tensorrt_llm_run.py
+++ b/nemo/export/trt_llm/tensorrt_llm_run.py
@@ -16,17 +16,19 @@
 import json
 import logging
 import os
+import tempfile
 from dataclasses import dataclass
 from pathlib import Path
 from typing import List, Optional
 
+import numpy as np
 import tensorrt_llm
 import torch
 from mpi4py.futures import MPIPoolExecutor
 from tensorrt_llm.logger import logger
 from tensorrt_llm.lora_manager import LoraManager
 from tensorrt_llm.quantization import QuantMode
-from tensorrt_llm.runtime import ModelConfig, ModelRunnerCpp, SamplingConfig
+from tensorrt_llm.runtime import ModelConfig, ModelRunner, ModelRunnerCpp, SamplingConfig
 from transformers import PreTrainedTokenizer
 
 from nemo.export.trt_llm.tensor_utils import get_tensor_parallel_group
@@ -55,7 +57,7 @@ class TensorrtLLMHostContext:
 class TensorrtLLMWorkerContext:
     """The MPI worker side context for TRT LLM inference."""
 
-    decoder: ModelRunnerCpp = None
+    decoder: ModelRunner = None
     sampling_config: SamplingConfig = None
     lora_manager: LoraManager = None
     max_batch_size: int = 0
@@ -128,7 +130,13 @@ def _read_config(config_path: Path):
     return model_config, world_size, tensor_parallel_size, pipeline_parallel_size, dtype, max_input_len, max_batch_size
 
 
-def _load(tokenizer: PreTrainedTokenizer, engine_dir, lora_ckpt_list=None, num_beams=1):
+def _load(
+    tokenizer: PreTrainedTokenizer,
+    engine_dir,
+    lora_ckpt_list=None,
+    num_beams=1,
+    use_python_runtime: bool = True,
+):
     """The impl of `load` API for on a single GPU worker."""
     try:
         tensorrt_llm.logger.set_level("info")
@@ -147,17 +155,26 @@ def _load(tokenizer: PreTrainedTokenizer, engine_dir, lora_ckpt_list=None, num_b
 
         runtime_rank = tensorrt_llm.mpi_rank()
 
-        decoder = ModelRunnerCpp.from_dir(
-            engine_dir=engine_dir,
-            lora_dir=lora_ckpt_list,
-            lora_ckpt_source="nemo",
-            rank=runtime_rank,
-            max_batch_size=max_batch_size,
-            max_input_len=max_input_len,
-            max_output_len=max_output_len,
-            max_beam_width=max_beam_width,
-            debug_mode=False,
-        )
+        if use_python_runtime:
+            decoder = ModelRunner.from_dir(
+                engine_dir=engine_dir,
+                lora_dir=lora_ckpt_list,
+                lora_ckpt_source="nemo",
+                rank=runtime_rank,
+                debug_mode=False,
+            )
+        else:
+            decoder = ModelRunnerCpp.from_dir(
+                engine_dir=engine_dir,
+                lora_dir=lora_ckpt_list,
+                lora_ckpt_source="nemo",
+                rank=runtime_rank,
+                max_batch_size=max_batch_size,
+                max_input_len=max_input_len,
+                max_output_len=max_output_len,
+                max_beam_width=max_beam_width,
+                debug_mode=False,
+            )
 
         sampling_config = SamplingConfig(
             end_id=tokenizer.eos_token_id, pad_id=tokenizer.eos_token_id, num_beams=num_beams
@@ -218,6 +235,13 @@ def _forward(
         with torch.no_grad():
             prompt_tasks = None if task_ids is None else ",".join(str(task) for task in task_ids)
 
+            if prompt_table is not None:
+                prompt_table = prompt_table.reshape(1, *prompt_table.shape)
+                tmp_dir = tempfile.TemporaryDirectory()
+                prompt_table_path = os.path.join(tmp_dir.name, 'prompt_table.npy')
+                np.save(prompt_table_path, prompt_table.cpu().float().numpy())
+                prompt_table = prompt_table_path
+
             outputs = decoder.generate(
                 input_tensors,
                 max_new_tokens=max_output_len,
@@ -230,6 +254,7 @@ def _forward(
                 stop_words_list=stop_words_list,
                 bad_words_list=bad_words_list,
                 lora_uids=lora_uids,
+                prompt_table_path=prompt_table,
                 prompt_table=prompt_table,
                 prompt_tasks=prompt_tasks,
                 streaming=streaming,
@@ -239,6 +264,9 @@ def _forward(
 
             torch.cuda.synchronize()
 
+            if prompt_table is not None:
+                tmp_dir.cleanup()
+
         runtime_rank = tensorrt_llm.mpi_rank()
         if runtime_rank == 0 or multiprocessed_env:
             return outputs
@@ -251,7 +279,11 @@ def _forward(
 
 
 def load(
-    tokenizer: PreTrainedTokenizer, engine_dir: str, lora_ckpt_list: List[str] = None, num_beams: int = 1
+    tokenizer: PreTrainedTokenizer,
+    engine_dir: str,
+    lora_ckpt_list: List[str] = None,
+    num_beams: int = 1,
+    use_python_runtime: bool = True,
 ) -> TensorrtLLMHostContext:
     """Loaded the compiled LLM model and run it.
 
@@ -263,17 +295,17 @@ def load(
         config = json.load(f)
     world_size = config["pretrained_config"]["mapping"]["world_size"]
     if world_size == 1:
-        _load(tokenizer, engine_dir, lora_ckpt_list, num_beams)
+        _load(tokenizer, engine_dir, lora_ckpt_list, num_beams, use_python_runtime)
         executor = None
     elif tensorrt_llm.mpi_world_size() > 1:
-        _load(tokenizer, engine_dir, lora_ckpt_list, num_beams)
+        _load(tokenizer, engine_dir, lora_ckpt_list, num_beams, use_python_runtime)
         executor = None
         tensorrt_llm.mpi_barrier()
     else:
         executor = MPIPoolExecutor(max_workers=world_size)
         futures = []
         for _ in range(world_size):
-            future = executor.submit(_load, tokenizer, engine_dir, lora_ckpt_list, num_beams)
+            future = executor.submit(_load, tokenizer, engine_dir, lora_ckpt_list, num_beams, use_python_runtime)
             futures.append(future)
         for future in futures:
             future.result()
diff --git a/scripts/deploy/nlp/deploy_triton.py b/scripts/deploy/nlp/deploy_triton.py
index 7370731ec996..0a9604a73cdc 100755
--- a/scripts/deploy/nlp/deploy_triton.py
+++ b/scripts/deploy/nlp/deploy_triton.py
@@ -80,7 +80,7 @@ def get_args(argv):
         "-mpet", "--max_prompt_embedding_table_size", default=None, type=int, help="Max prompt embedding table size"
     )
     parser.add_argument(
-        "-upkc", "--use_paged_kv_cache", default=False, action='store_true', help="Enable paged kv cache."
+        "-npkc", "--no_paged_kv_cache", default=False, action='store_true', help="Enable paged kv cache."
     )
     parser.add_argument(
         "-drip",
@@ -133,6 +133,13 @@ def get_args(argv):
     parser.add_argument(
         "-lc", "--lora_ckpt", default=None, type=str, nargs="+", help="The checkpoint list of LoRA weights"
     )
+    parser.add_argument(
+        "-ucr",
+        '--use_cpp_runtime',
+        default=False,
+        action='store_true',
+        help='Use TensorRT LLM C++ runtime',
+    )
     parser.add_argument("-dm", "--debug_mode", default=False, action='store_true', help="Enable debug mode")
 
     args = parser.parse_args(argv)
@@ -206,32 +213,13 @@ def nemo_deploy(argv):
                 )
                 return
 
-    trt_llm_exporter = TensorRTLLM(model_dir=trt_llm_path, lora_ckpt_list=args.lora_ckpt)
+    trt_llm_exporter = TensorRTLLM(
+        model_dir=trt_llm_path,
+        lora_ckpt_list=args.lora_ckpt,
+        use_python_runtime=(not args.use_cpp_runtime),
+    )
 
     if args.nemo_checkpoint is not None:
-
-        trt_llm_exporter.export(
-            nemo_checkpoint_path=args.nemo_checkpoint,
-            model_type=args.model_type,
-            n_gpus=args.num_gpus,
-            tensor_parallel_size=args.num_gpus,
-            pipeline_parallel_size=1,
-            max_input_token=args.max_input_len,
-            max_output_token=args.max_output_len,
-            max_batch_size=args.max_batch_size,
-            max_num_tokens=args.max_num_tokens,
-            opt_num_tokens=args.opt_num_tokens,
-            max_prompt_embedding_table_size=args.max_prompt_embedding_table_size,
-            paged_kv_cache=args.use_paged_kv_cache,
-            remove_input_padding=(not args.disable_remove_input_padding),
-            dtype=args.dtype,
-            enable_multi_block_mode=args.multi_block_mode,
-            use_lora_plugin=args.use_lora_plugin,
-            lora_target_modules=args.lora_target_modules,
-            max_lora_rank=args.max_lora_rank,
-            save_nemo_model_config=True,
-        )
-
         try:
             LOGGER.info("Export operation will be started to export the nemo checkpoint to TensorRT-LLM.")
             trt_llm_exporter.export(
@@ -246,7 +234,7 @@ def nemo_deploy(argv):
                 max_num_tokens=args.max_num_tokens,
                 opt_num_tokens=args.opt_num_tokens,
                 max_prompt_embedding_table_size=args.max_prompt_embedding_table_size,
-                paged_kv_cache=args.use_paged_kv_cache,
+                paged_kv_cache=(not args.no_paged_kv_cache),
                 remove_input_padding=(not args.disable_remove_input_padding),
                 dtype=args.dtype,
                 enable_multi_block_mode=args.multi_block_mode,
diff --git a/scripts/export/export_to_trt_llm.py b/scripts/export/export_to_trt_llm.py
index e9741516cf00..ce9ef6a1e132 100644
--- a/scripts/export/export_to_trt_llm.py
+++ b/scripts/export/export_to_trt_llm.py
@@ -45,8 +45,8 @@ def get_args(argv):
     parser.add_argument(
         "-dt",
         "--dtype",
-        choices=["bf16", "fp16", "fp8", "int8"],
-        default="bf16",
+        choices=["bfloat16", "float16", "fp8", "int8"],
+        default="bfloat16",
         type=str,
         help="dtype of the model on TensorRT-LLM",
     )
@@ -59,7 +59,7 @@ def get_args(argv):
         "-mpet", "--max_prompt_embedding_table_size", default=None, type=int, help="Max prompt embedding table size"
     )
     parser.add_argument(
-        "-upkc", "--use_paged_kv_cache", default=False, action='store_true', help="Enable paged kv cache."
+        "-npkc", "--no_paged_kv_cache", default=False, action='store_true', help="Enable paged kv cache."
     )
     parser.add_argument(
         "-drip",
@@ -123,7 +123,7 @@ def nemo_export_trt_llm(argv):
     LOGGER.info("Logging level set to {}".format(loglevel))
     LOGGER.info(args)
 
-    if args.dtype != "bf16":
+    if args.dtype != "bfloat16":
         LOGGER.error(
             "Only bf16 is currently supported for the optimized deployment with TensorRT-LLM. "
             "Support for the other precisions will be added in the coming releases."
@@ -146,7 +146,7 @@ def nemo_export_trt_llm(argv):
             max_num_tokens=args.max_num_tokens,
             opt_num_tokens=args.opt_num_tokens,
             max_prompt_embedding_table_size=args.max_prompt_embedding_table_size,
-            paged_kv_cache=args.use_paged_kv_cache,
+            paged_kv_cache=(not args.no_paged_kv_cache),
             remove_input_padding=(not args.disable_remove_input_padding),
             dtype=args.dtype,
             enable_multi_block_mode=args.multi_block_mode,

From deb613adc7b7ad0a540f5cc1f0bc5032ddb345ff Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com>
Date: Wed, 29 May 2024 20:03:32 -0700
Subject: [PATCH 33/47] Skip sequence_parallel allreduce when using Mcore
 DistOpt (#9344)

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
---
 .../nlp/models/language_modeling/megatron_gpt_model.py    | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index 3660a5145b10..b3e3c231de52 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -843,9 +843,11 @@ def training_step(self, dataloader_iter):
 
         # when using sequence parallelism, the sequence parallel layernorm grads must be all-reduced
         if self.cfg.get('tensor_model_parallel_size', 1) > 1 and self.cfg.get('sequence_parallel', False):
-            self.megatron_timer_start('allreduce_sequence_parallel_gradients', log_level=1)
-            self.allreduce_sequence_parallel_gradients()
-            self.megatron_timer_stop('allreduce_sequence_parallel_gradients')
+            # Mcore DistOpt handles this, so we don't have to
+            if not self.use_mcore_dist_optim:
+                self.megatron_timer_start('allreduce_sequence_parallel_gradients', log_level=1)
+                self.allreduce_sequence_parallel_gradients()
+                self.megatron_timer_stop('allreduce_sequence_parallel_gradients')
 
         self.megatron_timer_start('gradient_allreduce', log_level=1)
         if self.use_fsdp:

From 2e396060f6f95b6a848f4f260b5bbafa8ed52107 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Thu, 30 May 2024 07:45:56 +0200
Subject: [PATCH 34/47] Fix FSDP gradient reduction with orig params (#9335)

The `param.grad is not None` check also fixes gradient reduction in the
case of parameters not having acquired gradients (as parameters could
become empty tensors in FSDP).

Thanks to @ofivite for suggesting that `use_orig_params=True` could be
the cause of the issue, which greatly helped with analysis.

Signed-off-by: janEbert <janpublicebert@posteo.net>
---
 .../nlp/models/language_modeling/megatron_gpt_model.py        | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index b3e3c231de52..a5b4450c7b44 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -1001,8 +1001,8 @@ def allreduce_fsdp_sharding_omitted_gradients(self):
         """All-reduce gradients of FSDP-sharding-omitted parameters in sharding domain (data-parallel domain)."""
         assert isinstance(self.model, torch.nn.Module)
         grads = []
-        for param in self.model.parameters():
-            if not isinstance(param, torch.distributed.fsdp.FlatParameter) and param.requires_grad:
+        for param in self.model._ignored_params:
+            if param.requires_grad and param.grad is not None:
                 grad = param.grad
                 grads.append(grad.data)
         if len(grads) > 0:

From b6595cbae2226ff553b44ff2b66527738ea4bdf2 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Thu, 30 May 2024 07:25:12 -0700
Subject: [PATCH 35/47] Fix P-tuning for Llama based models (#9300)

* Fix P-tuning for Llama based models (#9297)

* Added the BOS token for Llama, Mistral and Mixtral.

Signed-off-by: Alexey Panteleev <alpanteleev@nvidia.com>

* Don't load an existing TRT-LLM model before export to speed up the export process and avoid possible contamination from previous runs.

Signed-off-by: Alexey Panteleev <alpanteleev@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: apanteleev <apanteleev@users.noreply.github.com>

---------

Signed-off-by: Alexey Panteleev <alpanteleev@nvidia.com>
Signed-off-by: apanteleev <apanteleev@users.noreply.github.com>
Co-authored-by: apanteleev <apanteleev@users.noreply.github.com>
Co-authored-by: Onur Yilmaz <35306097+oyilmaz-nvidia@users.noreply.github.com>

* Fix the export test

---------

Signed-off-by: Alexey Panteleev <alpanteleev@nvidia.com>
Signed-off-by: apanteleev <apanteleev@users.noreply.github.com>
Signed-off-by: Onur Yilmaz <35306097+oyilmaz-nvidia@users.noreply.github.com>
Co-authored-by: Alexey Panteleev <alpanteleev@nvidia.com>
Co-authored-by: apanteleev <apanteleev@users.noreply.github.com>
Co-authored-by: Onur Yilmaz <35306097+oyilmaz-nvidia@users.noreply.github.com>
---
 nemo/export/trt_llm/tensorrt_llm_run.py | 8 +++++++-
 scripts/deploy/nlp/deploy_triton.py     | 1 +
 scripts/export/export_to_trt_llm.py     | 2 +-
 tests/export/test_nemo_export.py        | 2 +-
 4 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/nemo/export/trt_llm/tensorrt_llm_run.py b/nemo/export/trt_llm/tensorrt_llm_run.py
index fe0189b10628..1bdfd5237caf 100644
--- a/nemo/export/trt_llm/tensorrt_llm_run.py
+++ b/nemo/export/trt_llm/tensorrt_llm_run.py
@@ -312,7 +312,13 @@ def load(
 
     max_batch_size = config["build_config"]["max_batch_size"]
     max_input_len = config["build_config"]["max_input_len"]
-    add_bos = True if config["pretrained_config"]["architecture"] == "GemmaForCausalLM" else False
+    architectures_that_need_bos_token = [
+        "GemmaForCausalLM",
+        "LLaMAForCausalLM",
+        "MistralForCausalLM",
+        "MixtralForCausalLM",
+    ]
+    add_bos = config["pretrained_config"]["architecture"] in architectures_that_need_bos_token
 
     return TensorrtLLMHostContext(
         executor=executor,
diff --git a/scripts/deploy/nlp/deploy_triton.py b/scripts/deploy/nlp/deploy_triton.py
index 0a9604a73cdc..5a2440b0fa2f 100755
--- a/scripts/deploy/nlp/deploy_triton.py
+++ b/scripts/deploy/nlp/deploy_triton.py
@@ -216,6 +216,7 @@ def nemo_deploy(argv):
     trt_llm_exporter = TensorRTLLM(
         model_dir=trt_llm_path,
         lora_ckpt_list=args.lora_ckpt,
+        load_model=(args.nemo_checkpoint is None),
         use_python_runtime=(not args.use_cpp_runtime),
     )
 
diff --git a/scripts/export/export_to_trt_llm.py b/scripts/export/export_to_trt_llm.py
index ce9ef6a1e132..a9c16bf8cff6 100644
--- a/scripts/export/export_to_trt_llm.py
+++ b/scripts/export/export_to_trt_llm.py
@@ -131,7 +131,7 @@ def nemo_export_trt_llm(argv):
         return
 
     try:
-        trt_llm_exporter = TensorRTLLM(model_dir=args.model_repository)
+        trt_llm_exporter = TensorRTLLM(model_dir=args.model_repository, load_model=False)
 
         LOGGER.info("Export to TensorRT-LLM function is called.")
         trt_llm_exporter.export(
diff --git a/tests/export/test_nemo_export.py b/tests/export/test_nemo_export.py
index b3e186433561..97a06a1f6887 100644
--- a/tests/export/test_nemo_export.py
+++ b/tests/export/test_nemo_export.py
@@ -200,7 +200,7 @@ def run_trt_llm_inference(
                 print("---- LoRA could not be enabled and skipping the test.")
                 return None, None, None, None, None
 
-        trt_llm_exporter = TensorRTLLM(trt_llm_model_dir, lora_ckpt_list)
+        trt_llm_exporter = TensorRTLLM(trt_llm_model_dir, lora_ckpt_list, load_model=False)
 
         trt_llm_exporter.export(
             nemo_checkpoint_path=checkpoint_path,

From aed9d071c700080b3eb024e8a5d7f091f20f0183 Mon Sep 17 00:00:00 2001
From: Daniel Galvez <galv@users.noreply.github.com>
Date: Thu, 30 May 2024 10:08:24 -0700
Subject: [PATCH 36/47] Fix GreedyBatchedCTCInfer regression from
 GreedyCTCInfer. (#9347)

* Fix GreedyBatchedCTCInfer regression from GreedyCTCInfer.

decoder_lengths is allowed to be on CPU even when decoder_output is on
GPU. This matches the behavior of GreedyCTCInfer. Even though that
behavior is unintentional, there is code depending on that behavior,
including our jupyter notebooks.

Signed-off-by: Daniel Galvez <dgalvez@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: titu1994 <titu1994@users.noreply.github.com>

---------

Signed-off-by: Daniel Galvez <dgalvez@nvidia.com>
Signed-off-by: titu1994 <titu1994@users.noreply.github.com>
Co-authored-by: Somshubra Majumdar <titu1994@gmail.com>
Co-authored-by: titu1994 <titu1994@users.noreply.github.com>
Co-authored-by: Nithin Rao <nithinrao.koluguri@gmail.com>
---
 .../parts/submodules/ctc_greedy_decoding.py   | 12 +++-
 .../asr/decoding/test_ctc_decoding.py         | 71 +++++++++++++++++--
 2 files changed, 76 insertions(+), 7 deletions(-)

diff --git a/nemo/collections/asr/parts/submodules/ctc_greedy_decoding.py b/nemo/collections/asr/parts/submodules/ctc_greedy_decoding.py
index a7f57c82279a..74204cf73d8e 100644
--- a/nemo/collections/asr/parts/submodules/ctc_greedy_decoding.py
+++ b/nemo/collections/asr/parts/submodules/ctc_greedy_decoding.py
@@ -394,7 +394,17 @@ def forward(
 
         if decoder_lengths is None:
             logging.warning(_DECODER_LENGTHS_NONE_WARNING, mode=logging_mode.ONCE)
-            decoder_lengths = torch.tensor([decoder_output.shape[1]], dtype=torch.long).expand(decoder_output.shape[0])
+            decoder_lengths = torch.tensor(
+                [decoder_output.shape[1]], dtype=torch.long, device=decoder_output.device
+            ).expand(decoder_output.shape[0])
+
+        # GreedyCTCInfer::forward(), by accident, works with
+        # decoder_lengths on either CPU or GPU when decoder_output is
+        # on GPU. For the sake of backwards compatibility, we also
+        # allow decoder_lengths to be on the CPU device. In this case,
+        # we simply copy the decoder_lengths from CPU to GPU. If both
+        # tensors are already on the same device, this is a no-op.
+        decoder_lengths = decoder_lengths.to(decoder_output.device)
 
         if decoder_output.ndim == 2:
             hypotheses = self._greedy_decode_labels_batched(decoder_output, decoder_lengths)
diff --git a/tests/collections/asr/decoding/test_ctc_decoding.py b/tests/collections/asr/decoding/test_ctc_decoding.py
index a42d61f051ad..580344fed395 100644
--- a/tests/collections/asr/decoding/test_ctc_decoding.py
+++ b/tests/collections/asr/decoding/test_ctc_decoding.py
@@ -200,8 +200,41 @@ def test_subword_decoding_greedy_forward_hypotheses(self, tmp_tokenizer, alignme
     @pytest.mark.parametrize('timestamps', [False, True])
     @pytest.mark.parametrize('preserve_frame_confidence', [False, True])
     @pytest.mark.parametrize('length_is_none', [False, True])
+    @pytest.mark.parametrize(
+        "logprobs_device",
+        [
+            torch.device("cpu"),
+            pytest.param(
+                torch.device("cuda"),
+                marks=pytest.mark.skipif(
+                    not torch.cuda.is_available(),
+                    reason='CUDA required for test.',
+                ),
+            ),
+        ],
+    )
+    @pytest.mark.parametrize(
+        "length_device",
+        [
+            torch.device("cpu"),
+            pytest.param(
+                torch.device("cuda"),
+                marks=pytest.mark.skipif(
+                    not torch.cuda.is_available(),
+                    reason='CUDA required for test.',
+                ),
+            ),
+        ],
+    )
     def test_batched_decoding_logprobs(
-        self, tmp_tokenizer, alignments, timestamps, preserve_frame_confidence, length_is_none
+        self,
+        tmp_tokenizer,
+        alignments,
+        timestamps,
+        preserve_frame_confidence,
+        length_is_none,
+        logprobs_device,
+        length_device,
     ):
         cfg = CTCBPEDecodingConfig(
             strategy='greedy',
@@ -217,7 +250,7 @@ def test_batched_decoding_logprobs(
         torch.manual_seed(1)
         B, T = 4, 20
         V = unbatched_decoding.tokenizer.tokenizer.vocab_size + 1
-        input_signal = torch.randn(size=(B, T, V))
+        input_signal = torch.randn(size=(B, T, V), device=logprobs_device)
         # Set the blank index to a very high probability to make sure
         # that we always handle at least a few blanks.
         input_signal[:, 0, unbatched_decoding.tokenizer.tokenizer.vocab_size] = 1000
@@ -225,7 +258,7 @@ def test_batched_decoding_logprobs(
         if length_is_none:
             length = None
         else:
-            length = torch.randint(low=1, high=T, size=[B])
+            length = torch.randint(low=1, high=T, size=[B], device=length_device)
 
         with torch.inference_mode():
             hyps, _ = unbatched_decoding.ctc_decoder_predictions_tensor(
@@ -249,7 +282,33 @@ def test_batched_decoding_logprobs(
     @pytest.mark.unit
     @pytest.mark.parametrize('timestamps', [False, True])
     @pytest.mark.parametrize('length_is_none', [False, True])
-    def test_batched_decoding_labels(self, tmp_tokenizer, timestamps, length_is_none):
+    @pytest.mark.parametrize(
+        "labels_device",
+        [
+            torch.device("cpu"),
+            pytest.param(
+                torch.device("cuda"),
+                marks=pytest.mark.skipif(
+                    not torch.cuda.is_available(),
+                    reason='CUDA required for test.',
+                ),
+            ),
+        ],
+    )
+    @pytest.mark.parametrize(
+        "length_device",
+        [
+            torch.device("cpu"),
+            pytest.param(
+                torch.device("cuda"),
+                marks=pytest.mark.skipif(
+                    not torch.cuda.is_available(),
+                    reason='CUDA required for test.',
+                ),
+            ),
+        ],
+    )
+    def test_batched_decoding_labels(self, tmp_tokenizer, timestamps, length_is_none, labels_device, length_device):
         cfg = CTCBPEDecodingConfig(strategy='greedy', compute_timestamps=timestamps)
         unbatched_decoding = CTCBPEDecoding(decoding_cfg=cfg, tokenizer=tmp_tokenizer)
         cfg.strategy = 'greedy_batched'
@@ -258,7 +317,7 @@ def test_batched_decoding_labels(self, tmp_tokenizer, timestamps, length_is_none
         torch.manual_seed(1)
         B, T = 4, 20
         V = unbatched_decoding.tokenizer.tokenizer.vocab_size + 1
-        input_labels = torch.randint(V, size=(B, T))
+        input_labels = torch.randint(V, size=(B, T), device=labels_device)
         # Set some indices to blank to make sure that we always handle
         # at least a few blanks.
         input_labels[:, 0] = unbatched_decoding.tokenizer.tokenizer.vocab_size
@@ -266,7 +325,7 @@ def test_batched_decoding_labels(self, tmp_tokenizer, timestamps, length_is_none
         if length_is_none:
             length = None
         else:
-            length = torch.randint(low=1, high=T, size=[B])
+            length = torch.randint(low=1, high=T, size=[B], device=length_device)
 
         with torch.inference_mode():
             hyps, _ = unbatched_decoding.ctc_decoder_predictions_tensor(

From f397086f4a580e42633a89db99885eb07b511c3d Mon Sep 17 00:00:00 2001
From: Somshubra Majumdar <titu1994@gmail.com>
Date: Thu, 30 May 2024 10:25:07 -0700
Subject: [PATCH 37/47] Revert "Fix GreedyBatchedCTCInfer regression from
 GreedyCTCInfer. (#9347)" (#9351)

This reverts commit aed9d071c700080b3eb024e8a5d7f091f20f0183.
---
 .../parts/submodules/ctc_greedy_decoding.py   | 12 +---
 .../asr/decoding/test_ctc_decoding.py         | 71 ++-----------------
 2 files changed, 7 insertions(+), 76 deletions(-)

diff --git a/nemo/collections/asr/parts/submodules/ctc_greedy_decoding.py b/nemo/collections/asr/parts/submodules/ctc_greedy_decoding.py
index 74204cf73d8e..a7f57c82279a 100644
--- a/nemo/collections/asr/parts/submodules/ctc_greedy_decoding.py
+++ b/nemo/collections/asr/parts/submodules/ctc_greedy_decoding.py
@@ -394,17 +394,7 @@ def forward(
 
         if decoder_lengths is None:
             logging.warning(_DECODER_LENGTHS_NONE_WARNING, mode=logging_mode.ONCE)
-            decoder_lengths = torch.tensor(
-                [decoder_output.shape[1]], dtype=torch.long, device=decoder_output.device
-            ).expand(decoder_output.shape[0])
-
-        # GreedyCTCInfer::forward(), by accident, works with
-        # decoder_lengths on either CPU or GPU when decoder_output is
-        # on GPU. For the sake of backwards compatibility, we also
-        # allow decoder_lengths to be on the CPU device. In this case,
-        # we simply copy the decoder_lengths from CPU to GPU. If both
-        # tensors are already on the same device, this is a no-op.
-        decoder_lengths = decoder_lengths.to(decoder_output.device)
+            decoder_lengths = torch.tensor([decoder_output.shape[1]], dtype=torch.long).expand(decoder_output.shape[0])
 
         if decoder_output.ndim == 2:
             hypotheses = self._greedy_decode_labels_batched(decoder_output, decoder_lengths)
diff --git a/tests/collections/asr/decoding/test_ctc_decoding.py b/tests/collections/asr/decoding/test_ctc_decoding.py
index 580344fed395..a42d61f051ad 100644
--- a/tests/collections/asr/decoding/test_ctc_decoding.py
+++ b/tests/collections/asr/decoding/test_ctc_decoding.py
@@ -200,41 +200,8 @@ def test_subword_decoding_greedy_forward_hypotheses(self, tmp_tokenizer, alignme
     @pytest.mark.parametrize('timestamps', [False, True])
     @pytest.mark.parametrize('preserve_frame_confidence', [False, True])
     @pytest.mark.parametrize('length_is_none', [False, True])
-    @pytest.mark.parametrize(
-        "logprobs_device",
-        [
-            torch.device("cpu"),
-            pytest.param(
-                torch.device("cuda"),
-                marks=pytest.mark.skipif(
-                    not torch.cuda.is_available(),
-                    reason='CUDA required for test.',
-                ),
-            ),
-        ],
-    )
-    @pytest.mark.parametrize(
-        "length_device",
-        [
-            torch.device("cpu"),
-            pytest.param(
-                torch.device("cuda"),
-                marks=pytest.mark.skipif(
-                    not torch.cuda.is_available(),
-                    reason='CUDA required for test.',
-                ),
-            ),
-        ],
-    )
     def test_batched_decoding_logprobs(
-        self,
-        tmp_tokenizer,
-        alignments,
-        timestamps,
-        preserve_frame_confidence,
-        length_is_none,
-        logprobs_device,
-        length_device,
+        self, tmp_tokenizer, alignments, timestamps, preserve_frame_confidence, length_is_none
     ):
         cfg = CTCBPEDecodingConfig(
             strategy='greedy',
@@ -250,7 +217,7 @@ def test_batched_decoding_logprobs(
         torch.manual_seed(1)
         B, T = 4, 20
         V = unbatched_decoding.tokenizer.tokenizer.vocab_size + 1
-        input_signal = torch.randn(size=(B, T, V), device=logprobs_device)
+        input_signal = torch.randn(size=(B, T, V))
         # Set the blank index to a very high probability to make sure
         # that we always handle at least a few blanks.
         input_signal[:, 0, unbatched_decoding.tokenizer.tokenizer.vocab_size] = 1000
@@ -258,7 +225,7 @@ def test_batched_decoding_logprobs(
         if length_is_none:
             length = None
         else:
-            length = torch.randint(low=1, high=T, size=[B], device=length_device)
+            length = torch.randint(low=1, high=T, size=[B])
 
         with torch.inference_mode():
             hyps, _ = unbatched_decoding.ctc_decoder_predictions_tensor(
@@ -282,33 +249,7 @@ def test_batched_decoding_logprobs(
     @pytest.mark.unit
     @pytest.mark.parametrize('timestamps', [False, True])
     @pytest.mark.parametrize('length_is_none', [False, True])
-    @pytest.mark.parametrize(
-        "labels_device",
-        [
-            torch.device("cpu"),
-            pytest.param(
-                torch.device("cuda"),
-                marks=pytest.mark.skipif(
-                    not torch.cuda.is_available(),
-                    reason='CUDA required for test.',
-                ),
-            ),
-        ],
-    )
-    @pytest.mark.parametrize(
-        "length_device",
-        [
-            torch.device("cpu"),
-            pytest.param(
-                torch.device("cuda"),
-                marks=pytest.mark.skipif(
-                    not torch.cuda.is_available(),
-                    reason='CUDA required for test.',
-                ),
-            ),
-        ],
-    )
-    def test_batched_decoding_labels(self, tmp_tokenizer, timestamps, length_is_none, labels_device, length_device):
+    def test_batched_decoding_labels(self, tmp_tokenizer, timestamps, length_is_none):
         cfg = CTCBPEDecodingConfig(strategy='greedy', compute_timestamps=timestamps)
         unbatched_decoding = CTCBPEDecoding(decoding_cfg=cfg, tokenizer=tmp_tokenizer)
         cfg.strategy = 'greedy_batched'
@@ -317,7 +258,7 @@ def test_batched_decoding_labels(self, tmp_tokenizer, timestamps, length_is_none
         torch.manual_seed(1)
         B, T = 4, 20
         V = unbatched_decoding.tokenizer.tokenizer.vocab_size + 1
-        input_labels = torch.randint(V, size=(B, T), device=labels_device)
+        input_labels = torch.randint(V, size=(B, T))
         # Set some indices to blank to make sure that we always handle
         # at least a few blanks.
         input_labels[:, 0] = unbatched_decoding.tokenizer.tokenizer.vocab_size
@@ -325,7 +266,7 @@ def test_batched_decoding_labels(self, tmp_tokenizer, timestamps, length_is_none
         if length_is_none:
             length = None
         else:
-            length = torch.randint(low=1, high=T, size=[B], device=length_device)
+            length = torch.randint(low=1, high=T, size=[B])
 
         with torch.inference_mode():
             hyps, _ = unbatched_decoding.ctc_decoder_predictions_tensor(

From bf53aaa9e154f7068c49637e71b23a8d0bac513e Mon Sep 17 00:00:00 2001
From: Onur Yilmaz <35306097+oyilmaz-nvidia@users.noreply.github.com>
Date: Thu, 30 May 2024 15:26:51 -0400
Subject: [PATCH 38/47] TRT-LLM Export Code Cleanup (#9270)

* Init code cleanup for the trt-llm export

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* Removed model config

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: oyilmaz-nvidia <oyilmaz-nvidia@users.noreply.github.com>

* clearn futher

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* Done more cleaning and addressing the reviews

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: oyilmaz-nvidia <oyilmaz-nvidia@users.noreply.github.com>

---------

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>
Signed-off-by: oyilmaz-nvidia <oyilmaz-nvidia@users.noreply.github.com>
Co-authored-by: oyilmaz-nvidia <oyilmaz-nvidia@users.noreply.github.com>
---
 nemo/export/tensorrt_llm.py                   |  71 +--
 nemo/export/trt_llm/decoder/__init__.py       |  82 ---
 nemo/export/trt_llm/decoder/decoder.py        | 264 ---------
 nemo/export/trt_llm/decoder/falcon.py         | 152 -----
 nemo/export/trt_llm/decoder/gemma.py          | 150 -----
 nemo/export/trt_llm/decoder/gpt.py            | 147 -----
 nemo/export/trt_llm/decoder/gptj.py           | 117 ----
 nemo/export/trt_llm/decoder/llama.py          | 169 ------
 nemo/export/trt_llm/model_config.py           | 555 ------------------
 nemo/export/trt_llm/model_config_trt.py       |  82 ---
 nemo/export/trt_llm/nemo/convert.py           | 149 -----
 nemo/export/trt_llm/nemo/nemo.py              |  16 +-
 nemo/export/trt_llm/nemo/nemo_ckpt_convert.py |   2 +-
 nemo/export/trt_llm/nemo_utils.py             | 239 +++-----
 nemo/export/trt_llm/quantization_utils.py     | 128 ----
 nemo/export/trt_llm/tensor_utils.py           |  59 --
 nemo/export/trt_llm/tensorrt_llm_build.py     | 320 ----------
 nemo/export/trt_llm/tensorrt_llm_model.py     | 406 -------------
 nemo/export/trt_llm/tensorrt_llm_run.py       | 109 ----
 nemo/export/trt_llm/tensorrt_llm_utils.py     |  85 ---
 nemo/export/trt_llm/utils.py                  |  78 ---
 21 files changed, 86 insertions(+), 3294 deletions(-)
 delete mode 100644 nemo/export/trt_llm/decoder/__init__.py
 delete mode 100644 nemo/export/trt_llm/decoder/decoder.py
 delete mode 100644 nemo/export/trt_llm/decoder/falcon.py
 delete mode 100644 nemo/export/trt_llm/decoder/gemma.py
 delete mode 100644 nemo/export/trt_llm/decoder/gpt.py
 delete mode 100644 nemo/export/trt_llm/decoder/gptj.py
 delete mode 100644 nemo/export/trt_llm/decoder/llama.py
 delete mode 100644 nemo/export/trt_llm/model_config.py
 delete mode 100644 nemo/export/trt_llm/model_config_trt.py
 delete mode 100644 nemo/export/trt_llm/quantization_utils.py
 delete mode 100644 nemo/export/trt_llm/tensor_utils.py
 delete mode 100644 nemo/export/trt_llm/tensorrt_llm_model.py
 delete mode 100644 nemo/export/trt_llm/tensorrt_llm_utils.py
 delete mode 100644 nemo/export/trt_llm/utils.py

diff --git a/nemo/export/tensorrt_llm.py b/nemo/export/tensorrt_llm.py
index b030165a3d45..401ac2e930a6 100644
--- a/nemo/export/tensorrt_llm.py
+++ b/nemo/export/tensorrt_llm.py
@@ -28,14 +28,11 @@
 
 from nemo.deploy import ITritonDeployable
 from nemo.export.tarutils import TarPath, unpack_tarball
-from nemo.export.trt_llm.model_config_trt import model_config_to_tensorrt_llm
-from nemo.export.trt_llm.nemo.nemo_ckpt_convert import build_tokenizer
-from nemo.export.trt_llm.nemo_utils import get_tokenzier, nemo_llm_model_to_model_config, nemo_to_trtllm_config
+from nemo.export.trt_llm.nemo_utils import get_tokenzier, is_nemo_file, nemo_to_trtllm_config
 from nemo.export.trt_llm.qnemo import qnemo_to_tensorrt_llm
 from nemo.export.trt_llm.qnemo.tokenizer_utils import get_nmt_tokenizer
 from nemo.export.trt_llm.tensorrt_llm_build import build_and_save_engine
-from nemo.export.trt_llm.tensorrt_llm_run import generate, generate_streaming, load, load_refit
-from nemo.export.trt_llm.utils import is_nemo_file
+from nemo.export.trt_llm.tensorrt_llm_run import generate, generate_streaming, load
 
 use_deploy = True
 try:
@@ -278,70 +275,6 @@ def export(
         if load_model:
             self._load()
 
-    def build(
-        self,
-        nemo_model,
-        nemo_model_config,
-        tokenizer=None,
-        max_input_token: int = 256,
-        max_output_token: int = 256,
-        max_batch_size: int = 8,
-        use_refit: bool = False,
-        model_type: str = "gptnext",
-    ):
-        from megatron.core import parallel_state
-
-        self.use_refit = use_refit
-        self.stream = torch.cuda.Stream()
-        self.model_type = model_type
-        self.tokenizer = build_tokenizer(tokenizer)
-
-        # Each model shard has its own directory
-        if parallel_state.get_data_parallel_world_size() > 1:
-            self.model_dir = os.path.join(self.model_dir, f"dp{parallel_state.get_data_parallel_rank()}")
-        if parallel_state.get_tensor_model_parallel_world_size() > 1:
-            self.model_dir = os.path.join(self.model_dir, f"tp{parallel_state.get_tensor_model_parallel_rank()}")
-        if parallel_state.get_pipeline_model_parallel_world_size() > 1:
-            self.model_dir = os.path.join(self.model_dir, f"pp{parallel_state.get_pipeline_model_parallel_rank()}")
-
-        # Build or refit TRT-LLM engine from a nemo model.
-        model_configs = nemo_llm_model_to_model_config(
-            nemo_model=nemo_model,
-            decoder_type=model_type,
-            nemo_model_config=nemo_model_config,
-        )
-
-        model_config_to_tensorrt_llm(
-            model_configs,
-            self.model_dir,
-            max_input_len=max_input_token,
-            max_output_len=max_output_token,
-            max_batch_size=max_batch_size,
-            max_beam_width=1,
-            max_prompt_embedding_table_size=0,
-            use_refit=self.use_refit,
-        )
-        # Use load_refit to handle multiprocessed environment
-        self.model = load_refit(
-            tokenizer=self.tokenizer, engine_dir=self.model_dir, model_configs=model_configs, stream=self.stream
-        )
-
-    def refit(
-        self,
-        nemo_model,
-        nemo_model_config,
-    ):
-        assert self.use_refit, "TRT-LLM model must be built() with refit=True"
-
-        # Build or refit TRT-LLM engine from a nemo model.
-        model_configs = nemo_llm_model_to_model_config(
-            nemo_model=nemo_model, decoder_type=self.model_type, nemo_model_config=nemo_model_config
-        )
-
-        self.model = load_refit(
-            tokenizer=self.tokenizer, engine_dir=self.model_dir, model_configs=model_configs, stream=self.stream
-        )
-
     def forward(
         self,
         input_texts: List[str],
diff --git a/nemo/export/trt_llm/decoder/__init__.py b/nemo/export/trt_llm/decoder/__init__.py
deleted file mode 100644
index b5e22b5e513e..000000000000
--- a/nemo/export/trt_llm/decoder/__init__.py
+++ /dev/null
@@ -1,82 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Dict, Type
-
-import tensorrt as trt
-
-from nemo.export.trt_llm.decoder.decoder import DecoderLayerBuilder, DecoderLayerConfigBuilder
-from nemo.export.trt_llm.decoder.falcon import FALCONDecoderLayerBuilder, FALCONDecoderLayerConfigBuilder
-from nemo.export.trt_llm.decoder.gemma import GemmaDecoderLayerBuilder, GemmaDecoderLayerConfigBuilder
-from nemo.export.trt_llm.decoder.gpt import GPTDecoderLayerBuilder, GPTDecoderLayerConfigBuilder
-from nemo.export.trt_llm.decoder.gptj import GPTJDecoderLayerBuilder, GPTJDecoderLayerConfigBuilder
-from nemo.export.trt_llm.decoder.llama import LLAMADecoderLayerBuilder, LLAMADecoderLayerConfigBuilder
-from nemo.export.trt_llm.model_config import (
-    DECODER_FALCON,
-    DECODER_GEMMA,
-    DECODER_GPT2,
-    DECODER_GPTJ,
-    DECODER_GPTNEXT,
-    DECODER_LLAMA,
-    QUANTIZATION_NONE,
-)
-
-DECODER_CONFIG_REGISTRY: Dict[str, Type[DecoderLayerConfigBuilder]] = {
-    DECODER_GPT2: GPTDecoderLayerConfigBuilder,
-    DECODER_GPTJ: GPTJDecoderLayerConfigBuilder,
-    DECODER_LLAMA: LLAMADecoderLayerConfigBuilder,
-    DECODER_FALCON: FALCONDecoderLayerConfigBuilder,
-    DECODER_GEMMA: GemmaDecoderLayerConfigBuilder,
-}
-
-DECODER_MODEL_TYPE = {
-    DECODER_GPT2: 'GPTForCausalLM',
-    DECODER_GPTNEXT: 'GPTForCausalLM',
-    DECODER_LLAMA: 'LLaMAForCausalLM',
-    DECODER_GEMMA: 'GemmaForCausalLM',
-    DECODER_FALCON: 'FalconForCausalLM',
-}
-
-
-def build_decoder_layer_config(layer, decoder: str, dtype=trt.float16, rank=0, tensor_parallel=1):
-    """Builds the decoder layer config with the input torch module."""
-    assert decoder in DECODER_CONFIG_REGISTRY, f"{decoder} not supported"
-    return DECODER_CONFIG_REGISTRY[decoder](decoder, dtype, rank, tensor_parallel).build_layer(layer)
-
-
-DECODER_REGISTRY: Dict[str, Type[DecoderLayerBuilder]] = {
-    DECODER_GPT2: GPTDecoderLayerBuilder,
-    DECODER_GPTJ: GPTJDecoderLayerBuilder,
-    DECODER_LLAMA: LLAMADecoderLayerBuilder,
-    DECODER_GPTNEXT: GPTDecoderLayerBuilder,
-    DECODER_FALCON: FALCONDecoderLayerBuilder,
-    DECODER_GEMMA: GemmaDecoderLayerBuilder,
-}
-
-
-def build_decoder_layer(
-    layer,
-    layer_id: int,
-    num_layers: int,
-    dtype=trt.float16,
-    quantization=QUANTIZATION_NONE,
-    rank=0,
-    tensor_parallel=1,
-    tp_group=None,
-):
-    """Builds the tensorrt llm decoder layer module with the layer config as the input."""
-    assert layer.decoder_type in DECODER_REGISTRY, f"{layer.decoder_type} not supported"
-    builder = DECODER_REGISTRY[layer.decoder_type]
-    decoder_builder = builder(layer, layer_id, num_layers, dtype, quantization, rank, tensor_parallel, tp_group)
-    return decoder_builder.decoder
diff --git a/nemo/export/trt_llm/decoder/decoder.py b/nemo/export/trt_llm/decoder/decoder.py
deleted file mode 100644
index 2d1993fd74c0..000000000000
--- a/nemo/export/trt_llm/decoder/decoder.py
+++ /dev/null
@@ -1,264 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from abc import ABC, abstractmethod
-from typing import Optional
-
-import tensorrt as trt
-from transformers.activations import ACT2FN
-
-from nemo.export.trt_llm.model_config import (
-    QUANTIZATION_NONE,
-    AttentionConfig,
-    DecoderLayerConfig,
-    LayernormConfig,
-    MLPConfig,
-)
-from nemo.export.trt_llm.quantization_utils import quantize_linear
-from nemo.export.trt_llm.tensor_utils import get_tensor_parallel_group
-
-
-def _get_hidden_act(act_func):
-    """Returns the name of the hidden activation functon based on ACT2FN."""
-    if isinstance(act_func, str):
-        return act_func
-
-    for name, func in ACT2FN.items():
-        if isinstance(func, tuple):
-            if isinstance(act_func, func[0]):
-                return name
-        elif isinstance(act_func, func):
-            return name
-    assert False, f"Cannot find name for {act_func}"
-
-
-class DecoderLayerConfigBuilder(ABC):
-    """A config builder that translate the LLM decoder layer to the DecoderLayerConfig."""
-
-    @abstractmethod
-    def hidden_act_fn(self, layer):
-        """Returns the hidden act fn in the MLP layer, e.g. SiLUActivation or NewGELUActivation."""
-        pass
-
-    @abstractmethod
-    def infer_num_attention_heads(self, layer):
-        """Returns the num of attention heads of the layer."""
-        pass
-
-    @abstractmethod
-    def infer_max_position_embeddings(self, layer):
-        """Returns the max positional embeddings of the layer."""
-        pass
-
-    @abstractmethod
-    def build_input_layernorm(self, layer) -> LayernormConfig:
-        """Returns the built input layernorm layer."""
-        pass
-
-    @abstractmethod
-    def build_mlp_layernorm(
-        self, layer
-    ) -> LayernormConfig:  # Force all other models to implement. But seems this builder is not used.
-        """Returns the built mlp layernorm layer."""
-        pass
-
-    @abstractmethod
-    def build_attention(self, layer) -> AttentionConfig:
-        """Returns the built attention layer."""
-        pass
-
-    @abstractmethod
-    def build_mlp(self, layer) -> MLPConfig:
-        """Returns the built mlp layer."""
-        pass
-
-    @abstractmethod
-    def build_post_layernorm(self, layer) -> Optional[LayernormConfig]:
-        """Returns the built post layernorm."""
-        pass
-
-    def __init__(
-        self,
-        decoder_type: str,
-        dtype: trt.DataType = trt.float16,
-        rank: int = 0,
-        tensor_parallel: int = 1,
-    ):
-        """Initializes the DecoderLayerConfigBuilder."""
-        self.decoder_type = decoder_type
-        self.dtype = dtype
-        self.rank = rank
-        self.tensor_parallel = tensor_parallel
-
-    def build_layer(self, layer) -> DecoderLayerConfig:
-        """Builds the decoder layer and returns the DecoderLayer."""
-        decoder = DecoderLayerConfig()
-
-        decoder.decoder_type = self.decoder_type
-        decoder.num_attention_heads = self.infer_num_attention_heads(layer)
-        decoder.num_kv_heads = self.infer_num_kv_heads(layer)
-        decoder.max_position_embeddings = self.infer_max_position_embeddings(layer)
-
-        decoder.input_layernorm = self.build_input_layernorm(layer)
-        decoder.mlp_layernorm = self.build_mlp_layernorm(layer)
-        decoder.attention = self.build_attention(layer)
-        decoder.post_layernorm = self.build_post_layernorm(layer)
-        decoder.mlp = self.build_mlp(layer)
-        decoder.mlp.hidden_act = _get_hidden_act(self.hidden_act_fn(layer)).split("_")[0]
-
-        return decoder
-
-    def infer_num_kv_heads(self, layer):
-        """Returns the num of key value heads of the layer."""
-        return self.infer_num_attention_heads(layer)
-
-
-class DecoderLayerBuilder(ABC):
-    """An abstracted transformer decoder layer with tensorrt_llm implementation taking DecoderLayerConfig as the input.
-
-    Individual decoder layers are supposed to extend this class and implement the customized
-    abstracted method.
-    """
-
-    @abstractmethod
-    def build_decoder(self, layer):
-        """Returns the built decoder layer."""
-        pass
-
-    def __init__(
-        self,
-        layer: DecoderLayerConfig,
-        layer_id: int,
-        num_layers: int,
-        dtype: trt.DataType = trt.float16,
-        quantization: str = QUANTIZATION_NONE,
-        rank: int = 0,
-        tensor_parallel: int = 1,
-        tp_group=None,
-    ):
-        """Initializes the DecoderLayer."""
-        super().__init__()
-        assert isinstance(dtype, trt.DataType)
-        self.layer_id = layer_id
-        self.num_layers = num_layers
-        self.dtype = dtype
-        self.quantization = quantization
-        self.rank = rank
-        self.tensor_parallel = tensor_parallel
-
-        if tp_group is None:
-            self.tp_group = get_tensor_parallel_group(tensor_parallel)
-        else:
-            self.tp_group = tp_group
-
-        self.hidden_size = layer.hidden_size
-        self.num_attention_heads = layer.num_attention_heads
-        self.num_kv_heads = layer.num_kv_heads if layer.num_kv_heads > 0 else layer.num_attention_heads
-
-        assert (
-            self.num_attention_heads % self.num_kv_heads
-        ) == 0, "MQA/GQA requires the number of heads to be divisible by the number of K/V heads."
-        assert (self.num_kv_heads % self.tensor_parallel) == 0 or (self.tensor_parallel % self.num_kv_heads) == 0, (
-            "MQA/GQA requires either the number of K/V heads to be divisible by the number of GPUs"
-            " OR the number of GPUs to be divisible by the number of K/V heads."
-        )
-
-        self.max_position_embeddings = layer.max_position_embeddings
-        self.hidden_act = layer.mlp.hidden_act
-
-        self.decoder = self.build_decoder(layer)
-        self.assign_weights(layer)
-
-        is_moe = (
-            hasattr(self.decoder, "config")
-            and self.decoder.config.moe_num_experts is not None
-            and self.decoder.config.moe_num_experts > 1
-        )
-        if not is_moe:
-            self.quantize(layer)
-
-    def assign_weights(self, layer: DecoderLayerConfig):
-        """Assign the weights to the attention tensorrt_llm layer."""
-        is_moe = (
-            hasattr(self.decoder, "config")
-            and self.decoder.config.moe_num_experts is not None
-            and self.decoder.config.moe_num_experts > 1
-        )
-
-        self.decoder.input_layernorm.weight.value = layer.input_layernorm.weight
-        if layer.input_layernorm.bias is not None:
-            self.decoder.input_layernorm.bias.value = layer.input_layernorm.bias
-
-        if layer.mlp_layernorm is not None:  # Falcon has mlp layer norm
-            if is_moe:
-                assert layer.post_layernorm is None
-                self.decoder.post_layernorm.weight.value = layer.mlp_layernorm.weight
-                if layer.mlp_layernorm.bias is not None:
-                    self.decoder.post_layernorm.bias.value = layer.mlp_layernorm.bias
-            else:
-                self.decoder.mlp_layernorm.weight.value = layer.mlp_layernorm.weight
-                if layer.mlp_layernorm.bias is not None:
-                    self.decoder.mlp_layernorm.bias.value = layer.mlp_layernorm.bias
-
-        self.decoder.attention.qkv.weight.value = layer.attention.qkv.weight
-        if layer.attention.qkv.bias is not None:
-            self.decoder.attention.qkv.bias.value = layer.attention.qkv.bias
-
-        self.decoder.attention.dense.weight.value = layer.attention.dense.weight
-        if self.decoder.attention.dense.bias is not None:
-            self.decoder.attention.dense.bias.value = layer.attention.dense.bias
-
-        if layer.post_layernorm is not None:
-            self.decoder.post_layernorm.weight.value = layer.post_layernorm.weight
-            if layer.post_layernorm.bias is not None:
-                self.decoder.post_layernorm.bias.value = layer.post_layernorm.bias
-
-        if is_moe:
-            self.decoder.mlp.router.weight.value = layer.mlp.router.weight
-            self.decoder.mlp.experts_weight_1.value = layer.mlp.fc1.weight
-            self.decoder.mlp.experts_weight_2.value = layer.mlp.fc2.weight
-
-            if layer.mlp.fc1.bias is not None:
-                self.decoder.mlp.experts_bias_1.value = layer.mlp.fc1.bias
-
-            if layer.mlp.fc2.bias is not None:
-                self.decoder.mlp.experts_bias_2.value = layer.mlp.fc2.bias
-
-        else:
-            self.decoder.mlp.fc.weight.value = layer.mlp.fc.weight
-            self.decoder.mlp.proj.weight.value = layer.mlp.proj.weight
-            bias = layer.mlp.fc.bias is not None
-            if bias:
-                self.decoder.mlp.fc.bias.value = layer.mlp.fc.bias
-                self.decoder.mlp.proj.bias.value = layer.mlp.proj.bias
-
-            if layer.mlp.gate:
-                self.decoder.mlp.gate.weight.value = layer.mlp.gate.weight
-                if bias:
-                    self.decoder.mlp.gate.bias.value = layer.mlp.gate.bias
-
-    def quantize(self, layer: DecoderLayerConfig):
-        """Quantizes the decoder layer based on the layer config."""
-        self.decoder.attention.qkv = quantize_linear(
-            self.decoder.attention.qkv, self.quantization, layer.attention.qkv
-        )
-        self.decoder.attention.dense = quantize_linear(
-            self.decoder.attention.dense, self.quantization, layer.attention.dense
-        )
-        self.decoder.mlp.fc = quantize_linear(self.decoder.mlp.fc, self.quantization, layer.mlp.fc)
-        self.decoder.mlp.proj = quantize_linear(self.decoder.mlp.proj, self.quantization, layer.mlp.proj)
-
-        if hasattr(self.decoder.mlp, "gate"):
-            self.decoder.mlp.gate = quantize_linear(self.decoder.mlp.gate, self.quantization, layer.mlp.gate)
diff --git a/nemo/export/trt_llm/decoder/falcon.py b/nemo/export/trt_llm/decoder/falcon.py
deleted file mode 100644
index e05979fa75a0..000000000000
--- a/nemo/export/trt_llm/decoder/falcon.py
+++ /dev/null
@@ -1,152 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from typing import Optional
-
-from tensorrt_llm.functional import non_gated_version
-from tensorrt_llm.models.falcon.model import FalconDecoderLayer
-from tensorrt_llm.models.modeling_utils import PretrainedConfig, QuantConfig
-from typing_extensions import override
-
-from nemo.export.trt_llm.decoder.decoder import DecoderLayerBuilder, DecoderLayerConfigBuilder
-from nemo.export.trt_llm.model_config import (
-    LINEAR_COLUMN,
-    LINEAR_ROW,
-    AttentionConfig,
-    LayernormConfig,
-    LinearConfig,
-    MLPConfig,
-)
-
-
-class FALCONDecoderLayerConfigBuilder(DecoderLayerConfigBuilder):
-    """The FALCON implementation of the DecoderLayerConfigBuilder."""
-
-    @override
-    def hidden_act_fn(self, layer):
-        return layer.mlp.act_fn
-
-    @override
-    def infer_num_attention_heads(self, layer):
-        return layer.self_attn.num_heads
-
-    @override
-    def infer_num_kv_heads(self, layer):
-        return layer.self_attn.num_key_value_heads
-
-    @override
-    def infer_max_position_embeddings(self, layer):
-        return layer.self_attn.max_position_embeddings
-
-    @override
-    def build_input_layernorm(self, layer) -> LayernormConfig:
-        return LayernormConfig.from_nn_module(layer.input_layernorm, dtype=self.dtype)
-
-    @override
-    def build_mlp_layernorm(self, layer) -> LayernormConfig:
-        return LayernormConfig.from_nn_module(layer.mlp_layernorm, dtype=self.dtype)
-
-    @override
-    def build_attention(self, layer) -> AttentionConfig:
-        config = AttentionConfig()
-        config.qkv = LinearConfig.from_qkv_nn_modules(
-            [layer.self_attn.q_proj, layer.self_attn.k_proj, layer.self_attn.v_proj],
-            rank=self.rank,
-            tensor_parallel=self.tensor_parallel,
-            dtype=self.dtype,
-        )
-
-        config.dense = LinearConfig.from_nn_module(
-            layer.self_attn.o_proj,
-            LINEAR_ROW,
-            rank=self.rank,
-            tensor_parallel=self.tensor_parallel,
-            dtype=self.dtype,
-        )
-
-        return config
-
-    @override
-    def build_mlp(self, layer) -> MLPConfig:
-        config = MLPConfig()
-        config.fc = LinearConfig.from_nn_module(
-            layer.mlp.gate_proj,
-            LINEAR_COLUMN,
-            rank=self.rank,
-            tensor_parallel=self.tensor_parallel,
-            dtype=self.dtype,
-        )
-        config.proj = LinearConfig.from_nn_module(
-            layer.mlp.down_proj,
-            LINEAR_ROW,
-            rank=self.rank,
-            tensor_parallel=self.tensor_parallel,
-            dtype=self.dtype,
-        )
-        config.gate = LinearConfig.from_nn_module(
-            layer.mlp.up_proj,
-            LINEAR_COLUMN,
-            rank=self.rank,
-            tensor_parallel=self.tensor_parallel,
-            dtype=self.dtype,
-        )
-
-        return config
-
-    @override
-    def build_post_layernorm(self, layer) -> Optional[LayernormConfig]:
-        return LayernormConfig.from_nn_module(layer.post_attention_layernorm, dtype=self.dtype)
-
-
-class FALCONDecoderLayerBuilder(DecoderLayerBuilder):
-    """The FALCON implementation of the DecoderLayer."""
-
-    @override
-    def build_decoder(self, layer):
-        # Falcon 7B: parallel_attention=True, new_decoder_architecture=False
-        # Falcon 40B/180B: parallel_attention=True, new_decoder_architecture=True
-        config = PretrainedConfig(
-            architecture=None,
-            dtype=self.dtype,
-            logits_dtype=self.dtype,
-            vocab_size=layer.vocab_size,
-            max_position_embeddings=self.max_position_embeddings,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_layers,
-            num_attention_heads=self.num_attention_heads,
-            num_key_value_heads=self.num_kv_heads,
-            hidden_act=non_gated_version(self.hidden_act),
-            intermediate_size=layer.ffn_hidden_size_local * self.tensor_parallel,
-            norm_epsilon=layer.norm_epsilon,
-            position_embedding_type="rope_gpt_neox",
-            world_size=self.tensor_parallel,
-            tp_size=self.tensor_parallel,
-            pp_size=1,
-            quantization=QuantConfig(),
-            max_lora_rank=layer.max_lora_rank,
-            use_parallel_embedding=False,
-        )
-
-        # No other way to pass in model variant config, determine model variant by num_layers (7B: 32 layers)
-        config.set_if_not_exist('new_decoder_architecture', False if self.num_layers == 32 else True)
-        config.set_if_not_exist('parallel_attention', True)
-        config.set_if_not_exist('layernorm_epsilon', 1e-5)
-        config.set_if_not_exist('bias', False)
-        config.set_if_not_exist('moe_num_experts', 0)
-
-        return FalconDecoderLayer(
-            config=config,
-            layer_idx=self.layer_id,
-        )
diff --git a/nemo/export/trt_llm/decoder/gemma.py b/nemo/export/trt_llm/decoder/gemma.py
deleted file mode 100644
index 37f843dcf0ca..000000000000
--- a/nemo/export/trt_llm/decoder/gemma.py
+++ /dev/null
@@ -1,150 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Optional
-
-from tensorrt_llm.functional import non_gated_version
-from tensorrt_llm.models.gemma.model import GemmaDecoderLayer, QuantConfig
-from tensorrt_llm.models.modeling_utils import PretrainedConfig
-from typing_extensions import override
-
-from nemo.export.trt_llm.decoder.decoder import DecoderLayerBuilder, DecoderLayerConfigBuilder
-from nemo.export.trt_llm.model_config import (
-    LINEAR_COLUMN,
-    LINEAR_ROW,
-    AttentionConfig,
-    LayernormConfig,
-    LinearConfig,
-    MLPConfig,
-)
-
-
-class GemmaDecoderLayerConfigBuilder(DecoderLayerConfigBuilder):
-    """The LLAMA implementation of the DecoderLayerConfigBuilder."""
-
-    @override
-    def hidden_act_fn(self, layer):
-        return layer.mlp.act_fn
-
-    @override
-    def infer_num_attention_heads(self, layer):
-        return layer.self_attn.num_heads
-
-    @override
-    def infer_num_kv_heads(self, layer):
-        return layer.self_attn.num_key_value_heads
-
-    @override
-    def infer_max_position_embeddings(self, layer):
-        return layer.self_attn.max_position_embeddings
-
-    @override
-    def build_input_layernorm(self, layer) -> LayernormConfig:
-        return LayernormConfig.from_nn_module(layer.input_layernorm, dtype=self.dtype)
-
-    @override
-    def build_attention(self, layer) -> AttentionConfig:
-        config = AttentionConfig()
-        config.qkv = LinearConfig.from_qkv_nn_modules(
-            [layer.self_attn.q_proj, layer.self_attn.k_proj, layer.self_attn.v_proj],
-            rank=self.rank,
-            tensor_parallel=self.tensor_parallel,
-            dtype=self.dtype,
-        )
-
-        config.dense = LinearConfig.from_nn_module(
-            layer.self_attn.o_proj,
-            LINEAR_ROW,
-            rank=self.rank,
-            tensor_parallel=self.tensor_parallel,
-            dtype=self.dtype,
-        )
-
-        return config
-
-    @override
-    def build_mlp(self, layer) -> MLPConfig:
-        config = MLPConfig()
-        config.fc = LinearConfig.from_nn_module(
-            layer.mlp.gate_proj,
-            LINEAR_COLUMN,
-            rank=self.rank,
-            tensor_parallel=self.tensor_parallel,
-            dtype=self.dtype,
-        )
-        config.proj = LinearConfig.from_nn_module(
-            layer.mlp.down_proj,
-            LINEAR_ROW,
-            rank=self.rank,
-            tensor_parallel=self.tensor_parallel,
-            dtype=self.dtype,
-        )
-        config.gate = LinearConfig.from_nn_module(
-            layer.mlp.up_proj,
-            LINEAR_COLUMN,
-            rank=self.rank,
-            tensor_parallel=self.tensor_parallel,
-            dtype=self.dtype,
-        )
-
-        return config
-
-    @override
-    def build_post_layernorm(self, layer) -> Optional[LayernormConfig]:
-        return LayernormConfig.from_nn_module(layer.post_attention_layernorm, dtype=self.dtype)
-
-
-class GemmaDecoderLayerBuilder(DecoderLayerBuilder):
-    """The Gemma implementation of the DecoderLayer."""
-
-    @override
-    def build_decoder(self, layer):
-        rotary_scaling = None
-        if layer.rotary_scaling is not None:
-            rotary_scaling = {"type": "linear", "factor": float(layer.rotary_scaling)}
-
-        config = PretrainedConfig(
-            architecture=None,
-            dtype=self.dtype,
-            logits_dtype=self.dtype,
-            vocab_size=layer.vocab_size,
-            max_position_embeddings=self.max_position_embeddings,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_layers,
-            num_attention_heads=self.num_attention_heads,
-            num_key_value_heads=self.num_kv_heads,
-            head_size=layer.kv_channels,
-            hidden_act=self.hidden_act.split("-")[-1] if layer.moe_num_experts else non_gated_version(self.hidden_act),
-            intermediate_size=layer.ffn_hidden_size_local * self.tensor_parallel,
-            norm_epsilon=layer.norm_epsilon,
-            position_embedding_type="rope_gpt_neox",
-            world_size=self.tensor_parallel,
-            tp_size=self.tensor_parallel,
-            pp_size=1,
-            quantization=QuantConfig(),
-            max_lora_rank=layer.max_lora_rank,
-        )
-
-        config.set_if_not_exist('mlp_bias', False)
-        config.set_if_not_exist('attn_bias', False)
-        config.set_if_not_exist('rotary_base', layer.rotary_base)
-        config.set_if_not_exist('rotary_scaling', rotary_scaling)
-        config.set_if_not_exist('enable_pos_shift', False)
-        config.set_if_not_exist('dense_context_fmha', False)
-        config.set_if_not_exist('moe_num_experts', 0)
-
-        return GemmaDecoderLayer(
-            config=config,
-            layer_idx=self.layer_id,
-        )
diff --git a/nemo/export/trt_llm/decoder/gpt.py b/nemo/export/trt_llm/decoder/gpt.py
deleted file mode 100644
index a405aabbbd48..000000000000
--- a/nemo/export/trt_llm/decoder/gpt.py
+++ /dev/null
@@ -1,147 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from typing import Optional
-
-from tensorrt_llm.layers import AttentionMaskType, PositionEmbeddingType
-from tensorrt_llm.models.gpt.model import GPTDecoderLayer
-from tensorrt_llm.models.modeling_utils import PretrainedConfig, QuantConfig
-from typing_extensions import override
-
-from nemo.export.trt_llm.decoder.decoder import DecoderLayerBuilder, DecoderLayerConfigBuilder
-from nemo.export.trt_llm.model_config import (
-    LINEAR_COLUMN,
-    LINEAR_ROW,
-    AttentionConfig,
-    LayernormConfig,
-    LinearConfig,
-    MLPConfig,
-)
-
-
-class GPTDecoderLayerConfigBuilder(DecoderLayerConfigBuilder):
-    """The GPT2 implementation of the DecoderLayerConfigBuilder."""
-
-    @override
-    def hidden_act_fn(self, layer):
-        return layer.mlp.act
-
-    @override
-    def infer_num_attention_heads(self, layer):
-        return layer.attn.num_heads
-
-    @override
-    def infer_max_position_embeddings(self, layer):
-        return layer.attn.bias.shape[2]
-
-    @override
-    def build_input_layernorm(self, layer) -> LayernormConfig:
-        return LayernormConfig.from_nn_module(layer.ln_1, dtype=self.dtype)
-
-    @override
-    def build_attention(self, layer) -> AttentionConfig:
-        config = AttentionConfig()
-        config.qkv = LinearConfig.from_qkv_nn_modules(
-            [layer.attn.c_attn],
-            rank=self.rank,
-            tensor_parallel=self.tensor_parallel,
-            dtype=self.dtype,
-        )
-
-        config.dense = LinearConfig.from_nn_module(
-            layer.attn.c_proj,
-            LINEAR_ROW,
-            rank=self.rank,
-            tensor_parallel=self.tensor_parallel,
-            dtype=self.dtype,
-        )
-
-        return config
-
-    @override
-    def build_mlp(self, layer) -> MLPConfig:
-        config = MLPConfig()
-        config.fc = LinearConfig.from_nn_module(
-            layer.mlp.c_fc,
-            LINEAR_COLUMN,
-            rank=self.rank,
-            tensor_parallel=self.tensor_parallel,
-            dtype=self.dtype,
-        )
-        config.proj = LinearConfig.from_nn_module(
-            layer.mlp.c_proj,
-            LINEAR_ROW,
-            rank=self.rank,
-            tensor_parallel=self.tensor_parallel,
-            dtype=self.dtype,
-        )
-
-        return config
-
-    @override
-    def build_post_layernorm(self, layer) -> Optional[LayernormConfig]:
-        return LayernormConfig.from_nn_module(layer.ln_2, dtype=self.dtype)
-
-
-class GPTDecoderLayerBuilder(DecoderLayerBuilder):
-    """The GPT implementation of the DecoderLayer."""
-
-    @override
-    def build_decoder(self, layer):
-        rotary_pct = layer.rotary_pct
-
-        position_embedding_type = "rope_gpt_neox" if layer.position_embedding_type == "rope" else "learned_absolute"
-
-        assert not (position_embedding_type == "rope_gpt_neox" and rotary_pct == 0.0)
-
-        bias_qkv = layer.attention.qkv.bias is not None
-
-        rotary_scaling = None
-        if layer.rotary_scaling is not None:
-            rotary_scaling = {"type": "linear", "factor": float(layer.rotary_scaling)}
-
-        config = PretrainedConfig(
-            architecture=None,
-            dtype=self.dtype,
-            logits_dtype=self.dtype,
-            vocab_size=layer.vocab_size,
-            max_position_embeddings=self.max_position_embeddings,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_layers,
-            num_attention_heads=self.num_attention_heads,
-            num_key_value_heads=self.num_kv_heads,
-            hidden_act=self.hidden_act,
-            intermediate_size=layer.ffn_hidden_size_local * self.tensor_parallel,
-            norm_epsilon=layer.norm_epsilon,
-            position_embedding_type=position_embedding_type,
-            world_size=self.tensor_parallel,
-            tp_size=self.tensor_parallel,
-            pp_size=1,
-            max_lora_rank=layer.max_lora_rank,
-            quantization=QuantConfig(),
-        )
-
-        config.set_if_not_exist('hidden_act', self.hidden_act)
-        config.set_if_not_exist('apply_query_key_layer_scaling', False)
-        config.set_if_not_exist('bias', bias_qkv)
-        config.set_if_not_exist('rotary_base', layer.rotary_base)
-        config.set_if_not_exist('rotary_scaling', rotary_scaling)
-        config.set_if_not_exist('rotary_pct', rotary_pct)
-        config.set_if_not_exist('moe_num_experts', 0)
-
-        return GPTDecoderLayer(
-            config=config,
-            layer_idx=self.layer_id,
-        )
diff --git a/nemo/export/trt_llm/decoder/gptj.py b/nemo/export/trt_llm/decoder/gptj.py
deleted file mode 100644
index 327a31fdd35c..000000000000
--- a/nemo/export/trt_llm/decoder/gptj.py
+++ /dev/null
@@ -1,117 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from typing import Optional
-
-from tensorrt_llm.models.gptj.model import GPTJDecoderLayer
-from typing_extensions import override
-
-from nemo.export.trt_llm.decoder.decoder import DecoderLayerBuilder, DecoderLayerConfigBuilder
-from nemo.export.trt_llm.model_config import (
-    LINEAR_COLUMN,
-    LINEAR_ROW,
-    AttentionConfig,
-    LayernormConfig,
-    LinearConfig,
-    MLPConfig,
-)
-
-
-class GPTJDecoderLayerConfigBuilder(DecoderLayerConfigBuilder):
-    """The GPTJ implementation of the DecoderLayerConfigBuilder."""
-
-    @override
-    def hidden_act_fn(self, layer):
-        """Returns the hidden act fn in the MLP layer, e.g. SiLUActivation or NewGELUActivation."""
-        return layer.mlp.act
-
-    @override
-    def infer_num_attention_heads(self, layer):
-        return layer.attn.num_attention_heads
-
-    @override
-    def infer_max_position_embeddings(self, layer):
-        return layer.attn.bias.shape[2]
-
-    @override
-    def build_input_layernorm(self, layer) -> LayernormConfig:
-        return LayernormConfig.from_nn_module(layer.ln_1, dtype=self.dtype)
-
-    @override
-    def build_attention(self, layer) -> AttentionConfig:
-        config = AttentionConfig()
-        config.qkv = LinearConfig.from_qkv_nn_modules(
-            [layer.attn.q_proj, layer.attn.k_proj, layer.attn.v_proj],
-            rank=self.rank,
-            tensor_parallel=self.tensor_parallel,
-            dtype=self.dtype,
-        )
-
-        config.dense = LinearConfig.from_nn_module(
-            layer.attn.out_proj,
-            LINEAR_ROW,
-            rank=self.rank,
-            tensor_parallel=self.tensor_parallel,
-            dtype=self.dtype,
-        )
-
-        config.rotary_dim = layer.attn.rotary_dim
-
-        return config
-
-    @override
-    def build_mlp(self, layer) -> MLPConfig:
-        config = MLPConfig()
-        config.fc = LinearConfig.from_nn_module(
-            layer.mlp.fc_in,
-            LINEAR_COLUMN,
-            rank=self.rank,
-            tensor_parallel=self.tensor_parallel,
-            dtype=self.dtype,
-        )
-        config.proj = LinearConfig.from_nn_module(
-            layer.mlp.fc_out,
-            LINEAR_ROW,
-            rank=self.rank,
-            tensor_parallel=self.tensor_parallel,
-            dtype=self.dtype,
-        )
-
-        return config
-
-    @override
-    def build_post_layernorm(self, layer) -> Optional[LayernormConfig]:
-        # GPTJ do not have post layer_norm
-        return None
-
-
-class GPTJDecoderLayerBuilder(DecoderLayerBuilder):
-    """The GPTJ implementation of the DecoderLayer."""
-
-    @override
-    def build_decoder(self, layer):
-        assert self.tensor_parallel == 1 and self.rank == 0, "Only single GPU is supported for GPTJ"
-
-        return GPTJDecoderLayer(
-            hidden_size=self.hidden_size,
-            num_attention_heads=self.num_attention_heads,
-            max_position_embeddings=self.max_position_embeddings,
-            rotary_dim=layer.attention.rotary_dim,
-            dtype=self.dtype,
-            hidden_act=self.hidden_act,
-            tp_group=self.tp_group,
-            tp_size=self.tensor_parallel,
-            max_lora_rank=layer.max_lora_rank,
-        )
diff --git a/nemo/export/trt_llm/decoder/llama.py b/nemo/export/trt_llm/decoder/llama.py
deleted file mode 100644
index b37d62e214de..000000000000
--- a/nemo/export/trt_llm/decoder/llama.py
+++ /dev/null
@@ -1,169 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from typing import Optional
-
-from tensorrt_llm.functional import non_gated_version
-from tensorrt_llm.layers import MoeConfig
-from tensorrt_llm.models.llama.model import LLaMADecoderLayer
-from tensorrt_llm.models.modeling_utils import PretrainedConfig, QuantConfig
-from typing_extensions import override
-
-from nemo.export.trt_llm.decoder.decoder import DecoderLayerBuilder, DecoderLayerConfigBuilder
-from nemo.export.trt_llm.model_config import (
-    LINEAR_COLUMN,
-    LINEAR_ROW,
-    AttentionConfig,
-    LayernormConfig,
-    LinearConfig,
-    MLPConfig,
-)
-
-
-class LLAMADecoderLayerConfigBuilder(DecoderLayerConfigBuilder):
-    """The LLAMA implementation of the DecoderLayerConfigBuilder."""
-
-    @override
-    def hidden_act_fn(self, layer):
-        return layer.mlp.act_fn
-
-    @override
-    def infer_num_attention_heads(self, layer):
-        return layer.self_attn.num_heads
-
-    @override
-    def infer_num_kv_heads(self, layer):
-        return layer.self_attn.num_key_value_heads
-
-    @override
-    def infer_max_position_embeddings(self, layer):
-        return layer.self_attn.max_position_embeddings
-
-    @override
-    def build_input_layernorm(self, layer) -> LayernormConfig:
-        return LayernormConfig.from_nn_module(layer.input_layernorm, dtype=self.dtype)
-
-    @override
-    def build_attention(self, layer) -> AttentionConfig:
-        config = AttentionConfig()
-        config.qkv = LinearConfig.from_qkv_nn_modules(
-            [layer.self_attn.q_proj, layer.self_attn.k_proj, layer.self_attn.v_proj],
-            rank=self.rank,
-            tensor_parallel=self.tensor_parallel,
-            dtype=self.dtype,
-        )
-
-        config.dense = LinearConfig.from_nn_module(
-            layer.self_attn.o_proj,
-            LINEAR_ROW,
-            rank=self.rank,
-            tensor_parallel=self.tensor_parallel,
-            dtype=self.dtype,
-        )
-
-        return config
-
-    @override
-    def build_mlp(self, layer) -> MLPConfig:
-        config = MLPConfig()
-        config.fc = LinearConfig.from_nn_module(
-            layer.mlp.gate_proj,
-            LINEAR_COLUMN,
-            rank=self.rank,
-            tensor_parallel=self.tensor_parallel,
-            dtype=self.dtype,
-        )
-        config.proj = LinearConfig.from_nn_module(
-            layer.mlp.down_proj,
-            LINEAR_ROW,
-            rank=self.rank,
-            tensor_parallel=self.tensor_parallel,
-            dtype=self.dtype,
-        )
-        config.gate = LinearConfig.from_nn_module(
-            layer.mlp.up_proj,
-            LINEAR_COLUMN,
-            rank=self.rank,
-            tensor_parallel=self.tensor_parallel,
-            dtype=self.dtype,
-        )
-
-        return config
-
-    @override
-    def build_post_layernorm(self, layer) -> Optional[LayernormConfig]:
-        return LayernormConfig.from_nn_module(layer.post_attention_layernorm, dtype=self.dtype)
-
-
-class LLAMADecoderLayerBuilder(DecoderLayerBuilder):
-    """The LLAMA implementation of the DecoderLayer."""
-
-    @override
-    def build_decoder(self, layer):
-        rotary_scaling = None
-        if layer.rotary_scaling is not None:
-            rotary_scaling = {"type": "linear", "factor": float(layer.rotary_scaling)}
-
-        config = PretrainedConfig(
-            architecture=None,
-            dtype=self.dtype,
-            logits_dtype=self.dtype,
-            vocab_size=layer.vocab_size,
-            max_position_embeddings=self.max_position_embeddings,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_layers,
-            num_attention_heads=self.num_attention_heads,
-            num_key_value_heads=self.num_kv_heads,
-            hidden_act=self.hidden_act.split("-")[-1] if layer.moe_num_experts else non_gated_version(self.hidden_act),
-            intermediate_size=layer.ffn_hidden_size_local * self.tensor_parallel,
-            norm_epsilon=layer.norm_epsilon,
-            position_embedding_type="rope_gpt_neox",
-            world_size=self.tensor_parallel,
-            tp_size=self.tensor_parallel,
-            pp_size=1,
-            max_lora_rank=layer.max_lora_rank,
-            quantization=QuantConfig(),
-        )
-
-        config.set_if_not_exist('mlp_bias', False)
-        config.set_if_not_exist('attn_bias', False)
-        config.set_if_not_exist('rotary_base', layer.rotary_base)
-        config.set_if_not_exist('rotary_scaling', rotary_scaling)
-        config.set_if_not_exist('enable_pos_shift', False)
-        config.set_if_not_exist('dense_context_fmha', False)
-        config.set_if_not_exist('moe_num_experts', 0)
-
-        if layer.moe_num_experts:
-            if layer.moe_num_experts is not None:
-                if layer.moe_top_k is None:
-                    layer.moe_top_k = 1
-
-                layer.moe_tp_mode = MoeConfig.ParallelismMode.TENSOR_PARALLEL if layer.moe_tp_mode is None else None
-                layer.moe_renorm_mode = (
-                    MoeConfig.ExpertScaleNormalizationMode.RENORMALIZE if layer.moe_renorm_mode is None else None
-                )
-                moe_config = MoeConfig(
-                    layer.moe_num_experts, layer.moe_top_k, layer.moe_tp_mode, layer.moe_renorm_mode
-                )
-                moe_config.validate()
-                config.moe_num_experts = layer.moe_num_experts
-                config.moe_top_k = layer.moe_top_k
-                config.moe_tp_mode = layer.moe_tp_mode
-                config.moe_normalization_mode = layer.moe_renorm_mode
-
-        return LLaMADecoderLayer(
-            config=config,
-            layer_idx=self.layer_id,
-        )
diff --git a/nemo/export/trt_llm/model_config.py b/nemo/export/trt_llm/model_config.py
deleted file mode 100644
index 0f120dc56153..000000000000
--- a/nemo/export/trt_llm/model_config.py
+++ /dev/null
@@ -1,555 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import dataclasses
-from dataclasses import dataclass, field
-from typing import Dict, List, get_args, get_origin
-
-import numpy as np
-import tensorrt as trt
-import torch.nn as nn
-from tensorrt_llm._utils import pad_vocab_size
-from tensorrt_llm.functional import is_gated_activation
-from transformers import LlamaConfig, PretrainedConfig
-from transformers.models.llama.modeling_llama import LlamaRMSNorm
-
-from nemo.export.trt_llm.tensor_utils import get_tensor_from_dict, split, torch_to_numpy_with_dtype
-
-
-DECODER_GPT2 = "gpt2"
-DECODER_GPTJ = "gptj"
-DECODER_LLAMA = "llama"
-DECODER_GPTNEXT = "gptnext"
-DECODER_FALCON = "falcon"
-DECODER_GEMMA = "gemma"
-
-QUANTIZATION_NONE = ""
-QUANTIZATION_FP8 = "fp8"
-QUANTIZATION_INT8_SQ = "int8_sq"
-
-LINEAR_COLUMN = "column"
-LINEAR_ROW = "row"
-
-LAYERNORM_DEFAULT = ""
-LAYERNORM_RMS = "rms"
-
-LAYER_DEFAULT = ""
-LAYER_QKV = "qkv"
-
-
-@dataclass
-class EmbeddingConfig:
-    """The embedding layer config."""
-
-    weight: np.array = None
-    # Whether the embedding weights are local
-    is_local: bool = False
-
-    @staticmethod
-    def from_nn_module(module: nn.Module, dtype=trt.float16):
-        """Converts an nn.Module to an EmbeddingConfig."""
-        return EmbeddingConfig(weight=torch_to_numpy_with_dtype(module.weight, dtype))
-
-    @property
-    def local_vocab_size(self):
-        """Infers the vocab_size from the embedding layer weights shape."""
-        return self.weight.shape[0]
-
-    @property
-    def hidden_size(self):
-        """Infers the hidden_size from the embedding layer weights shape."""
-        return self.weight.shape[1]
-
-
-@dataclass
-class LayernormConfig:
-    """The layernorm layer config."""
-
-    weight: np.array = None
-    bias: np.array = None
-    layernorm_type: str = LAYERNORM_DEFAULT
-
-    @staticmethod
-    def from_nn_module(module: nn.Module, dtype=trt.float16):
-        """Converts an nn.Module to an LayernormConfig."""
-        layernorm_type = LAYERNORM_RMS if type(module) is LlamaRMSNorm else LAYERNORM_DEFAULT
-
-        config = LayernormConfig(weight=torch_to_numpy_with_dtype(module.weight, dtype), layernorm_type=layernorm_type)
-        if layernorm_type == LAYERNORM_DEFAULT:
-            config.bias = torch_to_numpy_with_dtype(module.bias, dtype)
-
-        return config
-
-
-@dataclass
-class LinearConfig:
-    """The linear layer config."""
-
-    linear_type: str = ""
-    weight: np.array = None
-    bias: np.array = None
-    activation_scaling_factor: np.array = None
-    weights_scaling_factor: np.array = None
-    prequant_scaling_factor: np.array = None
-    layer_type: str = LAYER_DEFAULT
-
-    @staticmethod
-    def from_nn_module(module: nn.Module, linear_type: str, rank=0, tensor_parallel=1, dtype=trt.float16):
-        """Converts an nn.Module to an LinearConfig."""
-        weight = torch_to_numpy_with_dtype(module.weight, dtype)
-        if "Conv1D" in type(module).__name__:
-            weight = weight.transpose()
-        else:
-            assert type(module) is nn.Linear
-
-        config = LinearConfig()
-        config.linear_type = linear_type
-        config.weight = np.ascontiguousarray(
-            split(weight, tensor_parallel, rank, dim=0 if linear_type == LINEAR_COLUMN else 1)
-        )
-
-        if hasattr(module, "bias") and module.bias is not None:
-            if linear_type == LINEAR_COLUMN:
-                config.bias = np.ascontiguousarray(
-                    split(
-                        torch_to_numpy_with_dtype(module.bias, dtype),
-                        tensor_parallel,
-                        rank,
-                    )
-                )
-            else:
-                config.bias = torch_to_numpy_with_dtype(module.bias, dtype)
-
-        return config
-
-    @staticmethod
-    def from_qkv_nn_modules(qkv_modules: List[nn.Module], rank=0, tensor_parallel=1, dtype=trt.float16):
-        """Converts the qkv modules to an LinearConfig."""
-        config = LinearConfig()
-        config.linear_type = LINEAR_COLUMN
-        config.layer_type = LAYER_QKV
-        if len(qkv_modules) == 1:
-            # QKV layers combined as a single module, e.g. GPT2
-            qkv_module = qkv_modules[0]
-            assert "Conv1D" in type(qkv_module).__name__
-
-            qkv_shape = qkv_module.weight.shape
-            # Decode the concat QKV weights and split them to different GPU rank.
-            config.weight = np.ascontiguousarray(
-                split(
-                    torch_to_numpy_with_dtype(qkv_module.weight, dtype=dtype).reshape(
-                        qkv_shape[0], 3, qkv_shape[-1] // 3
-                    ),
-                    tensor_parallel,
-                    rank,
-                    dim=-1,
-                )
-                .reshape(qkv_shape[0], -1)
-                .transpose()
-            )
-            config.bias = np.ascontiguousarray(
-                split(
-                    torch_to_numpy_with_dtype(qkv_module.bias, dtype=dtype).reshape(3, qkv_shape[-1] // 3),
-                    tensor_parallel,
-                    rank,
-                    dim=-1,
-                ).reshape(-1)
-            )
-
-        elif len(qkv_modules) == 3:
-            # Separate QKV layers
-            for m in qkv_modules:
-                assert type(m) is nn.Linear
-                assert not (hasattr(m, "bias") and m.bias is not None)
-
-            q_weight = split(torch_to_numpy_with_dtype(qkv_modules[0].weight), tensor_parallel, rank)
-            k_weight = split(torch_to_numpy_with_dtype(qkv_modules[1].weight), tensor_parallel, rank)
-            v_weight = split(torch_to_numpy_with_dtype(qkv_modules[2].weight), tensor_parallel, rank)
-            split_v = np.concatenate((q_weight, k_weight, v_weight))
-            config.weight = np.ascontiguousarray(split_v)
-
-        else:
-            assert False, f"QKV modules format {qkv_modules} not supported"
-
-        return config
-
-
-@dataclass
-class MoEMLPConfig:
-    """The MLP layer config."""
-
-    fc1: LinearConfig = None
-    fc2: LinearConfig = None
-    router: LinearConfig = None
-    hidden_act: str = ""
-
-    @staticmethod
-    def from_nemo(
-        weights_dict: Dict[str, np.ndarray],
-        llm_config: PretrainedConfig,
-        layer_id: int,
-        rank: int = 0,
-        is_mcore: bool = False,
-    ):
-        """Converts the nemo weights and config to `MLPConfig`."""
-        mlp = MoEMLPConfig(hidden_act=llm_config.activation_function)
-        mlp.fc1 = LinearConfig(linear_type=LINEAR_COLUMN)
-
-        mlp.fc1.weight = get_tensor_from_dict(
-            weights_dict, f"layers.{layer_id}.mlp.experts.experts.linear_fc1.weight.{rank}"
-        )
-
-        mlp.fc1.bias = get_tensor_from_dict(
-            weights_dict, f"layers.{layer_id}.mlp.experts.experts.linear_fc1.bias.{rank}"
-        )
-
-        mlp.fc2 = LinearConfig(linear_type=LINEAR_ROW)
-        mlp.fc2.weight = get_tensor_from_dict(
-            weights_dict, f"layers.{layer_id}.mlp.experts.experts.linear_fc2.weight.{rank}"
-        )
-        mlp.fc2.bias = get_tensor_from_dict(
-            weights_dict, f"layers.{layer_id}.mlp.experts.experts.linear_fc2.bias.{rank}"
-        )
-
-        mlp.router = LinearConfig(linear_type=LINEAR_ROW)
-        mlp.router.weight = get_tensor_from_dict(weights_dict, f"layers.{layer_id}.mlp.router.weight.{rank}")
-        return mlp
-
-
-@dataclass
-class AttentionConfig:
-    """The attention layer config."""
-
-    qkv: LinearConfig = None
-    dense: LinearConfig = None
-
-    rotary_dim: int = -np.inf
-
-    @staticmethod
-    def from_nemo(
-        weights_dict: Dict[str, np.ndarray],
-        layer_id: int,
-        rank: int = 0,
-    ):
-        """Converts the nemo weights and config to `AttentionConfig`."""
-        attention = AttentionConfig()
-        attention.qkv = LinearConfig(linear_type=LINEAR_COLUMN, layer_type=LAYER_QKV)
-        attention.qkv.weight = get_tensor_from_dict(
-            weights_dict, f"layers.{layer_id}.attention.query_key_value.weight.{rank}"
-        )
-        attention.qkv.bias = get_tensor_from_dict(
-            weights_dict,
-            f"layers.{layer_id}.attention.query_key_value.bias.{rank}",
-        )
-
-        attention.dense = LinearConfig(linear_type=LINEAR_ROW)
-        attention.dense.weight = get_tensor_from_dict(weights_dict, f"layers.{layer_id}.attention.dense.weight.{rank}")
-        attention.dense.bias = get_tensor_from_dict(
-            weights_dict,
-            f"layers.{layer_id}.attention.dense.bias",
-        )
-        return attention
-
-
-@dataclass
-class MLPConfig:
-    """The MLP layer config."""
-
-    fc: LinearConfig = None
-    gate: LinearConfig = None
-    proj: LinearConfig = None
-    hidden_act: str = ""
-
-    @staticmethod
-    def from_nemo(
-        weights_dict: Dict[str, np.ndarray],
-        llm_config: PretrainedConfig,
-        layer_id: int,
-        rank: int = 0,
-        is_mcore: bool = False,
-    ):
-        """Converts the nemo weights and config to `MLPConfig`."""
-        mlp = MLPConfig(hidden_act=llm_config.activation_function)
-        mlp.fc = LinearConfig(linear_type=LINEAR_COLUMN)
-        mlp.fc.weight = get_tensor_from_dict(weights_dict, f"layers.{layer_id}.mlp.dense_h_to_4h.weight.{rank}")
-
-        # print("********** mlp.fc.weight : ", mlp.fc.weight )
-
-        mlp.fc.bias = get_tensor_from_dict(
-            weights_dict,
-            f"layers.{layer_id}.mlp.dense_h_to_4h.bias.{rank}",
-        )
-
-        gated = is_gated_activation(mlp.hidden_act)
-        is_fast_glu = mlp.hidden_act in ['fast-geglu', 'fast-swiglu', 'fast-reglu']
-        if gated:
-            mlp.gate = LinearConfig(linear_type=LINEAR_COLUMN)
-            layer_name = (
-                f"layers.{layer_id}.mlp.dense_h_to_4h_2.weight.{rank}"
-                if isinstance(llm_config, LlamaConfig) and not is_mcore and not is_fast_glu
-                else f"layers.{layer_id}.mlp.dense_h_to_4h.gate.weight.{rank}"
-            )
-            mlp.gate.weight = get_tensor_from_dict(
-                weights_dict,
-                layer_name,
-            )
-            mlp.gate.bias = get_tensor_from_dict(
-                weights_dict,
-                f"layers.{layer_id}.mlp.dense_h_to_4h.gate.bias.{rank}",
-            )
-
-        mlp.proj = LinearConfig(linear_type=LINEAR_ROW)
-        mlp.proj.weight = get_tensor_from_dict(weights_dict, f"layers.{layer_id}.mlp.dense_4h_to_h.weight.{rank}")
-        mlp.proj.bias = get_tensor_from_dict(weights_dict, f"layers.{layer_id}.mlp.dense_4h_to_h.bias")
-        return mlp
-
-
-@dataclass
-class DecoderLayerConfig:
-    """The decoder layer config."""
-
-    decoder_type: str = ""
-    input_layernorm: LayernormConfig = None
-    mlp_layernorm: LayernormConfig = None  # Falcon 40B/180B has mlp_layernorm
-    attention: AttentionConfig = None
-    post_layernorm: LayernormConfig = None
-    mlp: MLPConfig = None
-
-    num_attention_heads: int = 0
-
-    num_kv_heads: int = 0
-    kv_channels: int = None
-    max_position_embeddings: int = 0
-    rotary_pct: float = 0
-    rotary_base: int = 10000
-    rotary_scaling: float = None
-    position_embedding_type: str = None
-
-    moe_num_experts: int = None
-    moe_top_k: int = None
-    moe_tp_mode: int = None
-    moe_renorm_mode: int = None
-
-    vocab_size: int = 0
-    norm_epsilon: float = 0.0
-    max_lora_rank: int = 64
-
-    @property
-    def is_moe(self):
-        return self.moe_num_experts is not None and self.moe_num_experts > 1
-
-    @property
-    def hidden_size(self):
-        """Returns the hidden size of the transformer model."""
-        if self.is_moe:
-            return self.mlp.fc2.weight.shape[1]
-        else:
-            return self.mlp.fc.weight.shape[1]
-
-    @property
-    def ffn_hidden_size_local(self):
-        """Returns the ffn hidden size of the transformer model."""
-        if self.is_moe:
-            return self.mlp.fc2.weight.shape[-1]
-        else:
-            return self.mlp.fc.weight.shape[0]
-
-    @staticmethod
-    def from_nemo(
-        weights_dict: Dict[str, np.ndarray],
-        llm_config: PretrainedConfig,
-        decoder_type: str,
-        layer_id: int,
-        rank: int = 0,
-        is_mcore: bool = False,
-    ):
-        """Converts the nemo weights and config to `DecoderLayerConfig`."""
-        layer_config = DecoderLayerConfig(
-            decoder_type=decoder_type,
-            num_attention_heads=llm_config.n_head,
-            max_position_embeddings=llm_config.n_positions,
-            rotary_pct=llm_config.rotary_pct if hasattr(llm_config, "rotary_pct") else 1.0,
-            rotary_base=(llm_config.rotary_base if hasattr(llm_config, "rotary_base") else 10000),
-            rotary_scaling=(llm_config.rotary_scaling if hasattr(llm_config, "rotary_scaling") else None),
-            position_embedding_type=(
-                llm_config.position_embedding_type if hasattr(llm_config, "position_embedding_type") else None
-            ),
-            num_kv_heads=(llm_config.num_kv_heads if hasattr(llm_config, "num_kv_heads") else 0),
-            kv_channels=(llm_config.kv_channels if hasattr(llm_config, "kv_channels") else None),
-            moe_num_experts=(llm_config.moe_num_experts if hasattr(llm_config, "moe_num_experts") else None),
-            moe_top_k=(llm_config.moe_top_k if hasattr(llm_config, "moe_top_k") else None),
-            moe_tp_mode=(llm_config.moe_tp_mode if hasattr(llm_config, "moe_tp_mode") else None),
-            moe_renorm_mode=(llm_config.moe_renorm_mode if hasattr(llm_config, "moe_renorm_mode") else None),
-            vocab_size=llm_config.vocab_size,
-            norm_epsilon=llm_config.norm_epsilon,
-        )
-        layer_config.input_layernorm = LayernormConfig()
-        layer_config.input_layernorm.layernorm_type = (
-            LAYERNORM_RMS if isinstance(llm_config, LlamaConfig) else LAYERNORM_DEFAULT
-        )
-        layer_config.input_layernorm.weight = get_tensor_from_dict(
-            weights_dict,
-            f"layers.{layer_id}.input_layernorm.weight",
-        )
-        layer_config.input_layernorm.bias = get_tensor_from_dict(
-            weights_dict,
-            f"layers.{layer_id}.input_layernorm.bias",
-        )
-
-        layer_config.mlp_layernorm = LayernormConfig()
-        layer_config.mlp_layernorm.layernorm_type = LAYERNORM_DEFAULT  # Falcon uses default layernorm
-        layer_config.mlp_layernorm.weight = get_tensor_from_dict(
-            weights_dict,
-            f"layers.{layer_id}.pre_mlp_layernorm.weight",
-        )
-        layer_config.mlp_layernorm.bias = get_tensor_from_dict(
-            weights_dict,
-            f"layers.{layer_id}.pre_mlp_layernorm.bias",
-        )
-
-        layer_config.post_layernorm = LayernormConfig()
-        layer_config.post_layernorm.layernorm_type = (
-            LAYERNORM_RMS if isinstance(llm_config, LlamaConfig) else LAYERNORM_DEFAULT
-        )
-
-        layer_config.post_layernorm.weight = get_tensor_from_dict(
-            weights_dict,
-            f"layers.{layer_id}.post_attention_layernorm.weight",
-        )
-        layer_config.post_layernorm.bias = get_tensor_from_dict(
-            weights_dict,
-            f"layers.{layer_id}.post_attention_layernorm.bias",
-        )
-
-        if layer_config.post_layernorm.weight is None:  # Falcon doesn't have post layernorm
-            layer_config.post_layernorm = None
-
-        if layer_config.mlp_layernorm.weight is None:
-            layer_config.mlp_layernorm = None
-
-        layer_config.attention = AttentionConfig.from_nemo(
-            weights_dict,
-            layer_id,
-            rank,
-        )
-
-        moe = False
-        if llm_config.moe_num_experts is not None:
-            if llm_config.moe_num_experts > 1:
-                moe = True
-
-        if moe:
-            layer_config.mlp = MoEMLPConfig.from_nemo(weights_dict, llm_config, layer_id, rank, is_mcore)
-        else:
-            layer_config.mlp = MLPConfig.from_nemo(weights_dict, llm_config, layer_id, rank, is_mcore)
-
-        return layer_config
-
-
-def _from_dict(class_type, data):
-    """Helper function to load the data as a class_type. class_type must be a dataclass."""
-    if data is None:
-        return None
-
-    if dataclasses.is_dataclass(class_type):
-        fieldtypes = {f.name: f.type for f in dataclasses.fields(class_type)}
-        return class_type(**{f: _from_dict(fieldtypes[f], data[f]) for f in data})
-    elif get_origin(class_type) == list and dataclasses.is_dataclass(get_args(class_type)[0]):
-        list_value = []
-        for child in data:
-            child_class_type = get_args(class_type)[0]
-            list_value.append(_from_dict(child_class_type, child))
-        return list_value
-    else:
-        return data
-
-
-@dataclass
-class ModelConfig:
-    """The full LLM model config that includes the full information needed for tensorrt_llm engine building.
-
-    This class includes all the fields that tensorrt_llm supports, but not all of the fields are required.
-    """
-
-    # Global metadata
-    quantization: str = QUANTIZATION_NONE
-    dtype: str = "float16"
-
-    # Model structure and weights
-    vocab_embedding: EmbeddingConfig = None
-    positional_embedding: EmbeddingConfig = None
-    layers: List[DecoderLayerConfig] = field(default_factory=list)
-    final_layernorm: LayernormConfig = None
-    lm_head: LinearConfig = None
-
-    # Ptuning metadata
-    use_prompt_tuning: bool = False
-    use_parallel_embedding: bool = False
-    max_lora_rank: int = 64
-
-    # Parallel metadata
-    mapping = None
-
-    def to_dict(self) -> dict:
-        """Converts the instance to a python dict."""
-        return dataclasses.asdict(self)
-
-    @staticmethod
-    def from_dict(d: dict):
-        """Load a dict to a `ModelConfig` instance."""
-        return _from_dict(ModelConfig, d)
-
-    @property
-    def vocab_size(self):
-        """Returns the vocab_size of the model."""
-        return (
-            self.vocab_embedding.local_vocab_size * self.mapping.tp_size
-            if self.vocab_embedding.is_local
-            else self.vocab_embedding.local_vocab_size
-        )
-
-    @property
-    def vocab_size_padded(self):
-        """Returns the padded vocab_size of the model rounds to the tensor_parallel."""
-        return pad_vocab_size(self.vocab_size, self.mapping.tp_size)
-
-    @property
-    def hidden_size(self):
-        """Returns the hidden_size of the model."""
-        return self.vocab_embedding.hidden_size
-
-    @property
-    def max_position_embeddings(self):
-        """Returns the max_position_embedding of the model."""
-        return self.layers[0].max_position_embeddings
-
-    @property
-    def num_attention_heads(self):
-        """Returns the num_attention_heads of the model."""
-        return self.layers[0].num_attention_heads
-
-    @property
-    def num_kv_heads(self):
-        """Returns the num_key_value_heads of the model."""
-        return self.layers[0].num_kv_heads if self.layers[0].num_kv_heads > 0 else self.num_attention_heads
-
-    @property
-    def head_size(self):
-        """Returns the head_size of the model."""
-        return self.layers[0].kv_channels
-
-    @property
-    def hidden_act(self):
-        """Returns the hidden_act of the model."""
-        return self.layers[0].mlp.hidden_act
diff --git a/nemo/export/trt_llm/model_config_trt.py b/nemo/export/trt_llm/model_config_trt.py
deleted file mode 100644
index 635f6ae4d807..000000000000
--- a/nemo/export/trt_llm/model_config_trt.py
+++ /dev/null
@@ -1,82 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import shutil
-from pathlib import Path
-from typing import List, Union
-
-from nemo.export.trt_llm.model_config import ModelConfig
-from nemo.export.trt_llm.tensorrt_llm_model import LMHeadModelBuilder
-
-
-def model_config_to_tensorrt_llm(
-    model_configs: List[ModelConfig],
-    engine_dir: Union[str, Path],
-    world_size: int = 1,
-    max_input_len: int = 200,
-    max_output_len: int = 200,
-    max_batch_size: int = 1,
-    max_beam_width: int = 1,
-    max_prompt_embedding_table_size: int = 0,
-    use_inflight_batching: bool = False,
-    paged_kv_cache: bool = False,
-    enable_context_fmha: bool = True,
-    enable_multi_block_mode: bool = False,
-    use_refit: bool = False,
-    use_lora_plugin: str = None,
-    lora_target_modules: List[str] = None,
-    max_lora_rank: int = 64,
-):
-    """The API to convert a torch or huggingface model represented as ModelConfig to tensorrt_llm.
-
-    Args:
-        model_configs: The list of ModelConfig converted, 1 for each GPU.
-        engine_dir: The target output directory to save the built tensorrt_llm engines.
-        gpus: the number of inference gpus for multi gpu inferencing.
-        max_input_len: The max input sequence length.
-        max_output_len: The max output sequence length.
-        max_batch_size: The max batch size.
-        max_beam_width: The max beam search width.
-        max_prompt_embedding_table_size: max size of the prompt embedding table.
-        use_inflight_batching (bool): if True, enables inflight batching for TensorRT-LLM Triton backend.
-        paged_kv_cache (bool): if True, uses kv cache feature of the TensorRT-LLM.
-        enable_context_fmha (bool): if True, use fused Context MultiHeadedAttention.
-        enable_multi_block_mode (bool): enable faster decoding in multihead attention. Required for long context.
-    """
-    engine_dir = Path(engine_dir)
-    if os.path.exists(engine_dir):
-        shutil.rmtree(engine_dir)
-
-    for rank in range(world_size):
-        model_configs[rank].use_prompt_tuning = max_prompt_embedding_table_size > 0
-        model_configs[rank].max_lora_rank = max_lora_rank
-        builder = LMHeadModelBuilder(model_configs[rank])
-        builder.build(
-            output_dir=engine_dir,
-            max_input_len=max_input_len,
-            max_output_len=max_output_len,
-            max_batch_size=max_batch_size,
-            max_beam_width=max_beam_width,
-            parallel_build=False,
-            max_prompt_embedding_table_size=max_prompt_embedding_table_size,
-            use_inflight_batching=use_inflight_batching,
-            paged_kv_cache=paged_kv_cache,
-            enable_context_fmha=enable_context_fmha,
-            enable_multi_block_mode=enable_multi_block_mode,
-            use_refit=use_refit,
-            use_lora_plugin=use_lora_plugin,
-            lora_target_modules=lora_target_modules,
-            max_lora_rank=max_lora_rank,
-        )
diff --git a/nemo/export/trt_llm/nemo/convert.py b/nemo/export/trt_llm/nemo/convert.py
index 7598b3f6825f..aa2a29888703 100644
--- a/nemo/export/trt_llm/nemo/convert.py
+++ b/nemo/export/trt_llm/nemo/convert.py
@@ -23,21 +23,6 @@
 weights_dict = {}
 
 
-def cpu_map_location(storage, loc):
-    return storage.cpu()
-
-
-def gpu_map_location(storage, loc):
-    if loc.startswith("cuda"):
-        training_gpu_idx = int(loc.split(":")[1])
-        inference_gpu_idx = training_gpu_idx % torch.cuda.device_count()
-        return storage.cuda(inference_gpu_idx)
-    elif loc.startswith("cpu"):
-        return storage.cpu()
-    else:
-        raise ValueError(f"Not handled {loc}")
-
-
 def save_val(val, dir, key, tp_num=None):
     suffix = "" if tp_num is None else f".{tp_num}.bin"
     # Transpose linear layer weights to the correct shape.
@@ -411,137 +396,3 @@ def split_and_save_weight(tp_rank, saved_dir, split_factor, key, vals, storage_t
 
     global weights_dict
     return weights_dict
-
-
-# Similar to split_save_weight but done on GPU for performance
-@torch.no_grad()
-def save_weight_torch(tp_rank, saved_dir, split_factor, key, vals, storage_type, act_range, config):
-    def save_tranpose(val, key, shared=False):
-        if shared or tp_rank is None:
-            suffix = "bin"
-        else:
-            suffix = f"{tp_rank}.bin"
-
-        # Transpose linear layer weights to the correct shape.
-        assert torch.is_tensor(val)
-        if len(val.shape) >= 2:
-            val = val.reshape(val.shape[0], -1)
-            val = torch.transpose(val, 0, 1)
-        val = val.contiguous().to("cpu", non_blocking=True)
-
-        if type(saved_dir) is dict:
-            saved_dir[f"model.{key}.{suffix}"] = val
-        else:
-            global weights_dict
-            weights_dict[f"model.{key}.{suffix}"] = val
-
-    use_attention_nemo_shape = config.get("use_attention_nemo_shape", False)
-    split_gated_activation = config.get("split_gated_activation", False)
-    num_attention_heads = config.get("num_attention_heads", 0)
-    tp_size = config.get("tp_size", 1)
-    num_kv_heads = config.get("num_kv_heads", num_attention_heads)
-
-    if not isinstance(vals, list):
-        vals = [vals]
-
-    if config.get("transpose_weights", False) and vals[0].ndim == 2:
-        vals = [val.T for val in vals]
-    if "layernorm.weight" in key and config.get("apply_layernorm_1p", False):
-        vals = [val + 1.0 for val in vals]
-
-    gpu_vals = [val.to(storage_type) for val in vals]
-    gpu_val = gpu_vals[0]
-
-    if (
-        "input_layernorm.weight" in key
-        or "input_layernorm.bias" in key
-        or "pre_mlp_layernorm.weight" in key
-        or "pre_mlp_layernorm.bias" in key
-        or "attention.dense.bias" in key
-        or "attention.linear_proj.bias" in key
-        or "post_attention_layernorm.weight" in key
-        or "post_attention_layernorm.bias" in key
-        or "post_self_attn_layernorm.weight" in key
-        or "mlp.dense_4h_to_h.bias" in key
-        or "mlp.linear_fc2.bias" in key
-        or "final_layernorm.weight" in key
-        or "final_layernorm.bias" in key
-    ):
-        if "post_self_attn_layernorm.weight" in key:
-            key = key.replace("post_self_attn_layernorm.weight", "post_attention_layernorm.weight")
-        elif "mlp.linear_fc2.bias" in key:
-            key = key.replace("mlp.linear_fc2.bias", "mlp.dense_4h_to_h.bias")
-        elif "attention.linear_proj.bias" in key:
-            key = key.replace("attention.linear_proj.bias", "attention.dense.bias")
-
-        save_tranpose(gpu_val, key, shared=True)
-    elif (
-        "attention.dense.weight" in key
-        or "mlp.dense_4h_to_h.weight" in key
-        or "attention.linear_proj.weight" in key
-        or "mlp.linear_fc2.weight" in key
-    ):
-        if "attention.linear_proj.weight" in key:
-            key = key.replace("attention.linear_proj.weight", "attention.dense.weight")
-        elif "mlp.linear_fc2.weight" in key:
-            key = key.replace("mlp.linear_fc2.weight", "mlp.dense_4h_to_h.weight")
-        save_tranpose(gpu_val, key)
-    elif (
-        "mlp.dense_h_to_4h.weight" in key
-        or "mlp.dense_h_to_4h.bias" in key
-        or "mlp.linear_fc1.weight" in key
-        or "mlp.linear_fc1.bias" in key
-    ):
-        if split_gated_activation:
-            val, gate = torch.chunk(gpu_val, 2, axis=-1)
-        else:
-            val, gate = None, None
-
-        if "mlp.linear_fc1" in key:
-            key = key.replace("mlp.linear_fc1", "mlp.dense_h_to_4h")
-
-        save_tranpose(val, key)
-
-        if split_gated_activation:
-            prefix, dot, suffix = key.rpartition(".")
-            key = prefix + ".gate" + dot + suffix
-            save_tranpose(gate, key)
-
-    elif "mlp.dense_h_to_4h_2.weight" in key or "mlp.dense_h_to_4h_2.bias" in key:
-        save_tranpose(gpu_val, key)
-
-    elif "attention.query_key_value.bias" in key or "attention.linear_qkv.bias" in key:
-        raise NotImplementedError("Attention QKV bias not implemented")
-
-    elif "attention.query_key_value.weight" in key or "attention.linear_qkv.weight" in key:
-        assert use_attention_nemo_shape, "Only support NEMO shape for QKV weights"
-        hidden_dim = vals[0].shape[0]
-        size_per_head = hidden_dim // num_attention_heads
-        q_num = num_attention_heads // num_kv_heads
-
-        len_vals = len(vals)
-        gpu_val = gpu_val.reshape(hidden_dim, num_kv_heads * len_vals // tp_size, q_num + 2, size_per_head)
-
-        # Split the QKV to separate variables.
-        # [qqqqkkvv] - > [qqqq,kk,vv]
-        qkv = torch.split(gpu_val, [q_num, 1, 1], dim=2)
-        split_vals = torch.concatenate(
-            [qkv[0].reshape(hidden_dim, -1), qkv[1].reshape(hidden_dim, -1), qkv[2].reshape(hidden_dim, -1)], dim=1
-        )
-
-        if "attention.linear_qkv.weight" in key:
-            key = key.replace("attention.linear_qkv.weight", "attention.query_key_value.weight")
-        save_tranpose(split_vals, key)
-
-    elif (
-        "attention.query.weight" in key
-        or "attention.query.bias" in key
-        or "attention.key_value.weight" in key
-        or "attention.key_value.bias" in key
-    ):
-        pass
-    else:
-        print(f"[WARNING] {key} not handled by converter")
-
-    global weights_dict
-    return weights_dict
diff --git a/nemo/export/trt_llm/nemo/nemo.py b/nemo/export/trt_llm/nemo/nemo.py
index c3564f1c4e8e..6276de5dddd9 100644
--- a/nemo/export/trt_llm/nemo/nemo.py
+++ b/nemo/export/trt_llm/nemo/nemo.py
@@ -23,11 +23,25 @@
 from transformers import FalconConfig, GPT2Config, LlamaConfig
 
 from nemo.export.tarutils import TarPath
-from nemo.export.trt_llm.nemo.convert import cpu_map_location, gpu_map_location
 
 LOGGER = logging.getLogger("NeMo")
 
 
+def cpu_map_location(storage, loc):
+    return storage.cpu()
+
+
+def gpu_map_location(storage, loc):
+    if loc.startswith("cuda"):
+        training_gpu_idx = int(loc.split(":")[1])
+        inference_gpu_idx = training_gpu_idx % torch.cuda.device_count()
+        return storage.cuda(inference_gpu_idx)
+    elif loc.startswith("cpu"):
+        return storage.cpu()
+    else:
+        raise ValueError(f"Not handled {loc}")
+
+
 def nemo_to_llm_config(nemo_model_config, vocab_size, eos_id, bos_id, decoder_type):
     convertion_dict = {
         "activation_function": "activation",
diff --git a/nemo/export/trt_llm/nemo/nemo_ckpt_convert.py b/nemo/export/trt_llm/nemo/nemo_ckpt_convert.py
index 8112bb8755e3..d83129b43fab 100644
--- a/nemo/export/trt_llm/nemo/nemo_ckpt_convert.py
+++ b/nemo/export/trt_llm/nemo/nemo_ckpt_convert.py
@@ -35,7 +35,7 @@
 from transformers import AutoTokenizer, GPT2Tokenizer, LlamaConfig
 
 from nemo.export.tarutils import TarPath, ZarrPathStore
-from nemo.export.trt_llm.nemo.convert import save_weight_torch, split_and_save_weight
+from nemo.export.trt_llm.nemo.convert import split_and_save_weight
 from nemo.export.trt_llm.nemo.nemo import UnpackedNemoCheckpointDir, extract_layers_with_prefix, nemo_to_llm_config
 from nemo.export.trt_llm.nemo.sentencepiece_tokenizer import SentencePieceTokenizer
 
diff --git a/nemo/export/trt_llm/nemo_utils.py b/nemo/export/trt_llm/nemo_utils.py
index d735cab36b00..7e687ce020da 100644
--- a/nemo/export/trt_llm/nemo_utils.py
+++ b/nemo/export/trt_llm/nemo_utils.py
@@ -14,20 +14,16 @@
 
 
 import argparse
-import copy
 import csv
 import datetime
 import logging
 import os
-import shutil
 import sys
-import tempfile
 from pathlib import Path
 from typing import Dict, List, Tuple, Union
 
 import numpy as np
 import tensorrt_llm
-from tensorrt_llm import str_dtype_to_trt
 from tensorrt_llm._utils import pad_vocab_size
 from tensorrt_llm.functional import non_gated_version
 from tensorrt_llm.layers import MoeConfig
@@ -35,24 +31,79 @@
 from transformers import AutoTokenizer, LlamaConfig, PreTrainedTokenizer
 
 from nemo.export.tarutils import TarPath
-from nemo.export.trt_llm.decoder import DECODER_MODEL_TYPE
-from nemo.export.trt_llm.model_config import (
-    LAYERNORM_DEFAULT,
-    LAYERNORM_RMS,
-    LINEAR_COLUMN,
-    DecoderLayerConfig,
-    EmbeddingConfig,
-    LayernormConfig,
-    LinearConfig,
-    ModelConfig,
-)
 from nemo.export.trt_llm.nemo.nemo import UnpackedNemoCheckpointDir
-from nemo.export.trt_llm.nemo.nemo_ckpt_convert import build_tokenizer, convert_dist_checkpoint, convert_nemo_model
-from nemo.export.trt_llm.tensor_utils import get_tensor_from_dict, get_tensor_parallel_group, split
+from nemo.export.trt_llm.nemo.nemo_ckpt_convert import build_tokenizer, convert_dist_checkpoint
+
+
+DECODER_MODEL_TYPE = {
+    "gptj": 'GPTForCausalLM',
+    "gptnext": 'GPTForCausalLM',
+    "llama": 'LLaMAForCausalLM',
+    "gemma": 'GemmaForCausalLM',
+    "falcon": 'FalconForCausalLM',
+}
 
 LOGGER = logging.getLogger("NeMo")
 
 
+def prompt_convert(prompt_config, prompt_weights):
+    if "task_templates" in prompt_config:
+        prompt_templates = prompt_config["task_templates"]
+        actual_task_id = 0
+        vtokens_embeddings = []
+        vtokens_len = []
+        for task_name_id, prompt_task in enumerate(prompt_templates):
+            prompt_task_name = prompt_task["taskname"]
+            LOGGER.info(f"Task {actual_task_id}: {prompt_task['taskname']}")
+            prompt_task_weights = prompt_weights["prompt_table"].get(
+                f"prompt_table.{prompt_task_name}.prompt_embeddings.weight"
+            )
+            if prompt_task_weights is None:
+                continue
+            vtokens_embeddings.append(prompt_task_weights)
+            vtokens_len.append(prompt_task_weights.shape[0])
+            actual_task_id += 1
+
+        max_vtoken_len = max(vtokens_len)
+        embedding_dim = vtokens_embeddings[0].shape[1]
+
+        # pad tasks to longest task embedding table
+        for i, vtoken_emb_table in enumerate(vtokens_embeddings):
+            padded_table = torch.zeros((max_vtoken_len, embedding_dim))
+            padded_table[: vtoken_emb_table.shape[0], :] = vtoken_emb_table
+            vtokens_embeddings[i] = padded_table
+
+        vtokens_embeddings = torch.stack(vtokens_embeddings)
+    else:
+        vtokens_embeddings = prompt_weights["prompt_embeddings_weights"]
+
+    return vtokens_embeddings
+
+
+def is_nemo_file(path):
+    flag = False
+
+    if path is not None:
+        if len(path) > 5:
+            pc = pathlib.Path(path)
+            if pc.exists():
+                if pc.is_file():
+                    if path[-5 : len(path)] == ".nemo":
+                        flag = True
+
+    return flag
+
+
+def split(v, tp_size, idx, dim=0):
+    """Splits the np tensor v on dim and return the idx's slice."""
+    if tp_size == 1:
+        return v
+    if len(v.shape) == 1:
+        return np.ascontiguousarray(np.split(v, tp_size)[idx])
+    else:
+        return np.ascontiguousarray(np.split(v, tp_size, axis=dim)[idx])
+
+
 def _nemo_llm_decode(
     in_file: str,
     out_dir: str,
@@ -123,83 +174,6 @@ def get_tokenzier(tokenizer_dir_or_path: Path) -> PreTrainedTokenizer:
     return build_tokenizer(tokenizer_config)
 
 
-def nemo_llm_to_model_config(
-    in_file: str,
-    decoder_type: str,
-    nemo_export_dir: Union[str, Path],
-    dtype: str = "bfloat16",
-    tensor_parallel_size: int = 1,
-    pipeline_parallel_size: int = 1,
-    save_nemo_model_config: bool = False,
-) -> Tuple[List[ModelConfig], PreTrainedTokenizer]:
-    """Converts the NEMO file and construct the `ModelConfig` before tensorrt_llm deployment."""
-    dtype_str = dtype
-
-    weights_dict, llm_model_config, tokenizer = _nemo_llm_decode(
-        in_file=in_file,
-        out_dir=nemo_export_dir,
-        tensor_parallelism=tensor_parallel_size,
-        processes=1,
-        storage_type=dtype_str,
-        load_checkpoints_on_gpu=False,
-        decoder_type=decoder_type,
-        save_nemo_model_config=save_nemo_model_config,
-    )
-
-    world_size = tensor_parallel_size * pipeline_parallel_size
-    model_config_template = ModelConfig()
-    model_config_template.dtype = dtype_str
-
-    str_dtype_to_trt(dtype_str)
-
-    model_configs = []
-    for i in range(world_size):
-
-        model_configs.append(copy.deepcopy(model_config_template))
-
-        model_configs[i].vocab_embedding = EmbeddingConfig(weight=get_tensor_from_dict(weights_dict, "wte"))
-
-        model_configs[i].positional_embedding = EmbeddingConfig(weight=get_tensor_from_dict(weights_dict, "wpe"))
-
-        model_configs[i].final_layernorm = LayernormConfig(
-            weight=get_tensor_from_dict(weights_dict, "final_layernorm.weight"),
-            bias=get_tensor_from_dict(weights_dict, "final_layernorm.bias"),
-        )
-        model_configs[i].final_layernorm.layernorm_type = (
-            LAYERNORM_RMS if isinstance(llm_model_config, LlamaConfig) else LAYERNORM_DEFAULT
-        )
-        model_configs[i].mapping = tensorrt_llm.Mapping(
-            world_size=world_size, rank=i, tp_size=tensor_parallel_size, pp_size=pipeline_parallel_size
-        )
-
-    for i in range(llm_model_config.n_layer):
-        for j in range(world_size):
-            model_configs[j].layers.append(
-                DecoderLayerConfig.from_nemo(
-                    weights_dict=weights_dict,
-                    llm_config=llm_model_config,
-                    decoder_type=decoder_type,
-                    layer_id=i,
-                    rank=model_configs[j].mapping.tp_rank,
-                    is_mcore=llm_model_config.is_mcore,
-                )
-            )
-
-    lm_head_weight = get_tensor_from_dict(weights_dict, "lm_head.weight")
-
-    if model_configs[0].vocab_size_padded != model_configs[0].vocab_size:
-        pad_width = model_configs[0].vocab_size_padded - model_configs[0].vocab_size
-        lm_head_weight = np.pad(lm_head_weight, ((0, pad_width), (0, 0)), "constant", constant_values=0)
-
-    for i in range(world_size):
-        model_configs[i].lm_head = LinearConfig(linear_type=LINEAR_COLUMN)
-        model_configs[i].lm_head.weight = np.ascontiguousarray(
-            split(lm_head_weight, model_configs[i].mapping.tp_size, model_configs[i].mapping.tp_rank)
-        )
-
-    return model_configs, tokenizer
-
-
 def to_word_list_format(
     word_dict: List[List[str]],
     tokenizer=None,
@@ -258,83 +232,6 @@ def to_word_list_format(
     return np.array([flat_ids, offsets], dtype="int32").transpose((1, 0, 2))
 
 
-def nemo_llm_model_to_model_config(
-    nemo_model: str,
-    decoder_type: str,
-    nemo_model_config: str,
-    dtype_str: str = "float32",
-) -> Tuple[List[ModelConfig], PreTrainedTokenizer]:
-    """Converts the NEMO model object and construct the `ModelConfig` before tensorrt_llm deployment."""
-    from megatron.core import parallel_state
-
-    assert nemo_model_config is not None, "gpt_model_config must be provided when in is a nemo model"
-
-    weights_dict, llm_model_config = convert_nemo_model(nemo_model, nemo_model_config, dtype_str, decoder_type)
-    is_mcore = nemo_model_config.get("mcore_gpt", False)
-    llm_model_config.is_mcore = is_mcore
-
-    model_config = ModelConfig()
-    model_config.use_prompt_tuning = False
-    model_config.dtype = dtype_str
-    model_config.use_parallel_embedding = True
-    str_dtype_to_trt(dtype_str)
-
-    model_config.vocab_embedding = EmbeddingConfig(weight=get_tensor_from_dict(weights_dict, "wte"), is_local=True)
-
-    model_config.positional_embedding = EmbeddingConfig(
-        weight=get_tensor_from_dict(weights_dict, "wpe"), is_local=True
-    )
-
-    model_config.final_layernorm = LayernormConfig(
-        weight=get_tensor_from_dict(weights_dict, "final_layernorm.weight"),
-        bias=get_tensor_from_dict(weights_dict, "final_layernorm.bias"),
-    )
-    model_config.final_layernorm.layernorm_type = (
-        LAYERNORM_RMS if isinstance(llm_model_config, LlamaConfig) else LAYERNORM_DEFAULT
-    )
-
-    tensor_parallel_size = nemo_model_config.tensor_model_parallel_size
-    pipeline_parallel_size = 1
-    world_size = tensor_parallel_size * pipeline_parallel_size
-
-    # hack since tensorrt_llm doesnt support DP natively so init all ranks with DP=1
-    model_config.mapping = tensorrt_llm.Mapping(
-        world_size=tensor_parallel_size * pipeline_parallel_size,
-        rank=tensorrt_llm.mpi_rank() % world_size,
-        tp_size=tensor_parallel_size,
-        pp_size=pipeline_parallel_size,
-    )
-    model_config.mapping.rank = tensorrt_llm.mpi_rank()
-    model_config.mapping.tp_group = get_tensor_parallel_group(tensor_parallel_size)
-
-    LOGGER.info(
-        f'''Resharing: Rank {tensorrt_llm.mpi_rank()} mapping:
-        tp_rank  {parallel_state.get_tensor_model_parallel_rank()} -> {model_config.mapping.tp_rank},
-        pp_rank  {parallel_state.get_pipeline_model_parallel_rank()} -> {model_config.mapping.pp_rank},
-        tp_group {model_config.mapping.tp_group}'''
-    )
-
-    for i in range(llm_model_config.n_layer):
-        model_config.layers.append(
-            DecoderLayerConfig.from_nemo(
-                weights_dict=weights_dict,
-                llm_config=llm_model_config,
-                decoder_type=decoder_type,
-                layer_id=i,
-                rank=model_config.mapping.tp_rank,
-                is_mcore=llm_model_config.is_mcore,
-            )
-        )
-    lm_head_weight = get_tensor_from_dict(weights_dict, "lm_head.weight")
-
-    assert model_config.vocab_size_padded == model_config.vocab_size
-
-    model_config.lm_head = LinearConfig(linear_type=LINEAR_COLUMN)
-    model_config.lm_head.weight = lm_head_weight
-
-    return [model_config]
-
-
 def nemo_to_trtllm_config(
     in_file: str,
     decoder_type: str,
diff --git a/nemo/export/trt_llm/quantization_utils.py b/nemo/export/trt_llm/quantization_utils.py
deleted file mode 100644
index 86365f774bb7..000000000000
--- a/nemo/export/trt_llm/quantization_utils.py
+++ /dev/null
@@ -1,128 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import numpy as np
-from tensorrt_llm.layers import Linear, RowLinear
-from tensorrt_llm.quantization.layers import FP8Linear, FP8RowLinear, Int8SmoothQuantLinear, Int8SmoothQuantRowLinear
-
-from nemo.export.trt_llm.model_config import (
-    QUANTIZATION_FP8,
-    QUANTIZATION_INT8_SQ,
-    QUANTIZATION_NONE,
-    LinearConfig,
-    ModelConfig,
-)
-
-
-def quantize_linear(tensorrt_llm_layer, quantization: str, layer_config: LinearConfig):
-    """Returns the quantized tensorrt_llm linear layer."""
-    if quantization == QUANTIZATION_NONE:
-        return tensorrt_llm_layer
-
-    if quantization == QUANTIZATION_FP8:
-        # FP8 is not sensitive to scaling factors. So we just quantize all layers possible.
-        default_scaling_factor = np.array([1], dtype=np.float32)
-        if layer_config.activation_scaling_factor is None:
-            layer_config.activation_scaling_factor = default_scaling_factor
-        if layer_config.weights_scaling_factor is None:
-            layer_config.weights_scaling_factor = default_scaling_factor
-
-    if layer_config.activation_scaling_factor is None or layer_config.weights_scaling_factor is None:
-        print(f"No valid scaling factors in {tensorrt_llm_layer._get_name()}, skipping quantization" " on this layer")
-        return tensorrt_llm_layer
-    else:
-        assert np.all(layer_config.activation_scaling_factor > 0)
-        assert np.all(layer_config.weights_scaling_factor > 0)
-
-    bias = tensorrt_llm_layer.bias is not None
-
-    linear_layer_type = type(tensorrt_llm_layer)
-    if linear_layer_type == Linear:
-        if quantization == QUANTIZATION_FP8:
-            linear = FP8Linear
-        elif quantization == QUANTIZATION_INT8_SQ:
-            linear = Int8SmoothQuantLinear
-        else:
-            assert False, f"{quantization} is not supported."
-        quantized_linear_layer = linear(
-            in_features=tensorrt_llm_layer.in_features,
-            out_features=tensorrt_llm_layer.out_features * tensorrt_llm_layer.tp_size,
-            bias=bias,
-            dtype=tensorrt_llm_layer.dtype,
-            tp_group=tensorrt_llm_layer.tp_group,
-            tp_size=tensorrt_llm_layer.tp_size,
-            gather_output=tensorrt_llm_layer.gather_output,
-        )
-    elif linear_layer_type == RowLinear:
-        if quantization == QUANTIZATION_FP8:
-            row_linear = FP8RowLinear
-        elif quantization == QUANTIZATION_INT8_SQ:
-            row_linear = Int8SmoothQuantRowLinear
-        else:
-            assert False, f"{quantization} is not supported."
-        quantized_linear_layer = row_linear(
-            in_features=tensorrt_llm_layer.in_features * tensorrt_llm_layer.tp_size,
-            out_features=tensorrt_llm_layer.out_features,
-            bias=bias,
-            dtype=tensorrt_llm_layer.dtype,
-            tp_group=tensorrt_llm_layer.tp_group,
-            tp_size=tensorrt_llm_layer.tp_size,
-        )
-    else:
-        assert False, f"{linear_layer_type} is not supported."
-
-    quantized_linear_layer.weight = tensorrt_llm_layer.weight
-    quantized_linear_layer.bias = tensorrt_llm_layer.bias
-
-    quantized_linear_layer.activation_scaling_factor.value = layer_config.activation_scaling_factor
-    quantized_linear_layer.weights_scaling_factor.value = layer_config.weights_scaling_factor
-
-    if hasattr(quantized_linear_layer, "prequant_scaling_factor"):
-        quantized_linear_layer.prequant_scaling_factor.value = layer_config.prequant_scaling_factor
-
-    return quantized_linear_layer
-
-
-def naive_quantization(config: ModelConfig, quantization: str):
-    """Generates a constant scaling factor (1) with target quantization.
-
-    This is for debugging and performance measurement only.
-    """
-    config.quantization = quantization
-    # Here the scaling factor is not inversed.
-    # In nvidia systems:
-    # pytorch_quantization uses inv scale
-    # onnx & trt uses non-inv scale
-    # cask uses inv scale
-    default_scaling_factor = np.array([1], dtype=np.float32)
-
-    if quantization == QUANTIZATION_FP8:
-        for layer in config.layers:
-            linear_layers = [
-                layer.attention.qkv,
-                layer.attention.dense,
-                layer.mlp.fc,
-                layer.mlp.proj,
-                layer.mlp.gate,
-            ]
-            for linear_layer in linear_layers:
-                if linear_layer:
-                    linear_layer.activation_scaling_factor = default_scaling_factor
-                    linear_layer.weights_scaling_factor = default_scaling_factor
-        config.lm_head.activation_scaling_factor = default_scaling_factor
-        config.lm_head.weights_scaling_factor = default_scaling_factor
-
-    else:
-        assert False, f"{quantization} not supported"
diff --git a/nemo/export/trt_llm/tensor_utils.py b/nemo/export/trt_llm/tensor_utils.py
deleted file mode 100644
index 2fce81b91647..000000000000
--- a/nemo/export/trt_llm/tensor_utils.py
+++ /dev/null
@@ -1,59 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from typing import Dict
-
-import numpy as np
-import tensorrt as trt
-import tensorrt_llm
-import torch
-
-
-def torch_to_numpy_with_dtype(tensor, dtype=trt.float16):
-    """Converts a torch tensor to numpy array with the dtype."""
-    if dtype == trt.float16:
-        torch_dtype = torch.float16
-    elif dtype == trt.float32:
-        torch_dtype = torch.float32
-    elif dtype == trt.bfloat16:
-        torch_dtype = torch.bfloat16
-    else:
-        assert False, f"{dtype} not supported"
-    return tensorrt_llm._utils.torch_to_numpy(tensor.detach().to(torch_dtype))
-
-
-def split(v, tp_size, idx, dim=0):
-    """Splits the np tensor v on dim and return the idx's slice."""
-    if tp_size == 1:
-        return v
-    if len(v.shape) == 1:
-        return np.ascontiguousarray(np.split(v, tp_size)[idx])
-    else:
-        return np.ascontiguousarray(np.split(v, tp_size, axis=dim)[idx])
-
-
-def get_tensor_parallel_group(tensor_parallel: int):
-    """Returns the tensor_parallel_group config based on tensor_parallel."""
-    from mpi4py import MPI
-
-    mpi_rank = MPI.COMM_WORLD.Get_rank()
-    offset = mpi_rank - mpi_rank % tensor_parallel
-    tp_group = list(range(offset, offset + tensor_parallel))
-    return None if tensor_parallel == 1 else tp_group
-
-
-def get_tensor_from_dict(weights_dict: Dict[str, np.ndarray], name: str) -> np.array:
-    """Loads tensor from the weights_dict."""
-    return weights_dict.get(f"model.{name}.bin", None)
diff --git a/nemo/export/trt_llm/tensorrt_llm_build.py b/nemo/export/trt_llm/tensorrt_llm_build.py
index 2336b8eb38ce..30490cc91254 100644
--- a/nemo/export/trt_llm/tensorrt_llm_build.py
+++ b/nemo/export/trt_llm/tensorrt_llm_build.py
@@ -20,10 +20,7 @@
 from pathlib import Path
 from typing import List
 
-import tensorrt as trt
 import tensorrt_llm
-import torch
-from tensorrt_llm import str_dtype_to_trt
 from tensorrt_llm._common import check_max_num_tokens
 from tensorrt_llm._utils import np_dtype_to_trt
 from tensorrt_llm.builder import BuildConfig, Builder
@@ -41,323 +38,6 @@
 LOGGER = logging.getLogger("NeMo")
 
 
-def get_engine_name(model, dtype, tp_size, pp_size, rank):
-    """Returns the engine file name based on the provided info."""
-    if pp_size == 1:
-        return '{}_{}_tp{}_rank{}.engine'.format(model, dtype, tp_size, rank)
-    return '{}_{}_tp{}_pp{}_rank{}.engine'.format(model, dtype, tp_size, pp_size, rank)
-
-
-def serialize_engine(engine, path):
-    """Serializes the engine to path."""
-    logger.info(f"Serializing engine to {path}...")
-    tik = time.time()
-    with open(path, "wb") as f:
-        f.write(bytearray(engine))
-    tok = time.time()
-    t = time.strftime("%H:%M:%S", time.gmtime(tok - tik))
-    logger.info(f"Engine serialized. Total time: {t}")
-
-
-def refit_runtime_engine(params, cuda_engine):
-    '''
-    @brief: Inplace refit one TensorRT cuda engine using weights from the network,
-        user should guarantee that the engine is built with REFIT flag, and the network has the same structure with the engine.
-    @param engine_buffer: A serialized TensorRT engine.
-    @param network: Network object.
-    @return: A serialized TRT engine if refit successfully, None otherwise
-    '''
-    logger.info(f'Refit runtime engine')
-    tik = time.time()
-
-    # Refit engine
-    assert params is not None
-    refitter = trt.Refitter(cuda_engine, logger.trt_logger)
-    for name, param in params:
-        trt_param = trt.Weights(np_dtype_to_trt(param._value.dtype), param._value.ctypes.data, param._value.size)
-
-        if trt_param is None or not refitter.set_named_weights(name, trt_param):
-            logger.error(f'Failed to refit weight: {name}')
-            return None
-
-    if not refitter.refit_cuda_engine():
-        logger.error(f'Failed to refit engine.')
-        return None
-
-    tok = time.time()
-    t = time.strftime('%H:%M:%S', time.gmtime(tok - tik))
-    logger.info(f'Total time of refitting {cuda_engine.name}: {t}')
-
-    return cuda_engine
-
-
-def build_rank_engine(
-    tensorrt_llm_gpt,
-    builder: Builder,
-    builder_config: tensorrt_llm.builder.BuilderConfig,
-    engine_name,
-    args,
-):
-
-    str_dtype_to_trt(args.dtype)
-    ootb = os.getenv("OOTB", False)
-
-    network = builder.create_network()
-    network.trt_network.name = engine_name
-
-    # We have to use the attention plugin for most of the models.
-    if args.use_gpt_attention_plugin:
-        network.plugin_config.set_gpt_attention_plugin(dtype=args.use_gpt_attention_plugin)
-
-    if not ootb:
-        network.plugin_config.use_custom_all_reduce = False
-
-        if args.use_gemm_plugin:
-            network.plugin_config.set_gemm_plugin(dtype=args.use_gemm_plugin)
-        if args.use_layernorm_plugin:
-            network.plugin_config.set_layernorm_plugin(dtype=args.use_layernorm_plugin)
-        assert not (args.enable_context_fmha and args.enable_context_fmha_fp32_acc)
-        if args.enable_context_fmha:
-            network.plugin_config.set_context_fmha(ContextFMHAType.enabled)
-        if args.enable_context_fmha_fp32_acc:
-            network.plugin_config.set_context_fmha(ContextFMHAType.enabled_with_fp32_acc)
-        if args.remove_input_padding:
-            network.plugin_config.enable_remove_input_padding()
-        else:
-            network.plugin_config.remove_input_padding = False
-        if args.paged_kv_cache:
-            network.plugin_config.enable_paged_kv_cache()
-        else:
-            network.plugin_config.paged_kv_cache = False
-        if args.use_ib_gpt_attention_plugin:
-            network.plugin_config.set_inflight_batching_gpt_attention_plugin(dtype=args.use_ib_gpt_attention_plugin)
-        if args.enable_multi_block_mode:
-            network.plugin_config.enable_mmha_multi_block_mode()
-
-        if args.use_lora_plugin:
-            network.plugin_config.set_lora_plugin(dtype=args.use_lora_plugin)
-
-        if args.use_lookup_plugin:
-            # Use the plugin for the embedding parallelism and sharing
-            network.plugin_config.set_lookup_plugin(dtype=args.dtype)
-    else:
-        LOGGER.warning("Build engine in OOTB mode, disable all plugins except nccl.")
-
-    if args.mapping.world_size > 1:
-        network.plugin_config.set_nccl_plugin(args.dtype)
-
-    with net_guard(network):
-        # Prepare
-        network.set_named_parameters(tensorrt_llm_gpt.named_parameters())
-
-        # Forward
-        inputs = tensorrt_llm_gpt.prepare_inputs(
-            max_batch_size=args.max_batch_size,
-            max_input_len=args.max_input_len,
-            max_new_tokens=args.max_input_len + args.max_output_len,
-            use_cache=True,
-            max_beam_width=args.max_beam_width,
-            paged_kv_cache=args.paged_kv_cache,
-            tokens_per_block=args.tokens_per_block,
-            prompt_embedding_table_size=args.max_prompt_embedding_table_size,
-            lora_target_modules=args.lora_target_modules,
-        )
-        tensorrt_llm_gpt(*inputs)
-
-    # Network -> Engine
-    engine = builder.build_engine(network, builder_config)
-    if args.mapping.rank == 0 or args.use_refit:
-        config_path = args.output_dir / "config.json"
-        builder.save_config(builder_config, config_path)
-    return engine
-
-
-def _build_impl(tensorrt_llm_model, args):
-    torch.cuda.set_device(args.mapping.rank % args.gpus_per_node)
-    tensorrt_llm.logger.set_level(args.log_level)
-    args.output_dir.mkdir(parents=True, exist_ok=True)
-    timing_cache_file = args.timing_cache if args.timing_cache else args.output_dir / "model.cache"
-    timing_cache = timing_cache_file
-
-    if args.use_lora_plugin is not None:
-        add_lora(tensorrt_llm_model, args.max_lora_rank)
-
-    builder = Builder()
-    apply_query_key_layer_scaling = False
-
-    builder_config = builder.create_builder_config(
-        name=MODEL_NAME,
-        precision=args.dtype,
-        timing_cache=timing_cache,
-        tensor_parallel=args.mapping.tp_size,
-        pipeline_parallel=args.mapping.pp_size,
-        world_size=args.mapping.tp_size * args.mapping.pp_size,
-        parallel_build=args.parallel_build,
-        num_layers=tensorrt_llm_model._num_layers,
-        num_heads=tensorrt_llm_model._num_heads,
-        num_kv_heads=tensorrt_llm_model._num_kv_heads,
-        head_size=tensorrt_llm_model._head_size,
-        hidden_size=tensorrt_llm_model._hidden_size,
-        vocab_size=tensorrt_llm_model._vocab_size,
-        hidden_act=tensorrt_llm_model.hidden_act,
-        max_position_embeddings=tensorrt_llm_model.max_position_embeddings,
-        add_bos=tensorrt_llm_model._add_bos,
-        apply_query_key_layer_scaling=apply_query_key_layer_scaling,
-        max_batch_size=args.max_batch_size,
-        max_input_len=args.max_input_len,
-        max_output_len=args.max_output_len,
-        max_beam_width=args.max_beam_width,
-        max_num_tokens=None,
-        max_draft_len=0,
-        int8="int8" in args.quantization,
-        opt_level=args.builder_opt,
-        paged_kv_cache=args.paged_kv_cache,
-        tokens_per_block=args.tokens_per_block,
-        max_prompt_embedding_table_size=args.max_prompt_embedding_table_size,
-        use_parallel_embedding=args.use_parallel_embedding,
-        embedding_sharding_dim=args.embedding_sharding_dim,
-        fp8="fp8" in args.quantization,
-        use_refit=args.use_refit,
-        gather_context_logits=False,
-        gather_generation_logits=False,
-        quant_mode=args.quant_mode,
-        lora_target_modules=args.lora_target_modules,
-        max_lora_rank=args.max_lora_rank,
-    )
-
-    tp_size = args.mapping.tp_size
-    pp_size = args.mapping.pp_size
-    rank = args.mapping.rank
-    engine_name = get_engine_name(MODEL_NAME, args.dtype, tp_size, pp_size, rank)
-    engine = build_rank_engine(tensorrt_llm_model, builder, builder_config, engine_name, args)
-    assert engine is not None, f"Failed to build engine for rank {rank}"
-
-    serialize_engine(engine, args.output_dir / engine_name)
-
-    if args.mapping.rank == 0:
-        ok = builder.save_timing_cache(builder_config, timing_cache_file)
-        assert ok, "Failed to save timing cache."
-
-
-def build(
-    tensorrt_llm_model,
-    output_dir: Path,
-    mapping=None,
-    dtype="float16",
-    timing_cache="",
-    log_level="info",
-    max_batch_size=1,
-    max_input_len=200,
-    max_output_len=200,
-    max_beam_width=1,
-    max_prompt_embedding_table_size=0,
-    parallel_build=False,
-    gpus_per_node=1,
-    quantization=None,
-    use_inflight_batching=False,
-    paged_kv_cache=False,
-    enable_context_fmha: bool = True,
-    enable_multi_block_mode=False,
-    use_refit=False,
-    use_lora_plugin: str = None,
-    lora_target_modules: List[str] = None,
-    max_lora_rank: int = 64,
-):
-    """Builds the tensorrt_llm_model to engine."""
-    args = argparse.Namespace()
-    args.mapping = mapping
-    args.dtype = dtype
-    args.timing_cache = timing_cache
-    args.log_level = log_level
-    args.max_batch_size = max_batch_size
-    args.max_input_len = max_input_len
-    args.max_output_len = max_output_len
-    args.max_beam_width = max_beam_width
-    args.use_gpt_attention_plugin = dtype
-    args.use_gemm_plugin = dtype
-    args.use_layernorm_plugin = False
-    args.parallel_build = parallel_build
-    args.enable_context_fmha = enable_context_fmha
-    args.enable_context_fmha_fp32_acc = False
-    args.gpus_per_node = gpus_per_node
-    args.builder_opt = None
-    args.output_dir = Path(output_dir)
-    args.remove_input_padding = True
-    args.use_smooth_quant = False
-    args.use_weight_only = False
-    args.weight_only_precision = "int8"
-    args.per_channel = False
-    args.per_token = False
-    args.int8_kv_cache = False
-    args.random_seed = None
-    args.paged_kv_cache = paged_kv_cache
-    args.max_prompt_embedding_table_size = max_prompt_embedding_table_size
-    args.use_inflight_batching = use_inflight_batching
-    args.use_ib_gpt_attention_plugin = False
-    args.use_parallel_embedding = False
-    args.embedding_sharding_dim = 0
-    args.use_lookup_plugin = False
-    args.tokens_per_block = 64
-    args.quantization = quantization
-    args.enable_multi_block_mode = enable_multi_block_mode
-    args.use_refit = use_refit
-    args.use_lora_plugin = use_lora_plugin
-    args.lora_target_modules = lora_target_modules
-    args.max_lora_rank = max_lora_rank
-
-    logger.set_level(args.log_level)
-
-    assert not (
-        args.use_smooth_quant and args.use_weight_only
-    ), "You cannot enable both SmoothQuant and INT8 weight-only together."
-
-    assert not (
-        args.use_smooth_quant and args.use_weight_only
-    ), "You cannot enable both SmoothQuant and INT8 weight-only together."
-
-    if args.use_ib_gpt_attention_plugin:
-        logger.warning(
-            "use_ib_gpt_attention_plugin is deprecated. Use combination of"
-            " --use_gpt_attention_plugin=dtype --use_inflight_batching instead."
-        )
-
-    if args.use_inflight_batching:
-        assert args.use_gpt_attention_plugin, "You have to use GPT attention plugin for in-flight batching mode"
-
-        if not args.paged_kv_cache:
-            logger.warning("Paged kv cache feature will enabled for in-flight batching mode.")
-            args.paged_kv_cache = True
-
-        if not args.remove_input_padding:
-            logger.warning("Remove input padding feature will enabled for in-flight batching mode.")
-            args.remove_input_padding = True
-
-    if args.use_smooth_quant:
-        args.quant_mode = QuantMode.use_smooth_quant(args.per_token, args.per_channel)
-    elif args.use_weight_only:
-        args.quant_mode = QuantMode.use_weight_only(args.weight_only_precision == "int4")
-    else:
-        args.quant_mode = QuantMode(0)
-
-    if args.int8_kv_cache:
-        args.quant_mode = args.quant_mode.set_int8_kv_cache()
-
-    if args.random_seed is not None:
-        torch.manual_seed(args.random_seed)
-
-    if args.mapping.is_first_pp_rank():
-        if tensorrt_llm_model._modules['vocab_embedding'].tp_size > 1:
-            args.use_parallel_embedding = True
-            args.embedding_sharding_dim = tensorrt_llm_model._modules['vocab_embedding'].sharding_dim
-
-    tik = time.time()
-    _build_impl(tensorrt_llm_model, args)
-
-    tok = time.time()
-    t = time.strftime("%H:%M:%S", time.gmtime(tok - tik))
-    logger.info(f"Total time of building all {args.mapping.world_size} engines: {t}")
-
-
 def build_and_save_engine(
     max_input_len=1024,
     max_output_len=1024,
diff --git a/nemo/export/trt_llm/tensorrt_llm_model.py b/nemo/export/trt_llm/tensorrt_llm_model.py
deleted file mode 100644
index f4b44552af63..000000000000
--- a/nemo/export/trt_llm/tensorrt_llm_model.py
+++ /dev/null
@@ -1,406 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from pathlib import Path
-from typing import List
-
-import numpy as np
-import torch
-from tensorrt_llm import default_net, str_dtype_to_trt
-from tensorrt_llm._utils import trt_dtype_to_str
-from tensorrt_llm.functional import expand_mask, gather_last_token_logits, recv, send, shape
-from tensorrt_llm.layers import AttentionParams, ColumnLinear, KeyValueCacheParams, LoraParams
-from tensorrt_llm.models.generation_mixin import GenerationMixin
-from tensorrt_llm.module import Module, ModuleList
-
-from nemo.export.trt_llm.decoder import build_decoder_layer
-from nemo.export.trt_llm.model_config import DECODER_GEMMA, DECODER_LLAMA, ModelConfig
-from nemo.export.trt_llm.quantization_utils import quantize_linear
-from nemo.export.trt_llm.tensorrt_llm_build import build
-from nemo.export.trt_llm.tensorrt_llm_utils import (
-    build_embedding_from_config,
-    build_layernorm_from_config,
-    print_tensorrt_llm,
-)
-
-
-def get_transformer_layers(mapping, num_layers):
-    layers_per_pipeline_stage = num_layers // mapping.pp_size
-    layers_range = list(
-        range(mapping.pp_rank * layers_per_pipeline_stage, (mapping.pp_rank + 1) * layers_per_pipeline_stage, 1)
-    )
-    return layers_range
-
-
-class ModelBuilder(Module):
-    def __init__(self, model_config: ModelConfig):
-        super().__init__()
-        self.quantization = model_config.quantization
-        self.max_position_embeddings = model_config.max_position_embeddings
-        self.hidden_act = model_config.hidden_act
-
-        self._dtype = str_dtype_to_trt(model_config.dtype)
-        self._kv_dtype = self._dtype
-        self._tensor_parallel = model_config.mapping.tp_size
-        self._vocab_size = model_config.vocab_size
-        self._hidden_size = model_config.hidden_size
-        self._num_layers = len(model_config.layers)
-        self._num_heads = model_config.num_attention_heads
-        self._num_kv_heads = model_config.num_kv_heads
-        self._head_size = (
-            model_config.hidden_size // model_config.num_attention_heads
-            if model_config.head_size is None
-            else model_config.head_size
-        )
-        self._use_prompt_tuning = model_config.use_prompt_tuning
-        self._add_bos = model_config.layers[0].decoder_type in (DECODER_GEMMA, DECODER_LLAMA)
-        self._mapping = model_config.mapping
-        self.rank = model_config.mapping.rank
-        self.max_lora_rank = model_config.max_lora_rank
-
-        if self._mapping.is_first_pp_rank():
-            self.vocab_embedding = build_embedding_from_config(
-                model_config.vocab_embedding,
-                self._dtype,
-                use_prompt_tuning=self._use_prompt_tuning,
-                tensor_parallel=model_config.mapping.tp_size,
-                tensor_parallel_rank=model_config.mapping.tp_rank,
-            )
-
-            if model_config.positional_embedding.weight is not None:
-                self.positional_embedding = build_embedding_from_config(
-                    model_config.positional_embedding,
-                    self._dtype,
-                    tensor_parallel=model_config.mapping.tp_size,
-                    tensor_parallel_rank=model_config.mapping.tp_rank,
-                )
-
-        self.layers = []
-        for layer_id in get_transformer_layers(self._mapping, self._num_layers):
-            model_config.layers[layer_id].max_lora_rank = self.max_lora_rank
-            self.layers.append(
-                build_decoder_layer(
-                    model_config.layers[layer_id],
-                    layer_id,
-                    self._num_layers,
-                    dtype=self._dtype,
-                    quantization=model_config.quantization,
-                    rank=self.rank,
-                    tensor_parallel=self._tensor_parallel,
-                    tp_group=model_config.mapping.tp_group,
-                )
-            )
-
-        self.layers = ModuleList(self.layers)
-
-        if self._mapping.is_last_pp_rank():
-            self.ln_f = build_layernorm_from_config(model_config.final_layernorm, self._dtype)
-
-    def forward(
-        self,
-        input_ids,
-        position_ids,
-        use_cache=False,
-        attention_mask=None,
-        kv_cache_params=None,
-        attention_params=None,
-        prompt_embedding_table=None,
-        prompt_tasks=None,
-        prompt_vocab_size=None,
-        inflight_batching_args=None,
-        hidden_states=None,
-        lora_params=None,
-    ):
-        ptuning_args = []
-        if self._use_prompt_tuning:
-            ptuning_args = [prompt_embedding_table, prompt_tasks, prompt_vocab_size]
-
-        if self._mapping.is_first_pp_rank():
-            x = self.vocab_embedding(input_ids, *ptuning_args)
-            if hasattr(self, "positional_embedding") and self.positional_embedding:
-                assert position_ids
-                x = x + self.positional_embedding(position_ids)
-            hidden_states = x
-        else:
-            hidden_states = recv(hidden_states, self._mapping.prev_pp_rank())
-
-        kv_cache_params.fill_none_tensor_list(len(self.layers))
-
-        if use_cache:
-            presents = []
-
-        if attention_mask is not None:
-            attention_mask = expand_mask(attention_mask, shape(input_ids, -1))
-
-        for layer_idx, (layer, past) in enumerate(
-            zip(
-                self.layers,
-                kv_cache_params.past_key_value,
-            )
-        ):
-
-            decoder_params = {
-                "hidden_states": hidden_states,
-                "attention_mask": attention_mask,
-                "use_cache": use_cache,
-                "kv_cache_params": KeyValueCacheParams(
-                    past_key_value=[past],
-                    host_past_key_value_lengths=kv_cache_params.host_past_key_value_lengths,
-                    kv_cache_block_pointers=kv_cache_params.kv_cache_block_pointers,
-                    host_max_attention_window_sizes=kv_cache_params.host_max_attention_window_sizes,
-                    cache_indirection=kv_cache_params.cache_indirection,
-                    host_sink_token_length=kv_cache_params.host_sink_token_length,
-                    host_kv_cache_block_pointers=kv_cache_params.host_kv_cache_block_pointers,
-                ),
-                "attention_params": attention_params,
-            }
-
-            if lora_params.lora_ranks is not None:
-                decoder_params["lora_layer_params"] = lora_params.get_layer_params(layer_idx)
-
-            hidden_states = layer(**decoder_params)
-
-            if use_cache:
-                presents.append(hidden_states[1])
-                hidden_states = hidden_states[0]
-
-        if self._mapping.is_last_pp_rank():
-            hidden_states = self.ln_f(hidden_states)
-        else:
-            hidden_states = send(hidden_states, self._mapping.next_pp_rank())
-
-        if use_cache:
-            return hidden_states, tuple(presents)
-        return hidden_states
-
-
-class LMHeadModelBuilder(ModelBuilder, GenerationMixin):
-    def __init__(self, model_config: ModelConfig):
-        super().__init__(model_config)
-
-        if self._mapping.is_last_pp_rank():
-            self.lm_head = ColumnLinear(
-                self._hidden_size,
-                model_config.vocab_size_padded,
-                bias=False,
-                dtype=self._dtype,
-                tp_group=self._mapping.tp_group,
-                tp_size=self._tensor_parallel,
-                gather_output=True,
-                share_weight=None,
-            )
-            self.lm_head.weight.value = model_config.lm_head.weight
-            if model_config.quantization:
-                self.lm_head = quantize_linear(self.lm_head, model_config.quantization, model_config.lm_head)
-
-    def forward(
-        self,
-        input_ids,
-        position_ids,
-        use_cache=False,
-        last_token_ids=None,
-        attention_mask=None,
-        kv_cache_params=None,
-        attention_params=None,
-        prompt_embedding_table=None,
-        prompt_tasks=None,
-        prompt_vocab_size=None,
-        inflight_batching_args=None,
-        hidden_states=None,
-        lora_params=None,
-    ):
-
-        hidden_states = super().forward(
-            input_ids,
-            position_ids,
-            use_cache,
-            attention_mask,
-            kv_cache_params,
-            attention_params,
-            prompt_embedding_table,
-            prompt_tasks,
-            prompt_vocab_size,
-            inflight_batching_args,
-            hidden_states,
-            lora_params,
-        )
-
-        if use_cache:
-            hidden_states, presents = hidden_states
-
-        if self._mapping.is_last_pp_rank():
-            assert last_token_ids is not None, "Expecting last token ids to be not None"
-            hidden_states = gather_last_token_logits(
-                hidden_states, last_token_ids, default_net().plugin_config.remove_input_padding
-            )
-
-            # [batch_size, hidden_size] -> [batch_size, vocab_size]
-            lm_logits = self.lm_head(hidden_states)
-            lm_logits.mark_output("logits", str_dtype_to_trt("float16"))
-        else:
-            hidden_states.mark_output('hidden_states_output', self._dtype)
-
-        if use_cache:
-            if not default_net().plugin_config.paged_kv_cache:
-                for i, present in zip(self._mapping.pp_layers(self._num_layers), presents):
-                    present.mark_output(f'present_key_value_{i}', self._kv_dtype)
-            if self._mapping.is_last_pp_rank():
-                return (lm_logits, presents)
-            return (hidden_states, presents)
-        else:
-            if self._mapping.is_last_pp_rank():
-                return lm_logits
-            return hidden_states
-
-    def prepare_inputs(
-        self,
-        max_batch_size,
-        max_input_len,
-        max_new_tokens,
-        use_cache=True,
-        max_beam_width: int = 1,
-        paged_kv_cache: bool = False,
-        tokens_per_block: int = 64,
-        prompt_embedding_table_size: int = 0,
-        lora_target_modules: List[str] = None,
-    ):
-
-        # Prepare inputs
-        head_size = self._head_size
-        num_heads_kv = self._num_kv_heads
-        remove_input_padding = default_net().plugin_config.remove_input_padding
-        use_gpt_attention_plugin = default_net().plugin_config.gpt_attention_plugin
-        use_gemm_plugin = default_net().plugin_config.gemm_plugin
-        use_custom_all_reduce = default_net().plugin_config.use_custom_all_reduce
-        use_lora_plugin = default_net().plugin_config.lora_plugin
-
-        model_inputs = self.prepare_basic_inputs(
-            max_batch_size=max_batch_size,
-            max_beam_width=max_beam_width,
-            max_input_len=max_input_len,
-            max_seq_len=max_new_tokens,
-            num_kv_heads=num_heads_kv,
-            head_size=head_size,
-            num_layers=self._num_layers,
-            kv_dtype=self._kv_dtype,
-            remove_input_padding=remove_input_padding,
-            use_gpt_attention_plugin=use_gpt_attention_plugin,
-            use_gemm_plugin=use_gemm_plugin,
-            paged_kv_cache=paged_kv_cache,
-            tokens_per_block=tokens_per_block,
-            gather_context_logits=False,
-            gather_generation_logits=False,
-            dtype=self._dtype,
-            num_heads=self._num_heads,
-            mapping=self._mapping,
-            max_num_tokens=None,
-            prompt_embedding_table_size=prompt_embedding_table_size,
-            position_encoding_2d=False,
-            use_lora_plugin=use_lora_plugin,
-            lora_target_modules=lora_target_modules,
-            max_draft_len=0,
-            use_custom_all_reduce=use_custom_all_reduce,
-        )
-
-        inflight_batching_args = None
-
-        return (
-            model_inputs["input_ids"],
-            model_inputs["position_ids"],
-            use_cache,
-            model_inputs["last_token_ids"],
-            model_inputs["attention_mask"],
-            KeyValueCacheParams(
-                past_key_value=model_inputs['past_key_value'],
-                host_past_key_value_lengths=model_inputs['host_past_key_value_lengths'],
-                host_max_attention_window_sizes=model_inputs['host_max_attention_window_sizes'],
-                kv_cache_block_pointers=model_inputs['kv_cache_block_pointers'],
-                host_kv_cache_block_pointers=model_inputs['host_kv_cache_block_pointers'],
-                cache_indirection=model_inputs['cache_indirection'],
-                host_sink_token_length=model_inputs['host_sink_token_length'],
-            ),
-            AttentionParams(
-                sequence_length=model_inputs['sequence_length'],
-                context_lengths=model_inputs['context_lengths'],
-                host_context_lengths=model_inputs['host_context_lengths'],
-                max_context_length=max_input_len,
-                host_request_types=model_inputs['host_request_types'],
-            ),
-            model_inputs['prompt_embedding_table'],
-            model_inputs['tasks'],
-            model_inputs['prompt_vocab_size'],
-            inflight_batching_args,
-            model_inputs["hidden_states_input"],
-            LoraParams(
-                model_inputs['lora_ranks'],
-                model_inputs['lora_weights_pointers'],
-                host_context_lengths=model_inputs['host_context_lengths'],
-                max_context_length=max_input_len,
-                host_request_types=model_inputs['host_request_types'],
-            ),
-        )
-
-    def build(
-        self,
-        output_dir: Path,
-        timing_cache: str = "",
-        log_level: str = "info",
-        max_batch_size: int = 1,
-        max_input_len: int = 200,
-        max_output_len: int = 200,
-        max_beam_width: int = 1,
-        parallel_build: bool = False,
-        max_prompt_embedding_table_size: int = 0,
-        use_inflight_batching: bool = False,
-        paged_kv_cache: bool = False,
-        enable_context_fmha: bool = True,
-        enable_multi_block_mode: bool = False,
-        use_refit: bool = False,
-        use_lora_plugin: str = None,
-        lora_target_modules: List[str] = None,
-        max_lora_rank: int = 64,
-    ):
-
-        if self.rank > torch.cuda.device_count():
-            print(f"warning: Rank {self.rank} larger than GPUs available ({torch.cuda.device_count()})")
-
-        build(
-            tensorrt_llm_model=self,
-            output_dir=output_dir,
-            mapping=self._mapping,
-            dtype=trt_dtype_to_str(self._dtype),
-            timing_cache=timing_cache,
-            log_level=log_level,
-            max_batch_size=max_batch_size,
-            max_input_len=max_input_len,
-            max_output_len=max_output_len,
-            max_beam_width=max_beam_width,
-            max_prompt_embedding_table_size=max_prompt_embedding_table_size,
-            parallel_build=parallel_build,
-            gpus_per_node=torch.cuda.device_count(),
-            quantization=self.quantization,
-            use_inflight_batching=use_inflight_batching,
-            paged_kv_cache=paged_kv_cache,
-            enable_context_fmha=enable_context_fmha,
-            enable_multi_block_mode=enable_multi_block_mode,
-            use_refit=use_refit,
-            use_lora_plugin=use_lora_plugin,
-            lora_target_modules=lora_target_modules,
-            max_lora_rank=max_lora_rank,
-        )
-
-    def print(self):
-        np.set_printoptions(threshold=36)
-        print_tensorrt_llm(f"rank.{self.rank}", self)
diff --git a/nemo/export/trt_llm/tensorrt_llm_run.py b/nemo/export/trt_llm/tensorrt_llm_run.py
index 1bdfd5237caf..f79d6ddce4bc 100644
--- a/nemo/export/trt_llm/tensorrt_llm_run.py
+++ b/nemo/export/trt_llm/tensorrt_llm_run.py
@@ -25,16 +25,11 @@
 import tensorrt_llm
 import torch
 from mpi4py.futures import MPIPoolExecutor
-from tensorrt_llm.logger import logger
 from tensorrt_llm.lora_manager import LoraManager
 from tensorrt_llm.quantization import QuantMode
 from tensorrt_llm.runtime import ModelConfig, ModelRunner, ModelRunnerCpp, SamplingConfig
 from transformers import PreTrainedTokenizer
 
-from nemo.export.trt_llm.tensor_utils import get_tensor_parallel_group
-from nemo.export.trt_llm.tensorrt_llm_model import LMHeadModelBuilder
-
-from nemo.export.trt_llm.tensorrt_llm_build import get_engine_name, MODEL_NAME, refit_runtime_engine  # isort:skip
 from nemo.export.trt_llm.nemo_utils import to_word_list_format  # isort:skip
 
 
@@ -330,110 +325,6 @@ def load(
     )
 
 
-def load_refit(
-    tokenizer,
-    engine_dir: str,
-    lora_ckpt_list: List[str] = None,
-    num_beams: int = 1,
-    model_configs: List = None,
-    stream=None,
-) -> TensorrtLLMHostContext:
-    """Loaded the compiled LLM model and run it.
-
-    It also supports running the TRT LLM model on multi-GPU.
-    """
-
-    config_path = os.path.join(engine_dir, "config.json")
-    with open(config_path, "r") as f:
-        config = json.load(f)
-    """The impl of `load` API for on a single GPU worker."""
-    tensorrt_llm.logger.set_level("error")
-
-    engine_dir = Path(engine_dir)
-    config_path = engine_dir / "config.json"
-
-    (
-        model_config,
-        world_size,
-        tensor_parallel_size,
-        pipeline_parallel_size,
-        dtype,
-        max_input_len,
-        max_batch_size,
-    ) = _read_config(config_path)
-
-    runtime_rank = torch.cuda.current_device()
-    assert runtime_rank < torch.cuda.device_count(), f"Rank {runtime_rank} out of bound"
-
-    # Manipulate the tensorrt_llm mapping to make it compatible with the multiprocessed env.
-    assert tensorrt_llm.mpi_world_size() == torch.distributed.get_world_size(), "MPI world size mismatch"
-    runtime_mapping = tensorrt_llm.Mapping(
-        world_size=tensorrt_llm.mpi_world_size(),
-        rank=runtime_rank,
-        tp_size=tensorrt_llm.mpi_world_size(),
-        pp_size=1,
-    )
-
-    engine_name = get_engine_name(
-        MODEL_NAME, dtype, tensor_parallel_size, pipeline_parallel_size, tensorrt_llm.mpi_rank()
-    )
-
-    logger.info(f"Loading engine: Rank ({tensorrt_llm.mpi_rank()} -> {engine_dir}/{engine_name}")
-
-    serialize_path = os.path.join(engine_dir, engine_name)
-    with open(serialize_path, "rb") as f:
-        engine_buffer = f.read()
-
-    decoder = tensorrt_llm.runtime.GenerationSession(
-        model_config, engine_buffer, runtime_mapping, debug_mode=False, stream=stream
-    )
-    runtime_mapping.rank = runtime_rank
-    runtime_mapping.tp_group = get_tensor_parallel_group(
-        tensor_parallel_size
-    )  # Override the tp_group to support TP+DP
-    runtime_mapping.tp_rank = runtime_rank
-    runtime_mapping.tp_size = tensor_parallel_size
-    runtime_mapping.pp_group = [runtime_rank]
-    runtime_mapping.pp_rank = 0
-
-    sampling_config = SamplingConfig(end_id=tokenizer.eos_token_id, pad_id=tokenizer.eos_token_id, num_beams=num_beams)
-
-    if decoder.use_lora_plugin:
-        lora_manager = LoraManager()
-        if lora_ckpt_list is not None:
-            lora_manager.load_from_nemo(
-                model_files=lora_ckpt_list,
-                model_config=model_config,
-                runtime_mapping=runtime_mapping,
-            )
-    else:
-        lora_manager = None
-
-    # create a new builder and refit the current engine
-    new_builder = LMHeadModelBuilder(model_configs[0])
-    engine = decoder.runtime.engine
-    refit_runtime_engine(new_builder.named_parameters(), engine)
-
-    # Initialize the global context so it can be used during `run` API.
-    global tensorrt_llm_worker_context
-    tensorrt_llm_worker_context.decoder = decoder
-    tensorrt_llm_worker_context.sampling_config = sampling_config
-    tensorrt_llm_worker_context.lora_manager = lora_manager
-    tensorrt_llm_worker_context.max_batch_size = max_batch_size
-    tensorrt_llm_worker_context.max_input_len = max_input_len
-
-    max_batch_size = config["builder_config"]["max_batch_size"]
-    max_input_len = config["builder_config"]["max_input_len"]
-
-    return TensorrtLLMHostContext(
-        executor=None,
-        world_size=world_size,
-        tokenizer=tokenizer,
-        max_batch_size=max_batch_size,
-        max_input_len=max_input_len,
-    )
-
-
 def forward(
     input_tensors: List[torch.IntTensor],
     max_output_len: int,
diff --git a/nemo/export/trt_llm/tensorrt_llm_utils.py b/nemo/export/trt_llm/tensorrt_llm_utils.py
deleted file mode 100644
index b732daca2525..000000000000
--- a/nemo/export/trt_llm/tensorrt_llm_utils.py
+++ /dev/null
@@ -1,85 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import logging
-
-import tensorrt as trt
-from tensorrt_llm.layers import Embedding, LayerNorm, PromptTuningEmbedding, RmsNorm
-from tensorrt_llm.module import Module
-
-from nemo.export.trt_llm.model_config import LAYERNORM_DEFAULT, LAYERNORM_RMS, EmbeddingConfig, LayernormConfig
-from nemo.export.trt_llm.tensor_utils import get_tensor_parallel_group
-
-LOGGER = logging.getLogger("NeMo")
-
-
-def build_embedding_from_config(
-    config: EmbeddingConfig,
-    dtype: trt.DataType,
-    tensor_parallel: int = 1,
-    tensor_parallel_rank: int = 0,
-    use_prompt_tuning: bool = False,
-):
-    """Returns the tensorrt_llm embedding layer from the embedding config."""
-    # If the config is empty, return an empty impl.
-    if config is None:
-        return None
-    EmbeddingCls = PromptTuningEmbedding if use_prompt_tuning else Embedding
-
-    trt_embedding = EmbeddingCls(
-        config.weight.shape[0] * tensor_parallel,
-        config.weight.shape[1],
-        dtype=dtype,
-        tp_size=tensor_parallel,
-        tp_rank=tensor_parallel_rank,
-        tp_group=get_tensor_parallel_group(tensor_parallel),
-    )
-    trt_embedding.weight.value = config.weight
-    return trt_embedding
-
-
-def build_layernorm_from_config(config: LayernormConfig, dtype: trt.DataType):
-    """Returns the tensorrt_llm layernorm layer from the torch layernorm."""
-    # If the config is empty, return an empty impl.
-    if config is None:
-        return None
-
-    if config.layernorm_type == LAYERNORM_DEFAULT:
-        trt_layernorm = LayerNorm(normalized_shape=config.weight.shape[0], dtype=dtype)
-        trt_layernorm.weight.value = config.weight
-        trt_layernorm.bias.value = config.bias
-    elif config.layernorm_type == LAYERNORM_RMS:
-        trt_layernorm = RmsNorm(normalized_shape=config.weight.shape[0], dtype=dtype)
-        trt_layernorm.weight.value = config.weight
-    else:
-        raise NotImplementedError(f"{config.layernorm_type} not supported")
-    return trt_layernorm
-
-
-def print_tensorrt_llm(name: str, tensorrt_llm_module: Module):
-    """Prints the tensorrt llm structure including weights and related data for debugging purpose."""
-    for tensor_name in [
-        "weight",
-        "bias",
-        "activation_scaling_factor",
-        "weights_scaling_factor",
-        "prequant_scaling_factor",
-    ]:
-        if hasattr(tensorrt_llm_module, tensor_name):
-            tensor = getattr(tensorrt_llm_module, tensor_name)
-            if tensor is not None:
-                LOGGER.info(f"{name}.{tensor_name}:{tensor._value.dtype}:{tensor._value.shape}:\n{tensor._value}")
-
-    for k, v in tensorrt_llm_module.named_children():
-        print_tensorrt_llm(f"{name}.{k}({v._get_name()})", v)
diff --git a/nemo/export/trt_llm/utils.py b/nemo/export/trt_llm/utils.py
deleted file mode 100644
index 0f9fb66313b9..000000000000
--- a/nemo/export/trt_llm/utils.py
+++ /dev/null
@@ -1,78 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import logging
-import pathlib
-import numpy as np
-import torch
-
-log_format = "%(asctime)s %(name)s [%(levelname)s] %(message)s"
-logging.basicConfig(format=log_format)
-LOGGER = logging.getLogger("NeMo")
-
-# numpy doesn't know bfloat16, define abstract binary type instead
-np_bfloat16 = np.dtype('V2', metadata={"dtype": "bfloat16"})
-
-
-def prompt_convert(prompt_config, prompt_weights):
-    if "task_templates" in prompt_config:
-        prompt_templates = prompt_config["task_templates"]
-        actual_task_id = 0
-        vtokens_embeddings = []
-        vtokens_len = []
-        for task_name_id, prompt_task in enumerate(prompt_templates):
-            prompt_task_name = prompt_task["taskname"]
-            LOGGER.info(f"Task {actual_task_id}: {prompt_task['taskname']}")
-            prompt_task_weights = prompt_weights["prompt_table"].get(
-                f"prompt_table.{prompt_task_name}.prompt_embeddings.weight"
-            )
-            if prompt_task_weights is None:
-                continue
-            vtokens_embeddings.append(prompt_task_weights)
-            vtokens_len.append(prompt_task_weights.shape[0])
-            actual_task_id += 1
-
-        max_vtoken_len = max(vtokens_len)
-        embedding_dim = vtokens_embeddings[0].shape[1]
-
-        # pad tasks to longest task embedding table
-        for i, vtoken_emb_table in enumerate(vtokens_embeddings):
-            padded_table = torch.zeros((max_vtoken_len, embedding_dim))
-            padded_table[: vtoken_emb_table.shape[0], :] = vtoken_emb_table
-            vtokens_embeddings[i] = padded_table
-
-        vtokens_embeddings = torch.stack(vtokens_embeddings)
-    else:
-        vtokens_embeddings = prompt_weights["prompt_embeddings_weights"]
-
-    return vtokens_embeddings
-
-
-def cpu_map_location(storage, loc):
-    return storage.cpu()
-
-
-def is_nemo_file(path):
-    flag = False
-
-    if path is not None:
-        if len(path) > 5:
-            pc = pathlib.Path(path)
-            if pc.exists():
-                if pc.is_file():
-                    if path[-5 : len(path)] == ".nemo":
-                        flag = True
-
-    return flag

From 4a263e7f257d7e04f8e5d71756abeb4d9f4cfc60 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Fri, 31 May 2024 14:05:18 -0700
Subject: [PATCH 39/47] add large model stable training fix and contrastive
 loss update for variable seq (#9259) (#9348)

* add stable training fix and contrastive loss update for variable seq length input


* Apply isort and black reformatting


* replace remove_bias with use_bias


---------

Signed-off-by: Nithin Rao Koluguri <nithinraok>
Signed-off-by: nithinraok <nithinraok@users.noreply.github.com>
Co-authored-by: Nithin Rao <nithinrao.koluguri@gmail.com>
Co-authored-by: nithinraok <nithinraok@users.noreply.github.com>
---
 .../fastconformer_ctc_bpe_streaming.yaml      |  1 +
 .../fastconformer_ctc_char_streaming.yaml     |  1 +
 ...astconformer_transducer_bpe_streaming.yaml |  1 +
 ...stconformer_transducer_char_streaming.yaml |  1 +
 ...r_hybrid_transducer_ctc_bpe_streaming.yaml |  1 +
 ..._hybrid_transducer_ctc_char_streaming.yaml |  1 +
 ...stconformer_hybrid_transducer_ctc_bpe.yaml |  1 +
 ...tconformer_hybrid_transducer_ctc_char.yaml |  1 +
 .../fast-conformer-long_ctc_bpe.yaml          |  1 +
 .../fast-conformer-long_transducer_bpe.yaml   |  1 +
 .../ssl/fastconformer/fast-conformer.yaml     |  1 +
 .../asr/losses/ssl_losses/contrastive.py      | 23 +++---
 .../asr/modules/conformer_encoder.py          |  4 ++
 .../asr/parts/submodules/conformer_modules.py | 71 ++++++++++++++-----
 .../parts/submodules/multi_head_attention.py  | 41 +++++++----
 15 files changed, 110 insertions(+), 40 deletions(-)

diff --git a/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_ctc_bpe_streaming.yaml b/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_ctc_bpe_streaming.yaml
index a59a2628cd2f..acb499f18ffb 100644
--- a/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_ctc_bpe_streaming.yaml
+++ b/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_ctc_bpe_streaming.yaml
@@ -80,6 +80,7 @@ model:
     feat_out: -1 # you may set it if you need different output size other than the default d_model
     n_layers: 17
     d_model: 512
+    use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules
 
     # Sub-sampling parameters
     subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding
diff --git a/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_ctc_char_streaming.yaml b/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_ctc_char_streaming.yaml
index 8f8f7e40e39a..8dd978bb00e4 100644
--- a/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_ctc_char_streaming.yaml
+++ b/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_ctc_char_streaming.yaml
@@ -78,6 +78,7 @@ model:
     feat_out: -1 # you may set it if you need different output size other than the default d_model
     n_layers: 17
     d_model: 512
+    use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules
 
     # Sub-sampling params
     subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding
diff --git a/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_transducer_bpe_streaming.yaml b/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_transducer_bpe_streaming.yaml
index 69b21b496ddd..9f199c2dd488 100644
--- a/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_transducer_bpe_streaming.yaml
+++ b/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_transducer_bpe_streaming.yaml
@@ -85,6 +85,7 @@ model:
     feat_out: -1 # you may set it if you need different output size other than the default d_model
     n_layers: 17
     d_model: 512
+    use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules
 
     # Sub-sampling parameters
     subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding
diff --git a/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_transducer_char_streaming.yaml b/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_transducer_char_streaming.yaml
index 8fd096525e74..c7f83216aa0b 100644
--- a/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_transducer_char_streaming.yaml
+++ b/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_transducer_char_streaming.yaml
@@ -84,6 +84,7 @@ model:
     feat_out: -1 # you may set it if you need different output size other than the default d_model
     n_layers: 17
     d_model: 512
+    use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules
 
     # Sub-sampling params
     subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding
diff --git a/examples/asr/conf/fastconformer/hybrid_cache_aware_streaming/fastconformer_hybrid_transducer_ctc_bpe_streaming.yaml b/examples/asr/conf/fastconformer/hybrid_cache_aware_streaming/fastconformer_hybrid_transducer_ctc_bpe_streaming.yaml
index b0965b580d5b..6f356ce91caa 100644
--- a/examples/asr/conf/fastconformer/hybrid_cache_aware_streaming/fastconformer_hybrid_transducer_ctc_bpe_streaming.yaml
+++ b/examples/asr/conf/fastconformer/hybrid_cache_aware_streaming/fastconformer_hybrid_transducer_ctc_bpe_streaming.yaml
@@ -90,6 +90,7 @@ model:
     feat_out: -1 # you may set it if you need different output size other than the default d_model
     n_layers: 17
     d_model: 512
+    use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules
 
     # Sub-sampling parameters
     subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding
diff --git a/examples/asr/conf/fastconformer/hybrid_cache_aware_streaming/fastconformer_hybrid_transducer_ctc_char_streaming.yaml b/examples/asr/conf/fastconformer/hybrid_cache_aware_streaming/fastconformer_hybrid_transducer_ctc_char_streaming.yaml
index 9c144d22edec..870bb0190c03 100644
--- a/examples/asr/conf/fastconformer/hybrid_cache_aware_streaming/fastconformer_hybrid_transducer_ctc_char_streaming.yaml
+++ b/examples/asr/conf/fastconformer/hybrid_cache_aware_streaming/fastconformer_hybrid_transducer_ctc_char_streaming.yaml
@@ -88,6 +88,7 @@ model:
     feat_out: -1 # you may set it if you need different output size other than the default d_model
     n_layers: 17
     d_model: 512
+    use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules
 
     # Sub-sampling params
     subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding
diff --git a/examples/asr/conf/fastconformer/hybrid_transducer_ctc/fastconformer_hybrid_transducer_ctc_bpe.yaml b/examples/asr/conf/fastconformer/hybrid_transducer_ctc/fastconformer_hybrid_transducer_ctc_bpe.yaml
index 69e4546b77a7..3fc91cc1e436 100644
--- a/examples/asr/conf/fastconformer/hybrid_transducer_ctc/fastconformer_hybrid_transducer_ctc_bpe.yaml
+++ b/examples/asr/conf/fastconformer/hybrid_transducer_ctc/fastconformer_hybrid_transducer_ctc_bpe.yaml
@@ -87,6 +87,7 @@ model:
     feat_out: -1 # you may set it if you need different output size other than the default d_model
     n_layers: 17
     d_model: 512
+    use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules
 
     # Sub-sampling parameters
     subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding
diff --git a/examples/asr/conf/fastconformer/hybrid_transducer_ctc/fastconformer_hybrid_transducer_ctc_char.yaml b/examples/asr/conf/fastconformer/hybrid_transducer_ctc/fastconformer_hybrid_transducer_ctc_char.yaml
index ea98d13e62da..e99ba69df57a 100644
--- a/examples/asr/conf/fastconformer/hybrid_transducer_ctc/fastconformer_hybrid_transducer_ctc_char.yaml
+++ b/examples/asr/conf/fastconformer/hybrid_transducer_ctc/fastconformer_hybrid_transducer_ctc_char.yaml
@@ -85,6 +85,7 @@ model:
     feat_out: -1 # you may set it if you need different output size other than the default d_model
     n_layers: 17
     d_model: 512
+    use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules
 
     # Sub-sampling params
     subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding
diff --git a/examples/asr/conf/fastconformer/long_fastconformer/fast-conformer-long_ctc_bpe.yaml b/examples/asr/conf/fastconformer/long_fastconformer/fast-conformer-long_ctc_bpe.yaml
index 2fab24fa6373..3e3d2bf6788e 100644
--- a/examples/asr/conf/fastconformer/long_fastconformer/fast-conformer-long_ctc_bpe.yaml
+++ b/examples/asr/conf/fastconformer/long_fastconformer/fast-conformer-long_ctc_bpe.yaml
@@ -88,6 +88,7 @@ model:
     feat_out: -1 # you may set it if you need different output size other than the default d_model
     n_layers: 18
     d_model: 512
+    use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules
 
     # Sub-sampling params
     subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding
diff --git a/examples/asr/conf/fastconformer/long_fastconformer/fast-conformer-long_transducer_bpe.yaml b/examples/asr/conf/fastconformer/long_fastconformer/fast-conformer-long_transducer_bpe.yaml
index 4d5f4dbcbd06..5f6c37288ae9 100644
--- a/examples/asr/conf/fastconformer/long_fastconformer/fast-conformer-long_transducer_bpe.yaml
+++ b/examples/asr/conf/fastconformer/long_fastconformer/fast-conformer-long_transducer_bpe.yaml
@@ -90,6 +90,7 @@ model:
     feat_out: -1 # you may set it if you need different output size other than the default d_model
     n_layers: 17
     d_model: 512
+    use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules
 
     # Sub-sampling parameters
     subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding
diff --git a/examples/asr/conf/ssl/fastconformer/fast-conformer.yaml b/examples/asr/conf/ssl/fastconformer/fast-conformer.yaml
index 47ad5aa458ca..6e7b5e107629 100644
--- a/examples/asr/conf/ssl/fastconformer/fast-conformer.yaml
+++ b/examples/asr/conf/ssl/fastconformer/fast-conformer.yaml
@@ -79,6 +79,7 @@ model:
     feat_out: -1 # you may set it if you need different output size other than the default d_model
     n_layers: 17
     d_model: 512
+    use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules
 
     # Sub-sampling params
     subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding
diff --git a/nemo/collections/asr/losses/ssl_losses/contrastive.py b/nemo/collections/asr/losses/ssl_losses/contrastive.py
index bab691913c0a..16a70925ac9b 100644
--- a/nemo/collections/asr/losses/ssl_losses/contrastive.py
+++ b/nemo/collections/asr/losses/ssl_losses/contrastive.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from math import ceil
+
 import torch
 import torch.nn.functional as F
 from torch import nn
@@ -25,8 +27,7 @@
 class ContrastiveLoss(Loss):
     @property
     def input_types(self):
-        """Input types definitions for Contrastive.
-        """
+        """Input types definitions for Contrastive."""
         return {
             "spectrograms": NeuralType(("B", "D", "T"), SpectrogramType()),
             "spec_masks": NeuralType(("B", "D", "T"), SpectrogramType()),
@@ -147,13 +148,17 @@ def sample_negatives(self, y, num):
 
     @typecheck()
     def forward(self, spectrograms, spec_masks, decoder_outputs, decoder_lengths=None):
-        spec_in = spectrograms.transpose(-2, -1)
+        targets = spectrograms.transpose(-2, -1)
         masks = spec_masks.transpose(-2, -1)
-        targets = spec_in
         # BxTxC
+        diff = int(ceil(targets.shape[1] / decoder_outputs.shape[1]) * decoder_outputs.shape[1]) - targets.shape[1]
+
+        if diff > 0:
+            targets = F.pad(targets, (0, 0, 0, diff))
+            masks = F.pad(masks, (0, 0, 0, diff))
 
-        targets = targets.reshape(targets.shape[0], targets.shape[1] // self.combine_time_steps, -1)
-        masks = masks.reshape(targets.shape[0], targets.shape[1], -1)
+        targets = targets.reshape(targets.shape[0], decoder_outputs.shape[1], -1)
+        masks = masks.reshape(targets.shape[0], decoder_outputs.shape[1], -1)
 
         if self.quantized_targets:
             if self.store_ids:
@@ -198,7 +203,8 @@ def forward(self, spectrograms, spec_masks, decoder_outputs, decoder_lengths=Non
             if self.sample_from_non_masked:
                 # sample from all steps in utterance
                 negatives, _ = self.sample_negatives(
-                    targets.transpose(0, 1), targets_masked_only.size(0),  # TxBxC  # T'
+                    targets.transpose(0, 1),
+                    targets_masked_only.size(0),  # TxBxC  # T'
                 )
             else:
                 # only sample from masked steps in utterance
@@ -239,7 +245,8 @@ def forward(self, spectrograms, spec_masks, decoder_outputs, decoder_lengths=Non
             elif self.sample_from_non_masked:
                 # sample from all steps in batch
                 negatives, _ = self.sample_negatives(
-                    targets.reshape(targets.shape[0] * targets.shape[1], -1), targets_masked_only.size(0),  # BTxC
+                    targets.reshape(targets.shape[0] * targets.shape[1], -1),
+                    targets_masked_only.size(0),  # BTxC
                 )  # T'
             else:
                 # only sample from masked steps
diff --git a/nemo/collections/asr/modules/conformer_encoder.py b/nemo/collections/asr/modules/conformer_encoder.py
index d0e014e42a37..d8f0e58833f7 100644
--- a/nemo/collections/asr/modules/conformer_encoder.py
+++ b/nemo/collections/asr/modules/conformer_encoder.py
@@ -118,6 +118,8 @@ class ConformerEncoder(NeuralModule, StreamingEncoder, Exportable, AccessMixin):
             Defaults to None.
         conv_dual_mode (bool): specifies if convolution should be dual mode when dual_offline mode is being used. When enables, the left half of the convolution kernel would get masked in streaming cases.
             Defaults to False
+        use_bias (bool): Use bias in all Linear and Conv1d layers from each ConformerLayer to improve activation flow and stabilize training of huge models.
+            Defaults to True.
         dropout (float): the dropout rate used in all layers except the attention layers
             Defaults to 0.1.
         dropout_pre_encoder (float): the dropout rate used before the encoder
@@ -282,6 +284,7 @@ def __init__(
         conv_kernel_size=31,
         conv_norm_type='batch_norm',
         conv_context_size=None,
+        use_bias=True,
         dropout=0.1,
         dropout_pre_encoder=0.1,
         dropout_emb=0.1,
@@ -426,6 +429,7 @@ def __init__(
                 pos_bias_u=pos_bias_u,
                 pos_bias_v=pos_bias_v,
                 att_context_size=self.att_context_size,
+                use_bias=use_bias,
             )
             self.layers.append(layer)
 
diff --git a/nemo/collections/asr/parts/submodules/conformer_modules.py b/nemo/collections/asr/parts/submodules/conformer_modules.py
index aed6cc16245c..efd23ef44628 100644
--- a/nemo/collections/asr/parts/submodules/conformer_modules.py
+++ b/nemo/collections/asr/parts/submodules/conformer_modules.py
@@ -56,6 +56,8 @@ class ConformerLayer(torch.nn.Module, AdapterModuleMixin, AccessMixin):
         conv_kernel_size (int): kernel size for depthwise convolution in convolution module
         dropout (float): dropout probabilities for linear layers
         dropout_att (float): dropout probabilities for attention distributions
+        use_bias (bool): Apply bias to all Linear and Conv1d layers from each ConformerLayer to improve activation flow and stabilize training of huge models.
+            Defaults to True.
     """
 
     def __init__(
@@ -75,6 +77,7 @@ def __init__(
         pos_bias_u=None,
         pos_bias_v=None,
         att_context_size=[-1, -1],
+        use_bias=True,
     ):
         super(ConformerLayer, self).__init__()
 
@@ -84,7 +87,7 @@ def __init__(
 
         # first feed forward module
         self.norm_feed_forward1 = LayerNorm(d_model)
-        self.feed_forward1 = ConformerFeedForward(d_model=d_model, d_ff=d_ff, dropout=dropout)
+        self.feed_forward1 = ConformerFeedForward(d_model=d_model, d_ff=d_ff, dropout=dropout, use_bias=use_bias)
 
         # convolution module
         self.norm_conv = LayerNorm(d_model)
@@ -93,6 +96,7 @@ def __init__(
             kernel_size=conv_kernel_size,
             norm_type=conv_norm_type,
             conv_context_size=conv_context_size,
+            use_bias=use_bias,
         )
 
         # multi-headed self-attention module
@@ -107,6 +111,7 @@ def __init__(
                 pos_bias_u=pos_bias_u,
                 pos_bias_v=pos_bias_v,
                 max_cache_len=MHA_max_cache_len,
+                use_bias=use_bias,
             )
         elif self_attention_model == 'rel_pos_local_attn':
             self.self_attn = RelPositionMultiHeadAttentionLongformer(
@@ -120,10 +125,15 @@ def __init__(
                 global_tokens=global_tokens,
                 global_tokens_spacing=global_tokens_spacing,
                 global_attn_separate=global_attn_separate,
+                use_bias=use_bias,
             )
         elif self_attention_model == 'abs_pos':
             self.self_attn = MultiHeadAttention(
-                n_head=n_heads, n_feat=d_model, dropout_rate=dropout_att, max_cache_len=MHA_max_cache_len
+                n_head=n_heads,
+                n_feat=d_model,
+                dropout_rate=dropout_att,
+                max_cache_len=MHA_max_cache_len,
+                use_bias=use_bias,
             )
         else:
             raise ValueError(
@@ -133,7 +143,7 @@ def __init__(
 
         # second feed forward module
         self.norm_feed_forward2 = LayerNorm(d_model)
-        self.feed_forward2 = ConformerFeedForward(d_model=d_model, d_ff=d_ff, dropout=dropout)
+        self.feed_forward2 = ConformerFeedForward(d_model=d_model, d_ff=d_ff, dropout=dropout, use_bias=use_bias)
 
         self.dropout = nn.Dropout(dropout)
         self.norm_out = LayerNorm(d_model)
@@ -280,16 +290,25 @@ class ConformerConvolution(nn.Module):
         pointwise_activation (str): name of the activation function to be used for the pointwise conv.
             Note that Conformer uses a special key `glu_` which is treated as the original default from
             the paper.
+        use_bias (bool): Use bias in all Linear and Conv1d layers improve activation flow and stabilize training of huge models.
+            Defaults to True
     """
 
     def __init__(
-        self, d_model, kernel_size, norm_type='batch_norm', conv_context_size=None, pointwise_activation='glu_'
+        self,
+        d_model,
+        kernel_size,
+        norm_type='batch_norm',
+        conv_context_size=None,
+        pointwise_activation='glu_',
+        use_bias=True,
     ):
         super(ConformerConvolution, self).__init__()
         assert (kernel_size - 1) % 2 == 0
         self.d_model = d_model
         self.kernel_size = kernel_size
         self.norm_type = norm_type
+        self.use_bias = use_bias
 
         if conv_context_size is None:
             conv_context_size = (kernel_size - 1) // 2
@@ -305,7 +324,12 @@ def __init__(
             dw_conv_input_dim = d_model
 
         self.pointwise_conv1 = nn.Conv1d(
-            in_channels=d_model, out_channels=d_model * 2, kernel_size=1, stride=1, padding=0, bias=True
+            in_channels=d_model,
+            out_channels=d_model * 2,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=self.use_bias,
         )
 
         self.depthwise_conv = CausalConv1D(
@@ -315,7 +339,7 @@ def __init__(
             stride=1,
             padding=conv_context_size,
             groups=dw_conv_input_dim,
-            bias=True,
+            bias=self.use_bias,
         )
 
         if norm_type == 'batch_norm':
@@ -334,7 +358,12 @@ def __init__(
 
         self.activation = Swish()
         self.pointwise_conv2 = nn.Conv1d(
-            in_channels=dw_conv_input_dim, out_channels=d_model, kernel_size=1, stride=1, padding=0, bias=True
+            in_channels=dw_conv_input_dim,
+            out_channels=d_model,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=self.use_bias,
         )
 
     def forward(self, x, pad_mask=None, cache=None):
@@ -370,31 +399,34 @@ def forward(self, x, pad_mask=None, cache=None):
             return x, cache
 
     def reset_parameters_conv(self):
-        pw1_max = pw2_max = self.d_model ** -0.5
-        dw_max = self.kernel_size ** -0.5
+        pw1_max = pw2_max = self.d_model**-0.5
+        dw_max = self.kernel_size**-0.5
 
         with torch.no_grad():
             nn.init.uniform_(self.pointwise_conv1.weight, -pw1_max, pw1_max)
-            nn.init.uniform_(self.pointwise_conv1.bias, -pw1_max, pw1_max)
             nn.init.uniform_(self.pointwise_conv2.weight, -pw2_max, pw2_max)
-            nn.init.uniform_(self.pointwise_conv2.bias, -pw2_max, pw2_max)
             nn.init.uniform_(self.depthwise_conv.weight, -dw_max, dw_max)
-            nn.init.uniform_(self.depthwise_conv.bias, -dw_max, dw_max)
+            if self.use_bias:
+                nn.init.uniform_(self.pointwise_conv1.bias, -pw1_max, pw1_max)
+                nn.init.uniform_(self.pointwise_conv2.bias, -pw2_max, pw2_max)
+                nn.init.uniform_(self.depthwise_conv.bias, -dw_max, dw_max)
 
 
 class ConformerFeedForward(nn.Module):
     """
     feed-forward module of Conformer model.
+    use_bias (bool): Apply bias to all Linear and Conv1d layers improve activation flow and stabilize training of huge models.
     """
 
-    def __init__(self, d_model, d_ff, dropout, activation=Swish()):
+    def __init__(self, d_model, d_ff, dropout, activation=Swish(), use_bias=True):
         super(ConformerFeedForward, self).__init__()
         self.d_model = d_model
         self.d_ff = d_ff
-        self.linear1 = nn.Linear(d_model, d_ff)
+        self.use_bias = use_bias
+        self.linear1 = nn.Linear(d_model, d_ff, bias=self.use_bias)
         self.activation = activation
         self.dropout = nn.Dropout(p=dropout)
-        self.linear2 = nn.Linear(d_ff, d_model)
+        self.linear2 = nn.Linear(d_ff, d_model, bias=self.use_bias)
 
     def forward(self, x):
         x = self.linear1(x)
@@ -404,10 +436,11 @@ def forward(self, x):
         return x
 
     def reset_parameters_ff(self):
-        ffn1_max = self.d_model ** -0.5
-        ffn2_max = self.d_ff ** -0.5
+        ffn1_max = self.d_model**-0.5
+        ffn2_max = self.d_ff**-0.5
         with torch.no_grad():
             nn.init.uniform_(self.linear1.weight, -ffn1_max, ffn1_max)
-            nn.init.uniform_(self.linear1.bias, -ffn1_max, ffn1_max)
             nn.init.uniform_(self.linear2.weight, -ffn2_max, ffn2_max)
-            nn.init.uniform_(self.linear2.bias, -ffn2_max, ffn2_max)
+            if self.use_bias:
+                nn.init.uniform_(self.linear1.bias, -ffn1_max, ffn1_max)
+                nn.init.uniform_(self.linear2.bias, -ffn2_max, ffn2_max)
diff --git a/nemo/collections/asr/parts/submodules/multi_head_attention.py b/nemo/collections/asr/parts/submodules/multi_head_attention.py
index 6a866a617f35..19d713405953 100644
--- a/nemo/collections/asr/parts/submodules/multi_head_attention.py
+++ b/nemo/collections/asr/parts/submodules/multi_head_attention.py
@@ -55,21 +55,23 @@ class MultiHeadAttention(nn.Module):
         n_head (int): number of heads
         n_feat (int): size of the features
         dropout_rate (float): dropout rate
+        use_bias (bool): whether to remove bias in linear and conv layers
     """
 
-    def __init__(self, n_head, n_feat, dropout_rate, max_cache_len=0):
+    def __init__(self, n_head, n_feat, dropout_rate, max_cache_len=0, use_bias=True):
         """Construct an MultiHeadedAttention object."""
         super(MultiHeadAttention, self).__init__()
         self.cache_drop_size = None
+        self.use_bias = use_bias
         assert n_feat % n_head == 0
         # We assume d_v always equals d_k
         self.d_k = n_feat // n_head
         self.s_d_k = math.sqrt(self.d_k)
         self.h = n_head
-        self.linear_q = nn.Linear(n_feat, n_feat)
-        self.linear_k = nn.Linear(n_feat, n_feat)
-        self.linear_v = nn.Linear(n_feat, n_feat)
-        self.linear_out = nn.Linear(n_feat, n_feat)
+        self.linear_q = nn.Linear(n_feat, n_feat, bias=use_bias)
+        self.linear_k = nn.Linear(n_feat, n_feat, bias=use_bias)
+        self.linear_v = nn.Linear(n_feat, n_feat, bias=use_bias)
+        self.linear_out = nn.Linear(n_feat, n_feat, bias=use_bias)
         self.dropout = nn.Dropout(p=dropout_rate)
 
         self._max_cache_len = max_cache_len
@@ -161,11 +163,18 @@ class RelPositionMultiHeadAttention(MultiHeadAttention):
         n_head (int): number of heads
         n_feat (int): size of the features
         dropout_rate (float): dropout rate
+        use_bias (bool): whether to apply bias in linear and conv layers of MultiHeadAttention
     """
 
-    def __init__(self, n_head, n_feat, dropout_rate, pos_bias_u, pos_bias_v, max_cache_len=0):
+    def __init__(self, n_head, n_feat, dropout_rate, pos_bias_u, pos_bias_v, max_cache_len=0, use_bias=True):
         """Construct an RelPositionMultiHeadedAttention object."""
-        super().__init__(n_head=n_head, n_feat=n_feat, dropout_rate=dropout_rate, max_cache_len=max_cache_len)
+        super().__init__(
+            n_head=n_head,
+            n_feat=n_feat,
+            dropout_rate=dropout_rate,
+            max_cache_len=max_cache_len,
+            use_bias=use_bias,
+        )
         # linear transformation for positional encoding
         self.linear_pos = nn.Linear(n_feat, n_feat, bias=False)
         # these two learnable biases are used in matrix c and matrix d
@@ -253,7 +262,7 @@ def forward(self, query, key, value, mask, pos_emb, cache=None):
 class RelPositionMultiHeadAttentionLongformer(RelPositionMultiHeadAttention):
     """Multi-Head Attention layer of Transformer-XL with sliding window local+global attention from Longformer.
     Partially adapted from allenai (https://github.com/allenai/longformer/blob/master/longformer/sliding_chunks.py)
-    and huggingface (https://github.com/huggingface/transformers/blob/main/src/transformers/models/longformer/modeling_longformer.py) 
+    and huggingface (https://github.com/huggingface/transformers/blob/main/src/transformers/models/longformer/modeling_longformer.py)
     Paper: https://arxiv.org/abs/1901.02860 (Transformer-XL),
            https://arxiv.org/abs/2004.05150 (Longformer)
     Args:
@@ -267,6 +276,7 @@ class RelPositionMultiHeadAttentionLongformer(RelPositionMultiHeadAttention):
         global_tokens (int): number of tokens to be used for global attention
         global_tokens_spacing (int): how far apart the global tokens are
         global_attn_separate (bool): whether the q, k, v layers used for global tokens should be separate
+        use_bias (bool): whether to apply bias in linear and conv layers of MultiHeadAttention
     """
 
     def __init__(
@@ -281,6 +291,7 @@ def __init__(
         global_tokens=0,
         global_tokens_spacing=1,
         global_attn_separate=False,
+        use_bias=True,
     ):
         """Construct an RelPositionMultiHeadAttentionLongformer object."""
         super().__init__(
@@ -290,6 +301,7 @@ def __init__(
             pos_bias_u=pos_bias_u,
             pos_bias_v=pos_bias_v,
             max_cache_len=max_cache_len,
+            use_bias=use_bias,
         )
         self.att_context_size = att_context_size
         self.global_tokens = global_tokens
@@ -297,9 +309,9 @@ def __init__(
         self.global_attn_separate = global_attn_separate
 
         if self.global_attn_separate:
-            self.global_q = nn.Linear(n_feat, n_feat)
-            self.global_k = nn.Linear(n_feat, n_feat)
-            self.global_v = nn.Linear(n_feat, n_feat)
+            self.global_q = nn.Linear(n_feat, n_feat, bias=use_bias)
+            self.global_k = nn.Linear(n_feat, n_feat, bias=use_bias)
+            self.global_v = nn.Linear(n_feat, n_feat, bias=use_bias)
 
     def forward(self, query, key, value, pad_mask, pos_emb, cache=None):
         """Compute Scaled Dot Product Local Attention with rel. positional encoding. using overlapping chunks
@@ -650,7 +662,8 @@ def _compute_out_global_to_all(
         global_attn_scores = global_attn_scores.transpose(1, 2)
 
         global_attn_scores = global_attn_scores.masked_fill(
-            is_index_masked.transpose(2, 3), torch.finfo(global_attn_scores.dtype).min,
+            is_index_masked.transpose(2, 3),
+            torch.finfo(global_attn_scores.dtype).min,
         )
 
         global_attn_scores = global_attn_scores.view(batch_size * self.h, max_num_global_attn_indices, seq_len)
@@ -747,7 +760,9 @@ def _get_invalid_locations_mask(self, w: int, device: str):
         return mask.bool().to(device), ending_mask
 
     def mask_invalid_locations(
-        self, input_tensor: torch.Tensor, w: int,
+        self,
+        input_tensor: torch.Tensor,
+        w: int,
     ):
         """
         Mask locations invalid for the sliding window attention

From 006bd7f0614f963aea09cee4ffcff25afa8dd0db Mon Sep 17 00:00:00 2001
From: jgerh <163925524+jgerh@users.noreply.github.com>
Date: Fri, 31 May 2024 14:35:15 -0700
Subject: [PATCH 40/47] Nemo readme revisions (#9129)

* REvisions to NeMo ReadMe

* NeMo Readme.rst revisions

* Update README.rst

Co-authored-by: Eric Harper <complex451@gmail.com>
Signed-off-by: jgerh <163925524+jgerh@users.noreply.github.com>

* ReadMe updates

* ReadMe Updates

* Updates to NeMo Readme with new license information

* NeMo Framework ReadMe Revisions Updates

Signed-off-by: Jennifer Gerhold <jgerhold@nvidia.com>

* NeMo Framework ReadMe Revisions 2

Signed-off-by: Jennifer Gerhold <jgerhold@nvidia.com>

---------

Signed-off-by: jgerh <163925524+jgerh@users.noreply.github.com>
Signed-off-by: Eric Harper <complex451@gmail.com>
Signed-off-by: Jennifer Gerhold <jgerhold@nvidia.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
---
 README.rst | 287 ++++++++++++++++++++++++++---------------------------
 1 file changed, 143 insertions(+), 144 deletions(-)

diff --git a/README.rst b/README.rst
index 121c82b8590f..4a68acc286cd 100644
--- a/README.rst
+++ b/README.rst
@@ -108,57 +108,51 @@ Latest News
 Introduction
 ------------
 
-NVIDIA NeMo Framework is a generative AI framework built for researchers and PyTorch developers
-working on large language models (LLMs), multimodal models (MM), automatic speech recognition (ASR),
-and text-to-speech synthesis (TTS).
-The primary objective of NeMo is to provide a scalable framework for researchers and developers from industry and academia
-to more easily implement and design new generative AI models by being able to leverage existing code and pretrained models.
+NVIDIA NeMo Framework is a scalable and cloud-native generative AI framework built for researchers and PyTorch developers working on Large Language Models (LLMs), Multimodal Models (MMs), Automatic Speech Recognition (ASR), Text to Speech (TTS), and Computer Vision (CV) domains. It is designed to help you efficiently create, customize, and deploy new generative AI models by leveraging existing code and pre-trained model checkpoints.
 
 For technical documentation, please see the `NeMo Framework User Guide <https://docs.nvidia.com/nemo-framework/user-guide/latest/playbooks/index.html>`_.
 
-All NeMo models are trained with `Lightning <https://github.com/Lightning-AI/lightning>`_ and
-training is automatically scalable to 1000s of GPUs.
+LLMs and MMs Training, Alignment, and Customization
+###################################################
 
-When applicable, NeMo models take advantage of the latest possible distributed training techniques,
-including parallelism strategies such as
+All NeMo models are trained with `Lightning <https://github.com/Lightning-AI/lightning>`_.
+Training is automatically scalable to 1000s of GPUs.
 
-* data parallelism
-* tensor parallelism
-* pipeline model parallelism
-* fully sharded data parallelism (FSDP)
-* sequence parallelism
-* context parallelism
-* mixture-of-experts (MoE)
+When applicable, NeMo models leverage cutting-edge distributed training techniques, incorporating `parallelism strategies <https://docs.nvidia.com/nemo-framework/user-guide/latest/modeloverview.html>`_ to enable efficient training of very large models. These techniques include Tensor Parallelism (TP), Pipeline Parallelism (PP), Fully Sharded Data Parallelism (FSDP), Mixture-of-Experts (MoE), and Mixed Precision Training with BFloat16 and FP8, as well as others.
 
-and mixed precision training recipes with bfloat16 and FP8 training.
+NeMo Transformer-based LLMs and MMs utilize `NVIDIA Transformer Engine <https://github.com/NVIDIA/TransformerEngine>`_ for FP8 training on NVIDIA Hopper GPUs, while leveraging `NVIDIA Megatron Core <https://github.com/NVIDIA/Megatron-LM/tree/main/megatron/core>`_ for scaling Transformer model training.
 
-NeMo's Transformer based LLM and Multimodal models leverage `NVIDIA Transformer Engine <https://github.com/NVIDIA/TransformerEngine>`_ for FP8 training on NVIDIA Hopper GPUs
-and leverages `NVIDIA Megatron Core <https://github.com/NVIDIA/Megatron-LM/tree/main/megatron/core>`_ for scaling transformer model training.
+NeMo LLMs can be aligned with state-of-the-art methods such as SteerLM, Direct Preference Optimization (DPO), and Reinforcement Learning from Human Feedback (RLHF). See `NVIDIA NeMo Aligner <https://github.com/NVIDIA/NeMo-Aligner>`_ for more information.
 
-NeMo LLMs can be aligned with state of the art methods such as SteerLM, DPO and Reinforcement Learning from Human Feedback (RLHF),
-see `NVIDIA NeMo Aligner <https://github.com/NVIDIA/NeMo-Aligner>`_ for more details.
+In addition to supervised fine-tuning (SFT), NeMo also supports the latest parameter efficient fine-tuning (PEFT) techniques such as LoRA, P-Tuning, Adapters, and IA3. Refer to the `NeMo Framework User Guide <https://docs.nvidia.com/nemo-framework/user-guide/latest/sft_peft/index.html>`_ for the full list of supported models and techniques.
 
-NeMo LLM and Multimodal models can be deployed and optimized with `NVIDIA Inference Microservices (Early Access) <https://developer.nvidia.com/nemo-microservices-early-access>`_.
+LLMs and MMs Deployment and Optimization
+########################################
 
-NeMo ASR and TTS models can be optimized for inference and deployed for production use-cases with `NVIDIA Riva <https://developer.nvidia.com/riva>`_.
+NeMo LLMs and MMs can be deployed and optimized with `NVIDIA Inference Microservices (Early Access) <https://developer.nvidia.com/nemo-microservices-early-access>`_, in short, NIMs.
 
-For scaling NeMo LLM and Multimodal training on Slurm clusters or public clouds, please see the `NVIDIA Framework Launcher <https://github.com/NVIDIA/NeMo-Megatron-Launcher>`_.
-The NeMo Framework launcher has extensive recipes, scripts, utilities, and documentation for training NeMo LLMs and Multimodal models and also has an `Autoconfigurator <https://github.com/NVIDIA/NeMo-Megatron-Launcher#53-using-autoconfigurator-to-find-the-optimal-configuration>`_
-which can be used to find the optimal model parallel configuration for training on a specific cluster.
-To get started quickly with the NeMo Framework Launcher, please see the `NeMo Framework Playbooks <https://docs.nvidia.com/nemo-framework/user-guide/latest/playbooks/index.html>`_
-The NeMo Framework Launcher does not currently support ASR and TTS training but will soon.
+NeMo ASR and TTS models can be optimized for inference and deployed for production use cases with `NVIDIA Riva <https://developer.nvidia.com/riva>`_.
 
-Getting started with NeMo is simple.
-State of the Art pretrained NeMo models are freely available on `HuggingFace Hub <https://huggingface.co/models?library=nemo&sort=downloads&search=nvidia>`_ and
+NeMo Framework Launcher
+#######################
+
+`NeMo Framework Launcher <https://github.com/NVIDIA/NeMo-Megatron-Launcher>`_ is a cloud-native tool that streamlines the NeMo Framework experience. It is used for launching end-to-end NeMo Framework training jobs on CSPs and Slurm clusters. 
+
+The NeMo Framework Launcher includes extensive recipes, scripts, utilities, and documentation for training NeMo LLMs. It also includes the NeMo Framework `Autoconfigurator <https://github.com/NVIDIA/NeMo-Megatron-Launcher#53-using-autoconfigurator-to-find-the-optimal-configuration>`_, which is designed to find the optimal model parallel configuration for training on a specific cluster.
+
+To get started quickly with the NeMo Framework Launcher, please see the `NeMo Framework Playbooks <https://docs.nvidia.com/nemo-framework/user-guide/latest/playbooks/index.html>`_. The NeMo Framework Launcher does not currently support ASR and TTS training, but it will soon.
+
+Get Started with NeMo Framework
+-------------------------------
+
+Getting started with NeMo Framework is easy. State-of-the-art pretrained NeMo models are freely available on `Hugging Face Hub <https://huggingface.co/models?library=nemo&sort=downloads&search=nvidia>`_ and
 `NVIDIA NGC <https://catalog.ngc.nvidia.com/models?query=nemo&orderBy=weightPopularDESC>`_.
 These models can be used to generate text or images, transcribe audio, and synthesize speech in just a few lines of code.
 
 We have extensive `tutorials <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/starthere/tutorials.html>`_ that
-can be run on `Google Colab <https://colab.research.google.com>`_ or with our `NGC NeMo Framework Container. <https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo>`_
-and we have `playbooks <https://docs.nvidia.com/nemo-framework/user-guide/latest/playbooks/index.html>`_ for users that want to train NeMo models with the NeMo Framework Launcher.
+can be run on `Google Colab <https://colab.research.google.com>`_ or with our `NGC NeMo Framework Container <https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo>`_. We also have `playbooks <https://docs.nvidia.com/nemo-framework/user-guide/latest/playbooks/index.html>`_ for users who want to train NeMo models with the NeMo Framework Launcher.
 
-For advanced users that want to train NeMo models from scratch or finetune existing NeMo models
-we have a full suite of `example scripts <https://github.com/NVIDIA/NeMo/tree/main/examples>`_ that support multi-GPU/multi-node training.
+For advanced users who want to train NeMo models from scratch or fine-tune existing NeMo models, we have a full suite of `example scripts <https://github.com/NVIDIA/NeMo/tree/main/examples>`_ that support multi-GPU/multi-node training.
 
 Key Features
 ------------
@@ -172,9 +166,9 @@ Key Features
 Requirements
 ------------
 
-1) Python 3.10 or above
-2) Pytorch 1.13.1 or above
-3) NVIDIA GPU, if you intend to do model training
+* Python 3.10 or above
+* Pytorch 1.13.1 or above
+* NVIDIA GPU (if you intend to do model training)
 
 Developer Documentation
 -----------------------
@@ -197,54 +191,48 @@ Developer Documentation
 | Stable  | |stable|    | `Documentation of the stable (i.e. most recent release) branch. <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/>`_ |
 +---------+-------------+------------------------------------------------------------------------------------------------------------------------------------------+
 
-
-Getting help with NeMo
+Install NeMo Framework
 ----------------------
-FAQ can be found on NeMo's `Discussions board <https://github.com/NVIDIA/NeMo/discussions>`_. You are welcome to ask questions or start discussions there.
-
-
-Installation
-------------
 
 The NeMo Framework can be installed in a variety of ways, depending on your needs. Depending on the domain, you may find one of the following installation methods more suitable.
 
-* Conda / Pip - Refer to the `Conda <#conda>`_ and `Pip <#pip>`_ sections for installation instructions.
+* Conda / Pip - Refer to `Conda <#conda>`_ and `Pip <#pip>`_ for installation instructions.
 
-  * This is recommended for Automatic Speech Recognition (ASR) and Text-to-Speech (TTS) domains.
-  * When using a Nvidia PyTorch container as the base, this is the recommended installation method for all domains.
+  * This is the recommended method for Automatic Speech Recognition (ASR) and Text-to-Speech (TTS) domains.
+  * When using a Nvidia PyTorch container as the base, this is the recommended method for all domains.
 
-* Docker Containers - Refer to the `Docker containers <#docker-containers>`_ section for installation instructions.
+* Docker Containers - Refer to `Docker containers <#docker-containers>`_ for installation instructions.
 
-  * This is recommended for Large Language Models (LLM), Multimodal and Vision domains.
-  * NeMo LLM & Multimodal Container - `nvcr.io/nvidia/nemo:24.03.framework`
-  * NeMo Speech Container - `nvcr.io/nvidia/nemo:24.01.speech`
+  * NeMo Framework container - `nvcr.io/nvidia/nemo:24.05`
 
-* LLM and Multimodal Dependencies - Refer to the `LLM and Multimodal dependencies <#llm-and-multimodal-dependencies>`_ section for installation instructions.
-  * It's highly recommended to start with a base NVIDIA PyTorch container: `nvcr.io/nvidia/pytorch:24.02-py3`
+* LLMs and MMs Dependencies - Refer to `LLMs and MMs Dependencies <#install-llms-and-mms-dependencies>`_ for installation instructions.
+
+**Important: We strongly recommended that you start with a base NVIDIA PyTorch container: `nvcr.io/nvidia/pytorch:24.02-py3`**
 
 Conda
-~~~~~
+^^^^^^
 
-We recommend installing NeMo in a fresh Conda environment.
+Install NeMo in a fresh Conda environment:
 
 .. code-block:: bash
 
     conda create --name nemo python==3.10.12
     conda activate nemo
 
-Install PyTorch using their `configurator <https://pytorch.org/get-started/locally/>`_.
+Install PyTorch using their `configurator <https://pytorch.org/get-started/locally/>`_:
 
 .. code-block:: bash
 
     conda install pytorch torchvision torchaudio pytorch-cuda=11.8 -c pytorch -c nvidia
 
-The command used to install PyTorch may depend on your system. Please use the configurator linked above to find the right command for your system.
+The command to install PyTorch may depend on your system. Use the configurator linked above to find the right command for your system.
 
 Then, install NeMo via Pip or from Source. We do not provide NeMo on the conda-forge or any other Conda channel.
 
 Pip
-~~~
-Use this installation mode if you want the latest released version.
+^^^
+
+To install the nemo_toolkit, use the following installation method:
 
 .. code-block:: bash
 
@@ -252,12 +240,12 @@ Use this installation mode if you want the latest released version.
     pip install Cython
     pip install nemo_toolkit['all']
 
-Depending on the shell used, you may need to use ``"nemo_toolkit[all]"`` instead in the above command.
+Depending on the shell used, you may need to use the ``"nemo_toolkit[all]"`` specifier instead in the above command.
 
-Pip (Domain Specific)
-~~~~~~~~~~~~~~~~~~~~~
+Pip from a Specific Domain
+^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-To install only a specific domain of NeMo, use the following commands. Note: It is required to install the above pre-requisites before installing a specific domain of NeMo.
+To install a specific domain of NeMo, you must first install the nemo_toolkit using the instructions listed above. Then, you run the following domain-specific commands:
 
 .. code-block:: bash
 
@@ -267,9 +255,10 @@ To install only a specific domain of NeMo, use the following commands. Note: It
     pip install nemo_toolkit['vision']
     pip install nemo_toolkit['multimodal']
 
-Pip from source
-~~~~~~~~~~~~~~~
-Use this installation mode if you want the version from a particular GitHub branch (e.g main).
+Pip from a Source Branch
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+If you want to work with a specific version of NeMo from a particular GitHub branch (e.g main), use the following installation method:
 
 .. code-block:: bash
 
@@ -278,9 +267,10 @@ Use this installation mode if you want the version from a particular GitHub bran
     python -m pip install git+https://github.com/NVIDIA/NeMo.git@{BRANCH}#egg=nemo_toolkit[all]
 
 
-From source
-~~~~~~~~~~~
-Use this installation mode if you are contributing to NeMo.
+Build from Source
+^^^^^^^^^^^^^^^^^
+
+If you want to clone the NeMo GitHub repository and contribute to NeMo open-source development work, use the following installation method:
 
 .. code-block:: bash
 
@@ -289,18 +279,16 @@ Use this installation mode if you are contributing to NeMo.
     cd NeMo
     ./reinstall.sh
 
-If you only want the toolkit without additional conda-based dependencies, you may replace ``reinstall.sh``
-with ``pip install -e .`` when your PWD is the root of the NeMo repository.
+If you only want the toolkit without the additional Conda-based dependencies, you can replace ``reinstall.sh`` with ``pip install -e .`` when your PWD is the root of the NeMo repository.
 
-Mac computers with Apple silicon
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-To install NeMo on Mac with Apple M-Series GPU:
+Mac Computers with Apple Silicon
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-- create a new Conda environment
+To install NeMo on Mac computers with the Apple M-Series GPU, you need to create a new Conda environment, install PyTorch 2.0 or higher, and then install the nemo_toolkit.
 
-- install PyTorch 2.0 or higher
+**Important: This method is only applicable to the ASR domain.**
 
-- run the following code:
+Run the following code:
 
 .. code-block:: shell
 
@@ -322,24 +310,22 @@ To install NeMo on Mac with Apple M-Series GPU:
     # Note that only the ASR toolkit is guaranteed to work on MacBook - so for MacBook use pip install 'nemo_toolkit[asr]'
 
 Windows Computers
-~~~~~~~~~~~~~~~~~
-
-One of the options is using Windows Subsystem for Linux (WSL).
+^^^^^^^^^^^^^^^^^
 
-To install WSL:
-
-- In PowerShell, run the following code:
+To install the Windows Subsystem for Linux (WSL), run the following code in PowerShell: 
 
 .. code-block:: shell
 
     wsl --install
     # [note] If you run wsl --install and see the WSL help text, it means WSL is already installed.
 
-Learn more about installing WSL at `Microsoft's official documentation <https://learn.microsoft.com/en-us/windows/wsl/install>`_.
+To learn more about installing WSL, refer to `Microsoft's official documentation <https://learn.microsoft.com/en-us/windows/wsl/install>`_.
+
+After installing your Linux distribution with WSL, two options are available:
 
-After Installing your Linux distribution with WSL:
-  - **Option 1:** Open the distribution (Ubuntu by default) from the Start menu and follow the instructions.
-  - **Option 2:** Launch the Terminal application. Download it from `Microsoft's Windows Terminal page <https://learn.microsoft.com/en-us/windows/terminal>`_ if not installed.
+**Option 1:** Open the distribution (Ubuntu by default) from the Start menu and follow the instructions.
+
+**Option 2:** Launch the Terminal application. Download it from `Microsoft's Windows Terminal page <https://learn.microsoft.com/en-us/windows/terminal>`_ if not installed.
 
 Next, follow the instructions for Linux systems, as provided above. For example:
 
@@ -351,8 +337,11 @@ Next, follow the instructions for Linux systems, as provided above. For example:
     ./reinstall.sh
 
 RNNT
-~~~~
-Note that RNNT requires numba to be installed from conda.
+^^^^
+
+For optimal performance of a Recurrent Neural Network Transducer (RNNT), install the Numba package from Conda.
+
+Run the following code:
 
 .. code-block:: bash
 
@@ -360,14 +349,12 @@ Note that RNNT requires numba to be installed from conda.
   pip uninstall numba
   conda install -c conda-forge numba
 
-LLM and Multimodal Dependencies
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Install LLMs and MMs Dependencies
+---------------------------------
 
-The LLM and Multimodal domains require three additional dependencies: 
-NVIDIA Apex, NVIDIA Transformer Engine, and NVIDIA Megatron Core.
+If you work with the LLM and MM domains, three additional dependencies are required: NVIDIA Apex, NVIDIA Transformer Engine, and NVIDIA Megatron Core. When working with the `main` branch, these dependencies may require a recent commit.
 
-When working with the `main` branch these dependencies may require a recent commit.
-The most recent working versions of these dependencies are:
+The most recent working versions of these dependencies are here:
 
 .. code-block:: bash
 
@@ -376,11 +363,14 @@ The most recent working versions of these dependencies are:
   export mcore_commit=fbb375d4b5e88ce52f5f7125053068caff47f93f
   export nv_pytorch_tag=24.02-py3
 
-When using a released version of NeMo, 
-please refer to the `Software Component Versions <https://docs.nvidia.com/nemo-framework/user-guide/latest/softwarecomponentversions.html>`_ 
-for the correct versions.
+When using a released version of NeMo, please refer to the `Software Component Versions <https://docs.nvidia.com/nemo-framework/user-guide/latest/softwarecomponentversions.html>`_ for the correct versions.
+
+PyTorch Container
+^^^^^^^^^^^^^^^^^
+
+We recommended that you start with a base NVIDIA PyTorch container: nvcr.io/nvidia/pytorch:24.02-py3.
 
-If starting with a base NVIDIA PyTorch container first launch the container:
+If starting with a base NVIDIA PyTorch container, you must first launch the container:
 
 .. code-block:: bash
 
@@ -393,15 +383,14 @@ If starting with a base NVIDIA PyTorch container first launch the container:
     --ulimit stack=67108864 \
     nvcr.io/nvidia/pytorch:$nv_pytorch_tag
 
-Then install the dependencies:
+Next, you need to install the dependencies.
 
 Apex
-~~~~
-NeMo LLM Multimodal Domains require that NVIDIA Apex to be installed.
-Apex comes installed in the NVIDIA PyTorch container but it's possible that
-NeMo LLM and Multimodal may need to be updated to a newer version.
+^^^^
 
-To install Apex, run
+NVIDIA Apex is required for LLM and MM domains. Although Apex is pre-installed in the NVIDIA PyTorch container, you may need to update it to a newer version.
+
+To install Apex, run the following code:
 
 .. code-block:: bash
 
@@ -410,35 +399,32 @@ To install Apex, run
     git checkout $apex_commit
     pip install . -v --no-build-isolation --disable-pip-version-check --no-cache-dir --config-settings "--build-option=--cpp_ext --cuda_ext --fast_layer_norm --distributed_adam --deprecated_fused_adam --group_norm"
 
+When attempting to install Apex separately from the NVIDIA PyTorch container, you might encounter an error if the CUDA version on your system is different from the one used to compile PyTorch. To bypass this error, you can comment out the relevant line in the setup file located in the Apex repository on GitHub here: https://github.com/NVIDIA/apex/blob/master/setup.py#L32.
 
-While installing Apex outside of the NVIDIA PyTorch container,
-it may raise an error if the CUDA version on your system does not match the CUDA version torch was compiled with.
-This raise can be avoided by commenting it here: https://github.com/NVIDIA/apex/blob/master/setup.py#L32
+cuda-nvprof is needed to install Apex. The version should match the CUDA version that you are using.
 
-cuda-nvprof is needed to install Apex. The version should match the CUDA version that you are using:
+To install cuda-nvprof, run the following code:
 
 .. code-block:: bash
 
   conda install -c nvidia cuda-nvprof=11.8
 
-packaging is also needed:
+Finally, install the packaging:
 
 .. code-block:: bash
 
   pip install packaging
 
-With the latest versions of Apex, the `pyproject.toml` file in Apex may need to be deleted in order to install locally.
-
+To install the most recent versions of Apex locally, it might be necessary to remove the `pyproject.toml` file from the Apex directory.
 
 Transformer Engine
-~~~~~~~~~~~~~~~~~~
+^^^^^^^^^^^^^^^^^^
+
+NVIDIA Transformer Engine is required for LLM and MM domains. Although the Transformer Engine is pre-installed in the NVIDIA PyTorch container, you may need to update it to a newer version.
 
-The NeMo LLM Multimodal Domains require that NVIDIA Transformer Engine to be installed.
-Transformer Engine comes installed in the NVIDIA PyTorch container but it's possible that
-NeMo LLM and Multimodal may need Transformer Engine to be updated to a newer version.
+The Transformer Engine facilitates training with FP8 precision on NVIDIA Hopper GPUs and introduces many enhancements for the training of Transformer-based models. Refer to `Transformer Enginer <https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/installation.html>`_ for information. 
 
-Transformer Engine enables FP8 training on NVIDIA Hopper GPUs and many performance optimizations for transformer-based model training.
-Documentation for installing Transformer Engine can be found `here <https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/installation.html>`_. 
+To install Transformer Engine, run the following code:
 
 .. code-block:: bash
 
@@ -451,14 +437,15 @@ Documentation for installing Transformer Engine can be found `here <https://docs
 Transformer Engine requires PyTorch to be built with at least CUDA 11.8.
 
 Megatron Core
-~~~~~~~~~~~~~
+^^^^^^^^^^^^^
+
+Megatron Core is required for LLM and MM domains.
+
+Megatron Core is a library for scaling large Transformer-based models. NeMo LLMs and MMs leverage Megatron Core for model parallelism, 
 
-The NeMo LLM Multimodal Domains require that NVIDIA Megatron Core to be installed.
-Megatron core is a library for scaling large transformer base models. 
-NeMo LLM and Multimodal models leverage Megatron Core for model parallelism, 
 transformer architectures, and optimized PyTorch datasets.
 
-NeMo LLM and Multimodal may need Megatron Core to be updated to a recent version.
+To install Megatron Core, run the following code:
 
 .. code-block:: bash
 
@@ -469,27 +456,32 @@ NeMo LLM and Multimodal may need Megatron Core to be updated to a recent version
   cd megatron/core/datasets && \
   make
 
-
 NeMo Text Processing
-~~~~~~~~~~~~~~~~~~~~
-NeMo Text Processing, specifically (Inverse) Text Normalization, is now a separate repository `https://github.com/NVIDIA/NeMo-text-processing <https://github.com/NVIDIA/NeMo-text-processing>`_.
+--------------------
+
+NeMo Text Processing, specifically Inverse Text Normalization, is now a separate repository. It is located here: `https://github.com/NVIDIA/NeMo-text-processing <https://github.com/NVIDIA/NeMo-text-processing>`_.
+
+Docker Containers
+-----------------
+
+NeMo containers are launched concurrently with NeMo version updates. For example, the release of NeMo ``r1.23.0`` comes with the container ``nemo:24.01.speech``. The latest containers are:
+
+* NeMo LLM and MM container - `nvcr.io/nvidia/nemo:24.03.framework`
+* NeMo Speech container - `nvcr.io/nvidia/nemo:24.01.speech`
 
-Docker containers
-~~~~~~~~~~~~~~~~~
-We release NeMo containers alongside NeMo releases. For example, NeMo ``r1.23.0`` comes with container ``nemo:24.01.speech``, you may find more details about released containers in `releases page <https://github.com/NVIDIA/NeMo/releases>`_.
+You can find additional information about released containers on the `NeMo releases page <https://github.com/NVIDIA/NeMo/releases>`_.
 
-To use a pre-built container, please run
+To use a pre-built container, run the following code:
 
 .. code-block:: bash
 
     docker pull nvcr.io/nvidia/nemo:24.01.speech
 
-To build a nemo container with Dockerfile from a branch, please run
+To build a nemo container with Dockerfile from a branch, run the following code:
 
 .. code-block:: bash
 
-    DOCKER_BUILDKIT=1 docker build -f Dockerfile -t nemo:latest .
-
+    DOCKER_BUILDKIT=1 docker build -f Dockerfile -t nemo:latest
 
 If you choose to work with the main branch, we recommend using NVIDIA's PyTorch container version 23.10-py3 and then installing from GitHub.
 
@@ -499,25 +491,32 @@ If you choose to work with the main branch, we recommend using NVIDIA's PyTorch
     -p 8888:8888 -p 6006:6006 --ulimit memlock=-1 --ulimit \
     stack=67108864 --device=/dev/snd nvcr.io/nvidia/pytorch:23.10-py3
 
-Examples
---------
 
-Many examples can be found under the `"Examples" <https://github.com/NVIDIA/NeMo/tree/stable/examples>`_ folder.
+Future Work
+-----------
 
+The NeMo Framework Launcher does not currently support ASR and TTS training, but it will soon.
 
-Contributing
-------------
+Discussions Board
+-----------------
+
+FAQ can be found on the NeMo `Discussions board <https://github.com/NVIDIA/NeMo/discussions>`_. You are welcome to ask questions or start discussions on the board.
+
+Contribute to NeMo
+------------------
 
 We welcome community contributions! Please refer to `CONTRIBUTING.md <https://github.com/NVIDIA/NeMo/blob/stable/CONTRIBUTING.md>`_ for the process.
 
 Publications
-------------
+------------------
 
 We provide an ever-growing list of `publications <https://nvidia.github.io/NeMo/publications/>`_ that utilize the NeMo Framework.
 
-If you would like to add your own article to the list, you are welcome to do so via a pull request to this repository's ``gh-pages-src`` branch.
-Please refer to the instructions in the `README of that branch <https://github.com/NVIDIA/NeMo/tree/gh-pages-src#readme>`_.
+To contribute an article to the collection, please submit a pull request to the ``gh-pages-src`` branch of this repository. For detailed information, please consult the README located at the `gh-pages-src branch <https://github.com/NVIDIA/NeMo/tree/gh-pages-src#readme>`_.
+
+Licenses
+--------
+
+* `NeMo GitHub Apache 2.0 license <https://github.com/NVIDIA/NeMo?tab=Apache-2.0-1-ov-file#readme>`__
 
-License
--------
-NeMo is released under an `Apache 2.0 license <https://github.com/NVIDIA/NeMo/blob/stable/LICENSE>`_.
+* NeMo is licensed under the `NVIDIA AI PRODUCT AGREEMENT <https://www.nvidia.com/en-us/data-center/products/nvidia-ai-enterprise/eula/>`__. By pulling and using the container, you accept the terms and conditions of this license.
\ No newline at end of file

From 64c2812a2537f29e7e6a62780207f2749ec17ed1 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Fri, 31 May 2024 21:37:37 -0700
Subject: [PATCH 41/47] Guard cuda memory allocator update (#9312) (#9313)

* Guard cuda memory allocator update


* Apply isort and black reformatting


---------

Signed-off-by: smajumdar <titu1994@gmail.com>
Signed-off-by: titu1994 <titu1994@users.noreply.github.com>
Co-authored-by: Somshubra Majumdar <titu1994@gmail.com>
---
 nemo/collections/common/data/lhotse/dataloader.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/nemo/collections/common/data/lhotse/dataloader.py b/nemo/collections/common/data/lhotse/dataloader.py
index 32bbc1f3e8f4..01bf51b0e2c6 100644
--- a/nemo/collections/common/data/lhotse/dataloader.py
+++ b/nemo/collections/common/data/lhotse/dataloader.py
@@ -531,7 +531,13 @@ def maybe_set_cuda_expandable_segments(enabled: bool):
             warnings.warn(
                 "You have set PYTORCH_CUDA_ALLOC_CONF without expandable_segments:True option. We're setting that option anyway. To disable it, set cuda_expandable_segments=False in NeMo dataloader configuration."
             )
-        torch.cuda.memory._set_allocator_settings("expandable_segments:True")
+
+        try:
+            torch.cuda.memory._set_allocator_settings("expandable_segments:True")
+        except RuntimeError:
+            logging.info(
+                "Failed to set expandable_segments:True for PyTorch CUDA allocator. You may get training speed improvements if you enable this"
+            )
 
 
 def _select_channel(cut, channel_selector: int | str) -> list:

From 28ccec727cb76ba14d7a55061c290906a7dc6664 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Piotr=20=C5=BBelasko?= <petezor@gmail.com>
Date: Sat, 1 Jun 2024 00:40:41 -0400
Subject: [PATCH 42/47] Prompt formatter API and canary transcribe tensor input
 support (#9206)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Apply CanaryPromptFormatter in dataset/inference

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Working inference with CanaryPromptFormatter

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Minimum working example of Canary.transcribe() with tensors

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* training fix

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Update to the new 'chat' based prompt formatting API

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Prompt formatters for popular models and partial unit test coverage

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Updated documentation

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Improved test coverage + proper preamble support

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Fix usage of PromptFormatter for MT-AED class + fix tokenization/formatting issues

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Move some canary hacks to canary prompt formatter, improve validation, add tests for aggtok

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* aed_model.transcribe(**slots) support, rename all slots to lowercase and drop pipes everywhere except template definition.

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* truly generic version

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* making transcribe_speech.py work prompt slots + syntactic sugar

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* update streaming_utils.py

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* fix

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* code review: partial

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Accept multi-turn, single-turn, and legacy prompt format in transcribe() and transcribe_speech.py

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Address code reviews

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Add support for SPE special tokens bos/eos in prompt templates and ensure Llama2 format gives identical results with the reference implementation

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Fix tests and add llama2 prompt formatter tests

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Fix tests

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

---------

Signed-off-by: Piotr Żelasko <petezor@gmail.com>
---
 examples/asr/transcribe_speech.py             |  13 +-
 nemo/collections/asr/data/audio_to_text.py    |  45 +--
 .../asr/data/audio_to_text_lhotse_prompted.py | 158 +++-----
 .../asr/models/aed_multitask_models.py        | 172 +++++++--
 .../asr/parts/mixins/transcription.py         |   8 +-
 .../asr/parts/utils/streaming_utils.py        |  66 +++-
 nemo/collections/common/prompts/__init__.py   |   0
 nemo/collections/common/prompts/canary.py     |  71 ++++
 nemo/collections/common/prompts/example.py    |  36 ++
 nemo/collections/common/prompts/formatter.py  | 347 ++++++++++++++++++
 nemo/collections/common/prompts/gemma.py      |  29 ++
 nemo/collections/common/prompts/llama.py      |  72 ++++
 nemo/collections/common/prompts/mistral.py    |  33 ++
 nemo/collections/common/prompts/phi2.py       |  62 ++++
 .../common/tokenizers/aggregate_tokenizer.py  |   9 +-
 .../common/tokenizers/canary_tokenizer.py     |  49 ++-
 .../tokenizers/sentencepiece_tokenizer.py     |   3 +-
 tests/collections/__init__.py                 |   0
 .../asr/test_asr_multitask_model_bpe.py       |  25 +-
 .../collections/asr/test_custom_tokenizer.py  |  12 +-
 .../common/prompt_formatters/conftest.py      |  51 +++
 .../test_canary_prompt_formatter.py           |  50 +++
 .../test_gemma_prompt_formatter.py            |  40 ++
 .../test_llama2_prompt_formatter.py           |  63 ++++
 .../test_mistral_prompt_formatter.py          |  32 ++
 .../test_prompt_formatter_api.py              | 147 ++++++++
 26 files changed, 1382 insertions(+), 211 deletions(-)
 create mode 100644 nemo/collections/common/prompts/__init__.py
 create mode 100644 nemo/collections/common/prompts/canary.py
 create mode 100644 nemo/collections/common/prompts/example.py
 create mode 100644 nemo/collections/common/prompts/formatter.py
 create mode 100644 nemo/collections/common/prompts/gemma.py
 create mode 100644 nemo/collections/common/prompts/llama.py
 create mode 100644 nemo/collections/common/prompts/mistral.py
 create mode 100644 nemo/collections/common/prompts/phi2.py
 create mode 100644 tests/collections/__init__.py
 create mode 100644 tests/collections/common/prompt_formatters/conftest.py
 create mode 100644 tests/collections/common/prompt_formatters/test_canary_prompt_formatter.py
 create mode 100644 tests/collections/common/prompt_formatters/test_gemma_prompt_formatter.py
 create mode 100644 tests/collections/common/prompt_formatters/test_llama2_prompt_formatter.py
 create mode 100644 tests/collections/common/prompt_formatters/test_mistral_prompt_formatter.py
 create mode 100644 tests/collections/common/prompt_formatters/test_prompt_formatter_api.py

diff --git a/examples/asr/transcribe_speech.py b/examples/asr/transcribe_speech.py
index 1763c2035805..b63e9db5fef1 100644
--- a/examples/asr/transcribe_speech.py
+++ b/examples/asr/transcribe_speech.py
@@ -16,7 +16,7 @@
 import glob
 import json
 import os
-from dataclasses import dataclass, is_dataclass
+from dataclasses import dataclass, field, is_dataclass
 from tempfile import NamedTemporaryFile
 from typing import List, Optional, Union
 
@@ -25,6 +25,7 @@
 from omegaconf import OmegaConf, open_dict
 
 from nemo.collections.asr.models import EncDecCTCModel, EncDecHybridRNNTCTCModel, EncDecMultiTaskModel
+from nemo.collections.asr.models.aed_multitask_models import parse_multitask_prompt
 from nemo.collections.asr.modules.conformer_encoder import ConformerChangeConfig
 from nemo.collections.asr.parts.submodules.ctc_decoding import CTCDecodingConfig
 from nemo.collections.asr.parts.submodules.multitask_decoding import MultiTaskDecoding, MultiTaskDecodingConfig
@@ -169,6 +170,14 @@ class TranscriptionConfig:
 
     # Decoding strategy for AED models
     multitask_decoding: MultiTaskDecodingConfig = MultiTaskDecodingConfig()
+    # Prompt slots for prompted models, e.g. Canary-1B. Examples of acceptable prompt inputs:
+    # Implicit single-turn assuming default role='user' (works with Canary-1B)
+    #  +prompt.source_lang=en +prompt.target_lang=es +prompt.task=asr +prompt.pnc=yes
+    # Explicit single-turn prompt:
+    #  +prompt.role=user +prompt.slots.source_lang=en +prompt.slots.target_lang=es +prompt.slots.task=s2t_translation +prompt.slots.pnc=yes
+    # Explicit multi-turn prompt:
+    #  +prompt.turns='[{role:user,slots:{source_lang:en,target_lang:es,task:asr,pnc:yes}}]'
+    prompt: dict = field(default_factory=dict)
 
     # decoder type: ctc or rnnt, can be used to switch between CTC and RNNT decoder for Hybrid RNNT/CTC models
     decoder_type: Optional[str] = None
@@ -411,6 +420,8 @@ def autocast(dtype=None):
                 override_cfg.augmentor = augmentor
                 override_cfg.text_field = cfg.gt_text_attr_name
                 override_cfg.lang_field = cfg.gt_lang_attr_name
+                if hasattr(override_cfg, "prompt"):
+                    override_cfg.prompt = parse_multitask_prompt(OmegaConf.to_container(cfg.prompt))
                 transcriptions = asr_model.transcribe(
                     audio=filepaths,
                     override_config=override_cfg,
diff --git a/nemo/collections/asr/data/audio_to_text.py b/nemo/collections/asr/data/audio_to_text.py
index 00c15109b64f..e0bb63ad18cd 100644
--- a/nemo/collections/asr/data/audio_to_text.py
+++ b/nemo/collections/asr/data/audio_to_text.py
@@ -75,7 +75,9 @@ def _speech_collate_fn(batch, pad_id):
     has_audio = audio_lengths[0] is not None
     if has_audio:
         max_audio_len = max(audio_lengths).item()
-    max_tokens_len = max(tokens_lengths).item()
+    has_tokens = tokens_lengths[0] is not None
+    if has_tokens:
+        max_tokens_len = max(tokens_lengths).item()
 
     audio_signal, tokens = [], []
     for b in batch:
@@ -89,19 +91,24 @@ def _speech_collate_fn(batch, pad_id):
                 pad = (0, max_audio_len - sig_len)
                 sig = torch.nn.functional.pad(sig, pad)
             audio_signal.append(sig)
-        tokens_i_len = tokens_i_len.item()
-        if tokens_i_len < max_tokens_len:
-            pad = (0, max_tokens_len - tokens_i_len)
-            tokens_i = torch.nn.functional.pad(tokens_i, pad, value=pad_id)
-        tokens.append(tokens_i)
+        if has_tokens:
+            tokens_i_len = tokens_i_len.item()
+            if tokens_i_len < max_tokens_len:
+                pad = (0, max_tokens_len - tokens_i_len)
+                tokens_i = torch.nn.functional.pad(tokens_i, pad, value=pad_id)
+            tokens.append(tokens_i)
 
     if has_audio:
         audio_signal = torch.stack(audio_signal)
         audio_lengths = torch.stack(audio_lengths)
     else:
         audio_signal, audio_lengths = None, None
-    tokens = torch.stack(tokens)
-    tokens_lengths = torch.stack(tokens_lengths)
+    if has_tokens:
+        tokens = torch.stack(tokens)
+        tokens_lengths = torch.stack(tokens_lengths)
+    else:
+        tokens = None
+        tokens_lengths = None
     if sample_ids is None:
         return audio_signal, audio_lengths, tokens, tokens_lengths
     else:
@@ -256,8 +263,7 @@ def cache_datastore_manifests(
     if num_datastore_manifests > 0:
         # Local utility function
         def cache_data(manifest_filepaths, cache_audio, num_workers, max_num_workers):
-            """Cache manifests and audio data from object store.
-            """
+            """Cache manifests and audio data from object store."""
             # Determine the number of workers to use
             if num_workers is None:
                 num_workers = os.cpu_count() - 1
@@ -421,8 +427,7 @@ class _AudioTextDataset(Dataset):
 
     @property
     def output_types(self) -> Optional[Dict[str, NeuralType]]:
-        """Returns definitions of module output ports.
-               """
+        """Returns definitions of module output ports."""
         return {
             'audio_signal': NeuralType(('B', 'T'), AudioSignal()),
             'a_sig_length': NeuralType(tuple('B'), LengthsType()),
@@ -546,8 +551,7 @@ class AudioToCharDataset(_AudioTextDataset):
 
     @property
     def output_types(self) -> Optional[Dict[str, NeuralType]]:
-        """Returns definitions of module output ports.
-               """
+        """Returns definitions of module output ports."""
         return {
             'audio_signal': NeuralType(('B', 'T'), AudioSignal()),
             'a_sig_length': NeuralType(tuple('B'), LengthsType()),
@@ -640,8 +644,7 @@ class AudioToBPEDataset(_AudioTextDataset):
 
     @property
     def output_types(self) -> Optional[Dict[str, NeuralType]]:
-        """Returns definitions of module output ports.
-               """
+        """Returns definitions of module output ports."""
         return {
             'audio_signal': NeuralType(('B', 'T'), AudioSignal()),
             'a_sig_length': NeuralType(tuple('B'), LengthsType()),
@@ -910,8 +913,7 @@ def __next__(self):
         return TarredAudioFilter(self.manifest_processor.collection)
 
     def _loop_offsets(self, iterator):
-        """This function is used to iterate through utterances with different offsets for each file.
-        """
+        """This function is used to iterate through utterances with different offsets for each file."""
 
         class TarredAudioLoopOffsets:
             def __init__(self, collection):
@@ -944,8 +946,7 @@ def _collate_fn(self, batch):
         return _speech_collate_fn(batch, self.pad_id)
 
     def _build_sample(self, tup):
-        """Builds the training sample by combining the data from the WebDataset with the manifest info.
-        """
+        """Builds the training sample by combining the data from the WebDataset with the manifest info."""
         audio_bytes, audio_filename, offset_id = tup
 
         # Grab manifest entry from self.manifest_preprocessor.collection
@@ -1316,7 +1317,9 @@ class BucketingDataset(IterableDataset):
     """
 
     def __init__(
-        self, dataset: IterableDataset, bucketing_batch_size: int,
+        self,
+        dataset: IterableDataset,
+        bucketing_batch_size: int,
     ):
         self.wrapped_dataset = dataset
         self.bucketing_batch_size = bucketing_batch_size
diff --git a/nemo/collections/asr/data/audio_to_text_lhotse_prompted.py b/nemo/collections/asr/data/audio_to_text_lhotse_prompted.py
index 000b1a8f0839..e9e97d3d32d7 100644
--- a/nemo/collections/asr/data/audio_to_text_lhotse_prompted.py
+++ b/nemo/collections/asr/data/audio_to_text_lhotse_prompted.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 from typing import Callable, Sequence
 
-import omegaconf
 import torch.utils.data
 from lhotse import CutSet
 from lhotse.cut import MixedCut, MonoCut
@@ -21,7 +20,9 @@
 from lhotse.dataset.collation import collate_vectors
 
 from nemo.collections.asr.data.audio_to_text_lhotse import TokenizerWrapper
+from nemo.collections.common.prompts.canary import CanaryPromptFormatter
 from nemo.collections.common.tokenizers import CanaryTokenizer, TokenizerSpec
+from nemo.collections.common.tokenizers.canary_tokenizer import CANARY_SPECIAL_TOKENIZER
 
 
 class PromptedAudioToTextLhotseDataset(torch.utils.data.Dataset):
@@ -57,21 +58,21 @@ def __init__(
     def __getitem__(self, cuts: CutSet) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
         audio, audio_lens, cuts = self.load_audio(cuts)
 
-        tokens, prompt_tokens = self.prompt_format_fn(cuts, self.tokenizer, inference=self.inference)
+        prompts_with_answers, prompts = self.prompt_format_fn(cuts, self.tokenizer, inference=self.inference)
 
-        tokens = [torch.as_tensor(t) for t in tokens]
-        token_lens = torch.tensor([t.size(0) for t in tokens], dtype=torch.long)
-        tokens = collate_vectors(tokens, padding_value=self.padding_value)
+        prompts_with_answers = [torch.as_tensor(t) for t in prompts_with_answers]
+        prompts_with_answers_lens = torch.tensor([t.size(0) for t in prompts_with_answers], dtype=torch.long)
+        prompts_with_answers = collate_vectors(prompts_with_answers, padding_value=self.padding_value)
 
         if self.inference:
-            prompt_tokens = [torch.as_tensor(t) for t in prompt_tokens]
-            prompt_token_lens = torch.tensor([t.size(0) for t in prompt_tokens], dtype=torch.long)
-            prompt_tokens = collate_vectors(prompt_tokens, padding_value=self.padding_value)
+            prompts = [torch.as_tensor(t) for t in prompts]
+            prompts_lens = torch.tensor([t.size(0) for t in prompts], dtype=torch.long)
+            prompts = collate_vectors(prompts, padding_value=self.padding_value)
         else:
-            prompt_tokens = None
-            prompt_token_lens = None
+            prompts = None
+            prompts_lens = None
 
-        return audio, audio_lens, tokens, token_lens, prompt_tokens, prompt_token_lens
+        return audio, audio_lens, prompts_with_answers, prompts_with_answers_lens, prompts, prompts_lens
 
 
 # Mapping from a string name to a known prompt formatter function.
@@ -105,7 +106,9 @@ def get_prompt_format_fn(name: str) -> Callable[[CutSet, TokenizerWrapper, bool]
 
 
 @registered_prompt_format_fn
-def canary(cuts: CutSet, tokenizer: TokenizerWrapper, inference: bool = False) -> Sequence[Sequence[int]]:
+def canary(
+    cuts: CutSet, tokenizer: TokenizerWrapper, inference: bool = False
+) -> tuple[list[torch.Tensor], list[torch.Tensor]]:
     """
     Prepend and append control tokens to the token sequence as per Canary format.
 
@@ -132,116 +135,53 @@ def canary(cuts: CutSet, tokenizer: TokenizerWrapper, inference: bool = False) -
     assert isinstance(
         tokenizer._tokenizer, CanaryTokenizer
     ), "To use 'canary' prompt format, you must use the CanaryTokenizer."
-    tokenizer = tokenizer._tokenizer
+    formatter = CanaryPromptFormatter(tokenizer._tokenizer)
 
-    tokens, prompts = [], []
+    prompts_with_answers, prompts = [], []
     for cut in cuts:
         if isinstance(cut, MixedCut):
             cut = cut._first_non_padding_cut
-        assert isinstance(cut, MonoCut), "Expected MonoCut."
+        if not isinstance(cut, MonoCut):
+            raise TypeError(
+                f"Expected input audio to have a single channel (required MonoCut/MixedCut, but we received: {cut=})"
+            )
 
         # first, validate the utterance
-        missing_keys = [k for k in ("source_lang", "target_lang", "taskname", "pnc") if k not in cut.custom]
+        expected_slots = set(formatter.get_slots("user"))
+        missing_keys = expected_slots - set(cut.custom)
+        if "task" in missing_keys and "taskname" in cut.custom:
+            # Compatibility with "old" Canary manifest format.
+            # For compatbility with inference options, this slot is now called "task".
+            cut.custom["task"] = cut.custom["taskname"]
+            missing_keys.remove("task")
         if missing_keys:
             raise RuntimeError(
                 f"We found cut with ID {cut.id} that is missing the following keys: {missing_keys}"
                 f"Please ensure that every utterance in the input manifests contains these keys."
             )
 
-        # Actual tokenization. If a cut has multiple supervisions, we'll stitch their tokenized texts together.
-        texts = [sup.text for sup in cut.supervisions]
-        langs = [sup.language for sup in cut.supervisions]
-        taskname = cut.custom['taskname']
-        pnc = cut.custom['pnc']
-        source_lang = cut.custom['source_lang']
-        target_lang = cut.custom['target_lang']
-
-        tokens.append(canary_prompt(tokenizer, texts, langs, source_lang, target_lang, taskname, pnc))
-        if inference:
-            prompts.append(canary_prompt(tokenizer, None, None, source_lang, target_lang, taskname, pnc))
-    return tokens, prompts
-
-
-def canary_prompt(
-    tokenizer: CanaryTokenizer,
-    text: str | list[str] | None,
-    language: str | list[str] | None,
-    source_language: str,
-    target_language: str,
-    taskname: str,
-    pnc: str,
-) -> list[int]:
-    if isinstance(text, str):
-        text = [text]
-    if isinstance(language, str):
-        language = [language]
-
-    if text is not None:
-        try:
-            tokens = sum((tokenizer.text_to_ids(text_, lang_) for text_, lang_ in zip(text, language)), start=[])
-        except omegaconf.errors.KeyValidationError as e:
-            raise ProbablyIncorrectLanguageKeyError(
-                "We couldn't select the right tokenizer, which could be due to issues with reading "
-                "the language from the manifest. "
-                "If you're training, try setting lang_field='' to a different value (probably 'target_lang' or 'lang'). "
-                "If you're using model.transcribe() directly, please use override_config kwarg to set this. "
-                "If you're using transcribe_speech.py, use option gt_lang_attr_name='...' "
-            ) from e
-    else:
-        tokens = None  # create prompt for inference
-
-    # bos
-    prompted_tokens = [tokenizer.bos_id]
-
-    if tokens is not None and len(tokens) == 0:
-        # no speech token
-        prompted_tokens.append(tokenizer.nospeech_id)
-    else:
-        # first, validate the utterance
-        if source_language is None or target_language is None or taskname is None or pnc is None:
-            raise RuntimeError(
-                f"Missing keys provided to prompt: "
-                f"source_langauge={source_language},\n"
-                f"target_language={target_language},\n"
-                f"taskname={taskname},\n"
-                f"pnc={pnc}\n"
-                f"Please ensure that every utterance in the input manifests contains these keys."
-            )
-
-        # src_lang_id/no_speech
-        src_lang_id = tokenizer.spl_token_to_id(source_language)
-        prompted_tokens.append(src_lang_id)
-
-        # task
-        task = taskname
-        if task == 'asr' or task == "transcribe":
-            prompted_tokens.append(tokenizer.spl_token_to_id("transcribe"))
-        elif task == 's2t_translation' or task == 'ast' or task == "translate":
-            prompted_tokens.append(tokenizer.spl_token_to_id("translate"))
-        else:
-            raise ValueError(f"Unknown task: {task}")
-
-        # tgt_lang_id
-        tgt_lang_id = tokenizer.spl_token_to_id(target_language)
-        prompted_tokens.append(tgt_lang_id)
-
-        # PnC
-        pnc = f"{pnc}".lower().strip()  # to account for bool or str
-        if pnc in {'yes', 'true'}:
-            prompted_tokens.append(tokenizer.spl_token_to_id("pnc"))
-        elif pnc in {'no', 'false'}:
-            prompted_tokens.append(tokenizer.spl_token_to_id("nopnc"))
-        else:
-            raise ValueError(f"Unknown value for key 'pnc': {pnc}")
-
-        # text (only in training)
-        if tokens is not None:
-            prompted_tokens.extend(tokens)
+        encoded = formatter.encode_dialog(
+            turns=[
+                dict(
+                    role="user",
+                    slots={
+                        **{slot: cut.custom[slot] for slot in expected_slots},
+                        formatter.PROMPT_LANGUAGE_SLOT: CANARY_SPECIAL_TOKENIZER,
+                    },
+                ),
+                dict(
+                    role="assistant",
+                    slots={
+                        "text": ' '.join(s.text for s in cut.supervisions),
+                        formatter.PROMPT_LANGUAGE_SLOT: cut.custom["target_lang"],
+                    },
+                ),
+            ]
+        )
+        prompts_with_answers.append(encoded["input_ids"])
+        prompts.append(encoded["context_ids"])
 
-    # eos (only in training)
-    if tokens is not None:
-        prompted_tokens.append(tokenizer.eos_id)
-    return prompted_tokens
+    return prompts_with_answers, prompts
 
 
 class ProbablyIncorrectLanguageKeyError(RuntimeError):
diff --git a/nemo/collections/asr/models/aed_multitask_models.py b/nemo/collections/asr/models/aed_multitask_models.py
index b11d744a7e6a..880f8bb3a004 100644
--- a/nemo/collections/asr/models/aed_multitask_models.py
+++ b/nemo/collections/asr/models/aed_multitask_models.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import os
+import warnings
 from dataclasses import dataclass, field
 from math import ceil
 from typing import Any, Dict, List, Optional, Union
@@ -45,6 +46,7 @@
 from nemo.collections.common.metrics import GlobalAverageLossMetric
 from nemo.collections.common.parts import transformer_weights_init
 from nemo.collections.common.parts.preprocessing.manifest import get_full_path
+from nemo.collections.common.prompts.formatter import PromptFormatter
 from nemo.core.classes.common import typecheck
 from nemo.core.neural_types import (
     AudioSignal,
@@ -100,10 +102,7 @@ class MultiTaskTranscriptionConfig(TranscribeConfig):
     Configuration for Multi Task Transcription
     """
 
-    task: Optional[str] = None
-    pnc: Optional[bool] = None
-    source_lang: Optional[str] = None
-    target_lang: Optional[str] = None
+    prompt: list[dict[str, dict[str, str]]] | None = None
     text_field: str = "answer"
     lang_field: str = "target_lang"
 
@@ -112,10 +111,7 @@ class MultiTaskTranscriptionConfig(TranscribeConfig):
     )
 
     def __post_init__(self):
-        required_fields = ['task', 'pnc', 'source_lang', 'target_lang', 'text_field', 'lang_field']
-        for field in required_fields:
-            if not hasattr(self, field):
-                raise ValueError(f"`{field}` must be present in the transcription config: {self}")
+        self.prompt = parse_multitask_prompt(self.prompt)
 
 
 class EncDecMultiTaskModel(ASRModel, ExportableEncDecModel, ASRBPEMixin, ASRTranscriptionMixin):
@@ -134,6 +130,12 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None):
 
         super().__init__(cfg=cfg, trainer=trainer)
 
+        prompt_cls = PromptFormatter.resolve(self.prompt_format)
+        self.prompt = prompt_cls(
+            tokenizer=self.tokenizer,
+            defaults=OmegaConf.to_container(cfg.get("prompt_defaults")),
+        )
+
         # Setup audio preprocessor
         self.preprocessor = EncDecMultiTaskModel.from_config_dict(self.cfg.preprocessor)
         # Setup audio encoder
@@ -391,15 +393,12 @@ def transcribe(
         audio: Union[str, List[str], np.ndarray, DataLoader],
         batch_size: int = 4,
         return_hypotheses: bool = False,
-        task: Optional[str] = None,
-        pnc: Optional[bool] = None,
-        source_lang: Optional[str] = None,
-        target_lang: Optional[str] = None,
         num_workers: int = 0,
         channel_selector: Optional[ChannelSelectorType] = None,
         augmentor: DictConfig = None,
         verbose: bool = True,
         override_config: Optional[MultiTaskTranscriptionConfig] = None,
+        **prompt,
     ) -> Union[List[str], List[Hypothesis]]:
         """
         Uses greedy decoding to transcribe audio files. Use this method for debugging and prototyping.
@@ -412,15 +411,12 @@ def transcribe(
                 Bigger will result in better throughput performance but would use more memory.
             return_hypotheses: (bool) Either return hypotheses or text
                 With hypotheses can do some postprocessing like getting timestamp or rescoring
-            task: (str) task name. Defaults to `asr`.
-            pnc: (bool) whether to apply punctuation and capitalization or not. Defaults to True.
-            source_lang: (str) source language. Defaults to `en`.
-            target_lang: (str) target language. Defaults to `en`.
             num_workers: (int) number of workers for DataLoader
             channel_selector (int | Iterable[int] | str): select a single channel or a subset of channels from multi-channel audio. If set to `'average'`, it performs averaging across channels. Disabled if set to `None`. Defaults to `None`.
             augmentor: (DictConfig): Augment audio samples during transcription if augmentor is applied.
             verbose: (bool) whether to display tqdm progress bar
             override_config: (Optional[MultiTaskTranscriptionConfig]) A config to override the default config.
+            **prompt: Optional input to construct the prompts for the model. Accepted formats are: 1) legacy Canary-1B API source_lang=<lang>, target_lang=<lang>, etc. 2) explicit single-turn role=<role>, slots={<slot>: <value>, ...} 3) explicit multi-turn: turns=[{"role": <role>, "slots": {<slot>: <value>, ...}}]
 
         Returns:
             A list of transcriptions (or raw log probabilities if logprobs is True) in the same order as paths2audio_files
@@ -433,10 +429,7 @@ def transcribe(
                 channel_selector=channel_selector,
                 augmentor=augmentor,
                 verbose=verbose,
-                task=task,
-                pnc=pnc,
-                source_lang=source_lang,
-                target_lang=target_lang,
+                prompt=prompt,
             )
         else:
             if not isinstance(override_config, MultiTaskTranscriptionConfig):
@@ -738,9 +731,6 @@ def _transcribe_on_begin(self, audio, trcfg: MultiTaskTranscriptionConfig):
                 if hasattr(trcfg, '_internal') and hasattr(trcfg._internal, 'manifest_path'):
                     trcfg._internal.manifest_filepath = manifest_path
 
-        elif isinstance(audio, (np.ndarray, torch.Tensor)):
-            raise NotImplementedError("Transcribing from numpy or torch tensors is not supported yet.")
-
     def _transcribe_input_manifest_processing(
         self, audio_files: List[str], temp_dir: str, trcfg: MultiTaskTranscriptionConfig
     ) -> Dict[str, Any]:
@@ -792,7 +782,47 @@ def _transcribe_forward(self, batch: Any, trcfg: MultiTaskTranscriptionConfig):
         log_probs, encoded_len, enc_states, enc_mask = self.forward(
             input_signal=batch[0], input_signal_length=batch[1]
         )
-        decoder_input_ids = batch[-2].to(trcfg._internal.device)
+        if len(batch) == 6:
+            # Prompt provided by the dataloader.
+            decoder_input_ids = batch[4]
+        else:
+            # The dataloader provided only audio + audio_lens, so we
+            # are constructing the prompt dynamically using TranscribeConfig.
+
+            # Now ask the prompt formatter about which slots are required.
+            # It will return a default prompt structure with default slot values (if available, None otherwise).
+            # We iterate over that structure and update slot values based on ``trcfg.prompt``.
+            default_turns = self.prompt.get_default_dialog_slots()
+            if not trcfg.prompt:
+                # No turns were provided, use defaults.
+                turns = default_turns
+            else:
+                # Turns were provided, iterate over them and fill missing slot values using defaults..
+                turns = trcfg.prompt.copy()  # shallow copy #1: don't override the config
+                for turn in turns:
+                    role = turn["role"]
+                    # Check if we have defaults for this role.
+                    # There shouldn't be more than a single turn for a given role, but if there are,
+                    # we'll emit a warning.
+                    if default_turns_for_role := [t for t in default_turns if t["role"] == role]:
+                        if len(default_turns_for_role) > 1:
+                            warnings.warn(
+                                f"More than one default turn detected for {role=}. "
+                                f"We'll be using default slot values for the first turn of {role=} only."
+                            )
+                        default_slots = default_turns_for_role[0]["slots"]
+                        turn["slots"] = turn["slots"].copy()  # shallow copy #1: don't override the config
+                        # fill missing slots using defaults
+                        for slot, val in default_slots.items():
+                            if turn["slots"].get(slot) is None:
+                                turn["slots"][slot] = val
+
+            decoder_input_ids = (
+                self.prompt.encode_dialog(turns=turns)["context_ids"]
+                .unsqueeze(0)
+                .repeat(batch[0].shape[0], 1)
+                .to(trcfg._internal.device)
+            )
         output = dict(
             log_probs=log_probs,
             encoded_lengths=encoded_len,
@@ -906,6 +936,8 @@ def _may_be_make_dict_and_fix_paths(self, json_items, manifest_path, trcfg: Mult
         Returns:
             A list of dictionaries with the audio file paths fixed.
         """
+        # This method is a legacy helper for Canary that checks whether prompt slot values were provided
+        # in the input manifest and if not, it injects the defaults.
         out_json_items = []
         for item in json_items:
             if isinstance(item, str):
@@ -913,28 +945,21 @@ def _may_be_make_dict_and_fix_paths(self, json_items, manifest_path, trcfg: Mult
                 entry = {
                     'audio_filepath': item,
                     'duration': 100000,
-                    'source_lang': 'en' if trcfg.source_lang is None else trcfg.source_lang,
-                    'taskname': 'asr' if trcfg.task is None else trcfg.task,
-                    'target_lang': 'en' if trcfg.target_lang is None else trcfg.target_lang,
-                    'pnc': 'yes' if trcfg.pnc is None else 'yes' if trcfg.pnc else 'no',
                     trcfg.text_field: 'nothing',
                 }
             elif isinstance(item, dict):
                 entry = item
                 entry['audio_filepath'] = get_full_path(entry['audio_filepath'], manifest_file=manifest_path)
-
-                if 'source_lang' not in entry:
-                    entry['source_lang'] = 'en' if trcfg.source_lang is None else trcfg.source_lang
-                if 'taskname' not in entry:
-                    entry['taskname'] = 'asr' if trcfg.task is None else trcfg.task
-                if 'target_lang' not in entry:
-                    entry['target_lang'] = 'en' if trcfg.target_lang is None else trcfg.target_lang
-                if 'pnc' not in entry:
-                    entry['pnc'] = 'yes' if trcfg.pnc is None else 'yes' if trcfg.pnc else 'no'
                 if trcfg.text_field not in entry:
                     entry[trcfg.text_field] = 'nothing'
             else:
                 raise ValueError(f"Expected str or dict, got {type(item)}")
+            default_turn = [t for t in trcfg.prompt if t["role"] == "user"]
+            default_turn = default_turn[0]["slots"] if default_turn else {}
+            for k, dv in (("source_lang", "en"), ("target_lang", "en"), ("taskname", "asr"), ("pnc", "yes")):
+                if k not in entry:
+                    # last-chance fallback injecting legacy Canary defaults if none were provided.
+                    entry[k] = default_turn.get(k, dv)
             out_json_items.append(entry)
         return out_json_items
 
@@ -977,3 +1002,76 @@ def predict_step(self, batch, batch_idx=0, dataloader_idx=0, has_processed_signa
 
         text = [self.decoding.strip_special_tokens(t) for t in text]
         return text
+
+
+def parse_multitask_prompt(prompt: dict | None) -> list[dict]:
+    if prompt is None or not prompt:
+        return []
+
+    # Case 1.
+    # Multi-turn prompting format. This format conforms to PromptFormatter API and needs no further modification.
+    # This format allows to condition the model on chat history, system+user prompts, etc.
+    # Example:
+    # model.transcribe(
+    #     audio,
+    #     turns=[
+    #         dict(
+    #             role="user",
+    #             slots=dict(
+    #                 source_lang='en', target_lang='de', task='asr', pnc=True, context='translate this text'
+    #             ),
+    #         ),
+    #         dict(
+    #             role="assistant",
+    #             slots=dict(message="Calculating the translation of given text. Do you want to proceed?"),
+    #         ),
+    #         dict(
+    #             role="user",
+    #             slots=dict(
+    #                 source_lang='en', target_lang='de', task='asr', pnc=True, context='Yes, please proceed.'
+    #             ),
+    #         ),
+    #     ],
+    # )
+    if 'turns' in prompt:
+        assert (
+            len(prompt) == 1
+            and isinstance(prompt["turns"], list)
+            and all(isinstance(t, dict) and "role" in t and "slots" in t for t in prompt["turns"])
+        ), (
+            f"When providing a multi-turn prompt through 'turns', no other keys are allowed "
+            f"and the value under prompt['turns'] must be a list of dicts with roles and slot values "
+            f"(we received {prompt=})"
+        )
+        return prompt["turns"]
+
+    values_are_dicts = any(isinstance(v, dict) for k, v in prompt.items() if k != "slots")
+    assert not values_are_dicts, (
+        f"We don't support dict values for prompt keys other than 'slots'. " f"We received {prompt=}"
+    )
+
+    # Case 2.
+    # Single-turn prompting format with explicitly provided role and slot names and values.
+    # We create a 1-item multi-turn prompt from this input.
+    # Example:
+    # model.transcribe(
+    #     audio,
+    #     role="user",
+    #     slots=dict(source_lang='en', target_lang='de', task='asr', pnc=True, context='translate this text'),
+    # )
+    if "role" in prompt and "slots" in prompt:
+        assert isinstance(prompt["slots"], dict), (
+            f"When providing a single-turn prompt through 'role', 'slots' must also be provided "
+            f"(we received {prompt=})."
+        )
+        return [prompt]
+
+    # Case 3.
+    # Legacy prompting format for Canary-1B preserved for backward compatibility.
+    # Extra fields are converted to a single-turn prompt with role "user" (unless overridden with 'role').
+    # Example:
+    # model.transcribe(
+    #     audio, pnc=True, source_lang='en', target_lang='de', task='asr', context='translate this text'
+    # )
+    role = prompt.pop("role", "user")
+    return [dict(role=role, slots=prompt)]
diff --git a/nemo/collections/asr/parts/mixins/transcription.py b/nemo/collections/asr/parts/mixins/transcription.py
index df8d6bac50a9..261e97a225dd 100644
--- a/nemo/collections/asr/parts/mixins/transcription.py
+++ b/nemo/collections/asr/parts/mixins/transcription.py
@@ -148,11 +148,9 @@ def get_item(self, index):
         # Calculate seq length
         seq_len = torch.tensor(samples.shape[0], dtype=torch.long)
 
-        # Dummy text tokens
-        text_tokens = torch.tensor([0], dtype=torch.long)
-        text_tokens_len = torch.tensor(1, dtype=torch.long)
-
-        return (samples, seq_len, text_tokens, text_tokens_len)
+        # Typically NeMo ASR models expect the mini-batch to be a 4-tuple of (audio, audio_len, text, text_len).
+        # For inference, we set text and text_len to None to not disrupt the shape of the tuple.
+        return samples, seq_len, None, None
 
 
 class TranscriptionMixin(ABC):
diff --git a/nemo/collections/asr/parts/utils/streaming_utils.py b/nemo/collections/asr/parts/utils/streaming_utils.py
index 71c945b66255..51a46184e66f 100644
--- a/nemo/collections/asr/parts/utils/streaming_utils.py
+++ b/nemo/collections/asr/parts/utils/streaming_utils.py
@@ -21,7 +21,6 @@
 from omegaconf import OmegaConf
 from torch.utils.data import DataLoader
 
-from nemo.collections.asr.data.audio_to_text_lhotse_prompted import canary_prompt
 from nemo.collections.asr.models.ctc_bpe_models import EncDecCTCModelBPE
 from nemo.collections.asr.parts.mixins.streaming import StreamingEncoder
 from nemo.collections.asr.parts.preprocessing.features import normalize_batch
@@ -444,7 +443,10 @@ def _convert_buffer_to_features(self):
         device = self.asr_model.device
         audio_signal = samples.unsqueeze_(0).to(device)
         audio_signal_len = torch.Tensor([samples.shape[1]]).to(device)
-        features, features_len = self.raw_preprocessor(input_signal=audio_signal, length=audio_signal_len,)
+        features, features_len = self.raw_preprocessor(
+            input_signal=audio_signal,
+            length=audio_signal_len,
+        )
         features = features.squeeze()
         self._update_feature_buffer(features[:, -self.feature_chunk_len :])
 
@@ -479,7 +481,10 @@ def __init__(self, samples, frame_len, preprocessor, device, pad_to_frame_len=Tr
         self._feature_frame_len = frame_len / timestep_duration
         audio_signal = torch.from_numpy(self._samples).unsqueeze_(0).to(device)
         audio_signal_len = torch.Tensor([self._samples.shape[0]]).to(device)
-        self._features, self._features_len = preprocessor(input_signal=audio_signal, length=audio_signal_len,)
+        self._features, self._features_len = preprocessor(
+            input_signal=audio_signal,
+            length=audio_signal_len,
+        )
         self._features = self._features.squeeze()
 
     def __iter__(self):
@@ -701,7 +706,12 @@ class for streaming frame-based ASR use reset() method to reset FrameASR's
     """
 
     def __init__(
-        self, asr_model, frame_len=1.6, total_buffer=4.0, batch_size=4, pad_to_buffer_len=True,
+        self,
+        asr_model,
+        frame_len=1.6,
+        total_buffer=4.0,
+        batch_size=4,
+        pad_to_buffer_len=True,
     ):
         '''
         Args:
@@ -1183,7 +1193,9 @@ def _get_batch_preds(self):
         del best_hyp, pred
 
     def transcribe(
-        self, tokens_per_chunk: int, delay: int,
+        self,
+        tokens_per_chunk: int,
+        delay: int,
     ):
         """
         Performs "middle token" alignment prediction using the buffered audio chunk.
@@ -1210,7 +1222,12 @@ def transcribe(
                 ids, toks = self._alignment_decoder(alignment, self.asr_model.tokenizer, self.blank_id)
 
                 if len(ids) > 0 and a_idx < signal_end_idx:
-                    self.unmerged[idx] = inplace_buffer_merge(self.unmerged[idx], ids, delay, model=self.asr_model,)
+                    self.unmerged[idx] = inplace_buffer_merge(
+                        self.unmerged[idx],
+                        ids,
+                        delay,
+                        model=self.asr_model,
+                    )
 
         output = []
         for idx in range(self.batch_size):
@@ -1276,7 +1293,9 @@ def __init__(
         self.alignment_basepath = alignment_basepath
 
     def transcribe(
-        self, tokens_per_chunk: int, delay: int,
+        self,
+        tokens_per_chunk: int,
+        delay: int,
     ):
         if self.lcs_delay < 0:
             raise ValueError(
@@ -1302,7 +1321,10 @@ def transcribe(
 
                     if len(ids) > 0:
                         self.unmerged[idx] = inplace_buffer_merge(
-                            self.unmerged[idx], ids, delay, model=self.asr_model,
+                            self.unmerged[idx],
+                            ids,
+                            delay,
+                            model=self.asr_model,
                         )
 
                 else:
@@ -1588,15 +1610,17 @@ def get_input_tokens(self, sample: dict):
                     f"We found sample that is missing the following keys: {missing_keys}"
                     f"Please ensure that every utterance in the input manifests contains these keys. Sample: {sample}"
                 )
-            tokens = canary_prompt(
-                tokenizer=self.asr_model.tokenizer,
-                text=None,
-                language=None,
-                source_language=sample['source_lang'],
-                target_language=sample['target_lang'],
-                taskname=sample['taskname'],
-                pnc=sample['pnc'],
-            )
+            tokens = self.asr_model.prompt.encode_dialog(
+                turns=[
+                    {
+                        "role": "user",
+                        "slots": {
+                            **sample,
+                            self.asr_model.prompt.PROMPT_LANGUAGE_SLOT: "spl_tokens",
+                        },
+                    }
+                ]
+            )["context_ids"]
         else:
             raise ValueError(f"Unknown prompt format: {self.asr_model.prompt_format}")
         return torch.tensor(tokens, dtype=torch.long, device=self.asr_model.device).unsqueeze(0)  # [1, T]
@@ -1712,12 +1736,16 @@ def _get_batch_preds(self, keep_logits=False):
                 encoded, encoded_len = results
                 log_probs = self.asr_model.ctc_decoder(encoder_output=encoded)
                 transcribed_texts, _ = self.asr_model.ctc_decoding.ctc_decoder_predictions_tensor(
-                    decoder_outputs=log_probs, decoder_lengths=encoded_len, return_hypotheses=False,
+                    decoder_outputs=log_probs,
+                    decoder_lengths=encoded_len,
+                    return_hypotheses=False,
                 )
             else:
                 log_probs, encoded_len, predictions = results
                 transcribed_texts, _ = self.asr_model.decoding.ctc_decoder_predictions_tensor(
-                    decoder_outputs=log_probs, decoder_lengths=encoded_len, return_hypotheses=False,
+                    decoder_outputs=log_probs,
+                    decoder_lengths=encoded_len,
+                    return_hypotheses=False,
                 )
 
             self.all_preds.extend(transcribed_texts)
diff --git a/nemo/collections/common/prompts/__init__.py b/nemo/collections/common/prompts/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/nemo/collections/common/prompts/canary.py b/nemo/collections/common/prompts/canary.py
new file mode 100644
index 000000000000..aadc976ba474
--- /dev/null
+++ b/nemo/collections/common/prompts/canary.py
@@ -0,0 +1,71 @@
+from nemo.collections.common.prompts.formatter import Modality, PromptFormatter
+from nemo.collections.common.tokenizers.canary_tokenizer import (
+    CANARY_BOS,
+    CANARY_EOS,
+    CANARY_NOPNC,
+    CANARY_PNC,
+    CANARY_SPECIAL_TOKENIZER,
+)
+
+
+class CanaryPromptFormatter(PromptFormatter):
+    NAME = "canary"
+    OUTPUT_ROLE = "assistant"
+    TEMPLATE = {
+        "user": {
+            "template": f"{CANARY_BOS}|source_lang||task||target_lang||pnc|",
+            "slots": {
+                "source_lang": Modality.Text,
+                "task": Modality.Text,
+                "target_lang": Modality.Text,
+                "pnc": Modality.Text,
+            },
+        },
+        OUTPUT_ROLE: {
+            "template": f"|text|{CANARY_EOS}",
+            "slots": {
+                "text": Modality.Text,
+            },
+        },
+    }
+
+    def encode_turn(self, prompt_template: str, expected_slots: dict, slot_values: dict) -> list[int]:
+        # This method handles a level of indirection for Canary.
+        # It maps values provided in trcfg to the actual special tokens
+        # expected to be present in canary prompt.
+        # It used to be done in prompt_format_fn inside Dataset class corresponding to Canary,
+        # but we are not using it here anymore.
+        # This maps things such as '|task|: "asr"' to '|TASK|: "<|transcribe|>"'.
+        slot_values = map_manifest_values_to_special_tokens(slot_values)
+        return super().encode_turn(
+            prompt_template=prompt_template, expected_slots=expected_slots, slot_values=slot_values
+        )
+
+
+def map_manifest_values_to_special_tokens(slot_values: dict[str, str]) -> dict[str, str]:
+    slot_values = slot_values.copy()
+
+    any_special_token_present = False
+
+    for k in ("source_lang", "target_lang"):
+        if k in slot_values and not ((v := slot_values[k]).startswith("<|") and v.endswith("|>")):
+            slot_values[k] = "<|" + slot_values[k] + "|>"
+            any_special_token_present = True
+
+    k = "pnc"
+    if k in slot_values and slot_values[k] not in (CANARY_PNC, CANARY_NOPNC):
+        slot_values[k] = CANARY_PNC if slot_values[k] in ("yes", "1", "True", "true") else CANARY_NOPNC
+        any_special_token_present = True
+
+    # Note: we re-map 'taskname' to 'task' for compatibility with earlier versions of Canary training.
+    for k in ("task", "taskname"):
+        if k in slot_values and slot_values[k] not in ("<|transcribe|>", "<|translate|>"):
+            slot_values["task"] = "<|transcribe|>" if slot_values[k] == "asr" else "<|translate|>"
+            any_special_token_present = True
+
+    # Auto-inject which tokenizer to look up in CanaryTokenizer if not provided,
+    # and slots for this turn correspond to user prompt.
+    if any_special_token_present and PromptFormatter.PROMPT_LANGUAGE_SLOT not in slot_values:
+        slot_values[PromptFormatter.PROMPT_LANGUAGE_SLOT] = CANARY_SPECIAL_TOKENIZER
+
+    return slot_values
diff --git a/nemo/collections/common/prompts/example.py b/nemo/collections/common/prompts/example.py
new file mode 100644
index 000000000000..3589efb938f4
--- /dev/null
+++ b/nemo/collections/common/prompts/example.py
@@ -0,0 +1,36 @@
+"""
+Implemented following the guide at https://www.promptingguide.ai/models/phi-2#phi-2-usage
+"""
+
+from nemo.collections.common.prompts.formatter import Modality, PromptFormatter
+
+
+class ExamplePromptFormatter(PromptFormatter):
+    """
+    The simplest possible prompt formatter implementation.
+
+    It defines a dialog of the form:
+
+        User: Hi.
+        Assistant: Hi, how can I help you?
+        User: What's the time?
+        Assistant: It's 9 o'clock.
+
+    """
+
+    NAME = "example_prompt_format"
+    OUTPUT_ROLE = "assistant"
+    TEMPLATE = {
+        "user": {
+            "template": f"User: |message|\n",
+            "slots": {
+                "message": Modality.Text,
+            },
+        },
+        OUTPUT_ROLE: {
+            "template": f"Assistant: |message|\n",
+            "slots": {
+                "message": Modality.Text,
+            },
+        },
+    }
diff --git a/nemo/collections/common/prompts/formatter.py b/nemo/collections/common/prompts/formatter.py
new file mode 100644
index 000000000000..524b2e62c5a3
--- /dev/null
+++ b/nemo/collections/common/prompts/formatter.py
@@ -0,0 +1,347 @@
+from abc import ABC
+from enum import Enum
+from functools import lru_cache
+from typing import Any, Type
+
+import torch
+
+from nemo.collections.common.tokenizers import AggregateTokenizer, TokenizerSpec
+
+PREAMBLE_ROLE = "preamble"
+
+# Slots used to define when special tokens bos/eos should be inserted.
+# These are special in the sense of how sentencepiece defines special tokens:
+# They have to be specially inserted into the token sequence, and if they appear in the tokenized string,
+# SPE wouldn't use the special token ids but rather tokenize them as if they were normal strings.
+# We mimic SPE's behavior if these special slots are present in the template definition.
+# To achieve that, insert |bos| / |eos| at the beginning/end of template.
+# E.g., inserting only bos in llama2 user role: "template": "|bos|[INST] |message| [\INST]"
+BOS_SLOT = "|bos|"
+EOS_SLOT = "|eos|"
+
+
+class Modality(Enum):
+    """
+    Modalities supported as PromptFormatter slot values.
+    """
+
+    Text = "text"
+
+    def matches(self, value: Any) -> bool:
+        """
+        Checks if the provided value is compatible with an instance of Modality.
+        """
+        match self:
+            case Modality.Text:
+                return isinstance(value, str)
+            case _:
+                return False
+
+
+class PromptFormatter(ABC):
+    """
+    :class:`~nemo.collections.common.prompts.formatter.PromptFormatter` is intended to simplify
+    working with various prompt format templates and encoding them into token ID tensors.
+
+    It assumes a dialog-like structure, which is a list of turns, with each turn assigned to a role.
+    Sub-classes of PromptFormatter define turn templates for each role under TEMPLATE class attribute.
+    Each template may define some constant parts (e.g. begin-of-turn or end-of-turn tokens, whitespaces, etc.)
+    and variable parts which we call "slots", that will be provided by the user during training or inference.
+
+    A role is typically "user" and "assistant", and some popular models also use a "system" role.
+    Other roles may be defined as well. We expect the role corresponding to the model's responses
+    will be registered under class attribute called OUTPUT_ROLE.
+    We reserve a special "preamble" role with no slots that will be inserted at the beginning of
+    the formatted prompt, if "preamble" is present in TEMPLATE.
+
+    A turn is a dict with keys "role" and "slots", where "slots" are a dict that maps slot names
+    to values that should be filled in the template.
+    For example, a user role template may be ``"Question: |message|"`` and corresponding ``slots`` would then be
+    ``{"message": "What time is it?"}``.
+
+    There is a special slot called ``|prompt_language|`` that's used to select the sub-tokenizer in
+    :class:`~nemo.collections.common.tokenizers.aggregate_tokenizer.AggregateTokenizer`.
+    It's only used when the tokenizer is aggregate; otherwise it's discarded.
+
+    PromptFormatter supports constructing prompts for training (complete context and answers)
+    and for inference (context-only).
+    Training/inference is determined automatically; if the last role in a dialog is the OUTPUT_ROLE,
+    that's an 'asked-and-answered' scenario, so we assume it's inteded for training.
+    We'll create a dict with tokenized results available under the following keys:
+
+    * ``context_ids`` (all turns minus last one),
+    * ``answer_ids`` (last turn)
+    * ``input_ids`` (previous two values concatenated)
+    * ``mask`` (boolean mask tensor of the same lenth as ``input_ids`` that's set to True on OUTPUT_ROLE turns)
+
+    Typically, the user will use the ``encode_dialog`` method providing a list of turns to it.
+    Example showing how to construct model inputs/outputs for training::
+
+        >>> formatter = PromptFormatter(tokenizer)
+        ... encoded_for_training = formatter.encode_dialog(
+        ...     turns=[
+        ...         {"role": "user", "slots": {"message": "What time is it?"}},
+        ...         {"role": "assistant", "slots": {"message": "Ten o'clock."}},
+        ...         {"role": "user", "slots": {"message": "PM or AM?"}},
+        ...         {"role": "assistant", "slots": {"message": "AM, naturally! It's bright outside"}},
+        ...     ]
+        ... )
+
+    Another example that shows how to use the same method to generate prompts for inference::
+
+
+        >>> formatter = PromptFormatter(tokenizer)
+        ... encoded_for_training = formatter.encode_dialog(
+        ...     turns=[
+        ...         {"role": "user", "slots": {"message": "What time is it?"}},
+        ...         {"role": "assistant", "slots": {"message": "Ten o'clock."}},
+        ...         {"role": "user", "slots": {"message": "PM or AM?"}},
+        ...     ]
+        ... )
+
+    """
+
+    # Used to support AggregateTokenizer; this key selects the right sub-tokenizer for each turn.
+    PROMPT_LANGUAGE_SLOT = "prompt_language"
+
+    # Subclasses will be registered under this name, to be used via PromptFormatter.resolve(name).
+    NAME = None
+
+    # Template is a dict that maps:
+    # * from a role name string (system/user/assistant/etc)
+    # * to a dict with keys
+    #   * "template" that has a string value (the prompt template)
+    #   * "slots" that has a value of dict[str, Modality]
+    #       * keys of slots are the names of formattable slots in the prompt template
+    #       * values of slots are :class:`Modality` objects that can be used to check
+    #           whether a specific value conforms to a given modality requirements
+    #           (e.g., Modality.Text may expect string objects).
+    # Template is intended to be defined by the child classes.
+    TEMPLATE = None
+
+    # Turns under this role indicate responses by the model; if the last turn in
+    # PromptFormatter.encode_dialog() ends with this role, it indicates a training example.
+    OUTPUT_ROLE = None
+
+    # Internal reserved field.
+    _REGISTERED_FORMATTERS = {}
+
+    def __init__(self, tokenizer: TokenizerSpec, defaults: list[dict] | None = None) -> None:
+        self.tokenizer = tokenizer
+        self._defaults = defaults if defaults is not None else []
+        self._validate_defaults()
+
+    def __init_subclass__(cls, **kwargs) -> None:
+        ERR = "PromptFormatter subclass definition error:"
+        if cls.__name__ not in cls._REGISTERED_FORMATTERS:
+            for attr in ("NAME", "TEMPLATE", "OUTPUT_ROLE"):
+                assert (
+                    getattr(cls, attr, None) is not None
+                ), f"{ERR} PromptFormatter subclass {cls} did not define a class attribute {attr}"
+            assert cls.NAME not in cls._REGISTERED_FORMATTERS, (
+                f"Cannot register {cls.__name__} under {cls.NAME}: another prompt formatter of type "
+                f"{cls._REGISTERED_FORMATTERS[cls.NAME]} has already been registered under this name."
+            )
+            cls._REGISTERED_FORMATTERS[cls.NAME] = cls
+        if "preamble" in cls.TEMPLATE:
+            assert (
+                len(cls.TEMPLATE["preamble"].get("slots", [])) == 0
+            ), f"{ERR} Slots are not allowed for preamble template, but we found: '{cls.TEMPLATE['preamble']}'"
+        for role in cls.get_roles():
+            template = cls.get_template(role)
+            for slot in cls.get_slots(role):
+                assert (
+                    _mangled(slot) in template
+                ), f"{ERR} Slot '{slot}' not found in template '{template}' for role '{role}'"
+        super().__init_subclass__(**kwargs)
+
+    @classmethod
+    def resolve(cls, name: str) -> Type["PromptFormatter"]:
+        if name not in cls._REGISTERED_FORMATTERS:
+            raise RuntimeError(
+                f"Unknown prompt formatter: '{name}' (known formats: {', '.join(cls._REGISTERED_FORMATTERS.keys())})"
+            )
+        return cls._REGISTERED_FORMATTERS[name]
+
+    @classmethod
+    @lru_cache(1)
+    def get_roles(cls) -> list[str]:
+        return list(cls.TEMPLATE.keys())
+
+    @classmethod
+    def get_slots(cls, role: str) -> dict[str, Modality]:
+        # returns a copy to avoid accidential mutation of a global object by the user
+        return cls.TEMPLATE[role].get("slots", {}).copy()
+
+    @classmethod
+    def get_template(cls, role: str) -> str:
+        return cls.TEMPLATE[role]["template"]
+
+    def get_default_dialog_slots(self) -> list[dict]:
+        """
+        Returns a list of dialog turns that can be used as a skeleton to fill with actual slot values.
+        If ``PromptFormatter`` was initialized with ``defaults`` argument, this method will return the
+        defaults. Otherwise, every slot is pre-filled with ``None``.
+        """
+
+        def _get_default_for_role(role: str) -> dict:
+            for turn in self._defaults:
+                if turn["role"] == role:
+                    return turn
+            return {}
+
+        return [
+            {
+                "role": role,
+                "slots": {
+                    slot: _get_default_for_role(role).get("slots", {}).get(slot) for slot in self.get_slots(role)
+                },
+            }
+            for role in self.get_roles()
+            if role != self.OUTPUT_ROLE
+        ]
+
+    def encode_turn(
+        self, prompt_template: str, expected_slots: dict[str, Modality], slot_values: dict[str, Any]
+    ) -> list[int]:
+        prompt = prompt_template
+        for slot in expected_slots:
+            # For the final substitution of 'slot' in the template we have to mangle it to '|slot|' anyway,
+            # but 'slot' form enables to use valid python identifiers as **kwargs
+            # for passing slots around in user functions.
+            value = slot_values.get(slot)
+            assert value is not None, f"Missing required {slot=} in {slot_values=} for {prompt_template=}"
+            prompt = prompt.replace(_mangled(slot), value)
+        return self._apply_tokenizer(prompt, lang=slot_values.get(self.PROMPT_LANGUAGE_SLOT))
+
+    def encode_dialog(self, turns: list[dict]) -> dict[str, torch.Tensor]:
+        assert len(turns) > 0, "Empty dialog is not supported."
+        roles = self.get_roles()
+
+        turn_tokens = []
+        turn_token_counts = []
+        turn_mask_values = []
+
+        if "preamble" in self.TEMPLATE:
+            preamble_turns = [idx for idx, t in enumerate(turns) if t["role"] == "preamble"]
+            if not preamble_turns:
+                turns = [{"role": "preamble", **self.TEMPLATE["preamble"]}] + turns
+            else:
+                assert (
+                    len(preamble_turns) == 1 and preamble_turns[0] == 0
+                ), f"Preamble can only be presented at turn 0, but we found preamble turns at indexes {preamble_turns}."
+
+        for turn in turns:
+            assert "role" in turn, f"A turn must have have a 'role' key. We received {turn=}"
+            role = turn["role"]
+            assert role in roles, f"Found turn with {role=}, but availables roles are {roles}"
+            expected_slots = self.get_slots(role)
+            slot_values = turn.get("slots", {})
+            if expected_slots:
+                assert (
+                    slot_values
+                ), f"A turn for role {role} must have have a non-empty value under 'slots' key. We received {turn=}"
+                self._validate_slot_values(expected_slots, slot_values)
+            template = self.get_template(role)
+            tokens = self.encode_turn(template, expected_slots, slot_values)
+            turn_tokens.extend(tokens)
+            turn_token_counts.append(len(tokens))
+            turn_mask_values.append(role == self.OUTPUT_ROLE)
+
+        ans = {"input_ids": torch.tensor(turn_tokens, dtype=torch.long)}
+        if turn_mask_values[-1]:
+            # The last turn comes from OUTPUT_ROLE, i.e. it's a response from the system.
+            # This indicates it's a training example for which we provide context/answer/mask.
+            ans["context_ids"] = ans["input_ids"][: -turn_token_counts[-1]]
+            ans["answer_ids"] = ans["input_ids"][-turn_token_counts[-1] :]
+            ans["mask"] = torch.tensor(
+                [
+                    turn_mask_values[turn_idx]
+                    for turn_idx, turn_len in enumerate(turn_token_counts)
+                    for _ in range(turn_len)
+                ],
+                dtype=torch.bool,
+            )
+        else:
+            ans["context_ids"] = ans["input_ids"]  # context == input for inference
+        return ans
+
+    def _apply_tokenizer(self, text: str, lang: str | None = None) -> list[int]:
+        # Check if the tokenizer is aggregate and perform extra checks.
+        is_agg = isinstance(self.tokenizer, AggregateTokenizer)
+        if is_agg:
+            assert lang is not None, (
+                f"Missing key '{self.PROMPT_LANGUAGE_SLOT}' in slot_values -- cannot resolve "
+                f"the correct sub-tokenizer in the aggregate tokenizer."
+            )
+
+        # Strip bos/eos if present and remember to apply them later.
+        has_bos = text.startswith(BOS_SLOT)
+        has_eos = text.endswith(EOS_SLOT)
+        if has_bos:
+            text = text[len(BOS_SLOT) :]
+        if has_eos:
+            text = text[: -len(EOS_SLOT)]
+
+        # Tokenize, selecting the right API depending on aggregate/normal tokenizer.
+        if is_agg:
+            tokens = self.tokenizer.text_to_ids(text, lang)
+        else:
+            tokens = self.tokenizer.text_to_ids(text)
+
+        # Lazily look up bos/eos and apply them. Lazy has the advantage that if a tokenizer
+        # doesn't define bos/eos and the prompt format does not request them, everything just works.
+        if has_eos:
+            eos_id = self.tokenizer.get_eos(lang) if is_agg else self.tokenizer.eos
+            tokens.append(eos_id)
+        if has_bos:
+            bos_id = self.tokenizer.get_bos(lang) if is_agg else self.tokenizer.bos
+            tokens = [bos_id] + tokens
+
+        return tokens
+
+    def _validate_slot_values(self, expected: dict[str, Modality], received: dict[str, Any]) -> None:
+        missing = set(expected) - set(received)
+        assert not missing, f"The following slot values were not provided: {missing}"
+        for slot in expected:
+            expected_modality = expected[slot]
+            value = received[slot]
+            assert expected_modality.matches(
+                value
+            ), f"{slot=} received {value=} which does not match modality {expected_modality}"
+
+    def _validate_defaults(self):
+        if not self._defaults:
+            return
+
+        err = "Error in default prompt definition:"
+        assert isinstance(self._defaults, list)
+        for turn in self._defaults:
+            assert isinstance(turn, dict)
+            assert "role" in turn, f"{err} Missing required 'role' key. We received {turn=}"
+            role = turn["role"]
+            assert role in self.get_roles(), (
+                f"{err} Invalid {role=} in {turn=} - " f"supported roles are: {self.get_roles()}."
+            )
+            if expected_slots := self.get_slots(role):
+                assert "slots" in turn, (
+                    f"{err} Missing required 'slots' key in {turn=} - "
+                    f"we expected the following slots to be provided: {expected_slots}."
+                )
+                for slot in turn["slots"]:
+                    assert slot in expected_slots, (
+                        f"{err} Invalid {slot=} in {turn=}. "
+                        f"The following slots are supported for {role=}: {expected_slots}"
+                    )
+
+
+def _mangled(slot: str) -> str:
+    if not (slot[0] == "|" and slot[-1] == "|"):
+        return f"|{slot}|"
+    return slot
+
+
+def _unmangled(slot: str) -> str:
+    if slot[0] == "|" and slot[-1] == "|":
+        return slot[1:-1]
+    return slot
diff --git a/nemo/collections/common/prompts/gemma.py b/nemo/collections/common/prompts/gemma.py
new file mode 100644
index 000000000000..e3b81c848a3e
--- /dev/null
+++ b/nemo/collections/common/prompts/gemma.py
@@ -0,0 +1,29 @@
+"""
+Implemented following the guide at https://www.promptingguide.ai/models/gemma#gemma-7b-prompt-format
+"""
+
+from nemo.collections.common.prompts.formatter import Modality, PromptFormatter
+
+GEMMA_BOS = "<start_of_turn>"
+GEMMA_END_OF_TURN = "<end_of_turn>"
+GEMMA_NL = "\n\n"
+
+
+class GemmaPromptFormatter(PromptFormatter):
+    NAME = "gemma"
+    OUTPUT_ROLE = "assistant"
+    TEMPLATE = {
+        "user": {
+            "template": f"{GEMMA_BOS}user\n|message|{GEMMA_END_OF_TURN}\n{GEMMA_BOS}model\n",
+            "slots": {
+                "message": Modality.Text,
+            },
+        },
+        OUTPUT_ROLE: {
+            # Note: that trailing NL is bothering me.
+            "template": f"|message|{GEMMA_END_OF_TURN}\n",
+            "slots": {
+                "message": Modality.Text,
+            },
+        },
+    }
diff --git a/nemo/collections/common/prompts/llama.py b/nemo/collections/common/prompts/llama.py
new file mode 100644
index 000000000000..fdaccfaa846e
--- /dev/null
+++ b/nemo/collections/common/prompts/llama.py
@@ -0,0 +1,72 @@
+from nemo.collections.common.prompts.formatter import BOS_SLOT, EOS_SLOT, Modality, PromptFormatter
+
+
+class Llama2PromptFormatter(PromptFormatter):
+    """
+    This template has been validated to provide identical tokenized results to the official code
+    in https://github.com/meta-llama/llama/blob/main/llama/generation.py
+    """
+
+    NAME = "llama2"
+    OUTPUT_ROLE = "assistant"
+    TEMPLATE = {
+        "system_and_user": {
+            "template": f"{BOS_SLOT}[INST] <<SYS>>\n|system|\n<</SYS>>\n\n|message| [/INST]",
+            "slots": {
+                "system": Modality.Text,
+                "message": Modality.Text,
+            },
+        },
+        "user": {
+            "template": "|bos|[INST] |message| [/INST]",
+            "slots": {
+                "message": Modality.Text,
+            },
+        },
+        OUTPUT_ROLE: {
+            "template": f"|message| {EOS_SLOT}",
+            "slots": {
+                "message": Modality.Text,
+            },
+        },
+    }
+
+
+LLAMA3_BOS = "<|begin_of_text|>"
+LLAMA3_HEADER_BEGIN = "<|start_header_id|>"
+LLAMA3_HEADER_END = "<|end_header_id|>"
+LLAMA3_END_OF_TURN = "<|eot_id|>"
+LLAMA3_NL = "\n\n"
+
+
+class Llama3PromptFormatter(PromptFormatter):
+    """
+    Implemented following the code at:
+     https://github.com/meta-llama/llama3/blob/main/llama/test_tokenizer.py#L56
+    """
+
+    NAME = "llama3"
+    OUTPUT_ROLE = "assistant"
+    TEMPLATE = {
+        "preamble": {
+            "template": LLAMA3_BOS,
+        },
+        "system": {
+            "template": f"{LLAMA3_HEADER_BEGIN}system{LLAMA3_HEADER_END}{LLAMA3_NL}|message|{LLAMA3_END_OF_TURN}",
+            "slots": {
+                "message": Modality.Text,
+            },
+        },
+        "user": {
+            "template": f"{LLAMA3_HEADER_BEGIN}user{LLAMA3_HEADER_END}{LLAMA3_NL}|message|{LLAMA3_END_OF_TURN}",
+            "slots": {
+                "message": Modality.Text,
+            },
+        },
+        OUTPUT_ROLE: {
+            "template": f"{LLAMA3_HEADER_BEGIN}assistant{LLAMA3_HEADER_END}{LLAMA3_NL}|message|{LLAMA3_END_OF_TURN}",
+            "slots": {
+                "message": Modality.Text,
+            },
+        },
+    }
diff --git a/nemo/collections/common/prompts/mistral.py b/nemo/collections/common/prompts/mistral.py
new file mode 100644
index 000000000000..e882ac5973b1
--- /dev/null
+++ b/nemo/collections/common/prompts/mistral.py
@@ -0,0 +1,33 @@
+"""
+Implemented following the guide at https://www.promptingguide.ai/models/mistral-7b#chat-template-for-mistral-7b-instruct
+"""
+
+from nemo.collections.common.prompts.formatter import Modality, PromptFormatter
+
+MISTRAL_BOS = "<s>"
+MISTRAL_PROMPT_BEGIN = "[INST]"
+MISTRAL_PROMPT_END = "[/INST]"
+MISTRAL_END_OF_TURN = "</s>"
+MISTRAL_NL = "\n\n"
+
+
+class MistralPromptFormatter(PromptFormatter):
+    NAME = "mistral"
+    OUTPUT_ROLE = "assistant"
+    TEMPLATE = {
+        "preamble": {
+            "template": MISTRAL_BOS,
+        },
+        "user": {
+            "template": f"{MISTRAL_PROMPT_BEGIN} |message| {MISTRAL_PROMPT_END} ",
+            "slots": {
+                "message": Modality.Text,
+            },
+        },
+        OUTPUT_ROLE: {
+            "template": f"|message|{MISTRAL_END_OF_TURN}",
+            "slots": {
+                "message": Modality.Text,
+            },
+        },
+    }
diff --git a/nemo/collections/common/prompts/phi2.py b/nemo/collections/common/prompts/phi2.py
new file mode 100644
index 000000000000..67dad8d5dd82
--- /dev/null
+++ b/nemo/collections/common/prompts/phi2.py
@@ -0,0 +1,62 @@
+"""
+Implemented following the guide at https://www.promptingguide.ai/models/phi-2#phi-2-usage
+"""
+
+from nemo.collections.common.prompts.formatter import Modality, PromptFormatter
+
+
+class Phi2QAPromptFormatter(PromptFormatter):
+    NAME = "phi2_qa"
+    OUTPUT_ROLE = "assistant"
+    TEMPLATE = {
+        "user": {
+            "template": f"Instruct: |message|\nOutput: ",
+            "slots": {
+                "message": Modality.Text,
+            },
+        },
+        OUTPUT_ROLE: {
+            "template": f"|message|",
+            "slots": {
+                "message": Modality.Text,
+            },
+        },
+    }
+
+
+class Phi2ChatPromptFormatter(PromptFormatter):
+    NAME = "phi2_chat"
+    OUTPUT_ROLE = "assistant"
+    TEMPLATE = {
+        "user": {
+            "template": f"Human: |message|\nAI: ",
+            "slots": {
+                "message": Modality.Text,
+            },
+        },
+        OUTPUT_ROLE: {
+            "template": f"|message|",
+            "slots": {
+                "message": Modality.Text,
+            },
+        },
+    }
+
+
+class Phi2CodePromptFormatter(PromptFormatter):
+    NAME = "phi2_code"
+    OUTPUT_ROLE = "assistant"
+    TEMPLATE = {
+        "user": {
+            "template": f"|message|\n",
+            "slots": {
+                "message": Modality.Text,
+            },
+        },
+        OUTPUT_ROLE: {
+            "template": f"|message|",
+            "slots": {
+                "message": Modality.Text,
+            },
+        },
+    }
diff --git a/nemo/collections/common/tokenizers/aggregate_tokenizer.py b/nemo/collections/common/tokenizers/aggregate_tokenizer.py
index 9c003c37525a..66ec28ebda4d 100644
--- a/nemo/collections/common/tokenizers/aggregate_tokenizer.py
+++ b/nemo/collections/common/tokenizers/aggregate_tokenizer.py
@@ -15,6 +15,7 @@
 from typing import Dict, List, Union
 
 import numpy as np
+import torch
 
 from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec
 from nemo.utils import logging
@@ -124,7 +125,7 @@ def tokens_to_text(self, tokens, lang_id):
         return tokenizer.decode_pieces(tokens)
 
     def ids_to_text(self, ids):
-        if isinstance(ids, np.ndarray):
+        if isinstance(ids, (np.ndarray, torch.Tensor)):
             ids = ids.tolist()
 
         tokens = []
@@ -224,6 +225,12 @@ def tokens_to_ids(self, tokens: Union[str, List[str]], langs: Union[str, List[st
             ids.append(self.token_to_id(token, lang_id))
         return ids
 
+    def get_bos(self, lang_id: str) -> int:
+        return self.tokenizers_dict[lang_id].bos + self.token_id_offset[lang_id]
+
+    def get_eos(self, lang_id: str) -> int:
+        return self.tokenizers_dict[lang_id].eos + self.token_id_offset[lang_id]
+
     @property
     def vocab(self):
         return self.vocabulary
diff --git a/nemo/collections/common/tokenizers/canary_tokenizer.py b/nemo/collections/common/tokenizers/canary_tokenizer.py
index aed95c1f9312..6adcdd8cf734 100644
--- a/nemo/collections/common/tokenizers/canary_tokenizer.py
+++ b/nemo/collections/common/tokenizers/canary_tokenizer.py
@@ -24,7 +24,15 @@
 __all__ = ['CanaryTokenizer']
 
 # Default tokens for compatibility with Canary.
-DEFAULT_TOKENS = ["<|nospeech|>", "<pad>", "<|endoftext|>", "<|startoftranscript|>", "<|pnc|>", "<|nopnc|>"]
+CANARY_BOS = "<|startoftranscript|>"
+CANARY_EOS = "<|endoftext|>"
+CANARY_PAD = "<pad>"
+CANARY_NOSPEECH = "<|nospeech|>"
+CANARY_PNC = "<|pnc|>"
+CANARY_NOPNC = "<|nopnc|>"
+DEFAULT_TOKENS = [CANARY_NOSPEECH, CANARY_PAD, CANARY_EOS, CANARY_BOS, CANARY_PNC, CANARY_NOPNC]
+
+CANARY_SPECIAL_TOKENIZER = "spl_tokens"
 
 
 class CanaryTokenizer(AggregateTokenizer):
@@ -37,26 +45,51 @@ def __init__(self, tokenizers: Dict):
 
         # for easy access of special tokens
         self.special_tokens = {}
-        for special in tokenizers['spl_tokens'].vocab:
+        for special in tokenizers[CANARY_SPECIAL_TOKENIZER].vocab:
             # Search for special prompting tokens
-            if (special.startswith("<|") and special.endswith("|>")) or special == "<pad>":
-                self.special_tokens[special] = self.token_to_id(special, lang_id='spl_tokens')
+            if (special.startswith("<|") and special.endswith("|>")) or special == CANARY_PAD:
+                self.special_tokens[special] = self.token_to_id(special, lang_id=CANARY_SPECIAL_TOKENIZER)
 
     @cached_property
     def eos_id(self) -> int:
-        return self.special_tokens["<|endoftext|>"]
+        return self.special_tokens[CANARY_EOS]
 
     @cached_property
     def bos_id(self) -> int:
-        return self.special_tokens["<|startoftranscript|>"]
+        return self.special_tokens[CANARY_BOS]
 
     @cached_property
     def nospeech_id(self) -> int:
-        return self.special_tokens["<|nospeech|>"]
+        return self.special_tokens[CANARY_NOSPEECH]
 
     @cached_property
     def pad_id(self) -> int:
-        return self.special_tokens["<pad>"]
+        return self.special_tokens[CANARY_PAD]
+
+    def text_to_ids(self, text, lang_id) -> list[int]:
+        if lang_id == CANARY_SPECIAL_TOKENIZER:
+            return self._tokenize_special_prompt(text)
+        if text.endswith(CANARY_EOS):
+            return super().text_to_ids(text[: -len(CANARY_EOS)], lang_id) + [self.eos_id]
+        return super().text_to_ids(text[-len(CANARY_EOS) :], lang_id)
+
+    def _tokenize_special_prompt(self, text: str) -> list[int]:
+        """
+        Tokenize the input special prompt of the following schema:
+
+        <|startoftranscript|><|source_lang|><|taskname|><|target_lang|><|pnc|>
+
+        Required because otherwise self.text_to_ids() returns a different result than what Canary had been trained with.
+        """
+        ans = []
+        assert text.count('>') == 5, f"Expected exactly 5 special tokens in Canary's prompt, got: {text}."
+        assert text.startswith(CANARY_BOS), text
+        for _ in range(5):
+            token = text[: text.find(">") + 1]
+            ans.append(self.special_tokens[token])
+            text = text[len(token) :]
+        assert len(text) == 0, text
+        return ans
 
     def spl_token_to_id(self, token):
         if token_id := self.special_tokens.get(f"<|{token}|>", None):
diff --git a/nemo/collections/common/tokenizers/sentencepiece_tokenizer.py b/nemo/collections/common/tokenizers/sentencepiece_tokenizer.py
index aed05673f6fa..4a47f0e49b1e 100644
--- a/nemo/collections/common/tokenizers/sentencepiece_tokenizer.py
+++ b/nemo/collections/common/tokenizers/sentencepiece_tokenizer.py
@@ -17,6 +17,7 @@
 
 import numpy as np
 import sentencepiece
+import torch
 
 from nemo.collections.common.parts.utils import if_exist
 from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec
@@ -127,7 +128,7 @@ def tokens_to_text(self, tokens):
         return self.tokenizer.decode_pieces(tokens)
 
     def ids_to_text(self, ids):
-        if isinstance(ids, np.ndarray):
+        if isinstance(ids, (np.ndarray, torch.Tensor)):
             ids = ids.tolist()
 
         if self.legacy:
diff --git a/tests/collections/__init__.py b/tests/collections/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/collections/asr/test_asr_multitask_model_bpe.py b/tests/collections/asr/test_asr_multitask_model_bpe.py
index d250fbcf74a1..986df09deacb 100644
--- a/tests/collections/asr/test_asr_multitask_model_bpe.py
+++ b/tests/collections/asr/test_asr_multitask_model_bpe.py
@@ -80,9 +80,18 @@ def asr_model(test_data_dir):
         'dir': None,
         'type': 'agg',
         'langs': {
-            'spl_tokens': {'dir': os.path.join(test_data_dir, "asr", "tokenizers", "canary"), 'type': 'bpe',},
-            'en': {'dir': os.path.join(test_data_dir, "asr", "tokenizers", "an4_wpe_128"), 'type': 'wpe',},
-            'de': {'dir': os.path.join(test_data_dir, "asr", "tokenizers", "an4_wpe_128"), 'type': 'wpe',},
+            'spl_tokens': {
+                'dir': os.path.join(test_data_dir, "asr", "tokenizers", "canary"),
+                'type': 'bpe',
+            },
+            'en': {
+                'dir': os.path.join(test_data_dir, "asr", "tokenizers", "an4_wpe_128"),
+                'type': 'wpe',
+            },
+            'de': {
+                'dir': os.path.join(test_data_dir, "asr", "tokenizers", "an4_wpe_128"),
+                'type': 'wpe',
+            },
         },
         'custom_tokenizer': {
             '_target_': 'nemo.collections.common.tokenizers.canary_tokenizer.CanaryTokenizer',
@@ -98,6 +107,9 @@ def asr_model(test_data_dir):
     modelConfig = DictConfig(
         {
             'prompt_format': 'canary',
+            'prompt_defaults': [
+                {"role": "user", "slots": {"source_lang": "en", "target_lang": "en", "task": "asr", "pnc": "yes"}}
+            ],
             'sample_rate': 16000,
             'preprocessor': DictConfig(preprocessor),
             'model_defaults': DictConfig(model_defaults),
@@ -304,10 +316,9 @@ def test_transcribe_tensor(self, asr_model, test_data_dir):
         audio, sr = sf.read(audio_file, dtype='float32')
 
         # Numpy array test
-        with pytest.raises(NotImplementedError):
-            outputs = asr_model.transcribe(audio, batch_size=1)
-        # assert len(outputs) == 1
-        # assert isinstance(outputs[0], str)
+        outputs = asr_model.transcribe(audio, batch_size=1)
+        assert len(outputs) == 1
+        assert isinstance(outputs[0], str)
 
     @pytest.mark.unit
     def test_build_tokenizer(self, asr_model, test_data_dir):
diff --git a/tests/collections/asr/test_custom_tokenizer.py b/tests/collections/asr/test_custom_tokenizer.py
index 5a033045b709..61692061661f 100644
--- a/tests/collections/asr/test_custom_tokenizer.py
+++ b/tests/collections/asr/test_custom_tokenizer.py
@@ -67,7 +67,9 @@ class DummyModel(ASRBPEMixin, Serialization):
                 "spl_tokens": {"dir": special_tokenizer_path, "type": "bpe"},
                 "en": {"dir": lang_tokenizer_path, "type": "bpe"},
             },
-            "custom_tokenizer": {"_target_": "nemo.collections.common.tokenizers.canary_tokenizer.CanaryTokenizer",},
+            "custom_tokenizer": {
+                "_target_": "nemo.collections.common.tokenizers.canary_tokenizer.CanaryTokenizer",
+            },
         }
     )
     model._setup_aggregate_tokenizer(config)
@@ -83,5 +85,11 @@ class DummyModel(ASRBPEMixin, Serialization):
     assert isinstance(tokenizer.tokenizers_dict["en"], SentencePieceTokenizer)
     assert tokenizer.tokenizers_dict["en"].vocab_size == 6
 
-    assert tokenizer.text_to_ids("<|startoftranscript|>", lang_id="spl_tokens") == [13, 4]  # "_" comes first
+    assert tokenizer.text_to_ids("<|startoftranscript|><|en|><|asr|><|en|><|pnc|>", lang_id="spl_tokens") == [
+        4,
+        9,
+        7,
+        9,
+        5,
+    ]
     assert tokenizer.text_to_ids("a", lang_id="en") == [14 + 1, 14 + 2]
diff --git a/tests/collections/common/prompt_formatters/conftest.py b/tests/collections/common/prompt_formatters/conftest.py
new file mode 100644
index 000000000000..e18f1072af24
--- /dev/null
+++ b/tests/collections/common/prompt_formatters/conftest.py
@@ -0,0 +1,51 @@
+import pytest
+
+from nemo.collections.common.tokenizers import CanaryTokenizer, SentencePieceTokenizer
+from nemo.collections.common.tokenizers.sentencepiece_tokenizer import create_spt_model
+
+# Note: We don't really define special tokens for this test so every 'special token'
+#       will be represented as a number of regular tokens.
+TOKENIZER_TRAIN_TEXT = """
+Example system message.
+Example user message.
+Example assistant message.
+TEST
+[INST]
+[/INST]
+<s>
+</s>
+<<SYS>>
+<</SYS>>
+User: Assistant:
+user model
+Instruct Output 
+\n\n
+<start_of_turn> <end_of_turn>
+<|
+|>
+<|en|> <|de|> <|fr|> <|es|> <|transcribe|> <|translate|> <|pnc|> <|nopnc|> <|startoftranscript|> <|endoftext|>
+Feel free to add new tokens for your own tests!?
+But know that if you do so, you may need to update the token IDs in the existing tests! 
+So, it might be a good idea to create a new tokenizer instead when adding new prompt formats.
+"""
+
+
+@pytest.fixture(scope="session")
+def bpe_tokenizer(tmp_path_factory):
+    tmpdir = tmp_path_factory.mktemp("bpe_tokenizer")
+    text_path = tmpdir / "text.txt"
+    text_path.write_text(TOKENIZER_TRAIN_TEXT)
+    create_spt_model(str(text_path), vocab_size=512, sample_size=-1, do_lower_case=False, output_dir=str(tmpdir))
+    return SentencePieceTokenizer(str(tmpdir / "tokenizer.model"))
+
+
+@pytest.fixture(scope="session")
+def canary_tokenizer(bpe_tokenizer, tmp_path_factory):
+    tmpdir = tmp_path_factory.mktemp("spl_tokens")
+    spl_tokens = CanaryTokenizer.build_special_tokenizer(["transcribe", "en"], tmpdir)
+    return CanaryTokenizer(
+        tokenizers={
+            "spl_tokens": spl_tokens,
+            "en": bpe_tokenizer,
+        }
+    )
diff --git a/tests/collections/common/prompt_formatters/test_canary_prompt_formatter.py b/tests/collections/common/prompt_formatters/test_canary_prompt_formatter.py
new file mode 100644
index 000000000000..ff786766b246
--- /dev/null
+++ b/tests/collections/common/prompt_formatters/test_canary_prompt_formatter.py
@@ -0,0 +1,50 @@
+from nemo.collections.common.prompts.canary import CanaryPromptFormatter
+
+
+def test_canary_prompt_formatter_training(canary_tokenizer):
+    formatter = CanaryPromptFormatter(canary_tokenizer)
+    ans = formatter.encode_dialog(
+        [
+            {
+                "role": "user",
+                "slots": {
+                    "source_lang": "<|en|>",
+                    "target_lang": "<|en|>",
+                    "task": "<|transcribe|>",
+                    "pnc": "<|pnc|>",
+                    "prompt_language": "spl_tokens",
+                },
+            },
+            {"role": "assistant", "slots": {"text": "TEST", "prompt_language": "en"}},
+        ]
+    )
+    assert set(ans) == {"input_ids", "context_ids", "answer_ids", "mask"}
+    # fmt: off
+    assert ans["input_ids"].tolist() == [4, 8, 7, 8, 5, 11, 91, 30, 40, 3]
+    assert ans["context_ids"].tolist() == [4, 8, 7, 8, 5]
+    assert ans["answer_ids"].tolist() == [11, 91, 30, 40, 3]
+    assert ans["mask"].tolist() == [False] * 5 + [True] * 5
+    # fmt: on
+
+
+def test_canary_prompt_formatter_inference(canary_tokenizer):
+    formatter = CanaryPromptFormatter(canary_tokenizer)
+    ans = formatter.encode_dialog(
+        [
+            {
+                "role": "user",
+                "slots": {
+                    "source_lang": "<|en|>",
+                    "target_lang": "<|en|>",
+                    "task": "<|transcribe|>",
+                    "pnc": "<|pnc|>",
+                    "prompt_language": "spl_tokens",
+                },
+            },
+        ]
+    )
+    assert set(ans) == {"input_ids", "context_ids"}
+    # fmt: off
+    assert ans["input_ids"].tolist() == ans["context_ids"].tolist()
+    assert ans["input_ids"].tolist() == [4, 8, 7, 8, 5]
+    # fmt: on
diff --git a/tests/collections/common/prompt_formatters/test_gemma_prompt_formatter.py b/tests/collections/common/prompt_formatters/test_gemma_prompt_formatter.py
new file mode 100644
index 000000000000..be1f6de1a873
--- /dev/null
+++ b/tests/collections/common/prompt_formatters/test_gemma_prompt_formatter.py
@@ -0,0 +1,40 @@
+from nemo.collections.common.prompts.gemma import GemmaPromptFormatter
+
+
+def test_gemma_prompt_formatter_training(bpe_tokenizer):
+    formatter = GemmaPromptFormatter(bpe_tokenizer)
+    ans = formatter.encode_dialog(
+        [
+            {"role": "user", "slots": {"message": "TEST"}},
+            {"role": "assistant", "slots": {"message": "TEST"}},
+        ]
+    )
+    assert set(ans) == {"input_ids", "context_ids", "answer_ids", "mask"}
+    # fmt: off
+    assert ans["input_ids"].tolist() == [ 21,  53,  18,  26,  18,   6,  60,   9,   7,  75,  31,   1,  81,  20,
+         30, 104,  59,  18,  26,  18,   6,  60,   9,   7,  21,  53,  18,  26,
+         18,   6,  60,   9,   7,  73,  61,  69,   1,  81,  20,  30, 104,  59,
+         18,  26,  18,   6,  60,   9,   7]
+    assert ans["context_ids"].tolist() == [ 21,  53,  18,  26,  18,   6,  60,   9,   7,  75,  31,   1,  81,  20,
+         30, 104,  59,  18,  26,  18,   6,  60,   9,   7,  21,  53,  18,  26,
+         18,   6,  60,   9,   7,  73,  61,  69]
+    assert ans["answer_ids"].tolist() == [1,  81,  20,  30, 104,  59,
+         18,  26,  18,   6,  60,   9,   7]
+    assert ans["mask"].tolist() == [False] * 36 + [True] * 13
+    # fmt: on
+
+
+def test_gemma_prompt_formatter_inference(bpe_tokenizer):
+    formatter = GemmaPromptFormatter(bpe_tokenizer)
+    ans = formatter.encode_dialog(
+        [
+            {"role": "user", "slots": {"message": "TEST"}},
+        ]
+    )
+    assert set(ans) == {"input_ids", "context_ids"}
+    # fmt: off
+    assert ans["input_ids"].tolist() == ans["context_ids"].tolist()
+    assert ans["input_ids"].tolist() == [ 21,  53,  18,  26,  18,   6,  60,   9,   7,  75,  31,   1,  81,  20,
+                                          30, 104,  59,  18,  26,  18,   6,  60,   9,   7,  21,  53,  18,  26,
+                                          18,   6,  60,   9,   7,  73,  61,  69]
+    # fmt: on
diff --git a/tests/collections/common/prompt_formatters/test_llama2_prompt_formatter.py b/tests/collections/common/prompt_formatters/test_llama2_prompt_formatter.py
new file mode 100644
index 000000000000..9636dd31c768
--- /dev/null
+++ b/tests/collections/common/prompt_formatters/test_llama2_prompt_formatter.py
@@ -0,0 +1,63 @@
+from nemo.collections.common.prompts.llama import Llama2PromptFormatter
+
+
+def test_llama2_prompt_formatter_training(bpe_tokenizer):
+    formatter = Llama2PromptFormatter(bpe_tokenizer)
+    ans = formatter.encode_dialog(
+        [
+            {"role": "user", "slots": {"message": "TEST"}},
+            {"role": "assistant", "slots": {"message": "TEST"}},
+        ]
+    )
+    assert set(ans) == {"input_ids", "context_ids", "answer_ids", "mask"}
+    # fmt: off
+    assert ans["input_ids"].tolist() == [-1, 54, 42, 49, 30, 50, 1, 81, 20, 30, 54, 72, 42, 49, 30, 50, 1, 81, 20, 30, -1]
+    assert ans["context_ids"].tolist() == [-1, 54, 42, 49, 30, 50, 1, 81, 20, 30, 54, 72, 42, 49, 30, 50]
+    assert ans["answer_ids"].tolist() == [1, 81, 20, 30, -1]
+    assert ans["mask"].tolist() == [False] * 16 + [True] * 5
+    # fmt: on
+
+
+def test_llama2_prompt_formatter_inference(bpe_tokenizer):
+    formatter = Llama2PromptFormatter(bpe_tokenizer)
+    ans = formatter.encode_dialog(
+        [
+            {"role": "user", "slots": {"message": "TEST"}},
+        ]
+    )
+    assert set(ans) == {"input_ids", "context_ids"}
+    # fmt: off
+    assert ans["input_ids"].tolist() == ans["context_ids"].tolist()
+    assert ans["input_ids"].tolist() == [-1, 54, 42, 49, 30, 50, 1, 81, 20, 30, 54, 72, 42, 49, 30, 50]
+    # fmt: on
+
+
+def test_llama2_prompt_formatter_training_with_system(bpe_tokenizer):
+    formatter = Llama2PromptFormatter(bpe_tokenizer)
+    ans = formatter.encode_dialog(
+        [
+            {"role": "system_and_user", "slots": {"system": "TEST", "message": "TEST"}},
+            {"role": "assistant", "slots": {"message": "TEST"}},
+        ]
+    )
+    assert set(ans) == {"input_ids", "context_ids", "answer_ids", "mask"}
+    # fmt: off
+    assert ans["input_ids"].tolist() == [-1, 54, 42, 49, 30, 50, 77, 13, 45, 13, 7, 7, 1, 81, 20, 30, 21, 66, 13, 45, 13, 7, 7, 1, 81, 20, 30, 54, 72, 42, 49, 30, 50, 1, 81, 20, 30, -1]
+    assert ans["context_ids"].tolist() == [-1, 54, 42, 49, 30, 50, 77, 13, 45, 13, 7, 7, 1, 81, 20, 30, 21, 66, 13, 45, 13, 7, 7, 1, 81, 20, 30, 54, 72, 42, 49, 30, 50]
+    assert ans["answer_ids"].tolist() == [1, 81, 20, 30, -1]
+    assert ans["mask"].tolist() == [False] * 33 + [True] * 5
+    # fmt: on
+
+
+def test_llama2_prompt_formatter_inference_with_system(bpe_tokenizer):
+    formatter = Llama2PromptFormatter(bpe_tokenizer)
+    ans = formatter.encode_dialog(
+        [
+            {"role": "system_and_user", "slots": {"system": "TEST", "message": "TEST"}},
+        ]
+    )
+    assert set(ans) == {"input_ids", "context_ids"}
+    # fmt: off
+    assert ans["input_ids"].tolist() == ans["context_ids"].tolist()
+    assert ans["input_ids"].tolist() == [-1, 54, 42, 49, 30, 50, 77, 13, 45, 13, 7, 7, 1, 81, 20, 30, 21, 66, 13, 45, 13, 7, 7, 1, 81, 20, 30, 54, 72, 42, 49, 30, 50]
+    # fmt: on
diff --git a/tests/collections/common/prompt_formatters/test_mistral_prompt_formatter.py b/tests/collections/common/prompt_formatters/test_mistral_prompt_formatter.py
new file mode 100644
index 000000000000..edc00d426952
--- /dev/null
+++ b/tests/collections/common/prompt_formatters/test_mistral_prompt_formatter.py
@@ -0,0 +1,32 @@
+from nemo.collections.common.prompts.mistral import MistralPromptFormatter
+
+
+def test_mistral_prompt_formatter_training(bpe_tokenizer):
+    formatter = MistralPromptFormatter(bpe_tokenizer)
+    ans = formatter.encode_dialog(
+        [
+            {"role": "user", "slots": {"message": "TEST"}},
+            {"role": "assistant", "slots": {"message": "TEST"}},
+        ]
+    )
+    assert set(ans) == {"input_ids", "context_ids", "answer_ids", "mask"}
+    # fmt: off
+    assert ans["input_ids"].tolist() == [21, 8, 7, 54, 42, 49, 30, 50, 1, 81, 20, 30, 54, 72, 42, 49, 30, 50, 1, 81, 20, 30, 66, 8, 7]
+    assert ans["context_ids"].tolist() == [21, 8, 7, 54, 42, 49, 30, 50, 1, 81, 20, 30, 54, 72, 42, 49, 30, 50]
+    assert ans["answer_ids"].tolist() == [1, 81, 20, 30, 66, 8, 7]
+    assert ans["mask"].tolist() == [False] * 18 + [True] * 7
+    # fmt: on
+
+
+def test_mistral_prompt_formatter_inference(bpe_tokenizer):
+    formatter = MistralPromptFormatter(bpe_tokenizer)
+    ans = formatter.encode_dialog(
+        [
+            {"role": "user", "slots": {"message": "TEST"}},
+        ]
+    )
+    assert set(ans) == {"input_ids", "context_ids"}
+    # fmt: off
+    assert ans["input_ids"].tolist() == ans["context_ids"].tolist()
+    assert ans["input_ids"].tolist() == [21, 8, 7, 54, 42, 49, 30, 50, 1, 81, 20, 30, 54, 72, 42, 49, 30, 50]
+    # fmt: on
diff --git a/tests/collections/common/prompt_formatters/test_prompt_formatter_api.py b/tests/collections/common/prompt_formatters/test_prompt_formatter_api.py
new file mode 100644
index 000000000000..26ade7da1415
--- /dev/null
+++ b/tests/collections/common/prompt_formatters/test_prompt_formatter_api.py
@@ -0,0 +1,147 @@
+import pytest
+
+from nemo.collections.common.prompts.canary import PromptFormatter
+from nemo.collections.common.prompts.formatter import Modality
+
+
+class _DummyPromptFormatter(PromptFormatter):
+    NAME = "_dummy_test_formatter"
+    TEMPLATE = {
+        "user": {"template": "<s>|text|</s>", "slots": {"text": Modality.Text}},
+        "assistant": {"template": "|text|</s>", "slots": {"text": Modality.Text}},
+    }
+    OUTPUT_ROLE = "assistant"
+
+
+def test_prompt_formatter_empty_dialog_exception(bpe_tokenizer):
+    formatter = _DummyPromptFormatter(bpe_tokenizer)
+    with pytest.raises(AssertionError):
+        formatter.encode_dialog([])
+
+
+def test_prompt_formatter_inference(bpe_tokenizer):
+    formatter = _DummyPromptFormatter(bpe_tokenizer)
+    ans = formatter.encode_dialog([{"role": "user", "slots": {"text": "hi"}}])
+    recovered = bpe_tokenizer.ids_to_text(ans["input_ids"])
+    assert recovered == "<s>hi</s>"
+
+
+def test_prompt_formatter_training(bpe_tokenizer):
+    formatter = _DummyPromptFormatter(bpe_tokenizer)
+    ans = formatter.encode_dialog(
+        [
+            {"role": "user", "slots": {"text": "hi"}},
+            {"role": "assistant", "slots": {"text": "hello"}},
+        ]
+    )
+    recovered = bpe_tokenizer.ids_to_text(ans["input_ids"])
+    assert recovered == "<s>hi</s> hello</s>", recovered
+
+
+def test_prompt_formatter_missing_role(bpe_tokenizer):
+    formatter = _DummyPromptFormatter(bpe_tokenizer)
+    with pytest.raises(AssertionError, match="A turn must have have a 'role' key"):
+        formatter.encode_dialog([{"slots": {"text": "hi"}}])
+
+
+def test_prompt_formatter_missing_slots(bpe_tokenizer):
+    formatter = _DummyPromptFormatter(bpe_tokenizer)
+    with pytest.raises(
+        AssertionError, match="A turn for role user must have have a non-empty value under 'slots' key"
+    ):
+        formatter.encode_dialog([{"role": "user"}])
+    with pytest.raises(
+        AssertionError, match="A turn for role user must have have a non-empty value under 'slots' key"
+    ):
+        formatter.encode_dialog([{"role": "user", "slots": {}}])
+
+
+def test_prompt_formatter_aggregate_tokenizer(canary_tokenizer):
+    # Note the 'canary_tokenizer' arg which is an aggregate tokenizer fixture.
+    formatter = _DummyPromptFormatter(canary_tokenizer)
+    ans = formatter.encode_dialog(
+        [
+            {
+                "role": "user",
+                "slots": {
+                    "text": "hi",
+                    "prompt_language": "en",
+                },
+            }
+        ]
+    )
+    recovered = canary_tokenizer.ids_to_text(ans["input_ids"])
+    assert recovered == " <s>hi</s>"
+
+
+def test_prompt_formatter_aggregate_tokenizer_missing_prompt_language(canary_tokenizer):
+    # Note the 'canary_tokenizer' arg which is an aggregate tokenizer fixture.
+    formatter = _DummyPromptFormatter(canary_tokenizer)
+
+    with pytest.raises(AssertionError, match="Missing key 'prompt_language' in slot_values"):
+        formatter.encode_dialog([{"role": "user", "slots": {"text": "hi"}}])
+
+
+class _DummyPreamblePromptFormatter(PromptFormatter):
+    NAME = "_dummy_preamble_test_formatter"
+    TEMPLATE = {
+        "preamble": {"template": "TEST"},
+        "user": {"template": "<s>|text|</s>", "slots": {"text": Modality.Text}},
+        "assistant": {"template": "|text|</s>", "slots": {"text": Modality.Text}},
+    }
+    OUTPUT_ROLE = "assistant"
+
+
+def test_prompt_formatter_preamble_inference(bpe_tokenizer):
+    formatter = _DummyPreamblePromptFormatter(bpe_tokenizer)
+    ans = formatter.encode_dialog([{"role": "user", "slots": {"text": "hi"}}])
+    recovered = bpe_tokenizer.ids_to_text(ans["input_ids"])
+    assert recovered == "TEST <s>hi</s>", recovered
+
+
+def test_prompt_formatter_premble_training(bpe_tokenizer):
+    formatter = _DummyPreamblePromptFormatter(bpe_tokenizer)
+    ans = formatter.encode_dialog(
+        [
+            {"role": "user", "slots": {"text": "hi"}},
+            {"role": "assistant", "slots": {"text": "hello"}},
+        ]
+    )
+    recovered = bpe_tokenizer.ids_to_text(ans["input_ids"])
+    assert recovered == "TEST <s>hi</s> hello</s>"
+
+
+def test_prompt_formatter_explicit_preamble(bpe_tokenizer):
+    formatter = _DummyPreamblePromptFormatter(bpe_tokenizer)
+    ans = formatter.encode_dialog([{"role": "preamble"}, {"role": "user", "slots": {"text": "hi"}}])
+    recovered = bpe_tokenizer.ids_to_text(ans["input_ids"])
+    assert recovered == "TEST <s>hi</s>"
+
+
+def test_prompt_formatter_wrong_preamble_excpetions(bpe_tokenizer):
+    formatter = _DummyPreamblePromptFormatter(bpe_tokenizer)
+    with pytest.raises(AssertionError):
+        # Error: 2 preambles
+        formatter.encode_dialog(
+            [
+                {"role": "preamble"},
+                {"role": "preamble"},
+                {"role": "user", "slots": {"text": "hi"}},
+            ]
+        )
+    with pytest.raises(AssertionError):
+        # Error: preamble not at the beginning
+        formatter.encode_dialog(
+            [
+                {"role": "user", "slots": {"text": "hi"}},
+                {"role": "preamble"},
+            ]
+        )
+    with pytest.raises(AssertionError):
+        # Error: preamble with slots
+        formatter.encode_dialog(
+            [
+                {"role": "user", "slots": {"text": "hi"}},
+                {"role": "preamble", "slots": {"abc": "abc"}},
+            ]
+        )

From b0f3138a6be7fab3175deb8935f8492aeb1445bd Mon Sep 17 00:00:00 2001
From: Adi Renduchintala <adithya.r@gmail.com>
Date: Sat, 1 Jun 2024 09:41:45 -0700
Subject: [PATCH 43/47] support null/None truncation field (#9355)

* support null/None truncation field

Signed-off-by: arendu <adithyare@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: arendu <arendu@users.noreply.github.com>

* Fix truncation when truncation field is empty

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* Fix final truncation if truncation_field is not enough

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: hsiehjackson <hsiehjackson@users.noreply.github.com>

* Fix

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

---------

Signed-off-by: arendu <adithyare@nvidia.com>
Signed-off-by: arendu <arendu@users.noreply.github.com>
Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>
Signed-off-by: hsiehjackson <hsiehjackson@users.noreply.github.com>
Co-authored-by: arendu <arendu@users.noreply.github.com>
Co-authored-by: Cheng-Ping Hsieh <chsieh@nvidia.com>
Co-authored-by: hsiehjackson <hsiehjackson@users.noreply.github.com>
---
 .../megatron/gpt_sft_dataset.py               | 113 +++++++++++-------
 1 file changed, 72 insertions(+), 41 deletions(-)

diff --git a/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py
index faaa10606aa0..e16543a7568d 100644
--- a/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py
+++ b/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py
@@ -101,7 +101,7 @@ def __init__(
         self.seed = seed
         self.label_key = label_key
         self.answer_only_loss = answer_only_loss
-        self.truncation_fields = truncation_field.split(',')
+        self.truncation_fields = truncation_field.split(',') if truncation_field is not None else []
         self.pad_to_max_length = pad_to_max_length
         self.index_mapping_dir = index_mapping_dir
         self.prompt_template = prompt_template
@@ -166,8 +166,9 @@ def _maybe_validate_prompt_template(self):
         ), f'{label_placeholder} must be at the end of prompt_template.'
 
         # Legacy checkpoints has self.truncation_fields = ['context'] and self.prompt_template_keys = ['input', 'output']
-        if self.prompt_template_keys[0] == 'input' and self.truncation_fields[0] == 'context':
-            self.truncation_fields[0] = self.prompt_template_keys[0]
+        if len(self.truncation_fields) > 0:
+            if self.prompt_template_keys[0] == 'input' and self.truncation_fields[0] == 'context':
+                self.truncation_fields[0] = self.prompt_template_keys[0]
 
         assert set(self.truncation_fields).issubset(
             self.prompt_template_keys
@@ -305,32 +306,61 @@ def _multiple_truncation(self, template_ids: List[List[int]], template_ids_keys:
         if total_ids > self.max_seq_length:
             truncation_length_total = total_ids - self.max_seq_length
             num_fields = len(self.truncation_fields)
-            # sorted equal divide length to each field
-            # examples:
-            #   truncation_length_total = 3
-            #   num_fields = 11
-            #   truncation_length_list = [3,4,4]
-            truncation_length_list = [
-                truncation_length_total // num_fields + (1 if i < truncation_length_total % num_fields else 0)
-                for i in range(num_fields)[::-1]
-            ]
-
-            for i, (ids, key) in enumerate(zip(template_ids, template_ids_keys)):
-                if key in self.truncation_fields:
-                    truncation_length = truncation_length_list.pop()
-                    if len(ids) < truncation_length:
-                        logging.warning(f'{key} is not long enough to truncate.')
-                        truncation_length = len(ids)
-
-                    if self.truncation_method == 'left':
-                        window_offset = truncation_length
-                    elif self.truncation_method == 'right':
-                        window_offset = 0
-                    else:
-                        raise ValueError(f'{self.truncation_method} is not supported')
+            if num_fields > 0:
+                # sorted equal divide length to each field
+                # examples:
+                #   truncation_length_total = 3
+                #   num_fields = 11
+                #   truncation_length_list = [3,4,4]
+                truncation_length_list = [
+                    truncation_length_total // num_fields + (1 if i < truncation_length_total % num_fields else 0)
+                    for i in range(num_fields)[::-1]
+                ]
 
-                    window_length = len(ids) - truncation_length
-                    template_ids[i] = ids[window_offset : window_offset + window_length]
+                for i, (ids, key) in enumerate(zip(template_ids, template_ids_keys)):
+                    if key in self.truncation_fields:
+                        truncation_length = truncation_length_list.pop()
+                        if len(ids) < truncation_length:
+                            logging.warning(f'{key} is not long enough to truncate.')
+                            truncation_length = len(ids)
+
+                        if self.truncation_method == 'left':
+                            window_offset = truncation_length
+                        elif self.truncation_method == 'right':
+                            window_offset = 0
+                        else:
+                            raise ValueError(f'{self.truncation_method} is not supported')
+
+                        window_length = len(ids) - truncation_length
+                        template_ids[i] = ids[window_offset : window_offset + window_length]
+            else:
+                # If truncation_field is empty, we truncate template_ids (List[List[int]]) to make total ids < self.max_seq_length.
+                logging.warning(
+                    f'`truncation_field` is empty, we truncate input from {self.truncation_method} based on truncation_method.'
+                )
+                template_ids_lengths = [len(ids) for ids in template_ids]
+                if self.truncation_method == 'left':
+                    iters = range(0, len(template_ids_lengths), 1)
+                elif self.truncation_method == 'right':
+                    iters = range(len(template_ids_lengths) - 1, -1, -1)
+                else:
+                    raise ValueError(f'{self.truncation_method} is not supported')
+
+                # Iterate all lengths of template_ids.
+                for i in iters:
+                    if template_ids_lengths[i] >= truncation_length_total:
+                        template_ids_lengths[i] -= truncation_length_total
+                        if self.truncation_method == 'left':
+                            template_ids[i] = template_ids[i][-template_ids_lengths[i] :]
+                        elif self.truncation_method == 'right':
+                            template_ids[i] = template_ids[i][: template_ids_lengths[i]]
+                        else:
+                            raise ValueError(f'{self.truncation_method} is not supported')
+                        break
+                    else:
+                        truncation_length_total -= template_ids_lengths[i]
+                        template_ids_lengths[i] = 0
+                        template_ids[i] = []
 
         context_ids = [i for ids in template_ids[:-1] for i in ids]
         label_ids = template_ids[-1]
@@ -362,31 +392,30 @@ def _process_example(self, example):
             # these pad/eos tokens are placeholders for virtual tokens
             context_ids = [self.tokenizer.eos_id] * self.virtual_tokens + context_ids
 
-        input_ids = context_ids
-        answer_start_idx = len(input_ids)
-
         # Adds bos token in the start
         if self.add_bos:
             context_ids = [self.tokenizer.bos_id] + context_ids
-            input_ids = [self.tokenizer.bos_id] + input_ids
-            answer_start_idx += 1
 
         # Adds sep token between text/prompt and answer
         if self.add_sep:
             context_ids = context_ids + [self.sep_id]
-            input_ids = input_ids + [self.sep_id]
-            answer_start_idx += 1
 
-        input_ids = input_ids + answer_ids
+        input_ids = context_ids + answer_ids
 
         # Only training need to consider eos token
         if self.add_eos:
             input_ids = input_ids + [self.tokenizer.eos_id]
 
         if len(input_ids) > self.max_seq_length:
-            logging.warning(f'Input ids length {len(input_ids)} exceed max sequence length {self.max_seq_length}')
+            # this only happens if tuncation_field is not enough to truncate.
+            # context_ids can be empty if we truncate contexts.
+            # answer_ids can be empty if we truncate answers.
+            logging.warning(
+                f'After truncation, input ids length {len(input_ids)} still exceeds max sequence length {self.max_seq_length}'
+            )
+            context_ids = context_ids[: self.max_seq_length]
             input_ids = input_ids[: self.max_seq_length]
-            answer_ids = input_ids[answer_start_idx:]
+            answer_ids = input_ids[len(context_ids) :]
 
         # store metadata in dataset, in case user may have keys required in the prediction json files
         metadata = {k: v for k, v in example.items() if k not in self.prompt_template_keys}
@@ -396,7 +425,7 @@ def _process_example(self, example):
 
         processed_example = {
             'input_ids': input_ids,
-            'answer_start_idx': answer_start_idx,
+            'answer_start_idx': len(context_ids),
             'context_ids': context_ids,
             'context_length': len(context_ids),
             'answer_ids': answer_ids,
@@ -426,7 +455,7 @@ def _collate_item(self, item, max_length, pad_id):
         return item
 
     def _build_loss_mask(self, processed_example):
-        """ Pad input_ids in batch to max batch length while building loss mask """
+        """Pad input_ids in batch to max batch length while building loss mask"""
         input_ids = processed_example['input_ids']
         answer_start_idx = processed_example['answer_start_idx']
         if self.answer_only_loss:
@@ -641,7 +670,9 @@ def collate_fn(self, batch):
         else:
             attention_mask = [self._create_attention_mask(max_length) for _ in batch]
             processed_batch.update(
-                {'attention_mask': torch.stack(attention_mask),}
+                {
+                    'attention_mask': torch.stack(attention_mask),
+                }
             )
 
         return processed_batch

From 9218c3aab7af7c2d7f3d6e45c0b027bafe25eba8 Mon Sep 17 00:00:00 2001
From: Pablo Garay <palenq@gmail.com>
Date: Mon, 3 Jun 2024 09:53:39 -0700
Subject: [PATCH 44/47] cicd_remove_commented_code (#9364)

---
 .github/workflows/cicd-main.yml | 40 +--------------------------------
 1 file changed, 1 insertion(+), 39 deletions(-)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index b924cf975b18..29e84b933f14 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -43,33 +43,11 @@ jobs:
         docker container prune --filter "until=24h" --force
         docker image prune -a --filter "until=24h" --force
 
-#  checkout-repository:
-#    runs-on: self-hosted-azure
-#    container:
-#      image: nvcr.io/nvidia/pytorch:24.02-py3
-#      volumes:
-#        - ${{ github.workspace }}:/workspace
-#    steps:
-#    - name: Checkout repository
-#      uses: actions/checkout@v4
-#      with:
-#        path: ${{ github.run_id }}
-
 
   cicd-test-container-setup:
     needs: [cicd-cluster-clean]
     runs-on: self-hosted-azure-builder
     if: ${{ github.event.label.name == 'Run CICD' }}
-    # uses: actions/cache@v2
-    #container:
-#      image: nvcr.io/nvidia/pytorch:24.02-py3
-#      options: 
-#        # --user 0:128
-#        --device=/dev/nvidia0
-#        --gpus all
-#        --shm-size=8g 
-#        --env TRANSFORMERS_OFFLINE=0
-#        --env HYDRA_FULL_ERROR=1
     steps:
     - name: Checkout repository
       uses: actions/checkout@v4
@@ -114,23 +92,7 @@ jobs:
           # These checks are not crucial
           exit 0
         '
-
-    # - name: Build and push to local registry
-    #   uses: docker/build-push-action@v5
-    #   with:
-    #       context: .
-    #       push: true
-    #       tags: nemoci.azurecr.io/name/app:latest
-
-    # - name: Inspect
-    #   run: |
-    #     docker buildx imagetools inspect nemoci.azurecr.io/name/app:latest
-
-    #- name: Post-workflow execution
-    #  uses: gacts/run-and-post-run@v1
-    #  with:
-    #    post: |
-    #      chmod -R 777 .
+        ### \'\'
 
 
   OPTIONAL_L0_Unit_Tests_GPU:

From 48a2668821e86b4e514c9b04f16d5a7c7e51fd70 Mon Sep 17 00:00:00 2001
From: paul-gibbons <87940629+paul-gibbons@users.noreply.github.com>
Date: Mon, 3 Jun 2024 11:22:51 -0700
Subject: [PATCH 45/47] NeVa token fusion (#9245)

* token fusion via mlp downsampling + media_type default fix

Signed-off-by: paul-gibbons <paul@gibbonspaul.com>

* inference update

Signed-off-by: paul-gibbons <paul@gibbonspaul.com>

* adapter fix

Signed-off-by: paul-gibbons <paul@gibbonspaul.com>

* config refactor, remove image_token_len dependency, transpose mlp_downsample height and weight

Signed-off-by: paul-gibbons <paul@gibbonspaul.com>

* Apply isort and black reformatting

Signed-off-by: paul-gibbons <paul-gibbons@users.noreply.github.com>

* removing image_token_len in text generation strategy

Signed-off-by: paul-gibbons <paul@gibbonspaul.com>

* fix patch_dim text generation

Signed-off-by: paul-gibbons <paul@gibbonspaul.com>

* crop-size fix

Signed-off-by: paul-gibbons <paul@gibbonspaul.com>

* fixing RGB reversal bug

Signed-off-by: paul-gibbons <paul@gibbonspaul.com>

* Apply isort and black reformatting

Signed-off-by: paul-gibbons <paul-gibbons@users.noreply.github.com>

* crop_size default -> None in text_generation_strategy

Signed-off-by: paul-gibbons <paul@gibbonspaul.com>

* Apply isort and black reformatting

Signed-off-by: paul-gibbons <paul-gibbons@users.noreply.github.com>

* patch_dim padding for mlp_downsample

Signed-off-by: paul-gibbons <paul@gibbonspaul.com>

* Apply isort and black reformatting

Signed-off-by: paul-gibbons <paul-gibbons@users.noreply.github.com>

* patch_dim padding update

Signed-off-by: paul-gibbons <paul@gibbonspaul.com>

* Apply isort and black reformatting

Signed-off-by: paul-gibbons <paul-gibbons@users.noreply.github.com>

* updating h/w patch_dim naming convention

Signed-off-by: paul-gibbons <paul@gibbonspaul.com>

* Apply isort and black reformatting

Signed-off-by: paul-gibbons <paul-gibbons@users.noreply.github.com>

---------

Signed-off-by: paul-gibbons <paul@gibbonspaul.com>
Signed-off-by: paul-gibbons <paul-gibbons@users.noreply.github.com>
Co-authored-by: paul-gibbons <paul-gibbons@users.noreply.github.com>
---
 .../neva/conf/llava_config.yaml               |  1 +
 .../multimodal_llm/neva/conf/neva_config.yaml |  2 +-
 .../multimodal_llm/neva/conf/neva_peft.yaml   |  1 +
 .../neva/conf/video_neva_config.yaml          |  2 +-
 .../multimodal/data/neva/neva_dataset.py      | 59 ++++++++++++++-----
 nemo/collections/multimodal/parts/utils.py    | 15 ++---
 .../megatron/adapters/parallel_adapters.py    | 53 ++++++++++++++---
 .../common/text_generation_strategy.py        | 30 ++++++++--
 8 files changed, 127 insertions(+), 36 deletions(-)

diff --git a/examples/multimodal/multimodal_llm/neva/conf/llava_config.yaml b/examples/multimodal/multimodal_llm/neva/conf/llava_config.yaml
index 68d554efb806..b47c719fef1d 100644
--- a/examples/multimodal/multimodal_llm/neva/conf/llava_config.yaml
+++ b/examples/multimodal/multimodal_llm/neva/conf/llava_config.yaml
@@ -74,6 +74,7 @@ model:
       from_pretrained: "openai/clip-vit-large-patch14" # path or name
       from_hf: True
       patch_dim: 14
+      crop_size: [224, 224]
       hidden_size: 1024 # could be found from model but tricky in code
       vision_select_layer: -2   # default to the last layer
       class_token_length: 1
diff --git a/examples/multimodal/multimodal_llm/neva/conf/neva_config.yaml b/examples/multimodal/multimodal_llm/neva/conf/neva_config.yaml
index b9904981a5db..9ec6e51bb004 100644
--- a/examples/multimodal/multimodal_llm/neva/conf/neva_config.yaml
+++ b/examples/multimodal/multimodal_llm/neva/conf/neva_config.yaml
@@ -74,6 +74,7 @@ model:
       from_pretrained: "" # path or name
       from_hf: True
       patch_dim: 14
+      crop_size: [224, 224]
       hidden_size: 1024 # could be found from model but tricky in code
       vision_select_layer: -2   # default to the last layer
       class_token_length: 1
@@ -189,7 +190,6 @@ model:
     is_multimodal: True
     media_type: image # currently supported: image
     sep_image_conv_front: False
-    image_token_len: 256
     conv_template: ${model.mm_cfg.llm.model_type} # check `nemo/collections/multimodal/data/neva/conversation.py`
     image_folder: null
     image_aspect_ratio: 'square'
diff --git a/examples/multimodal/multimodal_llm/neva/conf/neva_peft.yaml b/examples/multimodal/multimodal_llm/neva/conf/neva_peft.yaml
index bde6718faf1a..5dfcec776b69 100644
--- a/examples/multimodal/multimodal_llm/neva/conf/neva_peft.yaml
+++ b/examples/multimodal/multimodal_llm/neva/conf/neva_peft.yaml
@@ -74,6 +74,7 @@ model:
       from_pretrained: "" # path or name
       from_hf: True
       patch_dim: 14
+      crop_size: [224, 224]
       hidden_size: 1024 # could be found from model but tricky in code
       vision_select_layer: -2   # default to the last layer
       class_token_length: 1
diff --git a/examples/multimodal/multimodal_llm/neva/conf/video_neva_config.yaml b/examples/multimodal/multimodal_llm/neva/conf/video_neva_config.yaml
index e2ba8494f2cd..8341ff857202 100644
--- a/examples/multimodal/multimodal_llm/neva/conf/video_neva_config.yaml
+++ b/examples/multimodal/multimodal_llm/neva/conf/video_neva_config.yaml
@@ -75,6 +75,7 @@ model:
       from_pretrained: "" # path or name
       from_hf: True
       patch_dim: 14
+      crop_size: [336, 336]
       hidden_size: 1024 # could be found from model but tricky in code
       vision_select_layer: -2   # default to the last layer
       class_token_length: 1
@@ -194,7 +195,6 @@ model:
     num_frames: 8 # selects the number of frames to use from the video
     sep_token_between_frames: False # TODO: allow usage of separator tokens between frames
     sep_image_conv_front: False
-    image_token_len: 256
     conv_template: ${model.mm_cfg.llm.model_type} # check `nemo/collections/multimodal/data/neva/conversation.py`
     image_folder: null
     video_folder: null
diff --git a/nemo/collections/multimodal/data/neva/neva_dataset.py b/nemo/collections/multimodal/data/neva/neva_dataset.py
index 70afc5b4a19a..07b5ad1a32df 100644
--- a/nemo/collections/multimodal/data/neva/neva_dataset.py
+++ b/nemo/collections/multimodal/data/neva/neva_dataset.py
@@ -145,25 +145,26 @@ def open_video(self, file_name):
                     cap = decord.VideoReader(f)
                     return self.flatten_frames(cap)
         else:
+            decord.bridge.set_bridge("torch")
             cap = decord.VideoReader(os.path.join(self.video_folder, file_name))
             return self.flatten_frames(cap)
         return None
 
     def flatten_frames(self, cap):
         if self.data_cfg['splice_single_frame'] == 'first':
-            frame = cap[0].asnumpy()[:, :, ::-1]
+            frame = cap[0].asnumpy()
             return Image.fromarray(frame).convert('RGB')
         elif self.data_cfg['splice_single_frame'] == 'middle':
-            frame = cap[len(cap) // 2].asnumpy()[:, :, ::-1]
+            frame = cap[len(cap) // 2].asnumpy()
             return Image.fromarray(frame).convert('RGB')
         elif self.data_cfg['splice_single_frame'] == 'last':
-            frame = cap[-1].asnumpy()[:, :, ::-1]
+            frame = cap[-1].asnumpy()
             return Image.fromarray(frame).convert('RGB')
         else:
             if self.data_cfg['num_frames'] == -1:
                 frames = []
                 for frame in cap:
-                    rgb_frame = frame.asnumpy()[:, :, ::-1]
+                    rgb_frame = frame.asnumpy()
                     img = Image.fromarray(rgb_frame).convert('RGB')
                     frames.append(img)
                 return frames
@@ -171,10 +172,7 @@ def flatten_frames(self, cap):
                 num_frames = min(len(cap), self.data_cfg['num_frames'])
                 indices = np.linspace(0, len(cap) - 1, num_frames, dtype=int)
                 frames = []
-                for i in indices:
-                    rgb_frame = cap[i].asnumpy()[:, :, ::-1]
-                    img = Image.fromarray(rgb_frame).convert('RGB')
-                    frames.append(img)
+                frames = cap.get_batch(indices)
 
                 while len(frames) < self.data_cfg['num_frames']:
                     frames.append(frames[-1])
@@ -262,9 +260,13 @@ def preprocess_multimodal(sources: dict, multimodal_cfg: dict, cur_token_len: in
         return sources
 
     num_patches = image_token_len
+
     if media_type == 'video':
         num_patches *= multimodal_cfg['num_frames']
 
+    if multimodal_cfg['mm_mlp_adapter_type'] == 'mlp_downsample':
+        num_patches //= 4
+
     if multimodal_cfg['use_im_start_end']:
         replace_token = DEFAULT_IMAGE_PATCH_TOKEN[model_type] * num_patches
     else:
@@ -922,9 +924,19 @@ def expand2square(pil_img, background_color):
             media_tensors = torch.tensor([])
             if images:
                 media_tensors = torch.stack(images)
-                cur_token_len = (media_tensors[0].shape[1] // 14) * (
-                    media_tensors[0].shape[2] // 14
-                )  # FIXME: 14 is hardcoded patch size
+                patch_dim = self.multimodal_cfg['patch_dim']
+
+                height_num_patches = media_tensors[0].shape[1] // patch_dim
+                width_num_patches = media_tensors[0].shape[2] // patch_dim
+
+                if self.multimodal_cfg['mm_mlp_adapter_type'] == 'mlp_downsample':
+                    if height_num_patches % 2 != 0:
+                        height_num_patches += 1
+                    if width_num_patches % 2 != 0:
+                        width_num_patches += 1
+
+                cur_token_len = height_num_patches * width_num_patches
+
                 sources = preprocess_multimodal(
                     copy.deepcopy(sources),
                     self.multimodal_cfg,
@@ -978,9 +990,19 @@ def expand2square(pil_img, background_color):
             media_tensors = frames
             if videos:
                 media_tensors = torch.stack(videos)
-                cur_token_len = (media_tensors[0].shape[-1] // 14) * (
-                    media_tensors[0].shape[-2] // 14
-                )  # FIXME: 14 is hardcoded patch size
+                patch_dim = self.multimodal_cfg['patch_dim']
+
+                height_num_patches = media_tensors[0].shape[-2] // patch_dim
+                width_num_patches = media_tensors[0].shape[-1] // patch_dim
+
+                if self.multimodal_cfg['mm_mlp_adapter_type'] == 'mlp_downsample':
+                    if height_num_patches % 2 != 0:
+                        height_num_patches += 1
+                    if width_num_patches % 2 != 0:
+                        width_num_patches += 1
+
+                cur_token_len = height_num_patches * width_num_patches
+
                 sources = preprocess_multimodal(
                     copy.deepcopy(sources),
                     self.multimodal_cfg,
@@ -1190,11 +1212,15 @@ def make_supervised_data_module(tokenizer, model_cfg) -> Dict:
     add_extra_token = 1
     if getattr(model_cfg, 'no_seqlen_plus_one_input_tokens', False):
         add_extra_token = 0
-    crop_size = data_cfg.get("crop_size", (224, 224))
+    crop_size = mm_cfg.vision_encoder.get("crop_size", (224, 224))
     if mm_cfg.vision_encoder.from_hf:
         image_processor = CLIPImageProcessor.from_pretrained(
             mm_cfg.vision_encoder.from_pretrained, torch_dtype=torch.bfloat16
         )
+        assert crop_size == (
+            image_processor.crop_size['height'],
+            image_processor.crop_size['width'],
+        ), f"Crop size {crop_size} does not match the HuggingFace CLIP model's crop size {(image_processor.crop_size['height'], image_processor.crop_size['width'])}"
     else:
         # TODO(yuya): Fix this hard-code for our own CLIP
         image_processor = image_transform(
@@ -1212,8 +1238,8 @@ def make_supervised_data_module(tokenizer, model_cfg) -> Dict:
             sep_image_conv_front=data_cfg.sep_image_conv_front,
             model_type=mm_cfg.llm.get("model_type", "nvgpt"),
             conv_template=data_cfg.get("conv_template", "nvgpt"),
+            patch_dim=model_cfg.mm_cfg.vision_encoder.patch_dim,
             crop_size=crop_size,
-            image_token_len=data_cfg.image_token_len,
             image_folder=data_cfg.get('image_folder', None),
             video_folder=data_cfg.get('video_folder', None),
             image_aspect_ratio=data_cfg.image_aspect_ratio,
@@ -1223,6 +1249,7 @@ def make_supervised_data_module(tokenizer, model_cfg) -> Dict:
             context_length=model_cfg.encoder_seq_length,
             media_type=data_cfg.get('media_type', 'image'),
             num_frames=data_cfg.get('num_frames', -1),
+            mm_mlp_adapter_type=model_cfg.mm_cfg.get('mm_mlp_adapter_type', 'linear'),
         ),
         data_cfg=dict(
             splice_single_frame=data_cfg.get('splice_single_frame', None),
diff --git a/nemo/collections/multimodal/parts/utils.py b/nemo/collections/multimodal/parts/utils.py
index 70dd2174a2b7..8f2549b8fcd0 100644
--- a/nemo/collections/multimodal/parts/utils.py
+++ b/nemo/collections/multimodal/parts/utils.py
@@ -15,6 +15,7 @@
 import tempfile
 from typing import Any, Callable, Tuple
 
+import decord
 import numpy as np
 import torch
 from omegaconf import DictConfig, OmegaConf, open_dict
@@ -469,23 +470,23 @@ def expand2square(pil_img, background_color):
 
     # add video processor for video neva
     def video_processor(maybe_video_path):
-        from decord import VideoReader
 
         if isinstance(maybe_video_path, str):
-            vr = VideoReader(maybe_video_path)
+            decord.bridge.set_bridge("torch")
+            vr = decord.VideoReader(maybe_video_path)
             if neva_cfg.data.splice_single_frame == 'first':
-                frames = [Image.fromarray(vr[0].asnumpy()[:, :, ::-1]).convert('RGB')]
+                frames = [Image.fromarray(vr[0].asnumpy()).convert('RGB')]
             elif neva_cfg.data.splice_single_frame == 'middle':
-                frames = [Image.fromarray(vr[len(vr) // 2].asnumpy()[:, :, ::-1]).convert('RGB')]
+                frames = [Image.fromarray(vr[len(vr) // 2].asnumpy()).convert('RGB')]
             elif neva_cfg.data.splice_single_frame == 'last':
-                frames = [Image.fromarray(vr[-1].asnumpy()[:, :, ::-1]).convert('RGB')]
+                frames = [Image.fromarray(vr[-1].asnumpy()).convert('RGB')]
             else:
                 if neva_cfg.data.num_frames == -1:
-                    frames = [Image.fromarray(frame.asnumpy()[:, :, ::-1]).convert('RGB') for frame in vr]
+                    frames = [Image.fromarray(frame.asnumpy()).convert('RGB') for frame in vr]
                 else:
                     num_frames = min(len(vr), neva_cfg.data.num_frames)
                     indices = np.linspace(0, len(vr) - 1, num_frames, dtype=int)
-                    frames = [Image.fromarray(vr[i].asnumpy()[:, :, ::-1]).convert('RGB') for i in indices]
+                    frames = vr.get_batch(indices)
 
                     while len(frames) < neva_cfg.data.num_frames:
                         frames.append(frames[-1])
diff --git a/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py b/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py
index 51510f1b881e..541ca9c28f3d 100644
--- a/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py
+++ b/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py
@@ -275,7 +275,9 @@ def _get_init_fn(self, init_method: str):
             raise NotImplementedError("out_init_method should be zero, normal, kaiming or xavier")
         return init_fn
 
-    def adapter_unfreeze(self,):
+    def adapter_unfreeze(
+        self,
+    ):
         """
         Can be customized to allow for selective training of only some params in the PEFT.
         """
@@ -402,7 +404,7 @@ class LoraQAdapter(ParallelLinearAdapter):
 
 class LoraDenseAttentionAdapter(ParallelLinearAdapter):
     """
-    Lora Adapters are the same arch as regular adapters but with potentially different input and output feature sizes 
+    Lora Adapters are the same arch as regular adapters but with potentially different input and output feature sizes
     and they do not use an bottleneck activation function
     """
 
@@ -411,7 +413,7 @@ class LoraDenseAttentionAdapter(ParallelLinearAdapter):
 
 class LoraHto4HAdapter(ParallelLinearAdapter):
     """
-    Lora Adapters are the same arch as regular adapters but with potentially different input and output feature sizes 
+    Lora Adapters are the same arch as regular adapters but with potentially different input and output feature sizes
     and they do not use an bottleneck activation function
     """
 
@@ -420,7 +422,7 @@ class LoraHto4HAdapter(ParallelLinearAdapter):
 
 class Lora4HtoHAdapter(ParallelLinearAdapter):
     """
-    Lora Adapters are the same arch as regular adapters but with potentially different input and output feature sizes 
+    Lora Adapters are the same arch as regular adapters but with potentially different input and output feature sizes
     and they do not use an bottleneck activation function
     """
 
@@ -688,14 +690,20 @@ def set_inference_table(self, prompt_representation: torch.Tensor):
         self.is_inference_ready = True
         return True
 
-    def clear_inference_table(self,):
+    def clear_inference_table(
+        self,
+    ):
         self.inference_table.fill_(0.0)
         self.is_inference_ready = False
 
-    def get_inference_table(self,):
+    def get_inference_table(
+        self,
+    ):
         return self.inference_table.data
 
-    def inner_forward(self,):
+    def inner_forward(
+        self,
+    ):
         input_embeds = self.embedding(self.indices).unsqueeze(0)
         intermediate_parallel, bias_parallel = self.first(input_embeds)
         intermediate_parallel = fused_bias_gelu(intermediate_parallel, bias_parallel)
@@ -890,6 +898,29 @@ class LoraKQVAdapterWeightTyingConfig(ParallelLinearAdapterWeightTyingConfig):
     _target_: str = "{0}.{1}".format(LoraKQVAdapterWeightTying.__module__, LoraKQVAdapterWeightTying.__name__)
 
 
+class DownSampleBlock(nn.Module):
+    def forward(self, x):
+        vit_embeds = x
+        h = w = int(vit_embeds.shape[3] ** 0.5)
+        vit_embeds = vit_embeds.reshape(*vit_embeds.shape[:3], h, w, -1)
+        vit_embeds = self.flat_square(vit_embeds)
+        vit_embeds = vit_embeds.reshape(*vit_embeds.shape[:3], -1, vit_embeds.shape[-1])
+        return vit_embeds
+
+    def flat_square(self, x):
+        b, T, F, h, w, c = x.size()
+        if w % 2 == 1:
+            x = torch.cat([x, torch.zeros((b, T, F, h, 1, c), dtype=x.dtype).to(x.device)], dim=4)
+            b, T, F, h, w, c = x.size()
+        if h % 2 == 1:
+            x = torch.cat([x, torch.zeros((b, T, F, 1, w, c), dtype=x.dtype).to(x.device)], dim=3)
+            b, T, F, h, w, c = x.size()
+        x = x.view(b, T, F, h, int(w / 2), int(c * 2))
+        x = x.permute(0, 1, 2, 4, 3, 5).contiguous()
+        x = x.view(b, T, F, int(h / 2), int(w / 2), int(c * 4))
+        return x
+
+
 class MultimodalProjectorAdapter(nn.Module, AdapterModuleUtil):
     def __init__(self, adapter_type: str, in_features: int, out_features: int, bias: bool, **kwargs) -> None:
         super().__init__()
@@ -898,6 +929,14 @@ def __init__(self, adapter_type: str, in_features: int, out_features: int, bias:
             self.mm_projector = torch.nn.Linear(in_features, out_features, bias)
         elif adapter_type == 'identity':
             self.mm_projector = lambda x: x
+        elif adapter_type == 'mlp_downsample':
+            self.mm_projector = torch.nn.Sequential(
+                DownSampleBlock(),
+                torch.nn.LayerNorm(in_features * 4),
+                torch.nn.Linear(in_features * 4, out_features, bias),
+                torch.nn.GELU(),
+                torch.nn.Linear(out_features, out_features, bias),
+            )
         else:
             mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', adapter_type)
             if mlp_gelu_match:
diff --git a/nemo/collections/nlp/modules/common/text_generation_strategy.py b/nemo/collections/nlp/modules/common/text_generation_strategy.py
index 44a80465c34b..e8e2859e439f 100644
--- a/nemo/collections/nlp/modules/common/text_generation_strategy.py
+++ b/nemo/collections/nlp/modules/common/text_generation_strategy.py
@@ -20,7 +20,7 @@
 from typing import List, Set, Tuple
 
 import torch
-
+from transformers import CLIPImageProcessor
 from nemo.collections.nlp.modules.common.lm_utils import pad_batch
 from nemo.collections.nlp.modules.common.megatron.module import Float16Module
 from nemo.collections.nlp.modules.common.megatron.utils import get_ltor_masks_and_position_ids
@@ -533,7 +533,6 @@ class NevaModelTextGenerationStrategy(TextGenerationStrategy):
     def __init__(self, model):
         super().__init__(model)
         self.forward_model = self.model.model
-        self.num_media_latents = model.cfg.data.get("image_token_len", 576)
         self.tokenizer = self.model.tokenizer
         self.image_paths = []
         self.cfg = self.model.cfg
@@ -545,8 +544,10 @@ def __init__(self, model):
             sep_image_conv_front=self.data_cfg.sep_image_conv_front,
             conv_template=self.data_cfg.get("conv_template", "nvgpt"),
             model_type=self.cfg.mm_cfg.llm.get("model_type", "nvgpt"),
-            image_token_len=self.data_cfg.image_token_len,
-            image_folder=self.data_cfg.image_folder,
+            patch_dim=self.cfg.mm_cfg.vision_encoder.patch_dim,
+            crop_size=self.cfg.mm_cfg.vision_encoder.get("crop_size", None),
+            image_folder=self.data_cfg.get('image_folder', None),
+            video_folder=self.data_cfg.get('video_folder', None),
             image_aspect_ratio=self.data_cfg.image_aspect_ratio,
             use_im_start_end=getattr(self.cfg.mm_cfg, 'use_im_start_end', False),
             image_processor=None,
@@ -554,7 +555,28 @@ def __init__(self, model):
             context_length=self.cfg.encoder_seq_length,
             media_type=getattr(self.data_cfg, 'media_type', 'image'),
             num_frames=getattr(self.data_cfg, 'num_frames', 1),
+            mm_mlp_adapter_type=getattr(self.cfg.mm_cfg, 'mm_mlp_adapter_type', 'linear'),
         )
+        if self.multimodal_cfg['crop_size'] is None:
+            image_processor = CLIPImageProcessor.from_pretrained(
+                self.cfg.mm_cfg.vision_encoder.from_pretrained, torch_dtype=torch.bfloat16
+            )
+            self.multimodal_cfg['crop_size'] = (
+                image_processor.crop_size['height'],
+                image_processor.crop_size['width'],
+            )
+
+        patch_dim = self.multimodal_cfg['patch_dim']
+        height_num_patches = self.multimodal_cfg['crop_size'][0] // patch_dim
+        width_num_patches = self.multimodal_cfg['crop_size'][1] // patch_dim
+
+        if self.multimodal_cfg['mm_mlp_adapter_type'] == 'mlp_downsample':
+            if height_num_patches % 2 != 0:
+                height_num_patches += 1
+            if width_num_patches % 2 != 0:
+                width_num_patches += 1
+
+        self.num_media_latents = height_num_patches * width_num_patches
 
     def clip_max_len(self, maxlen: int) -> int:
         """clip the max len based on the LM model max sequence length"""

From bd014d9d71a258da6c69c80df8244a9598c752f3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Piotr=20=C5=BBelasko?= <petezor@gmail.com>
Date: Mon, 3 Jun 2024 14:40:16 -0400
Subject: [PATCH 46/47] Fix prompt formatter's defaults=None case in multi-task
 model (#9366)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Piotr Żelasko <petezor@gmail.com>
---
 nemo/collections/asr/models/aed_multitask_models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nemo/collections/asr/models/aed_multitask_models.py b/nemo/collections/asr/models/aed_multitask_models.py
index 880f8bb3a004..edb591921782 100644
--- a/nemo/collections/asr/models/aed_multitask_models.py
+++ b/nemo/collections/asr/models/aed_multitask_models.py
@@ -133,7 +133,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None):
         prompt_cls = PromptFormatter.resolve(self.prompt_format)
         self.prompt = prompt_cls(
             tokenizer=self.tokenizer,
-            defaults=OmegaConf.to_container(cfg.get("prompt_defaults")),
+            defaults=OmegaConf.to_container(pd) if (pd := cfg.get("prompt_defaults")) is not None else None,
         )
 
         # Setup audio preprocessor

From a0488f63fbfb555f05461dcf235f9a58559a99eb Mon Sep 17 00:00:00 2001
From: yaoyu-33 <54727607+yaoyu-33@users.noreply.github.com>
Date: Mon, 3 Jun 2024 15:28:09 -0700
Subject: [PATCH 47/47] Update Gemma conversion script (#9365)

* Update Gemma conversion script

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Apply isort and black reformatting

Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>

---------

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>
Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>
Co-authored-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>
---
 .../checkpoint_converters/convert_gemma_jax_to_nemo.py |  3 ++-
 .../checkpoint_converters/convert_gemma_pyt_to_nemo.py | 10 ++++++----
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/scripts/checkpoint_converters/convert_gemma_jax_to_nemo.py b/scripts/checkpoint_converters/convert_gemma_jax_to_nemo.py
index c35906dc78c1..1cbeeb41c66d 100644
--- a/scripts/checkpoint_converters/convert_gemma_jax_to_nemo.py
+++ b/scripts/checkpoint_converters/convert_gemma_jax_to_nemo.py
@@ -14,6 +14,7 @@
 
 """
 Requires to install: `pip install orbax jax flax jaxlib`
+Requires to clone: https://github.com/google-deepmind/gemma.git
 Required to set: `export PYTHONPATH=/path/to/google/gemma_jax:$PYTHONPATH`
    python3 /opt/NeMo/scripts/nlp_language_modeling/convert_gemma_jax_to_nemo.py \
    --input_name_or_path /path/to/gemma/checkpoints/jax/7b \
@@ -27,8 +28,8 @@
 
 import jax
 import torch
+from gemma.params import load_params, nest_params, param_remapper
 from omegaconf import OmegaConf
-from params import load_params, nest_params, param_remapper
 from transformer import TransformerConfig
 
 from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
diff --git a/scripts/checkpoint_converters/convert_gemma_pyt_to_nemo.py b/scripts/checkpoint_converters/convert_gemma_pyt_to_nemo.py
index 583ee7893c0f..d14e5f7de551 100644
--- a/scripts/checkpoint_converters/convert_gemma_pyt_to_nemo.py
+++ b/scripts/checkpoint_converters/convert_gemma_pyt_to_nemo.py
@@ -14,6 +14,7 @@
 
 """
 Requires to install: `pip install fairscale==0.4.13 immutabledict==4.1.0 tensorstore==0.1.45`
+Requires to clone: https://github.com/google/gemma_pytorch.git
 Required to set: `export PYTHONPATH=/path/to/google/gemma_pytorchh:$PYTHONPATH`
    python3 /opt/NeMo/scripts/nlp_language_modeling/convert_gemma_pyt_to_nemo.py \
    --input_name_or_path /path/to/gemma/checkpoints/pyt/7b.ckpt \
@@ -26,9 +27,9 @@
 from argparse import ArgumentParser
 
 import torch
-from model.config import get_config_for_2b, get_config_for_7b
-from model.model import CausalLM
-from model.tokenizer import Tokenizer
+from gemma.config import get_config_for_2b, get_config_for_7b
+from gemma.model import CausalLM
+from gemma.tokenizer import Tokenizer
 from omegaconf import OmegaConf
 
 from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
@@ -152,7 +153,8 @@ def adjust_tensor_shapes(model, nemo_state_dict):
             # [(head_num + 2 * num_query_groups) * head_size, hidden_size]
             # -> [head_num, head_size, hidden_size], 2 * [num_query_groups, head_size, hidden_size]
             q_weight, k_weight, v_weight = qkv_weight.split(
-                [head_num * head_size, num_query_groups * head_size, num_query_groups * head_size], dim=0,
+                [head_num * head_size, num_query_groups * head_size, num_query_groups * head_size],
+                dim=0,
             )
             q_weight = q_weight.reshape(head_num, head_size, hidden_size)
             k_weight = k_weight.reshape(num_query_groups, head_size, hidden_size)