From f89bca0ed5186597a7bc58944a8deb9efdbcc520 Mon Sep 17 00:00:00 2001
From: Chen Cui <chcui@nvidia.com>
Date: Thu, 4 Jul 2024 21:30:16 -0400
Subject: [PATCH 01/26] [NeMo-UX] Add PEFT (#9490)

* initial commit for PEFT in nemo2

* Apply isort and black reformatting

Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>

* address comments

Signed-off-by: Chen Cui <chcui@nvidia.com>

* make import easier

Signed-off-by: Chen Cui <chcui@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>

* address comments

Signed-off-by: Chen Cui <chcui@nvidia.com>

* Update nemo/collections/llm/peft/lora.py

Signed-off-by: Marc Romeyn <marcromeyn@gmail.com>

* Some small fixes + adding more doc-strings

* Apply isort and black reformatting

Signed-off-by: marcromeyn <marcromeyn@users.noreply.github.com>

* Adding ModelTransform callback

* Apply isort and black reformatting

Signed-off-by: marcromeyn <marcromeyn@users.noreply.github.com>

* Fixing type-hint for model_transform

* Apply isort and black reformatting

Signed-off-by: marcromeyn <marcromeyn@users.noreply.github.com>

* fix import

Signed-off-by: Chen Cui <chcui@nvidia.com>

* model transform for gemma llama

Signed-off-by: Chen Cui <chcui@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>

* fix model transform

Signed-off-by: Chen Cui <chcui@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>

* change lora target default to all linear modules

Signed-off-by: Chen Cui <chcui@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>

* Small fix in mixtral

* Apply isort and black reformatting

Signed-off-by: marcromeyn <marcromeyn@users.noreply.github.com>

* Integrating PEFT to the public-API + some fixes

* Big refactor to allow to load adapter-states

* Some fixes to support adapter_path

* Apply isort and black reformatting

Signed-off-by: marcromeyn <marcromeyn@users.noreply.github.com>

* Disabling ckpt reloading when adapter_path is passed

* Fix CLI

* Apply isort and black reformatting

Signed-off-by: marcromeyn <marcromeyn@users.noreply.github.com>

* Remove commented-out code

* Remove commented-out code

* Remove un-used import

* Fix callback imports

* Apply isort and black reformatting

Signed-off-by: marcromeyn <marcromeyn@users.noreply.github.com>

* Fixing llm.pretrain

* Some small fixes

* Apply isort and black reformatting

Signed-off-by: marcromeyn <marcromeyn@users.noreply.github.com>

* Fix missing import + type-hint in finetune

* Adding PreemptionCallback + some more tests

* Apply isort and black reformatting

Signed-off-by: marcromeyn <marcromeyn@users.noreply.github.com>

* Clean up imports & clean up llm.api

* Apply isort and black reformatting

Signed-off-by: marcromeyn <marcromeyn@users.noreply.github.com>

* Trying to fix failing tests

* Remove __init__.py 2

* Apply isort and black reformatting

Signed-off-by: marcromeyn <marcromeyn@users.noreply.github.com>

* Fix failing test

* Trying to fix last failing test

---------

Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>
Signed-off-by: Chen Cui <chcui@nvidia.com>
Signed-off-by: Marc Romeyn <marcromeyn@gmail.com>
Signed-off-by: marcromeyn <marcromeyn@users.noreply.github.com>
Co-authored-by: cuichenx <cuichenx@users.noreply.github.com>
Co-authored-by: Marc Romeyn <mromeijn@nvidia.com>
Co-authored-by: marcromeyn <marcromeyn@users.noreply.github.com>
---
 nemo/collections/llm/__init__.py              |   6 +-
 nemo/collections/llm/api.py                   | 285 ++++++++++++++----
 nemo/collections/llm/gpt/model/base.py        |   3 +
 nemo/collections/llm/gpt/model/gemma.py       |   4 +-
 nemo/collections/llm/gpt/model/llama.py       |   4 +-
 nemo/collections/llm/gpt/model/mistral.py     |   6 +-
 nemo/collections/llm/gpt/model/mixtral.py     |   9 +-
 nemo/collections/llm/peft/__init__.py         |   4 +
 nemo/collections/llm/peft/api.py              |  11 +
 nemo/collections/llm/peft/lora.py             | 123 ++++++++
 .../megatron/adapters/parallel_adapters.py    |  11 +
 nemo/lightning/__init__.py                    |   2 +-
 nemo/lightning/_strategy_lib.py               |  41 ++-
 nemo/lightning/fabric/strategies.py           |  43 +--
 nemo/lightning/io/pl.py                       |   2 +-
 nemo/lightning/megatron_parallel.py           |   3 +-
 nemo/lightning/nemo_logger.py                 |   6 +-
 nemo/lightning/pytorch/callbacks/__init__.py  |  12 +-
 ...odel_checkpoint.py => model_checkpoint.py} |   7 +-
 .../pytorch/callbacks/model_transform.py      |  98 ++++++
 nemo/lightning/pytorch/callbacks/nsys.py      |  31 +-
 nemo/lightning/pytorch/callbacks/peft.py      | 261 ++++++++++++++++
 .../lightning/pytorch/callbacks/preemption.py | 115 +++++++
 nemo/lightning/pytorch/optim/base.py          |   3 +-
 nemo/lightning/pytorch/strategies.py          |  62 ++--
 nemo/lightning/resume.py                      |  30 +-
 setup.py                                      |   5 +
 tests/lightning/pytorch/callbacks/__init__.py |   0
 .../pytorch/callbacks/test_model_transform.py |  48 +++
 .../lightning/pytorch/callbacks/test_nsys.py  | 195 ++++++++++++
 .../lightning/pytorch/callbacks/test_peft.py  |  68 +++++
 .../pytorch/callbacks/test_preemption.py      | 114 +++++++
 tests/lightning/test_megatron_parallel.py     |   8 +-
 33 files changed, 1434 insertions(+), 186 deletions(-)
 create mode 100644 nemo/collections/llm/peft/__init__.py
 create mode 100644 nemo/collections/llm/peft/api.py
 create mode 100644 nemo/collections/llm/peft/lora.py
 rename nemo/lightning/pytorch/callbacks/{megatron_model_checkpoint.py => model_checkpoint.py} (98%)
 create mode 100644 nemo/lightning/pytorch/callbacks/model_transform.py
 create mode 100644 nemo/lightning/pytorch/callbacks/peft.py
 create mode 100644 nemo/lightning/pytorch/callbacks/preemption.py
 create mode 100644 tests/lightning/pytorch/callbacks/__init__.py
 create mode 100644 tests/lightning/pytorch/callbacks/test_model_transform.py
 create mode 100644 tests/lightning/pytorch/callbacks/test_nsys.py
 create mode 100644 tests/lightning/pytorch/callbacks/test_peft.py
 create mode 100644 tests/lightning/pytorch/callbacks/test_preemption.py

diff --git a/nemo/collections/llm/__init__.py b/nemo/collections/llm/__init__.py
index 50c5c53f6533..83c0a3af48c0 100644
--- a/nemo/collections/llm/__init__.py
+++ b/nemo/collections/llm/__init__.py
@@ -4,8 +4,8 @@
 except ImportError:
     pass
 
-from nemo.collections.llm import tokenizer
-from nemo.collections.llm.api import export_ckpt, import_ckpt, pretrain, train, validate
+from nemo.collections.llm import peft, tokenizer
+from nemo.collections.llm.api import export_ckpt, finetune, import_ckpt, pretrain, train, validate
 from nemo.collections.llm.gpt.data import (
     DollyDataModule,
     FineTuningDataModule,
@@ -98,6 +98,7 @@
     "export_ckpt",
     "pretrain",
     "validate",
+    "finetune",
     "tokenizer",
     "mock",
     "squad",
@@ -118,4 +119,5 @@
     "gemma_7b",
     "code_gemma_2b",
     "code_gemma_7b",
+    "peft",
 ]
diff --git a/nemo/collections/llm/api.py b/nemo/collections/llm/api.py
index 081b0f01b4c7..5c9703497597 100644
--- a/nemo/collections/llm/api.py
+++ b/nemo/collections/llm/api.py
@@ -1,11 +1,17 @@
+from copy import deepcopy
 from pathlib import Path
-from typing import Callable, Optional
+from typing import Any, Callable, Optional, Union
 
 import pytorch_lightning as pl
 from typing_extensions import Annotated
 
 from nemo.collections.llm.utils import Config, task
-from nemo.lightning import AutoResume, MegatronStrategy, NeMoLogger, OptimizerModule, Trainer, io, teardown
+from nemo.lightning import AutoResume, NeMoLogger, OptimizerModule, Trainer, io
+from nemo.lightning.pytorch.callbacks import PEFT, ModelTransform
+from nemo.utils import logging
+
+
+TokenizerType = Any
 
 
 @task(namespace="llm")
@@ -16,7 +22,8 @@ def train(
     log: Annotated[Optional[NeMoLogger], Config[NeMoLogger]] = None,
     resume: Annotated[Optional[AutoResume], Config[AutoResume]] = None,
     optim: Optional[OptimizerModule] = None,
-    tokenizer: Optional[str] = None,
+    tokenizer: Optional[TokenizerType] = None,
+    model_transform: Optional[Union[PEFT, ModelTransform, Callable]] = None,
     # TODO: Fix export export: Optional[str] = None,
 ) -> Path:
     """
@@ -30,42 +37,38 @@ def train(
         resume (Optional[Union[AutoResume, Resume]]): Resume training from a checkpoint.
         optim (Optional[OptimizerModule]): The optimizer module to be used. If not provided, the default optimizer
             from the model will be used.
-        tokenizer (Optional[str]): Tokenizer setting to be applied. Can be 'data' or 'model'.
+        tokenizer (Optional[TokenizerType]): Tokenizer setting to be applied. Can be 'data' or 'model' or an instance of TokenizerSpec.
         export (Optional[str]): Filename to save the exported checkpoint after training.
+        model_transform (Optional[Union[Callable[[nn.Module], nn.Module], PEFT]]): A model transform to be applied.
 
     Returns
     -------
         Path: The directory path where training artifacts are saved.
 
-    Raises
-    ------
-        ValueError: If the trainer's strategy is not MegatronStrategy.
-
     Examples
     --------
-        >>> model = MyModel()
-        >>> data = MyDataModule()
-        >>> trainer = Trainer(strategy=MegatronStrategy())
-        >>> train(model, data, trainer, tokenizer='data', source='path/to/ckpt.ckpt', export='final.ckpt')
+        >>> from nemo.collections import llm
+        >>> from nemo import lightning as nl
+        >>> model = llm.MistralModel()
+        >>> data = llm.SquadDataModule(seq_length=4096, global_batch_size=16, micro_batch_size=2)
+        >>> precision = nl.MegatronMixedPrecision(precision="bf16-mixed")
+        >>> trainer = nl.Trainer(strategy=nl.MegatronStrategy(tensor_model_parallel_size=2), plugins=precision)
+        >>> train(model, data, trainer, tokenizer="data")
         PosixPath('/path/to/log_dir')
     """
-    _log = log or NeMoLogger()
-    app_state = _log.setup(
-        trainer,
-        resume_if_exists=getattr(resume, "resume_if_exists", False),
-        task_config=getattr(train, "__io__", None),
+    app_state = _setup(
+        model=model,
+        data=data,
+        trainer=trainer,
+        log=log,
+        resume=resume,
+        optim=optim,
+        tokenizer=tokenizer,
+        model_transform=model_transform,
     )
-    if resume is not None:
-        resume.setup(model, trainer)
-    if optim:
-        optim.connect(model)
-    if tokenizer:  # TODO: Improve this
-        _use_tokenizer(model, data, tokenizer)
 
     trainer.fit(model, data)
 
-    _log.teardown()
-
     return app_state.exp_dir
 
 
@@ -74,41 +77,152 @@ def pretrain(
     model: pl.LightningModule,
     data: pl.LightningDataModule,
     trainer: Trainer,
-    source: Optional[str] = None,
-    # export: Optional[str] = None
+    log: Annotated[Optional[NeMoLogger], Config[NeMoLogger]] = None,
+    resume: Annotated[Optional[AutoResume], Config[AutoResume]] = None,
+    optim: Optional[OptimizerModule] = None,
 ) -> Path:
-    return train(model=model, data=data, trainer=trainer, tokenizer="data", source=source)
+    """
+    Pretrains a model using the specified data and trainer, with optional logging, resuming, and optimization.
+
+    This function is a wrapper around the `train` function, specifically configured for pretraining tasks.
+    Note, by default it will use the tokenizer from the model.
+
+    Args:
+        model (pl.LightningModule): The model to be pretrained.
+        data (pl.LightningDataModule): The data module containing pretraining data.
+        trainer (Trainer): The trainer instance configured with a MegatronStrategy.
+        log (NeMoLogger): A nemologger instance.
+        resume (Optional[AutoResume]): Resume training from a checkpoint.
+        optim (Optional[OptimizerModule]): The optimizer module to be used. If not provided, the default
+            optimizer from the model will be used.
+
+    Returns:
+        Path: The directory path where pretraining artifacts are saved.
+
+    Examples:
+        >>> from nemo.collections import llm
+        >>> from nemo import lightning as nl
+        >>> model = llm.MistralModel()
+        >>> data = llm.PretrainingDataModule(paths=[...], seq_length=4096, global_batch_size=16, micro_batch_size=2)
+        >>> precision = nl.MegatronMixedPrecision(precision="bf16-mixed")
+        >>> trainer = nl.Trainer(strategy=nl.MegatronStrategy(tensor_model_parallel_size=2), plugins=precision)
+        >>> llm.pretrain(model, data, trainer)
+        PosixPath('/path/to/log_dir')
+    """
+    return train(
+        model=model,
+        data=data,
+        trainer=trainer,
+        log=log,
+        resume=resume,
+        optim=optim,
+        tokenizer="data",
+    )
 
 
 @task(namespace="llm")
-def validate(
+def finetune(
     model: pl.LightningModule,
     data: pl.LightningDataModule,
     trainer: Trainer,
-    tokenizer: Optional[str] = None,
-    source: Optional[str] = None,
-    export: Optional[str] = None,
+    log: Annotated[Optional[NeMoLogger], Config[NeMoLogger]] = None,
+    resume: Annotated[Optional[AutoResume], Config[AutoResume]] = None,
+    optim: Optional[OptimizerModule] = None,
+    peft: Optional[Union[PEFT, ModelTransform, Callable]] = None,
 ) -> Path:
-    if not isinstance(trainer.strategy, MegatronStrategy):
-        raise ValueError("Only MegatronStrategy is supported")
+    """
+    Finetunes a model using the specified data and trainer, with optional logging, resuming, and PEFT.
 
-    validate_kwargs = {}
-    run_dir = Path(trainer.logger.log_dir)
-    export_dir = run_dir / "export"
+    Note, by default it will use the tokenizer from the model.
 
-    if tokenizer:  # TODO: Improve this
-        _use_tokenizer(model, data, tokenizer)
-    if source:
-        _add_ckpt_path(source, model, validate_kwargs)
+    Args:
+        model (pl.LightningModule): The model to be finetuned.
+        data (pl.LightningDataModule): The data module containing finetuning data.
+        trainer (Trainer): The trainer instance configured with a MegatronStrategy.
+        log (NeMoLogger): A nemologger instance.
+        resume (Optional[AutoResume]): Resume training from a checkpoint.
+        optim (Optional[OptimizerModule]): The optimizer module to be used. If not provided, the default
+            optimizer from the model will be used.
+        peft (Optional[PEFT]): A PEFT (Parameter-Efficient Fine-Tuning) configuration to be applied.
+
+    Returns:
+        Path: The directory path where finetuning artifacts are saved.
+
+    Examples:
+        >>> from nemo.collections import llm
+        >>> from nemo import lightning as nl
+        >>> model = llm.MistralModel()
+        >>> data = llm.SquadDataModule(seq_length=4096, global_batch_size=16, micro_batch_size=2)
+        >>> precision = nl.MegatronMixedPrecision(precision="bf16-mixed")
+        >>> trainer = nl.Trainer(strategy=nl.MegatronStrategy(tensor_model_parallel_size=2), plugins=precision)
+        >>> finetune(model, data, trainer, peft=llm.peft.LoRA()])
+        PosixPath('/path/to/log_dir')
+    """
 
-    trainer.validate(model, data, **validate_kwargs)
-    trainer.save_checkpoint(export_dir)
-    if export:
-        teardown(trainer)
-        del trainer, model, data
-        export_ckpt(export_dir, export)
+    return train(
+        model=model,
+        data=data,
+        trainer=trainer,
+        log=log,
+        resume=resume,
+        optim=optim,
+        tokenizer="model",
+        model_transform=peft,
+    )
 
-    return run_dir
+
+@task(namespace="llm")
+def validate(
+    model: pl.LightningModule,
+    data: pl.LightningDataModule,
+    trainer: Trainer,
+    log: Annotated[Optional[NeMoLogger], Config[NeMoLogger]] = None,
+    resume: Annotated[Optional[AutoResume], Config[AutoResume]] = None,
+    optim: Optional[OptimizerModule] = None,
+    tokenizer: Optional[TokenizerType] = None,
+    model_transform: Optional[Union[PEFT, ModelTransform, Callable]] = None,
+) -> Path:
+    """
+    Validates a model using the specified data and trainer, with optional logging, resuming, and model transformations.
+
+    Args:
+        model (pl.LightningModule): The model to be validated.
+        data (pl.LightningDataModule): The data module containing validation data.
+        trainer (Trainer): The trainer instance configured with a MegatronStrategy.
+        log (NeMoLogger): A nemologger instance.
+        resume (Optional[AutoResume]): Resume from a checkpoint for validation.
+        optim (Optional[OptimizerModule]): The optimizer module to be used. If not provided, the default optimizer
+            from the model will be used.
+        tokenizer (Optional[TokenizerType]): Tokenizer setting to be applied. Can be 'data' or 'model' or an instance of TokenizerSpec.
+        model_transform (Optional[Union[Callable[[nn.Module], nn.Module], PEFT]]): A model transform to be applied.
+
+    Returns:
+        Path: The directory path where validation artifacts are saved.
+
+    Examples:
+        >>> from nemo.collections import llm
+        >>> from nemo import lightning as nl
+        >>> model = llm.MistralModel()
+        >>> data = llm.SquadDataModule(seq_length=4096, global_batch_size=16, micro_batch_size=2)
+        >>> precision = nl.MegatronMixedPrecision(precision="bf16-mixed")
+        >>> trainer = nl.Trainer(strategy=nl.MegatronStrategy(tensor_model_parallel_size=2), plugins=precision)
+        >>> validate(model, data, trainer, tokenizer="data")
+        PosixPath('/path/to/log_dir')
+    """
+    app_state = _setup(
+        model=model,
+        data=data,
+        trainer=trainer,
+        log=log,
+        resume=resume,
+        optim=optim,
+        tokenizer=tokenizer,
+        model_transform=model_transform,
+    )
+
+    trainer.validate(model, data)
+
+    return app_state.exp_dir
 
 
 @task(name="import", namespace="llm")
@@ -136,28 +250,67 @@ def export_ckpt(
     return io.export_ckpt(path, target, output_path, overwrite, load_connector)
 
 
-def _use_tokenizer(model: pl.LightningModule, data: pl.LightningDataModule, tokenizer: str) -> None:
+def _use_tokenizer(model: pl.LightningModule, data: pl.LightningDataModule, tokenizer: TokenizerType) -> None:
     if tokenizer == "data":
-        model.tokenizer = data.tokenizer
-        if hasattr(model, "__io__"):
-            model.__io__.tokenizer = data.tokenizer
+        _set_with_io(model, "tokenizer", data.tokenizer)
     elif tokenizer == "model":
-        data.tokenizer = model.tokenizer
-        if hasattr(data, "__io__"):
-            data.__io__.tokenizer = model.tokenizer
+        _set_with_io(data, "tokenizer", model.tokenizer)
+    else:
+        try:
+            from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec
 
+            if isinstance(tokenizer, TokenizerSpec):
+                _set_with_io(model, "tokenizer", tokenizer)
+                _set_with_io(data, "tokenizer", tokenizer)
+            else:
+                raise ValueError(f"Expected TokenizerSpec or 'data' or 'model', got: {tokenizer}")
+        except ImportError:
+            raise ValueError("TokenizerSpec is not available")
 
-def _add_ckpt_path(source, model, kwargs) -> None:
-    if io.is_distributed_ckpt(source):
-        kwargs["ckpt_path"] = source
-    else:
-        kwargs["ckpt_path"] = model.import_ckpt(source)
 
+def _setup(
+    model: pl.LightningModule,
+    data: pl.LightningDataModule,
+    trainer: Trainer,
+    log: Optional[NeMoLogger],
+    resume: Optional[AutoResume],
+    optim: Optional[OptimizerModule],
+    tokenizer: Optional[TokenizerType],
+    model_transform: Optional[Union[PEFT, ModelTransform, Callable]],
+) -> Any:  # Return type is Any because app_state's type is not specified
+    _log = log or NeMoLogger()
+    if resume and resume.adapter_path and _log.ckpt:
+        logging.info("Disabling try_restore_best_ckpt restoration for adapters")
+        _log.ckpt.try_restore_best_ckpt = False
+
+    app_state = _log.setup(
+        trainer,
+        resume_if_exists=getattr(resume, "resume_if_exists", False),
+        task_config=getattr(train, "__io__", None),
+    )
+    if resume is not None:
+        resume.setup(model, trainer)
+
+    if optim:
+        optim.connect(model)
+    if tokenizer:  # TODO: Improve this
+        _use_tokenizer(model, data, tokenizer)
+
+    if model_transform:
+        _set_with_io(model, "model_transform", model_transform)
+
+    # Add ModelTransform callback to Trainer if needed
+    if getattr(model, "model_transform", None):
+        if not any(isinstance(cb, ModelTransform) for cb in trainer.callbacks):
+            if isinstance(model_transform, ModelTransform):
+                trainer.callbacks.append(model_transform)
+            else:
+                trainer.callbacks.append(ModelTransform())
+
+    return app_state
 
-def _save_config_img(*args, **kwargs):
-    try:
-        from nemo_sdk.utils import save_config_img
 
-        save_config_img(*args, **kwargs)
-    except ImportError:
-        pass
+def _set_with_io(obj, attr, value):
+    setattr(obj, attr, value)
+    if hasattr(obj, "__io__") and hasattr(value, "__io__"):
+        setattr(obj.__io__, attr, deepcopy(value.__io__))
diff --git a/nemo/collections/llm/gpt/model/base.py b/nemo/collections/llm/gpt/model/base.py
index 9b7f4e4ab0c8..28a0eed52a5f 100644
--- a/nemo/collections/llm/gpt/model/base.py
+++ b/nemo/collections/llm/gpt/model/base.py
@@ -6,6 +6,7 @@
 import torch.distributed
 from megatron.core.optimizer import OptimizerConfig
 from megatron.core.transformer.transformer_config import TransformerConfig
+from torch import nn
 
 from nemo.collections.llm import fn
 from nemo.lightning import get_vocab_size, io
@@ -117,12 +118,14 @@ def __init__(
         # TODO: Add transformer_layer_spec when we update mcore
         optim: Optional[OptimizerModule] = None,
         tokenizer: Optional["TokenizerSpec"] = None,
+        model_transform: Optional[Callable[[nn.Module], nn.Module]] = None,
     ):
         super().__init__()
         self.config = config
         self.tokenizer = tokenizer
         self.optim = optim or MegatronOptimizerModule(config=OptimizerConfig(lr=1e-4, use_distributed_optimizer=True))
         self.optim.connect(self)  # This will bind the `configure_optimizers` method
+        self.model_transform = model_transform
 
     def configure_model(self) -> None:
         if not hasattr(self, "module"):
diff --git a/nemo/collections/llm/gpt/model/gemma.py b/nemo/collections/llm/gpt/model/gemma.py
index 348cad255876..6493bb0dfad7 100644
--- a/nemo/collections/llm/gpt/model/gemma.py
+++ b/nemo/collections/llm/gpt/model/gemma.py
@@ -3,6 +3,7 @@
 from typing import TYPE_CHECKING, Annotated, Callable, Optional
 
 import torch
+from torch import nn
 
 from nemo.collections.llm.fn.activation import openai_gelu
 from nemo.collections.llm.gpt.model.base import GPTConfig, GPTModel
@@ -68,8 +69,9 @@ def __init__(
         config: Annotated[Optional[GemmaConfig], Config[GemmaConfig]] = None,
         optim: Optional[OptimizerModule] = None,
         tokenizer: Optional["TokenizerSpec"] = None,
+        model_transform: Optional[Callable[[nn.Module], nn.Module]] = None,
     ):
-        super().__init__(config or GemmaConfig(), optim=optim, tokenizer=tokenizer)
+        super().__init__(config or GemmaConfig(), optim=optim, tokenizer=tokenizer, model_transform=model_transform)
 
 
 @io.model_importer(GemmaModel, "hf")
diff --git a/nemo/collections/llm/gpt/model/llama.py b/nemo/collections/llm/gpt/model/llama.py
index 94cbd99acf90..c7add828b7f4 100644
--- a/nemo/collections/llm/gpt/model/llama.py
+++ b/nemo/collections/llm/gpt/model/llama.py
@@ -4,6 +4,7 @@
 
 import torch
 import torch.nn.functional as F
+from torch import nn
 
 from nemo.collections.llm.gpt.model.base import GPTConfig, GPTModel
 from nemo.collections.llm.utils import Config
@@ -103,8 +104,9 @@ def __init__(
         config: Annotated[Optional[LlamaConfig], Config[LlamaConfig]] = None,
         optim: Optional[OptimizerModule] = None,
         tokenizer: Optional["TokenizerSpec"] = None,
+        model_transform: Optional[Callable[[nn.Module], nn.Module]] = None,
     ):
-        super().__init__(config or LlamaConfig(), optim=optim, tokenizer=tokenizer)
+        super().__init__(config or LlamaConfig(), optim=optim, tokenizer=tokenizer, model_transform=model_transform)
 
 
 @io.model_importer(LlamaModel, "hf")
diff --git a/nemo/collections/llm/gpt/model/mistral.py b/nemo/collections/llm/gpt/model/mistral.py
index 274a761fe5b6..d1049cfe77ce 100644
--- a/nemo/collections/llm/gpt/model/mistral.py
+++ b/nemo/collections/llm/gpt/model/mistral.py
@@ -5,6 +5,7 @@
 import pytorch_lightning as pl
 import torch
 import torch.nn.functional as F
+from torch import nn
 from typing_extensions import Annotated
 
 from nemo.collections.llm.gpt.model.base import GPTConfig, GPTModel
@@ -46,8 +47,11 @@ def __init__(
         config: Annotated[Optional[MistralConfig7B], Config[MistralConfig7B]] = None,
         optim: Optional[OptimizerModule] = None,
         tokenizer: Optional["TokenizerSpec"] = None,
+        model_transform: Optional[Callable[[nn.Module], nn.Module]] = None,
     ):
-        super().__init__(config or MistralConfig7B(), optim=optim, tokenizer=tokenizer)
+        super().__init__(
+            config or MistralConfig7B(), optim=optim, tokenizer=tokenizer, model_transform=model_transform
+        )
 
 
 @io.model_importer(MistralModel, "hf")
diff --git a/nemo/collections/llm/gpt/model/mixtral.py b/nemo/collections/llm/gpt/model/mixtral.py
index 7d757479d27a..af1b73dd9109 100644
--- a/nemo/collections/llm/gpt/model/mixtral.py
+++ b/nemo/collections/llm/gpt/model/mixtral.py
@@ -4,15 +4,17 @@
 
 import torch
 import torch.nn.functional as F
+from torch import nn
 
 from nemo.collections.llm.gpt.model.base import GPTConfig, GPTModel
 from nemo.lightning import io, teardown
 from nemo.lightning.pytorch.optim import OptimizerModule
 
 if TYPE_CHECKING:
-    from transformers import MistralConfig, MistralForCausalLM
+    from transformers import MixtralForCausalLM
 
     from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
+    from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec
 
 
 @dataclass
@@ -53,8 +55,11 @@ def __init__(
         config: Optional[MixtralConfig8x7B] = None,
         optim: Optional[OptimizerModule] = None,
         tokenizer: Optional["TokenizerSpec"] = None,
+        model_transform: Optional[Callable[[nn.Module], nn.Module]] = None,
     ):
-        super().__init__(config or MixtralConfig8x7B(), optim=optim, tokenizer=tokenizer)
+        super().__init__(
+            config or MixtralConfig8x7B(), optim=optim, tokenizer=tokenizer, model_transform=model_transform
+        )
 
 
 @io.model_importer(MixtralModel, ext="hf")
diff --git a/nemo/collections/llm/peft/__init__.py b/nemo/collections/llm/peft/__init__.py
new file mode 100644
index 000000000000..69855f6f9c53
--- /dev/null
+++ b/nemo/collections/llm/peft/__init__.py
@@ -0,0 +1,4 @@
+from nemo.collections.llm.peft.api import gpt_lora
+from nemo.collections.llm.peft.lora import LoRA
+
+__all__ = ["LoRA", "gpt_lora"]
diff --git a/nemo/collections/llm/peft/api.py b/nemo/collections/llm/peft/api.py
new file mode 100644
index 000000000000..dc8fc76c752e
--- /dev/null
+++ b/nemo/collections/llm/peft/api.py
@@ -0,0 +1,11 @@
+from nemo.collections.llm.peft.lora import LoRA
+from nemo.collections.llm.utils import factory
+from nemo.lightning.pytorch.callbacks.peft import PEFT
+
+
+@factory
+def gpt_lora() -> PEFT:
+    return LoRA()
+
+
+__all__ = ["gpt_lora"]
diff --git a/nemo/collections/llm/peft/lora.py b/nemo/collections/llm/peft/lora.py
new file mode 100644
index 000000000000..913144d1bf5f
--- /dev/null
+++ b/nemo/collections/llm/peft/lora.py
@@ -0,0 +1,123 @@
+from dataclasses import dataclass, field
+from typing import List, Literal
+
+from megatron.core import parallel_state
+from torch import nn
+
+from nemo.lightning.pytorch.callbacks.peft import PEFT, AdapterWrapper
+from nemo.utils import logging
+
+
+class AdapterParallelAdd(AdapterWrapper):
+    """An adapter wrapper that adds the output of the adapter to the output of the wrapped module.
+
+    This class is designed to be used with LoRA (Low-Rank Adaptation) and similar techniques
+    where the adapter's output is added to the main module's output. It extends the AdapterWrapper
+    class to provide a specific implementation of the forward method.
+    """
+
+    def forward(self, x):
+        linear_output, bias = self.to_wrap(x)
+        if isinstance(linear_output, tuple) and len(linear_output) == 2:
+            linear_output, layernorm_output = linear_output
+            adapter_output = self.adapter(layernorm_output)
+        else:
+            adapter_output = self.adapter(x)
+        return linear_output + adapter_output, bias
+
+
+@dataclass
+class LoRA(PEFT):
+    """
+    Implements the LoRA (Low-Rank Adaptation) module for parameter-efficient fine-tuning.
+
+    LoRA uses a low-rank projection to adapt the weights of a pre-trained model to a new downstream task.
+    This class facilitates the application of LoRA to specific modules within the model architecture.
+
+    Args:
+        target_modules (List[str], optional): A list of module names to apply LoRA to.
+            Defaults to all linear layers ['linear_qkv', 'linear_proj', 'linear_fc1', 'linear_fc2'].
+                - 'linear_qkv': Apply LoRA to the fused linear layer used for query, key, and value projections
+                                in self-attention modules.
+                - 'linear_proj': Apply LoRA to the linear layer used for projecting the output of self-attention modules.
+                - 'linear_fc1': Apply LoRA to the first fully-connected layer in MLP.
+                - 'linear_fc2': Apply LoRA to the second fully-connected layer in MLP.
+        dim (int): Dimension of the low-rank projection space. Defaults to 32.
+        alpha (int): Weighting factor for the low-rank projection. Defaults to 32.
+        dropout (float): Dropout rate for the low-rank projection. Defaults to 0.0.
+        dropout_position (Literal['pre', 'post'], optional): Position for applying dropout.
+            Can be 'pre' (before the low-rank projection) or 'post' (after). Defaults to 'post'.
+
+    Example:
+    --------
+        >>> from nemo.collections import llm
+        >>> lora = llm.peft.LoRA(target_modules=['linear_qkv', 'linear_proj'], dim=32)
+        >>> model = llm.Mistral7BModel(model_transform=lora)
+        >>> # (set up trainer and data)
+        >>> trainer.fit(model, data)
+
+    References:
+    -----------
+        Hu, E. J., Shen, Y., Wallis, P., Allen-Zhu, Z., Li, Y., Wang, S., Wang, L., & Chen, W. (2021).
+        LoRA: Low-Rank Adaptation of Large Language Models. arXiv preprint arXiv:2106.09685.
+        https://arxiv.org/abs/2106.09685
+
+    )
+    """
+
+    target_modules: List[str] = field(
+        default_factory=lambda: ['linear_qkv', 'linear_proj', 'linear_fc1', 'linear_fc2']
+    )
+    dim: int = 32
+    alpha: int = 32
+    dropout: float = 0.0
+    dropout_position: Literal['pre', 'post'] = 'post'
+
+    def transform(self, m: nn.Module, name=None, prefix=None):
+        """
+        Applies LoRA to a specific module within the model architecture.
+
+        Args:
+            m (nn.Module): The module to apply LoRA to.
+            name (str, optional): Name of the module (if applicable). Defaults to None.
+            prefix (str, optional): Prefix for the module name (if applicable). Defaults to None.
+
+        Returns:
+            nn.Module: The modified module with LoRA applied, or the original module if not a target.
+        """
+        from nemo.collections.nlp.modules.common.megatron.adapters.parallel_adapters import ParallelLinearAdapter
+
+        tp_size = parallel_state.get_tensor_model_parallel_world_size()
+        if name in self.target_modules:
+            # m.in_features and m.out_features are divided by tp_size already,
+            # but in_features and out_features passed to ParallelLinearAdapter are not.
+            if name in ['linear_qkv', 'linear_fc1']:
+                # Column Parallel Linear
+                input_is_parallel = False
+                in_features = m.in_features
+                out_features = m.out_features * tp_size
+            else:  # name in ['linear_proj', 'linear_fc2']
+                # Row Parallel Linear
+                input_is_parallel = True
+                in_features = m.in_features * tp_size
+                out_features = m.out_features
+
+            logging.info(f"Adding lora to: {prefix}.{name}")
+            adapter = ParallelLinearAdapter(
+                in_features,
+                out_features,
+                self.dim,
+                activation='identity',
+                norm_position=None,
+                norm_type=None,
+                column_init_method="normal",
+                row_init_method="zero",
+                gather_output=False,
+                input_is_parallel=input_is_parallel,
+                dropout=self.dropout,
+                dropout_position=self.dropout_position,
+                model_parallel_config=getattr(m, "config", None),
+                alpha=self.alpha,
+            )
+            return AdapterParallelAdd(m, adapter)
+        return m
diff --git a/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py b/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py
index 21dace008877..9ab1da7136a1 100644
--- a/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py
+++ b/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py
@@ -24,6 +24,7 @@
 import torch.nn as nn
 import torch.nn.init as init
 
+from megatron.core.dist_checkpointing.mapping import ShardedStateDict
 from nemo.collections.common.parts.adapter_modules import AdapterModuleUtil
 from nemo.collections.common.parts.utils import activation_registry
 from nemo.collections.nlp.modules.common.megatron.fused_bias_gelu import fused_bias_gelu
@@ -322,6 +323,16 @@ def forward(self, x):
 
         return x
 
+    def sharded_state_dict(
+        self, prefix: str = '', sharded_offsets: tuple = (), metadata: Optional[dict] = None
+    ) -> ShardedStateDict:
+        sharded_state_dict = {}
+        sharded_state_dict.update(self.linear_in.sharded_state_dict(f"{prefix}linear_in.", sharded_offsets, metadata))
+        sharded_state_dict.update(
+            self.linear_out.sharded_state_dict(f"{prefix}linear_out.", sharded_offsets, metadata)
+        )
+        return sharded_state_dict
+
 
 class _All2AllHp2Sp(torch.autograd.Function):
     """
diff --git a/nemo/lightning/__init__.py b/nemo/lightning/__init__.py
index d414376d8168..e9674ed1e212 100644
--- a/nemo/lightning/__init__.py
+++ b/nemo/lightning/__init__.py
@@ -14,7 +14,7 @@
 from nemo.lightning.fabric.plugins import FabricMegatronMixedPrecision
 from nemo.lightning.fabric.strategies import FabricMegatronStrategy
 from nemo.lightning.nemo_logger import NeMoLogger
-from nemo.lightning.pytorch.callbacks.megatron_model_checkpoint import ModelCheckpoint
+from nemo.lightning.pytorch.callbacks.model_checkpoint import ModelCheckpoint
 from nemo.lightning.pytorch.optim import LRSchedulerModule, MegatronOptimizerModule, OptimizerModule, lr_scheduler
 from nemo.lightning.pytorch.plugins import MegatronDataSampler, MegatronMixedPrecision
 from nemo.lightning.pytorch.plugins import data_sampler as _data_sampler
diff --git a/nemo/lightning/_strategy_lib.py b/nemo/lightning/_strategy_lib.py
index cb74b42a74c8..11e89a468c76 100644
--- a/nemo/lightning/_strategy_lib.py
+++ b/nemo/lightning/_strategy_lib.py
@@ -2,7 +2,7 @@
 import os
 from collections import defaultdict
 from contextlib import contextmanager
-from typing import TYPE_CHECKING, Any, Dict, Generator, Optional, Protocol, TypeVar
+from typing import TYPE_CHECKING, Any, Dict, Generator, Mapping, Optional, Protocol, TypeVar
 
 import torch
 from torch import nn
@@ -472,3 +472,42 @@ def get_safe(param_id):
     optim_state_to_sharding_state(optimizer_state_dict["optimizer"], id_to_sharded_param_map)
 
     return optimizer_state_dict
+
+
+def load_model_state_dict(megatron_parallel, checkpoint: Mapping[str, Any], strict: bool = True) -> None:
+    from megatron.core import parallel_state
+
+    for index, module in enumerate(megatron_parallel):
+        if parallel_state.get_virtual_pipeline_model_parallel_world_size() is not None:
+            if "state_dict" in checkpoint:
+                checkpoint_state_dict = checkpoint["state_dict"][f"model_{index}"]
+            else:
+                checkpoint_state_dict = checkpoint[f"model_{index}"]
+        else:
+            if "state_dict" in checkpoint:
+                checkpoint_state_dict = checkpoint["state_dict"]
+            else:
+                checkpoint_state_dict = checkpoint
+
+        n_nesting = 0
+        mcore_model = megatron_parallel.module
+        while hasattr(mcore_model, "module"):
+            mcore_model = mcore_model.module
+            n_nesting += 1
+
+        _state_dict = {}
+        for key, value in checkpoint_state_dict.items():
+            # Count the number of "module." at the start of the key
+            count, _key = 0, key
+            while _key.startswith("module."):
+                _key = _key[len("module.") :]
+                count += 1
+
+            # Adjust the number of "module." prefixes
+            if count < n_nesting:
+                to_add = "module." * (n_nesting - count)
+                _state_dict[f"{to_add}{key}"] = value
+            elif count > n_nesting:
+                to_remove = "module." * (count - n_nesting)
+                _state_dict[key[len(to_remove) :]] = value
+        module.load_state_dict(_state_dict, strict=strict)
diff --git a/nemo/lightning/fabric/strategies.py b/nemo/lightning/fabric/strategies.py
index a53cee1c75e8..a662386a9119 100644
--- a/nemo/lightning/fabric/strategies.py
+++ b/nemo/lightning/fabric/strategies.py
@@ -296,48 +296,7 @@ def load_checkpoint(
     def load_module_state_dict(
         self, module: Module, state_dict: Dict[str, Union[Any, Tensor]], strict: bool = True
     ) -> None:
-        from megatron.core import parallel_state
-
-        for index, p_module in enumerate(module):
-            if parallel_state.get_virtual_pipeline_model_parallel_world_size() is not None:
-                if "state_dict" in state_dict:
-                    checkpoint_state_dict = state_dict["state_dict"][f"model_{index}"]
-                else:
-                    checkpoint_state_dict = state_dict[f"model_{index}"]
-            else:
-                if "state_dict" in state_dict:
-                    checkpoint_state_dict = state_dict["state_dict"]
-                else:
-                    checkpoint_state_dict = state_dict
-
-            mcore_model = p_module.module
-            while hasattr(mcore_model, "module"):
-                mcore_model = mcore_model.module
-
-            current = module[0]
-            n_nesting = 0
-            while current != mcore_model:
-                current = current.module
-                n_nesting += 1
-
-            _state_dict = {}
-            for key, value in checkpoint_state_dict.items():
-                # Count the number of "module." at the start of the key
-                count, _key = 0, key
-                while _key.startswith("module."):
-                    _key = _key[len("module.") :]
-                    count += 1
-
-                # Adjust the number of "module." prefixes
-                if count < n_nesting:
-                    to_add = "module." * (n_nesting - count)
-                    _state_dict[f"{to_add}{key}"] = value
-                elif count > n_nesting:
-                    to_remove = "module." * (count - n_nesting)
-                    _state_dict[key[len(to_remove) :]] = value
-            checkpoint_state_dict = _state_dict
-
-            p_module.load_state_dict(checkpoint_state_dict, strict=strict)
+        _strategy_lib.load_model_state_dict(module, state_dict, strict=strict)
 
     @contextmanager
     def megatron_context(self) -> Generator[None, None, None]:
diff --git a/nemo/lightning/io/pl.py b/nemo/lightning/io/pl.py
index b582e4a6b7dd..51cd639f4dc3 100644
--- a/nemo/lightning/io/pl.py
+++ b/nemo/lightning/io/pl.py
@@ -46,7 +46,7 @@ def construct_extra(cls, trainer: pl.Trainer) -> Dict[str, Any]:
         return extra
 
 
-class MegatronCheckpointIO(CheckpointIO):
+class MegatronCheckpointIO(CheckpointIO, IOMixin):
     """CheckpointIO that utilizes :func:`torch.save` and :func:`torch.load` to save and load checkpoints respectively,
     common for most use cases.
 
diff --git a/nemo/lightning/megatron_parallel.py b/nemo/lightning/megatron_parallel.py
index 919224d5b9f6..386b9d5070f9 100644
--- a/nemo/lightning/megatron_parallel.py
+++ b/nemo/lightning/megatron_parallel.py
@@ -12,6 +12,7 @@
     Iterable,
     Iterator,
     List,
+    Mapping,
     Optional,
     Protocol,
     Sequence,
@@ -525,7 +526,7 @@ def sharded_state_dict(self, prefix: str = "") -> Dict[str, Any]:
                 # virtual pipline rank must be set so that GPTModel returns the correct sharded state dict
                 parallel_state.set_virtual_pipeline_model_parallel_rank(index)
                 module_sharded_state_dict = self._module_sharded_state_dict(module)
-                sharded_state_dict[f"megatron_module_{index}"] = module_sharded_state_dict
+                sharded_state_dict[f"model_{index}"] = module_sharded_state_dict
             else:
                 module_sharded_state_dict = self._module_sharded_state_dict(module)
                 sharded_state_dict.update(module_sharded_state_dict)
diff --git a/nemo/lightning/nemo_logger.py b/nemo/lightning/nemo_logger.py
index efed77663876..5ed783fdbefe 100644
--- a/nemo/lightning/nemo_logger.py
+++ b/nemo/lightning/nemo_logger.py
@@ -11,13 +11,14 @@
 from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint as PTLModelCheckpoint
 from pytorch_lightning.loggers import Logger, TensorBoardLogger, WandbLogger
 
+from nemo.lightning.io.mixin import IOMixin
 from nemo.lightning.pytorch.callbacks import ModelCheckpoint
 from nemo.utils import logging
 from nemo.utils.app_state import AppState
 
 
 @dataclass
-class NeMoLogger:
+class NeMoLogger(IOMixin):
     """Logger for NeMo runs.
 
     Args:
@@ -219,6 +220,3 @@ def _setup_files_to_move(self, log_dir, app_state):
 
         app_state.files_to_move = files_to_move
         app_state.files_to_copy = self.files_to_copy
-
-    def teardown(self):
-        pass
diff --git a/nemo/lightning/pytorch/callbacks/__init__.py b/nemo/lightning/pytorch/callbacks/__init__.py
index 1525ab21b835..ee0e777d739e 100644
--- a/nemo/lightning/pytorch/callbacks/__init__.py
+++ b/nemo/lightning/pytorch/callbacks/__init__.py
@@ -1,7 +1,9 @@
-from nemo.lightning.pytorch.callbacks.megatron_model_checkpoint import ModelCheckpoint
+from nemo.lightning.pytorch.callbacks.model_checkpoint import ModelCheckpoint
+from nemo.lightning.pytorch.callbacks.model_transform import ModelTransform
+from nemo.lightning.pytorch.callbacks.nsys import NsysCallback
+from nemo.lightning.pytorch.callbacks.peft import PEFT
+from nemo.lightning.pytorch.callbacks.preemption import PreemptionCallback
 from nemo.lightning.pytorch.callbacks.progress import MegatronProgressBar
 
-__all__ = [
-    "MegatronProgressBar",
-    "ModelCheckpoint",
-]
+
+__all__ = ["ModelCheckpoint", "ModelTransform", "PEFT", "NsysCallback", "MegatronProgressBar", "PreemptionCallback"]
diff --git a/nemo/lightning/pytorch/callbacks/megatron_model_checkpoint.py b/nemo/lightning/pytorch/callbacks/model_checkpoint.py
similarity index 98%
rename from nemo/lightning/pytorch/callbacks/megatron_model_checkpoint.py
rename to nemo/lightning/pytorch/callbacks/model_checkpoint.py
index 4c0da66828a7..d0a1585f6293 100644
--- a/nemo/lightning/pytorch/callbacks/megatron_model_checkpoint.py
+++ b/nemo/lightning/pytorch/callbacks/model_checkpoint.py
@@ -51,11 +51,13 @@ def __init__(
         save_best_model: bool = False,
         save_on_train_epoch_end: Optional[bool] = False,  # Save after training, not after validation
         enable_nemo_ckpt_io: bool = True,
+        try_restore_best_ckpt: bool = True,
         **kwargs,
     ):
         self.save_best_model = save_best_model
         self.previous_best_path = ""
         self.enable_nemo_ckpt_io = enable_nemo_ckpt_io
+        self.try_restore_best_ckpt = try_restore_best_ckpt
 
         # Call the parent class constructor with the remaining kwargs.
         super().__init__(
@@ -266,8 +268,9 @@ def on_train_end(self, trainer, pl_module):
             else:
                 if os.path.isdir(self.best_model_path.split('.ckpt')[0]):
                     self.best_model_path = self.best_model_path.split('.ckpt')[0]
-                self.best_model_path = trainer.strategy.broadcast(self.best_model_path)
-                trainer._checkpoint_connector.restore(self.best_model_path)
+                if self.try_restore_best_ckpt:
+                    self.best_model_path = trainer.strategy.broadcast(self.best_model_path)
+                    trainer._checkpoint_connector.restore(self.best_model_path)
 
     def _del_model_without_trainer(self, filepath: str) -> None:
         from nemo.utils.get_rank import is_global_rank_zero
diff --git a/nemo/lightning/pytorch/callbacks/model_transform.py b/nemo/lightning/pytorch/callbacks/model_transform.py
new file mode 100644
index 000000000000..68b3db16f473
--- /dev/null
+++ b/nemo/lightning/pytorch/callbacks/model_transform.py
@@ -0,0 +1,98 @@
+from functools import wraps
+from typing import Any, Callable, Optional, TypeVar
+
+import pytorch_lightning as pl
+from torch import nn
+
+from nemo.lightning.io.mixin import IOMixin
+from nemo.utils import logging
+
+
+class ModelTransform(pl.Callback, IOMixin):
+    """
+    A PyTorch Lightning callback that applies a model transformation function at the start of fitting or validation.
+
+    This callback is designed to apply a transformation to the model when fitting or validation begins.
+    This design allows for loading the original checkpoint first and then applying the transformation,
+    which is particularly useful for techniques like Parameter-Efficient Fine-Tuning (PEFT).
+
+    The transformation function is expected to be defined on the LightningModule
+    as an attribute called 'model_transform'.
+
+    Key Features:
+    - Applies transformation at the start of fit or validation, not during initialization.
+    - Allows loading of original checkpoints before transformation.
+    - Supports PEFT and similar techniques that modify model structure.
+
+    Example:
+        >>> class MyLightningModule(pl.LightningModule):
+        ...     def __init__(self):
+        ...         super().__init__()
+        ...         self.model = SomeModel()
+        ...         self.model_transform = lambda m: SomePEFTMethod()(m)
+        ...
+        >>> model = MyLightningModule()
+        >>> # Load original checkpoint here if needed
+        >>> model.load_state_dict(torch.load('original_checkpoint.pth'))
+        >>> trainer = pl.Trainer(callbacks=[ModelTransform()])
+        >>> # The model will be transformed when trainer.fit() or trainer.validate() is called
+        >>> trainer.fit(model)
+
+    Note:
+        The transformation is applied only once, at the start of fitting or validation,
+        whichever comes first. This ensures that the model structure is modified before
+        any forward passes or parameter updates occur, but after the original weights
+        have been loaded.
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.model_transform: Optional[Callable[[nn.Module], nn.Module]] = None
+
+    def setup(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", stage: str) -> None:
+        logging.info(f"Setting up ModelTransform for stage: {stage}")
+
+        if hasattr(pl_module, 'model_transform'):
+            logging.info("Found model_transform attribute on pl_module")
+            self.model_transform = _call_counter(pl_module.model_transform)
+            pl_module.model_transform = self.model_transform
+            logging.info(f"Set model_transform to: {self.model_transform}")
+        else:
+            logging.info("No model_transform attribute found on pl_module")
+
+    def on_train_epoch_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None:
+        self._maybe_apply_transform(trainer)
+
+    def _maybe_apply_transform(self, trainer):
+        if self._needs_to_call:
+            self.model_transform(trainer.model)
+
+    @property
+    def _needs_to_call(self) -> bool:
+        return self.model_transform and self.model_transform.__num_calls__ == 0
+
+
+T = TypeVar('T', bound=Callable[..., Any])
+
+
+def _call_counter(func: T) -> T:
+    """
+    A decorator that counts the number of times a function is called.
+
+    This decorator wraps a function and adds a '__num_calls__' attribute to it,
+    which is incremented each time the function is called.
+
+    Args:
+        func (Callable): The function to be wrapped.
+
+    Returns:
+        Callable: The wrapped function with a call counter.
+    """
+
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        wrapper.__num_calls__ += 1
+        return func(*args, **kwargs)
+
+    wrapper.__num_calls__ = 0
+    return wrapper  # type: ignore
diff --git a/nemo/lightning/pytorch/callbacks/nsys.py b/nemo/lightning/pytorch/callbacks/nsys.py
index c18722a607b4..d24d7fd974be 100644
--- a/nemo/lightning/pytorch/callbacks/nsys.py
+++ b/nemo/lightning/pytorch/callbacks/nsys.py
@@ -9,6 +9,26 @@
 
 
 class NsysCallback(Callback, IOMixin):
+    """
+    A PyTorch Lightning callback for NVIDIA Nsight Systems (Nsys) profiling.
+
+    This callback enables profiling of specific steps during training using NVIDIA Nsys.
+    It allows for precise control over when profiling starts and ends, which ranks are profiled,
+    and whether to generate detailed shape information.
+
+    More info about nsys can be found [here](https://developer.nvidia.com/nsight-systems).
+
+    Args:
+        start_step (int): Global batch to start profiling
+        end_step (int): Global batch to end profiling
+        ranks (List[int]): Global rank IDs to profile
+        gen_shape (bool): Generate model and kernel details including input shapes
+
+    Example:
+        >>> callback = NsysCallback(start_step=100, end_step=200, ranks=[0, 1], gen_shape=True)
+        >>> trainer = Trainer(callbacks=[callback])
+    """
+
     def __init__(
         self,
         start_step: int,
@@ -16,13 +36,6 @@ def __init__(
         ranks: List[int] = [0],
         gen_shape: bool = False,
     ):
-        """
-        Args:
-            start_step (int): Global batch to start profiling
-            end_step (int): Global batch to end profiling
-            ranks (List[int]): Global rank IDs to profile
-            gen_shape (bool): Generate model and kernel details including input shapes
-        """
         assert type(start_step) == int, f'Nsys start_step must be of type int. Found: {type(start_step)}'
         self._nsys_profile_start_step = start_step
 
@@ -54,6 +67,8 @@ def on_train_batch_start(self, trainer, pl_module, batch, batch_idx: int) -> Opt
                 torch.cuda.cudart().cudaProfilerStart()
                 if self._nsys_profile_gen_shape:
                     torch.autograd.profiler.emit_nvtx(record_shapes=True).__enter__()
+                else:
+                    torch.autograd.profiler.emit_nvtx().__enter__()
 
     def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx: int) -> None:
         """PyTorch Lightning hook:
@@ -63,7 +78,7 @@ def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx: int)
 
         device = trainer.strategy.root_device
         if device.type == 'cuda':
-            print(f'batch idx: {batch_idx}')
             if batch_idx == self._nsys_profile_end_step and get_rank() in self._nsys_profile_ranks:
                 logging.info("====== End nsys profiling ======")
                 torch.cuda.cudart().cudaProfilerStop()
+                torch.autograd.profiler.emit_nvtx().__exit__(None, None, None)
diff --git a/nemo/lightning/pytorch/callbacks/peft.py b/nemo/lightning/pytorch/callbacks/peft.py
new file mode 100644
index 000000000000..26325bf549d0
--- /dev/null
+++ b/nemo/lightning/pytorch/callbacks/peft.py
@@ -0,0 +1,261 @@
+import json
+from abc import ABC, abstractmethod
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Tuple
+
+import pytorch_lightning as pl
+import torch.nn as nn
+from lightning_fabric.utilities.types import _PATH
+from pytorch_lightning.plugins.io.wrapper import _WrappingCheckpointIO
+from typing_extensions import override
+
+from nemo.lightning.io.pl import ckpt_to_dir
+from nemo.lightning.pytorch.callbacks.model_transform import ModelTransform
+from nemo.utils import logging
+
+if TYPE_CHECKING:
+    from megatron.core.dist_checkpointing.mapping import ShardedStateDict
+
+
+_ADAPTER_META_FILENAME = "adapter_metadata.json"
+
+
+class PEFT(ABC, ModelTransform):
+    """Abstract base class for Parameter-Efficient Fine-Tuning (PEFT) methods.
+
+    This class defines the interface for PEFT methods, which are used to fine-tune
+    large language models efficiently by modifying only a small subset of the model's
+    parameters.
+
+    Example:
+        class MyPEFT(PEFT):
+            def transform(self, module, name=None, prefix=None):
+                # Implement the transform logic
+                pass
+
+
+        peft = MyPEFT()
+        peft_model = LargeLanguageModel(model_transform=peft)
+    """
+
+    @abstractmethod
+    def transform(self, module, name=None, prefix=None):
+        """Transform a single module according to the PEFT method.
+
+        This method is called for each module in the model during the PEFT application process.
+        It should be implemented by subclasses to define how individual modules are transformed
+        for the specific PEFT technique.
+
+        Args:
+            module (nn.Module): The individual module to be transformed.
+            name (Optional[str]): The name of the module within the model structure. Defaults to None.
+            prefix (Optional[str]): A prefix to be added to the module name, typically used for
+                                    nested modules. Defaults to None.
+
+        Returns:
+            nn.Module: The transformed module. This can be the original module with modifications,
+                       a new module replacing the original, or the original module if no
+                       transformation is needed for this specific module.
+
+        Note:
+            This method is automatically called for each module in the model when the PEFT
+            instance is applied to the model using the __call__ method.
+        """
+        raise NotImplementedError("The transform method should be implemented by subclasses.")
+
+    def __call__(self, model: nn.Module) -> nn.Module:
+        """Apply the PEFT method to the entire model.
+
+        This method freezes the model parameters and walks through the model
+        structure, applying the transform method to each module.
+
+        Args:
+            model (nn.Module): The model to be fine-tuned.
+
+        Returns:
+            nn.Module: The transformed model with PEFT applied.
+        """
+
+        model.freeze()
+        model.walk(self.transform)
+
+        return model
+
+    def setup(self, trainer: pl.Trainer, pl_module: pl.LightningModule, stage: str) -> None:
+        super().setup(trainer, pl_module, stage=stage)
+
+        self.wrapped_io = WrappedAdapterIO(trainer.strategy.checkpoint_io)
+        trainer.strategy._checkpoint_io = self.wrapped_io
+
+    def on_train_epoch_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None:
+        needs_to_call = self._needs_to_call
+        self._maybe_apply_transform(trainer)
+
+        # Check if we need to load the adapters
+        if needs_to_call and self.wrapped_io.adapter_ckpt_path is not None:
+            logging.info(f"Loading adapters from {self.wrapped_io.adapter_ckpt_path}")
+            adapter_state = self.wrapped_io.load_checkpoint(self.wrapped_io.adapter_ckpt_path)
+            trainer.strategy.load_model_state_dict(adapter_state, strict=False)
+
+    def on_load_checkpoint(
+        self, trainer: pl.Trainer, pl_module: pl.LightningModule, checkpoint: Dict[str, Any]
+    ) -> None:
+        pl_module.strict_loading = False
+
+
+class AdapterWrapper(nn.Module):
+    """Abstract base class for wrapping modules with adapters in Parameter-Efficient Fine-Tuning (PEFT).
+
+    This class wraps a module and its associated adapter, providing methods for
+    managing the state dictionaries of both the main module and the adapter. It does not
+    implement the forward method, which must be implemented by concrete subclasses.
+
+    Attributes:
+        to_wrap (nn.Module): The main module to be wrapped.
+        adapter (nn.Module): The adapter module to be applied.
+
+    Note:
+        This class is abstract and cannot be instantiated directly. Subclasses must
+        implement the forward method.
+
+    Example:
+        class AdapterParallelAdd(AdapterWrapper):
+            def __init__(self, to_wrap, adapter):
+                super().__init__(to_wrap, adapter)
+
+            def forward(self, x):
+                return self.to_wrap(x) + self.adapter(x)
+
+        main_module = nn.Linear(100, 100)
+        adapter = nn.Linear(100, 100)
+        parallel_adapter = AdapterParallelAdd(main_module, adapter)
+    """
+
+    def __init__(self, to_wrap: nn.Module, adapter: nn.Module):
+        super(AdapterWrapper, self).__init__()
+        self.to_wrap = to_wrap
+        self.adapter = adapter
+
+    def state_dict(self, destination=None, prefix='', keep_vars=False):
+        """Retrieve the state dictionary of the wrapped module and adapter.
+
+        This method overrides the default state_dict behavior to include both
+        the main module's state and the adapter's state under a special 'adapters' key.
+
+        Args:
+            destination (Optional[dict]): A dictionary to store the state. If None, a new
+                                          dictionary is created. Defaults to None.
+            prefix (str): A prefix added to parameter and buffer names. Defaults to ''.
+            keep_vars (bool): If True, returns variables instead of tensor values.
+                              Defaults to False.
+
+        Returns:
+            dict: The state dictionary containing both the main module and adapter states.
+        """
+
+        if destination is None:
+            destination = {}
+
+        # Get state dict of the main module
+        main_state_dict = self.to_wrap.state_dict(destination, prefix, keep_vars)
+
+        # Store adapter state dict under the special "adapters" key in the destination dict
+        adapter_state_dict = self.adapter.state_dict(None, prefix, keep_vars)
+        destination[f'{prefix}adapters'] = adapter_state_dict
+        return main_state_dict
+
+    def sharded_state_dict(
+        self,
+        prefix: str = '',
+        sharded_offsets: Tuple[Tuple[int, int, int]] = (),
+        metadata: Optional[dict] = None,
+    ) -> "ShardedStateDict":
+        """Retrieve the sharded state dictionary of the wrapped module and adapter.
+
+        This method is used for distributed checkpointing, combining the sharded states
+        of both the main module and the adapter.
+
+        Args:
+            prefix (str): A prefix added to parameter and buffer names. Defaults to ''.
+            sharded_offsets (Tuple[Tuple[int, int, int]]): Offsets for sharded parameters.
+                                                           Defaults to an empty tuple.
+            metadata (Optional[dict]): Additional metadata for the sharded state.
+                                       Defaults to None.
+
+        Returns:
+            ShardedStateDict: The combined sharded state dictionary.
+        """
+        sharded_state_dict = {}
+        sharded_state_dict.update(self.to_wrap.sharded_state_dict(prefix, sharded_offsets, metadata))
+        sharded_state_dict.update(self.adapter.sharded_state_dict(f"{prefix}adapter.", sharded_offsets, metadata))
+        return sharded_state_dict
+
+    def load_state_dict(self, state_dict, strict=True):
+        """Load a state dictionary into the wrapped module and adapter.
+
+        This method overrides the default load_state_dict behavior to handle
+        loading states for both the main module and the adapter.
+
+        Args:
+            state_dict (dict): The state dictionary to load.
+            strict (bool): Whether to strictly enforce that the keys in state_dict
+                           match the keys returned by this module's state_dict()
+                           function. Defaults to True.
+        """
+        # Check if the 'adapters' key is present in the state_dict
+        if 'adapters' in state_dict:
+            adapter_state_dict = state_dict.pop('adapters')
+        else:
+            adapter_state_dict = {}
+
+        # Load the main module state dict
+        self.to_wrap.load_state_dict(state_dict, strict)
+
+        # Load the adapter module state dict if present
+        if adapter_state_dict:
+            self.adapter.load_state_dict(adapter_state_dict, strict)
+
+
+class WrappedAdapterIO(_WrappingCheckpointIO):
+    model_ckpt_path: Optional[Path] = None
+    adapter_ckpt_path: Optional[Path] = None
+
+    @override
+    def save_checkpoint(self, checkpoint: Dict[str, Any], path: _PATH, storage_options: Optional[Any] = None) -> None:
+        assert self.checkpoint_io is not None
+
+        key = "sharded_state_dict" if "sharded_state_dict" in checkpoint else "state_dict"
+        checkpoint[key] = dict(filter(lambda x: ".adapter." in x[0], checkpoint[key].items()))
+
+        self.checkpoint_io.save_checkpoint(checkpoint, path, storage_options=storage_options)
+
+        from nemo.utils.get_rank import is_global_rank_zero
+
+        if is_global_rank_zero():
+            metadata = {"model_ckpt_path": str(self.model_ckpt_path)}
+            adapter_meta_path = ckpt_to_dir(path) / _ADAPTER_META_FILENAME
+            with open(adapter_meta_path, "w") as f:
+                json.dump(metadata, f)
+
+    @override
+    def load_checkpoint(
+        self, path: _PATH, sharded_state_dict=None, map_location: Optional[Callable] = None
+    ) -> Dict[str, Any]:
+        assert self.checkpoint_io is not None
+
+        adapter_meta_path = ckpt_to_dir(path) / _ADAPTER_META_FILENAME
+        if getattr(path, "adapter_path", None):
+            self.model_ckpt_path = path
+            self.adapter_ckpt_path = path.adapter_path
+        elif adapter_meta_path.exists():
+            with open(adapter_meta_path, "r") as f:
+                metadata = json.load(f)
+            self.model_ckpt_path = Path(metadata['model_ckpt_path'])
+            self.adapter_ckpt_path = path
+        else:
+            self.model_ckpt_path = path
+
+        # Note: this will include the Trainer-state of the model-checkpoint
+        model_ckpt = self.checkpoint_io.load_checkpoint(path, sharded_state_dict, map_location)
+
+        return model_ckpt
diff --git a/nemo/lightning/pytorch/callbacks/preemption.py b/nemo/lightning/pytorch/callbacks/preemption.py
new file mode 100644
index 000000000000..7f1dd94256d2
--- /dev/null
+++ b/nemo/lightning/pytorch/callbacks/preemption.py
@@ -0,0 +1,115 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import contextlib
+import signal
+from typing import Optional
+
+import torch
+from pytorch_lightning.callbacks import Callback
+from pytorch_lightning.trainer.trainer import Trainer
+
+from nemo.utils import logging
+
+
+class PreemptionCallback(Callback):
+    """
+    PreemptionCallback checks for preemption during training at the end of every step.
+    Upon preemption, it signals the trainer to stop gracefully.
+
+    Args:
+        sig (int, optional): The signal to listen for. Defaults to signal.SIGTERM.
+
+    Example:
+        >>> from nemo.lightning.pytorch.callbacks import PreemptionCallback
+        >>> callback = PreemptionCallback()
+        >>> trainer = Trainer(callbacks=[callback])
+    """
+
+    def __init__(self, sig: Optional[int] = None):
+        self.sig = sig if sig is not None else signal.SIGTERM
+        self._interrupted = False
+        self._handler_context = None
+        self._preemption_supported = None
+
+    def on_train_start(self, trainer: Trainer, pl_module) -> None:
+        if self.preemption_supported:
+            self._handler_context = self._preemption_handler()
+            self._handler_context.__enter__()
+
+    def on_train_batch_start(self, trainer: Trainer, pl_module, batch, batch_idx: int) -> None:
+        if not self.preemption_supported:
+            self._preemption_supported = self._check_preemption_support()
+            if self.preemption_supported:
+                self._handler_context = self._preemption_handler()
+                self._handler_context.__enter__()
+
+    def on_train_end(self, trainer: Trainer, pl_module) -> None:
+        if self._handler_context:
+            self._handler_context.__exit__(None, None, None)
+
+    def on_train_batch_end(self, trainer: Trainer, pl_module, outputs, batch, batch_idx: int) -> None:
+        if self.interrupted:
+            logging.info("Preemption detected, signaling trainer to stop")
+            trainer.should_stop = True
+
+    def on_exception(self, trainer: Trainer, pl_module, exception: BaseException) -> None:
+        if isinstance(exception, PreemptionException):
+            logging.info("Handling PreemptionException")
+            trainer.should_stop = True
+
+    @contextlib.contextmanager
+    def _preemption_handler(self):
+        if not self.preemption_supported:
+            logging.warning("Preemption requires torch distributed to be initialized, preemption may be disabled")
+            yield
+            return
+
+        original_handler = signal.getsignal(self.sig)
+
+        def master_handler(signum, frame):
+            logging.info(f"Received signal {signum}, initiating graceful stop")
+            self._interrupted = True
+            raise PreemptionException("Preemption signal received")
+
+        def ignoring_handler(signum, frame):
+            logging.debug(f"Received signal {signum} on non-master rank, ignoring")
+
+        try:
+            private_rank = torch.distributed.get_rank()
+            signal.signal(self.sig, master_handler if private_rank == 0 else ignoring_handler)
+            yield
+        finally:
+            signal.signal(self.sig, original_handler)
+
+    @property
+    def preemption_supported(self) -> bool:
+        if self._preemption_supported is None:
+            self._preemption_supported = self._check_preemption_support()
+        return self._preemption_supported
+
+    def _check_preemption_support(self) -> bool:
+        return torch.distributed.is_available() and torch.distributed.is_initialized()
+
+    @property
+    def interrupted(self) -> bool:
+        if not self.preemption_supported:
+            return False
+        interrupted = torch.tensor(self._interrupted, device=torch.cuda.current_device(), dtype=torch.int32)
+        torch.distributed.broadcast(interrupted, 0)
+        return bool(interrupted.item())
+
+
+class PreemptionException(Exception):
+    """Custom exception for preemption events."""
diff --git a/nemo/lightning/pytorch/optim/base.py b/nemo/lightning/pytorch/optim/base.py
index 88a77328ef9b..8e857a156649 100644
--- a/nemo/lightning/pytorch/optim/base.py
+++ b/nemo/lightning/pytorch/optim/base.py
@@ -1,5 +1,6 @@
 import types
 from abc import ABC, abstractmethod
+from copy import deepcopy
 from typing import List, Optional
 
 import pytorch_lightning as L
@@ -134,7 +135,7 @@ def custom_configure_optimizers(lightning_module_self, megatron_parallel=None):
 
         if hasattr(self, "__io__") and hasattr(model, "__io__"):
             if hasattr(model.__io__, "optim"):
-                model.__io__.optim = self.__io__
+                model.__io__.optim = deepcopy(self.__io__)
 
     @abstractmethod
     def optimizers(self, model) -> List[Optimizer]:
diff --git a/nemo/lightning/pytorch/strategies.py b/nemo/lightning/pytorch/strategies.py
index 99e7245d60dd..0f6dc89a7076 100644
--- a/nemo/lightning/pytorch/strategies.py
+++ b/nemo/lightning/pytorch/strategies.py
@@ -33,7 +33,7 @@
 from nemo.lightning import _strategy_lib, io
 from nemo.lightning.io.pl import MegatronCheckpointIO
 from nemo.lightning.megatron_parallel import CallbackConnector, MegatronParallel, _ModuleStepFunction
-from nemo.lightning.pytorch.callbacks import MegatronProgressBar
+from nemo.lightning.pytorch.callbacks import MegatronProgressBar, ModelTransform
 
 if TYPE_CHECKING:
     from nemo.lightning.pytorch.plugins.data_sampler import DataSampler
@@ -106,9 +106,9 @@ def __init__(
         **kwargs,
     ) -> None:
         super().__init__(
-            parallel_devices,
-            cluster_environment,
-            checkpoint_io,
+            parallel_devices=parallel_devices,
+            cluster_environment=cluster_environment,
+            checkpoint_io=checkpoint_io,
             find_unused_parameters=find_unused_parameters,
             **kwargs,
         )
@@ -193,6 +193,18 @@ def setup(self, trainer: pl.Trainer, setup_optimizers: bool = True) -> None:
         self.setup_megatron_parallel(trainer, setup_optimizers=setup_optimizers)
         self.setup_precision_plugin()
 
+        if getattr(self.lightning_module, "model_transform", None):
+            # Ensure the ModelTransform callback is pass to the trainer.
+            # Callback.setup() is called before the current Strategy.setup(), so we can
+            # only perform a check here; adding the callback here would not be sufficient
+            if not any(isinstance(cb, ModelTransform) for cb in trainer.callbacks):
+                raise ValueError(
+                    "You specified a model_transform function in the model, but no"
+                    "ModelTransform callback was found in the trainer. "
+                    "Please initialize the trainer with "
+                    "`trainer = Trainer(..., callbacks=[ModelTransform()])`"
+                )
+
         if trainer.num_sanity_val_steps > 1 and self.pipeline_model_parallel_size > 1:
             # TODO: log here
             trainer.num_sanity_val_steps = 0
@@ -522,53 +534,21 @@ def remove_checkpoint(self, filepath: Union[str, Path]) -> None:
 
     def load_model_state_dict(self, checkpoint: Mapping[str, Any], strict: bool = True) -> None:
         assert self.megatron_parallel is not None
-        from megatron.core import parallel_state
 
-        for index, module in enumerate(self.megatron_parallel):
-            if parallel_state.get_virtual_pipeline_model_parallel_world_size() is not None:
-                checkpoint_state_dict = checkpoint['state_dict'][f'model_{index}']
-            else:
-                checkpoint_state_dict = checkpoint['state_dict']
-
-            mcore_model = self.lightning_module.module
-            while hasattr(mcore_model, "module"):
-                mcore_model = mcore_model.module
-
-            current = self.model[0]
-            n_nesting = 0
-            while current != mcore_model:
-                current = current.module
-                n_nesting += 1
-
-            _state_dict = {}
-            for key, value in checkpoint_state_dict.items():
-                # Count the number of "module." at the start of the key
-                count, _key = 0, key
-                while _key.startswith("module."):
-                    _key = _key[len("module.") :]
-                    count += 1
-
-                # Adjust the number of "module." prefixes
-                if count < n_nesting:
-                    to_add = "module." * (n_nesting - count)
-                    _state_dict[f"{to_add}{key}"] = value
-                elif count > n_nesting:
-                    to_remove = "module." * (count - n_nesting)
-                    _state_dict[key[len(to_remove) :]] = value
-            checkpoint_state_dict = _state_dict
-
-            module.load_state_dict(checkpoint_state_dict, strict=strict)
+        _strategy_lib.load_model_state_dict(self.megatron_parallel, checkpoint, strict=strict)
 
     @property
     @override
     def checkpoint_io(self) -> CheckpointIO:
         if self._checkpoint_io is None:
             self._checkpoint_io = MegatronCheckpointIO()
-        elif isinstance(self._checkpoint_io, _WrappingCheckpointIO):
-            self._checkpoint_io.checkpoint_io = MegatronCheckpointIO()
 
         return self._checkpoint_io
 
+    @checkpoint_io.setter
+    def checkpoint_io(self, io: CheckpointIO) -> None:
+        self._checkpoint_io = io
+
     def _get_data_step(self, step_type: str) -> Optional[_ModuleStepFunction]:
         for fn_name in [f"{step_type}_data_step", "data_step"]:
             if hasattr(self.lightning_module, fn_name):
diff --git a/nemo/lightning/resume.py b/nemo/lightning/resume.py
index f762d345ed3b..fc2e21eb37fd 100644
--- a/nemo/lightning/resume.py
+++ b/nemo/lightning/resume.py
@@ -1,16 +1,24 @@
-from pathlib import Path
+import os
+from pathlib import Path, PosixPath, WindowsPath
 from typing import Optional, Union
 
 import lightning_fabric as fl
 import pytorch_lightning as pl
 
 from nemo.lightning import io
+from nemo.lightning.io.mixin import IOMixin
 from nemo.utils import logging
 from nemo.utils.app_state import AppState
 from nemo.utils.model_utils import uninject_model_parallel_rank
 
+# Dynamically inherit from the correct Path subclass based on the operating system.
+if os.name == 'nt':
+    BasePath = WindowsPath
+else:
+    BasePath = PosixPath
 
-class Resume:
+
+class Resume(IOMixin):
     def nemo_path(self, model) -> Optional[Path]:
         raise NotImplementedError
 
@@ -34,6 +42,7 @@ def __init__(
         path: Optional[str] = None,  ## old resume_from_checkpoint
         dirpath: Optional[str] = None,  ## optional path to checkpoint directory
         import_path: Optional[str] = None,  ## for importing from hf or other checkpoint formats
+        adapter_path: Optional[str] = None,
         resume_if_exists: bool = False,
         resume_past_end: bool = False,
         resume_ignore_no_checkpoint: bool = False,
@@ -66,6 +75,7 @@ def __init__(
         self.path = path
         self.dirpath = dirpath
         self.import_path = import_path
+        self.adapter_path = adapter_path
         self.resume_if_exists = resume_if_exists
         self.resume_past_end = resume_past_end
         self.resume_ignore_no_checkpoint = resume_ignore_no_checkpoint
@@ -76,7 +86,10 @@ def nemo_path(self, model=None) -> Optional[Path]:
         if self.import_path:
             if model is None:
                 raise ValueError("Model is needed to import checkpoint from HF or other non-NeMo checkpoint format.")
-            return model.import_ckpt(self.import_path)
+            output = model.import_ckpt(self.import_path)
+            if self.adapter_path:
+                return AdapterPath(output, adapter_path=Path(self.adapter_path))
+            return output
 
         ### refactored from exp_manager
         checkpoint = None
@@ -131,6 +144,17 @@ def nemo_path(self, model=None) -> Optional[Path]:
                 checkpoint = last_checkpoints[0]
 
         if checkpoint:
+            if self.adapter_path:
+                return AdapterPath(checkpoint, adapter_path=Path(self.adapter_path))
             return Path(checkpoint)
 
         return None
+
+
+class AdapterPath(BasePath):
+    adapter_path: Optional[Path]
+
+    def __new__(cls, *args, adapter_path: Optional[Path] = None, **kwargs):
+        output = super().__new__(cls, *args, **kwargs)
+        output.adapter_path = adapter_path
+        return output
diff --git a/setup.py b/setup.py
index 6c82ef803174..292be13e65df 100644
--- a/setup.py
+++ b/setup.py
@@ -286,4 +286,9 @@ def finalize_options(self):
     keywords=__keywords__,
     # Custom commands.
     cmdclass={'style': StyleCommand},
+    entry_points={
+        "sdk.factories": [
+            "llm = nemo.collections.llm",
+        ],
+    },
 )
diff --git a/tests/lightning/pytorch/callbacks/__init__.py b/tests/lightning/pytorch/callbacks/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/lightning/pytorch/callbacks/test_model_transform.py b/tests/lightning/pytorch/callbacks/test_model_transform.py
new file mode 100644
index 000000000000..9894f7d7bc58
--- /dev/null
+++ b/tests/lightning/pytorch/callbacks/test_model_transform.py
@@ -0,0 +1,48 @@
+import pytest
+import pytorch_lightning as pl
+from torch import nn
+
+from nemo.lightning.pytorch.callbacks.model_transform import ModelTransform
+
+
+class TestModelTransformCallback:
+    @pytest.fixture
+    def callback(self):
+        return ModelTransform()
+
+    @pytest.fixture
+    def pl_module(self):
+        return MockLightningModule()
+
+    @pytest.fixture
+    def trainer(self):
+        return pl.Trainer()
+
+    def test_setup_stores_transform(self, callback, pl_module, trainer, caplog):
+        callback.setup(trainer, pl_module, 'fit')
+
+        assert callback.model_transform is not None, "callback.model_transform should be set after setup"
+        assert hasattr(
+            callback.model_transform, '__num_calls__'
+        ), "callback.model_transform should have __num_calls__ attribute"
+        assert callback.model_transform.__num_calls__ == 0, "callback.model_transform should not have been called yet"
+        assert pl_module.model_transform == callback.model_transform, "pl_module.model_transform should be updated"
+
+
+class MockModel(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.linear = nn.Linear(10, 10)
+
+    def forward(self, x):
+        return self.linear(x)
+
+
+class MockLightningModule(pl.LightningModule):
+    def __init__(self):
+        super().__init__()
+        self.model = MockModel()
+        self.model_transform = lambda m: nn.Sequential(m, nn.ReLU())
+
+    def forward(self, x):
+        return self.model(x)
diff --git a/tests/lightning/pytorch/callbacks/test_nsys.py b/tests/lightning/pytorch/callbacks/test_nsys.py
new file mode 100644
index 000000000000..e8734ad1c1ac
--- /dev/null
+++ b/tests/lightning/pytorch/callbacks/test_nsys.py
@@ -0,0 +1,195 @@
+from unittest.mock import MagicMock, patch
+
+import pytest
+import torch
+from nemo.lightning.pytorch.callbacks.nsys import NsysCallback
+
+
+class TestNsysCallback:
+    @pytest.fixture(autouse=True)
+    def setup_mocks(self):
+        self.cuda_mock = patch('torch.cuda')
+        self.cudart_mock = patch('torch.cuda.cudart')
+        self.emit_nvtx_mock = patch('torch.autograd.profiler.emit_nvtx')
+        self.get_rank_mock = patch('nemo.lightning.pytorch.callbacks.nsys.get_rank')
+
+        self.cuda_mock.start()
+        self.cudart_mock.start()
+        self.emit_nvtx_mock.start()
+        self.get_rank_mock.start()
+
+        # Mock CUDA availability
+        torch.cuda.is_available = MagicMock(return_value=True)
+        torch.cuda.current_device = MagicMock(return_value=0)
+
+        yield
+
+        self.cuda_mock.stop()
+        self.cudart_mock.stop()
+        self.emit_nvtx_mock.stop()
+        self.get_rank_mock.stop()
+
+    @pytest.fixture
+    def mock_trainer(self):
+        trainer = MagicMock()
+        trainer.strategy.root_device.type = 'cuda'
+        return trainer
+
+    @pytest.fixture
+    def mock_pl_module(self):
+        return MagicMock()
+
+    def test_init_valid_params(self):
+        """Test initialization with valid parameters."""
+        callback = NsysCallback(start_step=10, end_step=20, ranks=[0, 1], gen_shape=True)
+        assert callback._nsys_profile_start_step == 10
+        assert callback._nsys_profile_end_step == 20
+        assert callback._nsys_profile_ranks == [0, 1]
+        assert callback._nsys_profile_gen_shape == True
+
+    def test_init_invalid_params(self):
+        """Test initialization with invalid parameters."""
+        with pytest.raises(AssertionError):
+            NsysCallback(start_step='10', end_step=20)
+
+        with pytest.raises(AssertionError):
+            NsysCallback(start_step=10, end_step='20')
+
+        with pytest.raises(AssertionError):
+            NsysCallback(start_step=20, end_step=10)
+
+    @patch('nemo.lightning.pytorch.callbacks.nsys.get_rank')
+    @patch('torch.cuda.cudart')
+    @patch('torch.autograd.profiler.emit_nvtx')
+    def test_on_train_batch_start_profiling(
+        self, mock_emit_nvtx, mock_cudart, mock_get_rank, mock_trainer, mock_pl_module
+    ):
+        """Test on_train_batch_start when profiling should start."""
+        mock_get_rank.return_value = 0
+        callback = NsysCallback(start_step=10, end_step=20, ranks=[0], gen_shape=True)
+
+        callback.on_train_batch_start(mock_trainer, mock_pl_module, None, 10)
+
+        mock_cudart().cudaProfilerStart.assert_called_once()
+        mock_emit_nvtx.assert_called_once_with(record_shapes=True)
+
+    @patch('nemo.lightning.pytorch.callbacks.nsys.get_rank')
+    @patch('torch.cuda.cudart')
+    def test_on_train_batch_start_no_profiling(self, mock_cudart, mock_get_rank, mock_trainer, mock_pl_module):
+        """Test on_train_batch_start when profiling should not start."""
+        mock_get_rank.return_value = 0
+        callback = NsysCallback(start_step=10, end_step=20, ranks=[0])
+
+        callback.on_train_batch_start(mock_trainer, mock_pl_module, None, 9)
+
+        mock_cudart().cudaProfilerStart.assert_not_called()
+
+    @patch('nemo.lightning.pytorch.callbacks.nsys.get_rank')
+    @patch('torch.cuda.cudart')
+    @patch('torch.autograd.profiler.emit_nvtx')
+    def test_on_train_batch_end_profiling(
+        self, mock_emit_nvtx, mock_cudart, mock_get_rank, mock_trainer, mock_pl_module
+    ):
+        """Test on_train_batch_end when profiling should end."""
+        mock_get_rank.return_value = 0
+        callback = NsysCallback(start_step=10, end_step=20, ranks=[0])
+
+        callback.on_train_batch_end(mock_trainer, mock_pl_module, None, None, 20)
+
+        mock_cudart().cudaProfilerStop.assert_called_once()
+
+    @patch('nemo.lightning.pytorch.callbacks.nsys.get_rank')
+    @patch('torch.cuda.cudart')
+    @patch('torch.autograd.profiler.emit_nvtx')
+    def test_on_train_batch_end_no_profiling(
+        self, mock_emit_nvtx, mock_cudart, mock_get_rank, mock_trainer, mock_pl_module
+    ):
+        """Test on_train_batch_end when profiling should not end."""
+        mock_get_rank.return_value = 0
+        callback = NsysCallback(start_step=10, end_step=20, ranks=[0])
+
+        callback.on_train_batch_end(mock_trainer, mock_pl_module, None, None, 19)
+
+        mock_cudart().cudaProfilerStop.assert_not_called()
+
+    def test_non_cuda_device(self, mock_trainer, mock_pl_module):
+        """Test behavior when the device is not CUDA."""
+        mock_trainer.strategy.root_device.type = 'cpu'
+        callback = NsysCallback(start_step=10, end_step=20, ranks=[0])
+
+        callback.on_train_batch_start(mock_trainer, mock_pl_module, None, 10)
+        callback.on_train_batch_end(mock_trainer, mock_pl_module, None, None, 20)
+
+        # No exceptions should be raised, and no profiling calls should be made
+
+    @patch('nemo.lightning.pytorch.callbacks.nsys.get_rank')
+    def test_rank_not_in_profile_ranks(self, mock_get_rank, mock_trainer, mock_pl_module):
+        """Test behavior when the current rank is not in the profile ranks."""
+        mock_get_rank.return_value = 1
+        callback = NsysCallback(start_step=10, end_step=20, ranks=[0])
+        callback = NsysCallback(start_step=10, end_step=20, ranks=[0])
+
+        callback.on_train_batch_start(mock_trainer, mock_pl_module, None, 10)
+        callback.on_train_batch_end(mock_trainer, mock_pl_module, None, None, 20)
+
+        # No profiling calls should be made
+
+    @pytest.mark.parametrize(
+        "start_step,end_step,batch_idx,expected_call",
+        [
+            (10, 20, 9, False),
+            (10, 20, 10, True),
+            (10, 20, 15, False),
+            (10, 20, 20, False),
+            (10, 20, 21, False),
+        ],
+    )
+    @patch('nemo.lightning.pytorch.callbacks.nsys.get_rank')
+    @patch('torch.cuda.cudart')
+    @patch('torch.autograd.profiler.emit_nvtx')
+    def test_profiling_range(
+        self,
+        mock_emit_nvtx,
+        mock_cudart,
+        mock_get_rank,
+        start_step,
+        end_step,
+        batch_idx,
+        expected_call,
+        mock_trainer,
+        mock_pl_module,
+    ):
+        """Test profiling behavior across different batch indices."""
+        mock_get_rank.return_value = 0
+        callback = NsysCallback(start_step=start_step, end_step=end_step, ranks=[0])
+
+        callback.on_train_batch_start(mock_trainer, mock_pl_module, None, batch_idx)
+
+        if expected_call:
+            mock_cudart().cudaProfilerStart.assert_called_once()
+            mock_emit_nvtx.assert_called_once()
+        else:
+            mock_cudart().cudaProfilerStart.assert_not_called()
+            mock_emit_nvtx.assert_not_called()
+
+    @patch('nemo.lightning.pytorch.callbacks.nsys.get_rank')
+    @patch('torch.cuda.cudart')
+    def test_single_profile_range(self, mock_cudart, mock_get_rank, mock_trainer, mock_pl_module):
+        """Test behavior with a single profile range."""
+        mock_get_rank.return_value = 0
+        callback = NsysCallback(start_step=10, end_step=40, ranks=[0])
+
+        # Ensure the device type is 'cuda'
+        mock_trainer.strategy.root_device.type = 'cuda'
+
+        # Start of range
+        callback.on_train_batch_start(mock_trainer, mock_pl_module, None, 10)
+        assert mock_cudart().cudaProfilerStart.call_count == 1, "cudaProfilerStart was not called"
+
+        # Middle of range
+        callback.on_train_batch_start(mock_trainer, mock_pl_module, None, 25)
+        assert mock_cudart().cudaProfilerStart.call_count == 1, "cudaProfilerStart was called again"
+
+        # End of range
+        callback.on_train_batch_end(mock_trainer, mock_pl_module, None, None, 40)
+        assert mock_cudart().cudaProfilerStop.call_count == 1, "cudaProfilerStop was not called"
diff --git a/tests/lightning/pytorch/callbacks/test_peft.py b/tests/lightning/pytorch/callbacks/test_peft.py
new file mode 100644
index 000000000000..81dc7f85bc08
--- /dev/null
+++ b/tests/lightning/pytorch/callbacks/test_peft.py
@@ -0,0 +1,68 @@
+from unittest.mock import MagicMock, patch
+
+import torch.nn as nn
+from nemo.collections.llm import fn
+from nemo.lightning.pytorch.callbacks.peft import PEFT, WrappedAdapterIO
+
+
+class TestPEFT:
+    class DummyPEFT(PEFT):
+        def transform(self, module, name=None, prefix=None):
+            return module  # No-op transform for testing
+
+    class DummyModel(nn.Module, fn.FNMixin):
+        def __init__(self):
+            super().__init__()
+            self.linear = nn.Linear(10, 10)
+            self.conv = nn.Conv2d(3, 3, 3)
+
+    def test_peft_call(self):
+        model = self.DummyModel()
+        peft = self.DummyPEFT()
+
+        transformed_model = peft(model)
+
+        assert transformed_model.linear.weight.requires_grad == False
+        assert transformed_model.conv.weight.requires_grad == False
+
+    def test_peft_setup(self):
+        peft = self.DummyPEFT()
+        trainer = MagicMock()
+        pl_module = MagicMock()
+
+        pl_module.model_transform = peft
+        peft.setup(trainer, pl_module, "fit")
+
+        assert isinstance(trainer.strategy._checkpoint_io, WrappedAdapterIO)
+        assert peft.model_transform is not None
+        assert peft._needs_to_call is True
+
+    @patch('nemo.lightning.pytorch.callbacks.peft.logging')
+    def test_peft_on_train_epoch_start_with_adapter(self, mock_logging):
+        peft = self.DummyPEFT()
+        trainer = MagicMock()
+        pl_module = MagicMock()
+        pl_module.model_transform = peft
+
+        peft.setup(trainer, pl_module, "fit")
+
+        assert peft.model_transform is not None
+        assert peft._needs_to_call is True
+
+        peft.wrapped_io = MagicMock()
+        peft.wrapped_io.adapter_ckpt_path = "dummy_path"
+        peft.wrapped_io.load_checkpoint.return_value = {"dummy_state": "dummy_value"}
+        peft.on_train_epoch_start(trainer, pl_module)
+
+        mock_logging.info.assert_called_once_with("Loading adapters from dummy_path")
+        trainer.strategy.load_model_state_dict.assert_called_once_with({"dummy_state": "dummy_value"}, strict=False)
+
+    def test_peft_on_load_checkpoint(self):
+        peft = self.DummyPEFT()
+        trainer = MagicMock()
+        pl_module = MagicMock()
+        checkpoint = {}
+
+        peft.on_load_checkpoint(trainer, pl_module, checkpoint)
+
+        assert pl_module.strict_loading == False
diff --git a/tests/lightning/pytorch/callbacks/test_preemption.py b/tests/lightning/pytorch/callbacks/test_preemption.py
new file mode 100644
index 000000000000..5fcb4a1458ee
--- /dev/null
+++ b/tests/lightning/pytorch/callbacks/test_preemption.py
@@ -0,0 +1,114 @@
+import logging
+import signal
+from unittest.mock import MagicMock, PropertyMock, patch
+
+import pytest
+import torch
+from pytorch_lightning import Trainer
+
+from nemo.lightning.pytorch.callbacks.preemption import PreemptionCallback, PreemptionException
+
+
+class TestPreemptionCallback:
+
+    @pytest.fixture
+    def callback(self):
+        return PreemptionCallback()
+
+    @pytest.fixture
+    def mock_trainer(self):
+        trainer = MagicMock(spec=Trainer)
+        trainer.should_stop = False
+        return trainer
+
+    def test_init(self, callback):
+        assert callback.sig == signal.SIGTERM
+        assert not callback._interrupted
+        assert callback._handler_context is None
+
+    def test_custom_signal(self):
+        custom_callback = PreemptionCallback(sig=signal.SIGUSR1)
+        assert custom_callback.sig == signal.SIGUSR1
+
+    @pytest.mark.parametrize("initially_supported,becomes_supported", [(False, True), (False, False), (True, True)])
+    def test_on_train_batch_start_distributed_init(
+        self, callback, mock_trainer, initially_supported, becomes_supported
+    ):
+        with (
+            patch.object(PreemptionCallback, '_check_preemption_support') as mock_check,
+            patch.object(callback, '_preemption_handler') as mock_handler,
+        ):
+
+            mock_check.side_effect = [initially_supported, becomes_supported]
+
+            callback.on_train_start(mock_trainer, None)
+            callback.on_train_batch_start(mock_trainer, None, None, 0)
+
+            expected_call_count = 1 if initially_supported else (1 if becomes_supported else 0)
+            assert mock_handler.call_count == expected_call_count
+
+            if initially_supported:
+                mock_handler.assert_called_once_with()
+            elif becomes_supported:
+                mock_handler.assert_called_once_with()
+            else:
+                mock_handler.assert_not_called()
+
+    @pytest.mark.parametrize(
+        "is_supported,interrupted,expected",
+        [
+            (True, True, True),
+            (True, False, False),
+            (False, True, False),
+            (False, False, False),
+        ],
+    )
+    def test_interrupted_property(self, callback, is_supported, interrupted, expected):
+        with (
+            patch.object(PreemptionCallback, '_check_preemption_support', return_value=is_supported),
+            patch('torch.distributed.broadcast'),
+            patch('torch.tensor', return_value=torch.tensor(interrupted)),
+            patch('torch.cuda.is_available', return_value=True),
+            patch('torch.cuda.current_device', return_value=0),
+        ):
+            callback._interrupted = interrupted
+            assert callback.interrupted == expected
+
+    def test_on_train_start(self, callback, mock_trainer):
+        with (
+            patch.object(PreemptionCallback, 'preemption_supported', new_callable=PropertyMock) as mock_supported,
+            patch.object(callback, '_preemption_handler') as mock_handler,
+        ):
+
+            # Test when preemption is supported
+            mock_supported.return_value = True
+            callback.on_train_start(mock_trainer, None)
+            mock_handler.assert_called_once()
+            mock_handler.reset_mock()
+
+            # Test when preemption is not supported
+            mock_supported.return_value = False
+            callback.on_train_start(mock_trainer, None)
+            mock_handler.assert_not_called()
+
+    def test_on_train_end(self, callback, mock_trainer):
+        mock_context = MagicMock()
+        callback._handler_context = mock_context
+        callback.on_train_end(mock_trainer, None)
+        mock_context.__exit__.assert_called_once_with(None, None, None)
+
+    @pytest.mark.parametrize("interrupted", [True, False])
+    def test_on_train_batch_end(self, callback, mock_trainer, interrupted):
+        with patch.object(PreemptionCallback, 'interrupted', new_callable=lambda: property(lambda self: interrupted)):
+            callback.on_train_batch_end(mock_trainer, None, None, None, 0)
+            assert mock_trainer.should_stop == interrupted
+
+    def test_on_exception_preemption(self, callback, mock_trainer):
+        exception = PreemptionException("Test preemption")
+        callback.on_exception(mock_trainer, None, exception)
+        assert mock_trainer.should_stop
+
+    def test_on_exception_other(self, callback, mock_trainer):
+        exception = ValueError("Some other exception")
+        callback.on_exception(mock_trainer, None, exception)
+        assert not mock_trainer.should_stop
diff --git a/tests/lightning/test_megatron_parallel.py b/tests/lightning/test_megatron_parallel.py
index fafd25e49f5a..e504c7eb5c7c 100644
--- a/tests/lightning/test_megatron_parallel.py
+++ b/tests/lightning/test_megatron_parallel.py
@@ -1,4 +1,5 @@
 from collections import defaultdict
+from unittest.mock import MagicMock
 
 import pytest
 from megatron.core import parallel_state
@@ -123,13 +124,14 @@ def test_add_callbacks(self) -> None:
         assert callback in callback_connector.callbacks["on_megatron_step_start"]
         assert callback in callback_connector.callbacks["on_megatron_microbatch_start"]
 
-    def test_event(self, mocker) -> None:
+    def test_event(self) -> None:
         callback_connector = mp.CallbackConnector()
         callback = TestCallback()
         callback_connector.add(callback)
 
-        mocker.spy(callback, "on_megatron_step_start")
-        mocker.spy(callback, "on_megatron_microbatch_start")
+        # Replace mocker.spy with manual mocking
+        callback.on_megatron_step_start = MagicMock()
+        callback.on_megatron_microbatch_start = MagicMock()
 
         callback_connector.event("on_megatron_step_start")
         callback_connector.event("on_megatron_microbatch_start")

From 35ce666bbf10eff47fc05e08fafb5fac4a56585a Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com>
Date: Thu, 4 Jul 2024 23:04:32 -0700
Subject: [PATCH 02/26] Akoumparouli/mistral import instruct chat template fix
 (#9567)

* use bf16 by defualt mistral conv

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* add chat template

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* use capitalized role names

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

---------

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Co-authored-by: Marc Romeyn <mromeijn@nvidia.com>
---
 .../convert_mistral_7b_hf_to_nemo.py          | 20 +++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/scripts/checkpoint_converters/convert_mistral_7b_hf_to_nemo.py b/scripts/checkpoint_converters/convert_mistral_7b_hf_to_nemo.py
index cb11bb5da564..3a72661499bf 100644
--- a/scripts/checkpoint_converters/convert_mistral_7b_hf_to_nemo.py
+++ b/scripts/checkpoint_converters/convert_mistral_7b_hf_to_nemo.py
@@ -54,7 +54,7 @@ def get_args():
         help="Path to Huggingface Mistral-7b checkpoints",
     )
     parser.add_argument("--output_path", type=str, default=None, required=True, help="Path to output .nemo file.")
-    parser.add_argument("--precision", type=str, default="32", help="Model precision")
+    parser.add_argument("--precision", type=str, default="bf16", help="Model precision")
     args = parser.parse_args()
     return args
 
@@ -167,7 +167,7 @@ def convert(args):
         scaler = None
         if precision in [16, '16', '16-mixed']:
             scaler = GradScaler(
-                init_scale=nemo_config.get('native_amp_init_scale', 2 ** 32),
+                init_scale=nemo_config.get('native_amp_init_scale', 2**32),
                 growth_interval=nemo_config.get('native_amp_growth_interval', 1000),
                 hysteresis=nemo_config.get('hysteresis', 2),
             )
@@ -329,6 +329,22 @@ def convert(args):
     model = model.to(dtype=dtype)
     model.cfg.use_cpu_initialization = False
 
+    if getattr(tokenizer, 'chat_template', None) is not None:
+        import hashlib
+
+        assert (
+            hashlib.md5(tokenizer.chat_template.encode('utf-8')).hexdigest() == "0b629f783db54e02509999196956ff40"
+        ), "Got unkown chat template"
+        from omegaconf import OmegaConf, open_dict
+
+        with open_dict(model.cfg):
+            model.cfg.tokenizer.chat_template = OmegaConf.create(
+                {
+                    'prefix': "{_bos_}",
+                    'roles': {'User': "[INST] {_content_} [/INST]", 'Assistant': "{_content_}{_eos_}"},
+                }
+            )
+
     model.save_to(args.output_path)
     logging.info(f'NeMo model saved to: {args.output_path}')
 

From d481674c988fa089c6b4d8c0133e6a3e79cc2261 Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com>
Date: Thu, 4 Jul 2024 23:05:04 -0700
Subject: [PATCH 03/26] Remove .cuda calls, use device isntead (#9602)

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
---
 nemo/lightning/megatron_parallel.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/nemo/lightning/megatron_parallel.py b/nemo/lightning/megatron_parallel.py
index 386b9d5070f9..71d9c87f2fe0 100644
--- a/nemo/lightning/megatron_parallel.py
+++ b/nemo/lightning/megatron_parallel.py
@@ -49,7 +49,7 @@ def default_data_step(dataloader_iter: Iterator[DataT]) -> DataT:
         batch = batch[0]
 
     if isinstance(batch, dict):
-        batch = {k: v.cuda() for k, v in batch.items()}
+        batch = {k: v.cuda(non_blocking=True) for k, v in batch.items()}
 
     return batch
 
@@ -182,7 +182,7 @@ def __init__(
 
         for i, model_module in enumerate(_pipeline):
             if not cpu:
-                model_module.cuda(torch.cuda.current_device())
+                model_module.cuda(torch.cuda.current_device(), non_blocking=True)
 
             for param in model_module.parameters():
                 set_defaults_if_not_set_tensor_model_parallel_attributes(param)
@@ -300,7 +300,7 @@ def forward(
             if forward_only:
                 loss_mean = cast(torch.Tensor, [])
             else:
-                loss_mean = torch.tensor(0.0).cuda()
+                loss_mean = torch.tensor(0.0, device=torch.cuda.current_device())
 
         self.callbacks.event("on_megatron_log_step_end", **context)
         self.callbacks.event("on_megatron_step_end", **context)
@@ -1018,7 +1018,7 @@ def forward(
             loss_sum_and_ub_size_all_gpu = torch.cat(
                 [
                     loss_sum_for_ub.clone().detach().view(1),
-                    torch.tensor([num_valid_tokens_in_ub]).cuda().clone().detach(),
+                    torch.tensor([num_valid_tokens_in_ub], device=torch.cuda.current_device()).clone().detach(),
                 ]
             )
             torch.distributed.all_reduce(loss_sum_and_ub_size_all_gpu, group=parallel_state.get_data_parallel_group())
@@ -1045,11 +1045,11 @@ def reduce(self, losses_reduced_per_micro_batch) -> torch.Tensor:
             loss_sum = (
                 torch.vstack(loss_sum_tensors_list).sum(dim=0)
                 if len(loss_sum_tensors_list) > 0
-                else torch.tensor([0.0, 0.0]).cuda()
+                else torch.tensor([0.0, 0.0], device=torch.cuda.current_device())
             )
             return loss_sum
 
-        return torch.tensor(0.0).cuda()
+        return torch.tensor(0.0, device=torch.cuda.current_device())
 
 
 def masked_token_loss(tensor: Tensor, mask: Tensor):

From 10768ae18dc10499479a532e7ca0a6733b2ce9d3 Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com>
Date: Fri, 5 Jul 2024 00:35:26 -0700
Subject: [PATCH 04/26] fix converter defautl args (#9565)

* fix converter defautl args

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>

---------

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>
Co-authored-by: akoumpa <akoumpa@users.noreply.github.com>
---
 .../convert_mixtral_hf_to_nemo.py                  | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/scripts/checkpoint_converters/convert_mixtral_hf_to_nemo.py b/scripts/checkpoint_converters/convert_mixtral_hf_to_nemo.py
index 8183b0d142c1..1bf23224357f 100644
--- a/scripts/checkpoint_converters/convert_mixtral_hf_to_nemo.py
+++ b/scripts/checkpoint_converters/convert_mixtral_hf_to_nemo.py
@@ -50,11 +50,17 @@
 def get_args():
     parser = ArgumentParser()
     parser.add_argument(
-        "--input_name_or_path", type=str, default=None, required=True, help="Path to Huggingface Mixtral checkpoints",
+        "--input_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to Huggingface Mixtral checkpoints",
     )
     parser.add_argument("--output_path", type=str, default=None, required=True, help="Path to output .nemo file.")
-    valid_precision_values = [16, '16', 'bf16', '16-mixed', 'bf16-mixed', 32, '32']
-    parser.add_argument("--precision", type=str, default="32", choices=valid_precision_values, help="Model precision")
+    valid_precision_values = [16, '16', 'bf16', '16-mixed', 'bf16-mixed']
+    parser.add_argument(
+        "--precision", type=str, default="bf16", choices=valid_precision_values, help="Model precision"
+    )
     parser.add_argument('--low-ram', action='store_true')
     parser.add_argument('--tmp-dir', default='/tmp/mixtral_ckpt_parts/')
     args = parser.parse_args()
@@ -185,7 +191,7 @@ def make_trainer(args, nemo_config):
         scaler = None
         if precision in [16, '16', '16-mixed']:
             scaler = GradScaler(
-                init_scale=nemo_config.get('native_amp_init_scale', 2 ** 32),
+                init_scale=nemo_config.get('native_amp_init_scale', 2**32),
                 growth_interval=nemo_config.get('native_amp_growth_interval', 1000),
                 hysteresis=nemo_config.get('hysteresis', 2),
             )

From d4a32d0dea3d7201defdad09967b4536fa56e672 Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com>
Date: Fri, 5 Jul 2024 01:43:26 -0700
Subject: [PATCH 05/26] mixtral export (#9603)

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
---
 nemo/collections/llm/gpt/model/mixtral.py | 119 ++++++++++++++++++++++
 1 file changed, 119 insertions(+)

diff --git a/nemo/collections/llm/gpt/model/mixtral.py b/nemo/collections/llm/gpt/model/mixtral.py
index af1b73dd9109..6256b67515ee 100644
--- a/nemo/collections/llm/gpt/model/mixtral.py
+++ b/nemo/collections/llm/gpt/model/mixtral.py
@@ -186,3 +186,122 @@ def _import_qkv(ctx: io.TransformCTX, q, k, v):
 )
 def _import_moe_w1_w3(gate_proj, up_proj):
     return torch.cat((gate_proj, up_proj), axis=0)
+
+
+@io.model_exporter(MixtralModel, "hf")
+class HFMixtralExporter(io.ModelConnector[MixtralModel, "MixtralForCausalLM"]):
+    def init(self) -> "MixtralForCausalLM":
+        from transformers import AutoModelForCausalLM
+
+        return AutoModelForCausalLM.from_config(self.config)
+
+    def apply(self, output_path: Path) -> Path:
+        # TODO: Make it work with lazy init
+        # with torch.device("meta"):
+        #     target = self.init()
+        target = self.init()
+        source, _ = self.nemo_load(str(self))
+        target = self.convert_state(source, target)
+
+        # TODO: Make sure we don't need to do this
+        target = target.cpu()
+        target.save_pretrained(output_path)
+        self.tokenizer.save_pretrained(output_path)
+
+        return output_path
+
+    def convert_state(self, source, target):
+        mapping = {
+            "embedding.word_embeddings.weight": "model.embed_tokens.weight",
+            "decoder.layers.*.self_attention.linear_proj.weight": "model.layers.*.self_attn.o_proj.weight",
+            "decoder.layers.*.self_attention.linear_qkv.layer_norm_weight": "model.layers.*.input_layernorm.weight",
+            "decoder.layers.*.pre_mlp_layernorm.weight": "model.layers.*.post_attention_layernorm.weight",
+            # MoE
+            "decoder.layers.*.mlp.experts.local_experts.*.linear_fc2.weight": "model.layers.*.block_sparse_moe.experts.*.w2.weight",
+            "decoder.layers.*.mlp.router.weight": "model.layers.*.block_sparse_moe.gate.weight",
+            # lm-head
+            "decoder.final_layernorm.weight": "model.norm.weight",
+            "output_layer.weight": "lm_head.weight",
+        }
+
+        return io.apply_transforms(source, target, mapping=mapping, transforms=[_export_qkv, _export_moe_w1_w3])
+
+    @property
+    def tokenizer(self):
+        return io.load_ckpt(str(self)).model.tokenizer.tokenizer
+
+    @property
+    def config(self) -> "MixtralConfig":
+        source: MixtralConfig7B = io.load_ckpt(str(self)).model.config
+
+        from transformers import MixtralConfig as HfMixtralConfig
+
+        return HfMixtralConfig(
+            num_hidden_layers=source.num_layers,
+            hidden_size=source.hidden_size,
+            intermediate_size=source.ffn_hidden_size,
+            max_position_embeddings=source.max_position_embeddings,
+            seq_length=source.max_position_embeddings,
+            # RoPe
+            rope_theta=source.rotary_base,
+            # transformer config
+            num_attention_heads=source.num_attention_heads,
+            num_key_value_heads=source.num_query_groups,
+            num_local_experts=config.num_moe_experts,
+            num_experts_per_tok=config.moe_router_topk,
+            # norm
+            rms_norm_eps=source.layernorm_epsilon,
+            # init
+            initializer_range=source.init_method_std,
+            # vocab
+            vocab_size=self.tokenizer.vocab_size,
+        )
+
+
+@io.state_transform(
+    source_key="decoder.layers.*.self_attention.linear_qkv.weight",
+    target_key=(
+        "model.layers.*.self_attn.q_proj.weight",
+        "model.layers.*.self_attn.k_proj.weight",
+        "model.layers.*.self_attn.v_proj.weight",
+    ),
+)
+def _export_qkv(ctx: io.TransformCTX, linear_qkv):
+    megatron_config = ctx.source.config
+
+    head_num = megatron_config.num_attention_heads
+    num_query_groups = megatron_config.num_query_groups
+    heads_per_group = head_num // num_query_groups
+    hidden_size = megatron_config.hidden_size
+    head_num = megatron_config.num_attention_heads
+    head_size = hidden_size // head_num
+    qkv_total_dim = head_num + 2 * num_query_groups
+
+    linear_qkv = linear_qkv.reshape([qkv_total_dim, head_size, hidden_size])
+    q_slice = torch.cat(
+        [
+            torch.arange((heads_per_group + 2) * i, (heads_per_group + 2) * i + heads_per_group)
+            for i in range(num_query_groups)
+        ]
+    )
+    k_slice = torch.arange(heads_per_group, qkv_total_dim, (heads_per_group + 2))
+    v_slice = torch.arange(heads_per_group + 1, qkv_total_dim, (heads_per_group + 2))
+
+    q_proj = linear_qkv[q_slice].reshape(-1, hidden_size).cpu()
+    k_proj = linear_qkv[k_slice].reshape(-1, hidden_size).cpu()
+    v_proj = linear_qkv[v_slice].reshape(-1, hidden_size).cpu()
+
+    return q_proj, k_proj, v_proj
+
+
+@io.state_transform(
+    source_key="decoder.layers.*.mlp.experts.local_experts.*.linear_fc1.weight",
+    target_key=(
+        "model.layers.*.block_sparse_moe.experts.*.w1.weight",
+        "model.layers.*.block_sparse_moe.experts.*.w3.weight",
+    ),
+)
+def _export_moe_w1_w3(linear_fc1):
+    gate_proj, up_proj = torch.chunk(linear_fc1, 2, dim=0)
+
+    return gate_proj, up_proj

From bdb4e89d9ac33d733f8ea7b21552628dda798825 Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com>
Date: Fri, 5 Jul 2024 08:11:14 -0700
Subject: [PATCH 06/26] fix: remove non_blocking from PTL's .cuda call (#9618)

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
---
 nemo/lightning/megatron_parallel.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nemo/lightning/megatron_parallel.py b/nemo/lightning/megatron_parallel.py
index 71d9c87f2fe0..2f2308717004 100644
--- a/nemo/lightning/megatron_parallel.py
+++ b/nemo/lightning/megatron_parallel.py
@@ -182,7 +182,7 @@ def __init__(
 
         for i, model_module in enumerate(_pipeline):
             if not cpu:
-                model_module.cuda(torch.cuda.current_device(), non_blocking=True)
+                model_module.cuda(torch.cuda.current_device())
 
             for param in model_module.parameters():
                 set_defaults_if_not_set_tensor_model_parallel_attributes(param)

From 19b1d75b1819108d58684bcb9996867763684561 Mon Sep 17 00:00:00 2001
From: Ali Taghibakhshi <71892896+JRD971000@users.noreply.github.com>
Date: Fri, 5 Jul 2024 13:00:01 -0500
Subject: [PATCH 07/26] Alit/mamba tmp (#9612)

* adding mamba support

* fix import mixins

* rm convert jamba

* Apply isort and black reformatting

Signed-off-by: JRD971000 <JRD971000@users.noreply.github.com>

* more cleanups

* use GPT text gen

* Apply isort and black reformatting

Signed-off-by: JRD971000 <JRD971000@users.noreply.github.com>

* fixing gbs in TP convetor

* Apply isort and black reformatting

Signed-off-by: JRD971000 <JRD971000@users.noreply.github.com>

* add reqs

* add tutorial

* minor fix to tutorial

* moving finetuning files

Signed-off-by: arendu <adithya.r@gmail.com>

* moving finetuning files

Signed-off-by: arendu <adithya.r@gmail.com>

* address comments

* Apply isort and black reformatting

Signed-off-by: JRD971000 <JRD971000@users.noreply.github.com>

* address comments

* Apply isort and black reformatting

Signed-off-by: JRD971000 <JRD971000@users.noreply.github.com>

* add mamba_tmp

* remove mamba import

* Apply isort and black reformatting

Signed-off-by: JRD971000 <JRD971000@users.noreply.github.com>

---------

Signed-off-by: JRD971000 <JRD971000@users.noreply.github.com>
Signed-off-by: arendu <adithya.r@gmail.com>
Co-authored-by: Ali Taghibakhshi <ataghibakhsh@login-eos01.eos.clusters.nvidia.com>
Co-authored-by: JRD971000 <JRD971000@users.noreply.github.com>
Co-authored-by: arendu <adithya.r@gmail.com>
---
 .../conf/megatron_mamba_config.yaml           | 191 +++++
 .../mamba_change_num_partition.py             | 696 ++++++++++++++++++
 .../megatron_mamba_finetuning_config.yaml     | 315 ++++++++
 .../conf/megatron_mamba_generate_config.yaml  | 298 ++++++++
 .../tuning/megatron_mamba_finetuning.py       |  60 ++
 .../tuning/megatron_mamba_generate.py         |  69 ++
 .../language_modeling/megatron_mamba_model.py |  91 +++
 .../megatron_mamba_sft_model.py               |  47 ++
 .../common/text_generation_strategy.py        |   3 +
 .../nlp/parts/mixins/nlp_adapter_mixins.py    |   8 +-
 requirements/requirements_nlp.txt             |   1 +
 .../convert_mamba2_pyt_to_nemo.py             | 159 ++++
 tutorials/llm/mamba/mamba.rst                 | 301 ++++++++
 13 files changed, 2236 insertions(+), 3 deletions(-)
 create mode 100644 examples/nlp/language_modeling/conf/megatron_mamba_config.yaml
 create mode 100644 examples/nlp/language_modeling/mamba_change_num_partition.py
 create mode 100644 examples/nlp/language_modeling/tuning/conf/megatron_mamba_finetuning_config.yaml
 create mode 100644 examples/nlp/language_modeling/tuning/conf/megatron_mamba_generate_config.yaml
 create mode 100644 examples/nlp/language_modeling/tuning/megatron_mamba_finetuning.py
 create mode 100644 examples/nlp/language_modeling/tuning/megatron_mamba_generate.py
 create mode 100644 nemo/collections/nlp/models/language_modeling/megatron_mamba_model.py
 create mode 100644 nemo/collections/nlp/models/language_modeling/megatron_mamba_sft_model.py
 create mode 100644 scripts/checkpoint_converters/convert_mamba2_pyt_to_nemo.py
 create mode 100644 tutorials/llm/mamba/mamba.rst

diff --git a/examples/nlp/language_modeling/conf/megatron_mamba_config.yaml b/examples/nlp/language_modeling/conf/megatron_mamba_config.yaml
new file mode 100644
index 000000000000..f4f37d7c4ce0
--- /dev/null
+++ b/examples/nlp/language_modeling/conf/megatron_mamba_config.yaml
@@ -0,0 +1,191 @@
+name: megatron_mamba
+restore_from_path: null # used when starting from a .nemo file
+
+trainer:
+  devices: 1
+  num_nodes: 1
+  accelerator: gpu
+  precision: bf16
+  logger: False # logger provided by exp_manager
+  enable_checkpointing: False
+  use_distributed_sampler: False
+  max_epochs: -1 # PTL default. In practice we don't usually train for more than 1 epoch.
+  max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
+  log_every_n_steps: 10
+  val_check_interval: 100
+  limit_val_batches: 50
+  limit_test_batches: 500
+  accumulate_grad_batches: 1
+  gradient_clip_val: 1.0
+  benchmark: False
+
+exp_manager:
+  explicit_log_dir: null
+  exp_dir: null
+  name: megatron_mamba
+  create_wandb_logger: False
+  wandb_logger_kwargs:
+    project: null
+    name: null
+  resume_if_exists: True
+  resume_ignore_no_checkpoint: True
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: val_loss
+    save_top_k: 10
+    mode: min
+    always_save_nemo: False # saves nemo file during validation, not implemented for model parallel
+    filename: 'megatron_mamba--{val_loss:.2f}-{step}-{consumed_samples}'
+    model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}}
+
+
+model:
+  restore_from_path: null
+  # model parallelism 
+  mcore_gpt: True
+  micro_batch_size: 1
+  global_batch_size: 8
+  tensor_model_parallel_size: 1
+  pipeline_model_parallel_size: 1
+  virtual_pipeline_model_parallel_size: null
+  expert_model_parallel_size: 1 # expert model parallelism
+  hybrid_override_pattern: null
+  vocab_size: 256000
+  # model architecture
+  encoder_seq_length: 4096
+  max_position_embeddings: ${.encoder_seq_length}
+  position_embedding_type: 'none' # Position embedding type. Options ['learned_absolute', 'rope', 'alibi', 'kerple' , 'xpos', 'sandwich'] xpos and sandwich are experimental.
+  num_layers: 56
+  gated_linear_unit: False
+  add_bias_linear: False
+  num_query_groups: 8
+  mamba_ssm_ngroups: 8
+  attention_dropout: 0.0
+  hidden_dropout: 0.0
+  hidden_size: 4096
+  ffn_hidden_size: 14336 # Transformer FFN hidden size. Usually 4 * hidden_size.
+  num_attention_heads: 32
+  transformer_block_type: pre_ln
+  init_method_std: 0.02 # Standard deviation of the zero mean normal distribution used for weight initialization.')
+  kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null
+  apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number.
+  normalization: RMSNorm
+  layernorm_epsilon: 1e-5
+  num_moe_experts: 16
+  moe_router_topk: 2
+  moe_aux_loss_coeff: 0.001
+  make_vocab_size_divisible_by: 128 # Pad the vocab size to be divisible by this value for computation efficiency.
+  pre_process: True # add embedding
+  post_process: True # add pooler
+  megatron_legacy: False
+  persist_layer_norm: True
+
+  tokenizer:
+    library: 'huggingface'
+    type: 'EleutherAI/gpt-neox-20b' 
+    model: null 
+    vocab_file: null
+    merge_file: null 
+    sentencepiece_legacy: False
+    use_fast: True
+
+  # Distributed checkpoint setup
+  dist_ckpt_format: 'zarr' # Set to 'torch_dist' to use PyTorch distributed checkpoint format.
+  dist_ckpt_load_on_device: True # whether to load checkpoint weights directly on GPU or to CPU
+  dist_ckpt_parallel_save: False # if true, each worker will write its own part of the dist checkpoint
+
+  # precision
+  native_amp_init_scale: 4294967296 # 2 ** 32
+  native_amp_growth_interval: 1000
+  fp32_residual_connection: False # Move residual connections to fp32
+  fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16
+
+  # Megatron O2-style half-precision
+  megatron_amp_O2: False # Enable O2-level automatic mixed precision using main parameters
+  grad_allreduce_chunk_size_mb: 125
+
+
+  # Fusion
+  grad_div_ar_fusion: True # Fuse grad division into torch.distributed.all_reduce. Only used with O2 and no pipeline parallelism..
+  gradient_accumulation_fusion: False # Fuse weight gradient accumulation to GEMMs. Only used with pipeline parallelism and O2.
+  bias_activation_fusion: False # Use a kernel that fuses the bias addition from weight matrices with the subsequent activation function.
+  bias_dropout_add_fusion: True # Use a kernel that fuses the bias addition, dropout and residual connection addition.
+  masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask.
+  get_attention_mask_from_fusion: True # When using fused softmax it will create the attention mask so we won't copy it to the pipeline stages.
+  apply_rope_fusion: True # Use a kernel to add rotary positional embeddings. Only used if position_embedding_type=rope
+
+
+  # miscellaneous
+  seed: 1234
+  use_cpu_initialization: False # Init weights on the CPU (slow for large models)
+  onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter.
+  gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
+  
+  ## Activation Checkpointing
+  # NeMo Megatron supports 'selective' activation checkpointing where only the memory intensive part of attention is checkpointed.
+  # These memory intensive activations are also less compute intensive which makes activation checkpointing more efficient for LLMs (20B+).
+  # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
+  # 'full' will checkpoint the entire transformer layer.
+  activations_checkpoint_granularity: null # 'selective' or 'full' 
+  activations_checkpoint_recurrent: False # If set to True, the checkpointing is only done for rglru and conv1d and not for attention and mlp layers
+  activations_checkpoint_method: null # 'uniform', 'block'
+  # 'uniform' divides the total number of transformer layers and checkpoints the input activation
+  # of each chunk at the specified granularity. When used with 'selective', 'uniform' checkpoints all attention blocks in the model.
+  # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity
+  activations_checkpoint_num_layers: null
+  # when using 'uniform' this creates groups of transformer layers to checkpoint. Usually set to 1. Increase to save more memory.
+  # when using 'block' this this will checkpoint the first activations_checkpoint_num_layers per pipeline stage.
+  num_micro_batches_with_partial_activation_checkpoints: null
+  # This feature is valid only when used with pipeline-model-parallelism.
+  # When an integer value is provided, it sets the number of micro-batches where only a partial number of Transformer layers get checkpointed
+  # and recomputed within a window of micro-batches. The rest of micro-batches in the window checkpoint all Transformer layers. The size of window is
+  # set by the maximum outstanding micro-batch backpropagations, which varies at different pipeline stages. The number of partial layers to checkpoint
+  # per micro-batch is set by 'activations_checkpoint_num_layers' with 'activations_checkpoint_method' of 'block'.
+  # This feature enables using activation checkpoint at a fraction of micro-batches up to the point of full GPU memory usage.
+  activations_checkpoint_layers_per_pipeline: null
+  # This feature is valid only when used with pipeline-model-parallelism.
+  # When an integer value (rounded down when float is given) is provided, it sets the number of Transformer layers to skip checkpointing at later
+  # pipeline stages. For example, 'activations_checkpoint_layers_per_pipeline' of 3 makes pipeline stage 1 to checkpoint 3 layers less than
+  # stage 0 and stage 2 to checkpoint 6 layers less stage 0, and so on. This is possible because later pipeline stage
+  # uses less GPU memory with fewer outstanding micro-batch backpropagations. Used with 'num_micro_batches_with_partial_activation_checkpoints',
+  # this feature removes most of activation checkpoints at the last pipeline stage, which is the critical execution path.
+  sequence_parallel: False
+  
+  data:
+    # Path to data must be specified by the user.
+    # can override from the CLI: "model.data.data_prefix=[.5,/raid/data/pile/my-gpt3_00_text_document,.5,/raid/data/pile/my-gpt3_01_text_document]",
+    # Or see example below: 
+    # data_prefix: 
+    #   - .5
+    #   - /raid/data/pile/my-gpt3_00_text_document
+    #   - .5
+    #   - /raid/data/pile/my-gpt3_01_text_document
+    data_prefix: [1.0, /path/to/data]
+    index_mapping_dir: null # path to save index mapping .npy files, by default will save in the same location as data_prefix
+    data_impl: mmap
+    splits_string: 900,50,50
+    seq_length: ${model.encoder_seq_length}
+    skip_warmup: True
+    num_workers: 0
+    dataloader_type: single  # cyclic, LDDL
+    reset_position_ids: False # Reset position ids after end-of-document token
+    reset_attention_mask: False # Reset attention mask after end-of-document token
+    eod_mask_loss: False # Mask loss for the end of document tokens
+    masked_lm_prob: 0.15 # Probability of replacing a token with mask.
+    short_seq_prob: 0.1 # Probability of producing a short sequence.
+    ceil_to_power_2: True
+    get_attention_mask_from_fusion: True
+    pad_to_max_length: True
+  
+  optim:
+    name: distributed_fused_adam
+    lr: 2e-4
+    weight_decay: 0.01 
+    betas: 
+    - 0.9
+    - 0.98
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 500
+      constant_steps: 50000
+      min_lr: 2e-5
diff --git a/examples/nlp/language_modeling/mamba_change_num_partition.py b/examples/nlp/language_modeling/mamba_change_num_partition.py
new file mode 100644
index 000000000000..bc76b3215a74
--- /dev/null
+++ b/examples/nlp/language_modeling/mamba_change_num_partition.py
@@ -0,0 +1,696 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import re
+import tarfile
+import tempfile
+from argparse import ArgumentParser
+
+import torch
+from omegaconf import open_dict
+from pytorch_lightning import Trainer
+
+from nemo.collections.nlp.models.language_modeling.megatron_mamba_model import MegatronMambaModel
+from nemo.collections.nlp.parts.nlp_overrides import (
+    NEMO_MEGATRON_MODEL_PARALLEL_APPSTATE_OVERRIDE,
+    GradScaler,
+    MegatronHalfPrecisionPlugin,
+    NLPDDPStrategy,
+    NLPSaveRestoreConnector,
+    PipelineMixedPrecisionPlugin,
+)
+from nemo.utils import logging
+from nemo.utils.app_state import AppState
+
+"""
+Usage:
+
+### Tensor Parallelism conversion ###
+
+# Megatron Mamba
+python /opt/NeMo/examples/nlp/language_modeling/mamba_change_num_partition.py \
+    --model_file=<path to source .nemo model> \
+    --target_file=<path to target .nemo model> \
+    --tensor_model_parallel_size=1 \
+    --target_tensor_model_parallel_size=4 \
+    --precision=bf16 \
+    --d-model=4096 \
+    --mamba-version=2 \
+    --mamba2-n-groups=8 \
+    --mamba2-head-dim=64
+"""
+
+tp_split_dim = {
+    'word_embeddings.weight': 0,
+    'norm.weight': -1,
+    'final_norm.weight': -1,
+    'output_layer.weight': 0,
+    # mamba1/2
+    'A_log': 0,
+    'D': 0,
+    'dt_bias': 0,
+    'in_proj.weight': 0,
+    'conv1d.weight': 0,
+    'conv1d.bias': 0,
+    'x_proj.weight': 1,
+    'dt_proj.weight': 0,
+    'dt_proj.bias': 0,
+    'out_proj.weight': 1,
+    'mixer.norm.weight': 0,
+    # mlp
+    'linear_fc1.layer_norm_weight': -1,
+    'linear_fc1.weight': 0,
+    'linear_fc2.weight': 1,
+    # attention
+    'self_attention.linear_proj.weight': 1,
+    'self_attention.linear_qkv.layer_norm_weight': -1,
+    'self_attention.linear_qkv.weight': 0,
+}
+
+
+def get_split_dim(tensor_name):
+    # norm.weight will match tensor_name of mixer.norm.weight and norm.weight, need to distinguish
+    if 'norm.weight' in tensor_name:
+        if 'mixer.norm.weight' in tensor_name:
+            return tp_split_dim['mixer.norm.weight']
+        else:
+            return tp_split_dim['norm.weight']
+
+    for key in tp_split_dim.keys():
+        if key in tensor_name:
+            return tp_split_dim[key]
+    raise Exception("Unknown tensor name {}".format(tensor_name))
+
+
+def split_tensor_for_tp(params, key, dim, tensor):
+
+    tp_size = params.target_tensor_model_parallel_size
+    tensor_sliced = []
+    if dim == -1:
+        tensor_sliced = [tensor for i in range(tp_size)]
+    else:
+        if 'mixer.in_proj.weight' in key and params.mamba_version == 1:
+            x, z = torch.split(tensor, [params.mamba_d_inner, params.mamba_d_inner], dim=dim)
+            x_sliced = torch.chunk(x, tp_size, dim=dim)
+            z_sliced = torch.chunk(z, tp_size, dim=dim)
+            for x, z in zip(x_sliced, z_sliced):
+                tensor_sliced.append(torch.cat((x, z), dim=dim))
+
+        elif 'mixer.in_proj.weight' in key and params.mamba_version == 2:
+            x, z, B, C, dt = torch.split(
+                tensor,
+                [
+                    params.mamba_d_inner,
+                    params.mamba_d_inner,
+                    params.mamba2_n_groups * params.mamba_d_state,
+                    params.mamba2_n_groups * params.mamba_d_state,
+                    params.mamba2_n_heads,
+                ],
+                dim=dim,
+            )
+            B = torch.reshape(B, (-1, params.mamba_d_state, B.shape[-1]))
+            C = torch.reshape(C, (-1, params.mamba_d_state, C.shape[-1]))
+
+            B_sliced = torch.chunk(B, tp_size, dim=dim)
+            C_sliced = torch.chunk(C, tp_size, dim=dim)
+            x_sliced = torch.chunk(x, tp_size, dim=dim)
+            z_sliced = torch.chunk(z, tp_size, dim=dim)
+            dt_sliced = torch.chunk(dt, tp_size, dim=dim)
+
+            tensor_sliced = []
+            for x, z, B, C, dt in zip(x_sliced, z_sliced, B_sliced, C_sliced, dt_sliced):
+                tensor_sliced.append(torch.cat((x, z, B.flatten(0, 1), C.flatten(0, 1), dt), dim=dim))
+
+        elif 'mixer.conv1d' in key and params.mamba_version == 2:
+            x, B, C = torch.split(
+                tensor,
+                [
+                    params.mamba_d_inner,
+                    params.mamba2_n_groups * params.mamba_d_state,
+                    params.mamba2_n_groups * params.mamba_d_state,
+                ],
+                dim=dim,
+            )
+            if 'weight' in key:
+                B = torch.reshape(B, (-1, params.mamba_d_state, B.shape[-2], B.shape[-1]))
+                C = torch.reshape(C, (-1, params.mamba_d_state, C.shape[-2], C.shape[-1]))
+            elif 'bias' in key:
+                B = torch.reshape(B, (-1, params.mamba_d_state))
+                C = torch.reshape(C, (-1, params.mamba_d_state))
+            else:
+                raise Exception("Unknown key")
+
+            B_sliced = torch.chunk(B, tp_size, dim=dim)
+            C_sliced = torch.chunk(C, tp_size, dim=dim)
+            x_sliced = torch.chunk(x, tp_size, dim=dim)
+
+            tensor_sliced = []
+            for x, B, C in zip(x_sliced, B_sliced, C_sliced):
+                tensor_sliced.append(torch.cat((x, B.flatten(0, 1), C.flatten(0, 1)), dim=dim))
+        elif '_extra_state' in key:
+            pass
+        else:
+            tensor_sliced = torch.chunk(tensor, tp_size, dim=dim)
+
+    return tensor_sliced
+
+
+#################
+### Utilities ###
+#################
+
+
+def force_cpu_model(cfg):
+    with open_dict(cfg):
+        # temporarily set to cpu
+        original_cpu_init = cfg.get('use_cpu_initialization', False)
+        if 'megatron_amp_O2' in cfg:
+            amp_o2_key = 'megatron_amp_O2'
+            original_amp_o2 = cfg.megatron_amp_O2
+        elif 'megatron_amp_02' in cfg:
+            amp_o2_key = 'megatron_amp_02'
+            original_amp_o2 = cfg.megatron_amp_02
+        else:
+            amp_o2_key, original_amp_o2 = None, None
+
+        # Set new values
+        cfg.use_cpu_initialization = True
+        if amp_o2_key is not None:
+            cfg[amp_o2_key] = False
+
+        # Disable sequence parallelism - Not disabling this gives error when converting the the model to TP=1
+        original_sequence_parallel = cfg.get('sequence_parallel', None)
+        cfg.sequence_parallel = False
+
+    # Setup restore dict
+    restore_dict = {'use_cpu_initialization': original_cpu_init}  # 'megatron_amp_O2': original_amp_o2
+    if amp_o2_key is not None:
+        restore_dict[amp_o2_key] = original_amp_o2
+    if original_sequence_parallel is not None:
+        restore_dict['sequence_parallel'] = original_sequence_parallel
+
+    return cfg, restore_dict
+
+
+def restore_model_config(cfg, original_dict):
+    with open_dict(cfg):
+        for key, val in original_dict.items():
+            logging.info(f"Restoring model config key ({key}) from {cfg[key]} to original value of {val}")
+            cfg[key] = val
+    return cfg
+
+
+def write_tp_pp_split(model, splits, app_state, tp_size, pp_rank, write_path):
+    """
+    Function to write the given TP PP split to NeMo File.
+
+    Save each of the TP ranks in reverse order
+    This is done so that the last PP rank will save the last TP rank only after all other PP TP ranks are saved
+    The final rank will then save a new NeMo file with all other ranks inside.
+
+    Args:
+        model: The model corresponding to the current TP PP split. Contains partial parameters.
+        splits: Nested List of tensors containing the TP splits of the current model given current PP rank.
+            Indexed as splits[idx][tp_rank].
+        app_state: AppState object.
+        tp_size:  The global tensor-parallel size of the final model.
+        pp_rank: The local pipeline parallel rank of the final model.
+        write_path: The path to save the NeMo file.
+    """
+    for tp_rank in range(tp_size - 1, -1, -1):
+        app_state.pipeline_model_parallel_rank = pp_rank
+        app_state.tensor_model_parallel_rank = tp_rank
+
+        idx = 0
+        for name, param in model.named_parameters():
+            split_val = splits[idx][tp_rank].clone()
+
+            if param.shape != split_val.shape:
+                raise RuntimeError(
+                    f"Can not handle parameter {name}, required shape: {param.shape}, split shape: {split_val.shape}."
+                )
+
+            param.data = split_val
+            idx += 1
+
+        if write_path is not None:
+            logging.info(f"Writing pp rank {pp_rank} tp rank {tp_rank} to file {write_path}")
+            model.save_to(write_path)
+
+
+##################
+### Converters ###
+##################
+
+
+def split_tp_partition_only(args, model, original_model, tp_size, write_path=None, megatron_legacy=False):
+
+    if tp_size < 1:
+        raise ValueError("TP size must to be >= 1.")
+
+    app_state = AppState()
+    app_state.data_parallel_rank = 0
+    app_state.pipeline_model_parallel_size = 1
+    app_state.tensor_model_parallel_size = tp_size
+    app_state.model_parallel_size = app_state.pipeline_model_parallel_size * app_state.tensor_model_parallel_size
+
+    app_state.pipeline_model_parallel_rank = 0
+    app_state.tensor_model_parallel_rank = tp_size - 1
+
+    idx = 0
+    splits = []
+
+    for ii, (key, original_tensor) in enumerate(original_model.model.state_dict().items()):
+        try:
+            layer_num = int(re.findall(r'\d+', key)[0])
+            new_key = key.replace(str(layer_num), str(layer_num), 1)
+        except:
+            new_key = key
+
+        if '_extra_state' not in new_key:
+            split_dim = get_split_dim(new_key)
+            split = split_tensor_for_tp(args, new_key, split_dim, original_tensor)
+
+            splits.append(split)
+            idx += 1
+
+    # Save each of the TP ranks in reverse order
+    # This is done so that the last PP rank will save the last TP rank only after all other PP TP ranks are saved
+    # The final rank will then save a new NeMo file with all other ranks inside.
+    write_tp_pp_split(model, splits, app_state, tp_size, pp_rank=0, write_path=write_path)
+
+    with tarfile.open(write_path, 'r') as tar:
+        # Extract all contents to the specified path
+        tar.extractall(path=os.path.dirname(write_path))
+
+
+def main():
+    parser = ArgumentParser()
+    parser.add_argument("--model_file", type=str, default=None, required=False, help="Path to source .nemo file")
+    parser.add_argument("--target_file", type=str, required=True, help="Path to write target .nemo file")
+    parser.add_argument(
+        "--tensor_model_parallel_size", type=int, default=-1, required=False, help="TP size of source model"
+    )
+    parser.add_argument("--target_tensor_model_parallel_size", type=int, required=True, help="TP size of target model")
+    parser.add_argument(
+        '--pipeline_model_parallel_size', type=int, default=1, required=False, help='PP size of source model'
+    )
+    parser.add_argument(
+        '--target_pipeline_model_parallel_size', type=int, required=False, default=1, help='PP size of target model'
+    )
+    parser.add_argument(
+        '--target_pipeline_model_parallel_split_rank', type=int, default=0, help='PP rank to split for Enc-Dec models'
+    )
+    parser.add_argument(
+        '--virtual_pipeline_model_parallel_size', type=int, default=None, help='Virtual Pipeline parallelism size'
+    )
+    parser.add_argument(
+        '--ckpt_name', type=str, default=None, help='Checkpoint name to load from for Virtual Parallel'
+    )
+    parser.add_argument(
+        "--model_class",
+        type=str,
+        default="nemo.collections.nlp.models.language_modeling.megatron_mamba_model.MegatronMambaModel",
+        help="NeMo model class. This script should support all NeMo megatron models that use Tensor Parallel",
+    )
+    parser.add_argument("--precision", default=16, help="PyTorch Lightning Trainer precision flag")
+    parser.add_argument('--num_gpu_per_node', default=8, type=int, help='Number of GPUs per node')
+    parser.add_argument(
+        "--megatron_legacy",
+        action="store_true",
+        help="Converter for legacy megatron modles that have different q,k,v weight splits",
+    )
+    parser.add_argument(
+        "--tokenizer_model_path",
+        type=str,
+        required=False,
+        default=None,
+        help="Path to the tokenizer model path if your model uses a tokenizer model as an artifact. This is needed if your model uses a sentencepiece tokenizer.",
+    )
+    parser.add_argument(
+        "--tokenizer_vocab_file",
+        type=str,
+        required=False,
+        default=None,
+        help="Path to the tokenizer model path if your model uses a tokenizer model as an artifact. This is needed if your model uses a sentencepiece tokenizer.",
+    )
+    parser.add_argument('--hparams_file', type=str, default=None, help='Path to hparams file from PTL training')
+    parser.add_argument(
+        '--tp_conversion_only', default=True, action='store_true', help='Only convert TP model to TP model'
+    )
+    parser.add_argument('--model_extracted_dir', type=str, default=None, help='Path to pre-extracted model directory')
+
+    parser.add_argument('--d-model', type=int, default=4096)
+    parser.add_argument('--mamba-version', type=int, default=2)
+    parser.add_argument('--mamba-d-state', type=int, default=128)
+    parser.add_argument('--mamba2-n-groups', type=int, default=8)
+    parser.add_argument('--mamba2-head-dim', type=int, default=64)
+
+    args = parser.parse_args()
+
+    args.mamba_d_inner = args.d_model * 2
+    args.mamba2_n_heads = args.mamba_d_inner // args.mamba2_head_dim
+
+    precision = args.precision
+    num_gpu_per_node = int(args.num_gpu_per_node)
+    if args.precision in ["32", "16"]:
+        precision = int(float(args.precision))
+
+    if precision in ["bf16", "bf16-mixed"]:
+        if torch.cuda.is_available() and torch.cuda.is_bf16_supported():
+            pass
+        else:
+            logging.warning("BF16 is not supported on this device. Using FP16 instead.")
+            precision = precision[2:]
+
+    if precision == 32:
+        dtype = torch.float32
+    elif precision in [16, "16", "16-mixed"]:
+        dtype = torch.float16
+    elif precision in ["bf16", "bf16-mixed"]:
+        dtype = torch.bfloat16
+    else:
+        dtype = torch.float32  # fallback
+
+    # Built target directory if it does not exist
+    target_dir = os.path.split(args.target_file)[0]
+    if not os.path.exists(target_dir):
+        os.makedirs(target_dir, exist_ok=True)
+
+    tp_size = args.tensor_model_parallel_size
+    tgt_tp_size = args.target_tensor_model_parallel_size
+    pp_size = args.pipeline_model_parallel_size
+    tgt_pp_size = args.target_pipeline_model_parallel_size
+    pipeline_model_parallel_split_rank = args.target_pipeline_model_parallel_split_rank
+    vp_size = args.virtual_pipeline_model_parallel_size
+    if vp_size is None:
+        vp_size = 1
+
+    convert_vp = vp_size > 1
+    if convert_vp:
+        from megatron.core import parallel_state
+
+        parallel_state.set_virtual_pipeline_model_parallel_world_size(vp_size)
+
+        hparams_filepath = args.hparams_file
+        if hparams_filepath is None:
+            logging.warning(
+                '\n\n\n!!!!!!!!!\n'
+                'You are converting a model with virtual pipeline parallelism enabled, \n'
+                'but have not passed `hparams_file` argument. \n'
+                'This will cause each ckpt file to be temporarily laoded onto GPU memory!\n\n'
+                'It is highly recommended to pass `hparams_file` argument to avoid this.\n'
+            )
+
+    # Import the class of the model
+
+    if args.model_file is None and args.model_extracted_dir is None:
+        raise ValueError("Cannot pass model_file and model_extracted_dir as None at the same time.")
+
+    tmp_cfg = MegatronMambaModel.restore_from(
+        restore_path=args.model_file,
+        trainer=Trainer(devices=1, strategy=NLPDDPStrategy(), accelerator="cpu", precision=precision),
+        map_location=torch.device("cpu"),
+        return_config=True,
+    )
+    plugins = []
+    if precision in [16, '16', 'bf16', '16-mixed', 'bf16-mixed']:
+        scaler = None
+        if precision in [16, '16', '16-mixed']:
+            scaler = GradScaler(
+                init_scale=tmp_cfg.get('native_amp_init_scale', 2**32),
+                growth_interval=tmp_cfg.get('native_amp_growth_interval', 1000),
+                hysteresis=tmp_cfg.get('hysteresis', 2),
+            )
+            # MixedPrecisionPlugin in PTL >= 2.0 requires precision to be 16-mixed or bf16-mixed
+            plugin_precision = '16-mixed'
+        else:
+            plugin_precision = 'bf16-mixed'
+
+        if tmp_cfg.get('megatron_amp_O2', False):
+            plugins.append(MegatronHalfPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler))
+        else:
+            plugins.append(PipelineMixedPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler))
+        # Set precision None after precision plugins are created as PTL >= 2.1 does not allow both
+        # precision plugins and precision to exist
+    trainer = Trainer(plugins=plugins, devices=1, strategy=NLPDDPStrategy(), accelerator="cpu")
+
+    if tp_size < 0 or pp_size < 0:
+        logging.info(f"Loading model config from {args.model_file} to get TP and PP size")
+        model_config_internal = MegatronMambaModel.restore_from(
+            restore_path=args.model_file,
+            trainer=trainer,
+            map_location=torch.device("cpu"),
+            return_config=True,
+        )
+
+        tp_size = model_config_internal.get('tensor_model_parallel_size', 1)
+        pp_size = model_config_internal.get('pipeline_model_parallel_size', 1)
+
+    # Check if TP conversion only
+    tp_conversion_only = args.tp_conversion_only
+    if tp_conversion_only:
+        logging.info("Converting TP model to TP model only")
+
+        if pp_size > 1:
+            raise ValueError("Provided `--tp_conversion_only` but `--pipeline_model_parallel_size` > 1")
+
+        if tgt_pp_size > 1:
+            raise ValueError("Provided `--tp_conversion_only` but `--target_pipeline_model_parallel_size` > 1")
+
+        if pipeline_model_parallel_split_rank > 0:
+            raise ValueError("Provided `--tp_conversion_only` but `--target_pipeline_model_parallel_split_rank` > 0")
+
+        # Force PP size to 1
+        pp_size = 1
+        tgt_pp_size = 1
+        pipeline_model_parallel_split_rank = 0
+
+    if vp_size is None or vp_size < 0:
+        vp_size = 1
+
+    app_state = AppState()
+    app_state.data_parallel_rank = 0
+    app_state.pipeline_model_parallel_size = pp_size
+    app_state.tensor_model_parallel_size = tp_size
+
+    if vp_size > 1:
+        app_state.virtual_pipeline_model_parallel_size = vp_size
+    app_state.model_parallel_size = app_state.pipeline_model_parallel_size * app_state.tensor_model_parallel_size
+
+    world_size = pp_size * tp_size  # pseudo world size for simulating load of a specific rank on a single gpu
+
+    app_state.tensor_model_parallel_rank = 0
+    app_state.pipeline_model_parallel_rank = 0
+
+    # Extract tokenizer artifact from the model to temp directory
+    logging.info("Extracting tokenizer artifact from NeMo file...")
+    temp_dir = tempfile.mkdtemp()
+    tokenizer_model_path = None
+    with tarfile.open(args.model_file, "r") as tar:
+        for member in tar.getmembers():
+            if '.model' in member.name:
+                extracted_file = tar.extractfile(member)
+                extracted_file_path = os.path.join(temp_dir, member.name)
+
+                if tokenizer_model_path is None:
+                    logging.info(f"Found tokenizer. Extracting {member.name} to {extracted_file_path}")
+
+                    tokenizer_model_path = extracted_file_path
+                    with open(extracted_file_path, "wb") as f:
+                        f.write(extracted_file.read())
+                else:
+                    if args.tokenizer_model_path is None:
+                        logging.warning(
+                            f"\n\nFound multiple tokenizer artifacts in the model file.\n"
+                            f"Using only {tokenizer_model_path}.\n"
+                            f"If this is incorrect, manually pass the correct tokenizer using "
+                            f"`--tokenizer_model_path`.\n\n"
+                        )
+
+    # If input model has TP > 1 or PP > 1
+    # Reconstruct the model to have TP = 1 and PP = 1
+    # Note that this is a forward loop that will process PP [0..N] TP [0..M] in sequential order.
+
+    # If input model has TP = 1 and PP = 1
+    app_state.model_parallel_size = 1
+
+    save_restore_connector = NLPSaveRestoreConnector()
+
+    if args.model_extracted_dir is not None:
+        logging.info(f"Using extracted model directory: {args.model_extracted_dir}")
+        save_restore_connector.model_extracted_dir = args.model_extracted_dir
+
+    if args.model_file is not None:
+        model_filepath = args.model_file
+    else:
+        model_filepath = args.model_extracted_dir
+
+    tmp_cfg = MegatronMambaModel.restore_from(
+        restore_path=model_filepath,
+        trainer=trainer,
+        map_location=torch.device("cpu"),
+        save_restore_connector=save_restore_connector,
+        return_config=True,
+    )
+
+    tmp_cfg, restore_dict = force_cpu_model(tmp_cfg)
+
+    model = MegatronMambaModel.restore_from(
+        restore_path=model_filepath,
+        trainer=trainer,
+        map_location=torch.device("cpu"),
+        save_restore_connector=save_restore_connector,
+        override_config_path=tmp_cfg,
+    )
+
+    original_model = MegatronMambaModel.restore_from(
+        restore_path=model_filepath,
+        trainer=trainer,
+        map_location=torch.device("cpu"),
+        save_restore_connector=save_restore_connector,
+        override_config_path=tmp_cfg,
+    )
+    original_model = original_model.to('cpu')
+    original_model._save_restore_connector = NLPSaveRestoreConnector()
+    original_model.freeze()
+    original_model.to(dtype=dtype)
+
+    model.to(dtype=dtype)
+
+    restore_model_config(model.cfg, restore_dict)
+
+    # If target model has TP > 1 or PP > 1
+    if tgt_pp_size > 1 or tgt_tp_size > 1:
+
+        # Preserve the TP 1 PP 1 model parameters and names
+        global_params = []
+        global_params.append([p for n, p in model.named_parameters()])  # params
+        global_params.append([n for n, p in model.named_parameters()])  # names
+
+        logging.debug("Global parameters:")
+        for idx, (name, p) in enumerate(zip(global_params[1], global_params[0])):
+            logging.debug(f"{name} - {p.shape}")
+
+        logging.info(f"TP 1 PP 1 Number of Parameters : {len(global_params[0])}")
+
+        world_size = (
+            tgt_pp_size * tgt_tp_size
+        )  # pseudo world size for simulating load of a specific rank on a single gpu
+        new_global_batch_size = model.cfg.micro_batch_size * world_size
+        old_global_batch_size = model.cfg.get('global_batch_size', model.cfg.micro_batch_size)
+
+        global_offset = len(global_params[0]) - 1  # -1 cause this indexes the array, range [0, L-1]
+        logging.info(f"Final layer offset for parameters: {global_offset}")
+
+        for pp_rank in range(tgt_pp_size - 1, -1, -1):  # reverse order
+
+            with open_dict(model.cfg):
+                model.cfg.pipeline_model_parallel_size = tgt_pp_size
+                model.cfg.tensor_model_parallel_size = tgt_tp_size
+
+                if 'pipeline_model_parallel_split_rank' in model.cfg:
+                    if pipeline_model_parallel_split_rank > 0:
+                        model.cfg.pipeline_model_parallel_split_rank = pipeline_model_parallel_split_rank
+                    elif pp_size > 1:
+                        logging.warning(
+                            f"Model config has `pipeline_model_parallel_split_rank` set to "
+                            f"{model.cfg.pipeline_model_parallel_split_rank} and target PP "
+                            f"size is {tgt_pp_size}. "
+                            f"Provided `pipeline_model_parallel_split_rank` is "
+                            f"{pipeline_model_parallel_split_rank}. "
+                            f"Be careful that the model config is correct "
+                            f"if encoder-decoder models are being converted."
+                        )
+
+                model.cfg.global_batch_size = old_global_batch_size  # Used for restoration
+
+            # Override flag that forces Model to use AppState instead of Trainer
+            # to determine the world size, global and local rank
+            # Used for simulating load of a specific rank on a single gpu
+            os.environ[NEMO_MEGATRON_MODEL_PARALLEL_APPSTATE_OVERRIDE] = "true"
+
+            # Compute the global rank
+            global_rank = (
+                pp_rank * tgt_tp_size + 0
+            )  # tp_rank = 0 needed just for modules, all TP will be merged to this PP rank
+
+            # Update AppState
+            app_state.world_size = world_size
+            app_state.global_rank = global_rank
+            app_state.local_rank = global_rank % num_gpu_per_node
+            app_state.pipeline_model_parallel_size = tgt_pp_size
+            app_state.tensor_model_parallel_size = tgt_tp_size
+            app_state.model_parallel_size = (
+                app_state.pipeline_model_parallel_size * app_state.tensor_model_parallel_size
+            )
+
+            trainer = Trainer(plugins=plugins, devices=1, strategy=NLPDDPStrategy(), accelerator="cpu")
+            if args.tokenizer_model_path is not None:
+                with open_dict(model.cfg):
+                    model.cfg.tokenizer.model = args.tokenizer_model_path
+
+            else:
+                if tokenizer_model_path is None:
+                    logging.warning("Could not extract tokenizer model file from checkpoint.")
+
+                else:
+                    # Extract tokenizer info
+                    with open_dict(model.cfg):
+                        model.cfg.tokenizer.model = tokenizer_model_path
+
+            model.cfg, restore_dict = force_cpu_model(model.cfg)
+
+            from apex.transformer.pipeline_parallel.utils import _GLOBAL_NUM_MICROBATCHES_CALCULATOR
+
+            _GLOBAL_NUM_MICROBATCHES_CALCULATOR.current_global_batch_size = 1
+            _GLOBAL_NUM_MICROBATCHES_CALCULATOR.current_micro_batch_size = 1
+            model.cfg.global_batch_size = 1
+            model.cfg.micro_batch_size = 1
+
+            model = MegatronMambaModel(model.cfg, trainer)
+            model = model.to('cpu')
+            model._save_restore_connector = NLPSaveRestoreConnector()
+            model.freeze()
+            model.to(dtype=dtype)
+
+            restore_model_config(model.cfg, restore_dict)
+
+            # Update global batch size
+            if old_global_batch_size % new_global_batch_size != 0 or old_global_batch_size < new_global_batch_size:
+                logging.info(
+                    f"Global batch size {old_global_batch_size} is not divisible by new global batch size {new_global_batch_size}."
+                    f" The model config will be updated with new global batch size {new_global_batch_size}."
+                )
+                with open_dict(model.cfg):
+                    model.cfg.global_batch_size = new_global_batch_size
+
+            logging.info(f"Global rank: {global_rank} Local rank: {app_state.local_rank} World size: {world_size}")
+            logging.info(f"PP rank: {pp_rank} TP rank: {0}")
+            logging.info(f"TP 1 PP 1 Number of Layers : {len(global_params[0])}")
+            logging.info(f"Remaining layer offset for parameters: {global_offset}")
+            logging.info("\n")
+
+            # Special case for TP conversion only mode
+            if tp_conversion_only:
+                logging.info(f"Skipping PP split due to flag `--tp_conversion_only`")
+                split_tp_partition_only(
+                    args, model, original_model, tgt_tp_size, args.target_file, args.megatron_legacy
+                )
+                break
+
+
+if __name__ == '__main__':
+    main()
diff --git a/examples/nlp/language_modeling/tuning/conf/megatron_mamba_finetuning_config.yaml b/examples/nlp/language_modeling/tuning/conf/megatron_mamba_finetuning_config.yaml
new file mode 100644
index 000000000000..3684b61bb186
--- /dev/null
+++ b/examples/nlp/language_modeling/tuning/conf/megatron_mamba_finetuning_config.yaml
@@ -0,0 +1,315 @@
+name: megatron_mamba
+restore_from_path: ${model.restore_from_path} # used when starting from a .nemo file
+
+trainer:
+  devices: 1
+  accelerator: gpu
+  num_nodes: 1
+  precision: bf16
+  logger: False # logger provided by exp_manager
+  enable_checkpointing: False
+  use_distributed_sampler: False
+  max_epochs: 9999
+  max_steps: 10000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
+  log_every_n_steps: 1 # frequency with which training steps are logged
+  val_check_interval: 200 # If is an int n > 1, will run val every n training steps, if a float 0.0 - 1.0 will run val every epoch fraction, e.g. 0.25 will run val every quarter epoch
+  gradient_clip_val: 1.0
+  limit_val_batches: 1024
+  limit_test_batches: 500
+
+exp_manager:
+  explicit_log_dir: null
+  exp_dir: null
+  name: ${name}
+  create_wandb_logger: True
+  wandb_logger_kwargs:
+    project: griffin
+    name: sft-test
+  resume_if_exists: False
+  resume_ignore_no_checkpoint: True
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: validation_${model.data.validation_ds.metric.name}
+    save_top_k: 1
+    mode: min
+    save_nemo_on_train_end: True
+    filename: '${name}--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}-{consumed_samples}'
+    model_parallel_size: ${model.tensor_model_parallel_size}
+    always_save_nemo: False
+    save_best_model: True
+  create_early_stopping_callback: True
+  early_stopping_callback_params:
+    monitor: "val_loss"
+    mode: "min"
+    min_delta: 0.001
+    patience: 10
+    verbose: True
+    strict: False # Should be False to avoid a runtime error where EarlyStopping says monitor is unavailable, which sometimes happens with resumed training.
+
+
+model:
+  restore_from_path: null
+  # model parallelism 
+  mcore_gpt: True
+  micro_batch_size: 1
+  global_batch_size: 8
+  tensor_model_parallel_size: 1
+  pipeline_model_parallel_size: 1
+  virtual_pipeline_model_parallel_size: null
+  expert_model_parallel_size: 1 # expert model parallelism
+
+  vocab_size: 65536
+  # model architecture
+  encoder_seq_length: 4096
+  hybrid_override_pattern: null
+  max_position_embeddings: ${.encoder_seq_length}
+  position_embedding_type: 'none' # Position embedding type. Options ['learned_absolute', 'rope', 'alibi', 'kerple' , 'xpos', 'sandwich'] xpos and sandwich are experimental.
+  num_layers: 64
+  gated_linear_unit: False
+  add_bias_linear: False
+  num_query_groups: 8
+  ngroups_mamba: 8
+  attention_dropout: 0.0
+  hidden_dropout: 0.0
+  hidden_size: 4096
+  ffn_hidden_size: 14336 # Transformer FFN hidden size. Usually 4 * hidden_size.
+  num_attention_heads: 32
+  transformer_block_type: pre_ln
+  init_method_std: 0.02 # Standard deviation of the zero mean normal distribution used for weight initialization.')
+  kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null
+  apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number.
+  normalization: RMSNorm
+  layernorm_epsilon: 1e-5
+  num_moe_experts: 16
+  moe_router_topk: 2
+  moe_aux_loss_coeff: 0.001
+  make_vocab_size_divisible_by: 128 # Pad the vocab size to be divisible by this value for computation efficiency.
+  pre_process: True # add embedding
+  post_process: True # add pooler
+  megatron_legacy: False
+  persist_layer_norm: True
+
+
+  # mixed-precision
+  attention_softmax_in_fp32: False
+
+  # Distributed checkpoint setup
+  dist_ckpt_format: 'zarr' # Set to 'torch_dist' to use PyTorch distributed checkpoint format.
+  dist_ckpt_load_on_device: True # whether to load checkpoint weights directly on GPU or to CPU
+  dist_ckpt_parallel_save: False # if true, each worker will write its own part of the dist checkpoint
+
+
+  tokenizer:
+    library: 'huggingface'
+    type: 'EleutherAI/gpt-neox-20b' 
+    model: null 
+    vocab_file: null
+    merge_file: null 
+    sentencepiece_legacy: False
+    use_fast: True
+
+  # precision
+  native_amp_init_scale: 4294967296 # 2 ** 32
+  native_amp_growth_interval: 1000
+  fp32_residual_connection: False # Move residual connections to fp32
+  fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16
+
+  # Megatron O2-style half-precision
+  megatron_amp_O2: False # Enable O2-level automatic mixed precision using main parameters
+  grad_allreduce_chunk_size_mb: 125
+
+  # Fusion
+  grad_div_ar_fusion: True # Fuse grad division into torch.distributed.all_reduce. Only used with O2 and no pipeline parallelism..
+  gradient_accumulation_fusion: True # Fuse weight gradient accumulation to GEMMs. Only used with pipeline parallelism and O2.
+  bias_activation_fusion: False # Use a kernel that fuses the bias addition from weight matrices with the subsequent activation function.
+  bias_dropout_add_fusion: True # Use a kernel that fuses the bias addition, dropout and residual connection addition.
+  masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask.
+  get_attention_mask_from_fusion: True # When using fused softmax it will create the attention mask so we won't copy it to the pipeline stages.
+  apply_rope_fusion: True # Use a kernel to add rotary positional embeddings. Only used if position_embedding_type=rope
+
+  # miscellaneous
+  seed: 1234
+  use_cpu_initialization: False # Init weights on the CPU (slow for large models)
+  onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter.
+  gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
+  
+  ## Activation Checkpointing
+  # NeMo Megatron supports 'selective' activation checkpointing where only the memory intensive part of attention is checkpointed.
+  # These memory intensive activations are also less compute intensive which makes activation checkpointing more efficient for LLMs (20B+).
+  # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
+  # 'full' will checkpoint the entire transformer layer.
+  activations_checkpoint_granularity: null # 'selective' or 'full' 
+  activations_checkpoint_method: null # 'uniform', 'block'
+  # 'uniform' divides the total number of transformer layers and checkpoints the input activation
+  # of each chunk at the specified granularity. When used with 'selective', 'uniform' checkpoints all attention blocks in the model.
+  # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity
+  activations_checkpoint_num_layers: null
+  # when using 'uniform' this creates groups of transformer layers to checkpoint. Usually set to 1. Increase to save more memory.
+  # when using 'block' this this will checkpoint the first activations_checkpoint_num_layers per pipeline stage.
+  num_micro_batches_with_partial_activation_checkpoints: null
+  # This feature is valid only when used with pipeline-model-parallelism.
+  # When an integer value is provided, it sets the number of micro-batches where only a partial number of Transformer layers get checkpointed
+  # and recomputed within a window of micro-batches. The rest of micro-batches in the window checkpoint all Transformer layers. The size of window is
+  # set by the maximum outstanding micro-batch backpropagations, which varies at different pipeline stages. The number of partial layers to checkpoint
+  # per micro-batch is set by 'activations_checkpoint_num_layers' with 'activations_checkpoint_method' of 'block'.
+  # This feature enables using activation checkpoint at a fraction of micro-batches up to the point of full GPU memory usage.
+  activations_checkpoint_layers_per_pipeline: null
+  # This feature is valid only when used with pipeline-model-parallelism.
+  # When an integer value (rounded down when float is given) is provided, it sets the number of Transformer layers to skip checkpointing at later
+  # pipeline stages. For example, 'activations_checkpoint_layers_per_pipeline' of 3 makes pipeline stage 1 to checkpoint 3 layers less than
+  # stage 0 and stage 2 to checkpoint 6 layers less stage 0, and so on. This is possible because later pipeline stage
+  # uses less GPU memory with fewer outstanding micro-batch backpropagations. Used with 'num_micro_batches_with_partial_activation_checkpoints',
+  # this feature removes most of activation checkpoints at the last pipeline stage, which is the critical execution path.
+  sequence_parallel: False
+  
+  peft:
+    peft_scheme: "lora"  # can be either adapter,ia3, lora, or ptuning
+    restore_from_path: null
+
+    # Used for adapter peft training
+    adapter_tuning:
+      type: 'parallel_adapter' # this should be either 'parallel_adapter' or 'linear_adapter'
+      adapter_dim: 32
+      adapter_dropout: 0.0
+      norm_position: 'pre' # This can be set to 'pre', 'post' or null, 'pre' is normally what is used.
+      column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      norm_type: 'mixedfusedlayernorm' # IGNORED if layer_adapter is used,  options are ['layernorm', 'mixedfusedlayernorm']
+      layer_selection: null  # selects in which layers to add adapters, e.g. [1,12] will add adapters to layer 1 (lowest) and 12. null will apply adapters to all layers
+      weight_tying: False
+      position_embedding_strategy: null # used only when weight_tying is True
+
+    lora_tuning:
+      target_modules: ['all'] # this can either be 'attention_qkv','attention_dense','mlp_fc1','mlp_fc2', attention (qkv & dense), mlp (fc1 & fc2)
+      adapter_dim: 32
+      alpha: 32
+      adapter_dropout: 0.0
+      column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      layer_selection: null # selects in which layers to add lora adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers
+      weight_tying: False
+      position_embedding_strategy: null # used only when weight_tying is True
+
+    # Used for p-tuning peft training
+    p_tuning:
+      virtual_tokens: 10  # The number of virtual tokens the prompt encoder should add at the start of the sequence
+      bottleneck_dim: 1024  # the size of the prompt encoder mlp bottleneck
+      embedding_dim: 1024  # the size of the prompt encoder embeddings
+      init_std: 0.023
+
+    ia3_tuning:
+      layer_selection:  null  # selects in which layers to add ia3 adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers
+    
+    selective_tuning:
+      tunable_base_param_names: ["self_attention", "word_embeddings"]  # TODO: regex support @adithyre
+
+
+  data:
+    train_ds:
+      # Example of how to specify paths to multiple datasets
+      # file_names:
+      #   - /path/to/squad.jsonl
+      #   - /path/to/mnli.jsonl
+      #   - /path/to/boolq.jsonl
+      # Example of how each dataset is formatted
+      # {'input': 'John von Neumann\nVon Neumann made fundamental contributions .... Q: What did the math of artificial viscosity do?', 'output': 'smoothed the shock transition without sacrificing basic physics'}
+      file_names: null # Path to a list of JSONL files corresponding to the source data.
+      global_batch_size: ${model.global_batch_size}
+      micro_batch_size: ${model.micro_batch_size}
+      shuffle: True
+      num_workers: 0
+      memmap_workers: 2
+      pin_memory: True
+      max_seq_length: 2048
+      min_seq_length: 1
+      drop_last: True
+      # Example of how to specify concat_sampling_probabilities
+      # concat_sampling_probabilities:
+      #   - 0.5
+      #   - 0.25
+      #   - 0.25
+      concat_sampling_probabilities: [1.0] # When providing a list of datasets, this arg defines the sampling probabilities from each dataset when strategy='random'
+      label_key: 'output'
+      add_eos: True
+      add_sep: False
+      add_bos: True
+      truncation_field: "input" # # Can be multiple keys separated with ',' Options: keys in prompt_template
+      index_mapping_dir: null # Path to a directory to write index mapping files.
+      prompt_template: "{input} {output}" # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
+      truncation_method: 'right' # Truncation from which position, Options: ['left', 'right'] 
+      ceil_to_power_2: True
+      get_attention_mask_from_fusion: True
+      pad_to_max_length: True
+    validation_ds:
+        file_names: null # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
+        names: null # Names of the corresponding datasets used to log metrics.
+        global_batch_size: ${model.global_batch_size}
+        micro_batch_size: ${model.micro_batch_size}
+        shuffle: False
+        num_workers: 0
+        memmap_workers: ${model.data.train_ds.memmap_workers}
+        pin_memory: True
+        max_seq_length: 2048
+        min_seq_length: 1
+        drop_last: False
+        label_key: ${model.data.train_ds.label_key}
+        add_eos: ${model.data.train_ds.add_eos}
+        add_sep: ${model.data.train_ds.add_sep}
+        add_bos: ${model.data.train_ds.add_bos}
+        write_predictions_to_file: False
+        output_file_path_prefix: null # Prefix of the file to write predictions to.
+        truncation_field: ${model.data.train_ds.truncation_field} # Options: keys in prompt_template
+        index_mapping_dir: null # Path to a directory to write index mapping files.
+        prompt_template: ${model.data.train_ds.prompt_template} # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
+        tokens_to_generate: 32 # decide how many tokens we want to generate to evaluate performance with string metrics
+        truncation_method: 'right' # Truncation from which position, Options: ['left', 'right']
+        ceil_to_power_2: True
+        get_attention_mask_from_fusion: True
+        pad_to_max_length: True
+        metric:
+          name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss']
+          average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
+          num_classes: null
+    test_ds:
+      file_names: null # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
+      names: null # Names of the corresponding datasets used to log metrics.
+      global_batch_size: ${model.global_batch_size}
+      micro_batch_size: ${model.micro_batch_size}
+      shuffle: False
+      num_workers: 0
+      memmap_workers: ${model.data.train_ds.memmap_workers}
+      pin_memory: True
+      max_seq_length: 2048
+      min_seq_length: 1
+      drop_last: False
+      label_key: ${model.data.train_ds.label_key}
+      add_eos: ${model.data.train_ds.add_eos}
+      add_sep: ${model.data.train_ds.add_sep}
+      add_bos: ${model.data.train_ds.add_bos}
+      write_predictions_to_file: False
+      output_file_path_prefix: null # Prefix of the file to write predictions to.
+      truncation_field: ${model.data.train_ds.truncation_field} # Options: keys in prompt_template
+      index_mapping_dir: null # Path to a directory to write index mapping files.
+      prompt_template: ${model.data.train_ds.prompt_template}
+      tokens_to_generate: 32 # decide how many tokens we want to generate to evaluate performance with string metrics
+      truncation_method: 'right' # Truncation from which position, Options: ['left', 'right']
+      ceil_to_power_2: True
+      get_attention_mask_from_fusion: True
+      pad_to_max_length: True
+      metric:
+        name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss']
+        average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
+        num_classes: null
+
+  optim:
+    name: distributed_fused_adam
+    lr: 2e-4
+    weight_decay: 0.01 
+    betas: 
+    - 0.9
+    - 0.98
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 500
+      constant_steps: 50000
+      min_lr: 2e-5
diff --git a/examples/nlp/language_modeling/tuning/conf/megatron_mamba_generate_config.yaml b/examples/nlp/language_modeling/tuning/conf/megatron_mamba_generate_config.yaml
new file mode 100644
index 000000000000..2d34aefffc7e
--- /dev/null
+++ b/examples/nlp/language_modeling/tuning/conf/megatron_mamba_generate_config.yaml
@@ -0,0 +1,298 @@
+name: megatron_mamba
+restore_from_path: ${model.restore_from_path} # used when starting from a .nemo file
+
+trainer:
+  devices: 1
+  num_nodes: 1
+  accelerator: gpu
+  precision: bf16
+  logger: False # logger provided by exp_manager
+  enable_checkpointing: False
+  use_distributed_sampler: False
+  max_epochs: -1 # PTL default. In practice we don't usually train for more than 1 epoch.
+  max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
+  log_every_n_steps: 10
+  val_check_interval: 100
+  limit_val_batches: 50
+  limit_test_batches: 500
+  accumulate_grad_batches: 1
+  gradient_clip_val: 1.0
+  benchmark: False
+
+exp_manager:
+  explicit_log_dir: null
+  exp_dir: null
+  name: megatron_mamba
+  create_wandb_logger: False
+  wandb_logger_kwargs:
+    project: null
+    name: null
+  resume_if_exists: True
+  resume_ignore_no_checkpoint: True
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: val_loss
+    save_top_k: 10
+    mode: min
+    always_save_nemo: False # saves nemo file during validation, not implemented for model parallel
+    filename: 'megatron_mamba--{val_loss:.2f}-{step}-{consumed_samples}'
+    model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}}
+
+model:
+  restore_from_path: null
+  # model parallelism 
+  mcore_gpt: True
+  micro_batch_size: 2
+  global_batch_size: 2
+  tensor_model_parallel_size: 1
+  pipeline_model_parallel_size: 1
+  virtual_pipeline_model_parallel_size: null
+  expert_model_parallel_size: 1 # expert model parallelism
+  hybrid_override_pattern: null
+  vocab_size: 65536
+  # model architecture
+  encoder_seq_length: 4096
+  max_position_embeddings: ${.encoder_seq_length}
+  position_embedding_type: 'none' # Position embedding type. Options ['learned_absolute', 'rope', 'alibi', 'kerple' , 'xpos', 'sandwich'] xpos and sandwich are experimental.
+  num_layers: 64
+  gated_linear_unit: False
+  num_query_groups: 8
+  ngroups_mamba: 8
+  attention_dropout: 0.0
+  hidden_dropout: 0.0
+  hidden_size: 4096
+  ffn_hidden_size: 14336 # Transformer FFN hidden size. Usually 4 * hidden_size.
+  num_attention_heads: 32
+  transformer_block_type: pre_ln
+  init_method_std: 0.02 # Standard deviation of the zero mean normal distribution used for weight initialization.')
+  kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null
+  apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number.
+  normalization: RMSNorm
+  layernorm_epsilon: 1e-5
+  num_moe_experts: 16
+  moe_router_topk: 2
+  moe_aux_loss_coeff: 0.001
+  make_vocab_size_divisible_by: 128 # Pad the vocab size to be divisible by this value for computation efficiency.
+  pre_process: True # add embedding
+  post_process: True # add pooler
+  megatron_legacy: False
+  persist_layer_norm: True
+  add_bias_linear: False
+
+  answer_only_loss: True
+
+  tokenizer:
+    library: 'huggingface'
+    type: 'EleutherAI/gpt-neox-20b' 
+    model: null 
+    vocab_file: null
+    merge_file: null 
+    sentencepiece_legacy: False
+    use_fast: True
+
+
+  # precision
+  native_amp_init_scale: 4294967296 # 2 ** 32
+  native_amp_growth_interval: 1000
+  fp32_residual_connection: False # Move residual connections to fp32
+  fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16
+
+  # Megatron O2-style half-precision
+  megatron_amp_O2: False # Enable O2-level automatic mixed precision using main parameters
+  grad_allreduce_chunk_size_mb: 125
+
+  # Fusion
+  grad_div_ar_fusion: True # Fuse grad division into torch.distributed.all_reduce. Only used with O2 and no pipeline parallelism..
+  gradient_accumulation_fusion: True # Fuse weight gradient accumulation to GEMMs. Only used with pipeline parallelism and O2.
+  bias_activation_fusion: False # Use a kernel that fuses the bias addition from weight matrices with the subsequent activation function.
+  bias_dropout_add_fusion: True # Use a kernel that fuses the bias addition, dropout and residual connection addition.
+  masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask.
+  get_attention_mask_from_fusion: True # When using fused softmax it will create the attention mask so we won't copy it to the pipeline stages.
+  apply_rope_fusion: True # Use a kernel to add rotary positional embeddings. Only used if position_embedding_type=rope
+
+
+  # miscellaneous
+  seed: 1234
+  use_cpu_initialization: False # Init weights on the CPU (slow for large models)
+  onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter.
+  gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
+  
+  ## Activation Checkpointing
+  # NeMo Megatron supports 'selective' activation checkpointing where only the memory intensive part of attention is checkpointed.
+  # These memory intensive activations are also less compute intensive which makes activation checkpointing more efficient for LLMs (20B+).
+  # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
+  # 'full' will checkpoint the entire transformer layer.
+  activations_checkpoint_granularity: null # 'selective' or 'full' 
+  activations_checkpoint_recurrent: False # If set to True, the checkpointing is only done for rglru and conv1d and not for attention and mlp layers
+  activations_checkpoint_method: null # 'uniform', 'block'
+  # 'uniform' divides the total number of transformer layers and checkpoints the input activation
+  # of each chunk at the specified granularity. When used with 'selective', 'uniform' checkpoints all attention blocks in the model.
+  # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity
+  activations_checkpoint_num_layers: null
+  # when using 'uniform' this creates groups of transformer layers to checkpoint. Usually set to 1. Increase to save more memory.
+  # when using 'block' this this will checkpoint the first activations_checkpoint_num_layers per pipeline stage.
+  num_micro_batches_with_partial_activation_checkpoints: null
+  # This feature is valid only when used with pipeline-model-parallelism.
+  # When an integer value is provided, it sets the number of micro-batches where only a partial number of Transformer layers get checkpointed
+  # and recomputed within a window of micro-batches. The rest of micro-batches in the window checkpoint all Transformer layers. The size of window is
+  # set by the maximum outstanding micro-batch backpropagations, which varies at different pipeline stages. The number of partial layers to checkpoint
+  # per micro-batch is set by 'activations_checkpoint_num_layers' with 'activations_checkpoint_method' of 'block'.
+  # This feature enables using activation checkpoint at a fraction of micro-batches up to the point of full GPU memory usage.
+  activations_checkpoint_layers_per_pipeline: null
+  # This feature is valid only when used with pipeline-model-parallelism.
+  # When an integer value (rounded down when float is given) is provided, it sets the number of Transformer layers to skip checkpointing at later
+  # pipeline stages. For example, 'activations_checkpoint_layers_per_pipeline' of 3 makes pipeline stage 1 to checkpoint 3 layers less than
+  # stage 0 and stage 2 to checkpoint 6 layers less stage 0, and so on. This is possible because later pipeline stage
+  # uses less GPU memory with fewer outstanding micro-batch backpropagations. Used with 'num_micro_batches_with_partial_activation_checkpoints',
+  # this feature removes most of activation checkpoints at the last pipeline stage, which is the critical execution path.
+  sequence_parallel: False
+  
+  peft:
+    peft_scheme: null  # can be either adapter,ia3, lora, or ptuning
+    restore_from_path: null
+
+    # Used for adapter peft training
+    adapter_tuning:
+      type: 'parallel_adapter' # this should be either 'parallel_adapter' or 'linear_adapter'
+      adapter_dim: 32
+      adapter_dropout: 0.0
+      norm_position: 'pre' # This can be set to 'pre', 'post' or null, 'pre' is normally what is used.
+      column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      norm_type: 'mixedfusedlayernorm' # IGNORED if layer_adapter is used,  options are ['layernorm', 'mixedfusedlayernorm']
+      layer_selection: null  # selects in which layers to add adapters, e.g. [1,12] will add adapters to layer 1 (lowest) and 12. null will apply adapters to all layers
+      weight_tying: False
+      position_embedding_strategy: null # used only when weight_tying is True
+
+    lora_tuning:
+      target_modules: ['all'] # this can either be 'attention_qkv','attention_dense','mlp_fc1','mlp_fc2', attention (qkv & dense), mlp (fc1 & fc2)
+      adapter_dim: 32
+      alpha: 32
+      adapter_dropout: 0.0
+      column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      layer_selection: null # selects in which layers to add lora adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers
+      weight_tying: False
+      position_embedding_strategy: null # used only when weight_tying is True
+
+    # Used for p-tuning peft training
+    p_tuning:
+      virtual_tokens: 10  # The number of virtual tokens the prompt encoder should add at the start of the sequence
+      bottleneck_dim: 1024  # the size of the prompt encoder mlp bottleneck
+      embedding_dim: 1024  # the size of the prompt encoder embeddings
+      init_std: 0.023
+
+    ia3_tuning:
+      layer_selection:  null  # selects in which layers to add ia3 adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers
+    
+    selective_tuning:
+      tunable_base_param_names: ["self_attention", "word_embeddings"]  # TODO: regex support @adithyre
+
+  data:
+    test_ds:
+      file_names: ??? # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
+      names: ??? # Names of the corresponding datasets used to log metrics.
+      global_batch_size: 1
+      micro_batch_size: 1
+      shuffle: False
+      num_workers: 0
+      pin_memory: True
+      max_seq_length: 2048
+      min_seq_length: 1
+      drop_last: False
+      context_key: 'input'
+      label_key: 'output'
+      add_eos: True
+      add_sep: False
+      add_bos: True
+      write_predictions_to_file: False
+      output_file_path_prefix: null # Prefix of the file to write predictions to.
+      truncation_field: "input" # Options: keys in prompt_template
+      index_mapping_dir: null # Path to a directory to write index mapping files.
+      prompt_template: "{input} {output}"
+      tokens_to_generate: 32 # decide how many tokens we want to generate to evaluate performance with string metrics
+      truncation_method: 'right' # Truncation from which position, Options: ['left', 'right']
+      ceil_to_power_2: True
+      get_attention_mask_from_fusion: True
+      pad_to_max_length: True
+
+      metric:
+        name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss']
+        average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
+        num_classes: null
+
+inference:
+  greedy: True # Whether or not to use sampling ; use greedy decoding otherwise
+  top_k: 0  # The number of highest probability vocabulary tokens to keep for top-k-filtering.
+  top_p: 0.9 # If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation.
+  temperature: 1.0 # sampling temperature
+  all_probs: False  # whether return the log prob for all the tokens in vocab
+  repetition_penalty: 1.2  # The parameter for repetition penalty. 1.0 means no penalty.
+  min_tokens_to_generate: 0  # The minimum length of the sequence to be generated.
+  compute_logprob: False  # a flag used to compute logprob of all the input text, a very special case of running inference, default False
+  outfile_path: output.txt
+  compute_attention_mask: True
+
+# server-related configs
+server: False  # whether launch the API server
+port: 5555 # the port number for the inference server
+web_server: False # whether launch the web inference server
+share: True  # whether create a public URL
+username: test # user name for web client
+password: test2  # password for web client
+web_port: 9889 # the port number of the web server 1058
+chat: False # use the chat interface
+chatbot_config:
+  value: False   # whether to inject the value attributes
+  attributes:
+    - name: Quality
+      min: 0
+      max: 4
+      key: quality
+      type: int
+      default: 4
+    - name: Toxicity
+      min: 0
+      max: 4
+      key: toxcity
+      type: int
+      default: 0
+    - name: Humor
+      min: 0
+      max: 4
+      key: humor
+      type: int
+      default: 0
+    - name: Creativity
+      min: 0
+      max: 4
+      key: creativity
+      type: int
+      default: 0
+    - name: Violence
+      min: 0
+      max: 4
+      key: violence
+      type: int
+      default: 0
+    - name: Helpfulness
+      min: 0
+      max: 4
+      key: helpfulness
+      type: int
+      default: 4
+    - name: Not_Appropriate
+      min: 0
+      max: 4
+      key: not_appropriate
+      type: int
+      default: 0
+    - name: Language
+      choices: ['ar', 'bg', 'bn', 'ca', 'cs', 'da', 'de', 'el', 'en', 'eo', 'es', 'eu', 'fa', 'fi', 'fr', 'gl', 'he', 'hu', 'id', 'it', 'ja', 'ko', 'nb', 'nl', 'pl', 'pt', 'ro', 'ru', 'sk', 'sv', 'th', 'tr', 'uk', 'vi', 'zh']
+      key: lang
+      type: list
+      default: en
+   
+  user: User
+  assistant: Assistant
+  system: "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n"
\ No newline at end of file
diff --git a/examples/nlp/language_modeling/tuning/megatron_mamba_finetuning.py b/examples/nlp/language_modeling/tuning/megatron_mamba_finetuning.py
new file mode 100644
index 000000000000..0613ef486ec3
--- /dev/null
+++ b/examples/nlp/language_modeling/tuning/megatron_mamba_finetuning.py
@@ -0,0 +1,60 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch.multiprocessing as mp
+from omegaconf.omegaconf import OmegaConf
+
+from nemo.collections.nlp.models.language_modeling.megatron_mamba_sft_model import MegatronMambaSFTModel
+from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronLMPPTrainerBuilder
+from nemo.collections.nlp.parts.peft_config import PEFT_CONFIG_MAP
+from nemo.core.config import hydra_runner
+from nemo.utils import logging
+from nemo.utils.exp_manager import exp_manager
+
+mp.set_start_method("spawn", force=True)
+
+
+@hydra_runner(config_path="conf", config_name="megatron_mamba_finetuning_config")
+def main(cfg) -> None:
+
+    logging.info("\n\n************** Experiment configuration ***********")
+    logging.info(f'\n{OmegaConf.to_yaml(cfg)}')
+
+    precision = cfg.trainer.precision
+    trainer = MegatronLMPPTrainerBuilder(cfg).create_trainer()
+    # Restore the precision value after Trainer is built.
+    cfg.trainer.precision = precision
+    exp_manager(trainer, cfg.exp_manager)
+
+    model_cfg = MegatronMambaSFTModel.merge_cfg_with(cfg.model.restore_from_path, cfg)
+    model = MegatronMambaSFTModel.restore_from(cfg.model.restore_from_path, model_cfg, trainer=trainer)
+
+    peft_cfg_cls = PEFT_CONFIG_MAP[cfg.model.peft.peft_scheme]
+
+    if cfg.model.peft.restore_from_path is not None:
+        # initialize peft weights from a check`point instead of randomly
+        # This is not the same as resume training because optimizer states are not restored.
+        logging.info("PEFT Weights will be loaded from", cfg.model.peft.restore_from_path)
+        model.load_adapters(cfg.model.peft.restore_from_path, peft_cfg_cls(model_cfg))
+    elif peft_cfg_cls is not None:
+        logging.info("Adding adapter weights to the model for PEFT")
+        model.add_adapter(peft_cfg_cls(model_cfg))
+    else:
+        logging.info(f"Running full finetuning since no peft scheme is given.\n{model.summarize()}")
+
+    trainer.fit(model)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/examples/nlp/language_modeling/tuning/megatron_mamba_generate.py b/examples/nlp/language_modeling/tuning/megatron_mamba_generate.py
new file mode 100644
index 000000000000..6f660d552fc6
--- /dev/null
+++ b/examples/nlp/language_modeling/tuning/megatron_mamba_generate.py
@@ -0,0 +1,69 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import os
+import torch.multiprocessing as mp
+from omegaconf.omegaconf import OmegaConf
+from nemo.collections.nlp.models.language_modeling.megatron_mamba_sft_model import MegatronMambaSFTModel
+from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronLMPPTrainerBuilder
+from nemo.collections.nlp.parts.peft_config import PEFT_CONFIG_MAP
+from nemo.core.config import hydra_runner
+from nemo.utils import logging
+from nemo.utils.model_utils import inject_model_parallel_rank
+
+
+mp.set_start_method("spawn", force=True)
+
+
+@hydra_runner(config_path="conf", config_name="megatron_mamba_generate_config")
+def main(cfg) -> None:
+    logging.info("\n\n************** Experiment configuration ***********")
+    logging.info(f"\n{OmegaConf.to_yaml(cfg)}")
+    trainer = MegatronLMPPTrainerBuilder(cfg).create_trainer()
+
+    if cfg.model.peft.restore_from_path:
+        model_cfg = MegatronMambaSFTModel.merge_inference_cfg(cfg.model.peft.restore_from_path, cfg)
+    else:
+        model_cfg = MegatronMambaSFTModel.merge_inference_cfg(cfg.model.restore_from_path, cfg)
+
+    model = MegatronMambaSFTModel.restore_from(cfg.model.restore_from_path, model_cfg, trainer=trainer)
+
+    if cfg.model.peft.restore_from_path:
+        model.load_adapters(cfg.model.peft.restore_from_path)
+    elif cfg.model.peft.restore_from_ckpt.checkpoint_dir and cfg.model.peft.restore_from_ckpt.checkpoint_name:
+        peft_cfg_cls = PEFT_CONFIG_MAP[cfg.model.peft.peft_scheme]
+        checkpoint_path = os.path.join(
+            cfg.model.peft.restore_from_ckpt.checkpoint_dir, cfg.model.peft.restore_from_ckpt.checkpoint_name
+        )
+        # checkpoint_path is a dir in case of distributed checkpointing
+        if not os.path.isdir(checkpoint_path):
+            # legacy checkpoint needs model parallel rank injection
+            checkpoint_path = inject_model_parallel_rank(
+                os.path.join(
+                    cfg.model.peft.restore_from_ckpt.checkpoint_dir, cfg.model.peft.restore_from_ckpt.checkpoint_name
+                )
+            )
+            model.load_adapters(checkpoint_path, peft_cfgs=peft_cfg_cls(model_cfg))
+        else:
+            raise NotImplementedError("distributed checkpointing of PEFT weights is not supported")
+
+    model.freeze()
+    logging.info(f"Freezing parameters for PEFT eval:\n{model.summarize()}")
+
+    trainer.test(model)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_mamba_model.py b/nemo/collections/nlp/models/language_modeling/megatron_mamba_model.py
new file mode 100644
index 000000000000..fb8a04b947b0
--- /dev/null
+++ b/nemo/collections/nlp/models/language_modeling/megatron_mamba_model.py
@@ -0,0 +1,91 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+# from megatron.core.models.mamba import MambaModel
+# from megatron.core.models.mamba.mamba_layer_specs import mamba_stack_spec
+from omegaconf.dictconfig import DictConfig
+from pytorch_lightning.trainer.trainer import Trainer
+
+from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
+from nemo.utils import logging
+
+
+class MegatronMambaModel(MegatronGPTModel):
+    """
+    Megatron Mamba pretraining.
+    """
+
+    def __init__(self, cfg: DictConfig, trainer: Trainer):
+
+        self.vocab_size = cfg.get('vocab_size', 65536)
+        self.cfg = cfg
+        super().__init__(cfg=cfg, trainer=trainer)
+        logging.warning("Overriding mcore_gpt=True")
+        self.mcore_gpt = True
+
+    def model_provider_func(self, pre_process, post_process):
+
+        self.hybrid_override_pattern = self.cfg.get(
+            'hybrid_override_pattern', "M" * self.transformer_config.num_layers
+        )
+        self.transformer_config.add_bias_linear = self.cfg.get('add_bias_linear', False)
+        self.transformer_config.gated_linear_unit = self.cfg.get('gated_linear_unit', False)
+        self.transformer_config.layernorm_epsilon = self.cfg.get('layernorm_epsilon', 1e-5)
+
+        # TODO @ataghibakhsh: add mamba_ssm_ngroups=self.cfg.get('mamba_ssm_ngroups', 8) once MLM MR merged
+        # TODO @ataghibakhsh: add the following
+        '''MambaModel(
+            config=self.transformer_config,
+            max_sequence_length=self.cfg.get('encoder_seq_length', 4096),
+            vocab_size=self.cfg.get('vocab_size', 65536),
+            mamba_stack_spec=mamba_stack_spec,
+            hybrid_override_pattern=self.hybrid_override_pattern,
+        )'''
+        # after package mismatch is resovled
+        model = None
+
+        return model
+
+    def forward(self, input_ids, position_ids=None, attention_mask=None, labels=None):
+
+        output_tensor = self.model(
+            input_ids=input_ids, position_ids=position_ids, attention_mask=attention_mask, labels=labels
+        )
+        return output_tensor
+
+    def build_transformer_config(self):
+        transformer_config = super().build_transformer_config()
+        return transformer_config
+
+    def on_validation_epoch_end(self):
+
+        averaged_loss = torch.tensor(0.0, dtype=torch.float32).cuda()
+        return averaged_loss
+
+    def sharded_state_dict(self, prefix: str = ''):
+        return None
+
+    def _reset_activation_checkpointing_args(self):
+        return
+
+    def _restore_activation_checkpointing_args(self):
+        return
+
+    def _reset_sequence_parallelism_args(self):
+        return
+
+    def _restore_sequence_parallelism_args(self):
+        return
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_mamba_sft_model.py b/nemo/collections/nlp/models/language_modeling/megatron_mamba_sft_model.py
new file mode 100644
index 000000000000..ebcc47004711
--- /dev/null
+++ b/nemo/collections/nlp/models/language_modeling/megatron_mamba_sft_model.py
@@ -0,0 +1,47 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from omegaconf import DictConfig
+from omegaconf.dictconfig import DictConfig
+from pytorch_lightning.trainer.trainer import Trainer
+
+from nemo.collections.nlp.models.language_modeling.megatron_base_model import MegatronBaseModel
+from nemo.collections.nlp.models.language_modeling.megatron_gpt_sft_model import MegatronGPTSFTModel
+from nemo.collections.nlp.models.language_modeling.megatron_mamba_model import MegatronMambaModel
+
+
+__all__ = ['MegatronMambaSFTModel']
+
+
+class MegatronMambaSFTModel(MegatronGPTSFTModel, MegatronMambaModel):
+    """
+    Megatron Jamba Supervised Fine-Tuning
+    """
+
+    def __init__(self, cfg: DictConfig, trainer: Trainer):
+
+        super().__init__(cfg, trainer=trainer)
+        self.mcore_gpt = True
+        self.validation_param_sync_overlap = self.cfg.get('validation_param_sync_overlap', False)
+
+    def _reset_activation_checkpointing_args(self):
+        pass
+
+    def on_validation_model_zero_grad(self) -> None:
+        """
+        Skip gradient zeroing at the beginning of validation routine.
+        This is needed when overlapping the AllGather of the updated parameters with the following valdation step.
+        """
+        if not self.validation_param_sync_overlap:
+            MegatronBaseModel.on_validation_model_zero_grad(self)
diff --git a/nemo/collections/nlp/modules/common/text_generation_strategy.py b/nemo/collections/nlp/modules/common/text_generation_strategy.py
index 238c01695f42..f51d53ba5944 100644
--- a/nemo/collections/nlp/modules/common/text_generation_strategy.py
+++ b/nemo/collections/nlp/modules/common/text_generation_strategy.py
@@ -988,6 +988,7 @@ def model_inference_strategy_dispatcher(model, **args):
         MegatronGPTPromptLearningModel,
     )
     from nemo.collections.nlp.models.language_modeling.megatron_griffin_model import MegatronGriffinModel
+    from nemo.collections.nlp.models.language_modeling.megatron_mamba_model import MegatronMambaModel
     from nemo.collections.nlp.models.language_modeling.megatron_retrieval_model import MegatronRetrievalModel
     from nemo.collections.nlp.models.language_modeling.megatron_retro_model import MegatronRetroModel
     from nemo.collections.nlp.modules.common.retro_inference_strategies import (
@@ -998,6 +999,8 @@ def model_inference_strategy_dispatcher(model, **args):
 
     if isinstance(model, MegatronGriffinModel):
         return GriffinModelTextGenerationStrategy(model)
+    if isinstance(model, MegatronMambaModel):
+        return GPTModelTextGenerationStrategy(model)
     if isinstance(model, MegatronNevaModel):
         return NevaModelTextGenerationStrategy(model)
     if isinstance(model, MegatronGPTPromptLearningModel):
diff --git a/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py b/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py
index 7d294f6085bb..34ca175470ab 100644
--- a/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py
+++ b/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py
@@ -17,6 +17,7 @@
 from typing import List, Optional, Union
 
 import torch
+from megatron.core.transformer.identity_op import IdentityOp
 from omegaconf import DictConfig, OmegaConf, open_dict
 
 from nemo.utils.model_utils import inject_model_parallel_rank
@@ -178,9 +179,10 @@ def _check_and_add_peft_cfg(self, peft_cfg):
                 for layer in layers:
                     if layer.layer_number in (layer_selection or list(range(1, self.cfg.num_layers + 1))):
                         for name, module in layer.named_modules():
-                            self._check_and_add_adapter(
-                                name, module, adapter_name, adapter_cfg, name_key_to_mcore_mixins
-                            )
+                            if not isinstance(module, IdentityOp):
+                                self._check_and_add_adapter(
+                                    name, module, adapter_name, adapter_cfg, name_key_to_mcore_mixins
+                                )
             else:
                 # Non GPT models, as well as GPT+PTuning do not support layer selection
                 if layer_selection is not None:
diff --git a/requirements/requirements_nlp.txt b/requirements/requirements_nlp.txt
index 494a9ab6d672..d006ccb7ad65 100644
--- a/requirements/requirements_nlp.txt
+++ b/requirements/requirements_nlp.txt
@@ -10,6 +10,7 @@ gdown
 h5py
 ijson
 jieba
+mamba-ssm==1.2.0.post1
 markdown2
 matplotlib>=3.3.2
 #megatron_core>0.6.0 # add back once mcore on pypi is compatible again
diff --git a/scripts/checkpoint_converters/convert_mamba2_pyt_to_nemo.py b/scripts/checkpoint_converters/convert_mamba2_pyt_to_nemo.py
new file mode 100644
index 000000000000..9a44f9c2c5c4
--- /dev/null
+++ b/scripts/checkpoint_converters/convert_mamba2_pyt_to_nemo.py
@@ -0,0 +1,159 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import re
+from argparse import ArgumentParser
+from collections import defaultdict
+import torch
+from omegaconf.omegaconf import OmegaConf
+from nemo.collections.nlp.models.language_modeling.megatron_mamba_model import MegatronMambaModel
+from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronLMPPTrainerBuilder
+from nemo.collections.nlp.parts.utils_funcs import torch_dtype_from_precision
+from nemo.utils import logging
+
+'''
+Example
+
+CUDA_VISIBLE_DEVICES="0" python /NeMo/scripts/checkpoint_converters/convert_mamba2_pyt_to_nemo.py \
+                                --input_name_or_path <path to the source pytorch model> \
+                                --output_path <path to target .nemo model> \
+                                --ngroups_mamba 8 \
+                                --precision bf16
+'''
+
+
+def get_args():
+    parser = ArgumentParser()
+    parser.add_argument(
+        "--hparams_file",
+        type=str,
+        default=f"{os.path.dirname(__file__)}/../../examples/nlp/language_modeling/conf/megatron_mamba_config.yaml",
+        required=False,
+        help="Path config for restoring. It's created during training and may need to be modified during restore if restore environment is different than training. Ex: /raid/nemo_experiments/megatron_gpt/hparams.yaml",
+    )
+    parser.add_argument("--output_path", type=str, default=None, required=True, help="Path to output .nemo file.")
+    parser.add_argument(
+        "--input_name_or_path",
+        type=str,
+        required=True,
+    )
+    parser.add_argument("--ngroups_mamba", type=int, default=8, help="ngroups for Mamba model")
+    parser.add_argument(
+        "--precision", type=str, default="bf16", choices=["bf16", "32"], help="Precision for checkpoint weights saved"
+    )
+    args = parser.parse_args()
+    return args
+
+
+def convert(args):
+
+    checkpoint_weights = torch.load(args.input_name_or_path, map_location='cpu')['model']
+    new_state_dict = {}
+
+    if 'backbone' in list(checkpoint_weights.keys())[0]:
+
+        layer_keys = [key for key in checkpoint_weights.keys() if re.match(r'backbone\.layers\.\d+\.', key)]
+        layer_numbers = set(int(re.search(r'backbone\.layers\.(\d+)\.', key).group(1)) for key in layer_keys)
+        num_layers = max(layer_numbers) + 1
+
+        direct_mappings = {
+            'model.embedding.word_embeddings.weight': 'backbone.embedding.weight',
+            'model.decoder.final_norm.weight': 'backbone.norm_f.weight',
+            'model.output_layer.weight': 'lm_head.weight',
+        }
+
+        for new_key, old_key in direct_mappings.items():
+            new_state_dict[new_key] = checkpoint_weights[old_key]
+
+        layer_attributes = [
+            'mixer.A_log',
+            'mixer.D',
+            'mixer.conv1d.weight',
+            'mixer.conv1d.bias',
+            'mixer.in_proj.weight',
+            'mixer.dt_bias',
+            'mixer.out_proj.weight',
+            'mixer.norm.weight',
+            'norm.weight',
+        ]
+
+        for i in range(num_layers):
+            for attr in layer_attributes:
+                new_key = f'model.decoder.layers.{i}.{attr}'
+                old_key = f'backbone.layers.{i}.{attr}'
+                new_state_dict[new_key] = checkpoint_weights[old_key]
+
+    else:
+
+        layer_keys = [key for key in checkpoint_weights.keys() if re.match(r'decoder\.layers\.\d+\.', key)]
+        layer_numbers = set(int(re.search(r'decoder\.layers\.(\d+)\.', key).group(1)) for key in layer_keys)
+        num_layers = max(layer_numbers) + 1
+
+        new_state_dict = {"model." + key: value for key, value in checkpoint_weights.items()}
+
+    layers = defaultdict(list)
+
+    for key in new_state_dict.keys():
+        match = re.match(r'model\.decoder\.layers\.(\d+)\.(\w+)', key)
+        if match:
+            index, layer_type = match.groups()
+            layers[index].append(layer_type)
+
+    layer_pattern = ''
+    for i in range(max(map(int, layers.keys())) + 1):
+        index_str = str(i)
+        layer_types = layers.get(index_str, [])
+        if 'mixer' in layer_types:
+            layer_pattern += 'M'
+        elif 'self_attention' in layer_types:
+            layer_pattern += '*'
+        elif 'mlp' in layer_types:
+            layer_pattern += '-'
+        else:
+            raise AssertionError("Layer not found. Each layer must be eiher MLP, Mamba, or Attention")
+
+    nemo_config = OmegaConf.load(args.hparams_file)
+    nemo_config.trainer["precision"] = args.precision
+    nemo_config.model.vocab_size, nemo_config.model.hidden_size = new_state_dict[
+        'model.embedding.word_embeddings.weight'
+    ].shape
+    nemo_config.model.num_layers = num_layers
+    nemo_config.model.hybrid_override_pattern = layer_pattern
+    nemo_config.model.ngroups_mamba = args.ngroups_mamba
+
+    if "-" in layer_pattern:
+        nemo_config.model.ffn_hidden_size = new_state_dict[
+            f'model.decoder.layers.{layer_pattern.index("-")}.mlp.linear_fc1.weight'
+        ].shape[0]
+    else:
+        nemo_config.model.ffn_hidden_size = nemo_config.model.hidden_size
+
+    nemo_config.model.use_cpu_initialization = True
+
+    logging.info(f"Loading Mamba2 Pytorch checkpoint : `{args.input_name_or_path}`")
+
+    trainer = MegatronLMPPTrainerBuilder(nemo_config).create_trainer()
+    nemo_model_from_pyt = MegatronMambaModel(nemo_config.model, trainer)
+
+    nemo_model_from_pyt.load_state_dict(new_state_dict, strict=True)
+    dtype = torch_dtype_from_precision(args.precision)
+    nemo_model_from_pyt = nemo_model_from_pyt.to(dtype=dtype)
+    nemo_model_from_pyt.save_to(args.output_path)
+    logging.info(f'Mamba2 NeMo model saved to: {args.output_path}')
+
+
+if __name__ == '__main__':
+    args = get_args()
+    convert(args)
diff --git a/tutorials/llm/mamba/mamba.rst b/tutorials/llm/mamba/mamba.rst
new file mode 100644
index 000000000000..c09a6ae03087
--- /dev/null
+++ b/tutorials/llm/mamba/mamba.rst
@@ -0,0 +1,301 @@
+Mamba2 and Mamba2-Transformer Hybrid Models Fine-Tuning
+=======================================================
+
+`State Space Models (SSMs) <https://arxiv.org/pdf/2405.21060>`__ have recently emerged as a promising alternative to transformers. SSMs offer advantages such as linear time complexity relative to sequence length and a constant cache size for inference. These features enable the processing of longer sequences and higher throughput. Despite these benefits, SSMs alone may fall short compared to transformers on tasks that demand strong copying or in-context learning capabilities.
+
+To harness the strengths of both approaches, SSM-Hybrid models incorporate MLP, Transformer, and SSM blocks in their architecture. As highlighted in `a study by NVIDIA <https://arxiv.org/pdf/2406.07887>`__, these hybrid models outperform traditional transformers of the same size by achieving faster inference times due to the inclusion of SSM blocks. Based on experimental results, Mamba2-Hybrid models not only surpass transformer baselines in performance but also benefit from increased computational efficiency.
+
+The Mamba2 models discussed in the `Transformers are SSMs <https://arxiv.org/pdf/2405.21060>`__ paper are available in five different sizes: 130 million, 370 million, 780 million, 1.3 billion, and 2.7 billion parameters. The Mamba2-Hybrid models, along with their Mamba2 baseline as released by `NVIDIA <https://arxiv.org/pdf/2406.07887>`__, are provided in an 8 billion parameter size.
+
+`Low-Rank Adaptation (LoRA) <https://arxiv.org/pdf/2106.09685>`__ has emerged as a popular Parameter Efficient Fine-Tuning (PEFT) technique that tunes a very small number of additional parameters as compared to full fine-tuning, thereby reducing the compute required. LoRA tuning can be applied to the linear layers in the Transformer and MLP blocks for the Mamba2-Hybrid models. 
+
+`NVIDIA NeMo
+Framework <https://docs.nvidia.com/nemo-framework/user-guide/latest/overview.html>`__ provides tools to perform Fine-tuning on Mamba2 and Mamba2-Hybrid to fit your use case.
+
+Requirements
+-------------
+
+In order to proceed, ensure that you have met the following requirements:
+
+* Full Fine-Tuning System Configuration
+    * Small models (130m, 370m, 780m)
+        * Access to at least 1 NVIDIA GPU with a cumulative memory of at least 40GB, for example: 1 x A6000-40GB.
+
+    * Mid-size models (1.3b, 2.7b)
+        * Access to at least 1 NVIDIA GPU with a cumulative memory of at least 80GB, for example: 1 x H100-80GB or 1 x A100-80GB.
+
+    * Large models (8b)
+        * Access to at least 2 NVIDIA GPUs with a cumulative memory of at least 80GB, for example: 2 x H100-80GB or 2 x A100-80GB.
+
+* LoRA Fine-Tuning (Mamba2-Hybrid only) System Configuration
+    * Access to at least 1 NVIDIA GPU with a cumulative memory of at least 80GB, for example: 1 x H100-80GB or 1 x A100-80GB.
+
+
+
+* A Docker-enabled environment, with `NVIDIA Container Runtime <https://developer.nvidia.com/container-runtime>`_ installed, which will make the container GPU-aware.
+
+
+* `Authenticate with NVIDIA NGC <https://docs.nvidia.com/nim/large-language-models/latest/getting-started.html#ngc-authentication>`_, and download `NGC CLI Tool <https://docs.nvidia.com/nim/large-language-models/latest/getting-started.html#ngc-cli-tool>`_.
+
+
+Step-by-step Guide for Fine-Tuning 
+----------------------------------
+
+Checkpoints from HuggingFace
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Obtain the desired checkpoint from HuggigFace. 
+
+* `Repository <https://huggingface.co/state-spaces>`__  for the Mamba2 models from the `Transformers are SSMs paper <https://arxiv.org/pdf/2405.21060>`__.
+* `Repository <https://huggingface.co/collections/nvidia/ssms-666a362c5c3bb7e4a6bcfb9c>`__  for the Mamba2 and Mamba2-Hybrid models by `NVIDIA <https://arxiv.org/pdf/2406.07887>`__.
+
+
+Convert the Pytorch Checkpoint to a NeMo Checkpoint
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+1. Get into NVIDIA Container 
+
+2. Run the conversion script from <SCRIPT-PATH>. For this conversion script, you should provide the PyTorch state dictionary of the model for ``input_name_or_path``, i.e. this argument only accepts a single ``state_dict``.
+
+.. code:: bash
+
+    CUDA_VISIBLE_DEVICES="0" python /NeMo/scripts/checkpoint_converters/convert_mamba2_pyt_to_nemo.py \
+                                    --input_name_or_path <path to the source pytorch model> \
+                                    --output_path <path to target .nemo model> \
+                                    --ngroups_mamba 8 \
+                                    --precision bf16
+
+* Note: the ``ngroups_mamba`` parameter should be 1 for the Mamba2 models from the `Transformers are SSMs paper <https://arxiv.org/pdf/2405.21060>`__ (130m, 370m, 780m, 1.3b, and 2.7b) and 8 for the Mamba2 and Mamba2-Hybrid models by `NVIDIA <https://arxiv.org/pdf/2406.07887>`__ (both 8b).
+
+Model (Tensor) Parallelism for the 8b Models
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+* Note: Distributed checkpointing for the Mamba2 and Mamba2-Hybrid models will be implemented in the near future. For now, you should use the method below for converting to Tensor Parallel (TP) of different sizes. 
+
+The HuggingFace checkpoint for the 8b model is for TP of size 1, and so is the ``.nemo`` checkpoint obtained for the previous step. To shard the model weights for a larger TP size, use the script from <SCRIPT PATH>. The example below is for a target TP of size 4.
+
+.. code:: bash
+   
+   python /opt/NeMo/examples/nlp/language_modeling/mamba_change_num_partition.py \
+          --model_file=<path to source .nemo model> \
+          --target_file=<path to target .nemo model> \
+          --tensor_model_parallel_size=1 \
+          --target_tensor_model_parallel_size=4 \
+          --precision=bf16 \
+
+After running this script, a ``.nemo`` model along with the TP-size number of folders (4 in this example) will be generated in the target path. The folders for each rank will be displayed as ``mp_rank_00`` to ``mp_rank_03`` in this example. 
+
+* Note: You can only use Tensor Parallelism for the 8b models by `NVIDIA <https://arxiv.org/pdf/2406.07887>`__ (Mamba2 8b and Mamba2-Hybrid 8b). This is due to the fact that the ``nroups`` parameter in the model architecture should be divisible by TP size. ``nroups`` parameter is 8 for NVIDIA models and 1 for other models in the list.
+
+Run Fine-Tuning
+^^^^^^^^^^^^^^^
+1. Follow the steps from `here <https://nemo-framework-tme.gitlab-master-pages.nvidia.com/documentation/user-guide/latest/llms/gemma/dataprep.html>`__ to obtain and preprocess the fine-tuning dataset.
+
+2. For full fine-tuning, run the following script
+
+.. code:: bash
+
+    #!/bin/bash
+
+    MBS=4
+    GBS=128
+    TP=2 # According to the saved checkpoint
+    SP=True # True only if TP>1 otherwise False
+    SEQ_LEN=2048
+    NUM_DEVICES=2
+    MODEL="8b-hybrid"
+    PATH_TO_NEMO_MODEL=<path to .nemo file>
+    TRAIN_DATASET_PATH=<path to training dataset file>
+    VAL_DATASET_PATH=<path to validation dataset file>
+    CONFIG_PATH="/opt/NeMo/examples/nlp/language_modeling/conf/"
+    CONFIG_NAME="megatron_mamba_finetuning_config"
+    SAVE_DIR=<path to the saving directory>
+    TOKENIZER_MODEL=<path to tokenizer model> # Only for the 8b models, for other models, set to null
+
+    declare -A MODEL_CONFIGS
+    MODEL_CONFIGS[130m]="24 768 768 50288 1 huggingface EleutherAI/gpt-neox-20b" 
+    MODEL_CONFIGS[370m]="48 1024 1024 50288 1 huggingface EleutherAI/gpt-neox-20b" 
+    MODEL_CONFIGS[780m]="48 1536 1536 50288 1 huggingface EleutherAI/gpt-neox-20b" 
+    MODEL_CONFIGS[1_3b]="48 2048 2048 50288 1 huggingface EleutherAI/gpt-neox-20b" 
+    MODEL_CONFIGS[2_7b]="64 2560 2560 50288 1 huggingface EleutherAI/gpt-neox-20b" 
+    MODEL_CONFIGS[8b]="56 4096 16384 256000 8 megatron GPTSentencePieceTokenizer" 
+    MODEL_CONFIGS[8b-hybrid]="56 4096 16384 256000 8 megatron GPTSentencePieceTokenizer" 
+
+    if [ "$MODEL" = "8b-hybrid" ]; then
+        export HYBRID_PATTERN='M-M-M--M-M*-M-M-M-M--M*-M-M-M-M-M*--M-M-M-M-M*-M--M-M-M-'
+    else
+        export HYBRID_PATTERN=''
+    fi
+
+    set_model_params() {
+        local config=(${MODEL_CONFIGS[$MODEL]})
+        NUM_LAYERS=${config[0]}
+        DIM=${config[1]}
+        FFN_DIM=${config[2]}
+        VOCAB_SIZE=${config[3]}
+        NGROUP=${config[4]}
+        TOKENIZER_LIB=${config[5]}
+        TOKENIZER_TYPE=${config[6]}
+    }
+    set_model_params
+
+    export NVTE_FUSED_ATTN=1
+    export NVTE_FLASH_ATTN=0
+
+    MASTER_PORT=15008 torchrun --nproc_per_node=${NUM_DEVICES} 
+            /home/ataghibakhsh/NeMo/examples/nlp/language_modeling/tuning/megatron_mamba_finetuning.py \
+            --config-path=${CONFIG_PATH} \
+            --config-name=${CONFIG_NAME} \
+            trainer.devices=${NUM_DEVICES} \
+            trainer.precision=bf16 \
+            trainer.accelerator=gpu \
+            trainer.log_every_n_steps=1 \
+            trainer.val_check_interval=100 \
+            trainer.limit_val_batches=50 \
+            +trainer.num_sanity_val_steps=0 \
+            +trainer.accumulate_grad_batches=1 \
+            trainer.max_steps=700 \
+            trainer.gradient_clip_val=1.0 \
+            exp_manager.exp_dir=${SAVE_DIR} \
+            exp_manager.resume_if_exists=True \
+            exp_manager.create_checkpoint_callback=True \
+            exp_manager.create_wandb_logger=True \
+            model.hybrid_override_pattern=${HYBRID_PATTERN} \
+            model.ngroups_mamba=${NGROUP} \
+            model.tensor_model_parallel_size=${TP} \
+            model.sequence_parallel=$SP \
+            model.tokenizer.library=${TOKENIZER_LIB} \
+            model.tokenizer.type=${TOKENIZER_TYPE} \
+            model.tokenizer.model=${TOKENIZER_MODEL} \
+            model.vocab_size=${VOCAB_SIZE} \
+            model.num_layers=${NUM_LAYERS} \
+            model.hidden_size=${DIM} \
+            model.ffn_hidden_size=${FFN_DIM} \
+            model.peft.peft_scheme='none' \
+            model.megatron_amp_O2=True \
+            model.encoder_seq_length=${SEQ_LEN} \
+            model.data.validation_ds.pad_to_max_length=True \
+            model.data.train_ds.pad_to_max_length=True \
+            model.optim.name="distributed_fused_adam" \
+            model.data.train_ds.max_seq_length=${SEQ_LEN} \
+            model.data.validation_ds.max_seq_length=${SEQ_LEN} \
+            model.mcore_gpt=True \
+            model.micro_batch_size=${MBS} \
+            model.global_batch_size=${GBS} \
+            model.restore_from_path=${PATH_TO_NEMO_MODEL} \
+            model.data.train_ds.file_names=[${TRAIN_DATASET_PATH}] \
+            model.data.validation_ds.file_names=[${VAL_DATASET_PATH}] \
+            model.optim.lr=5e-6 \
+            model.optim.sched.min_lr=1e-7
+
+* Note: The tokenizer for 8b models (Mamba2 8b and MAmba2-Hybrid 8b) can be found in the `HuggingFace repository <https://huggingface.co/collections/nvidia/ssms-666a362c5c3bb7e4a6bcfb9c>`__. Download it a set its path to ``TOKENIZER_MODEL`` (the tokenizer model file is under the name of ```mt_nlg_plus_multilingual_ja_zh_the_stack_frac_015_256k.model```). For other models, set ``TOKENIZER_MODEL=null`` since it will be downloaded from HuggingFace at the time of run.
+
+3. For LoRA PEFT-Tuning (only for the 8b-hybrid model), use the script above but change the ```model.peft.peft_scheme``` to ```lora``` and ```model.optim.name``` to ``fused_adam``.
+
+
+Evaluating the Fine-Tuned Model
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. code:: bash
+
+    #!/bin/bash
+
+    MBS=32
+    GBS=64
+    TP=2 # According to the fine-tuned checkpoint
+    SP=True # True only if TP>1 otherwise False
+    SEQ_LEN=2048
+    NUM_DEVICES=2
+    MODEL="8b-hybrid"
+    PATH_TO_NEMO_MODEL=<path to .nemo file>
+    TRAIN_DATASET_PATH=<path to training dataset file>
+    VAL_DATASET_PATH=<path to validation dataset file>
+    CONFIG_PATH="/opt/NeMo/examples/nlp/language_modeling/tuning/conf/"
+    CONFIG_NAME="megatron_mamba_finetuning_config"
+    SAVE_DIR=<path to the saving directory>
+    TOKENIZER_MODEL=<path to tokenizer model> # Only for the 8b models, for other models, set to null
+
+    declare -A MODEL_CONFIGS
+    MODEL_CONFIGS[130m]="24 768 768 50288 1 huggingface EleutherAI/gpt-neox-20b" 
+    MODEL_CONFIGS[370m]="48 1024 1024 50288 1 huggingface EleutherAI/gpt-neox-20b" 
+    MODEL_CONFIGS[780m]="48 1536 1536 50288 1 huggingface EleutherAI/gpt-neox-20b" 
+    MODEL_CONFIGS[1_3b]="48 2048 2048 50288 1 huggingface EleutherAI/gpt-neox-20b" 
+    MODEL_CONFIGS[2_7b]="64 2560 2560 50288 1 huggingface EleutherAI/gpt-neox-20b" 
+    MODEL_CONFIGS[8b]="56 4096 16384 256000 8 megatron GPTSentencePieceTokenizer" 
+    MODEL_CONFIGS[8b-hybrid]="56 4096 16384 256000 8 megatron GPTSentencePieceTokenizer" 
+
+    if [ "$MODEL" = "8b-hybrid" ]; then
+        export HYBRID_PATTERN='M-M-M--M-M*-M-M-M-M--M*-M-M-M-M-M*--M-M-M-M-M*-M--M-M-M-'
+    else
+        export HYBRID_PATTERN=''
+    fi
+
+    set_model_params() {
+        local config=(${MODEL_CONFIGS[$MODEL]})
+        NUM_LAYERS=${config[0]}
+        DIM=${config[1]}
+        FFN_DIM=${config[2]}
+        VOCAB_SIZE=${config[3]}
+        NGROUP=${config[4]}
+        TOKENIZER_LIB=${config[5]}
+        TOKENIZER_TYPE=${config[6]}
+    }
+    set_model_params
+
+    export NVTE_FUSED_ATTN=1
+    export NVTE_FLASH_ATTN=0
+
+    TEST_DATASET="[<path to test datasets (list)>]"
+
+    CONFIG_PATH="/opt/NeMo/examples/nlp/language_modeling/tuning/conf/"
+    CONFIG_NAME="megatron_mamba_generate_config"
+
+    MASTER_PORT=15008 torchrun --nproc_per_node=${NUM_DEVICES}  /opt/NeMo/examples/nlp/language_modeling/tuning/megatron_mamba_generate.py \
+            --config-path=${CONFIG_PATH} \
+            --config-name=${CONFIG_NAME} \
+            trainer.devices=${NUM_DEVICES} \
+            trainer.precision=bf16 \
+            trainer.accelerator=gpu \
+            trainer.log_every_n_steps=1 \
+            trainer.val_check_interval=10 \
+            trainer.limit_val_batches=20 \
+            ++trainer.num_sanity_val_steps=0 \
+            ++trainer.accumulate_grad_batches=1 \
+            trainer.max_steps=1000 \
+            trainer.gradient_clip_val=1.0 \
+            exp_manager.exp_dir=${SAVE_DIR} \
+            exp_manager.resume_if_exists=False \
+            exp_manager.create_wandb_logger=False \
+            model.megatron_amp_O2=True \
+            model.peft.restore_from_path=False \
+            +model.peft.restore_from_ckpt.checkpoint_dir=False \
+            +model.peft.restore_from_ckpt.checkpoint_name=False \
+            model.hybrid_override_pattern=${HYBRID_PATTERN} \
+            model.tensor_model_parallel_size=${TP} \
+            model.sequence_parallel=$SP \
+            model.micro_batch_size=${MBS} \
+            model.global_batch_size=${GBS} \
+            model.restore_from_path=${PATH_TO_NEMO_MODEL} \
+            model.data.test_ds.file_names=${TEST_DATASET} \
+            model.data.test_ds.global_batch_size=${GBS} \
+            model.data.test_ds.micro_batch_size=${MBS} \
+            model.data.test_ds.tokens_to_generate=30 \
+            model.answer_only_loss=True \
+            model.tokenizer.library=${TOKENIZER_LIB} \
+            model.tokenizer.type=${TOKENIZER_TYPE} \
+            model.tokenizer.model=${TOKENIZER_MODEL} \
+            model.vocab_size=${VOCAB_SIZE} \
+            model.num_layers=${NUM_LAYERS} \
+            model.hidden_size=${DIM} \
+            model.ffn_hidden_size=${FFN_DIM} \
+            inference.greedy=True \
+            exp_manager.checkpoint_callback_params.monitor=validation_loss \
+            ++inference.verbose=True \
+            model.data.test_ds.write_predictions_to_file=True \
+            model.data.test_ds.output_file_path_prefix=${SAVE_DIR}/shorteval \
+            && echo "Eval finished, calculating scores" \
+            && python /opt/NeMo/scripts/metric_calculation/peft_metric_calc.py --label_field original_answers \
+            --pred_file ${SAVE_DIR}/shorteval_test_squad_inputs_preds_labels.jsonl > ${SAVE_DIR}/shorteval_test_squad_inputs_preds_labels.score \
+            && cat ${SAVE_DIR}/shorteval_test_squad_inputs_preds_labels.score
+
+

From 7e104580f56571a14be86a0a7ae3ad93d2319e03 Mon Sep 17 00:00:00 2001
From: monica-sekoyan <166123533+monica-sekoyan@users.noreply.github.com>
Date: Fri, 5 Jul 2024 23:39:57 +0400
Subject: [PATCH 08/26] TitaNet Batch Verify Speaker (#9337)

* add batch_inference for verify_speakers method

Signed-off-by: msekoyan@nvidia.com <msekoyan@nvidia.com>

* remove not used package

Signed-off-by: msekoyan@nvidia.com <msekoyan@nvidia.com>

* change batch inference logic

Signed-off-by: msekoyan@nvidia.com <msekoyan@nvidia.com>

* fixup

Signed-off-by: msekoyan@nvidia.com <msekoyan@nvidia.com>

* requested changes

Signed-off-by: msekoyan@nvidia.com <msekoyan@nvidia.com>

* add verify_speakers_batch to docs

Signed-off-by: msekoyan@nvidia.com <msekoyan@nvidia.com>

* handle None durations in manifest

Signed-off-by: msekoyan@nvidia.com <msekoyan@nvidia.com>

* change logging text

Signed-off-by: msekoyan@nvidia.com <msekoyan@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: monica-sekoyan <monica-sekoyan@users.noreply.github.com>

* check duration presence

Signed-off-by: msekoyan@nvidia.com <msekoyan@nvidia.com>

* add channel_selector to dataset configs

Signed-off-by: msekoyan@nvidia.com <msekoyan@nvidia.com>

---------

Signed-off-by: msekoyan@nvidia.com <msekoyan@nvidia.com>
Signed-off-by: monica-sekoyan <monica-sekoyan@users.noreply.github.com>
Co-authored-by: monica-sekoyan <monica-sekoyan@users.noreply.github.com>
Co-authored-by: Nithin Rao <nithinrao.koluguri@gmail.com>
---
 docs/source/asr/speaker_recognition/api.rst   |  2 +-
 .../asr/speaker_recognition/results.rst       |  8 +-
 nemo/collections/asr/data/audio_to_label.py   | 81 +++++++++++++------
 .../asr/models/clustering_diarizer.py         |  7 --
 .../configs/classification_models_config.py   |  3 +-
 nemo/collections/asr/models/label_models.py   | 79 +++++++++++++++---
 nemo/collections/asr/parts/mixins/mixins.py   | 28 ++++++-
 .../asr/parts/preprocessing/segment.py        |  8 +-
 .../common/parts/preprocessing/collections.py | 21 +++--
 9 files changed, 181 insertions(+), 56 deletions(-)

diff --git a/docs/source/asr/speaker_recognition/api.rst b/docs/source/asr/speaker_recognition/api.rst
index 0f95cb281145..cdadc4dd5f1d 100644
--- a/docs/source/asr/speaker_recognition/api.rst
+++ b/docs/source/asr/speaker_recognition/api.rst
@@ -6,6 +6,6 @@ Model Classes
 -------------
 .. autoclass:: nemo.collections.asr.models.label_models.EncDecSpeakerLabelModel
     :show-inheritance:
-    :members: setup_finetune_model, get_embedding, verify_speakers
+    :members: setup_finetune_model, get_embedding, verify_speakers, verify_speakers_batch
 
 
diff --git a/docs/source/asr/speaker_recognition/results.rst b/docs/source/asr/speaker_recognition/results.rst
index a6029595823f..e607a35a49e6 100644
--- a/docs/source/asr/speaker_recognition/results.rst
+++ b/docs/source/asr/speaker_recognition/results.rst
@@ -91,7 +91,7 @@ Speaker Verification Inference
 
 Speaker Verification is a task of verifying if two utterances are from the same speaker or not.
 
-We provide a helper function to verify the audio files and return True if two provided audio files are from the same speaker, False otherwise.
+We provide a helper function to verify the audio files (also in a batch) and return True if provided pair of audio files is from the same speaker, False otherwise.
 
 The audio files should be 16KHz mono channel wav files.
 
@@ -99,6 +99,12 @@ The audio files should be 16KHz mono channel wav files.
 
   speaker_model = EncDecSpeakerLabelModel.from_pretrained(model_name="titanet_large")
   decision = speaker_model.verify_speakers('path/to/one/audio_file','path/to/other/audio_file')
+  decisions = speaker_model.verify_speakers_batch([
+                                                  ('/path/to/audio_0_0', '/path/to/audio_0_1'),
+                                                  ('/path/to/audio_1_0', '/path/to/audio_1_1'),
+                                                  ('/path/to/audio_2_0', '/path/to/audio_2_1'),
+                                                  ('/path/to/audio_3_0', '/path/to/audio_3_1')
+                                                  ],  batch_size=4, device='cuda')
 
 
 NGC Pretrained Checkpoints
diff --git a/nemo/collections/asr/data/audio_to_label.py b/nemo/collections/asr/data/audio_to_label.py
index 4ff27f91ed0f..decd6beaa961 100644
--- a/nemo/collections/asr/data/audio_to_label.py
+++ b/nemo/collections/asr/data/audio_to_label.py
@@ -118,12 +118,12 @@ def _speech_collate_fn(batch, pad_id):
 
 def _fixed_seq_collate_fn(self, batch):
     """collate batch of audio sig, audio len, tokens, tokens len
-        Args:
-            batch (Optional[FloatTensor], Optional[LongTensor], LongTensor,
-                LongTensor):  A tuple of tuples of signal, signal lengths,
-                encoded tokens, and encoded tokens length.  This collate func
-                assumes the signals are 1d torch tensors (i.e. mono audio).
-        """
+    Args:
+        batch (Optional[FloatTensor], Optional[LongTensor], LongTensor,
+            LongTensor):  A tuple of tuples of signal, signal lengths,
+            encoded tokens, and encoded tokens length.  This collate func
+            assumes the signals are 1d torch tensors (i.e. mono audio).
+    """
     _, audio_lengths, _, tokens_lengths = zip(*batch)
 
     has_audio = audio_lengths[0] is not None
@@ -232,19 +232,23 @@ class _AudioLabelDataset(Dataset):
             Defaults to None.
         trim (bool): Whether to use trim silence from beginning and end of audio signal using librosa.effects.trim().
             Defaults to False.
+        channel selector (Union[str, int, List[int]]): string denoting the downmix mode, an integer denoting the channel to be selected, or an iterable
+            of integers denoting a subset of channels. Channel selector is using zero-based indexing.
+            If set to `None`, the original signal will be used.
     """
 
     @property
     def output_types(self) -> Optional[Dict[str, NeuralType]]:
-        """Returns definitions of module output ports.
-        """
+        """Returns definitions of module output ports."""
 
         output_types = {
             'audio_signal': NeuralType(
                 ('B', 'T'),
-                AudioSignal(freq=self._sample_rate)
-                if self is not None and hasattr(self, '_sample_rate')
-                else AudioSignal(),
+                (
+                    AudioSignal(freq=self._sample_rate)
+                    if self is not None and hasattr(self, '_sample_rate')
+                    else AudioSignal()
+                ),
             ),
             'a_sig_length': NeuralType(tuple('B'), LengthsType()),
         }
@@ -259,7 +263,10 @@ def output_types(self) -> Optional[Dict[str, NeuralType]]:
         else:
 
             output_types.update(
-                {'label': NeuralType(tuple('B'), LabelsType()), 'label_length': NeuralType(tuple('B'), LengthsType()),}
+                {
+                    'label': NeuralType(tuple('B'), LabelsType()),
+                    'label_length': NeuralType(tuple('B'), LengthsType()),
+                }
             )
 
         return output_types
@@ -273,6 +280,7 @@ def __init__(
         min_duration: Optional[float] = 0.1,
         max_duration: Optional[float] = None,
         trim: bool = False,
+        channel_selector: Union[str, int, List[int]] = None,
         is_regression_task: bool = False,
         cal_labels_occurrence: Optional[bool] = False,
     ):
@@ -290,6 +298,7 @@ def __init__(
 
         self.featurizer = featurizer
         self.trim = trim
+        self.channel_selector = channel_selector
         self.is_regression_task = is_regression_task
 
         if not is_regression_task:
@@ -325,7 +334,13 @@ def __getitem__(self, index):
         if offset is None:
             offset = 0
 
-        features = self.featurizer.process(sample.audio_file, offset=offset, duration=sample.duration, trim=self.trim)
+        features = self.featurizer.process(
+            sample.audio_file,
+            offset=offset,
+            duration=sample.duration,
+            trim=self.trim,
+            channel_selector=self.channel_selector,
+        )
         f, fl = features, torch.tensor(features.shape[0]).long()
 
         if not self.is_regression_task:
@@ -392,6 +407,9 @@ class AudioToSpeechLabelDataset(_AudioLabelDataset):
         trim (bool): Whether to use trim silence from beginning and end
             of audio signal using librosa.effects.trim().
             Defaults to False.
+        channel selector (Union[str, int, List[int]]): string denoting the downmix mode, an integer denoting the channel to be selected, or an iterable
+            of integers denoting a subset of channels. Channel selector is using zero-based indexing.
+            If set to `None`, the original signal will be used.
         window_length_in_sec (float): length of window/slice (in seconds)
             Use this for speaker recognition and VAD tasks.
         shift_length_in_sec (float): amount of shift of window for generating the frame for VAD task in a batch
@@ -413,6 +431,7 @@ def __init__(
         min_duration: Optional[float] = 0.1,
         max_duration: Optional[float] = None,
         trim: bool = False,
+        channel_selector: Optional[Union[str, int, List[int]]] = None,
         window_length_in_sec: Optional[float] = 8,
         shift_length_in_sec: Optional[float] = 1,
         normalize_audio: bool = False,
@@ -433,6 +452,7 @@ def __init__(
             min_duration=min_duration,
             max_duration=max_duration,
             trim=trim,
+            channel_selector=channel_selector,
             is_regression_task=is_regression_task,
             cal_labels_occurrence=cal_labels_occurrence,
         )
@@ -631,8 +651,7 @@ def _internal_generator(self):
         return TarredAudioFilter(self.collection, self.file_occurence)
 
     def _build_sample(self, tup):
-        """Builds the training sample by combining the data from the WebDataset with the manifest info.
-        """
+        """Builds the training sample by combining the data from the WebDataset with the manifest info."""
         audio_bytes, audio_filename = tup
         # Grab manifest entry from self.collection
         file_id, _ = os.path.splitext(os.path.basename(audio_filename))
@@ -647,7 +666,10 @@ def _build_sample(self, tup):
         # Convert audio bytes to IO stream for processing (for SoundFile to read)
         audio_filestream = io.BytesIO(audio_bytes)
         features = self.featurizer.process(
-            audio_filestream, offset=offset, duration=manifest_entry.duration, trim=self.trim,
+            audio_filestream,
+            offset=offset,
+            duration=manifest_entry.duration,
+            trim=self.trim,
         )
 
         audio_filestream.close()
@@ -879,9 +901,12 @@ class AudioToMultiLabelDataset(Dataset):
             All training files which have a duration more than max_duration
             are dropped. Note: Duration is read from the manifest JSON.
             Defaults to None.
-        trim (bool): Whether to use trim silence from beginning and end
+        trim_silence (bool): Whether to use trim silence from beginning and end
             of audio signal using librosa.effects.trim().
             Defaults to False.
+        channel selector (Union[str, int, List[int]]): string denoting the downmix mode, an integer denoting the channel to be selected, or an iterable
+            of integers denoting a subset of channels. Channel selector is using zero-based indexing.
+            If set to `None`, the original signal will be used.
         window_length_in_sec (float): length of window/slice (in seconds)
             Use this for speaker recognition and VAD tasks.
         shift_length_in_sec (float): amount of shift of window for generating the frame for VAD task in a batch
@@ -898,15 +923,16 @@ class AudioToMultiLabelDataset(Dataset):
 
     @property
     def output_types(self) -> Optional[Dict[str, NeuralType]]:
-        """Returns definitions of module output ports.
-        """
+        """Returns definitions of module output ports."""
 
         output_types = {
             'audio_signal': NeuralType(
                 ('B', 'T'),
-                AudioSignal(freq=self._sample_rate)
-                if self is not None and hasattr(self, '_sample_rate')
-                else AudioSignal(),
+                (
+                    AudioSignal(freq=self._sample_rate)
+                    if self is not None and hasattr(self, '_sample_rate')
+                    else AudioSignal()
+                ),
             ),
             'a_sig_length': NeuralType(tuple('B'), LengthsType()),
         }
@@ -920,7 +946,10 @@ def output_types(self) -> Optional[Dict[str, NeuralType]]:
             )
         else:
             output_types.update(
-                {'label': NeuralType(('B', 'T'), LabelsType()), 'label_length': NeuralType(tuple('B'), LengthsType()),}
+                {
+                    'label': NeuralType(('B', 'T'), LabelsType()),
+                    'label_length': NeuralType(tuple('B'), LengthsType()),
+                }
             )
 
         return output_types
@@ -936,6 +965,7 @@ def __init__(
         min_duration: Optional[float] = 0.1,
         max_duration: Optional[float] = None,
         trim_silence: bool = False,
+        channel_selector: Optional[Union[str, int, List[int]]] = None,
         is_regression_task: bool = False,
         cal_labels_occurrence: Optional[bool] = False,
         delimiter: Optional[str] = None,
@@ -959,6 +989,7 @@ def __init__(
 
         self.featurizer = WaveformFeaturizer(sample_rate=sample_rate, int_values=int_values, augmentor=augmentor)
         self.trim = trim_silence
+        self.channel_selector = channel_selector
         self.is_regression_task = is_regression_task
         self.id2occurrence = {}
         self.labels_occurrence = None
@@ -1016,6 +1047,7 @@ def __getitem__(self, index):
             offset=offset,
             duration=sample.duration,
             trim=self.trim,
+            channel_selector=self.channel_selector,
             normalize_db=self.normalize_audio_db,
         )
 
@@ -1245,8 +1277,7 @@ def _internal_generator(self):
         return TarredAudioFilter(self.collection, self.file_occurence)
 
     def _build_sample(self, tup):
-        """Builds the training sample by combining the data from the WebDataset with the manifest info.
-        """
+        """Builds the training sample by combining the data from the WebDataset with the manifest info."""
         audio_bytes, audio_filename = tup
         # Grab manifest entry from self.collection
         file_id, _ = os.path.splitext(os.path.basename(audio_filename))
diff --git a/nemo/collections/asr/models/clustering_diarizer.py b/nemo/collections/asr/models/clustering_diarizer.py
index 93913a43c1b5..98e56a7be48d 100644
--- a/nemo/collections/asr/models/clustering_diarizer.py
+++ b/nemo/collections/asr/models/clustering_diarizer.py
@@ -392,13 +392,6 @@ def _extract_embeddings(self, manifest_file: str, scale_idx: int, num_scales: in
             pkl.dump(self.embeddings, open(self._embeddings_file, 'wb'))
             logging.info("Saved embedding files to {}".format(embedding_dir))
 
-    def path2audio_files_to_manifest(self, paths2audio_files, manifest_filepath):
-        with open(manifest_filepath, 'w', encoding='utf-8') as fp:
-            for audio_file in paths2audio_files:
-                audio_file = audio_file.strip()
-                entry = {'audio_filepath': audio_file, 'offset': 0.0, 'duration': None, 'text': '-', 'label': 'infer'}
-                fp.write(json.dumps(entry) + '\n')
-
     def diarize(self, paths2audio_files: List[str] = None, batch_size: int = 0):
         """
         Diarize files provided through paths2audio_files or manifest file
diff --git a/nemo/collections/asr/models/configs/classification_models_config.py b/nemo/collections/asr/models/configs/classification_models_config.py
index 33408f591c8e..76c6022e22e2 100644
--- a/nemo/collections/asr/models/configs/classification_models_config.py
+++ b/nemo/collections/asr/models/configs/classification_models_config.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 from dataclasses import dataclass, field
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Optional, Union
 
 from omegaconf import MISSING
 
@@ -46,6 +46,7 @@ class EncDecClassificationDatasetConfig(nemo.core.classes.dataset.DatasetConfig)
     max_duration: Optional[float] = None
     min_duration: Optional[float] = None
     cal_labels_occurrence: Optional[bool] = False
+    channel_selector: Optional[Union[str, int, List[int]]] = None
 
     # VAD Optional
     vad_stream: Optional[bool] = None
diff --git a/nemo/collections/asr/models/label_models.py b/nemo/collections/asr/models/label_models.py
index 9de47645d4f3..62cf2e4608d0 100644
--- a/nemo/collections/asr/models/label_models.py
+++ b/nemo/collections/asr/models/label_models.py
@@ -13,6 +13,8 @@
 # limitations under the License.
 import copy
 import itertools
+import os
+import tempfile
 from collections import Counter
 from math import ceil
 from typing import Dict, List, Optional, Union
@@ -34,6 +36,7 @@
 )
 from nemo.collections.asr.data.audio_to_text_dataset import convert_to_config_list
 from nemo.collections.asr.models.asr_model import ExportableEncDecModel
+from nemo.collections.asr.parts.mixins.mixins import VerificationMixin
 from nemo.collections.asr.parts.preprocessing.features import WaveformFeaturizer
 from nemo.collections.asr.parts.preprocessing.perturb import process_augmentations
 from nemo.collections.common.metrics import TopKClassificationAccuracy
@@ -46,7 +49,7 @@
 __all__ = ['EncDecSpeakerLabelModel']
 
 
-class EncDecSpeakerLabelModel(ModelPT, ExportableEncDecModel):
+class EncDecSpeakerLabelModel(ModelPT, ExportableEncDecModel, VerificationMixin):
     """
     Encoder decoder class for speaker label models.
     Model class creates training, validation methods for setting up data
@@ -242,6 +245,7 @@ def __setup_dataloader_from_config(self, config: Optional[Dict]):
                 max_duration=config.get('max_duration', None),
                 min_duration=config.get('min_duration', None),
                 trim=config.get('trim_silence', False),
+                channel_selector=config.get('channel_selector', None),
                 normalize_audio=config.get('normalize_audio', False),
                 cal_labels_occurrence=config.get('cal_labels_occurrence', False),
             )
@@ -583,6 +587,7 @@ def verify_speakers(self, path2audio_file1, path2audio_file2, threshold=0.7):
         # Score
         similarity_score = torch.dot(X, Y) / ((torch.dot(X, X) * torch.dot(Y, Y)) ** 0.5)
         similarity_score = (similarity_score + 1) / 2
+
         # Decision
         if similarity_score >= threshold:
             logging.info(" two audio files are from same speaker")
@@ -591,6 +596,58 @@ def verify_speakers(self, path2audio_file1, path2audio_file2, threshold=0.7):
             logging.info(" two audio files are from different speakers")
             return False
 
+    @torch.no_grad()
+    def verify_speakers_batch(self, audio_files_pairs, threshold=0.7, batch_size=32, sample_rate=16000, device='cuda'):
+        """
+        Verify if audio files from the first and second manifests are from the same speaker or not.
+
+        Args:
+            audio_files_pairs: list of tuples with audio_files pairs to be verified
+            threshold: cosine similarity score used as a threshold to distinguish two embeddings (default = 0.7)
+            batch_size: batch size to perform batch inference
+            sample_rate: sample rate of audio files in manifest file
+            device: compute device to perform operations.
+
+        Returns:
+            True if both audio pair is from same speaker, False otherwise
+        """
+
+        if type(audio_files_pairs) is list:
+            tmp_dir = tempfile.TemporaryDirectory()
+            manifest_filepath1 = os.path.join(tmp_dir.name, 'tmp_manifest1.json')
+            manifest_filepath2 = os.path.join(tmp_dir.name, 'tmp_manifest2.json')
+            self.path2audio_files_to_manifest([p[0] for p in audio_files_pairs], manifest_filepath1)
+            self.path2audio_files_to_manifest([p[1] for p in audio_files_pairs], manifest_filepath2)
+        else:
+            raise ValueError("audio_files_pairs must be of type list of tuples containing a pair of audio files")
+
+        embs1, _, _, _ = self.batch_inference(
+            manifest_filepath1, batch_size=batch_size, sample_rate=sample_rate, device=device
+        )
+        embs2, _, _, _ = self.batch_inference(
+            manifest_filepath2, batch_size=batch_size, sample_rate=sample_rate, device=device
+        )
+
+        embs1 = torch.Tensor(embs1).to(device)
+        embs2 = torch.Tensor(embs2).to(device)
+        # Length Normalize
+        embs1 = torch.div(embs1, torch.linalg.norm(embs1, dim=1).unsqueeze(dim=1))
+        embs2 = torch.div(embs2, torch.linalg.norm(embs2, dim=1).unsqueeze(dim=1))
+
+        X = embs1.unsqueeze(dim=1)
+        Y = embs2.unsqueeze(dim=2)
+        # Score
+        similarity_scores = torch.matmul(X, Y).squeeze() / (
+            (torch.matmul(X, X.permute(0, 2, 1)).squeeze() * torch.matmul(Y.permute(0, 2, 1), Y).squeeze()) ** 0.5
+        )
+        similarity_scores = (similarity_scores + 1) / 2
+
+        # Decision
+        decision = similarity_scores >= threshold
+
+        tmp_dir.cleanup()
+        return decision.cpu().numpy()
+
     @torch.no_grad()
     def batch_inference(self, manifest_filepath, batch_size=32, sample_rate=16000, device='cuda'):
         """
@@ -623,15 +680,15 @@ def batch_inference(self, manifest_filepath, batch_size=32, sample_rate=16000, d
         if trained_labels is not None:
             trained_labels = list(trained_labels)
 
-        featurizer = WaveformFeaturizer(sample_rate=sample_rate)
-
-        dataset = AudioToSpeechLabelDataset(manifest_filepath=manifest_filepath, labels=None, featurizer=featurizer)
-
-        dataloader = torch.utils.data.DataLoader(
-            dataset=dataset,
-            batch_size=batch_size,
-            collate_fn=dataset.fixed_seq_collate_fn,
-        )
+        dl_config = {
+            'manifest_filepath': manifest_filepath,
+            'sample_rate': sample_rate,
+            'channel_selector': 0,
+            'batch_size': batch_size,
+        }
+        self.labels = self.extract_labels(dl_config)
+        dl_config['labels'] = self.labels
+        dataloader = self.__setup_dataloader_from_config(config=dl_config)
 
         logits = []
         embs = []
@@ -647,7 +704,7 @@ def batch_inference(self, manifest_filepath, batch_size=32, sample_rate=16000, d
             gt_labels.extend(labels.cpu().numpy())
             embs.extend(emb.cpu().numpy())
 
-        gt_labels = list(map(lambda t: dataset.id2label[t], gt_labels))
+        gt_labels = list(map(lambda t: dataloader.dataset.id2label[t], gt_labels))
 
         self.train(mode=mode)
         if mode is True:
diff --git a/nemo/collections/asr/parts/mixins/mixins.py b/nemo/collections/asr/parts/mixins/mixins.py
index 1ec406622036..f5b4381f7fb7 100644
--- a/nemo/collections/asr/parts/mixins/mixins.py
+++ b/nemo/collections/asr/parts/mixins/mixins.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import json
 import os
 import shutil
 import tarfile
@@ -31,7 +32,7 @@
 
 
 class ASRBPEMixin(ABC):
-    """ ASR BPE Mixin class that sets up a Tokenizer via a config
+    """ASR BPE Mixin class that sets up a Tokenizer via a config
 
     This mixin class adds the method `_setup_tokenizer(...)`, which can be used by ASR models
     which depend on subword tokenization.
@@ -204,7 +205,12 @@ def _setup_aggregate_tokenizer(self, tokenizer_cfg: DictConfig):
         tokenizers_dict = {}
         # init each of the monolingual tokenizers found in the config and assemble into  AggregateTokenizer
         for lang, tokenizer_config in self.tokenizer_cfg[self.AGGREGATE_TOKENIZERS_DICT_PREFIX].items():
-            (tokenizer, model_path, vocab_path, spe_vocab_path,) = self._make_tokenizer(tokenizer_config, lang)
+            (
+                tokenizer,
+                model_path,
+                vocab_path,
+                spe_vocab_path,
+            ) = self._make_tokenizer(tokenizer_config, lang)
 
             tokenizers_dict[lang] = tokenizer
             if hasattr(self, 'cfg'):
@@ -845,7 +851,23 @@ def _setup_streaming_transcribe_dataloader(
                 streaming_buffer.reset_buffer()
 
 
-class DiarizationMixin(ABC):
+class VerificationMixin(ABC):
+    @staticmethod
+    def path2audio_files_to_manifest(paths2audio_files, manifest_filepath):
+        """
+        Takes paths to audio files and manifest filepath and creates manifest file with the audios
+        Args:
+            paths2audio_files: paths to audio fragment to be verified
+            manifest_filepath: path to manifest file to bre created
+        """
+        with open(manifest_filepath, 'w', encoding='utf-8') as fp:
+            for audio_file in paths2audio_files:
+                audio_file = audio_file.strip()
+                entry = {'audio_filepath': audio_file, 'offset': 0.0, 'duration': None, 'text': '-', 'label': 'infer'}
+                fp.write(json.dumps(entry) + '\n')
+
+
+class DiarizationMixin(VerificationMixin):
     @abstractmethod
     def diarize(self, paths2audio_files: List[str], batch_size: int = 1) -> List[str]:
         """
diff --git a/nemo/collections/asr/parts/preprocessing/segment.py b/nemo/collections/asr/parts/preprocessing/segment.py
index 6b861ac27f8e..310e76cfd0b0 100644
--- a/nemo/collections/asr/parts/preprocessing/segment.py
+++ b/nemo/collections/asr/parts/preprocessing/segment.py
@@ -50,6 +50,10 @@
 try:
     from pydub import AudioSegment as Audio
     from pydub.exceptions import CouldntDecodeError
+
+    # FFMPEG for some formats needs explicitly defined coding-decoding strategy
+    ffmpeg_codecs = {'opus': 'opus'}
+
 except ModuleNotFoundError:
     HAVE_PYDUB = False
 
@@ -342,14 +346,14 @@ def from_file(
 
         if HAVE_PYDUB and samples is None:
             try:
-                samples = Audio.from_file(audio_file)
+                samples = Audio.from_file(audio_file, codec=ffmpeg_codecs.get(os.path.splitext(audio_file)[-1]))
                 sample_rate = samples.frame_rate
                 num_channels = samples.channels
                 if offset > 0:
                     # pydub does things in milliseconds
                     seconds = offset * 1000
                     samples = samples[int(seconds) :]
-                if duration > 0:
+                if duration is not None and duration > 0:
                     seconds = duration * 1000
                     samples = samples[: int(seconds)]
                 samples = np.array(samples.get_array_of_samples())
diff --git a/nemo/collections/common/parts/preprocessing/collections.py b/nemo/collections/common/parts/preprocessing/collections.py
index 24ca6cffe458..0cb81c115d05 100644
--- a/nemo/collections/common/parts/preprocessing/collections.py
+++ b/nemo/collections/common/parts/preprocessing/collections.py
@@ -702,18 +702,23 @@ def __init__(
         output_type = self.OUTPUT_TYPE
         data, duration_filtered = [], 0.0
         total_duration = 0.0
+        duration_undefined = True
+
         for audio_file, duration, command, offset in zip(audio_files, durations, labels, offsets):
             # Duration filters.
-            if min_duration is not None and duration < min_duration:
+            if duration is not None and min_duration is not None and duration < min_duration:
                 duration_filtered += duration
                 continue
 
-            if max_duration is not None and duration > max_duration:
+            if duration is not None and max_duration is not None and duration > max_duration:
                 duration_filtered += duration
                 continue
 
             data.append(output_type(audio_file, duration, command, offset))
-            total_duration += duration
+
+            if duration is not None:
+                total_duration += duration
+                duration_undefined = False
 
             if index_by_file_id:
                 file_id, _ = os.path.splitext(os.path.basename(audio_file))
@@ -729,8 +734,14 @@ def __init__(
             else:
                 data.sort(key=lambda entity: entity.duration)
 
-        logging.info(f"Filtered duration for loading collection is {duration_filtered / 3600: .2f} hours.")
-        logging.info(f"Dataset loaded with {len(data)} items, total duration of {total_duration / 3600: .2f} hours.")
+        if duration_undefined:
+            logging.info(f"Dataset loaded with {len(data)} items. The durations were not provided.")
+        else:
+            logging.info(f"Filtered duration for loading collection is {duration_filtered / 3600: .2f} hours.")
+            logging.info(
+                f"Dataset successfully loaded with {len(data)} items and total duration provided from manifest is {total_duration / 3600: .2f} hours."
+            )
+
         self.uniq_labels = sorted(set(map(lambda x: x.label, data)))
         logging.info("# {} files loaded accounting to # {} labels".format(len(data), len(self.uniq_labels)))
 

From 18ecd416cadec788ed43402d0f822695d446bdb0 Mon Sep 17 00:00:00 2001
From: mikolajblaz <mikolajblaz@users.noreply.github.com>
Date: Fri, 5 Jul 2024 21:52:14 +0200
Subject: [PATCH 09/26] Enable MCore checkpointing optimizations (#9505)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Expose num processes in PyT Dist

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Add parallel save/load optimizations from MCore

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Remove async utils from MCore

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Enable DistOpt paralell R/W

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Enable PyT Dist caching

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Small fixes

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Make sure DistCkptIO is instantiated from config

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Bump MCore version to v0.7

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Print load strategy

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Forward MCore to model space DistOpt

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Add separate flag to control DistOpt paralell R/W

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Turn off parallel save by default

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

---------

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>
---
 Dockerfile.ci                                 |   2 +-
 .../conf/megatron_gpt_config.yaml             |   4 +
 .../nlp/parts/megatron_trainer_builder.py     |   2 +-
 nemo/collections/nlp/parts/nlp_overrides.py   |   2 +-
 nemo/core/optim/mcore_optim.py                |   4 +-
 nemo/utils/callbacks/dist_ckpt_io.py          | 106 +++++--
 nemo/utils/callbacks/torch_dist_async.py      | 298 ------------------
 7 files changed, 91 insertions(+), 327 deletions(-)
 delete mode 100644 nemo/utils/callbacks/torch_dist_async.py

diff --git a/Dockerfile.ci b/Dockerfile.ci
index b376aacd0bfe..ac36e6429475 100644
--- a/Dockerfile.ci
+++ b/Dockerfile.ci
@@ -34,7 +34,7 @@ WORKDIR /workspace
 # Install NeMo requirements
 ARG TE_TAG=bfe21c3d68b0a9951e5716fb520045db53419c5e
 ARG MODELOPT_VERSION=0.13.0
-ARG MCORE_TAG=02871b4df8c69fac687ab6676c4246e936ce92d0
+ARG MCORE_TAG=0ab8dd4c7520408683fdb9f8ac119eff7d38fc0e
 ARG APEX_TAG=810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c
 RUN \
 --mount=type=bind,source=requirements,target=requirements \
diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
index 98bf7d448845..ac1f4a37b232 100755
--- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
@@ -177,6 +177,10 @@ model:
   dist_ckpt_format: 'zarr' # Set to 'torch_dist' to use PyTorch distributed checkpoint format.
   dist_ckpt_load_on_device: True # whether to load checkpoint weights directly on GPU or to CPU
   dist_ckpt_parallel_save: False # if true, each worker will write its own part of the dist checkpoint
+  dist_ckpt_parallel_load: False # if true, each worker will load part of the dist checkpoint and exchange with NCCL. Might use some extra GPU memory
+  dist_ckpt_torch_dist_multiproc: 2 # number of extra processes per rank used during ckpt save with PyTorch distributed format
+  dist_ckpt_assume_constant_structure: False # set to True only if the state dict structure doesn't change within a single job. Allows caching some computation across checkpoint saves.
+  dist_ckpt_parallel_dist_opt: True # parallel save/load of a DistributedOptimizer. 'True' allows performant save and reshardable checkpoints. Set to 'False' only in order to minimize the number of checkpoint files.
 
   ## Activation Checkpointing
   # NeMo Megatron supports 'selective' activation checkpointing where only the memory intensive part of attention is checkpointed.
diff --git a/nemo/collections/nlp/parts/megatron_trainer_builder.py b/nemo/collections/nlp/parts/megatron_trainer_builder.py
index 194168008dc4..f4276fd1b8f9 100644
--- a/nemo/collections/nlp/parts/megatron_trainer_builder.py
+++ b/nemo/collections/nlp/parts/megatron_trainer_builder.py
@@ -90,7 +90,7 @@ def _training_strategy(self) -> Union[NLPDDPStrategy, NLPFSDPStrategy]:
             find_unused_parameters=False,
             nccl_communicator_config_path=self.cfg.model.get('nccl_communicator_config_path', None),
             sharp=self.cfg.model.get('sharp', False),
-            dist_ckpt_parallel_save=self.cfg.model.get('dist_ckpt_parallel_save', False),
+            dist_ckpt_parallel_save=self.cfg.model.get('dist_ckpt_parallel_dist_opt', True),
         )
 
     def _grad_scaler(self) -> GradScaler:
diff --git a/nemo/collections/nlp/parts/nlp_overrides.py b/nemo/collections/nlp/parts/nlp_overrides.py
index 43c330f257ec..ad220aaa3539 100644
--- a/nemo/collections/nlp/parts/nlp_overrides.py
+++ b/nemo/collections/nlp/parts/nlp_overrides.py
@@ -1007,7 +1007,7 @@ def dummy():
                         model.trainer.strategy.launcher.launch(dummy, trainer=model.trainer)
                     model.trainer.strategy.setup_environment()
                 sharded_state_dict = model.sharded_state_dict()
-                checkpoint_io = DistributedCheckpointIO(model.cfg.get('dist_ckpt_format', 'zarr'))
+                checkpoint_io = DistributedCheckpointIO.from_config(model.cfg, async_save=False)
                 checkpoint_io.save_checkpoint(sharded_state_dict, dist_ckpt_dir)
 
                 if HAVE_MODELOPT and hasattr(model, "get_model_module_list"):
diff --git a/nemo/core/optim/mcore_optim.py b/nemo/core/optim/mcore_optim.py
index 234680f49249..9feb70cc90a1 100644
--- a/nemo/core/optim/mcore_optim.py
+++ b/nemo/core/optim/mcore_optim.py
@@ -58,9 +58,7 @@ def load_state_dict(self, state_dict):
     def sharded_state_dict(
         self, model_sharded_state_dict, optimizer_state_dict=None, is_loading=False, dist_ckpt_parallel_save=False
     ):
-        # TODO(@akoumparouli, @mikolajblaz): switch to sharding_type once support for fully_sharded_model_space merged in mcore.
-        # sharding_type = 'fully_sharded_model_space' if dist_ckpt_parallel_save else 'dp_zero_gather_scatter'
-        sharding_type = 'dp_zero_gather_scatter'
+        sharding_type = 'fully_sharded_model_space' if dist_ckpt_parallel_save else 'dp_zero_gather_scatter'
         return self.mcore_optimizer.sharded_state_dict(
             model_sharded_state_dict, is_loading=is_loading, sharding_type=sharding_type
         )
diff --git a/nemo/utils/callbacks/dist_ckpt_io.py b/nemo/utils/callbacks/dist_ckpt_io.py
index 31ab0c84dd3a..65eea827e851 100644
--- a/nemo/utils/callbacks/dist_ckpt_io.py
+++ b/nemo/utils/callbacks/dist_ckpt_io.py
@@ -32,16 +32,29 @@
     from megatron.core import dist_checkpointing
     from megatron.core.dist_checkpointing.dict_utils import extract_matching_values
     from megatron.core.dist_checkpointing.mapping import ShardedBase
+    from megatron.core.dist_checkpointing.serialization import (
+        get_default_load_sharded_strategy,
+        get_default_save_sharded_strategy,
+    )
     from megatron.core.dist_checkpointing.strategies import tensorstore
-
-    from nemo.utils.callbacks.torch_dist_async import AsyncCallsQueue, AsyncRequest, TorchDistAsyncSaveShardedStrategy
+    from megatron.core.dist_checkpointing.strategies.async_utils import AsyncCallsQueue, AsyncRequest
+    from megatron.core.dist_checkpointing.strategies.base import SaveShardedStrategy
+    from megatron.core.dist_checkpointing.strategies.fully_parallel import (
+        FullyParallelLoadStrategyWrapper,
+        FullyParallelSaveStrategyWrapper,
+    )
+    from megatron.core.dist_checkpointing.strategies.torch import TorchDistSaveShardedStrategy
+    from megatron.core.parallel_state import get_data_parallel_group
 
     HAVE_MEGATRON_CORE = True
 
-except (ImportError, ModuleNotFoundError) as IMPORT_ERROR_EXC:
+except (ImportError, ModuleNotFoundError) as e:
 
     HAVE_MEGATRON_CORE = False
-    IMPORT_ERROR = "megatron-core was not found. Please see the NeMo README for installation instructions: https://github.com/NVIDIA/NeMo#megatron-gpt."
+    IMPORT_ERROR = (
+        "megatron-core was not found. Please see the NeMo README for installation instructions: https://github.com/NVIDIA/NeMo#megatron-gpt."
+        f" Exact error: {e}"
+    )
 
 
 @contextmanager
@@ -87,7 +100,7 @@ class AsyncFinalizableCheckpointIO(_WrappingCheckpointIO):
 
     def __init__(self, checkpoint_io: AsyncCompatibleCheckpointIO) -> None:
         if not HAVE_MEGATRON_CORE:
-            raise ImportError(IMPORT_ERROR) from IMPORT_ERROR_EXC
+            raise ImportError(IMPORT_ERROR)
         if not isinstance(checkpoint_io, AsyncCompatibleCheckpointIO):
             raise ValueError(f'Incompatible wrapped checkpoint_io type: {type(checkpoint_io)}')
 
@@ -177,6 +190,12 @@ class DistributedCheckpointIO(AsyncCompatibleCheckpointIO):
             always loads on device). Defaults to True.
         async_save (bool): whether to save asynchronously. Should be set to True if
             this class will be wrapped with AsyncFinalizableCheckpointIO.
+        torch_dist_multiproc (int, optional): number of extra processes per rank
+            used during ckpt save with PyTorch distributed format. Defaults, to None
+            which means using an MCore default (2).
+        parallel_save (bool): parallelizes the save across ranks. Defaults to True
+        parallel_load (bool): parallelizes the load across ranks (followed by params all gather).
+            Defaults to False due to some extra memory usage requirement.
     """
 
     def __init__(
@@ -184,15 +203,25 @@ def __init__(
         save_ckpt_format: str,
         load_directly_on_device: bool = True,
         async_save: bool = False,
+        torch_dist_multiproc: Optional[int] = None,
+        assume_constant_structure: bool = False,
+        parallel_save: bool = True,
+        parallel_load: bool = False,
     ):
         super().__init__()
         if not HAVE_MEGATRON_CORE:
-            raise ImportError(IMPORT_ERROR) from IMPORT_ERROR_EXC
+            raise ImportError(IMPORT_ERROR)
 
         self.save_ckpt_format = save_ckpt_format
         self.load_directly_on_device = load_directly_on_device
         self.async_save = async_save
-        self.save_sharded_strategy = self._determine_dist_ckpt_save_strategy()
+        self.torch_dist_multiproc = torch_dist_multiproc
+        self.assume_constant_structure = assume_constant_structure
+        self.parallel_save = parallel_save
+        self.parallel_load = parallel_load
+
+        self._save_sharded_strategy = None
+        self.validated_consistency = False
 
     @classmethod
     def from_config(cls, model_cfg: dict, async_save: bool = False):
@@ -208,6 +237,9 @@ def from_config(cls, model_cfg: dict, async_save: bool = False):
             save_ckpt_format=model_cfg.get('dist_ckpt_format', 'zarr'),
             load_directly_on_device=model_cfg.get('dist_ckpt_load_on_device', True),
             async_save=async_save,
+            torch_dist_multiproc=model_cfg.get('dist_ckpt_torch_dist_multiproc', None),
+            parallel_save=model_cfg.get('dist_ckpt_parallel_save', True),
+            parallel_load=model_cfg.get('dist_ckpt_parallel_load', False),
         )
 
     @_debug_time('DistributedCheckpointIO.save_checkpoint')
@@ -224,16 +256,15 @@ def save_checkpoint(
         fs = get_filesystem(path)
         fs.makedirs(path, exist_ok=True)
 
-        dist_checkpointing.save(
-            sharded_state_dict=checkpoint, checkpoint_dir=path, sharded_strategy=self.save_sharded_strategy
+        validate_sharding_integrity = not (self.validated_consistency and self.assume_constant_structure)
+        self.validated_consistency = True
+        return dist_checkpointing.save(
+            sharded_state_dict=checkpoint,
+            checkpoint_dir=path,
+            sharded_strategy=self.save_sharded_strategy,
+            validate_access_integrity=validate_sharding_integrity,
+            async_sharded_save=self.async_save,
         )
-        if not self.async_save:
-            return None
-        # NOTE: this logic will be simplified in MCore v0.7
-        assert self.save_sharded_strategy.async_request is not None
-        async_request = self.save_sharded_strategy.async_request
-        self.save_sharded_strategy.async_request = None
-        return async_request
 
     @_debug_time('DistributedCheckpointIO.load_checkpoint')
     def load_checkpoint(
@@ -267,6 +298,16 @@ def load_checkpoint(
         else:
             sharded_strategy = None
 
+        if self.parallel_load:
+            if sharded_strategy is None:
+                sharded_strategy = get_default_load_sharded_strategy(path)
+            sharded_strategy = FullyParallelLoadStrategyWrapper(
+                sharded_strategy, get_data_parallel_group(with_context_parallel=True)
+            )
+
+        if sharded_strategy is not None:
+            logging.info(f'Using {sharded_strategy} dist-ckpt load strategy.')
+
         if not strict:
             sharded_state_dict = self.adjust_non_strict_load(path, sharded_state_dict)
 
@@ -309,17 +350,36 @@ def remove_checkpoint(self, path: _PATH) -> None:
         """
         shutil.rmtree(path, ignore_errors=True)
 
+    @property
+    def save_sharded_strategy(self) -> 'SaveShardedStrategy':
+        if self._save_sharded_strategy is None:
+            self._save_sharded_strategy = self._determine_dist_ckpt_save_strategy()
+        return self._save_sharded_strategy
+
     def _determine_dist_ckpt_save_strategy(self):
         """Determine the saving strategy based on constructor args.
 
-        If self.async_save is True instantiates an async PyT Dist strategy,
-        otherwise relies on MCore to create a proper strategy based on ckpt format.
+        Relies on the default MCore strategy unless extra PyT Distributed format arguments
+        are passed in config or in case of a fully parallel save in which case
+        a parallelization wrapper is applied.
         """
-        save_strategy = (self.save_ckpt_format, 1)
-        if self.async_save:
-            if save_strategy[0] != 'torch_dist':
-                raise ValueError('Async dist-ckpt save supported only for torch_dist format')
-            save_strategy = TorchDistAsyncSaveShardedStrategy('torch_dist', 1)
+        if self.async_save and self.save_ckpt_format != 'torch_dist':
+            raise ValueError('Async dist-ckpt save supported only for torch_dist format')
+
+        torch_dist_kwargs = {} if self.torch_dist_multiproc is None else dict(thread_count=self.torch_dist_multiproc)
+        if self.save_ckpt_format == 'torch_dist' and torch_dist_kwargs:
+            save_strategy = TorchDistSaveShardedStrategy(self.save_ckpt_format, 1, **torch_dist_kwargs)
+        else:
+            save_strategy = get_default_save_sharded_strategy(self.save_ckpt_format, 1)
+
+        # MCore v0.8 introduces `use_cached_ckpt_structure` attribute
+        if hasattr(save_strategy, 'use_cached_ckpt_structure'):
+            save_strategy.use_cached_ckpt_structure = self.assume_constant_structure
+
+        if self.parallel_save:
+            save_strategy = FullyParallelSaveStrategyWrapper(
+                save_strategy, get_data_parallel_group(with_context_parallel=True), self.assume_constant_structure
+            )
 
         logging.info(f'Using {save_strategy} dist-ckpt save strategy.')
         return save_strategy
diff --git a/nemo/utils/callbacks/torch_dist_async.py b/nemo/utils/callbacks/torch_dist_async.py
deleted file mode 100644
index 1cd226af9cdb..000000000000
--- a/nemo/utils/callbacks/torch_dist_async.py
+++ /dev/null
@@ -1,298 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from collections import deque
-from pathlib import Path
-from time import time
-from typing import Callable, List, NamedTuple, Optional, Tuple
-
-import torch
-from megatron.core.dist_checkpointing.mapping import ShardedStateDict
-from megatron.core.dist_checkpointing.strategies.filesystem_async import FileSystemWriterAsync
-from megatron.core.dist_checkpointing.strategies.state_dict_saver import (
-    save_state_dict_async_finalize,
-    save_state_dict_async_plan,
-)
-from megatron.core.dist_checkpointing.strategies.torch import (
-    MCoreSavePlanner,
-    TorchDistSaveShardedStrategy,
-    _replace_state_dict_keys_with_sharded_keys,
-    mcore_to_pyt_state_dict,
-)
-from torch import multiprocessing as mp
-
-from nemo.utils import logging
-
-
-class TorchDistAsyncSaveShardedStrategy(TorchDistSaveShardedStrategy):
-    """Async save strategy for the PyT Distributed format.
-
-    NOTE: this class will be removed and replaced with an MCore version
-    """
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.async_request = None
-
-    def save(self, sharded_state_dict: ShardedStateDict, checkpoint_dir: Path):
-        """Translates MCore ShardedTensors to PyT ShardedTensors and saves in PyT Distributed format.
-
-        Args:
-            sharded_state_dict (ShardedStateDict): sharded state dict to save
-            checkpoint_dir (Path): checkpoint directory
-
-        Returns: None
-        """
-        # Translate the state dict
-        (
-            sharded_state_dict,
-            flat_mapping,
-            rename_mapping,
-        ) = _replace_state_dict_keys_with_sharded_keys(sharded_state_dict, self.keep_only_main_replica)
-        pyt_state_dict = mcore_to_pyt_state_dict(sharded_state_dict, False)
-        # Use PyT saving mechanism
-        writer = FileSystemWriterAsync(checkpoint_dir, thread_count=self.thread_count)
-
-        save_state_dict_ret = save_state_dict_async_plan(
-            pyt_state_dict,
-            writer,
-            None,
-            planner=MCoreSavePlanner(),
-        )
-        self.async_request = self._get_save_and_finalize_callbacks(writer, save_state_dict_ret)
-        return self.async_request
-
-    def _get_save_and_finalize_callbacks(self, writer, save_state_dict_ret):
-        save_fn_args = writer.get_save_function_and_args()
-        if save_fn_args is None:  # this check can be removed with MCore v0.7
-            save_fn_args = None, ()
-        save_fn, save_args = save_fn_args
-
-        def finalize_fn():
-            save_state_dict_async_finalize(*save_state_dict_ret)
-            torch.distributed.barrier()
-
-        return AsyncRequest(save_fn, save_args, [finalize_fn])
-
-
-class AsyncRequest(NamedTuple):
-    """Represents an async request that needs to be scheduled for execution.
-
-    NOTE: this class will be removed and replaced with an MCore version
-
-    Args:
-        async_fn (Callable, optional): async function to call. None represents noop.
-        async_fn_args (Tuple): args to pass to `async_fn`.
-        finalize_fns (List[Callable]): list of functions to call to finalize the request.
-            These functions will be called synchronously after `async_fn` is done
-            *on all ranks*.
-    """
-
-    async_fn: Optional[Callable]
-    async_fn_args: Tuple
-    finalize_fns: List[Callable]
-    is_frozen: bool = False
-
-    def add_finalize_fn(self, fn: Callable) -> None:
-        """Adds a new finalize function to the request.
-
-        Args:
-            fn (Callable): function to add to the async request. This function
-                will be called *after* existing finalization functions.
-
-        Returns:
-            None
-        """
-        if self.is_frozen:
-            raise RuntimeError('Cannot add finalization functions to a frozen AsyncRequest')
-        self.finalize_fns.append(fn)
-
-    def execute_sync(self) -> None:
-        """Helper to synchronously execute the request.
-
-        This logic is equivalent to what should happen in case of the async call.
-        """
-        if self.async_fn is not None:
-            self.async_fn(*self.async_fn_args)
-        torch.distributed.barrier()
-        for finalize_fn in self.finalize_fns:
-            finalize_fn()
-
-    def freeze(self) -> 'AsyncRequest':
-        """Freezes the async request, disallowing adding new finalization functions.
-
-        Returns:
-            AsyncRequest: new async request with all same fields except for the
-                `is_frozen` flag.
-        """
-        return self._replace(is_frozen=True)
-
-
-class DistributedAsyncCaller:
-    """Wrapper around mp.Process that ensures correct semantic of distributed finalization.
-
-    NOTE: this class will be removed and replaced with an MCore version
-
-    Starts process asynchronously and allows checking if all processes on all ranks are done.
-    """
-
-    def __init__(self):
-        self.process: Optional[mp.Process] = None
-        self.start_time: Optional[float] = None
-
-    def schedule_async_call(
-        self,
-        async_fn: Optional[Callable],
-        save_args: Tuple,
-    ) -> None:
-        """Spawn a process with `async_fn` as the target.
-
-        This method must be called on all ranks.
-
-        Args:
-            async_fn (Callable, optional): async function to call. If None,
-                no process will be started.
-            save_args (Tuple): async function args.
-        """
-        if async_fn is None:
-            return  # nothing to do
-        torch.cuda.synchronize()
-        ctx = mp.get_context('fork')
-        self.start_time = time()
-        self.process = ctx.Process(
-            target=async_fn,
-            args=save_args,
-        )
-        self.process.start()
-
-    def is_current_async_call_done(self, blocking=False) -> bool:
-        """Check if async save is finished on all ranks.
-
-        For semantic correctness, requires rank synchronization in each check.
-        This method must be called on all ranks.
-
-        Args:
-            blocking (bool, optional): if True, will wait until the call is done
-                on all ranks. Otherwise, returns immediately if at least one rank
-                is still active. Defaults to False.
-
-        Returns:
-            bool: True if all ranks are done (immediately of after active wait
-                if `blocking` is True), False if at least one rank is still active.
-        """
-        # The following takes the same overhead as torch.distributed.barrier (single integer all-reduce)
-        is_alive = int(self.process.is_alive()) if self.process is not None else 0
-        ten = torch.tensor([is_alive], dtype=torch.int, device=torch.cuda.current_device())
-        logging.debug(f"[rank {torch.distributed.get_rank()}] DistributedAsyncCaller is_alive:{is_alive}")
-        torch.distributed.all_reduce(ten)
-        if ten[0] > 0 and not blocking:
-            return False
-        else:
-            if self.process is not None:
-                logging.debug(f"rank: {torch.distributed.get_rank()}, joining self.process")
-                self.process.join()
-                self.process = None
-
-                logging.debug(
-                    f"DistributedAsyncCaller: Async process join finished after {time() - self.start_time:.2f}s from forking"
-                )
-                self.start_time = None
-            return True
-
-
-class _ActiveAsyncRequest(NamedTuple):
-    """Helper to represent an active async call.
-
-    NOTE: this class will be removed and replaced with an MCore version
-
-    Args:
-        idx (int): index of the call (starting from 0)
-        async_caller (DistributedAsyncCaller): async caller instance that represents
-            the async process handling the async request
-        async_request (AsyncRequest):  async request that is being called
-    """
-
-    idx: int
-    async_caller: DistributedAsyncCaller
-    async_request: AsyncRequest
-
-
-class AsyncCallsQueue:
-    """Manages a queue of async calls.
-
-    NOTE: this class will be removed and replaced with an MCore version
-
-    Allows adding a new async call with `schedule_async_request` and finalizing
-    active calls with `maybe_finalize_async_calls`.
-    """
-
-    def __init__(self):
-        self.async_calls: deque[_ActiveAsyncRequest] = deque([])
-        self.call_idx: int = -1
-
-    def schedule_async_request(self, async_request: AsyncRequest) -> int:
-        """Start a new async call and add it to a queue of active async calls.
-
-        This method must be called on all ranks.
-
-        Args:
-            async_request (AsyncRequest): async request to start.
-
-        Returns:
-            int: index of the async call that was started.
-                This can help the user keep track of the async calls.
-        """
-        self.call_idx += 1
-        async_caller = DistributedAsyncCaller()
-        async_request = async_request.freeze()
-        async_caller.schedule_async_call(async_request.async_fn, async_request.async_fn_args)
-        self.async_calls.append(_ActiveAsyncRequest(self.call_idx, async_caller, async_request))
-        return self.call_idx
-
-    def maybe_finalize_async_calls(self, blocking=False) -> List[int]:
-        """Finalizes all available calls.
-
-        This method must be called on all ranks.
-
-        Args:
-            blocking (bool, optional): if True, will wait until all active requests
-                are done. Otherwise, finalizes only the async request that already
-                finished. Defaults to False.
-        Returns:
-            List[int]: list of indices (as returned by `schedule_async_request`)
-                of async calls that have been successfully finalized.
-        """
-        call_idx_finalized = []
-        while self.async_calls:
-            next_async_done = self.async_calls[0].async_caller.is_current_async_call_done(blocking)
-            if not next_async_done:
-                break
-            call_idx, _, async_request = self.async_calls.popleft()
-            for finalize_fn in async_request.finalize_fns:
-                finalize_fn()
-            ten = torch.tensor([call_idx], dtype=torch.int, device=torch.cuda.current_device())
-            torch.distributed.all_reduce(ten, op=torch.distributed.ReduceOp.MAX)
-            assert (
-                ten.item() == call_idx
-            ), 'Unmatched async calls. That probably means not all ranks are participating in async finalization'
-            call_idx_finalized.append(call_idx)
-        return call_idx_finalized
-
-    def get_num_unfinalized_calls(self):
-        """Get the number of active async calls."""
-        return len(self.async_calls)
-
-    def close(self):
-        """Finalize all calls upon closing."""
-        self.maybe_finalize_async_calls(blocking=True)

From b52229f5d1add66af67d9912671e7eca48c2ebdf Mon Sep 17 00:00:00 2001
From: Onur Yilmaz <35306097+oyilmaz-nvidia@users.noreply.github.com>
Date: Fri, 5 Jul 2024 17:51:02 -0400
Subject: [PATCH 10/26] Change mixtral moe key name for trt-llm (#9620)

* fix minor import bug

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* change moe key values

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* add weight to the key

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

---------

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>
---
 nemo/export/trt_llm/converter/utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/nemo/export/trt_llm/converter/utils.py b/nemo/export/trt_llm/converter/utils.py
index b56bcc2be6c6..a4365a281b49 100644
--- a/nemo/export/trt_llm/converter/utils.py
+++ b/nemo/export/trt_llm/converter/utils.py
@@ -439,14 +439,14 @@ def split_and_save_weight(tp_rank, saved_dir, split_factor, key, vals, storage_t
         split_w3s = np.split(w3, split_factor, axis=1)
 
         split_vals = [np.concatenate(item, axis=1) for item in zip(split_w3s, split_w1s)]
-        key = f'{layer_prefix}.mlp.experts_weight_1'
+        key = f'{layer_prefix}.mlp.fc.weight'
         save_expert_split(split_vals, saved_dir, key, tp_rank, split_factor)
 
     elif "experts.linear_fc2.weight" in key:
         cat_dim = -1
         val = np.concatenate(vals, axis=cat_dim)
         split_vals = np.split(val, split_factor, axis=cat_dim)
-        key = f'{layer_prefix}.mlp.experts_weight_2'
+        key = f'{layer_prefix}.mlp.proj.weight'
         save_expert_split(split_vals, saved_dir, key, tp_rank, split_factor)
     else:
         print(f"[WARNING] {key} not handled by converter")

From 9c8570f3072a6cbdd1206b07f1a00eca5df0c4a8 Mon Sep 17 00:00:00 2001
From: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com>
Date: Sat, 6 Jul 2024 03:05:03 +0300
Subject: [PATCH 11/26] fix ckpt load bug (#9621)

* fix ckpt load bug

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* Apply isort and black reformatting

Signed-off-by: dimapihtar <dimapihtar@users.noreply.github.com>

---------

Signed-off-by: dimapihtar <dpihtar@gmail.com>
Signed-off-by: dimapihtar <dimapihtar@users.noreply.github.com>
Co-authored-by: dimapihtar <dimapihtar@users.noreply.github.com>
---
 nemo/collections/nlp/parts/nlp_overrides.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/nemo/collections/nlp/parts/nlp_overrides.py b/nemo/collections/nlp/parts/nlp_overrides.py
index ad220aaa3539..0b89bfda8dbd 100644
--- a/nemo/collections/nlp/parts/nlp_overrides.py
+++ b/nemo/collections/nlp/parts/nlp_overrides.py
@@ -573,8 +573,8 @@ def _integrate_original_checkpoint_data(self, checkpoint: Dict[str, Any]) -> Dic
             ]['optimizer']['param_groups']
         else:
             checkpoint['optimizer_states'][0]['param_groups'] = original_checkpoint['optimizer_states'][0][
-                'optimizer'
-            ]['param_groups']
+                'param_groups'
+            ]
 
         return checkpoint
 

From 613e1f1875865879e29b2e994cd6705df39aeb5f Mon Sep 17 00:00:00 2001
From: Yu Yao <54727607+yaoyu-33@users.noreply.github.com>
Date: Fri, 5 Jul 2024 17:06:50 -0700
Subject: [PATCH 12/26] NeVA Minor Fixes (#9608)

* fix neva resume with empty param loaded for some pp stage

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* fix crop size check

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Apply isort and black reformatting

Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>

---------

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>
Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>
Co-authored-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>
---
 nemo/collections/multimodal/parts/utils.py    |  4 ++--
 nemo/core/optim/optimizer_with_main_params.py | 15 +++++++++++----
 2 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/nemo/collections/multimodal/parts/utils.py b/nemo/collections/multimodal/parts/utils.py
index 9ad8856daa63..b6dee33d24f3 100644
--- a/nemo/collections/multimodal/parts/utils.py
+++ b/nemo/collections/multimodal/parts/utils.py
@@ -525,8 +525,8 @@ def create_image_processor(mm_cfg):
         else:
             raise (ValueError("Currently only support CLIPImageProcessor and SiglipImageProcessor from Huggingface"))
 
-        crop_size = mm_cfg.vision_encoder.get("crop_size", (224, 224))
-        if hasattr(image_processor, 'crop_size'):
+        crop_size = mm_cfg.vision_encoder.get("crop_size")
+        if hasattr(image_processor, 'crop_size') and crop_size is not None:
             assert crop_size == (
                 image_processor.crop_size['height'],
                 image_processor.crop_size['width'],
diff --git a/nemo/core/optim/optimizer_with_main_params.py b/nemo/core/optim/optimizer_with_main_params.py
index 7d47b7e895f7..412332adef90 100755
--- a/nemo/core/optim/optimizer_with_main_params.py
+++ b/nemo/core/optim/optimizer_with_main_params.py
@@ -119,7 +119,7 @@ def zero(self):
         self.data.zero_()
 
     def allreduce_buffer(self):
-        """Synchronous buffer data allreduce """
+        """Synchronous buffer data allreduce"""
         self.data.div_(get_data_parallel_world_size())
         torch.distributed.all_reduce(self.data, group=self._data_group)
 
@@ -175,7 +175,7 @@ class MainParamsOptimizerWrapper(torch.optim.Optimizer):
     Arguments:
         optimizer: base optimizer such as Adam or SGD.
         fp32_grad_accum: to enable the use of fp32 in gradient accumulation and allreduce.
-        contiguous_grad_bucket: to enable allocating the master gradients in the 
+        contiguous_grad_bucket: to enable allocating the master gradients in the
             contiguous memory space to reduce memory fragmentation.
         async_grad_allreduce: enable asynchronous gradient allreduce that is executed
             along with the training step backprop.
@@ -339,6 +339,7 @@ def __init__(
 
     def _make_param_hook(self, param, main_param, i, grad_chunk_info, is_expert_group):
         """Create the grad accumulation and all-reduce hook for backprop."""
+
         # Hook used for back-prop.
         def param_hook(*unused):
             # Accumulates gradients on main gradients
@@ -361,7 +362,9 @@ def allreduce_grads(use_fused_div, tensor, data_group, grad_mult):
                 else:
                     tensor.div_(grad_mult)
                     torch.distributed.all_reduce(
-                        tensor, group=data_group, async_op=True,
+                        tensor,
+                        group=data_group,
+                        async_op=True,
                     )
 
             # Asynchronous gradients allreduce accross data_parallel ranks
@@ -473,12 +476,16 @@ def load_state_dict(self, state_dict):
         if optimizer_key not in state_dict:
             optimizer_key = 'optimizer_state_dict'
             logging.info('***WARNING*** loading optimizer from ' 'an old checkpoint ...')
+        if 'state' not in state_dict[optimizer_key]:
+            state_dict[optimizer_key]['state'] = {}
         self.optimizer.load_state_dict(state_dict[optimizer_key])
 
         # Copy data for the main params.
         fp32_from_float16_params_key = 'fp32_from_fp16_params'
         if fp32_from_float16_params_key not in state_dict:
             fp32_from_float16_params_key = 'fp32_from_fp16'
+        if fp32_from_float16_params_key not in state_dict:
+            state_dict[fp32_from_float16_params_key] = []
         for current_group, saved_group in zip(self.fp32_from_float16_groups, state_dict[fp32_from_float16_params_key]):
             for current_param, saved_param in zip(current_group, saved_group):
                 current_param.data.copy_(saved_param.data)
@@ -489,7 +496,7 @@ def allreduce_main_grads(self):
 
     @contextmanager
     def no_sync(self):
-        """ A context manager to disable gradient synchronizations across
+        """A context manager to disable gradient synchronizations across
         data-parallel ranks."""
         old_require_backward_grad_sync = self._require_backward_grad_sync
         self._require_backward_grad_sync = False

From 7256db10771aa1d213d9b49640667efaa14f89c9 Mon Sep 17 00:00:00 2001
From: Chen Cui <chcui@nvidia.com>
Date: Sat, 6 Jul 2024 00:46:52 -0400
Subject: [PATCH 13/26] fix pretrianing data sizes and weights (#9627)

Signed-off-by: Chen Cui <chcui@nvidia.com>
---
 nemo/collections/llm/gpt/data/pre_training.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/nemo/collections/llm/gpt/data/pre_training.py b/nemo/collections/llm/gpt/data/pre_training.py
index 247ee1a1521a..46b407410d31 100644
--- a/nemo/collections/llm/gpt/data/pre_training.py
+++ b/nemo/collections/llm/gpt/data/pre_training.py
@@ -42,6 +42,9 @@ def __init__(
             paths = [paths]
         if weights is not None:
             assert len(weights) == len(paths)
+            if len(weights) == 1:
+                # weights must be None if there is only one dataset
+                weights = None
 
         self.paths = paths
         self.weights = weights
@@ -90,7 +93,7 @@ def setup(self, stage: str = "") -> None:
 
         if self.trainer.limit_val_batches <= 1.0 and isinstance(self.trainer.limit_val_batches, float):
             # This is to make sure we only have one epoch on every validation iteration
-            num_val_samples = None
+            num_val_samples = None if self.weights is None else 1
 
         train_valid_test_num_samples = [num_train_samples, num_val_samples, num_test_samples]
         self._train_ds, self._validation_ds, self._test_ds = BlendedMegatronDatasetBuilder(

From 7c05678e5c2a5eee27c0aae6f0cdf5c6d3a3aac6 Mon Sep 17 00:00:00 2001
From: Ali Taghibakhshi <71892896+JRD971000@users.noreply.github.com>
Date: Sat, 6 Jul 2024 13:04:32 -0500
Subject: [PATCH 14/26] Alit/mamba (#9575)

* adding mamba support

* fix import mixins

* rm convert jamba

* Apply isort and black reformatting

Signed-off-by: JRD971000 <JRD971000@users.noreply.github.com>

* more cleanups

* use GPT text gen

* Apply isort and black reformatting

Signed-off-by: JRD971000 <JRD971000@users.noreply.github.com>

* fixing gbs in TP convetor

* Apply isort and black reformatting

Signed-off-by: JRD971000 <JRD971000@users.noreply.github.com>

* add reqs

* add tutorial

* minor fix to tutorial

* moving finetuning files

Signed-off-by: arendu <adithya.r@gmail.com>

* moving finetuning files

Signed-off-by: arendu <adithya.r@gmail.com>

* address comments

* Apply isort and black reformatting

Signed-off-by: JRD971000 <JRD971000@users.noreply.github.com>

* address comments

* Apply isort and black reformatting

Signed-off-by: JRD971000 <JRD971000@users.noreply.github.com>

* address comments

* add mamba dependancies

* add mcore tag

* modify dockerfile ci

* modify dockerfile ci

---------

Signed-off-by: JRD971000 <JRD971000@users.noreply.github.com>
Signed-off-by: arendu <adithya.r@gmail.com>
Co-authored-by: Ali Taghibakhshi <ataghibakhsh@login-eos01.eos.clusters.nvidia.com>
Co-authored-by: JRD971000 <JRD971000@users.noreply.github.com>
Co-authored-by: arendu <adithya.r@gmail.com>
---
 Dockerfile.ci                                 |  20 ++-
 .../megatron_mamba_finetuning_config.yaml     | 135 ++++--------------
 .../conf/megatron_mamba_generate_config.yaml  | 130 ++++-------------
 .../language_modeling/megatron_mamba_model.py |  14 +-
 .../nlp/parts/mixins/nlp_adapter_mixins.py    |  17 +--
 requirements/requirements_nlp.txt             |   2 -
 .../convert_mamba2_pyt_to_nemo.py             |  27 +++-
 7 files changed, 110 insertions(+), 235 deletions(-)

diff --git a/Dockerfile.ci b/Dockerfile.ci
index ac36e6429475..dd8af593768f 100644
--- a/Dockerfile.ci
+++ b/Dockerfile.ci
@@ -32,9 +32,9 @@ EOF
 WORKDIR /workspace
 
 # Install NeMo requirements
-ARG TE_TAG=bfe21c3d68b0a9951e5716fb520045db53419c5e
+ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea
 ARG MODELOPT_VERSION=0.13.0
-ARG MCORE_TAG=0ab8dd4c7520408683fdb9f8ac119eff7d38fc0e
+ARG MCORE_TAG=0bc3547702464501feefeb5523b7a17e591b21fa
 ARG APEX_TAG=810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c
 RUN \
 --mount=type=bind,source=requirements,target=requirements \
@@ -61,6 +61,22 @@ git checkout ${MCORE_TAG} && \
   popd && \
 popd
 export PYTHONPATH="${PYTHONPATH}:/workspace/Megatron-LM"
+
+# Mamba dependancy installation
+git clone https://github.com/state-spaces/mamba.git && \
+  cd mamba && \
+  git checkout v2.0.3 && \
+  python setup.py install && \
+  cd .. && \
+  rm -rf mamba 
+
+git clone https://github.com/Dao-AILab/causal-conv1d && \
+  cd causal-conv1d && \
+  git checkout v1.2.2.post1 && \
+  python setup.py install && \
+  cd .. && \
+  rm -rf causal-conv1d 
+
 EOF
 
 # Copy over NeMo code
diff --git a/examples/nlp/language_modeling/tuning/conf/megatron_mamba_finetuning_config.yaml b/examples/nlp/language_modeling/tuning/conf/megatron_mamba_finetuning_config.yaml
index 3684b61bb186..33498540a3d5 100644
--- a/examples/nlp/language_modeling/tuning/conf/megatron_mamba_finetuning_config.yaml
+++ b/examples/nlp/language_modeling/tuning/conf/megatron_mamba_finetuning_config.yaml
@@ -48,119 +48,38 @@ exp_manager:
 
 
 model:
-  restore_from_path: null
-  # model parallelism 
-  mcore_gpt: True
-  micro_batch_size: 1
-  global_batch_size: 8
-  tensor_model_parallel_size: 1
-  pipeline_model_parallel_size: 1
-  virtual_pipeline_model_parallel_size: null
-  expert_model_parallel_size: 1 # expert model parallelism
-
-  vocab_size: 65536
-  # model architecture
-  encoder_seq_length: 4096
-  hybrid_override_pattern: null
-  max_position_embeddings: ${.encoder_seq_length}
-  position_embedding_type: 'none' # Position embedding type. Options ['learned_absolute', 'rope', 'alibi', 'kerple' , 'xpos', 'sandwich'] xpos and sandwich are experimental.
-  num_layers: 64
-  gated_linear_unit: False
-  add_bias_linear: False
-  num_query_groups: 8
-  ngroups_mamba: 8
-  attention_dropout: 0.0
-  hidden_dropout: 0.0
-  hidden_size: 4096
-  ffn_hidden_size: 14336 # Transformer FFN hidden size. Usually 4 * hidden_size.
-  num_attention_heads: 32
-  transformer_block_type: pre_ln
-  init_method_std: 0.02 # Standard deviation of the zero mean normal distribution used for weight initialization.')
-  kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null
-  apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number.
-  normalization: RMSNorm
-  layernorm_epsilon: 1e-5
-  num_moe_experts: 16
-  moe_router_topk: 2
-  moe_aux_loss_coeff: 0.001
-  make_vocab_size_divisible_by: 128 # Pad the vocab size to be divisible by this value for computation efficiency.
-  pre_process: True # add embedding
-  post_process: True # add pooler
-  megatron_legacy: False
-  persist_layer_norm: True
-
-
-  # mixed-precision
-  attention_softmax_in_fp32: False
-
-  # Distributed checkpoint setup
-  dist_ckpt_format: 'zarr' # Set to 'torch_dist' to use PyTorch distributed checkpoint format.
-  dist_ckpt_load_on_device: True # whether to load checkpoint weights directly on GPU or to CPU
-  dist_ckpt_parallel_save: False # if true, each worker will write its own part of the dist checkpoint
-
-
-  tokenizer:
-    library: 'huggingface'
-    type: 'EleutherAI/gpt-neox-20b' 
-    model: null 
-    vocab_file: null
-    merge_file: null 
-    sentencepiece_legacy: False
-    use_fast: True
-
-  # precision
-  native_amp_init_scale: 4294967296 # 2 ** 32
-  native_amp_growth_interval: 1000
-  fp32_residual_connection: False # Move residual connections to fp32
-  fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16
-
-  # Megatron O2-style half-precision
-  megatron_amp_O2: False # Enable O2-level automatic mixed precision using main parameters
-  grad_allreduce_chunk_size_mb: 125
-
-  # Fusion
-  grad_div_ar_fusion: True # Fuse grad division into torch.distributed.all_reduce. Only used with O2 and no pipeline parallelism..
-  gradient_accumulation_fusion: True # Fuse weight gradient accumulation to GEMMs. Only used with pipeline parallelism and O2.
-  bias_activation_fusion: False # Use a kernel that fuses the bias addition from weight matrices with the subsequent activation function.
-  bias_dropout_add_fusion: True # Use a kernel that fuses the bias addition, dropout and residual connection addition.
-  masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask.
-  get_attention_mask_from_fusion: True # When using fused softmax it will create the attention mask so we won't copy it to the pipeline stages.
-  apply_rope_fusion: True # Use a kernel to add rotary positional embeddings. Only used if position_embedding_type=rope
-
-  # miscellaneous
   seed: 1234
-  use_cpu_initialization: False # Init weights on the CPU (slow for large models)
-  onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter.
-  gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
-  
-  ## Activation Checkpointing
-  # NeMo Megatron supports 'selective' activation checkpointing where only the memory intensive part of attention is checkpointed.
-  # These memory intensive activations are also less compute intensive which makes activation checkpointing more efficient for LLMs (20B+).
+  tensor_model_parallel_size: 1 # intra-layer model parallelism
+  pipeline_model_parallel_size: 1 # inter-layer model parallelism
+
+  encoder_seq_length: 1024
+  global_batch_size: 8
+  micro_batch_size: 1
+  restore_from_path: ??? # Path to an existing .nemo model you wish to add new tasks to or run inference with
+  resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
+  save_nemo_on_validation_end: False # Saves an inference ready .nemo file every time a checkpoint is saved during training.
+  sync_batch_comm: False
+  megatron_amp_O2: False
+
+  ## Sequence Parallelism
+  # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially
   # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
-  # 'full' will checkpoint the entire transformer layer.
-  activations_checkpoint_granularity: null # 'selective' or 'full' 
-  activations_checkpoint_method: null # 'uniform', 'block'
+  sequence_parallel: False
+
+  ## Activation Checkpoint
+  activations_checkpoint_granularity: null # 'selective' or 'full'
+  activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective'
   # 'uniform' divides the total number of transformer layers and checkpoints the input activation
-  # of each chunk at the specified granularity. When used with 'selective', 'uniform' checkpoints all attention blocks in the model.
+  # of each chunk at the specified granularity
   # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity
-  activations_checkpoint_num_layers: null
-  # when using 'uniform' this creates groups of transformer layers to checkpoint. Usually set to 1. Increase to save more memory.
-  # when using 'block' this this will checkpoint the first activations_checkpoint_num_layers per pipeline stage.
-  num_micro_batches_with_partial_activation_checkpoints: null
-  # This feature is valid only when used with pipeline-model-parallelism.
-  # When an integer value is provided, it sets the number of micro-batches where only a partial number of Transformer layers get checkpointed
-  # and recomputed within a window of micro-batches. The rest of micro-batches in the window checkpoint all Transformer layers. The size of window is
-  # set by the maximum outstanding micro-batch backpropagations, which varies at different pipeline stages. The number of partial layers to checkpoint
-  # per micro-batch is set by 'activations_checkpoint_num_layers' with 'activations_checkpoint_method' of 'block'.
-  # This feature enables using activation checkpoint at a fraction of micro-batches up to the point of full GPU memory usage.
+  activations_checkpoint_num_layers: null # not used with 'selective'
   activations_checkpoint_layers_per_pipeline: null
-  # This feature is valid only when used with pipeline-model-parallelism.
-  # When an integer value (rounded down when float is given) is provided, it sets the number of Transformer layers to skip checkpointing at later
-  # pipeline stages. For example, 'activations_checkpoint_layers_per_pipeline' of 3 makes pipeline stage 1 to checkpoint 3 layers less than
-  # stage 0 and stage 2 to checkpoint 6 layers less stage 0, and so on. This is possible because later pipeline stage
-  # uses less GPU memory with fewer outstanding micro-batch backpropagations. Used with 'num_micro_batches_with_partial_activation_checkpoints',
-  # this feature removes most of activation checkpoints at the last pipeline stage, which is the critical execution path.
-  sequence_parallel: False
+  answer_only_loss: True
+  gradient_as_bucket_view: False
+
+  hidden_dropout: 0.0
+  attention_dropout: 0.0
+  ffn_dropout: 0.0
   
   peft:
     peft_scheme: "lora"  # can be either adapter,ia3, lora, or ptuning
diff --git a/examples/nlp/language_modeling/tuning/conf/megatron_mamba_generate_config.yaml b/examples/nlp/language_modeling/tuning/conf/megatron_mamba_generate_config.yaml
index 2d34aefffc7e..fddfa16c8c09 100644
--- a/examples/nlp/language_modeling/tuning/conf/megatron_mamba_generate_config.yaml
+++ b/examples/nlp/language_modeling/tuning/conf/megatron_mamba_generate_config.yaml
@@ -39,113 +39,39 @@ exp_manager:
     model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}}
 
 model:
-  restore_from_path: null
-  # model parallelism 
-  mcore_gpt: True
-  micro_batch_size: 2
-  global_batch_size: 2
-  tensor_model_parallel_size: 1
-  pipeline_model_parallel_size: 1
-  virtual_pipeline_model_parallel_size: null
-  expert_model_parallel_size: 1 # expert model parallelism
-  hybrid_override_pattern: null
-  vocab_size: 65536
-  # model architecture
-  encoder_seq_length: 4096
-  max_position_embeddings: ${.encoder_seq_length}
-  position_embedding_type: 'none' # Position embedding type. Options ['learned_absolute', 'rope', 'alibi', 'kerple' , 'xpos', 'sandwich'] xpos and sandwich are experimental.
-  num_layers: 64
-  gated_linear_unit: False
-  num_query_groups: 8
-  ngroups_mamba: 8
-  attention_dropout: 0.0
-  hidden_dropout: 0.0
-  hidden_size: 4096
-  ffn_hidden_size: 14336 # Transformer FFN hidden size. Usually 4 * hidden_size.
-  num_attention_heads: 32
-  transformer_block_type: pre_ln
-  init_method_std: 0.02 # Standard deviation of the zero mean normal distribution used for weight initialization.')
-  kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null
-  apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number.
-  normalization: RMSNorm
-  layernorm_epsilon: 1e-5
-  num_moe_experts: 16
-  moe_router_topk: 2
-  moe_aux_loss_coeff: 0.001
-  make_vocab_size_divisible_by: 128 # Pad the vocab size to be divisible by this value for computation efficiency.
-  pre_process: True # add embedding
-  post_process: True # add pooler
-  megatron_legacy: False
-  persist_layer_norm: True
-  add_bias_linear: False
-
-  answer_only_loss: True
-
-  tokenizer:
-    library: 'huggingface'
-    type: 'EleutherAI/gpt-neox-20b' 
-    model: null 
-    vocab_file: null
-    merge_file: null 
-    sentencepiece_legacy: False
-    use_fast: True
-
-
-  # precision
-  native_amp_init_scale: 4294967296 # 2 ** 32
-  native_amp_growth_interval: 1000
-  fp32_residual_connection: False # Move residual connections to fp32
-  fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16
-
-  # Megatron O2-style half-precision
-  megatron_amp_O2: False # Enable O2-level automatic mixed precision using main parameters
-  grad_allreduce_chunk_size_mb: 125
-
-  # Fusion
-  grad_div_ar_fusion: True # Fuse grad division into torch.distributed.all_reduce. Only used with O2 and no pipeline parallelism..
-  gradient_accumulation_fusion: True # Fuse weight gradient accumulation to GEMMs. Only used with pipeline parallelism and O2.
-  bias_activation_fusion: False # Use a kernel that fuses the bias addition from weight matrices with the subsequent activation function.
-  bias_dropout_add_fusion: True # Use a kernel that fuses the bias addition, dropout and residual connection addition.
-  masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask.
-  get_attention_mask_from_fusion: True # When using fused softmax it will create the attention mask so we won't copy it to the pipeline stages.
-  apply_rope_fusion: True # Use a kernel to add rotary positional embeddings. Only used if position_embedding_type=rope
-
-
-  # miscellaneous
   seed: 1234
-  use_cpu_initialization: False # Init weights on the CPU (slow for large models)
-  onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter.
-  gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
-  
-  ## Activation Checkpointing
-  # NeMo Megatron supports 'selective' activation checkpointing where only the memory intensive part of attention is checkpointed.
-  # These memory intensive activations are also less compute intensive which makes activation checkpointing more efficient for LLMs (20B+).
+  tensor_model_parallel_size: 1 # intra-layer model parallelism
+  pipeline_model_parallel_size: 1 # inter-layer model parallelism
+
+  encoder_seq_length: 1024
+  global_batch_size: 8
+  micro_batch_size: 1
+  restore_from_path: ??? # Path to an existing .nemo model you wish to add new tasks to or run inference with
+  resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
+  save_nemo_on_validation_end: False # Saves an inference ready .nemo file every time a checkpoint is saved during training.
+  sync_batch_comm: False
+  megatron_amp_O2: False
+
+  ## Sequence Parallelism
+  # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially
   # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
-  # 'full' will checkpoint the entire transformer layer.
-  activations_checkpoint_granularity: null # 'selective' or 'full' 
-  activations_checkpoint_recurrent: False # If set to True, the checkpointing is only done for rglru and conv1d and not for attention and mlp layers
-  activations_checkpoint_method: null # 'uniform', 'block'
+  sequence_parallel: False
+
+  ## Activation Checkpoint
+  activations_checkpoint_granularity: null # 'selective' or 'full'
+  activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective'
   # 'uniform' divides the total number of transformer layers and checkpoints the input activation
-  # of each chunk at the specified granularity. When used with 'selective', 'uniform' checkpoints all attention blocks in the model.
+  # of each chunk at the specified granularity
   # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity
-  activations_checkpoint_num_layers: null
-  # when using 'uniform' this creates groups of transformer layers to checkpoint. Usually set to 1. Increase to save more memory.
-  # when using 'block' this this will checkpoint the first activations_checkpoint_num_layers per pipeline stage.
-  num_micro_batches_with_partial_activation_checkpoints: null
-  # This feature is valid only when used with pipeline-model-parallelism.
-  # When an integer value is provided, it sets the number of micro-batches where only a partial number of Transformer layers get checkpointed
-  # and recomputed within a window of micro-batches. The rest of micro-batches in the window checkpoint all Transformer layers. The size of window is
-  # set by the maximum outstanding micro-batch backpropagations, which varies at different pipeline stages. The number of partial layers to checkpoint
-  # per micro-batch is set by 'activations_checkpoint_num_layers' with 'activations_checkpoint_method' of 'block'.
-  # This feature enables using activation checkpoint at a fraction of micro-batches up to the point of full GPU memory usage.
+  activations_checkpoint_num_layers: null # not used with 'selective'
   activations_checkpoint_layers_per_pipeline: null
-  # This feature is valid only when used with pipeline-model-parallelism.
-  # When an integer value (rounded down when float is given) is provided, it sets the number of Transformer layers to skip checkpointing at later
-  # pipeline stages. For example, 'activations_checkpoint_layers_per_pipeline' of 3 makes pipeline stage 1 to checkpoint 3 layers less than
-  # stage 0 and stage 2 to checkpoint 6 layers less stage 0, and so on. This is possible because later pipeline stage
-  # uses less GPU memory with fewer outstanding micro-batch backpropagations. Used with 'num_micro_batches_with_partial_activation_checkpoints',
-  # this feature removes most of activation checkpoints at the last pipeline stage, which is the critical execution path.
-  sequence_parallel: False
+  answer_only_loss: True
+  gradient_as_bucket_view: False
+
+  hidden_dropout: 0.0
+  attention_dropout: 0.0
+  ffn_dropout: 0.0
+  
   
   peft:
     peft_scheme: null  # can be either adapter,ia3, lora, or ptuning
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_mamba_model.py b/nemo/collections/nlp/models/language_modeling/megatron_mamba_model.py
index fb8a04b947b0..5180bd12b35e 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_mamba_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_mamba_model.py
@@ -13,9 +13,8 @@
 # limitations under the License.
 
 import torch
-
-# from megatron.core.models.mamba import MambaModel
-# from megatron.core.models.mamba.mamba_layer_specs import mamba_stack_spec
+from megatron.core.models.mamba import MambaModel
+from megatron.core.models.mamba.mamba_layer_specs import mamba_stack_spec
 from omegaconf.dictconfig import DictConfig
 from pytorch_lightning.trainer.trainer import Trainer
 
@@ -46,16 +45,15 @@ def model_provider_func(self, pre_process, post_process):
         self.transformer_config.layernorm_epsilon = self.cfg.get('layernorm_epsilon', 1e-5)
 
         # TODO @ataghibakhsh: add mamba_ssm_ngroups=self.cfg.get('mamba_ssm_ngroups', 8) once MLM MR merged
-        # TODO @ataghibakhsh: add the following
-        '''MambaModel(
+
+        model = MambaModel(
             config=self.transformer_config,
             max_sequence_length=self.cfg.get('encoder_seq_length', 4096),
             vocab_size=self.cfg.get('vocab_size', 65536),
+            mamba_ssm_ngroups=self.cfg.get('mamba_ssm_ngroups', 8),
             mamba_stack_spec=mamba_stack_spec,
             hybrid_override_pattern=self.hybrid_override_pattern,
-        )'''
-        # after package mismatch is resovled
-        model = None
+        )
 
         return model
 
diff --git a/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py b/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py
index 34ca175470ab..45f4af3cfbf3 100644
--- a/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py
+++ b/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py
@@ -127,14 +127,15 @@ def _check_and_add_adapter(self, name, module, peft_name, peft_cfg, name_key_to_
                     f'model.{mcore_target}',
                     f'model.module.{mcore_target}',
                 ]:  # simple string match for now
-                    swap_mcore_mixin(module, mcore_mixin)
-                    if model_utils.import_class_by_path(peft_cfg._target_) in module.get_accepted_adapter_types():
-                        module.add_adapter(
-                            name=peft_name,
-                            cfg=peft_cfg,
-                            base_model_cfg=self.cfg,
-                            model_parallel_config=self.model_parallel_config,
-                        )
+                    if not isinstance(module, IdentityOp):
+                        swap_mcore_mixin(module, mcore_mixin)
+                        if model_utils.import_class_by_path(peft_cfg._target_) in module.get_accepted_adapter_types():
+                            module.add_adapter(
+                                name=peft_name,
+                                cfg=peft_cfg,
+                                base_model_cfg=self.cfg,
+                                model_parallel_config=self.model_parallel_config,
+                            )
         elif isinstance(module, AdapterModuleMixin):
             if model_utils.import_class_by_path(peft_cfg._target_) in module.get_accepted_adapter_types():
                 module.add_adapter(
diff --git a/requirements/requirements_nlp.txt b/requirements/requirements_nlp.txt
index d006ccb7ad65..a1dad5b64a8a 100644
--- a/requirements/requirements_nlp.txt
+++ b/requirements/requirements_nlp.txt
@@ -1,6 +1,5 @@
 accelerated-scan
 boto3
-causal-conv1d==1.2.0.post2
 einops
 faiss-cpu
 fasttext
@@ -10,7 +9,6 @@ gdown
 h5py
 ijson
 jieba
-mamba-ssm==1.2.0.post1
 markdown2
 matplotlib>=3.3.2
 #megatron_core>0.6.0 # add back once mcore on pypi is compatible again
diff --git a/scripts/checkpoint_converters/convert_mamba2_pyt_to_nemo.py b/scripts/checkpoint_converters/convert_mamba2_pyt_to_nemo.py
index 9a44f9c2c5c4..9dfd9565179d 100644
--- a/scripts/checkpoint_converters/convert_mamba2_pyt_to_nemo.py
+++ b/scripts/checkpoint_converters/convert_mamba2_pyt_to_nemo.py
@@ -29,8 +29,9 @@
 CUDA_VISIBLE_DEVICES="0" python /NeMo/scripts/checkpoint_converters/convert_mamba2_pyt_to_nemo.py \
                                 --input_name_or_path <path to the source pytorch model> \
                                 --output_path <path to target .nemo model> \
-                                --ngroups_mamba 8 \
-                                --precision bf16
+                                --mamba_ssm_ngroups 8 \
+                                --precision bf16 \
+                                --tokenizer_model_dir <path to tokenizer.model, only set for 8b models, otherwise defaults to None>
 '''
 
 
@@ -49,17 +50,20 @@ def get_args():
         type=str,
         required=True,
     )
-    parser.add_argument("--ngroups_mamba", type=int, default=8, help="ngroups for Mamba model")
+    parser.add_argument("--mamba_ssm_ngroups", type=int, default=8, help="ngroups for Mamba model")
     parser.add_argument(
         "--precision", type=str, default="bf16", choices=["bf16", "32"], help="Precision for checkpoint weights saved"
     )
+    parser.add_argument(
+        "--tokenizer_model_dir", type=str, default=None, help="Path to the tokenizer.model, required for 8b models"
+    )
     args = parser.parse_args()
     return args
 
 
 def convert(args):
 
-    checkpoint_weights = torch.load(args.input_name_or_path, map_location='cpu')['model']
+    checkpoint_weights = torch.load(args.input_name_or_path, map_location='cpu')
     new_state_dict = {}
 
     if 'backbone' in list(checkpoint_weights.keys())[0]:
@@ -95,6 +99,11 @@ def convert(args):
                 old_key = f'backbone.layers.{i}.{attr}'
                 new_state_dict[new_key] = checkpoint_weights[old_key]
 
+        # Tokenizer settings
+        tokenizer_library = 'huggingface'
+        tokenizer_type = 'EleutherAI/gpt-neox-20b'
+        tokenizer_model = None
+
     else:
 
         layer_keys = [key for key in checkpoint_weights.keys() if re.match(r'decoder\.layers\.\d+\.', key)]
@@ -103,6 +112,11 @@ def convert(args):
 
         new_state_dict = {"model." + key: value for key, value in checkpoint_weights.items()}
 
+        # Tokenizer settings
+        tokenizer_library = 'megatron'
+        tokenizer_type = 'GPTSentencePieceTokenizer'
+        tokenizer_model = args.tokenizer_model_dir
+
     layers = defaultdict(list)
 
     for key in new_state_dict.keys():
@@ -131,7 +145,10 @@ def convert(args):
     ].shape
     nemo_config.model.num_layers = num_layers
     nemo_config.model.hybrid_override_pattern = layer_pattern
-    nemo_config.model.ngroups_mamba = args.ngroups_mamba
+    nemo_config.model.mamba_ssm_ngroups = args.mamba_ssm_ngroups
+    nemo_config.model.tokenizer.library = tokenizer_library
+    nemo_config.model.tokenizer.type = tokenizer_type
+    nemo_config.model.tokenizer.model = tokenizer_model
 
     if "-" in layer_pattern:
         nemo_config.model.ffn_hidden_size = new_state_dict[

From 26e4896993d3130e1ebb83c2cb1e8c3a15aa0a46 Mon Sep 17 00:00:00 2001
From: Anna Shors <71393111+ashors1@users.noreply.github.com>
Date: Sun, 7 Jul 2024 17:25:09 -0700
Subject: [PATCH 15/26] [NeMo-UX] async checkpointing support (#9466)

* add async checkpointing support

* fixes

* Apply isort and black reformatting

Signed-off-by: ashors1 <ashors1@users.noreply.github.com>

* add parallel read/write support and other optimizations

* Apply isort and black reformatting

Signed-off-by: ashors1 <ashors1@users.noreply.github.com>

* address comments, make dist checkpointing args configurable

* Apply isort and black reformatting

Signed-off-by: ashors1 <ashors1@users.noreply.github.com>

* fix small typo

Signed-off-by: ashors1 <ashors@nvidia.com>

* Update default sharding type

Co-authored-by: mikolajblaz <mikolajblaz@users.noreply.github.com>
Signed-off-by: Anna Shors <71393111+ashors1@users.noreply.github.com>

* Update default sharding type

Co-authored-by: mikolajblaz <mikolajblaz@users.noreply.github.com>
Signed-off-by: Anna Shors <71393111+ashors1@users.noreply.github.com>

* Apply isort and black reformatting

Signed-off-by: ashors1 <ashors1@users.noreply.github.com>

---------

Signed-off-by: ashors1 <ashors1@users.noreply.github.com>
Signed-off-by: ashors1 <ashors@nvidia.com>
Signed-off-by: Anna Shors <71393111+ashors1@users.noreply.github.com>
Co-authored-by: ashors1 <ashors1@users.noreply.github.com>
Co-authored-by: mikolajblaz <mikolajblaz@users.noreply.github.com>
---
 nemo/lightning/_strategy_lib.py               |   9 +-
 nemo/lightning/io/pl.py                       | 104 +++++++++++++++---
 .../pytorch/callbacks/model_checkpoint.py     |  62 +++++++++--
 nemo/lightning/pytorch/optim/megatron.py      |   4 +-
 nemo/lightning/pytorch/strategies.py          |  41 ++++++-
 5 files changed, 187 insertions(+), 33 deletions(-)

diff --git a/nemo/lightning/_strategy_lib.py b/nemo/lightning/_strategy_lib.py
index 11e89a468c76..e6452de16512 100644
--- a/nemo/lightning/_strategy_lib.py
+++ b/nemo/lightning/_strategy_lib.py
@@ -399,7 +399,10 @@ def enable_nvidia_optimizations() -> None:
 
 
 def optimizer_sharded_state_dict(
-    model: SharedStateDictProtocol, optimizer: "Optimizable", is_loading=False
+    model: SharedStateDictProtocol,
+    optimizer: "Optimizable",
+    is_loading=False,
+    sharding_type='fully_sharded_model_space',
 ) -> Dict[str, torch.Tensor]:
     """
     Sharded state dictionary for an MainParamsOptimizerWrapper.
@@ -428,7 +431,9 @@ def optimizer_sharded_state_dict(
     }
 
     if hasattr(optimizer, "sharded_state_dict"):
-        return optimizer.sharded_state_dict(model_sharded_state_dict, is_loading=is_loading)
+        return optimizer.sharded_state_dict(
+            model_sharded_state_dict, is_loading=is_loading, sharding_type=sharding_type
+        )
 
     if not isinstance(optimizer, MainParamsOptimizerWrapper):
         # Regular optimizer, e.g. Adam or FusedAdam
diff --git a/nemo/lightning/io/pl.py b/nemo/lightning/io/pl.py
index 51cd639f4dc3..2cadc56e59b4 100644
--- a/nemo/lightning/io/pl.py
+++ b/nemo/lightning/io/pl.py
@@ -8,12 +8,27 @@
 from lightning_fabric.plugins.io.checkpoint_io import CheckpointIO
 from lightning_fabric.utilities.cloud_io import get_filesystem
 from lightning_fabric.utilities.types import _PATH
+from megatron.core.dist_checkpointing.serialization import (
+    get_default_load_sharded_strategy,
+    get_default_save_sharded_strategy,
+)
+
+# from nemo.utils.callbacks.torch_dist_async import TorchDistAsyncSaveShardedStrategy
+from megatron.core.dist_checkpointing.strategies import tensorstore
+from megatron.core.dist_checkpointing.strategies.async_utils import AsyncCallsQueue, AsyncRequest
+from megatron.core.dist_checkpointing.strategies.base import SaveShardedStrategy
+from megatron.core.dist_checkpointing.strategies.fully_parallel import (
+    FullyParallelLoadStrategyWrapper,
+    FullyParallelSaveStrategyWrapper,
+)
+from megatron.core.dist_checkpointing.strategies.torch import TorchDistSaveShardedStrategy
+from megatron.core.parallel_state import get_data_parallel_group
 from torch import nn
 from typing_extensions import Self, override
 
 from nemo.lightning.io.capture import IOProtocol
 from nemo.lightning.io.mixin import IOMixin
-
+from nemo.utils.callbacks.dist_ckpt_io import AsyncCompatibleCheckpointIO
 
 log = logging.getLogger(__name__)
 
@@ -46,7 +61,7 @@ def construct_extra(cls, trainer: pl.Trainer) -> Dict[str, Any]:
         return extra
 
 
-class MegatronCheckpointIO(CheckpointIO, IOMixin):
+class MegatronCheckpointIO(AsyncCompatibleCheckpointIO, IOMixin):
     """CheckpointIO that utilizes :func:`torch.save` and :func:`torch.load` to save and load checkpoints respectively,
     common for most use cases.
 
@@ -57,9 +72,23 @@ class MegatronCheckpointIO(CheckpointIO, IOMixin):
     def __init__(
         self,
         save_ckpt_format: str = 'torch_dist',
+        load_directly_on_device: bool = True,
+        async_save: bool = False,
+        torch_dist_multiproc: Optional[int] = None,
+        assume_constant_structure: bool = False,
+        parallel_save: bool = True,
+        parallel_load: bool = False,
     ):
         self.save_ckpt_format = save_ckpt_format
-        self.save_sharded_strategy = self._determine_dist_ckpt_save_strategy()
+        self.load_directly_on_device = load_directly_on_device
+        self.async_save = async_save
+        self.torch_dist_multiproc = torch_dist_multiproc
+        self.assume_constant_structure = assume_constant_structure
+        self.parallel_save = parallel_save
+        self.parallel_load = parallel_load
+
+        self._save_sharded_strategy = None
+        self.validated_consistency = False
 
     @override
     def save_checkpoint(self, checkpoint: Dict[str, Any], path: _PATH, storage_options: Optional[Any] = None) -> None:
@@ -78,11 +107,11 @@ def save_checkpoint(self, checkpoint: Dict[str, Any], path: _PATH, storage_optio
         """
         from megatron.core import dist_checkpointing
 
-        if storage_options is not None:
-            raise TypeError(
-                "`Trainer.save_checkpoint(..., storage_options=...)` with `storage_options` arg"
-                f" is not supported for `{self.__class__.__name__}`. Please implement your custom `CheckpointIO`"
-                " to define how you'd like to use `storage_options`."
+        if storage_options is not None and len(storage_options) > 0:
+            logging.warning(
+                f"{self.__class__.__name__} does not support"
+                f" storage_options, but {storage_options=} was provided."
+                f" Ignoring given storage_options"
             )
         checkpoint_dir = ckpt_to_dir(path)
         fs = get_filesystem(checkpoint_dir)
@@ -91,10 +120,14 @@ def save_checkpoint(self, checkpoint: Dict[str, Any], path: _PATH, storage_optio
             return
         fs.makedirs(checkpoint_dir, exist_ok=True)
 
-        dist_checkpointing.save(
-            checkpoint,
-            checkpoint_dir=str(checkpoint_dir),
+        validate_sharding_integrity = not (self.validated_consistency and self.assume_constant_structure)
+        self.validated_consistency = True
+        return dist_checkpointing.save(
+            sharded_state_dict=checkpoint,
+            checkpoint_dir=checkpoint_dir,
             sharded_strategy=self.save_sharded_strategy,
+            validate_access_integrity=validate_sharding_integrity,
+            async_sharded_save=self.async_save,
         )
 
     @override
@@ -127,7 +160,24 @@ def load_checkpoint(
         if not fs.isdir(path):
             raise ValueError(f"Distributed checkpoints should be a directory. Found: {path}.")
 
-        checkpoint = dist_checkpointing.load(sharded_state_dict=sharded_state_dict, checkpoint_dir=str(path))
+        if self.save_ckpt_format == 'zarr' and self.load_directly_on_device:
+            sharded_strategy = tensorstore.TensorStoreLoadShardedStrategy(load_directly_on_device=True)
+        else:
+            sharded_strategy = None
+
+        if self.parallel_load:
+            if sharded_strategy is None:
+                sharded_strategy = get_default_load_sharded_strategy(path)
+            sharded_strategy = FullyParallelLoadStrategyWrapper(
+                sharded_strategy, get_data_parallel_group(with_context_parallel=True)
+            )
+
+        if sharded_strategy is not None:
+            logging.info(f'Using {sharded_strategy} dist-ckpt load strategy.')
+
+        checkpoint = dist_checkpointing.load(
+            sharded_state_dict=sharded_state_dict, checkpoint_dir=str(path), sharded_strategy=sharded_strategy
+        )
         checkpoint = _fix_tensors_device(checkpoint)
 
         return checkpoint
@@ -147,14 +197,38 @@ def remove_checkpoint(self, path: _PATH) -> None:
 
     def _determine_dist_ckpt_save_strategy(self):
         """Determine the saving strategy based on constructor args.
-        If self.async_save is True instantiates an async PyT Dist strategy,
-        otherwise relies on MCore to create a proper strategy based on ckpt format.
+
+        Relies on the default MCore strategy unless extra PyT Distributed format arguments
+        are passed in config or in case of a fully parallel save in which case
+        a parallelization wrapper is applied.
         """
-        save_strategy = (self.save_ckpt_format, 1)
+        if self.async_save and self.save_ckpt_format != 'torch_dist':
+            raise ValueError('Async dist-ckpt save supported only for torch_dist format')
+
+        torch_dist_kwargs = {} if self.torch_dist_multiproc is None else dict(thread_count=self.torch_dist_multiproc)
+        if self.save_ckpt_format == 'torch_dist' and torch_dist_kwargs:
+            save_strategy = TorchDistSaveShardedStrategy(self.save_ckpt_format, 1, **torch_dist_kwargs)
+        else:
+            save_strategy = get_default_save_sharded_strategy(self.save_ckpt_format, 1)
+
+        # MCore v0.8 introduces `use_cached_ckpt_structure` attribute
+        if hasattr(save_strategy, 'use_cached_ckpt_structure'):
+            save_strategy.use_cached_ckpt_structure = self.assume_constant_structure
+
+        if self.parallel_save:
+            save_strategy = FullyParallelSaveStrategyWrapper(
+                save_strategy, get_data_parallel_group(with_context_parallel=True), self.assume_constant_structure
+            )
 
         logging.info(f'Using {save_strategy} dist-ckpt save strategy.')
         return save_strategy
 
+    @property
+    def save_sharded_strategy(self) -> 'SaveShardedStrategy':
+        if self._save_sharded_strategy is None:
+            self._save_sharded_strategy = self._determine_dist_ckpt_save_strategy()
+        return self._save_sharded_strategy
+
 
 def _fix_tensors_device(ckpt: Dict) -> Dict:
     """Ensure checkpoint tensors are on the correct device."""
diff --git a/nemo/lightning/pytorch/callbacks/model_checkpoint.py b/nemo/lightning/pytorch/callbacks/model_checkpoint.py
index d0a1585f6293..83e750ff281e 100644
--- a/nemo/lightning/pytorch/callbacks/model_checkpoint.py
+++ b/nemo/lightning/pytorch/callbacks/model_checkpoint.py
@@ -30,6 +30,7 @@
 from nemo.lightning.io.pl import TrainerContext
 from nemo.utils import logging
 from nemo.utils.app_state import AppState
+from nemo.utils.callbacks.dist_ckpt_io import AsyncFinalizableCheckpointIO
 from nemo.utils.model_utils import ckpt_to_dir
 
 
@@ -51,12 +52,19 @@ def __init__(
         save_best_model: bool = False,
         save_on_train_epoch_end: Optional[bool] = False,  # Save after training, not after validation
         enable_nemo_ckpt_io: bool = True,
+        async_save: bool = False,
         try_restore_best_ckpt: bool = True,
         **kwargs,
     ):
         self.save_best_model = save_best_model
         self.previous_best_path = ""
         self.enable_nemo_ckpt_io = enable_nemo_ckpt_io
+        self.async_save = async_save
+        # Checkpoints which removal is deferred until async save is done.
+        # Each element of `deferred_ckpts_to_remove` is a growing list
+        # that `self._remove_checkpoint` adds to. Once `self._save_checkpoint`
+        # is called, the last element is frozen and a new element is added.
+        self.deferred_ckpts_to_remove: List[List[str]] = []
         self.try_restore_best_ckpt = try_restore_best_ckpt
 
         # Call the parent class constructor with the remaining kwargs.
@@ -234,13 +242,7 @@ def on_train_end(self, trainer, pl_module):
             return None
 
         # check if we need to save a last checkpoint manually as validation isn't always run based on the interval
-        ## TODO: there is some sort of bug in this code.
-        ## this is what is causing the failure with async checkpointing when "epoch" is part of the ckpt name
-        ## I think this is unnecessary because we will automatically save a final checkpoint
-        ## during on_train_batch_end
-        ## see https://github.com/Lightning-AI/pytorch-lightning/blob/f6fd046552a1504023cb3386a8a0df418a810e4f/src/lightning/pytorch/callbacks/model_checkpoint.py#L315
-        ## we should change the logic to only save a final checkpoint if it wasn't just saveds
-        '''if self.save_last and trainer.val_check_interval != 0:
+        if self.save_last and trainer.val_check_interval != 0:
             should_save_last_checkpoint = False
             if isinstance(trainer.val_check_interval, float) and trainer.val_check_interval % trainer.global_step != 0:
                 should_save_last_checkpoint = True
@@ -251,7 +253,7 @@ def on_train_end(self, trainer, pl_module):
                 if self.last_model_path == self.format_checkpoint_name(monitor_candidates, self.CHECKPOINT_NAME_LAST):
                     logging.debug(f'Last checkpoint {self.last_model_path} already saved')
                 else:
-                    super()._save_last_checkpoint(trainer, monitor_candidates)'''
+                    super()._save_last_checkpoint(trainer, monitor_candidates)
         # Call parent on_train_end() to save the -last checkpoint
         super().on_train_end(trainer, pl_module)
 
@@ -380,6 +382,8 @@ def _save_checkpoint(self, trainer: 'pytorch_lightning.Trainer', filepath: str)
         ema_callback = self._ema_callback(trainer)
 
         if ema_callback is not None:
+            if self.async_save:
+                raise ValueError('async_save with EMA not supported')
             with ema_callback.save_original_optimizer_state(trainer):
                 super()._save_checkpoint(trainer, filepath)
 
@@ -392,10 +396,23 @@ def _save_checkpoint(self, trainer: 'pytorch_lightning.Trainer', filepath: str)
                 super()._save_checkpoint(trainer, filepath)
             self.remove_checkpoint_unfinished_marker(filepath, barrier_before=True)
         else:
+            # Async save passes the finalization function to checkpoint_io,
+            # sync save calls the finalization function immediately after save.
             finalize_fn = self._get_finalize_save_checkpoint_callback(trainer, filepath, trainer.global_step)
-            storage_options = None
+            if self.async_save:
+                checkpoint_io = trainer.strategy.checkpoint_io
+                if not isinstance(checkpoint_io, AsyncFinalizableCheckpointIO):
+                    raise ValueError('Async save requires async compatible CheckpointIO')
+                storage_options = dict(finalize_fn=finalize_fn)
+                # Each upcoming ckpt removal request will be executed as part of this save finalization
+                self.deferred_ckpts_to_remove.append([])
+            else:
+                storage_options = None
             trainer.save_checkpoint(filepath, self.save_weights_only, storage_options=storage_options)
-            finalize_fn()
+            if self.async_save:
+                logging.info(f'Scheduled async checkpoint save for {filepath}')
+            else:
+                finalize_fn()
 
     def _get_finalize_save_checkpoint_callback(
         self, trainer: 'pytorch_lightning.Trainer', filepath: str, global_step: int
@@ -421,10 +438,32 @@ def _cb():
             # we don't want to remove the marker until all checkpointing is done.
             self.remove_checkpoint_unfinished_marker(filepath, barrier_before=True)
 
+            if not self.async_save:
+                return
+
+            logging.info(f'Async checkpoint save for step {global_step} ({filepath}) finalized successfully.')
+
+            # Remove checkpoints marked for removal by `self._remove_checkpoint`
+            # For each finalization there is exactly one entry in self.deferred_ckpts_to_remove
+            assert self.deferred_ckpts_to_remove
+            ckpts_to_remove = self.deferred_ckpts_to_remove.pop(0)
+            logging.debug(f'Checkpoints to remove: {ckpts_to_remove}')
+            for ckpt_to_remove in ckpts_to_remove:
+                self._remove_checkpoint(trainer, ckpt_to_remove, override_async=True)
+
         return _cb
 
     def _remove_checkpoint(self, trainer: "pytorch_lightning.Trainer", filepath: str, override_async=False) -> None:
-        """Performs checkpoint removal."""
+        """Performs checkpoint removal.
+
+        With async save, `self._remove_checkpoint` is called before the checkpoint
+        is actually finished so we can't remove it. Instead we add it to
+        `self.deferred_ckpts_to_remove` for future removal.
+        """
+        if self.async_save and not override_async:
+            # Register checkpoint removal in the last (active) checkpoint removal list
+            self.deferred_ckpts_to_remove[-1].append(filepath)
+            return
         # barrier_after=True, so all ranks continue after the unfinished checkpoint marker is placed.
         # if anything goes wrong during removal, we should be able to detect that data is incomplete.
         self.set_checkpoint_unfinished_marker(filepath, barrier_after=True)
@@ -432,6 +471,7 @@ def _remove_checkpoint(self, trainer: "pytorch_lightning.Trainer", filepath: str
         ema_callback = self._ema_callback(trainer)
         if ema_callback is not None:
             # remove EMA copy of the state dict as well.
+
             filepath = self._ema_format_filepath(filepath)
             super()._remove_checkpoint(trainer, filepath)
         # barrier_before=True, so all ranks synchronize before removing the unfinished checkpoint marker
diff --git a/nemo/lightning/pytorch/optim/megatron.py b/nemo/lightning/pytorch/optim/megatron.py
index 77fe20e6de78..7faa53f32b65 100644
--- a/nemo/lightning/pytorch/optim/megatron.py
+++ b/nemo/lightning/pytorch/optim/megatron.py
@@ -90,10 +90,8 @@ def sharded_state_dict(
                 model_sharded_state_dict,
                 optimizer_state_dict=None,
                 is_loading=False,
-                # dist_ckpt_parallel_save=False, ## TODO: fix!
+                sharding_type='fully_sharded_model_space',
             ):
-                # sharding_type = 'fully_sharded_model_space' if dist_ckpt_parallel_save else 'dp_zero_gather_scatter'
-                sharding_type = 'dp_zero_gather_scatter'
                 state_dict = self.mcore_optimizer.sharded_state_dict(
                     model_sharded_state_dict, is_loading=is_loading, sharding_type=sharding_type
                 )
diff --git a/nemo/lightning/pytorch/strategies.py b/nemo/lightning/pytorch/strategies.py
index 0f6dc89a7076..6a84319b4fa2 100644
--- a/nemo/lightning/pytorch/strategies.py
+++ b/nemo/lightning/pytorch/strategies.py
@@ -34,6 +34,7 @@
 from nemo.lightning.io.pl import MegatronCheckpointIO
 from nemo.lightning.megatron_parallel import CallbackConnector, MegatronParallel, _ModuleStepFunction
 from nemo.lightning.pytorch.callbacks import MegatronProgressBar, ModelTransform
+from nemo.utils.callbacks.dist_ckpt_io import AsyncFinalizableCheckpointIO, AsyncFinalizerCallback
 
 if TYPE_CHECKING:
     from nemo.lightning.pytorch.plugins.data_sampler import DataSampler
@@ -103,6 +104,12 @@ def __init__(
         ddp: Union[DDPLiteral, DistributedDataParallelConfig] = "megatron",
         lazy_init: bool = False,
         pipeline_dtype: Optional[torch.dtype] = None,
+        save_ckpt_format='torch_dist',
+        ckpt_torch_dist_multiproc=None,  ## TODO(ashors): put elsewhere?
+        ckpt_assume_constant_structure=False,
+        ckpt_parallel_save=True,
+        ckpt_parallel_load=False,
+        ckpt_parallel_save_optim=True,
         **kwargs,
     ) -> None:
         super().__init__(
@@ -128,6 +135,13 @@ def __init__(
         self.log_train_loss = bool(int(os.getenv("NEMO_LOG_TRAIN_LOSS", 1)))
         self.log_memory_usage = bool(int(os.getenv("NEMO_LOG_MEMORY_USAGE", 0)))
 
+        self.save_ckpt_format = save_ckpt_format
+        self.torch_dist_multiproc = ckpt_torch_dist_multiproc
+        self.assume_constant_structure = ckpt_assume_constant_structure
+        self.parallel_save = ckpt_parallel_save
+        self.parallel_load = ckpt_parallel_load
+        self.parallel_save_optim = ckpt_parallel_save_optim
+
         self._ddp = ddp
         if ddp == "megatron":
             self.ddp_config = DistributedDataParallelConfig()
@@ -483,8 +497,11 @@ def optimizer_sharded_state_dict(self, is_loading=False):
         # TODO: Fix when MainParamsOptimizerWrapper is not used
 
         optimizer = self.lightning_module.optimizers(use_pl_optimizer=False)
+        sharding_type = 'fully_sharded_model_space' if self.parallel_save_optim else 'dp_zero_gather_scatter'
 
-        return _strategy_lib.optimizer_sharded_state_dict(self.megatron_parallel, optimizer, is_loading=is_loading)
+        return _strategy_lib.optimizer_sharded_state_dict(
+            self.megatron_parallel, optimizer, is_loading=is_loading, sharding_type=sharding_type
+        )
 
     @override
     def save_checkpoint(
@@ -541,7 +558,27 @@ def load_model_state_dict(self, checkpoint: Mapping[str, Any], strict: bool = Tr
     @override
     def checkpoint_io(self) -> CheckpointIO:
         if self._checkpoint_io is None:
-            self._checkpoint_io = MegatronCheckpointIO()
+            checkpoint_callback = self.trainer.checkpoint_callback
+            async_save = getattr(checkpoint_callback, "async_save", False)
+            self._checkpoint_io = MegatronCheckpointIO(
+                save_ckpt_format=self.save_ckpt_format,
+                async_save=async_save,
+                torch_dist_multiproc=self.torch_dist_multiproc,
+                assume_constant_structure=self.assume_constant_structure,
+                parallel_save=self.parallel_save,
+                parallel_load=self.parallel_load,
+            )
+            if async_save:
+                self._checkpoint_io = AsyncFinalizableCheckpointIO(self._checkpoint_io)
+                have_async_callback = False
+                for callback in self.trainer.callbacks:
+                    if isinstance(callback, AsyncFinalizerCallback):
+                        have_async_callback = True
+                        break
+                if not have_async_callback:
+                    self.trainer.callbacks.append(AsyncFinalizerCallback())
+        elif isinstance(self._checkpoint_io, _WrappingCheckpointIO):
+            self._checkpoint_io.checkpoint_io = MegatronCheckpointIO()
 
         return self._checkpoint_io
 

From da140eb408b69501c2b6b57a59d68ba6af82ab4f Mon Sep 17 00:00:00 2001
From: Taejin Park <tango4j@gmail.com>
Date: Sun, 7 Jul 2024 19:38:57 -0700
Subject: [PATCH 16/26] Fix the arguments  of forward_for_export function in
 msdd_models (#9624)

* Fix the arguments  of forward_for_export function

Signed-off-by: Taejin Park <tango4j@gmail.com>

* Apply isort and black reformatting

Signed-off-by: tango4j <tango4j@users.noreply.github.com>

---------

Signed-off-by: Taejin Park <tango4j@gmail.com>
Signed-off-by: tango4j <tango4j@users.noreply.github.com>
Co-authored-by: tango4j <tango4j@users.noreply.github.com>
---
 nemo/collections/asr/models/msdd_models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nemo/collections/asr/models/msdd_models.py b/nemo/collections/asr/models/msdd_models.py
index 60aae8d1a4b1..c88275dcacd3 100644
--- a/nemo/collections/asr/models/msdd_models.py
+++ b/nemo/collections/asr/models/msdd_models.py
@@ -565,7 +565,7 @@ def forward(
         self.msdd._speaker_model.train()
         if len(detach_ids[0]) > 1:
             logits, embs_a = self.msdd._speaker_model.forward_for_export(
-                processed_signal=audio_signal[detach_ids[0]], processed_signal_len=audio_signal_len[detach_ids[0]]
+                audio_signal=audio_signal[detach_ids[0]], length=audio_signal_len[detach_ids[0]]
             )
             embs[detach_ids[0], :] = embs_a
 

From ab1d72235fb5c3add05169719e0572c8ac186aaa Mon Sep 17 00:00:00 2001
From: mikolajblaz <mikolajblaz@users.noreply.github.com>
Date: Mon, 8 Jul 2024 12:16:26 +0200
Subject: [PATCH 17/26] Change default parallel_save to False (#9632)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>
---
 nemo/utils/callbacks/dist_ckpt_io.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/nemo/utils/callbacks/dist_ckpt_io.py b/nemo/utils/callbacks/dist_ckpt_io.py
index 65eea827e851..144c07addaa8 100644
--- a/nemo/utils/callbacks/dist_ckpt_io.py
+++ b/nemo/utils/callbacks/dist_ckpt_io.py
@@ -205,7 +205,7 @@ def __init__(
         async_save: bool = False,
         torch_dist_multiproc: Optional[int] = None,
         assume_constant_structure: bool = False,
-        parallel_save: bool = True,
+        parallel_save: bool = False,
         parallel_load: bool = False,
     ):
         super().__init__()
@@ -238,7 +238,7 @@ def from_config(cls, model_cfg: dict, async_save: bool = False):
             load_directly_on_device=model_cfg.get('dist_ckpt_load_on_device', True),
             async_save=async_save,
             torch_dist_multiproc=model_cfg.get('dist_ckpt_torch_dist_multiproc', None),
-            parallel_save=model_cfg.get('dist_ckpt_parallel_save', True),
+            parallel_save=model_cfg.get('dist_ckpt_parallel_save', False),
             parallel_load=model_cfg.get('dist_ckpt_parallel_load', False),
         )
 

From c0cd8d4567a6360b28f51751eabedd4bd1a76177 Mon Sep 17 00:00:00 2001
From: mikolajblaz <mikolajblaz@users.noreply.github.com>
Date: Mon, 8 Jul 2024 12:16:54 +0200
Subject: [PATCH 18/26] Unwrap ckpt_io for model opt (async save) (#9622)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>
---
 nemo/collections/nlp/parts/nlp_overrides.py | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/nemo/collections/nlp/parts/nlp_overrides.py b/nemo/collections/nlp/parts/nlp_overrides.py
index 0b89bfda8dbd..e251690831cb 100644
--- a/nemo/collections/nlp/parts/nlp_overrides.py
+++ b/nemo/collections/nlp/parts/nlp_overrides.py
@@ -395,7 +395,7 @@ def save_checkpoint(
                 save_sharded_modelopt_state(
                     self.lightning_module.get_model_module_list(),
                     ckpt_to_dir(filepath),
-                    self.checkpoint_io.save_sharded_strategy,
+                    self.unwrapped_checkpoint_io.save_sharded_strategy,
                     prefix="model.",
                 )
         else:
@@ -595,10 +595,7 @@ def remove_checkpoint(self, filepath: Union[str, Path]) -> None:
 
     @property
     def use_distributed_checkpointing(self):
-        checkpoint_io = self.checkpoint_io
-        while isinstance(checkpoint_io, _WrappingCheckpointIO):
-            checkpoint_io = checkpoint_io.checkpoint_io
-        has_dist_ckpt_io = HAVE_MEGATRON_CORE and isinstance(checkpoint_io, DistributedCheckpointIO)
+        has_dist_ckpt_io = HAVE_MEGATRON_CORE and isinstance(self.unwrapped_checkpoint_io, DistributedCheckpointIO)
         has_sharded_state_dict = (
             hasattr(self.lightning_module, 'sharded_state_dict')
             and self.lightning_module.sharded_state_dict() is not None
@@ -638,6 +635,14 @@ def restore_checkpoint_after_setup(self) -> bool:
         """
         return True
 
+    @property
+    def unwrapped_checkpoint_io(self) -> CheckpointIO:
+        """Returns CheckpointIO unwrapped from any _WrappedCheckpointIO wrappers."""
+        checkpoint_io = self.checkpoint_io
+        while isinstance(checkpoint_io, _WrappingCheckpointIO):
+            checkpoint_io = checkpoint_io.checkpoint_io
+        return checkpoint_io
+
 
 class NLPDDPStrategyNotebook(NLPDDPStrategy):
     """Version of NLPDDPStrategy to be used in a Jupyter Notebook
@@ -1011,6 +1016,8 @@ def dummy():
                 checkpoint_io.save_checkpoint(sharded_state_dict, dist_ckpt_dir)
 
                 if HAVE_MODELOPT and hasattr(model, "get_model_module_list"):
+                    while isinstance(checkpoint_io, _WrappingCheckpointIO):
+                        checkpoint_io = checkpoint_io.checkpoint_io
                     save_sharded_modelopt_state(
                         model.get_model_module_list(),
                         dist_ckpt_dir,

From 575283a9d60037bab88baf675c27f21361bec933 Mon Sep 17 00:00:00 2001
From: huvunvidia <86480512+huvunvidia@users.noreply.github.com>
Date: Mon, 8 Jul 2024 09:06:32 -0400
Subject: [PATCH 19/26] MCore T5 support for NeMo - Training (#9432)

* huvu/mcore_t5 first commit from local

* removing DEBUGGING prints

* cleaning megatron_lm_encoder_decoder_model.py code

* cleaning code

* adding Github action test

* only run mcore T5 test

* only run mcore T5 test

* only run mcore T5 test

* only run mcore T5 test

* reset .github/workflows/cicd-main.yml

* reset .github/workflows/cicd-main.yml

* adding condition self.mcore_t5 when running self.build_transformer_config()

* refractor megatron_lm_encoder_decoder_model.py to not use self.model

* only run T5-related tests

* remove all self.model

* reset cicd file

* reset cicd file

* updating codes remove duplicate if/else; adding mcore/transformer_engine to config file

* adjust +model.mcore_t5=True

* Apply isort and black reformatting

Signed-off-by: huvunvidia <huvunvidia@users.noreply.github.com>

---------

Signed-off-by: huvunvidia <huvunvidia@users.noreply.github.com>
Co-authored-by: Huy Vu2 <huvu@login-eos01.eos.clusters.nvidia.com>
Co-authored-by: huvunvidia <huvunvidia@users.noreply.github.com>
---
 .github/workflows/cicd-main.yml               |  75 ++++
 .../conf/megatron_t5_config.yaml              |   4 +
 .../language_modeling/megatron_base_model.py  |  34 +-
 .../megatron_lm_encoder_decoder_model.py      | 369 +++++++++++++++---
 4 files changed, 425 insertions(+), 57 deletions(-)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 44ecb03acc7b..d225ee3ab429 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -3488,6 +3488,80 @@ jobs:
         rm -rf examples/nlp/language_modeling/t5_pretrain_results
         rm -rf examples/nlp/language_modeling/t5_index_mappings
 
+  L2_Megatron_Core_T5_Pretraining_and_Resume_Training_TP2:
+    needs: [cicd-test-container-setup]
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_t5_pretraining.py \
+        trainer.devices=2 \
+        trainer.log_every_n_steps=1 \
+        trainer.max_epochs=null \
+        trainer.max_steps=10 \
+        trainer.val_check_interval=10 \
+        trainer.accumulate_grad_batches=1 \
+        trainer.precision=bf16 \
+        model.megatron_amp_O2=True \
+        exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
+        model.mcore_t5=True \
+        model.transformer_engine=True \
+        model.tensor_model_parallel_size=2 \
+        model.micro_batch_size=4 \
+        model.global_batch_size=4 \
+        model.seq_length=128 \
+        model.encoder.num_layers=4 \
+        model.encoder.hidden_size=64 \
+        model.encoder.num_attention_heads=8 \
+        model.decoder.num_layers=4 \
+        model.decoder.hidden_size=64 \
+        model.decoder.num_attention_heads=8 \
+        model.encoder.transformer_block_type='pre_ln' \
+        model.decoder.transformer_block_type='pre_ln' \
+        model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \
+        model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \
+        model.data.data_impl=text_mmap \
+        +model.data.data_impl_kwargs.newline_int=10 \
+        +model.data.data_impl_kwargs.header_lines=0 \
+        +model.data.data_impl_kwargs.workers=null \
+        +model.data.data_impl_kwargs.sort_dataset_paths=False
+
+        NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_t5_pretraining.py \
+        trainer.devices=2 \
+        trainer.log_every_n_steps=1 \
+        trainer.max_epochs=null \
+        trainer.max_steps=10 \
+        trainer.val_check_interval=10 \
+        trainer.accumulate_grad_batches=1 \
+        trainer.precision=bf16 \
+        model.megatron_amp_O2=True \
+        exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
+        exp_manager.resume_if_exists=True \
+        model.mcore_t5=True \
+        model.transformer_engine=True \
+        model.tensor_model_parallel_size=2 \
+        model.micro_batch_size=4 \
+        model.global_batch_size=4 \
+        model.seq_length=128 \
+        model.encoder.num_layers=4 \
+        model.encoder.hidden_size=64 \
+        model.encoder.num_attention_heads=8 \
+        model.decoder.num_layers=4 \
+        model.decoder.hidden_size=64 \
+        model.decoder.num_attention_heads=8 \
+        model.encoder.transformer_block_type='pre_ln' \
+        model.decoder.transformer_block_type='pre_ln' \
+        model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \
+        model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \
+        model.data.data_impl=text_mmap \
+        +model.data.data_impl_kwargs.newline_int=10 \
+        +model.data.data_impl_kwargs.header_lines=0 \
+        +model.data.data_impl_kwargs.workers=null \
+        +model.data.data_impl_kwargs.sort_dataset_paths=False
+      AFTER_SCRIPT: |
+        rm -rf examples/nlp/language_modeling/t5_pretrain_results
+        rm -rf examples/nlp/language_modeling/t5_index_mappings
+
   L2_Megatron_T5_with_ALiBi_Pretraining_and_Resume_Training_TP2:
     needs: [cicd-test-container-setup]
     uses: ./.github/workflows/_test_template.yml
@@ -4433,6 +4507,7 @@ jobs:
       - L2_Megatron_Change_Partitions_Reduce_TP_Num_Partitions_-2_to_1-_and_PP_Num_Partitions_-1_to_2
       - L2_Megatron_Change_Partitions_Increase_TP_Num_Partitions_-2_to_4-_and_PP_Num_Partitions_-1_to_2
       - L2_Megatron_T5_Pretraining_and_Resume_Training_TP2
+      - L2_Megatron_Core_T5_Pretraining_and_Resume_Training_TP2
       - L2_Megatron_T5_with_ALiBi_Pretraining_and_Resume_Training_TP2
       - L2_Megatron_T5_with_KERPLE_Pretraining_and_Resume_Training_TP2
       - L2_Megatron_T5_Pretraining_and_Resume_Training_PP2
diff --git a/examples/nlp/language_modeling/conf/megatron_t5_config.yaml b/examples/nlp/language_modeling/conf/megatron_t5_config.yaml
index e51cfff420a3..439a0f1533bd 100644
--- a/examples/nlp/language_modeling/conf/megatron_t5_config.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_t5_config.yaml
@@ -43,6 +43,10 @@ exp_manager:
     model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}}
 
 model:
+  # use T5 model from megatron.core
+  mcore_t5: False
+  transformer_engine: False
+
   # model parallelism 
   micro_batch_size: 4
   global_batch_size: 8 # will use more micro batches to reach global batch size
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
index f7b53a95c19a..7308d3db3f91 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
@@ -290,7 +290,11 @@ def _wrap_model_for_O2(self):
         Returns:
             The wrapped model. Returns a list of wrapped modules or a single wrapped module.
         """
-        is_mcore_model = self.__dict__.get('mcore_gpt', False) or self.__dict__.get('mcore_bert', False)
+        is_mcore_model = (
+            self.__dict__.get('mcore_gpt', False)
+            or self.__dict__.get('mcore_bert', False)
+            or self.__dict__.get('mcore_t5', False)
+        )
 
         Float16Wrapper = MCoreFloat16Module if is_mcore_model else Float16Module
 
@@ -305,15 +309,21 @@ def _wrap_model_for_O2(self):
 
         args = mcore_args if is_mcore_model else nemo_args
         # Model wrapper to convert both model and inputs to half precision
-        if isinstance(self.model, list):
+        if isinstance((self.enc_dec_model if hasattr(self, "enc_dec_model") else self.model), list):
             converted_model = []
-            for module in self.model:
+            for module in self.enc_dec_model if hasattr(self, "enc_dec_model") else self.model:
                 args['module'] = module
                 converted_model.append(Float16Wrapper(**args))
-            self.model = converted_model
+            if hasattr(self, "enc_dec_model"):
+                self.enc_dec_model = converted_model
+            else:
+                self.model = converted_model
         else:
-            args['module'] = self.model
-            self.model = Float16Wrapper(**args)
+            args['module'] = self.enc_dec_model if hasattr(self, "enc_dec_model") else self.model
+            if hasattr(self, "enc_dec_model"):
+                self.enc_dec_model = Float16Wrapper(**args)
+            else:
+                self.model = Float16Wrapper(**args)
         args.pop('module')
 
     def get_model_module_list(self):
@@ -323,10 +333,10 @@ def extract_module(model):
             else:
                 return model
 
-        if isinstance(self.model, list):
-            return list(map(extract_module, self.model))
+        if isinstance((self.enc_dec_model if hasattr(self, "enc_dec_model") else self.model), list):
+            return list(map(extract_module, (self.enc_dec_model if hasattr(self, "enc_dec_model") else self.model)))
         else:
-            return [extract_module(self.model)]
+            return [extract_module(self.enc_dec_model if hasattr(self, "enc_dec_model") else self.model)]
 
     def _reconfigure_limit_batches(self, limit_batches, dataloader, mode):
         """
@@ -1022,7 +1032,11 @@ def is_data_parallel_rank_zero(self):
 
     def _get_total_params_across_model_parallel_groups_gpt_bert(self):
         """Returns the total number of parameters across all model parallel groups."""
-        is_mcore_model = self.__dict__.get('mcore_gpt', False) or self.__dict__.get('mcore_bert', False)
+        is_mcore_model = (
+            self.__dict__.get('mcore_gpt', False)
+            or self.__dict__.get('mcore_bert', False)
+            or self.__dict__.get('mcore_t5', False)
+        )
         # log number of parameters
         model = self.get_model_module_list()
         if isinstance(model, list):
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py b/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py
index 8fe215bcc9af..6609b1aff303 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py
@@ -32,11 +32,13 @@
 from nemo.collections.nlp.modules.common.megatron.build_model import build_model
 from nemo.collections.nlp.modules.common.megatron.module import Float16Module
 from nemo.collections.nlp.modules.common.megatron.token_level_encoder_decoder import (
+    AttnMaskType,
     MegatronTokenLevelEncoderDecoderModule,
 )
 from nemo.collections.nlp.modules.common.megatron.utils import (
     ApexGuardDefaults,
     average_losses_across_data_parallel_group,
+    build_attention_mask_3d,
     get_params_for_weight_decay_optimization,
 )
 from nemo.collections.nlp.modules.common.text_generation_utils import (
@@ -62,7 +64,16 @@
 try:
     from megatron.core import parallel_state, tensor_parallel
     from megatron.core.enums import ModelType
+    from megatron.core.models.T5 import T5Model as MCoreT5Model
+    from megatron.core.models.T5.t5_spec import (
+        get_t5_decoder_with_local_block_spec,
+        get_t5_decoder_with_transformer_engine_block_spec,
+        get_t5_encoder_with_local_block_spec,
+        get_t5_encoder_with_transformer_engine_block_spec,
+    )
     from megatron.core.pipeline_parallel.schedules import get_forward_backward_func
+    from megatron.core.transformer.module import Float16Module as MCoreFloat16Module
+    from megatron.core.transformer.transformer_config import TransformerConfig
 
     HAVE_MEGATRON_CORE = True
 
@@ -96,6 +107,13 @@ def __init__(self, cfg: DictConfig, trainer: Trainer):
         # Make sure trainer.accumulate_grad_batches is 1.
         self._validate_trainer()
 
+        self.mcore_t5 = cfg.get('mcore_t5', False)
+
+        if self.mcore_t5:
+            self.transformer_config = self.build_transformer_config()
+
+        self.megatron_amp_O2 = cfg.get('megatron_amp_O2', False)
+
         # TODO: Currently does not support interleaved pipeline parallelism.
         # This means we can only use pipeline parallelism without the interleaved schedule.
         if isinstance(self.trainer.accelerator, CPUAccelerator):
@@ -116,18 +134,18 @@ def __init__(self, cfg: DictConfig, trainer: Trainer):
         # We don't need to call it explicitly? Since it is a pytorch lightning hook function
         # self.setup_optimizer_param_groups()
 
-        self.megatron_amp_O2 = cfg.get('megatron_amp_O2', False)
-
         if self.megatron_amp_O2:
 
             if not self.with_distributed_adam:
                 # Pre-allocate the model on GPU to have master parameters allocated on the same device with matching data type
-                self.enc_dec_model.cuda(torch.cuda.current_device())
+                if isinstance(self.enc_dec_model, list):
+                    for module in self.enc_dec_model:
+                        module.cuda(torch.cuda.current_device())
+                else:
+                    self.enc_dec_model.cuda(torch.cuda.current_device())
 
             # Model wrapper to convert both model and inputs to half precision
-            self.enc_dec_model = Float16Module(
-                config=self.model_parallel_config, module=self.enc_dec_model, precision=self.cfg.precision
-            )
+            self._wrap_model_for_O2()
 
         self.enable_autocast = (
             True if (not self.megatron_amp_O2) and (self.autocast_dtype in [torch.float16, torch.bfloat16]) else False
@@ -250,38 +268,74 @@ def model_provider_func(self, pre_process, post_process, add_encoder, add_decode
         if parallel_state.get_pipeline_model_parallel_world_size() > 1 and self.cfg.encoder.arch == 'perceiver':
             raise ValueError(f"Perceivers with pipeline parallel > 1 is not supported yet.")
 
-        if not hasattr(self.cfg, 'embedding_init_method_std'):
-            embedding_init_method_std = self.cfg.encoder.init_method_std
-        else:
-            embedding_init_method_std = self.cfg.embedding_init_method_std
+        if hasattr(self, 'mcore_t5') and self.mcore_t5:
+            assert HAVE_MEGATRON_CORE, "Cannot use MCore T5 since Megatron Core is not found"
+            assert self.cfg.get(
+                'share_token_embeddings', True
+            ), "share_token_embeddings must be True if using MCore T5 model"
+            if self.cfg.get('transformer_engine', False):
+                enc_dec_spec_fns = (
+                    get_t5_encoder_with_transformer_engine_block_spec,
+                    get_t5_decoder_with_transformer_engine_block_spec,
+                )
+            else:
+                enc_dec_spec_fns = (
+                    get_t5_encoder_with_local_block_spec,
+                    get_t5_decoder_with_local_block_spec,
+                )
+
+            en_block_spec = enc_dec_spec_fns[0](self.cfg.encoder.num_layers)
+            de_block_spec = enc_dec_spec_fns[1](self.cfg.decoder.num_layers)
+            model = MCoreT5Model(
+                config=self.transformer_config,
+                transformer_encoder_layer_spec=en_block_spec,
+                transformer_decoder_layer_spec=de_block_spec,
+                vocab_size=self.padded_vocab_size,
+                max_sequence_length=self.cfg.max_position_embeddings,
+                pre_process=pre_process,
+                post_process=post_process,
+                fp16_lm_cross_entropy=self.cfg.get('fp16_lm_cross_entropy', False),
+                parallel_output=True,
+                share_embeddings_and_output_weights=self.cfg.get('share_decoder_tokens_head_embeddings', True),
+                position_embedding_type=self.cfg.get('position_embedding_type', 'learned_absolute'),
+                rotary_percent=self.cfg.get('rotary_percentage', 1.0),
+                seq_len_interpolation_factor=self.cfg.get('seq_len_interpolation_factor', None),
+            )
 
-        if not hasattr(self.cfg, 'embedding_dropout'):
-            embedding_dropout = self.cfg.encoder.hidden_dropout
         else:
-            embedding_dropout = self.cfg.embedding_dropout
-
-        model = MegatronTokenLevelEncoderDecoderModule(
-            config=self.model_parallel_config,
-            encoder_cfg=self.cfg.encoder,
-            decoder_cfg=self.cfg.decoder,
-            vocab_size=self.padded_vocab_size,
-            max_position_embeddings=self.cfg.max_position_embeddings,
-            num_tokentypes=0,
-            parallel_output=True,
-            pre_process=pre_process,
-            post_process=post_process,
-            fp16_cross_entropy=self.cfg.get('fp16_lm_cross_entropy', False),
-            precision=self.cfg.get('precision', 16),
-            embedding_init_method_std=embedding_init_method_std,
-            embedding_dropout=embedding_dropout,
-            label_smoothing=self.cfg.get('label_smoothing', 0.0),
-            add_encoder=add_encoder,
-            add_decoder=add_decoder,
-            share_token_embeddings=self.cfg.get('share_token_embeddings', True),
-            share_decoder_tokens_head_embeddings=self.cfg.get('share_decoder_tokens_head_embeddings', True),
-            tokens_head_bias=self.cfg.get('tokens_head_bias', True),
-            hiddens_cfg=self.cfg.get('hiddens', None),
-        )
+            if not hasattr(self.cfg, 'embedding_init_method_std'):
+                embedding_init_method_std = self.cfg.encoder.init_method_std
+            else:
+                embedding_init_method_std = self.cfg.embedding_init_method_std
+
+            if not hasattr(self.cfg, 'embedding_dropout'):
+                embedding_dropout = self.cfg.encoder.hidden_dropout
+            else:
+                embedding_dropout = self.cfg.embedding_dropout
+
+            model = MegatronTokenLevelEncoderDecoderModule(
+                config=self.model_parallel_config,
+                encoder_cfg=self.cfg.encoder,
+                decoder_cfg=self.cfg.decoder,
+                vocab_size=self.padded_vocab_size,
+                max_position_embeddings=self.cfg.max_position_embeddings,
+                num_tokentypes=0,
+                parallel_output=True,
+                pre_process=pre_process,
+                post_process=post_process,
+                fp16_cross_entropy=self.cfg.get('fp16_lm_cross_entropy', False),
+                precision=self.cfg.get('precision', 16),
+                embedding_init_method_std=embedding_init_method_std,
+                embedding_dropout=embedding_dropout,
+                label_smoothing=self.cfg.get('label_smoothing', 0.0),
+                add_encoder=add_encoder,
+                add_decoder=add_decoder,
+                share_token_embeddings=self.cfg.get('share_token_embeddings', True),
+                share_decoder_tokens_head_embeddings=self.cfg.get('share_decoder_tokens_head_embeddings', True),
+                tokens_head_bias=self.cfg.get('tokens_head_bias', True),
+                hiddens_cfg=self.cfg.get('hiddens', None),
+            )
+
         return model
 
     def forward(
@@ -372,6 +426,25 @@ def training_step(self, dataloader_iter):
         # we zero grads here because we also call backward in the megatron fwd/bwd functions
         self._optimizer.zero_grad()
 
+        if self.with_distributed_adam:
+            # hack to enable overlapping param sync and forward compute
+            # note: the distributed optimizer monkey-patches each
+            # parameter's __getattribute__ function so that it can
+            # launch parameter all-gathers the first time the
+            # parameter is accessed after the optimizer step. However,
+            # PyTorch directly passes embedding parameters into a C++,
+            # bypassing this process. A quick-and-dirty hack is to
+            # manually interact with the parameter.
+            modules = self.enc_dec_model if isinstance(self.enc_dec_model, list) else [self.enc_dec_model]
+            for module in modules:
+                if isinstance(module, (Float16Module, MCoreFloat16Module)):
+                    module = module.module
+                if not self.mcore_t5:
+                    module = module.language_model
+                if hasattr(module, 'embedding'):
+                    for param in module.embedding.parameters():
+                        param.data_ptr()
+
         loss_dict = self.fwd_bwd_step(dataloader_iter, False)
 
         if self.with_distributed_adam:
@@ -380,8 +453,12 @@ def training_step(self, dataloader_iter):
             # from multiple simultaneous NCCL calls
             self._optimizer._finish_bucket_grad_sync()
         elif self.megatron_amp_O2:
-            # when using pipeline parallelism grads must be reduced after the pipeline (not asynchronously)
-            if self.cfg.get('pipeline_model_parallel_size', 1) > 1:
+            # when using pipeline parallelism grads must be all-reduced after the pipeline (not asynchronously)
+            if (
+                self.cfg.get('pipeline_model_parallel_size', 1) > 1
+                or self.cfg.get('sequence_parallel', False)
+                or not self.cfg.get('async_grad_allreduce', True)
+            ):
                 # main grads are stored in the MainParamsOptimizer wrapper
                 self._optimizer.allreduce_main_grads()
         else:
@@ -596,15 +673,37 @@ def fwd_output_and_loss_func(dataloader_iter, model):
                 batch_data,
             ) = batch
 
-            output = model(
-                encoder_input_ids,  # enc_input_ids
-                encoder_attn_mask,  # enc_attn_mask
-                decoder_input_ids,  # dec_input_ids
-                decoder_attn_mask,  # dec_attn_mask
-                None,  # token_type_ids
-                lm_labels,  # labels
-                batch_data,  # batch_data
-            )
+            if self.mcore_t5:
+                # attn mask logic follows megatron.data.t5_dataset.py in Megatron-LM
+                encoder_attn_mask_3d = build_attention_mask_3d(
+                    encoder_attn_mask, encoder_attn_mask, AttnMaskType.padding
+                )
+                decoder_attn_mask_3d = build_attention_mask_3d(
+                    decoder_attn_mask, decoder_attn_mask, AttnMaskType.causal
+                )
+                enc_dec_attn_mask_3d = build_attention_mask_3d(
+                    decoder_attn_mask, encoder_attn_mask, AttnMaskType.padding
+                )
+
+                output = model(  # model is MCoreT5Model
+                    encoder_input_ids,  # encoder_input_ids
+                    decoder_input_ids,  # decoder_input_ids
+                    encoder_attn_mask_3d,  # encoder_attn_mask
+                    decoder_attn_mask_3d,  # decoder_attn_mask
+                    enc_dec_attn_mask_3d,  # encoder_decoder_attn_mask
+                    lm_labels,  # lm_labels
+                )
+
+            else:
+                output = model(
+                    encoder_input_ids,  # enc_input_ids
+                    encoder_attn_mask,  # enc_attn_mask
+                    decoder_input_ids,  # dec_input_ids
+                    decoder_attn_mask,  # dec_attn_mask
+                    None,  # token_type_ids
+                    lm_labels,  # labels
+                    batch_data,  # batch_data
+                )
 
             def loss_func(output_tensor):
                 if isinstance(output_tensor, dict):
@@ -983,6 +1082,36 @@ def setup(self, stage=None):
                 ) == 'relative' and not self.cfg.decoder.get('relative_position_bias_self_attention_only', True):
                     self.enc_dec_model.sync_initial_decoder_cross_attention_relative_position_embeddings()
 
+        if self.cfg.get('transformer_engine', False) or self.cfg.get('mcore_t5', False):
+            self.setup_transformer_engine_tp_groups()
+
+    def setup_transformer_engine_tp_groups(self):
+        """This should be called after model parallel groups have been initialized
+        and only needs to be called when using Transformer Engine.
+        """
+        for module in self.get_t5_module_list():
+            """Set TP group
+            Copied from: https://github.com/NVIDIA/TransformerEngine/blob/main/transformer_engine/pytorch/transformer.py#L398
+            """
+            # Deep iterate but skip self to avoid infinite recursion.
+            for index, child in enumerate(module.modules()):
+                if index == 0:
+                    continue
+                if hasattr(child, "set_tensor_parallel_group"):
+                    tp_group = parallel_state.get_tensor_model_parallel_group()
+                    child.set_tensor_parallel_group(tp_group)
+
+    def get_t5_module_list(self):
+        if isinstance(self.enc_dec_model, list):
+            return [
+                model.module if isinstance(model, (Float16Module, MCoreFloat16Module)) else model
+                for model in self.enc_dec_model
+            ]
+        elif isinstance(self.enc_dec_model, (Float16Module, MCoreFloat16Module)):
+            return [self.enc_dec_model.module]
+        else:
+            return [self.enc_dec_model]
+
     def setup_training_data(self, cfg):
         if hasattr(self, '_train_ds'):
             consumed_samples = self.compute_consumed_samples(0)
@@ -1536,3 +1665,149 @@ def build_model_parallel_config(self):
                 f'encoder.hidden_size not found in {self.cfg}. Set this in model_parallel_config if using pipeline parallelism.'
             )
         return model_parallel_config
+
+    def sharded_state_dict(self, prefix: str = '') -> Dict[str, Any]:
+        """
+        Creates the sharded state dict which is used by dist_checkpoint to save the sharded tensors to disk.
+        When given the sharded_stated_dict, dist_checkpoint.load will load the tensors corresponding to
+        self.state_dict().
+        The sharded tensor mapping is defined in the GPTModel class from mcore.
+        """
+        if self.mcore_t5:
+            module_prefix = f'{prefix}model.'
+            sharded_state_dict = {}
+            for index, module in enumerate(self.get_model_module_list()):
+                if parallel_state.get_virtual_pipeline_model_parallel_world_size() is not None:
+                    # virtual pipline rank must be set so that GPTModel returns the correct sharded state dict
+                    parallel_state.set_virtual_pipeline_model_parallel_rank(index)
+                    module_sharded_state_dict = module.sharded_state_dict(prefix=module_prefix)
+                    sharded_state_dict[f'model_{index}'] = module_sharded_state_dict
+                else:
+                    module_sharded_state_dict = module.sharded_state_dict(prefix=module_prefix)
+                    sharded_state_dict.update(module_sharded_state_dict)
+
+            # reset vp rank
+            if parallel_state.get_virtual_pipeline_model_parallel_world_size() is not None:
+                parallel_state.set_virtual_pipeline_model_parallel_rank(0)
+
+            return sharded_state_dict
+
+    def on_save_checkpoint(self, checkpoint) -> None:
+        """LightningModule hook:
+        https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html#on-save-checkpoint
+        """
+        if self.mcore_t5:
+            checkpoint['sharded_state_dict'] = self.sharded_state_dict()
+        else:
+            if isinstance(self.enc_dec_model, list):
+                for i in range(len(self.enc_dec_model)):
+                    parallel_state.set_virtual_pipeline_model_parallel_rank(i)
+                    checkpoint[f'model{i}'] = self.enc_dec_model[i].module.state_dict_for_save_checkpoint()
+                parallel_state.set_virtual_pipeline_model_parallel_rank(0)
+
+    def on_load_checkpoint(self, checkpoint) -> None:
+        """LightningModule hook:
+        https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html#on-load-checkpoint
+        """
+        if self.mcore_t5:
+            if 'state_dict' in checkpoint and checkpoint['state_dict']:
+                for index, module in enumerate(self.get_model_module_list()):
+                    if parallel_state.get_virtual_pipeline_model_parallel_world_size() is not None:
+                        checkpoint_state_dict = checkpoint['state_dict'][f'model_{index}']
+                    else:
+                        checkpoint_state_dict = checkpoint['state_dict']
+                    # checkpoint_state_dict has "model." but module does not so we need to remove it when loading
+                    checkpoint_state_dict = {
+                        key.replace('model.', ''): checkpoint_state_dict.pop(key)
+                        for key in list(checkpoint_state_dict.keys())
+                    }
+
+                    # addressing the current T5 mcore version's implementation of sharded_state_dict
+                    checkpoint_state_dict['lm_head.output_layer.bias'] = checkpoint_state_dict['output_layer.bias']
+
+                    module.load_state_dict(checkpoint_state_dict, strict=True)
+            else:
+                checkpoint['state_dict'] = {}
+        else:
+            if isinstance(self.enc_dec_model, list):
+                for i in range(len(self.enc_dec_model)):
+                    parallel_state.set_virtual_pipeline_model_parallel_rank(i)
+                    self.enc_dec_model[i].module.load_state_dict(checkpoint[f'model{i}'], strict=True)
+                parallel_state.set_virtual_pipeline_model_parallel_rank(0)
+
+    def build_transformer_config(self) -> TransformerConfig:
+        """Builds the megatron core gpt transformer config for the model.
+        For attributes in the nemo model config that are the same
+        as the megatron core TransformerConfig, we will use the value from the nemo model config.
+        For attributes in TransformerConfig that are not in the nemo model config, we add custom logic.
+        """
+
+        # for T5 model, transformers hyperparameters are stored in self.cfg.encoder/self.cfg.decoder
+        with open_dict(self.cfg):
+            for key in self.cfg.encoder:
+                print("{}: {}".format(key, self.cfg.encoder.get(key)))
+                OmegaConf.update(self.cfg, key, self.cfg.encoder.get(key))
+
+        normalization = self.cfg.get('normalization', 'layernorm')
+
+        layernorm_zero_centered_gamma = self.cfg.get('normalization', 'layernorm') == 'layernorm1p'
+        if normalization == 'layernorm':
+            normalization = 'LayerNorm'
+        elif normalization == 'rmsnorm':
+            normalization = 'RMSNorm'
+        elif normalization == 'layernorm1p':
+            normalization = 'LayerNorm'
+            layernorm_zero_centered_gamma = True
+        else:
+            logging.warning(
+                f"The normalization type: {normalization} might not be supported in megatron core."
+                f"Supported types are LayerNorm and RMSNorm."
+            )
+
+        # any configs that are not in the nemo model config will be added here
+        model_specific_configs = {
+            'layernorm_zero_centered_gamma': layernorm_zero_centered_gamma,
+            'normalization': normalization,
+        }
+
+        transformer_config = super().build_transformer_config()
+
+        for key, value in model_specific_configs.items():
+            setattr(transformer_config, key, value)
+
+        # pass mcore customization configs directly to mcore
+        mcore_customization_config_dict = self.cfg.get('mcore_customization_config', {})
+        for key, value in mcore_customization_config_dict.items():
+            setattr(transformer_config, key, value)
+
+        return transformer_config
+
+    def setup_mcore_distributed_parallel(self):
+        """Set up mcore distributed data parallel"""
+        if self.with_distributed_adam and self.use_mcore_dist_optim:
+            config = get_model_config(self.enc_dec_model[0])
+            ddp_config = DistributedDataParallelConfig(
+                grad_reduce_in_fp32=(self.cfg.optim.get('grad_sync_dtype', 'fp32') == 'fp32'),
+                overlap_grad_reduce=self.cfg.optim.get('overlap_grad_sync', False),
+                use_distributed_optimizer=True,
+                check_for_nan_in_grad=self.cfg.optim.get('check_for_nan_in_grad', False),
+                # mcore bucket_size is based on num of parameters, therefore not
+                # using bucket_cap_mb to configure bucket_size here
+                bucket_size=self.cfg.optim.get('ddp_bucket_size', None),
+            )
+            self.enc_dec_model = [
+                McoreDDP(
+                    config,
+                    ddp_config,
+                    model_chunk,
+                    data_parallel_group=parallel_state.get_data_parallel_group(with_context_parallel=True),
+                    expert_data_parallel_group=parallel_state.get_data_modulo_expert_parallel_group(),
+                    # Turn off bucketing for model_chunk 2 onwards, since communication for these
+                    # model chunks is overlapped with compute anyway.
+                    disable_bucketing=(model_chunk_idx > 0),
+                )
+                for (model_chunk_idx, model_chunk) in enumerate(self.enc_dec_model)
+            ]
+
+            # (TODO) Broadcast params from data parallel src rank to other data parallel ranks.
+            # by calling model_module.broadcast_params() if the model is randomly initialized.

From 17f295beb207a31c3f4dea40e311ccef3cbc08ff Mon Sep 17 00:00:00 2001
From: Marc Romeyn <mromeijn@nvidia.com>
Date: Mon, 8 Jul 2024 17:50:37 +0200
Subject: [PATCH 20/26] [Nemo-UX] Expose transformer_layer_spec inside
 GPTConfig (#9592)

* Expose transformer_layer_spec inside GPTConfig

* Apply isort and black reformatting

Signed-off-by: marcromeyn <marcromeyn@users.noreply.github.com>

* Expose layer-specs

* Apply isort and black reformatting

Signed-off-by: marcromeyn <marcromeyn@users.noreply.github.com>

---------

Signed-off-by: marcromeyn <marcromeyn@users.noreply.github.com>
Co-authored-by: marcromeyn <marcromeyn@users.noreply.github.com>
---
 nemo/collections/llm/gpt/model/__init__.py |  4 +++
 nemo/collections/llm/gpt/model/base.py     | 33 +++++++++++++++++++---
 2 files changed, 33 insertions(+), 4 deletions(-)

diff --git a/nemo/collections/llm/gpt/model/__init__.py b/nemo/collections/llm/gpt/model/__init__.py
index 1dac811f91ef..4391a41293ee 100644
--- a/nemo/collections/llm/gpt/model/__init__.py
+++ b/nemo/collections/llm/gpt/model/__init__.py
@@ -4,6 +4,8 @@
     MaskedTokenLossReduction,
     gpt_data_step,
     gpt_forward_step,
+    local_layer_spec,
+    transformer_engine_layer_spec,
 )
 from nemo.collections.llm.gpt.model.gemma import (
     CodeGemmaConfig2B,
@@ -56,4 +58,6 @@
     "MaskedTokenLossReduction",
     "gpt_data_step",
     "gpt_forward_step",
+    "transformer_engine_layer_spec",
+    "local_layer_spec",
 ]
diff --git a/nemo/collections/llm/gpt/model/base.py b/nemo/collections/llm/gpt/model/base.py
index 28a0eed52a5f..4c1f425d7f99 100644
--- a/nemo/collections/llm/gpt/model/base.py
+++ b/nemo/collections/llm/gpt/model/base.py
@@ -1,10 +1,12 @@
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Callable, Dict, Literal, Optional
+from typing import TYPE_CHECKING, Callable, Dict, Literal, Optional, Union
 
 import pytorch_lightning as L
 import torch
 import torch.distributed
+from megatron.core.models.gpt import gpt_layer_specs
 from megatron.core.optimizer import OptimizerConfig
+from megatron.core.transformer.spec_utils import ModuleSpec
 from megatron.core.transformer.transformer_config import TransformerConfig
 from torch import nn
 
@@ -63,6 +65,18 @@ def gpt_forward_step(model, batch) -> torch.Tensor:
     return model(**forward_args)
 
 
+def transformer_engine_layer_spec(config: "GPTConfig") -> ModuleSpec:
+    return gpt_layer_specs.get_gpt_layer_with_transformer_engine_spec(
+        num_experts=config.num_moe_experts, moe_grouped_gemm=config.moe_grouped_gemm, qk_layernorm=config.qk_layernorm
+    )
+
+
+def local_layer_spec(config: "GPTConfig") -> ModuleSpec:
+    return gpt_layer_specs.get_gpt_layer_local_spec(
+        num_experts=config.num_moe_experts, moe_grouped_gemm=config.moe_grouped_gemm, qk_layernorm=config.qk_layernorm
+    )
+
+
 @dataclass
 class GPTConfig(TransformerConfig, io.IOMixin):
     # From megatron.core.models.gpt.gpt_model.GPTModel
@@ -79,6 +93,7 @@ class GPTConfig(TransformerConfig, io.IOMixin):
     # TODO: Move this to better places?
     get_attention_mask_from_fusion: bool = False
 
+    transformer_layer_spec: Union[ModuleSpec, Callable[["GPTConfig"], ModuleSpec]] = transformer_engine_layer_spec
     forward_step_fn: Callable = gpt_forward_step
     data_step_fn: Callable = gpt_data_step
 
@@ -91,12 +106,15 @@ def configure_model(self, tokenizer) -> "MCoreGPTModel":
             ) % vp_size == 0, "Make sure the number of model chunks is the same across all pipeline stages."
 
         from megatron.core import parallel_state
-        from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec
         from megatron.core.models.gpt.gpt_model import GPTModel as MCoreGPTModel
 
+        transformer_layer_spec = self.transformer_layer_spec
+        if not isinstance(transformer_layer_spec, ModuleSpec):
+            transformer_layer_spec = transformer_layer_spec(self)
+
         return MCoreGPTModel(
             self,
-            transformer_layer_spec=get_gpt_layer_with_transformer_engine_spec(self.num_moe_experts),
+            transformer_layer_spec=transformer_layer_spec,
             vocab_size=get_vocab_size(self, tokenizer.vocab_size, self.make_vocab_size_divisible_by),
             max_sequence_length=self.seq_length,
             fp16_lm_cross_entropy=self.fp16_lm_cross_entropy,
@@ -225,4 +243,11 @@ def get_packed_seq_params(batch):
     )
 
 
-__all__ = ["GPTModel", "GPTConfig", "gpt_data_step", "gpt_forward_step"]
+__all__ = [
+    "GPTModel",
+    "GPTConfig",
+    "gpt_data_step",
+    "gpt_forward_step",
+    "transformer_engine_layer_spec",
+    "local_layer_spec",
+]

From a70349316552f1e5ee975fd03010152a17e1982e Mon Sep 17 00:00:00 2001
From: Yu Yao <54727607+yaoyu-33@users.noreply.github.com>
Date: Mon, 8 Jul 2024 09:20:33 -0700
Subject: [PATCH 21/26] Update NeMo Clip to Use MCore Modules (#9594)

* update clip model and config file

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* update clip for mcore

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* MCore CLIP Fix

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* fix no mask

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* few neva fixes

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* update siglip module

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* add siglip loss

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* fix

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* fix collate fn

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* update siglip conversion script

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* update siglip convert

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* clip fixes

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Apply isort and black reformatting

Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>

* clean up script

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* clip fixes

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Apply isort and black reformatting

Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>

* fix code styles

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Update siglip_loss.py

Signed-off-by: yaoyu-33 <54727607+yaoyu-33@users.noreply.github.com>

---------

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>
Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>
Signed-off-by: yaoyu-33 <54727607+yaoyu-33@users.noreply.github.com>
Co-authored-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>
---
 examples/multimodal/convert_ckpt_to_nemo.py   |   8 -
 .../clip/conf/megatron_clip_VIT-L-14.yaml     |  51 +-
 .../clip/conf/megatron_clip_config.yaml       |   3 +-
 .../clip/conf/megatron_clip_infer.yaml        |   2 +-
 .../conf/megatron_siglip_so400m_14_384.yaml   | 251 +++++
 .../clip/convert_external_clip_to_nemo.py     |   1 +
 .../clip/megatron_clip_pretrain.py            |   7 +-
 .../multimodal/data/clip/clip_dataset.py      |  33 +-
 .../multimodal/losses/siglip_loss.py          | 220 +++++
 .../clip/megatron_clip_models.py              | 921 +++++++++++++++---
 .../language_modeling/megatron_base_model.py  |   2 +-
 nemo/collections/nlp/parts/utils_funcs.py     |  15 +-
 .../convert_clip_hf_to_nemo.py                | 248 +++++
 .../convert_siglip_hf_to_nemo.py              | 380 ++++++++
 14 files changed, 1996 insertions(+), 146 deletions(-)
 create mode 100644 examples/multimodal/vision_language_foundation/clip/conf/megatron_siglip_so400m_14_384.yaml
 create mode 100644 nemo/collections/multimodal/losses/siglip_loss.py
 create mode 100644 scripts/checkpoint_converters/convert_clip_hf_to_nemo.py
 create mode 100644 scripts/checkpoint_converters/convert_siglip_hf_to_nemo.py

diff --git a/examples/multimodal/convert_ckpt_to_nemo.py b/examples/multimodal/convert_ckpt_to_nemo.py
index 2bc0f5d7ab62..573bdc0bc040 100644
--- a/examples/multimodal/convert_ckpt_to_nemo.py
+++ b/examples/multimodal/convert_ckpt_to_nemo.py
@@ -165,14 +165,6 @@ def convert(local_rank, rank, world_size, args):
         model = MegatronControlNet.load_from_checkpoint(
             checkpoint_path, hparams_file=args.hparams_file, trainer=trainer
         )
-    elif args.model_type == 'kosmos':
-        model = MegatronKosmosModel.load_from_checkpoint(
-            checkpoint_path, hparams_file=args.hparams_file, trainer=trainer
-        )
-    elif args.model_type == 'neva':
-        model = MegatronNevaModel.load_from_checkpoint(
-            checkpoint_path, hparams_file=args.hparams_file, trainer=trainer
-        )
     else:
         raise ValueError(f"Unrecognized model_type {args.model_type}.")
 
diff --git a/examples/multimodal/vision_language_foundation/clip/conf/megatron_clip_VIT-L-14.yaml b/examples/multimodal/vision_language_foundation/clip/conf/megatron_clip_VIT-L-14.yaml
index d8740bb98eb2..bfee36b6c099 100644
--- a/examples/multimodal/vision_language_foundation/clip/conf/megatron_clip_VIT-L-14.yaml
+++ b/examples/multimodal/vision_language_foundation/clip/conf/megatron_clip_VIT-L-14.yaml
@@ -1,3 +1,50 @@
+trainer:
+  devices: 1
+  num_nodes: 1
+  accelerator: gpu
+  precision: 32
+  logger: False # logger provided by exp_manager
+  enable_checkpointing: False
+  use_distributed_sampler: False
+  max_epochs: -1 # PTL default. In practice, max_steps will be reached first.
+  max_steps: 375000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
+  log_every_n_steps: 10
+  val_check_interval: 100
+  check_val_every_n_epoch: null
+  limit_val_batches: 50
+  limit_test_batches: 500
+  accumulate_grad_batches: 1 # do not modify, grad acc is automatic for training megatron models
+  gradient_clip_val: 1.0
+  benchmark: False
+  enable_model_summary: False # default PTL callback for this does not support model parallelism, instead we log manually
+
+exp_manager:
+  explicit_log_dir: null
+  exp_dir: null
+  name: megatron_clip
+  create_wandb_logger: False
+  wandb_logger_kwargs:
+    project: null
+    name: null
+  resume_if_exists: True
+  resume_ignore_no_checkpoint: True
+  resume_from_checkpoint: ${model.resume_from_checkpoint}
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: val_loss
+    save_top_k: 10
+    mode: min
+    always_save_nemo: False # saves nemo file during validation, not implemented for model parallel
+    save_nemo_on_train_end: False # not recommended when training large models on clusters with short time limits
+    filename: 'megatron_clip--{val_loss:.2f}-{step}-{consumed_samples}'
+    model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}}
+  ema:
+    enable: False
+    decay: 0.9999
+    validate_original_weights: False
+    every_n_steps: 1
+    cpu_offload: False
+
 model:
   precision: 32
   # specify micro_batch_size, global_batch_size, and model parallelism
@@ -19,6 +66,9 @@ model:
   local_loss: False # calculate loss w/ local features @ global (instead of realizing full global @ global matrix)
   gather_with_grad: True # enable full distributed gradient for feature gather, set this to False may cause convergence issue
 
+  mcore_gpt: False
+  transformer_engine: False
+
   vision:
     precision: 32
     # vision configs
@@ -135,7 +185,6 @@ model:
     bias_activation_fusion: False
     megatron_legacy: True
 
-    transformer_engine: False
     fp8: False # enables fp8 in TransformerLayer forward
     fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3
     fp8_hybrid: False # sets fp8_format = recipe.Format.HYBRID
diff --git a/examples/multimodal/vision_language_foundation/clip/conf/megatron_clip_config.yaml b/examples/multimodal/vision_language_foundation/clip/conf/megatron_clip_config.yaml
index a6b1928ef13f..f75a163a5ed2 100644
--- a/examples/multimodal/vision_language_foundation/clip/conf/megatron_clip_config.yaml
+++ b/examples/multimodal/vision_language_foundation/clip/conf/megatron_clip_config.yaml
@@ -68,6 +68,8 @@ model:
   #  numerical results as the naïve method.
   local_loss: False # calculate loss w/ local features @ global (instead of realizing full global @ global matrix)
   gather_with_grad: True # enable full distributed gradient for feature gather, set this to False may cause convergence issue
+  mcore_gpt: True
+  transformer_engine: True
 
   vision:
     precision: ${trainer.precision}
@@ -183,7 +185,6 @@ model:
     bias_activation_fusion: False
     megatron_legacy: False
 
-    transformer_engine: False
     fp8: False # enables fp8 in TransformerLayer forward
     fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3
     fp8_hybrid: False # sets fp8_format = recipe.Format.HYBRID
diff --git a/examples/multimodal/vision_language_foundation/clip/conf/megatron_clip_infer.yaml b/examples/multimodal/vision_language_foundation/clip/conf/megatron_clip_infer.yaml
index 215cd17841ae..3e127aa6d86a 100755
--- a/examples/multimodal/vision_language_foundation/clip/conf/megatron_clip_infer.yaml
+++ b/examples/multimodal/vision_language_foundation/clip/conf/megatron_clip_infer.yaml
@@ -6,7 +6,7 @@ trainer:
   num_nodes: 1
   accelerator: gpu
   logger: False # logger provided by exp_manager
-  precision: 16 # 16, 32, or bf16
+  precision: 32 # 16, 32, or bf16
 
 model:
   restore_from_path: null  # Path to a trained ViT .nemo file
diff --git a/examples/multimodal/vision_language_foundation/clip/conf/megatron_siglip_so400m_14_384.yaml b/examples/multimodal/vision_language_foundation/clip/conf/megatron_siglip_so400m_14_384.yaml
new file mode 100644
index 000000000000..6c5be3a2bcd6
--- /dev/null
+++ b/examples/multimodal/vision_language_foundation/clip/conf/megatron_siglip_so400m_14_384.yaml
@@ -0,0 +1,251 @@
+trainer:
+  devices: 1
+  num_nodes: 1
+  accelerator: gpu
+  precision: 32
+  logger: False # logger provided by exp_manager
+  enable_checkpointing: False
+  use_distributed_sampler: False
+  max_epochs: -1 # PTL default. In practice, max_steps will be reached first.
+  max_steps: 375000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
+  log_every_n_steps: 10
+  val_check_interval: 100
+  check_val_every_n_epoch: null
+  limit_val_batches: 50
+  limit_test_batches: 500
+  accumulate_grad_batches: 1 # do not modify, grad acc is automatic for training megatron models
+  gradient_clip_val: 1.0
+  benchmark: False
+  enable_model_summary: False # default PTL callback for this does not support model parallelism, instead we log manually
+
+exp_manager:
+  explicit_log_dir: null
+  exp_dir: null
+  name: megatron_clip
+  create_wandb_logger: False
+  wandb_logger_kwargs:
+    project: null
+    name: null
+  resume_if_exists: True
+  resume_ignore_no_checkpoint: True
+  resume_from_checkpoint: ${model.resume_from_checkpoint}
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: val_loss
+    save_top_k: 10
+    mode: min
+    always_save_nemo: False # saves nemo file during validation, not implemented for model parallel
+    save_nemo_on_train_end: False # not recommended when training large models on clusters with short time limits
+    filename: 'megatron_clip--{val_loss:.2f}-{step}-{consumed_samples}'
+    model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}}
+  ema:
+    enable: False
+    decay: 0.9999
+    validate_original_weights: False
+    every_n_steps: 1
+    cpu_offload: False
+
+model:
+  precision: 32
+  # specify micro_batch_size, global_batch_size, and model parallelism
+  # gradient accumulation will be done automatically based on data_parallel_size
+  micro_batch_size: 1 # limited by GPU memory
+  global_batch_size: 1 # will use more micro batches to reach global batch size
+  tensor_model_parallel_size: 1 # intra-layer model parallelism
+  pipeline_model_parallel_size: 1 # inter-layer model parallelism
+  virtual_pipeline_model_parallel_size: null # interleaved pipeline
+
+  restore_from_pretrained: null # used in fine-tuning
+  # multimodal configs
+  output_dim: 1152
+  #  As the number of devices used to train increases, so does the space complexity of
+  #  the logit matrix. Using a naïve all-gather scheme, space complexity will be
+  #  `O(n^2)`. Instead, complexity may become effectively linear if the flags
+  #  `--gather-with-grad` and `--local-loss` are used. This alteration results in one-to-one
+  #  numerical results as the naïve method.
+
+  use_siglip: True
+  mcore_gpt: True
+  transformer_engine: True
+
+  vision:
+    precision: 32
+    # vision configs
+    patch_dim: 14
+    img_h: 378
+    img_w: 378
+    image_mean: null
+    image_std: null
+    num_channels: 3
+    drop_patch_rate: 0.0
+    drop_path_rate: 0.0
+    global_average_pool: False
+    output_dim: ${model.output_dim}
+    class_token_length: 0
+    preprocess_layernorm: True # apply layer norm to embedded tokens
+
+    # model architecture
+    encoder_seq_length: 196
+    max_position_embeddings: ${.encoder_seq_length}
+    position_embedding_type: learned_absolute
+    num_layers: 27
+    hidden_size: 1152
+    ffn_hidden_size: 4304 # Transformer FFN hidden size. Usually 4 * hidden_size.
+    num_attention_heads: 16
+    init_method_std: 0.02 # Standard deviation of the zero mean normal distribution used for weight initialization.')
+    use_scaled_init_method: True # use scaled residuals initialization
+    hidden_dropout: 0. # Dropout probability for hidden state transformer.
+    attention_dropout: 0.
+    kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null
+    apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number.
+    normalization: layernorm # Type of normalization layers
+    layernorm_epsilon: 1e-5
+    do_layer_norm_weight_decay: False # True means weight decay on all params
+    pre_process: True # add embedding
+    post_process: True # add pooler
+    persist_layer_norm: True # Use of persistent fused layer norm kernel.
+
+    ## Activation Checkpointing
+    activations_checkpoint_granularity: null # 'selective' or 'full'
+    activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective'
+    activations_checkpoint_num_layers: null # not used with 'selective'
+    sequence_parallel: False
+
+    # precision
+    native_amp_init_scale: 4294967296 # 2 ** 32
+    native_amp_growth_interval: 1000
+    hysteresis: 2 # Gradient scale hysteresis
+    fp32_residual_connection: False # Move residual connections to fp32
+    fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16
+
+    # model fusions
+    masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask.
+    bias_dropout_add_fusion: True # Use a kernel that fuses the bias addition, dropout and residual connection addition.
+
+    use_cpu_initialization: False # Init weights on the CPU (slow for large models)
+    onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter.
+    gradient_accumulation_fusion: False # Fuse weight gradient accumulation to GEMMs. Only used with pipeline parallelism.
+    openai_gelu: True
+    bias_activation_fusion: False
+    megatron_legacy: True
+    activation: approx-gelu
+
+
+
+  text:
+    precision: 32
+    # text configs
+    output_dim: ${model.output_dim}
+
+    # model architecture
+    encoder_seq_length: 64
+    max_position_embeddings: ${.encoder_seq_length}
+    position_embedding_type: learned_absolute
+    num_layers: 27
+    hidden_size: 1152
+    ffn_hidden_size: 4304 # Transformer FFN hidden size. Usually 4 * hidden_size.
+    num_attention_heads: 16
+    init_method_std: 0.02 # Standard deviation of the zero mean normal distribution used for weight initialization.')
+    use_scaled_init_method: True # use scaled residuals initialization
+    hidden_dropout: 0. # Dropout probability for hidden state transformer.
+    attention_dropout: 0.
+    kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null
+    apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number.
+    normalization: layernorm # Type of normalization layers
+    layernorm_epsilon: 1e-5
+    do_layer_norm_weight_decay: False # True means weight decay on all params
+    pre_process: True # add embedding
+    post_process: True # add pooler
+    persist_layer_norm: True # Use of persistent fused layer norm kernel.
+
+    ## Activation Checkpointing
+    activations_checkpoint_granularity: null # 'selective' or 'full'
+    activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective'
+    activations_checkpoint_num_layers: null # not used with 'selective'
+    num_micro_batches_with_partial_activation_checkpoints: null
+    activations_checkpoint_layers_per_pipeline: null
+    sequence_parallel: False
+
+    # precision
+    native_amp_init_scale: 4294967296 # 2 ** 32
+    native_amp_growth_interval: 1000
+    hysteresis: 2 # Gradient scale hysteresis
+    fp32_residual_connection: False # Move residual connections to fp32
+    fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16
+
+    # model fusions
+    masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask.
+    bias_dropout_add_fusion: True # Use a kernel that fuses the bias addition, dropout and residual connection addition.
+
+    use_cpu_initialization: False # Init weights on the CPU (slow for large models)
+    onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter.
+    gradient_accumulation_fusion: False # Fuse weight gradient accumulation to GEMMs. Only used with pipeline parallelism.
+    openai_gelu: True
+    bias_activation_fusion: False
+    megatron_legacy: True
+
+    fp8: False # enables fp8 in TransformerLayer forward
+    fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3
+    fp8_hybrid: False # sets fp8_format = recipe.Format.HYBRID
+    fp8_margin: 0 # scaling margin
+    fp8_interval: 1 # scaling update interval
+    fp8_amax_history_len: 1 # Number of steps for which amax history is recorded per tensor
+    fp8_amax_compute_algo: most_recent # 'most_recent' or 'max'. Algorithm for computing amax from history
+    use_emha: False # Use fused multi-head attention for large sequence-length. Note this is not yet supported. Please set to False.
+    activation: approx-gelu
+
+  # Megatron O2-style half-precision
+  megatron_amp_O2: False # Enable O2-level automatic mixed precision using main parameters
+  grad_allreduce_chunk_size_mb: 125
+  grad_div_ar_fusion: True # Fuse grad division into torch.distributed.all_reduce
+
+  # miscellaneous
+  seed: 1234
+  resume_from_checkpoint: null # manually set the checkpoint file to load from
+  apex_transformer_log_level: 30 # Python logging level displays logs with severity greater than or equal to this
+  gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
+
+  tokenizer:
+    library: 'huggingface'
+    type: 'google/siglip-so400m-patch14-384'
+    model: null
+    vocab_file: null
+    merge_file: null
+    delimiter: null # only used for tabular tokenizer
+    sentencepiece_legacy: False # Legacy=True allows you to add special tokens to sentencepiece tokenizers.
+  make_vocab_size_divisible_by: 128 # Pad the vocab size to be divisible by this value for computation efficiency.
+
+  data:
+    num_workers: 8
+    train:
+      dataset_path: # List of paths to pkl files or tar files
+        - /datasets/coyo/test.pkl
+    validation: # List of paths to pkl files or tar files
+      dataset_path:
+        - /datasets/coyo/test.pkl
+    webdataset:
+      infinite_sampler: False
+      local_root_path: /datasets/coyo
+
+    imagenet_val: null # Path to imagenet val set for conducting zero shot evaluation.
+
+  # Nsys profiling options
+  nsys_profile:
+    enabled: False
+    start_step: 10  # Global batch to start profiling
+    end_step: 10 # Global batch to end profiling
+    ranks: [ 0 ] # Global rank IDs to profile
+    gen_shape: False # Generate model and kernel details including input shapes
+
+  optim:
+    name: fused_adam
+    lr: 1e-3
+    weight_decay: 0.2
+    betas:
+      - 0.9
+      - 0.98
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 2000
+      constant_steps: 0
+      min_lr: 1e-5
\ No newline at end of file
diff --git a/examples/multimodal/vision_language_foundation/clip/convert_external_clip_to_nemo.py b/examples/multimodal/vision_language_foundation/clip/convert_external_clip_to_nemo.py
index b9b9ab917173..9af25181d07e 100644
--- a/examples/multimodal/vision_language_foundation/clip/convert_external_clip_to_nemo.py
+++ b/examples/multimodal/vision_language_foundation/clip/convert_external_clip_to_nemo.py
@@ -283,6 +283,7 @@ def convert(local_rank, rank, world_size, args):
 
 
 if __name__ == '__main__':
+    logging.warning("This script is going to be deprecated soon. Please use ")
     args = get_args()
     local_rank, rank, world_size = initialize_distributed(args)
     convert(local_rank, rank, world_size, args)
diff --git a/examples/multimodal/vision_language_foundation/clip/megatron_clip_pretrain.py b/examples/multimodal/vision_language_foundation/clip/megatron_clip_pretrain.py
index 4462649a5861..abca470e5843 100644
--- a/examples/multimodal/vision_language_foundation/clip/megatron_clip_pretrain.py
+++ b/examples/multimodal/vision_language_foundation/clip/megatron_clip_pretrain.py
@@ -22,8 +22,6 @@
 from nemo.utils import logging
 from nemo.utils.exp_manager import exp_manager
 
-mp.set_start_method("spawn", force=True)
-
 
 @hydra_runner(config_path="conf", config_name="megatron_clip_config")
 def main(cfg) -> None:
@@ -31,7 +29,10 @@ def main(cfg) -> None:
     logging.info(f'\n{OmegaConf.to_yaml(cfg)}')
 
     assert (
-        cfg.trainer.devices * cfg.trainer.num_nodes
+        cfg.trainer.devices
+        * cfg.trainer.num_nodes
+        // cfg.model.tensor_model_parallel_size
+        // cfg.model.pipeline_model_parallel_size
     ) * cfg.model.micro_batch_size == cfg.model.global_batch_size, (
         "Gradient accumulation is not supported in CLIP yet."
     )
diff --git a/nemo/collections/multimodal/data/clip/clip_dataset.py b/nemo/collections/multimodal/data/clip/clip_dataset.py
index 7e263e19dcc9..6b63d546194a 100644
--- a/nemo/collections/multimodal/data/clip/clip_dataset.py
+++ b/nemo/collections/multimodal/data/clip/clip_dataset.py
@@ -76,11 +76,18 @@ def get_preprocess_fns(model_cfg, tokenizer=None, is_train=True):
     img_size = (model_cfg.vision.get("img_h"), model_cfg.vision.get("img_w"))
     img_mean = model_cfg.vision.get("img_mean")
     img_std = model_cfg.vision.get("img_std")
-    img_transform = image_transform(img_size, is_train=is_train, mean=img_mean, std=img_std,)
+    img_transform = image_transform(
+        img_size,
+        is_train=is_train,
+        mean=img_mean,
+        std=img_std,
+    )
     text_transform = lambda x: x
     if tokenizer is not None:
         text_transform = partial(
-            tokenize, tokenizer=tokenizer, context_length=model_cfg.text.get("max_position_embeddings"),
+            tokenize,
+            tokenizer=tokenizer,
+            context_length=model_cfg.text.get("max_position_embeddings"),
         )
     return img_transform, text_transform
 
@@ -100,7 +107,9 @@ def transform_fn(sample, img_transform, text_transform):
 
 
 def build_train_valid_datasets(
-    model_cfg, consumed_samples, tokenizer=None,
+    model_cfg,
+    consumed_samples,
+    tokenizer=None,
 ):
     data_cfg = model_cfg.data
 
@@ -127,6 +136,13 @@ def build_train_valid_datasets(
     return train_data, val_data
 
 
+def custom_collate(batch):
+    if len(batch) == 0:
+        return None, None
+    else:
+        return default_collate(batch)
+
+
 # For zero-shot imagenet validation
 def build_imagenet_validation_dataloader(model_cfg, tokenizer=None):
     val_image_transform, text_transform = get_preprocess_fns(model_cfg, tokenizer, is_train=False)
@@ -138,7 +154,10 @@ def build_imagenet_validation_dataloader(model_cfg, tokenizer=None):
     if imagenet_path is None:
         return None
 
-    image_dataset = ImageFolder(root=imagenet_path, transform=val_image_transform,)
+    image_dataset = ImageFolder(
+        root=imagenet_path,
+        transform=val_image_transform,
+    )
 
     image_batch_sampler = MegatronPretrainingSampler(
         total_samples=len(image_dataset),
@@ -150,12 +169,6 @@ def build_imagenet_validation_dataloader(model_cfg, tokenizer=None):
         drop_last=False,
     )
 
-    def custom_collate(batch):
-        if len(batch) == 0:
-            return None, None
-        else:
-            return default_collate(batch)
-
     imagenet_val["images"] = torch.utils.data.DataLoader(
         image_dataset,
         batch_sampler=image_batch_sampler,
diff --git a/nemo/collections/multimodal/losses/siglip_loss.py b/nemo/collections/multimodal/losses/siglip_loss.py
new file mode 100644
index 000000000000..a7d2ec9b46ce
--- /dev/null
+++ b/nemo/collections/multimodal/losses/siglip_loss.py
@@ -0,0 +1,220 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This file contains code artifacts adapted from the original implementation:
+# https://github.com/mlfoundations/open_clip/blob/main/src/open_clip/loss.py
+
+import torch
+import torch.nn.functional as F
+
+
+def neighbour_exchange(from_rank, to_rank, tensor, group=None):
+    tensor_recv = torch.zeros_like(tensor)
+    send_op = torch.distributed.P2POp(
+        torch.distributed.isend,
+        tensor,
+        to_rank,
+        group=group,
+    )
+    recv_op = torch.distributed.P2POp(
+        torch.distributed.irecv,
+        tensor_recv,
+        from_rank,
+        group=group,
+    )
+    reqs = torch.distributed.batch_isend_irecv([send_op, recv_op])
+    for req in reqs:
+        req.wait()
+    return tensor_recv
+
+
+def neighbour_exchange_bidir(left_rank, right_rank, tensor_to_left, tensor_to_right, group=None):
+    tensor_from_left = torch.zeros_like(tensor_to_right)
+    tensor_from_right = torch.zeros_like(tensor_to_left)
+    send_op_left = torch.distributed.P2POp(
+        torch.distributed.isend,
+        tensor_to_left,
+        left_rank,
+        group=group,
+    )
+    send_op_right = torch.distributed.P2POp(
+        torch.distributed.isend,
+        tensor_to_right,
+        right_rank,
+        group=group,
+    )
+    recv_op_left = torch.distributed.P2POp(
+        torch.distributed.irecv,
+        tensor_from_left,
+        left_rank,
+        group=group,
+    )
+    recv_op_right = torch.distributed.P2POp(
+        torch.distributed.irecv,
+        tensor_from_right,
+        right_rank,
+        group=group,
+    )
+    reqs = torch.distributed.batch_isend_irecv([send_op_right, send_op_left, recv_op_right, recv_op_left])
+    for req in reqs:
+        req.wait()
+    return tensor_from_right, tensor_from_left
+
+
+class NeighbourExchange(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, from_rank, to_rank, group, tensor):
+        ctx.group = group
+        ctx.from_rank = from_rank
+        ctx.to_rank = to_rank
+        return neighbour_exchange(from_rank, to_rank, tensor, group=group)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return (None, None, None) + (NeighbourExchange.apply(ctx.to_rank, ctx.from_rank, ctx.group, grad_output),)
+
+
+def neighbour_exchange_with_grad(from_rank, to_rank, tensor, group=None):
+    return NeighbourExchange.apply(from_rank, to_rank, group, tensor)
+
+
+class NeighbourExchangeBidir(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, left_rank, right_rank, group, tensor_to_left, tensor_to_right):
+        ctx.group = group
+        ctx.left_rank = left_rank
+        ctx.right_rank = right_rank
+        return neighbour_exchange_bidir(left_rank, right_rank, tensor_to_left, tensor_to_right, group=group)
+
+    @staticmethod
+    def backward(ctx, *grad_outputs):
+        return (None, None, None) + NeighbourExchangeBidir.apply(
+            ctx.right_rank, ctx.left_rank, ctx.group, *grad_outputs
+        )
+
+
+def neighbour_exchange_bidir_with_grad(left_rank, right_rank, tensor_to_left, tensor_to_right, group=None):
+    return NeighbourExchangeBidir.apply(left_rank, right_rank, group, tensor_to_left, tensor_to_right)
+
+
+class SigLipLoss(torch.nn.Module):
+    """Sigmoid Loss for Language Image Pre-Training (SigLIP) - https://arxiv.org/abs/2303.15343
+
+    @article{zhai2023sigmoid,
+      title={Sigmoid loss for language image pre-training},
+      author={Zhai, Xiaohua and Mustafa, Basil and Kolesnikov, Alexander and Beyer, Lucas},
+      journal={arXiv preprint arXiv:2303.15343},
+      year={2023}
+    }
+    """
+
+    def __init__(
+        self,
+        cache_labels=False,
+        rank=0,
+        world_size=1,
+        group=None,
+        bidir=True,
+    ):
+        super().__init__()
+        self.cache_labels = cache_labels
+        self.rank = rank
+        self.world_size = world_size
+        self.group = group
+        self.bidir = bidir
+
+    def get_ground_truth(self, device, dtype, num_logits, negative_only=False) -> torch.Tensor:
+        labels = -torch.ones((num_logits, num_logits), device=device, dtype=dtype)
+        if not negative_only:
+            labels = 2 * torch.eye(num_logits, device=device, dtype=dtype) + labels
+        return labels
+
+    def get_logits(self, image_features, text_features, logit_scale, logit_bias=None):
+        logits = logit_scale * image_features @ text_features.T
+        if logit_bias is not None:
+            logits += logit_bias
+        return logits
+
+    def _loss(self, image_features, text_features, logit_scale, logit_bias=None, negative_only=False):
+        logits = self.get_logits(image_features, text_features, logit_scale, logit_bias)
+        labels = self.get_ground_truth(
+            image_features.device,
+            image_features.dtype,
+            image_features.shape[0],
+            negative_only=negative_only,
+        )
+        loss = -F.logsigmoid(labels * logits).sum() / image_features.shape[0]
+        return loss
+
+    def forward(
+        self,
+        output_tensor,
+    ):
+        image_features, text_features, logit_scale, logit_bias = output_tensor
+        loss = self._loss(image_features, text_features, logit_scale, logit_bias)
+
+        if self.world_size > 1:
+            # exchange text features w/ neighbour world_size - 1 times
+            right_rank = (self.rank + 1) % self.world_size
+            left_rank = (self.rank - 1 + self.world_size) % self.world_size
+            if self.bidir:
+                text_features_to_right = text_features_to_left = text_features
+                num_bidir, remainder = divmod(self.world_size - 1, 2)
+                for i in range(num_bidir):
+                    text_features_recv = neighbour_exchange_bidir_with_grad(
+                        left_rank,
+                        right_rank,
+                        text_features_to_left,
+                        text_features_to_right,
+                        group=self.group,
+                    )
+
+                    for f in text_features_recv:
+                        loss += self._loss(
+                            image_features,
+                            f,
+                            logit_scale,
+                            logit_bias,
+                            negative_only=True,
+                        )
+                    text_features_to_left, text_features_to_right = text_features_recv
+
+                if remainder:
+                    text_features_recv = neighbour_exchange_with_grad(
+                        left_rank, right_rank, text_features_to_right, group=self.group
+                    )
+
+                    loss += self._loss(
+                        image_features,
+                        text_features_recv,
+                        logit_scale,
+                        logit_bias,
+                        negative_only=True,
+                    )
+            else:
+                text_features_to_right = text_features
+                for i in range(self.world_size - 1):
+                    text_features_from_left = neighbour_exchange_with_grad(
+                        left_rank, right_rank, text_features_to_right, group=self.group
+                    )
+
+                    loss += self._loss(
+                        image_features,
+                        text_features_from_left,
+                        logit_scale,
+                        logit_bias,
+                        negative_only=True,
+                    )
+                    text_features_to_right = text_features_from_left
+        return loss, {"loss": loss}
diff --git a/nemo/collections/multimodal/models/vision_language_foundation/clip/megatron_clip_models.py b/nemo/collections/multimodal/models/vision_language_foundation/clip/megatron_clip_models.py
index 7be7407b98ae..a83960307672 100644
--- a/nemo/collections/multimodal/models/vision_language_foundation/clip/megatron_clip_models.py
+++ b/nemo/collections/multimodal/models/vision_language_foundation/clip/megatron_clip_models.py
@@ -13,12 +13,17 @@
 # limitations under the License.
 
 import itertools
-from functools import partial
+import os
+import warnings
+from contextlib import nullcontext
+from dataclasses import fields
+from functools import cache, partial
 from typing import Any, Optional
 
 import numpy as np
 import torch
 import torch.nn.functional as F
+from omegaconf import OmegaConf
 from omegaconf.dictconfig import DictConfig
 from pytorch_lightning.accelerators import CPUAccelerator
 from pytorch_lightning.trainer.trainer import Trainer
@@ -29,7 +34,9 @@
     build_train_valid_datasets,
 )
 from nemo.collections.multimodal.losses.clip_loss import ClipLoss
+from nemo.collections.multimodal.losses.siglip_loss import SigLipLoss
 from nemo.collections.nlp.models.language_modeling.megatron_base_model import MegatronBaseModel
+from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import get_specs, mcore_supports_moe
 from nemo.collections.nlp.modules.common.megatron.build_model import build_model
 from nemo.collections.nlp.modules.common.megatron.language_model import get_language_model
 from nemo.collections.nlp.modules.common.megatron.module import Float16Module, MegatronModule
@@ -40,7 +47,7 @@
     init_method_normal,
     scaled_init_method_normal,
 )
-from nemo.collections.nlp.parts.utils_funcs import get_last_rank, torch_dtype_from_precision
+from nemo.collections.nlp.parts.utils_funcs import activation_to_func, get_last_rank
 from nemo.collections.vision.modules.vit.vit_backbone import VitBackbone
 from nemo.core.classes.common import PretrainedModelInfo
 from nemo.utils import logging
@@ -55,7 +62,33 @@
 
 try:
     from megatron.core import parallel_state
+    from megatron.core.distributed import DistributedDataParallel as McoreDDP
+    from megatron.core.distributed import DistributedDataParallelConfig
+    from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
+    from megatron.core.models.gpt import GPTModel as MCoreGPTModel
+    from megatron.core.models.vision.clip_vit_model import CLIPViTModel
     from megatron.core.pipeline_parallel.schedules import get_forward_backward_func
+    from megatron.core.transformer.attention import CrossAttention, CrossAttentionSubmodules
+    from megatron.core.transformer.custom_layers.transformer_engine import (
+        TEColumnParallelLinear,
+        TEDotProductAttention,
+        TELayerNormColumnParallelLinear,
+        TENorm,
+        TERowParallelLinear,
+    )
+    from megatron.core.transformer.enums import AttnMaskType as MCoreAttnMaskType
+    from megatron.core.transformer.identity_op import IdentityOp
+    from megatron.core.transformer.mlp import MLP, MLPSubmodules
+    from megatron.core.transformer.module import Float16Module as MCoreFloat16Module
+    from megatron.core.transformer.spec_utils import ModuleSpec
+    from megatron.core.transformer.transformer_config import TransformerConfig
+    from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules
+    from megatron.core.utils import (
+        drain_embedding_wgrad_compute,
+        get_model_config,
+        init_method_normal,
+        scaled_init_method_normal,
+    )
 
     HAVE_MEGATRON_CORE = True
 
@@ -63,6 +96,28 @@
 
     HAVE_MEGATRON_CORE = False
 
+try:
+    import transformer_engine
+    from transformer_engine.pytorch import module as te_module
+
+    HAVE_TE = True
+
+except (ImportError, ModuleNotFoundError):
+    HAVE_TE = False
+
+
+@cache
+def mcore_supports_moe() -> bool:
+    global HAVE_MEGATRON_CORE
+    if not HAVE_MEGATRON_CORE:
+        return False
+    try:
+        from megatron.core.transformer.moe.router import TopKRouter
+
+        return True
+    except ImportError:
+        return False
+
 
 class CLIPVisionTransformer(MegatronModule):
     """Vision Transformer Model."""
@@ -100,7 +155,11 @@ def __init__(self, model_cfg, model_parallel_config, pre_process=True, post_proc
 
         if self.post_process and not skip_head:
             self.output_dim = model_cfg.output_dim
-            self.head = torch.nn.Linear(self.hidden_size, self.output_dim, bias=False,)
+            self.head = torch.nn.Linear(
+                self.hidden_size,
+                self.output_dim,
+                bias=False,
+            )
 
     def set_input_tensor(self, input_tensor):
         """See megatron.model.transformer.set_input_tensor()"""
@@ -129,7 +188,6 @@ def __init__(self, model_cfg, model_parallel_config, padded_vocab_size, pre_proc
         self.pre_process = pre_process
         self.post_process = post_process
         self.fp16_lm_cross_entropy = model_cfg.fp16_lm_cross_entropy
-        self.sequence_parallel = model_cfg.sequence_parallel
         self.gradient_accumulation_fusion = model_cfg.gradient_accumulation_fusion
 
         scaled_init_method = (
@@ -173,7 +231,7 @@ def __init__(self, model_cfg, model_parallel_config, padded_vocab_size, pre_proc
             openai_gelu=model_cfg.openai_gelu,
             onnx_safe=model_cfg.onnx_safe,
             megatron_legacy=model_cfg.megatron_legacy,
-            transformer_engine=model_cfg.transformer_engine,
+            transformer_engine=False,
             fp8=model_cfg.fp8,
             fp8_e4m3=model_cfg.fp8_e4m3,
             fp8_hybrid=model_cfg.fp8_hybrid,
@@ -193,14 +251,17 @@ def __init__(self, model_cfg, model_parallel_config, padded_vocab_size, pre_proc
             hidden_size=model_cfg.hidden_size,
         )
 
-        # TODO (yuya): check this position id
         self.position_ids = None
         if self.pre_process:
             self.position_ids = torch.arange(model_cfg.max_position_embeddings).expand(1, -1).cuda()
 
         if self.post_process:
             self.output_dim = model_cfg.output_dim
-            self.head = torch.nn.Linear(model_cfg.hidden_size, self.output_dim, bias=False,)
+            self.head = torch.nn.Linear(
+                model_cfg.hidden_size,
+                self.output_dim,
+                bias=False,
+            )
 
         self.attn_mask = self.build_attention_mask(model_cfg.max_position_embeddings)
 
@@ -217,7 +278,8 @@ def build_attention_mask(self, max_position_embeddings):
         return mask
 
     def forward(
-        self, input_ids,
+        self,
+        input_ids,
     ):
         # input_ids: [b, s]
         # position_ids: [b, s]
@@ -245,27 +307,263 @@ def forward(
         return hidden_states
 
 
+class SiglipMHAPoolingHead(TransformerLayer):
+    """Multihead Attention Pooling."""
+
+    def __init__(
+        self,
+        config: TransformerConfig,
+        submodules: TransformerLayerSubmodules,
+    ):
+        super().__init__(config, submodules)
+
+        self.probe = torch.nn.Parameter(torch.randn(1, 1, config.hidden_size))
+
+    def forward(self, hidden_state):
+        batch_size = hidden_state.shape[0]
+        # [s, b, h]
+        probe = self.probe.repeat(1, batch_size, 1)
+        hidden_state = hidden_state.transpose(0, 1)
+        hidden_state, context = super().forward(
+            probe,
+            attention_mask=None,
+            context=hidden_state,
+        )
+
+        return hidden_state[0]
+
+
+class MCoreSiglipViTModel(CLIPViTModel):
+    def __init__(self, *args, **kwargs):
+        # TODO (yuya): need to handle post_process correctly in order to enable PP
+        self.output_dim = kwargs.pop('output_dim')
+        kwargs['ln_pre_impl'] = IdentityOp
+        super().__init__(*args, **kwargs)
+        assert self.output_dim == self.config.hidden_size, "Siglip output_dim needs to be the same as hidden_size."
+        self.conv1 = torch.nn.Conv2d(
+            in_channels=3,
+            out_channels=self.visual_hidden_size,
+            kernel_size=self.patch_dim,
+            stride=self.patch_dim,
+            bias=True,
+        )
+        self.final_layernorm = TENorm(
+            config=self.config,
+            hidden_size=self.config.hidden_size,
+            eps=self.config.layernorm_epsilon,
+        )
+
+        self.head = SiglipMHAPoolingHead(
+            self.config,
+            submodules=TransformerLayerSubmodules(
+                cross_attention=ModuleSpec(
+                    module=CrossAttention,
+                    params={"attn_mask_type": MCoreAttnMaskType.no_mask},
+                    submodules=CrossAttentionSubmodules(
+                        linear_q=TEColumnParallelLinear,
+                        linear_kv=TEColumnParallelLinear,
+                        core_attention=TEDotProductAttention,
+                        linear_proj=TERowParallelLinear,
+                    ),
+                ),
+                cross_attn_bda=get_bias_dropout_add,
+                mlp=ModuleSpec(
+                    module=MLP,
+                    submodules=MLPSubmodules(
+                        linear_fc1=TELayerNormColumnParallelLinear,
+                        linear_fc2=TERowParallelLinear,
+                    ),
+                ),
+                mlp_bda=get_bias_dropout_add,
+            ),
+        )
+
+    def forward(self, x):
+        x = super().forward(
+            x,
+        )
+        x = self.final_layernorm(x)
+        x = self.head(x)
+        return x
+
+
+class MCoreSiglipTextModel(MCoreGPTModel):
+    def __init__(self, *args, **kwargs):
+        # TODO (yuya): need to handle post_process correctly in order to enable PP
+        self.output_dim = kwargs.pop('output_dim')
+        kwargs['transformer_layer_spec'].submodules.self_attention.params['attn_mask_type'] = MCoreAttnMaskType.no_mask
+
+        super().__init__(*args, **kwargs)
+        self.final_layernorm = TENorm(
+            config=self.config,
+            hidden_size=self.config.hidden_size,
+            eps=self.config.layernorm_epsilon,
+        )
+        self.head = torch.nn.Linear(
+            self.config.hidden_size,
+            self.output_dim,
+            bias=True,
+        )
+
+        self.position_ids = None
+        if self.pre_process:
+            self.position_ids = torch.arange(kwargs['max_sequence_length']).expand(1, -1).cuda()
+
+    def forward(self, input_ids):
+
+        x = super().forward(input_ids, position_ids=self.position_ids, attention_mask=None)
+        x = self.final_layernorm(x)
+        x = x[-1]
+        x = self.head(x)
+        return x
+
+
+class MCoreCLIPViTModel(CLIPViTModel):
+    def __init__(self, *args, **kwargs):
+        # TODO (yuya): need to handle post_process correctly in order to enable PP
+        self.output_dim = kwargs.pop('output_dim')
+        super().__init__(*args, **kwargs)
+        self.final_layernorm = TENorm(
+            config=self.config,
+            hidden_size=self.config.hidden_size,
+            eps=self.config.layernorm_epsilon,
+        )
+        self.head = torch.nn.Linear(
+            self.config.hidden_size,
+            self.output_dim,
+            bias=False,
+        )
+
+    def forward(self, x):
+        x = super().forward(
+            x,
+        )
+        x = self.final_layernorm(x)
+        x = x[:, 0]
+        x = self.head(x)
+        return x
+
+
+class MCoreCLIPTextModel(MCoreGPTModel):
+    def __init__(self, *args, **kwargs):
+        # TODO (yuya): need to handle post_process correctly in order to enable PP
+        self.output_dim = kwargs.pop('output_dim')
+
+        super().__init__(*args, **kwargs)
+        self.final_layernorm = TENorm(
+            config=self.config,
+            hidden_size=self.config.hidden_size,
+            eps=self.config.layernorm_epsilon,
+        )
+        self.head = torch.nn.Linear(
+            self.config.hidden_size,
+            self.output_dim,
+            bias=False,
+        )
+        self.position_ids = None
+        if self.pre_process:
+            self.position_ids = torch.arange(kwargs['max_sequence_length']).expand(1, -1).cuda()
+
+    def forward(self, input_ids):
+        x = super().forward(input_ids, position_ids=self.position_ids, attention_mask=None)
+        x = self.final_layernorm(x)
+        x = x[input_ids.argmax(dim=-1), torch.arange(x.shape[1])]
+        x = self.head(x)
+        return x
+
+
 class CLIPModel(MegatronModule):
     """CLIP Model"""
 
-    def __init__(self, model_cfg, model_parallel_config, padded_vocab_size, pre_process=True, post_process=True):
+    def __init__(
+        self,
+        model_cfg,
+        model_parallel_config,
+        vision_transformer_config,
+        text_transformer_config,
+        padded_vocab_size,
+        pre_process=True,
+        post_process=True,
+    ):
         super(CLIPModel, self).__init__()
 
         self.config = model_parallel_config
+        self.use_siglip = model_cfg.get("use_siglip", False)
         self.pre_process = pre_process
         self.post_process = post_process
-        self.vision_encoder = CLIPVisionTransformer(
-            model_cfg.vision, model_parallel_config, pre_process=self.pre_process, post_process=self.post_process,
-        )
-        self.text_encoder = CLIPTextTransformer(
-            model_cfg.text,
-            model_parallel_config,
-            padded_vocab_size,
-            pre_process=self.pre_process,
-            post_process=self.post_process,
-        )
+        self.output_dim = model_cfg.output_dim
+        self.get_attention_mask_from_fusion = model_cfg.get('get_attention_mask_from_fusion', True)
 
-        self.logit_scale = torch.nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
+        if model_cfg.get("mcore_gpt", False):
+            if model_cfg.vision.get("class_token_length") is None or model_cfg.vision.get("class_token_length") <= 0:
+                add_class_token = False
+            else:
+                add_class_token = True
+            vision_layer_spec = get_specs(
+                model_cfg.text.get('name', ''),
+                vision_transformer_config.num_moe_experts,
+                vision_transformer_config.moe_grouped_gemm,
+                model_cfg.get('transformer_engine', True),
+            )
+            vision_layer_spec.submodules.self_attention.params['attn_mask_type'] = MCoreAttnMaskType.no_mask
+
+            if model_cfg.get("use_siglip", False):
+                vision_module = MCoreSiglipViTModel
+                text_module = MCoreSiglipTextModel
+            else:
+                vision_module = MCoreCLIPViTModel
+                text_module = MCoreCLIPTextModel
+            self.vision_encoder = vision_module(
+                transformer_config=vision_transformer_config,
+                transformer_layer_spec=vision_layer_spec,
+                patch_dim=model_cfg.vision.get('patch_dim', 16),
+                img_h=model_cfg.vision.get('img_h', 224),
+                img_w=model_cfg.vision.get('img_w', 224),
+                add_class_token=add_class_token,
+                class_token_len=model_cfg.vision.get('class_token_length'),
+                output_dim=model_cfg.output_dim,
+            )
+            self.text_encoder = text_module(
+                config=text_transformer_config,
+                transformer_layer_spec=get_specs(
+                    model_cfg.text.get('name', ''),
+                    text_transformer_config.num_moe_experts,
+                    text_transformer_config.moe_grouped_gemm,
+                    model_cfg.get('transformer_engine', True),
+                ),
+                vocab_size=model_cfg.text.get('override_vocab_size', padded_vocab_size),
+                max_sequence_length=model_cfg.text.get('encoder_seq_length', 512),
+                pre_process=pre_process,
+                post_process=False,
+                parallel_output=True,
+                share_embeddings_and_output_weights=False,
+                position_embedding_type=model_cfg.text.get('position_embedding_type', 'learned_absolute'),
+                rotary_percent=model_cfg.text.get('rotary_percentage', 1.0),
+                seq_len_interpolation_factor=model_cfg.text.get('seq_len_interpolation_factor', None),
+                rotary_base=model_cfg.text.get('rotary_base', 10000),
+                output_dim=model_cfg.output_dim,
+            )
+
+        else:
+            self.vision_encoder = CLIPVisionTransformer(
+                model_cfg.vision,
+                model_parallel_config,
+                pre_process=self.pre_process,
+                post_process=self.post_process,
+            )
+            self.text_encoder = CLIPTextTransformer(
+                model_cfg.text,
+                model_parallel_config,
+                padded_vocab_size,
+                pre_process=self.pre_process,
+                post_process=self.post_process,
+            )
+
+        if self.use_siglip:
+            self.logit_scale = torch.nn.Parameter(torch.ones([]) * np.log(10))
+            self.logit_bias = torch.nn.Parameter(torch.ones([]) * (-10))
+        else:
+            self.logit_scale = torch.nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
 
     def set_input_tensor(self, input_tensor):
         """See megatron.model.transformer.set_input_tensor()"""
@@ -277,10 +575,89 @@ def forward(self, images, captions):
         text_features = self.text_encoder(captions)
 
         if self.post_process:
+            if self.use_siglip:
+                return (
+                    F.normalize(image_features, dim=-1),
+                    F.normalize(text_features, dim=-1),
+                    self.logit_scale.exp(),
+                    self.logit_bias,
+                )
             return F.normalize(image_features, dim=-1), F.normalize(text_features, dim=-1), self.logit_scale.exp()
 
         return image_features, text_features
 
+    def build_transformer_config(self) -> TransformerConfig:
+        """Builds the megatron core gpt transformer config for the model.
+        For attributes in the nemo model config that are the same
+        as the megatron core TransformerConfig, we will use the value from the nemo model config.
+        For attributes in TransformerConfig that are not in the nemo model config, we add custom logic.
+        """
+
+        normalization = self.cfg.get('normalization', 'layernorm').lower()
+        layernorm_zero_centered_gamma = self.cfg.get('normalization', 'layernorm') == 'layernorm1p'
+        if normalization == 'layernorm':
+            normalization = 'LayerNorm'
+        elif normalization == 'rmsnorm':
+            normalization = 'RMSNorm'
+        elif normalization == 'layernorm1p':
+            normalization = 'LayerNorm'
+            layernorm_zero_centered_gamma = True
+        else:
+            logging.warning(
+                f"The normalization type: {normalization} might not be supported in megatron core."
+                f"Supported types are LayerNorm and RMSNorm."
+            )
+
+        ub_tp_comm_overlap = self.cfg.get('ub_tp_comm_overlap', False)
+
+        if not self.cfg.get('fp8', False):
+            fp8 = None
+        elif self.cfg.get('fp8_e4m3', False):
+            fp8 = 'e4m3'
+        elif self.cfg.get('fp8_hybrid', False):
+            fp8 = 'hybrid'
+        else:
+            raise ValueError(f"fp8 enabled but fp8_format (fp8_e4m3 | fp8_hybrid) is not set.")
+
+        # any configs that are not in the nemo model config will be added here
+        model_specific_configs = {
+            'layernorm_zero_centered_gamma': layernorm_zero_centered_gamma,
+            'normalization': normalization,
+            'fp8': fp8,
+            'tp_comm_overlap': ub_tp_comm_overlap,
+            # MoE related
+            'num_moe_experts': self.cfg.get('num_moe_experts', None),
+            'moe_router_load_balancing_type': self.cfg.get('moe_router_load_balancing_type', 'aux_loss'),
+            'moe_router_topk': self.cfg.get('moe_router_topk', 2),
+            'moe_grouped_gemm': self.cfg.get('moe_grouped_gemm', False),
+            'moe_aux_loss_coeff': self.cfg.get(
+                'moe_aux_loss_coeff', 0
+            ),  # 1e-2 would be a good start value for load balance loss.
+            'moe_z_loss_coeff': self.cfg.get('moe_z_loss_coeff', None),  # 1e-3 would be a good start value for z-loss
+            'moe_input_jitter_eps': self.cfg.get('moe_input_jitter_eps', None),
+            'moe_token_dropping': self.cfg.get('moe_token_dropping', False),  # TODO: Support token dropping.
+        }
+        if model_specific_configs['num_moe_experts'] is not None:
+            assert mcore_supports_moe(), 'Megatron-core >= v0.5.0 is required for MoE'
+        elif not mcore_supports_moe():
+            if 'num_moe_experts' in model_specific_configs:
+                del model_specific_configs['num_moe_experts']
+            moe_keys = list(filter(lambda x: x.startswith('moe_'), model_specific_configs.keys()))
+            for k in moe_keys:
+                del model_specific_configs[k]
+
+        transformer_config = super().build_transformer_config()
+
+        for key, value in model_specific_configs.items():
+            setattr(transformer_config, key, value)
+
+        # pass mcore customization configs directly to mcore
+        mcore_customization_config_dict = self.cfg.get('mcore_customization_config', {})
+        for key, value in mcore_customization_config_dict.items():
+            setattr(transformer_config, key, value)
+
+        return transformer_config
+
 
 class MegatronCLIPModel(MegatronBaseModel):
     """Megatron CLIP Model."""
@@ -302,11 +679,21 @@ def __init__(self, cfg: DictConfig, trainer: Trainer):
 
         self._validate_trainer()
 
+        # placeholder for O2 wrapper
+        self.transformer_config = self.build_transformer_config(self.cfg.text)
+
         self.megatron_amp_O2 = cfg.get('megatron_amp_O2', False)
 
+        self.mcore_gpt = cfg.get('mcore_gpt', False)
+        if cfg.get('fp8', False):
+            self.prev_step_training = True
         if not self.megatron_amp_O2 and self.cfg.get('virtual_pipeline_model_parallel_size', None):
             raise ValueError('Virtual pipeline model parallel is only supported when using megatron_amp_O2')
 
+        self.transformer_engine = cfg.get('transformer_engine', False)
+        if self.megatron_amp_O2 and not self.transformer_engine:
+            logging.warning('megatron_amp_O2 is enabled but transformer-engine is not.')
+
         # build_model returns a list of modules which are used for interleaved pipeline parallelism
         if isinstance(self.trainer.accelerator, CPUAccelerator):
             self.model = build_model(
@@ -316,19 +703,24 @@ def __init__(self, cfg: DictConfig, trainer: Trainer):
                 virtual_pipeline_model_parallel_size=self.cfg.get('virtual_pipeline_model_parallel_size', None),
             )
         else:
-            self.model = build_model(
-                model_provider_func=self.model_provider_func,
-                wrap_with_ddp=False,
-                virtual_pipeline_model_parallel_size=self.cfg.get('virtual_pipeline_model_parallel_size', None),
-            )
+            build_model_context = nullcontext
+            if HAVE_TE and self.cfg.get('fp8', False) and self.cfg.get('fp8_params', False):
+                build_model_context = transformer_engine.pytorch.fp8_model_init
+            with build_model_context():
+                self.model = build_model(
+                    model_provider_func=self.model_provider_func,
+                    wrap_with_ddp=False,
+                    virtual_pipeline_model_parallel_size=self.cfg.get('virtual_pipeline_model_parallel_size', None),
+                    on_cpu=cfg.get('fsdp', False) and cfg.get('use_cpu_initialization', False),
+                )
 
         # if we're not using interleaved, then self.model is a module.
-        if self.cfg.get('virtual_pipeline_model_parallel_size', None) is None:
+        if self.cfg.get('virtual_pipeline_model_parallel_size', None) is None and (not self.use_mcore_dist_optim):
             self.model = self.model[0]
 
         if self.megatron_amp_O2:
 
-            if not self.with_distributed_adam:
+            if not self.with_distributed_adam and not self.cfg.get("use_cpu_initialization", False):
                 # Pre-allocate the model on GPU to have master parameters allocated on the same device with matching data type
                 if isinstance(self.model, list):
                     for module in self.model:
@@ -336,31 +728,17 @@ def __init__(self, cfg: DictConfig, trainer: Trainer):
                 else:
                     self.model.cuda(torch.cuda.current_device())
 
-            # Model wrapper to convert both model and inputs to half precision
-            # TODO (yuya): check this; FP16 Module might not work; when self.model is a list?
-            if isinstance(self.model, list):
-                converted_model = []
-                for module in self.model:
-                    converted_model.append(
-                        Float16Module(config=self.model_parallel_config, module=module, precision=cfg.precision)
-                    )
-                    self.model = converted_model
-            else:
-                self.model = Float16Module(
-                    config=self.model_parallel_config, module=self.model, precision=cfg.precision
-                )
+            self._wrap_model_for_O2()
 
-        self.autocast_dtype = torch_dtype_from_precision(self.trainer.precision)
         self.enable_autocast = (
             True if (not self.megatron_amp_O2) and (self.autocast_dtype in [torch.float16, torch.bfloat16]) else False
         )
 
-        self.transformer_engine = cfg.get('transformer_engine', False)
-
         # Convert the global-batch-based profile index to micro-batch index
         if hasattr(self, '_nsys_profile_enabled') or hasattr(self, '_memory_profile_enabled'):
             mp_size = cfg.get('tensor_model_parallel_size', 1) * cfg.get('pipeline_model_parallel_size', 1)
-            data_parallel_world_size = trainer.world_size // mp_size
+            cp_size = cfg.get('context_parallel_size', 1)
+            data_parallel_world_size = trainer.world_size // (mp_size * cp_size)
             grad_accum_steps = cfg.get('global_batch_size') // (cfg.get('micro_batch_size') * data_parallel_world_size)
             if hasattr(self, '_nsys_profile_enabled'):
                 self._nsys_profile_start_step *= grad_accum_steps
@@ -368,22 +746,36 @@ def __init__(self, cfg: DictConfig, trainer: Trainer):
             if hasattr(self, '_memory_profile_enabled'):
                 self._memory_profile_start_step *= grad_accum_steps
                 self._memory_profile_end_step *= grad_accum_steps
-        self.get_attention_mask_from_fusion = self.cfg.get('get_attention_mask_from_fusion', True)
-        self.initialize_ub = self.cfg.get('ub_tp_comm_overlap', False)
 
-    def get_module_list(self):
-        if isinstance(self.model, list):
-            return [model.module if isinstance(model, Float16Module) else model for model in self.model]
-        elif isinstance(self.model, Float16Module):
-            return [self.model.module]
-        else:
-            return [self.model]
+        self.initialize_ub = self.cfg.get('ub_tp_comm_overlap', False)
+        self.log_train_loss = bool(int(os.getenv("NEMO_LOG_TRAIN_LOSS", 1)))
+        self.log_memory_usage = bool(int(os.getenv("NEMO_LOG_MEMORY_USAGE", 0)))
+        self.loss_broadcast_src_rank = None
+        data_cfg = cfg.get('data', {})
+        self.return_output_tensors = data_cfg.get('return_output_tensors', False)
+        self.validation_drop_last = data_cfg.get('validation_drop_last', True)
+        self.sample_weight = data_cfg.get('sample_weight', 'token')
+        self.validation_param_sync_overlap = self.cfg.get('validation_param_sync_overlap', False)
 
     def model_provider_func(self, pre_process, post_process):
         """Model depends on pipeline paralellism."""
+        vision_transformer_config = self.build_transformer_config(self.cfg.vision) if self.mcore_gpt else None
+        text_transformer_config = self.build_transformer_config(self.cfg.text) if self.mcore_gpt else None
+
+        if self.mcore_gpt and not parallel_state.is_initialized():
+
+            def dummy():
+                return
+
+            if self.trainer.strategy.launcher is not None:
+                self.trainer.strategy.launcher.launch(dummy, trainer=self.trainer)
+            self.trainer.strategy.setup_environment()
+
         model = CLIPModel(
             model_cfg=self.cfg,
             model_parallel_config=self.model_parallel_config,
+            vision_transformer_config=vision_transformer_config,
+            text_transformer_config=text_transformer_config,
             padded_vocab_size=self.padded_vocab_size,
             pre_process=pre_process,
             post_process=post_process,
@@ -401,9 +793,40 @@ def setup_optimizer_param_groups(self):
         else:
             self._optimizer_param_groups = get_params_for_weight_decay_optimization(self.model)
 
+    def setup_mcore_distributed_parallel(self):
+        """Set up mcore distributed data parallel"""
+        if self.with_distributed_adam and self.use_mcore_dist_optim:
+            config = get_model_config(self.model[0])
+            ddp_config = DistributedDataParallelConfig(
+                grad_reduce_in_fp32=(self.cfg.optim.get('grad_sync_dtype', 'fp32') == 'fp32'),
+                overlap_grad_reduce=self.cfg.optim.get('overlap_grad_sync', False),
+                use_distributed_optimizer=True,
+                check_for_nan_in_grad=self.cfg.optim.get('check_for_nan_in_grad', False),
+                # mcore bucket_size is based on num of parameters, therefore not
+                # using bucket_cap_mb to configure bucket_size here
+                bucket_size=self.cfg.optim.get('ddp_bucket_size', None),
+            )
+
+            self.model = [
+                McoreDDP(
+                    config,
+                    ddp_config,
+                    model_chunk,
+                    data_parallel_group=parallel_state.get_data_parallel_group(with_context_parallel=True),
+                    expert_data_parallel_group=parallel_state.get_data_modulo_expert_parallel_group(),
+                    # Turn off bucketing for model_chunk 2 onwards, since communication for these
+                    # model chunks is overlapped with compute anyway.
+                    disable_bucketing=(model_chunk_idx > 0),
+                )
+                for (model_chunk_idx, model_chunk) in enumerate(self.model)
+            ]
+
+            # (TODO) Broadcast params from data parallel src rank to other data parallel ranks.
+            # by calling model_module.broadcast_params() if the model is randomly initialized.
+
     def configure_optimizers(self):
 
-        if self.with_distributed_adam:
+        if self.with_distributed_adam and not self.use_mcore_dist_optim:
 
             # Disable overlapped grad sync for layer norm grads when
             # sequence parallelism is enabled
@@ -462,13 +885,16 @@ def fwd_bwd_step(self, dataloader_iter, forward_only):
         no_sync_func = None
         grad_sync_func = None
         param_sync_func = None
-        if not forward_only and self.with_distributed_adam:
-            no_sync_func = partial(self._optimizer.no_sync, greedy_grad_copy=self.megatron_amp_O2,)
+        if not forward_only and self.with_distributed_adam and not self.use_mcore_dist_optim:
+            no_sync_func = partial(
+                self._optimizer.no_sync,
+                greedy_grad_copy=self.megatron_amp_O2,
+            )
             grad_sync_func = self.reduce_overlap_gradients
             param_sync_func = self.sync_overlap_parameters
 
         # pipeline schedules will get these from self.model.config
-        for module in self.get_module_list():
+        for module in self.get_model_module_list():
             module.config.no_sync_func = no_sync_func
             module.config.grad_sync_func = grad_sync_func
             module.config.param_sync_func = param_sync_func
@@ -515,7 +941,9 @@ def initialize_ub_func(self):
             )
 
         input_shape = [
-            self.cfg.get('encoder_seq_length') * self.cfg.get('micro_batch_size'),
+            self.cfg.get('encoder_seq_length')
+            * self.cfg.get('micro_batch_size')
+            // self.cfg.get('context_parallel_size', 1),
             self.cfg.get('hidden_size'),
         ]
 
@@ -529,12 +957,12 @@ def initialize_ub_func(self):
 
     def training_step(self, dataloader_iter):
         """
-            Our dataloaders produce a micro-batch and then we fetch
-            a number of microbatches depending on the global batch size and model parallel size
-            from the dataloader to produce a list of microbatches.
-            Batch should be a list of microbatches and those microbatches should on CPU.
-            Microbatches are then moved to GPU during the pipeline.
-            The list of microbatches is then piped through the pipeline using Apex fwd/bwd functions.
+        Our dataloaders produce a micro-batch and then we fetch
+        a number of microbatches depending on the global batch size and model parallel size
+        from the dataloader to produce a list of microbatches.
+        Batch should be a list of microbatches and those microbatches should on CPU.
+        Microbatches are then moved to GPU during the pipeline.
+        The list of microbatches is then piped through the pipeline using Apex fwd/bwd functions.
         """
         # Initialize userbuffer communicators.
         if self.initialize_ub:
@@ -543,7 +971,7 @@ def training_step(self, dataloader_iter):
         # we zero grads here because we also call backward in the megatron-core fwd/bwd functions
         self._optimizer.zero_grad()
 
-        if self.with_distributed_adam:
+        if self.with_distributed_adam and not self.use_mcore_dist_optim:
             # hack to enable overlapping param sync and forward compute
             # note: the distributed optimizer monkey-patches each
             # parameter's __getattribute__ function so that it can
@@ -554,9 +982,10 @@ def training_step(self, dataloader_iter):
             # manually interact with the parameter.
             modules = self.model if isinstance(self.model, list) else [self.model]
             for module in modules:
-                if isinstance(module, Float16Module):
+                if isinstance(module, (Float16Module, MCoreFloat16Module)):
                     module = module.module
-                module = module.text_encoder.language_model
+                if not self.mcore_gpt:
+                    module = module.language_model
                 if hasattr(module, 'embedding'):
                     for param in module.embedding.parameters():
                         param.data_ptr()
@@ -567,38 +996,115 @@ def training_step(self, dataloader_iter):
         if self.cfg.get('tensor_model_parallel_size', 1) > 1 and self.cfg.get('sequence_parallel', False):
             self.allreduce_sequence_parallel_gradients()
 
-        if self.with_distributed_adam:
-            # synchronize asynchronous grad reductions
-            # note: not necessary, but reduces performance degradation
-            # from multiple simultaneous NCCL calls
-            self._optimizer._finish_bucket_grad_sync()
+        if self.cfg.get('fp8', False):
+            self.prev_step_training = self.training
+
+        # Optimization: Defer the embedding GEMM Wgrads of the last PP stage to pipeline flush waiting time
+        if self.cfg.get('pipeline_model_parallel_size', 1) > 1 and parallel_state.is_pipeline_last_stage(
+            ignore_virtual=True
+        ):
+            if (
+                self.cfg.get('defer_embedding_wgrad_compute', False) and self.mcore_gpt
+            ):  # Silently ignore the optimization if MCORE is not used
+                module_list = self.get_model_module_list()
+                if len(module_list) > 1:
+                    embedding_module = module_list[-1]
+                else:
+                    embedding_module = module_list[0]
+
+                embedding_activation_buffer = embedding_module.embedding_activation_buffer
+                grad_output_buffer = embedding_module.grad_output_buffer
+                weight = embedding_module.output_layer.weight
+
+                drain_embedding_wgrad_compute(
+                    embedding_module.config, embedding_activation_buffer, grad_output_buffer, weight
+                )
+
+        # when using sequence parallelism, the sequence parallel layernorm grads must be all-reduced
+        if self.cfg.get('tensor_model_parallel_size', 1) > 1 and self.cfg.get('sequence_parallel', False):
+            self.megatron_timer_start('allreduce_sequence_parallel_gradients', log_level=1)
+            self.allreduce_sequence_parallel_gradients()
+            self.megatron_timer_stop('allreduce_sequence_parallel_gradients')
+
+        self.megatron_timer_start('gradient_allreduce', log_level=1)
+        if self.use_fsdp:
+            # Reduce the gradients omitted from FSDP-sharding
+            self.allreduce_fsdp_sharding_omitted_gradients()
+        elif self.with_distributed_adam:
+            if not self.use_mcore_dist_optim:
+                # synchronize asynchronous grad reductions
+                # note: not necessary, but reduces performance degradation
+                # from multiple simultaneous NCCL calls
+                self._optimizer._finish_bucket_grad_sync()
+            # else: Mcore distributed optim calls finalize_model_grads to finish grad sync
         elif self.megatron_amp_O2:
             # when using pipeline parallelism grads must be all-reduced after the pipeline (not asynchronously)
-            # if self.cfg.get('pipeline_model_parallel_size', 1) > 1 or self.cfg.get('sequence_parallel', False):
-            #     # main grads are stored in the MainParamsOptimizer wrapper
-            self._optimizer.allreduce_main_grads()
+            if (
+                self.cfg.get('pipeline_model_parallel_size', 1) > 1
+                or self.cfg.get('sequence_parallel', False)
+                or not self.cfg.get('async_grad_allreduce', True)
+            ):
+                # main grads are stored in the MainParamsOptimizer wrapper
+                self._optimizer.allreduce_main_grads()
         else:
             # async grad allreduce is not currently implemented for O1/autocasting mixed precision training
             # so we all-reduce gradients after the pipeline
             self.allreduce_gradients()  # @sangkug we think this is causing memory to blow up (hurts perf)
+        self.megatron_timer_stop('gradient_allreduce')
+
+        if (
+            not self.use_mcore_dist_optim
+            and self.cfg.get('pipeline_model_parallel_size', 1) > 1
+            and self.cfg.get('share_embeddings_and_output_weights', True)
+        ):
+            self.megatron_timer_start('allreduce_first_last_embeddings', log_level=1)
+            # when using pipeline parallelism the first and last stage must keep embeddings in sync
+            self.allreduce_first_last_embeddings()
+            self.megatron_timer_stop('allreduce_first_last_embeddings')
+
+        if self.log_memory_usage:
+            mem_reserved = torch.cuda.max_memory_reserved()
+            self.log(
+                'peak_memory_usage',
+                mem_reserved,
+                prog_bar=True,
+                rank_zero_only=True,
+                batch_size=1,
+            )
 
         ## logging
-        # we can only log on one rank if it is rank zero so we broadcast from last rank
-        # we can avoid this broadcast by updating the PTL log function to accept specific ranks
-        torch.distributed.broadcast(loss_mean, get_last_rank())
-
-        if self.cfg.precision in [16, '16', '16-mixed']:
-            loss_scale = self.trainer.precision_plugin.scaler._scale
-            if loss_scale is not None:
-                self.log('loss_scale', loss_scale, batch_size=1)
+        if self.log_train_loss:
+            # When using pipeline parallelism, loss is calculated only in the last pipeline stage and
+            # it should be casted to other pipeline stages for logging.
+            # we can avoid this broadcast by updating the PTL log function to accept specific ranks
+            if parallel_state.get_pipeline_model_parallel_world_size() > 1:
+                if torch.distributed.get_rank() == get_last_rank():
+                    torch.distributed.send(loss_mean, 0)
+                elif torch.distributed.get_rank() == 0:
+                    torch.distributed.recv(loss_mean, get_last_rank())
+            self.log('reduced_train_loss', loss_mean, prog_bar=True, rank_zero_only=True, batch_size=1)
+
+            # (@adithyare) we need to check for the _scaler attribute to enable pp>1 for adapter training
+            if self.cfg.precision == 16 and hasattr(self.trainer.precision_plugin.scaler, "_scale"):
+                loss_scale = self.trainer.precision_plugin.scaler._scale
+                if loss_scale is not None:
+                    self.log('loss_scale', loss_scale, batch_size=1)
 
-        self.log('reduced_train_loss', loss_mean, prog_bar=True, rank_zero_only=True, batch_size=1)
         lr = self._optimizer.param_groups[0]['lr']
         self.log('lr', lr, rank_zero_only=True, batch_size=1)
-        self.log('global_step', self.trainer.global_step + 1, prog_bar=True, rank_zero_only=True, batch_size=1)
+        self.log(
+            'global_step',
+            self.trainer.global_step + 1,
+            prog_bar=True,
+            rank_zero_only=True,
+            batch_size=1,
+        )
+
+        consumed_samples = self._compute_consumed_samples_after_training_step()
+        # TODO: make sure compute_consumed_samples works for pipeline parallelism
         self.log(
             'consumed_samples',
-            self.compute_consumed_samples(self.trainer.global_step + 1 - self.init_global_step),
+            consumed_samples,
             prog_bar=True,
             rank_zero_only=True,
             batch_size=1,
@@ -607,20 +1113,20 @@ def training_step(self, dataloader_iter):
         return loss_mean
 
     def backward(self, *args, **kwargs):
-        """ LightningModule hook to do backward.
-            We want this to do nothing since we run backward in the fwd/bwd functions from apex.
-            No need to call it here.
+        """LightningModule hook to do backward.
+        We want this to do nothing since we run backward in the fwd/bwd functions from apex.
+        No need to call it here.
         """
         pass
 
     def optimizer_zero_grad(self, *args, **kwargs):
-        """ LightningModule hook to zero grad.
-            We want this to do nothing as we are zeroing grads during the training_step.
+        """LightningModule hook to zero grad.
+        We want this to do nothing as we are zeroing grads during the training_step.
         """
         pass
 
     def _append_sequence_parallel_module_grads(self, module, grads):
-        """ Helper method for allreduce_sequence_parallel_gradients"""
+        """Helper method for allreduce_sequence_parallel_gradients"""
 
         for param in module.parameters():
             sequence_parallel_param = getattr(param, 'sequence_parallel', False)
@@ -632,9 +1138,9 @@ def _append_sequence_parallel_module_grads(self, module, grads):
                 grads.append(grad.data)
 
     def allreduce_sequence_parallel_gradients(self):
-        """ All-reduce layernorm parameters across model parallel nodes when sequence parallelism is used.
-            Modified from megatron-lm:
-            https://gitlab-master.nvidia.com/ADLR/megatron-lm/-/blob/3f91f09bb2ab32f9904b47f46f19d2fc3f518ed8/megatron/training.py#L425
+        """All-reduce layernorm parameters across model parallel nodes when sequence parallelism is used.
+        Modified from megatron-lm:
+        https://gitlab-master.nvidia.com/ADLR/megatron-lm/-/blob/3f91f09bb2ab32f9904b47f46f19d2fc3f518ed8/megatron/training.py#L425
         """
 
         grads = []
@@ -650,7 +1156,18 @@ def allreduce_sequence_parallel_gradients(self):
             buf.copy_(synced)
 
     def get_forward_output_and_loss_func(self):
-        loss_func = ClipLoss(local_loss=self.cfg.local_loss, gather_with_grad=self.cfg.gather_with_grad,)
+        if self.cfg.get("use_siglip", False):
+            # TODO(yuya): fix rank
+            loss_func = SigLipLoss(
+                rank=parallel_state.get_data_parallel_rank(),
+                world_size=parallel_state.get_data_parallel_world_size(),
+                group=parallel_state.get_data_parallel_group(),
+            )
+        else:
+            loss_func = ClipLoss(
+                local_loss=self.cfg.local_loss,
+                gather_with_grad=self.cfg.gather_with_grad,
+            )
 
         def fwd_output_and_loss_func(dataloader_iter, model):
             batch, _, _ = next(dataloader_iter)
@@ -690,7 +1207,8 @@ def zero_shot_classifier(self):
                 texts = texts.cuda(non_blocking=True)
                 # TODO (yuya): distributed not working
                 with torch.cuda.amp.autocast(
-                    enabled=self.autocast_dtype in (torch.half, torch.bfloat16), dtype=self.autocast_dtype,
+                    enabled=self.autocast_dtype in (torch.half, torch.bfloat16),
+                    dtype=self.autocast_dtype,
                 ):
                     class_embeddings = text_encoder(texts)
                     class_embedding = F.normalize(class_embeddings, dim=-1).mean(dim=0)
@@ -726,7 +1244,8 @@ def accuracy(output, target, topk=(1,)):
                 target = target.cuda(non_blocking=True)
                 # predict
                 with torch.cuda.amp.autocast(
-                    enabled=self.autocast_dtype in (torch.half, torch.bfloat16), dtype=self.autocast_dtype,
+                    enabled=self.autocast_dtype in (torch.half, torch.bfloat16),
+                    dtype=self.autocast_dtype,
                 ):
                     image_features = vision_encoder(images)
                     image_features = F.normalize(image_features, dim=-1)
@@ -745,10 +1264,10 @@ def accuracy(output, target, topk=(1,)):
 
     def validation_step(self, dataloader_iter):
         """
-            Our dataloaders produce a micro-batch and then we fetch
-            a number of microbatches depending on the global batch size and model parallel size
-            from the dataloader to produce a list of microbatches.
-            The list of microbatches is then piped through the pipeline using megatron-core fwd/bwd functions.        """
+        Our dataloaders produce a micro-batch and then we fetch
+        a number of microbatches depending on the global batch size and model parallel size
+        from the dataloader to produce a list of microbatches.
+        The list of microbatches is then piped through the pipeline using megatron-core fwd/bwd functions."""
         # Initialize userbuffer communicators.
         if self.initialize_ub:
             self.initialize_ub_func()
@@ -801,7 +1320,9 @@ def build_train_valid_test_datasets(self):
             raise ValueError("limit_val_batches must be an integer or float less than or equal to 1.0.")
 
         self._train_ds, self._validation_ds = build_train_valid_datasets(
-            model_cfg=self.cfg, consumed_samples=self.compute_consumed_samples(0), tokenizer=self.tokenizer,
+            model_cfg=self.cfg,
+            consumed_samples=self.compute_consumed_samples(0),
+            tokenizer=self.tokenizer,
         )
         self._test_ds = None
 
@@ -816,7 +1337,7 @@ def build_train_valid_test_datasets(self):
         return self._train_ds, self._validation_ds, self._test_ds
 
     def setup(self, stage=None):
-        """ PTL hook that is executed after DDP spawns.
+        """PTL hook that is executed after DDP spawns.
             We setup datasets here as megatron datasets require DDP to instantiate.
             See https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html#setup for more information.
         Args:
@@ -909,23 +1430,18 @@ def setup_test_data(self, cfg):
                 f'Setting up test dataloader with len(len(self._test_ds)): {len(self._test_ds)} and consumed samples: {consumed_samples}'
             )
             self._test_dl = torch.utils.data.DataLoader(
-                self._test_ds, batch_size=self._micro_batch_size, num_workers=cfg.num_workers, pin_memory=True,
+                self._test_ds,
+                batch_size=self._micro_batch_size,
+                num_workers=cfg.num_workers,
+                pin_memory=True,
             )
 
     def predict_step(self, batch: Any, batch_idx: int, dataloader_idx: Optional[int] = None) -> Any:
         raise NotImplementedError
 
-    def transfer_batch_to_device(self, batch: Any, device: torch.device, dataloader_idx: int) -> Any:
-        """ PTL hook: https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html#transfer-batch-to-device
-            When using pipeline parallelism, we need the global batch to remain on the CPU,
-            since the memory overhead will be too high when using a large number of microbatches.
-            Microbatches are transferred from CPU to GPU inside the pipeline.
-        """
-        return batch
-
     def _validate_trainer(self):
-        """ Certain trainer configurations can break training.
-            Here we try to catch them and raise an error.
+        """Certain trainer configurations can break training.
+        Here we try to catch them and raise an error.
         """
         if self.trainer.accumulate_grad_batches > 1:
             raise ValueError(
@@ -961,3 +1477,178 @@ def parameters(self):
             return itertools.chain.from_iterable(module.parameters() for module in self.model)
         else:
             return self.model.parameters()
+
+    def build_transformer_config(self, model_cfg=None) -> TransformerConfig:
+        """Builds the megatron core gpt transformer config for the model.
+        For attributes in the nemo model config that are the same
+        as the megatron core TransformerConfig, we will use the value from the nemo model config.
+        For attributes in TransformerConfig that are not in the nemo model config, we add custom logic.
+        """
+        if model_cfg is None:
+            model_cfg = self.cfg
+        normalization = model_cfg.get('normalization', 'layernorm').lower()
+        layernorm_zero_centered_gamma = model_cfg.get('normalization', 'layernorm') == 'layernorm1p'
+        if normalization == 'layernorm':
+            normalization = 'LayerNorm'
+        elif normalization == 'rmsnorm':
+            normalization = 'RMSNorm'
+        elif normalization == 'layernorm1p':
+            normalization = 'LayerNorm'
+            layernorm_zero_centered_gamma = True
+        else:
+            logging.warning(
+                f"The normalization type: {normalization} might not be supported in megatron core."
+                f"Supported types are LayerNorm and RMSNorm."
+            )
+
+        ub_tp_comm_overlap = model_cfg.get('ub_tp_comm_overlap', False)
+
+        if not model_cfg.get('fp8', False):
+            fp8 = None
+        elif model_cfg.get('fp8_e4m3', False):
+            fp8 = 'e4m3'
+        elif model_cfg.get('fp8_hybrid', False):
+            fp8 = 'hybrid'
+        else:
+            raise ValueError(f"fp8 enabled but fp8_format (fp8_e4m3 | fp8_hybrid) is not set.")
+
+        # any configs that are not in the nemo model config will be added here
+        model_specific_configs = {
+            'layernorm_zero_centered_gamma': layernorm_zero_centered_gamma,
+            'normalization': normalization,
+            'fp8': fp8,
+            'tp_comm_overlap': ub_tp_comm_overlap,
+            # MoE related
+            'num_moe_experts': model_cfg.get('num_moe_experts', None),
+            'moe_router_load_balancing_type': model_cfg.get('moe_router_load_balancing_type', 'aux_loss'),
+            'moe_router_topk': model_cfg.get('moe_router_topk', 2),
+            'moe_grouped_gemm': model_cfg.get('moe_grouped_gemm', False),
+            'moe_aux_loss_coeff': model_cfg.get(
+                'moe_aux_loss_coeff', 0
+            ),  # 1e-2 would be a good start value for load balance loss.
+            'moe_z_loss_coeff': model_cfg.get('moe_z_loss_coeff', None),  # 1e-3 would be a good start value for z-loss
+            'moe_input_jitter_eps': model_cfg.get('moe_input_jitter_eps', None),
+            'moe_token_dropping': model_cfg.get('moe_token_dropping', False),  # TODO: Support token dropping.
+        }
+        if model_specific_configs['num_moe_experts'] is not None:
+            assert mcore_supports_moe(), 'Megatron-core >= v0.5.0 is required for MoE'
+        elif not mcore_supports_moe():
+            if 'num_moe_experts' in model_specific_configs:
+                del model_specific_configs['num_moe_experts']
+            moe_keys = list(filter(lambda x: x.startswith('moe_'), model_specific_configs.keys()))
+            for k in moe_keys:
+                del model_specific_configs[k]
+
+        # create a dictionary copy of the model config
+        cfg = OmegaConf.to_container(model_cfg, resolve=True)
+
+        # create a dict to store the transformer config arguments
+        transformer_config_dict = {}
+
+        # get model parallel configs from the base class
+        model_parallel_config = self.build_model_parallel_config()
+
+        add_bias_linear = model_cfg.get('bias', True)
+        add_qkv_bias = model_cfg.get('qkv_bias', False)
+
+        activation = model_cfg.get('activation', 'gelu')
+        gated_linear_unit = activation.endswith('glu')
+        # TODO: need to check which activation functions are supported in mcore
+        activation_func = activation_to_func(activation, openai_gelu=model_cfg.get("openai_gelu", False))
+
+        normalization = model_cfg.get('normalization', 'LayerNorm')
+
+        init_method_std = model_cfg.get('init_method_std', 0.02)
+        # default used in mcore
+        init_method = init_method_normal(init_method_std)
+
+        output_layer_init_method = init_method
+        num_layers = model_cfg.get('num_layers', 1)
+        use_scaled_init_method = model_cfg.get('use_scaled_init_method', True)
+        if use_scaled_init_method:
+            output_layer_init_method = scaled_init_method_normal(init_method_std, num_layers=num_layers)
+
+        attention_softmax_in_fp32 = False  # not currently used in NeMo unless apply_query_key_layer_scaling is True
+        apply_query_key_layer_scaling = model_cfg.get('apply_query_key_layer_scaling', False)
+
+        rotary_interleaved = model_cfg.get('rotary_interleaved', False)
+
+        fp16_enabled = self.trainer.precision in [16, '16', '16-mixed']
+        if apply_query_key_layer_scaling:
+            if fp16_enabled:
+                os.environ["NVTE_APPLY_QK_LAYER_SCALING"] = "1"
+            else:
+                logging.warning(
+                    "apply_query_key_layer_scaling is only enabled when using FP16, setting it to False "
+                    "and setting NVTE_APPLY_QK_LAYER_SCALING=0"
+                )
+                os.environ["NVTE_APPLY_QK_LAYER_SCALING"] = "0"
+                apply_query_key_layer_scaling = False
+
+        if apply_query_key_layer_scaling:
+            attention_softmax_in_fp32 = True
+
+        bias_activation_fusion = model_cfg.get('bias_activation_fusion', True)
+
+        bias_dropout_fusion = model_cfg.get('bias_dropout_add_fusion', True)
+
+        apply_rope_fusion = model_cfg.get('apply_rope_fusion', False)
+
+        # TODO: need to check if recompute APIs are matching up properly
+        recompute_granularity = model_cfg.get('activations_checkpoint_granularity', None)
+        recompute_method = model_cfg.get('activations_checkpoint_method', None)
+        recompute_num_layers = model_cfg.get('activations_checkpoint_num_layers', None)
+
+        # any configs that are not in the nemo model config will be added here
+        config_mapping = {
+            'apply_query_key_layer_scaling': apply_query_key_layer_scaling,
+            'apply_residual_connection_post_layernorm': False,  # we don't use this in NeMo
+            'layernorm_zero_centered_gamma': False,
+            'add_bias_linear': add_bias_linear,
+            'add_qkv_bias': add_qkv_bias,
+            'gated_linear_unit': gated_linear_unit,
+            'activation_func': activation_func,
+            'normalization': normalization,
+            'init_method': init_method,
+            'output_layer_init_method': output_layer_init_method,
+            'attention_softmax_in_fp32': attention_softmax_in_fp32,
+            'bias_activation_fusion': bias_activation_fusion,
+            'bias_dropout_fusion': bias_dropout_fusion,
+            'apply_rope_fusion': apply_rope_fusion,
+            'recompute_granularity': recompute_granularity,
+            'recompute_method': recompute_method,
+            'recompute_num_layers': recompute_num_layers,
+            'distribute_saved_activations': False,  # not currently used in NeMo
+            'fp8': None,
+            'rotary_interleaved': rotary_interleaved,
+            'deallocate_pipeline_outputs': True,
+        }
+
+        # populate the transformer config dict
+        for field in fields(TransformerConfig):
+            # config mapping has second highest priority
+            if field.name in config_mapping:
+                transformer_config_dict[field.name] = config_mapping[field.name]
+            # then config
+            elif field.name in cfg:
+                transformer_config_dict[field.name] = cfg[field.name]
+            # then model parallel config
+            elif field in fields(model_parallel_config):
+                transformer_config_dict[field.name] = getattr(model_parallel_config, field.name)
+            else:
+                logging.warning(
+                    f"The model: {self} does not have field.name: {field.name} in its cfg. "
+                    f"Add this key to cfg or config_mapping to make to make it configurable."
+                )
+
+        transformer_config = TransformerConfig(**transformer_config_dict)
+
+        for key, value in model_specific_configs.items():
+            setattr(transformer_config, key, value)
+
+        # pass mcore customization configs directly to mcore
+        mcore_customization_config_dict = model_cfg.get('mcore_customization_config', {})
+        for key, value in mcore_customization_config_dict.items():
+            setattr(transformer_config, key, value)
+
+        return transformer_config
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
index 7308d3db3f91..4ded9a42db4f 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
@@ -484,7 +484,7 @@ def build_transformer_config(self) -> TransformerConfig:
         activation = self.cfg.get('activation', 'gelu')
         gated_linear_unit = activation.endswith('glu')
         # TODO: need to check which activation functions are supported in mcore
-        activation_func = activation_to_func(activation)
+        activation_func = activation_to_func(activation, openai_gelu=self.cfg.get("openai_gelu", False))
 
         normalization = self.cfg.get('normalization', 'LayerNorm')
 
diff --git a/nemo/collections/nlp/parts/utils_funcs.py b/nemo/collections/nlp/parts/utils_funcs.py
index c00df5de1a98..a989ff3f606c 100644
--- a/nemo/collections/nlp/parts/utils_funcs.py
+++ b/nemo/collections/nlp/parts/utils_funcs.py
@@ -34,14 +34,14 @@
 from sklearn.metrics import classification_report, confusion_matrix
 from torch import Tensor
 
-from nemo.collections.nlp.modules.common.megatron.utils import erf_gelu
+from nemo.collections.nlp.modules.common.megatron.utils import ApproxGELUActivation, erf_gelu
 from nemo.collections.nlp.modules.common.megatron.utils import openai_gelu as openai_gelu_func
 from nemo.collections.nlp.modules.common.megatron.utils import squared_relu
 from nemo.utils import logging
 
 
 def torch_dtype_from_precision(precision: Union[int, str], megatron_amp_O2: Optional[bool] = None) -> torch.dtype:
-    """ Mapping from PTL precision types to corresponding PyTorch parameter datatype."""
+    """Mapping from PTL precision types to corresponding PyTorch parameter datatype."""
     if megatron_amp_O2 is not None and megatron_amp_O2 is False:
         return torch.float32
 
@@ -56,12 +56,12 @@ def torch_dtype_from_precision(precision: Union[int, str], megatron_amp_O2: Opti
 
 
 def list2str(l: List[int]) -> str:
-    """ Converts list to a string"""
+    """Converts list to a string"""
     return ' '.join([str(x) for x in l])
 
 
 def tensor2list(tensor: Tensor) -> List[Union[int, float]]:
-    """ Converts tensor to a list """
+    """Converts tensor to a list"""
     return tensor.detach().cpu().tolist()
 
 
@@ -168,13 +168,13 @@ def get_last_rank():
 
 
 def activation_to_func(activation: str, openai_gelu: bool = False, onnx_safe: bool = False) -> Callable:
-    """ Converts an activation function represented as a string to a function.
+    """Converts an activation function represented as a string to a function.
 
     Args:
         activation (str): string representation of an activation function, typically gotten from the model config.
         openai_gelu (bool): whether to use the OpenAI GELU implementation. Used with HF compatibility.
         onnx_safe (bool): whether to use the ONNX-compatible implementation of GELU.
-    
+
     Returns:
         Callable: the activation function.
     """
@@ -188,6 +188,7 @@ def activation_to_func(activation: str, openai_gelu: bool = False, onnx_safe: bo
         'fast-geglu',
         'fast-swiglu',
         'fast-reglu',
+        'approx-gelu',
     ]
 
     if activation not in supported_activations:
@@ -208,6 +209,8 @@ def activation_to_func(activation: str, openai_gelu: bool = False, onnx_safe: bo
         activation_func = F.silu
     elif activation == 'squared-relu':
         activation_func = squared_relu
+    elif activation == 'approx-gelu':
+        activation_func = ApproxGELUActivation
 
     return activation_func
 
diff --git a/scripts/checkpoint_converters/convert_clip_hf_to_nemo.py b/scripts/checkpoint_converters/convert_clip_hf_to_nemo.py
new file mode 100644
index 000000000000..690fa74abccd
--- /dev/null
+++ b/scripts/checkpoint_converters/convert_clip_hf_to_nemo.py
@@ -0,0 +1,248 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Usage example:
+    torchrun --nproc-per-node=1 /opt/NeMo/scripts/checkpoint_converters/convert_clip_hf_to_nemo.py \
+        --input_name_or_path=openai/clip-vit-large-patch14 \
+        --output_path=openai_clip.nemo \
+        --hparams_file=/opt/NeMo/examples/multimodal/vision_language_foundation/clip/conf/megatron_clip_VIT-L-14.yaml
+
+Additionally, provide a NeMo hparams file with the correct model architecture arguments. Refer to examples/multimodal/foundation/clip/conf/megatron_clip_config.yaml.
+
+After conversion, you can verify with the following command:
+
+  wget https://upload.wikimedia.org/wikipedia/commons/0/0f/1665_Girl_with_a_Pearl_Earring.jpg
+  torchrun --nproc-per-node=1 /opt/NeMo/examples/multimodal/vision_language_foundation/clip/megatron_clip_infer.py \
+    model.restore_from_path=./openai_clip.nemo \
+    image_path=./1665_Girl_with_a_Pearl_Earring.jpg \
+    texts='["a dog", "a boy", "a girl"]'
+
+It should generate a high probability for "a girl" tag, e.g.
+Given image's CLIP text probability:  [('a dog', 0.0049710185), ('a boy', 0.002258187), ('a girl', 0.99277073)]
+
+"""
+
+import os
+from argparse import ArgumentParser
+
+import torch
+from omegaconf import OmegaConf
+from pytorch_lightning.plugins.environments import TorchElasticEnvironment
+from pytorch_lightning.trainer.trainer import Trainer
+from transformers import CLIPModel
+
+from nemo.collections.multimodal.models.vision_language_foundation.clip.megatron_clip_models import MegatronCLIPModel
+from nemo.utils import AppState, logging
+from nemo.utils.distributed import initialize_distributed
+
+try:
+    from megatron.core import parallel_state
+
+    HAVE_MEGATRON_CORE = True
+
+except (ImportError, ModuleNotFoundError):
+
+    HAVE_MEGATRON_CORE = False
+
+
+def get_args():
+    parser = ArgumentParser()
+    parser.add_argument("--input_name_or_path", type=str, default="openai/clip-vit-base-patch32")
+
+    parser.add_argument(
+        "--hparams_file",
+        type=str,
+        default=None,
+        required=True,
+        help="Path config for restoring. It's created during training and may need to be modified during restore if restore environment is different than training. Ex: /opt/NeMo/examples/multimodal/vision_language_foundation/clip/conf/megatron_clip_VIT-L-14.yaml",
+    )
+    parser.add_argument("--output_path", type=str, default=None, required=True, help="Path to output .nemo file.")
+
+    parser.add_argument("--gpus_per_node", type=int, required=False, default=1)
+    parser.add_argument("--tensor_model_parallel_size", type=int, required=False, default=1)
+    parser.add_argument("--pipeline_model_parallel_size", type=int, required=False, default=1)
+    parser.add_argument(
+        "--pipeline_model_parallel_split_rank",
+        type=int,
+        required=False,
+        default=None,
+        help="If pipeline parallel size > 1, this is the rank at which the encoder ends and the decoder begins.",
+    )
+    parser.add_argument("--local_rank", type=int, required=False, default=os.getenv('LOCAL_RANK', -1))
+
+    args = parser.parse_args()
+    return args
+
+
+def mapping_hf_state_dict(hf_model):
+    hf_state_dict = hf_model.state_dict()
+    hf_config = hf_model.config
+    key_mapping = {
+        "text_projection.weight": "text_encoder.head.weight",
+        "visual_projection.weight": "vision_encoder.head.weight",
+    }
+
+    layer_mapping = {
+        ".layer_norm1.weight": ".self_attention.linear_qkv.layer_norm_weight",
+        ".layer_norm1.bias": ".self_attention.linear_qkv.layer_norm_bias",
+        ".layer_norm2.weight": ".mlp.linear_fc1.layer_norm_weight",
+        ".layer_norm2.bias": ".mlp.linear_fc1.layer_norm_bias",
+        ".self_attn.out_proj.weight": ".self_attention.linear_proj.weight",
+        ".self_attn.out_proj.bias": ".self_attention.linear_proj.bias",
+        ".mlp.fc1.weight": ".mlp.linear_fc1.weight",
+        ".mlp.fc1.bias": ".mlp.linear_fc1.bias",
+        ".mlp.fc2.weight": ".mlp.linear_fc2.weight",
+        ".mlp.fc2.bias": ".mlp.linear_fc2.bias",
+        ".pre_layrnorm.weight": ".ln_pre.weight",
+        ".pre_layrnorm.bias": ".ln_pre.bias",
+        ".post_layernorm.weight": ".final_layernorm.weight",
+        ".post_layernorm.bias": ".final_layernorm.bias",
+        ".embeddings.patch_embedding.weight": ".conv1.weight",
+        ".embeddings.class_embedding": ".class_token",
+        ".final_layer_norm.weight": ".final_layernorm.weight",
+        ".final_layer_norm.bias": ".final_layernorm.bias",
+        ".embeddings.token_embedding.weight": ".embedding.word_embeddings.weight",
+        "vision_encoder.embeddings.position_embedding.weight": "vision_encoder.position_embeddings.weight",
+        "text_encoder.embeddings.position_embedding.weight": "text_encoder.embedding.position_embeddings.weight",
+    }
+
+    nemo_state_dict = {}
+    for key in hf_state_dict.keys():
+        if key.startswith("text_model.encoder.layers"):
+            key_ = key.replace("text_model.encoder.layers", "text_encoder.decoder.layers")
+        elif key.startswith("vision_model.encoder.layers"):
+            key_ = key.replace("vision_model.encoder.layers", "vision_encoder.decoder.layers")
+        elif key.startswith('vision_model.'):
+            key_ = key.replace("vision_model.", "vision_encoder.")
+        elif key.startswith('text_model.'):
+            key_ = key.replace('text_model.', 'text_encoder.')
+        else:
+            key_ = key
+        for pat in key_mapping:
+            if key_ == pat:
+                key_ = key_.replace(pat, key_mapping[pat])
+        for pat in layer_mapping:
+            if key_.endswith(pat):
+                key_ = key_[: -len(pat)] + layer_mapping[pat]
+                break
+        if "vision" in key_:
+            config = hf_config.vision_config
+        else:
+            config = hf_config.text_config
+        head_num = num_query_groups = config.num_attention_heads
+        hidden_size = config.hidden_size
+        head_size = hidden_size // head_num
+        heads_per_group = head_num // num_query_groups
+
+        if 'q_proj.weight' in key_:
+            key_k = key.replace('q_proj', 'k_proj')
+            key_v = key.replace('q_proj', 'v_proj')
+            key_new = key_.replace('self_attn.q_proj', 'self_attention.linear_qkv')
+            q_weight, k_weight, v_weight = hf_state_dict[key], hf_state_dict[key_k], hf_state_dict[key_v]
+
+            q_weight = q_weight.reshape(head_num, head_size, hidden_size)
+            k_weight = k_weight.reshape(num_query_groups, head_size, hidden_size)
+            v_weight = v_weight.reshape(num_query_groups, head_size, hidden_size)
+            qkv_weight = torch.empty((0, head_size, hidden_size), device=q_weight.device)
+            for i in range(num_query_groups):
+                qkv_weight = torch.cat((qkv_weight, q_weight[i * heads_per_group : (i + 1) * heads_per_group, :, :]))
+                qkv_weight = torch.cat((qkv_weight, k_weight[i : i + 1, :, :]))
+                qkv_weight = torch.cat((qkv_weight, v_weight[i : i + 1, :, :]))
+            qkv_weight = qkv_weight.reshape([head_size * (head_num + 2 * num_query_groups), hidden_size])
+            nemo_state_dict[key_new] = qkv_weight
+
+        elif 'q_proj.bias' in key_:
+            key_k = key.replace('q_proj', 'k_proj')
+            key_v = key.replace('q_proj', 'v_proj')
+            key_new = key_.replace('self_attn.q_proj', 'self_attention.linear_qkv')
+            q_bias, k_bias, v_bias = hf_state_dict[key], hf_state_dict[key_k], hf_state_dict[key_v]
+
+            q_bias = q_bias.reshape(head_num, head_size)
+            k_bias = k_bias.reshape(num_query_groups, head_size)
+            v_bias = v_bias.reshape(num_query_groups, head_size)
+            qkv_bias = torch.empty((0, head_size), device=q_bias.device)
+            for i in range(num_query_groups):
+                qkv_bias = torch.cat((qkv_bias, q_bias[i * heads_per_group : (i + 1) * heads_per_group, :]))
+                qkv_bias = torch.cat((qkv_bias, k_bias[i : i + 1, :]))
+                qkv_bias = torch.cat((qkv_bias, v_bias[i : i + 1, :]))
+            qkv_bias = qkv_bias.reshape([head_size * (head_num + 2 * num_query_groups)])
+            nemo_state_dict[key_new] = qkv_bias
+        elif not ('k_proj' in key_ or 'v_proj' in key_ or 'position_ids' in key_):
+            nemo_state_dict[key_] = hf_state_dict[key]
+
+    nemo_state_dict["vision_encoder.class_token"] = nemo_state_dict["vision_encoder.class_token"].reshape(1, 1, -1)
+
+    return nemo_state_dict
+
+
+def convert(local_rank, rank, world_size, args):
+    app_state = AppState()
+    app_state.data_parallel_rank = 0
+    num_nodes = world_size // args.gpus_per_node
+    trainer = Trainer(
+        devices=args.gpus_per_node, num_nodes=num_nodes, accelerator='gpu', plugins=[TorchElasticEnvironment()]
+    )
+
+    app_state.pipeline_model_parallel_size = args.pipeline_model_parallel_size
+    app_state.tensor_model_parallel_size = args.tensor_model_parallel_size
+
+    # no use atm, use to split ranks in encoder/decoder models.
+    if args.pipeline_model_parallel_size > 1 and args.model_type in []:
+        if args.pipeline_model_parallel_split_rank is not None:
+            app_state.pipeline_model_parallel_split_rank = args.pipeline_model_parallel_split_rank
+        else:
+            if args.pipeline_model_parallel_size % 2 != 0:
+                raise ValueError(
+                    f"Pipeline model parallel size {args.pipeline_model_parallel_size} must be even if split rank is not specified."
+                )
+            else:
+                # If split rank is not set, then we set it to be pipeline_model_parallel_size // 2 - this is because in most cases we have the same number of enc/dec layers.
+                app_state.pipeline_model_parallel_split_rank = args.pipeline_model_parallel_size // 2
+    else:
+        app_state.pipeline_model_parallel_split_rank = None
+
+    app_state.model_parallel_size = app_state.tensor_model_parallel_size * app_state.pipeline_model_parallel_size
+
+    parallel_state.initialize_model_parallel(
+        tensor_model_parallel_size=app_state.tensor_model_parallel_size,
+        pipeline_model_parallel_size=app_state.pipeline_model_parallel_size,
+        pipeline_model_parallel_split_rank=app_state.pipeline_model_parallel_split_rank,
+    )
+
+    app_state.pipeline_model_parallel_rank = parallel_state.get_pipeline_model_parallel_rank()
+    app_state.tensor_model_parallel_rank = parallel_state.get_tensor_model_parallel_rank()
+
+    cfg = OmegaConf.load(args.hparams_file)
+    cfg.model.mcore_gpt = True
+    cfg.model.transformer_engine = True
+    cfg.model.text.position_embedding_type = "learned_absolute"
+    cfg.model.vision.position_embedding_type = "learned_absolute"
+
+    model = MegatronCLIPModel(cfg.model, trainer)
+
+    hf_model = CLIPModel.from_pretrained(args.input_name_or_path)
+    state_dict = mapping_hf_state_dict(hf_model)
+
+    model.model.load_state_dict(state_dict, strict=False)
+
+    model.save_to(args.output_path)
+
+    logging.info(f'NeMo model saved to: {args.output_path}')
+
+
+if __name__ == '__main__':
+    args = get_args()
+    local_rank, rank, world_size = initialize_distributed(args)
+    convert(local_rank, rank, world_size, args)
diff --git a/scripts/checkpoint_converters/convert_siglip_hf_to_nemo.py b/scripts/checkpoint_converters/convert_siglip_hf_to_nemo.py
new file mode 100644
index 000000000000..97a9d557f78b
--- /dev/null
+++ b/scripts/checkpoint_converters/convert_siglip_hf_to_nemo.py
@@ -0,0 +1,380 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Requires HF transformers updated to support Gemma Models
+   python3 /opt/NeMo/scripts/nlp_language_modeling/convert_gemma_hf_to_nemo.py \
+   --input_name_or_path /path/to/gemma/checkpoints/hf/7b \
+   --output_path /path/to/gemma-7b.nemo \
+   --tokenizer_path /path/to/tokenizer.model
+"""
+
+import os
+from argparse import ArgumentParser
+
+import torch
+from omegaconf import OmegaConf
+from transformers import AutoModel, AutoProcessor
+
+from nemo.collections.multimodal.models.vision_language_foundation.clip.megatron_clip_models import MegatronCLIPModel
+from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronTrainerBuilder
+from nemo.collections.nlp.parts.utils_funcs import torch_dtype_from_precision
+from nemo.utils import logging
+
+
+def create_rename_keys(num_hidden_layers):
+    rename_keys = []
+    for i in range(num_hidden_layers):
+        rename_keys.extend(
+            [
+                (
+                    f"text_model.encoder.layers.{i}.self_attn.k_proj.weight",
+                    f"model.text_encoder.decoder.layers.{i}.self_attention.linear_k.weight",
+                ),
+                (
+                    f"text_model.encoder.layers.{i}.self_attn.k_proj.bias",
+                    f"model.text_encoder.decoder.layers.{i}.self_attention.linear_k.bias",
+                ),
+                (
+                    f"text_model.encoder.layers.{i}.self_attn.q_proj.weight",
+                    f"model.text_encoder.decoder.layers.{i}.self_attention.linear_q.weight",
+                ),
+                (
+                    f"text_model.encoder.layers.{i}.self_attn.q_proj.bias",
+                    f"model.text_encoder.decoder.layers.{i}.self_attention.linear_q.bias",
+                ),
+                (
+                    f"text_model.encoder.layers.{i}.self_attn.v_proj.weight",
+                    f"model.text_encoder.decoder.layers.{i}.self_attention.linear_v.weight",
+                ),
+                (
+                    f"text_model.encoder.layers.{i}.self_attn.v_proj.bias",
+                    f"model.text_encoder.decoder.layers.{i}.self_attention.linear_v.bias",
+                ),
+                (
+                    f"text_model.encoder.layers.{i}.self_attn.out_proj.weight",
+                    f"model.text_encoder.decoder.layers.{i}.self_attention.linear_proj.weight",
+                ),
+                (
+                    f"text_model.encoder.layers.{i}.self_attn.out_proj.bias",
+                    f"model.text_encoder.decoder.layers.{i}.self_attention.linear_proj.bias",
+                ),
+                (
+                    f"text_model.encoder.layers.{i}.layer_norm1.weight",
+                    f"model.text_encoder.decoder.layers.{i}.self_attention.linear_qkv.layer_norm_weight",
+                ),
+                (
+                    f"text_model.encoder.layers.{i}.layer_norm1.bias",
+                    f"model.text_encoder.decoder.layers.{i}.self_attention.linear_qkv.layer_norm_bias",
+                ),
+                (
+                    f"text_model.encoder.layers.{i}.mlp.fc1.weight",
+                    f"model.text_encoder.decoder.layers.{i}.mlp.linear_fc1.weight",
+                ),
+                (
+                    f"text_model.encoder.layers.{i}.mlp.fc1.bias",
+                    f"model.text_encoder.decoder.layers.{i}.mlp.linear_fc1.bias",
+                ),
+                (
+                    f"text_model.encoder.layers.{i}.mlp.fc2.weight",
+                    f"model.text_encoder.decoder.layers.{i}.mlp.linear_fc2.weight",
+                ),
+                (
+                    f"text_model.encoder.layers.{i}.mlp.fc2.bias",
+                    f"model.text_encoder.decoder.layers.{i}.mlp.linear_fc2.bias",
+                ),
+                (
+                    f"text_model.encoder.layers.{i}.layer_norm2.weight",
+                    f"model.text_encoder.decoder.layers.{i}.mlp.linear_fc1.layer_norm_weight",
+                ),
+                (
+                    f"text_model.encoder.layers.{i}.layer_norm2.bias",
+                    f"model.text_encoder.decoder.layers.{i}.mlp.linear_fc1.layer_norm_bias",
+                ),
+                (
+                    f"vision_model.encoder.layers.{i}.self_attn.k_proj.weight",
+                    f"model.vision_encoder.decoder.layers.{i}.self_attention.linear_k.weight",
+                ),
+                (
+                    f"vision_model.encoder.layers.{i}.self_attn.k_proj.bias",
+                    f"model.vision_encoder.decoder.layers.{i}.self_attention.linear_k.bias",
+                ),
+                (
+                    f"vision_model.encoder.layers.{i}.self_attn.v_proj.weight",
+                    f"model.vision_encoder.decoder.layers.{i}.self_attention.linear_v.weight",
+                ),
+                (
+                    f"vision_model.encoder.layers.{i}.self_attn.v_proj.bias",
+                    f"model.vision_encoder.decoder.layers.{i}.self_attention.linear_v.bias",
+                ),
+                (
+                    f"vision_model.encoder.layers.{i}.self_attn.q_proj.weight",
+                    f"model.vision_encoder.decoder.layers.{i}.self_attention.linear_q.weight",
+                ),
+                (
+                    f"vision_model.encoder.layers.{i}.self_attn.q_proj.bias",
+                    f"model.vision_encoder.decoder.layers.{i}.self_attention.linear_q.bias",
+                ),
+                (
+                    f"vision_model.encoder.layers.{i}.self_attn.out_proj.weight",
+                    f"model.vision_encoder.decoder.layers.{i}.self_attention.linear_proj.weight",
+                ),
+                (
+                    f"vision_model.encoder.layers.{i}.self_attn.out_proj.bias",
+                    f"model.vision_encoder.decoder.layers.{i}.self_attention.linear_proj.bias",
+                ),
+                (
+                    f"vision_model.encoder.layers.{i}.layer_norm1.weight",
+                    f"model.vision_encoder.decoder.layers.{i}.self_attention.linear_qkv.layer_norm_weight",
+                ),
+                (
+                    f"vision_model.encoder.layers.{i}.layer_norm1.bias",
+                    f"model.vision_encoder.decoder.layers.{i}.self_attention.linear_qkv.layer_norm_bias",
+                ),
+                (
+                    f"vision_model.encoder.layers.{i}.mlp.fc1.weight",
+                    f"model.vision_encoder.decoder.layers.{i}.mlp.linear_fc1.weight",
+                ),
+                (
+                    f"vision_model.encoder.layers.{i}.mlp.fc1.bias",
+                    f"model.vision_encoder.decoder.layers.{i}.mlp.linear_fc1.bias",
+                ),
+                (
+                    f"vision_model.encoder.layers.{i}.mlp.fc2.weight",
+                    f"model.vision_encoder.decoder.layers.{i}.mlp.linear_fc2.weight",
+                ),
+                (
+                    f"vision_model.encoder.layers.{i}.mlp.fc2.bias",
+                    f"model.vision_encoder.decoder.layers.{i}.mlp.linear_fc2.bias",
+                ),
+                (
+                    f"vision_model.encoder.layers.{i}.layer_norm2.weight",
+                    f"model.vision_encoder.decoder.layers.{i}.mlp.linear_fc1.layer_norm_weight",
+                ),
+                (
+                    f"vision_model.encoder.layers.{i}.layer_norm2.bias",
+                    f"model.vision_encoder.decoder.layers.{i}.mlp.linear_fc1.layer_norm_bias",
+                ),
+            ]
+        )
+
+    rename_keys.extend(
+        [
+            ("logit_scale", "model.logit_scale"),
+            ("logit_bias", "model.logit_bias"),
+            ("vision_model.embeddings.patch_embedding.weight", "model.vision_encoder.conv1.weight"),
+            ("vision_model.embeddings.patch_embedding.bias", "model.vision_encoder.conv1.bias"),
+            ("vision_model.embeddings.position_embedding.weight", "model.vision_encoder.position_embeddings.weight"),
+            ("vision_model.post_layernorm.weight", "model.vision_encoder.final_layernorm.weight"),
+            ("vision_model.post_layernorm.bias", "model.vision_encoder.final_layernorm.bias"),
+            ("vision_model.head.probe", "model.vision_encoder.head.probe"),
+            (
+                "vision_model.head.attention.in_proj_weight",
+                "model.vision_encoder.head.cross_attention.linear_qkv.weight",
+            ),
+            ("vision_model.head.attention.in_proj_bias", "model.vision_encoder.head.cross_attention.linear_qkv.bias"),
+            (
+                "vision_model.head.attention.out_proj.weight",
+                "model.vision_encoder.head.cross_attention.linear_proj.weight",
+            ),
+            (
+                "vision_model.head.attention.out_proj.bias",
+                "model.vision_encoder.head.cross_attention.linear_proj.bias",
+            ),
+            ("vision_model.head.layernorm.weight", "model.vision_encoder.head.mlp.linear_fc1.layer_norm_weight"),
+            ("vision_model.head.layernorm.bias", "model.vision_encoder.head.mlp.linear_fc1.layer_norm_bias"),
+            ("vision_model.head.mlp.fc1.weight", "model.vision_encoder.head.mlp.linear_fc1.weight"),
+            ("vision_model.head.mlp.fc1.bias", "model.vision_encoder.head.mlp.linear_fc1.bias"),
+            ("vision_model.head.mlp.fc2.weight", "model.vision_encoder.head.mlp.linear_fc2.weight"),
+            ("vision_model.head.mlp.fc2.bias", "model.vision_encoder.head.mlp.linear_fc2.bias"),
+            ("text_model.embeddings.token_embedding.weight", "model.text_encoder.embedding.word_embeddings.weight"),
+            (
+                "text_model.embeddings.position_embedding.weight",
+                "model.text_encoder.embedding.position_embeddings.weight",
+            ),
+            ("text_model.final_layer_norm.weight", "model.text_encoder.final_layernorm.weight"),
+            ("text_model.final_layer_norm.bias", "model.text_encoder.final_layernorm.bias"),
+            ("text_model.head.weight", "model.text_encoder.head.weight"),
+            ("text_model.head.bias", "model.text_encoder.head.bias"),
+        ]
+    )
+
+    return rename_keys
+
+
+def rename_model_keys(model_state_dict, rename_keys):
+    """
+    Rename keys in the model's state dictionary based on the provided mappings.
+
+    Parameters:
+    model_state_dict (dict): The state dictionary of the model.
+    rename_keys (list): A list of tuples with the mapping (old_key, new_key).
+
+    Returns:
+    dict: A new state dictionary with updated key names.
+    """
+
+    # Create a new state dictionary with updated key names
+    new_state_dict = {}
+
+    # Track keys from the original state dict to ensure all are processed
+    remaining_keys = set(model_state_dict.keys())
+
+    # Iterate over the rename mappings
+    for old_key, new_key in rename_keys:
+        if old_key in model_state_dict:
+            # Rename the key and remove it from the tracking set
+            new_state_dict[new_key] = model_state_dict[old_key]
+            remaining_keys.remove(old_key)
+
+    # Check if any keys were not converted from old to new
+    for old_key in remaining_keys:
+        print(f"Warning: Key '{old_key}' was not converted.")
+
+    return new_state_dict
+
+
+def adjust_tensor_shapes(model, nemo_state_dict):
+    """
+    Adapt tensor shapes in the state dictionary to ensure compatibility with a different model structure.
+
+    Parameters:
+    nemo_state_dict (dict): The state dictionary of the model.
+
+    Returns:
+    dict: The updated state dictionary with modified tensor shapes for compatibility.
+    """
+    model_config = model.cfg
+
+    # Note: For 'key' and 'value' weight and biases, NeMo uses a consolidated tensor 'query_key_value'.
+    for key_ in list(nemo_state_dict.keys()):
+        if "vision" in key_:
+            config = model_config["vision"]
+        else:
+            config = model_config["text"]
+        num_query_groups = head_num = config["num_attention_heads"]
+        hidden_size = config["hidden_size"]
+        head_size = hidden_size // head_num
+        heads_per_group = head_num // num_query_groups
+        if "bias" in key_:
+            hidden_size = 1
+
+        if 'head.cross_attention.linear_qkv.' in key_:
+            key_q = key_.replace('linear_qkv', 'linear_q')
+            key_kv = key_.replace('linear_qkv', 'linear_kv')
+            q_weight, k_weight, v_weight = nemo_state_dict[key_].chunk(3)
+            k_weight = k_weight.reshape(num_query_groups, head_size, hidden_size)
+            v_weight = v_weight.reshape(num_query_groups, head_size, hidden_size)
+            kv_weight = torch.empty((0, head_size, hidden_size), device=q_weight.device)
+            for i in range(num_query_groups):
+                kv_weight = torch.cat((kv_weight, k_weight[i : i + 1, :, :]))
+                kv_weight = torch.cat((kv_weight, v_weight[i : i + 1, :, :]))
+            kv_weight = kv_weight.reshape([head_size * 2 * num_query_groups, hidden_size])
+            if "bias" in key_:
+                kv_weight = kv_weight.squeeze(-1)
+            nemo_state_dict[key_q] = q_weight
+            nemo_state_dict[key_kv] = kv_weight
+            del nemo_state_dict[key_]
+
+        if 'self_attention.linear_q.' in key_:
+            key_q = key_
+            key_k = key_.replace('linear_q', 'linear_k')
+            key_v = key_.replace('linear_q', 'linear_v')
+            key_qkv = key_.replace('linear_q', 'linear_qkv')
+
+            # [(head_num + 2 * num_query_groups) * head_size, hidden_size]
+            # -> [head_num, head_size, hidden_size], 2 * [num_query_groups, head_size, hidden_size]
+            q_weight, k_weight, v_weight = nemo_state_dict[key_q], nemo_state_dict[key_k], nemo_state_dict[key_v]
+            q_weight = q_weight.reshape(head_num, head_size, hidden_size)
+            k_weight = k_weight.reshape(num_query_groups, head_size, hidden_size)
+            v_weight = v_weight.reshape(num_query_groups, head_size, hidden_size)
+
+            qkv_weight = torch.empty((0, head_size, hidden_size), device=q_weight.device)
+            for i in range(num_query_groups):
+                qkv_weight = torch.cat((qkv_weight, q_weight[i * heads_per_group : (i + 1) * heads_per_group, :, :]))
+                qkv_weight = torch.cat((qkv_weight, k_weight[i : i + 1, :, :]))
+                qkv_weight = torch.cat((qkv_weight, v_weight[i : i + 1, :, :]))
+            qkv_weight = qkv_weight.reshape([head_size * (head_num + 2 * num_query_groups), hidden_size])
+            if "bias" in key_:
+                qkv_weight = qkv_weight.squeeze(-1)
+            nemo_state_dict[key_qkv] = qkv_weight
+            del nemo_state_dict[key_q], nemo_state_dict[key_k], nemo_state_dict[key_v]
+
+    return nemo_state_dict
+
+
+def adjust_nemo_config(model_config, ref_config):
+    model_config["encoder_seq_length"] = ref_config["max_position_embeddings"]
+    model_config["num_layers"] = ref_config["num_hidden_layers"]
+    model_config["ffn_hidden_size"] = ref_config["intermediate_size"]
+    model_config["hidden_size"] = ref_config["hidden_size"]
+    model_config["num_attention_heads"] = ref_config["num_attention_heads"]
+    model_config["num_query_groups"] = ref_config["num_key_value_heads"]
+    model_config["kv_channels"] = ref_config["head_dim"]
+    model_config["layernorm_epsilon"] = ref_config["rms_norm_eps"]
+    return model_config
+
+
+def get_args():
+    parser = ArgumentParser()
+    parser.add_argument("--input_name_or_path", type=str)
+    parser.add_argument("--tokenizer_path", type=str)
+    parser.add_argument(
+        "--hparams_file",
+        type=str,
+        default=os.path.join(
+            os.path.dirname(__file__),
+            '../../examples/multimodal/vision_language_foundation/clip/conf/megatron_siglip_so400m_14_384.yaml',
+        ),
+        required=False,
+        help="Path config for restoring. It's created during training and may need to be modified during restore if restore environment is different than training. Ex: /raid/nemo_experiments/megatron_gpt/hparams.yaml",
+    )
+    parser.add_argument("--output_path", type=str, default=None, help="Path to output .nemo file.")
+    parser.add_argument(
+        "--precision", type=str, default="bf16", choices=["bf16", "32"], help="Precision for checkpoint weight saved"
+    )
+
+    args = parser.parse_args()
+    return args
+
+
+def convert(args):
+    logging.info(f"Loading checkpoint from HF: `{args.input_name_or_path}`")
+    hf_model = AutoModel.from_pretrained(args.input_name_or_path)
+    # hf_processor = AutoProcessor.from_pretrained(args.input_name_or_path)
+    logging.info("HF Model loading done.")
+
+    nemo_config = OmegaConf.load(args.hparams_file)
+
+    nemo_config.trainer["precision"] = args.precision
+    trainer = MegatronTrainerBuilder(nemo_config).create_trainer()
+    model = MegatronCLIPModel(nemo_config.model, trainer)
+
+    assert nemo_config.model.text.num_layers == nemo_config.model.vision.num_layers
+    rename_keys = create_rename_keys(nemo_config.model.text.num_layers)
+    old_state_dict = hf_model.state_dict()
+    new_state_dict = rename_model_keys(model_state_dict=old_state_dict, rename_keys=rename_keys)
+
+    nemo_state_dict = adjust_tensor_shapes(model, new_state_dict)
+    model.load_state_dict(nemo_state_dict, strict=False)
+
+    dtype = torch_dtype_from_precision(args.precision)
+    model = model.to(dtype=dtype)
+    model.save_to(args.output_path)
+    logging.info(f'NeMo model saved to: {args.output_path}')
+
+
+if __name__ == '__main__':
+    args = get_args()
+    convert(args)

From b4fe4a595575614d8c054ea28cecc02c90f946b6 Mon Sep 17 00:00:00 2001
From: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>
Date: Mon, 8 Jul 2024 09:23:28 -0700
Subject: [PATCH 22/26] Add REST API to deploy module (#9539)

* Add REST API and FastAPI to deploy module

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Add NemoQuery and requirements

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Edit path for config.json

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Add modifications for REST API for the correct functionality

Move service dir under deploy
Use NeMoQueryLLM instead of NemoQuery

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Apply isort and black reformatting

Signed-off-by: pre-commit-ci[bot] <pre-commit-ci[bot]@users.noreply.github.com>

* Change default port for REST Service

Change default port for REST service as Triton server also used the same port as default.

Signed-off-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>

* Apply isort and black reformatting

Signed-off-by: athitten <athitten@users.noreply.github.com>

---------

Signed-off-by: Abhishree <abhishreetm@gmail.com>
Signed-off-by: pre-commit-ci[bot] <pre-commit-ci[bot]@users.noreply.github.com>
Signed-off-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>
Signed-off-by: athitten <athitten@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: athitten <athitten@users.noreply.github.com>
---
 nemo/deploy/service/__init__.py       | 14 +++++
 nemo/deploy/service/config.json       |  5 ++
 nemo/deploy/service/rest_model_api.py | 87 +++++++++++++++++++++++++++
 requirements/requirements_infer.txt   |  4 +-
 scripts/deploy/nlp/deploy_triton.py   | 30 ++++++++-
 5 files changed, 138 insertions(+), 2 deletions(-)
 create mode 100644 nemo/deploy/service/__init__.py
 create mode 100644 nemo/deploy/service/config.json
 create mode 100644 nemo/deploy/service/rest_model_api.py

diff --git a/nemo/deploy/service/__init__.py b/nemo/deploy/service/__init__.py
new file mode 100644
index 000000000000..0349454da9e1
--- /dev/null
+++ b/nemo/deploy/service/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .rest_model_api import app
diff --git a/nemo/deploy/service/config.json b/nemo/deploy/service/config.json
new file mode 100644
index 000000000000..d3b3440dd97b
--- /dev/null
+++ b/nemo/deploy/service/config.json
@@ -0,0 +1,5 @@
+{
+    "triton_service_port": 8000,
+    "triton_service_ip": "0.0.0.0",
+    "triton_request_timeout": 60
+  }
\ No newline at end of file
diff --git a/nemo/deploy/service/rest_model_api.py b/nemo/deploy/service/rest_model_api.py
new file mode 100644
index 000000000000..5c49370fd45f
--- /dev/null
+++ b/nemo/deploy/service/rest_model_api.py
@@ -0,0 +1,87 @@
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import json
+import os
+from pathlib import Path
+
+from fastapi import FastAPI
+from pydantic import BaseModel
+from pydantic_settings import BaseSettings
+
+from nemo.deploy.nlp import NemoQueryLLM
+
+
+class TritonSettings(BaseSettings):
+    _triton_service_port: int
+    _triton_service_ip: str
+    _triton_request_timeout: str
+
+    def __init__(self):
+        super(TritonSettings, self).__init__()
+        try:
+            with open(os.path.join(Path.cwd(), 'nemo/deploy/service/config.json')) as config:
+                config_json = json.load(config)
+                self._triton_service_port = config_json["triton_service_port"]
+                self._triton_service_ip = config_json["triton_service_ip"]
+                self._triton_request_timeout = config_json["triton_request_timeout"]
+        except Exception as error:
+            print("An exception occurred:", error)
+            return
+
+    @property
+    def triton_service_port(self):
+        return self._triton_service_port
+
+    @property
+    def triton_service_ip(self):
+        return self._triton_service_ip
+
+    @property
+    def triton_request_timeout(self):
+        return self._triton_request_timeout
+
+
+app = FastAPI()
+triton_settings = TritonSettings()
+
+
+class CompletionRequest(BaseModel):
+    model: str
+    prompt: str
+    max_tokens: int = 512
+    temperature: float = 1.0
+    top_p: float = 0.0
+    n: int = 1
+    stream: bool = False
+    stop: str | None = None
+    frequency_penalty: float = 1.0
+
+
+@app.post("/v1/completions/")
+def completions_v1(request: CompletionRequest):
+    try:
+        url = triton_settings.triton_service_ip + ":" + str(triton_settings.triton_service_port)
+        nq = NemoQueryLLM(url=url, model_name=request.model)
+        output = nq.query_llm(
+            prompts=[request.prompt],
+            max_output_len=request.max_tokens,
+            top_k=request.n,
+            top_p=request.top_p,
+            temperature=request.temperature,
+            init_timeout=triton_settings.triton_request_timeout,
+        )
+        return {
+            "output": output[0][0],
+        }
+    except Exception as error:
+        print("An exception occurred:", error)
+        return {"error": "An exception occurred"}
diff --git a/requirements/requirements_infer.txt b/requirements/requirements_infer.txt
index c18f4e81ade3..5380398c278b 100644
--- a/requirements/requirements_infer.txt
+++ b/requirements/requirements_infer.txt
@@ -1,4 +1,6 @@
+fastapi
 nvidia-pytriton
+pydantic-settings
 tensorstore==0.1.45
+uvicorn
 zarr
-
diff --git a/scripts/deploy/nlp/deploy_triton.py b/scripts/deploy/nlp/deploy_triton.py
index 7173c64c7438..a306231bcd61 100755
--- a/scripts/deploy/nlp/deploy_triton.py
+++ b/scripts/deploy/nlp/deploy_triton.py
@@ -18,6 +18,8 @@
 import sys
 from pathlib import Path
 
+import uvicorn
+
 from nemo.deploy import DeployPyTriton
 
 LOGGER = logging.getLogger("NeMo")
@@ -170,6 +172,17 @@ def get_args(argv):
         choices=['TensorRT-LLM', 'In-Framework'],
         help="Different options to deploy nemo model.",
     )
+    parser.add_argument(
+        "-srs",
+        "--start_rest_service",
+        default="False",
+        type=str,
+        help="Starts the REST service for OpenAI API support",
+    )
+    parser.add_argument(
+        "-sha", "--service_http_address", default="0.0.0.0", type=str, help="HTTP address for the REST Service"
+    )
+    parser.add_argument("-sp", "--service_port", default=8080, type=int, help="Port for the REST Service")
     parser.add_argument("-dm", "--debug_mode", default=False, action='store_true', help="Enable debug mode")
     args = parser.parse_args(argv)
     return args
@@ -224,6 +237,11 @@ def get_trtllm_deployable(args):
                     "There are {0} tables and {1} task ids.".format(len(ptuning_tables_files), len(args.task_ids))
                 )
 
+    if args.start_rest_service:
+        if args.service_port == args.triton_port:
+            logging.error("REST service port and Triton server port cannot use the same port.")
+            return
+
     trt_llm_exporter = TensorRTLLM(
         model_dir=trt_llm_path,
         lora_ckpt_list=args.lora_ckpt,
@@ -331,11 +349,21 @@ def nemo_deploy(argv):
 
     try:
         LOGGER.info("Model serving on Triton is will be started.")
+        if args.start_rest_service == "True":
+            try:
+                LOGGER.info("REST service will be started.")
+                uvicorn.run(
+                    'nemo.deploy.service.rest_model_api:app',
+                    host=args.service_http_address,
+                    port=args.service_port,
+                    reload=True,
+                )
+            except Exception as error:
+                logging.error("Error message has occurred during REST service start. Error message: " + str(error))
         nm.serve()
     except Exception as error:
         LOGGER.error("Error message has occurred during deploy function. Error message: " + str(error))
         return
-
     LOGGER.info("Model serving will be stopped.")
     nm.stop()
 

From 4dc63e751033b0ce4f0c4b2967bdd2dbb0058d31 Mon Sep 17 00:00:00 2001
From: paul-gibbons <87940629+paul-gibbons@users.noreply.github.com>
Date: Mon, 8 Jul 2024 09:37:46 -0700
Subject: [PATCH 23/26] Mistral + Mixtral Support for NeVa (#9459)

* mistral template support

Signed-off-by: paul-gibbons <paul@gibbonspaul.com>

* get_specs neva fix

Signed-off-by: paul-gibbons <paul@gibbonspaul.com>

* mistral update

Signed-off-by: paul-gibbons <paul@gibbonspaul.com>

* fixed mistral tokenization

Signed-off-by: paul-gibbons <paul@gibbonspaul.com>

* text_gen_strategy add mistral support

Signed-off-by: paul-gibbons <paul@gibbonspaul.com>

* mistral text_gen fix

Signed-off-by: paul-gibbons <paul@gibbonspaul.com>

* Cleaning up neva config

Signed-off-by: paul-gibbons <paul@gibbonspaul.com>

* fix llama_2 default text_gen_strategy

Signed-off-by: paul-gibbons <paul@gibbonspaul.com>

* Apply isort and black reformatting

Signed-off-by: paul-gibbons <paul-gibbons@users.noreply.github.com>

* fix forward() to account for new embedding optimization in MCore

Signed-off-by: paul-gibbons <paul@gibbonspaul.com>

* Apply isort and black reformatting

Signed-off-by: paul-gibbons <paul-gibbons@users.noreply.github.com>

---------

Signed-off-by: paul-gibbons <paul@gibbonspaul.com>
Signed-off-by: paul-gibbons <paul-gibbons@users.noreply.github.com>
Co-authored-by: paul-gibbons <paul-gibbons@users.noreply.github.com>
---
 .../multimodal/data/neva/conversation.py      | 28 ++++++++++++--
 .../multimodal/data/neva/neva_dataset.py      | 34 ++++++++++++++---
 .../models/multimodal_llm/neva/neva_model.py  | 38 ++++++++++++++++---
 nemo/collections/multimodal/parts/utils.py    |  4 +-
 .../common/text_generation_strategy.py        | 21 ++++++++++
 5 files changed, 109 insertions(+), 16 deletions(-)

diff --git a/nemo/collections/multimodal/data/neva/conversation.py b/nemo/collections/multimodal/data/neva/conversation.py
index 43b1977aa993..10a6c9e7283d 100644
--- a/nemo/collections/multimodal/data/neva/conversation.py
+++ b/nemo/collections/multimodal/data/neva/conversation.py
@@ -43,6 +43,7 @@ class SeparatorStyle(Enum):
     PLAIN = auto()
     LLAMA_2 = auto()
     LLAMA_3 = auto()
+    MISTRAL = auto()
     NVGPT = auto()
 
 
@@ -94,11 +95,15 @@ def get_prompt(self):
                         ret += " "
                 else:
                     ret += role + ":"
-        elif self.sep_style == SeparatorStyle.LLAMA_2:
-            wrap_sys = lambda msg: f"<<SYS>>\n{msg}\n<</SYS>>\n\n"
+        elif self.sep_style == SeparatorStyle.LLAMA_2 or self.sep_style == SeparatorStyle.MISTRAL:
+            if self.sep_style == SeparatorStyle.LLAMA_2:
+                wrap_sys = lambda msg: f"<<SYS>>\n{msg}\n<</SYS>>\n\n"
+            else:
+                wrap_sys = lambda msg: f"{msg}" + ("\n" if msg else "")
             wrap_inst = lambda msg: f"[INST] {msg} [/INST]"
             ret = ""
-
+            if self.sep_style == SeparatorStyle.MISTRAL:
+                ret += DEFAULT_BOS_TOKEN
             for i, (role, message) in enumerate(messages):
                 if i == 0:
                     assert message, "first message should not be none"
@@ -112,7 +117,10 @@ def get_prompt(self):
                         message = wrap_inst(message)
                         ret += self.sep + " " + message
                     else:
-                        ret += " " + message + " " + self.sep2
+                        if self.sep_style == SeparatorStyle.LLAMA_2:
+                            ret += " " + message + " " + self.sep2
+                        else:
+                            ret += message + self.sep2
                 else:
                     ret += ""
             ret = ret.lstrip(self.sep)
@@ -449,6 +457,17 @@ def dict(self):
     version="v1_mmtag",
 )
 
+conv_mistral = Conversation(
+    system="",
+    roles=("USER", "ASSISTANT"),
+    version="mistral",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.MISTRAL,
+    sep="",
+    sep2=DEFAULT_EOS_TOKEN,
+)
+
 default_conversation = conv_vicuna_v1
 conv_templates = {
     "default": conv_vicuna_v0,
@@ -466,6 +485,7 @@ def dict(self):
     "nvgpt": conv_nvgpt,
     "nv_steerlm": conv_nvgpt,
     "nv_dpo": conv_nv_dpo,
+    "mistral": conv_mistral,
 }
 
 if __name__ == "__main__":
diff --git a/nemo/collections/multimodal/data/neva/neva_dataset.py b/nemo/collections/multimodal/data/neva/neva_dataset.py
index 86d45ded54cf..7eef677e13a8 100644
--- a/nemo/collections/multimodal/data/neva/neva_dataset.py
+++ b/nemo/collections/multimodal/data/neva/neva_dataset.py
@@ -426,6 +426,7 @@ def preprocess_llama_2(
     sources: dict,
     tokenizer,
     cfg,
+    is_mistral: bool = False,
 ) -> Dict:
     """
     Preprocesses sources for the LLaMA 2 model configuration.
@@ -442,7 +443,10 @@ def preprocess_llama_2(
     - Dict: A dictionary containing tokenized and labeled data suitable for the LLaMA 2 model.
       This includes tokens, labels, and any special processing as defined in the configuration.
     """
-    conv = conversation_lib.conv_llava_llama_2.copy()
+    if is_mistral:
+        conv = conversation_lib.conv_mistral.copy()
+    else:
+        conv = conversation_lib.conv_llava_llama_2.copy()
     roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
 
     # Apply prompt templates
@@ -477,7 +481,10 @@ def preprocess_llama_2(
     labels = tokens.clone().detach()
 
     # Mask labels
-    sep = "[/INST] "
+    if is_mistral:
+        sep = "[/INST]"
+    else:
+        sep = "[/INST] "
     for conversation, target in zip(conversations, labels):
         rounds = conversation.split(conv.sep2)
         cur_len = 0
@@ -492,18 +499,23 @@ def preprocess_llama_2(
             parts[0] += sep
 
             round_len = len(tokenizer.text_to_ids(rou + conv.sep2))
-            instruction_len = len(tokenizer.text_to_ids(parts[0])) - 2
+
+            if is_mistral:
+                instruction_len = len(tokenizer.text_to_ids(parts[0])) - 1
+            else:
+                instruction_len = len(tokenizer.text_to_ids(parts[0])) - 2
+
             if i > 0:
                 round_len -= 1  # Remove extra token added by sp tokenizer
             else:
                 instruction_len += 1
             target[cur_len : cur_len + instruction_len] = IGNORE_INDEX
-
             cur_len += round_len
         target[cur_len:] = IGNORE_INDEX
 
     # Check if masking working correctly
-    # print([x for x in zip(tokens[0].numpy().tolist(), labels[0].numpy().tolist())])
+    # masking_test =[x for x in zip(tokens[0].numpy().tolist(), labels[0].numpy().tolist())]
+    # print(masking_test)
 
     if add_extra_token:
         tokens = tokens[:, :-1].contiguous()
@@ -990,7 +1002,10 @@ def expand2square(pil_img, background_color):
                                 result.paste(pil_img, ((height - width) // 2, 0))
                                 return result
 
-                        frames = expand2square(frames, tuple(int(x * 255) for x in self.processor.image_mean))
+                        frames = [
+                            expand2square(frame, tuple(int(x * 255) for x in self.processor.image_mean))
+                            for frame in frames
+                        ]
                         frames = self.processor.preprocess(frames, return_tensors='pt')['pixel_values']
                     else:
                         frames = self.processor.preprocess(frames, return_tensors='pt')['pixel_values']
@@ -1057,6 +1072,13 @@ def expand2square(pil_img, background_color):
                 self.tokenizer,
                 self.multimodal_cfg,
             )
+        elif self.conv_template == "mistral":
+            data_dict = preprocess_llama_2(
+                sources,
+                self.tokenizer,
+                self.multimodal_cfg,
+                is_mistral=True,
+            )
         elif self.conv_template == "plain":
             data_dict = preprocess_plain(
                 sources,
diff --git a/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py b/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py
index cce40da45725..376237e89ecc 100644
--- a/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py
+++ b/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py
@@ -75,7 +75,7 @@
     HAVE_APEX = False
 
 try:
-    from megatron.core import InferenceParams, dist_checkpointing, parallel_state
+    from megatron.core import InferenceParams, dist_checkpointing, parallel_state, tensor_parallel
     from megatron.core.models.gpt import GPTModel as MCoreGPTModel
     from megatron.core.pipeline_parallel.schedules import get_forward_backward_func
     from megatron.core.transformer.utils import make_sharded_tensors_for_checkpoint
@@ -154,10 +154,34 @@ def set_media(self, media):
         self.media = media
 
     def forward(self, input_ids, **kwargs):
-        media = self.media  # avoid change the signature of embedding forward function
+        media = self.media  # avoid changing the signature of embedding forward function
+
+        # TODO: Refactor replace_media_embedding to account for MCore's embedding communication optimization
+        # https://github.com/NVIDIA/Megatron-LM/commit/ee423e7 changes the way we handle embeddings with sequence parallelism
+        # When using reduce_scatter_embeddings, word_embedding_tensor is now in the following shape: [sequence/tp, batch_size, hidden_size]
+        # replace_media_embedding currently expects [batch_size, sequence, hidden_size]
+
+        # Check if reduce_scatter_embeddings is enabled in the embedding forward function
+        apply_reduce_scatter = getattr(self, 'reduce_scatter_embeddings', False)
+
+        # Set reduce_scatter_embeddings to false to keep words_embedding's
+        # tensor dimesion the same for replace_media_embedding
+        if apply_reduce_scatter:
+            self.reduce_scatter_embeddings = False
+
         words_embeddings = super().forward(input_ids, **kwargs)
+        words_embeddings = self.replace_media_embeddings(input_ids, words_embeddings, media)
 
-        return self.replace_media_embeddings(input_ids, words_embeddings, media)
+        # Scatter embeddings back to each TP rank if reduce_scatter_embeddings is enabled
+        if apply_reduce_scatter:
+            words_embeddings = self._apply_reduce_scatter(words_embeddings)
+            self.reduce_scatter_embeddings = True
+
+        return words_embeddings
+
+    def _apply_reduce_scatter(self, embeddings):
+        embeddings = embeddings.transpose(0, 1).contiguous()
+        return tensor_parallel.mappings.scatter_to_sequence_parallel_region(embeddings)
 
     def encode_vision_x(self, vision_x: torch.Tensor):
         """
@@ -193,7 +217,6 @@ def encode_vision_x(self, vision_x: torch.Tensor):
     def replace_media_embeddings(self, input_ids, inputs_embeds, media):
         if media is None:
             return inputs_embeds
-
         batch_size, sequence_length, hidden_size = inputs_embeds.shape
 
         # calculate media features without gradients
@@ -550,7 +573,12 @@ def dummy():
                 media_end_id=media_end_id,
                 mcore_gpt=self.mcore_gpt,
                 config=self.transformer_config,
-                transformer_layer_spec=get_specs(self.spec_name),
+                transformer_layer_spec=get_specs(
+                    self.spec_name,
+                    self.transformer_config.num_moe_experts,
+                    self.transformer_config.moe_grouped_gemm,
+                    self.transformer_engine,
+                ),
                 vocab_size=self.cfg.get('override_vocab_size', self.padded_vocab_size),
                 max_sequence_length=self.cfg.get('encoder_seq_length', 512),
                 pre_process=pre_process,
diff --git a/nemo/collections/multimodal/parts/utils.py b/nemo/collections/multimodal/parts/utils.py
index b6dee33d24f3..7eb72b38d0f0 100644
--- a/nemo/collections/multimodal/parts/utils.py
+++ b/nemo/collections/multimodal/parts/utils.py
@@ -135,8 +135,10 @@ def load_nemo_model_weights(nemo_path, sharded_state_dict=None):
 
             # distributed checkpointing
             if state_dict is None and sharded_state_dict is not None:
+
                 is_dist_ckpt = True
                 checkpoint = dict(state_dict=sharded_state_dict)
+
                 tmp_model_weights_ckpt = os.path.join(tmpdir, save_restore_connector.model_weights_ckpt)
                 tmp_model_weights_dir = os.path.splitext(tmp_model_weights_ckpt)[0]
                 assert os.path.isdir(tmp_model_weights_dir), f'Expected {tmp_model_weights_dir} to be a directory.'
@@ -501,7 +503,7 @@ def expand2square(pil_img, background_color):
                     result.paste(pil_img, ((height - width) // 2, 0))
                     return result
 
-            frames = expand2square(frames, tuple(int(x * 255) for x in processor.image_mean))
+            frames = [expand2square(frame, tuple(int(x * 255) for x in self.processor.image_mean)) for frame in frames]
             frames = processor.preprocess(frames, return_tensors='pt')['pixel_values']
         else:
             frames = processor.preprocess(frames, return_tensors='pt')['pixel_values']
diff --git a/nemo/collections/nlp/modules/common/text_generation_strategy.py b/nemo/collections/nlp/modules/common/text_generation_strategy.py
index f51d53ba5944..8f8fe313a5e3 100644
--- a/nemo/collections/nlp/modules/common/text_generation_strategy.py
+++ b/nemo/collections/nlp/modules/common/text_generation_strategy.py
@@ -508,6 +508,27 @@ def neva_process_prompts(prompt, tokenizer, multimodal_cfg, num_media_latents, c
             copy.deepcopy(list_data_dict), multimodal_cfg, num_media_latents
         )  # HARDCODED FOR NOW
         data_dict = preprocess_llama_3(sources, tokenizer, multimodal_cfg)
+    elif multimodal_cfg["conv_template"] == "mistral":
+        record = {
+            'conversations': [
+                {
+                    'from': 'human',
+                    'value': prompt,
+                },
+                {
+                    'from': 'gpt',
+                    'value': '',
+                },
+            ],
+        }
+        for turn in record['conversations']:
+            if turn.get('value') is not None:
+                turn['value'] = re.sub('<image>', f'{DEFAULT_IMAGE_TOKEN}\n', turn['value'])
+        list_data_dict.append(record)
+        sources = preprocess_multimodal(
+            copy.deepcopy(list_data_dict), multimodal_cfg, num_media_latents
+        )  # HARDCODED FOR NOW
+        data_dict = preprocess_llama_2(sources, tokenizer, multimodal_cfg, is_mistral=True)
     elif multimodal_cfg["conv_template"] == "v1":
         record = {
             'conversations': [

From 38af139d8f2d3377201815d743c3c0daa05748b5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?oliver=20k=C3=B6nig?= <okoenig@nvidia.com>
Date: Mon, 8 Jul 2024 18:49:09 +0200
Subject: [PATCH 24/26] ci: Timeout per step, not job (#9635)

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>
---
 .github/workflows/_test_template.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/_test_template.yml b/.github/workflows/_test_template.yml
index 5956a23bdd67..0dbb1d50ee52 100644
--- a/.github/workflows/_test_template.yml
+++ b/.github/workflows/_test_template.yml
@@ -36,7 +36,6 @@ on:
 jobs:
   main:
     runs-on: ${{ inputs.RUNNER }} 
-    timeout-minutes: ${{ inputs.TIMEOUT }}
     outputs:
       conclusion: ${{ steps.main.conclusion }}
       log: ${{ steps.main.outputs.log }}
@@ -54,6 +53,7 @@ jobs:
           uses: actions/checkout@v4
         - id: main
           name: Run main script
+          timeout-minutes: ${{ inputs.TIMEOUT }}
           run: |
             set +e 
             (  

From aa397d7677b164abbd6138b8980b3d5019b399f7 Mon Sep 17 00:00:00 2001
From: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com>
Date: Mon, 8 Jul 2024 09:52:21 -0700
Subject: [PATCH 25/26] Adding support for mcore generate (#9566)

* Adding support for mcore generate

* Apply isort and black reformatting

Signed-off-by: shanmugamr1992 <shanmugamr1992@users.noreply.github.com>

* adding support

* Apply isort and black reformatting

Signed-off-by: shanmugamr1992 <shanmugamr1992@users.noreply.github.com>

* adding support

---------

Signed-off-by: shanmugamr1992 <shanmugamr1992@users.noreply.github.com>
Signed-off-by: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com>
Co-authored-by: shanmugamr <shanmugamr@nvidia.com>
Co-authored-by: shanmugamr1992 <shanmugamr1992@users.noreply.github.com>
---
 .../conf/megatron_gpt_inference.yaml          |   1 +
 .../megatron_gpt_inference_batch_mcore.yaml   |  29 +++
 .../language_modeling/megatron_gpt_eval.py    |   3 +
 .../megatron_gpt_mcore_batch_eval.py          | 222 ++++++++++++++++++
 4 files changed, 255 insertions(+)
 create mode 100644 examples/nlp/language_modeling/conf/megatron_gpt_inference_batch_mcore.yaml
 create mode 100644 examples/nlp/language_modeling/megatron_gpt_mcore_batch_eval.py

diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_inference.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_inference.yaml
index ce8311daf95c..056f9eb9c6ec 100644
--- a/examples/nlp/language_modeling/conf/megatron_gpt_inference.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_gpt_inference.yaml
@@ -1,3 +1,4 @@
+# NOTE : This config and megatron_gpt_eval.py will be deprecated soon. Use megatron_gpt_inference_batch_mcore.yaml
 inference:
   greedy: False # Whether or not to use sampling ; use greedy decoding otherwise
   top_k: 0  # The number of highest probability vocabulary tokens to keep for top-k-filtering.
diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_inference_batch_mcore.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_inference_batch_mcore.yaml
new file mode 100644
index 000000000000..1b34a8b5abc3
--- /dev/null
+++ b/examples/nlp/language_modeling/conf/megatron_gpt_inference_batch_mcore.yaml
@@ -0,0 +1,29 @@
+common_inference_params:
+  top_k: 1  # The number of highest probability vocabulary tokens to keep for top-k-filtering.
+  top_p: 0.0 # If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation.
+  temperature: 1.0 # sampling temperature
+  tokens_to_generate: 30 # The minimum length of the sequence to be generated.
+  return_log_probs: False  # whether return the log prob for the sampled tokens
+  repetition_penalty: 1.2  # The parameter for repetition penalty. 1.0 means no penalty.
+
+trainer:
+  devices: 1
+  num_nodes: 1
+  accelerator: gpu
+  logger: False # logger provided by exp_manager
+  precision: 16 # 16, 32, or bf16
+  use_distributed_sampler: False
+  
+tensor_model_parallel_size: -1
+pipeline_model_parallel_size: -1
+inference_batch_times_seq_len_threshold: 1000 # If batch_size * sequence-length is smaller than this threshold we will not use pipelining, otherwise we will.
+max_batch_size: 4 # Input prompts are batched using max_batch_size and sent to inference
+
+megatron_amp_O2: False  # Enable O2-level automatic mixed precision to save memory
+gpt_model_file: null  # GPT nemo file path
+checkpoint_dir: null # checkpoint file dir. This is used to load the PTL checkpoint generated during the GPT training
+checkpoint_name: null # PTL checkpoint file name, only used for PTL checkpoint loading
+hparams_file: null # model configuration file, only used for PTL checkpoint loading
+prompts: # prompts for GPT inference
+  - "Q: How are you?"
+  - "Q: How big is the universe?"
diff --git a/examples/nlp/language_modeling/megatron_gpt_eval.py b/examples/nlp/language_modeling/megatron_gpt_eval.py
index 362a2ae3e298..b9b0d2973094 100644
--- a/examples/nlp/language_modeling/megatron_gpt_eval.py
+++ b/examples/nlp/language_modeling/megatron_gpt_eval.py
@@ -31,6 +31,7 @@
 from nemo.collections.nlp.modules.common.transformer.text_generation import LengthParam, SamplingParam
 from nemo.collections.nlp.parts.nlp_overrides import CustomProgressBar, NLPDDPStrategy, NLPSaveRestoreConnector
 from nemo.core.config import hydra_runner
+from nemo.utils import logging
 from nemo.utils.app_state import AppState
 from nemo.utils.model_utils import inject_model_parallel_rank
 
@@ -168,6 +169,7 @@ def remove_padded_prompts(response, nb_paddings):
 
 
 def load_model_from_config(trainer, cfg):
+
     if cfg.gpt_model_file is not None:
         if (
             cfg.tensor_model_parallel_size < 0
@@ -306,6 +308,7 @@ def round_to_mult(n, mult=8):
 def main(cfg) -> None:
 
     callbacks = []
+    logging.warning("This file will be depreacted soon. Use the megatron_gpt_mcore_batch_eval.py file instead.")
     # enable_progress_bar is True by default. If cfg.trainer.enable_progress_bar=False, CustomProgressBar is not appended to callbacks
     if 'enable_progress_bar' not in cfg.trainer or cfg.trainer.enable_progress_bar:
         callbacks.append(CustomProgressBar())
diff --git a/examples/nlp/language_modeling/megatron_gpt_mcore_batch_eval.py b/examples/nlp/language_modeling/megatron_gpt_mcore_batch_eval.py
new file mode 100644
index 000000000000..988a5f8588ff
--- /dev/null
+++ b/examples/nlp/language_modeling/megatron_gpt_mcore_batch_eval.py
@@ -0,0 +1,222 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import datetime
+import os
+from argparse import Namespace
+
+from megatron.core.inference.common_inference_params import CommonInferenceParams
+from megatron.core.inference.engines.mcore_engine import MCoreEngine
+from megatron.core.inference.inference_model_wrappers.gpt.gpt_inference_wrapper import GPTInferenceWrapper
+from megatron.core.inference.text_generation_controllers.simple_text_generation_controller import (
+    SimpleTextGenerationController,
+)
+from omegaconf import OmegaConf, open_dict
+from pytorch_lightning.trainer.trainer import Trainer
+
+from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
+from nemo.collections.nlp.modules.common.megatron.megatron_init import fake_initialize_model_parallel
+from nemo.collections.nlp.parts.nlp_overrides import CustomProgressBar, NLPDDPStrategy, NLPSaveRestoreConnector
+from nemo.core.config import hydra_runner
+from nemo.utils.app_state import AppState
+from nemo.utils.model_utils import inject_model_parallel_rank
+
+"""
+This is the script to run GPT text generation in batch mode using Megatron Core Generate function.
+"""
+
+
+@hydra_runner(config_path="conf", config_name="megatron_gpt_inference_batch_mcore")
+def main(cfg) -> None:
+    callbacks = []
+    # enable_progress_bar is True by default. If cfg.trainer.enable_progress_bar=False, CustomProgressBar is not appended to callbacks
+    if 'enable_progress_bar' not in cfg.trainer or cfg.trainer.enable_progress_bar:
+        callbacks.append(CustomProgressBar())
+    # trainer required for restoring model parallel models
+    trainer = Trainer(
+        strategy=NLPDDPStrategy(timeout=datetime.timedelta(seconds=18000)),
+        **cfg.trainer,
+        callbacks=callbacks,
+    )
+
+    if cfg.gpt_model_file is not None:
+        if (
+            cfg.tensor_model_parallel_size < 0
+            or cfg.pipeline_model_parallel_size < 0
+            or cfg.get('pipeline_model_parallel_split_rank', -1) < 0
+        ):
+            save_restore_connector = NLPSaveRestoreConnector()
+            if os.path.isdir(cfg.gpt_model_file):
+                save_restore_connector.model_extracted_dir = cfg.gpt_model_file
+            model_config = MegatronGPTModel.restore_from(
+                restore_path=cfg.gpt_model_file,
+                trainer=trainer,
+                return_config=True,
+                save_restore_connector=save_restore_connector,
+            )
+
+            # with dist checkpointing we don't need to set this
+            if not model_config.get('mcore_gpt', False):
+                with open_dict(cfg):
+                    cfg.tensor_model_parallel_size = model_config.get('tensor_model_parallel_size', 1)
+                    cfg.pipeline_model_parallel_size = model_config.get('pipeline_model_parallel_size', 1)
+                    cfg.pipeline_model_parallel_split_rank = model_config.get('pipeline_model_parallel_split_rank', 0)
+
+    assert (
+        cfg.trainer.devices * cfg.trainer.num_nodes
+        == cfg.tensor_model_parallel_size
+        * cfg.pipeline_model_parallel_size
+        * max(1, cfg.get('expert_model_parallel_size', 1))
+    ), "devices * num_nodes should equal tensor_model_parallel_size * pipeline_model_parallel_size"
+
+    if cfg.gpt_model_file:
+        save_restore_connector = NLPSaveRestoreConnector()
+        if os.path.isdir(cfg.gpt_model_file):
+            save_restore_connector.model_extracted_dir = cfg.gpt_model_file
+
+        pretrained_cfg = MegatronGPTModel.restore_from(
+            restore_path=cfg.gpt_model_file,
+            trainer=trainer,
+            return_config=True,
+            save_restore_connector=save_restore_connector,
+        )
+        OmegaConf.set_struct(pretrained_cfg, True)
+        with open_dict(pretrained_cfg):
+            pretrained_cfg.sequence_parallel = False
+            pretrained_cfg.activations_checkpoint_granularity = None
+            pretrained_cfg.activations_checkpoint_method = None
+            pretrained_cfg.precision = trainer.precision
+            pretrained_cfg["use_flash_attention"] = cfg.get("use_flash_attention", False)
+            pretrained_cfg["apply_rope_fusion"] = False
+            if pretrained_cfg.get('mcore_gpt', False):
+                # with dist checkpointing we can use the model parallel config specified by the user
+                pretrained_cfg.tensor_model_parallel_size = cfg.tensor_model_parallel_size
+                pretrained_cfg.pipeline_model_parallel_size = cfg.pipeline_model_parallel_size
+                pretrained_cfg.expert_model_parallel_size = cfg.get('expert_model_parallel_size', 1)
+                pretrained_cfg.micro_batch_size = 1
+            if trainer.precision == "16":
+                pretrained_cfg.megatron_amp_O2 = False
+            elif trainer.precision in ['bf16', 'bf16-mixed'] and cfg.get('megatron_amp_O2', False):
+                pretrained_cfg.megatron_amp_O2 = True
+        model = MegatronGPTModel.restore_from(
+            restore_path=cfg.gpt_model_file,
+            trainer=trainer,
+            override_config_path=pretrained_cfg,
+            save_restore_connector=save_restore_connector,
+            map_location=f'cuda:{trainer.local_rank}',  # map_location is needed for converted models
+        )
+    elif cfg.checkpoint_dir:
+        app_state = AppState()
+        if (
+            cfg.tensor_model_parallel_size > 1
+            or cfg.pipeline_model_parallel_size > 1
+            or cfg.get('expert_model_parallel_size', 1) > 1
+        ):
+            app_state.model_parallel_size = (
+                cfg.tensor_model_parallel_size
+                * cfg.pipeline_model_parallel_size
+                * cfg.get('expert_model_parallel_size', 1)
+            )
+            app_state.tensor_model_parallel_size = cfg.tensor_model_parallel_size
+            app_state.pipeline_model_parallel_size = cfg.pipeline_model_parallel_size
+            app_state.expert_model_parallel_size = cfg.get('expert_model_parallel_size', 1)
+            (
+                app_state.tensor_model_parallel_rank,
+                app_state.pipeline_model_parallel_rank,
+                app_state.expert_model_parallel_rank,
+                app_state.model_parallel_size,
+                app_state.data_parallel_size,
+                app_state.pipeline_model_parallel_split_rank,
+                app_state.virtual_pipeline_model_parallel_rank,
+            ) = fake_initialize_model_parallel(
+                world_size=app_state.model_parallel_size,
+                rank=trainer.global_rank,
+                tensor_model_parallel_size_=cfg.tensor_model_parallel_size,
+                pipeline_model_parallel_size_=cfg.pipeline_model_parallel_size,
+                pipeline_model_parallel_split_rank_=cfg.pipeline_model_parallel_split_rank,
+                expert_model_parallel_size_=cfg.get('expert_model_parallel_size', 1),
+            )
+        checkpoint_path = os.path.join(cfg.checkpoint_dir, cfg.checkpoint_name)
+        # checkpoint_path is a dir in case of distributed checkpointing
+        if not os.path.isdir(checkpoint_path):
+            # legacy checkpoint needs model parallel rank injection
+            checkpoint_path = inject_model_parallel_rank(os.path.join(cfg.checkpoint_dir, cfg.checkpoint_name))
+        model = MegatronGPTModel.load_from_checkpoint(checkpoint_path, hparams_file=cfg.hparams_file, trainer=trainer)
+    else:
+        raise ValueError("need at least a nemo file or checkpoint dir")
+
+    model.freeze()
+
+    # Have to turn off activations_checkpoint_method for inference
+    try:
+        model.model.language_model.encoder.activations_checkpoint_method = None
+    except AttributeError:
+        pass
+
+    args = Namespace
+    args.inference_batch_times_seq_len_threshold = cfg.inference_batch_times_seq_len_threshold
+    args.padded_vocab_size = model.padded_vocab_size
+    args.fp32_residual_connection = model.cfg.fp32_residual_connection
+    args.hidden_size = model.cfg.hidden_size
+    args.params_dtype = model.cfg.precision
+    args.max_batch_size = cfg.max_batch_size
+
+    # We need this wrapper since mcore generate uses tokenizer.detokenize, tokenizer.tokenize to encode and decode prompts
+    class MCoreTokenizerWrappper:
+        def __init__(self, tokenizer):
+            self.tokenizer = tokenizer
+            self.eod = tokenizer.eod
+            self.vocab_size = tokenizer.vocab_size
+
+        def detokenize(self, tokens):
+            return self.tokenizer.ids_to_text(tokens)
+
+        def tokenize(self, prompt):
+            return self.tokenizer.text_to_ids(prompt)
+
+    tokenizer = MCoreTokenizerWrappper(model.tokenizer)
+
+    inference_wrapped_model = GPTInferenceWrapper(model.model, args)
+    text_generation_controller = SimpleTextGenerationController(
+        inference_wrapped_model=inference_wrapped_model, tokenizer=tokenizer
+    )
+    mcore_engine = MCoreEngine(
+        text_generation_controller=text_generation_controller, max_batch_size=args.max_batch_size
+    )
+
+    common_inference_params = CommonInferenceParams(
+        temperature=cfg.common_inference_params.temperature,
+        top_k=cfg.common_inference_params.top_k,
+        top_p=cfg.common_inference_params.top_p,
+        return_log_probs=cfg.common_inference_params.return_log_probs,
+        num_tokens_to_generate=cfg.common_inference_params.tokens_to_generate,
+    )
+
+    results = mcore_engine.generate(
+        prompts=OmegaConf.to_container(cfg.prompts), common_inference_params=common_inference_params
+    )
+
+    for idx, result in enumerate(results):
+        print(f' \n------------- RESULT FOR PROMPT {idx} --------------- ')
+        result = {
+            'id': result.request_id,
+            'input_prompt': result.prompt,
+            'generated_text': result.generated_text,
+            'generated_tokens': result.generated_tokens,
+        }
+        print(result)
+
+
+if __name__ == '__main__':
+    main()  # noqa pylint: disable=no-value-for-parameter

From 66c960ebdec9d22f40a7d43e9b2d38dc4a34ad25 Mon Sep 17 00:00:00 2001
From: Onur Yilmaz <35306097+oyilmaz-nvidia@users.noreply.github.com>
Date: Mon, 8 Jul 2024 13:31:24 -0400
Subject: [PATCH 26/26] Improve error messaging during trt-llm export (#9638)

* fix minor import bug

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* Raise error when number of query groups cannot be splitted by the tps

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* moved the error message to the utils

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

---------

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>
---
 nemo/export/trt_llm/converter/utils.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/nemo/export/trt_llm/converter/utils.py b/nemo/export/trt_llm/converter/utils.py
index a4365a281b49..3768ff4b2844 100644
--- a/nemo/export/trt_llm/converter/utils.py
+++ b/nemo/export/trt_llm/converter/utils.py
@@ -388,6 +388,16 @@ def split_and_save_weight(tp_rank, saved_dir, split_factor, key, vals, storage_t
 
             # Split the QKV to separate variables.
             qkv = np.split(val, [q_num, q_num + 1], axis=2)
+
+            query_groups_shape = qkv[0].shape
+            if len(query_groups_shape) > 1:
+                if (query_groups_shape[1] % split_factor) != 0:
+                    raise Exception(
+                        "Number of query groups of the models is {0}. Please select tensor parallelism size "
+                        "that can split the number of query groups to equal number of query matrices in the "
+                        "each GPU.".format(query_groups_shape[1])
+                    )
+
             q_split = np.split(qkv[0], split_factor, axis=1)
             k_split = np.split(qkv[1], split_factor, axis=1)
             v_split = np.split(qkv[2], split_factor, axis=1)