Refactor Quantizer for reusing in QAT

Signed-off-by: Keval Morabia <[email protected]>
NVIDIA · May 22, 2024 · 96fde3b · 96fde3b
1 parent d7bb403
commit 96fde3b
Show file tree

Hide file tree

Showing 6 changed files with 176 additions and 194 deletions.
diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
@@ -368,9 +368,9 @@ jobs:
           uses: actions/checkout@v4
         - run: |
             python examples/nlp/language_modeling/megatron_quantization.py \
-            model_file=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
+            model.restore_from_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
             quantization.algorithm=null \
-            model_save=/home/TestData/nlp/megatron_llama/ci_baseline
+            export.save_path=/home/TestData/nlp/megatron_llama/ci_baseline
 
             rm -rf /home/TestData/nlp/megatron_llama/ci_baseline
         - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
@@ -395,15 +395,15 @@ jobs:
            uses: actions/checkout@v4
          - run: |
              python examples/nlp/language_modeling/megatron_quantization.py \
-             model_file=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
-             tensor_model_parallel_size=2 \
+             model.restore_from_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
+             model.tensor_model_parallel_size=2 \
              trainer.devices=2 \
              quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \
              quantization.algorithm=fp8 \
              quantization.num_calib_size=8 \
              inference.batch_size=2 \
              export.inference_tensor_parallel=2 \
-             model_save=/home/TestData/nlp/megatron_llama/ci_fp8.qnemo
+             export.save_path=/home/TestData/nlp/megatron_llama/ci_fp8.qnemo
 
              rm -rf /home/TestData/nlp/megatron_llama/ci_fp8.qnemo
          - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
@@ -428,12 +428,12 @@ jobs:
            uses: actions/checkout@v4
          - run: |
              python examples/nlp/language_modeling/megatron_quantization.py \
-             model_file=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
+             model.restore_from_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
              quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \
              quantization.algorithm=int8_sq \
              quantization.num_calib_size=8 \
              inference.batch_size=2 \
-             model_save=/home/TestData/nlp/megatron_llama/ci_int8_sq.qnemo
+             export.save_path=/home/TestData/nlp/megatron_llama/ci_int8_sq.qnemo
 
              rm -rf /home/TestData/nlp/megatron_llama/ci_int8_sq.qnemo
          - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
@@ -459,14 +459,14 @@ jobs:
   #        uses: actions/checkout@v4
   #      - run: |
   #          python examples/nlp/language_modeling/megatron_quantization.py \
-  #          model_file=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
-  #          tensor_model_parallel_size=1 \
+  #          model.restore_from_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
+  #          model.tensor_model_parallel_size=1 \
   #          trainer.devices=1 \
   #          quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \
   #          quantization.algorithm=int4_awq \
   #          quantization.num_calib_size=8 \
   #          inference.batch_size=2 \
-  #          model_save=/home/TestData/nlp/megatron_llama/ci_int4_awq.qnemo
+  #          export.save_path=/home/TestData/nlp/megatron_llama/ci_int4_awq.qnemo
   #
   #          rm -rf /home/TestData/nlp/megatron_llama/ci_int4_awq.qnemo
         #- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"

diff --git a/docs/source/nlp/quantization.rst b/docs/source/nlp/quantization.rst
@@ -74,16 +74,16 @@ The script must be launched correctly with the number of processes equal to tens
 .. code-block:: bash
 
     torchrun --nproc-per-node 8 examples/nlp/language_modeling/megatron_quantization.py \
-        model_file=llama2-70b-base-bf16.nemo \
-        tensor_model_parallel_size=8 \
-        pipeline_model_parallel_size=1 \
+        model.restore_from_path=llama2-70b-base-bf16.nemo \
+        model.tensor_model_parallel_size=8 \
+        model.pipeline_model_parallel_size=1 \
         trainer.num_nodes=1 \
         trainer.devices=8 \
         trainer.precision=bf16 \
         quantization.algorithm=fp8 \
         export.decoder_type=llama \
         export.inference_tensor_parallel=2 \
-        model_save=llama2-70b-base-fp8-qnemo
+        export.save_path=llama2-70b-base-fp8-qnemo
 
 
 

diff --git a/examples/nlp/language_modeling/conf/megatron_quantization.yaml b/examples/nlp/language_modeling/conf/megatron_quantization.yaml
@@ -20,9 +20,14 @@ trainer:
   precision: bf16 # 16, 32, or bf16
   enable_checkpointing: false
 
+model:
+  tensor_model_parallel_size: 1
+  pipeline_model_parallel_size: 1
+  restore_from_path: llama2-7b-fp16.nemo # Nemo file path
+
 quantization:
-  quantize_bmm1: false
-  algorithm: fp8 # int8_sq, fp8, int8, int4_awq, null
+  decoder_type: ${export.decoder_type} # gptnext, gpt2, llama
+  algorithm: fp8 # null, int8, int8_sq, fp8, int4_awq, w4a8_awq, int4
   calib_dataset: cnn_dailymail # wikitext, cnn_dailymail, or a local dataset
   num_calib_size: 512 # number of samples used for calibration
   awq_block_size: 128 # block size for scaling factors in AWQ algorithm
@@ -31,9 +36,5 @@ export:
   decoder_type: llama # gptnext, gpt2, llama
   inference_tensor_parallel: 1 # Default using 1 TP for inference
   inference_pipeline_parallel: 1 # Default using 1 PP for inference
-  dtype: bf16 # Default precision data type
-
-model_file: llama2-7b-fp16.nemo # Nemo file path
-model_save: llama2-7b-fp8.qnemo # Path where the quantized model will be saved
-tensor_model_parallel_size: 1
-pipeline_model_parallel_size: 1
+  dtype: ${trainer.precision} # Default precision data type
+  save_path: llama2-7b-fp8.qnemo # Path where the quantized model will be saved
diff --git a/examples/nlp/language_modeling/megatron_quantization.py b/examples/nlp/language_modeling/megatron_quantization.py
@@ -15,9 +15,15 @@
 import torch
 import torch.multiprocessing as mp
 from datasets import load_dataset
+from omegaconf import open_dict
+from pytorch_lightning.trainer.trainer import Trainer
+from tqdm import tqdm
 
+from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
+from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy
 from nemo.core.config import hydra_runner
 from nemo.export.quantize import Quantizer
+from nemo.utils.model_utils import load_config
 
 mp.set_start_method("spawn", force=True)
 
@@ -31,11 +37,11 @@
 Example usage:
 ```
 python examples/nlp/language_modeling/megatron_quantization.py \
-    model_file=llama2-7b-fp16.nemo \
-    model_save=llama2-7b-fp8.qnemo \
+    model.restore_from_path=llama2-7b-fp16.nemo \
     quantization.algorithm=fp8 \
     export.decoder_type=llama \
     export.inference_tensor_parallel=1
+    export.save_path=llama2-7b-fp8.qnemo \
 ```
 """
 
@@ -64,7 +70,18 @@ def main(cfg) -> None:
     if not torch.cuda.is_available():
         raise EnvironmentError("GPU is required for the inference.")
 
-    quantizer = Quantizer(cfg.quantization, cfg.inference, cfg.export, cfg.trainer)
+    # Overwrite model config with the one from the model checkpoint and apply quantization modifications
+    model_cfg = load_config(cfg.model.restore_from_path)
+    with open_dict(model_cfg):
+        for key, val in cfg.model.items():
+            model_cfg[key] = val
+    model_cfg = Quantizer.modify_model_config(model_cfg)
+
+    trainer = Trainer(strategy=NLPDDPStrategy(), **cfg.trainer)
+    model = MegatronGPTModel.restore_from(
+        restore_path=cfg.model.restore_from_path, override_config_path=model_cfg, trainer=trainer
+    )
+    model.freeze()
 
     # Quantization algorithm can be set to None. This is useful for baseline precision
     # accuracy validation. In this case only weights export step will be performed:
@@ -76,14 +93,14 @@ def main(cfg) -> None:
             cfg.inference.max_context_length,
         )
         dataloader = [data for data in dataloader]
-    else:
-        dataloader = None
 
-    model = quantizer.quantize(
-        cfg.model_file, dataloader, cfg.tensor_model_parallel_size, cfg.pipeline_model_parallel_size
-    )
+        def forward_loop(model):
+            for i, batch in enumerate(tqdm(dataloader, desc="Calibrating")):
+                model.predict_step(batch, i)
+
+        model = Quantizer.quantize(model, forward_loop, cfg.quantization, cfg.inference)
 
-    quantizer.export(model, cfg.model_save)
+    Quantizer.export(model, cfg.export)
 
 
 if __name__ == '__main__':