Skip to content

Commit

Permalink
Refactor Quantizer for reusing in QAT
Browse files Browse the repository at this point in the history
Signed-off-by: Keval Morabia <[email protected]>
  • Loading branch information
kevalmorabia97 committed May 22, 2024
1 parent d7bb403 commit 96fde3b
Show file tree
Hide file tree
Showing 6 changed files with 176 additions and 194 deletions.
20 changes: 10 additions & 10 deletions .github/workflows/cicd-main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -368,9 +368,9 @@ jobs:
uses: actions/checkout@v4
- run: |
python examples/nlp/language_modeling/megatron_quantization.py \
model_file=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
model.restore_from_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
quantization.algorithm=null \
model_save=/home/TestData/nlp/megatron_llama/ci_baseline
export.save_path=/home/TestData/nlp/megatron_llama/ci_baseline
rm -rf /home/TestData/nlp/megatron_llama/ci_baseline
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
Expand All @@ -395,15 +395,15 @@ jobs:
uses: actions/checkout@v4
- run: |
python examples/nlp/language_modeling/megatron_quantization.py \
model_file=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
tensor_model_parallel_size=2 \
model.restore_from_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
model.tensor_model_parallel_size=2 \
trainer.devices=2 \
quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \
quantization.algorithm=fp8 \
quantization.num_calib_size=8 \
inference.batch_size=2 \
export.inference_tensor_parallel=2 \
model_save=/home/TestData/nlp/megatron_llama/ci_fp8.qnemo
export.save_path=/home/TestData/nlp/megatron_llama/ci_fp8.qnemo
rm -rf /home/TestData/nlp/megatron_llama/ci_fp8.qnemo
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
Expand All @@ -428,12 +428,12 @@ jobs:
uses: actions/checkout@v4
- run: |
python examples/nlp/language_modeling/megatron_quantization.py \
model_file=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
model.restore_from_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \
quantization.algorithm=int8_sq \
quantization.num_calib_size=8 \
inference.batch_size=2 \
model_save=/home/TestData/nlp/megatron_llama/ci_int8_sq.qnemo
export.save_path=/home/TestData/nlp/megatron_llama/ci_int8_sq.qnemo
rm -rf /home/TestData/nlp/megatron_llama/ci_int8_sq.qnemo
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
Expand All @@ -459,14 +459,14 @@ jobs:
# uses: actions/checkout@v4
# - run: |
# python examples/nlp/language_modeling/megatron_quantization.py \
# model_file=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
# tensor_model_parallel_size=1 \
# model.restore_from_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
# model.tensor_model_parallel_size=1 \
# trainer.devices=1 \
# quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \
# quantization.algorithm=int4_awq \
# quantization.num_calib_size=8 \
# inference.batch_size=2 \
# model_save=/home/TestData/nlp/megatron_llama/ci_int4_awq.qnemo
# export.save_path=/home/TestData/nlp/megatron_llama/ci_int4_awq.qnemo
#
# rm -rf /home/TestData/nlp/megatron_llama/ci_int4_awq.qnemo
#- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
Expand Down
8 changes: 4 additions & 4 deletions docs/source/nlp/quantization.rst
Original file line number Diff line number Diff line change
Expand Up @@ -74,16 +74,16 @@ The script must be launched correctly with the number of processes equal to tens
.. code-block:: bash
torchrun --nproc-per-node 8 examples/nlp/language_modeling/megatron_quantization.py \
model_file=llama2-70b-base-bf16.nemo \
tensor_model_parallel_size=8 \
pipeline_model_parallel_size=1 \
model.restore_from_path=llama2-70b-base-bf16.nemo \
model.tensor_model_parallel_size=8 \
model.pipeline_model_parallel_size=1 \
trainer.num_nodes=1 \
trainer.devices=8 \
trainer.precision=bf16 \
quantization.algorithm=fp8 \
export.decoder_type=llama \
export.inference_tensor_parallel=2 \
model_save=llama2-70b-base-fp8-qnemo
export.save_path=llama2-70b-base-fp8-qnemo
Expand Down
17 changes: 9 additions & 8 deletions examples/nlp/language_modeling/conf/megatron_quantization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,14 @@ trainer:
precision: bf16 # 16, 32, or bf16
enable_checkpointing: false

model:
tensor_model_parallel_size: 1
pipeline_model_parallel_size: 1
restore_from_path: llama2-7b-fp16.nemo # Nemo file path

quantization:
quantize_bmm1: false
algorithm: fp8 # int8_sq, fp8, int8, int4_awq, null
decoder_type: ${export.decoder_type} # gptnext, gpt2, llama
algorithm: fp8 # null, int8, int8_sq, fp8, int4_awq, w4a8_awq, int4
calib_dataset: cnn_dailymail # wikitext, cnn_dailymail, or a local dataset
num_calib_size: 512 # number of samples used for calibration
awq_block_size: 128 # block size for scaling factors in AWQ algorithm
Expand All @@ -31,9 +36,5 @@ export:
decoder_type: llama # gptnext, gpt2, llama
inference_tensor_parallel: 1 # Default using 1 TP for inference
inference_pipeline_parallel: 1 # Default using 1 PP for inference
dtype: bf16 # Default precision data type

model_file: llama2-7b-fp16.nemo # Nemo file path
model_save: llama2-7b-fp8.qnemo # Path where the quantized model will be saved
tensor_model_parallel_size: 1
pipeline_model_parallel_size: 1
dtype: ${trainer.precision} # Default precision data type
save_path: llama2-7b-fp8.qnemo # Path where the quantized model will be saved
35 changes: 26 additions & 9 deletions examples/nlp/language_modeling/megatron_quantization.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,15 @@
import torch
import torch.multiprocessing as mp
from datasets import load_dataset
from omegaconf import open_dict
from pytorch_lightning.trainer.trainer import Trainer
from tqdm import tqdm

from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy
from nemo.core.config import hydra_runner
from nemo.export.quantize import Quantizer
from nemo.utils.model_utils import load_config

mp.set_start_method("spawn", force=True)

Expand All @@ -31,11 +37,11 @@
Example usage:
```
python examples/nlp/language_modeling/megatron_quantization.py \
model_file=llama2-7b-fp16.nemo \
model_save=llama2-7b-fp8.qnemo \
model.restore_from_path=llama2-7b-fp16.nemo \
quantization.algorithm=fp8 \
export.decoder_type=llama \
export.inference_tensor_parallel=1
export.save_path=llama2-7b-fp8.qnemo \
```
"""

Expand Down Expand Up @@ -64,7 +70,18 @@ def main(cfg) -> None:
if not torch.cuda.is_available():
raise EnvironmentError("GPU is required for the inference.")

quantizer = Quantizer(cfg.quantization, cfg.inference, cfg.export, cfg.trainer)
# Overwrite model config with the one from the model checkpoint and apply quantization modifications
model_cfg = load_config(cfg.model.restore_from_path)
with open_dict(model_cfg):
for key, val in cfg.model.items():
model_cfg[key] = val
model_cfg = Quantizer.modify_model_config(model_cfg)

trainer = Trainer(strategy=NLPDDPStrategy(), **cfg.trainer)
model = MegatronGPTModel.restore_from(
restore_path=cfg.model.restore_from_path, override_config_path=model_cfg, trainer=trainer
)
model.freeze()

# Quantization algorithm can be set to None. This is useful for baseline precision
# accuracy validation. In this case only weights export step will be performed:
Expand All @@ -76,14 +93,14 @@ def main(cfg) -> None:
cfg.inference.max_context_length,
)
dataloader = [data for data in dataloader]
else:
dataloader = None

model = quantizer.quantize(
cfg.model_file, dataloader, cfg.tensor_model_parallel_size, cfg.pipeline_model_parallel_size
)
def forward_loop(model):
for i, batch in enumerate(tqdm(dataloader, desc="Calibrating")):
model.predict_step(batch, i)

model = Quantizer.quantize(model, forward_loop, cfg.quantization, cfg.inference)

quantizer.export(model, cfg.model_save)
Quantizer.export(model, cfg.export)


if __name__ == '__main__':
Expand Down
Loading

0 comments on commit 96fde3b

Please sign in to comment.