Skip to content

Commit

Permalink
Debug Transformer Engine FP8 support with Megatron-core infrastructure (
Browse files Browse the repository at this point in the history
#6791)

* Construct FP8 amax reduction group

Signed-off-by: Tim Moon <[email protected]>

* Update Megatron-core version in CI

Signed-off-by: Tim Moon <[email protected]>

---------

Signed-off-by: Tim Moon <[email protected]>
Co-authored-by: Tim Moon <[email protected]>
Co-authored-by: Tim Moon <[email protected]>
  • Loading branch information
3 people authored Jun 4, 2023
1 parent 8f26d83 commit 5c3ed94
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 1 deletion.
9 changes: 9 additions & 0 deletions Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,15 @@ pipeline {
}
}

stage('Megatron Core installation') {
steps {
sh 'git clone https://github.com/NVIDIA/Megatron-LM.git && \
cd Megatron-LM && \
git checkout cd2537d444792b487b1ab5a6fa685e09c9957409 && \
pip install -e .'
}
}

stage('PyTorch Lightning version') {
steps {
sh 'python -c "import pytorch_lightning; print(pytorch_lightning.__version__)"'
Expand Down
3 changes: 2 additions & 1 deletion nemo/collections/nlp/parts/nlp_overrides.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,7 @@ def init_model_parallel(self, global_rank: int, world_size: int) -> None:
pipeline_model_parallel_size=app_state.pipeline_model_parallel_size,
virtual_pipeline_model_parallel_size=app_state.virtual_pipeline_model_parallel_size,
pipeline_model_parallel_split_rank=app_state.pipeline_model_parallel_split_rank,
use_fp8=app_state.use_fp8,
)

# assert that fake tp and pp rank match after model parallel init
Expand Down Expand Up @@ -406,7 +407,7 @@ class PEFTSaveRestoreConnector(NLPSaveRestoreConnector):
peft_model_nemo_path: Used to provide the .nemo file corresponding to a PEFT model (which will only contain a small set of params)
peft_model_ckpt_path: Used to provide the path to .ckpt files of a PEFT model. This is required when no .nemo is available (yet) such as during resumed training.
peft_model_ckpt_name: The filename of the ckpt file inside the peft_model_ckpt_path folder
If both are provided the peft_model_ckpt_path takes precedence.
If both are provided the peft_model_ckpt_path takes precedence.
If neither are provided, PEFT params are initialized at random (not loaded from any external source).
"""

Expand Down

0 comments on commit 5c3ed94

Please sign in to comment.