From 6764327df75bee19d882279b80d0d439697f9bb1 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Mon, 26 Feb 2024 07:37:19 -0800 Subject: [PATCH] MoE parameter passing (#8490) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * MoE parameter passing (#8255) * MoE parameter passing Signed-off-by: Alexandros Koumparoulis * Pass EP/MoE params in consumer scripts. Signed-off-by: Alexandros Koumparoulis * PR fixes Signed-off-by: Alexandros Koumparoulis * Use latest commit of mcore-0.5 Signed-off-by: Alexandros Koumparoulis * CI fix Signed-off-by: Alexandros Koumparoulis * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Alexandros Koumparoulis Co-authored-by: Alexandros Koumparoulis Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: Alexandros Koumparoulis * Jiaqiz/option to disable adapters & merge all lora layers (#8029) * Added LoRA support for the Dense layer of Attention * Added LoRA MLP support to MCore and NeMo models. * Change LoRA config default to QKV. * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fixed bug with ddp training. * use adapter only when it is enabled Signed-off-by: jiaqi zeng * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix lora merge script (#8113) Signed-off-by: Chen Cui Co-authored-by: Adi Renduchintala * add peft ckpt to nemo Signed-off-by: Jiaqi Zeng * merge lora weights for all layers, mcore only Signed-off-by: Jiaqi Zeng * support/fix cpu initialization Signed-off-by: Chen Cui * add example usage Signed-off-by: Chen Cui * fix TP due to distributed checkpoint Signed-off-by: Chen Cui * updating the logic of merging lora weights for all layers, mcore only Signed-off-by: Jiaqi Zeng * MCoreMixin chages. * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * merge in fp32 then cast back Signed-off-by: Jiaqi Zeng * remove ckpt to nemo Signed-off-by: Jiaqi Zeng * fix import Signed-off-by: Jiaqi Zeng --------- Signed-off-by: jiaqi zeng Signed-off-by: Chen Cui Signed-off-by: Jiaqi Zeng Co-authored-by: Tugrul Konuk Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Adi Renduchintala Co-authored-by: Chen Cui Signed-off-by: Alexandros Koumparoulis * Update k2 version (#8478) Signed-off-by: Vladimir Bataev Signed-off-by: Alexandros Koumparoulis * Add mcore full TE transformer layer spec (#8328) * Add spec and implement autocast layer Signed-off-by: Jan Baczek * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Signed-off-by: Jan Baczek * remove try-catchs, these dependecies are mandatory for this file Signed-off-by: Jan Baczek * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Signed-off-by: Jan Baczek * Check out this cool try/except clause Signed-off-by: Jan Baczek * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Remove unused import Signed-off-by: Jan Baczek * Add import tests to Jenkinsfile Signed-off-by: Jan Baczek * Move import tests to Jenkins and remove code that is developed only for passing tests Signed-off-by: Jan Baczek * Make test robust to faulty base configs Signed-off-by: Jan Baczek * Use proper GPT implementation in the test Signed-off-by: Jan Baczek * Update nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py Co-authored-by: Sudhakar Singh Signed-off-by: jbaczek <45043825+jbaczek@users.noreply.github.com> * Update nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py Co-authored-by: Sudhakar Singh Signed-off-by: jbaczek <45043825+jbaczek@users.noreply.github.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py Co-authored-by: Jaemin Choi Signed-off-by: jbaczek <45043825+jbaczek@users.noreply.github.com> * Update nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py Co-authored-by: Jaemin Choi Signed-off-by: jbaczek <45043825+jbaczek@users.noreply.github.com> * Add TE knobs to the copy of AutocastTransformerLayer Signed-off-by: Jan Baczek * Add TE knobs to the copy of AutocastTransformerLayer Signed-off-by: Jan Baczek * Add dummy parameter to accomodated for the changes in mcore Signed-off-by: Jan Baczek * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update mcore to 0.5.0 in Jenkins pipeline Signed-off-by: Jan Baczek * Bump mcore commit. This is commit from tot, not any release. Signed-off-by: Jan Baczek * Remove from the test config option that is incompatible with bias_activation_fusion Signed-off-by: Jan Baczek * Bump TE version in CI to 1.4 Signed-off-by: Jan Baczek * Update test Signed-off-by: Jan Baczek * Change precision for the test - current runnens don't support bf16 Signed-off-by: Jan Baczek --------- Signed-off-by: Jan Baczek Signed-off-by: jbaczek <45043825+jbaczek@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Sudhakar Singh Co-authored-by: Jaemin Choi Signed-off-by: Alexandros Koumparoulis * Add mcore full TE transformer layer spec (#8328) * Add spec and implement autocast layer Signed-off-by: Jan Baczek * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Signed-off-by: Jan Baczek * remove try-catchs, these dependecies are mandatory for this file Signed-off-by: Jan Baczek * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Signed-off-by: Jan Baczek * Check out this cool try/except clause Signed-off-by: Jan Baczek * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Remove unused import Signed-off-by: Jan Baczek * Add import tests to Jenkinsfile Signed-off-by: Jan Baczek * Move import tests to Jenkins and remove code that is developed only for passing tests Signed-off-by: Jan Baczek * Make test robust to faulty base configs Signed-off-by: Jan Baczek * Use proper GPT implementation in the test Signed-off-by: Jan Baczek * Update nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py Co-authored-by: Sudhakar Singh Signed-off-by: jbaczek <45043825+jbaczek@users.noreply.github.com> * Update nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py Co-authored-by: Sudhakar Singh Signed-off-by: jbaczek <45043825+jbaczek@users.noreply.github.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py Co-authored-by: Jaemin Choi Signed-off-by: jbaczek <45043825+jbaczek@users.noreply.github.com> * Update nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py Co-authored-by: Jaemin Choi Signed-off-by: jbaczek <45043825+jbaczek@users.noreply.github.com> * Add TE knobs to the copy of AutocastTransformerLayer Signed-off-by: Jan Baczek * Add TE knobs to the copy of AutocastTransformerLayer Signed-off-by: Jan Baczek * Add dummy parameter to accomodated for the changes in mcore Signed-off-by: Jan Baczek * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update mcore to 0.5.0 in Jenkins pipeline Signed-off-by: Jan Baczek * Bump mcore commit. This is commit from tot, not any release. Signed-off-by: Jan Baczek * Remove from the test config option that is incompatible with bias_activation_fusion Signed-off-by: Jan Baczek * Bump TE version in CI to 1.4 Signed-off-by: Jan Baczek * Update test Signed-off-by: Jan Baczek * Change precision for the test - current runnens don't support bf16 Signed-off-by: Jan Baczek --------- Signed-off-by: Jan Baczek Signed-off-by: jbaczek <45043825+jbaczek@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Sudhakar Singh Co-authored-by: Jaemin Choi * Handle float limit_val_batches (#8426) * Handle float limit_val_batches Signed-off-by: Abhishree * Rectify reconfiguration of float limit_val_batches Signed-off-by: Abhishree * Remove unused imports Signed-off-by: Abhishree * Scale len(val_dataloader) with float limit_val_batches Signed-off-by: Abhishree * Return len(dataloader) in microbatches Signed-off-by: Abhishree * Add back resetting of num val samples Signed-off-by: Abhishree * Fix to ensure float limit_val_batches is multiple of num_micro_batches Signed-off-by: Abhishree * Remove forcing eval samples to 1 for float limit_val_batches Signed-off-by: Abhishree * Fix bug wrt 0 limiot_val_batches Signed-off-by: Abhishree * Add missing mock_dataset line Signed-off-by: Abhishree * Avoid ensuring limit_val_batches is a mutliple of microbatches for 1.0 Signed-off-by: Abhishree * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Restore the hack forcing number of validation and test epochs to 1 Signed-off-by: Jan Baczek * Change limit_val_batches to 1.0 for GPT pretraining test. The integer value is covered in other tests Signed-off-by: Jan Baczek --------- Signed-off-by: Abhishree Signed-off-by: Jan Baczek Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Jan Baczek Signed-off-by: Alexandros Koumparoulis * Fix tutorial links in user guide (#8497) Signed-off-by: yaoyu-33 Signed-off-by: Alexandros Koumparoulis * Sequence Parallel for LoRA (#8369) * support lora + sequence parallel Signed-off-by: Chen Cui * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add more comments Signed-off-by: Chen Cui * add lora SP CI test Signed-off-by: Chen Cui * support lora for all linear modules as in #7988 Signed-off-by: Chen Cui * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Chen Cui Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: Alexandros Koumparoulis * Call proper method to replace (#8498) Signed-off-by: Naga Venkatesh Gavini Signed-off-by: Alexandros Koumparoulis * Added memory logger (#8395) * Added memory logger Signed-off-by: Selvaraj Anandaraj * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Selvaraj Anandaraj Co-authored-by: Selvaraj Anandaraj Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Eric Harper Signed-off-by: Alexandros Koumparoulis * Canary refactor for Riva (#8363) * initial commit of bleu score tracking Signed-off-by: Travis Bartley * initial commit, refactoring aed models for riva Signed-off-by: Travis Bartley * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Updating Canary to support torch metrics Signed-off-by: Travis Bartley * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * style fixes Signed-off-by: Travis Bartley * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * missed an empty batch conditional Signed-off-by: Travis Bartley * Fixing dataloader issues Signed-off-by: Travis Bartley * Finishing merge conflict with transcribe update Signed-off-by: Travis Bartley * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * style fix Signed-off-by: Travis Bartley * copyright header fix Signed-off-by: Travis Bartley * yet another merge conflict Signed-off-by: Travis Bartley * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * making paired data management safer Signed-off-by: Travis Bartley * sentencepiece needs bigger tokenizer... Signed-off-by: Travis Bartley * sentencepiece tokenizer vocab needs to be +2 from vocab for canary Signed-off-by: Travis Bartley * Update canary tokenizer to be more generic, updated metrics to manage special tokens removal themselves. Signed-off-by: Travis Bartley * merge conflit Signed-off-by: Travis Bartley * Simplified tokenizer and corrected bug in dataloader Signed-off-by: Travis Bartley * Cleaning up docstrings and fixing inference bug. Signed-off-by: Travis Bartley * adding example scripts Signed-off-by: Travis Bartley * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * cleaning up useless imports Signed-off-by: Travis Bartley * adding unit tests Signed-off-by: Travis Bartley * fixing unit tests Signed-off-by: Travis Bartley * cfg name change Signed-off-by: Travis Bartley * adding custom check to pass pytests Signed-off-by: Travis Bartley * removing print script Signed-off-by: Travis Bartley * catching bugs regarding tokens. Signed-off-by: Travis Bartley * added docstrings and made examples scripts more generic Signed-off-by: Travis Bartley * docstring deleted by accident Signed-off-by: Travis Bartley * plurals in namespace Signed-off-by: Travis Bartley * changing example script Signed-off-by: Travis Bartley --------- Signed-off-by: Travis Bartley Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Piotr Żelasko Signed-off-by: Alexandros Koumparoulis * add alpha scaling to lora (#8248) * removed pdeprecated eft model Signed-off-by: arendu * add alpha Signed-off-by: arendu * default for alpha Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add alpha scaling to lora (#8483) * coldfix (#8412) Signed-off-by: George Zelenfroynd Signed-off-by: Michal Futrega * Fixed errors in the CTM gen functions (#8416) (#8420) Signed-off-by: Taejin Park Co-authored-by: Taejin Park Signed-off-by: Michal Futrega * Add change_vocabulary and save_tokenizers() support to Multitask ASR models (#8357) (#8367) * Add change_vocabulary and save_tokenizers() support * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update nemo/collections/asr/models/aed_multitask_models.py --------- Signed-off-by: smajumdar Signed-off-by: Somshubra Majumdar Co-authored-by: Somshubra Majumdar Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Piotr Żelasko Signed-off-by: Michal Futrega * fix path location and branch (#8314) * fix path location and branch (#8304) * fix path location and branch Signed-off-by: Nithin Rao Koluguri * change to a floating point number Signed-off-by: Nithin Rao Koluguri --------- Signed-off-by: Nithin Rao Koluguri Co-authored-by: Nithin Rao Koluguri Co-authored-by: Somshubra Majumdar * updat ebranch in tutorial Signed-off-by: Nithin Rao Koluguri --------- Signed-off-by: Nithin Rao Koluguri Co-authored-by: Nithin Rao Co-authored-by: Somshubra Majumdar Co-authored-by: Nithin Rao Koluguri Signed-off-by: Michal Futrega * Add TP comm overlap knobs to AutocastTransformerLayer (#8290) Signed-off-by: Jaemin Choi Co-authored-by: Jaemin Choi Signed-off-by: Michal Futrega * add deallocate pipeline output optimization (#8279) (#8318) * add deallocate pipeline output optimization * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Jimmy Zhang Co-authored-by: JimmyZhang12 <67203904+JimmyZhang12@users.noreply.github.com> Co-authored-by: Jimmy Zhang Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: Michal Futrega * remove assertion (#8302) (#8321) Signed-off-by: dimapihtar Co-authored-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com> Signed-off-by: Michal Futrega * Keep max_seqlen and cu_seqlens_argmin for later micro-batches when PP>1 (#8334) (#8346) Signed-off-by: Sangkug Lym Co-authored-by: Sangkug Lym Co-authored-by: Eric Harper Signed-off-by: Michal Futrega * Enable megatron core loggers for GPT pretraining (#8354) (#8384) * Logging changes tested for gpt_pretraining * Additional args * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Aishwarya Bhandare Co-authored-by: ashbhandare Co-authored-by: Aishwarya Bhandare Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Eric Harper Signed-off-by: Michal Futrega * Fix dreambooth data sampler issue (#8400) (#8413) * Turn on drop last * Some neva fixes * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: yaoyu-33 Co-authored-by: yaoyu-33 <54727607+yaoyu-33@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: Michal Futrega * add ensemble decoding fix (#8427) (#8433) Signed-off-by: Nithin Rao Koluguri Co-authored-by: Nithin Rao Signed-off-by: Michal Futrega * NeVA Tutorial Notebook (#8217) * init commit - neva tutorial Signed-off-by: Pratyush Muthukumar * NeVA tutorial notebook Signed-off-by: Pratyush Muthukumar * init commit - neva tutorial Signed-off-by: Pratyush Muthukumar Signed-off-by: Pratyush Muthukumar Signed-off-by: Pratyush Muthukumar * NeVA tutorial notebook Signed-off-by: Pratyush Muthukumar Signed-off-by: Pratyush Muthukumar Signed-off-by: Pratyush Muthukumar * requested changes Signed-off-by: Pratyush Muthukumar Signed-off-by: Pratyush Muthukumar * add inference via script Signed-off-by: Pratyush Muthukumar * requested changes Signed-off-by: Pratyush Muthukumar * requested changes Signed-off-by: Pratyush Muthukumar * add codeblocks to run torchrun in notebook Signed-off-by: Pratyush Muthukumar --------- Signed-off-by: Pratyush Muthukumar Signed-off-by: Pratyush Muthukumar Co-authored-by: Pratyush Muthukumar Signed-off-by: Michal Futrega * mcore customization doc minor fix (#8421) (#8437) Signed-off-by: Huiying Li Co-authored-by: Huiying Signed-off-by: Michal Futrega * Add `loop_labels` algorithm for TDT greedy decoding (#8215) * Add `loop_labels` algorithm for TDT greedy decoding Signed-off-by: Vladimir Bataev * Use `loop_labels` by default Signed-off-by: Vladimir Bataev * Loop labels greedy decoding v2 Signed-off-by: Vladimir Bataev * Add comments. Clean up Signed-off-by: Vladimir Bataev * Add comments Signed-off-by: Vladimir Bataev * Add comments Signed-off-by: Vladimir Bataev * Add tests for batched hypotheses Signed-off-by: Vladimir Bataev * Add tests for batched alignments Signed-off-by: Vladimir Bataev * Add comments Signed-off-by: Vladimir Bataev * Fix comment Signed-off-by: Vladimir Bataev * Fix test Signed-off-by: Vladimir Bataev * Add computer for TDT Signed-off-by: Vladimir Bataev * Fix TDT decoding algorithm Signed-off-by: Vladimir Bataev * Use loop frames by default for TDT Signed-off-by: Vladimir Bataev * Remove "loop frames" implementation for TDT Signed-off-by: Vladimir Bataev * Clean up Signed-off-by: Vladimir Bataev * Add comments Signed-off-by: Vladimir Bataev * Fix confidence. Use tensor for durations. Signed-off-by: Vladimir Bataev --------- Signed-off-by: Vladimir Bataev Signed-off-by: Michal Futrega * Add dist ckpt support for regular optimizers (#7749) (#8293) * Add dist ckpt support for regular optimizers * [tutorial] fixed missing RIR scripts file. (#8257) * fix imports * imports fix * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * ci imports fix * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * revert asr notebook * revert asr notebook --------- Signed-off-by: Mikołaj Błaż Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> Signed-off-by: dimapihtar Co-authored-by: mikolajblaz Co-authored-by: Eric Harper Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> Co-authored-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com> Co-authored-by: dimapihtar Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: Michal Futrega * Multimodal r1.23.0 bug fix (#8315) (#8339) * Rename quick-gelu * ddpm config guard * Fix ddpm edit api * Fix insert_image_token cfg issue * neva updates * reformat * Add back jenkins * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fix jenkins * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fix bugs * Update default neva template --------- Signed-off-by: yaoyu-33 Co-authored-by: yaoyu-33 <54727607+yaoyu-33@users.noreply.github.com> Co-authored-by: Eric Harper Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: Michal Futrega * mcore ds fix (#8283) (#8385) * [tutorial] fixed missing RIR scripts file. (#8257) * add values to en tts dict (#7879) * mcore ds fix * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * update mcore * revert asr files * add comments * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add support for mcore mock dataset * update mcore version * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * update gpt cfg * update mcore commit * fix Bert unit tests * update bert tests * fix bert mcore test * fix gpt jenkins tests * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * update apex & TE commits * revert apex installation * turn off the fusion for jenkins --------- Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> Signed-off-by: Mariana Graterol Fuenmayor Signed-off-by: Dmytro Pykhtar Signed-off-by: dimapihtar Co-authored-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com> Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> Co-authored-by: Mariana <47233618+mgrafu@users.noreply.github.com> Co-authored-by: Dmytro Pykhtar Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Pablo Garay Co-authored-by: Eric Harper Signed-off-by: Michal Futrega * MCore dataset compatibility for tokenizers (#8390) (#8397) * Add unique_identifiers for all tokenizers and eod for SentencePieceTokenizer * Add generalized token aliases to TokenizerSpec to conform with MegatronTokenizer's interface. Remove now-redundant individual fixes from AutoTokenizer and SentencePieceTokenizer. --------- Signed-off-by: Valerie Sarge Co-authored-by: Valerie Sarge Co-authored-by: Pablo Garay Co-authored-by: Eric Harper Signed-off-by: Michal Futrega * Canary: inference tokenization improvements; preserving custom keys when creating tarred manifests (#8432) * Improvements for Canary: - carry over custom keys when creatin tarred manifests - selectable text field in ASR eval - get rid of prompt slicing, create proper inference prompts Signed-off-by: Piotr Żelasko * set ensure_ascii=False in tarred conversion to avoid breaking tokenizers trained on UTF-8 encoding Signed-off-by: Piotr Żelasko --------- Signed-off-by: Piotr Żelasko Signed-off-by: Michal Futrega * add sbert to IR (#8445) * add sbert to IR Signed-off-by: ataghibakhsh * add doc Signed-off-by: ataghibakhsh * fix the auto_tokenizer property method reset bug Signed-off-by: ataghibakhsh * addressed bot comments Signed-off-by: ataghibakhsh * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: ataghibakhsh Co-authored-by: Eric Harper Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: Michal Futrega * Update readme (#8440) * update Signed-off-by: eharper * udpate Signed-off-by: eharper * update Signed-off-by: eharper * update Signed-off-by: eharper * update Signed-off-by: eharper * landing pages added * landing page added for vision * landing pages updated * some minor changes to the main readme * update Signed-off-by: eharper * update Signed-off-by: eharper * update Signed-off-by: eharper * update Signed-off-by: eharper * update Signed-off-by: eharper * update Signed-off-by: eharper * update Signed-off-by: eharper * update Signed-off-by: eharper * update Signed-off-by: eharper * update Signed-off-by: eharper * update Signed-off-by: eharper * update Signed-off-by: eharper * update Signed-off-by: eharper * update Signed-off-by: eharper * update Signed-off-by: eharper * update Signed-off-by: eharper * update Signed-off-by: eharper * typo fixed * update Signed-off-by: eharper --------- Signed-off-by: eharper Co-authored-by: ntajbakhsh Signed-off-by: Michal Futrega * NeMo-Mistral to HF converter bugfix. (#8353) (#8442) Signed-off-by: Alexandros Koumparoulis Co-authored-by: akoumpa <153118171+akoumpa@users.noreply.github.com> Signed-off-by: Michal Futrega * Fixing mcore bert for TP, PP and SP (#8336) (#8443) * Fixing mcore bert for TP, PP and SP * Fixing mcore bert for TP, PP and SP * Fixing mcore version * Fixing mcore version * Update Jenkinsfile * Update Jenkinsfile * Update Jenkinsfile --------- Signed-off-by: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com> Co-authored-by: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com> Co-authored-by: Shanmugam Ramasamy Co-authored-by: Eric Harper Signed-off-by: Michal Futrega * Add LoRA support to all linear layers (#7988) * Added LoRA support for the Dense layer of Attention * Added LoRA MLP support to MCore and NeMo models. * Change LoRA config default to QKV. * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fixed bug with ddp training. * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * MCoreMixin chages. * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * using new commit of meg-LM Signed-off-by: arendu * add cpu_offloading_num_layers to conversion script until bug in megatron is fixed Signed-off-by: Chen Cui * fix peft mixin arguments to follow mcore 0.5 Signed-off-by: Chen Cui * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * update megatron commit to fix ci error Signed-off-by: Chen Cui * try to fix ci Signed-off-by: Chen Cui * try to fix ci Signed-off-by: Chen Cui * add cfg default Signed-off-by: Chen Cui --------- Signed-off-by: Adi Renduchintala Signed-off-by: Jiaqi Zeng Signed-off-by: arendu Signed-off-by: Chen Cui Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Adi Renduchintala Co-authored-by: Jiaqi Zeng Co-authored-by: arendu Co-authored-by: HeyyyyyyG <49757268+HeyyyyyyG@users.noreply.github.com> Co-authored-by: Chen Cui Co-authored-by: Eric Harper Signed-off-by: Michal Futrega * Add Neva Template for NV-DPO Models (#8358) * add/rename from nvgpt to nv_steerlm, add nv_dpo template Signed-off-by: HuiyingLi * add nv_dpo conversation to accomendate empty system message Signed-off-by: HuiyingLi * handle nv_dpo template text generation Signed-off-by: HuiyingLi * add prompt string to nvgpt Signed-off-by: HuiyingLi * bugfix for inference prompt template Signed-off-by: HuiyingLi * bug fix for grabbing clean text Signed-off-by: Huiying Li * fix code format Signed-off-by: Huiying Li --------- Signed-off-by: HuiyingLi Signed-off-by: Huiying Li Signed-off-by: Michal Futrega * Rebase scaling alpha Signed-off-by: Michal Futrega * default for alpha Signed-off-by: arendu Signed-off-by: Michal Futrega * Rebase scaling alpha Signed-off-by: Michal Futrega --------- Signed-off-by: George Zelenfroynd Signed-off-by: Michal Futrega Signed-off-by: Taejin Park Signed-off-by: smajumdar Signed-off-by: Somshubra Majumdar Signed-off-by: Nithin Rao Koluguri Signed-off-by: Jaemin Choi Signed-off-by: Jimmy Zhang Signed-off-by: dimapihtar Signed-off-by: Sangkug Lym Signed-off-by: Aishwarya Bhandare Signed-off-by: yaoyu-33 Signed-off-by: Pratyush Muthukumar Signed-off-by: Pratyush Muthukumar Signed-off-by: Huiying Li Signed-off-by: Vladimir Bataev Signed-off-by: Mikołaj Błaż Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> Signed-off-by: Mariana Graterol Fuenmayor Signed-off-by: Dmytro Pykhtar Signed-off-by: Valerie Sarge Signed-off-by: Piotr Żelasko Signed-off-by: ataghibakhsh Signed-off-by: eharper Signed-off-by: Alexandros Koumparoulis Signed-off-by: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com> Signed-off-by: Adi Renduchintala Signed-off-by: Jiaqi Zeng Signed-off-by: arendu Signed-off-by: Chen Cui Signed-off-by: HuiyingLi Co-authored-by: George <37293288+Jorjeous@users.noreply.github.com> Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: Taejin Park Co-authored-by: Somshubra Majumdar Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Piotr Żelasko Co-authored-by: Nithin Rao Co-authored-by: Jaemin Choi Co-authored-by: Jaemin Choi Co-authored-by: JimmyZhang12 <67203904+JimmyZhang12@users.noreply.github.com> Co-authored-by: Jimmy Zhang Co-authored-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com> Co-authored-by: Sangkug Lym Co-authored-by: Eric Harper Co-authored-by: ashbhandare Co-authored-by: Aishwarya Bhandare Co-authored-by: yaoyu-33 <54727607+yaoyu-33@users.noreply.github.com> Co-authored-by: Pratyush Muthukumar <30813477+PannuMuthu@users.noreply.github.com> Co-authored-by: Pratyush Muthukumar Co-authored-by: Huiying Co-authored-by: Vladimir Bataev Co-authored-by: mikolajblaz Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> Co-authored-by: dimapihtar Co-authored-by: Mariana <47233618+mgrafu@users.noreply.github.com> Co-authored-by: Dmytro Pykhtar Co-authored-by: Pablo Garay Co-authored-by: Valerie Sarge Co-authored-by: Ali Taghibakhshi <71892896+JRD971000@users.noreply.github.com> Co-authored-by: ntajbakhsh Co-authored-by: akoumpa <153118171+akoumpa@users.noreply.github.com> Co-authored-by: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com> Co-authored-by: Shanmugam Ramasamy Co-authored-by: Tugrul Konuk Co-authored-by: Adi Renduchintala Co-authored-by: Jiaqi Zeng Co-authored-by: arendu Co-authored-by: HeyyyyyyG <49757268+HeyyyyyyG@users.noreply.github.com> Co-authored-by: Chen Cui --------- Signed-off-by: arendu Signed-off-by: George Zelenfroynd Signed-off-by: Michal Futrega Signed-off-by: Taejin Park Signed-off-by: smajumdar Signed-off-by: Somshubra Majumdar Signed-off-by: Nithin Rao Koluguri Signed-off-by: Jaemin Choi Signed-off-by: Jimmy Zhang Signed-off-by: dimapihtar Signed-off-by: Sangkug Lym Signed-off-by: Aishwarya Bhandare Signed-off-by: yaoyu-33 Signed-off-by: Pratyush Muthukumar Signed-off-by: Pratyush Muthukumar Signed-off-by: Huiying Li Signed-off-by: Vladimir Bataev Signed-off-by: Mikołaj Błaż Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> Signed-off-by: Mariana Graterol Fuenmayor Signed-off-by: Dmytro Pykhtar Signed-off-by: Valerie Sarge Signed-off-by: Piotr Żelasko Signed-off-by: ataghibakhsh Signed-off-by: eharper Signed-off-by: Alexandros Koumparoulis Signed-off-by: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com> Signed-off-by: Adi Renduchintala Signed-off-by: Jiaqi Zeng Signed-off-by: Chen Cui Signed-off-by: HuiyingLi Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Michal Futrega Co-authored-by: George <37293288+Jorjeous@users.noreply.github.com> Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: Taejin Park Co-authored-by: Somshubra Majumdar Co-authored-by: Piotr Żelasko Co-authored-by: Nithin Rao Co-authored-by: Jaemin Choi Co-authored-by: Jaemin Choi Co-authored-by: JimmyZhang12 <67203904+JimmyZhang12@users.noreply.github.com> Co-authored-by: Jimmy Zhang Co-authored-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com> Co-authored-by: Sangkug Lym Co-authored-by: Eric Harper Co-authored-by: ashbhandare Co-authored-by: Aishwarya Bhandare Co-authored-by: yaoyu-33 <54727607+yaoyu-33@users.noreply.github.com> Co-authored-by: Pratyush Muthukumar <30813477+PannuMuthu@users.noreply.github.com> Co-authored-by: Pratyush Muthukumar Co-authored-by: Huiying Co-authored-by: Vladimir Bataev Co-authored-by: mikolajblaz Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> Co-authored-by: dimapihtar Co-authored-by: Mariana <47233618+mgrafu@users.noreply.github.com> Co-authored-by: Dmytro Pykhtar Co-authored-by: Pablo Garay Co-authored-by: Valerie Sarge Co-authored-by: Ali Taghibakhshi <71892896+JRD971000@users.noreply.github.com> Co-authored-by: ntajbakhsh Co-authored-by: akoumpa <153118171+akoumpa@users.noreply.github.com> Co-authored-by: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com> Co-authored-by: Shanmugam Ramasamy Co-authored-by: Tugrul Konuk Co-authored-by: Jiaqi Zeng Co-authored-by: HeyyyyyyG <49757268+HeyyyyyyG@users.noreply.github.com> Co-authored-by: Chen Cui Signed-off-by: Alexandros Koumparoulis * Update PEFT Doc (#8501) * update peft doc Signed-off-by: Chen Cui * remove old prompt learning doc and notebook Signed-off-by: Chen Cui * fix table Signed-off-by: Chen Cui * fix table Signed-off-by: Chen Cui * fix table Signed-off-by: Chen Cui * revert accidental commit Signed-off-by: Chen Cui * revert accidental commit Signed-off-by: Chen Cui --------- Signed-off-by: Chen Cui Signed-off-by: Alexandros Koumparoulis * release updates (#8394) * release updates (#8378) * [tutorial] fixed missing RIR scripts file. (#8257) Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> * add values to en tts dict (#7879) Signed-off-by: Mariana Graterol Fuenmayor * mcore ds fix Signed-off-by: Dmytro Pykhtar * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * update mcore Signed-off-by: dimapihtar * revert asr files Signed-off-by: dimapihtar * add comments Signed-off-by: dimapihtar * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add support for mcore mock dataset Signed-off-by: dimapihtar * update mcore version Signed-off-by: dimapihtar * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * update gpt cfg Signed-off-by: dimapihtar * update mcore commit Signed-off-by: dimapihtar * fix Bert unit tests Signed-off-by: dimapihtar * update bert tests Signed-off-by: dimapihtar * fix bert mcore test Signed-off-by: dimapihtar * fix gpt jenkins tests Signed-off-by: dimapihtar * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add support for dict data input type Signed-off-by: dimapihtar * add mock ds test Signed-off-by: dimapihtar * add test for dict data input type Signed-off-by: dimapihtar * mcore ds fix Signed-off-by: dimapihtar * data input fix Signed-off-by: dimapihtar --------- Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> Signed-off-by: Mariana Graterol Fuenmayor Signed-off-by: Dmytro Pykhtar Signed-off-by: dimapihtar Signed-off-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com> Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> Co-authored-by: Mariana <47233618+mgrafu@users.noreply.github.com> Co-authored-by: Dmytro Pykhtar Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Pablo Garay * Update megatron_gpt_model.py Signed-off-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com> --------- Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> Signed-off-by: Mariana Graterol Fuenmayor Signed-off-by: Dmytro Pykhtar Signed-off-by: dimapihtar Signed-off-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com> Co-authored-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com> Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> Co-authored-by: Mariana <47233618+mgrafu@users.noreply.github.com> Co-authored-by: Dmytro Pykhtar Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Pablo Garay Co-authored-by: Eric Harper Signed-off-by: Alexandros Koumparoulis --------- Signed-off-by: Alexandros Koumparoulis Signed-off-by: jiaqi zeng Signed-off-by: Chen Cui Signed-off-by: Jiaqi Zeng Signed-off-by: Vladimir Bataev Signed-off-by: Jan Baczek Signed-off-by: jbaczek <45043825+jbaczek@users.noreply.github.com> Signed-off-by: Abhishree Signed-off-by: yaoyu-33 Signed-off-by: Naga Venkatesh Gavini Signed-off-by: Selvaraj Anandaraj Signed-off-by: Travis Bartley Signed-off-by: arendu Signed-off-by: George Zelenfroynd Signed-off-by: Michal Futrega Signed-off-by: Taejin Park Signed-off-by: smajumdar Signed-off-by: Somshubra Majumdar Signed-off-by: Nithin Rao Koluguri Signed-off-by: Jaemin Choi Signed-off-by: Jimmy Zhang Signed-off-by: dimapihtar Signed-off-by: Sangkug Lym Signed-off-by: Aishwarya Bhandare Signed-off-by: Pratyush Muthukumar Signed-off-by: Pratyush Muthukumar Signed-off-by: Huiying Li Signed-off-by: Mikołaj Błaż Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> Signed-off-by: Mariana Graterol Fuenmayor Signed-off-by: Dmytro Pykhtar Signed-off-by: Valerie Sarge Signed-off-by: Piotr Żelasko Signed-off-by: ataghibakhsh Signed-off-by: eharper Signed-off-by: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com> Signed-off-by: Adi Renduchintala Signed-off-by: HuiyingLi Signed-off-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com> Co-authored-by: akoumpa <153118171+akoumpa@users.noreply.github.com> Co-authored-by: Alexandros Koumparoulis Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: HeyyyyyyG <49757268+HeyyyyyyG@users.noreply.github.com> Co-authored-by: Tugrul Konuk Co-authored-by: Adi Renduchintala Co-authored-by: Chen Cui Co-authored-by: Vladimir Bataev Co-authored-by: Alexandros Koumparoulis Co-authored-by: Sudhakar Singh Co-authored-by: Jaemin Choi Co-authored-by: jbaczek <45043825+jbaczek@users.noreply.github.com> Co-authored-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com> Co-authored-by: Jan Baczek Co-authored-by: yaoyu-33 <54727607+yaoyu-33@users.noreply.github.com> Co-authored-by: Naga Venkatesh Gavini Co-authored-by: Selvaraj Anandaraj Co-authored-by: Selvaraj Anandaraj Co-authored-by: Eric Harper Co-authored-by: tbartley94 <90423858+tbartley94@users.noreply.github.com> Co-authored-by: Piotr Żelasko Co-authored-by: Michal Futrega Co-authored-by: George <37293288+Jorjeous@users.noreply.github.com> Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: Taejin Park Co-authored-by: Somshubra Majumdar Co-authored-by: Nithin Rao Co-authored-by: Jaemin Choi Co-authored-by: JimmyZhang12 <67203904+JimmyZhang12@users.noreply.github.com> Co-authored-by: Jimmy Zhang Co-authored-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com> Co-authored-by: Sangkug Lym Co-authored-by: ashbhandare Co-authored-by: Aishwarya Bhandare Co-authored-by: Pratyush Muthukumar <30813477+PannuMuthu@users.noreply.github.com> Co-authored-by: Pratyush Muthukumar Co-authored-by: Huiying Co-authored-by: mikolajblaz Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> Co-authored-by: dimapihtar Co-authored-by: Mariana <47233618+mgrafu@users.noreply.github.com> Co-authored-by: Dmytro Pykhtar Co-authored-by: Pablo Garay Co-authored-by: Valerie Sarge Co-authored-by: Ali Taghibakhshi <71892896+JRD971000@users.noreply.github.com> Co-authored-by: ntajbakhsh Co-authored-by: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com> Co-authored-by: Shanmugam Ramasamy Co-authored-by: Jiaqi Zeng Signed-off-by: ataghibakhsh --- .../language_modeling/megatron_gpt_eval.py | 21 ++++++++++-- .../tuning/megatron_gpt_sft.py | 1 + .../language_modeling/megatron_base_model.py | 7 +++- .../modules/common/megatron/megatron_init.py | 26 ++++++++++++++ nemo/collections/nlp/parts/nlp_overrides.py | 1 + nemo/utils/app_state.py | 34 +++++++++++++++++++ 6 files changed, 86 insertions(+), 4 deletions(-) diff --git a/examples/nlp/language_modeling/megatron_gpt_eval.py b/examples/nlp/language_modeling/megatron_gpt_eval.py index e31c80dedee6..96cd75b546c1 100644 --- a/examples/nlp/language_modeling/megatron_gpt_eval.py +++ b/examples/nlp/language_modeling/megatron_gpt_eval.py @@ -199,7 +199,9 @@ def main(cfg) -> None: assert ( cfg.trainer.devices * cfg.trainer.num_nodes - == cfg.tensor_model_parallel_size * cfg.pipeline_model_parallel_size + == cfg.tensor_model_parallel_size + * cfg.pipeline_model_parallel_size + * max(1, cfg.get('expert_model_parallel_size', 1)) ), "devices * num_nodes should equal tensor_model_parallel_size * pipeline_model_parallel_size" if cfg.gpt_model_file: @@ -224,6 +226,8 @@ def main(cfg) -> None: # with dist checkpointing we can use the model parallel config specified by the user pretrained_cfg.tensor_model_parallel_size = cfg.tensor_model_parallel_size pretrained_cfg.pipeline_model_parallel_size = cfg.pipeline_model_parallel_size + pretrained_cfg.expert_model_parallel_size = cfg.get('expert_model_parallel_size', 1) + pretrained_cfg.micro_batch_size = 1 if trainer.precision == "16": pretrained_cfg.megatron_amp_O2 = False elif trainer.precision in ['bf16', 'bf16-mixed'] and cfg.get('megatron_amp_O2', False): @@ -237,13 +241,23 @@ def main(cfg) -> None: ) elif cfg.checkpoint_dir: app_state = AppState() - if cfg.tensor_model_parallel_size > 1 or cfg.pipeline_model_parallel_size > 1: - app_state.model_parallel_size = cfg.tensor_model_parallel_size * cfg.pipeline_model_parallel_size + if ( + cfg.tensor_model_parallel_size > 1 + or cfg.pipeline_model_parallel_size > 1 + or cfg.get('expert_model_parallel_size', 1) > 1 + ): + app_state.model_parallel_size = ( + cfg.tensor_model_parallel_size + * cfg.pipeline_model_parallel_size + * cfg.get('expert_model_parallel_size', 1) + ) app_state.tensor_model_parallel_size = cfg.tensor_model_parallel_size app_state.pipeline_model_parallel_size = cfg.pipeline_model_parallel_size + app_state.expert_model_parallel_size = cfg.get('expert_model_parallel_size', 1) ( app_state.tensor_model_parallel_rank, app_state.pipeline_model_parallel_rank, + app_state.expert_model_parallel_rank, app_state.model_parallel_size, app_state.data_parallel_size, app_state.pipeline_model_parallel_split_rank, @@ -254,6 +268,7 @@ def main(cfg) -> None: tensor_model_parallel_size_=cfg.tensor_model_parallel_size, pipeline_model_parallel_size_=cfg.pipeline_model_parallel_size, pipeline_model_parallel_split_rank_=cfg.pipeline_model_parallel_split_rank, + expert_model_parallel_size_=cfg.get('expert_model_parallel_size', 1), ) checkpoint_path = os.path.join(cfg.checkpoint_dir, cfg.checkpoint_name) # checkpoint_path is a dir in case of distributed checkpointing diff --git a/examples/nlp/language_modeling/tuning/megatron_gpt_sft.py b/examples/nlp/language_modeling/tuning/megatron_gpt_sft.py index 295685aacb97..44d0737ad44e 100644 --- a/examples/nlp/language_modeling/tuning/megatron_gpt_sft.py +++ b/examples/nlp/language_modeling/tuning/megatron_gpt_sft.py @@ -73,6 +73,7 @@ def _modify_config(gpt_cfg, cfg, add_cfg_to_tree=False): gpt_cfg.ffn_dropout = cfg.model.ffn_dropout gpt_cfg.use_flash_attention = cfg.model.get('use_flash_attention', False) gpt_cfg.tensor_model_parallel_size = cfg.model.get('tensor_model_parallel_size', 1) + gpt_cfg.expert_model_parallel_size = cfg.model.get('expert_model_parallel_size', 1) gpt_cfg.pipeline_model_parallel_size = cfg.model.get('pipeline_model_parallel_size', 1) gpt_cfg.pipeline_model_parallel_split_rank = cfg.model.get('pipeline_model_parallel_split_rank', 0) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py index 6a2ea80ec764..685867be7b42 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py @@ -161,7 +161,11 @@ def __init__(self, cfg: DictConfig, trainer: Trainer, no_lm_init=True): # Overrides used when converting checkpoints if os.environ.get(NEMO_MEGATRON_MODEL_PARALLEL_APPSTATE_OVERRIDE, "false").lower() == "true": app_state = AppState() - init_world_size = app_state.tensor_model_parallel_size * app_state.pipeline_model_parallel_size + init_world_size = ( + app_state.tensor_model_parallel_size + * app_state.pipeline_model_parallel_size + * (app_state.expert_model_parallel_size or 1) + ) init_global_rank = app_state.global_rank init_local_rank = app_state.local_rank else: @@ -186,6 +190,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer, no_lm_init=True): global_rank=init_global_rank, local_rank=init_local_rank, tensor_model_parallel_size=cfg.get('tensor_model_parallel_size', 1), + expert_model_parallel_size=cfg.get('expert_model_parallel_size', 1), pipeline_model_parallel_size=cfg.get('pipeline_model_parallel_size', 1), virtual_pipeline_model_parallel_size=vp_size, pipeline_model_parallel_split_rank=cfg.get('pipeline_model_parallel_split_rank', 0), diff --git a/nemo/collections/nlp/modules/common/megatron/megatron_init.py b/nemo/collections/nlp/modules/common/megatron/megatron_init.py index 013838e7688e..5f402707fb59 100644 --- a/nemo/collections/nlp/modules/common/megatron/megatron_init.py +++ b/nemo/collections/nlp/modules/common/megatron/megatron_init.py @@ -33,6 +33,8 @@ from megatron.core import tensor_parallel from megatron.core.parallel_state import ( get_pipeline_model_parallel_rank, + set_expert_model_parallel_rank, + set_expert_model_parallel_world_size, set_pipeline_model_parallel_rank, set_pipeline_model_parallel_split_rank, set_pipeline_model_parallel_world_size, @@ -60,6 +62,7 @@ def initialize_model_parallel_for_nemo( global_rank, local_rank, tensor_model_parallel_size=1, + expert_model_parallel_size=1, pipeline_model_parallel_size=1, virtual_pipeline_model_parallel_size=None, pipeline_model_parallel_split_rank=None, @@ -81,6 +84,7 @@ def initialize_model_parallel_for_nemo( app_state.global_rank = global_rank app_state.world_size = world_size app_state.local_rank = local_rank + app_state.expert_model_parallel_size = expert_model_parallel_size app_state.tensor_model_parallel_size = tensor_model_parallel_size app_state.pipeline_model_parallel_size = pipeline_model_parallel_size app_state.virtual_pipeline_model_parallel_size = virtual_pipeline_model_parallel_size @@ -90,6 +94,7 @@ def initialize_model_parallel_for_nemo( ( app_state.tensor_model_parallel_rank, app_state.pipeline_model_parallel_rank, + app_state.expert_model_parallel_rank, app_state.model_parallel_size, app_state.data_parallel_size, app_state.pipeline_model_parallel_split_rank, @@ -102,12 +107,16 @@ def initialize_model_parallel_for_nemo( virtual_pipeline_model_parallel_size_=virtual_pipeline_model_parallel_size, pipeline_model_parallel_split_rank_=pipeline_model_parallel_split_rank, context_parallel_size_=context_parallel_size, + expert_model_parallel_size_=expert_model_parallel_size, ) # update apex.transformer globals set_tensor_model_parallel_world_size(app_state.tensor_model_parallel_size) set_tensor_model_parallel_rank(app_state.tensor_model_parallel_rank) + set_expert_model_parallel_world_size(app_state.expert_model_parallel_size) + set_expert_model_parallel_rank(app_state.expert_model_parallel_rank) + set_pipeline_model_parallel_rank(app_state.pipeline_model_parallel_rank) if HAVE_INTERLEAVED: set_virtual_pipeline_model_parallel_world_size(app_state.virtual_pipeline_model_parallel_size) @@ -179,6 +188,7 @@ def fake_initialize_model_parallel( pipeline_model_parallel_size_, pipeline_model_parallel_split_rank_=None, virtual_pipeline_model_parallel_size_=None, + expert_model_parallel_size_=1, context_parallel_size_=1, ): """ @@ -302,6 +312,21 @@ def fake_initialize_model_parallel( logging.info(f'All tensor model parallel group ranks: {all_tensor_model_parallel_group_ranks}') logging.info(f'Rank {rank} has tensor model parallel rank: {tensor_model_parallel_rank}') + # EP rank + expert_model_parallel_rank = 0 + if expert_model_parallel_size_ is not None and expert_model_parallel_size_ > 1: + tensor_and_data_group_size: int = tensor_model_parallel_size * data_parallel_size + num_tensor_and_data_groups: int = world_size // tensor_and_data_group_size + tensor_and_expert_group_size: int = tensor_model_parallel_size * expert_model_parallel_size_ + num_expert_groups: int = data_parallel_size // expert_model_parallel_size_ + for i in range(num_tensor_and_data_groups): + for j in range(num_expert_groups): + start_rank = i * tensor_and_data_group_size + j * tensor_and_expert_group_size + end_rank = i * tensor_and_data_group_size + (j + 1) * tensor_and_expert_group_size + ranks = range(start_rank, end_rank) + if rank in ranks: + expert_model_parallel_rank = list(ranks).index(rank) + # Build the pipeline model-parallel groups and embedding groups # (first and last rank in each pipeline model-parallel group). all_pipeline_model_parallel_group_ranks = [] @@ -340,6 +365,7 @@ def fake_initialize_model_parallel( return ( tensor_model_parallel_rank, pipeline_model_parallel_rank, + expert_model_parallel_rank, model_parallel_size, data_parallel_size, pipeline_model_parallel_split_rank_, diff --git a/nemo/collections/nlp/parts/nlp_overrides.py b/nemo/collections/nlp/parts/nlp_overrides.py index cde0188dff20..711129f04fe1 100644 --- a/nemo/collections/nlp/parts/nlp_overrides.py +++ b/nemo/collections/nlp/parts/nlp_overrides.py @@ -129,6 +129,7 @@ def init_model_parallel(sharp: bool, nccl_communicator_config_path: str = None) context_parallel_size=app_state.context_parallel_size, nccl_communicator_config_path=nccl_communicator_config_path, use_sharp=sharp, + expert_model_parallel_size=app_state.expert_model_parallel_size, ) # assert that fake tp and pp rank match after model parallel init diff --git a/nemo/utils/app_state.py b/nemo/utils/app_state.py index eb6b6d91ba5e..8ba9880219ec 100644 --- a/nemo/utils/app_state.py +++ b/nemo/utils/app_state.py @@ -39,6 +39,7 @@ def __init__(self): self._local_rank = None self._global_rank = None self._tensor_model_parallel_rank = None + self._expert_model_parallel_rank = None self._pipeline_model_parallel_rank = None self._data_parallel_rank = None @@ -46,6 +47,7 @@ def __init__(self): self._model_parallel_size = None self._tensor_model_parallel_size = None self._tensor_model_parallel_group = None + self._expert_model_parallel_size = None self._pipeline_model_parallel_size = None self._virtual_pipeline_model_parallel_size = None self._pipeline_model_parallel_group = None @@ -141,6 +143,38 @@ def tensor_model_parallel_size(self, size): """ self._tensor_model_parallel_size = size + @property + def expert_model_parallel_rank(self): + """ Property returns the expert model parallel rank. + Returns: + Tensor model parallel rank. + """ + return self._expert_model_parallel_rank + + @expert_model_parallel_rank.setter + def expert_model_parallel_rank(self, rank): + """ Property sets the expert model parallel rank. + Args: + rank (int): Tensor model parallel rank. + """ + self._expert_model_parallel_rank = rank + + @property + def expert_model_parallel_size(self): + """ Property returns the number of GPUs in each expert parallel group. + Returns: + Number of GPUs in each expert parallel group. + """ + return self._expert_model_parallel_size + + @expert_model_parallel_size.setter + def expert_model_parallel_size(self, size): + """ Property sets the number of GPUs in each expert parallel group. + Args: + size (int): Number of GPUs in each expert parallel group. + """ + self._expert_model_parallel_size = size + @property def pipeline_model_parallel_size(self): """ Property returns the number of GPUs in each model parallel group.