From c1b198b5317b2cf0b7e11e2e5e33b425986b4f4b Mon Sep 17 00:00:00 2001 From: anteju <108555623+anteju@users.noreply.github.com> Date: Mon, 20 May 2024 10:17:02 -0700 Subject: [PATCH 01/47] Add mel codec checkpoints (#9228) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Add mel codec checkpoints Signed-off-by: Ante Jukić --------- Signed-off-by: Ante Jukić --- docs/source/tts/data/ngc_models_codec.csv | 2 ++ docs/source/tts/models.rst | 3 ++- nemo/collections/tts/models/audio_codec.py | 26 ++++++++++++++++++---- 3 files changed, 26 insertions(+), 5 deletions(-) diff --git a/docs/source/tts/data/ngc_models_codec.csv b/docs/source/tts/data/ngc_models_codec.csv index d46567012600..6827c54ce7f4 100644 --- a/docs/source/tts/data/ngc_models_codec.csv +++ b/docs/source/tts/data/ngc_models_codec.csv @@ -1,2 +1,4 @@ Model Name,Dataset,Sampling Rate,Model Class,Overview,Checkpoint audio_codec_16khz_small,Libri-Light,16000Hz,nemo.collections.tts.models.AudioCodecModel,`audio_codec_16khz_small `_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/audio_codec_16khz_small/versions/v1/files/audio_codec_16khz_small.nemo`` +mel_codec_22khz_medium,LibriVox and Common Voice,22050Hz,nemo.collections.tts.models.AudioCodecModel,`mel_codec_22khz_medium `_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/mel_codec_22khz_medium/versions/v1/files/mel_codec_22khz_medium.nemo`` +mel_codec_44khz_medium,LibriVox and Common Voice,44100Hz,nemo.collections.tts.models.AudioCodecModel,`mel_codec_44khz_medium `_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/mel_codec_44khz_medium/versions/v1/files/mel_codec_44khz_medium.nemo`` diff --git a/docs/source/tts/models.rst b/docs/source/tts/models.rst index 6f9d7d24c45d..7ea5caa4d871 100644 --- a/docs/source/tts/models.rst +++ b/docs/source/tts/models.rst @@ -140,9 +140,10 @@ Codecs Audio Codec ~~~~~~~~~~~ -The NeMo Audio Codec model is a non-autoregressive convolutional encoder-quantizer-decoder model for coding or tokenization of raw audio signals. +The NeMo Audio Codec model is a non-autoregressive convolutional encoder-quantizer-decoder model for coding or tokenization of raw audio signals or mel-spectrogram features. The NeMo Audio Codec model supports residual vector quantizer (RVQ) :cite:`tts-models-zeghidour2022soundstream` and finite scalar quantizer (FSQ) :cite:`tts-models-mentzer2023finite` for quantization of the encoder output. This model is trained end-to-end using generative loss, discriminative loss, and reconstruction loss, similar to other neural audio codecs such as SoundStream :cite:`tts-models-zeghidour2022soundstream` and EnCodec :cite:`tts-models-defossez2022encodec`. +For further information refer to the ``Audio Codec Training`` tutorial in the TTS tutorial section. .. image:: images/audiocodec_model.png :align: center diff --git a/nemo/collections/tts/models/audio_codec.py b/nemo/collections/tts/models/audio_codec.py index 81fb7cb5cd7b..04a6d2793f88 100644 --- a/nemo/collections/tts/models/audio_codec.py +++ b/nemo/collections/tts/models/audio_codec.py @@ -118,7 +118,10 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None): # STFT loss setup stft_loss_log_guard = cfg.get("stft_loss_log_guard", 1.0) self.stft_loss_scale = cfg.get("stft_loss_scale", 0.0) - self.stft_loss_fn = MultiResolutionSTFTLoss(resolutions=loss_resolutions, log_guard=stft_loss_log_guard,) + self.stft_loss_fn = MultiResolutionSTFTLoss( + resolutions=loss_resolutions, + log_guard=stft_loss_log_guard, + ) # Time domain loss setup self.time_domain_loss_scale = cfg.get("time_domain_loss_scale", 1.0) @@ -237,7 +240,9 @@ def quantize(self, encoded: torch.Tensor, encoded_len: torch.Tensor) -> torch.Te "tokens": NeuralType(('B', 'C', 'T_encoded'), TokenIndex()), "tokens_len": NeuralType(tuple('B'), LengthsType()), }, - output_types={"dequantized": NeuralType(('B', 'D', 'T_encoded'), EncodedRepresentation()),}, + output_types={ + "dequantized": NeuralType(('B', 'D', 'T_encoded'), EncodedRepresentation()), + }, ) def dequantize(self, tokens: torch.Tensor, tokens_len: torch.Tensor) -> torch.Tensor: """Convert the discrete tokens into a continuous encoded representation. @@ -392,8 +397,7 @@ def _process_batch(self, batch): @property def disc_update_prob(self) -> float: - """Probability of updating the discriminator. - """ + """Probability of updating the discriminator.""" return self.disc_updates_per_period / self.disc_update_period def should_update_disc(self, batch_idx) -> bool: @@ -652,4 +656,18 @@ def list_available_models(cls) -> List[PretrainedModelInfo]: ) models.append(model) + model = PretrainedModelInfo( + pretrained_model_name="mel_codec_22khz_medium", + location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/mel_codec_22khz_medium/versions/v1/files/mel_codec_22khz_medium.nemo", + description="For details about this model please refer to the model card: https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/mel_codec_22khz_medium", + ) + models.append(model) + + model = PretrainedModelInfo( + pretrained_model_name="mel_codec_44khz_medium", + location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/mel_codec_44khz_medium/versions/v1/files/mel_codec_44khz_medium.nemo", + description="For details about this model please refer to the model card: https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/mel_codec_44khz_medium", + ) + models.append(model) + return models From 67f06aca159e1970f0df25ba0c69180536ea5a1a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Mon, 20 May 2024 20:24:51 +0200 Subject: [PATCH 02/47] ci: Remove duplicated job (#9258) Signed-off-by: Oliver Koenig --- .github/workflows/cicd-main.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 4efb525100d9..dbc7d907580a 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -289,9 +289,6 @@ jobs: run: | rm -rf /home/TestData/nlp/megatron_gpt/starcoder-ci-hf/megatron_starcoder_tp1_pp1.nemo; rm -rf /home/TestData/nlp/megatron_gpt/starcoder-ci-hf/${{ github.run_id }}/ - - name: Cleanup - if: "always()" - run: | rm -rf /home/TestData/nlp/megatron_gpt/starcoder-ci-hf/model_weights - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" if: "failure()" From 5d4f6b2ea629ed7b89305a4a7d984b792af2139e Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com> Date: Mon, 20 May 2024 12:42:16 -0700 Subject: [PATCH 03/47] fix import (#9240) * fix import Signed-off-by: Alexandros Koumparoulis * Apply isort and black reformatting Signed-off-by: akoumpa --------- Signed-off-by: Alexandros Koumparoulis Signed-off-by: akoumpa Co-authored-by: akoumpa --- .../nlp/language_modeling/megatron_lm_ckpt_to_nemo.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/nlp/language_modeling/megatron_lm_ckpt_to_nemo.py b/examples/nlp/language_modeling/megatron_lm_ckpt_to_nemo.py index 03d6fd94e4e2..72252a03d5be 100644 --- a/examples/nlp/language_modeling/megatron_lm_ckpt_to_nemo.py +++ b/examples/nlp/language_modeling/megatron_lm_ckpt_to_nemo.py @@ -291,9 +291,9 @@ def load_from_checkpoint( **kwargs, ): """ - Loads Megatron_LM checkpoints, convert it, with some maintenance of restoration. - For documentation, please refer to LightningModule.load_from_checkpoin() documentation. - """ + Loads Megatron_LM checkpoints, convert it, with some maintenance of restoration. + For documentation, please refer to LightningModule.load_from_checkpoin() documentation. + """ checkpoint = None try: cls._set_model_restore_state(is_being_restored=True) @@ -470,7 +470,7 @@ def convert(local_rank, rank, world_size, args): ) if mcore_output and not args.mcore_input: # convert from legacy Megatron-LM to MCore NeMo. Initialize an mcore translation dict - from scripts.nlp_language_modeling.convert_nemo_gpt_to_mcore import build_key_mapping + from scripts.checkpoint_converters.convert_gpt_nemo_to_mcore import build_key_mapping mcore_translate = {} for k, v in build_key_mapping(model_cfg).items(): From a69ace4f5ac5f72367852f78538ea6c9880c39b2 Mon Sep 17 00:00:00 2001 From: yaoyu-33 <54727607+yaoyu-33@users.noreply.github.com> Date: Mon, 20 May 2024 16:38:19 -0700 Subject: [PATCH 04/47] Fix document links (#9260) Signed-off-by: yaoyu-33 --- docs/source/features/parallelisms.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/source/features/parallelisms.rst b/docs/source/features/parallelisms.rst index 9d5f33196c4e..d5e86e46a49d 100644 --- a/docs/source/features/parallelisms.rst +++ b/docs/source/features/parallelisms.rst @@ -44,7 +44,7 @@ Implement Tensor Parallelism NeMo integrates Tensor Parallelism through the implementation from Megatron Core. To understand how TP is activated within transformer blocks, refer to the code in the following repository: `Megatron-LM Transformer Block `_. -For detailed API usage and additional configurations, consult the `Megatron Core Developer Guide `_. +For detailed API usage and additional configurations, consult the `Megatron Core Developer Guide `_. Pipeline Parallelism ^^^^^^^^^^^^^^^^^^^^ @@ -87,7 +87,7 @@ Implement Pipeline Parallelism NeMo's implementation of PP leverages functionalities from Megatron Core. For a practical example of how PP is implemented within transformer blocks in NeMo, you can inspect the following codebase: `Megatron-LM Transformer Block `_. -For more detailed API usage and configurations related to PP, visit the `Megatron Core Developer Guide `_. +For more detailed API usage and configurations related to PP, visit the `Megatron Core Developer Guide `_. Sequence Parallelism ^^^^^^^^^^^^^^^^^^^^ @@ -132,7 +132,7 @@ To activate CP in the NeMo framework, set the ``context_parallel_size`` paramete context_parallel_size: 1 # Example to enable Context Parallelism -The configuration can be found and modified here: `NeMo Megatron Core Context Config `_. +The configuration can be found and modified here: `NeMo Megatron Core Context Config `_. Implement Context Parallelism ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ From 2e1814c9f031ad2aeeebad44597365e97253d2c4 Mon Sep 17 00:00:00 2001 From: Onur Yilmaz <35306097+oyilmaz-nvidia@users.noreply.github.com> Date: Tue, 21 May 2024 09:56:38 -0400 Subject: [PATCH 05/47] Add TRT-LLM params like max_num_tokens and opt_num_tokens (#9210) * Add params like max_num_tokens and opt_num_tokens Signed-off-by: Onur Yilmaz * remove padding param added * update params like max_num_token Signed-off-by: Onur Yilmaz * Apply isort and black reformatting Signed-off-by: oyilmaz-nvidia * remove context context_fmha param for now Signed-off-by: Onur Yilmaz * add params like max num token to the script Signed-off-by: Onur Yilmaz * Apply isort and black reformatting Signed-off-by: oyilmaz-nvidia --------- Signed-off-by: Onur Yilmaz Signed-off-by: oyilmaz-nvidia Co-authored-by: oyilmaz-nvidia Co-authored-by: Pablo Garay --- nemo/export/tensorrt_llm.py | 21 ++++++--- nemo/export/trt_llm/tensorrt_llm_build.py | 28 ++++++++++-- scripts/deploy/nlp/deploy_triton.py | 53 +++++++++++++++++++---- scripts/export/export_to_trt_llm.py | 19 +++++--- tests/export/test_nemo_export.py | 2 + 5 files changed, 100 insertions(+), 23 deletions(-) diff --git a/nemo/export/tensorrt_llm.py b/nemo/export/tensorrt_llm.py index af4f1b6699ee..cad7b821b3b4 100644 --- a/nemo/export/tensorrt_llm.py +++ b/nemo/export/tensorrt_llm.py @@ -117,15 +117,16 @@ def export( max_batch_size: int = 8, max_prompt_embedding_table_size=None, use_parallel_embedding: bool = False, - use_inflight_batching: bool = False, - enable_context_fmha: bool = True, - paged_kv_cache: bool = False, + paged_kv_cache: bool = True, + remove_input_padding: bool = True, dtype: str = "bfloat16", load_model: bool = True, enable_multi_block_mode: bool = False, use_lora_plugin: str = None, lora_target_modules: List[str] = None, max_lora_rank: int = 64, + max_num_tokens: int = None, + opt_num_tokens: int = None, save_nemo_model_config: bool = False, ): """ @@ -142,12 +143,18 @@ def export( max_output_token (int): max output length. max_batch_size (int): max batch size. max_prompt_embedding_table_size (int): max prompt embedding size. - use_inflight_batching (bool): if True, enables inflight batching for TensorRT-LLM Triton backend. - enable_context_fmha (bool): if True, use fused Context MultiHeadedAttention. + use_parallel_embedding (bool): whether to use parallel embedding feature of TRT-LLM or not paged_kv_cache (bool): if True, uses kv cache feature of the TensorRT-LLM. + remove_input_padding (bool): enables removing input padding or not. dtype (str): Floating point type for model weights (Supports BFloat16/Float16). load_model (bool): load TensorRT-LLM model after the export. enable_multi_block_mode (bool): enable faster decoding in multihead attention. Required for long context. + use_lora_plugin (str): use dynamic lora or not. + lora_target_modules (List[str]): list of the target lora modules. + max_lora_rank (int): maximum lora rank. + max_num_tokens (int): + opt_num_tokens (int): + save_nemo_model_config (bool): """ if model_type not in self.get_supported_models_list: @@ -238,6 +245,10 @@ def export( lora_target_modules=lora_target_modules, max_prompt_embedding_table_size=max_prompt_embedding_table_size, enable_multi_block_mode=enable_multi_block_mode, + paged_kv_cache=paged_kv_cache, + remove_input_padding=remove_input_padding, + max_num_tokens=max_num_tokens, + opt_num_tokens=opt_num_tokens, ) tokenizer_path = os.path.join(nemo_export_dir, "tokenizer.model") diff --git a/nemo/export/trt_llm/tensorrt_llm_build.py b/nemo/export/trt_llm/tensorrt_llm_build.py index ac8d9094ea32..2336b8eb38ce 100644 --- a/nemo/export/trt_llm/tensorrt_llm_build.py +++ b/nemo/export/trt_llm/tensorrt_llm_build.py @@ -24,6 +24,7 @@ import tensorrt_llm import torch from tensorrt_llm import str_dtype_to_trt +from tensorrt_llm._common import check_max_num_tokens from tensorrt_llm._utils import np_dtype_to_trt from tensorrt_llm.builder import BuildConfig, Builder from tensorrt_llm.commands.build import build as build_trtllm @@ -371,6 +372,12 @@ def build_and_save_engine( lora_target_modules=None, max_prompt_embedding_table_size=0, enable_multi_block_mode: bool = False, + paged_kv_cache: bool = True, + remove_input_padding: bool = True, + max_num_tokens: int = None, + opt_num_tokens: int = None, + max_beam_width: int = 1, + tokens_per_block: int = 128, ): try: model_cls = getattr(tensorrt_llm.models, model_config.architecture) @@ -383,15 +390,30 @@ def build_and_save_engine( plugin_config.set_gpt_attention_plugin(dtype=str_dtype) plugin_config.set_gemm_plugin(dtype=str_dtype) plugin_config.set_plugin("multi_block_mode", enable_multi_block_mode) - max_num_tokens = max_batch_size * max_input_len + if paged_kv_cache: + plugin_config.enable_paged_kv_cache(tokens_per_block=tokens_per_block) + else: + plugin_config.paged_kv_cache = False + plugin_config.remove_input_padding = remove_input_padding + + max_num_tokens, opt_num_tokens = check_max_num_tokens( + max_num_tokens=max_num_tokens, + opt_num_tokens=opt_num_tokens, + max_batch_size=max_batch_size, + max_input_len=max_input_len, + max_beam_width=max_beam_width, + remove_input_padding=remove_input_padding, + enable_context_fmha=plugin_config.context_fmha, + tokens_per_block=tokens_per_block, + ) build_dict = { 'max_input_len': max_input_len, 'max_output_len': max_output_len, 'max_batch_size': max_batch_size, - 'max_beam_width': 1, + 'max_beam_width': max_beam_width, 'max_num_tokens': max_num_tokens, - 'opt_num_tokens': None, + 'opt_num_tokens': opt_num_tokens, 'max_prompt_embedding_table_size': max_prompt_embedding_table_size, 'gather_context_logits': False, 'gather_generation_logits': False, diff --git a/scripts/deploy/nlp/deploy_triton.py b/scripts/deploy/nlp/deploy_triton.py index aa896e924584..7370731ec996 100755 --- a/scripts/deploy/nlp/deploy_triton.py +++ b/scripts/deploy/nlp/deploy_triton.py @@ -27,7 +27,8 @@ def get_args(argv): parser = argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter, description=f"Deploy nemo models to Triton", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + description=f"Deploy nemo models to Triton", ) parser.add_argument("-nc", "--nemo_checkpoint", type=str, help="Source .nemo file") parser.add_argument( @@ -73,6 +74,8 @@ def get_args(argv): parser.add_argument("-mil", "--max_input_len", default=256, type=int, help="Max input length of the model") parser.add_argument("-mol", "--max_output_len", default=256, type=int, help="Max output length of the model") parser.add_argument("-mbs", "--max_batch_size", default=8, type=int, help="Max batch size of the model") + parser.add_argument("-mnt", "--max_num_tokens", default=None, type=int, help="Max number of tokens") + parser.add_argument("-ont", "--opt_num_tokens", default=None, type=int, help="Optimum number of tokens") parser.add_argument( "-mpet", "--max_prompt_embedding_table_size", default=None, type=int, help="Max prompt embedding table size" ) @@ -80,11 +83,11 @@ def get_args(argv): "-upkc", "--use_paged_kv_cache", default=False, action='store_true', help="Enable paged kv cache." ) parser.add_argument( - "-dcf", - "--disable_context_fmha", + "-drip", + "--disable_remove_input_padding", default=False, action='store_true', - help="Disable fused Context MultiHeadedAttention (required for V100 support).", + help="Disables the remove input padding option.", ) parser.add_argument( "-mbm", @@ -101,7 +104,6 @@ def get_args(argv): '--use_lora_plugin', nargs='?', const=None, - default=False, choices=['float16', 'float32', 'bfloat16'], help="Activates the lora plugin which enables embedding sharing.", ) @@ -109,7 +111,16 @@ def get_args(argv): '--lora_target_modules', nargs='+', default=None, - choices=["attn_qkv", "attn_q", "attn_k", "attn_v", "attn_dense", "mlp_h_to_4h", "mlp_gate", "mlp_4h_to_h",], + choices=[ + "attn_qkv", + "attn_q", + "attn_k", + "attn_v", + "attn_dense", + "mlp_h_to_4h", + "mlp_gate", + "mlp_4h_to_h", + ], help="Add lora in which modules. Only be activated when use_lora_plugin is enabled.", ) parser.add_argument( @@ -198,6 +209,29 @@ def nemo_deploy(argv): trt_llm_exporter = TensorRTLLM(model_dir=trt_llm_path, lora_ckpt_list=args.lora_ckpt) if args.nemo_checkpoint is not None: + + trt_llm_exporter.export( + nemo_checkpoint_path=args.nemo_checkpoint, + model_type=args.model_type, + n_gpus=args.num_gpus, + tensor_parallel_size=args.num_gpus, + pipeline_parallel_size=1, + max_input_token=args.max_input_len, + max_output_token=args.max_output_len, + max_batch_size=args.max_batch_size, + max_num_tokens=args.max_num_tokens, + opt_num_tokens=args.opt_num_tokens, + max_prompt_embedding_table_size=args.max_prompt_embedding_table_size, + paged_kv_cache=args.use_paged_kv_cache, + remove_input_padding=(not args.disable_remove_input_padding), + dtype=args.dtype, + enable_multi_block_mode=args.multi_block_mode, + use_lora_plugin=args.use_lora_plugin, + lora_target_modules=args.lora_target_modules, + max_lora_rank=args.max_lora_rank, + save_nemo_model_config=True, + ) + try: LOGGER.info("Export operation will be started to export the nemo checkpoint to TensorRT-LLM.") trt_llm_exporter.export( @@ -209,9 +243,11 @@ def nemo_deploy(argv): max_input_token=args.max_input_len, max_output_token=args.max_output_len, max_batch_size=args.max_batch_size, + max_num_tokens=args.max_num_tokens, + opt_num_tokens=args.opt_num_tokens, max_prompt_embedding_table_size=args.max_prompt_embedding_table_size, paged_kv_cache=args.use_paged_kv_cache, - enable_context_fmha=not args.disable_context_fmha, + remove_input_padding=(not args.disable_remove_input_padding), dtype=args.dtype, enable_multi_block_mode=args.multi_block_mode, use_lora_plugin=args.use_lora_plugin, @@ -236,7 +272,8 @@ def nemo_deploy(argv): ) ) trt_llm_exporter.add_prompt_table( - task_name=str(task_id), prompt_embeddings_checkpoint_path=prompt_embeddings_checkpoint_path, + task_name=str(task_id), + prompt_embeddings_checkpoint_path=prompt_embeddings_checkpoint_path, ) except Exception as error: LOGGER.error("An error has occurred during adding the prompt embedding table(s). Error message: " + str(error)) diff --git a/scripts/export/export_to_trt_llm.py b/scripts/export/export_to_trt_llm.py index 5e5833444f65..e9741516cf00 100644 --- a/scripts/export/export_to_trt_llm.py +++ b/scripts/export/export_to_trt_llm.py @@ -53,18 +53,20 @@ def get_args(argv): parser.add_argument("-mil", "--max_input_len", default=256, type=int, help="Max input length of the model") parser.add_argument("-mol", "--max_output_len", default=256, type=int, help="Max output length of the model") parser.add_argument("-mbs", "--max_batch_size", default=8, type=int, help="Max batch size of the model") + parser.add_argument("-mnt", "--max_num_tokens", default=None, type=int, help="Max number of tokens") + parser.add_argument("-ont", "--opt_num_tokens", default=None, type=int, help="Optimum number of tokens") parser.add_argument( "-mpet", "--max_prompt_embedding_table_size", default=None, type=int, help="Max prompt embedding table size" ) parser.add_argument( - "-uib", - "--use_inflight_batching", - default=False, - action='store_true', - help="Enable inflight batching for TensorRT-LLM Triton backend.", + "-upkc", "--use_paged_kv_cache", default=False, action='store_true', help="Enable paged kv cache." ) parser.add_argument( - "-upkc", "--use_paged_kv_cache", default=False, action='store_true', help="Enable paged kv cache." + "-drip", + "--disable_remove_input_padding", + default=False, + action='store_true', + help="Disables the remove input padding option.", ) parser.add_argument( "-mbm", @@ -141,9 +143,12 @@ def nemo_export_trt_llm(argv): max_input_token=args.max_input_len, max_output_token=args.max_output_len, max_batch_size=args.max_batch_size, + max_num_tokens=args.max_num_tokens, + opt_num_tokens=args.opt_num_tokens, max_prompt_embedding_table_size=args.max_prompt_embedding_table_size, - use_inflight_batching=args.use_inflight_batching, paged_kv_cache=args.use_paged_kv_cache, + remove_input_padding=(not args.disable_remove_input_padding), + dtype=args.dtype, enable_multi_block_mode=args.multi_block_mode, use_lora_plugin=args.use_lora_plugin, lora_target_modules=args.lora_target_modules, diff --git a/tests/export/test_nemo_export.py b/tests/export/test_nemo_export.py index 0e9981403a1a..b3e186433561 100644 --- a/tests/export/test_nemo_export.py +++ b/tests/export/test_nemo_export.py @@ -214,6 +214,8 @@ def run_trt_llm_inference( max_prompt_embedding_table_size=max_prompt_embedding_table_size, use_lora_plugin=use_lora_plugin, lora_target_modules=lora_target_modules, + max_num_tokens=int(max_input_token * max_batch_size * 0.2), + opt_num_tokens=60, save_nemo_model_config=True, ) From c7bf46e88b404078f58c077f25a9b9180565d43d Mon Sep 17 00:00:00 2001 From: Sangkug Lym Date: Tue, 21 May 2024 11:25:15 -0700 Subject: [PATCH 06/47] sum-reduce grad_norm in DP+CP domain (#9262) * sum-reudce grad_norm in DP+CP domain Signed-off-by: Sangkug Lym * Apply isort and black reformatting Signed-off-by: pablo-garay --------- Signed-off-by: Sangkug Lym Signed-off-by: pablo-garay Co-authored-by: Pablo Garay Co-authored-by: pablo-garay --- .../nlp/modules/common/megatron/clip_grads.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/nemo/collections/nlp/modules/common/megatron/clip_grads.py b/nemo/collections/nlp/modules/common/megatron/clip_grads.py index 7edc6720574e..b87c260ca4da 100644 --- a/nemo/collections/nlp/modules/common/megatron/clip_grads.py +++ b/nemo/collections/nlp/modules/common/megatron/clip_grads.py @@ -142,7 +142,7 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2, use_fsdp=False): grad_norm = torch.zeros(1, device='cuda', dtype=torch.float32).squeeze() # Since we will be summing across data parallel groups, # we need the pow(norm-type). - total_norm = grad_norm ** norm_type + total_norm = grad_norm**norm_type if use_fsdp: if len(sharded_grads_for_norm) > 0: sharded_grad_norm, _ = multi_tensor_applier( @@ -150,20 +150,22 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2, use_fsdp=False): ) else: sharded_grad_norm = torch.zeros(1, device='cuda', dtype=torch.float32).squeeze() - total_sharded_norm = sharded_grad_norm ** norm_type + total_sharded_norm = sharded_grad_norm**norm_type else: for grad in grads_for_norm: grad_norm = torch.norm(grad, norm_type) - total_norm += grad_norm ** norm_type + total_norm += grad_norm**norm_type if use_fsdp: for grad in sharded_grads_for_norm: grad_norm = torch.norm(grad, norm_type) - total_sharded_norm += grad_norm ** norm_type + total_sharded_norm += grad_norm**norm_type if use_fsdp: # Sum norm of grad shards across data-parallel GPUs. torch.distributed.all_reduce( - total_sharded_norm, op=torch.distributed.ReduceOp.SUM, group=parallel_state.get_data_parallel_group(), + total_sharded_norm, + op=torch.distributed.ReduceOp.SUM, + group=parallel_state.get_data_parallel_group(with_context_parallel=True), ) total_norm += total_sharded_norm.squeeze() From d7bb40364c17bf322004539f851cc83df4c4c2b7 Mon Sep 17 00:00:00 2001 From: yaoyu-33 <54727607+yaoyu-33@users.noreply.github.com> Date: Tue, 21 May 2024 20:14:18 -0700 Subject: [PATCH 07/47] Add llama3 and distributed checkpoint support in NeVA (#9101) * temp save Signed-off-by: yaoyu-33 * temp save 2 Signed-off-by: yaoyu-33 * update code Signed-off-by: yaoyu-33 * enable seq packing Signed-off-by: yaoyu-33 * fix neva and clip Signed-off-by: yaoyu-33 * Enable parallel seq packing algo and few other fixes Signed-off-by: yaoyu-33 * Pipeline parallel support Signed-off-by: yaoyu-33 * Update data preprocess Signed-off-by: yaoyu-33 * fix few pp issues Signed-off-by: yaoyu-33 * enable sequence packing w/ PP Signed-off-by: yaoyu-33 * Fix cu_seqlens in inputs Signed-off-by: yaoyu-33 * add assert Signed-off-by: yaoyu-33 * Depend on PP to decide whether do padding Signed-off-by: yaoyu-33 * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Add docstring Signed-off-by: yaoyu-33 * Fix few evaluation issues Signed-off-by: yaoyu-33 * Fix few PP evaluation issues Signed-off-by: yaoyu-33 * Address comments Signed-off-by: yaoyu-33 * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add llama3 template Signed-off-by: yaoyu-33 * address comments Signed-off-by: yaoyu-33 * Fix license Signed-off-by: yaoyu-33 * Fix llama3 Signed-off-by: yaoyu-33 * Few fixes Signed-off-by: yaoyu-33 * Few neva bugs Signed-off-by: yaoyu-33 * Few neva bugs Signed-off-by: yaoyu-33 * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Few neva bugs Signed-off-by: yaoyu-33 * llama3 inference fix Signed-off-by: yaoyu-33 * Force vision encoder to run in fp32 Signed-off-by: yaoyu-33 * Revert "Force vision encoder to run in fp32" This reverts commit 9d2160d96cb3e2a27a18538950ef43b4482c04da. * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Try adding distributed format of checkpoint Signed-off-by: yaoyu-33 * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Allow dist checkpoint to be non-strict Signed-off-by: yaoyu-33 * Fix Signed-off-by: yaoyu-33 * Some fixes for PP + dist ckpt in Neva Signed-off-by: yaoyu-33 * fix peft Signed-off-by: yaoyu-33 * few fixes for lora Signed-off-by: yaoyu-33 * checkpoint updates Signed-off-by: yaoyu-33 * Apply isort and black reformatting Signed-off-by: yaoyu-33 * bug fix Signed-off-by: yaoyu-33 * Add neva dist checkpoint converter Signed-off-by: yaoyu-33 * Apply isort and black reformatting Signed-off-by: yaoyu-33 * resolve comments Signed-off-by: yaoyu-33 * update neva dist ckpt apis Signed-off-by: yaoyu-33 * Apply isort and black reformatting Signed-off-by: yaoyu-33 * fix return Signed-off-by: yaoyu-33 --------- Signed-off-by: yaoyu-33 Signed-off-by: yaoyu-33 Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: yaoyu-33 --- .../neva/conf/neva_inference.yaml | 2 +- .../multimodal_llm/neva/eval/gradio_server.py | 1 + .../multimodal_llm/neva/eval/vqa_science.py | 1 - .../neva/neva_convert_to_dist_ckpt.py | 89 +++++++ .../multimodal/data/neva/conversation.py | 61 ++++- .../multimodal/data/neva/neva_dataset.py | 221 +++++++++++++++--- .../models/multimodal_llm/neva/neva_model.py | 144 ++++++++++-- nemo/collections/multimodal/parts/utils.py | 18 +- .../common/text_generation_strategy.py | 118 +++++++--- .../modules/common/text_generation_utils.py | 38 +-- .../parts/mixins/multimodal_adapter_mixins.py | 85 ++++--- .../nlp/parts/mixins/nlp_adapter_mixins.py | 44 +++- nemo/collections/nlp/parts/nlp_overrides.py | 4 +- nemo/utils/callbacks/dist_ckpt_io.py | 36 ++- 14 files changed, 690 insertions(+), 172 deletions(-) create mode 100644 examples/multimodal/multimodal_llm/neva/neva_convert_to_dist_ckpt.py diff --git a/examples/multimodal/multimodal_llm/neva/conf/neva_inference.yaml b/examples/multimodal/multimodal_llm/neva/conf/neva_inference.yaml index 145575d8a73b..b06f4bd8e535 100644 --- a/examples/multimodal/multimodal_llm/neva/conf/neva_inference.yaml +++ b/examples/multimodal/multimodal_llm/neva/conf/neva_inference.yaml @@ -11,7 +11,7 @@ inference: compute_logprob: False # a flag used to compute logprob of all the input text, a very special case of running inference, default False end_strings: ["","",] # generation will stop when one of these tokens is generated media_base_path: /pwd/images # /path/to/images or /path/to/videos - insert_media_token: left # `left` or `right` or `null` + insert_media_token: null # `left` or `right` or `null` media_type: image # `image` or `video` trainer: diff --git a/examples/multimodal/multimodal_llm/neva/eval/gradio_server.py b/examples/multimodal/multimodal_llm/neva/eval/gradio_server.py index 88cfdc4ed194..7c04a7045f00 100644 --- a/examples/multimodal/multimodal_llm/neva/eval/gradio_server.py +++ b/examples/multimodal/multimodal_llm/neva/eval/gradio_server.py @@ -20,6 +20,7 @@ from omegaconf import OmegaConf from nemo.collections.multimodal.parts.utils import create_neva_model_and_processor +from nemo.collections.nlp.modules.common.transformer.text_generation import LengthParam, SamplingParam CFG_STRING = """ trainer: diff --git a/examples/multimodal/multimodal_llm/neva/eval/vqa_science.py b/examples/multimodal/multimodal_llm/neva/eval/vqa_science.py index a80c9e70f4ed..17bda5725eb4 100644 --- a/examples/multimodal/multimodal_llm/neva/eval/vqa_science.py +++ b/examples/multimodal/multimodal_llm/neva/eval/vqa_science.py @@ -169,7 +169,6 @@ def eval_model(args): parser.add_argument("--image-folder", type=str, default="") parser.add_argument("--question-file", type=str, default="tables/question.json") parser.add_argument("--answers-file", type=str, default="answer.jsonl") - parser.add_argument("--conv-mode", type=str, default="llava_v0") parser.add_argument("--tp", type=int, default=1) parser.add_argument("--pp", type=int, default=1) parser.add_argument("--num-chunks", type=int, default=1) diff --git a/examples/multimodal/multimodal_llm/neva/neva_convert_to_dist_ckpt.py b/examples/multimodal/multimodal_llm/neva/neva_convert_to_dist_ckpt.py new file mode 100644 index 000000000000..8891a8e9d208 --- /dev/null +++ b/examples/multimodal/multimodal_llm/neva/neva_convert_to_dist_ckpt.py @@ -0,0 +1,89 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from argparse import ArgumentParser +from omegaconf.omegaconf import OmegaConf + +from nemo.collections.multimodal.models.multimodal_llm.neva.neva_model import MegatronNevaModel +from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronTrainerBuilder +from nemo.collections.nlp.parts.nlp_overrides import NLPSaveRestoreConnector +from nemo.utils import logging + + +def get_args(): + parser = ArgumentParser() + parser.add_argument( + "--input_path", + type=str, + default=None, + required=True, + help="Path to NeMo legacy checkpoints", + ) + parser.add_argument("--output_path", type=str, default=None, required=True, help="Path to output .nemo file.") + parser.add_argument("--gpus_per_node", type=int, required=False, default=8) + parser.add_argument("--num_nodes", type=int, required=False, default=1) + parser.add_argument( + "--precision", + type=str, + required=False, + default='bf16-mixed', + choices=['32-true', '16-mixed', 'bf16-mixed'], + help="Precision value for the trainer that matches with precision of the ckpt", + ) + args = parser.parse_args() + return args + + +def main() -> None: + args = get_args() + cfg = { + 'trainer': { + 'devices': args.gpus_per_node, + 'num_nodes': args.num_nodes, + 'accelerator': 'gpu', + 'precision': args.precision, + }, + 'model': { + 'native_amp_init_scale': 2**32, + 'native_amp_growth_interval': 1000, + 'hysteresis': 2, + 'gradient_as_bucket_view': True, + }, + 'cluster_type': 'BCP', + } + cfg = OmegaConf.create(cfg) + + # Set precision None after precision plugins are created as PTL >= 2.1 does not allow both + # precision plugins and precision to exist + cfg.trainer.precision = None + trainer = MegatronTrainerBuilder(cfg).create_trainer() + + save_restore_connector = NLPSaveRestoreConnector() + if os.path.isdir(args.input_path): + save_restore_connector.model_extracted_dir = args.input_path + + model = MegatronNevaModel.restore_from( + restore_path=args.input_path, + trainer=trainer, + save_restore_connector=save_restore_connector, + strict=False, + ) + + model.save_to(args.output_path) + logging.info(f'NeMo model saved to: {args.output_path}') + + +if __name__ == '__main__': + main() diff --git a/nemo/collections/multimodal/data/neva/conversation.py b/nemo/collections/multimodal/data/neva/conversation.py index 80a297a5b952..43b1977aa993 100644 --- a/nemo/collections/multimodal/data/neva/conversation.py +++ b/nemo/collections/multimodal/data/neva/conversation.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. import dataclasses +from collections import defaultdict from enum import Enum, auto from typing import List @@ -24,9 +25,14 @@ DEFAULT_SYSTEM_TOKEN = "" DEFAULT_SEPARATOR_TOKEN = "" DEFAULT_LABELS_TOKEN = "" -DEFAULT_IMAGE_PATCH_TOKEN = "" -DEFAULT_IM_START_TOKEN = "" -DEFAULT_IM_END_TOKEN = "" +DEFAULT_IMAGE_PATCH_TOKEN = defaultdict(lambda: "") +DEFAULT_IM_START_TOKEN = defaultdict(lambda: "") +DEFAULT_IM_END_TOKEN = defaultdict(lambda: "") + +# Update llama3 default +DEFAULT_IMAGE_PATCH_TOKEN["llama_3"] = "<|reserved_special_token_3|>" +DEFAULT_IM_START_TOKEN["llama_3"] = "<|reserved_special_token_4|>" +DEFAULT_IM_END_TOKEN["llama_3"] = "<|reserved_special_token_5|>" class SeparatorStyle(Enum): @@ -36,6 +42,7 @@ class SeparatorStyle(Enum): TWO = auto() PLAIN = auto() LLAMA_2 = auto() + LLAMA_3 = auto() NVGPT = auto() @@ -109,6 +116,34 @@ def get_prompt(self): else: ret += "" ret = ret.lstrip(self.sep) + elif self.sep_style == SeparatorStyle.LLAMA_3: + """ + <|begin_of_text|><|start_header_id|>system<|end_header_id|> + + {{ system_prompt }}<|eot_id|><|start_header_id|>user<|end_header_id|> + + {{ user_message_1 }}<|eot_id|><|start_header_id|>assistant<|end_header_id|> + + {{ model_answer_1 }}<|eot_id|><|start_header_id|>user<|end_header_id|> + + {{ user_message_2 }}<|eot_id|><|start_header_id|>assistant<|end_header_id|> + """ + wrap_sys = lambda msg: f"<|start_header_id|>system<|end_header_id|>\n\n{msg}" + wrap_user = lambda msg: f"<|start_header_id|>user<|end_header_id|>\n\n{msg}" + wrap_assistant = lambda msg: f"<|start_header_id|>assistant<|end_header_id|>\n\n{msg}" + + ret = "<|begin_of_text|>" + wrap_sys(self.system) + self.sep + for i, (role, message) in enumerate(messages): + if i == 0: + assert message, "first message should not be none" + assert role == self.roles[0], "first message should come from user" + if type(message) is tuple: + message, _, _ = message + elif i % 2 == 0: + ret += wrap_user(message) + self.sep + else: + ret += wrap_assistant(message) + (self.sep if message else "") + elif self.sep_style == SeparatorStyle.PLAIN: seps = [self.sep, self.sep2] ret = self.system @@ -346,8 +381,25 @@ def dict(self): sep2=DEFAULT_EOS_TOKEN, ) +conv_llava_llama_3 = Conversation( + system="You are a helpful language and vision assistant. " + "You are able to understand the visual content that the user provides, " + "and assist the user with a variety of tasks using natural language.", + roles=("user", "assistant"), + version="llama_v3", + messages=(), + offset=0, + sep_style=SeparatorStyle.LLAMA_3, + sep="<|eot_id|>", +) + conv_llava_plain = Conversation( - system="", roles=("", ""), messages=(), offset=0, sep_style=SeparatorStyle.PLAIN, sep="\n", + system="", + roles=("", ""), + messages=(), + offset=0, + sep_style=SeparatorStyle.PLAIN, + sep="\n", ) conv_llava_v0 = Conversation( @@ -416,6 +468,5 @@ def dict(self): "nv_dpo": conv_nv_dpo, } - if __name__ == "__main__": print(default_conversation.get_prompt()) diff --git a/nemo/collections/multimodal/data/neva/neva_dataset.py b/nemo/collections/multimodal/data/neva/neva_dataset.py index caaab2c5d67e..70afc5b4a19a 100644 --- a/nemo/collections/multimodal/data/neva/neva_dataset.py +++ b/nemo/collections/multimodal/data/neva/neva_dataset.py @@ -34,17 +34,11 @@ import nemo.collections.multimodal.data.neva.conversation as conversation_lib from nemo.collections.multimodal.data.clip.augmentations.augmentations import image_transform from nemo.collections.multimodal.data.neva.conversation import ( - DEFAULT_BOS_TOKEN, - DEFAULT_EOS_TOKEN, DEFAULT_IM_END_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IMAGE_TOKEN, DEFAULT_LABELS_TOKEN, - DEFAULT_PAD_TOKEN, - DEFAULT_SEPARATOR_TOKEN, - DEFAULT_SYSTEM_TOKEN, - DEFAULT_UNK_TOKEN, DEFAULT_VIDEO_TOKEN, ) from nemo.collections.nlp.modules.common.megatron.utils import get_ltor_masks_and_position_ids @@ -188,7 +182,10 @@ def flatten_frames(self, cap): def tokenize( - texts: Union[str, List[str]], tokenizer: Any, context_length: int, add_extra_token: int, + texts: Union[str, List[str]], + tokenizer: Any, + context_length: int, + add_extra_token: int, ) -> torch.LongTensor: """ Returns the tokenized representation of given input string(s). If the list of tokens exceeds the context @@ -216,7 +213,7 @@ def tokenize( if isinstance(texts, str): texts = [texts] texts_is_str = True - tokens = tokenizer.text_to_ids(texts) + tokens = [tokenizer.text_to_ids(t) for t in texts] max_len = max([len(token) for token in tokens]) context_length = min(max_len - add_extra_token, context_length) # truncate and padding @@ -251,6 +248,7 @@ def preprocess_multimodal(sources: dict, multimodal_cfg: dict, cur_token_len: in - dict: The processed sources dictionary after applying multimodal preprocessing steps. """ is_multimodal = multimodal_cfg['is_multimodal'] + model_type = multimodal_cfg['model_type'] media_type = multimodal_cfg['media_type'] image_token_len = cur_token_len if media_type == 'image': @@ -268,11 +266,10 @@ def preprocess_multimodal(sources: dict, multimodal_cfg: dict, cur_token_len: in num_patches *= multimodal_cfg['num_frames'] if multimodal_cfg['use_im_start_end']: - replace_token = DEFAULT_IMAGE_PATCH_TOKEN * num_patches + replace_token = DEFAULT_IMAGE_PATCH_TOKEN[model_type] * num_patches else: - replace_token = DEFAULT_IMAGE_PATCH_TOKEN * (num_patches - 2) - - replace_token = DEFAULT_IM_START_TOKEN + replace_token + DEFAULT_IM_END_TOKEN + replace_token = DEFAULT_IMAGE_PATCH_TOKEN[model_type] * (num_patches - 2) + replace_token = DEFAULT_IM_START_TOKEN[model_type] + replace_token + DEFAULT_IM_END_TOKEN[model_type] for source in sources: conversation = source['conversations'] @@ -295,7 +292,103 @@ def preprocess_multimodal(sources: dict, multimodal_cfg: dict, cur_token_len: in return sources -def preprocess_llama_2(sources: dict, tokenizer, cfg,) -> Dict: +def preprocess_llama_3( + sources: dict, + tokenizer, + cfg, +) -> Dict: + """ + Preprocesses sources for the LLaMA 3 model configuration. + + The function applies prompt templates and tokenizes the conversations according to the LLaMA 2 model specifications. + It involves special handling of tokens, masking of labels, and adjustments based on configuration settings. + + Parameters: + - sources (dict): A dictionary of sources containing conversations to be processed. + - tokenizer: The tokenizer to be used for processing the text. + - cfg: Configuration settings for preprocessing, including context length and additional tokens. + + Returns: + - Dict: A dictionary containing tokenized and labeled data suitable for the LLaMA 2 model. + This includes tokens, labels, and any special processing as defined in the configuration. + """ + conv = conversation_lib.conv_llava_llama_3.copy() + roles = {"human": conv.roles[0], "gpt": conv.roles[1]} + + # Apply prompt templates + conversations = [] + for i, source in enumerate(sources): + source = source['conversations'] + if roles[source[0]["from"]] != conv.roles[0]: + # Skip the first one if it is not from human + source = source[1:] + + conv.messages = [] + for j, sentence in enumerate(source): + role = roles[sentence["from"]] + assert role == conv.roles[j % 2], f"{i}" + conv.append_message(role, sentence["value"]) + conversations.append(conv.get_prompt()) + + add_extra_token = cfg.get("add_extra_token") + + # Tokenize conversations + tokens = tokenize( + texts=conversations, + tokenizer=tokenizer, + context_length=cfg.get("context_length"), + add_extra_token=add_extra_token, + ) + labels = tokens.clone().detach() + # Mask labels + sep = "<|start_header_id|>assistant<|end_header_id|>\n\n" # part sep + round_sep = "<|start_header_id|>user<|end_header_id|>\n\n" + for conversation, target in zip(conversations, labels): + # the first match of round sep is going to be the one after system, which is not the intended behavior + rounds = conversation.split(round_sep) + rounds = [round_sep.join(rounds[:2])] + rounds[2:] + cur_len = 0 + for i, rou in enumerate(rounds): + + if rou == "": + break + + parts = rou.split(sep) + if len(parts) != 2: + break + parts[0] += sep + + if i == 0: + round_len = len(tokenizer.text_to_ids(rou)) + instruction_len = len(tokenizer.text_to_ids(parts[0])) + else: + round_len = len(tokenizer.text_to_ids(round_sep + rou)) + instruction_len = len(tokenizer.text_to_ids(round_sep + parts[0])) + target[cur_len : cur_len + instruction_len] = IGNORE_INDEX + cur_len += round_len + target[cur_len:] = IGNORE_INDEX + + # Check if masking working correctly + # print([x for x in zip(tokens[0].numpy().tolist(), labels[0].numpy().tolist())]) + + if add_extra_token: + tokens = tokens[:, :-1].contiguous() + labels = labels[:, 1:].contiguous() + else: + labels = torch.roll(labels, shifts=-1, dims=-1) + labels[:, -1] = IGNORE_INDEX + + return dict( + tokens=tokens, + labels=labels, + ) + + +def preprocess_llama_2( + sources: dict, + tokenizer, + cfg, +) -> Dict: """ Preprocesses sources for the LLaMA 2 model configuration. @@ -379,10 +472,17 @@ def preprocess_llama_2(sources: dict, tokenizer, cfg,) -> Dict: labels = torch.roll(labels, shifts=-1, dims=-1) labels[:, -1] = IGNORE_INDEX - return dict(tokens=tokens, labels=labels,) + return dict( + tokens=tokens, + labels=labels, + ) -def preprocess_v1(sources: dict, tokenizer, cfg,) -> Dict: +def preprocess_v1( + sources: dict, + tokenizer, + cfg, +) -> Dict: """ Preprocesses sources for the Vicuna V1 model configuration. @@ -462,10 +562,17 @@ def preprocess_v1(sources: dict, tokenizer, cfg,) -> Dict: labels = torch.roll(labels, shifts=-1, dims=-1) labels[:, -1] = IGNORE_INDEX - return dict(tokens=tokens, labels=labels,) + return dict( + tokens=tokens, + labels=labels, + ) -def preprocess_nvgpt(sources: dict, tokenizer, cfg,) -> Dict: +def preprocess_nvgpt( + sources: dict, + tokenizer, + cfg, +) -> Dict: """ Preprocess a given set of conversational sources using nvgpt conversation template @@ -503,9 +610,9 @@ def preprocess_nvgpt(sources: dict, tokenizer, cfg,) -> Dict: if i % 2 == 1: turn['from'] = conv.roles[1] if 'label' not in turn: - turn[ - 'label' - ] = "quality:4,toxicity:0,humor:0,creativity:0,helpfulness:4,correctness:4,coherence:4,complexity:4,verbosity:4" + turn['label'] = ( + "quality:4,toxicity:0,humor:0,creativity:0,helpfulness:4,correctness:4,coherence:4,complexity:4,verbosity:4" + ) value = DEFAULT_LABELS_TOKEN + turn['label'] + '\n' + turn['value'] conv.append_message(turn['from'], value) if not turn["value"]: @@ -567,10 +674,17 @@ def preprocess_nvgpt(sources: dict, tokenizer, cfg,) -> Dict: labels = torch.roll(labels, shifts=-1, dims=-1) labels[:, -1] = IGNORE_INDEX - return dict(tokens=tokens, labels=labels,) + return dict( + tokens=tokens, + labels=labels, + ) -def preprocess_nv_dpo(sources: dict, tokenizer, cfg,) -> Dict: +def preprocess_nv_dpo( + sources: dict, + tokenizer, + cfg, +) -> Dict: """ Preprocess a given set of conversational sources using nvgpt conversation template @@ -666,10 +780,17 @@ def preprocess_nv_dpo(sources: dict, tokenizer, cfg,) -> Dict: labels = torch.roll(labels, shifts=-1, dims=-1) labels[:, -1] = IGNORE_INDEX - return dict(tokens=tokens, labels=labels,) + return dict( + tokens=tokens, + labels=labels, + ) -def preprocess_plain(sources, tokenizer, cfg,) -> Dict: +def preprocess_plain( + sources, + tokenizer, + cfg, +) -> Dict: """ Preprocesses plain text sources (no template) for tokenization and label generation. @@ -717,7 +838,10 @@ def preprocess_plain(sources, tokenizer, cfg,) -> Dict: labels = torch.roll(labels, shifts=-1, dims=-1) labels[:, -1] = IGNORE_INDEX - return dict(tokens=tokens, labels=labels,) + return dict( + tokens=tokens, + labels=labels, + ) class LazySupervisedDataset(Dataset): @@ -865,20 +989,45 @@ def expand2square(pil_img, background_color): ) else: - logging.warning("media not found in sources") media_tensors = torch.tensor([]) sources = copy.deepcopy(sources) if self.conv_template in ["nvgpt", "nv_steerlm"]: - data_dict = preprocess_nvgpt(sources, self.tokenizer, self.multimodal_cfg,) + data_dict = preprocess_nvgpt( + sources, + self.tokenizer, + self.multimodal_cfg, + ) elif self.conv_template == "nv_dpo": - data_dict = preprocess_nv_dpo(sources, self.tokenizer, self.multimodal_cfg,) + data_dict = preprocess_nv_dpo( + sources, + self.tokenizer, + self.multimodal_cfg, + ) elif self.conv_template == "v1": - data_dict = preprocess_v1(sources, self.tokenizer, self.multimodal_cfg,) + data_dict = preprocess_v1( + sources, + self.tokenizer, + self.multimodal_cfg, + ) elif self.conv_template == "llama_2": - data_dict = preprocess_llama_2(sources, self.tokenizer, self.multimodal_cfg,) + data_dict = preprocess_llama_2( + sources, + self.tokenizer, + self.multimodal_cfg, + ) + elif self.conv_template == "llama_3": + data_dict = preprocess_llama_3( + sources, + self.tokenizer, + self.multimodal_cfg, + ) elif self.conv_template == "plain": - data_dict = preprocess_plain(sources, self.tokenizer, self.multimodal_cfg,) + data_dict = preprocess_plain( + sources, + self.tokenizer, + self.multimodal_cfg, + ) else: raise ValueError(f"Conversation template `{self.conv_template}` is not supported in Neva now.") @@ -981,7 +1130,7 @@ def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]: tokens = batch['tokens'] labels = batch['labels'] - media_type = model_cfg.data.get('media_type') + media_type = model_cfg.data.get('media_type', 'image') if media_type == 'image': media = batch.get('image') elif media_type == 'video': @@ -1048,7 +1197,12 @@ def make_supervised_data_module(tokenizer, model_cfg) -> Dict: ) else: # TODO(yuya): Fix this hard-code for our own CLIP - image_processor = image_transform(crop_size, is_train=False, mean=None, std=None,) + image_processor = image_transform( + crop_size, + is_train=False, + mean=None, + std=None, + ) train_dataset = NevaDataset( tokenizer=tokenizer, @@ -1056,6 +1210,7 @@ def make_supervised_data_module(tokenizer, model_cfg) -> Dict: multimodal_cfg=dict( is_multimodal=data_cfg.is_multimodal, sep_image_conv_front=data_cfg.sep_image_conv_front, + model_type=mm_cfg.llm.get("model_type", "nvgpt"), conv_template=data_cfg.get("conv_template", "nvgpt"), crop_size=crop_size, image_token_len=data_cfg.image_token_len, diff --git a/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py b/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py index 7192a1b018b1..e33cf267c230 100644 --- a/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py +++ b/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py @@ -78,6 +78,7 @@ from megatron.core import InferenceParams, dist_checkpointing, parallel_state from megatron.core.models.gpt import GPTModel as MCoreGPTModel from megatron.core.pipeline_parallel.schedules import get_forward_backward_func + from megatron.core.transformer.utils import make_sharded_tensors_for_checkpoint HAVE_MEGATRON_CORE = True @@ -91,7 +92,11 @@ class FrozenCLIPVisionTransformer(CLIPVisionTransformer): def __init__(self, model_cfg, model_parallel_config, pre_process=True, post_process=True): super().__init__( - model_cfg, model_parallel_config, pre_process=pre_process, post_process=post_process, skip_head=True, + model_cfg, + model_parallel_config, + pre_process=pre_process, + post_process=post_process, + skip_head=True, ) self.frozen = False self.dtype = self.config.params_dtype @@ -235,6 +240,15 @@ def replace_media_embeddings(self, input_ids, inputs_embeds, media): return updated_input_embeds + def sharded_state_dict(self, prefix: str = '', sharded_offsets: tuple = (), **kwargs): + sharded_state_dict = super().sharded_state_dict(prefix=prefix, sharded_offsets=sharded_offsets, **kwargs) + + state_dict = self.state_dict(prefix='', keep_vars=True) + state_dict.pop('weight') + # duplicate everything else + sharded_state_dict.update(make_sharded_tensors_for_checkpoint(state_dict, prefix=prefix)) + return sharded_state_dict + class NevaBaseModel: """ @@ -245,7 +259,12 @@ class NevaBaseModel: """ def __init__( - self, mm_cfg, media_start_id, media_end_id, mcore_gpt, **kwargs, + self, + mm_cfg, + media_start_id, + media_end_id, + mcore_gpt, + **kwargs, ): self.mm_cfg = mm_cfg self.media_start_id = media_start_id @@ -264,7 +283,8 @@ def __init__( # Initialize vision encoder and freeze it if mm_cfg.vision_encoder.from_hf: vision_encoder = CLIPVisionModel.from_pretrained( - mm_cfg.vision_encoder.from_pretrained, torch_dtype=torch.bfloat16, + mm_cfg.vision_encoder.from_pretrained, + torch_dtype=torch.bfloat16, ).cuda() vision_encoder = vision_encoder.to(torch.bfloat16) if mm_cfg.vision_encoder.freeze: @@ -385,7 +405,12 @@ class MCoreNevaModel(MCoreGPTModel, NevaBaseModel): """ def __init__( - self, mm_cfg, media_start_id, media_end_id, mcore_gpt, **kwargs, + self, + mm_cfg, + media_start_id, + media_end_id, + mcore_gpt, + **kwargs, ): MCoreGPTModel.__init__(self, **kwargs) NevaBaseModel.__init__(self, mm_cfg, media_start_id, media_end_id, mcore_gpt, **kwargs) @@ -400,11 +425,17 @@ def freeze_llm(self, mm_cfg): else: output_layer_parameters = {} - for param in chain(embedding_parameters, self.decoder.parameters(), output_layer_parameters,): + for param in chain( + embedding_parameters, + self.decoder.parameters(), + output_layer_parameters, + ): param.requires_grad = False def forward( - self, *args, **kwargs, + self, + *args, + **kwargs, ): media = kwargs.pop('media', None) if parallel_state.is_pipeline_first_stage(ignore_virtual=True): @@ -421,7 +452,12 @@ class NevaModel(GPTModel, NevaBaseModel): """ def __init__( - self, mm_cfg, media_start_id, media_end_id, mcore_gpt, **kwargs, + self, + mm_cfg, + media_start_id, + media_end_id, + mcore_gpt, + **kwargs, ): GPTModel.__init__(self, **kwargs) NevaBaseModel.__init__(self, mm_cfg, media_start_id, media_end_id, mcore_gpt, **kwargs) @@ -431,7 +467,9 @@ def freeze_llm(self, mm_cfg): param.requires_grad = False def forward( - self, *args, **kwargs, + self, + *args, + **kwargs, ): media = kwargs.pop('media', None) if parallel_state.is_pipeline_first_stage(ignore_virtual=True): @@ -455,7 +493,7 @@ def init_neva_adapter(self): adapter_type=self.cfg.mm_cfg.get("mm_mlp_adapter_type", "linear"), in_features=self.cfg.mm_cfg.vision_encoder.hidden_size, out_features=self.cfg.hidden_size, - bias=True, + bias=True, # self.cfg.get("bias", False), ) for name, module in self.named_modules(): self._check_and_add_adapter( @@ -471,8 +509,10 @@ def init_neva_adapter(self): def model_provider_func(self, pre_process, post_process): """Model depends on pipeline paralellism.""" - media_start_id = self.tokenizer.token_to_id(DEFAULT_IM_START_TOKEN) - media_end_id = self.tokenizer.token_to_id(DEFAULT_IM_END_TOKEN) + + model_type = self.cfg.mm_cfg.llm.get("model_type", "nvgpt") + media_start_id = self.tokenizer.token_to_id(DEFAULT_IM_START_TOKEN[model_type]) + media_end_id = self.tokenizer.token_to_id(DEFAULT_IM_END_TOKEN[model_type]) if self.mcore_gpt: if not parallel_state.is_initialized(): @@ -581,6 +621,13 @@ def setup_optimizer_param_groups(self): else: MegatronGPTModel.setup_optimizer_param_groups(self) + # TODO(yuya): Refactor the handling of distributed checkpoint optimizer state loading + # With Pipeline Parallelism (PP) greater than 1, different stages might have varying lengths for `self._optimizer_param_groups`. + # This inconsistency can lead to errors during the loading of distributed checkpoints. + # As a temporary workaround, if `self._optimizer_param_groups` has less than 2 groups, add an empty parameter group marked as non-expert. + if len(self._optimizer_param_groups) < 2 and not self.use_peft: + self._optimizer_param_groups = (self._optimizer_param_groups[0], {'params': [], 'is_expert': False}) + # filter out params doesn't have grad for param_group in self._optimizer_param_groups: params_with_grad = [param for param in param_group['params'] if param.requires_grad] @@ -640,7 +687,10 @@ def fwd_bwd_step(self, dataloader_iter, forward_only, first_val_step=None): grad_sync_func = None param_sync_func = None if not forward_only and self.with_distributed_adam: - no_sync_func = partial(self._optimizer.no_sync, greedy_grad_copy=self.megatron_amp_O2,) + no_sync_func = partial( + self._optimizer.no_sync, + greedy_grad_copy=self.megatron_amp_O2, + ) grad_sync_func = self.reduce_overlap_gradients param_sync_func = self.sync_overlap_parameters @@ -698,9 +748,9 @@ def fwd_bwd_step(self, dataloader_iter, forward_only, first_val_step=None): def training_step(self, dataloader_iter): """ - We pass the dataloader iterator function to the micro-batch scheduler. - The input batch to each micro-batch is fetched using the dataloader function - in the micro-batch fwd function. + We pass the dataloader iterator function to the micro-batch scheduler. + The input batch to each micro-batch is fetched using the dataloader function + in the micro-batch fwd function. """ return MegatronGPTModel.training_step(self, dataloader_iter) @@ -903,7 +953,7 @@ def loss_func(self, loss_mask, output_tensor): return loss def setup(self, stage=None): - """ PTL hook that is executed after DDP spawns. + """PTL hook that is executed after DDP spawns. We setup datasets here as megatron datasets require DDP to instantiate. See https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html#setup for more information. Args: @@ -981,7 +1031,10 @@ def build_train_valid_test_datasets(self): self._train_ds = NevaPackedSeqDatatset(self.cfg.data.data_prefix, self.cfg.data.get("crop_size")) self._validation_ds = NevaPackedSeqDatatset(self.cfg.data.data_prefix, self.cfg.data.get("crop_size")) else: - ds_dict = make_supervised_data_module(tokenizer=self.tokenizer, model_cfg=self.cfg,) + ds_dict = make_supervised_data_module( + tokenizer=self.tokenizer, + model_cfg=self.cfg, + ) self._train_ds = ds_dict["train_dataset"] self._validation_ds = ds_dict["eval_dataset"] @@ -1049,10 +1102,7 @@ def list_available_models(cls) -> Optional[PretrainedModelInfo]: def setup_test_data(self, cfg): pass - def state_dict(self, destination=None, prefix='', keep_vars=False): - # Get the original state dictionary - original_state_dict = super().state_dict(destination=destination, prefix=prefix, keep_vars=keep_vars) - + def get_keys_to_keep(self): keys_to_keep = list(self.adapter_keys) # TODO(yuya): maybe not hard-code vision_encoder keys here vision_encoder_keys = [k for k in self.base_keys if "vision_encoder" in k] @@ -1061,6 +1111,12 @@ def state_dict(self, destination=None, prefix='', keep_vars=False): keys_to_keep += llm_keys if not self.cfg.mm_cfg.vision_encoder.freeze: keys_to_keep += vision_encoder_keys + return keys_to_keep + + def state_dict(self, destination=None, prefix='', keep_vars=False): + # Get the original state dictionary + original_state_dict = super().state_dict(destination=destination, prefix=prefix, keep_vars=keep_vars) + keys_to_keep = self.get_keys_to_keep() new_state_dict = {k: original_state_dict[k] for k in keys_to_keep} return new_state_dict @@ -1079,10 +1135,46 @@ def load_state_dict(self, state_dict, strict=False): logging.critical(f'Unexpected keys: \n{unexpected_keys}') def on_load_checkpoint(self, checkpoint) -> None: - pass + """LightningModule hook: + https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html#on-load-checkpoint + """ + + # mcore uses distributed checkpointing + # FSDP supports the lagecy checkpointing or torch-FSDP-native sharded checkpointing + if self.mcore_gpt and not self.use_fsdp: + if 'state_dict' in checkpoint and checkpoint['state_dict']: + for index, module in enumerate(self.get_model_module_list()): + if parallel_state.get_virtual_pipeline_model_parallel_world_size() is not None: + checkpoint_state_dict = checkpoint['state_dict'][f'model_{index}'] + else: + checkpoint_state_dict = checkpoint['state_dict'] + # checkpoint_state_dict has "model." but module does not so we need to remove it when loading + checkpoint_state_dict = { + key.replace('model.', ''): checkpoint_state_dict.pop(key) + for key in list(checkpoint_state_dict.keys()) + } + module.load_state_dict(checkpoint_state_dict, strict=False) + else: + # when restoring a distributed checkpoint from a ptl checkpoint we need to defer loading the state_dict + # see NLPModel.on_load_checkpoint + checkpoint['state_dict'] = {} + + # legacy checkpointing for interleaved + else: + if isinstance(self.model, list): + for i in range(len(self.model)): + parallel_state.set_virtual_pipeline_model_parallel_rank(i) + self.model[i].module.load_state_dict(checkpoint[f'model{i}'], strict=True) + parallel_state.set_virtual_pipeline_model_parallel_rank(0) def sharded_state_dict(self, prefix: str = ''): - return None + if self.use_peft: + return None + + original_sharded_state_dict = super().sharded_state_dict() + keys_to_keep = self.get_keys_to_keep() + new_sharded_state_dict = {k: original_sharded_state_dict[k] for k in keys_to_keep} + return new_sharded_state_dict def predict_step(self, batch: Any, batch_idx: int, dataloader_idx: Optional[int] = None) -> Any: inference_config = self.get_inference_config() @@ -1111,7 +1203,11 @@ def predict_step(self, batch: Any, batch_idx: int, dataloader_idx: Optional[int] return generate(self, **inference_config) def generate( - self, input_prompts, inference_config, length_params: LengthParam, sampling_params: SamplingParam = None, + self, + input_prompts, + inference_config, + length_params: LengthParam, + sampling_params: SamplingParam = None, ) -> OutputType: # check whether the DDP is initialized diff --git a/nemo/collections/multimodal/parts/utils.py b/nemo/collections/multimodal/parts/utils.py index f9d6ed5250f6..70dd2174a2b7 100644 --- a/nemo/collections/multimodal/parts/utils.py +++ b/nemo/collections/multimodal/parts/utils.py @@ -344,22 +344,6 @@ def create_neva_model_and_processor(cfg): # trainer required for restoring model parallel models trainer = Trainer(plugins=plugins, strategy=NLPDDPStrategy(), **cfg.trainer) - if ( - cfg.tensor_model_parallel_size < 0 - or cfg.pipeline_model_parallel_size < 0 - or cfg.get('pipeline_model_parallel_split_rank', -1) < 0 - ): - model_config = MegatronNevaModel.restore_from( - restore_path=cfg.neva_model_file, - trainer=trainer, - return_config=True, - ) - - with open_dict(cfg): - cfg.tensor_model_parallel_size = model_config.get('tensor_model_parallel_size', 1) - cfg.pipeline_model_parallel_size = model_config.get('pipeline_model_parallel_size', 1) - cfg.pipeline_model_parallel_split_rank = model_config.get('pipeline_model_parallel_split_rank', 0) - assert ( cfg.trainer.devices * cfg.trainer.num_nodes == cfg.tensor_model_parallel_size * cfg.pipeline_model_parallel_size @@ -385,6 +369,8 @@ def create_neva_model_and_processor(cfg): neva_cfg.mm_cfg.llm.from_pretrained = cfg.get('base_model_file', None) neva_cfg.apply_rope_fusion = False neva_cfg.fp8 = False + neva_cfg.tensor_model_parallel_size = cfg.tensor_model_parallel_size + neva_cfg.pipeline_model_parallel_size = cfg.pipeline_model_parallel_size # neva_cfg.mm_cfg.vision_encoder.from_pretrained = None model = MegatronNevaModel.restore_from( diff --git a/nemo/collections/nlp/modules/common/text_generation_strategy.py b/nemo/collections/nlp/modules/common/text_generation_strategy.py index fd32ac844274..44a80465c34b 100644 --- a/nemo/collections/nlp/modules/common/text_generation_strategy.py +++ b/nemo/collections/nlp/modules/common/text_generation_strategy.py @@ -69,7 +69,11 @@ def forward_step(self, batch, tensor_shape): fwd_bwd_function = get_forward_backward_func() output_tensor = fwd_bwd_function( forward_step_func=self.model.get_forward_output_only_func(), - data_iterator=iter([batch,]), + data_iterator=iter( + [ + batch, + ] + ), model=[self.forward_model], num_microbatches=get_num_microbatches(), forward_only=True, @@ -104,7 +108,7 @@ def tokenize_batch(self, sentences, max_len, add_BOS): @abc.abstractclassmethod def clip_max_len(self, maxlen: int) -> int: - """ clip the max len based on the LM model max sequence length + """clip the max len based on the LM model max sequence length Args: maxlen (int): the max len computed from the context and number of tokens to generate returns (int): @@ -119,7 +123,7 @@ def init_batch(self, context_tokens: torch.Tensor, context_length: int, compute_ context_length (int): the context token length compute_attention_mask: bool: set to True to compute attention mask (not needed for FA) Args: - context_tokens (torch.Tensor): The padded context tokens including the space for tokens to be generated + context_tokens (torch.Tensor): The padded context tokens including the space for tokens to be generated """ pass @@ -262,7 +266,7 @@ def __init__(self, model): self.forward_model = self.model.model def clip_max_len(self, maxlen: int) -> int: - """ clip the max len based on the LM model max sequence length""" + """clip the max len based on the LM model max sequence length""" # for positional embedding types that allow length extrapolation, don't clip the max length if self.model.cfg.get("position_embedding_type", "learned_absolute") == "learned_absolute": @@ -336,7 +340,7 @@ def __init__(self, model): self.forward_model = self.model.model def clip_max_len(self, maxlen: int) -> int: - """ clip the max len based on the LM model max sequence length""" + """clip the max len based on the LM model max sequence length""" # for positional embedding types that allow length extrapolation, don't clip the max length if self.model.cfg.get("position_embedding_type", "learned_absolute") == "learned_absolute": @@ -390,7 +394,11 @@ def forward_step(self, batch, tensor_shape_and_context_length): output_tensor = fwd_bwd_function( forward_step_func=self.model.get_forward_output_only_func(), - data_iterator=iter([batch,]), + data_iterator=iter( + [ + batch, + ] + ), model=[self.forward_model], num_microbatches=get_num_microbatches(), forward_only=True, @@ -406,6 +414,7 @@ def neva_process_prompts(prompt, tokenizer, multimodal_cfg, num_media_latents, c from nemo.collections.multimodal.data.neva.neva_dataset import ( DEFAULT_IMAGE_TOKEN, preprocess_llama_2, + preprocess_llama_3, preprocess_multimodal, preprocess_nv_dpo, preprocess_nvgpt, @@ -415,10 +424,18 @@ def neva_process_prompts(prompt, tokenizer, multimodal_cfg, num_media_latents, c list_data_dict = [] if multimodal_cfg["conv_template"] in ["nvgpt", "nv_steerlm", "nv_dpo"]: record = { - 'system': '\n' - if multimodal_cfg["conv_template"] == 'nv_dpo' - else 'A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user\'s questions.\n\n', - 'conversations': [{'from': 'User', 'value': prompt}, {'from': 'Assistant', 'value': '',},], + 'system': ( + '\n' + if multimodal_cfg["conv_template"] == 'nv_dpo' + else 'A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user\'s questions.\n\n' + ), + 'conversations': [ + {'from': 'User', 'value': prompt}, + { + 'from': 'Assistant', + 'value': '', + }, + ], } for turn in record['conversations']: @@ -441,7 +458,16 @@ def neva_process_prompts(prompt, tokenizer, multimodal_cfg, num_media_latents, c elif multimodal_cfg["conv_template"] == "llama_2": record = { - 'conversations': [{'from': 'human', 'value': prompt,}, {'from': 'gpt', 'value': '',},], + 'conversations': [ + { + 'from': 'human', + 'value': prompt, + }, + { + 'from': 'gpt', + 'value': '', + }, + ], } for turn in record['conversations']: @@ -453,9 +479,40 @@ def neva_process_prompts(prompt, tokenizer, multimodal_cfg, num_media_latents, c copy.deepcopy(list_data_dict), multimodal_cfg, num_media_latents ) # HARDCODED FOR NOW data_dict = preprocess_llama_2(sources, tokenizer, multimodal_cfg) + elif multimodal_cfg["conv_template"] == "llama_3": + record = { + 'conversations': [ + { + 'from': 'human', + 'value': prompt, + }, + { + 'from': 'gpt', + 'value': '', + }, + ], + } + + for turn in record['conversations']: + if turn.get('value') is not None: + turn['value'] = re.sub('', f'{DEFAULT_IMAGE_TOKEN}\n', turn['value']) + list_data_dict.append(record) + sources = preprocess_multimodal( + copy.deepcopy(list_data_dict), multimodal_cfg, num_media_latents + ) # HARDCODED FOR NOW + data_dict = preprocess_llama_3(sources, tokenizer, multimodal_cfg) elif multimodal_cfg["conv_template"] == "v1": record = { - 'conversations': [{'from': 'human', 'value': prompt,}, {'from': 'gpt', 'value': '',},], + 'conversations': [ + { + 'from': 'human', + 'value': prompt, + }, + { + 'from': 'gpt', + 'value': '', + }, + ], } for turn in record['conversations']: @@ -487,6 +544,7 @@ def __init__(self, model): is_multimodal=self.data_cfg.is_multimodal, sep_image_conv_front=self.data_cfg.sep_image_conv_front, conv_template=self.data_cfg.get("conv_template", "nvgpt"), + model_type=self.cfg.mm_cfg.llm.get("model_type", "nvgpt"), image_token_len=self.data_cfg.image_token_len, image_folder=self.data_cfg.image_folder, image_aspect_ratio=self.data_cfg.image_aspect_ratio, @@ -499,7 +557,7 @@ def __init__(self, model): ) def clip_max_len(self, maxlen: int) -> int: - """ clip the max len based on the LM model max sequence length""" + """clip the max len based on the LM model max sequence length""" if maxlen > self.model.cfg.encoder_seq_length + 1: maxlen = self.model.cfg.encoder_seq_length + 1 return maxlen @@ -616,7 +674,7 @@ def init_batch(self, context_tokens: torch.Tensor, context_length: int, compute_ ) def clip_max_len(self, maxlen: int) -> int: - """ clip the max len based on the LM model max sequence length""" + """clip the max len based on the LM model max sequence length""" if maxlen > self.model.frozen_model.cfg.encoder_seq_length + 1: maxlen = self.model.frozen_model.cfg.encoder_seq_length + 1 return maxlen @@ -681,7 +739,7 @@ def __init__(self, model): self.forward_model = self.model.model def clip_max_len(self, maxlen: int) -> int: - """ clip the max len based on the LM model max sequence length""" + """clip the max len based on the LM model max sequence length""" # for positional embedding types that allow length extrapolation, don't clip the max length if self.model.cfg.get("position_embedding_type", "learned_absolute") == "learned_absolute": @@ -830,21 +888,21 @@ def init_batch(self, context_tokens: torch.Tensor, context_length: int, compute_ # updating RetroEncoder (RetroEncoderCrossAttention, RetroEncoderBiasDropoutAdd, RetroEncoderLayerNorm) if contain_encoder: # the first cross-attention decoder layer contain encoder - layer.cross_attention.encoder.layers[ - 0 - ].cross_attention.retro_num_neighbors = inference_retro_num_neighbors - layer.cross_attention.encoder.layers[ - 0 - ].cross_attention.retro_chunk_length = inference_retro_chunk_length - layer.cross_attention.encoder.layers[ - 0 - ].cross_attention.retro_retrieved_length = inference_retro_retrieved_length - layer.cross_attention.encoder.layers[ - 0 - ].cross_attn_bda.retro_num_neighbors = inference_retro_num_neighbors - layer.cross_attention.encoder.layers[ - 0 - ].pre_mlp_layernorm.retro_num_neighbors = inference_retro_num_neighbors + layer.cross_attention.encoder.layers[0].cross_attention.retro_num_neighbors = ( + inference_retro_num_neighbors + ) + layer.cross_attention.encoder.layers[0].cross_attention.retro_chunk_length = ( + inference_retro_chunk_length + ) + layer.cross_attention.encoder.layers[0].cross_attention.retro_retrieved_length = ( + inference_retro_retrieved_length + ) + layer.cross_attention.encoder.layers[0].cross_attn_bda.retro_num_neighbors = ( + inference_retro_num_neighbors + ) + layer.cross_attention.encoder.layers[0].pre_mlp_layernorm.retro_num_neighbors = ( + inference_retro_num_neighbors + ) contain_encoder = False return context_tokens diff --git a/nemo/collections/nlp/modules/common/text_generation_utils.py b/nemo/collections/nlp/modules/common/text_generation_utils.py index 850eb3d5c778..722c493dfa9b 100644 --- a/nemo/collections/nlp/modules/common/text_generation_utils.py +++ b/nemo/collections/nlp/modules/common/text_generation_utils.py @@ -151,6 +151,7 @@ def megatron_gpt_generate(model, inputs, tokenizer, length_params, sampling_para def megatron_neva_generate(model, prompt_dict_list, length_params, sampling_params, inference_config, **strategy_args): + model_type = model.cfg.mm_cfg.llm.get("model_type", "nvgpt") conv_template = model.cfg.data.get("conv_template", "nvgpt") final_response = [] for idx, prompt_dict in enumerate(prompt_dict_list): @@ -180,8 +181,14 @@ def megatron_neva_generate(model, prompt_dict_list, length_params, sampling_para continue # Regular expression pattern to match the sequence - pattern = re.compile(rf'{DEFAULT_IM_START_TOKEN}( ⁇ )+{DEFAULT_IM_END_TOKEN}') - pattern_nvgpt = re.compile(rf'{DEFAULT_IM_START_TOKEN}({DEFAULT_IMAGE_PATCH_TOKEN})+{DEFAULT_IM_END_TOKEN}') + pattern = re.compile( + rf'{DEFAULT_IM_START_TOKEN[model_type]}( ⁇ )+{DEFAULT_IM_END_TOKEN[model_type]}'.replace(r'|', r'\|') + ) + pattern_nvgpt = re.compile( + rf'{DEFAULT_IM_START_TOKEN[model_type]}({DEFAULT_IMAGE_PATCH_TOKEN[model_type]})+{DEFAULT_IM_END_TOKEN[model_type]}'.replace( + r'|', r'\|' + ) + ) combined_pattern = re.compile(f'{pattern.pattern}|{pattern_nvgpt.pattern}') clean_text = re.sub(combined_pattern, f"<{media_type_token}>", response['sentences'][0]) @@ -199,6 +206,9 @@ def megatron_neva_generate(model, prompt_dict_list, length_params, sampling_para clean_response = clean_response.split("")[-2][10:] # [10:] for removing "Assistant\n" elif conv_template == "llama_2": clean_response = clean_response.rsplit("[/INST] ", 1)[-1] + elif conv_template == "llama_3": + clean_response = clean_response.rsplit("assistant<|end_header_id|>\n\n", 1)[-1] + clean_response = clean_response.rstrip("<|eot_id|>") elif conv_template == "v1": clean_response = clean_response.rsplit("ASSISTANT: ", 1)[-1] @@ -287,17 +297,17 @@ def tab_logits(logits, min_id, max_id, filter_value=-float('Inf')): def top_k_logits(logits, top_k=0, top_p=0.0, filter_value=-float('Inf'), started=None): """ - This function has been mostly taken from huggingface conversational - ai code at - https://medium.com/huggingface/how-to-build-a-state-of-the-art- - conversational-ai-with-transfer-learning-2d818ac26313 - - @param logits: logits tensor - @param top_k: keep only top k tokens with highest probability - @param top_p: keep the top tokens with cumulative probability - @filter_value: value to set filtered tokens to - @started: a tensor of bools indicating whether the text generation starts for the batch - returns the filtered logits + This function has been mostly taken from huggingface conversational + ai code at + https://medium.com/huggingface/how-to-build-a-state-of-the-art- + conversational-ai-with-transfer-learning-2d818ac26313 + + @param logits: logits tensor + @param top_k: keep only top k tokens with highest probability + @param top_p: keep the top tokens with cumulative probability + @filter_value: value to set filtered tokens to + @started: a tensor of bools indicating whether the text generation starts for the batch + returns the filtered logits """ if top_k > 0: # Remove all tokens with a probability less than the @@ -333,7 +343,7 @@ def top_k_logits(logits, top_k=0, top_p=0.0, filter_value=-float('Inf'), started def repetition_penalty(logits, repetition_penalty, used_tokens): - """ Implement the repetition penalty, check paper + """Implement the repetition penalty, check paper https://arxiv.org/pdf/1909.05858.pdf """ if used_tokens is not None and repetition_penalty != 1.0: diff --git a/nemo/collections/nlp/parts/mixins/multimodal_adapter_mixins.py b/nemo/collections/nlp/parts/mixins/multimodal_adapter_mixins.py index 5da7296519cb..1a5321065fa9 100644 --- a/nemo/collections/nlp/parts/mixins/multimodal_adapter_mixins.py +++ b/nemo/collections/nlp/parts/mixins/multimodal_adapter_mixins.py @@ -12,26 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os -import tempfile from typing import List, Optional, Union import torch -from omegaconf import DictConfig, OmegaConf, open_dict from nemo.collections.nlp.models.nlp_model import NLPModel -from nemo.collections.nlp.parts.mixins.nlp_adapter_mixins import NLPAdapterModelMixin -from nemo.collections.nlp.parts.peft_config import ( - PEFT_CONFIG_MAP, - CanonicalAdaptersPEFTConfig, - LoraPEFTConfig, - PEFTConfig, - PtuningPEFTConfig, -) +from nemo.collections.nlp.parts.mixins.nlp_adapter_mixins import NLPAdapterModelMixin, replace_prefix +from nemo.collections.nlp.parts.peft_config import PEFT_CONFIG_MAP, PEFTConfig, PtuningPEFTConfig from nemo.core.classes.mixins.adapter_mixins import AdapterModuleMixin -from nemo.core.connectors.save_restore_connector import SaveRestoreConnector from nemo.utils import logging, model_utils -from nemo.utils.model_utils import inject_model_parallel_rank try: from megatron.core import parallel_state @@ -46,7 +35,9 @@ class MultimodalAdapterModelMixin(NLPAdapterModelMixin): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - def _get_all_keys(self,): + def _get_all_keys( + self, + ): # TODO (yuya): p-tuning need additional handle, check peft models. """ Returns all the keys in the model @@ -57,35 +48,50 @@ def _get_all_keys(self,): return set(k) def add_adapter(self, peft_cfgs: Union[PEFTConfig, List[PEFTConfig]]): + if self.cfg.get('virtual_pipeline_model_parallel_size', None): + raise ValueError('Virtual pipeline model parallel is not supported when using PEFT') + if self.cfg.optim.name == "distributed_fused_adam": + raise ValueError('distributed_fused_adam is not supported for PEFT. Please use fused_adam') + + self.use_peft = True if not isinstance(peft_cfgs, List): peft_cfgs = [peft_cfgs] + # @chcui crucial to set self.virtual_tokens and self.use_peft for all PP ranks + for peft_cfg in peft_cfgs: + if isinstance(peft_cfg, PtuningPEFTConfig): + self.virtual_tokens = peft_cfg.virtual_tokens + ptuning_only = len(peft_cfgs) == 1 and isinstance(peft_cfgs[0], PtuningPEFTConfig) + self.ptuning_only_and_non_first_stage = ptuning_only and not self.first_stage_of_pipeline() + if self.ptuning_only_and_non_first_stage: + # There are no params to add if we are not in the first state of the pipeline + return + self.base_keys = getattr(self, "base_keys", self._get_all_keys()) logging.info(f"Before adding PEFT params:\n{self.summarize()}") - self.use_ptuning_only = len(peft_cfgs) == 1 and isinstance(peft_cfgs[0], PtuningPEFTConfig) - for peft_cfg in peft_cfgs: - if self.use_ptuning_only: - if not self.first_stage_of_pipeline(): - # There are no params to add if we are not in the first state of the pipeline - continue - self.virtual_tokens = peft_cfg.virtual_tokens - self._check_and_add_peft_cfg(peft_cfg) logging.info(f"After adding PEFT params:\n{self.summarize()}") self.adapter_keys = self._get_all_keys() - self.base_keys - if self.megatron_amp_O2: - self.adapter_keys = set(key.replace("model.module.", "model.", 1) for key in self.adapter_keys) + self.tunable_base_param_keys = set() for cfg in peft_cfgs: - if cfg.weight_tying: + if hasattr(cfg, "weight_tying") and cfg.weight_tying: self.tie_weights(cfg) - self.use_peft = True + + if hasattr(cfg, "tunable_base_param_names") and cfg.tunable_base_param_names: + self.set_tunable_base_params(cfg) + + if self.megatron_amp_O2: + self.adapter_keys = set(key.replace("model.module.", "model.", 1) for key in self.adapter_keys) def load_adapters( - self, filepath: str, peft_cfgs: Optional[Union[PEFTConfig, List[PEFTConfig]]] = None, map_location: str = None, + self, + filepath: str, + peft_cfgs: Optional[Union[PEFTConfig, List[PEFTConfig]]] = None, + map_location: str = None, ): """ Utility method that restores only the adapter module(s), and not the entire model itself. @@ -110,22 +116,27 @@ def load_adapters( else: map_location = 'cpu' - if filepath.endswith('.nemo'): - conf, state_dict = self._get_config_and_state_dict_from_nemo(filepath, map_location) - elif filepath.endswith('.ckpt'): - state_dict = torch.load(filepath, map_location)['state_dict'] - else: - raise RuntimeError(f"{filepath} is not nemo file or ckpt file") + # TODO (yuya): this logic needs to change for dist ckpt because after + # adding adapaters the checkpoint will change if not peft_cfgs: assert filepath.endswith( '.nemo' ), "Inferring peft scheme is only supported for .nemo checkpoints. Please supply the `peft_cfgs` argument." peft_cfgs = [PEFT_CONFIG_MAP[conf.peft.peft_scheme](conf)] self.add_adapter(peft_cfgs) - assert set(state_dict.keys()) == self.adapter_keys - - if self.megatron_amp_O2: - state_dict = {k.replace("model.", "model.module.", 1): v for k, v in state_dict.items()} + if filepath.endswith('.nemo'): + sharded_state_dict = None + if getattr(self, "sharded_state_dict", None) is not None: + sharded_state_dict = self.sharded_state_dict(prefix="model.") + conf, state_dict = self._get_config_and_state_dict_from_nemo(filepath, map_location, sharded_state_dict) + elif filepath.endswith('.ckpt'): + state_dict = torch.load(filepath, map_location)['state_dict'] + else: + raise RuntimeError(f"{filepath} is not nemo file or ckpt file") + if self.cfg.megatron_amp_O2: + state_dict = {replace_prefix(k, 'model.', 'model.module.'): v for k, v in state_dict.items()} + if not self.ptuning_only_and_non_first_stage: + assert set(state_dict.keys()) == self.adapter_keys.union(self.tunable_base_param_keys) missing_keys, unexpected_keys = NLPModel.load_state_dict(self, state_dict, strict=False) diff --git a/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py b/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py index 123f0f06a33d..ca5820772c62 100644 --- a/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py +++ b/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py @@ -30,6 +30,7 @@ from nemo.collections.nlp.modules.common.megatron.adapters.parallel_adapters import PromptEncoderAdapterConfig +from nemo.collections.nlp.parts.nlp_overrides import NLPSaveRestoreConnector from nemo.collections.nlp.parts.peft_config import ( PEFT_CONFIG_MAP, CanonicalAdaptersPEFTConfig, @@ -38,11 +39,13 @@ PtuningPEFTConfig, ) from nemo.core.classes.mixins.adapter_mixins import AdapterModuleMixin -from nemo.core.connectors.save_restore_connector import SaveRestoreConnector from nemo.utils import logging, model_utils try: - from megatron.core import parallel_state + from megatron.core import dist_checkpointing, parallel_state + + HAVE_MEGATRON_CORE = True + except (ImportError, ModuleNotFoundError): HAVE_MEGATRON_CORE = False @@ -56,7 +59,7 @@ def replace_prefix(name, old_prefix, new_prefix): class NLPAdapterModelMixin: - """ NLP Adapter Mixin that can augment any transformer-based model with Adapter module support. + """NLP Adapter Mixin that can augment any transformer-based model with Adapter module support. This mixin class should be used only with a top level ModelPT subclass, that includes either a `model` or an `enc_dec_model` submodule. This mixin class adds several utility methods to add, load and save adapters. @@ -92,7 +95,9 @@ def first_stage_of_pipeline(self): logging.warning("no attribute named model or no model.pre_process found. Can not detect stage of pipeline...") return False - def _get_all_keys(self,): + def _get_all_keys( + self, + ): """ Returns all the keys in the model """ @@ -216,15 +221,18 @@ def add_adapter(self, peft_cfgs: Union[PEFTConfig, List[PEFTConfig]]): if hasattr(cfg, "tunable_base_param_names") and cfg.tunable_base_param_names: self.set_tunable_base_params(cfg) - def _get_config_and_state_dict_from_nemo(self, filepath, map_location): + def _get_config_and_state_dict_from_nemo(self, filepath, map_location, sharded_state_dict=None): cwd = os.getcwd() + save_restore_connector = NLPSaveRestoreConnector() with tempfile.TemporaryDirectory() as tmpdir: try: - SaveRestoreConnector._unpack_nemo_file(filepath, tmpdir) + if os.path.isfile(filepath): + save_restore_connector._unpack_nemo_file(path2file=filepath, out_folder=tmpdir) + else: + tmpdir = filepath os.chdir(tmpdir) - config_yaml = "model_config.yaml" model_weights_ckpt = "model_weights.ckpt" @@ -233,7 +241,22 @@ def _get_config_and_state_dict_from_nemo(self, filepath, map_location): os.chdir(cwd) model_weights = os.path.join(tmpdir, model_weights_ckpt) model_weights = inject_model_parallel_rank(model_weights) - state_dict = torch.load(model_weights, map_location=map_location) + state_dict = save_restore_connector._load_state_dict_from_disk( + model_weights, map_location=map_location + ) + + # distributed checkpointing + if state_dict is None and sharded_state_dict is not None: + checkpoint = dict(state_dict=sharded_state_dict) + tmp_model_weights_ckpt = os.path.join(tmpdir, save_restore_connector.model_weights_ckpt) + tmp_model_weights_dir = os.path.splitext(tmp_model_weights_ckpt)[0] + assert os.path.isdir(tmp_model_weights_dir), f'Expected {tmp_model_weights_dir} to be a directory.' + checkpoint = dist_checkpointing.load( + sharded_state_dict=checkpoint, + checkpoint_dir=tmp_model_weights_dir, + ) + state_dict = checkpoint["state_dict"] + return conf, state_dict finally: os.chdir(cwd) @@ -271,7 +294,10 @@ def setup_optimizer_param_groups(self): super().setup_optimizer_param_groups() def load_adapters( - self, filepath: str, peft_cfgs: Optional[Union[PEFTConfig, List[PEFTConfig]]] = None, map_location: str = None, + self, + filepath: str, + peft_cfgs: Optional[Union[PEFTConfig, List[PEFTConfig]]] = None, + map_location: str = None, ): """ Utility method that restores only the adapter module(s), and not the entire model itself. diff --git a/nemo/collections/nlp/parts/nlp_overrides.py b/nemo/collections/nlp/parts/nlp_overrides.py index f50a467cf71a..e8f7009b791c 100644 --- a/nemo/collections/nlp/parts/nlp_overrides.py +++ b/nemo/collections/nlp/parts/nlp_overrides.py @@ -1236,7 +1236,9 @@ def dummy(): tmp_model_weights_dir = os.path.splitext(tmp_model_weights_ckpt)[0] assert os.path.isdir(tmp_model_weights_dir), f'Expected {tmp_model_weights_dir} to be a directory.' checkpoint_io = DistributedCheckpointIO.from_config(conf) - checkpoint = checkpoint_io.load_checkpoint(tmp_model_weights_dir, sharded_state_dict=checkpoint) + checkpoint = checkpoint_io.load_checkpoint( + tmp_model_weights_dir, sharded_state_dict=checkpoint, strict=strict + ) instance.on_load_checkpoint(checkpoint) if hasattr(instance, 'setup_transformer_engine_tp_groups'): instance.setup_transformer_engine_tp_groups() diff --git a/nemo/utils/callbacks/dist_ckpt_io.py b/nemo/utils/callbacks/dist_ckpt_io.py index 905de4eb3567..b95be90274e3 100644 --- a/nemo/utils/callbacks/dist_ckpt_io.py +++ b/nemo/utils/callbacks/dist_ckpt_io.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import os import shutil from abc import ABC, abstractmethod from contextlib import contextmanager @@ -29,6 +30,8 @@ try: from megatron.core import dist_checkpointing + from megatron.core.dist_checkpointing.dict_utils import extract_matching_values + from megatron.core.dist_checkpointing.mapping import ShardedBase from megatron.core.dist_checkpointing.strategies import tensorstore from nemo.utils.callbacks.torch_dist_async import AsyncCallsQueue, AsyncRequest, TorchDistAsyncSaveShardedStrategy @@ -234,7 +237,11 @@ def save_checkpoint( @_debug_time('DistributedCheckpointIO.load_checkpoint') def load_checkpoint( - self, path: _PATH, map_location: Optional[Any] = None, sharded_state_dict: Dict[str, Any] = None + self, + path: _PATH, + map_location: Optional[Any] = None, + sharded_state_dict: Dict[str, Any] = None, + strict: Optional[bool] = True, ) -> Dict[str, Any]: """Loads a distributed checkpoint. @@ -259,10 +266,37 @@ def load_checkpoint( else: sharded_strategy = None + if not strict: + sharded_state_dict = self.adjust_non_strict_load(path, sharded_state_dict) + return dist_checkpointing.load( sharded_state_dict=sharded_state_dict, checkpoint_dir=path, sharded_strategy=sharded_strategy ) + def adjust_non_strict_load(self, path: _PATH, sharded_state_dict: Dict[str, Any]): + ckpt_sharded_metadata = dist_checkpointing.load_tensors_metadata(path) + loaded_keys = [] + missing_keys = [] + unexpected_keys = [] + + def should_remove_missing_sharded_base(x: Any): + if isinstance(x, ShardedBase): + if x.key in ckpt_sharded_metadata: + loaded_keys.append(x.key) + return False + else: + unexpected_keys.append(x.key) + return True + return False + + _, sharded_state_dict = extract_matching_values(sharded_state_dict, should_remove_missing_sharded_base) + logging.info(f'The following keys are not in the checkpoint and will not be loaded: {unexpected_keys}') + + # TODO: compute missing_keys by: + # 1. all_gather_object of loaded_keys + # 2. missing_keys = ckpt_sharded_metadata.keys() - loaded_keys + return sharded_state_dict + @_debug_time('DistributedCheckpointIO.remove_checkpoint') def remove_checkpoint(self, path: _PATH) -> None: """Remove a distributed checkpoint. From fe7e2e5767940d3aa114a161072f6905fa3b8057 Mon Sep 17 00:00:00 2001 From: huvunvidia <86480512+huvunvidia@users.noreply.github.com> Date: Wed, 22 May 2024 11:41:25 -0400 Subject: [PATCH 08/47] RAG Pipeline (#9143) * first commit * working pipeline rag_indexing; rag_eval with rag.yaml * udpate RAG documentation * add image to documents * cleaning docs * before merge from main * refactor code to make it easier to support more customized embedder and LLMs in future * addressing Ali's comments * addressing Ali's comments * addressing Ali's comments * fix Code scanning results / CodeQL --------- Co-authored-by: Huy Vu2 --- examples/nlp/rag/conf/rag_generating.yaml | 37 +++++ examples/nlp/rag/conf/rag_indexing.yaml | 19 +++ examples/nlp/rag/images/rag_pipeline.png | Bin 0 -> 86786 bytes examples/nlp/rag/rag.md | 141 +++++++++++++++++ examples/nlp/rag/rag_generating.py | 49 ++++++ examples/nlp/rag/rag_indexing.py | 44 ++++++ nemo/collections/nlp/models/rag/__init__.py | 16 ++ .../nlp/models/rag/custom_bert_embedder.py | 145 ++++++++++++++++++ .../nlp/models/rag/custom_gpt_llm.py | 130 ++++++++++++++++ 9 files changed, 581 insertions(+) create mode 100644 examples/nlp/rag/conf/rag_generating.yaml create mode 100644 examples/nlp/rag/conf/rag_indexing.yaml create mode 100644 examples/nlp/rag/images/rag_pipeline.png create mode 100644 examples/nlp/rag/rag.md create mode 100644 examples/nlp/rag/rag_generating.py create mode 100644 examples/nlp/rag/rag_indexing.py create mode 100644 nemo/collections/nlp/models/rag/__init__.py create mode 100644 nemo/collections/nlp/models/rag/custom_bert_embedder.py create mode 100644 nemo/collections/nlp/models/rag/custom_gpt_llm.py diff --git a/examples/nlp/rag/conf/rag_generating.yaml b/examples/nlp/rag/conf/rag_generating.yaml new file mode 100644 index 000000000000..dcd86b1b220e --- /dev/null +++ b/examples/nlp/rag/conf/rag_generating.yaml @@ -0,0 +1,37 @@ + +trainer: + devices: 1 + num_nodes: 1 + accelerator: gpu + logger: False # logger provided by exp_manager + precision: 'bf16-mixed' + use_distributed_sampler: False + +indexing: + embedder: + model_type: bert + model_path: null + embed_batch_size: 128 + data: + data_path: null + chunk_size: 256 + chunk_overlap: 10 + index_path: null + +generating: + llm: + model_type: gpt + model_path: null + query: null + inference: + greedy: False # Whether or not to use sampling ; use greedy decoding otherwise + top_k: 0 # The number of highest probability vocabulary tokens to keep for top-k-filtering. + top_p: 0.9 # If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation. + temperature: 1.0 # sampling temperature + add_BOS: True # add the bos token at the begining of the prompt + tokens_to_generate: 500 # The minimum length of the sequence to be generated. + all_probs: False # whether return the log prob for all the tokens in vocab + repetition_penalty: 1.2 # The parameter for repetition penalty. 1.0 means no penalty. + min_tokens_to_generate: 0 # The minimum length of the sequence to be generated. + compute_logprob: False # a flag used to compute logprob of all the input text, a very special case of running inference, default False + end_strings: ["<|endoftext|>"] # generation will stop when one of these tokens is generated \ No newline at end of file diff --git a/examples/nlp/rag/conf/rag_indexing.yaml b/examples/nlp/rag/conf/rag_indexing.yaml new file mode 100644 index 000000000000..049afc1dbbfe --- /dev/null +++ b/examples/nlp/rag/conf/rag_indexing.yaml @@ -0,0 +1,19 @@ + +trainer: + devices: 1 + num_nodes: 1 + accelerator: gpu + logger: False # logger provided by exp_manager + precision: 'bf16-mixed' + use_distributed_sampler: False + +indexing: + embedder: + model_type: bert + model_path: null + embed_batch_size: 128 + data: + data_path: null + chunk_size: 256 + chunk_overlap: 10 + index_path: null \ No newline at end of file diff --git a/examples/nlp/rag/images/rag_pipeline.png b/examples/nlp/rag/images/rag_pipeline.png new file mode 100644 index 0000000000000000000000000000000000000000..810ef254e857017679567c1547c9a51f29b6cddc GIT binary patch literal 86786 zcmeGDWl&se8wH5Qa0?J5xVr>`6C8rOyGzi2OA&gb{bUYi1=7cZW2 z#f1fwTyzc=5j=)3o?wnulLobFlkc#%*k%?~8JOVsHWkFlh!tXl;Wkg55egR+a|*O# zG;@-e4b1J5?-l|ZQs+zzO?JGOGR`>X=cY!=w$s4rt;<)oEz6LGgEXPD2QQuF(YuFP zZ)hTtzb}dGPqzude_vG6*bHw`|9wS^{Qp1xyBMhdx78rH8|5$84C<6T9#A3#%OiQ3 z%JMhGxJ_vIU+dlfBIn3u`@Oop=_#*SI{)GfQfIC^ls=0*eR+ZQ?>(a(*N33~ZU;#f zZ0G;`$AE=AYh60smWBF?ZbIPQa7;B=Drve}D%Kn~!{66>8J z=M0N3lEdK+kx*RTM%X{{s$HZK%ILthe^=}LjvPUyUiz|m-a&+$k)2~pS=Kcts+0G4 zt_t6)cZB%9Oh3r&d!9?};F+1~%yG4##|N(+v^0fC>!8TQD-o-jbYB=}Q2+7Lw9Q%f zSmxVx3mguPMBxjKC9}ms))Z9-wj-$n0a!L^>~F^T7qvq6h?*a;rSUGD3_=KXYp_fr zy)nt9GZydfN`GTjqbGhIID%jzgDwlgr}5oaIWM?~kKePulbpPhT^-H%Yrl4Jo1boU|K~E@cr@5I*VLMKbT!hU29L=DcXH}G1zn7{;N>3;!Eq7<9UAY zh!b5h*E`QV>y8F7Fdx#<8>@~uc4)n>X60jUOhUTg+2gl(i(vRO(bjNYLzPuCyYkdJ zvUL0ftSP*6rxL=JN8%UG=OzaEUN|h#ckKgWjo&7HWFNINUVY{gTvTGpy`|38n{mn zqqP5k{5yU+bvLST`EZ9?jqqv_M`yT!Ers&YTBz1=3nsR5S6%?>ukCG8r80&Ztu5p* z^5XqMj{mlYKY*X6$&SD7%?mYA;dJ&xl4+LKOK*V+&npAD`~Jm4fp5*2#n6XO`_UDb za($2FNu2XA+iSgjND5S;4p!(rD4Kbal$@D4s(c>TP8Rj#k7L9y)lR#muG3~1Q|iOy z)7}Ex_(D=( zo)xyixrV}~PN0dpY&XVy{=+7fQZLUpMR~6JKE`LXQG(zLgaU!N-0cg~`#3 zx!spaozSz_E7+;~;&NB{+Pdz`<=2K7vn!|a>r*jy&ks-e*=~AI$gjMT-eb{;9e*fs zW^NbcGP-Zl4l0*!_w}gDkP$$7bB-1nO6Uop^rdj;pqzBH_%; z>_aD(kqVcm;9Z=B?CY=0o+y7xm7pf>ZCo5HSGS}7nmp(=>!4Q7%_7=LP@_n=<6IK1 z15Vu!S9@c*7`wqkLq4xR>vL~%L&Qbg}%gKZcVeWo0>%7{f z#xSqh_HVQr5OjLWz`}p=&5ac*VkGxm!%l{{ZP~#X{95DFV}Zps{#Z@`HLRFU-zv`u z&-)Y>kq4HrMDEF=$a?+TiI*J7f)i%j36PcV^&nEb)E2_XUESO6cnUlBUBmOzi{N9@ zVdmRKg6>vsDaa7&lI$lp9l;uvQF6Ko;lD4H-pkH_ace5mJ?23c?8e;;zwl8_4#`JBn_Judvwt}VkK`=!*ijak3a@IsXNFs7s5-uBs zJAdITAYdm2}`qwmyyFfo&jFs3~MSD24WCmyNyJI;r zft&Jk)?NFGI3uUC?B_$(eLn$(;Zkv2t zXN_p)m431=?P}`=VMmyq-g28+_-?E^N3UTeAHz|QiY}O%BQ-*F9!BI#Zhupj;l&75JO3IT{G|TD&YJV> z>;2SpT;z;$)m&af63xDnZzy9UlMc5>@^rW#UwAIDd7=bDoOWIy;#9wR8Yl^4@u>Uj zJaN_73`raGAx?(8?fm62CtvwDsIRR~zK&w@;UUc=PGXgWwfS3KBAXVeRdJ2-#x=jX?uF1D_8R%?Z=CO3-Zf;LfxiJ`W}o*%)DKh z@3dm#g8tfkR$33JT}y}kCjl2OLLSQJE0Y@es)8>c7G6e>TVy+CL^8RL^_r)5aaV1lD+MAB8$>I8?Xjnq8j;_6flDVZ$d|vON>;zEs zQzCpG86GQluBrFg7>P)}9$AJTAjK#R*NJUyKN#~I`+dyA(F?`!jKLzyAVY1w{a`tI zfh0NX2;0d2c)rC6P-3`!r`itqgMoWmFmsy>dxpxyZ)N`$7V;x&Lr?<>qOFI1bwNno zajudT56w%--)1uvPmrpM>wm=QkpG?eemm%EmC~axGHffRZ+4UC@%l*pERidU;7WDB ztj&v>{hmbUzgLs|b9v3@CEMYA$UPhSnRC!isW8cCyXlAv5XI_3YlT!9O~C?xjbPfJ z2a76nvAq=mT7PRq4UEL`jy`115w}g6)uttAKmQeog$mC$5=0f^L?|V)+*s&XM)B_h zv(h?%;Vt=r?(c?OhUQVx2#Ipqei)&j1?LxZD}*mUDJNT9=ic2*68?ytc_TiJm? za-l=!nFq1^9Z-Sr|9)5aQuI~e{mq#@g%Ddm>#DEEP*M^$#(#>~=!a-MfmhIQI8+v! z-#`Mdpx`j=#3Ka%y+K^DIAh=|D0mR25N6dKNI-%}yyEZBgd2j+^`d+PQuq|Ns`o*w zOJH|%;(r$*YeD=Kbb|PRhtoW-rU~tTH+x{ws6>rsL2j#R5cJhs{cnKm^Ed2C9zE)S zxOAKfZl%x zNxjNpw^f-65_lB+@E?GdH7N##r%q*fK%%vx3;pk?NhD}Tbow}&V6l!CoAPJ=4lP)L zQW?Zimd3U0rO@r{{eP?8tn)Pv``cN|dLr+Y@v+qZ+yu;FPwWZY~+=JflB^>>KDeI$tgzwt5h!8CKmst52y+wBp2u*kDzHH71J z1O^4IkLOClA|MD^T9*7yod0w=YX|~?d}3qKX_N{wtK}$;E^aW9(XibYO0ChMFr|k7v+LQG$kovzlih}xQkhorN9;de zp=$%R+kIO9v(>4@Sjh(q4d|`+rwD!l!UUgOd@{Ul#p%b!#@rbHgh%;lz4|}9!nS^S zIA4S~esi{cLBLG8I#^{q2wLw6*P8S5{d3~O{T!{_q@tpt4o3^> zQ`FDFbRKlFCL)!=i!Yf#H|9>olsxiq2Wedop-h*{jrntjoiR1cemg|0gN28?tCP({ z-6HhkYSYmot8d#!M-lO~s#(^HjczEwxYR#>+4$tRr*wb49_fZcZhCXNx%2DCYdk!> zbAmtaCxO%n1sG|CA+zS{F#B@X?;49K=g!EQe_U-@NYw&|s1zslavgBRmkA1R(4Q^)^A8%ZJo!2884ZE~@SC^q5vXmQon_y0 zEHpYI02|pGh=lvUUHP*$%)Owmc0AOjziKR0!Nv}EmsaQdGbk1_Wy-^q{|Pe$olpZ8 zIZETIx}E{RTN%RC=nNrPa#=@Go=ATA`q>E3cXL;Ec7jPnzL7Qcogn>VuPO%h{6<8S z+vC{sCbUV3kug52i_Xi+t9~qKl)&?=pi4DsIz$OQ{bhM?$mVhZi$eUzFQ_Depf*mX zm&9t3uvfRTvXXiA3do0?4H33Mn*<3 zWjdY^U`t9oSF@Fd^`3XNd!;M}U9cRSoV%jrjGsQ?5fc;dT@%$s8xnp&)6Noxdq<}x z2l<06s~GLtZC5)2E--VU0Nd;a5gUI&!t(crKie8stj0K60vBK5c+WLI-tHIaXPuh^ z4wrQh9SvIj^+U+z>EWu<8KR?nDu;;A#WDq2U1+dZYw>7sEAvO)!ou6Yv|Awk;hUP8 z$`#>uyPGfn(@g%K)rt1h`^RIytqKm%!C^_6jHZWEE9Mb#acMx9+-^1#m9w`2Yu)7m zHZLxo8I8f??yxS2*OQAux25DNhkya^lZ)K2(n6~@pY>96;nkLoU9)1Y1V;052g1&B zE8jOP#(YTTcrH`!{Z&q zjo#7o?y?(57|M(K_TG;-lz=Bh6ckYX@xhYl{QUeq{3y&+HY?wZ3__S+=+TgM;Nl`` zCpo#OCCNYL*?h$Oi#*#< zLP6*0{sW1Ok-$%DrFBc=?tfyRC9dgn0`~oGP=9Q8s;VnN+J9ALv&^fE(?0%$m(%Mz zX~d@goI&GiN{F>Uj*i`g-uFKQq1`d?@xzzBpYRQP!p7XufS~m}&ddL02N63|k?}#Q9l`D`V2KPSpRbK8*6*IXU>wzhW|A!-Np5Xw`|HnXy zj*OfIvfLWkpA(R!ovP;EzJ?l>CI(GD%JT~f&UnR5z6H=t{c!fXo%vOwSPAJKCvERLj!|J62C_P z8$3fJjjZf!R7}h~OgLyVqTp}e6!uTmTq8amwfn*BxuRinaBvWLKi>DnQlh}Wqw?{% zIW4r!$r1@I%2lg0*sFCpSpzvaeFk98l)cnwtd(mz-8iWaGm88FxeZY~e|l-g^29E&(D=8Ew=33>xn2k><+Ghq_E2a*HB--;{9|-_`BAs_&Ft_&hbtp ztsl&?SjLAj4M@GFHYOwXjicmpJtd1E9dWl=}%>6 z9Ehh4DpD@aQYqCiKPcCzF$dY%DfHGd?+f(gW3%Pp0|=;c`wueQgvVmgYD;1=Jic1? zE-u_-4*yJ0D?Ye^wW|^KLp=2UM_w~v8Sd+7j!{Kjb zZI>#uF_MGdHIl;)H9dL)eRnbInn0`=%HSh}G@l`-yB~2r=kZud*`CwQ!Akcl>o?HQ zMah`zoSnXQjQ^Nt9PRzhcE;WtI&PV_xh21LC-SHI)6?po>sa}b=&z1&?~E?@Agxak z$sbLN-j^&kI(GIFdWW%EOs@v&|zwd{3UCXrd)n@d3zC7dJVw3H~Sm;8Cu&Avfy_8xFmNNdo6c~Ke8J`=ZD=Tnb;4Itopox&(mzs6@2X)c77SN(x#*^ zKDna4-qYe>M)a|i6RHZcE#YZ01Ty+ZKj?BBvxFzh_rUUujZ=gMOTH~EWqkby`aTCXztva90l$99IlgdMX?ux5 zqh=SyA4g#4a{z$+&*3;fY}b4C%yCI9;-f(V5ri;EIR3+_?8VEacGzjjtG-f<%e7Xx z&sfk|L|vm3>W1?aPu*xS1$k_fOcFDBjZxB?Zx`5El6|sXS!P(qYO!3FO`Fz-JQKQ{ zO$qO0b)sHByVZCaU8N3;&BcLig9J;LOc^)4q_TnsI5 zJp{BDNUA?d>HCLKiycQLtcxc*5*r3SW8+^yPE;z_(e{LCxNwR5zfO?irU`&1_u!b_ z$gJR4Qg|#{|Bk?UgKkdcXl7NTYb)V^MU^^d5=fC}w{~c@ZFY~jUM&Df%n74e`^D;s zHjFHnnqAfEtiN;JxSb8NE4LA;W+h9?))Oht0SLep@jbIJL|n6;=!btAtmFs%Jp}z* zXQF)ZBqG(7eh83TaaMrBihrs|SsAuJaOBy&>5cTXBSv%RHg7NXr&jK=1q1G(@Z_3v zLT#y65wB>h19Lwu&?Ja(UNKW?P0I6)--;|YhKFu5tgp=ta(HiLHGo`=(c*9EjHG0 zZAHI$E={Q@`%YB#^fI~fb!yBpWYRwsN+9Is@VDPnsrO)z4<^XH6r4(lmv0EkulEHVPSL0_3WotHg>TAv;@ zOJ}@`^58MPam0SV}6Mv(K~Y@pwt^b)$2TSfJFqw%-g9W@omXG0GbK1bBr*!P8Y(a+I% zN-bfCL^>jjz9GTxt7dLhfl|$@qM1Tp^T1reTss%K$cMzIXmtdv5uKip&V1WFv$wvo zx?QEo;OPJiZ{( zyH}_#*FRGhL5B)rRm&N1Z6%8N$Zi*V0TKAzMd|3%UZ|kY|I6OMi~D5vM;r^yMa#$x94-R0p0_~bd8^M;;#S18-Pny%0n3d7@6z(*GsXC;x~ z{?h2W-h%`diL+H81xqC}yM^{W_SsGYxN|Zo+8<$Dh!$u)o@NQgpBCuMzrM&Q2!)oN z{u(RE)I^9~=kE{4UbZCc^Ms1WLPr@1y()2BV3!uD%kpM#fl^SIrLY&UHkvUM7fao+}7nk=luh7&pb_$xVzidIw zwQ-i&%qPW4(Kp{|!UDM2iB1~%$#r*3-wH&!?1tcelR+BC$2IDX51NqP6^r_ zC)aB6;Os^t5Ply$N=&#wTc2$?S5=@^nZ@N5d)g^nE7McFakT@G%$1J&D#D*X@41I< zB;{BJjGG4v#wH*g*Q;8+qgF}!AxJ~O>k^#G-U0gcMD@6c;bV9wOSEBb7GtOfpaF;7 zaVVSSlRzB9pEExH%R7a@OvmLwGiBOEm+msqhcxjb$g8E@AQ-23Kn-}cI69?F7r^FT5X_TP8c3(Gv;!Pg zc|N37n{Q4tpJPA9rZ;1p>ygN^kka+{H%OYH@92Gd z!f}eD1a3Fp&J_V1NiqD3@nRYp8VLw%jLDEA&W!L`t@P?t5ph~5_vSY*_+$~mK#N47 zl~zMOr>6!ON|cQD#T|cRZfz~CgJ<&|_(}gl^ZdURHs{D;SkbumJL>yIg_HYbPZ4{SPB+QFO$5!S^rxyAAP%c&} zN@kU}z5wfqdPaZR~iFXcUDANzg-s=W?-=m9R~I~L{xWZwv#wq(EdV6e09A<%zl zC{p7$vhelwJ?_TREhxeTaP+u)xI{etR}-t$EUWE&xk3#nR`;NBQ8(r^4i9K=>uZFk zBgD4hs1Kfz(SSnX^0&ov8K|w}0CeJ>`M`Ib0S7>DijKUd&X}mM!k*Z{bN>rCvF~=V z@qK`9%Okps%t#W4zCi@ClfqGg`l|0_Y`l6ZbFOB}RDXy+rv*?5s2u@-JFuy#X({$0 z$79eP0?v1PlN;a>dAf*#%O({sX3)mpploSLm%yNnbUhE^|S)@8pBnsC?(0jOJ*N3%G9(< z{@OgX6O!Ws#h2_tHbr{zeJcHsm1i3C>w|PETcLpcoM|{&SM1k}weHaES%nfcrj6lL z)z)m$*bhC+%ewJ&vn`(7jIdc+;$FP~!^6R#W;XEv$Kqc0(hq8YiCHpFU_ZrdzWRNd zR%k)H`j;+mpuBm2CE2IgY}zEzXiyu~fEynWs|a z$CoOhjkZ1@(kL-z*YW{m6iYmqbU>WlW~u8BoKs90rJ`3adc)On&XR8alwY~z@q#d!YcvtoJ{%!wQ+xZpxZmw2rXj{(-Az34+|}xn#)2)%Nvm9 zH_0Mt_+!R;xEqWsnGoVC0@>DkGIKt4=~Bt$_aqih$14IgNxCIO#`GC(c@5@ET9bvr zOOon8jNYrdI9o77pk4q%3QWp7+KlOFv zVt?j>fMmYXep|+IY98!xgV&Xl$W8u1X~u5qthkz*&FwyBi0_xR{Fhs zJqHE|v$F*#OqdiGKIED4 z?hAqe2?ejBT}342UwSfvgm75QcFkh7k$;8-hk&41r*Ib6r-I?c%w}4%*{(1E&XKWE zN%W4Q7~3Ar=mn4llk2Hb7%n^V{&eZY)$hDX8SLCuZF3|6>6e{ALVy1uN;~$K-F&T; zIanm|GnE#dA<}Y4mC%HfqkQvEbPo>?rE3`@tBrbMDk`cYt=}6Vlwi3$LGjRALHXNF z`@W!}8k){*HxY0z6$^{fg{~gl&_eh>gKHqjvgWk^YMC5cT$wY|jes>pA5BlKT80OY zNmo!;^;t{|_L-0h!{^4dm@a8A({8f#YIKoU^<|L;UiDOlLH&_DIaB4qSWjr@drU8BU%#2JK zFzY=Ph!9Ycr_#m$9tyx=HD3XOmC8%MVvCN#(=Dl&oz1RiN@skoe8QJqEFbpeo0=QR zt1vJudV&xX#v}Vu`YJ>5_Q|#m*8K0G&i3gCkRt*R+AGDz4b3>W!o$O-G~;l2OgMhk zHFT)iDRfb@7v5dW=#EvJ%7URLOVrWbFAv5v`(s~+kV_^M6yBW<3gk5mVti?cE;RB* z6hwSZfV)X}4TAmi^O`m2Lr5gWd7C9K`VNo(W)c zP_x;XziMm$zF4dO26DV)JWcLi5hXJ9#b zt}6sxc`kF~nNuYF9LC@M4K6U<;#Up7!6 zz#<|(f892#Srn*ke+)dg8Gye74K3VTRPla$jh|*;iG>1J3iH#~n9%2-3ZWi>4os#4 zlKAhcp76RDnt6LdJvqX zMyFYU!$+@CeuaH;w5W++u2Q5V4ym-VD;Zc#Zn*Ku|v@3^MXc!50WRN8Pm z@h+bRB32(7@Fw`6m6?_>1`35c~3k^dmm6PIi| zK5Bv#`3SU-l4w$Ft1o+;A>r_7<2t4)&j6d+yShkO2^q~cn<~NpuyXDd+GcZHM}Tt> z%ICA~{mw!cCU8*I?+aS?11kR!O2fsazR}S@m%d6queE8M z-MzR8qOPza69HLnE2V+(EMJN&e!@Ci199A063>XbCHRX8GI^UYQ&l}gwb4*A)< z(XkeltI0%U!w+_Cs%rH|upM-`kOA#i>;@H&BbY}fo)RSP#Hw5kO-;q>FY&n?e1PBy z0I07dBFr{zC)1^xMW$be8pRmjFQCA}%1LVEykqF|YB0D182&I#Oh%osP%Og5W9{@N zAez%@{DjSy&5%)Lge9@iPbpNK3{d5Yv0{E$l(w3FAI-{%2g2?}Nf$mpEP>dH1g?)B zi{Z_=BVze70BwttXBsOvpDyNJIcg4}rKM$$I|1ED-46GLz$ki@L6wjf$YRf0j}~^~ z1Q8O+SvPeuI4rlY;~C8_$1d(twD@80OM4L{e4 ziV;;8V6cpSRCx`JDAy}5Tx&Lx#<}O*QJl?1njxeE05+$ENKYTad@H$JA0T0r|1qQi zk`VJ6p5v{)8M$gN$Bp5^Ww83@;{(j>LodPoho^0$PAFgkK=+k~XZ4jj(CbzwnF&D1 z+SF9Lb;OMuOC>WvJyvH7yPvMuo5MYuaS~Ks$7lPx`f8KqxiSI1$71%D&Fc5N znHgdpBerg!Bl#;w934#QssLc*^z(u~nyF&d-h;X7lP+}G92WtYXx2LbswlshaXvV6 zLy10AXa4fNQM(@W0|>m(g#idRc&9&X&w9tU7Zc?RQ@9BRWA7LuhRsn3KXtWxM;hFy zTG-yV!C|nF^?<$tiweM_k&*b>gV8EJIPIOOKMKVaW>?N_0Z9~21yl!JrE1qNU^v6}sQ0YE3NCGRIs4lb@e zRjK6juqgEu1_lN<*{xXh{`oo^Q8BULKmD3>GbU$4&7c)}#c%&B( z=C0;R)@+_HR7o#h$D+I9yC=~1bfrUfwvoGV9}vr1ca7ksi|E~LFTSPhY*ap@*1l*`<^G|cQ7^KL2NQOdz)iOe>LLSD z0fB)Ont|V*xg~NYXUM8coA^-bInp%U4F)Q^&4~-i_YL9Z+EK`m@OoQRjWfoZ$j++DLr$sw6VAV#+-n*=}K4Z z5x6Ucxgv#Dz+!s5z$!Zz^=Z*Om`#;ov-@5@1dw|?x4k-=AT6O=!>PsFgQJXc znLZjhoIh_!kOX2ibLO%pE2U*zTsQ!5mGFZX4FErJ+~WP|gY7dFPP-HyT;>e-b&&1M z`T#)3(UXDu*fCV8yLzSUO|l_DB}puw^4u%Q8ag4u)E9VEWA@5zVO}mM%J$QFeT2Ud z>{=Z`B*pfLTYp@)F(+o(0de0FDADm+NZAZzX{r&}hO|T0-L&Jr+>mIvaIU03;YfV| zdzt&z>92KJ)fchHSG-o6yrQ*aL--WKQ4eLvrxv>IeD1`o;AZoA#uhj*@#(k9XQ*`? zRAcl897!BvfD^(0Xfo6W^c|j&cUM=}j;Nenj;K&Pjq(S8-D16aZqw5wG0_9;x@47= z#?X~yJ+W~@45>F8vpq3e!t0mM^KZn`+509jh0bQ1U>Ksg8VxuG_Lhv6zpUs;_mcL@ zBKFt0bTp@IPWLUi84)Q#-5HFE)%->oHwP#;>4Y{KxB$D8ef75c50>0riTtv3lIt_6 zP>1FA>B`j?ji2B6=Sfk?j1B2ZVmwD_UCI^2KQE7o0;_YKt8Nn40Ab|K?$o<_2QMs1 z6ztBBT-`cR<<6FmcQZXO2=wlT3CmUW>2Gm8dw-PI;ig==yzJbP>K#wElKaf3H0+|* zfQIY9_$HH!Ycx41{B)snS^qKYGMJXg4n&?{BkpM;Vh53Ku*H1Y6ka^M-}EEcHa@tj zAL!g11K@QH5U6wk8j`m_mlFoXMMOoTFYL9csVS@d57W`~=MFlcM$jQcbkD*Z329nL{_&;ggIt}h^oD(3{uPdR(pxtkMEA$vk)wP%K5#-N zE9Jx9>((rc6im@8ME53dymTKZDNr6>>3W^wUqX2OD}Du&Rb#-v8;VRgOy_^Pa&NEJ zVFf2r0n!qnN6E~<vB*_TCBZ+Kd1QH+kWwbCFJYV;O$%s~tpP=9&+bH? zTD$^jyYG#%-E-m4Pc2vR!ko3>>uN0Ys`ujWOtH$G?HC12>VwZ5$oKYd&M4@6+{?q- z$2s{{?{^VyQ5zyznvp*_z$5TI5Rl$V$$i9#IN&b3RU%{W%ohHl+I;S2+RT*2UNl+q zCP-KSIU1nn;rigdry(vk*awB`ufj3N-^RN>(tF$r_$if-v|_!FSn-2dxjQrn!>?0^ zNc))}$yNWB8Xk<|^-VzNsv0E?nodT3CKf$|qIRgygIII%!Z(|gsp64Cs z3e=IID2ZgoaV&AR%P$ZYi)cItTeNv=l4KN8R{Sq z{qHNSLiX_vgzVZD6dQJR**o_$&ntYXyUvP~HNz6!zeSpn<5M{5WUBx#FK^MkRQBia zbDcXhJiL+!Rj>OKQ848X$_x**SPE&Nh)rBVw|Tg|!2W20IFi9ve)R&VCc1-BNn_&n zqh~nX&dsWNg~xwg11Ynp(AGe~`U?R50I?1ypr3&QWEst;1GGi0LI%$r8xC6%C&={0 zI2yoUM7r5v(5g(jqs6?(24qAu}zO`H9(Fe0It0imIxdt*^}H!@@n;$J9w`JDeC>~db;)1&+qjq7KlIjGc$)gHMt9FrZzl|OVrJYD{3dOJNPha^Sk(hM)s_N_KtaX9;MBGL z_j%IE4yB6v9Pn%#06sNWBmlhZJk+4`g zghAnilS~Qp5}#+=wP%~nKeCl`Cwtsvp*bF}`PY5;rI|=k#i8}V+xv@$4U=d^MZ0hb zQ;7;bC;r(TaGFvd7oZ*4Cm`>MS&t5x^!mi=L5IItJ+4=SW}98@7b=ZR0ujbr1$=#h z7V+AxqoGYg2|%Ip0QeB$St$zim5T)f(|B_d(V1>akZ@S|0bYG|H~D-SV4aQ1u^FOx zfY>rkRl4dMWk{Ty`__*2U8>jbZ>9Tnq|7AxhHD#rK8;$~(d;$^%GQMnHSh)UZeX|Dyrr^(8Wjf%LP4ahCXYtgEpOGs1`Ek`ll1Q&N*RS;iSa~kv z31Gjk9Bu}Z1So!kkI{FrG>>$0ZH97_Ne;{HP0$~n-c4f3=5xC|7L;~=@WB+W*O)_Ic@BaKm-JF-^T&8 z1FFn-<_F(r@!WtLGZo#Y$AAh|xEJK2t9fc-?5q@PMPq<`3d9e+l#BJ1#Ulc?DL}|Q zQvWbUm+8W94tw?s$L&vd7(*E8&bp6@>!ab^Q!C(es=gxx?v3q5M7Yke2g(ZLQpT3_ zmUr1OQa!Y#%U!A3yltk0o*_tsh}-ipvW0e{0F+mnjnSwe>TSoMN$m8w((T^nlWLQE zXev-p94DLq{Tic+061hZBFPB$MAWkIx@VcbYxGgFwjSQ&OWQSGMD)heMq(P>N6XD-8`^7=i z0KZ5`59^P+d}axTwE&&snCiMQWwJJ%aY{~CUNhg^vkZAa#tx6dZ=HOwer-x%@pw2- z{aSBNZ_-Lj5xFFM>d#682jRS54ML8IK`~d9)W|Z$axC1$%I|pH|2&?;ZK$ zoo~O4xp{DIa!cXeat1mOh9{<6^jeHI`$a!c=#4zlcQ6jJk*;Kr7QKlBgjLHwhZNoh z;30GrI|_N6ce^K5o71F27UfxO64EgcG5QqfLXGmr6*0y8ly4s&ANQ~ntSMXNa>-&m zoT(q|jMd5Z$&90mRLjd=eQfYmfaWUs;V)Un7#de_E;mZ3`ol&D{8bH4_X9L!7ziY? zHFV=q{jq2yC1$th`c2*rKuCZg>V(j6)+EUg^T2E zf){(zLmob;@Oo9(Rh*}?Q5iOOZdK65IDULc2Q;6n-Y6$PKdt!TQi+6`)Hs?hI=xEF zF3b&J(wi@)HG!6E((lAMfPu*G^m+NZ0>I_P@R&pWA$L*4LP|pWBqD3`x7BoLb3kw= zac`>x@`-USH@}!g-M=q=GnLx>cDzv{$gEj#|paCJJ=i zl&=kT-A&ZVB#@s&_7c!>>IAiOB?Igl^93O7-y2v2WGfCwlkZ0z-DLP^?lx~h#HTN| z+@-sr!6BCqm1YL+_1puQPbf*}C$(=LP}g7{MKqv~M-8M$qt#sGYSNnSM-Mq9OTZ?m?x@%@sP{HzZVxU66cvj@odFS} zPD-pd(v*gmXdl$<>xoE6NvE!Ot$wpiad1_4LQywNmh1A}12M=g@*)}_5ZGPLcgEmM z0)VQ@9nc*dyzGRk`kefpfsDcv{_%1V;py=1t;_Mdqs578gpH?Kb|oB9DCoeaZC5%J zcnpT%-$)j1#mb~C*6O+!%6+gvI}s09Rg$Kyt*yMWV+SXvp1^S2cZ)|O$ zA2r%o5+dnCPXbk)f0oaq_>rqotwK9j+X?iJXlPV7Gwa-2vgiOr#C##u*o)U#Ex>Vq z*g8yX-a42)zBQL(sk_Id`>)h$V%crz*Q({U=MEDV!s_LK63J}7wp7D3BUs*oY zvN6LlKqQwknq^&EyVF85^{X3;@FsuS^cvh;mNV%;ia8uCv30h;l9%|Ow)C=ZAQ>T# zHj~Oz>LZrahJL)JDJIfn=uHzhp~3s9C8e84I(^f&UpqV_ zJsCkzf-xMQTZQLo30$(H8tXNlFLA5Z{46$+`{BD}(Upu~1UFF$k{SIZjQ||%ac@gT zsO@&JrpB5O=ls0OZWgJK((u7>n#HA-rW|^*T5xF5ox_<1Tj1+9s>5kB%?}68Ynmfr zk@Zy}I5q+t%$3$4l#^fY9QvL<+6GT~K>KP8XWJmn1At9nl_Wr}y%Ql702#(>1v~OW#LErgHog7T zSJ8bDp!V}+_}q-R)h|n*T09#n5`EfLIs$%*+0+wBe;-Zb%yu(>^X3f^(3Mw|HJK2) zrR%eex(8b)wt4xyeFUEy>2V8VB1rs`YOQfw@vQ<5a4ZVXM_x zilhAu7y?3g`%K&j3i169&6P9gDjX6CsqD83ON0Xi`v6_RPr4y63YPN{b_Cu?%b`Mw#Y?gVEW~yXms&iHVLF4 zYBfT?!F`+IsJp9Zovi^(=&s9$)~8Fvcgl%9$k5+-E=sjwriNIDMq@OE=zkuyz6x%7 zb^NfP%;Cr)24Mvx(g(l+v4(GW4&bv9Zu1o(7eLQHmL-BvGeUq^D{TVb zN7Q`I9z}~;st-v1!^~GKr73TW?sA+H^dNL3SvfdtW(BXH8?FU?e*od!GtzT@zZHcN zIiqKB2OHeWC6-KqrP_It?EYSbTE_{NRqbEu;O*J$Mb+ge)!5SI~t}-}$_~pb>?EGXg`O z75mA6AB_%iSj^gdh*5X_rdm{-5fVpN-KAUkRk-Wf+DCP4hk`1e$thZ9&nUpIs!OBBR^oYl}W7!3lFdG5gRxMlMAWQ9XE1a zyh;UaU2{Ssstb4c`?*KxpY8G4?%rbn#0l}`bF1pGfv7bq*Q@-#rf*pBxAc>`*lrpi zf+)J!sMz*BjNQ%77`{eCA>i#hB|yw;OaeBoHVlx5hfcO7Ha>NzxPACkv@HlUub9jy z{|~O-I;^U0eZ&2fl9uik=}t*O8l@YgK|s12=?($uP?Tj`?X|9pft7)Cmpj38Y5xc*5D)|65u-8C`vW%dRV zQEwoCv|3C{6Z+sbf{@xUY7*48B*(-S^~Z-|Dfir5Bfj|7t89|w^?%ak&+DdJorKI| zx`doWx`^sCB$kF#*(@fGIokmlf^=oMik5V3ZeXF78?==@-2Fg$12j(Rm?6MMRB&sp zE+(>P+g1`flx#}^x?@+|9&>O}=OMv5aFNC8Bwvq(e*H$YS{mF)xagRv5AE+0j3 zyuZHqv>I^Ffq*tVJiO=jt;P#D$le9hvd7K;j8if@Ug-eVzk(%`M9}w((adG`&)6CG z*woOl>jj=HnO<^8@UM5_eU9O=!KECAhV^j(X_TIbvl_6Q5<3nJCeUx6P!uh#`$WQL zy8<#1l_!BwaD5Ijqy%$CKZ8(nzdqE808#jBM<2ak#RpnoA@DuH`(L-y;2 z!dHI0{S`+DD)1e{B#kLT9-J_pMqTd@tEr;B4bcX_xW6Cm>kWw6rqs(EkH4t@XCE~9 z!CQ`Yf%35tg`WrLG?~wVm~k}_Q$&XIYbH?jF0GenSA)kAv1YNw%gy^wADz{ro)iRu z=ylu^&WOJK?nv}(PbQM1Rhgfzt0QTolz?F@poDfXjuk2 zK@|tL3{wayF9RI};nh~LPHEASd{W$#_q((ftjx@BqiJN?m6}{_Tw>C7BoRL*P1}nl zGV4BNYo5k>mt_^kX-f;VXxXiyNl}aNatmBE0_$aPAkz-pi{>642tGJX&cITEnx z{{p(k$=8kCB#DtYKYF@ARr#D|EtgNP-tSQa))quS#V_!O`}is&If0`v8t{1Rg-!># zWX;>RE`U>7O=;`%qX5*Y|wYX7(bKbe&77}90ahD|kF2;T~R26XgP@mnrBNeoNsyltXw$q9ud!q$Y@WdP% zA1zIIMS*RTL_jqo*}zx(o-@v%TfCoob6T>O4Hp^z0s8nbvbB$^@~0Iz{H#%d54)tV zFM-a6Q|q~-v305Da-VDpS_bf9$RVI=n*o(ailAGW)o*%&1qo?s-7adVr1?|1IO+~$ zWw2>EgrX7UOqiaBUwe`CFVB5t|zc zNhS?jP98qFFmaqTAZt0iIA3<-yR`W=%Syv>s%ug~Sw;@RMFt!Wi*_AS#jVB;FR^*{ z(_A-p*M595<}iEsTRTWRmdPmOSpPzAya}g1MpzZ?@{-G|JjAD`&idC+RtUT6F7krf zt&08bKU-}A&uP#_H4#v$uU_1>W(M+QWk&GaMwO@JRM2Q~uDogH>pW=UX7^nx+@UGQ zx(f57K+C?5^p8}ppzKs2Ygfk|9yjzqh#kBm=Cw|-`s01oYve*@sX?A@T9@CPEnf4R z+^T&(Loe!X+1Qg8Izbu;m^Y1D9l^-U=1X_~FOhGqeAUECPtz#8*J!m)ZyAKBH%5{(wDLQVF z&50CFH34rHu)8GglZyJ10#%*Hex{&HZ6PqUFpIglx#7|)y@pnjb8v7JO(vrK{D4Q) zz6{F0ta|l^JX|2vlx-PI`VvQV63(D(p3rKM7q8mDXL#Jh4L!0jU^+Kjdnv4xm0`wI z`AanT*6yHlK?K5Xe&b*7|0hVnxNjk|zNjv~dbOIOLOK*BJovi&S5Wu^vVq{iKltbO zDCvRcu8tVC9kZD6h&cJl!yU6`3_Qx1Jz2Y=?OGMAuRc_61s2ZQE7`qtUH=E)H!q*y z#lZ1h9i6eH+;VQ*3}O90t$m0zQog7foihiwj)l|6nL5eix)PRjng#=Sn~aA1-daO) zmC1hTP0gdoEr>2}0_pa1WX3vWyRB`d7yU;X4^w!Jd6NdU!~MK3R}cCOB^Db?DL#A2 zDh7w8*TI5jB$CIZwF-`a?(=&+e(xL(8V#C&h~CeSxcHo~LA((2NA z>{KIq+Q=;THRP5a&)$QPlB`2|^qkyDf3!8-Vqoo~0tV&t0Lp$V6)F6#Dy+sBSngPdJx z=J+mJSDKEV7Pr=jr50fi{ny%6|Cib4eD(TdncIU9sq2Q&Hpm^(p|a|7U?5OkM_OtG zifufxPPN5nCR}n_hT;=vds!1{_})L=%Xjo7*la>a9ST4ofRd*i6}Kh8p>kL%k?o); z_o2-HoIbb?84+^ioF<(Xy=@}5u)3M!b6wrfj-@m*`Yb`tcV{iSep&M;dC4N}H|9&Z zV7(|1AJOBn!;Gj1XuMiv!rIZgLNO1}wupSeItgIY=t~l9#KDU`Ouu74guJ@?lK?%Q zFA}xiSvIM6evpgzclNqihCHfOV3Rr{-#&^&86??tFN7}}PG6M+WqlLiNlAAzDY1(( zYB0oZdkO{40E$gotmee!bJ|~7Fu8>=L9gT7n3LF^V*F-*5>xS>2whS_M?hg7y%%d7 z%fJyZy^4(Eaya;0e){weP_xLC&RbO&Qiqmp-2_l4SOJM?2 zKM?<@-@AOC6NVXyj(>kF@0gjHu|h(6@lU)6K85d23^5|rp7xy z@CbdZ$s%*C-4_2XL-yz}BCua7Axxk|91Vz4vvyY7jeBnIJ zbsQm=uG95%2Q19@dFVe5FIyHxzfOF*vS&E)!dLe54klP$%&S9N^JM)+y;7}DcwOqi zH;TA^w<)(wZ0Ri-YvIg%1i>+tbJsm}gQxJjiTgPOH}Li@|eC z^l2a@PA6r0mo4)CJj80QDz_-X4pn8Z;drXHTg}eBYO^O!3Nwn9Qo9Haz0*E)oQlE(o{m+;^(T*1RThvFOk-MeVcgJ z<{>Z2$nYx0iD4Cf#B*xpUJsSqT0`-RB=$h7UNptlW6AV8;L`-T9L$+TBiM-`NAb#^ z?_C04-!2jA1vaY$0$F=4MO_E0H>FKW#B^i$n=*#zy-QD###tr+ z`ws7~^YpUr73V(gO}p;`)zanYPze65W8!2}yzkk10yWIroiW(HtaMCUon>+-k#fC4 zdE)8zpK|FpS~RTRCh>lL+$@Z=pQVhCoyi1VoTes{JfvJqZXZvjpY$Yv9Xj~4mPMM_ z$qm2t-B*Qp5hNqst*<>5?>z^Y^}=2S1f#z+j&|TnQ%nza$i(@=DR(@_XtMHq*IhXi z`vnGpiaS(0YBj)z3zNXQa9KmA6Cok;0~_R$Pn7g-``K_7ipDp{_m~7d`G&7ji$$Pt ziHujQp)}v67m<_Vq>RhD7AE9jlfM#~C>RkHrm`v*WYhra3)>2a>t`oUA)6&AM4-PM z;eFIUwK(>}4367w9@ng&?oJp8a^gXyQ-MRTmt32OwfEDnXc7iox?HtBWc@Y!4q`bf zK+im05%BJL_v@#N`NT+qItp1{RQJmEB4@*i9%3hm`}l3I9+C$a!ak9ND|~uqoAGO_ zi>cCd7b~ddZ_^UwCs7B_2!eKt?5YO)aOe^;exxIz)=oFSq61;@^KteQV12v-z8U+? zemr*%k8ePxe6!@fiw;t-cp+Hh(P~i#V>_2w4V3uUf#|b=dX!zr7G^^q|us=EH)FO}}++ITKB0h=bjV6O;nUCEc zJNvcnh=A3nkl+$~AaJRFdHxEE8B~8?5wZSiYb^~%J*k$(_8~CO&8otIAl%~O;Cz2L z{9^bGNH}-KzK6E<@M^)Fdk*7$G-9*WCZY;t!htJa2kISI)ZQ;SZrr zhLCtOy;YsY`o_lPN_#3?Mm?oPd}g(O5KxHmAvOiRnN3B~ zzsMliKDT9QnXsnfO!)%)!czPB0H6J#qfF7}S8ONtX3gujctZW(H13Hl7OGlN@Kn4W zL8t1~|7Rls1^rOeX>8_a=@q-rsvr-$aS#)U6HB(mwb2=d>ly zIB+RSH0$l^{m#1t`sFTn2m1xwz^lvT9hZd1L59khb)^v0i)|Q!XFCBN71Z%GAG?32 zorR#jbR>17+KIPX^FC-t*28Eu*D5ROxIT;XcVkH2&}$~lkmoPfMJ9>BWh%jMDF12# zqq!&s<5=}2YYsh)6Z_ae_@=DQ{pwhhrpz=L!73V5t5DRb-&>*2={;22;3`}D9AHva;_o>fyo}@ zO$56jVPVXs0JB<(D(FqgsuhRR%L=FS3kfynwNZWDq5tao-VPa}Kb^yBnm%2Qb&sKc z*g+~9zxJDo*Rfl5MxteV=lmVgE1(rPUA}8h+FPziw58oy=95Q`605N=)O>P^e|5b) zy?;`6V;hNyk8LkTbo$Xx<7VA3ML*Vw?9nz1{E&VxgmLJs(SP{}Cu|&;^2E$}K&g&l z#3Hgo+&s?T_>yt+L@(CW-D_}vhwIEt=ut=d=Zyi^RK;v=kW(7s)VsGn=&TtnQIJP+ z$S}*J;e?{@$wbb%*4;JdVCEURd9-dA04R0b^)0Uz1J3dB1hjHz0x0XcF|W3&DNdqS z9UO}C4|KCfOI7%lzix^mreylZxc^S!;sys}yRp2lGy;D%85 zeOzO!PvU7bc5dW%R|v7iV&yCSgIiL4iX~JyQE&9atV<~n0OI=~NrXFW1!S%b0)JF0 z>U1BslUQD#u4;wh02cwae&$wG9<(KL^?tv-E5|Vb`2%?YQ_Dh(AtYJ1!lq6%28l#r zzli7cz)|M0TJDD(?PyieQrZd2Kc$G`> zfz`BiLBz?Q8F(r=)NPq6fqy6yAUGqlAmib~6-KeKy0Z zp9Et+e+I7BKq^0Ud?0K({?R(q>uugOu}xZ6DN)o5ROuAa}2@Ap^0rnr=^gc zmBV12kjuj(<3^=WE&|gn@hPiRzU5bZpNqRrw-sl1K@|$^bbCY-ECU0JV^zd zUcJ-)>uDT0KaZPPCyvliS20re2w%oVngafp3`_!;c=WEWt#iw;CUtWCkYl&{!D273 zPBH-J;9^H`gE?uCN05W%Kk zaseiX!y6s1C|bsO=N|i#-^=zT!~|(S)=rjE+B(%l5<$$kj0z%3;6v20djj-h?ZB~N zM$i$^TV&snamJ+-9XF!SyP)3{UHt7CIx^%PggQ5jnuZ$7=`zdr=lN$}Oa+y^zKZ%J zXE6teN>vHd0^=9S;C|a8_;{h0T)+n5H)>Oc?WWkQV~#K zHOvIQNJIhxf?~nAs!jVR{E&oA5B%vS6M5tw%a zhhE%zr_+a#U6y?_#!%n{eB+^~Lryb(s=%3EV-fG3Z8;rUXwU_$G*j8xE%2z+Af<5n zbKgH0yr2BW{_qcakbYZXK*;gvv#ESBIcmueX{x2?tD!}wVx2rpbEKj_`E& zYJf}pGGx^Pzs{>0MoZ~65R;@4|R-%IhV*zhA-u7CL>$)YS>YjnM4b3E^et+^>q$Y3-# zaCOR>;&<_DHNcp^R<{m+K=YbJ-Z07j&EIOUofmhxD?`VLMZCS3(6gSCq4^4rMPK-X zq-;CN^05MpG?974!nLP~1`-(LFm>wilD4nyCIV=`i{eco!GZ=2Q|Ud<`OvI7lJ4RALoPaolx!4={5 zLX~f`gj1�ExjHx~SA(;4HzPCS~odJl-5go_hOby(gB2k#T!Iq(59M9GmtSAPK*N zZW_IQbN%lRFuO{pkY2@>p_r*#{p&_#UjQp>Hq80mx<&wrBP|PzW3kVG40ANNK0nh5SFVMNY zYVAFk;V|hgxI6vAhh^}0GYUzaLQg$I=F0ma{vna!Xcn!;4mBzI!Yw=;Vg0_?r;_JIVc}Cu zub7!hpLnX5xYr2&o{_6}CN^A(VkNCp-HA4s9m9{L!utFw_0@B1>;Qn{5qA81)ARW= z!;k1)O3hXuK(w<;+AGy%xXbAi15hRNytWfd;$kCJ#OS|K_EYVOBZ$m%&^ zRUr$L#L!tLj)_AxeX}k{95^>?yH`E)Hst;Z86Be z7(I7La9}y8)vuO*s818Oi{NCaT1x^9sBl@D01oY6%M7CqxvThFNZ+BYT zj0~c(^*h>oE*jPPqKi98y)XZ4`mZ|4s>eR%8Fe=Fjt!g0X6e3r_W>(cw9ONGA7XUy zEBbiRWBN*uy2^4|fSnau3`A800DSUiyRMa-H5ip3q^^!H%lnTkEH{MGR%HNWE$nGj zlWC7j4_=OHhl)OiAGRHen@S`u$)_@GSkc#(%*V9(E=L#WUg%AyaDs{f7M5`QKy?d8 zk<;CZXUe10;K4P9z!c9H`MqF*?DPoQ*3}!VkZs^a5uVRmGN2rEqm}Z!pOCdHS@?kOz8+R4oe26)n5vIye&TG`+K+%i5BqkBjT-3i>!}L(ZD0m$I zc+6_lOu`?IP9wD@d4C-Zy~n;NxqC9PL>hi6BKQya680RgqM~APX3J=+j~5y-_p=d^ zTjn%w%K!jC+P)h9Dh7u~?+jfc1FR54azCNq-}W&3LFz_-@W-poo}t-#V^rIaAZ?d9 z8C~k-^{(pV%7@V9mFj5%zOu?Y+}r~Whyh{xEt}$AyuCmmN(E*s`z(5`xcK_OC7_WS>uru%;=pB^}GWAiGeMx^em95ooKE_Ams@-2XYC~> zY+v)1#0RmMJ!Uf5P0zzmVZzA;AUS&wqf({?uTh?DBl3O#os-S%rs9W{i4rnFC0 zET);`LA%xIuW2Cao>*nC#}M_YD?If|`MMRct8JT3#KAm_TG(F|6cVy0yt@vdtN*oN zUteDXk5%|=%Y1W|UbBeLjp45_EEv=UxV^g~39w*pf=H*EI1$Axp(0Y1&HgXyyLlU6 zv=S-IUj-Zh=UG8{Z&Pzbs`V-rOyxx$@0>CKbXr1m`pn0{R?|C(JNG{WAJ~VkviG|KA zch=XSRyv!n8B!5-eO&2!m2G^}b2#=h%|Rnbz6NpU?!?raOVptvP7Go1EA+Pm;gz>b z;#iYj5z!Kq>&t3U-Kp1kdjQn)J7higjQsko&1fgrgX)&w9m3mXZp;KUXhXyGwp)43 z&4^oP1r*;{Sw}<=OS=Wo{0jY#fZB9lcsd_&ek`U-hya~1@^=r~po%T!k8el(DA%@FhK(ymVIdip*!1JEU~78A(}(#>VNNYdY=GF#2bga! z`z;WXY=LB4YmW=Fmlu4y`4WaY1VV;#b3(l~U$V+dhBHgq;4{gc*-DxaOp3D0UjsXX z2)YoU^D75w=)sc@tZ_VYKszY?rN5rsQCODZQ_mlvka#n>ubwuTKdm2w(uP*Fe#A{B zHU328WF~c`A$<_MFuBZrq^3V6wI!~lwPx3B{(<_3uyy$tZ9I5M@DI2t!{~{C+6*EH zZ4LxjL~g>l02uuy)=V6>Fo0v2)GTVl|t-n_Sk~yS{T53qT7qXd?~qtA#=? zoitZZIZ%(cL)z4yb)=#p_qn;O>(KkY-wo7WCotaZbm6bKeJ)-eN{fu&-;gXev-J2T zA)j~$xT*glBf*g&e)H_7?V=8kO0Dzfc(EF*squ2yrXWGIsXs>c-t!*QC!{$lLVbWn zaVZFrerW<-2PflfwGV~wog+^kT6TCgeyQ|m_-TWPX*~f>qzRJ3%B@E+yU&JD$;}qQ z!KGl>$7FH)#|OB3dYaw$%K4w^^hw6a4zn+%!d004WM%)CtDe59(~nU4FoPB->Px0r z<0t+_RPHU}K)HT%%s-F7&(O1caxHCO++E-?TQ#kDPSDG;;l=NF^OTo)IbcL2Ef!Xl zvMa7#>up_s)b??5boh}-HDu`5Pt0vtG0fp+R-0$OBkR{fOX&I& zy2_*5cPS#qF{{rwnu5bK9Ih${R3%G zC3x<|rXdR9m1&?smi3;t=}A^?g-FA_EY8AFx#;!l5y=pC&_7p=>!GE~ej}n*s!mjn z2@QAaMsH70yYabsUL4s+$ELN%ib}*ecnXAG5;mHb{T$jwqj5n@~PI_Jum6+;mtoXI5kTS<7M*!e>mCIQ_-r2qlFm|Bc zX**^qXX6ggqhVa@rKkU|`HiKY8MNAi>+0&3p2{ULhr<}b_N$$qn99w7mSs5pLzaF| zg^;avLIK1JEc>ks5e3H2mQ8JU@Y zoCVWw0g@;xK=}aQR~n}o%FWp>93359OK2~z_rvuzzrz~*O^go+F{^~-UzV!Yfwxqv zlbBusEe1kSUm*aBIV1mG-kKRJr%!vPp~5-oA-IRp`NnSbc*$h8VUvP%!3h^LewQUK z9_&kb!)`DSD4-oqQZ^?dIl8)t7go_K73m=-(jOLiiuQ^C`2zi(nuIX#_ZMeCiYqZ2 z4T{`RN>6$L)8e_YR+&Moe^`a?9>=AzX?S!~jcJ&soSf|Scl^UdtDB{mmo zWH1~=v(Q(jJl|6e5$!V${k>%(bt?@{(zsouJb%4x7Px!hQ-Yaph8XJ|4quiTV03&z zolzsoQSAv(3(p1W;fjyNq7Mu8IJz^?_AQ?I<9$WlN~3H1yV^ZWYwi8wi`$32{C?KM zQ^6oWkSoxCj_upK-)D`GTJzlD6yH{wI3^cObY|SJgUV0je9o(75nZ*FJo+*Hk_uRH zuz2<~g5{C7xsgyM7+{F!#OO4hk9oF%7zW^3tzGN559Rlv*D3|IO07j~(}Pgz0@t(lQ|#gdZ{l5*57hrZ(I{mZDzoXV)=ri!Iv*v!yAh8 zjP4dNM@ykj1&h6^UmW7&WeU%8D68NfUxFi)hA0tm@EKXNS|do3A!qv3`))TY9uf;- zK=R{qXx`7HtiYsOTXF+ssF_1;2u@6Lc5{>Ve`-xGQ!5=H?f|vzN9PJJN!^8(?Ls*} z*LV%a?;`err?s5e(9OXmK>9oa20{yYSbjaI&4YKUp{Tk~W?)rZc9krGGF_4)uRLj^ zg8JUmr5(0=BYgZ|IQ^&3{r0=B#i^dWT8WDOd0-spJM4<*X1rN_&#}z7CJuuAd@zbZ z3`l_Ii}b_~q39f-VheSt`}qO!PY7T;3RH3XTt2MhjxR4OJ z>Mml>5l<|?pIA1d{+P_CxHaUQ$aJ5}9XDtdSMg0QTFzpAlL3wNn)sz34{@aLy*L8K zck|}RnGk2q%PHHPLSlh!wm4zh;Enq}>mbi}AHP8&ZAPIjSy%2`?A|pGT;8YqZ{Ma| zQt4AXpy;?+mvzF<*2(S9TjPJO!>X`J1m+odorlS-ci!%v12G!RC3sw*`$W;jES#vy zRyP^ioy#>d2^Bo`(+#6vFEeP10aBPJFbof?{-6)-sp+&7fM%VnhEn8!oz!)zJ|P z_>jiRyRkP@eybYpFFI|5XsDT0N~NVFRu$BKGI}`E?O_;07kdC|Y|>_pY8Dg?3SN#P zploWs`)S-eWlp0`_?F}_&PdV-g9b+g7{}(y2GC`m zAtQ@fm)s|5eoo*8=3yR__1*kzDS(<)22$Rnb;@+wZFq1fNUZ1Ye^scq0xx>VUv-On z@!$auo@;S1q~c6JC&O)Tk|BzS3lXMAS~B@**zr`&QW*vFbYaHJF@{*<%cJ}XC6iTl zW4NxYh+hppFrXJ7xD`D z{pbLbF{b+o%pV{=bA{3A>A7Hh>z&^<#Y;~CojX6?g_q{X&$wds&DXL*bCn`i+(jkR z;U&SA-2pIaZAM1MqyUy#D?s`4IEg*j05YCNPt%2*!tV_&S>JrM0;MN_G`>=aGsle# z5BOOoiHruMWVbcrFeX!ofZeGIx@$XR-$vPkSS{Z> zrF6&t_yM-pT@s>`@S;ImZ_5}jErEICFSH&74NU^LyU&sw`mbS8H^>39qyKP*1KJ&s zSo}S^{fp9@0gbao^2`FrZ&J7|<8LxGN_{1Tmd0@a{|FB{9puxO4#cmGDaBK$@01!`Jz%`X_ifK3mXq#sStT-fKH{^8pKE# z7k{au;&3GmrB>XvfbpdK{V>*V8;b3PR(>qeL|2{~fgr*C{(R;z&kq0J;V!Cw?t8gU zX{2Lzgu5SKgV@3YzzR1|Aiw-DK~b$(Zt=OcxNrVFD9Fy%d?Y}@XZ-YMYM}kQf=A*h zeJ+JqqOfU_xcv$qe0F`n&b-^|=4SB2{q4RN2;Jyu!Q>1!&`f}@W6V7nWEz6)kaa!V z>@nBbQh>8Hmaj+wK96T_L)Bo8h}T8Q65P~xS@A_VMcf|0>jSsi{F=B2EK$FvC&jJT z2$h#>@@{g-t_@mM@0(mbV-No`PG%73FJLtZV`Jkp78FI^g%8`bFd_(XwJAR6o{F#L;W`R6koci?W}aNbb@a04l5 z&A%5;mF#}J;!pfFQ8Mp9rMr)f1epNlefT?}0881H4=6)j{KMoe>F&xEG6Bum6esH2)F830OYuj0lAd?=dwRsv z4-7^b&YODA9y)0nt79a*aMhxMWsl7o6gV)z^@*UVCwfg zTHhP@b75HC59X14Ph!z6A{q|Gr0AaePJl(^E1lURki1dyq2G7B{(3fx>v-Aw*;t*) zXuX3845_>W0T#z7#UAjLf!|q#^_3g8|3NXrU@yjZN$ZR2z8eU^8uCg`~e< zX8r&TYkgaq!Ob{Fc+uthG=5Y-i;H`Z6{!bBBc6Y-8zwj$)LAVNE4X2_e z{f9>hp61|(2Ov%diCHkZ#9S#rFvC(aSos&1MMn)h+waFUoxvQTE07dAZ1iF+w0QA# zFe4^VcSU?P^J4qyQAXRJ$QGQVAgNTL^u)*R=nTk=pZjc=r4>H+-lDF(Z^QxB9vaXB zybp8F^S=OH^EH43>IQZsn@g&H2XpqL zV^n(vo3b&K1`x+9z;Y~(fP}=0Pj_1l<^c@?7h{v_p-P(~t+Pzb-M!sH36+k>DLn8p zp$BJ@DgUrA!lh3U>|gF3wFA8+5Yk4E#<7zwd7(8u9QT|A(oG!%Z}hqW2qOciDm{VK ze}SVZ7g}lech~q^iGl^_Vb;SyXmtXc&I6)iE7z1Lf;`?y?N)COP$MFS#Tj6yTiMw7 zfls7Vn~&Iq{Zm3x62Oa|f_+y>_!D3|V2Fx}>a}>P0}bvqhG|HGHy;4YNF=kqpS(=a z{THGR%Xj{MRcRw(khTNRxEgXJm_CIoPn%k-^zA9gDMDaV5ddu7symi@J-NdL1Vrg5 zB&H>5o$!{}_co8VD?|{hK3aBF^d`|vg7X<@M!j}+KBEDp2_tdvan*Y%ha!tLN+SSf_&b=bVO61Ri;B-2 z21=5p3NL)f;YdMQ=94gJ7 z{2Orm9Y*qZwag(9sXgwOx4}$cJfpq<+@Ve^(~UDx1n}+!dJUXHLPC?JdSh%{WIvL_ zgV3<0WMXl&lY9E9;r^tK9c#%E16-M+4~DiWen`*@PJG6?@#<{=n01f`iyWwl(~U>M zH+sFkJTPN3!gyxD7a|25hf~|VL;v?e@QD5-5~2CsorGH5Fmn<8SbptEvI{7!09FN? z@4^BOFDU@{nLOzN{ZOtxOO4Y;;Y1At1BPG|?EbTrgFS$$cj^&vBWA&XE7(MY zG>QT0CqbpLd})Y1W>S)cD!G%)Fc57)-jvFN%!P9Q|GCOd0Wd7c;ayxVRn^%Cm}q-} znl{Wd$-gF^9GR1=LoH1fKlPguIPgIOS{DJ8(<@;7-_b^}-STrLN-%VC46w88^1_ik z703|mG+Qj`#OvNccD{&pM{7VTE0Xga7Wd$dyV}`SitCz}b;Gi1&Je->RmP8}QbES~ zAy!o=TXkj`huB|bAV=SK2-=zJa)h!l3CU*STeGAHT!sKJpsjETxk{!$snWO$ekgyF zBT?kPEBqkQCJ6OgsD`0|f0l*j8))xS7@rRa7U@jluo-1y5iN_DBNG+l+r`pYr6nZ` zE5vSIq~NFSQhp0ukn=;JYQY-}HGJnoXv(f@oBQNv(6g#uYxoIjxJB%jy$A5xKNdbG z3NQlyaU+=LQs67ukM_TB{UsEso~vJHq6>t-_hd|xHg zOZleUk?Bcbc+fKdYo7)SumGQEHc1=Bk*UKU9tE4Tcb$I!@4$j!(l5sw3=BxPjx3Qy z=;`aJyi3PFt9P$mcg3D-_;I6)7-1X}bEDpq@b3sR<9HczEa|SqYSopcu;5VKFur$#tR;BE6rJu1Bw;dQtbC`*I*vxL-ZJ>UkQRMHD>8=CLCp07s$>}_EQ}E1`h>22#vFHZ z1w1QBD4VMUAs^XZ&LiEMl}#{+O{Dhey7{&E;{J+%)*~Ov5)~6W&{AmQ4>E?eZp5gdCHmkkX$8d@-V z7=#}wUcgb6_M-9SwCS89AQ#RvRwJ2cF<-70fypI!S0E7j)lHJ%?t zG$NaC=iO~5tTZLAuTh1puQlE*iiAm3G(Y6g@Z(@VnbJ{i#jGnj_B_IKF707xhppg5 zfnhLU3P>x_3!zQVQ3=vmoY3hkgo)dEj-FmUyJB4oqBLP6>{s~Ks>Q7sfY6c|73R~* zsKuVqw-)}J2o*m_Ixwj_;g;Uuu2vCK3wGm^B1Kw=6QmWo$1?@Ao^xP56FDQnSG@Yi z;6;kM*0U_O=8!GUb){eA-G2<=q66p0LM!)3CJgK0gc{qPK;x}wAmVP`{+KVByX``q zD=3Y>V|jLB%8OqAGf|t%d3rKx97QNRbF2p}ipDfgSL;kUyGtjb!d#1(AtT!qhJUa{ z!Eex@=Mk&78G^RN-V^MyTsI4LDZ-);O=kx@{mfd=g3EW$fsQy1y1rm^$7@F_^_YQ> z{dZ(8eKN?VhEVD|zYkGHc@=xy;h7q>9t;0crP+5vc+hpJd=ZLop#-T4zk0gbfe}S6DvRWhF&;hta@t<-=-J)I^P<1me?NL zSv-4CtXpuOJleGe7$iDmzge7NFZj~x*{Y!k`KTZ{(%=1#N^_?;J0V0HwUSQ5@HeNh z*3nupX!jLw4^u3olkd$^;fpbCw{I7f(9tkIb*~c58|Hf$L}v47cV?0QRa-h+zU-9O zu!6LFiZ|-=^MUmPnt$C5E4p!D#ggVol#FZ`oB4=VEakaB*~*v+bx3-veBkVx{mPr4 zCs*~>O{*!D?N<$hZLy=JOe^W*m;cV(-UU%KB5u(<#z^BXI;3yx0q4V+r}(!X@5-~& zF+u@%N<<9OunvgydZe8%WOcF((d&6Mlh?G#5giTt>~Ti@_Were$_u(A&UcGfgxVUU z6HV`8O2eA$R@(7Gq|Z!$I;5uEEaXIa{kzEZ;2#E7_-V%eamSI6NFaLtdY<*!8v#=lY_lk>{Z*S8)Bh2wCW7H?#(% z>=eIXlOvh2ZZ7eKaj@)dE{Pkb#?PbmT1FqET?Jm(2fPzOiVD01H*}HXYbt|2`-12p zjklkcXlDm19HP5en+l?&Nt@yw-krX%exRnT3HQp%G=`5r8sZ{4)JPL0@Ivyp(db8g zhia1nkqJDOx4QwW&lu=YV_pe)p^NwQ-;mfBMY`H7M(-nziEW>6%kC}F`Y~S(+LM~2 z;5SD78m%9fm#XU&UwdFa`b9WRPCPSl=L6eev6LG@Ac$)~W#);=W zZ5WkR^VFA4d_BKTI+$81AaE=Y=iTzmhKG}AyE`%^=&n&|j?6dUCP>RY|LSI*#&mp5 z-C%jLf70~>LCe}*RQ71T`e_1Pse%p$<1hQF>oJSLMQ1|IoHYuz&3`loi9^Sj;L!7z z8IXJY&1l#?vsSAOBBry2@v({$+xbiQzZtMaiz|{w+;Tab%V-tz->aUJt>}+AuaQN( z2@%zWEg-ouejje%e;V~`uAbQEisMaA6jz3htOV%uF$OiVVTLiq;*#JLKiaSbZP~R1 zAcFCtoGTrc@Z{6`#}b)V4uzjoF6Iim=bnU^xz^$Kb59oqpdhmb(T8(xo)K)=4fVXC0{`x47MxXf_V^N|H5&(@u=` z#}pdkvrI!F5-eaDiH)2 z?QTdbJcdOl9;b9)(>=4oK9!F=)EjnICpRZ&oj`HMN9QU=DhNr?x~ci;eJML977s@H zDQQ&C-w&vbetfN@RI&OQb^y2>t;Eb(%Pwns)_$`js`}^QMnvMhn9N%0m+*d?aB)S9 z*KVqIp9Y`%LJnq2;4|L#$R*LM^}3fN+hiyeeAOQclTWXBPtladG90W6OY}!x1nab68yJ6m zOXHq8MIu#N=ANsq&EGraT&8bf)hIMd`C=_epNqeeEd8zO1&Q+;g9)dI`tR%MYvf3|(T_kB@ZW&x8Ss54yQ4;Qz3o4f3< zcXy&*2z+OZV2y45oDcDvIk(EN$PjgY(J|P!(=zWspPMF{_!2|%t&z;nRribJAJy8Y zjh(mmBkXy0PVNGiyw7s|A~us@v0+#*7)N9MdG3|!PUZbc;P$(;m#L0W zr2$ly^|FV{9;}H^cXO|QjTDr_jXd>y&kz}-8l^eIrpHgRIEeJfB_t%uTf#HRA`9hQ^8JfU_;=-pE>F6Tl{2oD%b}m-ZSD%gBLV}=>uNhu<2k?=3 z&w?nq%UH#=>+vUwbwcv?2$*pneS##8gHk}jQhpO|upgm|M_a*_2mu;1n z@~Tgs__&p^V&z$}`jb~21V!BjR+zYMaw<){Ot0|e+$B>zKE<(#-7g2_n5=DmT+x4f z$Kx-?Bu&Sx&2BF=-E$tnAC1az7AZ1b%{J z#ju3md{(_3?}#Vuy9tykg?*icn@?m^W`vR!magYSfv^Kd+~Gm(F8uqO&8m}J#~rL3 zvzOk(Yid5=_vy;h#WZgT`a*A*+s8o|8O9&A?2F7@20>mbB}sYrGP-*;8y|QW@SsXH zBuhJEI6jRjh+8IDKh3ecarBN?BC5>qbKnA_#H6gxzUL|8i$h$O9Joj@nKPr8hj#-r z!zk}+r!|w!9A7WG>YG=Sj`w++ZwHrZ+1-fb1gzd3I$r5=&vU`a)xNoOy;1GCG3opL zn(Qc1a$vjpEL)PFege=j6D!VKlr;jA ziA~b(LfByJ+KxVOyzY10&MIB`eE%R5lTr+ULxTK2?7eqTR6)}>3L=7JC1(VYBp^8l zQ9;SkC4(Y4O3olzq9Qp5$ysuiBuLH+EU+v&FKJma_n`PZ?>}GFSNB$Zb?eqWRlss~ zPS2c~p6;IRnO|3Q=ifDE$VRevgBlSit(%`#N;vMaKw#{SPCUEPRs?T7m;9A0$+3*t zC>vjX9#;4k!9#Dd@ltC+6WCAs7mdU%?1F~&qyA=O?tG8RJoDJIZ7I9fZK3I+F zqQ&QL>3;6dbw=s6ALNhW-ZICr^bY$EyLSQIF+kI~*js@(c|UV@!m`6LzgJ2sLARq_ z^zP!#XsbtmZU|);&0)8q+7POME$_3`1qwxzC6n=V+|G@cL)d3p>E9{h_QiA4^&gg+ zSYtg@g0|X^IPw`x7O7@uwnz=^=_{ou4hwQs_Mtv@!mqe5?(rt(Go@jjZ_)h*4pY0r ztGV@)X`I-lzIQjXZ<0m5h{;5rIx>x23`XJd$QLaI=h_>2y3P}BVZ@(71BXUDNiq)14A@e*|p{?S%7RQNty1W_LFm&7Goyl(1HQ{}vZJPXDA|OAh2->jYdkM{(@m3=#}} zf^5Z-U;&8c&+A%!7wTLT7%itL*vDi_=G2AGUFWvW0l_d)&c$NRIG^e}7~>kwpVKMB ze#X+m!mJy(%VBN|&|OR<{XrA7e4!G786EGKA9j_F^C@237Dtz#%wRj@S}S)gfJXZh zy1_a9lxzGafgsN9`%4Jd%)L}$3bvzdZ1YGn@nkx?X~_;t3a~MW-|79vQ?F+!FFAQr znA!0C%FnXR_eMYj?#`5-pDQ(H4|lNbB?^%jO=x)F5Ey<{jR7*Y0qNPyO#x<=Aj>qYoo!eA&Z5E=vq)fa=zijcV>tw-FY<&S( zQqVTtD~k>~pHRPqa}7AYO4SjNCre)sZRn8WvMWvh*gQ?}beod&{vXZ-eod3h{pw8_ z&VVxGxN{Sls)}QF!P1d!nKB}kT$xmc>~MyN_el0%3uTw67`8a+w&H0fIyYJAi8ypM zZB?}GR{cus>@-@R5#GbjBA_^;_-s=h?mu0T2R53)G)fYAZ{b<~<)|KS&-2Z3mxS$s z-Pj?p0Q}Usfcmk`Z_jEDeb;?<_ChFo?Q~I=Z|EecNsGM97auPVvz$LBAZz0W)MD(~U`+D`qI&}0#D^89-2ejdjMvqJCn32b-V ze4XqMY>3j=uwjxddEvav%LwKtHL=x+&Xx%(-5kYyvXNRy8@6!xYc=v& zlY8oBBGUYinF0JNP6j+@5gl|=KRM{_5DpUef)87Ec3KkMJTlZ;xGGApV9zECUbyu zwrRGGu7N6~V`}UkwY^Hahi4sR&9so$B28}?lLs^Dp z^9iZKFr4S>-AiG1*8xGmU*_G#xrg82H_U95ijBbpQZGPKHHblV5nOz44$lcxKnvGv z7FKh_Aq$vpdKiKl2LYfh4%f*(>c9gET9bzj))dlZywW5zV!tvxmhJnG?@Kurt52AH zLM8ObDjhV*8ajP9k%X9XpqKKJ#$a-Bt#Q3Kmv$_rRf(ir>%_vjng#D^a^fUGXmJ-= zou_DS{OMYcv*Z`YdnolcNk1|>-_un&=U(hp@gaWtvkiMb;^vIN--|r<)x^yHO19CE zK38NW^9ZJ{5Z<@=lSjiucS5Gj0@6gnp{o%cRn&3EJmnXB*-<4wLq;6-A(;zxhR-{t z1L*D6ROM+d(J!8QQQkXf$d-xhUQ`Hn0{kft-nr6>%RO20=VBvBkfjV|(^*|SRCxH5 zjGyWt(<*sVSTjU=*rMZ5LJLa*0D+ncF{GNWB3r_Rg9To~c?l^zf1OPA(%99)3_pzm zrzTWPScXxk8_BRq1FtsX+uVIXc80-R8O)H1Y=yj*zl;Nf0tX7}wD#A(D8F3SNSO~n z7YWx1Ou}skP|PX-oDKPPlg&8tFgjYF1P*M(wsD=0#ZaAo(l7g$L0&paFS6{ekjKxt!hDU29wn5d2~cLIad}6LoX0IWq#GiPbskdwfCj54kDm7>>Z;8({VcA zu3Y?>_Y}Q(d6T;T(6_ui;Zd{QA;{*KSjkwT@W&kk6} zU}3m%;+5t@jFfd-{ilHSvxR}|7-`d;2hAUS=tAvtKXYc^rkB_0|<2OZDoW@Ir z799R_*-dUABF^NP!dAxEN8ri`?xqvMAEdJ@GdF}%4UE zxf{)9K;%Rrv&a!$f-xlPu2Hn0qEyzkIGl*>KDVGs5U%BqAu8H`MZgRO_Cfw*s+atX zi@92@eFw(Ms%3@0`WoZk&)AU<(?G6O2|q2o z@YZboK=BKoJfv-jqywqTI`H5jcB|jDDO_eF~V$v$&;h&LAPC@YuYf{UDs+h?>5Hw@1v^Oqf$h7$_G>z zlQ+pAPitx=8?Du3#{wQO@V48MpDs#yB&sK*8p(WgouTyJn`|(feX}zlPm}M2Ns99K z3izsfdE)h^yQ#>?FBJ|g2^VMzj8Bd90fj$2#~U9`-kmr(4?daT!F;*0%aaRHj-VBg zMe}&VoSX9PY%ZC^VwC}D{#GM*qBJ~#?D`YUCx}MI9mqw6tK#?4SWnvryZ@~ysyHyo zV3q~0nqJ!Nk=#hKm969%i~o>s*_O}=jO!meTABKuQvHWqGb@rD{KDVXU6L?8vG_Co ze9l%GMZUM5(WNl*Ft%pcd6@Q%uHOtK_sE|=K|9r$Xw??zh68D4D%k>>#b~L~6%Hdc zt1sV5`ZeY=i9S<9ge=F!1F&!3SIXe4&UBH9WqWp|hA4|_B~i{%)g0oqo^?at*1%$u z4tds(HK{wrXog_K%Nu{1xwBtlBGz zPhIx?szvj?|0wCH0`^MJMEqeqq(<&iQM_W2fhdWwK#3AES=*~=n;hwxBe9cAZ~i@( zqr!n_XC{L-#g{=*Oe8i51*;kpcIR&!eF?r5BpP|8&N$b9{J!hsUdnwuQ^fq1rQ;q8 zS8pwOUUVUv0qZ4p9=-3Z)`t5ABDw(;gN2Wk?uFvOB7H5Ef`mKc?%0Gw#=TSFE%3mo z#OeoDC@ImhsRitz!LtR`RvYK>z>jrnuzDN#`16C= zo%?p~67KLW!#rZOMX}maQYECvyi(WSE@~_}f`z)(cf}n@Hcy#yEq*&U>P1df+D!O^ zAG7?jp}lf*H88yr7(=W}jKx}F5X7ulBo{(#XtO$_MKpGqqAa$HoWldxxbj?S4h;i) zU01{vPO6%~8!*}N;nukvqVuyKYvB`@@MT%CQgElr<%Rx>lR~w;NFWmVp;q~UiqB#B zPWih7g4%NepXayy(?0yR#1)Pmn-rTz6#srPa_s&Z&PdOLju?B-7mZLR+3qyEAVrHG zdeAaxmfktu>yZiz@Fl3>copLK{975H&jiO1;-I?uH;e4XXr~7cpK>Fi2pg+Mv-fO` z+_{*517DT?_*b&7drd+@8`2dx&S zGT0|nyKb&|G#0}R@q=PpMAiB6MYHUX*jDgtkz56J!&Y${gX7rnm4ynl-lC%2DSe}r z&8l}Kvz230i<=}nK(6~(eqy6rU@~oIdZrDNaSXy5Bws_;p`%5tdQpH4c70w4r+pKtRa=xqyD? z3*suEyk|znh_ZyY3TvCo`T;ko)KWQ*&ZhlREYV(^o zJAhD?_rtm&g1MH)A@c|6z({{B>3Z<2RQ&+L1Ah_$%E@0y?I3mJ&`mD|LsLp#6&H?OHOr-;5P9w(AP8cW$>cT_`wpQ^EOn%OO167}t1WI?xQm?SXYbzSeILzx zQu{l!B6)X(hu0+*`8Kf60DQ==`Grm8MJbb}EcVz)2bV@fUoBUDBxXN5 zX0%^_b#Q#UO1=>md)+0Mv<$8=uy1VSt=qxx)m&@NFK)LrS^w=u*_2H@le;;&p9XQY z5!k&aLh;Tv$V6NKH2un?gDtn4xB=#9(4?GSB_OVPd>rV$5$HG+2UCrL&gaM2%YaZO z{Qtv&y-I&=?H%YhfE18t_{+WfeJD|@UEzg&Njd45&0a?D!V zBCsBGbsXtS_el4alYIR-ij(Q38+xMF4+)eh&DzGFLmQiFncQ^{n`)eVC5sr$U4N^Fa;DrjMHE{)q?;dJo6Bn5I9D~>cU`h3 zA+~-WWg_xFV^K(7_$}Ifs)oDzgHvJ!+9_MrhZ5^^%ee32U1N-1Md65=jULe1SPM(E zw8iGbYrjENSmAWiS%DW7C`)xv_NZp%?dsOGvi5IL1xe-q@>y(W{!p1}Xc)6;(?Qc} zX(jdbX=2j(B95_ZZSQO4)^Qqpp`E`U0S^2(snq%@_lNJh=872x4R5mxZ7k9M(Io3X0MQk*(pV?L>9l=SEloFR)1Qew^b7uPe?66c1Bw{)7r<4I;^&(G(ng$J zSNfa68R>u4!7`5mGfac%@#rtv&=fA0ehl5mRSt>N7$xs;gd4fAFwXBD_FOMAHdpm1 zejR`MD*J|m-b-2Ev(U8oz4W1FKYTR~w1t{~&&hiq&~zUG@N;4!Ds@Wge=O`f@qb@L z{%34Z{`W-c|Ehd|aox?~9>}e6gpy&g?trH|6@-1!eebrN>KJFqXrwh`^!v1wx3>Tw zRNFtLoVHXs{ynPfbR>ZBUshBu8i7*Pm3wsq`DU&D*@wQp*{wo$|CwUG+I`{L8LLtY z?e4`GsA#v;Gg#M6OH<16NP%P)raHGbQSL_CW!krk0}z2&|H^Uq$GlGI)E9b3&5>aE z>3Z+>zHhHxy>3mQWS~=>;T|pR?Og(9hXdiuzF5*@m>+pfyTm`bl60HkGFzSe@N#@o zoT~@_ZnK7dn=Im>Z%60xWE2*jvH7QO?aYptKzc6&u!G(tGVkKt78j)AAyiaXkC~G) zekEULm{9c_$MoM58BZPjgvYi)Q6Zb31;!YFM65smLv6uWn(Z02D&lk9*u_8(`kjA$ z?i;lsz~AO4^qhKvfqspE`-Kj$uY8;2wziC#gVZmBF=rEO@1Y7joq@jps)S%eLFa0rP-nOeZ8ZQOm>r~f{*|B)E9jPkttP1ywWHJW__YP#RPW{74xD5s7oY3E zYji@643qI*ySDBC91~tN1ljGgbQcNF+5)hi=t>S0PQd)45}*7g|nc0oI~e?^0ucf{I?}CRufu)G+SH3uuod@klKi0$+~~7Lb;;~F8$XD zD6Ib{t)u_Hu0MnDf7BxV|NH#^8-4zFWib;v+p85{dUO=4UGEt2-Nwp;&6d{T{7 zA%8d1*7(?YA^Z*XIB&0hz+McG5;7_+7oHlSKF4EJ*C1kjD*}FwketD!R3hJAtvM-p zu<2=ay=#9^M^Sj_Ca{s4*-WI=L-50%q%|w=En<|Rm7R(Izn0k-;+Zk}OHl)*Irm1I zG!ozm)T`RNE%#XqdsFYTnwVCkGEr#;h~&_vY~UE-RaD^C&bui^Tz4bYe1~ z<}|d_G0~hk_e&-I5Ozr+HYf7r`PzIod8_I2Kdl<@Z5W62U=(q({#$!f4l^me@PP|7 zt5OHD!@#VG2&Hi^ew<+DekGP9SloE=i+_Q+Q29S%5mW1tAiHYXXr4UFeG=VNzKb$q ze3I~>xzZ8#)-7GCC(B-fIvZ}z|MV`3`kVDY?4q@G1IFKe(bt}NsyZ_RD4ls1ZwnN* zgWj@Qzl*f~SJt->v%>uA?Ahii<#BcBcMwPe6CsMB#eyTJX*J}ABRPYIKp%0Cdy~=E`CtYgYvH^@i#JSFR|olJ7?FL z8;}Qs!d3$t)&}OiXhRm>gAdZ)T@*Cz4tB^hXt3(Kz65;`D-D(qcOA^<2StHTu;t#T zHszG9@0$W#Nz=GwN8t$OL(XSC|9n-(JOz_NG=sI$#dEr^Ph;^e-!_h4Zelpfou4** z`ezYKvJ)W$@FC)7E$6SlNt{2&NQuCcz1*R5GT*kktEBQNPO(uBzol|Syi~?NcXJ*o zkJxo?inN`U6gfChFCWr%mvke?K58x5iJ8rdprGO{Q2*rp`dm`3HqqIkYq_oSW=Bk~H5=%Uzmv_)h5&1KAZamvF5~YO;{Lbem!01XvKV92y7X|~ZyFSv`Qel|i-E^%dU}1- zwMj)xG*!j8FRwS*c*Jw}FW>mSCRSx2wO^et8M&^VZ-mKCh>i87mFQmrx8QE_@Y8+! zuB~w>G&A{Lm5`Z5W=hxC(Rinuw(WVR$hC784Vk8MWUBq(po^38=DH4VlTzZA&AIG1 zxQFb<{JcZK3Ngj?_=10)FRNxDORZ?>sR-ck=xl3PI%XSh-u8`5-Qt7ibo1@7iC|^^ zo^Dtwc(AE)OyqT4;E4k=)%$pzO0g}``*X+bD@QWca?aNN$hR=4%6K<)jZ*q_zeGi=i~H`G=|;!o&X+_9i4a~H8ISCRm)ILO=Y#5d2z+s z6eqiK!%if5sfVo+Lbq@09DJ2?Rn00i}r^9Sn~=6mEXb(b0jLowf7Y zH1_)>4R~Ne{20QgWD3H}2u^%6R}dxE_EA&KbPL$o%=%swGxbxl7Q;4%t;*@HHoqDD zOb@&VPs#T4qQ=?LHBgT?8#gT`*Ntzrt5*xnE_pbv&q8WMf7GmF!j%yPn;xl^ug&|cb>y^i0`OSU0 z=IHVV-+|L*Jd=2V!~W~w13nE;Impu?BwKZlfJKJ!4@GNBNbp#V#~Y@Ga~^g*Q^?9@ zDPAz^#kN`gE}G0{THX_j#jr7_ZgOJRU<&>pB}p1USK4Ydr275N22XE#oQNFG>itml zdj3zPj&oPLEK8}xO`Z>cTfN)ZlK8!59>jw$W!NstTU;qp4;;<)sz0aN^Ot_*)KPAG zIKZ-YO4an;JfitCx!admuY{9quVjYEmx5V<``&)ia%JuC=&TZdYe?yI!Xf`Tz><~x zG4hWlV4WPqSV_W;yH=1}DX!cYpk6mzl?e4bmAESY!2?z=veuDBN7D*hGoKG8)+9Nf zv&hpqx2hD4pokmi$PZWLV|Ml~J6#b=_m4Y0o1Zj$=1jU>Q(}KI@izZU6GHoR!%D$p zu}2Ivs%{X)drVn-P{&DTSw?(E8Nm(XfDf5$~c%4vwn9>@apf+MaQ~d5mS6TN29o2 zbR0NYb7)xW-Q#p<+Fheydx3k~lkZrnEI4>KN?XLNAVwReVKJtViPASbvHhdzIVqkH zoJTpipJ$R>rWQxqzUCcml$qt&2KW{EtJ>0*ZEy|^AFTmQ)6BrQUzxa!#2by{0E_jQ!roils@m$d!Vt|zZ-?s(qB-T+S#ldMu8+<}>6{7#%u4v# zt##0~lRnms_RGi=m}kA^VBc6SW0A<* z{4VG)h}>c$TK-VN$#Uo4TUEaa3B-;r>NeB^_komaB@ffxA3*87s>W#w0!IO)Yi9W_uQyy}+hG!!Pf1EZRDP6fZrc z+_~!Q8Q5oY7n9687F5k#qGCVg7VQW;q45&CrZ3JptX_X@i%2VrxvKwXk4dr<+Qa;K z++n%NF|$8y8d||s75J8lCcbkgLUM>+OLOOJ&(xuo@84gztx<{bU1Gj#TXjp~7LnUc z73$?ekBBnY=I*5@OFGi%C1<2QW4k^bf8ZB)r^FLsAnVc?yXNLCL`hkoLM+D-)1x~0 zKzPNVg!j^qnWS!!XYMh>XLPS5$+c8yDna z@Uz@BN538^9g!v|!{#F{A7%4ZN62z*3(+lOIkn!+$8?C<+*+c>cX5&H{G#{#LUUXbysnZs}2XktQD;T9Gfq7ji2n7rQmOmg0HgZX%cNTxw0a#u)E*rdwC7`BZh;J;%K!@W;H(Q0yIt z!Nk0C86}ba%e!mMgQt5}qmMpAx8~w2rR6{+pft7fcEN)gVUW&#>5)vi~KlrU$BEhU_SD=vIMj^B-q>%0^cM2#? z;<9m(C!3LH5IwrsrKDA6L+^+s4Yf?IId|>J))!&aB&3^${U_E4_6} z0wB}Dh+O~tZsjg^?C@zX2Ja68&jREV%YD!D`F^mfLHTfo|Gl}ZfG3eH%6dtH;3Lf} zcwf=!u+x{)jm<`xTCBGan)!?p}_TrF_WYopJI&qZgzDM&R>A7EEt}x;klVhLO zARpEKfUNY-Tc(iwj=f{o77BL=+)WVF8%uYWWGEHYX@0mw4DIB!+Oy-^-7rbp6j9l|9w2nWl!%XUeDwWYUsLWjNraK`O^T3PSvLW%J z(g0?$Otz+dWj)yNYADeMJN`&D_#~*SE!3@fZU```o`~n^4DMm;iYD>%G6>zieTFmU zj`>$#f|`SL&=xGJd1rZKjPHk_`mK8M?I9syAob1Q;iI_{)Xs)`>8j0ZX@8<0(>n96 zw`SG1)+T&1|GyY!#ydm7`9#`m~0IhNg6pvi@yUhVwo;u1Nu?EDi9^Z!=2nJg-Bc3L=j%^t(&Q zN)#o3+w=>-0ecsxr7HkW@;ierO7FWnfnMjwLYB8gvOKdX0O$Eo$i0`m?dU(F$Wsla z(0NpE`zarb$2j^vIoWUD*Dvc(ToaVrcAQ0S0|CmZx*YM91l)Ad1RNQYkXo1Y+4H$^ zwZC*FhZ8oUbrHISw0h88)OJeP9gAuJzldA=mN4Hh`Pgv)@wE{>fU_NaTy{1662NL@ zp8D&I_oXn+mElH38+-gHT{6qpzwhg-Y-+)oK!GDENT(~= z$xZt!H{{mePQbyu7jE~w&Nt*$`E6Odm#rl2&r=N&!f77#d~YQBcu(qE;)~H|1`@dg zwFHGP)MJpIpMmsQ`8IlksW;CMTmi|6GXUs-X?D6J0uDNQViFQnOH@qMC!){Z8NB)O z6NG0Gbe|lDkT-#ml{IwurjsQ#xmw==sUFR0eTQEFwHpORsA$m&y33cZ>70Z6J)7pdC2|N|^l_m3id*0_FxUXvz254I3ZkiVG@Q^a_ zw<$p=FbZ7qGI<+RcL=CfC4V_KCAC%?h+d=7e-$!AUW#AdYYEjXu45CpnQgs5)kjwb0WK#r_kF^G$mBx2-5Y?-L)xp|-Us>H+sj1?z5lOj(Vr z-WMPZ42C8O>*v-#VNFTvSHO+xlnp9l;l~;FC+B~r?D$(C;-}n4ypv}{l#eCNdDAi! z3kD$Yo!`|@4t|A!N$Oi@cNIz2{O6l|W8n2ONS4AaGu-!T|IE#N>zi9~!gdQNraEZA z$wBtSR+hhHuy4rZ9IFBBbW;s*`Oe$~2rLS+=yN}!77{Xi`cts_J`iRem&zqP0$`3R zbWjS>{)U}1_<>CYzS<#+m6CRQwSQvq`Rsoh@ZQFiPVnS!Xmp(9&nrA*d`KV~{x_(9 zdyC)?Z=j%L6#i)R(1@mSD+Bi4xtY26w*G-Egt#1NY)i`Q3C~aL( zvYT5O{P~C2dEiHPh)75Z^YeRbPz9+Dk@i*MXD&N4IRHu-3n!-%q}CSn?s;bO-Ta0p z-MlJexe60aZVsl@goH+%1zJGKk|ACCpBrj(h|J%4c2E!^R=81I=Tl~++w;iE9QdG{NefI#+U>HypjTJ+iO5f6{r z^~IV*s`(cR-r_PvrfTrM7^FZe%xIg>ccD)|KD)ZldmO({Kp4WVQ!}ms(#rwR0)-)M zfD#8TI?ew{58Jt}{CyWz^UkI?24LtV)^*wg@^1Kk#K%&a3K6A9pcaEM0`S%Lyn4Poa>fo3-2#_^fdQ=cI(G)x zVL%H$UD>UVr9Q{iO_g0&C&*_xDGy|C(+I|+PT}`mV9=Fhexxh;5(sv%a*)CNuGL?! z=}#DRJK3yMeNwJEOLjVYzpF~^v0c${x|&>cxsYO)=}gY`KEV3Xtb8JOSB-B^1%ONF z1;DrtzM+*=uo(UR`3wn%hvO)`Hay}ckPpDdWDLL#!3@Bb?9yX(AI5##GzA0u>=WO$82HR*lq1Bp*6)S0zCy4}Rf8H2oGsU!hoV6l3R?~- zmk3>iqxWg+hvDa7ezwAod9>TYwNcvJIEJDYgs#uj0~?P8 zvq}4vuQsLXlp9al0|D3~5zBy-u?U%)_7<^OXAHiRSTBuFI>=8VVj|^L zK2sP=1v#<#*vIYYI18#V>bY;^2=7#lZS$e1leVJ=L^n9&qDbc4B)K1KFhVY+@Dboq z2w3L>%1f$Q^abD({26mJ6nbnx$YqQIU`e~hOWJ%+lah7j^W2kK=U41#C61=zVahS3Z4;~*73J&Ab~I)47Cfz>MF!ojUwj`Y zZ>ZTVBu9_RZ<&U!-hpL+wcZSxjW}&hns1EeICG-VJ-pGEtPx+aFD-utoF+4%Zb@&yG26x<&|{#Qf`8E#SpRn3Jdub$gV*ZF3z z0El#8K+HR?u(!UIU%Cg3fH;=(?#PzXwM<@$z3e`c$@k}t0JvRe`4X7=p#!EotneKn zymbyZfGBKqHvF%%MU4>7Z12$7YlCPXJCQa7bje)X1P4L`B;F| zz0;1KvH4%EP0)LjKLSu^3Z#0jUpX31=o^vik92_@lYx6L>y#^RQ10sIY=BwPFO)4Z zOC4dCnFa110WuR@be{s;LO>pQ%tZ~f6%{7c6)*Am_c5;LX=* zV=X6HVNS4Dq{2Jdfp(8vF-hAAVNwI!uy$bo*#wZR`f@+Y2j|=u28481oiV*yu1x~4 zzrxID#{fP9WQe+a{$oD>4PZT<;)Y-KU)n+4pM#3o1N!?o?ImGf0 zf-&C5=Z(eMa-u7r15yr^WjshJ4V<6@2ang1vNA9kFbetUoC=Z4`Z)RhniqXL^X?-iH%!?;l%wmACTKJ6?4cs-~irJE{KLKo?0;+>MCcYI*KGn@6QOn$=uh3$`_ ziFunr3Tug39J^O;+MD%r20lFP9*x+$6aGq`xws-bD>AAR5R8i^uvxBy7@XT7v5IA9 zWwD!qs!3; zrJ4jVSl`(gw;X<_@o@V6Rn5(%H11+cTjOX+^*AR1sp(WKX?xA)>(aKLyuSJZ- z)Pl{vRWd12l6YsC`_#W;fNPl{V{14M3B{aF70$LgXj#3cNgBcPHvP>Fy@$Fa#?y0G zKjO>|Y-QFJhjvK3Y(~G8`}+$#@6?;>EjDxzaOLIUd3GR>>Mnl zyhz8aa6U_2nO%L8V_gG6VWo@leqrLnV~|**N3V9+3M0eIFJWE}4^bvWPn|0=-P~mw z3-w0_>Dpedx&Wt!_@0+C58=6F{%B3kF1Sv}!}Vwee|^a3IIAc7xiXm<46HATs(bA6 zwJO~s)WZ85GZG_tjR1%}z%GhUaaJH*FSJHa{no8C;^m(-!w!!cO=g;oU@)634Xk<> zCZNtdMD8e08>Xy7{5jpK%;ZIQf0|*KHtG zn2Ub459)Hg!>YyLtk65HnpX~N zvT(-QjsS$Kx!$^q5VQ9Y#IcYSuTGIQHB5(PJD!ZZUF*2nw!7=_N~ zy%O+)_COQT>sLw=&m>D&p#rq!%2=fH9!RmZ;1AZyH9X?FXuz6`N|8ATyLw}aO|zF~ zEYTMRKy=@j1AgxEGIa!5Fh7n_BRTIpedE?M0iRq9wKSE6xu?(Tlqc*yLBH@_Q;t@c zr|q`d#-(s!l@iUJ$=kA@_T9}9+K>uY?N%KB5}6xe_;1aG>0VbZrVuqRG=_IAH@t98 zJN%P>P`Z9rE_ZWv&Dsy{xq+tgDVo$pu;xQZx{YXzpAyRbajbwvo-W^jnIz4>U|9vKdjHfztDcka#qD64xCe$~tj zsn^@SZ(4NzE9$78XWt`bjW0svgm1-NAE|lXAlDqZxg_$8H}x@)Bdx}RK`n1E6>k$q zyTP&G3Q$slJ4wQVk9GI1Rc2H=OvS%vGR1cK7nAH!BibA( zvvh0~3*tto|Cz$i@kuX4F4?$e%gpL?p@P5Dau%T<;mDlL2g-HIF-3uDG!x^DDp*_hHr~MHf5AH6s18!9(3v^E=9O#~XKlKxqk^+9a?Vbw;T_3P$RnAdiYvESz~o4wSDT}r+RMO%IM zyxb2{bFHmP@Hy}Izt!a?kMYw-G-^FfYA-rIvCg-Q4neB=E_yhLMaNT=aY!v3*7kvg zCZDO|$z7ZPY{3*|pbLhAUQHzWz5`Oq3|72-7P)~*D)m&i&ed0S{<8nn8oV*s&FiMQ z@r^B1Y5w(QNsD&Ys}>{n_CO0<(i2=QHPjb1tMd-6CTj^bax4N$$H&ISsW~H2Lu#3o z=gLfv&k+7q3U6hP=yOV{n=h_(KU17#K&wVy)ysMuhz~r4H{iuD4pQb|a4H>C;asn2 znuJhfeA~dg>>vp1GrJYd1&v z;ffl!*D56wr6*I|pwkr}QuN2?{-&X=BuprgFY?u}ON;{G;tctqw#W7)RF1JATNrsfQGvM*s zik9PG1Z*v5`JaZ;&XZ*Z=|&A9E-5qTxl(7If!=!(Ex$EOhPIt$qass8KV{_qhJKlr zBaLw!MDAu2dR^ydKb**Ob$?V#!l?&4z3z}}Se3@zCfk||*~tVfSvLqxV4_5?i_L1} zj!+j;qZ~XYykOlp^W9=f(jeqZiBNj%-G}$*6yxC2A4+vCMXs=`osBotSXQo7+4!f9 z@yWHVl1tU19h-!vpK5wNle`Ofhwf;Ytxk(@BfTN{c$9C~<)Hq~@~U~NpInL`io|L%r@z~7g7`IDuk0?S~V(T$HjQUOnknr)(eycRJw!untTOCd5V#fI<`(T7i1B4 zi>eEn&<2{%tH$^v6mhL}$#{vKWE+ukHJ< z3SG}-CE)f{@5k?2(izRPwOL|*ehuC;ng`pykh)*{O7zBmDx=A)%%LK+tt9%H#Joc z%BPwitvx&FL_;4NfMh!oqQ@c+BvQX~G00dPP2EpAb55!`Zj$@9^Eya%OyoT3yJm|N z`D*?4z%Z7r5PJR0W7o7#D*4r0O=T<&6Y(`s-8tgT+m_hcv^WHAz`_hG#BxO-(n+r- z()GvQJnl^0NR60*UPVr;2pVXegWRIlCihn)8LYp4Q)kyFZ#m4{`t4~d=SPJ;pNFM{ z=lSVk+{{N*m>+~+pk3nAJdBzBR@Rwhf3e?f*`L6rm`s}u<4}ToR!N<2#|^dE#;p@V z_!T#;@+dc_RIa>s)4hza>1b7Wst?Mx_BN&1MVK+BvdZS!%Q7-lx*(u4H#C{J=;$oA zvZiQXRLLnWRR2V^79deE)!3`@U)?V1oRF&ziEuJzC8$?RdyPkt&h(j%Bp>#fZI4q} z^-uVAb|7cu(+OIx_bDay3!3>=ws-=Zj>YdOrdA(x9yM#f5IOB8vmstJF@yP-Sd2ou z&6W+Mw?+iw*jm+32VNqC=Hl-@eftYPEf(4nAWn)m@tfasFVd>#?XQQ>N_e*ToM$RS z+m=5nJrL(oesI4;)IjC!pcMPQ0_n1y<6X^ft3oZ#^gpLr3VMuC0DiaIQ!cBsuquJ*#S^D_o>-gZ$J`GEE9l=-qxWaesq zEO2A)>y%adUSQry?k@vFN5mbXm+q*#Hc_Xa{pc)A-7o`_vYs9KRlqehDo_k%b_LD z&R*v`2E3Ht+-y{hVl6W|KRL2{0z5?Y z6W#FYUxBkV=;7yKCe39Ce-;X{^a7RilTNnXO-H**Nk4O!P4|J^JYQm0U@YWu#^Rk8 z+9aRS>q`@&Z|E^rvb@@(!BtpgbT_cYn8QIC>jzEfk=Mv`a+K7`4mX@=npKqFjIgsy zWjazIMaO;#?5ZezD0a6nW?f~}!n*X#t7)M-V6+rE<&CwQ%!(jlRLOruYCGf52usfj z44m=}Vp3R>uU3o2m56v@$3r$}!25R>gOlg=!Hm74mR|~A^KAR;$bDgsdo^LiGRP6) zuC+<){C23z4NZoc9N#t{$uIo0-emOcC>|d_S5T{KA7)*E64Rw}GFe(6Wk5pxk9HPw zV#HXed#th!l<&g|*5lx)@e2$dC`CKjh{; zQ5dS&oO3TrojYiX)A#$3QFwsX^x*8a@AdtP=zs8nU&q_?X=kAk;uYyJid};5RCOyF zUSVHp^-S~S&&5!qNy5z!SQqOf*{?-480+f6@?^2anWFZvKa0+k2h!Wp`b zY`!P4JR|0A=_BZMQInr1$T%gP;j;HWMy?+nHz)RPsY1$CNHGB!;YI>GCaN}D-)J}v z?jAkf`z4e8;>JYMb>fdaLw23-rbnB!Cw;J8yZ}a)D~|6=8gm8pY!8V4kYP z656kT;MPIj%xeD3i30XQEBPw=L$cy=-jn5D2J&ykuzZ~A{)b1ljL*k1csQ7@tBDec z&9nv7p0$@&FKN{9b*8uL&1$a+zn?9(&ld8J(=0+iRalnzx_(!cMUKs@%AYApeCd1i zq-7A24#Fs0!nzFFkbxRUk|U4Oa-7c}c}kB@{n7oww7R}Lq0VV{b}L3VZ*HzNs@ z8|ZK~0)wNZkhXh@2_hKNvnzrS&yl(LR432TZ)ZO5sxG$;R>NSWDiE=yyi~_8zD2B@ zoC<@3@ktdU-zb0chwb+)WE~imM0u`F6uzTi3s0OZ2n?(}^+*<-dn}E^lVi_S6DIU{d@6Fc8}F3q%LBc`~q&@L7L-;0++bURc$jA)L=>9GhSAb z5fEypH8VXl0eTdLYzqz*f)rg-TN7yXBdm_+a^;>Fh=F4>xPX7v!QcHiF7vpEEv9o1Y9hC%&AC z9bW%r))?^Isa=#N>4|?rdZ3fHJQB^SRTo8@H(LfBZ=|G1{5(5_U4Ey2$>T??f&GIV zLdTm0OpFRlh!GJPiOW$#_0JRn3kkStV?MQ*UurSD6gRk}EFj%?$Nv2vdlYU1I^H&!KYGRyM0Ubvn$FUNy#inyE$lTfTmtQVL) zRbQsLoJ;c1n6{88!$;%bzj+J3gFdlB(lY3w?LD@{6xyu7m@%unj( z=C*sh-T&68;_DAw`YXmzdM@5eN!V_bD>v7>znOIO0V4}dS2SKh(iFc5VjxzjxaUV0 z@g_Z^y5+3w&l3f^e01T*n9^-W-Ncri2)VR1O!S20g1nKSWfmN~1L&$3kjl;p-<2g{ z0RYC$20VPl8Bc{%R4JX*?MJ}`{Pxa27obxCxbSr&gkT2b+qDc0ox_R4Q;wU5Vr^X1 zu4I;sI_QsUzv73Ko`Tt2HxgPel1JOZD|IU3B=U;inO?{$HCmu>=(Y-K^fIG-U9my` z3H{il&iwF^ zMA{}xz%@&V&x$UE|C+zmp%b$D=Yi5ygz1})xaP;g*vk zvMi1aMmM;CDpX=VIT!wQ!Z5twRBNrEo`ZpSO%_&~UbwyXi+VZj)p&<|Clw)yw8wAZ zyFq|Q!yC3GvPG2EbA5a%zBBMu9cydt-5#mNEy9kE#~Vz5H-*bbBbvRp(}X(R1s6i; zO%QL3FB(KVqbYbFp5S&cMI&z8DzY^6A~AlVd{8~HWzMTfJ@;9nAiaAk+QDwUfI#ya z?DmDwhrqwney_FF8!KqhMcMyQZ)Tk-w^gDoOZv&{L(wCvYs446-qFP)?>pVjI1{mp zUc$qt!2LNfK(!RyW}v(%O@zA|Ttd7r1r&~2JDi^5W#PmdY{_vy7T|`=X)a2HnRy%D zmzd)~jaT}E$Ko#;kUf~oB*%Ny=)RWi)-RE03W5*O?Gk=bP|{-ACnaJw1l1<#54($l zkLysL^tg|58EmIninw1^VSq<2`O@6{UvK@p?)v}Iu+XZu>`e(u$Ot(6Ie7#3K2Vjz z0lsQa!ECOp*506+j^W1PWz3Jj<$+hcfZx8e5y*mH;05kt9TxZB@TT5@-vr>I&tPDc z5si_b`q%dj=cHOFKe!ctzsSk|fBsoI7eF;Vdg~L=aZtDKz&atYPJb{@&8z3MgCFCU zrY&MXdN^*mf$H+#A9WE&#uW(2@i0$d`gq-OI0Ej44`91C-11kpW(6*UxbYJ9u6cpgr$mCr`Qz zH)ByGp>J!@j%tIamzLrAF{nSyU7SUIylJ)(SJZTM3$nBCG!guECMGVb+us87^|BDg zzRCay%5^JWcEVoF>ZQD$v0bjJRc{yL2tBa$SR80Ie&GQyt%&I8S_A0=?Q-K%z;F#Y z5?Jmm4lWPnzDLQjr;4~_kCy1=j5{_dgC$bqdtF8Hc;j&kfQmeemwT|iU`}c|fW?p! zR9;w=mmL^y*tnXfDQGkLV~wD#GkU_vXQd%#7#0|4NXIbOJ;HDVNN1pv6N z?qb8zqW%5XR*;M)ICyfb{iov&lg$l)zzLNCkL3<9(*^a*ObwHW2;|!EpCX6atDP3!c8Ye}t?M8wZHlbYK3yPT^~`dt><6)wcdu@5+GUo!9CLZ3!sat%_n;rDX z3bPP+83z#&k=Vg(EM)IcOAjn}nicKS*%DQWIZZ24yuU!!7%#}yBJcTOx5??WlCb%_ zmcLgQ+u`916%uOkw&k)K{};rZb?a*R?&CQpx7ujN4dTjMoKFsZpX~93ihk$&m1ruvi$-r9`i%MgIs?T+$Y^mkdx`={pHIS zP&|DExWS&WGK0FZ8)6;ZTCntGVP_7pJZSdvQ_;i=BW5sP*dE43ynC|EzsopdX4MJ6 z65EgWf=PJPa^#}Y{9B*MSmBcpK)#U?Kv>=$j2&-$BNNY|0+U#9Gt$QtIm?O&ApYq- z%N+h-)h`biG zzC$XMWU5pwgdj)k8^H`1*f*T&1YN53g?I=1K;mbs-{f!$i>qd|94Wi>*4UM|jM6F- zXZir(??22Bp44e;TNsV8?wslEn?1+70IPICDxP5>lCKZ=9dscW`%BLr0}H8t(kte_ zADAK98r(fMlySPl2F}F4O#imjxkTN6cd+@+AEaf7xBYyK^0W=k^^w z?i*?@O7HRUX~4hfCQ)-7^Krk}f6y9CTUp^U(_u?SUF55OH_)-pz{bYr5KI)S325<( zJ~|{dv~^dCq-z(_jLnMzge`iEO!I9U%D4J{xpdQ(gyN*5)o?HS|Xa}vQ2QX;u5$2_ivuQ#l!pvfW&8D zZMlG@@s!5sGYL=qSK}M?Us~JN1F+&JV61T834O@rYv+t5?LUEMP?MI6N$5hMW6xXzk33BV^?^9H*v)QTr;TOlmKgiHohTpW@`2|=t3iSXqbD1 z2DSFd^L>zFG$8p@t^<;d6OfSqMUF+<2$<^umd@-BI4Oy(A1s1QfL2#C!d>PvQt;f7 zxZWM9yFox=$@zJinEF#}ED<2brAtA_4(|1rwdMD=$7B+kk6vaq+H&1xBO3ry9#`*y z|5bxyJ;96lWwL4Oukan?@{fC%aUBM(<9T6N;k@aa z>%bGr^!%H^d%{4$l|)AZHN7n;I0Akg*nxOKA%7e?ygLdN!C}C0gP4L3`igrr;PoUD z^a4>6l7tYFm5rL3?Qcx&;oL(U%L!h0z*gdOoT#5Rooe3ht4hT0lhR#8!=ylLh$l_^ zXKM#l=y;z0+Md6{R?3?CjB&r?)*ru3T6dd_I*FJFT%zUQAK9ioMn^It`trQ#=l1}( zow%3TPkfTrf=X#zUyk!DoWz?0vps07`5W6$15%me7s;>w*Ku;54cIE^fTq~gUArx& z?Lvfc-x`ArW(Hs=DJ-zJv>FuXg|+z+-gPe=)3dWs%@}9xWmu?cL{wT1*ZhLuF=Z72qftx1;Qyy1;m+Ws-&~nVnKzti2opOLXAreuMp>o` zUwzu(NmAhLySI8<7hJ6fz|O$mg03tW3o7zc&yAOI{;H9&;f)@_#Pn!(@HCMXn1QFB z6C=5dPXczZO$`AFVB+b{Zy{b1HPQ7qpbIdma}J@_}ltUK_H+49uLCbrvp}#r{tig&87+MG7Pb}5ds-xF`8_zWVx!pPqzgQ7+)`=78OJ!_ zkAchjOS+O#!(+F6e2hD$CN<6>6|@GCQTJ#*AEDyndkNl==?eNz{sOUZ<}@x=rD35_ zqu}((^V>_SY&>H$pwDtCT1(-3@(vVBvVq`kV7R4!O4h0oPAs(kyQIR&ZkWQ-yfEi& zVda=;NTI9Zb1#M>7jq5Dg+RuBs4s6nB630UMczVyQPhmU??ytw>dt5u>VFG}%DJ}J zY77N!3(V@A#QpZlHlC-?*=@b@r*3yEH7c|iM zcUTH;%*VVRr4t@*g8q*3gBCh5Gb5w5jG`pF$0&0_n?6^k3e$R$_$DzU{tT$9xk>Mb zx)taR*Lm9^4L68s>-LW#1m{yLFE?*q@z1greLm0Yq#hbA)qX=eFE|ZsiE4P9{UTE6 zt`wKJp(^_LK`rDYY(el>t?Fsrl92Akqma`}!}_?ZYzwU3JO<$=9^Mt@^DQPVa^xNiI7W3Jpt0N$om#47(%*1HfW4+Jq6z!{iy*e4 zQ2z_c{(BL2gHYh$fFtGMlribrtFk%mHMUrR8G3r&@41U*93WAsH>vz>*f42&Ot?K# zzv3tY)1?zILm8FwyTPyPm2xwS77=KLeawDr8J?b2*Sb{iRiivoI@9ZrZ#_YJ((G|( z&(eC{%`F~7UxIv~7lQodIm(bbK1gnSpkK-Ut=2rwRqm%9*}hApN7c?^ak7x+Us^Ew z4sUe%&l&d+PmLnc8Rqs2FgvO&TSLtA2W1E&t06f8N)-m|N+*um7&RgIP2@lY@svbiegX{v-GLtZj(`s zP^UEPiEDcyx<;5YevBx%4VZ}4FWul|tUo@Q2PRhR8p zLF)&M=e_mDMDNuwXRB&vobC~PWWnyp#3{N+yj`x9xd=RTCRwA3+2l+kb+p}OAzZ8cFHI}Xl)=#4MbVR?-YRI(J=q5xWY7Yt%p&nE@tzOxd2gi z|FTA*$vuw7nzRM=<~VLKYR7bJN2Pl#Qof4)Xt9p ztH${QWhZw(gydE`NpdkFL5cqrj=_Nz1~Vmj{Rr+mx-UVO&CvU4IKTe)&Oc<=8?Axt zItNGfYJ4Ws31;}&nm})Q?21FvW%dTEWc@4Pwu^X(WqsysPrQFZ{EQxy<0=bQ-DESB zD)<79_iZzXmxeV|@15HaKxl*8!#wo;uFXaTC9Gh;;b z)zxk1y(cX$U5W_}C{+03kdah%#KFBfE~{!b#UiTcA|A zcRZ&LX@==#lED3MxBlF_KMh{e_z}QBB92sJs*lkryHSie&UdS;el(~9VKrC z4m+;XJ=hB9)n6ecdD%~wOMjTiqf3q`({Pa>R%FqxJsLZp44 z>5iDTywz2BOAIPE?sd~TGtcgt@K?`MU-w4_Y;|8SA)0G0(|wIvsuL1DZmGIQBI$_9 z)@|b`pC?{Mzb3~*zU~u}jOaBdwR6?)MAt@cfY*5bia15Hf`#o5J=#hIC3Yw_?O z14Y&BoE*^0nE42V_(K$`kJK5sG<7yDfhm^$Z1;CJ!rzKff~>l&Z5;V&vN)GaE#x0w zg7Xe$qCvx;&QU8M0J^ocm9`6z(X^TaE`A742Pkz`TlF*Hw0KGB1U`UPW+ad;cSTlK zR*KChnWxq0fi{Xf&|A0Q${+z8t)R#J=r^j82cIdp3@Gl}VzYM*3yZTj~&5zI&<*&c{ z?T@J8l@1r*3OOLzZ{Kd!V`F6niuo#iuB(-uMUF<7c`oB}pmF_Dx&?jLNpP){r~92! zNJZc_+5bBf(<}!$P&?531H!6g=)pWaw+VEDh*F6#f>&gNTTa;@bJ%wItd@wXXEB_| zsIfq&+;|z)R9EE)=9$B9M=t|C06an#upR$I*8gdzdseR|IA7Hf`DA~!s(%N>En;ti zlc(7Yr9zZ`o5~6A07u^n;*kKth>;?#`)%Qzz@7#T1q(i>wpYNzdoeHnL`IY91~9dm zLU;ZZ>SRs#=*+&kL|6?v2(4bV5#I9qjZZ9fx}P)QSl6PxyBQ@=P+;m8hA7ri1qnuv zKK7H{^1#y1pYP^Rh7jGp=pRr+g}XDL6&YMHuK(wMSM24T zlPyC;!(30qvVmqk3a3gV!L z--BN1ZIpMqr7bI9pWmB}vfkW8)39&v=czolwJqXlT)dO-kWMD^Z3;v$K{zN`z^n}) zEL6NA&{O`u2HH@PIex}XOM&2}=6z6N7XWpf`~X?z4DP&dD0i;ZKCh~zTB2)8${`?X zDt`R-n(cUPX_Z3)HMhRV3cw$<8>=QCIe}*Vi$fqX??bvbC-Hq(LRXu8yKIGIrI1cS zyrg!cH`xFwQI9mA<-*3TuKN&m(mc!YiQtRJ{FDucoZ!12_rjfltE(R*Oyb zBAS(x0zG_1u~SRSMOu@KOvxUwqzCPL3I>+#bhI@0;8npM?~C?u_o%n!9-=L^yf zi5VH?`RP6pzz_s6|2y_=?WW;OQrV;Bx3u0{9aFLTF_m;}$7eTtwrJfoLA%soF*LSu zIW*lU-u6mv4HA2Mdn>4_B0wc39~8cSDPFUbV;A_#Ci7Rl`EY}8 zwhH5bqQsFWqLMrH+40!H`~b7>j3xJpYZ&y@zHqd3ER)e&TuZm&8DYdV36CM0d`R8{ z^Ns?m{&bZ$%g6}l3tBJYiszjCSP)v^plphcFrts!L)-5jE5m+!WK-}wp~PL6?Paih z=VKF@3ka!4p=Cl$6Ja<;;Z_~8Iq`_ zo72Yx%zO=KK-mb#jC@vg#iZwC7O2rndK;Us`#tB^c@V|K*&(IN7j}kAWZ>Bnwt=JK zJ3pKm7(hcZ$^Fc9|2;j-WXrhmw1pTvhkjV6TWE9H*YXx>2eGnSC-1d5WEPU9vWFyR zXNd5|Fg8`g1;LfS&6_Bdn%ESF4A(VLRL9XgT%D{jAd}en!1Vg}g;F4y-S#y5jXtli z^4V+8bfMLn@n_x3V?K)`_qsgB9e!Ww`AhoDbgGBHzd8{g$TNT=xfz#TxPt?2s-34~ zQQ#5c8tPF7ZSEiHto=(4Gu$F5I3vAxSXGZS1??Hk%P(YfDAO&ak@P#u@kiEg>+g!T z z;A#KPST4ChIvi$okAtVn_~#FKcurtyp_2H;+d{iMQMW!VqS(8%kg-^sdHwaUO2GFM zmcK(Jf^6u#n3WFgJ36s7o?Ok(KcpSQhvxf6n(s-mdnGmmCcYHE%W=?FELd-qEY5D= zW;VRp_1ZVVlDA9k)VQCUo{fE}<3LXohm9W8=|#JGvg&<0LL62)wA9| zg=M@upNJUvA3odxHG*4|TuCo`i8^bB%X!b@<<1;J`G79CoWyp^mSHExlG>^6PW5%Hi_6LdhbS&WDp?4;fH4WY<9kN@Pc=G7wd5&H`)aYP+gwoottzS4avF)V z;+S8P-*&#f>YwXZ9d3ci$8}frLsIkP%egp4yl;8+-4ghmgHp@0Y{4}6p|~Ut!zC`6 zR4San)$WOo*`~ed@LAiQr(C%4R`fC2n~CaX&2R7Hcd_v18amK-@W``y`#3-HR&Jc- zWbtgEm^^<1Lw0w74wM*4Hfl7}JEJ`KU7aJ{YGKY;+`wiox}YHCUPhk$NjXAfp(U+^ zkOvD*lsjd+WwU7}^@TWobNR8)Dam>u1;tQgWB;}>b-uuu{)Svinxc_}!0Ob`N$kwQ zCY?uuzlATqzzpi%vuh7zub$sXKU^;Fdgrhn&X?Hf6JP%&Va$`L2x;^f9tmvhNhU4~ zs4k}`(z1}H*T6eUp?@o477}>fUc=1{c%z3P?Tujq-Jx9r#|>kiIe(z~gZ+pI^(GnN zWm|gbdK^@*JXy=asZuC>mE#)s#Czi*48^>+c(cTf0Jmb3eKo;ko2vKurV9JLe59+^ z;sfTs)<39DnjIQyHmcg+-fa2sx%f;(?cJW`1BuJFrfg2*?z82n-@c+do!yD}Xoaj~2y{tbTc z+fDp}E9)$RFNbeR?5w)?p0H|PVjIJncd_r1#M+$a)?_Nu+y-g;evoU7w{_x6fCFI=&?caFn@W3;M|3<8r4+^ZK$hh= z;Rwxjo9Cb?A{};q`lXMLJWH{Oe0J|Q9_oQ(4>e2L_?zg&S@?UhSBV6P6%JtfJ$^Et# zB+8SEyH`u9U$@uPC|lUVEyp8uD-~!D3RovoY@e~#*BnPRbZsb$&heJLy^P)oVwL3A zqqxUo?82qXAV2dA7rWxBSYT6s3nif!vR9Ll zd4rl6sdq7lymn2#CXA0NwA%SndLt|>Odl4|g1V>%^>;n$$D(h``}?&Ft~{(B4P=F6 zEA=*x2tOSF)@vAgej`|WD8#Qdd(M@O;XltVI=F*SonB5W50zm4?MXxgj7 zhwC0HEQ{XcHu$M?sz)EVahZ0ljm0yFE$^y!b_;q!G*-alW2x-#zLB;%OJC!UI`#?? zK1X>JMb#oT+Y(8g*=Iew&|6Hv?HnHVx>hEBodHa#`Z=;}mZ7jG4v)7qsV!cRCf%a? zY%HvJmK0m)hxSd%C=65d>mSSwK!Mh&fhnn%Q_IEWtRUXPlWzE9u6kBX(oF))($7+` z?~M{I^PXK)(+);@R4LEduCcHPOm}=tssB1|0D1k>hz|L4{5=NLYsiuZiPO8+1HXk6 z_P!8U4iC6T5PF_a*W1$4e!kyZN{1YAJ4%{tJj9+v(pZZRtkYhAc z#^U2g0k4^yR;z5f*3vX@e84xbmooBCULSl2`Yi8FR@e$^y$FAJfw73WD9t^Ra3O*R;Z8NCv}szdXYiay`p zS24d|(nETsU;Bl3I4cGM1Qpd9no8fyWag;O~#{rz6#%B~OxHIoU7pVN{ z__T`A<2>nm32Y*C5iQnzD8!MOE1B;dGPtMyL09ZKq<0A`(sT>5@D_((6Sr}8w zOKc`<&+94W6W^f~&+hJF*z3J9E00zJnqLwh+0GGWt#m8_o-HNL%1yN%=UtQ~z@@{k zpO@MFPVYFNQ>?JeziJ&P^Hk5a8&DVVLLZ)6XH5{c;W1Aqi(N&RAIfAj*GaGW&mmq( z?><=HJSy*lhbxJ*k>D}EO1^V+e&Ge@E?a2@OJKgFpDz^p{7C-@%O9m*k;Jh)H1C&3 zW_h9mV=$e6ig(2V6!?;h&chtMcw8c>fa)40;{XW2mr8BcSGrZ9miUj!ldwCmb)Eq` z-jhyCSuoNrhJB2PRuyH&4s1af_Jr6$pGWyb3)A9B#NdarJ+4A5lnz9BmR+K!>frPy7`blV3=M5J(7Cnqbl{P#Og4elW!g6)eS}5 z@9!N-g-^o4)oce`l)i$?ap6gO;6z}Ci9MEwE4iJFf>xta1s8T>cDJgW7IVDj?N;U; z_FiZXCuF$oto4xXtjm`#eXARaYqAh$p~hnl1$sc|DsY*|^PtWc2o`yCG@MXTQCVPr zEa3~o@A$WvzB7hAG=OJ#E8DmYL9V$nOwN}u_;1X!rHGk@PbM7#Ej9hZ7T-OjV$1Sy zX^x4JUmkc^3=XWiNsqiAo*!aGpqmOjGC`+FL)ZyCUZG{jzFi=q(zj)Ao!8YC-|sNP zz1MF_SsAhs;YJOm9s9Z?c~41Zl{8t;$E)M5&(gayxZPP&=}ru_?aI92!oxsP`p(5K zW;C{oe<=bF;p-?T@-rhv$|N;5 zXaz{)z2(qYG4R#3eaQzOdz5Hu9Cu^%YuuwWHpG0tSvnlry!Tw^5t+4)zWiWNY&-Z= z%-8e_5MM4@ygb-cMG~9JRq25jL*!bqP0=dp0nvN+=b**7>LYSdb!3eCPP7XHq zxwZ?--nPQ`7!X5+b-!`A=`%vyI{65Y*MAZqyxY4v6~$^&WvE5UyQlV(vZZGMGuNdQ zl|BqCCdt_%A8A+irWRPNykYf-o-)|4ukUh#d%H0F zL2t@pQeJv{m7}W?i=h3)K*NL~qGV;bbR{x?eca3`eP*)dNV6LD+?Na{(t^j(l7x~3;SrKV&$v{Fssu}s@C zvbH}nT%_E*O-Dw>xeu(4bT5wWVBY$_Q{OJ@>HbKlplWJhpUd(CuG49QQlpwaj>|@R zc@&dryX!?xG&&Mij9R>1w1cLjc^XK{PeLcVc?qpDQ^5-jj6hnrDuIG8slO#V`;yA| zurgl`S&vVKm4h62lBX7QKE?*AYvk*xodNePa;uBB{pei6fAL%7sK|?g^ED}m)6oq= z0{ygqtjVwlaW!{of+mu9nE%^aM`T_Nx)dJa1cj$iNm=_kLfyRYK7dSj*4nNOzu@zr zywOza7NMJ}PEAVR9>+Oss2)m!NA8E)dBaTq%G!=Pn{|m(XzEsXuMwpY-ROyv%~N?( zIJmJLu6!8qG!JSLzeE`rOU^vX*9aA|8Q9H0NS}4lT>y2?C=Brb-pV8AVcIXsR;U+jcB)9a*i%{`) zzw3)#9dS~Djyie+2U&U-_eeAM{>e#}35AnjkTviVip~P%L~2jfT{m0CRS$83L{xfv zwHIar6MD@cV6q5Ui}=(kRN-6&H(P&$z0Rp)ez>ewGT+zGi&3cpnz6q+Y^l+jy)nC7 z+$r~=7H&1xjOF{Cnvs3uH;NkrG?O2mV4@5Hp5Ns z(00k~1jyQH68@8 zMItwz;XeQI`R4-#I<*%yoKc{%dVxZ$-;QTK+p2rw_rY3_q8S9C;6RM!tFB`CcFzHgH`ydqa^+AE1&uJ zz0#DLHXFSPrxYhIftc@Qz_KX<9Gq+D*kMSvjfY;(UGSz}&@BEc)* zJU{S5IdKPm>C20kHD2w{{dBS(3gNX;^jr%5cExb7qxbM=k;un?SsYp8R>!Y^kEiyx zGiw1hNsxZ0d%$`RI@jB&A7OGPpmOH#^k(8{0_QM6TO3=qy8_*hYOr=w`DklkiU6!9 zt#w|UW*$C|0*ygbINW5S+N3otJ$T`yeveJL`Rpm`sMF=pw)F-$3F{K4)p~q)U}Vq% zZXI+nM@&lz7{Lv>e%k`#W9bCsdGUjRi2?3!5H8=asbA+<3X^;ww?HbX=&{0dJYv=N z)uh%s_f+ywKI-J#H-$6nER7GpLdldiXBd~vCf}pPNzw+H0vom*B?1=io_wf;PBc;h zZtt*HkY@2z-W<3_(og?fF|NA>&o<25;QLaH7b*NN@p3KuDsxTYqT>%{2Rg+&KO87L zwnwnbHHGWeGmROLlRSzRAp+9*niPb+aTC$?Jc(_P^Ky?&hn_vPC_7a@!ro%wOu|;% z7?wER5`Gcb?3T1V^8QOAz&QW@O+e;7?bAD4=Z5auSQXwiZJd%M`r-=zZnV!|j6C3~?kGT~3HY@9`9`gN*mb;d?% zU$Zpoq9g1E4F^e=#S?+D>O$zXIc1n8yl3T+^Y5UDy;KVr1^X(h3j74u>b)i<3oe7= z^@P&RW|{`7LoUH`jz*d4gM@Fv=UHecIedL+BXN?t^}H#%yUOL?T)yF>%t z8cNFgs$hjHy|%n+vT9Ed;$(Le@BjNNZ+adP#Us1^m&VvK#-{kh)N%)d?P^q&Jy?d^ zQcfOue#l1eRQ?xP50##IhlCH>lEbBqRFsC3Y0}Cx`d4#7J#5U5fe8dfaX2Rv4;Rw@ zv(8Rsele_Pgnq{Yi8$OV*6NFs2_pzY+Q|v5mP7mLbxwRDY<@|TK2rQ9_aI`E%hLAV z!b9K*RRg1uEy z6n*L-UMpFU*Y*(@hp(;Y_s-(qZWwvJk+|3W?WSE^O<0>-FQ->(!8!&rjIQf9T2d#? zYI6@qIH{qDC|=UZ-OqBk?;#~B-*V`w(^-arWM+NA9lZkQI|?`@|K#m8dx3$qaGLlW z>4fk}SO=BoDCEaFMPK4QTh%G1eCQim3-Th9a?8Wq!(Ag48B2cIw#X!FI-25Zka5mJ zE35t?d2WMF@f@0SE%bAE0#EDz*fF~gH7Id05`WlGM^DBM{LnuCwQFmcESvQPk7ENi z*#f5FF8a%)CC?JjI!0E+ELBOtL%TUE;wy5>bfO-&qE~+DV`8LK7#pkIYI`kOahm7G zxP6c*hHKgQAErU>XW4icIGVv-1_6C8$OeO24yW;Jn8}85B6=lM_?^1KoZnz|!+M4l zSYPwmZTH$E=N@FesVbwHqYNxt854qxS;A-?OsOgE3OEj_^-ia9@<5ZPDWW|thEkWt zjDRAzcn-BKF<36R12t$FNa_Et^948yjqXd?G?M|cN1E1H1_@T(UmxopH+fQu)G!cx zQWu!kdBuI3lt{M%_3|x0{#1^Hu7Jm>98q)ve|cUc&>(CLy`nNTCr3J5;PO~dGVh}i z9ihWrS!N}*hZPHI=T1sy?Kyru^3U+B9(j}JE-S0;aW&yEdl?iJl2XBK)l?>Vr0UrX z@Of#6bC{_JjKGk6=j8~D>YXGaW<&rs1SPAiSMw^$;es><=bGRJJ;%8zf4vs>S{rUV z^Upb1JMkn7?xn0a-3b92lX%0D{FXg9ItEYr2@bt~(u`{IT3P8&WH&$1Zz6211!i?; zBskZT1kX03s$X(kFZBaxazoh=;@}K`n&V(a@=Yc0tllw$ojuqBJ3tAVXj;EK;0K*G z6ZA-fdc@j=AY`Z1)dHojf4ergODdq+TN}%_AkkUI1oq!Ey^@_VdQy_CF{#~AX9rOu z#IvpHx_lM49`LRZ0vj2mZ;psYDWq;EQ}C4-RLQ^?GPj!=ld!J65x3Gzr{HIPWtW~3 z0tQd5DW3EuHJ9%1YO+5ypeaC<;Hed_0#D=&ij`^+n{(hcs+SFGBEj4%C;^9j^#;7% z=U@!6yENfWh2h1WvRBVO{036^TKf7aaGv*nya-&?LTx3fZ-ZCzKmp`yw6-n#y=?KF z^Zb8<(m-6p;j>(T{;fxE&|9TB-aojhi zF$oB;Q%j9cLoA^i3HXOhRV;P^=5EW8L@8?dQPuqc8<*n*`^$9pHy8tSgg0wCZu z1H;e1NeR4ODXrU?dV5RE0cQm8LURgW-oJ#%Yr#YY)hV^iV1P(DJWU5GU)JErOcD1g z1NDM07K)^_o*!%NyVOr>89mDppni|scnqY&_47*&Y70%hva;!)6r0T?ojI)=f6d`s z_m4Saj-`Kte}g&HE*M{6f^t}QY-zHdM8EPSfodwe2xaWtR7e${u8^VNmBXz)2+@{c zk8qe`M-gk6n8pJ&|8UJq#5cY^xpI^A!b54Lhj){ICwuj#ylNAqph)V*Za-T2*)%#a z`h@@Xy3+|7*676+g0(8Tp%mX4UiW?%t1~%*)`k`ieS)|OH)4FN=|0A1**P^U&=W^s zSd1S$cz^?$1c5mA(ewQWXJ`?IYrbT7NIDVc%o-{h8Ut7rNQ>lw^a(DK5Au8vsBxhc zEC@3YcUr@qU!}n*8)x*ul!^SOHFXTim;DLH4K17`rIe}`jKueJZ$GwZmt-uGT=j;Q zQ)ZCoC;j?y+BrTIVxr%hB24LLs&&7_|A5f}ZOEyEy529`4~^w!71E~(bPI<1&Ip`z zQzb4pJiWhaa2_T(d{xueFG3>qagS>NP_|cC!Cxexk?82@%ld&jCzIa#P4Q)^yH7)g z;KMDx!gg&FJH{IeB(z=Kk{U~$;x$$a3q{8zK&*wX-{FN{jEHD9i}_GJbToHssK46g z#F7dMHFhdD&i67wXR6l4cWVXu6_|b|l^=l_pD6od!|w}vws|klA4sNJ#2y1LJ18;m z&z{8fzHeSs#JIDhjSY z|E>Pz4z{FRQo?s{81T|%_si4aNJD|Gk!%bAyPLsEK|-sTmZeGQ-! za1XkCjXv{2$YC-1D^%h6;`N7SAw+u6l0x|D|JU7DM@9X0?;-*ssFZ+$gwl;t(ufFB zqA+wyhlDf?Eg>i%D%~PALrF>tC`jkfDLurHLkw}x;P<`veeb>N{&j!<{LWgeHO!pP z>Cf5w?6ddtbX*?S|E89XDjK4OqV-e@xwv2plm1UtBXf(K@7Htmh)WOZ5v&F|zOQJe z7WlRaJhH(1h$BWahZ_V4GMWKFMxS3Wv3+88jMP-6vcA-4jw)?sIWNYAW-6-&u5B80 zoRht2$RUUI(NXk(Qgh70bt z_2ZG<%&Q!RkbIMebc5^1b=!$z=O4e(2+SFzx)_CO&b__|HcpWGu+eWS{~8mZvZWnq zTC)y!N{?mHC3lv)4c>si+-|xYHZj%vi~F)U>l6j_KwUju&dw%74g()UrG6Z8uCp=? zBKJ-#4AZ39DS*Y3VbeD~&@hts-xTlIR=T-9UI(^63k^tiOfd z%{UnGkclYmVaV^N=t1OtTecp3dpB3ozZtt>;jBpf`#V6!o062TH;U1Xb&uVAq1(=D zkI92Tr3xPw9l1ag3HoxpoA9n)1%ysgcb~o9wtS~qVD%0~22U8hW%qIVKuD}rAK!#E zKkLr4MW{;e<-wBilW_-ISAFG&iP@<%w8=xWwk1KhWkTPhR-XF4L9Q8SjhY-QewQ$- zsx?cGssHHBX*TpcRh(CYsr13;G|?eF0;G8?Q5!fjfE3UJv;q^r_fZ1$lyZhSSxiFj zO&7Ow>U7O~(-CUyp&1e1v}eQr=rIT~7gT0_q99&C+C2BRv>~b8X2yQ178(x$RIsa` zOcFAF1Ych|^==fdnnb%La^h+D#-xevwKd^29r>3rDX?je=V34s zeFTXX#bTBr20p@Vc$0x4_v?Zj7x&ZEnYMB8s*EBC>xas;gAh97+<`Y|n0RLwqy80D z_@6Ib3=J>0;pyia!jLhAnz($uho(YWQunk|m*G>ds=sCS+m;)+tSG}BAJ<#Kch{3K zLc)T&*OR*E7mm7pesWj1t{1O3Ox>iWw!^>FK+tEo7DJj#FA4QbmWNjU{F*kbu5dpM zlYnX4-xB^HOnLR&_I(~nx*oj$+v*O1e%}I-{;L2%u_D^j!qU=q%s8z-(*e4{WEHKe z1^5oy%c>DWD_UjCDO0N5S}jYK9%ajA>v;(7WAk(y%Rl3ShEWm7B6fw(h^;YO0=f-* z0y;W6d!e_RFr{FbgdYpYtJ38MbzRcI)I54$IW%%c6+jR+_Tns{2rS(Hxp<8qh7nr^ z`2k?-i%?M}Bk*bPixs{0LEs!_`b~ciUF|-@$Qc8)m+Suh7kqVFK|RvvK0vzuo!351 z>}Jw)W&w;{uH-IH{Jp(v{>;wX+&**}%ICHYB;{iKh+BI4b)U2*|^ z@<8z+q-8Z&LBQj&Oh1^n+N!FNXXvU%j-2~VQb9#p7>m^5hLg(rp1%XbBUwyp=V6WCHsHaPD30>>Y{f;d({e~+8E3Z=LsHHpXhhu95 z%4uX8kgt0NIOBXNP~KZhBjW&dzX%1?F-|V7vS{y-CV+Gpe0I8->^3cZy)60*SKZNw zERUac{ta=vb*5?9N8tBrz1R`^(zsft7eDw7@*^%~UB$Cs;#``B6Xta9HWR;;&y6KL z@GUJ3W7cg0RXSPryIg7ly5e6A!-<6YeQT?{9XV#R7FF3yz~HiYveKWm-{_d0xhNT3{juU?fKy`*d{cd7LN=vTaCoP?H@JLqV#$mIHBP8;$97_eRBY7~G(nO_wS+tTu zG`*&qTLt>FpM$D939aZ`7H_fetDeS2=%h4%=jd&*k<)zv=&e1`(3xgUX-(LB=) z*T9P9JU5T7^woiV57xO(BPKJIm1jyQr(yQ#tO04rqqn7C&ofv#6Gw`7Q{=m)$falDG7Fb`=dc8!_Y zX~QCVYSm%UlSm+so#dZagST-Js0FcD%};4DDEg{~u;X+A;HOwV2JU+6S%qE9PNwR7 zsUmkZX4>lw%vjTwMTf-_L+7ify0I#k77tX_V!}PYsM5%oPVy|B0<7x>tZNg9RADy- z#3mj1!CyA~(+U-4c0zmmsb`8psthtdRcYi6(Ka4uYx!(W(Vm^+*Bq4Iv@t zoI0#?yIPOmv9zx076LG>s&#sJ9h@+Xd~uH+KD7ujI%{*MbJ$vMW3^eyQ%u^K>Wr0K z85I~WS}dd23=~Q$5ciWbIQM*&AA$Cs4qJZB#L5hRRex%lcCZ(`EJ>&T^4qCsFUGG8 z)1c-l1s2VtQ0xrx_T%9M3k~YOJUQGxom~WBaO)e3tB4Cq< z<3y3Up!>OmRPO~<>s<3B;{b}=V5!Q$$Q-AB<`qA5z!}8)3bB04%Z;%*d1XH#?dS## zdj^S^NMMc$7tz^z8%3MNF;g!))*v{?fU+?wltkN%esw#a7vFV>6^OYw77~IW34mam zhztgo(8$a%`T;tR$xILxgrVZo_-Q2@`ncz@cjMM$NzY!C8>&$LvN;@wWDR1lAS0F(QiIUiD|M{g?H*0BeaNomv0KM#2 zEaYDCbNj}c(}q)b*RDkC?sqo&h8NU(^jTM_kEc7SPR-tR(nEz}Uk%9{SAK{syswD9 z1?EVmwuGC!8MpJ^r44GE&U;4!jXPb-zrcA4>#MBa$&&g$@uo=U1n}BfYktQ!6g${Z z=xE>_u6``v8e13}3)MVy$Q-UaPLi5D%tT~0`}CwC)2{=NrlHFu5*c7;)uCjHiLi(v z^*)KGrYV}lKd{|dl|m}T*=aQ#kezihxVh>wpSNYJU_z)=91Gt+A0wJN>Z5(oFS94& zpncYO6eqGH?bth?bFM_3lRX9%ZfxURlb4jR`dD{c_LK1G%P6YCgB;?qD#^@bHu=qE zV7w%x<9-*6efI!5LBfv=uqO`hJOyrlgKGZA8e9?&$&7_sz@_TmIC5m~)0l@X(n5Ph z+Trw@(J?AsQ}df~e}tw1B)Vkuj`1(+-dxIKe}7LS>EkBVV{}!~P`%6YYHE;&>q=jy zhH#IXqGCV5^eP5WO8v+(()K1?(&HPQZa?YMlw7hTSFZ8K+E-k@8vQ!auPNyM!kx&n zFjajigcbTwXZ}IdRjT)7pIp79p$6(-qe&*_L!&GDZ&2M#Exc-WIEfYMxY*0Bw+~s( zpQ@@N05(!K7A_)8`XtkBzpXuP>;5O?Kl?p4wyYM_pbsunN@XLAPySF3yFefio$jn| zUtj%_ZRsi?o_xVhjb*uT?Ug)ONuD{$;jLYNum)&G&=Xo2Ceyp^(_u$OEUc_ZEXfi8 zUCGyXsQyLd@C+@9N7{94BNHkqcuKwAfS;>gZB;9|vofL|jxI4nYNAF+G8z5jEW39m zHg&$dQ*d)AiOrSD+tkda`)0PdQ`+z65En(FDkQA-LMQ}7O_@Ls8L@aP z0dEl?P8BFi$fhd{LjZ3@d&+Y@5;@b@h{&3A{j{*8bOzY&Ruiq#`t7TxasZ#sDA65< znpIXTriB*aH<#sScDFex#1gtCTDf@^lFOyXe)=jLru9lxd0pStYZwtyrtt;1v08+s z05QbS|8&2jr$_7AvnaovPw6Yw%V}jj2#F;~*TGL8M-_enDEbu_P`dsiP%~q#~X}i~bO|*`Jxb^JO;r;&3NLN;hJB{kSZMJ5cI)4AgqHhSgt~|GMmg3>*CVBwyzzROJ zsPtf1S5DcGzGmkmB4#rUN47ov5E_W4rVVhb8?FQ)b#`2Qd?Q%2ubLXMwX?tn#g4f6 zWr-}&MYWq^zL<#_Es${{g-s7AHFw(^MFKwx9z{tkMnzVk-5We}=?&KES-gLntKP1A zVKU(q8l%{VuRgU(cLSg+zeC7BH*O3uFVJV#066&p0Kd9&a<)XasmPT-NN|+~ut6 zs+0l6I~ipOiq;^k&7U!;*ixXA_%<*F4R0)IyFGsVX8(ApqT9a(AmEl-2rSz=tZ}oh z^HVG=?T1~v{RJ@kt%=VZMaf6gcYpN~2Zta*P55l)k)F{KhC6(?kK?>3xmZlwCW^+M z&-0P8)~kRXC6>kfdne7Vx-sBuv(X@67<<@sBM=ewh=-S@r{MdeK+u)JALXvMS} zwv|tvpJWS3Rc7waoqIu~PqRF3=X?@zm2zW3=BO$QTD^$rQfAv%9GH7E?LjWRZ*=Og z=P{iyC7;Oi{PbWSpgtG^#xJct6K|@7tG*gSq{v34(-yqJ_hS$p1Lt;@ znvj+4#0)8j`;ErnHcCi2VjFu&wdJzq-y8a;*mbrr}`oFjP&^zjm=$C;3b^FnhM_67;5U^Od)f9Y<)# z4`{|GIbTfWeq)K$ZcO1cuC)Ry>p2!l>fcWB%MWIHf!ml35COwl>guNZ_WbF}4IKnC z&Pol1@I;&@6{qBgV6!)PWeXp5k?dSt%4Tg*6~bi<)E*z7p7lR%@jP zzoCJg&C3_`hEd@^dov<1GanH_#O2CAktPW(uFm)ON3nwAHJ&f{IKa+E;N3VId&h zGnp3jnR^7kN{KAXpRV7)nrd+}1c1t3BC`mrco1w!9T(Os!;rs+4D5s0BqQ_Jxbk23 zxyE<#Wpv<&Md#|F$;nr!OAlj@kG-KcrBYB!T&QekC<@P%B36P~W9GXn?99do_9tt7 z{vlk+I0eo&T%EnH+tJ%y8Xek@m&g9AwRIedZD^mDHyXrh1e8UCPRShF<;9d0=z#Ni zpKo-LdUdDNw4wo&sKPsIa9SK1v?wv2tF$OSIn3_6VzPEIVnC(3kkfz8>SUs&4cV=% zZi*u`^R}YBYEb>LoP`z9sn?sF=>xY?c8e}sekkZ~p(uQetr%cWVVx$LJHL)~^!7SV z(d*{n?^@dIJP-4|(q8WptM#hj+kse-YqxCSNgoGl|G-MEBt=2S-Tw=@-G}@~r-gq_ z1B7?6)o)QPKKHSW%b7WHQDv}X_@vL`ND1$lZb{VEe1W7hkAx6N@?TU+C@SbGual7} z(C7DZrf&?wYY@Lm$Iw#mmAL6!Ioi-HZ76W&66NfzFK*OBUT9P;|HOh4X0t`_%*@JV zwzrt%r&&qB<{90tn|D$dEB2Cj^`(H|dC?t~iP{anXX%#loKl{;@L`9+qRT?lE^a@$ zyi7t_jifXN<*OuIOqS>7CST-@j8Iv-RBQ0|QizjFEZ;}cCix(4S{%*vtHWYl(w=7u z_4F4Qe}x~eE^-#_Nbn24NYw5fURMNhxwh`Gg?&oBicJblL?RnH#qs!|YMOJkvpn6w zPkJHFlT)9m6>lGkWa5CV1eYcx}v9tTrDdkgk3|fbCMCyDXNRzriAKO#bo!3ox`MvHb0I1=Vslf@19%g7BE>^k~~++N7z444UT5IvV&R z&{PuYpGMXA;?H%eYc35IPxPvPyEO2{BEO%FbyLDdZ3u2e(o-2VVO3k;sEWO@BNZhl zk2NKUMAFQZdqhSU8RR?LIY<}|`d9W%6yHU$T3|SdZ&2E{2t#)yL={{ls(RBFri2EH zQ&GMu$`vvJy{mK6HF|J&?*k3KuXx9nus@b5m zxg}L>b|~20zANQ8Z73kv1Do8MTz=w`K5vSnA8BhViF%plE9F&9LyZ!hG{~+{jyj=P*_oOfvXtU4sXZ8)Oo2{8)vDS23Tu;ae3CD$|0YI>;T14OfF$!4Yde!G@Q#;U z-8gx==HVHZuqvVPHQ)_*}Od{!WLITC+{St{Lq70AL+vY$%k@U61=^Q z>h|aQ+!3yg?eD7{-OplM_SBxD^Zb060m~D+ij!4*Z`;%Z-b-sgTw2=>N)*()Yknn3dlW4# zN@iNXi8Up?qpCQ6?D`NFJ*z^wdH2vURDfW#l$FAVTl2SsUc{F})Nk4)^Kn0H4UgmF zoY{V)eCwmsy_OpsufkVp#zyG1<6vKU=%Ms4&hka>yf)8%_v9~E`UP?nfR@+JvV~e- zTeUN8c1&pPRO9QLgf7`m2l?@(>ss;pV(o=qh! zVeH~Btaii~pT`<-wvW&()V#6!hG3M*KU@hNg5!+*`RjJO(jG?ZNKmMp*8_5FD#H;j{sjYA+67euo@#tLBky1TozEWh~wHgTzioS{t@c{sFihy~q(} zziq$xcyr_zDznG>H;m99!DGs@VY5FeiP#qPwK}r0WA+p)Z)epVxS%-VHI?Un_szyd z0KhwtNNusA?Paf_M&_4A61!D-!wF)i7&O5v` zJXreO7J;)`Wu`_K6bK=f7*AT*XH?hIAUPMQkQtEua|flnUm(#>*7NXPws&xB-H4f5l~!+*C`ARUa7*PxM(^+q{no$&|kg_0A}*5MFVc+%HY9^ zi>nmoT8W9Imik{>R!OL}v-b zn&QvU1>rOYZ$xWW?=}9E&DIlLzDlWQ)@#>uP?vMwEpVg=~1#=mU zppej5*4Es3{Ug*X@h22PqjD@(DaBlk9%T-vJ)Zkgm8*|V_zj)so(uUH!HBHhM0I2y zF1QG8CnfJO%+PWyizh>D;%H$X36|b#!%*A4J&4h>E1DH>r5*y8D`DiA9NUE_X0b`5 zG0X3@P%q*++2GMp!*@QG<;Y*xatpwOG) zJk&e)+2#BZVzihc-Mxr)dAexg%)-eoI(+A#e4PO*G4pP_X1zA;;7(39F>tAgZ+T>h zHP{FAP*YHt{nfXE@j1aAHE~-9K4Zf3YV6o$z%OjWy%+i=_KW`~0y#DVNdMpRd~h)K z8Z~1yuQuQn5)qkz;ylZ4XqcwIYtkdTxLgRSuJ}}Rdv}?gZESp1%-%>qp={TReUOGp z2b@hggoTH(_wS&8gJo7~P`nghpeg^8E>PTMf>(zNlO=KcC=YLAGvZ)bopIjsPu1@B zlen^g&mhS#n}q(|alk)#!J$2b#DcB_>~cfCF+~Ps#a|vnlj*4Nl~ z64Ac-5MI;^<(;pzg5A`to@o`c*|MVDC1d$P77ZqklunlRlb)6Iz2ug}t}2|jcqjYB zIAo#wv$fMpZXp9tM(oa-eHNulpYIka#-qg0Ngl*|FH-Bx1mo=x^L`W$|alSU0<#&GwEHgC0I)jM8Es8AoJdKY#T5(R(;Re2F508unZUzVjWdf4-npj& zxhc?Q=l}^N!1yO;5m-BQI?Fk<7e7%!Y!)uP{dNET+V}9&z5CQ@B@r1uZC7}rl^J3( zg~{YhYvj=tZL!&~t>VvHPkX8}D@J7B)b4u~RjXP~temt6BcuzKu89 z{900xfw70J>n&;E$pv1Tr`U)nvbV^Pr++?T-b(Jxb00mWM33%x1qzpo21I2f3ShG_ zs&7&AFNFh_O0xgD5)z%6??He zaf=`7Y$01=v7GiL9%TGwcgX)$XOr;{9MZrpW_AQ|(_K-fvI=8>p~rc!)@JhAe{x*E z$+-3HHjgr2Z2X4$+C4>jac!G4$MCN+ROR+7lk1SVT{y%++UCy_(WHb~y0zz-arNrs z=;}TP?(mDbc*_h#GHGT4;dSyBXV>iFYG=d1v{RRFDVBd4$9R@oExgei6l(k8^2_YG zRrO_F)oH$PSzZd2?{LVLZ6+P*!_RJGa@l%jd{%Sng_AaH#Pu$&r(L&`yrzD}Wqh%m zdQdp_sIHF;0t^Ho-Mi|Ke_1B@0Yzp`&?PJVQs<)d%q|_2Q8%XZvxrC3(j?%QQhui? zEm3bjZ;_tos(QYCEn6S(PG%Q#57j{>qLXW7;OkqAuLtaF&c4Xmy~;O$=Nj==d8i9* z2&2|o)1@^;RIJ0PA>a2bVn6WVeT7zSLM~q%LGNKbN9EWlxUoL&(1-BjC2RCab;qfE z4*AB1rkIC1x*AqCXX`dJefR=YLq6z%E*>2Cl} z)3fJ`S$KY2QDvi)R54Fh{CZ@8v-9Al+Y@i}uWs(>n*r<>v3y`Ll>WQkaFh_zajE>> z(b1gtrM}lWrgw#hpM_utCWCl?eQc2ak`+TNn2x!XdK@1~?y$cz1n4utS1R9SnDjEt zR7zxIu3(>dE7!$(NhVmJo;u+UzF27b1uKTkSF9h54#ijQ4P3m?bXsgi8D>CTDTdKUe0a228?KZ=U~#WOAl-$t*2CMo;$i6*dV&?rDLL7z zLuqoHzM!S@AFGKG__Frca;<3m!x%@g&M6-zaZ}q=Qpovn*ep4zR?4uy&b|URr;?ZQ zR5(s|DERpUiOMPI=(;Z?W~-`_m=N1uw4vd zpT`aeUe;eJyMP!OSp%*BX!?MnfyqhkstJ2v{N|9Owwkd#Eo^2|m97vqN$p6oraHM;{lttrn4?rfL9-8W-hJLcRkLQTwr z@HA3qF&78IruQr{q+QsYc}4)46_5+J5X`>X4!MeV@w4o8O=3bzU}sZ?@ZmJI0orEm zC`W;lwjU!ei~ejIwwpC1kv91=V5^mlJgRL4?`jh`JG~4?{-fUnmx7B@xY`}k0v%ca zoXzdq6vl_m4f2z@=o8|DqV)?DF4hyU0{*O{k_EK9&2fsJM~oLz4}^w?hlgj{Ub+k1 zIbU|W4vE~oP%XgVtz%gS@ELD{Wwf5SBr;XwR(9~W=z!TlaHq5;tIG(v_|c>Z??P^Y zku3Rxs~b$yj4y{*CP?1?Pg8h=w2zs!Z{4Ga@KQmB3Rnx&+_*WaXSAZJzg|0s$Jn~yK@NjXu zw8igxO9lV3=07=BUiJ6)54S#eO2`&W9Kb;FKu}n?W5azy;h*kD-3O&V56tN%R0t;i zaWz4{m(@ID?n1zTdqs<8pvxoxQ>ixikt0Eg0C?Uj5eO``{WE23ra{;Yk&Dq-85n?* z@BtgeVa=c16is?jfy@GbWMa2v0%;R9=V8<(tSx5+uYrNdc=n(Ax8@kwN$=qpfz9Wi s#K_2f_D}Jk1=zT^f6{-e&y1)33)w+ka^ch=*zPGlc_vpRYZCN70JI + +

+ +### Indexing data + + +The first step is processing and indexing the corpus document(s). To do so, set the path to the embedder checkpoint, corpus document(s), index saving directory and relevant arguments, then run the following command. Below we explain in more details the steps run within the script. + + +``` +python examples/nlp/rag/rag_indexing.py \ + trainer.devices=1 \ + trainer.precision='bf16-mixed' \ + indexing.embedder.model_path='/path/to/checkpoints/embedder_model.nemo' \ + indexing.embedder.embed_batch_size=128 \ + indexing.data.data_path='/path/to/data' \ + indexing.data.chunk_size=256 \ + indexing.data.chunk_overlap=10 \ + indexing.index_path='/path/to/index' +``` + +Inside the script, the following steps are run. + +First, the document is read into LlamaIndex's `SimpleDirectoryReader` object. + +``` +print("Loading documents.") +documents = SimpleDirectoryReader(cfg.indexing.data.data_path).load_data() +``` + +We then set up how the corpus document(s) will be split into smaller chunks, by setting splitter type, chunk size, and chunk overlap values. + +``` +print("Setting text transformation.") +Settings.text_splitter = SentenceSplitter() +Settings.chunk_size = cfg.indexing.data.chunk_size +Settings.chunk_overlap = cfg.indexing.data.chunk_overlap +``` + +We then load the trained embedder NeMo model. Currently, this script only supports `.nemo` checkpoints. The wrapper around NeMo embedder to work with LLamaIndex interface is implemented at `nemo/collections/nlp/models/rag/custom_embedder.py`. We can try different embedding batch size to balance the number of samples embedded at once and embedding speed. + +``` +print("Loading embedding models.") +model_path = cfg.indexing.embedder.model_path +embed_batch_size = cfg.indexing.embedder.embed_batch_size +embed_model = NeMoEmbeddings(model_path = model_path, cfg = cfg, embed_batch_size = embed_batch_size) +Settings.embed_model = embed_model +``` + +Next, we will index the corpus document(s), simply by using the LlamaIndex `VectorStoreIndex.from_documents()` method. Under the hood, this method will split the corpus document(s) into smaller chunks having a pre-defined chunk size, batch them and feed them to the embedder, then put the output embeddings into an index. In this example, we use the built-in LlamaIndex's in-memory vector store to save the index. We can also use external vector stores, such as Milvus, Qdrant, etc. See more at [LlamaIndex Vector Stores](https://docs.llamaindex.ai/en/stable/module_guides/storing/vector_stores/). + + +``` +print("Indexing data.") +index = VectorStoreIndex.from_documents(documents, show_progress=True) +``` + +After indexing, we save the index to disk that later we can load to be used with an LLM. + +``` +print("Saving index to disk.") +index_path = cfg.indexing.index_path +index.storage_context.persist(persist_dir=index_path) +``` + + +### Generation + +After processing and indexing the document, we can have a NeMo LLM model to interact with the corpus document(s) through RAG, such as asking details within the documents. To do so, set the path to the LLM checkpoint, save index, and a query to ask and run the following command. Below we explain in more details the steps run within the script. + +``` +python examples/nlp/rag/rag_eval.py \ + trainer.devices=1 \ + trainer.precision='bf16-mixed' \ + indexing.embedder.model_path='/path/to/checkpoints/embedder_model.nemo' \ + indexing.index_path='/path/to/index' \ + generating.llm.model_path='/path/to/checkpoints/llm_model.nemo' \ + generating.inference.greedy=False \ + generating.inference.temperature=1.0 \ + generating.query='Which art schools did I applied to?' +``` + +Inside the script, the following steps are run. + + +First, the LLM is loaded from `generating.llm.model_path`. Currently the script only works with `.nemo` checkpoints. The wrapper around NeMo LLM to work with LLamaIndex interface is implemented at `nemo/collections/nlp/models/rag/custom_llm.py`. + +``` +print("Loading LLM.") +model_path = cfg.generating.llm.model_path +Settings.llm = NeMoLLM(model_path = model_path, cfg = cfg) +``` + +Then we load the index saved on disk in the previous indexing step. If using Milvus database, it can also be loaded at this step. +``` +print("Loading index from disk.") +index_path = cfg.indexing.index_path +storage_context = StorageContext.from_defaults(persist_dir=index_path) +index = load_index_from_storage(storage_context) +``` + +Finally, we will retrieve the relevant contexts and generate answers for the query using LlamaIndex's `query_engine.query()` method. Under the hood, this method automatically embeds the query with the defined embedder, then retrieve the k relevant contexts from the index, and add those contexts to a predefined template along with the query before feeding them to the LLM for generation. We can set the number of relevant contexts to be retrieved by setting the argument `similarity_top_k` value. +``` +print("Responding to query using relevant contexts.") +query_engine = index.as_query_engine(similarity_top_k=3) +response = query_engine.query(query) +print(response) +``` + +Below is an example of the default template by LlamaIndex to feed a query and relevant contexts to the LLM. This template can be modified following LlamaIndex's documentation [Prompts RAG](https://docs.llamaindex.ai/en/stable/examples/prompts/prompts_rag/). + + +``` +Context information is below. +--------------------- +{context_str 1} +{context_str 2} +... +--------------------- +Given the context information and not prior knowledge, answer the query. +Query: {query_str} +Answer: +``` \ No newline at end of file diff --git a/examples/nlp/rag/rag_generating.py b/examples/nlp/rag/rag_generating.py new file mode 100644 index 000000000000..952dc2532102 --- /dev/null +++ b/examples/nlp/rag/rag_generating.py @@ -0,0 +1,49 @@ +from llama_index.core import Settings, StorageContext, load_index_from_storage + +from nemo.collections.nlp.models.rag.custom_bert_embedder import NeMoBertEmbeddings +from nemo.collections.nlp.models.rag.custom_gpt_llm import NeMoGPTLLM +from nemo.core.config import hydra_runner +from nemo.utils import logging + + +@hydra_runner(config_path="conf", config_name="rag_generating") +def main(cfg) -> None: + + # load LLM + logging.info("Loading LLM.") + model_path = cfg.generating.llm.model_path + if cfg.generating.llm.model_type == "gpt": + Settings.llm = NeMoGPTLLM(model_path=model_path, cfg=cfg) + else: + assert cfg.generating.model_type in ["gpt"], "Currently RAG pipeline supports 'gpt' for LLM models." + + # load embedder + logging.info("Loading embedder.") + model_path = cfg.indexing.embedder.model_path + if cfg.indexing.embedder.model_type == "bert": + embed_model = NeMoBertEmbeddings(model_path=model_path, cfg=cfg) + else: + assert cfg.indexing.model_type in ["bert"], "Currently RAG pipeline supports 'bert' for embeddings models." + embed_model = None + Settings.embed_model = embed_model + + # load index from disk + logging.info("Loading index from disk.") + index_path = cfg.indexing.index_path + storage_context = StorageContext.from_defaults(persist_dir=index_path) + index = load_index_from_storage(storage_context) + + # set query + logging.info("Setting query.") + query = cfg.generating.query + logging.info("Query: ", query) + + # query and print response + logging.info("Responding to query using relevant contexts.") + query_engine = index.as_query_engine(similarity_top_k=3) + response = query_engine.query(query) + logging.info(response) + + +if __name__ == '__main__': + main() diff --git a/examples/nlp/rag/rag_indexing.py b/examples/nlp/rag/rag_indexing.py new file mode 100644 index 000000000000..ab487c035228 --- /dev/null +++ b/examples/nlp/rag/rag_indexing.py @@ -0,0 +1,44 @@ +from llama_index.core import Settings, SimpleDirectoryReader, VectorStoreIndex +from llama_index.core.node_parser import SentenceSplitter + +from nemo.collections.nlp.models.rag.custom_bert_embedder import NeMoBertEmbeddings +from nemo.core.config import hydra_runner +from nemo.utils import logging + + +@hydra_runner(config_path="conf", config_name="rag_indexing") +def main(cfg) -> None: + + # load data + logging.info("Loading documents.") + documents = SimpleDirectoryReader(cfg.indexing.data.data_path).load_data() + + # set text transformation + logging.info("Setting text transformation.") + Settings.text_splitter = SentenceSplitter() + Settings.chunk_size = cfg.indexing.data.chunk_size + Settings.chunk_overlap = cfg.indexing.data.chunk_overlap + + # load embedder + logging.info("Loading embedding models.") + model_path = cfg.indexing.embedder.model_path + embed_batch_size = cfg.indexing.embedder.embed_batch_size + if cfg.indexing.embedder.model_type == "bert": + embed_model = NeMoBertEmbeddings(model_path=model_path, cfg=cfg, embed_batch_size=embed_batch_size) + else: + assert cfg.indexing.model_type in ["bert"], "Currently RAG pipeline supports 'bert' for embeddings models." + embed_model = None + Settings.embed_model = embed_model + + # index data + logging.info("Indexing data.") + index = VectorStoreIndex.from_documents(documents, show_progress=True) + + # save index data to disk + logging.info("Saving index to disk.") + index_path = cfg.indexing.index_path + index.storage_context.persist(persist_dir=index_path) + + +if __name__ == '__main__': + main() diff --git a/nemo/collections/nlp/models/rag/__init__.py b/nemo/collections/nlp/models/rag/__init__.py new file mode 100644 index 000000000000..15434bc2e603 --- /dev/null +++ b/nemo/collections/nlp/models/rag/__init__.py @@ -0,0 +1,16 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from nemo.collections.nlp.models.rag.custom_bert_embedder import NeMoBertEmbeddings +from nemo.collections.nlp.models.rag.custom_gpt_llm import NeMoGPTLLM diff --git a/nemo/collections/nlp/models/rag/custom_bert_embedder.py b/nemo/collections/nlp/models/rag/custom_bert_embedder.py new file mode 100644 index 000000000000..e2f26fadf247 --- /dev/null +++ b/nemo/collections/nlp/models/rag/custom_bert_embedder.py @@ -0,0 +1,145 @@ +from typing import Any, List + +import torch +from llama_index.core.bridge.pydantic import PrivateAttr +from llama_index.core.embeddings import BaseEmbedding +from omegaconf import DictConfig +from pytorch_lightning.trainer.trainer import Trainer + +from nemo.collections.nlp.models.information_retrieval.megatron_bert_embedding_model import MegatronBertEmbeddingModel +from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy + + +class NeMoBertEmbeddings(BaseEmbedding): + _model: MegatronBertEmbeddingModel = PrivateAttr() + _model_cfg: DictConfig = PrivateAttr() + + def __init__( + self, + model_path: str = None, + cfg: Any = None, + embed_batch_size: int = 16, + **kwargs: Any, + ) -> None: + + # set up trainer + trainer_config = { + "devices": cfg.trainer.devices, + "num_nodes": 1, + "accelerator": "gpu", + "logger": False, + "precision": cfg.trainer.precision, + } + trainer = Trainer(strategy=NLPDDPStrategy(), **trainer_config) + + # setup/override model config + model_cfg = MegatronBertEmbeddingModel.restore_from( + restore_path=model_path, trainer=trainer, return_config=True + ) + model_cfg.micro_batch_size = 1 + model_cfg.global_batch_size = cfg.trainer.devices + self._model_cfg = model_cfg + print("self._model_cfg: ", self._model_cfg) + + # restore model + model = MegatronBertEmbeddingModel.restore_from( + restore_path=model_path, trainer=trainer, override_config_path=model_cfg, strict=True + ) + model.freeze() + self._model = model + + super().__init__( + embed_batch_size=embed_batch_size, + **kwargs, + ) + + @classmethod + def class_name(cls) -> str: + return "nemo_bert_embeddings" + + async def _aget_query_embedding(self, query: str) -> List[float]: + return self._get_query_embedding(query) + + async def _aget_text_embedding(self, text: str) -> List[float]: + return self._get_text_embedding(text) + + def _construct_forward_input(self, texts: List[str]): + # this method construct model's forward input arguments from texts, following the constructing step in nemo/collections/nlp/data/information_retrieval/bert_embedding_dataset.py + + # retrieve arguments from model_config + max_seq_length = self._model_cfg.encoder_seq_length + + # tokenize text + input_ids = [self._model.tokenizer.text_to_ids(text) for text in texts] + + # truncate input_ids + input_ids = [item[: (max_seq_length - 1)] for item in input_ids] + + # add bos and eos + input_ids = [([self._model.tokenizer.bos_id] + item + [self._model.tokenizer.eos_id]) for item in input_ids] + + # pad input_ids + def _ceil_to_nearest(n, m): + return (n + m - 1) // m * m + + lengths = [len(item) for item in input_ids] + max_length = min(max_seq_length, _ceil_to_nearest(max(lengths), 16)) + assert max_length <= max_seq_length + input_ids = [item + [self._model.tokenizer.pad_id] * (max_length - len(item)) for item in input_ids] + input_ids = torch.LongTensor(input_ids) + + # construct attention_mask + def _create_attention_mask2(max_length, item_lengh): + """Create `attention_mask`. + Args: + input_ids: A 1D tensor that holds the indices of tokens. + """ + # seq_length = len(input_ids) + # `attention_mask` has the shape of [1, seq_length, seq_length] + attention_mask = torch.zeros(max_length) + attention_mask[:item_lengh] = 1 + return attention_mask + + attention_mask = [_create_attention_mask2(max_length, len) for len in lengths] + attention_mask = torch.stack(attention_mask) + + # construct token_type_ids + token_type_ids = torch.zeros_like(input_ids) + + processed_batch = { + 'input_ids': input_ids, + 'token_type_ids': token_type_ids, + 'attention_mask': attention_mask, + } + + return processed_batch + + def _get_query_embedding(self, query: str) -> List[float]: + constructed_forward_input = self._construct_forward_input([query]) + for key in constructed_forward_input.keys(): + constructed_forward_input[key] = constructed_forward_input[key].to(self._model.device) + + embeddings = self._model.forward(**constructed_forward_input) + embeddings = embeddings.transpose(0, 1) # reshape tensor shape [hidden_dim, bs] to [bs, hidden_dim] + + return embeddings[0].tolist() + + def _get_text_embedding(self, text: str) -> List[float]: + constructed_forward_input = self._construct_forward_input([text]) + for key in constructed_forward_input.keys(): + constructed_forward_input[key] = constructed_forward_input[key].to(self._model.device) + + embeddings = self._model.forward(**constructed_forward_input) + embeddings = embeddings.transpose(0, 1) # reshape tensor shape [hidden_dim, bs] to [bs, hidden_dim] + + return embeddings[0].tolist() + + def _get_text_embeddings(self, texts: List[str]) -> List[List[float]]: + constructed_forward_input = self._construct_forward_input(texts) + for key in constructed_forward_input.keys(): + constructed_forward_input[key] = constructed_forward_input[key].to(self._model.device) + + embeddings = self._model.forward(**constructed_forward_input) + embeddings = embeddings.transpose(0, 1) # reshape tensor shape [hidden_dim, bs] to [bs, hidden_dim] + + return embeddings.tolist() diff --git a/nemo/collections/nlp/models/rag/custom_gpt_llm.py b/nemo/collections/nlp/models/rag/custom_gpt_llm.py new file mode 100644 index 000000000000..bcd52b3f9b16 --- /dev/null +++ b/nemo/collections/nlp/models/rag/custom_gpt_llm.py @@ -0,0 +1,130 @@ +from typing import Any + +from llama_index.core.bridge.pydantic import PrivateAttr +from llama_index.core.llms import CompletionResponse, CompletionResponseGen, CustomLLM, LLMMetadata +from llama_index.core.llms.callbacks import llm_completion_callback +from pytorch_lightning.trainer.trainer import Trainer + +from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel +from nemo.collections.nlp.modules.common.transformer.text_generation import LengthParam, SamplingParam +from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy + + +class NeMoGPTLLM(CustomLLM): + context_window: int = 2048 + num_output: int = 256 + model_name: str = "NeMo LLM" + dummy_response: str = "My response" + + length_params: LengthParam = { + "max_length": 500, + "min_length": 0, + } + + sampling_params: SamplingParam = { + "use_greedy": True, + "temperature": 1.0, + "top_k": 0, + "top_p": 1.0, + "repetition_penalty": 1.0, + "add_BOS": True, + "all_probs": False, + "compute_logprob": False, + "end_strings": ["<|endoftext|>"], + } + + _model: Any = PrivateAttr() + _model_cfg: Any = PrivateAttr() + _tokenizer: Any = PrivateAttr() + + def __init__( + self, + model_path: str = None, + cfg: Any = None, + **kwargs: Any, + ) -> None: + + # set up trainer + trainer_config = { + "devices": cfg.trainer.devices, + "num_nodes": 1, + "accelerator": "gpu", + "logger": False, + "precision": cfg.trainer.precision, + } + + tensor_model_parallel_size = 1 + pipeline_model_parallel_size = 1 + + # trainer required for restoring model parallel models + trainer = Trainer(strategy=NLPDDPStrategy(), **trainer_config) + assert ( + trainer_config["devices"] * trainer_config['num_nodes'] + == tensor_model_parallel_size * pipeline_model_parallel_size + ), "devices * num_nodes should equal tensor_model_parallel_size * pipeline_model_parallel_size" + + # setup/override model config + model_cfg = MegatronGPTModel.restore_from(restore_path=model_path, trainer=trainer, return_config=True) + model_cfg.micro_batch_size = 1 + model_cfg.global_batch_size = cfg.trainer.devices + self._model_cfg = model_cfg + print("self._model_cfg: ", self._model_cfg) + + # restore model + model = MegatronGPTModel.restore_from( + restore_path=model_path, trainer=trainer, override_config_path=model_cfg, strict=True + ) + model.freeze() + self._model = model + super().__init__(**kwargs) + + # update LLM metadata + self.context_window = self._model_cfg.encoder_seq_length + + # update inference params + length_params: LengthParam = { + "max_length": cfg.generating.inference.tokens_to_generate, + "min_length": cfg.generating.inference.min_tokens_to_generate, + } + + sampling_params: SamplingParam = { + "use_greedy": cfg.generating.inference.greedy, + "temperature": cfg.generating.inference.temperature, + "top_k": cfg.generating.inference.top_k, + "top_p": cfg.generating.inference.top_p, + "repetition_penalty": cfg.generating.inference.repetition_penalty, + "add_BOS": cfg.generating.inference.add_BOS, + "all_probs": cfg.generating.inference.all_probs, + "compute_logprob": cfg.generating.inference.compute_logprob, + "end_strings": cfg.generating.inference.end_strings, + } + + @property + def metadata(self) -> LLMMetadata: + """Get LLM metadata.""" + return LLMMetadata( + context_window=self.context_window, + num_output=self.num_output, + model_name=self.model_name, + ) + + @llm_completion_callback() + def complete(self, prompt: str, **kwargs: Any) -> CompletionResponse: + llm_response = self._model.generate( + inputs=[prompt], length_params=self.length_params, sampling_params=self.sampling_params + ) + text_response = llm_response['sentences'][0] + + return CompletionResponse(text=text_response) + + @llm_completion_callback() + def stream_complete(self, prompt: str, **kwargs: Any) -> CompletionResponseGen: + llm_response = self._model.generate( + inputs=[prompt], length_params=self.length_params, sampling_params=self.sampling_params + ) + text_response = llm_response['sentences'][0] + + response = "" + for token in text_response: + response += token + yield CompletionResponse(text=response, delta=token) From 9ca10104777f14f79e626eacb0381a59e25b896e Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Wed, 22 May 2024 10:03:37 -0600 Subject: [PATCH 09/47] Pin transformers (#9261) (#9273) * update branch * pin --------- Signed-off-by: eharper Co-authored-by: Eric Harper --- requirements/requirements_lightning.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/requirements_lightning.txt b/requirements/requirements_lightning.txt index 5ad2519cfd1a..cf996584da23 100644 --- a/requirements/requirements_lightning.txt +++ b/requirements/requirements_lightning.txt @@ -4,6 +4,6 @@ hydra-core>1.3,<=1.3.2 omegaconf<=2.3 pytorch-lightning>=2.2.1 torchmetrics>=0.11.0 -transformers>=4.36.0 +transformers>=4.36.0,<=4.40.2 wandb webdataset>=0.2.86 From 286d38704dc934cdbbb37fa3b026d04e547ba71c Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com> Date: Wed, 22 May 2024 11:45:37 -0700 Subject: [PATCH 10/47] Mcore dist opt ckpt fix (#9156) * Mcore dist opt ckpt fix Signed-off-by: Alexandros Koumparoulis * pass dp_zero_gather_scatter to starded-state-dict Signed-off-by: Alexandros Koumparoulis * Apply isort and black reformatting Signed-off-by: akoumpa * introduce dist_ckpt_parallel_save option Signed-off-by: Alexandros Koumparoulis * determine sharding type from dist_ckpt_parallel_save Signed-off-by: Alexandros Koumparoulis * Apply isort and black reformatting Signed-off-by: akoumpa * read model.disk_ckpt_parallel_save from cfg and pass it to mcore dist ckpt Signed-off-by: Alexandros Koumparoulis * Apply isort and black reformatting Signed-off-by: akoumpa * Pass is_loading to mcore_optim.py's sharded_state_dict Signed-off-by: Alexandros Koumparoulis * Apply isort and black reformatting Signed-off-by: akoumpa * Update nemo/core/optim/mcore_optim.py Co-authored-by: mikolajblaz Signed-off-by: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com> --------- Signed-off-by: Alexandros Koumparoulis Signed-off-by: akoumpa Signed-off-by: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com> Co-authored-by: akoumpa Co-authored-by: mikolajblaz --- .../conf/megatron_gpt_config.yaml | 1 + .../nlp/parts/megatron_trainer_builder.py | 1 + nemo/collections/nlp/parts/nlp_overrides.py | 17 +++++++++++++---- nemo/core/optim/mcore_optim.py | 11 +++++++++-- 4 files changed, 24 insertions(+), 6 deletions(-) diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml index 269aa8f55153..ca0c3f74e4c8 100755 --- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml +++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml @@ -154,6 +154,7 @@ model: # Distributed checkpoint setup dist_ckpt_format: 'zarr' # Set to 'torch_dist' to use PyTorch distributed checkpoint format. dist_ckpt_load_on_device: True # whether to load checkpoint weights directly on GPU or to CPU + dist_ckpt_parallel_save: False # if true, each worker will write its own part of the dist checkpoint ## Activation Checkpointing # NeMo Megatron supports 'selective' activation checkpointing where only the memory intensive part of attention is checkpointed. diff --git a/nemo/collections/nlp/parts/megatron_trainer_builder.py b/nemo/collections/nlp/parts/megatron_trainer_builder.py index 03cf5fb755bd..f6336f6bcc71 100644 --- a/nemo/collections/nlp/parts/megatron_trainer_builder.py +++ b/nemo/collections/nlp/parts/megatron_trainer_builder.py @@ -90,6 +90,7 @@ def _training_strategy(self) -> Union[NLPDDPStrategy, NLPFSDPStrategy]: find_unused_parameters=False, nccl_communicator_config_path=self.cfg.model.get('nccl_communicator_config_path', None), sharp=self.cfg.model.get('sharp', False), + dist_ckpt_parallel_save=self.cfg.model.get('dist_ckpt_parallel_save', False), ) def _grad_scaler(self) -> GradScaler: diff --git a/nemo/collections/nlp/parts/nlp_overrides.py b/nemo/collections/nlp/parts/nlp_overrides.py index e8f7009b791c..79937c265b09 100644 --- a/nemo/collections/nlp/parts/nlp_overrides.py +++ b/nemo/collections/nlp/parts/nlp_overrides.py @@ -78,6 +78,7 @@ from apex.transformer.pipeline_parallel.utils import get_num_microbatches from nemo.core.optim.distributed_adam import MegatronDistributedFusedAdam + from nemo.core.optim.mcore_optim import McoreDistributedOptimizer HAVE_APEX = True @@ -183,6 +184,7 @@ def __init__( no_ddp_communication_hook: bool = False, nccl_communicator_config_path: Optional[str] = None, sharp: bool = False, + dist_ckpt_parallel_save: bool = False, **kwargs: Union[Any, Dict[str, Any]], ) -> None: if not HAVE_APEX: @@ -199,6 +201,7 @@ def __init__( self.no_ddp_communication_hook = no_ddp_communication_hook self.nccl_communicator_config_path = nccl_communicator_config_path self.sharp = sharp + self._dist_ckpt_parallel_save = dist_ckpt_parallel_save def setup(self, trainer: "pl.Trainer") -> None: """ @@ -276,7 +279,7 @@ def configure_ddp(self): else: super().configure_ddp() - def optimizer_sharded_state_dict(self, unsharded_optim_state=None): + def optimizer_sharded_state_dict(self, unsharded_optim_state=None, is_loading=False): """ Sharded state dictionary for an MainParamsOptimizerWrapper. Used to save and load the optimizer state when training with distributed_checkpoint. @@ -294,8 +297,14 @@ def optimizer_sharded_state_dict(self, unsharded_optim_state=None): model_sharded_state_dict = { key: value for key, value in model_sharded_state_dict.items() if not key.endswith('_extra_state') } - - if isinstance(optimizer, MegatronDistributedFusedAdam): + if isinstance(optimizer, McoreDistributedOptimizer): + return optimizer.sharded_state_dict( + model_sharded_state_dict, + unsharded_optim_state, + is_loading=is_loading, + dist_ckpt_parallel_save=self._dist_ckpt_parallel_save, + ) + elif isinstance(optimizer, MegatronDistributedFusedAdam): return optimizer.sharded_state_dict(model_sharded_state_dict, unsharded_optim_state) elif not isinstance(optimizer, MainParamsOptimizerWrapper): # Regular optimizer, e.g. Adam or FusedAdam @@ -501,7 +510,7 @@ def load_checkpoint(self, checkpoint_path: Union[str, Path]) -> Dict[str, Any]: # after dist_checkpointing.load, sharded tensors will be replaced with tensors checkpoint['state_dict'] = sharded_state_dict - checkpoint['optimizer_states'] = [self.optimizer_sharded_state_dict()] + checkpoint['optimizer_states'] = [self.optimizer_sharded_state_dict(is_loading=True)] if self._check_param_groups_mismatch(checkpoint_path, checkpoint): return self._fix_param_groups(checkpoint_path, checkpoint) diff --git a/nemo/core/optim/mcore_optim.py b/nemo/core/optim/mcore_optim.py index 0d4b524049ca..234680f49249 100644 --- a/nemo/core/optim/mcore_optim.py +++ b/nemo/core/optim/mcore_optim.py @@ -55,8 +55,15 @@ def state_dict(self): def load_state_dict(self, state_dict): self.mcore_optimizer.load_state_dict(state_dict) - def sharded_state_dict(self, model_sharded_state_dict, is_loading: bool = False, **kwargs): - return self.mcore_optimizer.sharded_state_dict(model_sharded_state_dict, is_loading, **kwargs) + def sharded_state_dict( + self, model_sharded_state_dict, optimizer_state_dict=None, is_loading=False, dist_ckpt_parallel_save=False + ): + # TODO(@akoumparouli, @mikolajblaz): switch to sharding_type once support for fully_sharded_model_space merged in mcore. + # sharding_type = 'fully_sharded_model_space' if dist_ckpt_parallel_save else 'dp_zero_gather_scatter' + sharding_type = 'dp_zero_gather_scatter' + return self.mcore_optimizer.sharded_state_dict( + model_sharded_state_dict, is_loading=is_loading, sharding_type=sharding_type + ) def step(self, closure): """Clip gradients (if needed) and step the base optimizer. From 52364c1fe1d614d3a11e60fd68a428e1377e3d8e Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Wed, 22 May 2024 15:29:50 -0400 Subject: [PATCH 11/47] Fix loading github raw images on notebook (#9282) (#9283) Signed-off-by: Nithin Rao Koluguri Co-authored-by: Nithin Rao --- tutorials/asr/ASR_TTS_Tutorial.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tutorials/asr/ASR_TTS_Tutorial.ipynb b/tutorials/asr/ASR_TTS_Tutorial.ipynb index 067c007ea3df..709f96d14ba5 100644 --- a/tutorials/asr/ASR_TTS_Tutorial.ipynb +++ b/tutorials/asr/ASR_TTS_Tutorial.ipynb @@ -38,7 +38,7 @@ "### Architecture\n", "\n", "\"ASR-TTS\n", "\n", From 0e744c9300ca99060696b3536978ff5629312071 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Wed, 22 May 2024 13:51:38 -0700 Subject: [PATCH 12/47] Accept None as an argument to decoder_lengths in GreedyBatchedCTCInfer::forward (#9278) * Accept None as an argument to decoder_lengths in GreedyBatchedCTCInfer::forward (#9246) * Accept None as an argument to decoder_lengths in GreedyBatchedCTCInfer::forward GreedyCTCInfer::forward already allowed for this, so they did not implement the exact same interface. Now, they do. Also warn about not passing in the decoder_lengths argument. It is likely an error on the user's part not to pass it in explicitly. Signed-off-by: Daniel Galvez * Apply isort and black reformatting Signed-off-by: titu1994 * Log warning only once for sanity. Signed-off-by: Daniel Galvez --------- Signed-off-by: Daniel Galvez Signed-off-by: titu1994 Co-authored-by: titu1994 Co-authored-by: Somshubra Majumdar * Apply isort and black reformatting Signed-off-by: nithinraok --------- Signed-off-by: Daniel Galvez Signed-off-by: titu1994 Signed-off-by: nithinraok Co-authored-by: Daniel Galvez Co-authored-by: titu1994 Co-authored-by: Somshubra Majumdar Co-authored-by: Nithin Rao Co-authored-by: nithinraok --- .../parts/submodules/ctc_greedy_decoding.py | 50 +++++++++++++------ .../asr/decoding/test_ctc_decoding.py | 22 ++++++-- 2 files changed, 53 insertions(+), 19 deletions(-) diff --git a/nemo/collections/asr/parts/submodules/ctc_greedy_decoding.py b/nemo/collections/asr/parts/submodules/ctc_greedy_decoding.py index 1ef26cd7adf3..c4e9a14f6e1d 100644 --- a/nemo/collections/asr/parts/submodules/ctc_greedy_decoding.py +++ b/nemo/collections/asr/parts/submodules/ctc_greedy_decoding.py @@ -25,7 +25,10 @@ from nemo.utils import logging -def pack_hypotheses(hypotheses: List[rnnt_utils.Hypothesis], logitlen: torch.Tensor,) -> List[rnnt_utils.Hypothesis]: +def pack_hypotheses( + hypotheses: List[rnnt_utils.Hypothesis], + logitlen: torch.Tensor, +) -> List[rnnt_utils.Hypothesis]: if logitlen is not None: if hasattr(logitlen, 'cpu'): @@ -55,6 +58,9 @@ def _states_to_device(dec_state, device='cpu'): return dec_state +_DECODER_LENGTHS_NONE_WARNING = "Passing in decoder_lengths=None for CTC decoding is likely to be an error, since it is unlikely that each element of your batch has exactly the same length. decoder_lengths will default to decoder_output.shape[0]." + + class GreedyCTCInfer(Typing, ConfidenceMethodMixin): """A greedy CTC decoder. @@ -108,8 +114,7 @@ class GreedyCTCInfer(Typing, ConfidenceMethodMixin): @property def input_types(self): - """Returns definitions of module input ports. - """ + """Returns definitions of module input ports.""" # Input can be of dimension - # ('B', 'T', 'D') [Log probs] or ('B', 'T') [Labels] @@ -120,8 +125,7 @@ def input_types(self): @property def output_types(self): - """Returns definitions of module output ports. - """ + """Returns definitions of module output ports.""" return {"predictions": [NeuralType(elements_type=HypothesisType())]} def __init__( @@ -145,7 +149,9 @@ def __init__( @typecheck() def forward( - self, decoder_output: torch.Tensor, decoder_lengths: torch.Tensor, + self, + decoder_output: torch.Tensor, + decoder_lengths: Optional[torch.Tensor], ): """Returns a list of hypotheses given an input batch of the encoder hidden embedding. Output token is generated auto-repressively. @@ -158,6 +164,15 @@ def forward( Returns: packed list containing batch number of sentences (Hypotheses). """ + + logging.warning( + "CTC decoding strategy 'greedy' is slower than 'greedy_batch', which implements the same exact interface. Consider changing your strategy to 'greedy_batch' for a free performance improvement.", + mode=logging_mode.ONCE, + ) + + if decoder_lengths is None: + logging.warning(_DECODER_LENGTHS_NONE_WARNING, mode=logging_mode.ONCE) + with torch.inference_mode(): hypotheses = [] # Process each sequence independently @@ -204,7 +219,7 @@ def forward( return (packed_result,) @torch.no_grad() - def _greedy_decode_logprobs(self, x: torch.Tensor, out_len: torch.Tensor): + def _greedy_decode_logprobs(self, x: torch.Tensor, out_len: Optional[torch.Tensor]): # x: [T, D] # out_len: [seq_len] @@ -234,7 +249,7 @@ def _greedy_decode_logprobs(self, x: torch.Tensor, out_len: torch.Tensor): return hypothesis @torch.no_grad() - def _greedy_decode_labels(self, x: torch.Tensor, out_len: torch.Tensor): + def _greedy_decode_labels(self, x: torch.Tensor, out_len: Optional[torch.Tensor]): # x: [T] # out_len: [seq_len] @@ -324,8 +339,7 @@ class GreedyBatchedCTCInfer(Typing, ConfidenceMethodMixin): @property def input_types(self): - """Returns definitions of module input ports. - """ + """Returns definitions of module input ports.""" # Input can be of dimension - # ('B', 'T', 'D') [Log probs] or ('B', 'T') [Labels] @@ -336,8 +350,7 @@ def input_types(self): @property def output_types(self): - """Returns definitions of module output ports. - """ + """Returns definitions of module output ports.""" return {"predictions": [NeuralType(elements_type=HypothesisType())]} def __init__( @@ -361,7 +374,9 @@ def __init__( @typecheck() def forward( - self, decoder_output: torch.Tensor, decoder_lengths: torch.Tensor, + self, + decoder_output: torch.Tensor, + decoder_lengths: Optional[torch.Tensor], ): """Returns a list of hypotheses given an input batch of the encoder hidden embedding. Output token is generated auto-repressively. @@ -374,11 +389,18 @@ def forward( Returns: packed list containing batch number of sentences (Hypotheses). """ + + input_decoder_lengths = decoder_lengths + + if decoder_lengths is None: + logging.warning(_DECODER_LENGTHS_NONE_WARNING, mode=logging_mode.ONCE) + decoder_lengths = torch.tensor([decoder_output.shape[1]], dtype=torch.long).expand(decoder_output.shape[0]) + if decoder_output.ndim == 2: hypotheses = self._greedy_decode_labels_batched(decoder_output, decoder_lengths) else: hypotheses = self._greedy_decode_logprobs_batched(decoder_output, decoder_lengths) - packed_result = pack_hypotheses(hypotheses, decoder_lengths) + packed_result = pack_hypotheses(hypotheses, input_decoder_lengths) return (packed_result,) @torch.no_grad() diff --git a/tests/collections/asr/decoding/test_ctc_decoding.py b/tests/collections/asr/decoding/test_ctc_decoding.py index 8eceb822fd38..a42d61f051ad 100644 --- a/tests/collections/asr/decoding/test_ctc_decoding.py +++ b/tests/collections/asr/decoding/test_ctc_decoding.py @@ -90,7 +90,9 @@ def test_constructor_subword(self, tmp_tokenizer): assert decoding is not None @pytest.mark.unit - def test_char_decoding_greedy_forward(self,): + def test_char_decoding_greedy_forward( + self, + ): cfg = CTCDecodingConfig(strategy='greedy') vocab = char_vocabulary() decoding = CTCDecoding(decoding_cfg=cfg, vocabulary=vocab) @@ -197,7 +199,10 @@ def test_subword_decoding_greedy_forward_hypotheses(self, tmp_tokenizer, alignme @pytest.mark.parametrize('alignments', [False, True]) @pytest.mark.parametrize('timestamps', [False, True]) @pytest.mark.parametrize('preserve_frame_confidence', [False, True]) - def test_batched_decoding_logprobs(self, tmp_tokenizer, alignments, timestamps, preserve_frame_confidence): + @pytest.mark.parametrize('length_is_none', [False, True]) + def test_batched_decoding_logprobs( + self, tmp_tokenizer, alignments, timestamps, preserve_frame_confidence, length_is_none + ): cfg = CTCBPEDecodingConfig( strategy='greedy', preserve_alignments=alignments, @@ -217,7 +222,10 @@ def test_batched_decoding_logprobs(self, tmp_tokenizer, alignments, timestamps, # that we always handle at least a few blanks. input_signal[:, 0, unbatched_decoding.tokenizer.tokenizer.vocab_size] = 1000 input_signal[:, 1, unbatched_decoding.tokenizer.tokenizer.vocab_size] = 1000 - length = torch.randint(low=1, high=T, size=[B]) + if length_is_none: + length = None + else: + length = torch.randint(low=1, high=T, size=[B]) with torch.inference_mode(): hyps, _ = unbatched_decoding.ctc_decoder_predictions_tensor( @@ -240,7 +248,8 @@ def test_batched_decoding_logprobs(self, tmp_tokenizer, alignments, timestamps, @pytest.mark.unit @pytest.mark.parametrize('timestamps', [False, True]) - def test_batched_decoding_labels(self, tmp_tokenizer, timestamps): + @pytest.mark.parametrize('length_is_none', [False, True]) + def test_batched_decoding_labels(self, tmp_tokenizer, timestamps, length_is_none): cfg = CTCBPEDecodingConfig(strategy='greedy', compute_timestamps=timestamps) unbatched_decoding = CTCBPEDecoding(decoding_cfg=cfg, tokenizer=tmp_tokenizer) cfg.strategy = 'greedy_batched' @@ -254,7 +263,10 @@ def test_batched_decoding_labels(self, tmp_tokenizer, timestamps): # at least a few blanks. input_labels[:, 0] = unbatched_decoding.tokenizer.tokenizer.vocab_size input_labels[:, 1] = unbatched_decoding.tokenizer.tokenizer.vocab_size - length = torch.randint(low=1, high=T, size=[B]) + if length_is_none: + length = None + else: + length = torch.randint(low=1, high=T, size=[B]) with torch.inference_mode(): hyps, _ = unbatched_decoding.ctc_decoder_predictions_tensor( From 0f2874b270f476405f11aeb09d38a709118c67b5 Mon Sep 17 00:00:00 2001 From: Ali Taghibakhshi <71892896+JRD971000@users.noreply.github.com> Date: Wed, 22 May 2024 20:10:25 -0500 Subject: [PATCH 13/47] Alit/bert convert fix (#9285) * fix extra state and post process * move to args * Apply isort and black reformatting Signed-off-by: JRD971000 --------- Signed-off-by: JRD971000 Co-authored-by: JRD971000 --- .../convert_bert_hf_to_nemo.py | 25 +++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/scripts/checkpoint_converters/convert_bert_hf_to_nemo.py b/scripts/checkpoint_converters/convert_bert_hf_to_nemo.py index 278f7b879b28..a81fd33f47a2 100644 --- a/scripts/checkpoint_converters/convert_bert_hf_to_nemo.py +++ b/scripts/checkpoint_converters/convert_bert_hf_to_nemo.py @@ -19,6 +19,7 @@ --input_name_or_path "thenlper/gte-large" \ --output_path /path/to/output/nemo/file.nemo \ --mcore True \ + --post_process False \ --precision 32 ``` """ @@ -62,6 +63,9 @@ def get_args(): help="Path config for restoring. It's created during training and may need to be modified during restore if restore environment is different than training. Ex: /raid/nemo_experiments/megatron_gpt/hparams.yaml", ) parser.add_argument("--output_path", type=str, default=None, required=True, help="Path to output .nemo file.") + parser.add_argument( + "--post_process", type=bool, default=False, required=False, help="Whether to have the postprocessing modules" + ) parser.add_argument( "--precision", type=str, default="32", choices=["bf16", "32"], help="Precision for checkpoint weights saved" ) @@ -81,6 +85,14 @@ def convert(args): trainer = MegatronTrainerBuilder(nemo_config).create_trainer() model = MegatronBertModel(nemo_config.model, trainer) + if not args.post_process: + model.model.lm_head, model.model.encoder.final_layernorm, model.model.binary_head, model.model.output_layer = ( + None, + None, + None, + None, + ) + nemo_state_dict = {} hf_config = hf_model.config.to_dict() hidden_size = hf_config["hidden_size"] @@ -184,6 +196,19 @@ def convert(args): nemo_state_dict[LayerNorm2_weight_base_name] = param_to_weights(LayerNorm2_weight) nemo_state_dict[LayerNorm2_bias_base_name] = param_to_weights(LayerNorm2_bias) + nemo_state_dict[f'model.encoder.layers.{l}.self_attention.linear_proj._extra_state'] = model.state_dict()[ + f'model.encoder.layers.{l}.self_attention.linear_proj._extra_state' + ] + nemo_state_dict[f'model.encoder.layers.{l}.self_attention.linear_qkv._extra_state'] = model.state_dict()[ + f'model.encoder.layers.{l}.self_attention.linear_qkv._extra_state' + ] + nemo_state_dict[f'model.encoder.layers.{l}.mlp.linear_fc1._extra_state'] = model.state_dict()[ + f'model.encoder.layers.{l}.mlp.linear_fc1._extra_state' + ] + nemo_state_dict[f'model.encoder.layers.{l}.mlp.linear_fc2._extra_state'] = model.state_dict()[ + f'model.encoder.layers.{l}.mlp.linear_fc2._extra_state' + ] + # Non-layer dependent keys word_embeddings_weight = hf_model.state_dict()['embeddings.word_embeddings.weight'] position_embeddings_weight = hf_model.state_dict()['embeddings.position_embeddings.weight'] From 9d6e4724edacb76a82767dcdd37963b7a55fe83e Mon Sep 17 00:00:00 2001 From: mikolajblaz Date: Thu, 23 May 2024 18:06:12 +0200 Subject: [PATCH 14/47] Remove .nemo instead of renaming (#9281) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Remove .nemo instead of renaming Signed-off-by: Mikołaj Błaż * add ignore_errors=True flag Signed-off-by: dimapihtar * Revert "Remove .nemo instead of renaming" This reverts commit b836410a2d369aeb231f00b651d9b0f22b355929. Signed-off-by: Mikołaj Błaż * Remove backup .nemo after success Signed-off-by: Mikołaj Błaż * Update tests Signed-off-by: Mikołaj Błaż * Backup .nemo imediately before save_to Signed-off-by: Mikołaj Błaż * Apply isort and black reformatting Signed-off-by: mikolajblaz * Fix CTC import Signed-off-by: Mikołaj Błaż --------- Signed-off-by: Mikołaj Błaż Signed-off-by: dimapihtar Signed-off-by: mikolajblaz Co-authored-by: dimapihtar --- .../parts/submodules/ctc_greedy_decoding.py | 2 +- nemo/utils/callbacks/nemo_model_checkpoint.py | 33 +++++++--- tests/core/test_exp_manager.py | 65 ++++++++++++++----- 3 files changed, 76 insertions(+), 24 deletions(-) diff --git a/nemo/collections/asr/parts/submodules/ctc_greedy_decoding.py b/nemo/collections/asr/parts/submodules/ctc_greedy_decoding.py index c4e9a14f6e1d..a7f57c82279a 100644 --- a/nemo/collections/asr/parts/submodules/ctc_greedy_decoding.py +++ b/nemo/collections/asr/parts/submodules/ctc_greedy_decoding.py @@ -22,7 +22,7 @@ from nemo.collections.asr.parts.utils.asr_confidence_utils import ConfidenceMethodConfig, ConfidenceMethodMixin from nemo.core.classes import Typing, typecheck from nemo.core.neural_types import HypothesisType, LengthsType, LogprobsType, NeuralType -from nemo.utils import logging +from nemo.utils import logging, logging_mode def pack_hypotheses( diff --git a/nemo/utils/callbacks/nemo_model_checkpoint.py b/nemo/utils/callbacks/nemo_model_checkpoint.py index 15e8a4e21f55..e1d1f2e94586 100644 --- a/nemo/utils/callbacks/nemo_model_checkpoint.py +++ b/nemo/utils/callbacks/nemo_model_checkpoint.py @@ -22,6 +22,8 @@ import pytorch_lightning import torch from _weakref import proxy + +from lightning_fabric.utilities.cloud_io import get_filesystem from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint, _is_local_file_protocol from pytorch_lightning.utilities import rank_zero_info @@ -198,7 +200,6 @@ def on_save_checkpoint(self, trainer, pl_module, checkpoint): if app_state.model_parallel_size is not None and app_state.model_parallel_size > 1: logging.warning(f'always_save_nemo will slow down training for model_parallel > 1.') # since we are creating tarfile artifacts we need to update .nemo path - self._backup_existing_nemo_ckpt(trainer) app_state.model_restore_path = self._format_nemo_checkpoint_name() if app_state.model_parallel_size is not None and app_state.model_parallel_size > 1: maybe_injected_best_model_path = inject_model_parallel_rank(self.best_model_path) @@ -222,14 +223,19 @@ def on_save_checkpoint(self, trainer, pl_module, checkpoint): pl_module.load_state_dict(checkpoint, strict=True) if torch.distributed.is_initialized(): torch.distributed.barrier() + backup_path = self._backup_existing_nemo_ckpt(trainer) pl_module.save_to(save_path=app_state.model_restore_path) logging.info(f"New best .nemo model saved to: {app_state.model_restore_path}") pl_module.load_state_dict(old_state_dict, strict=True) else: if torch.distributed.is_initialized(): torch.distributed.barrier() + backup_path = self._backup_existing_nemo_ckpt(trainer) pl_module.save_to(save_path=app_state.model_restore_path) logging.info(f"New .nemo model saved to: {app_state.model_restore_path}") + if backup_path is not None and is_global_rank_zero(): + logging.info(f'Removing old .nemo backup {backup_path}') + get_filesystem(backup_path).rm(backup_path) return output def on_train_end(self, trainer, pl_module): @@ -268,16 +274,25 @@ def on_train_end(self, trainer, pl_module): trainer._checkpoint_connector.restore(self.best_model_path) if self.save_nemo_on_train_end: - self._backup_existing_nemo_ckpt(trainer) + backup_path = self._backup_existing_nemo_ckpt(trainer) pl_module.save_to(save_path=self._format_nemo_checkpoint_name()) + if backup_path is not None and is_global_rank_zero(): + logging.info(f'Removing old .nemo backup {backup_path}') + get_filesystem(backup_path).rm(backup_path) - def _backup_existing_nemo_ckpt(self, trainer) -> str: + def _backup_existing_nemo_ckpt(self, trainer) -> Optional[str]: """Search for an available name with version infix and rename existing checkpoint. NOTE: this behavior is slightly different from regular checkpoints. PTL creates new regular checkpoint with the first available name. Here, for backward compatibility, we create .nemo checkpoint as before and create a backup under the first available name. + + Args: + trainer (Trainer): trainer instance. + + Returns: + Path to the backup checkpoint or None, if no backup was created """ base_path = self._format_nemo_checkpoint_name() available_path = base_path @@ -286,11 +301,13 @@ def _backup_existing_nemo_ckpt(self, trainer) -> str: while self.file_exists(available_path, trainer, check_dist_ckpt=False): available_path = self._format_nemo_checkpoint_name(version_cnt) version_cnt += 1 - if available_path != base_path: - if trainer.is_global_zero: - logging.info(f'{base_path} already exists, moving existing checkpoint to {available_path}') - shutil.move(base_path, available_path) - trainer.strategy.barrier() + if available_path == base_path: + # no existing ckpt, no need to backup + return None + if trainer.is_global_zero: + logging.info(f'{base_path} already exists, moving existing checkpoint to {available_path}') + shutil.move(base_path, available_path) + trainer.strategy.barrier() return available_path def _format_nemo_checkpoint_name(self, ver: Optional[int] = None) -> str: diff --git a/tests/core/test_exp_manager.py b/tests/core/test_exp_manager.py index 8c6b33022dac..2d9bd03f0203 100644 --- a/tests/core/test_exp_manager.py +++ b/tests/core/test_exp_manager.py @@ -151,7 +151,7 @@ def test_omegaconf(self): @pytest.mark.unit def test_trainer_loggers(self, tmp_path): - """ Test that a trainer with logger errors out with a number of arguments. Test that it works with + """Test that a trainer with logger errors out with a number of arguments. Test that it works with create_tensorboard_logger set to False """ test_trainer = pl.Trainer(accelerator='cpu') # Should create logger and modelcheckpoint @@ -235,7 +235,7 @@ def test_trainer_neptune_logger(self, tmp_path): @pytest.mark.unit def test_checkpoint_configurations(self): - """ Test that trainer creating modelcheckpoint and asking exp_manager to do it too results in errors, but + """Test that trainer creating modelcheckpoint and asking exp_manager to do it too results in errors, but is error free if only one is asked to do so. """ disable_tb_logger = {"create_tensorboard_logger": False} @@ -297,7 +297,7 @@ def test_log_dir_overrides(self, monkeypatch, tmp_path): @pytest.mark.unit def test_resume(self, tmp_path): - """ Tests the resume capabilities of exp_manager""" + """Tests the resume capabilities of exp_manager""" test_trainer = pl.Trainer(accelerator='cpu', enable_checkpointing=False, logger=False) # Error because explicit_log_dir does not exist @@ -428,7 +428,8 @@ def test_nemo_checkpoint_save_best_model_1(self, tmp_path): def test_nemo_checkpoint_save_best_model_2(self, tmp_path): test_trainer = pl.Trainer(accelerator='cpu', enable_checkpointing=False, logger=False, max_epochs=4) exp_manager( - test_trainer, {"explicit_log_dir": str(tmp_path / "test")}, + test_trainer, + {"explicit_log_dir": str(tmp_path / "test")}, ) model = ExampleModel() test_trainer.fit(model) @@ -456,6 +457,27 @@ def test_nemo_checkpoint_always_save_nemo(self, tmp_path): model = ExampleModel.restore_from(str(tmp_path / "test" / "checkpoints" / "default.nemo")) assert float(model(torch.tensor([1.0, 1.0], device=model.device))) == 0.0 + @pytest.mark.unit + def test_nemo_checkpoint_doesnt_produce_too_many_nemo_ckpts(self, tmp_path): + test_trainer = pl.Trainer(accelerator='cpu', enable_checkpointing=False, logger=False, max_epochs=4) + exp_manager( + test_trainer, + { + "checkpoint_callback_params": {"save_best_model": True, "always_save_nemo": True, "save_top_k": 2}, + "explicit_log_dir": str(tmp_path / "test"), + }, + ) + model = ExampleModel() + test_trainer.fit(model) + + assert Path(str(tmp_path / "test" / "checkpoints" / "default.nemo")).exists() + assert ( + len(list((tmp_path / "test" / "checkpoints").glob("default*.nemo"))) == 1 + ) # check number of `.nemo` checkpoints + + model = ExampleModel.restore_from(str(tmp_path / "test" / "checkpoints" / "default.nemo")) + assert float(model(torch.tensor([1.0, 1.0], device=model.device))) == 0.0 + @pytest.mark.unit def test_nemo_checkpoint_make_checkpoint_dir(self, tmp_path): test_trainer = pl.Trainer( @@ -511,8 +533,8 @@ def test_nemo_checkpoint_restore_model(self, tmp_path): @pytest.mark.run_only_on('GPU') @pytest.mark.parametrize('test_dist_ckpt', [False, True]) - def test_checkpoints_are_not_overwritten(self, tmp_path, test_dist_ckpt): - """ Simulates already existing checkpoints in the ckpt directory and tests ckpt versioning """ + def test_base_checkpoints_are_not_overwritten(self, tmp_path, test_dist_ckpt): + """Simulates already existing checkpoints in the ckpt directory and tests non-nemo ckpt versioning""" strategy = NLPDDPStrategy() if test_dist_ckpt else 'auto' test_trainer = pl.Trainer( accelerator='cpu', enable_checkpointing=False, logger=False, max_epochs=4, strategy=strategy @@ -563,7 +585,8 @@ def _get_versioned_name(ckpt_name: Path, nemo: bool = False): assert _get_versioned_name(ckpt_1).exists(), all_checkpoints assert not _get_versioned_name(ckpt_2).exists(), all_checkpoints # ckpt2 didn't exist before - assert _get_versioned_name(ckpt_nemo, nemo=True).exists(), all_checkpoints + # .nemo checkpoints are not versioned: + assert not _get_versioned_name(ckpt_nemo, nemo=True).exists(), all_checkpoints @pytest.mark.unit def test_last_checkpoint_saved(self, tmp_path): @@ -592,6 +615,7 @@ def train_dataloader(self): model_path = checkpoint_dir / "val_loss=0.0300-epoch=1-step=64-last.ckpt" last_saved_checkpoint = torch.load(model_path) assert max_steps == last_saved_checkpoint['global_step'] + # restart training, ensure global step starts correctly class AssertCallback(Callback): def on_train_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None: @@ -681,8 +705,7 @@ def test_warning_validation_skipping_when_custom_epoch_loop(self, tmp_path): """ tmp_path = tmp_path / "test_3" - class CustomLoop(_TrainingEpochLoop): - ... + class CustomLoop(_TrainingEpochLoop): ... trainer = pl.Trainer( accelerator='cpu', enable_checkpointing=False, logger=False, max_epochs=1, val_check_interval=0.33 @@ -759,7 +782,8 @@ def test_skipped_unfinished_checkpoints_when_restoring(self, tmp_path): restored_trainer = pl.Trainer(accelerator='cpu', enable_checkpointing=False, logger=False) exp_manager( - restored_trainer, {"resume_if_exists": True, "explicit_log_dir": str(test_dir)}, + restored_trainer, + {"resume_if_exists": True, "explicit_log_dir": str(test_dir)}, ) # Check that last complete (w/o unifinished marker) checkpoint was found @@ -803,7 +827,8 @@ def test_skipped_unfinished_dist_checkpoints_when_restoring(self, tmp_path): restored_trainer = pl.Trainer(accelerator='cpu', enable_checkpointing=False, logger=False) exp_manager( - restored_trainer, {"resume_if_exists": True, "explicit_log_dir": str(test_dir)}, + restored_trainer, + {"resume_if_exists": True, "explicit_log_dir": str(test_dir)}, ) # Check that last complete (w/o unifinished marker) checkpoint was found @@ -850,13 +875,17 @@ def test_incomplete_checkpoints_cleanup(self, tmp_path): # unfinished checkpoint with EMA part, both parts should be removed self._write_fake_checkpoint( - checkpoints_dir / "incomplete01-EMA.ckpt", isdir=False, add_unfinished_marker=False, + checkpoints_dir / "incomplete01-EMA.ckpt", + isdir=False, + add_unfinished_marker=False, ) self._write_fake_checkpoint(checkpoints_dir / "incomplete01.ckpt", isdir=False, add_unfinished_marker=True) # just EMA part - should be removed. NOTE marker path is the same for base part and for EMA part self._write_fake_checkpoint( - checkpoints_dir / "incomplete02-EMA.ckpt", isdir=False, add_unfinished_marker=False, + checkpoints_dir / "incomplete02-EMA.ckpt", + isdir=False, + add_unfinished_marker=False, ) (checkpoints_dir / f"incomplete02{NeMoModelCheckpoint.UNFINISHED_CHECKPOINT_SUFFIX}").touch() @@ -864,7 +893,10 @@ def test_incomplete_checkpoints_cleanup(self, tmp_path): exp_manager( test_trainer, - {"checkpoint_callback_params": {"save_top_k": 0, "save_last": False}, "explicit_log_dir": str(test_dir),}, + { + "checkpoint_callback_params": {"save_top_k": 0, "save_last": False}, + "explicit_log_dir": str(test_dir), + }, ) model = ExampleModel() @@ -909,7 +941,10 @@ def test_incomplete_dist_checkpoints_cleanup(self, tmp_path): exp_manager( test_trainer, - {"checkpoint_callback_params": {"save_top_k": 0, "save_last": False}, "explicit_log_dir": str(test_dir),}, + { + "checkpoint_callback_params": {"save_top_k": 0, "save_last": False}, + "explicit_log_dir": str(test_dir), + }, ) model = ExampleModel() From a589828b7268dfb7aff505ba2a49ab151c5d5ee4 Mon Sep 17 00:00:00 2001 From: Chen Cui Date: Thu, 23 May 2024 12:13:31 -0400 Subject: [PATCH 15/47] Refactor Sequence Packing Script (#9271) * refactor pack seq script Signed-off-by: Chen Cui * add copyright header Signed-off-by: Chen Cui * Apply isort and black reformatting Signed-off-by: cuichenx * update doc Signed-off-by: Chen Cui * minor Signed-off-by: Chen Cui * Apply isort and black reformatting Signed-off-by: cuichenx --------- Signed-off-by: Chen Cui Signed-off-by: cuichenx Co-authored-by: cuichenx --- .../features/throughput_optimizations.rst | 9 +- nemo/utils/sequence_packing_utils.py | 232 ++++++++++++++++++ .../prepare_packed_ft_dataset.py | 206 ++++++---------- 3 files changed, 306 insertions(+), 141 deletions(-) create mode 100644 nemo/utils/sequence_packing_utils.py diff --git a/docs/source/features/throughput_optimizations.rst b/docs/source/features/throughput_optimizations.rst index 3f3ded01b1a2..dfd8b6cf9310 100644 --- a/docs/source/features/throughput_optimizations.rst +++ b/docs/source/features/throughput_optimizations.rst @@ -71,8 +71,8 @@ target length (i.e. efficient packing), then use shuffle. Otherwise try *first_f python scripts/nlp_language_modeling/prepare_packed_ft_dataset.py \ model.data.train_ds.file_names=[/path/to/training.jsonl] \ model.data.train_ds.max_seq_length=2048 \ - model.restore_from_path= \ - +output_dir= + +tokenizer_path=/path/to/tokenizer.model \ + +output_dir=/path/to/output_folder \ +pack_sizes=[2048,4096,8192] \ [ +packing_algorithm=first_fit_shuffle \ ] [ +seed=0 ] @@ -86,10 +86,7 @@ target length (i.e. efficient packing), then use shuffle. Otherwise try *first_f to the size of packed sequence (``pack_size``). ``max_seq_length`` should be set to the same value as unpacked data, and can be determined by examining the distribution of sequence lengths in the dataset. - Note 3. Currently, we require a full nemo model file for simplicity and readability of code, but in theory only a - tokenizer file is needed. This part can be improved in a future iteration of the script. - - Note 4. ``pack_sizes`` is a list of packed sequence lengths. In this example, there will be three output files, one for + Note 3. ``pack_sizes`` is a list of packed sequence lengths. In this example, there will be three output files, one for each pack size. The output files are named ``/packed_{pack_size}_seed{seed}.npy``. This argument is a list because you will likely want to experiment with a few ``pack_sizes`` to find out which length can fill the GPU memory without exceeding it. Adjusting ``pack_size`` is analogous to adjusting the micro batch size in diff --git a/nemo/utils/sequence_packing_utils.py b/nemo/utils/sequence_packing_utils.py new file mode 100644 index 000000000000..2a5a14f83823 --- /dev/null +++ b/nemo/utils/sequence_packing_utils.py @@ -0,0 +1,232 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import collections +from typing import Dict, List + +import numpy as np +from tqdm import tqdm + +from nemo.utils import logging + +PACKING_ALGOS = ['first_fit_decreasing', 'first_fit_shuffle'] + + +def find_first_bin_that_fits(bins: List[List[int]], s: int, bin_size: int) -> int: + """ + Finds the first bin in a list of bins that has enough space to fit a sequence of size 's'. + + Args: + bins: A list of lists, where each inner list represents a bin and contains the current elements in that bin. + s: The size of the sequence to be placed in a bin. + bin_size: The maximum capacity of each bin. + + Returns: + The index of the first bin that can fit the sequence 's', or -1 if no such bin exists. + """ + for i, abin in enumerate(bins): + if sum(abin) + s <= bin_size: + return i + return -1 + + +def first_fit(seqlens: List[int], pack_size: int) -> List[List[int]]: + """ + Packs sequences of varying lengths into bins using the First-Fit algorithm. + + Args: + seqlens: A list of integers, representing the lengths of the sequences to be packed. + pack_size: The maximum capacity of each bin. + + Returns: + A list of lists, where each inner list represents a bin and contains the indices of the sequences assigned to that bin. + """ + res = [] + for s in seqlens: + first_bin = find_first_bin_that_fits(res, s, pack_size) + if first_bin == -1: # open a new bin + res.append([s]) + else: + res[first_bin].append(s) + return res + + +def first_fit_decreasing(seqlens: List[int], pack_size: int) -> List[List[int]]: + """ + Packs sequences of varying lengths into bins using the First-Fit Decreasing algorithm. + + This is a variation of the First-Fit algorithm where the sequences are sorted by decreasing length before packing. + + Args: + seqlens: A list of integers, representing the lengths of the sequences to be packed. + pack_size: The maximum capacity of each bin. + + Returns: + A list of lists, similar to the output of the 'first_fit' function. + """ + sorted_seqlens = sorted(seqlens, reverse=True) + return first_fit(sorted_seqlens, pack_size) + + +def first_fit_shuffle(seqlens: List[int], pack_size: int) -> List[List[int]]: + """ + Packs sequences of varying lengths into bins using the First-Fit with Shuffling algorithm. + + This variation shuffles the order of the sequences before applying the First-Fit algorithm. + + Args: + seqlens: A list of integers, representing the lengths of the sequences to be packed. + pack_size: The maximum capacity of each bin. + + Returns: + A list of lists, similar to the output of the 'first_fit' function. + """ + shuffled_seqlens = seqlens[:] + np.random.shuffle(shuffled_seqlens) + return first_fit(shuffled_seqlens, pack_size) + + +def create_hist(dataset: np.array, truncate_seq_len: int): + """ + Creates a histogram of sequence lengths from a tokenized dataset. + + This function analyzes the tokenized dataset and creates a histogram showing the distribution of sequence lengths. + + Args: + dataset: A NumPy array containing the tokenized sequences. Each element is a dictionary that contains at minimum + the key `input_ids`. + truncate_seq_len: The maximum sequence length to consider in the histogram. + + Returns: + sequences: A dictionary where keys are sequence lengths and values are lists of corresponding sequences from the dataset. + histogram: A list representing the histogram data (number of sequences for each length). + """ + logging.info("Creating histogram from tokenized dataset...") + + sequences = collections.defaultdict(list) + counts = [0] * truncate_seq_len + + for item_dict in dataset: + seq_len = len(item_dict['input_ids']) - 1 + sequences[seq_len].append(item_dict) + counts[seq_len] += 1 + + logging.debug("Histogram of sequence lengths") + logging.debug(counts) + + histogram = [] + for seq_len in range(truncate_seq_len): + histogram.append(len(sequences[seq_len])) + + return sequences, histogram + + +def create_packing_strategy( + histogram: List[int], pack_size: int, packing_algorithm: str = 'first_fit' +) -> List[List[int]]: + """ + Packs sequences into bins using the specified packing algorithm. + + This function takes the histogram of sequence lengths, desired pack size, and a string representing the packing + algorithm to use. It then calls the corresponding function (e.g., 'first_fit_decreasing') and performs the + packing process using only sequence lengths as input (without the actual sequences). + + Args: + histogram: A list representing the histogram data (number of sequences for each length). + pack_size: The maximum capacity of each bin. + packing_algorithm: One of the supported packing algorithms from ['first_fit_decreasing', 'first_fit_shuffle'] + + Returns: + assignments: A list of lists, where each inner list represents a bin and contains the indices of the + sequence lengths assigned to that bin. + """ + + logging.info(f"Packing sequences to length {pack_size}...") + + all_seq_lens = [] + for i, count in enumerate(histogram): + all_seq_lens.extend([i] * count) + + packing_fn = globals()[packing_algorithm] + assignments = packing_fn(all_seq_lens, pack_size) + packed_seq_lens = [sum(x) for x in assignments] + packing_factor = len(all_seq_lens) / len(packed_seq_lens) + + logging.debug("Packed sequence lengths:") + logging.debug(packed_seq_lens) + logging.info(f"Packing is {sum(packed_seq_lens)/len(packed_seq_lens)/pack_size*100:.2f}% efficient") + logging.info( + f">>>>> For pack size {pack_size}, average number of sequences per pack is n = {packing_factor:.3f} <<<<<" + ) + return assignments + + +def fill_packing_strategy( + assignments: List[List[int]], sequences: Dict[int, List[Dict]], pack_size: int +) -> List[Dict]: + """ + Fills the packing strategy with actual sequence data based on assignments and sequence information. + + This function takes the assignments generated by the packing algorithm (containing sequence length indices), + the original sequences data, and the pack size. It iterates through the assignments, retrieves the corresponding + sequences from the sequences dictionary, and constructs the final output data structure with input IDs, loss masks + (if available), and starting indices for each sequence in a packed sequence. + + Args: + assignments: A list of lists, where each inner list represents a bin and contains the indices of the + sequence lengths assigned to that bin (output of 'create_packing_strategy'). + sequences: A dictionary where keys are sequence lengths and values are lists of corresponding sequences + from the dataset (output of 'create_hist'). + pack_size: The maximum capacity of each bin. + + Returns: + output_data: A list of dictionaries, where each dictionary represents a packed sequence with its input IDs, + loss mask (if available), and starting indices. + """ + ifile_handles = dict() + for seq_len in tqdm(range(pack_size + 1)): + per_seq_data = sequences[seq_len] + if len(per_seq_data) > 0: + perm = np.random.permutation(len(per_seq_data)) + input_ids = np.array([x['input_ids'] for x in per_seq_data])[perm].tolist() + try: + loss_mask = np.array( + [[idx >= x['answer_start_idx'] for idx in range(len(x['input_ids']))] for x in per_seq_data] + )[perm].tolist() + except KeyError: + loss_mask = None + ifile_handles[seq_len] = (input_ids, loss_mask) + + input_ids, loss_mask, seq_start_id = {}, {}, {} + + for oindex, assignment in tqdm(enumerate(assignments), total=len(assignments)): + _input_ids, _loss_mask, _seq_start_id = [], [], [0] + + for seq_length in assignment: + _input_ids.extend(ifile_handles[seq_length][0].pop()) + _loss_mask.extend(ifile_handles[seq_length][1].pop()) + _seq_start_id.append(len(_input_ids)) + + input_ids[oindex] = _input_ids + loss_mask[oindex] = _loss_mask + seq_start_id[oindex] = _seq_start_id[:-1] + + output_data = [] + for i in range(len(input_ids)): + item_dict = {'input_ids': input_ids[i], 'loss_mask': loss_mask[i], 'seq_start_id': seq_start_id[i]} + output_data.append(item_dict) + + assert all(not seq[0] for seq in ifile_handles.values()), "Error: There are items left over from the assignment" + assert all(not seq[1] for seq in ifile_handles.values()), "Error: There are items left over from the assignment" + return output_data diff --git a/scripts/nlp_language_modeling/prepare_packed_ft_dataset.py b/scripts/nlp_language_modeling/prepare_packed_ft_dataset.py index f01aa54fc265..b3251e75c84e 100644 --- a/scripts/nlp_language_modeling/prepare_packed_ft_dataset.py +++ b/scripts/nlp_language_modeling/prepare_packed_ft_dataset.py @@ -12,19 +12,20 @@ # See the License for the specific language governing permissions and # limitations under the License. -import collections import os from dataclasses import dataclass -from typing import Tuple +from typing import TYPE_CHECKING, Tuple import numpy as np -from tqdm import tqdm -from nemo.collections.nlp.models.language_modeling.megatron_gpt_sft_model import MegatronGPTSFTModel -from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronLMPPTrainerBuilder +from nemo.collections.nlp.data.language_modeling.megatron.gpt_sft_dataset import GPTSFTDataset +from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer from nemo.core.config import hydra_runner from nemo.utils import logging -from nemo.utils.exp_manager import exp_manager +from nemo.utils.sequence_packing_utils import create_hist, create_packing_strategy, fill_packing_strategy + +if TYPE_CHECKING: + from omegaconf import DictConfig """ Script to prepare packed dataset from a SFT/PEFT dataset in the jsonl format. @@ -45,146 +46,71 @@ python scripts/nlp_language_modeling/prepare_packed_ft_dataset.py \ model.data.train_ds.file_names=[/path/to/training.jsonl] \ model.data.train_ds.max_seq_length=2048 \ - model.restore_from_path= \ - +output_dir= + +tokenizer_path=/path/to/tokenizer.model + +output_dir=/path/to/output_folder +pack_sizes=[2048,4096,8192] Note: -- pack_sizes can take in a list -- model.data.train_ds.max_seq_length is the length to truncate long sequences before packing, and is different from the packing sizes -- currenlty, we require a full nemo model file for simplicity and readability of code, but in theory only a tokenizer file is needed. - This part can be improved in a future iteration of the script. + - If your model or dataset requires non-default configs for conventional SFT/PEFT training in NeMo, you will + need to pass in the same configs to ``model.data.train_ds`` as you would for training with unpacked dataset. + + - ``model.data.train_ds.max_seq_length`` is the length to truncate each sequence before packing multiple sequences + to the size of packed sequence (``pack_size``). ``max_seq_length`` should be set to the same value as unpacked data, + and can be determined by examining the distribution of sequence lengths in the dataset. + + - ``pack_sizes`` is a list of packed sequence lengths. In this example, there will be three output files, one for + each pack size. The output files are named ``/packed_{pack_size}_seed{seed}.npy``. + This argument is a list because you will likely want to experiment with a few ``pack_sizes`` to find out which length + can fill the GPU memory without exceeding it. Adjusting ``pack_size`` is analogous to adjusting the micro batch size in + the unpacked case. """ -PACKING_ALGOS = ['first_fit_decreasing', 'first_fit_shuffle'] - - -def find_first_bin_that_fits(bins, s, bin_size): - for i, abin in enumerate(bins): - if sum(abin) + s <= bin_size: - return i - return -1 - - -def first_fit(seqlens, pack_size): - res = [] - for s in seqlens: - first_bin = find_first_bin_that_fits(res, s, pack_size) - if first_bin == -1: # open a new bin - res.append([s]) - else: - res[first_bin].append(s) - return res - - -def first_fit_decreasing(seqlens, pack_size): - sorted_seqlens = sorted(seqlens, reverse=True) - return first_fit(sorted_seqlens, pack_size) - -def first_fit_shuffle(seqlens, pack_size): - shuffled_seqlens = seqlens[:] - np.random.shuffle(shuffled_seqlens) - return first_fit(shuffled_seqlens, pack_size) +def tokenize_dataset(cfg: 'DictConfig'): + """ + Tokenizes a dataset using the same configuration file as finetuninng with GPTSFTDataset. + This function reads a dataset and tokenizes it using SentencePiece tokenizer based on the provided configuration. -def create_assignment(output_path, assignments, ifile_handles): - n_samples_in_this_shard = len(assignments) - input_ids, loss_mask, seq_start_id = {}, {}, {} + Args: + cfg: A Hydra configuration object containing parameters for tokenization. - for oindex, assignment in tqdm(enumerate(assignments), total=n_samples_in_this_shard): - _input_ids, _loss_mask, _seq_start_id = [], [], [0] + Returns: + A NumPy array containing the tokenized sequences from the dataset. + """ - for seq_length in assignment: - _input_ids.extend(ifile_handles[seq_length][0].pop()) - _loss_mask.extend(ifile_handles[seq_length][1].pop()) - _seq_start_id.append(len(_input_ids)) - - input_ids[oindex] = _input_ids - loss_mask[oindex] = _loss_mask - seq_start_id[oindex] = _seq_start_id[:-1] - - output_data = [] - for i in range(len(input_ids)): - item_dict = {'input_ids': input_ids[i], 'loss_mask': loss_mask[i], 'seq_start_id': seq_start_id[i]} - output_data.append(item_dict) - - assert all(not seq[0] for seq in ifile_handles.values()), "Error: There are items left over from the assignment" - assert all(not seq[1] for seq in ifile_handles.values()), "Error: There are items left over from the assignment" - np.save(output_path, output_data) - logging.info(f"Done, output written to {output_path}") - - -def tokenize_dataset(cfg): logging.info("Tokenizing dataset...") # using the same template as SFT/PEFT script. This may be overkill but guarantees the preprocess settings # are identical to normal SFT training - trainer = MegatronLMPPTrainerBuilder(cfg).create_trainer() - exp_manager(trainer, cfg.exp_manager) - - model_cfg = MegatronGPTSFTModel.merge_cfg_with(cfg.model.restore_from_path, cfg) - model = MegatronGPTSFTModel.restore_from(cfg.model.restore_from_path, model_cfg, trainer=trainer) - - # we set is_train=False to turn off samples mapping and get the actual length of train dataset - train_ds = model._build_dataset(cfg.model.data.train_ds, is_train=False)[0] - return np.array([train_ds[i] for i in range(len(train_ds))]) - - -def create_hist(dataset, truncate_seq_len): - logging.info("Creating histogram from tokenized dataset...") - - sequences = collections.defaultdict(list) - counts = [0] * truncate_seq_len - - for item_dict in dataset: - seq_len = len(item_dict['input_ids']) - 1 - sequences[seq_len].append(item_dict) - counts[seq_len] += 1 - - logging.info("Histogram of sequence lengths") - logging.info(counts) - - histogram = [] - for seq_len in range(truncate_seq_len): - histogram.append(len(sequences[seq_len])) - - return sequences, histogram - - -def run_packing(sequences, histogram, output_dir, pack_size, packing_algorithm, seed=0): - logging.info(f"Packing sequences to length {pack_size}...") - - all_seq_lens = [] - for i, count in enumerate(histogram): - all_seq_lens.extend([i] * count) - - packing_fn = globals()[packing_algorithm] - assignments = packing_fn(all_seq_lens, pack_size) - packed_seq_lens = [sum(x) for x in assignments] - packing_factor = len(all_seq_lens) / len(packed_seq_lens) - - logging.info("Packed sequence lengths:") - logging.info(packed_seq_lens) - logging.info( - f">>>>> For pack size {pack_size}, average number of sequences per pack is n = {packing_factor} <<<<<" + data_cfg = cfg.model.data.train_ds + dataset = GPTSFTDataset( + file_path=data_cfg.file_names[0], + tokenizer=get_nmt_tokenizer(library="sentencepiece", tokenizer_model=cfg.tokenizer_path), + max_seq_length=data_cfg.max_seq_length, + min_seq_length=data_cfg.min_seq_length, + pad_seq_length_to_mult=16, # adds padding in collate_fn so this value is irrelevant here + add_bos=data_cfg.get('add_bos', False), + add_eos=data_cfg.get('add_eos', True), + add_sep=data_cfg.get('add_sep', False), + sep_id=cfg.get('sep_id', 49704), + max_num_samples=None, + seed=data_cfg.get('seed', 1234), + label_key=data_cfg.get('label_key', 'answer'), + answer_only_loss=cfg.get('answer_only_loss', True), + truncation_field=data_cfg.get('truncation_field', 'text'), + pad_to_max_length=data_cfg.get('pad_to_max_length', False), + index_mapping_dir=data_cfg.get('index_mapping_dir', None), + prompt_template=data_cfg.get('prompt_template', None), + virtual_tokens=0, + tokens_to_generate=data_cfg.get('tokens_to_generate', 0), + memmap_workers=data_cfg.get('memmap_workers', None), + hf_dataset=data_cfg.get('hf_dataset', False), + truncation_method=data_cfg.get('truncation_method', 'right'), + special_tokens=data_cfg.get('chat_prompt_tokens', None), + is_test=True, ) - ifile_handles = {} - for seq_len in tqdm(range(pack_size + 1)): - per_seq_data = sequences[seq_len] - if len(per_seq_data) > 0: - input_ids = np.array([x['input_ids'] for x in per_seq_data]) - loss_mask = np.array( - [[idx >= x['answer_start_idx'] for idx in range(len(x['input_ids']))] for x in per_seq_data] - ) - perm = np.random.permutation(len(input_ids)) - ifile_handles[seq_len] = (input_ids[perm].tolist(), loss_mask[perm].tolist()) - else: - ifile_handles[seq_len] = [], [] - - os.makedirs(output_dir, exist_ok=True) - output_path = os.path.join(output_dir, f'packed_{pack_size}_seed{seed}.npy') - create_assignment(output_path, assignments, ifile_handles) + return np.array([dataset[i] for i in range(len(dataset))]) @dataclass @@ -194,7 +120,7 @@ class PackingArgs: packing_algorithm: str = "first_fit_shuffle" seed: int = 0 - def from_config(self, cfg): + def from_config(self, cfg: 'DictConfig'): for required_arg in ('output_dir', 'pack_sizes'): assert cfg.get(required_arg, None), f"Please specify +{required_arg}=..." self.output_dir = cfg.output_dir @@ -207,12 +133,20 @@ def from_config(self, cfg): @hydra_runner( config_path="../../examples/nlp/language_modeling/tuning/conf", config_name="megatron_gpt_finetuning_config" ) -def main(cfg) -> None: +def main(cfg: 'DictConfig') -> None: args = PackingArgs().from_config(cfg) dataset = tokenize_dataset(cfg) sequences, histogram = create_hist(dataset, cfg.model.data.train_ds.max_seq_length) for pack_size in args.pack_sizes: - run_packing(sequences, histogram, args.output_dir, pack_size, args.packing_algorithm, args.seed) + assignments = create_packing_strategy(histogram, pack_size, args.packing_algorithm) + output_data = fill_packing_strategy(assignments, sequences, pack_size) + + # save output data + os.makedirs(args.output_dir, exist_ok=True) + output_path = os.path.join(args.output_dir, f'packed_{pack_size}_seed{args.seed}.npy') + np.save(output_path, output_data) + logging.info(f"Done, output written to {output_path}") + logging.info( f""" ✅ Packed datasets with pack sizes {args.pack_sizes} are prepared successfully. @@ -221,7 +155,9 @@ def main(cfg) -> None: > +model.data.train_ds.packed_sequence=True 2. Use the new dataset file instead of the original jsonl file > model.data.train_ds.file_names=/path/to/packed_dataset.npy -3. Adjust the batch sizes. +3. Specify the packed sequence length. This should be one of the ``pack_sizes`` you specified during data preparation. + > model.data.train_ds.max_seq_length= +4. Adjust the batch sizes. Micro batch size has to be set to 1 as a nominal constraint. This is because batches are now concatenated in the preprocessing step. You can increase the pack_size to achieve the same purpose of increasing micro batch size. Global batch size has to be reduced by the average number of sequences per pack `n`, From dddc125227413ce9f84f83515d5b99c82b2279fa Mon Sep 17 00:00:00 2001 From: Marc Romeyn Date: Fri, 24 May 2024 02:16:23 +0200 Subject: [PATCH 16/47] [Nemo-UX] Move code to collections + fix some small bugs (#9277) * Move io & llm * Run linting * Fix 2 bugs in megatron-strategy * Use teardown inside mistral hf-importer * Fix bug inside HF import * Apply isort and black reformatting Signed-off-by: marcromeyn * Port LLM api * Apply isort and black reformatting Signed-off-by: marcromeyn * fix imports Signed-off-by: Chen Cui * Apply isort and black reformatting Signed-off-by: cuichenx --------- Signed-off-by: marcromeyn Signed-off-by: Chen Cui Signed-off-by: cuichenx Co-authored-by: marcromeyn Co-authored-by: Chen Cui Co-authored-by: cuichenx --- nemo/collections/llm/__init__.py | 43 +++++ nemo/collections/llm/api.py | 161 ++++++++++++++++++ nemo/{ => collections}/llm/gpt/__init__.py | 0 nemo/collections/llm/gpt/data/__init__.py | 7 + nemo/{ => collections}/llm/gpt/data/core.py | 4 +- nemo/{ => collections}/llm/gpt/data/dolly.py | 4 +- .../llm/gpt/data/fine_tuning.py | 12 +- nemo/{ => collections}/llm/gpt/data/mock.py | 9 +- .../llm/gpt/data/pre_training.py | 6 +- nemo/{ => collections}/llm/gpt/data/squad.py | 4 +- .../llm/gpt/model}/__init__.py | 15 +- nemo/{ => collections}/llm/gpt/model/base.py | 3 +- .../llm/gpt/model/mistral_7b.py | 12 +- nemo/collections/llm/utils.py | 16 ++ nemo/io/__init__.py | 25 --- nemo/lightning/__init__.py | 6 + nemo/lightning/base.py | 30 +--- nemo/lightning/data.py | 9 +- nemo/lightning/io/__init__.py | 25 +++ nemo/{ => lightning}/io/api.py | 20 +-- nemo/{ => lightning}/io/capture.py | 6 +- nemo/{ => lightning}/io/connector.py | 30 +++- nemo/{ => lightning}/io/mixin.py | 16 +- nemo/{ => lightning}/io/pl.py | 10 +- nemo/{ => lightning}/io/state.py | 20 ++- nemo/lightning/megatron_parallel.py | 55 +++--- .../pytorch/plugins/mixed_precision.py | 15 +- nemo/lightning/pytorch/strategies.py | 21 ++- nemo/lightning/pytorch/trainer.py | 2 +- nemo/llm/gpt/data/__init__.py | 7 - nemo/llm/gpt/model/__init__.py | 12 -- tests/{ => lightning}/io/__init__.py | 0 tests/{ => lightning}/io/test_api.py | 11 +- tests/{ => lightning}/io/test_mixin.py | 2 +- tests/{ => lightning}/io/test_state.py | 3 +- tests/lightning/test_data.py | 28 ++- tests/lightning/test_megatron_parallel.py | 2 +- 37 files changed, 454 insertions(+), 197 deletions(-) create mode 100644 nemo/collections/llm/__init__.py create mode 100644 nemo/collections/llm/api.py rename nemo/{ => collections}/llm/gpt/__init__.py (100%) create mode 100644 nemo/collections/llm/gpt/data/__init__.py rename nemo/{ => collections}/llm/gpt/data/core.py (98%) rename nemo/{ => collections}/llm/gpt/data/dolly.py (97%) rename nemo/{ => collections}/llm/gpt/data/fine_tuning.py (93%) rename nemo/{ => collections}/llm/gpt/data/mock.py (97%) rename nemo/{ => collections}/llm/gpt/data/pre_training.py (97%) rename nemo/{ => collections}/llm/gpt/data/squad.py (97%) rename nemo/{llm => collections/llm/gpt/model}/__init__.py (65%) rename nemo/{ => collections}/llm/gpt/model/base.py (99%) rename nemo/{ => collections}/llm/gpt/model/mistral_7b.py (96%) create mode 100644 nemo/collections/llm/utils.py delete mode 100644 nemo/io/__init__.py create mode 100644 nemo/lightning/io/__init__.py rename nemo/{ => lightning}/io/api.py (96%) rename nemo/{ => lightning}/io/capture.py (96%) rename nemo/{ => lightning}/io/connector.py (92%) rename nemo/{ => lightning}/io/mixin.py (98%) rename nemo/{ => lightning}/io/pl.py (98%) rename nemo/{ => lightning}/io/state.py (97%) delete mode 100644 nemo/llm/gpt/data/__init__.py delete mode 100644 nemo/llm/gpt/model/__init__.py rename tests/{ => lightning}/io/__init__.py (100%) rename tests/{ => lightning}/io/test_api.py (65%) rename tests/{ => lightning}/io/test_mixin.py (91%) rename tests/{ => lightning}/io/test_state.py (99%) diff --git a/nemo/collections/llm/__init__.py b/nemo/collections/llm/__init__.py new file mode 100644 index 000000000000..0f60fd7438b9 --- /dev/null +++ b/nemo/collections/llm/__init__.py @@ -0,0 +1,43 @@ +# This is here to import it once, which improves the speed of launch when in debug-mode +try: + import transformer_engine # noqa +except ImportError: + pass + +from nemo.collections.llm.api import export_ckpt, import_ckpt, pretrain, train, validate +from nemo.collections.llm.gpt.data import ( + DollyDataModule, + FineTuningDataModule, + MockDataModule, + PreTrainingDataModule, + SquadDataModule, +) +from nemo.collections.llm.gpt.model import ( + GPTConfig, + GPTModel, + MaskedTokenLossReduction, + Mistral7BConfig, + Mistral7BModel, + gpt_data_step, + gpt_forward_step, +) + +__all__ = [ + "MockDataModule", + "GPTModel", + "GPTConfig", + "gpt_data_step", + "gpt_forward_step", + "MaskedTokenLossReduction", + "Mistral7BConfig", + "Mistral7BModel", + "PreTrainingDataModule", + "FineTuningDataModule", + "SquadDataModule", + "DollyDataModule", + "train", + "import_ckpt", + "export_ckpt", + "pretrain", + "validate", +] diff --git a/nemo/collections/llm/api.py b/nemo/collections/llm/api.py new file mode 100644 index 000000000000..824d84ffb461 --- /dev/null +++ b/nemo/collections/llm/api.py @@ -0,0 +1,161 @@ +from pathlib import Path +from typing import Callable, Optional + +import pytorch_lightning as pl + +from nemo.collections.llm.utils import task +from nemo.lightning import MegatronStrategy, Trainer, io, teardown + + +@task(namespace="llm") +def train( + model: pl.LightningModule, + data: pl.LightningDataModule, + trainer: Trainer, + tokenizer: Optional[str] = None, + source: Optional[str] = None, + export: Optional[str] = None, +) -> Path: + """ + Trains a model using the specified data and trainer, with optional tokenizer, source, and export. + + Args: + model (pl.LightningModule): The model to be trained. + data (pl.LightningDataModule): The data module containing training data. + trainer (Trainer): The trainer instance configured with a MegatronStrategy. + tokenizer (Optional[str]): Tokenizer setting to be applied. Can be 'data' or 'model'. + source (Optional[str]): Path to a checkpoint from which to continue training. + export (Optional[str]): Filename to save the exported checkpoint after training. + + Returns + ------- + Path: The directory path where training artifacts are saved. + + Raises + ------ + ValueError: If the trainer's strategy is not MegatronStrategy. + + Examples + -------- + >>> model = MyModel() + >>> data = MyDataModule() + >>> trainer = Trainer(strategy=MegatronStrategy()) + >>> train(model, data, trainer, tokenizer='data', source='path/to/ckpt.ckpt', export='final.ckpt') + PosixPath('/path/to/log_dir') + """ + if not isinstance(trainer.strategy, MegatronStrategy): + raise ValueError("Only MegatronStrategy is supported") + + fit_kwargs = {} + run_dir = Path(trainer.logger.log_dir) + export_dir = run_dir / "export" + + if hasattr(train, "__io__"): + _save_config_img(run_dir, train.__io__) + + if tokenizer: # TODO: Improve this + _use_tokenizer(model, data, tokenizer) + if source: + _add_ckpt_path(source, model, fit_kwargs) + + trainer.fit(model, data, **fit_kwargs) + + print(f"Saving checkpoint to: {export_dir}") + trainer.save_checkpoint(export_dir) + + if export and trainer.strategy.is_global_zero: + teardown(trainer, model=model) + print(f"Exporting checkpoint to: {export_dir / export}") + export_ckpt(export_dir, export) + + return run_dir + + +@task(namespace="llm") +def pretrain( + model: pl.LightningModule, + data: pl.LightningDataModule, + trainer: Trainer, + source: Optional[str] = None, + # export: Optional[str] = None +) -> Path: + return train(model=model, data=data, trainer=trainer, tokenizer="data", source=source) + + +@task(namespace="llm") +def validate( + model: pl.LightningModule, + data: pl.LightningDataModule, + trainer: Trainer, + tokenizer: Optional[str] = None, + source: Optional[str] = None, + export: Optional[str] = None, +) -> Path: + if not isinstance(trainer.strategy, MegatronStrategy): + raise ValueError("Only MegatronStrategy is supported") + + validate_kwargs = {} + run_dir = Path(trainer.logger.log_dir) + export_dir = run_dir / "export" + + if tokenizer: # TODO: Improve this + _use_tokenizer(model, data, tokenizer) + if source: + _add_ckpt_path(source, model, validate_kwargs) + + trainer.validate(model, data, **validate_kwargs) + trainer.save_checkpoint(export_dir) + if export: + teardown(trainer) + del trainer, model, data + export_ckpt(export_dir, export) + + return run_dir + + +@task(name="import", namespace="llm") +def import_ckpt( + model: pl.LightningModule, + source: str, + output_path: Optional[Path] = None, + overwrite: bool = False, +) -> Path: + return io.import_ckpt(model=model, source=source, output_path=output_path, overwrite=overwrite) + + +def load_connector_from_trainer_ckpt(path: Path, target: str) -> io.ModelConnector: + return io.load_ckpt(path).model.exporter(target, path) + + +@task(name="export", namespace="llm") +def export_ckpt( + path: Path, + target: str, + output_path: Optional[Path] = None, + overwrite: bool = False, + load_connector: Callable[[Path, str], io.ModelConnector] = load_connector_from_trainer_ckpt, +) -> Path: + return io.export_ckpt(path, target, output_path, overwrite, load_connector) + + +def _use_tokenizer(model: pl.LightningModule, data: pl.LightningDataModule, tokenizer: str) -> None: + if tokenizer == "data": + model.tokenizer = data.tokenizer + elif tokenizer == "model": + data.tokenizer = model.tokenizer + + +def _add_ckpt_path(source, model, kwargs) -> None: + if io.is_distributed_ckpt(source): + kwargs["ckpt_path"] = source + else: + kwargs["ckpt_path"] = model.import_ckpt(source) + + +def _save_config_img(*args, **kwargs): + try: + from nemo_sdk.utils import save_config_img + + save_config_img(*args, **kwargs) + except ImportError: + pass diff --git a/nemo/llm/gpt/__init__.py b/nemo/collections/llm/gpt/__init__.py similarity index 100% rename from nemo/llm/gpt/__init__.py rename to nemo/collections/llm/gpt/__init__.py diff --git a/nemo/collections/llm/gpt/data/__init__.py b/nemo/collections/llm/gpt/data/__init__.py new file mode 100644 index 000000000000..f83da73c987b --- /dev/null +++ b/nemo/collections/llm/gpt/data/__init__.py @@ -0,0 +1,7 @@ +from nemo.collections.llm.gpt.data.dolly import DollyDataModule +from nemo.collections.llm.gpt.data.fine_tuning import FineTuningDataModule +from nemo.collections.llm.gpt.data.mock import MockDataModule +from nemo.collections.llm.gpt.data.pre_training import PreTrainingDataModule +from nemo.collections.llm.gpt.data.squad import SquadDataModule + +__all__ = ["FineTuningDataModule", "SquadDataModule", "DollyDataModule", "MockDataModule", "PreTrainingDataModule"] diff --git a/nemo/llm/gpt/data/core.py b/nemo/collections/llm/gpt/data/core.py similarity index 98% rename from nemo/llm/gpt/data/core.py rename to nemo/collections/llm/gpt/data/core.py index c8ce328c1e0b..8d99583016a4 100644 --- a/nemo/llm/gpt/data/core.py +++ b/nemo/collections/llm/gpt/data/core.py @@ -32,7 +32,7 @@ def create_sft_dataset( truncation_method: str = 'right', memmap_workers: int = 2, hf_dataset: bool = False, - **kwargs + **kwargs, ) -> "GPTSFTDataset": from nemo.collections.nlp.data.language_modeling.megatron.gpt_sft_dataset import GPTSFTDataset @@ -53,5 +53,5 @@ def create_sft_dataset( index_mapping_dir=index_mapping_dir, prompt_template=prompt_template, truncation_method=truncation_method, - **kwargs + **kwargs, ) diff --git a/nemo/llm/gpt/data/dolly.py b/nemo/collections/llm/gpt/data/dolly.py similarity index 97% rename from nemo/llm/gpt/data/dolly.py rename to nemo/collections/llm/gpt/data/dolly.py index 2e3dcaffbf0a..9632a142eb35 100644 --- a/nemo/llm/gpt/data/dolly.py +++ b/nemo/collections/llm/gpt/data/dolly.py @@ -5,8 +5,8 @@ import numpy as np from datasets import load_dataset -from nemo.llm.gpt.data.core import get_dataset_root -from nemo.llm.gpt.data.fine_tuning import FineTuningDataModule +from nemo.collections.llm.gpt.data.core import get_dataset_root +from nemo.collections.llm.gpt.data.fine_tuning import FineTuningDataModule from nemo.utils import logging if TYPE_CHECKING: diff --git a/nemo/llm/gpt/data/fine_tuning.py b/nemo/collections/llm/gpt/data/fine_tuning.py similarity index 93% rename from nemo/llm/gpt/data/fine_tuning.py rename to nemo/collections/llm/gpt/data/fine_tuning.py index 1e4ab0432847..1be5c41e4919 100644 --- a/nemo/llm/gpt/data/fine_tuning.py +++ b/nemo/collections/llm/gpt/data/fine_tuning.py @@ -5,8 +5,8 @@ import pytorch_lightning as pl from torch.utils.data import DataLoader +from nemo.collections.llm.gpt.data.core import create_sft_dataset from nemo.lightning.pytorch.plugins import MegatronDataSampler -from nemo.llm.gpt.data.core import create_sft_dataset if TYPE_CHECKING: from nemo.collections.common.tokenizers import TokenizerSpec @@ -74,7 +74,13 @@ def val_dataloader(self) -> DataLoader: return self._create_dataloader(self._create_dataset(str(self.validation_path))) def test_dataloader(self) -> DataLoader: - return self._create_dataloader(self._create_dataset(str(self.test_path), tokens_to_generate=32, is_test=True,)) + return self._create_dataloader( + self._create_dataset( + str(self.test_path), + tokens_to_generate=32, + is_test=True, + ) + ) @lru_cache def _create_dataset(self, path, **kwargs): @@ -89,7 +95,7 @@ def _create_dataloader(self, dataset, **kwargs) -> DataLoader: pin_memory=self.pin_memory, persistent_workers=self.persistent_workers, collate_fn=dataset.collate_fn, - **kwargs + **kwargs, ) @property diff --git a/nemo/llm/gpt/data/mock.py b/nemo/collections/llm/gpt/data/mock.py similarity index 97% rename from nemo/llm/gpt/data/mock.py rename to nemo/collections/llm/gpt/data/mock.py index ff035a78453d..ccc1acfd6a2a 100644 --- a/nemo/llm/gpt/data/mock.py +++ b/nemo/collections/llm/gpt/data/mock.py @@ -74,7 +74,12 @@ def _create_dataloader(self, dataset, **kwargs) -> DataLoader: class _MockGPTDataset(Dataset): def __init__( - self, tokenizer: "TokenizerSpec", name: str, num_samples: int, seq_length: int, seed: int = 42, + self, + tokenizer: "TokenizerSpec", + name: str, + num_samples: int, + seq_length: int, + seed: int = 42, ) -> None: super().__init__() self.name = name @@ -118,7 +123,7 @@ def _collate_fn(self, batch): def collate_fn(self, batch): """Method that user pass as functor to DataLoader. - + The method optionally performs neural type checking and add types to the outputs. Please note, subclasses of Dataset should not implement `input_types`. diff --git a/nemo/llm/gpt/data/pre_training.py b/nemo/collections/llm/gpt/data/pre_training.py similarity index 97% rename from nemo/llm/gpt/data/pre_training.py rename to nemo/collections/llm/gpt/data/pre_training.py index d5d05955078b..80e099290b1d 100644 --- a/nemo/llm/gpt/data/pre_training.py +++ b/nemo/collections/llm/gpt/data/pre_training.py @@ -9,6 +9,7 @@ if TYPE_CHECKING: from megatron.core.datasets.gpt_dataset import GPTDatasetConfig + from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec @@ -82,7 +83,10 @@ def setup(self, stage: str = "") -> None: train_valid_test_num_samples = [num_train_samples, num_val_samples, num_test_samples] self._train_ds, self._validation_ds, self._test_ds = BlendedMegatronDatasetBuilder( - GPTDataset, train_valid_test_num_samples, is_built_on_rank=lambda: True, config=self.gpt_dataset_config, + GPTDataset, + train_valid_test_num_samples, + is_built_on_rank=lambda: True, + config=self.gpt_dataset_config, ).build() # uncomment once fabric API is merged diff --git a/nemo/llm/gpt/data/squad.py b/nemo/collections/llm/gpt/data/squad.py similarity index 97% rename from nemo/llm/gpt/data/squad.py rename to nemo/collections/llm/gpt/data/squad.py index c5235905b4ed..77d48da98a0e 100644 --- a/nemo/llm/gpt/data/squad.py +++ b/nemo/collections/llm/gpt/data/squad.py @@ -4,8 +4,8 @@ from datasets import DatasetDict, load_dataset -from nemo.llm.gpt.data.core import get_dataset_root -from nemo.llm.gpt.data.fine_tuning import FineTuningDataModule +from nemo.collections.llm.gpt.data.core import get_dataset_root +from nemo.collections.llm.gpt.data.fine_tuning import FineTuningDataModule from nemo.utils import logging if TYPE_CHECKING: diff --git a/nemo/llm/__init__.py b/nemo/collections/llm/gpt/model/__init__.py similarity index 65% rename from nemo/llm/__init__.py rename to nemo/collections/llm/gpt/model/__init__.py index a05c96f60944..fcb78d6cd397 100644 --- a/nemo/llm/__init__.py +++ b/nemo/collections/llm/gpt/model/__init__.py @@ -1,21 +1,18 @@ -from nemo.llm.gpt.data import MockDataModule -from nemo.llm.gpt.model import ( +from nemo.collections.llm.gpt.model.base import ( GPTConfig, GPTModel, MaskedTokenLossReduction, - Mistral7BConfig, - Mistral7BModel, gpt_data_step, gpt_forward_step, ) +from nemo.collections.llm.gpt.model.mistral_7b import Mistral7BConfig, Mistral7BModel __all__ = [ - "MockDataModule", - "GPTModel", "GPTConfig", - "gpt_data_step", - "gpt_forward_step", - "MaskedTokenLossReduction", + "GPTModel", "Mistral7BConfig", "Mistral7BModel", + "MaskedTokenLossReduction", + "gpt_data_step", + "gpt_forward_step", ] diff --git a/nemo/llm/gpt/model/base.py b/nemo/collections/llm/gpt/model/base.py similarity index 99% rename from nemo/llm/gpt/model/base.py rename to nemo/collections/llm/gpt/model/base.py index 7aaac96fdc4f..c6db9b8cbd80 100644 --- a/nemo/llm/gpt/model/base.py +++ b/nemo/collections/llm/gpt/model/base.py @@ -7,8 +7,7 @@ from megatron.core.transformer.transformer_config import TransformerConfig from torch.optim import Optimizer -from nemo import io -from nemo.lightning import get_vocab_size +from nemo.lightning import get_vocab_size, io from nemo.lightning.base import ModelConfig from nemo.lightning.megatron_parallel import MaskedTokenLossReduction diff --git a/nemo/llm/gpt/model/mistral_7b.py b/nemo/collections/llm/gpt/model/mistral_7b.py similarity index 96% rename from nemo/llm/gpt/model/mistral_7b.py rename to nemo/collections/llm/gpt/model/mistral_7b.py index 83d3b3412a39..e0035a086fbe 100644 --- a/nemo/llm/gpt/model/mistral_7b.py +++ b/nemo/collections/llm/gpt/model/mistral_7b.py @@ -5,8 +5,8 @@ import torch import torch.nn.functional as F -from nemo import io -from nemo.llm.gpt.model.base import GPTConfig, GPTModel +from nemo.collections.llm.gpt.model.base import GPTConfig, GPTModel +from nemo.lightning import io, teardown if TYPE_CHECKING: from transformers import MistralConfig, MistralForCausalLM @@ -21,7 +21,7 @@ class Mistral7BConfig(GPTConfig): position_embedding_type: str = "rope" add_bias_linear: bool = False gated_linear_unit: bool = True - apply_query_key_layer_scaling: bool = True + apply_query_key_layer_scaling: bool = False # TODO: Should this be True? num_layers: int = 32 hidden_size: int = 4096 @@ -56,6 +56,9 @@ def apply(self, output_path: Path) -> Path: self.convert_state(source, target) self.nemo_save(output_path, trainer) + teardown(trainer, target) + del trainer, target + return output_path def convert_state(self, source, target): @@ -90,11 +93,12 @@ def make_vocab_size_divisible_by(mistral_vocab_size): return base output = Mistral7BConfig( - seq_length=source.max_position_embeddings, + seq_length=source.sliding_window, num_layers=source.num_hidden_layers, hidden_size=source.hidden_size, ffn_hidden_size=source.intermediate_size, num_attention_heads=source.num_attention_heads, + max_position_embeddings=source.max_position_embeddings, init_method_std=source.initializer_range, layernorm_epsilon=source.rms_norm_eps, num_query_groups=source.num_key_value_heads, diff --git a/nemo/collections/llm/utils.py b/nemo/collections/llm/utils.py new file mode 100644 index 000000000000..848a83f5dc08 --- /dev/null +++ b/nemo/collections/llm/utils.py @@ -0,0 +1,16 @@ +from typing import Any, Callable, TypeVar + +T = TypeVar('T', bound=Callable[..., Any]) + + +def task(*args: Any, **kwargs: Any) -> Callable[[T], T]: + try: + import nemo_sdk as sdk + + return sdk.task(*args, **kwargs) + except ImportError: + # Return a no-op function + def noop_decorator(func: T) -> T: + return func + + return noop_decorator diff --git a/nemo/io/__init__.py b/nemo/io/__init__.py deleted file mode 100644 index 1b541ff7ba34..000000000000 --- a/nemo/io/__init__.py +++ /dev/null @@ -1,25 +0,0 @@ -from nemo.io.api import export_ckpt, import_ckpt, load, load_ckpt, model_exporter, model_importer -from nemo.io.capture import reinit -from nemo.io.connector import Connector, ModelConnector -from nemo.io.mixin import ConnectorMixin, IOMixin -from nemo.io.pl import TrainerCheckpoint, is_distributed_ckpt -from nemo.io.state import TransformCTX, apply_transforms, state_transform - -__all__ = [ - "apply_transforms", - "Connector", - "ConnectorMixin", - "IOMixin", - "import_ckpt", - "is_distributed_ckpt", - "export_ckpt", - "load", - "load_ckpt", - "ModelConnector", - "model_importer", - "model_exporter", - 'reinit', - "state_transform", - "TrainerCheckpoint", - "TransformCTX", -] diff --git a/nemo/lightning/__init__.py b/nemo/lightning/__init__.py index afbdb39f42d4..e54f223f91cc 100644 --- a/nemo/lightning/__init__.py +++ b/nemo/lightning/__init__.py @@ -3,6 +3,12 @@ from lightning_fabric.plugins.environments import slurm from pytorch_lightning import plugins as _pl_plugins +# This is here to import it once, which improves the speed of launch when in debug-mode +try: + import transformer_engine # noqa +except ImportError: + pass + from nemo.lightning.base import get_vocab_size, teardown from nemo.lightning.pytorch.plugins import MegatronDataSampler, MegatronMixedPrecision from nemo.lightning.pytorch.plugins import data_sampler as _data_sampler diff --git a/nemo/lightning/base.py b/nemo/lightning/base.py index 65bc1310f426..9cf2d9a44f35 100644 --- a/nemo/lightning/base.py +++ b/nemo/lightning/base.py @@ -9,7 +9,7 @@ from pytorch_lightning import LightningModule, Trainer from torch import nn -from nemo import io +from nemo.lightning import io DEFAULT_NEMO_CACHE_HOME = Path.home() / ".cache" / "nemo" NEMO_CACHE_HOME = Path(os.getenv("NEMO_HOME", DEFAULT_NEMO_CACHE_HOME)) @@ -18,27 +18,7 @@ DEFAULT_NEMO_MODELS_CACHE = NEMO_CACHE_HOME / "models" NEMO_MODELS_CACHE = Path(os.getenv("NEMO_MODELS_CACHE", DEFAULT_NEMO_MODELS_CACHE)) -# -# @dataclass -# class DataConfig: -# seq_length: int -# micro_batch_size: int = 4 -# global_batch_size: int = 8 -# rampup_batch_size: Optional[List[int]] = None -# train_drop_last: bool = True -# val_drop_last: bool = True -# test_drop_last: bool = True -# num_workers: int = 8 -# pin_memory: bool = True -# persistent_workers: bool = False -# -# @property -# def num_microbatches(self) -> int: -# from apex.transformer.pipeline_parallel.utils import get_num_microbatches -# -# return get_num_microbatches() -# -# + ModelT = TypeVar("ModelT", bound=LightningModule) @@ -66,7 +46,11 @@ def init(self, *args, data=None, cpu: bool = False, **kwargs) -> ModelT: return model -def get_vocab_size(config, vocab_size: int, make_vocab_size_divisible_by: int = 128,) -> int: +def get_vocab_size( + config, + vocab_size: int, + make_vocab_size_divisible_by: int = 128, +) -> int: from nemo.utils import logging after = vocab_size diff --git a/nemo/lightning/data.py b/nemo/lightning/data.py index 794300db72f0..88e2f3436699 100644 --- a/nemo/lightning/data.py +++ b/nemo/lightning/data.py @@ -20,7 +20,10 @@ def create_dataloader( def setup_microbatch_calculator( - global_rank: int, micro_batch_size: int, global_batch_size: int, rampup_batch_size: Optional[List[int]] = None, + global_rank: int, + micro_batch_size: int, + global_batch_size: int, + rampup_batch_size: Optional[List[int]] = None, ) -> None: """ Initializes the data for distributed training by setting up the microbatch calculator @@ -41,7 +44,6 @@ def setup_microbatch_calculator( """ from nemo.lightning._strategy_lib import NEMO_MEGATRON_MODEL_PARALLEL_APPSTATE_OVERRIDE - from nemo.utils import AppState app_state = AppState() @@ -189,8 +191,7 @@ def __len__(self): return (num_available_samples - 1) // self.micro_batch_times_data_parallel_size + 1 @abc.abstractmethod - def __iter__(self): - ... + def __iter__(self): ... class MegatronPretrainingSampler(BaseMegatronSampler): diff --git a/nemo/lightning/io/__init__.py b/nemo/lightning/io/__init__.py new file mode 100644 index 000000000000..d1a193c5e728 --- /dev/null +++ b/nemo/lightning/io/__init__.py @@ -0,0 +1,25 @@ +from nemo.lightning.io.api import export_ckpt, import_ckpt, load, load_ckpt, model_exporter, model_importer +from nemo.lightning.io.capture import reinit +from nemo.lightning.io.connector import Connector, ModelConnector +from nemo.lightning.io.mixin import ConnectorMixin, IOMixin +from nemo.lightning.io.pl import TrainerCheckpoint, is_distributed_ckpt +from nemo.lightning.io.state import TransformCTX, apply_transforms, state_transform + +__all__ = [ + "apply_transforms", + "Connector", + "ConnectorMixin", + "IOMixin", + "import_ckpt", + "is_distributed_ckpt", + "export_ckpt", + "load", + "load_ckpt", + "ModelConnector", + "model_importer", + "model_exporter", + 'reinit', + "state_transform", + "TrainerCheckpoint", + "TransformCTX", +] diff --git a/nemo/io/api.py b/nemo/lightning/io/api.py similarity index 96% rename from nemo/io/api.py rename to nemo/lightning/io/api.py index c8fe3c04a811..9af1d3d2a9d6 100644 --- a/nemo/io/api.py +++ b/nemo/lightning/io/api.py @@ -5,8 +5,8 @@ import fiddle as fdl import pytorch_lightning as pl -from nemo.io.mixin import ConnectorMixin, ConnT, ModelConnector -from nemo.io.pl import TrainerCheckpoint +from nemo.lightning.io.mixin import ConnectorMixin, ConnT, ModelConnector +from nemo.lightning.io.pl import TrainerCheckpoint CkptType = TypeVar("CkptType") @@ -128,14 +128,14 @@ def import_ckpt( path for the imported checkpoint; if not provided, the importer's default path will be used. The 'overwrite' parameter enables the replacement of existing data at the output path, which is useful when updating models with new data and discarding old checkpoint files. - - For instance, using `import_ckpt(Mistral7BModel(), "hf")` initiates the import process - by searching for a registered model importer tagged with "hf". In NeMo, `HFMistral7BImporter` + + For instance, using `import_ckpt(Mistral7BModel(), "hf")` initiates the import process + by searching for a registered model importer tagged with "hf". In NeMo, `HFMistral7BImporter` is registered under this tag via: - `@io.model_importer(Mistral7BModel, "hf", default_path="mistralai/Mistral-7B-v0.1")`. - This links `Mistral7BModel` to `HFMistral7BImporter`, designed for HuggingFace checkpoints. - The importer then processes and integrates these checkpoints into `Mistral7BModel` for further - fine-tuning. + `@io.model_importer(Mistral7BModel, "hf", default_path="mistralai/Mistral-7B-v0.1")`. + This links `Mistral7BModel` to `HFMistral7BImporter`, designed for HuggingFace checkpoints. + The importer then processes and integrates these checkpoints into `Mistral7BModel` for further + fine-tuning. Args: model (pl.LightningModule): The model into which the checkpoint will be imported. @@ -188,7 +188,7 @@ def export_ckpt( ) -> Path: """ Exports a checkpoint from a model using the model's associated exporter, typically for - the purpose of sharing a model that has been fine-tuned or customized within NeMo. + the purpose of sharing a model that has been fine-tuned or customized within NeMo. This function leverages the ConnectorMixin interface to seamlessly integrate the model's state into an external checkpoint format. diff --git a/nemo/io/capture.py b/nemo/lightning/io/capture.py similarity index 96% rename from nemo/io/capture.py rename to nemo/lightning/io/capture.py index 2a65d18c15e3..910506f13147 100644 --- a/nemo/io/capture.py +++ b/nemo/lightning/io/capture.py @@ -42,14 +42,12 @@ def wrapper(*args, **kwargs): @runtime_checkable class IOProtocol(Protocol, Generic[SelfT]): @property - def __io__(self) -> fdl.Config[SelfT]: - ... + def __io__(self) -> fdl.Config[SelfT]: ... @runtime_checkable class ReInitProtocol(Protocol, Generic[SelfT]): - def reinit(self) -> SelfT: - ... + def reinit(self) -> SelfT: ... def reinit(configurable: IOProtocol[SelfT]) -> SelfT: diff --git a/nemo/io/connector.py b/nemo/lightning/io/connector.py similarity index 92% rename from nemo/io/connector.py rename to nemo/lightning/io/connector.py index bf5f88f95992..cd77abf9dc1c 100644 --- a/nemo/io/connector.py +++ b/nemo/lightning/io/connector.py @@ -29,19 +29,19 @@ class Connector(BasePath, Generic[SourceT, TargetT]): ------- init() -> TargetT: Should be implemented to initialize the target type from the source type. - + apply(output_path: Path) -> Path: Should be implemented to apply the transformation and save the result at the output path. - + __new__(cls, *args, **kwargs) -> 'Connector': Creates a new instance of the connector, using default_path if no path is provided. - + __call__(output_path: Optional[Path] = None, overwrite: bool = False) -> Path: Processes the transformation and handles file operations like overwriting. - + local_path(base_path: Optional[Path] = None) -> Path: Computes the local path for storage based on a base path or a default cache home. - + is_in_cache(base_path: Optional[Path] = None) -> bool: Checks if the transformed data is already cached at the specified base path. """ @@ -96,10 +96,10 @@ class ModelConnector(Connector, Generic[SourceT, TargetT]): ------- nemo_setup(model: pl.LightningModule, trainer: Optional[pl.Trainer] = None) -> pl.Trainer: Sets up the model and trainer using a specified strategy, preparing it for training or inference. - + nemo_save(output_path: Path, trainer: pl.Trainer): Saves the model's state to the specified path using the trainer's current strategy. - + nemo_load(path: Path, trainer: Optional[pl.Trainer] = None, cpu: bool = True) -> Tuple[Any, pl.Trainer]: Loads a model from the specified path, optionally using a CPU-focused strategy, and returns the model and trainer. """ @@ -118,7 +118,9 @@ def nemo_setup(self, model: pl.LightningModule, trainer: Optional[pl.Trainer] = """ from nemo.lightning import MegatronStrategy, Trainer - _trainer = trainer or Trainer(devices=1, accelerator="cpu", strategy=MegatronStrategy()) + _trainer = trainer or Trainer( + devices=1, accelerator="cpu", strategy=MegatronStrategy(store_optimizer_states=False) + ) _trainer.strategy.connect(model) _trainer.strategy.setup_environment() @@ -156,8 +158,8 @@ def nemo_load( ------- Tuple[pl.LightningModule, pl.Trainer]: The loaded model and the trainer configured with the model. """ - from nemo.io.api import load_ckpt from nemo.lightning import MegatronStrategy, Trainer, _strategy_lib + from nemo.lightning.io.api import load_ckpt model = load_ckpt(path).model _trainer = trainer or Trainer(devices=1, accelerator="cpu" if cpu else "gpu", strategy=MegatronStrategy()) @@ -177,3 +179,13 @@ def nemo_load( _trainer.strategy.load_checkpoint(path) return model, _trainer + + def local_path(self, base_path: Optional[Path] = None) -> Path: + if base_path: + _base = base_path + else: + from nemo.lightning.base import NEMO_MODELS_CACHE + + _base = Path(NEMO_MODELS_CACHE) + + return _base / str(self).replace("://", "/") diff --git a/nemo/io/mixin.py b/nemo/lightning/io/mixin.py similarity index 98% rename from nemo/io/mixin.py rename to nemo/lightning/io/mixin.py index bba6677b452b..b5ee76a2fe03 100644 --- a/nemo/io/mixin.py +++ b/nemo/lightning/io/mixin.py @@ -8,8 +8,8 @@ from cloudpickle import dump from typing_extensions import Self -from nemo.io.capture import IOProtocol -from nemo.io.connector import ModelConnector +from nemo.lightning.io.capture import IOProtocol +from nemo.lightning.io.connector import ModelConnector ConnT = TypeVar('ConnT', bound=ModelConnector) @@ -35,8 +35,8 @@ class IOMixin: Examples -------- - from nemo import io - + from nemo.lightning import io + class ExampleClass(io.IOMixin): def __init__(self, param1, param2): super().__init__() @@ -46,7 +46,7 @@ def __init__(self, param1, param2): # Creating an instance of ExampleClass example = ExampleClass('value1', 'value2') example_copy = io.reinit(example) - + Note: For more information on `fdl.Config`, refer to the Fiddle library documentation at @@ -168,9 +168,9 @@ def import_from(cls, path: str) -> Self: Args: path (str): The path to the model file to be imported. - + Example: - from nemo import llm + from nemo.collections import llm model = llm.Mistral7BModel.import_from("hf") Returns @@ -285,7 +285,7 @@ def import_ckpt(self, path: str, overwrite: bool = False, base_path: Optional[Pa @classmethod def _get_connector(cls, ext, path=None, importer=True) -> ModelConnector: """ - Retrieves the appropriate model connector based on the file extension and path, + Retrieves the appropriate model connector based on the file extension and path, distinguishing between importers and exporters. Args: diff --git a/nemo/io/pl.py b/nemo/lightning/io/pl.py similarity index 98% rename from nemo/io/pl.py rename to nemo/lightning/io/pl.py index ba9b5be72cab..fba94f5e3a55 100644 --- a/nemo/io/pl.py +++ b/nemo/lightning/io/pl.py @@ -11,8 +11,8 @@ from torch import nn from typing_extensions import Self, override -from nemo.io.capture import IOProtocol -from nemo.io.mixin import IOMixin +from nemo.lightning.io.capture import IOProtocol +from nemo.lightning.io.mixin import IOMixin if TYPE_CHECKING: from nemo.lightning.pytorch.strategies import MegatronStrategy @@ -53,11 +53,9 @@ def construct_extra(cls, strategy: "MegatronStrategy") -> Dict[str, Any]: class TrainerCkptProtocol(Protocol): @classmethod - def from_strategy(cls, strategy: "MegatronStrategy") -> Self: - ... + def from_strategy(cls, strategy: "MegatronStrategy") -> Self: ... - def io_dump(self, output: Path): - ... + def io_dump(self, output: Path): ... class MegatronCheckpointIO(CheckpointIO): diff --git a/nemo/io/state.py b/nemo/lightning/io/state.py similarity index 97% rename from nemo/io/state.py rename to nemo/lightning/io/state.py index d978cd0ade8e..ed481cfcfe08 100644 --- a/nemo/io/state.py +++ b/nemo/lightning/io/state.py @@ -26,11 +26,11 @@ def apply_transforms( transforms: Optional[List[Callable[[TransformCTX], TransformCTX]]] = None, ) -> TargetModuleT: """ - Applies a series of transformations to adapt the state dictionary of a source module to + Applies a series of transformations to adapt the state dictionary of a source module to match the structure of a target module's state dictionary. This function renames keys according to a provided mapping and modifies values using a list - of transformation functions. Each transformation function typically is decorated + of transformation functions. Each transformation function typically is decorated with `io.state_transform`. Args: @@ -91,7 +91,12 @@ def scale_weights(ctx): _target = target.module target_state = _target.state_dict() - ctx = TransformCTX(source=_source, source_state=_source.state_dict(), target=_target, target_state=target_state,) + ctx = TransformCTX( + source=_source, + source_state=_source.state_dict(), + target=_target, + target_state=target_state, + ) for key, val in mapping.items(): ctx = StateDictTransform(key, val)(ctx) @@ -349,16 +354,15 @@ def _match_keys(keys: List[str], pattern: str) -> np.ndarray: @overload def state_transform( - source_key: Union[str, Tuple[str, ...], Dict[str, str]], target_key: Union[str, Tuple[str, ...]], -) -> Callable[[F], StateDictTransform[F]]: - ... + source_key: Union[str, Tuple[str, ...], Dict[str, str]], + target_key: Union[str, Tuple[str, ...]], +) -> Callable[[F], StateDictTransform[F]]: ... @overload def state_transform( source_key: Union[str, Tuple[str, ...], Dict[str, str]], target_key: Union[str, Tuple[str, ...]], fn: F -) -> StateDictTransform[F]: - ... +) -> StateDictTransform[F]: ... def state_transform( diff --git a/nemo/lightning/megatron_parallel.py b/nemo/lightning/megatron_parallel.py index 899f2fb2c06c..8106b83a41d1 100644 --- a/nemo/lightning/megatron_parallel.py +++ b/nemo/lightning/megatron_parallel.py @@ -31,11 +31,9 @@ @runtime_checkable class PrecisionPluginProtocol(Protocol[DataT]): - def convert_input(self, data: DataT) -> DataT: - ... + def convert_input(self, data: DataT) -> DataT: ... - def convert_output(self, output: torch.Tensor) -> torch.Tensor: - ... + def convert_output(self, output: torch.Tensor) -> torch.Tensor: ... def default_data_step(dataloader_iter: Iterator[DataT]) -> DataT: @@ -122,7 +120,7 @@ def __init__( if vp_size is not None: if len(_pipeline) == 1 and parallel_state.get_pipeline_model_parallel_world_size() > 1: - from nemo import io + from nemo.lightning import io parallel_state.set_virtual_pipeline_model_parallel_world_size(vp_size) for i in range(1, vp_size): @@ -212,7 +210,10 @@ def forward( if wrap_forward_step: _data_step = data_step or self.data_step forward_step_func = self.wrapped_forward_step( - _forward_step, data_step=_data_step, loss_reduction=loss_reduction, context=context, + _forward_step, + data_step=_data_step, + loss_reduction=_loss_reduction, + context=context, ) else: forward_step_func = _forward_step @@ -259,7 +260,11 @@ def forward( return loss_mean def wrapped_forward_step( - self, forward_step, loss_reduction, context, data_step, + self, + forward_step, + loss_reduction, + context, + data_step, ) -> Callable[[nn.Module, DataT], Tuple[torch.Tensor, "MegatronCallbackProtocol"]]: """The method wraps the forward step function and returns a callable. @@ -309,7 +314,11 @@ def wrapped_forward_step_func(dataloader_iter, model): # callback self._setup_module( - forward_callback, batch=batch, model=self, forward_module=model, tensor=output_tensor, + forward_callback, + batch=batch, + model=self, + forward_module=model, + tensor=output_tensor, ) if self.precision_plugin and parallel_state.is_pipeline_last_stage(): @@ -728,29 +737,21 @@ def __contains__(self, callback_object) -> bool: class CallbackMethods: - def on_megatron_step_start(self, *args, **kwargs) -> None: - ... + def on_megatron_step_start(self, *args, **kwargs) -> None: ... - def on_megatron_microbatch_start(self, *args, **kwargs) -> None: - ... + def on_megatron_microbatch_start(self, *args, **kwargs) -> None: ... - def on_megatron_microbatch_callback(self, *args, **kwargs) -> None: - ... + def on_megatron_microbatch_callback(self, *args, **kwargs) -> None: ... - def on_megatron_microbatch_end(self, *args, **kwargs) -> None: - ... + def on_megatron_microbatch_end(self, *args, **kwargs) -> None: ... - def on_megatron_reduce_microbatches_start(self, *args, **kwargs) -> None: - ... + def on_megatron_reduce_microbatches_start(self, *args, **kwargs) -> None: ... - def on_megatron_reduce_microbatches_end(self, *args, **kwargs) -> None: - ... + def on_megatron_reduce_microbatches_end(self, *args, **kwargs) -> None: ... - def on_megatron_log_step_end(self, *args, **kwargs) -> None: - ... + def on_megatron_log_step_end(self, *args, **kwargs) -> None: ... - def on_megatron_step_end(self, *args, **kwargs) -> None: - ... + def on_megatron_step_end(self, *args, **kwargs) -> None: ... ReductionT = TypeVar("ReductionT") @@ -778,8 +779,7 @@ def reduce(self, losses_reduced_per_micro_batch: Sequence[ReductionT]) -> torch. @runtime_checkable class MegatronCallbackProtocol(Protocol): - def __call__(self, tensor: torch.Tensor) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]: - ... + def __call__(self, tensor: torch.Tensor) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]: ... @runtime_checkable @@ -796,8 +796,7 @@ def __call__( decoder_seq_length: Optional[int] = None, forward_only: bool = False, collect_non_loss_data: bool = False, - ) -> list: - ... + ) -> list: ... def _calc_number_of_params(model: List[nn.Module]) -> int: diff --git a/nemo/lightning/pytorch/plugins/mixed_precision.py b/nemo/lightning/pytorch/plugins/mixed_precision.py index af7054526957..6c3d556816d2 100644 --- a/nemo/lightning/pytorch/plugins/mixed_precision.py +++ b/nemo/lightning/pytorch/plugins/mixed_precision.py @@ -27,11 +27,16 @@ class MegatronMixedPrecision(MixedPrecision): - def __init__(self, precision: Literal["16-mixed", "bf16-mixed"], amp_O2: bool = True, device="cuda",) -> None: + def __init__( + self, + precision: Literal["16-mixed", "bf16-mixed"], + amp_O2: bool = False, + device="cuda", + ) -> None: if precision == "bf16-mixed": scaler = None else: - scaler = GradScaler(init_scale=2 ** 32, growth_interval=1000, hysteresis=2) + scaler = GradScaler(init_scale=2**32, growth_interval=1000, hysteresis=2) super().__init__(precision, device, scaler) @@ -94,7 +99,11 @@ def convert_optimizer(self, optimizer: Optimizer) -> Optimizer: if isinstance(optimizer, MainParamsOptimizerWrapper) or not self.amp_O2: return optimizer - return MainParamsOptimizerWrapper(optimizer, fp32_grad_accum=True, contiguous_grad_bucket=True,) + return MainParamsOptimizerWrapper( + optimizer, + fp32_grad_accum=True, + contiguous_grad_bucket=True, + ) def convert_input(self, data: AnyT) -> AnyT: """Convert model inputs (forward) to the floating point precision type of this plugin. diff --git a/nemo/lightning/pytorch/strategies.py b/nemo/lightning/pytorch/strategies.py index 65986b2a4855..c002ecf7fd68 100644 --- a/nemo/lightning/pytorch/strategies.py +++ b/nemo/lightning/pytorch/strategies.py @@ -27,9 +27,8 @@ from torch.utils.data import DataLoader from typing_extensions import override -from nemo import io -from nemo.io.pl import MegatronCheckpointIO, TrainerCheckpoint, TrainerCkptProtocol -from nemo.lightning import _strategy_lib +from nemo.lightning import _strategy_lib, io +from nemo.lightning.io.pl import MegatronCheckpointIO, TrainerCheckpoint, TrainerCkptProtocol from nemo.lightning.megatron_parallel import CallbackConnector, MegatronParallel, _ModuleStepFunction from nemo.lightning.pytorch.callbacks import MegatronProgressBar @@ -63,6 +62,7 @@ def __init__( find_unused_parameters: bool = False, enable_nemo_ckpt_io: bool = True, ckpt_type: TrainerCkptProtocol = TrainerCheckpoint, + ckpt_include_optimizer: bool = False, lazy_init: bool = False, **kwargs, ) -> None: @@ -83,6 +83,7 @@ def __init__( self.enable_nemo_ckpt_io = enable_nemo_ckpt_io self.ckpt_type = ckpt_type self.lazy_init = lazy_init + self.ckpt_include_optimizer = ckpt_include_optimizer # used in NVIDIA NGC PyTorch containers _strategy_lib.enable_nvidia_optimizations() @@ -174,6 +175,7 @@ def setup_distributed(self) -> None: super().setup_distributed() from megatron.core import parallel_state + from nemo.utils import AppState # init model parallel if needed @@ -227,6 +229,7 @@ def configure_ddp(self) -> None: def _setup_model(self, model: nn.Module) -> DistributedDataParallel: """Only called when we need to wrap the model for pytorch's ddp.""" from megatron.core import parallel_state + from nemo.utils import AppState app_state = AppState() @@ -345,10 +348,10 @@ def optimizer_sharded_state_dict(self): def save_checkpoint( self, checkpoint: Dict[str, Any], filepath: Union[str, Path], storage_options: Optional[Any] = None ) -> None: - checkpoint['state_dict'] = OrderedDict([]) # remove device state_dict - checkpoint['sharded_state_dict'] = self.megatron_parallel.sharded_state_dict() + checkpoint["state_dict"] = OrderedDict([]) # remove device state_dict + checkpoint["sharded_state_dict"] = self.megatron_parallel.sharded_state_dict() if self.trainer.state.fn == TrainerFn.FITTING: - checkpoint['optimizer_states'] = [self.optimizer_sharded_state_dict()] + checkpoint["optimizer_states"] = [self.optimizer_sharded_state_dict()] self.checkpoint_io.save_checkpoint(checkpoint, filepath, storage_options=storage_options) if self.enable_nemo_ckpt_io and self.is_global_zero and self.ckpt_type: @@ -367,9 +370,9 @@ def load_checkpoint(self, checkpoint_path: Union[str, Path]) -> Dict[str, Any]: sharded_state_dict = {} sharded_state_dict["state_dict"] = self.megatron_parallel.sharded_state_dict() - # if self.trainer.state.fn == TrainerFn.FITTING: - # if self.lightning_module.optimizers(use_pl_optimizer=False): - # sharded_state_dict["optimizer_states"] = [self.optimizer_sharded_state_dict()] + if self.ckpt_include_optimizer and self.trainer.state.fn == TrainerFn.FITTING: + if self.lightning_module.optimizers(use_pl_optimizer=False): + sharded_state_dict["optimizer_states"] = [self.optimizer_sharded_state_dict()] checkpoint = self.checkpoint_io.load_checkpoint(checkpoint_path, sharded_state_dict=sharded_state_dict) diff --git a/nemo/lightning/pytorch/trainer.py b/nemo/lightning/pytorch/trainer.py index da04a93eef05..b4483d4af4b9 100644 --- a/nemo/lightning/pytorch/trainer.py +++ b/nemo/lightning/pytorch/trainer.py @@ -4,7 +4,7 @@ import pytorch_lightning as pl from typing_extensions import Self -from nemo.io.mixin import IOMixin +from nemo.lightning.io.mixin import IOMixin class Trainer(pl.Trainer, IOMixin): diff --git a/nemo/llm/gpt/data/__init__.py b/nemo/llm/gpt/data/__init__.py deleted file mode 100644 index 1c1c9ce5d525..000000000000 --- a/nemo/llm/gpt/data/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -from nemo.llm.gpt.data.dolly import DollyDataModule -from nemo.llm.gpt.data.fine_tuning import FineTuningDataModule -from nemo.llm.gpt.data.mock import MockDataModule -from nemo.llm.gpt.data.pre_training import PreTrainingDataModule -from nemo.llm.gpt.data.squad import SquadDataModule - -__all__ = ["FineTuningDataModule", "SquadDataModule", "DollyDataModule", "MockDataModule", "PreTrainingDataModule"] diff --git a/nemo/llm/gpt/model/__init__.py b/nemo/llm/gpt/model/__init__.py deleted file mode 100644 index 05c3e9928fab..000000000000 --- a/nemo/llm/gpt/model/__init__.py +++ /dev/null @@ -1,12 +0,0 @@ -from nemo.llm.gpt.model.base import GPTConfig, GPTModel, MaskedTokenLossReduction, gpt_data_step, gpt_forward_step -from nemo.llm.gpt.model.mistral_7b import Mistral7BConfig, Mistral7BModel - -__all__ = [ - "GPTConfig", - "GPTModel", - "Mistral7BConfig", - "Mistral7BModel", - "MaskedTokenLossReduction", - "gpt_data_step", - "gpt_forward_step", -] diff --git a/tests/io/__init__.py b/tests/lightning/io/__init__.py similarity index 100% rename from tests/io/__init__.py rename to tests/lightning/io/__init__.py diff --git a/tests/io/test_api.py b/tests/lightning/io/test_api.py similarity index 65% rename from tests/io/test_api.py rename to tests/lightning/io/test_api.py index d4c317bf2e9f..9872d0860193 100644 --- a/tests/io/test_api.py +++ b/tests/lightning/io/test_api.py @@ -1,6 +1,6 @@ -from nemo import io from nemo import lightning as nl -from nemo import llm +from nemo.collections import llm +from nemo.lightning import io class TestLoad: @@ -8,7 +8,12 @@ def test_reload_ckpt(self, tmpdir): trainer = nl.Trainer(devices=1, accelerator="cpu", strategy=nl.MegatronStrategy()) # model = llm.Mistral7BModel() model = llm.GPTModel( - llm.GPTConfig(num_layers=2, hidden_size=1024, ffn_hidden_size=4096, num_attention_heads=8,) + llm.GPTConfig( + num_layers=2, + hidden_size=1024, + ffn_hidden_size=4096, + num_attention_heads=8, + ) ) ckpt = io.TrainerCheckpoint(model, trainer) diff --git a/tests/io/test_mixin.py b/tests/lightning/io/test_mixin.py similarity index 91% rename from tests/io/test_mixin.py rename to tests/lightning/io/test_mixin.py index ed898d435609..824608db6bf0 100644 --- a/tests/io/test_mixin.py +++ b/tests/lightning/io/test_mixin.py @@ -1,4 +1,4 @@ -from nemo import io +from nemo.lightning import io class DummyClass(io.IOMixin): diff --git a/tests/io/test_state.py b/tests/lightning/io/test_state.py similarity index 99% rename from tests/io/test_state.py rename to tests/lightning/io/test_state.py index bb5dc4a9af3d..f368f3ce02ce 100644 --- a/tests/io/test_state.py +++ b/tests/lightning/io/test_state.py @@ -1,7 +1,7 @@ import pytest from torch import nn -from nemo.io.state import StateDictTransform, TransformCTX, state_transform +from nemo.lightning.io.state import StateDictTransform, TransformCTX, state_transform class TestStateDictTransform: @@ -141,6 +141,7 @@ def test_transform_with_tuple_target_key_and_multiple_outputs(self, mock_multi_t Test transformation where the target_key is a tuple and the transform function returns multiple values that are then unrolled to these target keys. """ + # Define a transformation that splits the input into two parts def split_transform(ctx, x): return x - 1, x + 1 diff --git a/tests/lightning/test_data.py b/tests/lightning/test_data.py index e3143b6da03c..7acdcc91b486 100644 --- a/tests/lightning/test_data.py +++ b/tests/lightning/test_data.py @@ -6,11 +6,15 @@ 'nemo.collections.nlp.data.language_modeling.megatron.gpt_sft_dataset.GPTSFTDataset.__init__', return_value=None ) def test_finetuning_module(mock_gpt_sft_dataset) -> None: - from nemo.llm.gpt.data import FineTuningDataModule + from nemo.collections.llm.gpt.data import FineTuningDataModule dataset_root = 'random_root' datamodule = FineTuningDataModule( - dataset_root, seq_length=2048, micro_batch_size=4, global_batch_size=8, seed=1234, + dataset_root, + seq_length=2048, + micro_batch_size=4, + global_batch_size=8, + seed=1234, ) datamodule.train_dataloader() @@ -21,9 +25,14 @@ def test_finetuning_module(mock_gpt_sft_dataset) -> None: 'nemo.collections.nlp.data.language_modeling.megatron.gpt_sft_dataset.GPTSFTDataset.__init__', return_value=None ) def test_dolly_module(mock_gpt_sft_dataset) -> None: - from nemo.llm.gpt.data import DollyDataModule + from nemo.collections.llm.gpt.data import DollyDataModule - datamodule = DollyDataModule(seq_length=2048, micro_batch_size=4, global_batch_size=8, seed=1234,) + datamodule = DollyDataModule( + seq_length=2048, + micro_batch_size=4, + global_batch_size=8, + seed=1234, + ) datamodule.train_dataloader() mock_gpt_sft_dataset.assert_called_once() @@ -33,9 +42,14 @@ def test_dolly_module(mock_gpt_sft_dataset) -> None: 'nemo.collections.nlp.data.language_modeling.megatron.gpt_sft_dataset.GPTSFTDataset.__init__', return_value=None ) def test_squad_module(mock_gpt_sft_dataset) -> None: - from nemo.llm.gpt.data import SquadDataModule + from nemo.collections.llm.gpt.data import SquadDataModule - datamodule = SquadDataModule(seq_length=2048, micro_batch_size=4, global_batch_size=8, seed=1234,) + datamodule = SquadDataModule( + seq_length=2048, + micro_batch_size=4, + global_batch_size=8, + seed=1234, + ) datamodule.train_dataloader() mock_gpt_sft_dataset.assert_called_once() @@ -45,7 +59,7 @@ def test_squad_module(mock_gpt_sft_dataset) -> None: # @patch('megatron.core.datasets.blended_megatron_dataset_builder.BlendedMegatronDatasetBuilder') # @patch('nemo.lightning.pytorch.trainer.Trainer') # def test_pretraining_module(mock_pretraining_dataset_builder, mock_trainer) -> None: -# from nemo.llm.gpt.data import PreTrainingDataModule +# from nemo.collections.llm.gpt.data import PreTrainingDataModule # # datamodule = PreTrainingDataModule( # path=Path('random_path'), diff --git a/tests/lightning/test_megatron_parallel.py b/tests/lightning/test_megatron_parallel.py index 877e6a39a976..31d20170c0b6 100644 --- a/tests/lightning/test_megatron_parallel.py +++ b/tests/lightning/test_megatron_parallel.py @@ -103,7 +103,7 @@ def test_init_with_virtual_pipeline(self, mocker, mock_pipeline): mocker.patch('megatron.core.parallel_state.model_parallel_is_initialized', return_value=True) mocker.patch('megatron.core.parallel_state.set_virtual_pipeline_model_parallel_world_size') mocker.patch('megatron.core.parallel_state.set_virtual_pipeline_model_parallel_rank') - mocker.patch('nemo.io.reinit', return_value=mock_pipeline) + mocker.patch('nemo.lightning.io.reinit', return_value=mock_pipeline) megatron_parallel = mp.MegatronParallel(mock_pipeline, vp_size=2, cpu=True) From a2a75c5da06b21a24e83328cb55e7cb017d9faa4 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 23 May 2024 19:06:24 -0700 Subject: [PATCH 17/47] Fix typo in HF tutorial (#9302) (#9304) Signed-off-by: smajumdar Co-authored-by: Somshubra Majumdar --- tutorials/Publish_NeMo_Model_On_Hugging_Face_Hub.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tutorials/Publish_NeMo_Model_On_Hugging_Face_Hub.ipynb b/tutorials/Publish_NeMo_Model_On_Hugging_Face_Hub.ipynb index 1771d65c5e50..73a8ebc29ee3 100644 --- a/tutorials/Publish_NeMo_Model_On_Hugging_Face_Hub.ipynb +++ b/tutorials/Publish_NeMo_Model_On_Hugging_Face_Hub.ipynb @@ -916,7 +916,7 @@ { "cell_type": "code", "source": [ - "hf_model2 = nemo_asr.models.ASRModel.from_pretrained(hf_model_name + \"v2\")" + "hf_model2 = nemo_asr.models.ASRModel.from_pretrained(hf_model_name + \"_v2\")" ], "metadata": { "id": "WDgwrr2aQyUS" From cde0b2b226fb519798008f96b0b70d271e503d49 Mon Sep 17 00:00:00 2001 From: Tim Moon <4406448+timmoon10@users.noreply.github.com> Date: Fri, 24 May 2024 13:28:16 -0700 Subject: [PATCH 18/47] Expand documentation for data parallelism and distributed optimizer (#9227) * Add distributed optimizer to docs Signed-off-by: Tim Moon * Debug RST table Signed-off-by: Tim Moon * Review suggestions from @jgerh Signed-off-by: Tim Moon * Copyedits and formatting changes Signed-off-by: Tim Moon Co-authored-by: Tim Moon Co-authored-by: Jennifer Gerhold --------- Signed-off-by: Tim Moon Co-authored-by: Jennifer Gerhold --- docs/source/features/parallelisms.rst | 112 ++++++++++++++++++++------ 1 file changed, 88 insertions(+), 24 deletions(-) diff --git a/docs/source/features/parallelisms.rst b/docs/source/features/parallelisms.rst index d5e86e46a49d..4cc493f40024 100644 --- a/docs/source/features/parallelisms.rst +++ b/docs/source/features/parallelisms.rst @@ -3,22 +3,87 @@ Parallelisms ------------ -NeMo Megatron supports 5 types of parallelisms (which can be mixed together arbitrarily): +NeMo Megatron supports five types of parallelism (which can be mixed together arbitrarily). + +Data Parallelism +^^^^^^^^^^^^^^^^ + +Data Parallelism (DP) creates identical copies of the model across +multiple GPUs. Data batches are distributed between GPUs so that the +GPUs can process them independently. While compute is efficiently +distributed between GPUs, communication is required in order to keep +the model copies consistent with each other. Distributed Data Parallelism -^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Distributed Data Parallelism (DDP) creates idential copies of the model across multiple GPUs. +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Distributed Data Parallelism (DDP) keeps model copies consistent by +synchronizing parameter gradients before each optimization step. More +specifically, it sums gradients over all model copies using an +all-reduce communication collective. .. image:: ../nlp/nemo_megatron/images/ddp.gif :align: center :width: 800px :alt: Distributed Data Parallel +Distributed Optimizer (ZeRO-1) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The ZeRO-1 algorithm keeps model copies consistent by sharding the +optimizer state between GPUs. During each optimization step, the +parameter gradients are first summed and sharded (with a +reduce-scatter collective), each GPU applies an optimization to its +local shard of the parameters, and the updated parameter shards are +broadcast to update all of the model copies (with an all-gather +collective). This approach is attractive for large models since +sharding the optimizer state can significantly reduce its memory +footprint on individual GPUs. It also has, in theory, the same +communication volume as DDP and its communication pattern has more +opportunities for overlapping with compute. + +Enable Data Parallelism +~~~~~~~~~~~~~~~~~~~~~~~ + +DDP is the default parallelism scheme when NeMo is run on multiple +GPUs. Enabling other parallelism schemes in the model configuration +will decrease the size of the DP group, that is the number of +identical model copies. + +To enable the distributed optimizer, set +``model.optim.name=distributed_fused_adam`` in the model +configuration. It can be configured with the following options: + +=========================== ========= ================================================================================================================================== +Option Default Description +=========================== ========= ================================================================================================================================== +``dtype`` fp32 Optimizer state datatype +``grad_sync_dtype`` ``dtype`` Gradient reduce-scatter datatype +``overlap_grad_sync`` True Overlap gradient reduce-scatter with compute +``overlap_param_sync`` False Overlap parameter all-gather with compute +``bucket_cap_mb`` 100 Buffer size (in MiB) for internal state and workspaces. Larger buckets have lower runtime overheads but may increase memory usage. +``contiguous_param_buffer`` False Allocate parameters as views into a large buffer. Helps avoid some data copies. +``contiguous_grad_buffer`` True Allocate parameter gradients as views into a large buffer. Helps avoid some data copies. +=========================== ========= ================================================================================================================================== + +See the keyword arguments in `Apex DistributedFusedAdam `_ and `NeMo MegatronDistributedFusedAdam `_ for a full list of distributed optimizer options. + +Implement Data Parallelism +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +DDP in NeMo either uses PyTorch +`DistributedDataParallel `_ +(default) or a custom implementation (if custom multi-precision +training is enabled with ``megatron_amp_O2``). + +The distributed optimizer in NeMo is built on top of +`DistributedFusedAdam `_ +from Apex. Tensor Parallelism ^^^^^^^^^^^^^^^^^^ -**Tensor Parallelism (TP)** is a method for distributing a model's computation across multiple GPUs by splitting tensors into non-overlapping pieces. This allows different parts of the tensor to be processed simultaneously on separate GPUs, enhancing performance and enabling the training of larger models. +Tensor Parallelism (TP) is a method for distributing a model's computation across multiple GPUs by splitting tensors into non-overlapping pieces. This allows different parts of the tensor to be processed simultaneously on separate GPUs, enhancing performance and enabling the training of larger models. .. image:: ../nlp/nemo_megatron/images/tp.gif :align: center @@ -31,7 +96,8 @@ Enable Tensor Parallelism To enable TP in the NeMo framework, configure the ``tensor_model_parallel_size`` parameter in the model configuration. This parameter determines the number of GPUs among which the model's tensors are partitioned. **For Tensor Parallelism**: - - Set ``tensor_model_parallel_size`` to greater than ``1`` to enable intra-layer model parallelism. + +Set ``tensor_model_parallel_size`` to greater than ``1`` to enable intra-layer model parallelism. .. code-block:: yaml @@ -49,7 +115,7 @@ For detailed API usage and additional configurations, consult the `Megatron Core Pipeline Parallelism ^^^^^^^^^^^^^^^^^^^^ -**Pipeline Parallelism (PP)** is a technique that assigns consecutive layers or segments of a neural network to different GPUs. This division allows each GPU to process different stages of the network sequentially. +Pipeline Parallelism (PP) is a technique that assigns consecutive layers or segments of a neural network to different GPUs. This division allows each GPU to process different stages of the network sequentially. .. image:: ../nlp/nemo_megatron/images/pp.gif :align: center @@ -63,7 +129,8 @@ Enable Pipeline Parallelism To utilize PP in the NeMo framework, you need to set the ``pipeline_model_parallel_size`` parameter in the model's configuration. This parameter specifies the number of GPUs among which the model's layers are distributed. **For Pipeline Parallelism**: - - Set ``pipeline_model_parallel_size`` to a value greater than ``1`` to enable inter-layer model parallelism. + +Set ``pipeline_model_parallel_size`` to a value greater than ``1`` to enable inter-layer model parallelism. .. code-block:: yaml @@ -74,7 +141,7 @@ Adjust the configuration accordingly here: `NeMo Megatron GPT Config `_. +The NeMo implementation of PP leverages functionalities from Megatron Core. For a practical example of how PP is implemented within transformer blocks in NeMo, you can inspect the following codebase: `Megatron-LM Transformer Block `_. For more detailed API usage and configurations related to PP, visit the `Megatron Core Developer Guide `_. Sequence Parallelism ^^^^^^^^^^^^^^^^^^^^ -**Sequence Parallelism** extends tensor-level model parallelism by distributing computing load and activation memory across multiple GPUs along the sequence dimension of transformer layers. This method is particularly useful for portions of the layer that have previously not been parallelized, enhancing overall model performance and efficiency. +Sequence Parallelism extends tensor-level model parallelism by distributing computing load and activation memory across multiple GPUs along the sequence dimension of transformer layers. This method is particularly useful for portions of the layer that have previously not been parallelized, enhancing overall model performance and efficiency. .. image:: ../nlp/nemo_megatron/images/sp.gif :align: center @@ -113,12 +180,12 @@ For further information on configuration, refer to the following documentation: Implement Sequence Parallelism ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -NeMo's implementation of Sequence Parallelism utilizes functionality from Megatron Core. For an in-depth look at how Sequence Parallelism is integrated into the Megatron Core architecture, you can examine the source code here: `Megatron-LM Sequence Parallel Source Code `_. +The NeMo implementation of Sequence Parallelism utilizes functionality from Megatron Core. For an in-depth look at how Sequence Parallelism is integrated into the Megatron Core architecture, you can examine the source code here: `Megatron-LM Sequence Parallel Source Code `_. Context Parallelism ^^^^^^^^^^^^^^^^^^^ -**Context Parallelism (CP)** is a method for parallelizing the processing of neural network activations across multiple GPUs, focusing on the sequence dimension of the input data. Unlike Sequence Parallelism (SP) that only partitions specific types of activations, CP divides all network activations along the sequence dimension. +Context Parallelism (CP) is a method for parallelizing the processing of neural network activations across multiple GPUs, focusing on the sequence dimension of the input data. Unlike Sequence Parallelism (SP) that only partitions specific types of activations, CP divides all network activations along the sequence dimension. Enable Context Parallelism ~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -126,7 +193,8 @@ Enable Context Parallelism To activate CP in the NeMo framework, set the ``context_parallel_size`` parameter in the model configuration. This parameter specifies the number of GPUs among which the model's sequence activations are distributed. **For Context Parallelism**: - - Set ``context_parallel_size`` to a value greater than ``1`` to enable sequence-wide model parallelism. + +Set ``context_parallel_size`` to a value greater than ``1`` to enable sequence-wide model parallelism. .. code-block:: yaml @@ -137,18 +205,16 @@ The configuration can be found and modified here: `NeMo Megatron Core Context Co Implement Context Parallelism ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -NeMo leverages functionalities from both Megatron Core and transformer-engine to implement CP efficiently. During forward propagation, each GPU handles a segment of the sequence, storing only the necessary Key and Value (KV) pairs. In the backward pass, these KV pairs are reassembled across GPUs using advanced communication schemes like all-gather and reduce-scatter transformed into point-to-point communications in a ring topology. This method reduces the memory footprint significantly while maintaining computational efficiency. - -Additionally, NeMo's CP supports integration with various forms of model parallelism such as TP (Tensor Parallelism), PP (Pipeline Parallelism), and DP (Data Parallelism), ensuring broad usability and flexibility in large-scale model training environments. +NeMo leverages functionalities from both Megatron Core and Transformer Engine to implement CP efficiently. During forward propagation, each GPU handles a segment of the sequence, storing only the necessary Key and Value (KV) pairs. In the backward pass, these KV pairs are reassembled across GPUs using advanced communication schemes like all-gather and reduce-scatter transformed into point-to-point communications in a ring topology. This method reduces the memory footprint significantly while maintaining computational efficiency. Visit our source code for more insights into the implementation: -- Megatron Core transformer engine: `Megatron Core `_ -- Transformer Engine repository: `Transformer Engine Code `_ +- `Megatron Core wrappers for Transformer Engine `_ +- `Transformer Engine attention modules `_ Expert Parallelism ^^^^^^^^^^^^^^^^^^ -**Expert Parallelism (EP)** is a type of model parallelism that distributes experts of an MoE across GPUs. +Expert Parallelism (EP) is a type of model parallelism that distributes experts of an MoE across GPUs. .. image:: ../nlp/nemo_megatron/images/ep.png :align: center @@ -158,9 +224,7 @@ Expert Parallelism Enable Expert Parallelism ~~~~~~~~~~~~~~~~~~~~~~~~~ -To enable it users can pass ``model.expert_model_parallel_size=k``, where k is an integer with the desired -expert parallelism level, for example if the model has three experts (i.e. ``model.num_moe_experts=3``), we can specify -k=3 (i.e. via CLI using ``model.expert_model_parallel_size=3``). The number of experts should be exactly divisible by the ``expert_model_parallel_size``. +To enable EP, set ``model.expert_model_parallel_size`` to the desired expert parallel size. For example, if the model has six experts (``model.num_moe_experts=6``), then setting ``model.expert_model_parallel_size=3`` results in each GPU processing two experts. The number of experts should be divisible by the expert parallel size. .. code-block:: yaml @@ -172,13 +236,13 @@ For further information on configuration, refer to the following documentation: Implement Expert Parallelism ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -NeMo's expert parallelism functionality is provided by Megatron-LM repository, please consult the corresponding `Moe-layer `_ for more moe implementation details. +The NeMo implementation of Expert Parallelism uses functionality from Megatron Core. Please consult the `Megatron Core MoE layer `_ for more MoE implementation details. Parallelism nomenclature ^^^^^^^^^^^^^^^^^^^^^^^^ -When reading and modifying NeMo Megatron code you will encounter the following terms. +The following figure illustrates some terms that you may encounter in the NeMo Megatron codebase. .. image:: ../nlp/nemo_megatron/images/pnom.gif :align: center From c3f19e928bb040351b58f66b5642030a5aea14df Mon Sep 17 00:00:00 2001 From: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> Date: Fri, 24 May 2024 13:28:31 -0700 Subject: [PATCH 19/47] Update flash attention section in memory_optimizations.rst (#9188) * Update flash attention section in memory_optimizations.rst Signed-off-by: cyanguwa <8636796+cyanguwa@users.noreply.github.com> * update changes based on comments Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> --------- Signed-off-by: cyanguwa <8636796+cyanguwa@users.noreply.github.com> Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> --- docs/source/features/memory_optimizations.rst | 24 ++++++++++++++----- 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/docs/source/features/memory_optimizations.rst b/docs/source/features/memory_optimizations.rst index d87cb1e191ca..4d363670fedf 100644 --- a/docs/source/features/memory_optimizations.rst +++ b/docs/source/features/memory_optimizations.rst @@ -11,14 +11,26 @@ Flash Attention Overview ^^^^^^^^ -Flash Attention is a method designed to enhance the efficiency of Transformer models, which are widely utilized in applications such as Natural Language Processing (NLP). Traditional Transformers are slow and consume a lot of memory, especially with long sequences, due to the quadratic time and memory complexity of self-attention. Flash Attention is an IO-aware exact attention algorithm that leverages tiling to minimize the number of memory reads/writes between the GPU's high-bandwidth memory (HBM) and on-chip SRAM. This approach is designed to be more efficient in terms of IO complexity compared to standard attention mechanisms. +Flash attention is an algorithm designed to improve the efficiency of the attention mechanism in transformer models such as GPT and BERT. The attention mechanism has quadratic time and memory complexity in sequence length and can present significant runtime and memory challenges for longer sequences. + +Compared to the standard, non-flash algorithm, flash attention applies two techniques to lower the memory requirement and improve compute efficiency. + +The tiling technique decomposes the inputs based on the shared memory size and calculates the softmax one tile at a time. Instead of working on the entire query, key, value tensors at once, it makes several passes at these tensors and then combines the results in a subsequent step. + +The recomputation technique stores the softmax normalization factors (linear to sequence length), instead of the softmax results (qudratic to sequence length), and uses these normalization factors to recompute the attention scores. This saves the amount of data to write to global memory and reduces both the memory requirement and I/O traffic between global memory and shared memory. + +Flash attention lowers the memory footprint and computational complexity from quadratic to linear, and greatly extending the range of sequence length allowed in large language models. + +The flash attention algorithm was first propsed `here `_. Two of its implementations are `flash-attention `_ by Tri Dao *et al*, and `fused flash attention `_ by NVIDIA cuDNN. Turn Flash Attention On and Off ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -In the NeMo Framework, Flash Attention is supported through the Transformer Engine with the inclusion of Flash Attention 2. By default, Flash Attention is enabled, but the Transformer Engine may switch to a different kernel if the tensor dimensions are not optimal for Flash Attention. Users can completely disable Flash Attention by setting the environment variable ``NVTE_FLASH_ATTN=0``. +In the NeMo framework, flash attention is supported through `Transformer Engine `_, including both of the implementations mentioned above. Transformer Engine selects the appropriate implementation based on input information such as sequence length, number of heads and head dimension. When both implementations are applicable, Transformer Engine prefers cuDNN flash attention on Hopper+ architectures and Tri Dao flash attention on Ampere architectures. + +To disable Tri Dao flash attention, set the environment variable ``NVTE_FLASH_ATTN=0``. To disable cuDNN flash attention, set ``NVTE_FUSED_ATTN=0``. -For more details on the supported Dot Attention backend, please refer to the Transformer Engine source code available at `Transformer Engine's Attention Mechanism `_. +For more details on the Dot Product Attention backends supported in Transformer Engine, please refer to the source code at `Transformer Engine's Attention Mechanism `_. Activation Recomputation ------------------------ @@ -28,15 +40,15 @@ Overview Full Activation Recomputation """"""""""""""""""""""""""""" -This method recalculates all the intermediate activations during the backward pass of a model's training, instead of storing them during the forward pass. This technique maximizes memory efficiency at the cost of computational overhead, as each activation is recomputed when needed. +The full activation recomputation method recalculates all the intermediate activations during the backward pass of a model's training, instead of storing them during the forward pass. This technique maximizes memory efficiency at the cost of computational overhead, as each activation is recomputed when needed. Partial Activation Recomputation """""""""""""""""""""""""""""""" -This method recomputes only a subset of layers during the backward phase. It is a trade-off between the full recomputation and no recomputation, balancing memory savings with computational efficiency. +The partial activation recomputation method recomputes only a subset of layers during the backward phase. It is a trade-off between the full recomputation and no recomputation, balancing memory savings with computational efficiency. Selective Activation Recomputation """""""""""""""""""""""""""""""""" -This method reduces memory footprint of activations significantly via smart activation checkpointing. This approach involves selectively storing only crucial activations and recomputing the others as needed. It is particularly useful in large models to minimize memory usage while controlling the computational cost. +The selective activation recomputation method reduces memory footprint of activations significantly via smart activation checkpointing. This approach involves selectively storing only crucial activations and recomputing the others as needed. It is particularly useful in large models to minimize memory usage while controlling the computational cost. Refer to "Reducing Activation Recomputation in Large Transformer Models" for more details: https://arxiv.org/abs/2205.05198. From 251cf66910a24a76ed39ac8e66c9387e5ebfa7fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sat, 25 May 2024 00:31:25 +0200 Subject: [PATCH 20/47] Install alerting (#9311) * ci: Send Slack alerts on CI failure Signed-off-by: Oliver Koenig * chore: Set live Signed-off-by: Oliver Koenig --------- Signed-off-by: Oliver Koenig --- .github/scripts/slackHelper.sh | 23 +++++++++++++++++++++++ .github/workflows/cicd-main.yml | 21 +++++++++++++++++---- 2 files changed, 40 insertions(+), 4 deletions(-) create mode 100644 .github/scripts/slackHelper.sh diff --git a/.github/scripts/slackHelper.sh b/.github/scripts/slackHelper.sh new file mode 100644 index 000000000000..4696cebcf13b --- /dev/null +++ b/.github/scripts/slackHelper.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +function sendSlackMessage() { + + WEBHOOK_URL="$1" + PIPELINE_URL="$2" + + curl -X POST -H "Content-type: application/json" --data "{ + \"blocks\": [ + { + \"type\": \"section\", + \"text\": { + \"type\": \"mrkdwn\", + \"text\": \"\ +🚨 *CI/CD failure at <$PIPELINE_URL|NeMo CI>*: + +\" + } + } + ] + }" $WEBHOOK_URL + +} diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index dbc7d907580a..53e92e976240 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -6482,9 +6482,8 @@ jobs: - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" if: "failure()" - Nemo_CICD_Test: - needs: + needs: - L0_Unit_Tests_GPU - L0_Unit_Tests_CPU - L2_Community_LLM_Checkpoints_tests_Llama @@ -6598,8 +6597,22 @@ jobs: - L2_TTS_Fast_dev_runs_1_Mixer-TTS - L2_TTS_Fast_dev_runs_1_Hifigan - Speech_Checkpoints_tests - + if: always() runs-on: ubuntu-latest steps: # This should depend on all the tests so we block/unblock based on all tests passing - - run: exit 0 + - if: ${{ contains(needs.*.result, 'success') }} + run: exit 0 + + - if: ${{ contains(needs.*.result, 'failure') }} + name: Checkout repository + uses: actions/checkout@v4 + + - if: ${{ contains(needs.*.result, 'failure') }} + run: | + source .github/scripts/slackHelper.sh + + WEBHOOK_URL=${{ secrets.SLACK_WEBHOOK }} + PIPELINE_URL=${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} + + sendSlackMessage "$WEBHOOK_URL" "$PIPELINE_URL" From 1fa961ba03ab5f8c91b278640e29807079373372 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Sat, 25 May 2024 00:18:43 -0400 Subject: [PATCH 21/47] typos (#9314) (#9315) Signed-off-by: Nithin Rao Koluguri Co-authored-by: Nithin Rao --- tutorials/00_NeMo_Primer.ipynb | 2 +- tutorials/asr/ASR_Confidence_Estimation.ipynb | 4 ++-- tutorials/asr/ASR_Context_Biasing.ipynb | 2 +- tutorials/asr/Speech_Commands.ipynb | 4 ++-- tutorials/nlp/Joint_Intent_and_Slot_Classification.ipynb | 2 +- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/tutorials/00_NeMo_Primer.ipynb b/tutorials/00_NeMo_Primer.ipynb index 50aa60260b35..07d7f6b46539 100644 --- a/tutorials/00_NeMo_Primer.ipynb +++ b/tutorials/00_NeMo_Primer.ipynb @@ -588,7 +588,7 @@ "id": "U7Eezf_sAVS0" }, "source": [ - "You might wonder why we didnt explicitly set `citrinet.cfg.optim = cfg.optim`. \n", + "You might wonder why we didn't explicitly set `citrinet.cfg.optim = cfg.optim`. \n", "\n", "This is because the `setup_optimization()` method does it for you! You can still update the config manually." ] diff --git a/tutorials/asr/ASR_Confidence_Estimation.ipynb b/tutorials/asr/ASR_Confidence_Estimation.ipynb index eb8cd7b11688..9b925adbd777 100644 --- a/tutorials/asr/ASR_Confidence_Estimation.ipynb +++ b/tutorials/asr/ASR_Confidence_Estimation.ipynb @@ -284,7 +284,7 @@ " eps_padded_hyp, labels, padded_labels, fill_confidence_deletions(confidence_scores, labels)\n", " ):\n", " word_len = len(word)\n", - " # shield angle brakets for \n", + " # shield angle brackets for \n", " if html and word == \"\":\n", " word = \"<eps>\"\n", " if current_line_len + word_len + 1 <= terminal_width:\n", @@ -307,7 +307,7 @@ " current_word_line = \"\"\n", " for word, score in zip(transcript_list, confidence_scores):\n", " word_len = len(word)\n", - " # shield angle brakets for \n", + " # shield angle brackets for \n", " if html and word == \"\":\n", " word = \"<eps>\"\n", " if current_line_len + word_len + 1 <= terminal_width:\n", diff --git a/tutorials/asr/ASR_Context_Biasing.ipynb b/tutorials/asr/ASR_Context_Biasing.ipynb index dd2e8176ad33..7171510f4e0d 100644 --- a/tutorials/asr/ASR_Context_Biasing.ipynb +++ b/tutorials/asr/ASR_Context_Biasing.ipynb @@ -361,7 +361,7 @@ "source": [ "## Create a context-biasing list\n", "\n", - "Now, we need to select the words, recognition of wich we want to improve by CTC-WS context-biasing.\n", + "Now, we need to select the words, recognition of which we want to improve by CTC-WS context-biasing.\n", "Usually, we select only nontrivial words with the lowest recognition accuracy.\n", "Such words should have a character length >= 3 because short words in a context-biasing list may produce high false-positive recognition.\n", "In this toy example, we will select all the words that look like names with a recognition accuracy less than 1.0.\n", diff --git a/tutorials/asr/Speech_Commands.ipynb b/tutorials/asr/Speech_Commands.ipynb index 58b719a867fa..438533f0f03a 100644 --- a/tutorials/asr/Speech_Commands.ipynb +++ b/tutorials/asr/Speech_Commands.ipynb @@ -1431,10 +1431,10 @@ "# Lets change the scheduler\n", "optim_sched_cfg.sched.name = \"CosineAnnealing\"\n", "\n", - "# \"power\" isnt applicable to CosineAnnealing so let's remove it\n", + "# \"power\" isn't applicable to CosineAnnealing so let's remove it\n", "optim_sched_cfg.sched.pop('power')\n", "\n", - "# \"hold_ratio\" isnt applicable to CosineAnnealing, so let's remove it\n", + "# \"hold_ratio\" isn't applicable to CosineAnnealing, so let's remove it\n", "optim_sched_cfg.sched.pop('hold_ratio')\n", "\n", "# Set \"min_lr\" to lower value\n", diff --git a/tutorials/nlp/Joint_Intent_and_Slot_Classification.ipynb b/tutorials/nlp/Joint_Intent_and_Slot_Classification.ipynb index 675fdfd5351c..608685254a0d 100644 --- a/tutorials/nlp/Joint_Intent_and_Slot_Classification.ipynb +++ b/tutorials/nlp/Joint_Intent_and_Slot_Classification.ipynb @@ -749,7 +749,7 @@ "source": [ "### Optimizing Threshold\n", "\n", - "As mentioned above, when classifiying a given query such as `show all flights and fares from denver to san francisco`, our model checks whether each individual intent would be suitable. Before assigning the final labels for a query, the model assigns a probability an intent matches the query. For example, if our `dict.intents.csv` had 5 different intents, then the model could output for a given query \\[0.52, 0.38, 0.21, 0.67. 0.80\\] where each value represents the probability that query matches that particular intent. \n", + "As mentioned above, when classifying a given query such as `show all flights and fares from denver to san francisco`, our model checks whether each individual intent would be suitable. Before assigning the final labels for a query, the model assigns a probability an intent matches the query. For example, if our `dict.intents.csv` had 5 different intents, then the model could output for a given query \\[0.52, 0.38, 0.21, 0.67. 0.80\\] where each value represents the probability that query matches that particular intent. \n", "\n", "We need to use these probabilities to generate final label predictions of 0 or 1 for each label. While we can use 0.5 as the probability threshold, it is usually the case that there is a better threshold to use depending on the metric we want to optimize. For this tutorial, we will be finding the threshold that gives us the best micro-F1 score on the validation set. After running the `optimize_threshold` method, the threshold attribute for our model will be updated." ] From c39204d67c5b28f63bc5b9eed30a4c93002c1584 Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com> Date: Tue, 28 May 2024 01:28:03 -0700 Subject: [PATCH 22/47] call set_expert_model_parallel_world_size instead of set_cpu_expert_model_parallel_world_size (#9275) Signed-off-by: Alexandros Koumparoulis --- scripts/checkpoint_converters/convert_mixtral_nemo_to_hf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/checkpoint_converters/convert_mixtral_nemo_to_hf.py b/scripts/checkpoint_converters/convert_mixtral_nemo_to_hf.py index dbcbb80a7fda..ca9e44f82922 100644 --- a/scripts/checkpoint_converters/convert_mixtral_nemo_to_hf.py +++ b/scripts/checkpoint_converters/convert_mixtral_nemo_to_hf.py @@ -233,7 +233,7 @@ def convert(in_file, precision=None) -> None: if __name__ == '__main__': args = get_args() - parallel_state.set_cpu_expert_model_parallel_world_size(1) + parallel_state.set_expert_model_parallel_world_size(1) hf_state_dict, nemo_config = convert(args.input_name_or_path, args.precision) config = load_config(args.hf_model_name, nemo_config) From 7a8da171ed072433db9d615cc0eca132bc8351ca Mon Sep 17 00:00:00 2001 From: Ali Taghibakhshi <71892896+JRD971000@users.noreply.github.com> Date: Tue, 28 May 2024 12:57:08 -0500 Subject: [PATCH 23/47] conv1d stable version (#9330) --- requirements/requirements_nlp.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/requirements_nlp.txt b/requirements/requirements_nlp.txt index 9fd75ad8a95a..494a9ab6d672 100644 --- a/requirements/requirements_nlp.txt +++ b/requirements/requirements_nlp.txt @@ -1,6 +1,6 @@ accelerated-scan boto3 -causal-conv1d>=1.2.0 +causal-conv1d==1.2.0.post2 einops faiss-cpu fasttext From 5f7b0304f23cbcb64d92d9f511a22bffb7a5cb28 Mon Sep 17 00:00:00 2001 From: Kirthi Shankar Sivamani Date: Tue, 28 May 2024 14:05:20 -0700 Subject: [PATCH 24/47] FP8 feature documentation (#9265) * Create fp8.rst Signed-off-by: Kirthi Shankar Sivamani * Update fp8.rst Signed-off-by: Kirthi Shankar Sivamani * add fp8_params Signed-off-by: Kirthi Shankar Sivamani * Update mixed_precision.rst Signed-off-by: Kirthi Shankar Sivamani * review comments Signed-off-by: Kirthi Shankar Sivamani * rm file Signed-off-by: Kirthi Shankar Sivamani --------- Signed-off-by: Kirthi Shankar Sivamani --- docs/source/features/mixed_precision.rst | 42 ++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/docs/source/features/mixed_precision.rst b/docs/source/features/mixed_precision.rst index d193752e5475..ba0dfb4e945b 100644 --- a/docs/source/features/mixed_precision.rst +++ b/docs/source/features/mixed_precision.rst @@ -4,3 +4,45 @@ Mixed Precision Training ------------------------ Mixed precision training significantly enhances computational efficiency by conducting operations in half-precision and fp8 formats, while selectively maintaining minimal data in single-precision to preserve critical information throughout key areas of the network. NeMo now supports FP16, BF16, and FP8 (via Transformer Engine) across most models. Further details will be provided shortly. + + +FP8 usage +========= + +Overview +^^^^^^^^ + +NVIDIA H100 GPU introduced support for a new datatype, FP8 (8-bit floating point), enabling higher throughput of matrix multiplies and convolutions. NeMo uses the NVIDIA `TransformerEngine `_ (TE) in order to leverage speedups from FP8. The following table summarizes the FP8 related arguments that can be configured in NeMo (`example config setting `_). For a more detailed overview, refer to the TE `documentation `_, specifically the FP8 `format `_ and `recipe `_. + +.. list-table:: FP8 arguments + :widths: 25 25 50 + :header-rows: 1 + + * - Argument + - Description + * - transformer_engine + - TE and related functionality can be enabled by setting this boolean argument to True. If this argument is not set to True, all subsequent arguments will be ignored. + * - fp8 + - Enables FP8 training. For transformer networks, the QKV, projection, FC1, and FC2 matrix multiplications are executed using the 4th generation H100 tensor cores with FP8 support. + * - fp8_e4m3 + - Training recipe format for FP8. Activations, weights, and gradient tensors use the E4M3 format. + * - fp8_hybrid + - Training recipe format for FP8. Activations and weight tensors use the E4M3 format, whereas gradient use the E5M2 format to satisfy the additional dynamic range requirement for backward tensors. This is the default setting. + * - fp8_margin + - The scaling factor for FP8 tensors can be shifted by a factor of $2 ^ {margin}$ using this argument. + * - fp8_amax_history_len + - Window size for amax history. The window size determines how many instances of the most recent absolute max values (amaxes) are stored per tensor. + * - fp8_amax_compute_algo + - The choice between “max” and “most_recent” specifies how to select an amax value from the given history. + * - reduce_amax + - Indicates whether or not to perform an allreduce on the amax (absolute max) values for the FP8 tensors. Since the amax is directly used to compute the scaling factor for FP8 tensors, setting this argument ensures that the scaling factors for a tensor remain synchronized across devices in multi-GPU training configurations. + * - fp8_params + - Indicates whether or not to store module level parameters in FP8. Enabling this option can lead to reduced memory consumption. It eliminates the need to store a copy of weights in higher precision (> half) for cases where these weights are externally maintained, such as master parameters in the optimizer. For more information, refer to the `fp8_model_init `_ API in TE. + +Resources +^^^^^^^^^ + +- `TE documentation `_ +- `Intro to FP8, floating point formats, and mixed precision training `_ +- `Performance optimizations `_ that are natively supported in NeMo by enabling FP8 training with TE +- `TE installation `_ From 8a8c45319ef9e2a0e803918c6bb09745341d2647 Mon Sep 17 00:00:00 2001 From: Pablo Garay Date: Tue, 28 May 2024 15:17:43 -0700 Subject: [PATCH 25/47] comment out flaky tests (#9333) --- .github/workflows/cicd-main.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 53e92e976240..1e977a7e717d 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -133,7 +133,7 @@ jobs: # chmod -R 777 . - L0_Unit_Tests_GPU: + OPTIONAL_L0_Unit_Tests_GPU: needs: [cicd-test-container-setup] runs-on: self-hosted-azure container: @@ -325,7 +325,7 @@ jobs: # this test is using a 7B model which is too large for GitHub CI # replace the model in this test with a toy model or move the test # to the nightly CI - # L2_Community_LLM_Checkpoints_tests_Baichuan2: + # OPTIONAL_L2_Community_LLM_Checkpoints_tests_Baichuan2: # needs: [cicd-test-container-setup] # runs-on: self-hosted-azure # container: @@ -6484,12 +6484,12 @@ jobs: Nemo_CICD_Test: needs: - - L0_Unit_Tests_GPU + #- OPTIONAL_L0_Unit_Tests_GPU - L0_Unit_Tests_CPU - L2_Community_LLM_Checkpoints_tests_Llama - L2_Community_LLM_Checkpoints_tests_StarCoder - L2_Community_LLM_Checkpoints_tests_Falcon - #- L2_Community_LLM_Checkpoints_tests_Baichuan2 + #- OPTIONAL_L2_Community_LLM_Checkpoints_tests_Baichuan2 - ASR_dev_run_Speech_to_Text - ASR_dev_run_Speech_to_Text_WPE_-_CitriNet - ASR_dev_run_Speech_Pre-training_-_CitriNet From 136aeee276568122745f29a1d58de30c207df0a1 Mon Sep 17 00:00:00 2001 From: Eduardo Vellasques Date: Wed, 29 May 2024 04:10:31 +0200 Subject: [PATCH 26/47] fix typos in convert_mixtral_nemo_to_hf.py and convert_starcoder2_nemo_to_hf.py (#9325) Signed-off-by: evellasques --- scripts/checkpoint_converters/convert_mixtral_nemo_to_hf.py | 3 ++- scripts/checkpoint_converters/convert_starcoder2_nemo_to_hf.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/scripts/checkpoint_converters/convert_mixtral_nemo_to_hf.py b/scripts/checkpoint_converters/convert_mixtral_nemo_to_hf.py index ca9e44f82922..58311d0324c2 100644 --- a/scripts/checkpoint_converters/convert_mixtral_nemo_to_hf.py +++ b/scripts/checkpoint_converters/convert_mixtral_nemo_to_hf.py @@ -61,7 +61,7 @@ def load_config(hf_model_name, nemo_config): hf_config.num_key_value_heads = nemo_config.num_query_groups hf_config.num_local_experts = nemo_config.num_moe_experts assert hf_config.num_local_experts > 0, "num_experts must be greater than zero." - hf_config.num_experts_per_tok = nemo_config.num_experts_per_token + hf_config.num_experts_per_tok = nemo_config.moe_router_topk assert hf_config.num_experts_per_tok > 0, "num_experts_per_token must be greater than zero." if nemo_config.activation == 'fast-swiglu': hf_config.activation = 'silu' @@ -122,6 +122,7 @@ def convert(in_file, precision=None) -> None: embed_weights_base_name = f'model.language_model.embedding.word_embeddings.weight' state_dict[hf_embed_weight_name] = param_to_weights(ckpt[embed_weights_base_name]) + head_num = model.cfg.num_attention_heads if nemo_config.num_query_groups is None or nemo_config.num_query_groups == head_num: num_query_groups = head_num else: diff --git a/scripts/checkpoint_converters/convert_starcoder2_nemo_to_hf.py b/scripts/checkpoint_converters/convert_starcoder2_nemo_to_hf.py index b7b85ee826a8..043d1fd35261 100644 --- a/scripts/checkpoint_converters/convert_starcoder2_nemo_to_hf.py +++ b/scripts/checkpoint_converters/convert_starcoder2_nemo_to_hf.py @@ -266,7 +266,7 @@ def convert(in_file, precision=None, cpu_only=True) -> None: config = load_config(args.hf_model_name, nemo_config) model = AutoModelForCausalLM.from_config(config) model.load_state_dict(hf_state_dict, strict=True) - model.save_pretrained(args.out_file) + model.save_pretrained(args.output_path) hf_tokenizer = AutoTokenizer.from_pretrained('bigcode/starcoder2-tokenizer') hf_tokenizer.save_pretrained(args.output_path) logging.info(f'HF checkpoint saved to: {args.output_path}') From a1173eb1884969812a20d58d5be4ccf73b09b036 Mon Sep 17 00:00:00 2001 From: Deva Kumar Gajulamandyam <37027138+gdevakumar@users.noreply.github.com> Date: Wed, 29 May 2024 01:25:48 -0700 Subject: [PATCH 27/47] typos fixed in READMe.rst (#9322) Signed-off-by: Deva Kumar Gajulamandyam --- README.rst | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/README.rst b/README.rst index 0b05bd0390f8..a2c595d62137 100644 --- a/README.rst +++ b/README.rst @@ -108,7 +108,7 @@ Latest News Introduction ------------ -NVIDIA NeMo Framework is a generative AI framework built for researchers and pytorch developers +NVIDIA NeMo Framework is a generative AI framework built for researchers and PyTorch developers working on large language models (LLMs), multimodal models (MM), automatic speech recognition (ASR), and text-to-speech synthesis (TTS). The primary objective of NeMo is to provide a scalable framework for researchers and developers from industry and academia @@ -219,8 +219,8 @@ The NeMo Framework can be installed in a variety of ways, depending on your need * NeMo LLM & Multimodal Container - `nvcr.io/nvidia/nemo:24.03.framework` * NeMo Speech Container - `nvcr.io/nvidia/nemo:24.01.speech` -* LLM and Multimodal Dependencies - Refer to the `LLM and Multimodal dependencies <#llm-and-multimodal-dependencies>`_ section for isntallation instructions. - * It's higly recommended to start with a base NVIDIA PyTorch container: `nvcr.io/nvidia/pytorch:24.02-py3` +* LLM and Multimodal Dependencies - Refer to the `LLM and Multimodal dependencies <#llm-and-multimodal-dependencies>`_ section for installation instructions. + * It's highly recommended to start with a base NVIDIA PyTorch container: `nvcr.io/nvidia/pytorch:24.02-py3` Conda ~~~~~ @@ -452,9 +452,9 @@ Megatron Core ~~~~~~~~~~~~~ The NeMo LLM Multimodal Domains require that NVIDIA Megatron Core to be installed. -Megatron core is a library for scaling large transfromer base models. +Megatron core is a library for scaling large transformer base models. NeMo LLM and Multimodal models leverage Megatron Core for model parallelism, -transformer architectures, and optimized pytorch datasets. +transformer architectures, and optimized PyTorch datasets. NeMo LLM and Multimodal may need Megatron Core to be updated to a recent version. From cff6b95e74f9048409584092f9891e8de2f455d5 Mon Sep 17 00:00:00 2001 From: yaoyu-33 <54727607+yaoyu-33@users.noreply.github.com> Date: Wed, 29 May 2024 09:36:00 -0700 Subject: [PATCH 28/47] Fix trainer builder when exp_manager is not in config (#9293) * fix Signed-off-by: yaoyu-33 * Apply isort and black reformatting Signed-off-by: yaoyu-33 * rollback changes Signed-off-by: yaoyu-33 --------- Signed-off-by: yaoyu-33 Signed-off-by: yaoyu-33 Co-authored-by: yaoyu-33 --- .../modules/stable_diffusion/attention.py | 8 +- .../diffusionmodules/model.py | 11 +- .../diffusionmodules/openaimodel.py | 124 ++++++++++++------ .../stable_diffusion/diffusionmodules/util.py | 19 ++- .../nlp/parts/megatron_trainer_builder.py | 4 +- 5 files changed, 117 insertions(+), 49 deletions(-) diff --git a/nemo/collections/multimodal/modules/stable_diffusion/attention.py b/nemo/collections/multimodal/modules/stable_diffusion/attention.py index c70b59d39481..2eeed97db781 100644 --- a/nemo/collections/multimodal/modules/stable_diffusion/attention.py +++ b/nemo/collections/multimodal/modules/stable_diffusion/attention.py @@ -122,7 +122,11 @@ def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.0, use_te=Fal if use_te: activation = 'gelu' if not glu else 'geglu' # TODO: more parameters to be confirmed, dropout, seq_length - self.net = LayerNormMLP(hidden_size=dim, ffn_hidden_size=inner_dim, activation=activation,) + self.net = LayerNormMLP( + hidden_size=dim, + ffn_hidden_size=inner_dim, + activation=activation, + ) else: norm = nn.LayerNorm(dim) project_in = nn.Sequential(LinearWrapper(dim, inner_dim), nn.GELU()) if not glu else GEGLU(dim, inner_dim) @@ -264,7 +268,7 @@ def __init__( self.query_dim = query_dim self.dim_head = dim_head - self.scale = dim_head ** -0.5 + self.scale = dim_head**-0.5 self.heads = heads self.to_k = LinearWrapper(context_dim, self.inner_dim, bias=False, lora_network_alpha=lora_network_alpha) diff --git a/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/model.py b/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/model.py index 644efafaf06a..5b874f5f10ad 100644 --- a/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/model.py +++ b/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/model.py @@ -233,7 +233,10 @@ def __init__( # timestep embedding self.temb = nn.Module() self.temb.dense = nn.ModuleList( - [torch.nn.Linear(self.ch, self.temb_ch), torch.nn.Linear(self.temb_ch, self.temb_ch),] + [ + torch.nn.Linear(self.ch, self.temb_ch), + torch.nn.Linear(self.temb_ch, self.temb_ch), + ] ) # downsampling @@ -669,7 +672,11 @@ def __init__(self, factor, in_channels, mid_channels, out_channels, depth=2): ] ) - self.conv_out = nn.Conv2d(mid_channels, out_channels, kernel_size=1,) + self.conv_out = nn.Conv2d( + mid_channels, + out_channels, + kernel_size=1, + ) def forward(self, x): x = self.conv_in(x) diff --git a/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/openaimodel.py b/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/openaimodel.py index 3e301f0b8fc1..30ff0e1a9ff3 100644 --- a/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/openaimodel.py +++ b/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/openaimodel.py @@ -115,10 +115,14 @@ class AttentionPool2d(nn.Module): """ def __init__( - self, spacial_dim: int, embed_dim: int, num_heads_channels: int, output_dim: int = None, + self, + spacial_dim: int, + embed_dim: int, + num_heads_channels: int, + output_dim: int = None, ): super().__init__() - self.positional_embedding = nn.Parameter(th.randn(embed_dim, spacial_dim ** 2 + 1) / embed_dim ** 0.5) + self.positional_embedding = nn.Parameter(th.randn(embed_dim, spacial_dim**2 + 1) / embed_dim**0.5) self.qkv_proj = conv_nd(1, embed_dim, 3 * embed_dim, 1) self.c_proj = conv_nd(1, embed_dim, output_dim or embed_dim, 1) self.num_heads = embed_dim // num_heads_channels @@ -332,7 +336,10 @@ def __init__( self.emb_layers = None self.exchange_temb_dims = False else: - self.emb_layers = nn.Sequential(nn.SiLU(), linear(emb_channels, self.emb_out_channels),) + self.emb_layers = nn.Sequential( + nn.SiLU(), + linear(emb_channels, self.emb_out_channels), + ) self.out_layers = nn.Sequential( normalization(self.out_channels, act="silu", gn_groups=resblock_gn_groups), nn.Dropout(p=dropout), @@ -400,7 +407,12 @@ class AttentionBlock(nn.Module): """ def __init__( - self, channels, num_heads=1, num_head_channels=-1, use_checkpoint=False, use_new_attention_order=False, + self, + channels, + num_heads=1, + num_head_channels=-1, + use_checkpoint=False, + use_new_attention_order=False, ): super().__init__() self.channels = channels @@ -451,7 +463,7 @@ def count_flops_attn(model, _x, y): # We perform two matmuls with the same number of ops. # The first computes the weight matrix, the second computes # the combination of the value vectors. - matmul_ops = 2 * b * (num_spatial ** 2) * c + matmul_ops = 2 * b * (num_spatial**2) * c model.total_ops += th.DoubleTensor([matmul_ops]) @@ -653,7 +665,10 @@ def __init__( if num_attention_blocks is not None: assert len(num_attention_blocks) == len(self.num_res_blocks) assert all( - map(lambda i: self.num_res_blocks[i] >= num_attention_blocks[i], range(len(num_attention_blocks)),) + map( + lambda i: self.num_res_blocks[i] >= num_attention_blocks[i], + range(len(num_attention_blocks)), + ) ) logging.info( f"Constructor of UNetModel received num_attention_blocks={num_attention_blocks}. " @@ -674,7 +689,9 @@ def __init__( self.predict_codebook_ids = n_embed is not None time_embed_dim = model_channels * 4 self.time_embed = nn.Sequential( - linear(model_channels, time_embed_dim), nn.SiLU(), linear(time_embed_dim, time_embed_dim), + linear(model_channels, time_embed_dim), + nn.SiLU(), + linear(time_embed_dim, time_embed_dim), ) self.time_embeddings = torch.Tensor(build_timestep_embedding(model_channels, timesteps)) @@ -691,7 +708,9 @@ def __init__( self.label_emb = nn.Sequential( Timestep(model_channels), nn.Sequential( - linear(model_channels, time_embed_dim), nn.SiLU(), linear(time_embed_dim, time_embed_dim), + linear(model_channels, time_embed_dim), + nn.SiLU(), + linear(time_embed_dim, time_embed_dim), ), ) elif self.num_classes == "sequential": @@ -699,7 +718,9 @@ def __init__( self.adm_in_channels = adm_in_channels self.label_emb = nn.Sequential( nn.Sequential( - linear(adm_in_channels, time_embed_dim), nn.SiLU(), linear(time_embed_dim, time_embed_dim), + linear(adm_in_channels, time_embed_dim), + nn.SiLU(), + linear(time_embed_dim, time_embed_dim), ) ) else: @@ -810,26 +831,28 @@ def __init__( use_scale_shift_norm=use_scale_shift_norm, resblock_gn_groups=resblock_gn_groups, ), - AttentionBlock( - ch, - use_checkpoint=use_checkpoint, - num_heads=num_heads, - num_head_channels=dim_head, - use_new_attention_order=use_new_attention_order, - ) - if not use_spatial_transformer - else SpatialTransformer( - ch, - num_heads, - dim_head, - depth=transformer_depth_middle, - context_dim=context_dim, - disable_self_attn=disable_middle_self_attn, - use_linear=use_linear_in_transformer, - use_checkpoint=use_checkpoint, - use_flash_attention=use_flash_attention, - use_te=self.use_te_fp8, - lora_network_alpha=lora_network_alpha, + ( + AttentionBlock( + ch, + use_checkpoint=use_checkpoint, + num_heads=num_heads, + num_head_channels=dim_head, + use_new_attention_order=use_new_attention_order, + ) + if not use_spatial_transformer + else SpatialTransformer( + ch, + num_heads, + dim_head, + depth=transformer_depth_middle, + context_dim=context_dim, + disable_self_attn=disable_middle_self_attn, + use_linear=use_linear_in_transformer, + use_checkpoint=use_checkpoint, + use_flash_attention=use_flash_attention, + use_te=self.use_te_fp8, + lora_network_alpha=lora_network_alpha, + ) ), ResBlock( ch, @@ -1123,9 +1146,15 @@ def te_fp8_key_mapping(self, unet_dict): # norm_to_q.layer_norm_{weight|bias} -> norm.{weight|bias} # norm_to_q.weight -> to_q.weight new_key = key.replace('attn1.norm.', 'attn1.norm_to_q.layer_norm_') - new_key = new_key.replace('attn1.to_q.weight', 'attn1.norm_to_q.weight',) + new_key = new_key.replace( + 'attn1.to_q.weight', + 'attn1.norm_to_q.weight', + ) new_key = new_key.replace('attn2.norm.', 'attn2.norm_to_q.layer_norm_') - new_key = new_key.replace('attn2.to_q.weight', 'attn2.norm_to_q.weight',) + new_key = new_key.replace( + 'attn2.to_q.weight', + 'attn2.norm_to_q.weight', + ) ### LayerNormMLP # ff.net.layer_norm_{weight|bias} -> ff.net.0.{weight|bias} @@ -1214,7 +1243,10 @@ def _load_pretrained_model(self, state_dict, ignore_mismatched_sizes=False, from unexpected_keys = list(set(loaded_keys) - set(expected_keys)) def _find_mismatched_keys( - state_dict, model_state_dict, loaded_keys, ignore_mismatched_sizes, + state_dict, + model_state_dict, + loaded_keys, + ignore_mismatched_sizes, ): mismatched_keys = [] if ignore_mismatched_sizes: @@ -1234,7 +1266,10 @@ def _find_mismatched_keys( if state_dict is not None: # Whole checkpoint mismatched_keys = _find_mismatched_keys( - state_dict, model_state_dict, original_loaded_keys, ignore_mismatched_sizes, + state_dict, + model_state_dict, + original_loaded_keys, + ignore_mismatched_sizes, ) error_msgs = self._load_state_dict_into_model(state_dict) return missing_keys, unexpected_keys, mismatched_keys, error_msgs @@ -1329,9 +1364,14 @@ def _forward(self, x, timesteps=None, context=None, y=None, **kwargs): return self.out(h) def forward(self, x, timesteps=None, context=None, y=None, **kwargs): - with transformer_engine.pytorch.fp8_autocast( - enabled=self.use_te_fp8, fp8_recipe=self.fp8_recipe, - ) if self.use_te_fp8 else nullcontext(): + with ( + transformer_engine.pytorch.fp8_autocast( + enabled=self.use_te_fp8, + fp8_recipe=self.fp8_recipe, + ) + if self.use_te_fp8 + else nullcontext() + ): out = self._forward(x, timesteps, context, y, **kwargs) return out @@ -1387,7 +1427,9 @@ def __init__( time_embed_dim = model_channels * 4 self.time_embed = nn.Sequential( - linear(model_channels, time_embed_dim), nn.SiLU(), linear(time_embed_dim, time_embed_dim), + linear(model_channels, time_embed_dim), + nn.SiLU(), + linear(time_embed_dim, time_embed_dim), ) self.input_blocks = nn.ModuleList( @@ -1489,11 +1531,15 @@ def __init__( elif pool == "attention": assert num_head_channels != -1 self.out = nn.Sequential( - normalization(ch), nn.SiLU(), AttentionPool2d((image_size // ds), ch, num_head_channels, out_channels), + normalization(ch), + nn.SiLU(), + AttentionPool2d((image_size // ds), ch, num_head_channels, out_channels), ) elif pool == "spatial": self.out = nn.Sequential( - nn.Linear(self._feature_size, 2048), nn.ReLU(), nn.Linear(2048, self.out_channels), + nn.Linear(self._feature_size, 2048), + nn.ReLU(), + nn.Linear(2048, self.out_channels), ) elif pool == "spatial_v2": self.out = nn.Sequential( diff --git a/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/util.py b/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/util.py index 53f9669a0b8f..69700a43614e 100644 --- a/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/util.py +++ b/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/util.py @@ -44,7 +44,7 @@ def make_beta_schedule(schedule, n_timestep, linear_start=1e-4, linear_end=2e-2, cosine_s=8e-3): if schedule == "linear": - betas = torch.linspace(linear_start ** 0.5, linear_end ** 0.5, n_timestep, dtype=torch.float64) ** 2 + betas = torch.linspace(linear_start**0.5, linear_end**0.5, n_timestep, dtype=torch.float64) ** 2 elif schedule == "cosine": timesteps = torch.arange(n_timestep + 1, dtype=torch.float64) / n_timestep + cosine_s @@ -169,7 +169,10 @@ def backward(ctx, *output_grads): shallow_copies = [x.view_as(x) for x in ctx.input_tensors] output_tensors = ctx.run_function(*shallow_copies) input_grads = torch.autograd.grad( - output_tensors, ctx.input_tensors + ctx.input_params, output_grads, allow_unused=True, + output_tensors, + ctx.input_tensors + ctx.input_params, + output_grads, + allow_unused=True, ) del ctx.input_tensors del ctx.input_params @@ -319,7 +322,11 @@ def interpolate_fn(x, xp, yp): start_idx = torch.where( torch.eq(x_idx, 0), torch.tensor(1, device=x.device), - torch.where(torch.eq(x_idx, K), torch.tensor(K - 2, device=x.device), cand_start_idx,), + torch.where( + torch.eq(x_idx, K), + torch.tensor(K - 2, device=x.device), + cand_start_idx, + ), ) end_idx = torch.where(torch.eq(start_idx, cand_start_idx), start_idx + 2, start_idx + 1) start_x = torch.gather(sorted_all_x, dim=2, index=start_idx.unsqueeze(2)).squeeze(2) @@ -327,7 +334,11 @@ def interpolate_fn(x, xp, yp): start_idx2 = torch.where( torch.eq(x_idx, 0), torch.tensor(0, device=x.device), - torch.where(torch.eq(x_idx, K), torch.tensor(K - 2, device=x.device), cand_start_idx,), + torch.where( + torch.eq(x_idx, K), + torch.tensor(K - 2, device=x.device), + cand_start_idx, + ), ) y_positions_expanded = yp.unsqueeze(0).expand(N, -1, -1) start_y = torch.gather(y_positions_expanded, dim=2, index=start_idx2.unsqueeze(2)).squeeze(2) diff --git a/nemo/collections/nlp/parts/megatron_trainer_builder.py b/nemo/collections/nlp/parts/megatron_trainer_builder.py index f6336f6bcc71..194168008dc4 100644 --- a/nemo/collections/nlp/parts/megatron_trainer_builder.py +++ b/nemo/collections/nlp/parts/megatron_trainer_builder.py @@ -146,7 +146,7 @@ def _plugins(self) -> list: use_dist_ckpt = not self.cfg.model.get('fsdp', False) and ( self.cfg.model.get('mcore_gpt', False) or self.cfg.model.get('mcore_bert', False) ) - async_save = self.cfg.exp_manager.get('checkpoint_callback_params', {}).get('async_save', False) + async_save = self.cfg.get('exp_manager', {}).get('checkpoint_callback_params', {}).get('async_save', False) if use_dist_ckpt: checkpoint_io = DistributedCheckpointIO.from_config(self.cfg.model, async_save) if async_save: @@ -171,7 +171,7 @@ def _callbacks(self, callbacks: Optional[list]) -> list: if 'enable_progress_bar' not in self.cfg.trainer or self.cfg.trainer.enable_progress_bar: callbacks.append(CustomProgressBar()) - if self.cfg.exp_manager.get('checkpoint_callback_params', {}).get('async_save', False): + if self.cfg.get('exp_manager', {}).get('checkpoint_callback_params', {}).get('async_save', False): callbacks.append(AsyncFinalizerCallback()) return callbacks From 962b846be205562b047c3c4842cbb3db3757677e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Simon=20Wei=C3=9F?= Date: Wed, 29 May 2024 18:56:47 +0200 Subject: [PATCH 29/47] Update README.rst to clarify installation via Conda (#9323) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Simon Weiß --- README.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.rst b/README.rst index a2c595d62137..121c82b8590f 100644 --- a/README.rst +++ b/README.rst @@ -240,6 +240,8 @@ Install PyTorch using their `configurator Date: Wed, 29 May 2024 11:00:00 -0700 Subject: [PATCH 30/47] [Nemo CICD] update flaky test (#9339) * comment out flaky tests * optional test should not cancel workflow --- .github/workflows/cicd-main.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 1e977a7e717d..b924cf975b18 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -152,8 +152,8 @@ jobs: - name: "L0: Unit Tests GPU" run: | NEMO_NUMBA_MINVER=0.53 pytest -m "not pleasefixme" --with_downloads - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + #- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" + # if: "failure()" L0_Unit_Tests_CPU: From da720ae38ba2b47d10f365c6760357d504fd9039 Mon Sep 17 00:00:00 2001 From: yaoyu-33 <54727607+yaoyu-33@users.noreply.github.com> Date: Wed, 29 May 2024 14:08:50 -0700 Subject: [PATCH 31/47] Fix peft weights loading (#9341) Signed-off-by: yaoyu-33 --- .../collections/nlp/parts/mixins/multimodal_adapter_mixins.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nemo/collections/nlp/parts/mixins/multimodal_adapter_mixins.py b/nemo/collections/nlp/parts/mixins/multimodal_adapter_mixins.py index 1a5321065fa9..00552cb7f96e 100644 --- a/nemo/collections/nlp/parts/mixins/multimodal_adapter_mixins.py +++ b/nemo/collections/nlp/parts/mixins/multimodal_adapter_mixins.py @@ -133,10 +133,10 @@ def load_adapters( state_dict = torch.load(filepath, map_location)['state_dict'] else: raise RuntimeError(f"{filepath} is not nemo file or ckpt file") - if self.cfg.megatron_amp_O2: - state_dict = {replace_prefix(k, 'model.', 'model.module.'): v for k, v in state_dict.items()} if not self.ptuning_only_and_non_first_stage: assert set(state_dict.keys()) == self.adapter_keys.union(self.tunable_base_param_keys) + if self.cfg.megatron_amp_O2: + state_dict = {replace_prefix(k, 'model.', 'model.module.'): v for k, v in state_dict.items()} missing_keys, unexpected_keys = NLPModel.load_state_dict(self, state_dict, strict=False) From 4aba557bd23c27b6eeca7cf0da91845a4532178c Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Wed, 29 May 2024 15:11:55 -0700 Subject: [PATCH 32/47] fix lora and ptuning and isort/black (#9290) (#9295) * fix lora and ptuning and isort/black * remove raise error when multiple config files * Apply isort and black reformatting * fix script issues --------- Signed-off-by: Onur Yilmaz Signed-off-by: oyilmaz-nvidia Co-authored-by: Onur Yilmaz <35306097+oyilmaz-nvidia@users.noreply.github.com> Co-authored-by: oyilmaz-nvidia --- nemo/deploy/deploy_pytriton.py | 4 -- nemo/deploy/nlp/query_llm.py | 3 +- nemo/export/tensorrt_llm.py | 18 +++++- nemo/export/trt_llm/decoder/decoder.py | 6 +- nemo/export/trt_llm/decoder/falcon.py | 29 ++++++++-- nemo/export/trt_llm/decoder/gemma.py | 29 ++++++++-- nemo/export/trt_llm/decoder/gpt.py | 28 ++++++++-- nemo/export/trt_llm/decoder/gptj.py | 18 +++++- nemo/export/trt_llm/decoder/llama.py | 29 ++++++++-- nemo/export/trt_llm/model_config.py | 55 +++++++++++++----- nemo/export/trt_llm/nemo/nemo.py | 10 ++-- nemo/export/trt_llm/tensorrt_llm_model.py | 7 ++- nemo/export/trt_llm/tensorrt_llm_run.py | 68 +++++++++++++++++------ scripts/deploy/nlp/deploy_triton.py | 40 +++++-------- scripts/export/export_to_trt_llm.py | 10 ++-- 15 files changed, 252 insertions(+), 102 deletions(-) diff --git a/nemo/deploy/deploy_pytriton.py b/nemo/deploy/deploy_pytriton.py index 22dea8ac47cd..25e09cf3eacc 100644 --- a/nemo/deploy/deploy_pytriton.py +++ b/nemo/deploy/deploy_pytriton.py @@ -24,7 +24,6 @@ class DeployPyTriton(DeployBase): - """ Deploys any models to Triton Inference Server that implements ITritonDeployable interface in nemo.deploy. @@ -102,7 +101,6 @@ def __init__( ) def deploy(self): - """ Deploys any models to Triton Inference Server. """ @@ -148,7 +146,6 @@ def deploy(self): print(e) def serve(self): - """ Starts serving the model and waits for the requests """ @@ -163,7 +160,6 @@ def serve(self): print(e) def run(self): - """ Starts serving the model asynchronously. """ diff --git a/nemo/deploy/nlp/query_llm.py b/nemo/deploy/nlp/query_llm.py index 6a4337024eeb..c8387914c2e9 100644 --- a/nemo/deploy/nlp/query_llm.py +++ b/nemo/deploy/nlp/query_llm.py @@ -71,7 +71,8 @@ class NemoQueryLLM(NemoQueryLLMBase): def __init__(self, url, model_name): super().__init__( - url=url, model_name=model_name, + url=url, + model_name=model_name, ) def query_llm( diff --git a/nemo/export/tensorrt_llm.py b/nemo/export/tensorrt_llm.py index cad7b821b3b4..b030165a3d45 100644 --- a/nemo/export/tensorrt_llm.py +++ b/nemo/export/tensorrt_llm.py @@ -82,15 +82,24 @@ class TensorRTLLM(ITritonDeployable): """ - def __init__(self, model_dir: str, lora_ckpt_list: List[str] = None, load_model: bool = True): + def __init__( + self, + model_dir: str, + lora_ckpt_list: List[str] = None, + load_model: bool = True, + use_python_runtime: bool = True, + ): """ Args: model_dir (str): path for storing the TensorRT-LLM model files. + lora_ckpt_list (List[str]): lora checkpoint paths. load_model (bool): load TensorRT-LLM model if the engine files exist in the model_dir. + use_python_runtime (bool): whether to use python or c++ runtime. """ self.model_dir = model_dir self.lora_ckpt_list = lora_ckpt_list + self.use_python_runtime = use_python_runtime self.model = None self.tokenizer = None self.n_gpus = None @@ -623,7 +632,7 @@ def _prep_ptuning_table(self): if len(vtokens_embeddings) > 0: self.p_table = torch.stack(vtokens_embeddings, dim=0).view(-1, self.get_hidden_size) - max_prompt_embedding_table_size = self.config['builder_config']['max_prompt_embedding_table_size'] + max_prompt_embedding_table_size = self.config['build_config']['max_prompt_embedding_table_size'] actual_prompt_table_size = self.p_table.shape[0] if actual_prompt_table_size > max_prompt_embedding_table_size: @@ -754,7 +763,10 @@ def _load(self): self._load_config_file() self.tokenizer = get_tokenzier(Path(os.path.join(self.model_dir))) self.model = load( - tokenizer=self.tokenizer, engine_dir=self.model_dir, lora_ckpt_list=self.lora_ckpt_list + tokenizer=self.tokenizer, + engine_dir=self.model_dir, + lora_ckpt_list=self.lora_ckpt_list, + use_python_runtime=self.use_python_runtime, ) self._load_prompt_tables() except Exception as error: diff --git a/nemo/export/trt_llm/decoder/decoder.py b/nemo/export/trt_llm/decoder/decoder.py index b3c0e2257e9f..2d1993fd74c0 100644 --- a/nemo/export/trt_llm/decoder/decoder.py +++ b/nemo/export/trt_llm/decoder/decoder.py @@ -90,7 +90,11 @@ def build_post_layernorm(self, layer) -> Optional[LayernormConfig]: pass def __init__( - self, decoder_type: str, dtype: trt.DataType = trt.float16, rank: int = 0, tensor_parallel: int = 1, + self, + decoder_type: str, + dtype: trt.DataType = trt.float16, + rank: int = 0, + tensor_parallel: int = 1, ): """Initializes the DecoderLayerConfigBuilder.""" self.decoder_type = decoder_type diff --git a/nemo/export/trt_llm/decoder/falcon.py b/nemo/export/trt_llm/decoder/falcon.py index 91edc7794607..e05979fa75a0 100644 --- a/nemo/export/trt_llm/decoder/falcon.py +++ b/nemo/export/trt_llm/decoder/falcon.py @@ -69,7 +69,11 @@ def build_attention(self, layer) -> AttentionConfig: ) config.dense = LinearConfig.from_nn_module( - layer.self_attn.o_proj, LINEAR_ROW, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype, + layer.self_attn.o_proj, + LINEAR_ROW, + rank=self.rank, + tensor_parallel=self.tensor_parallel, + dtype=self.dtype, ) return config @@ -78,13 +82,25 @@ def build_attention(self, layer) -> AttentionConfig: def build_mlp(self, layer) -> MLPConfig: config = MLPConfig() config.fc = LinearConfig.from_nn_module( - layer.mlp.gate_proj, LINEAR_COLUMN, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype, + layer.mlp.gate_proj, + LINEAR_COLUMN, + rank=self.rank, + tensor_parallel=self.tensor_parallel, + dtype=self.dtype, ) config.proj = LinearConfig.from_nn_module( - layer.mlp.down_proj, LINEAR_ROW, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype, + layer.mlp.down_proj, + LINEAR_ROW, + rank=self.rank, + tensor_parallel=self.tensor_parallel, + dtype=self.dtype, ) config.gate = LinearConfig.from_nn_module( - layer.mlp.up_proj, LINEAR_COLUMN, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype, + layer.mlp.up_proj, + LINEAR_COLUMN, + rank=self.rank, + tensor_parallel=self.tensor_parallel, + dtype=self.dtype, ) return config @@ -130,4 +146,7 @@ def build_decoder(self, layer): config.set_if_not_exist('bias', False) config.set_if_not_exist('moe_num_experts', 0) - return FalconDecoderLayer(config=config, layer_idx=self.layer_id,) + return FalconDecoderLayer( + config=config, + layer_idx=self.layer_id, + ) diff --git a/nemo/export/trt_llm/decoder/gemma.py b/nemo/export/trt_llm/decoder/gemma.py index 10301c7a47d7..37f843dcf0ca 100644 --- a/nemo/export/trt_llm/decoder/gemma.py +++ b/nemo/export/trt_llm/decoder/gemma.py @@ -64,7 +64,11 @@ def build_attention(self, layer) -> AttentionConfig: ) config.dense = LinearConfig.from_nn_module( - layer.self_attn.o_proj, LINEAR_ROW, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype, + layer.self_attn.o_proj, + LINEAR_ROW, + rank=self.rank, + tensor_parallel=self.tensor_parallel, + dtype=self.dtype, ) return config @@ -73,13 +77,25 @@ def build_attention(self, layer) -> AttentionConfig: def build_mlp(self, layer) -> MLPConfig: config = MLPConfig() config.fc = LinearConfig.from_nn_module( - layer.mlp.gate_proj, LINEAR_COLUMN, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype, + layer.mlp.gate_proj, + LINEAR_COLUMN, + rank=self.rank, + tensor_parallel=self.tensor_parallel, + dtype=self.dtype, ) config.proj = LinearConfig.from_nn_module( - layer.mlp.down_proj, LINEAR_ROW, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype, + layer.mlp.down_proj, + LINEAR_ROW, + rank=self.rank, + tensor_parallel=self.tensor_parallel, + dtype=self.dtype, ) config.gate = LinearConfig.from_nn_module( - layer.mlp.up_proj, LINEAR_COLUMN, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype, + layer.mlp.up_proj, + LINEAR_COLUMN, + rank=self.rank, + tensor_parallel=self.tensor_parallel, + dtype=self.dtype, ) return config @@ -128,4 +144,7 @@ def build_decoder(self, layer): config.set_if_not_exist('dense_context_fmha', False) config.set_if_not_exist('moe_num_experts', 0) - return GemmaDecoderLayer(config=config, layer_idx=self.layer_id,) + return GemmaDecoderLayer( + config=config, + layer_idx=self.layer_id, + ) diff --git a/nemo/export/trt_llm/decoder/gpt.py b/nemo/export/trt_llm/decoder/gpt.py index 8af4e4ef01e4..a405aabbbd48 100644 --- a/nemo/export/trt_llm/decoder/gpt.py +++ b/nemo/export/trt_llm/decoder/gpt.py @@ -54,11 +54,18 @@ def build_input_layernorm(self, layer) -> LayernormConfig: def build_attention(self, layer) -> AttentionConfig: config = AttentionConfig() config.qkv = LinearConfig.from_qkv_nn_modules( - [layer.attn.c_attn], rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype, + [layer.attn.c_attn], + rank=self.rank, + tensor_parallel=self.tensor_parallel, + dtype=self.dtype, ) config.dense = LinearConfig.from_nn_module( - layer.attn.c_proj, LINEAR_ROW, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype, + layer.attn.c_proj, + LINEAR_ROW, + rank=self.rank, + tensor_parallel=self.tensor_parallel, + dtype=self.dtype, ) return config @@ -67,10 +74,18 @@ def build_attention(self, layer) -> AttentionConfig: def build_mlp(self, layer) -> MLPConfig: config = MLPConfig() config.fc = LinearConfig.from_nn_module( - layer.mlp.c_fc, LINEAR_COLUMN, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype, + layer.mlp.c_fc, + LINEAR_COLUMN, + rank=self.rank, + tensor_parallel=self.tensor_parallel, + dtype=self.dtype, ) config.proj = LinearConfig.from_nn_module( - layer.mlp.c_proj, LINEAR_ROW, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype, + layer.mlp.c_proj, + LINEAR_ROW, + rank=self.rank, + tensor_parallel=self.tensor_parallel, + dtype=self.dtype, ) return config @@ -126,4 +141,7 @@ def build_decoder(self, layer): config.set_if_not_exist('rotary_pct', rotary_pct) config.set_if_not_exist('moe_num_experts', 0) - return GPTDecoderLayer(config=config, layer_idx=self.layer_id,) + return GPTDecoderLayer( + config=config, + layer_idx=self.layer_id, + ) diff --git a/nemo/export/trt_llm/decoder/gptj.py b/nemo/export/trt_llm/decoder/gptj.py index aa65ca385a47..327a31fdd35c 100644 --- a/nemo/export/trt_llm/decoder/gptj.py +++ b/nemo/export/trt_llm/decoder/gptj.py @@ -60,7 +60,11 @@ def build_attention(self, layer) -> AttentionConfig: ) config.dense = LinearConfig.from_nn_module( - layer.attn.out_proj, LINEAR_ROW, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype, + layer.attn.out_proj, + LINEAR_ROW, + rank=self.rank, + tensor_parallel=self.tensor_parallel, + dtype=self.dtype, ) config.rotary_dim = layer.attn.rotary_dim @@ -71,10 +75,18 @@ def build_attention(self, layer) -> AttentionConfig: def build_mlp(self, layer) -> MLPConfig: config = MLPConfig() config.fc = LinearConfig.from_nn_module( - layer.mlp.fc_in, LINEAR_COLUMN, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype, + layer.mlp.fc_in, + LINEAR_COLUMN, + rank=self.rank, + tensor_parallel=self.tensor_parallel, + dtype=self.dtype, ) config.proj = LinearConfig.from_nn_module( - layer.mlp.fc_out, LINEAR_ROW, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype, + layer.mlp.fc_out, + LINEAR_ROW, + rank=self.rank, + tensor_parallel=self.tensor_parallel, + dtype=self.dtype, ) return config diff --git a/nemo/export/trt_llm/decoder/llama.py b/nemo/export/trt_llm/decoder/llama.py index 873c0306375b..b37d62e214de 100644 --- a/nemo/export/trt_llm/decoder/llama.py +++ b/nemo/export/trt_llm/decoder/llama.py @@ -66,7 +66,11 @@ def build_attention(self, layer) -> AttentionConfig: ) config.dense = LinearConfig.from_nn_module( - layer.self_attn.o_proj, LINEAR_ROW, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype, + layer.self_attn.o_proj, + LINEAR_ROW, + rank=self.rank, + tensor_parallel=self.tensor_parallel, + dtype=self.dtype, ) return config @@ -75,13 +79,25 @@ def build_attention(self, layer) -> AttentionConfig: def build_mlp(self, layer) -> MLPConfig: config = MLPConfig() config.fc = LinearConfig.from_nn_module( - layer.mlp.gate_proj, LINEAR_COLUMN, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype, + layer.mlp.gate_proj, + LINEAR_COLUMN, + rank=self.rank, + tensor_parallel=self.tensor_parallel, + dtype=self.dtype, ) config.proj = LinearConfig.from_nn_module( - layer.mlp.down_proj, LINEAR_ROW, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype, + layer.mlp.down_proj, + LINEAR_ROW, + rank=self.rank, + tensor_parallel=self.tensor_parallel, + dtype=self.dtype, ) config.gate = LinearConfig.from_nn_module( - layer.mlp.up_proj, LINEAR_COLUMN, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype, + layer.mlp.up_proj, + LINEAR_COLUMN, + rank=self.rank, + tensor_parallel=self.tensor_parallel, + dtype=self.dtype, ) return config @@ -147,4 +163,7 @@ def build_decoder(self, layer): config.moe_tp_mode = layer.moe_tp_mode config.moe_normalization_mode = layer.moe_renorm_mode - return LLaMADecoderLayer(config=config, layer_idx=self.layer_id,) + return LLaMADecoderLayer( + config=config, + layer_idx=self.layer_id, + ) diff --git a/nemo/export/trt_llm/model_config.py b/nemo/export/trt_llm/model_config.py index dd360afd6b8a..0f120dc56153 100644 --- a/nemo/export/trt_llm/model_config.py +++ b/nemo/export/trt_llm/model_config.py @@ -122,7 +122,11 @@ def from_nn_module(module: nn.Module, linear_type: str, rank=0, tensor_parallel= if hasattr(module, "bias") and module.bias is not None: if linear_type == LINEAR_COLUMN: config.bias = np.ascontiguousarray( - split(torch_to_numpy_with_dtype(module.bias, dtype), tensor_parallel, rank,) + split( + torch_to_numpy_with_dtype(module.bias, dtype), + tensor_parallel, + rank, + ) ) else: config.bias = torch_to_numpy_with_dtype(module.bias, dtype) @@ -234,7 +238,9 @@ class AttentionConfig: @staticmethod def from_nemo( - weights_dict: Dict[str, np.ndarray], layer_id: int, rank: int = 0, + weights_dict: Dict[str, np.ndarray], + layer_id: int, + rank: int = 0, ): """Converts the nemo weights and config to `AttentionConfig`.""" attention = AttentionConfig() @@ -243,12 +249,16 @@ def from_nemo( weights_dict, f"layers.{layer_id}.attention.query_key_value.weight.{rank}" ) attention.qkv.bias = get_tensor_from_dict( - weights_dict, f"layers.{layer_id}.attention.query_key_value.bias.{rank}", + weights_dict, + f"layers.{layer_id}.attention.query_key_value.bias.{rank}", ) attention.dense = LinearConfig(linear_type=LINEAR_ROW) attention.dense.weight = get_tensor_from_dict(weights_dict, f"layers.{layer_id}.attention.dense.weight.{rank}") - attention.dense.bias = get_tensor_from_dict(weights_dict, f"layers.{layer_id}.attention.dense.bias",) + attention.dense.bias = get_tensor_from_dict( + weights_dict, + f"layers.{layer_id}.attention.dense.bias", + ) return attention @@ -276,7 +286,10 @@ def from_nemo( # print("********** mlp.fc.weight : ", mlp.fc.weight ) - mlp.fc.bias = get_tensor_from_dict(weights_dict, f"layers.{layer_id}.mlp.dense_h_to_4h.bias.{rank}",) + mlp.fc.bias = get_tensor_from_dict( + weights_dict, + f"layers.{layer_id}.mlp.dense_h_to_4h.bias.{rank}", + ) gated = is_gated_activation(mlp.hidden_act) is_fast_glu = mlp.hidden_act in ['fast-geglu', 'fast-swiglu', 'fast-reglu'] @@ -287,9 +300,13 @@ def from_nemo( if isinstance(llm_config, LlamaConfig) and not is_mcore and not is_fast_glu else f"layers.{layer_id}.mlp.dense_h_to_4h.gate.weight.{rank}" ) - mlp.gate.weight = get_tensor_from_dict(weights_dict, layer_name,) + mlp.gate.weight = get_tensor_from_dict( + weights_dict, + layer_name, + ) mlp.gate.bias = get_tensor_from_dict( - weights_dict, f"layers.{layer_id}.mlp.dense_h_to_4h.gate.bias.{rank}", + weights_dict, + f"layers.{layer_id}.mlp.dense_h_to_4h.gate.bias.{rank}", ) mlp.proj = LinearConfig(linear_type=LINEAR_ROW) @@ -382,19 +399,23 @@ def from_nemo( LAYERNORM_RMS if isinstance(llm_config, LlamaConfig) else LAYERNORM_DEFAULT ) layer_config.input_layernorm.weight = get_tensor_from_dict( - weights_dict, f"layers.{layer_id}.input_layernorm.weight", + weights_dict, + f"layers.{layer_id}.input_layernorm.weight", ) layer_config.input_layernorm.bias = get_tensor_from_dict( - weights_dict, f"layers.{layer_id}.input_layernorm.bias", + weights_dict, + f"layers.{layer_id}.input_layernorm.bias", ) layer_config.mlp_layernorm = LayernormConfig() layer_config.mlp_layernorm.layernorm_type = LAYERNORM_DEFAULT # Falcon uses default layernorm layer_config.mlp_layernorm.weight = get_tensor_from_dict( - weights_dict, f"layers.{layer_id}.pre_mlp_layernorm.weight", + weights_dict, + f"layers.{layer_id}.pre_mlp_layernorm.weight", ) layer_config.mlp_layernorm.bias = get_tensor_from_dict( - weights_dict, f"layers.{layer_id}.pre_mlp_layernorm.bias", + weights_dict, + f"layers.{layer_id}.pre_mlp_layernorm.bias", ) layer_config.post_layernorm = LayernormConfig() @@ -403,10 +424,12 @@ def from_nemo( ) layer_config.post_layernorm.weight = get_tensor_from_dict( - weights_dict, f"layers.{layer_id}.post_attention_layernorm.weight", + weights_dict, + f"layers.{layer_id}.post_attention_layernorm.weight", ) layer_config.post_layernorm.bias = get_tensor_from_dict( - weights_dict, f"layers.{layer_id}.post_attention_layernorm.bias", + weights_dict, + f"layers.{layer_id}.post_attention_layernorm.bias", ) if layer_config.post_layernorm.weight is None: # Falcon doesn't have post layernorm @@ -415,7 +438,11 @@ def from_nemo( if layer_config.mlp_layernorm.weight is None: layer_config.mlp_layernorm = None - layer_config.attention = AttentionConfig.from_nemo(weights_dict, layer_id, rank,) + layer_config.attention = AttentionConfig.from_nemo( + weights_dict, + layer_id, + rank, + ) moe = False if llm_config.moe_num_experts is not None: diff --git a/nemo/export/trt_llm/nemo/nemo.py b/nemo/export/trt_llm/nemo/nemo.py index 9026cd9cfba9..c3564f1c4e8e 100644 --- a/nemo/export/trt_llm/nemo/nemo.py +++ b/nemo/export/trt_llm/nemo/nemo.py @@ -106,7 +106,9 @@ def extract_layers_with_prefix(model_, prefix): class UnpackedNemoCheckpointDir: def __init__( - self, checkpoints_dir: typing.Union[pathlib.Path, TarPath], load_checkpoints_to_cpu: bool = False, + self, + checkpoints_dir: typing.Union[pathlib.Path, TarPath], + load_checkpoints_to_cpu: bool = False, ): assert isinstance(checkpoints_dir, (pathlib.Path, TarPath)) self._checkpoints_dir = checkpoints_dir @@ -121,11 +123,7 @@ def model_config(self): model_configs_paths = list(self._checkpoints_dir.rglob(model_config_filename)) if model_configs_paths: if len(model_configs_paths) > 1: - raise RuntimeError( - f"There are more than single {model_config_filename} in" - f" {self._checkpoints_dir}:" - f" {', '.join(map(lambda p: p.as_posix(), model_configs_paths))}" - ) + LOGGER.debug(f"There are more than single {model_config_filename} in" f" {self._checkpoints_dir}") model_config_path = model_configs_paths[0] LOGGER.debug("Loading model config from %s", model_config_path) with model_config_path.open("r") as model_config_file: diff --git a/nemo/export/trt_llm/tensorrt_llm_model.py b/nemo/export/trt_llm/tensorrt_llm_model.py index 736d6180807e..f4b44552af63 100644 --- a/nemo/export/trt_llm/tensorrt_llm_model.py +++ b/nemo/export/trt_llm/tensorrt_llm_model.py @@ -144,7 +144,12 @@ def forward( if attention_mask is not None: attention_mask = expand_mask(attention_mask, shape(input_ids, -1)) - for layer_idx, (layer, past) in enumerate(zip(self.layers, kv_cache_params.past_key_value,)): + for layer_idx, (layer, past) in enumerate( + zip( + self.layers, + kv_cache_params.past_key_value, + ) + ): decoder_params = { "hidden_states": hidden_states, diff --git a/nemo/export/trt_llm/tensorrt_llm_run.py b/nemo/export/trt_llm/tensorrt_llm_run.py index 92fc36272f7c..fe0189b10628 100644 --- a/nemo/export/trt_llm/tensorrt_llm_run.py +++ b/nemo/export/trt_llm/tensorrt_llm_run.py @@ -16,17 +16,19 @@ import json import logging import os +import tempfile from dataclasses import dataclass from pathlib import Path from typing import List, Optional +import numpy as np import tensorrt_llm import torch from mpi4py.futures import MPIPoolExecutor from tensorrt_llm.logger import logger from tensorrt_llm.lora_manager import LoraManager from tensorrt_llm.quantization import QuantMode -from tensorrt_llm.runtime import ModelConfig, ModelRunnerCpp, SamplingConfig +from tensorrt_llm.runtime import ModelConfig, ModelRunner, ModelRunnerCpp, SamplingConfig from transformers import PreTrainedTokenizer from nemo.export.trt_llm.tensor_utils import get_tensor_parallel_group @@ -55,7 +57,7 @@ class TensorrtLLMHostContext: class TensorrtLLMWorkerContext: """The MPI worker side context for TRT LLM inference.""" - decoder: ModelRunnerCpp = None + decoder: ModelRunner = None sampling_config: SamplingConfig = None lora_manager: LoraManager = None max_batch_size: int = 0 @@ -128,7 +130,13 @@ def _read_config(config_path: Path): return model_config, world_size, tensor_parallel_size, pipeline_parallel_size, dtype, max_input_len, max_batch_size -def _load(tokenizer: PreTrainedTokenizer, engine_dir, lora_ckpt_list=None, num_beams=1): +def _load( + tokenizer: PreTrainedTokenizer, + engine_dir, + lora_ckpt_list=None, + num_beams=1, + use_python_runtime: bool = True, +): """The impl of `load` API for on a single GPU worker.""" try: tensorrt_llm.logger.set_level("info") @@ -147,17 +155,26 @@ def _load(tokenizer: PreTrainedTokenizer, engine_dir, lora_ckpt_list=None, num_b runtime_rank = tensorrt_llm.mpi_rank() - decoder = ModelRunnerCpp.from_dir( - engine_dir=engine_dir, - lora_dir=lora_ckpt_list, - lora_ckpt_source="nemo", - rank=runtime_rank, - max_batch_size=max_batch_size, - max_input_len=max_input_len, - max_output_len=max_output_len, - max_beam_width=max_beam_width, - debug_mode=False, - ) + if use_python_runtime: + decoder = ModelRunner.from_dir( + engine_dir=engine_dir, + lora_dir=lora_ckpt_list, + lora_ckpt_source="nemo", + rank=runtime_rank, + debug_mode=False, + ) + else: + decoder = ModelRunnerCpp.from_dir( + engine_dir=engine_dir, + lora_dir=lora_ckpt_list, + lora_ckpt_source="nemo", + rank=runtime_rank, + max_batch_size=max_batch_size, + max_input_len=max_input_len, + max_output_len=max_output_len, + max_beam_width=max_beam_width, + debug_mode=False, + ) sampling_config = SamplingConfig( end_id=tokenizer.eos_token_id, pad_id=tokenizer.eos_token_id, num_beams=num_beams @@ -218,6 +235,13 @@ def _forward( with torch.no_grad(): prompt_tasks = None if task_ids is None else ",".join(str(task) for task in task_ids) + if prompt_table is not None: + prompt_table = prompt_table.reshape(1, *prompt_table.shape) + tmp_dir = tempfile.TemporaryDirectory() + prompt_table_path = os.path.join(tmp_dir.name, 'prompt_table.npy') + np.save(prompt_table_path, prompt_table.cpu().float().numpy()) + prompt_table = prompt_table_path + outputs = decoder.generate( input_tensors, max_new_tokens=max_output_len, @@ -230,6 +254,7 @@ def _forward( stop_words_list=stop_words_list, bad_words_list=bad_words_list, lora_uids=lora_uids, + prompt_table_path=prompt_table, prompt_table=prompt_table, prompt_tasks=prompt_tasks, streaming=streaming, @@ -239,6 +264,9 @@ def _forward( torch.cuda.synchronize() + if prompt_table is not None: + tmp_dir.cleanup() + runtime_rank = tensorrt_llm.mpi_rank() if runtime_rank == 0 or multiprocessed_env: return outputs @@ -251,7 +279,11 @@ def _forward( def load( - tokenizer: PreTrainedTokenizer, engine_dir: str, lora_ckpt_list: List[str] = None, num_beams: int = 1 + tokenizer: PreTrainedTokenizer, + engine_dir: str, + lora_ckpt_list: List[str] = None, + num_beams: int = 1, + use_python_runtime: bool = True, ) -> TensorrtLLMHostContext: """Loaded the compiled LLM model and run it. @@ -263,17 +295,17 @@ def load( config = json.load(f) world_size = config["pretrained_config"]["mapping"]["world_size"] if world_size == 1: - _load(tokenizer, engine_dir, lora_ckpt_list, num_beams) + _load(tokenizer, engine_dir, lora_ckpt_list, num_beams, use_python_runtime) executor = None elif tensorrt_llm.mpi_world_size() > 1: - _load(tokenizer, engine_dir, lora_ckpt_list, num_beams) + _load(tokenizer, engine_dir, lora_ckpt_list, num_beams, use_python_runtime) executor = None tensorrt_llm.mpi_barrier() else: executor = MPIPoolExecutor(max_workers=world_size) futures = [] for _ in range(world_size): - future = executor.submit(_load, tokenizer, engine_dir, lora_ckpt_list, num_beams) + future = executor.submit(_load, tokenizer, engine_dir, lora_ckpt_list, num_beams, use_python_runtime) futures.append(future) for future in futures: future.result() diff --git a/scripts/deploy/nlp/deploy_triton.py b/scripts/deploy/nlp/deploy_triton.py index 7370731ec996..0a9604a73cdc 100755 --- a/scripts/deploy/nlp/deploy_triton.py +++ b/scripts/deploy/nlp/deploy_triton.py @@ -80,7 +80,7 @@ def get_args(argv): "-mpet", "--max_prompt_embedding_table_size", default=None, type=int, help="Max prompt embedding table size" ) parser.add_argument( - "-upkc", "--use_paged_kv_cache", default=False, action='store_true', help="Enable paged kv cache." + "-npkc", "--no_paged_kv_cache", default=False, action='store_true', help="Enable paged kv cache." ) parser.add_argument( "-drip", @@ -133,6 +133,13 @@ def get_args(argv): parser.add_argument( "-lc", "--lora_ckpt", default=None, type=str, nargs="+", help="The checkpoint list of LoRA weights" ) + parser.add_argument( + "-ucr", + '--use_cpp_runtime', + default=False, + action='store_true', + help='Use TensorRT LLM C++ runtime', + ) parser.add_argument("-dm", "--debug_mode", default=False, action='store_true', help="Enable debug mode") args = parser.parse_args(argv) @@ -206,32 +213,13 @@ def nemo_deploy(argv): ) return - trt_llm_exporter = TensorRTLLM(model_dir=trt_llm_path, lora_ckpt_list=args.lora_ckpt) + trt_llm_exporter = TensorRTLLM( + model_dir=trt_llm_path, + lora_ckpt_list=args.lora_ckpt, + use_python_runtime=(not args.use_cpp_runtime), + ) if args.nemo_checkpoint is not None: - - trt_llm_exporter.export( - nemo_checkpoint_path=args.nemo_checkpoint, - model_type=args.model_type, - n_gpus=args.num_gpus, - tensor_parallel_size=args.num_gpus, - pipeline_parallel_size=1, - max_input_token=args.max_input_len, - max_output_token=args.max_output_len, - max_batch_size=args.max_batch_size, - max_num_tokens=args.max_num_tokens, - opt_num_tokens=args.opt_num_tokens, - max_prompt_embedding_table_size=args.max_prompt_embedding_table_size, - paged_kv_cache=args.use_paged_kv_cache, - remove_input_padding=(not args.disable_remove_input_padding), - dtype=args.dtype, - enable_multi_block_mode=args.multi_block_mode, - use_lora_plugin=args.use_lora_plugin, - lora_target_modules=args.lora_target_modules, - max_lora_rank=args.max_lora_rank, - save_nemo_model_config=True, - ) - try: LOGGER.info("Export operation will be started to export the nemo checkpoint to TensorRT-LLM.") trt_llm_exporter.export( @@ -246,7 +234,7 @@ def nemo_deploy(argv): max_num_tokens=args.max_num_tokens, opt_num_tokens=args.opt_num_tokens, max_prompt_embedding_table_size=args.max_prompt_embedding_table_size, - paged_kv_cache=args.use_paged_kv_cache, + paged_kv_cache=(not args.no_paged_kv_cache), remove_input_padding=(not args.disable_remove_input_padding), dtype=args.dtype, enable_multi_block_mode=args.multi_block_mode, diff --git a/scripts/export/export_to_trt_llm.py b/scripts/export/export_to_trt_llm.py index e9741516cf00..ce9ef6a1e132 100644 --- a/scripts/export/export_to_trt_llm.py +++ b/scripts/export/export_to_trt_llm.py @@ -45,8 +45,8 @@ def get_args(argv): parser.add_argument( "-dt", "--dtype", - choices=["bf16", "fp16", "fp8", "int8"], - default="bf16", + choices=["bfloat16", "float16", "fp8", "int8"], + default="bfloat16", type=str, help="dtype of the model on TensorRT-LLM", ) @@ -59,7 +59,7 @@ def get_args(argv): "-mpet", "--max_prompt_embedding_table_size", default=None, type=int, help="Max prompt embedding table size" ) parser.add_argument( - "-upkc", "--use_paged_kv_cache", default=False, action='store_true', help="Enable paged kv cache." + "-npkc", "--no_paged_kv_cache", default=False, action='store_true', help="Enable paged kv cache." ) parser.add_argument( "-drip", @@ -123,7 +123,7 @@ def nemo_export_trt_llm(argv): LOGGER.info("Logging level set to {}".format(loglevel)) LOGGER.info(args) - if args.dtype != "bf16": + if args.dtype != "bfloat16": LOGGER.error( "Only bf16 is currently supported for the optimized deployment with TensorRT-LLM. " "Support for the other precisions will be added in the coming releases." @@ -146,7 +146,7 @@ def nemo_export_trt_llm(argv): max_num_tokens=args.max_num_tokens, opt_num_tokens=args.opt_num_tokens, max_prompt_embedding_table_size=args.max_prompt_embedding_table_size, - paged_kv_cache=args.use_paged_kv_cache, + paged_kv_cache=(not args.no_paged_kv_cache), remove_input_padding=(not args.disable_remove_input_padding), dtype=args.dtype, enable_multi_block_mode=args.multi_block_mode, From deb613adc7b7ad0a540f5cc1f0bc5032ddb345ff Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com> Date: Wed, 29 May 2024 20:03:32 -0700 Subject: [PATCH 33/47] Skip sequence_parallel allreduce when using Mcore DistOpt (#9344) Signed-off-by: Alexandros Koumparoulis --- .../nlp/models/language_modeling/megatron_gpt_model.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 3660a5145b10..b3e3c231de52 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -843,9 +843,11 @@ def training_step(self, dataloader_iter): # when using sequence parallelism, the sequence parallel layernorm grads must be all-reduced if self.cfg.get('tensor_model_parallel_size', 1) > 1 and self.cfg.get('sequence_parallel', False): - self.megatron_timer_start('allreduce_sequence_parallel_gradients', log_level=1) - self.allreduce_sequence_parallel_gradients() - self.megatron_timer_stop('allreduce_sequence_parallel_gradients') + # Mcore DistOpt handles this, so we don't have to + if not self.use_mcore_dist_optim: + self.megatron_timer_start('allreduce_sequence_parallel_gradients', log_level=1) + self.allreduce_sequence_parallel_gradients() + self.megatron_timer_stop('allreduce_sequence_parallel_gradients') self.megatron_timer_start('gradient_allreduce', log_level=1) if self.use_fsdp: From 2e396060f6f95b6a848f4f260b5bbafa8ed52107 Mon Sep 17 00:00:00 2001 From: janEbert Date: Thu, 30 May 2024 07:45:56 +0200 Subject: [PATCH 34/47] Fix FSDP gradient reduction with orig params (#9335) The `param.grad is not None` check also fixes gradient reduction in the case of parameters not having acquired gradients (as parameters could become empty tensors in FSDP). Thanks to @ofivite for suggesting that `use_orig_params=True` could be the cause of the issue, which greatly helped with analysis. Signed-off-by: janEbert --- .../nlp/models/language_modeling/megatron_gpt_model.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index b3e3c231de52..a5b4450c7b44 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -1001,8 +1001,8 @@ def allreduce_fsdp_sharding_omitted_gradients(self): """All-reduce gradients of FSDP-sharding-omitted parameters in sharding domain (data-parallel domain).""" assert isinstance(self.model, torch.nn.Module) grads = [] - for param in self.model.parameters(): - if not isinstance(param, torch.distributed.fsdp.FlatParameter) and param.requires_grad: + for param in self.model._ignored_params: + if param.requires_grad and param.grad is not None: grad = param.grad grads.append(grad.data) if len(grads) > 0: From b6595cbae2226ff553b44ff2b66527738ea4bdf2 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 30 May 2024 07:25:12 -0700 Subject: [PATCH 35/47] Fix P-tuning for Llama based models (#9300) * Fix P-tuning for Llama based models (#9297) * Added the BOS token for Llama, Mistral and Mixtral. Signed-off-by: Alexey Panteleev * Don't load an existing TRT-LLM model before export to speed up the export process and avoid possible contamination from previous runs. Signed-off-by: Alexey Panteleev * Apply isort and black reformatting Signed-off-by: apanteleev --------- Signed-off-by: Alexey Panteleev Signed-off-by: apanteleev Co-authored-by: apanteleev Co-authored-by: Onur Yilmaz <35306097+oyilmaz-nvidia@users.noreply.github.com> * Fix the export test --------- Signed-off-by: Alexey Panteleev Signed-off-by: apanteleev Signed-off-by: Onur Yilmaz <35306097+oyilmaz-nvidia@users.noreply.github.com> Co-authored-by: Alexey Panteleev Co-authored-by: apanteleev Co-authored-by: Onur Yilmaz <35306097+oyilmaz-nvidia@users.noreply.github.com> --- nemo/export/trt_llm/tensorrt_llm_run.py | 8 +++++++- scripts/deploy/nlp/deploy_triton.py | 1 + scripts/export/export_to_trt_llm.py | 2 +- tests/export/test_nemo_export.py | 2 +- 4 files changed, 10 insertions(+), 3 deletions(-) diff --git a/nemo/export/trt_llm/tensorrt_llm_run.py b/nemo/export/trt_llm/tensorrt_llm_run.py index fe0189b10628..1bdfd5237caf 100644 --- a/nemo/export/trt_llm/tensorrt_llm_run.py +++ b/nemo/export/trt_llm/tensorrt_llm_run.py @@ -312,7 +312,13 @@ def load( max_batch_size = config["build_config"]["max_batch_size"] max_input_len = config["build_config"]["max_input_len"] - add_bos = True if config["pretrained_config"]["architecture"] == "GemmaForCausalLM" else False + architectures_that_need_bos_token = [ + "GemmaForCausalLM", + "LLaMAForCausalLM", + "MistralForCausalLM", + "MixtralForCausalLM", + ] + add_bos = config["pretrained_config"]["architecture"] in architectures_that_need_bos_token return TensorrtLLMHostContext( executor=executor, diff --git a/scripts/deploy/nlp/deploy_triton.py b/scripts/deploy/nlp/deploy_triton.py index 0a9604a73cdc..5a2440b0fa2f 100755 --- a/scripts/deploy/nlp/deploy_triton.py +++ b/scripts/deploy/nlp/deploy_triton.py @@ -216,6 +216,7 @@ def nemo_deploy(argv): trt_llm_exporter = TensorRTLLM( model_dir=trt_llm_path, lora_ckpt_list=args.lora_ckpt, + load_model=(args.nemo_checkpoint is None), use_python_runtime=(not args.use_cpp_runtime), ) diff --git a/scripts/export/export_to_trt_llm.py b/scripts/export/export_to_trt_llm.py index ce9ef6a1e132..a9c16bf8cff6 100644 --- a/scripts/export/export_to_trt_llm.py +++ b/scripts/export/export_to_trt_llm.py @@ -131,7 +131,7 @@ def nemo_export_trt_llm(argv): return try: - trt_llm_exporter = TensorRTLLM(model_dir=args.model_repository) + trt_llm_exporter = TensorRTLLM(model_dir=args.model_repository, load_model=False) LOGGER.info("Export to TensorRT-LLM function is called.") trt_llm_exporter.export( diff --git a/tests/export/test_nemo_export.py b/tests/export/test_nemo_export.py index b3e186433561..97a06a1f6887 100644 --- a/tests/export/test_nemo_export.py +++ b/tests/export/test_nemo_export.py @@ -200,7 +200,7 @@ def run_trt_llm_inference( print("---- LoRA could not be enabled and skipping the test.") return None, None, None, None, None - trt_llm_exporter = TensorRTLLM(trt_llm_model_dir, lora_ckpt_list) + trt_llm_exporter = TensorRTLLM(trt_llm_model_dir, lora_ckpt_list, load_model=False) trt_llm_exporter.export( nemo_checkpoint_path=checkpoint_path, From aed9d071c700080b3eb024e8a5d7f091f20f0183 Mon Sep 17 00:00:00 2001 From: Daniel Galvez Date: Thu, 30 May 2024 10:08:24 -0700 Subject: [PATCH 36/47] Fix GreedyBatchedCTCInfer regression from GreedyCTCInfer. (#9347) * Fix GreedyBatchedCTCInfer regression from GreedyCTCInfer. decoder_lengths is allowed to be on CPU even when decoder_output is on GPU. This matches the behavior of GreedyCTCInfer. Even though that behavior is unintentional, there is code depending on that behavior, including our jupyter notebooks. Signed-off-by: Daniel Galvez * Apply isort and black reformatting Signed-off-by: titu1994 --------- Signed-off-by: Daniel Galvez Signed-off-by: titu1994 Co-authored-by: Somshubra Majumdar Co-authored-by: titu1994 Co-authored-by: Nithin Rao --- .../parts/submodules/ctc_greedy_decoding.py | 12 +++- .../asr/decoding/test_ctc_decoding.py | 71 +++++++++++++++++-- 2 files changed, 76 insertions(+), 7 deletions(-) diff --git a/nemo/collections/asr/parts/submodules/ctc_greedy_decoding.py b/nemo/collections/asr/parts/submodules/ctc_greedy_decoding.py index a7f57c82279a..74204cf73d8e 100644 --- a/nemo/collections/asr/parts/submodules/ctc_greedy_decoding.py +++ b/nemo/collections/asr/parts/submodules/ctc_greedy_decoding.py @@ -394,7 +394,17 @@ def forward( if decoder_lengths is None: logging.warning(_DECODER_LENGTHS_NONE_WARNING, mode=logging_mode.ONCE) - decoder_lengths = torch.tensor([decoder_output.shape[1]], dtype=torch.long).expand(decoder_output.shape[0]) + decoder_lengths = torch.tensor( + [decoder_output.shape[1]], dtype=torch.long, device=decoder_output.device + ).expand(decoder_output.shape[0]) + + # GreedyCTCInfer::forward(), by accident, works with + # decoder_lengths on either CPU or GPU when decoder_output is + # on GPU. For the sake of backwards compatibility, we also + # allow decoder_lengths to be on the CPU device. In this case, + # we simply copy the decoder_lengths from CPU to GPU. If both + # tensors are already on the same device, this is a no-op. + decoder_lengths = decoder_lengths.to(decoder_output.device) if decoder_output.ndim == 2: hypotheses = self._greedy_decode_labels_batched(decoder_output, decoder_lengths) diff --git a/tests/collections/asr/decoding/test_ctc_decoding.py b/tests/collections/asr/decoding/test_ctc_decoding.py index a42d61f051ad..580344fed395 100644 --- a/tests/collections/asr/decoding/test_ctc_decoding.py +++ b/tests/collections/asr/decoding/test_ctc_decoding.py @@ -200,8 +200,41 @@ def test_subword_decoding_greedy_forward_hypotheses(self, tmp_tokenizer, alignme @pytest.mark.parametrize('timestamps', [False, True]) @pytest.mark.parametrize('preserve_frame_confidence', [False, True]) @pytest.mark.parametrize('length_is_none', [False, True]) + @pytest.mark.parametrize( + "logprobs_device", + [ + torch.device("cpu"), + pytest.param( + torch.device("cuda"), + marks=pytest.mark.skipif( + not torch.cuda.is_available(), + reason='CUDA required for test.', + ), + ), + ], + ) + @pytest.mark.parametrize( + "length_device", + [ + torch.device("cpu"), + pytest.param( + torch.device("cuda"), + marks=pytest.mark.skipif( + not torch.cuda.is_available(), + reason='CUDA required for test.', + ), + ), + ], + ) def test_batched_decoding_logprobs( - self, tmp_tokenizer, alignments, timestamps, preserve_frame_confidence, length_is_none + self, + tmp_tokenizer, + alignments, + timestamps, + preserve_frame_confidence, + length_is_none, + logprobs_device, + length_device, ): cfg = CTCBPEDecodingConfig( strategy='greedy', @@ -217,7 +250,7 @@ def test_batched_decoding_logprobs( torch.manual_seed(1) B, T = 4, 20 V = unbatched_decoding.tokenizer.tokenizer.vocab_size + 1 - input_signal = torch.randn(size=(B, T, V)) + input_signal = torch.randn(size=(B, T, V), device=logprobs_device) # Set the blank index to a very high probability to make sure # that we always handle at least a few blanks. input_signal[:, 0, unbatched_decoding.tokenizer.tokenizer.vocab_size] = 1000 @@ -225,7 +258,7 @@ def test_batched_decoding_logprobs( if length_is_none: length = None else: - length = torch.randint(low=1, high=T, size=[B]) + length = torch.randint(low=1, high=T, size=[B], device=length_device) with torch.inference_mode(): hyps, _ = unbatched_decoding.ctc_decoder_predictions_tensor( @@ -249,7 +282,33 @@ def test_batched_decoding_logprobs( @pytest.mark.unit @pytest.mark.parametrize('timestamps', [False, True]) @pytest.mark.parametrize('length_is_none', [False, True]) - def test_batched_decoding_labels(self, tmp_tokenizer, timestamps, length_is_none): + @pytest.mark.parametrize( + "labels_device", + [ + torch.device("cpu"), + pytest.param( + torch.device("cuda"), + marks=pytest.mark.skipif( + not torch.cuda.is_available(), + reason='CUDA required for test.', + ), + ), + ], + ) + @pytest.mark.parametrize( + "length_device", + [ + torch.device("cpu"), + pytest.param( + torch.device("cuda"), + marks=pytest.mark.skipif( + not torch.cuda.is_available(), + reason='CUDA required for test.', + ), + ), + ], + ) + def test_batched_decoding_labels(self, tmp_tokenizer, timestamps, length_is_none, labels_device, length_device): cfg = CTCBPEDecodingConfig(strategy='greedy', compute_timestamps=timestamps) unbatched_decoding = CTCBPEDecoding(decoding_cfg=cfg, tokenizer=tmp_tokenizer) cfg.strategy = 'greedy_batched' @@ -258,7 +317,7 @@ def test_batched_decoding_labels(self, tmp_tokenizer, timestamps, length_is_none torch.manual_seed(1) B, T = 4, 20 V = unbatched_decoding.tokenizer.tokenizer.vocab_size + 1 - input_labels = torch.randint(V, size=(B, T)) + input_labels = torch.randint(V, size=(B, T), device=labels_device) # Set some indices to blank to make sure that we always handle # at least a few blanks. input_labels[:, 0] = unbatched_decoding.tokenizer.tokenizer.vocab_size @@ -266,7 +325,7 @@ def test_batched_decoding_labels(self, tmp_tokenizer, timestamps, length_is_none if length_is_none: length = None else: - length = torch.randint(low=1, high=T, size=[B]) + length = torch.randint(low=1, high=T, size=[B], device=length_device) with torch.inference_mode(): hyps, _ = unbatched_decoding.ctc_decoder_predictions_tensor( From f397086f4a580e42633a89db99885eb07b511c3d Mon Sep 17 00:00:00 2001 From: Somshubra Majumdar Date: Thu, 30 May 2024 10:25:07 -0700 Subject: [PATCH 37/47] Revert "Fix GreedyBatchedCTCInfer regression from GreedyCTCInfer. (#9347)" (#9351) This reverts commit aed9d071c700080b3eb024e8a5d7f091f20f0183. --- .../parts/submodules/ctc_greedy_decoding.py | 12 +--- .../asr/decoding/test_ctc_decoding.py | 71 ++----------------- 2 files changed, 7 insertions(+), 76 deletions(-) diff --git a/nemo/collections/asr/parts/submodules/ctc_greedy_decoding.py b/nemo/collections/asr/parts/submodules/ctc_greedy_decoding.py index 74204cf73d8e..a7f57c82279a 100644 --- a/nemo/collections/asr/parts/submodules/ctc_greedy_decoding.py +++ b/nemo/collections/asr/parts/submodules/ctc_greedy_decoding.py @@ -394,17 +394,7 @@ def forward( if decoder_lengths is None: logging.warning(_DECODER_LENGTHS_NONE_WARNING, mode=logging_mode.ONCE) - decoder_lengths = torch.tensor( - [decoder_output.shape[1]], dtype=torch.long, device=decoder_output.device - ).expand(decoder_output.shape[0]) - - # GreedyCTCInfer::forward(), by accident, works with - # decoder_lengths on either CPU or GPU when decoder_output is - # on GPU. For the sake of backwards compatibility, we also - # allow decoder_lengths to be on the CPU device. In this case, - # we simply copy the decoder_lengths from CPU to GPU. If both - # tensors are already on the same device, this is a no-op. - decoder_lengths = decoder_lengths.to(decoder_output.device) + decoder_lengths = torch.tensor([decoder_output.shape[1]], dtype=torch.long).expand(decoder_output.shape[0]) if decoder_output.ndim == 2: hypotheses = self._greedy_decode_labels_batched(decoder_output, decoder_lengths) diff --git a/tests/collections/asr/decoding/test_ctc_decoding.py b/tests/collections/asr/decoding/test_ctc_decoding.py index 580344fed395..a42d61f051ad 100644 --- a/tests/collections/asr/decoding/test_ctc_decoding.py +++ b/tests/collections/asr/decoding/test_ctc_decoding.py @@ -200,41 +200,8 @@ def test_subword_decoding_greedy_forward_hypotheses(self, tmp_tokenizer, alignme @pytest.mark.parametrize('timestamps', [False, True]) @pytest.mark.parametrize('preserve_frame_confidence', [False, True]) @pytest.mark.parametrize('length_is_none', [False, True]) - @pytest.mark.parametrize( - "logprobs_device", - [ - torch.device("cpu"), - pytest.param( - torch.device("cuda"), - marks=pytest.mark.skipif( - not torch.cuda.is_available(), - reason='CUDA required for test.', - ), - ), - ], - ) - @pytest.mark.parametrize( - "length_device", - [ - torch.device("cpu"), - pytest.param( - torch.device("cuda"), - marks=pytest.mark.skipif( - not torch.cuda.is_available(), - reason='CUDA required for test.', - ), - ), - ], - ) def test_batched_decoding_logprobs( - self, - tmp_tokenizer, - alignments, - timestamps, - preserve_frame_confidence, - length_is_none, - logprobs_device, - length_device, + self, tmp_tokenizer, alignments, timestamps, preserve_frame_confidence, length_is_none ): cfg = CTCBPEDecodingConfig( strategy='greedy', @@ -250,7 +217,7 @@ def test_batched_decoding_logprobs( torch.manual_seed(1) B, T = 4, 20 V = unbatched_decoding.tokenizer.tokenizer.vocab_size + 1 - input_signal = torch.randn(size=(B, T, V), device=logprobs_device) + input_signal = torch.randn(size=(B, T, V)) # Set the blank index to a very high probability to make sure # that we always handle at least a few blanks. input_signal[:, 0, unbatched_decoding.tokenizer.tokenizer.vocab_size] = 1000 @@ -258,7 +225,7 @@ def test_batched_decoding_logprobs( if length_is_none: length = None else: - length = torch.randint(low=1, high=T, size=[B], device=length_device) + length = torch.randint(low=1, high=T, size=[B]) with torch.inference_mode(): hyps, _ = unbatched_decoding.ctc_decoder_predictions_tensor( @@ -282,33 +249,7 @@ def test_batched_decoding_logprobs( @pytest.mark.unit @pytest.mark.parametrize('timestamps', [False, True]) @pytest.mark.parametrize('length_is_none', [False, True]) - @pytest.mark.parametrize( - "labels_device", - [ - torch.device("cpu"), - pytest.param( - torch.device("cuda"), - marks=pytest.mark.skipif( - not torch.cuda.is_available(), - reason='CUDA required for test.', - ), - ), - ], - ) - @pytest.mark.parametrize( - "length_device", - [ - torch.device("cpu"), - pytest.param( - torch.device("cuda"), - marks=pytest.mark.skipif( - not torch.cuda.is_available(), - reason='CUDA required for test.', - ), - ), - ], - ) - def test_batched_decoding_labels(self, tmp_tokenizer, timestamps, length_is_none, labels_device, length_device): + def test_batched_decoding_labels(self, tmp_tokenizer, timestamps, length_is_none): cfg = CTCBPEDecodingConfig(strategy='greedy', compute_timestamps=timestamps) unbatched_decoding = CTCBPEDecoding(decoding_cfg=cfg, tokenizer=tmp_tokenizer) cfg.strategy = 'greedy_batched' @@ -317,7 +258,7 @@ def test_batched_decoding_labels(self, tmp_tokenizer, timestamps, length_is_none torch.manual_seed(1) B, T = 4, 20 V = unbatched_decoding.tokenizer.tokenizer.vocab_size + 1 - input_labels = torch.randint(V, size=(B, T), device=labels_device) + input_labels = torch.randint(V, size=(B, T)) # Set some indices to blank to make sure that we always handle # at least a few blanks. input_labels[:, 0] = unbatched_decoding.tokenizer.tokenizer.vocab_size @@ -325,7 +266,7 @@ def test_batched_decoding_labels(self, tmp_tokenizer, timestamps, length_is_none if length_is_none: length = None else: - length = torch.randint(low=1, high=T, size=[B], device=length_device) + length = torch.randint(low=1, high=T, size=[B]) with torch.inference_mode(): hyps, _ = unbatched_decoding.ctc_decoder_predictions_tensor( From bf53aaa9e154f7068c49637e71b23a8d0bac513e Mon Sep 17 00:00:00 2001 From: Onur Yilmaz <35306097+oyilmaz-nvidia@users.noreply.github.com> Date: Thu, 30 May 2024 15:26:51 -0400 Subject: [PATCH 38/47] TRT-LLM Export Code Cleanup (#9270) * Init code cleanup for the trt-llm export Signed-off-by: Onur Yilmaz * Removed model config Signed-off-by: Onur Yilmaz * Apply isort and black reformatting Signed-off-by: oyilmaz-nvidia * clearn futher Signed-off-by: Onur Yilmaz * Done more cleaning and addressing the reviews Signed-off-by: Onur Yilmaz * Apply isort and black reformatting Signed-off-by: oyilmaz-nvidia --------- Signed-off-by: Onur Yilmaz Signed-off-by: oyilmaz-nvidia Co-authored-by: oyilmaz-nvidia --- nemo/export/tensorrt_llm.py | 71 +-- nemo/export/trt_llm/decoder/__init__.py | 82 --- nemo/export/trt_llm/decoder/decoder.py | 264 --------- nemo/export/trt_llm/decoder/falcon.py | 152 ----- nemo/export/trt_llm/decoder/gemma.py | 150 ----- nemo/export/trt_llm/decoder/gpt.py | 147 ----- nemo/export/trt_llm/decoder/gptj.py | 117 ---- nemo/export/trt_llm/decoder/llama.py | 169 ------ nemo/export/trt_llm/model_config.py | 555 ------------------ nemo/export/trt_llm/model_config_trt.py | 82 --- nemo/export/trt_llm/nemo/convert.py | 149 ----- nemo/export/trt_llm/nemo/nemo.py | 16 +- nemo/export/trt_llm/nemo/nemo_ckpt_convert.py | 2 +- nemo/export/trt_llm/nemo_utils.py | 239 +++----- nemo/export/trt_llm/quantization_utils.py | 128 ---- nemo/export/trt_llm/tensor_utils.py | 59 -- nemo/export/trt_llm/tensorrt_llm_build.py | 320 ---------- nemo/export/trt_llm/tensorrt_llm_model.py | 406 ------------- nemo/export/trt_llm/tensorrt_llm_run.py | 109 ---- nemo/export/trt_llm/tensorrt_llm_utils.py | 85 --- nemo/export/trt_llm/utils.py | 78 --- 21 files changed, 86 insertions(+), 3294 deletions(-) delete mode 100644 nemo/export/trt_llm/decoder/__init__.py delete mode 100644 nemo/export/trt_llm/decoder/decoder.py delete mode 100644 nemo/export/trt_llm/decoder/falcon.py delete mode 100644 nemo/export/trt_llm/decoder/gemma.py delete mode 100644 nemo/export/trt_llm/decoder/gpt.py delete mode 100644 nemo/export/trt_llm/decoder/gptj.py delete mode 100644 nemo/export/trt_llm/decoder/llama.py delete mode 100644 nemo/export/trt_llm/model_config.py delete mode 100644 nemo/export/trt_llm/model_config_trt.py delete mode 100644 nemo/export/trt_llm/quantization_utils.py delete mode 100644 nemo/export/trt_llm/tensor_utils.py delete mode 100644 nemo/export/trt_llm/tensorrt_llm_model.py delete mode 100644 nemo/export/trt_llm/tensorrt_llm_utils.py delete mode 100644 nemo/export/trt_llm/utils.py diff --git a/nemo/export/tensorrt_llm.py b/nemo/export/tensorrt_llm.py index b030165a3d45..401ac2e930a6 100644 --- a/nemo/export/tensorrt_llm.py +++ b/nemo/export/tensorrt_llm.py @@ -28,14 +28,11 @@ from nemo.deploy import ITritonDeployable from nemo.export.tarutils import TarPath, unpack_tarball -from nemo.export.trt_llm.model_config_trt import model_config_to_tensorrt_llm -from nemo.export.trt_llm.nemo.nemo_ckpt_convert import build_tokenizer -from nemo.export.trt_llm.nemo_utils import get_tokenzier, nemo_llm_model_to_model_config, nemo_to_trtllm_config +from nemo.export.trt_llm.nemo_utils import get_tokenzier, is_nemo_file, nemo_to_trtllm_config from nemo.export.trt_llm.qnemo import qnemo_to_tensorrt_llm from nemo.export.trt_llm.qnemo.tokenizer_utils import get_nmt_tokenizer from nemo.export.trt_llm.tensorrt_llm_build import build_and_save_engine -from nemo.export.trt_llm.tensorrt_llm_run import generate, generate_streaming, load, load_refit -from nemo.export.trt_llm.utils import is_nemo_file +from nemo.export.trt_llm.tensorrt_llm_run import generate, generate_streaming, load use_deploy = True try: @@ -278,70 +275,6 @@ def export( if load_model: self._load() - def build( - self, - nemo_model, - nemo_model_config, - tokenizer=None, - max_input_token: int = 256, - max_output_token: int = 256, - max_batch_size: int = 8, - use_refit: bool = False, - model_type: str = "gptnext", - ): - from megatron.core import parallel_state - - self.use_refit = use_refit - self.stream = torch.cuda.Stream() - self.model_type = model_type - self.tokenizer = build_tokenizer(tokenizer) - - # Each model shard has its own directory - if parallel_state.get_data_parallel_world_size() > 1: - self.model_dir = os.path.join(self.model_dir, f"dp{parallel_state.get_data_parallel_rank()}") - if parallel_state.get_tensor_model_parallel_world_size() > 1: - self.model_dir = os.path.join(self.model_dir, f"tp{parallel_state.get_tensor_model_parallel_rank()}") - if parallel_state.get_pipeline_model_parallel_world_size() > 1: - self.model_dir = os.path.join(self.model_dir, f"pp{parallel_state.get_pipeline_model_parallel_rank()}") - - # Build or refit TRT-LLM engine from a nemo model. - model_configs = nemo_llm_model_to_model_config( - nemo_model=nemo_model, - decoder_type=model_type, - nemo_model_config=nemo_model_config, - ) - - model_config_to_tensorrt_llm( - model_configs, - self.model_dir, - max_input_len=max_input_token, - max_output_len=max_output_token, - max_batch_size=max_batch_size, - max_beam_width=1, - max_prompt_embedding_table_size=0, - use_refit=self.use_refit, - ) - # Use load_refit to handle multiprocessed environment - self.model = load_refit( - tokenizer=self.tokenizer, engine_dir=self.model_dir, model_configs=model_configs, stream=self.stream - ) - - def refit( - self, - nemo_model, - nemo_model_config, - ): - assert self.use_refit, "TRT-LLM model must be built() with refit=True" - - # Build or refit TRT-LLM engine from a nemo model. - model_configs = nemo_llm_model_to_model_config( - nemo_model=nemo_model, decoder_type=self.model_type, nemo_model_config=nemo_model_config - ) - - self.model = load_refit( - tokenizer=self.tokenizer, engine_dir=self.model_dir, model_configs=model_configs, stream=self.stream - ) - def forward( self, input_texts: List[str], diff --git a/nemo/export/trt_llm/decoder/__init__.py b/nemo/export/trt_llm/decoder/__init__.py deleted file mode 100644 index b5e22b5e513e..000000000000 --- a/nemo/export/trt_llm/decoder/__init__.py +++ /dev/null @@ -1,82 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from typing import Dict, Type - -import tensorrt as trt - -from nemo.export.trt_llm.decoder.decoder import DecoderLayerBuilder, DecoderLayerConfigBuilder -from nemo.export.trt_llm.decoder.falcon import FALCONDecoderLayerBuilder, FALCONDecoderLayerConfigBuilder -from nemo.export.trt_llm.decoder.gemma import GemmaDecoderLayerBuilder, GemmaDecoderLayerConfigBuilder -from nemo.export.trt_llm.decoder.gpt import GPTDecoderLayerBuilder, GPTDecoderLayerConfigBuilder -from nemo.export.trt_llm.decoder.gptj import GPTJDecoderLayerBuilder, GPTJDecoderLayerConfigBuilder -from nemo.export.trt_llm.decoder.llama import LLAMADecoderLayerBuilder, LLAMADecoderLayerConfigBuilder -from nemo.export.trt_llm.model_config import ( - DECODER_FALCON, - DECODER_GEMMA, - DECODER_GPT2, - DECODER_GPTJ, - DECODER_GPTNEXT, - DECODER_LLAMA, - QUANTIZATION_NONE, -) - -DECODER_CONFIG_REGISTRY: Dict[str, Type[DecoderLayerConfigBuilder]] = { - DECODER_GPT2: GPTDecoderLayerConfigBuilder, - DECODER_GPTJ: GPTJDecoderLayerConfigBuilder, - DECODER_LLAMA: LLAMADecoderLayerConfigBuilder, - DECODER_FALCON: FALCONDecoderLayerConfigBuilder, - DECODER_GEMMA: GemmaDecoderLayerConfigBuilder, -} - -DECODER_MODEL_TYPE = { - DECODER_GPT2: 'GPTForCausalLM', - DECODER_GPTNEXT: 'GPTForCausalLM', - DECODER_LLAMA: 'LLaMAForCausalLM', - DECODER_GEMMA: 'GemmaForCausalLM', - DECODER_FALCON: 'FalconForCausalLM', -} - - -def build_decoder_layer_config(layer, decoder: str, dtype=trt.float16, rank=0, tensor_parallel=1): - """Builds the decoder layer config with the input torch module.""" - assert decoder in DECODER_CONFIG_REGISTRY, f"{decoder} not supported" - return DECODER_CONFIG_REGISTRY[decoder](decoder, dtype, rank, tensor_parallel).build_layer(layer) - - -DECODER_REGISTRY: Dict[str, Type[DecoderLayerBuilder]] = { - DECODER_GPT2: GPTDecoderLayerBuilder, - DECODER_GPTJ: GPTJDecoderLayerBuilder, - DECODER_LLAMA: LLAMADecoderLayerBuilder, - DECODER_GPTNEXT: GPTDecoderLayerBuilder, - DECODER_FALCON: FALCONDecoderLayerBuilder, - DECODER_GEMMA: GemmaDecoderLayerBuilder, -} - - -def build_decoder_layer( - layer, - layer_id: int, - num_layers: int, - dtype=trt.float16, - quantization=QUANTIZATION_NONE, - rank=0, - tensor_parallel=1, - tp_group=None, -): - """Builds the tensorrt llm decoder layer module with the layer config as the input.""" - assert layer.decoder_type in DECODER_REGISTRY, f"{layer.decoder_type} not supported" - builder = DECODER_REGISTRY[layer.decoder_type] - decoder_builder = builder(layer, layer_id, num_layers, dtype, quantization, rank, tensor_parallel, tp_group) - return decoder_builder.decoder diff --git a/nemo/export/trt_llm/decoder/decoder.py b/nemo/export/trt_llm/decoder/decoder.py deleted file mode 100644 index 2d1993fd74c0..000000000000 --- a/nemo/export/trt_llm/decoder/decoder.py +++ /dev/null @@ -1,264 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from abc import ABC, abstractmethod -from typing import Optional - -import tensorrt as trt -from transformers.activations import ACT2FN - -from nemo.export.trt_llm.model_config import ( - QUANTIZATION_NONE, - AttentionConfig, - DecoderLayerConfig, - LayernormConfig, - MLPConfig, -) -from nemo.export.trt_llm.quantization_utils import quantize_linear -from nemo.export.trt_llm.tensor_utils import get_tensor_parallel_group - - -def _get_hidden_act(act_func): - """Returns the name of the hidden activation functon based on ACT2FN.""" - if isinstance(act_func, str): - return act_func - - for name, func in ACT2FN.items(): - if isinstance(func, tuple): - if isinstance(act_func, func[0]): - return name - elif isinstance(act_func, func): - return name - assert False, f"Cannot find name for {act_func}" - - -class DecoderLayerConfigBuilder(ABC): - """A config builder that translate the LLM decoder layer to the DecoderLayerConfig.""" - - @abstractmethod - def hidden_act_fn(self, layer): - """Returns the hidden act fn in the MLP layer, e.g. SiLUActivation or NewGELUActivation.""" - pass - - @abstractmethod - def infer_num_attention_heads(self, layer): - """Returns the num of attention heads of the layer.""" - pass - - @abstractmethod - def infer_max_position_embeddings(self, layer): - """Returns the max positional embeddings of the layer.""" - pass - - @abstractmethod - def build_input_layernorm(self, layer) -> LayernormConfig: - """Returns the built input layernorm layer.""" - pass - - @abstractmethod - def build_mlp_layernorm( - self, layer - ) -> LayernormConfig: # Force all other models to implement. But seems this builder is not used. - """Returns the built mlp layernorm layer.""" - pass - - @abstractmethod - def build_attention(self, layer) -> AttentionConfig: - """Returns the built attention layer.""" - pass - - @abstractmethod - def build_mlp(self, layer) -> MLPConfig: - """Returns the built mlp layer.""" - pass - - @abstractmethod - def build_post_layernorm(self, layer) -> Optional[LayernormConfig]: - """Returns the built post layernorm.""" - pass - - def __init__( - self, - decoder_type: str, - dtype: trt.DataType = trt.float16, - rank: int = 0, - tensor_parallel: int = 1, - ): - """Initializes the DecoderLayerConfigBuilder.""" - self.decoder_type = decoder_type - self.dtype = dtype - self.rank = rank - self.tensor_parallel = tensor_parallel - - def build_layer(self, layer) -> DecoderLayerConfig: - """Builds the decoder layer and returns the DecoderLayer.""" - decoder = DecoderLayerConfig() - - decoder.decoder_type = self.decoder_type - decoder.num_attention_heads = self.infer_num_attention_heads(layer) - decoder.num_kv_heads = self.infer_num_kv_heads(layer) - decoder.max_position_embeddings = self.infer_max_position_embeddings(layer) - - decoder.input_layernorm = self.build_input_layernorm(layer) - decoder.mlp_layernorm = self.build_mlp_layernorm(layer) - decoder.attention = self.build_attention(layer) - decoder.post_layernorm = self.build_post_layernorm(layer) - decoder.mlp = self.build_mlp(layer) - decoder.mlp.hidden_act = _get_hidden_act(self.hidden_act_fn(layer)).split("_")[0] - - return decoder - - def infer_num_kv_heads(self, layer): - """Returns the num of key value heads of the layer.""" - return self.infer_num_attention_heads(layer) - - -class DecoderLayerBuilder(ABC): - """An abstracted transformer decoder layer with tensorrt_llm implementation taking DecoderLayerConfig as the input. - - Individual decoder layers are supposed to extend this class and implement the customized - abstracted method. - """ - - @abstractmethod - def build_decoder(self, layer): - """Returns the built decoder layer.""" - pass - - def __init__( - self, - layer: DecoderLayerConfig, - layer_id: int, - num_layers: int, - dtype: trt.DataType = trt.float16, - quantization: str = QUANTIZATION_NONE, - rank: int = 0, - tensor_parallel: int = 1, - tp_group=None, - ): - """Initializes the DecoderLayer.""" - super().__init__() - assert isinstance(dtype, trt.DataType) - self.layer_id = layer_id - self.num_layers = num_layers - self.dtype = dtype - self.quantization = quantization - self.rank = rank - self.tensor_parallel = tensor_parallel - - if tp_group is None: - self.tp_group = get_tensor_parallel_group(tensor_parallel) - else: - self.tp_group = tp_group - - self.hidden_size = layer.hidden_size - self.num_attention_heads = layer.num_attention_heads - self.num_kv_heads = layer.num_kv_heads if layer.num_kv_heads > 0 else layer.num_attention_heads - - assert ( - self.num_attention_heads % self.num_kv_heads - ) == 0, "MQA/GQA requires the number of heads to be divisible by the number of K/V heads." - assert (self.num_kv_heads % self.tensor_parallel) == 0 or (self.tensor_parallel % self.num_kv_heads) == 0, ( - "MQA/GQA requires either the number of K/V heads to be divisible by the number of GPUs" - " OR the number of GPUs to be divisible by the number of K/V heads." - ) - - self.max_position_embeddings = layer.max_position_embeddings - self.hidden_act = layer.mlp.hidden_act - - self.decoder = self.build_decoder(layer) - self.assign_weights(layer) - - is_moe = ( - hasattr(self.decoder, "config") - and self.decoder.config.moe_num_experts is not None - and self.decoder.config.moe_num_experts > 1 - ) - if not is_moe: - self.quantize(layer) - - def assign_weights(self, layer: DecoderLayerConfig): - """Assign the weights to the attention tensorrt_llm layer.""" - is_moe = ( - hasattr(self.decoder, "config") - and self.decoder.config.moe_num_experts is not None - and self.decoder.config.moe_num_experts > 1 - ) - - self.decoder.input_layernorm.weight.value = layer.input_layernorm.weight - if layer.input_layernorm.bias is not None: - self.decoder.input_layernorm.bias.value = layer.input_layernorm.bias - - if layer.mlp_layernorm is not None: # Falcon has mlp layer norm - if is_moe: - assert layer.post_layernorm is None - self.decoder.post_layernorm.weight.value = layer.mlp_layernorm.weight - if layer.mlp_layernorm.bias is not None: - self.decoder.post_layernorm.bias.value = layer.mlp_layernorm.bias - else: - self.decoder.mlp_layernorm.weight.value = layer.mlp_layernorm.weight - if layer.mlp_layernorm.bias is not None: - self.decoder.mlp_layernorm.bias.value = layer.mlp_layernorm.bias - - self.decoder.attention.qkv.weight.value = layer.attention.qkv.weight - if layer.attention.qkv.bias is not None: - self.decoder.attention.qkv.bias.value = layer.attention.qkv.bias - - self.decoder.attention.dense.weight.value = layer.attention.dense.weight - if self.decoder.attention.dense.bias is not None: - self.decoder.attention.dense.bias.value = layer.attention.dense.bias - - if layer.post_layernorm is not None: - self.decoder.post_layernorm.weight.value = layer.post_layernorm.weight - if layer.post_layernorm.bias is not None: - self.decoder.post_layernorm.bias.value = layer.post_layernorm.bias - - if is_moe: - self.decoder.mlp.router.weight.value = layer.mlp.router.weight - self.decoder.mlp.experts_weight_1.value = layer.mlp.fc1.weight - self.decoder.mlp.experts_weight_2.value = layer.mlp.fc2.weight - - if layer.mlp.fc1.bias is not None: - self.decoder.mlp.experts_bias_1.value = layer.mlp.fc1.bias - - if layer.mlp.fc2.bias is not None: - self.decoder.mlp.experts_bias_2.value = layer.mlp.fc2.bias - - else: - self.decoder.mlp.fc.weight.value = layer.mlp.fc.weight - self.decoder.mlp.proj.weight.value = layer.mlp.proj.weight - bias = layer.mlp.fc.bias is not None - if bias: - self.decoder.mlp.fc.bias.value = layer.mlp.fc.bias - self.decoder.mlp.proj.bias.value = layer.mlp.proj.bias - - if layer.mlp.gate: - self.decoder.mlp.gate.weight.value = layer.mlp.gate.weight - if bias: - self.decoder.mlp.gate.bias.value = layer.mlp.gate.bias - - def quantize(self, layer: DecoderLayerConfig): - """Quantizes the decoder layer based on the layer config.""" - self.decoder.attention.qkv = quantize_linear( - self.decoder.attention.qkv, self.quantization, layer.attention.qkv - ) - self.decoder.attention.dense = quantize_linear( - self.decoder.attention.dense, self.quantization, layer.attention.dense - ) - self.decoder.mlp.fc = quantize_linear(self.decoder.mlp.fc, self.quantization, layer.mlp.fc) - self.decoder.mlp.proj = quantize_linear(self.decoder.mlp.proj, self.quantization, layer.mlp.proj) - - if hasattr(self.decoder.mlp, "gate"): - self.decoder.mlp.gate = quantize_linear(self.decoder.mlp.gate, self.quantization, layer.mlp.gate) diff --git a/nemo/export/trt_llm/decoder/falcon.py b/nemo/export/trt_llm/decoder/falcon.py deleted file mode 100644 index e05979fa75a0..000000000000 --- a/nemo/export/trt_llm/decoder/falcon.py +++ /dev/null @@ -1,152 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from typing import Optional - -from tensorrt_llm.functional import non_gated_version -from tensorrt_llm.models.falcon.model import FalconDecoderLayer -from tensorrt_llm.models.modeling_utils import PretrainedConfig, QuantConfig -from typing_extensions import override - -from nemo.export.trt_llm.decoder.decoder import DecoderLayerBuilder, DecoderLayerConfigBuilder -from nemo.export.trt_llm.model_config import ( - LINEAR_COLUMN, - LINEAR_ROW, - AttentionConfig, - LayernormConfig, - LinearConfig, - MLPConfig, -) - - -class FALCONDecoderLayerConfigBuilder(DecoderLayerConfigBuilder): - """The FALCON implementation of the DecoderLayerConfigBuilder.""" - - @override - def hidden_act_fn(self, layer): - return layer.mlp.act_fn - - @override - def infer_num_attention_heads(self, layer): - return layer.self_attn.num_heads - - @override - def infer_num_kv_heads(self, layer): - return layer.self_attn.num_key_value_heads - - @override - def infer_max_position_embeddings(self, layer): - return layer.self_attn.max_position_embeddings - - @override - def build_input_layernorm(self, layer) -> LayernormConfig: - return LayernormConfig.from_nn_module(layer.input_layernorm, dtype=self.dtype) - - @override - def build_mlp_layernorm(self, layer) -> LayernormConfig: - return LayernormConfig.from_nn_module(layer.mlp_layernorm, dtype=self.dtype) - - @override - def build_attention(self, layer) -> AttentionConfig: - config = AttentionConfig() - config.qkv = LinearConfig.from_qkv_nn_modules( - [layer.self_attn.q_proj, layer.self_attn.k_proj, layer.self_attn.v_proj], - rank=self.rank, - tensor_parallel=self.tensor_parallel, - dtype=self.dtype, - ) - - config.dense = LinearConfig.from_nn_module( - layer.self_attn.o_proj, - LINEAR_ROW, - rank=self.rank, - tensor_parallel=self.tensor_parallel, - dtype=self.dtype, - ) - - return config - - @override - def build_mlp(self, layer) -> MLPConfig: - config = MLPConfig() - config.fc = LinearConfig.from_nn_module( - layer.mlp.gate_proj, - LINEAR_COLUMN, - rank=self.rank, - tensor_parallel=self.tensor_parallel, - dtype=self.dtype, - ) - config.proj = LinearConfig.from_nn_module( - layer.mlp.down_proj, - LINEAR_ROW, - rank=self.rank, - tensor_parallel=self.tensor_parallel, - dtype=self.dtype, - ) - config.gate = LinearConfig.from_nn_module( - layer.mlp.up_proj, - LINEAR_COLUMN, - rank=self.rank, - tensor_parallel=self.tensor_parallel, - dtype=self.dtype, - ) - - return config - - @override - def build_post_layernorm(self, layer) -> Optional[LayernormConfig]: - return LayernormConfig.from_nn_module(layer.post_attention_layernorm, dtype=self.dtype) - - -class FALCONDecoderLayerBuilder(DecoderLayerBuilder): - """The FALCON implementation of the DecoderLayer.""" - - @override - def build_decoder(self, layer): - # Falcon 7B: parallel_attention=True, new_decoder_architecture=False - # Falcon 40B/180B: parallel_attention=True, new_decoder_architecture=True - config = PretrainedConfig( - architecture=None, - dtype=self.dtype, - logits_dtype=self.dtype, - vocab_size=layer.vocab_size, - max_position_embeddings=self.max_position_embeddings, - hidden_size=self.hidden_size, - num_hidden_layers=self.num_layers, - num_attention_heads=self.num_attention_heads, - num_key_value_heads=self.num_kv_heads, - hidden_act=non_gated_version(self.hidden_act), - intermediate_size=layer.ffn_hidden_size_local * self.tensor_parallel, - norm_epsilon=layer.norm_epsilon, - position_embedding_type="rope_gpt_neox", - world_size=self.tensor_parallel, - tp_size=self.tensor_parallel, - pp_size=1, - quantization=QuantConfig(), - max_lora_rank=layer.max_lora_rank, - use_parallel_embedding=False, - ) - - # No other way to pass in model variant config, determine model variant by num_layers (7B: 32 layers) - config.set_if_not_exist('new_decoder_architecture', False if self.num_layers == 32 else True) - config.set_if_not_exist('parallel_attention', True) - config.set_if_not_exist('layernorm_epsilon', 1e-5) - config.set_if_not_exist('bias', False) - config.set_if_not_exist('moe_num_experts', 0) - - return FalconDecoderLayer( - config=config, - layer_idx=self.layer_id, - ) diff --git a/nemo/export/trt_llm/decoder/gemma.py b/nemo/export/trt_llm/decoder/gemma.py deleted file mode 100644 index 37f843dcf0ca..000000000000 --- a/nemo/export/trt_llm/decoder/gemma.py +++ /dev/null @@ -1,150 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from typing import Optional - -from tensorrt_llm.functional import non_gated_version -from tensorrt_llm.models.gemma.model import GemmaDecoderLayer, QuantConfig -from tensorrt_llm.models.modeling_utils import PretrainedConfig -from typing_extensions import override - -from nemo.export.trt_llm.decoder.decoder import DecoderLayerBuilder, DecoderLayerConfigBuilder -from nemo.export.trt_llm.model_config import ( - LINEAR_COLUMN, - LINEAR_ROW, - AttentionConfig, - LayernormConfig, - LinearConfig, - MLPConfig, -) - - -class GemmaDecoderLayerConfigBuilder(DecoderLayerConfigBuilder): - """The LLAMA implementation of the DecoderLayerConfigBuilder.""" - - @override - def hidden_act_fn(self, layer): - return layer.mlp.act_fn - - @override - def infer_num_attention_heads(self, layer): - return layer.self_attn.num_heads - - @override - def infer_num_kv_heads(self, layer): - return layer.self_attn.num_key_value_heads - - @override - def infer_max_position_embeddings(self, layer): - return layer.self_attn.max_position_embeddings - - @override - def build_input_layernorm(self, layer) -> LayernormConfig: - return LayernormConfig.from_nn_module(layer.input_layernorm, dtype=self.dtype) - - @override - def build_attention(self, layer) -> AttentionConfig: - config = AttentionConfig() - config.qkv = LinearConfig.from_qkv_nn_modules( - [layer.self_attn.q_proj, layer.self_attn.k_proj, layer.self_attn.v_proj], - rank=self.rank, - tensor_parallel=self.tensor_parallel, - dtype=self.dtype, - ) - - config.dense = LinearConfig.from_nn_module( - layer.self_attn.o_proj, - LINEAR_ROW, - rank=self.rank, - tensor_parallel=self.tensor_parallel, - dtype=self.dtype, - ) - - return config - - @override - def build_mlp(self, layer) -> MLPConfig: - config = MLPConfig() - config.fc = LinearConfig.from_nn_module( - layer.mlp.gate_proj, - LINEAR_COLUMN, - rank=self.rank, - tensor_parallel=self.tensor_parallel, - dtype=self.dtype, - ) - config.proj = LinearConfig.from_nn_module( - layer.mlp.down_proj, - LINEAR_ROW, - rank=self.rank, - tensor_parallel=self.tensor_parallel, - dtype=self.dtype, - ) - config.gate = LinearConfig.from_nn_module( - layer.mlp.up_proj, - LINEAR_COLUMN, - rank=self.rank, - tensor_parallel=self.tensor_parallel, - dtype=self.dtype, - ) - - return config - - @override - def build_post_layernorm(self, layer) -> Optional[LayernormConfig]: - return LayernormConfig.from_nn_module(layer.post_attention_layernorm, dtype=self.dtype) - - -class GemmaDecoderLayerBuilder(DecoderLayerBuilder): - """The Gemma implementation of the DecoderLayer.""" - - @override - def build_decoder(self, layer): - rotary_scaling = None - if layer.rotary_scaling is not None: - rotary_scaling = {"type": "linear", "factor": float(layer.rotary_scaling)} - - config = PretrainedConfig( - architecture=None, - dtype=self.dtype, - logits_dtype=self.dtype, - vocab_size=layer.vocab_size, - max_position_embeddings=self.max_position_embeddings, - hidden_size=self.hidden_size, - num_hidden_layers=self.num_layers, - num_attention_heads=self.num_attention_heads, - num_key_value_heads=self.num_kv_heads, - head_size=layer.kv_channels, - hidden_act=self.hidden_act.split("-")[-1] if layer.moe_num_experts else non_gated_version(self.hidden_act), - intermediate_size=layer.ffn_hidden_size_local * self.tensor_parallel, - norm_epsilon=layer.norm_epsilon, - position_embedding_type="rope_gpt_neox", - world_size=self.tensor_parallel, - tp_size=self.tensor_parallel, - pp_size=1, - quantization=QuantConfig(), - max_lora_rank=layer.max_lora_rank, - ) - - config.set_if_not_exist('mlp_bias', False) - config.set_if_not_exist('attn_bias', False) - config.set_if_not_exist('rotary_base', layer.rotary_base) - config.set_if_not_exist('rotary_scaling', rotary_scaling) - config.set_if_not_exist('enable_pos_shift', False) - config.set_if_not_exist('dense_context_fmha', False) - config.set_if_not_exist('moe_num_experts', 0) - - return GemmaDecoderLayer( - config=config, - layer_idx=self.layer_id, - ) diff --git a/nemo/export/trt_llm/decoder/gpt.py b/nemo/export/trt_llm/decoder/gpt.py deleted file mode 100644 index a405aabbbd48..000000000000 --- a/nemo/export/trt_llm/decoder/gpt.py +++ /dev/null @@ -1,147 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from typing import Optional - -from tensorrt_llm.layers import AttentionMaskType, PositionEmbeddingType -from tensorrt_llm.models.gpt.model import GPTDecoderLayer -from tensorrt_llm.models.modeling_utils import PretrainedConfig, QuantConfig -from typing_extensions import override - -from nemo.export.trt_llm.decoder.decoder import DecoderLayerBuilder, DecoderLayerConfigBuilder -from nemo.export.trt_llm.model_config import ( - LINEAR_COLUMN, - LINEAR_ROW, - AttentionConfig, - LayernormConfig, - LinearConfig, - MLPConfig, -) - - -class GPTDecoderLayerConfigBuilder(DecoderLayerConfigBuilder): - """The GPT2 implementation of the DecoderLayerConfigBuilder.""" - - @override - def hidden_act_fn(self, layer): - return layer.mlp.act - - @override - def infer_num_attention_heads(self, layer): - return layer.attn.num_heads - - @override - def infer_max_position_embeddings(self, layer): - return layer.attn.bias.shape[2] - - @override - def build_input_layernorm(self, layer) -> LayernormConfig: - return LayernormConfig.from_nn_module(layer.ln_1, dtype=self.dtype) - - @override - def build_attention(self, layer) -> AttentionConfig: - config = AttentionConfig() - config.qkv = LinearConfig.from_qkv_nn_modules( - [layer.attn.c_attn], - rank=self.rank, - tensor_parallel=self.tensor_parallel, - dtype=self.dtype, - ) - - config.dense = LinearConfig.from_nn_module( - layer.attn.c_proj, - LINEAR_ROW, - rank=self.rank, - tensor_parallel=self.tensor_parallel, - dtype=self.dtype, - ) - - return config - - @override - def build_mlp(self, layer) -> MLPConfig: - config = MLPConfig() - config.fc = LinearConfig.from_nn_module( - layer.mlp.c_fc, - LINEAR_COLUMN, - rank=self.rank, - tensor_parallel=self.tensor_parallel, - dtype=self.dtype, - ) - config.proj = LinearConfig.from_nn_module( - layer.mlp.c_proj, - LINEAR_ROW, - rank=self.rank, - tensor_parallel=self.tensor_parallel, - dtype=self.dtype, - ) - - return config - - @override - def build_post_layernorm(self, layer) -> Optional[LayernormConfig]: - return LayernormConfig.from_nn_module(layer.ln_2, dtype=self.dtype) - - -class GPTDecoderLayerBuilder(DecoderLayerBuilder): - """The GPT implementation of the DecoderLayer.""" - - @override - def build_decoder(self, layer): - rotary_pct = layer.rotary_pct - - position_embedding_type = "rope_gpt_neox" if layer.position_embedding_type == "rope" else "learned_absolute" - - assert not (position_embedding_type == "rope_gpt_neox" and rotary_pct == 0.0) - - bias_qkv = layer.attention.qkv.bias is not None - - rotary_scaling = None - if layer.rotary_scaling is not None: - rotary_scaling = {"type": "linear", "factor": float(layer.rotary_scaling)} - - config = PretrainedConfig( - architecture=None, - dtype=self.dtype, - logits_dtype=self.dtype, - vocab_size=layer.vocab_size, - max_position_embeddings=self.max_position_embeddings, - hidden_size=self.hidden_size, - num_hidden_layers=self.num_layers, - num_attention_heads=self.num_attention_heads, - num_key_value_heads=self.num_kv_heads, - hidden_act=self.hidden_act, - intermediate_size=layer.ffn_hidden_size_local * self.tensor_parallel, - norm_epsilon=layer.norm_epsilon, - position_embedding_type=position_embedding_type, - world_size=self.tensor_parallel, - tp_size=self.tensor_parallel, - pp_size=1, - max_lora_rank=layer.max_lora_rank, - quantization=QuantConfig(), - ) - - config.set_if_not_exist('hidden_act', self.hidden_act) - config.set_if_not_exist('apply_query_key_layer_scaling', False) - config.set_if_not_exist('bias', bias_qkv) - config.set_if_not_exist('rotary_base', layer.rotary_base) - config.set_if_not_exist('rotary_scaling', rotary_scaling) - config.set_if_not_exist('rotary_pct', rotary_pct) - config.set_if_not_exist('moe_num_experts', 0) - - return GPTDecoderLayer( - config=config, - layer_idx=self.layer_id, - ) diff --git a/nemo/export/trt_llm/decoder/gptj.py b/nemo/export/trt_llm/decoder/gptj.py deleted file mode 100644 index 327a31fdd35c..000000000000 --- a/nemo/export/trt_llm/decoder/gptj.py +++ /dev/null @@ -1,117 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from typing import Optional - -from tensorrt_llm.models.gptj.model import GPTJDecoderLayer -from typing_extensions import override - -from nemo.export.trt_llm.decoder.decoder import DecoderLayerBuilder, DecoderLayerConfigBuilder -from nemo.export.trt_llm.model_config import ( - LINEAR_COLUMN, - LINEAR_ROW, - AttentionConfig, - LayernormConfig, - LinearConfig, - MLPConfig, -) - - -class GPTJDecoderLayerConfigBuilder(DecoderLayerConfigBuilder): - """The GPTJ implementation of the DecoderLayerConfigBuilder.""" - - @override - def hidden_act_fn(self, layer): - """Returns the hidden act fn in the MLP layer, e.g. SiLUActivation or NewGELUActivation.""" - return layer.mlp.act - - @override - def infer_num_attention_heads(self, layer): - return layer.attn.num_attention_heads - - @override - def infer_max_position_embeddings(self, layer): - return layer.attn.bias.shape[2] - - @override - def build_input_layernorm(self, layer) -> LayernormConfig: - return LayernormConfig.from_nn_module(layer.ln_1, dtype=self.dtype) - - @override - def build_attention(self, layer) -> AttentionConfig: - config = AttentionConfig() - config.qkv = LinearConfig.from_qkv_nn_modules( - [layer.attn.q_proj, layer.attn.k_proj, layer.attn.v_proj], - rank=self.rank, - tensor_parallel=self.tensor_parallel, - dtype=self.dtype, - ) - - config.dense = LinearConfig.from_nn_module( - layer.attn.out_proj, - LINEAR_ROW, - rank=self.rank, - tensor_parallel=self.tensor_parallel, - dtype=self.dtype, - ) - - config.rotary_dim = layer.attn.rotary_dim - - return config - - @override - def build_mlp(self, layer) -> MLPConfig: - config = MLPConfig() - config.fc = LinearConfig.from_nn_module( - layer.mlp.fc_in, - LINEAR_COLUMN, - rank=self.rank, - tensor_parallel=self.tensor_parallel, - dtype=self.dtype, - ) - config.proj = LinearConfig.from_nn_module( - layer.mlp.fc_out, - LINEAR_ROW, - rank=self.rank, - tensor_parallel=self.tensor_parallel, - dtype=self.dtype, - ) - - return config - - @override - def build_post_layernorm(self, layer) -> Optional[LayernormConfig]: - # GPTJ do not have post layer_norm - return None - - -class GPTJDecoderLayerBuilder(DecoderLayerBuilder): - """The GPTJ implementation of the DecoderLayer.""" - - @override - def build_decoder(self, layer): - assert self.tensor_parallel == 1 and self.rank == 0, "Only single GPU is supported for GPTJ" - - return GPTJDecoderLayer( - hidden_size=self.hidden_size, - num_attention_heads=self.num_attention_heads, - max_position_embeddings=self.max_position_embeddings, - rotary_dim=layer.attention.rotary_dim, - dtype=self.dtype, - hidden_act=self.hidden_act, - tp_group=self.tp_group, - tp_size=self.tensor_parallel, - max_lora_rank=layer.max_lora_rank, - ) diff --git a/nemo/export/trt_llm/decoder/llama.py b/nemo/export/trt_llm/decoder/llama.py deleted file mode 100644 index b37d62e214de..000000000000 --- a/nemo/export/trt_llm/decoder/llama.py +++ /dev/null @@ -1,169 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from typing import Optional - -from tensorrt_llm.functional import non_gated_version -from tensorrt_llm.layers import MoeConfig -from tensorrt_llm.models.llama.model import LLaMADecoderLayer -from tensorrt_llm.models.modeling_utils import PretrainedConfig, QuantConfig -from typing_extensions import override - -from nemo.export.trt_llm.decoder.decoder import DecoderLayerBuilder, DecoderLayerConfigBuilder -from nemo.export.trt_llm.model_config import ( - LINEAR_COLUMN, - LINEAR_ROW, - AttentionConfig, - LayernormConfig, - LinearConfig, - MLPConfig, -) - - -class LLAMADecoderLayerConfigBuilder(DecoderLayerConfigBuilder): - """The LLAMA implementation of the DecoderLayerConfigBuilder.""" - - @override - def hidden_act_fn(self, layer): - return layer.mlp.act_fn - - @override - def infer_num_attention_heads(self, layer): - return layer.self_attn.num_heads - - @override - def infer_num_kv_heads(self, layer): - return layer.self_attn.num_key_value_heads - - @override - def infer_max_position_embeddings(self, layer): - return layer.self_attn.max_position_embeddings - - @override - def build_input_layernorm(self, layer) -> LayernormConfig: - return LayernormConfig.from_nn_module(layer.input_layernorm, dtype=self.dtype) - - @override - def build_attention(self, layer) -> AttentionConfig: - config = AttentionConfig() - config.qkv = LinearConfig.from_qkv_nn_modules( - [layer.self_attn.q_proj, layer.self_attn.k_proj, layer.self_attn.v_proj], - rank=self.rank, - tensor_parallel=self.tensor_parallel, - dtype=self.dtype, - ) - - config.dense = LinearConfig.from_nn_module( - layer.self_attn.o_proj, - LINEAR_ROW, - rank=self.rank, - tensor_parallel=self.tensor_parallel, - dtype=self.dtype, - ) - - return config - - @override - def build_mlp(self, layer) -> MLPConfig: - config = MLPConfig() - config.fc = LinearConfig.from_nn_module( - layer.mlp.gate_proj, - LINEAR_COLUMN, - rank=self.rank, - tensor_parallel=self.tensor_parallel, - dtype=self.dtype, - ) - config.proj = LinearConfig.from_nn_module( - layer.mlp.down_proj, - LINEAR_ROW, - rank=self.rank, - tensor_parallel=self.tensor_parallel, - dtype=self.dtype, - ) - config.gate = LinearConfig.from_nn_module( - layer.mlp.up_proj, - LINEAR_COLUMN, - rank=self.rank, - tensor_parallel=self.tensor_parallel, - dtype=self.dtype, - ) - - return config - - @override - def build_post_layernorm(self, layer) -> Optional[LayernormConfig]: - return LayernormConfig.from_nn_module(layer.post_attention_layernorm, dtype=self.dtype) - - -class LLAMADecoderLayerBuilder(DecoderLayerBuilder): - """The LLAMA implementation of the DecoderLayer.""" - - @override - def build_decoder(self, layer): - rotary_scaling = None - if layer.rotary_scaling is not None: - rotary_scaling = {"type": "linear", "factor": float(layer.rotary_scaling)} - - config = PretrainedConfig( - architecture=None, - dtype=self.dtype, - logits_dtype=self.dtype, - vocab_size=layer.vocab_size, - max_position_embeddings=self.max_position_embeddings, - hidden_size=self.hidden_size, - num_hidden_layers=self.num_layers, - num_attention_heads=self.num_attention_heads, - num_key_value_heads=self.num_kv_heads, - hidden_act=self.hidden_act.split("-")[-1] if layer.moe_num_experts else non_gated_version(self.hidden_act), - intermediate_size=layer.ffn_hidden_size_local * self.tensor_parallel, - norm_epsilon=layer.norm_epsilon, - position_embedding_type="rope_gpt_neox", - world_size=self.tensor_parallel, - tp_size=self.tensor_parallel, - pp_size=1, - max_lora_rank=layer.max_lora_rank, - quantization=QuantConfig(), - ) - - config.set_if_not_exist('mlp_bias', False) - config.set_if_not_exist('attn_bias', False) - config.set_if_not_exist('rotary_base', layer.rotary_base) - config.set_if_not_exist('rotary_scaling', rotary_scaling) - config.set_if_not_exist('enable_pos_shift', False) - config.set_if_not_exist('dense_context_fmha', False) - config.set_if_not_exist('moe_num_experts', 0) - - if layer.moe_num_experts: - if layer.moe_num_experts is not None: - if layer.moe_top_k is None: - layer.moe_top_k = 1 - - layer.moe_tp_mode = MoeConfig.ParallelismMode.TENSOR_PARALLEL if layer.moe_tp_mode is None else None - layer.moe_renorm_mode = ( - MoeConfig.ExpertScaleNormalizationMode.RENORMALIZE if layer.moe_renorm_mode is None else None - ) - moe_config = MoeConfig( - layer.moe_num_experts, layer.moe_top_k, layer.moe_tp_mode, layer.moe_renorm_mode - ) - moe_config.validate() - config.moe_num_experts = layer.moe_num_experts - config.moe_top_k = layer.moe_top_k - config.moe_tp_mode = layer.moe_tp_mode - config.moe_normalization_mode = layer.moe_renorm_mode - - return LLaMADecoderLayer( - config=config, - layer_idx=self.layer_id, - ) diff --git a/nemo/export/trt_llm/model_config.py b/nemo/export/trt_llm/model_config.py deleted file mode 100644 index 0f120dc56153..000000000000 --- a/nemo/export/trt_llm/model_config.py +++ /dev/null @@ -1,555 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import dataclasses -from dataclasses import dataclass, field -from typing import Dict, List, get_args, get_origin - -import numpy as np -import tensorrt as trt -import torch.nn as nn -from tensorrt_llm._utils import pad_vocab_size -from tensorrt_llm.functional import is_gated_activation -from transformers import LlamaConfig, PretrainedConfig -from transformers.models.llama.modeling_llama import LlamaRMSNorm - -from nemo.export.trt_llm.tensor_utils import get_tensor_from_dict, split, torch_to_numpy_with_dtype - - -DECODER_GPT2 = "gpt2" -DECODER_GPTJ = "gptj" -DECODER_LLAMA = "llama" -DECODER_GPTNEXT = "gptnext" -DECODER_FALCON = "falcon" -DECODER_GEMMA = "gemma" - -QUANTIZATION_NONE = "" -QUANTIZATION_FP8 = "fp8" -QUANTIZATION_INT8_SQ = "int8_sq" - -LINEAR_COLUMN = "column" -LINEAR_ROW = "row" - -LAYERNORM_DEFAULT = "" -LAYERNORM_RMS = "rms" - -LAYER_DEFAULT = "" -LAYER_QKV = "qkv" - - -@dataclass -class EmbeddingConfig: - """The embedding layer config.""" - - weight: np.array = None - # Whether the embedding weights are local - is_local: bool = False - - @staticmethod - def from_nn_module(module: nn.Module, dtype=trt.float16): - """Converts an nn.Module to an EmbeddingConfig.""" - return EmbeddingConfig(weight=torch_to_numpy_with_dtype(module.weight, dtype)) - - @property - def local_vocab_size(self): - """Infers the vocab_size from the embedding layer weights shape.""" - return self.weight.shape[0] - - @property - def hidden_size(self): - """Infers the hidden_size from the embedding layer weights shape.""" - return self.weight.shape[1] - - -@dataclass -class LayernormConfig: - """The layernorm layer config.""" - - weight: np.array = None - bias: np.array = None - layernorm_type: str = LAYERNORM_DEFAULT - - @staticmethod - def from_nn_module(module: nn.Module, dtype=trt.float16): - """Converts an nn.Module to an LayernormConfig.""" - layernorm_type = LAYERNORM_RMS if type(module) is LlamaRMSNorm else LAYERNORM_DEFAULT - - config = LayernormConfig(weight=torch_to_numpy_with_dtype(module.weight, dtype), layernorm_type=layernorm_type) - if layernorm_type == LAYERNORM_DEFAULT: - config.bias = torch_to_numpy_with_dtype(module.bias, dtype) - - return config - - -@dataclass -class LinearConfig: - """The linear layer config.""" - - linear_type: str = "" - weight: np.array = None - bias: np.array = None - activation_scaling_factor: np.array = None - weights_scaling_factor: np.array = None - prequant_scaling_factor: np.array = None - layer_type: str = LAYER_DEFAULT - - @staticmethod - def from_nn_module(module: nn.Module, linear_type: str, rank=0, tensor_parallel=1, dtype=trt.float16): - """Converts an nn.Module to an LinearConfig.""" - weight = torch_to_numpy_with_dtype(module.weight, dtype) - if "Conv1D" in type(module).__name__: - weight = weight.transpose() - else: - assert type(module) is nn.Linear - - config = LinearConfig() - config.linear_type = linear_type - config.weight = np.ascontiguousarray( - split(weight, tensor_parallel, rank, dim=0 if linear_type == LINEAR_COLUMN else 1) - ) - - if hasattr(module, "bias") and module.bias is not None: - if linear_type == LINEAR_COLUMN: - config.bias = np.ascontiguousarray( - split( - torch_to_numpy_with_dtype(module.bias, dtype), - tensor_parallel, - rank, - ) - ) - else: - config.bias = torch_to_numpy_with_dtype(module.bias, dtype) - - return config - - @staticmethod - def from_qkv_nn_modules(qkv_modules: List[nn.Module], rank=0, tensor_parallel=1, dtype=trt.float16): - """Converts the qkv modules to an LinearConfig.""" - config = LinearConfig() - config.linear_type = LINEAR_COLUMN - config.layer_type = LAYER_QKV - if len(qkv_modules) == 1: - # QKV layers combined as a single module, e.g. GPT2 - qkv_module = qkv_modules[0] - assert "Conv1D" in type(qkv_module).__name__ - - qkv_shape = qkv_module.weight.shape - # Decode the concat QKV weights and split them to different GPU rank. - config.weight = np.ascontiguousarray( - split( - torch_to_numpy_with_dtype(qkv_module.weight, dtype=dtype).reshape( - qkv_shape[0], 3, qkv_shape[-1] // 3 - ), - tensor_parallel, - rank, - dim=-1, - ) - .reshape(qkv_shape[0], -1) - .transpose() - ) - config.bias = np.ascontiguousarray( - split( - torch_to_numpy_with_dtype(qkv_module.bias, dtype=dtype).reshape(3, qkv_shape[-1] // 3), - tensor_parallel, - rank, - dim=-1, - ).reshape(-1) - ) - - elif len(qkv_modules) == 3: - # Separate QKV layers - for m in qkv_modules: - assert type(m) is nn.Linear - assert not (hasattr(m, "bias") and m.bias is not None) - - q_weight = split(torch_to_numpy_with_dtype(qkv_modules[0].weight), tensor_parallel, rank) - k_weight = split(torch_to_numpy_with_dtype(qkv_modules[1].weight), tensor_parallel, rank) - v_weight = split(torch_to_numpy_with_dtype(qkv_modules[2].weight), tensor_parallel, rank) - split_v = np.concatenate((q_weight, k_weight, v_weight)) - config.weight = np.ascontiguousarray(split_v) - - else: - assert False, f"QKV modules format {qkv_modules} not supported" - - return config - - -@dataclass -class MoEMLPConfig: - """The MLP layer config.""" - - fc1: LinearConfig = None - fc2: LinearConfig = None - router: LinearConfig = None - hidden_act: str = "" - - @staticmethod - def from_nemo( - weights_dict: Dict[str, np.ndarray], - llm_config: PretrainedConfig, - layer_id: int, - rank: int = 0, - is_mcore: bool = False, - ): - """Converts the nemo weights and config to `MLPConfig`.""" - mlp = MoEMLPConfig(hidden_act=llm_config.activation_function) - mlp.fc1 = LinearConfig(linear_type=LINEAR_COLUMN) - - mlp.fc1.weight = get_tensor_from_dict( - weights_dict, f"layers.{layer_id}.mlp.experts.experts.linear_fc1.weight.{rank}" - ) - - mlp.fc1.bias = get_tensor_from_dict( - weights_dict, f"layers.{layer_id}.mlp.experts.experts.linear_fc1.bias.{rank}" - ) - - mlp.fc2 = LinearConfig(linear_type=LINEAR_ROW) - mlp.fc2.weight = get_tensor_from_dict( - weights_dict, f"layers.{layer_id}.mlp.experts.experts.linear_fc2.weight.{rank}" - ) - mlp.fc2.bias = get_tensor_from_dict( - weights_dict, f"layers.{layer_id}.mlp.experts.experts.linear_fc2.bias.{rank}" - ) - - mlp.router = LinearConfig(linear_type=LINEAR_ROW) - mlp.router.weight = get_tensor_from_dict(weights_dict, f"layers.{layer_id}.mlp.router.weight.{rank}") - return mlp - - -@dataclass -class AttentionConfig: - """The attention layer config.""" - - qkv: LinearConfig = None - dense: LinearConfig = None - - rotary_dim: int = -np.inf - - @staticmethod - def from_nemo( - weights_dict: Dict[str, np.ndarray], - layer_id: int, - rank: int = 0, - ): - """Converts the nemo weights and config to `AttentionConfig`.""" - attention = AttentionConfig() - attention.qkv = LinearConfig(linear_type=LINEAR_COLUMN, layer_type=LAYER_QKV) - attention.qkv.weight = get_tensor_from_dict( - weights_dict, f"layers.{layer_id}.attention.query_key_value.weight.{rank}" - ) - attention.qkv.bias = get_tensor_from_dict( - weights_dict, - f"layers.{layer_id}.attention.query_key_value.bias.{rank}", - ) - - attention.dense = LinearConfig(linear_type=LINEAR_ROW) - attention.dense.weight = get_tensor_from_dict(weights_dict, f"layers.{layer_id}.attention.dense.weight.{rank}") - attention.dense.bias = get_tensor_from_dict( - weights_dict, - f"layers.{layer_id}.attention.dense.bias", - ) - return attention - - -@dataclass -class MLPConfig: - """The MLP layer config.""" - - fc: LinearConfig = None - gate: LinearConfig = None - proj: LinearConfig = None - hidden_act: str = "" - - @staticmethod - def from_nemo( - weights_dict: Dict[str, np.ndarray], - llm_config: PretrainedConfig, - layer_id: int, - rank: int = 0, - is_mcore: bool = False, - ): - """Converts the nemo weights and config to `MLPConfig`.""" - mlp = MLPConfig(hidden_act=llm_config.activation_function) - mlp.fc = LinearConfig(linear_type=LINEAR_COLUMN) - mlp.fc.weight = get_tensor_from_dict(weights_dict, f"layers.{layer_id}.mlp.dense_h_to_4h.weight.{rank}") - - # print("********** mlp.fc.weight : ", mlp.fc.weight ) - - mlp.fc.bias = get_tensor_from_dict( - weights_dict, - f"layers.{layer_id}.mlp.dense_h_to_4h.bias.{rank}", - ) - - gated = is_gated_activation(mlp.hidden_act) - is_fast_glu = mlp.hidden_act in ['fast-geglu', 'fast-swiglu', 'fast-reglu'] - if gated: - mlp.gate = LinearConfig(linear_type=LINEAR_COLUMN) - layer_name = ( - f"layers.{layer_id}.mlp.dense_h_to_4h_2.weight.{rank}" - if isinstance(llm_config, LlamaConfig) and not is_mcore and not is_fast_glu - else f"layers.{layer_id}.mlp.dense_h_to_4h.gate.weight.{rank}" - ) - mlp.gate.weight = get_tensor_from_dict( - weights_dict, - layer_name, - ) - mlp.gate.bias = get_tensor_from_dict( - weights_dict, - f"layers.{layer_id}.mlp.dense_h_to_4h.gate.bias.{rank}", - ) - - mlp.proj = LinearConfig(linear_type=LINEAR_ROW) - mlp.proj.weight = get_tensor_from_dict(weights_dict, f"layers.{layer_id}.mlp.dense_4h_to_h.weight.{rank}") - mlp.proj.bias = get_tensor_from_dict(weights_dict, f"layers.{layer_id}.mlp.dense_4h_to_h.bias") - return mlp - - -@dataclass -class DecoderLayerConfig: - """The decoder layer config.""" - - decoder_type: str = "" - input_layernorm: LayernormConfig = None - mlp_layernorm: LayernormConfig = None # Falcon 40B/180B has mlp_layernorm - attention: AttentionConfig = None - post_layernorm: LayernormConfig = None - mlp: MLPConfig = None - - num_attention_heads: int = 0 - - num_kv_heads: int = 0 - kv_channels: int = None - max_position_embeddings: int = 0 - rotary_pct: float = 0 - rotary_base: int = 10000 - rotary_scaling: float = None - position_embedding_type: str = None - - moe_num_experts: int = None - moe_top_k: int = None - moe_tp_mode: int = None - moe_renorm_mode: int = None - - vocab_size: int = 0 - norm_epsilon: float = 0.0 - max_lora_rank: int = 64 - - @property - def is_moe(self): - return self.moe_num_experts is not None and self.moe_num_experts > 1 - - @property - def hidden_size(self): - """Returns the hidden size of the transformer model.""" - if self.is_moe: - return self.mlp.fc2.weight.shape[1] - else: - return self.mlp.fc.weight.shape[1] - - @property - def ffn_hidden_size_local(self): - """Returns the ffn hidden size of the transformer model.""" - if self.is_moe: - return self.mlp.fc2.weight.shape[-1] - else: - return self.mlp.fc.weight.shape[0] - - @staticmethod - def from_nemo( - weights_dict: Dict[str, np.ndarray], - llm_config: PretrainedConfig, - decoder_type: str, - layer_id: int, - rank: int = 0, - is_mcore: bool = False, - ): - """Converts the nemo weights and config to `DecoderLayerConfig`.""" - layer_config = DecoderLayerConfig( - decoder_type=decoder_type, - num_attention_heads=llm_config.n_head, - max_position_embeddings=llm_config.n_positions, - rotary_pct=llm_config.rotary_pct if hasattr(llm_config, "rotary_pct") else 1.0, - rotary_base=(llm_config.rotary_base if hasattr(llm_config, "rotary_base") else 10000), - rotary_scaling=(llm_config.rotary_scaling if hasattr(llm_config, "rotary_scaling") else None), - position_embedding_type=( - llm_config.position_embedding_type if hasattr(llm_config, "position_embedding_type") else None - ), - num_kv_heads=(llm_config.num_kv_heads if hasattr(llm_config, "num_kv_heads") else 0), - kv_channels=(llm_config.kv_channels if hasattr(llm_config, "kv_channels") else None), - moe_num_experts=(llm_config.moe_num_experts if hasattr(llm_config, "moe_num_experts") else None), - moe_top_k=(llm_config.moe_top_k if hasattr(llm_config, "moe_top_k") else None), - moe_tp_mode=(llm_config.moe_tp_mode if hasattr(llm_config, "moe_tp_mode") else None), - moe_renorm_mode=(llm_config.moe_renorm_mode if hasattr(llm_config, "moe_renorm_mode") else None), - vocab_size=llm_config.vocab_size, - norm_epsilon=llm_config.norm_epsilon, - ) - layer_config.input_layernorm = LayernormConfig() - layer_config.input_layernorm.layernorm_type = ( - LAYERNORM_RMS if isinstance(llm_config, LlamaConfig) else LAYERNORM_DEFAULT - ) - layer_config.input_layernorm.weight = get_tensor_from_dict( - weights_dict, - f"layers.{layer_id}.input_layernorm.weight", - ) - layer_config.input_layernorm.bias = get_tensor_from_dict( - weights_dict, - f"layers.{layer_id}.input_layernorm.bias", - ) - - layer_config.mlp_layernorm = LayernormConfig() - layer_config.mlp_layernorm.layernorm_type = LAYERNORM_DEFAULT # Falcon uses default layernorm - layer_config.mlp_layernorm.weight = get_tensor_from_dict( - weights_dict, - f"layers.{layer_id}.pre_mlp_layernorm.weight", - ) - layer_config.mlp_layernorm.bias = get_tensor_from_dict( - weights_dict, - f"layers.{layer_id}.pre_mlp_layernorm.bias", - ) - - layer_config.post_layernorm = LayernormConfig() - layer_config.post_layernorm.layernorm_type = ( - LAYERNORM_RMS if isinstance(llm_config, LlamaConfig) else LAYERNORM_DEFAULT - ) - - layer_config.post_layernorm.weight = get_tensor_from_dict( - weights_dict, - f"layers.{layer_id}.post_attention_layernorm.weight", - ) - layer_config.post_layernorm.bias = get_tensor_from_dict( - weights_dict, - f"layers.{layer_id}.post_attention_layernorm.bias", - ) - - if layer_config.post_layernorm.weight is None: # Falcon doesn't have post layernorm - layer_config.post_layernorm = None - - if layer_config.mlp_layernorm.weight is None: - layer_config.mlp_layernorm = None - - layer_config.attention = AttentionConfig.from_nemo( - weights_dict, - layer_id, - rank, - ) - - moe = False - if llm_config.moe_num_experts is not None: - if llm_config.moe_num_experts > 1: - moe = True - - if moe: - layer_config.mlp = MoEMLPConfig.from_nemo(weights_dict, llm_config, layer_id, rank, is_mcore) - else: - layer_config.mlp = MLPConfig.from_nemo(weights_dict, llm_config, layer_id, rank, is_mcore) - - return layer_config - - -def _from_dict(class_type, data): - """Helper function to load the data as a class_type. class_type must be a dataclass.""" - if data is None: - return None - - if dataclasses.is_dataclass(class_type): - fieldtypes = {f.name: f.type for f in dataclasses.fields(class_type)} - return class_type(**{f: _from_dict(fieldtypes[f], data[f]) for f in data}) - elif get_origin(class_type) == list and dataclasses.is_dataclass(get_args(class_type)[0]): - list_value = [] - for child in data: - child_class_type = get_args(class_type)[0] - list_value.append(_from_dict(child_class_type, child)) - return list_value - else: - return data - - -@dataclass -class ModelConfig: - """The full LLM model config that includes the full information needed for tensorrt_llm engine building. - - This class includes all the fields that tensorrt_llm supports, but not all of the fields are required. - """ - - # Global metadata - quantization: str = QUANTIZATION_NONE - dtype: str = "float16" - - # Model structure and weights - vocab_embedding: EmbeddingConfig = None - positional_embedding: EmbeddingConfig = None - layers: List[DecoderLayerConfig] = field(default_factory=list) - final_layernorm: LayernormConfig = None - lm_head: LinearConfig = None - - # Ptuning metadata - use_prompt_tuning: bool = False - use_parallel_embedding: bool = False - max_lora_rank: int = 64 - - # Parallel metadata - mapping = None - - def to_dict(self) -> dict: - """Converts the instance to a python dict.""" - return dataclasses.asdict(self) - - @staticmethod - def from_dict(d: dict): - """Load a dict to a `ModelConfig` instance.""" - return _from_dict(ModelConfig, d) - - @property - def vocab_size(self): - """Returns the vocab_size of the model.""" - return ( - self.vocab_embedding.local_vocab_size * self.mapping.tp_size - if self.vocab_embedding.is_local - else self.vocab_embedding.local_vocab_size - ) - - @property - def vocab_size_padded(self): - """Returns the padded vocab_size of the model rounds to the tensor_parallel.""" - return pad_vocab_size(self.vocab_size, self.mapping.tp_size) - - @property - def hidden_size(self): - """Returns the hidden_size of the model.""" - return self.vocab_embedding.hidden_size - - @property - def max_position_embeddings(self): - """Returns the max_position_embedding of the model.""" - return self.layers[0].max_position_embeddings - - @property - def num_attention_heads(self): - """Returns the num_attention_heads of the model.""" - return self.layers[0].num_attention_heads - - @property - def num_kv_heads(self): - """Returns the num_key_value_heads of the model.""" - return self.layers[0].num_kv_heads if self.layers[0].num_kv_heads > 0 else self.num_attention_heads - - @property - def head_size(self): - """Returns the head_size of the model.""" - return self.layers[0].kv_channels - - @property - def hidden_act(self): - """Returns the hidden_act of the model.""" - return self.layers[0].mlp.hidden_act diff --git a/nemo/export/trt_llm/model_config_trt.py b/nemo/export/trt_llm/model_config_trt.py deleted file mode 100644 index 635f6ae4d807..000000000000 --- a/nemo/export/trt_llm/model_config_trt.py +++ /dev/null @@ -1,82 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import shutil -from pathlib import Path -from typing import List, Union - -from nemo.export.trt_llm.model_config import ModelConfig -from nemo.export.trt_llm.tensorrt_llm_model import LMHeadModelBuilder - - -def model_config_to_tensorrt_llm( - model_configs: List[ModelConfig], - engine_dir: Union[str, Path], - world_size: int = 1, - max_input_len: int = 200, - max_output_len: int = 200, - max_batch_size: int = 1, - max_beam_width: int = 1, - max_prompt_embedding_table_size: int = 0, - use_inflight_batching: bool = False, - paged_kv_cache: bool = False, - enable_context_fmha: bool = True, - enable_multi_block_mode: bool = False, - use_refit: bool = False, - use_lora_plugin: str = None, - lora_target_modules: List[str] = None, - max_lora_rank: int = 64, -): - """The API to convert a torch or huggingface model represented as ModelConfig to tensorrt_llm. - - Args: - model_configs: The list of ModelConfig converted, 1 for each GPU. - engine_dir: The target output directory to save the built tensorrt_llm engines. - gpus: the number of inference gpus for multi gpu inferencing. - max_input_len: The max input sequence length. - max_output_len: The max output sequence length. - max_batch_size: The max batch size. - max_beam_width: The max beam search width. - max_prompt_embedding_table_size: max size of the prompt embedding table. - use_inflight_batching (bool): if True, enables inflight batching for TensorRT-LLM Triton backend. - paged_kv_cache (bool): if True, uses kv cache feature of the TensorRT-LLM. - enable_context_fmha (bool): if True, use fused Context MultiHeadedAttention. - enable_multi_block_mode (bool): enable faster decoding in multihead attention. Required for long context. - """ - engine_dir = Path(engine_dir) - if os.path.exists(engine_dir): - shutil.rmtree(engine_dir) - - for rank in range(world_size): - model_configs[rank].use_prompt_tuning = max_prompt_embedding_table_size > 0 - model_configs[rank].max_lora_rank = max_lora_rank - builder = LMHeadModelBuilder(model_configs[rank]) - builder.build( - output_dir=engine_dir, - max_input_len=max_input_len, - max_output_len=max_output_len, - max_batch_size=max_batch_size, - max_beam_width=max_beam_width, - parallel_build=False, - max_prompt_embedding_table_size=max_prompt_embedding_table_size, - use_inflight_batching=use_inflight_batching, - paged_kv_cache=paged_kv_cache, - enable_context_fmha=enable_context_fmha, - enable_multi_block_mode=enable_multi_block_mode, - use_refit=use_refit, - use_lora_plugin=use_lora_plugin, - lora_target_modules=lora_target_modules, - max_lora_rank=max_lora_rank, - ) diff --git a/nemo/export/trt_llm/nemo/convert.py b/nemo/export/trt_llm/nemo/convert.py index 7598b3f6825f..aa2a29888703 100644 --- a/nemo/export/trt_llm/nemo/convert.py +++ b/nemo/export/trt_llm/nemo/convert.py @@ -23,21 +23,6 @@ weights_dict = {} -def cpu_map_location(storage, loc): - return storage.cpu() - - -def gpu_map_location(storage, loc): - if loc.startswith("cuda"): - training_gpu_idx = int(loc.split(":")[1]) - inference_gpu_idx = training_gpu_idx % torch.cuda.device_count() - return storage.cuda(inference_gpu_idx) - elif loc.startswith("cpu"): - return storage.cpu() - else: - raise ValueError(f"Not handled {loc}") - - def save_val(val, dir, key, tp_num=None): suffix = "" if tp_num is None else f".{tp_num}.bin" # Transpose linear layer weights to the correct shape. @@ -411,137 +396,3 @@ def split_and_save_weight(tp_rank, saved_dir, split_factor, key, vals, storage_t global weights_dict return weights_dict - - -# Similar to split_save_weight but done on GPU for performance -@torch.no_grad() -def save_weight_torch(tp_rank, saved_dir, split_factor, key, vals, storage_type, act_range, config): - def save_tranpose(val, key, shared=False): - if shared or tp_rank is None: - suffix = "bin" - else: - suffix = f"{tp_rank}.bin" - - # Transpose linear layer weights to the correct shape. - assert torch.is_tensor(val) - if len(val.shape) >= 2: - val = val.reshape(val.shape[0], -1) - val = torch.transpose(val, 0, 1) - val = val.contiguous().to("cpu", non_blocking=True) - - if type(saved_dir) is dict: - saved_dir[f"model.{key}.{suffix}"] = val - else: - global weights_dict - weights_dict[f"model.{key}.{suffix}"] = val - - use_attention_nemo_shape = config.get("use_attention_nemo_shape", False) - split_gated_activation = config.get("split_gated_activation", False) - num_attention_heads = config.get("num_attention_heads", 0) - tp_size = config.get("tp_size", 1) - num_kv_heads = config.get("num_kv_heads", num_attention_heads) - - if not isinstance(vals, list): - vals = [vals] - - if config.get("transpose_weights", False) and vals[0].ndim == 2: - vals = [val.T for val in vals] - if "layernorm.weight" in key and config.get("apply_layernorm_1p", False): - vals = [val + 1.0 for val in vals] - - gpu_vals = [val.to(storage_type) for val in vals] - gpu_val = gpu_vals[0] - - if ( - "input_layernorm.weight" in key - or "input_layernorm.bias" in key - or "pre_mlp_layernorm.weight" in key - or "pre_mlp_layernorm.bias" in key - or "attention.dense.bias" in key - or "attention.linear_proj.bias" in key - or "post_attention_layernorm.weight" in key - or "post_attention_layernorm.bias" in key - or "post_self_attn_layernorm.weight" in key - or "mlp.dense_4h_to_h.bias" in key - or "mlp.linear_fc2.bias" in key - or "final_layernorm.weight" in key - or "final_layernorm.bias" in key - ): - if "post_self_attn_layernorm.weight" in key: - key = key.replace("post_self_attn_layernorm.weight", "post_attention_layernorm.weight") - elif "mlp.linear_fc2.bias" in key: - key = key.replace("mlp.linear_fc2.bias", "mlp.dense_4h_to_h.bias") - elif "attention.linear_proj.bias" in key: - key = key.replace("attention.linear_proj.bias", "attention.dense.bias") - - save_tranpose(gpu_val, key, shared=True) - elif ( - "attention.dense.weight" in key - or "mlp.dense_4h_to_h.weight" in key - or "attention.linear_proj.weight" in key - or "mlp.linear_fc2.weight" in key - ): - if "attention.linear_proj.weight" in key: - key = key.replace("attention.linear_proj.weight", "attention.dense.weight") - elif "mlp.linear_fc2.weight" in key: - key = key.replace("mlp.linear_fc2.weight", "mlp.dense_4h_to_h.weight") - save_tranpose(gpu_val, key) - elif ( - "mlp.dense_h_to_4h.weight" in key - or "mlp.dense_h_to_4h.bias" in key - or "mlp.linear_fc1.weight" in key - or "mlp.linear_fc1.bias" in key - ): - if split_gated_activation: - val, gate = torch.chunk(gpu_val, 2, axis=-1) - else: - val, gate = None, None - - if "mlp.linear_fc1" in key: - key = key.replace("mlp.linear_fc1", "mlp.dense_h_to_4h") - - save_tranpose(val, key) - - if split_gated_activation: - prefix, dot, suffix = key.rpartition(".") - key = prefix + ".gate" + dot + suffix - save_tranpose(gate, key) - - elif "mlp.dense_h_to_4h_2.weight" in key or "mlp.dense_h_to_4h_2.bias" in key: - save_tranpose(gpu_val, key) - - elif "attention.query_key_value.bias" in key or "attention.linear_qkv.bias" in key: - raise NotImplementedError("Attention QKV bias not implemented") - - elif "attention.query_key_value.weight" in key or "attention.linear_qkv.weight" in key: - assert use_attention_nemo_shape, "Only support NEMO shape for QKV weights" - hidden_dim = vals[0].shape[0] - size_per_head = hidden_dim // num_attention_heads - q_num = num_attention_heads // num_kv_heads - - len_vals = len(vals) - gpu_val = gpu_val.reshape(hidden_dim, num_kv_heads * len_vals // tp_size, q_num + 2, size_per_head) - - # Split the QKV to separate variables. - # [qqqqkkvv] - > [qqqq,kk,vv] - qkv = torch.split(gpu_val, [q_num, 1, 1], dim=2) - split_vals = torch.concatenate( - [qkv[0].reshape(hidden_dim, -1), qkv[1].reshape(hidden_dim, -1), qkv[2].reshape(hidden_dim, -1)], dim=1 - ) - - if "attention.linear_qkv.weight" in key: - key = key.replace("attention.linear_qkv.weight", "attention.query_key_value.weight") - save_tranpose(split_vals, key) - - elif ( - "attention.query.weight" in key - or "attention.query.bias" in key - or "attention.key_value.weight" in key - or "attention.key_value.bias" in key - ): - pass - else: - print(f"[WARNING] {key} not handled by converter") - - global weights_dict - return weights_dict diff --git a/nemo/export/trt_llm/nemo/nemo.py b/nemo/export/trt_llm/nemo/nemo.py index c3564f1c4e8e..6276de5dddd9 100644 --- a/nemo/export/trt_llm/nemo/nemo.py +++ b/nemo/export/trt_llm/nemo/nemo.py @@ -23,11 +23,25 @@ from transformers import FalconConfig, GPT2Config, LlamaConfig from nemo.export.tarutils import TarPath -from nemo.export.trt_llm.nemo.convert import cpu_map_location, gpu_map_location LOGGER = logging.getLogger("NeMo") +def cpu_map_location(storage, loc): + return storage.cpu() + + +def gpu_map_location(storage, loc): + if loc.startswith("cuda"): + training_gpu_idx = int(loc.split(":")[1]) + inference_gpu_idx = training_gpu_idx % torch.cuda.device_count() + return storage.cuda(inference_gpu_idx) + elif loc.startswith("cpu"): + return storage.cpu() + else: + raise ValueError(f"Not handled {loc}") + + def nemo_to_llm_config(nemo_model_config, vocab_size, eos_id, bos_id, decoder_type): convertion_dict = { "activation_function": "activation", diff --git a/nemo/export/trt_llm/nemo/nemo_ckpt_convert.py b/nemo/export/trt_llm/nemo/nemo_ckpt_convert.py index 8112bb8755e3..d83129b43fab 100644 --- a/nemo/export/trt_llm/nemo/nemo_ckpt_convert.py +++ b/nemo/export/trt_llm/nemo/nemo_ckpt_convert.py @@ -35,7 +35,7 @@ from transformers import AutoTokenizer, GPT2Tokenizer, LlamaConfig from nemo.export.tarutils import TarPath, ZarrPathStore -from nemo.export.trt_llm.nemo.convert import save_weight_torch, split_and_save_weight +from nemo.export.trt_llm.nemo.convert import split_and_save_weight from nemo.export.trt_llm.nemo.nemo import UnpackedNemoCheckpointDir, extract_layers_with_prefix, nemo_to_llm_config from nemo.export.trt_llm.nemo.sentencepiece_tokenizer import SentencePieceTokenizer diff --git a/nemo/export/trt_llm/nemo_utils.py b/nemo/export/trt_llm/nemo_utils.py index d735cab36b00..7e687ce020da 100644 --- a/nemo/export/trt_llm/nemo_utils.py +++ b/nemo/export/trt_llm/nemo_utils.py @@ -14,20 +14,16 @@ import argparse -import copy import csv import datetime import logging import os -import shutil import sys -import tempfile from pathlib import Path from typing import Dict, List, Tuple, Union import numpy as np import tensorrt_llm -from tensorrt_llm import str_dtype_to_trt from tensorrt_llm._utils import pad_vocab_size from tensorrt_llm.functional import non_gated_version from tensorrt_llm.layers import MoeConfig @@ -35,24 +31,79 @@ from transformers import AutoTokenizer, LlamaConfig, PreTrainedTokenizer from nemo.export.tarutils import TarPath -from nemo.export.trt_llm.decoder import DECODER_MODEL_TYPE -from nemo.export.trt_llm.model_config import ( - LAYERNORM_DEFAULT, - LAYERNORM_RMS, - LINEAR_COLUMN, - DecoderLayerConfig, - EmbeddingConfig, - LayernormConfig, - LinearConfig, - ModelConfig, -) from nemo.export.trt_llm.nemo.nemo import UnpackedNemoCheckpointDir -from nemo.export.trt_llm.nemo.nemo_ckpt_convert import build_tokenizer, convert_dist_checkpoint, convert_nemo_model -from nemo.export.trt_llm.tensor_utils import get_tensor_from_dict, get_tensor_parallel_group, split +from nemo.export.trt_llm.nemo.nemo_ckpt_convert import build_tokenizer, convert_dist_checkpoint + + +DECODER_MODEL_TYPE = { + "gptj": 'GPTForCausalLM', + "gptnext": 'GPTForCausalLM', + "llama": 'LLaMAForCausalLM', + "gemma": 'GemmaForCausalLM', + "falcon": 'FalconForCausalLM', +} LOGGER = logging.getLogger("NeMo") +def prompt_convert(prompt_config, prompt_weights): + if "task_templates" in prompt_config: + prompt_templates = prompt_config["task_templates"] + actual_task_id = 0 + vtokens_embeddings = [] + vtokens_len = [] + for task_name_id, prompt_task in enumerate(prompt_templates): + prompt_task_name = prompt_task["taskname"] + LOGGER.info(f"Task {actual_task_id}: {prompt_task['taskname']}") + prompt_task_weights = prompt_weights["prompt_table"].get( + f"prompt_table.{prompt_task_name}.prompt_embeddings.weight" + ) + if prompt_task_weights is None: + continue + vtokens_embeddings.append(prompt_task_weights) + vtokens_len.append(prompt_task_weights.shape[0]) + actual_task_id += 1 + + max_vtoken_len = max(vtokens_len) + embedding_dim = vtokens_embeddings[0].shape[1] + + # pad tasks to longest task embedding table + for i, vtoken_emb_table in enumerate(vtokens_embeddings): + padded_table = torch.zeros((max_vtoken_len, embedding_dim)) + padded_table[: vtoken_emb_table.shape[0], :] = vtoken_emb_table + vtokens_embeddings[i] = padded_table + + vtokens_embeddings = torch.stack(vtokens_embeddings) + else: + vtokens_embeddings = prompt_weights["prompt_embeddings_weights"] + + return vtokens_embeddings + + +def is_nemo_file(path): + flag = False + + if path is not None: + if len(path) > 5: + pc = pathlib.Path(path) + if pc.exists(): + if pc.is_file(): + if path[-5 : len(path)] == ".nemo": + flag = True + + return flag + + +def split(v, tp_size, idx, dim=0): + """Splits the np tensor v on dim and return the idx's slice.""" + if tp_size == 1: + return v + if len(v.shape) == 1: + return np.ascontiguousarray(np.split(v, tp_size)[idx]) + else: + return np.ascontiguousarray(np.split(v, tp_size, axis=dim)[idx]) + + def _nemo_llm_decode( in_file: str, out_dir: str, @@ -123,83 +174,6 @@ def get_tokenzier(tokenizer_dir_or_path: Path) -> PreTrainedTokenizer: return build_tokenizer(tokenizer_config) -def nemo_llm_to_model_config( - in_file: str, - decoder_type: str, - nemo_export_dir: Union[str, Path], - dtype: str = "bfloat16", - tensor_parallel_size: int = 1, - pipeline_parallel_size: int = 1, - save_nemo_model_config: bool = False, -) -> Tuple[List[ModelConfig], PreTrainedTokenizer]: - """Converts the NEMO file and construct the `ModelConfig` before tensorrt_llm deployment.""" - dtype_str = dtype - - weights_dict, llm_model_config, tokenizer = _nemo_llm_decode( - in_file=in_file, - out_dir=nemo_export_dir, - tensor_parallelism=tensor_parallel_size, - processes=1, - storage_type=dtype_str, - load_checkpoints_on_gpu=False, - decoder_type=decoder_type, - save_nemo_model_config=save_nemo_model_config, - ) - - world_size = tensor_parallel_size * pipeline_parallel_size - model_config_template = ModelConfig() - model_config_template.dtype = dtype_str - - str_dtype_to_trt(dtype_str) - - model_configs = [] - for i in range(world_size): - - model_configs.append(copy.deepcopy(model_config_template)) - - model_configs[i].vocab_embedding = EmbeddingConfig(weight=get_tensor_from_dict(weights_dict, "wte")) - - model_configs[i].positional_embedding = EmbeddingConfig(weight=get_tensor_from_dict(weights_dict, "wpe")) - - model_configs[i].final_layernorm = LayernormConfig( - weight=get_tensor_from_dict(weights_dict, "final_layernorm.weight"), - bias=get_tensor_from_dict(weights_dict, "final_layernorm.bias"), - ) - model_configs[i].final_layernorm.layernorm_type = ( - LAYERNORM_RMS if isinstance(llm_model_config, LlamaConfig) else LAYERNORM_DEFAULT - ) - model_configs[i].mapping = tensorrt_llm.Mapping( - world_size=world_size, rank=i, tp_size=tensor_parallel_size, pp_size=pipeline_parallel_size - ) - - for i in range(llm_model_config.n_layer): - for j in range(world_size): - model_configs[j].layers.append( - DecoderLayerConfig.from_nemo( - weights_dict=weights_dict, - llm_config=llm_model_config, - decoder_type=decoder_type, - layer_id=i, - rank=model_configs[j].mapping.tp_rank, - is_mcore=llm_model_config.is_mcore, - ) - ) - - lm_head_weight = get_tensor_from_dict(weights_dict, "lm_head.weight") - - if model_configs[0].vocab_size_padded != model_configs[0].vocab_size: - pad_width = model_configs[0].vocab_size_padded - model_configs[0].vocab_size - lm_head_weight = np.pad(lm_head_weight, ((0, pad_width), (0, 0)), "constant", constant_values=0) - - for i in range(world_size): - model_configs[i].lm_head = LinearConfig(linear_type=LINEAR_COLUMN) - model_configs[i].lm_head.weight = np.ascontiguousarray( - split(lm_head_weight, model_configs[i].mapping.tp_size, model_configs[i].mapping.tp_rank) - ) - - return model_configs, tokenizer - - def to_word_list_format( word_dict: List[List[str]], tokenizer=None, @@ -258,83 +232,6 @@ def to_word_list_format( return np.array([flat_ids, offsets], dtype="int32").transpose((1, 0, 2)) -def nemo_llm_model_to_model_config( - nemo_model: str, - decoder_type: str, - nemo_model_config: str, - dtype_str: str = "float32", -) -> Tuple[List[ModelConfig], PreTrainedTokenizer]: - """Converts the NEMO model object and construct the `ModelConfig` before tensorrt_llm deployment.""" - from megatron.core import parallel_state - - assert nemo_model_config is not None, "gpt_model_config must be provided when in is a nemo model" - - weights_dict, llm_model_config = convert_nemo_model(nemo_model, nemo_model_config, dtype_str, decoder_type) - is_mcore = nemo_model_config.get("mcore_gpt", False) - llm_model_config.is_mcore = is_mcore - - model_config = ModelConfig() - model_config.use_prompt_tuning = False - model_config.dtype = dtype_str - model_config.use_parallel_embedding = True - str_dtype_to_trt(dtype_str) - - model_config.vocab_embedding = EmbeddingConfig(weight=get_tensor_from_dict(weights_dict, "wte"), is_local=True) - - model_config.positional_embedding = EmbeddingConfig( - weight=get_tensor_from_dict(weights_dict, "wpe"), is_local=True - ) - - model_config.final_layernorm = LayernormConfig( - weight=get_tensor_from_dict(weights_dict, "final_layernorm.weight"), - bias=get_tensor_from_dict(weights_dict, "final_layernorm.bias"), - ) - model_config.final_layernorm.layernorm_type = ( - LAYERNORM_RMS if isinstance(llm_model_config, LlamaConfig) else LAYERNORM_DEFAULT - ) - - tensor_parallel_size = nemo_model_config.tensor_model_parallel_size - pipeline_parallel_size = 1 - world_size = tensor_parallel_size * pipeline_parallel_size - - # hack since tensorrt_llm doesnt support DP natively so init all ranks with DP=1 - model_config.mapping = tensorrt_llm.Mapping( - world_size=tensor_parallel_size * pipeline_parallel_size, - rank=tensorrt_llm.mpi_rank() % world_size, - tp_size=tensor_parallel_size, - pp_size=pipeline_parallel_size, - ) - model_config.mapping.rank = tensorrt_llm.mpi_rank() - model_config.mapping.tp_group = get_tensor_parallel_group(tensor_parallel_size) - - LOGGER.info( - f'''Resharing: Rank {tensorrt_llm.mpi_rank()} mapping: - tp_rank {parallel_state.get_tensor_model_parallel_rank()} -> {model_config.mapping.tp_rank}, - pp_rank {parallel_state.get_pipeline_model_parallel_rank()} -> {model_config.mapping.pp_rank}, - tp_group {model_config.mapping.tp_group}''' - ) - - for i in range(llm_model_config.n_layer): - model_config.layers.append( - DecoderLayerConfig.from_nemo( - weights_dict=weights_dict, - llm_config=llm_model_config, - decoder_type=decoder_type, - layer_id=i, - rank=model_config.mapping.tp_rank, - is_mcore=llm_model_config.is_mcore, - ) - ) - lm_head_weight = get_tensor_from_dict(weights_dict, "lm_head.weight") - - assert model_config.vocab_size_padded == model_config.vocab_size - - model_config.lm_head = LinearConfig(linear_type=LINEAR_COLUMN) - model_config.lm_head.weight = lm_head_weight - - return [model_config] - - def nemo_to_trtllm_config( in_file: str, decoder_type: str, diff --git a/nemo/export/trt_llm/quantization_utils.py b/nemo/export/trt_llm/quantization_utils.py deleted file mode 100644 index 86365f774bb7..000000000000 --- a/nemo/export/trt_llm/quantization_utils.py +++ /dev/null @@ -1,128 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import numpy as np -from tensorrt_llm.layers import Linear, RowLinear -from tensorrt_llm.quantization.layers import FP8Linear, FP8RowLinear, Int8SmoothQuantLinear, Int8SmoothQuantRowLinear - -from nemo.export.trt_llm.model_config import ( - QUANTIZATION_FP8, - QUANTIZATION_INT8_SQ, - QUANTIZATION_NONE, - LinearConfig, - ModelConfig, -) - - -def quantize_linear(tensorrt_llm_layer, quantization: str, layer_config: LinearConfig): - """Returns the quantized tensorrt_llm linear layer.""" - if quantization == QUANTIZATION_NONE: - return tensorrt_llm_layer - - if quantization == QUANTIZATION_FP8: - # FP8 is not sensitive to scaling factors. So we just quantize all layers possible. - default_scaling_factor = np.array([1], dtype=np.float32) - if layer_config.activation_scaling_factor is None: - layer_config.activation_scaling_factor = default_scaling_factor - if layer_config.weights_scaling_factor is None: - layer_config.weights_scaling_factor = default_scaling_factor - - if layer_config.activation_scaling_factor is None or layer_config.weights_scaling_factor is None: - print(f"No valid scaling factors in {tensorrt_llm_layer._get_name()}, skipping quantization" " on this layer") - return tensorrt_llm_layer - else: - assert np.all(layer_config.activation_scaling_factor > 0) - assert np.all(layer_config.weights_scaling_factor > 0) - - bias = tensorrt_llm_layer.bias is not None - - linear_layer_type = type(tensorrt_llm_layer) - if linear_layer_type == Linear: - if quantization == QUANTIZATION_FP8: - linear = FP8Linear - elif quantization == QUANTIZATION_INT8_SQ: - linear = Int8SmoothQuantLinear - else: - assert False, f"{quantization} is not supported." - quantized_linear_layer = linear( - in_features=tensorrt_llm_layer.in_features, - out_features=tensorrt_llm_layer.out_features * tensorrt_llm_layer.tp_size, - bias=bias, - dtype=tensorrt_llm_layer.dtype, - tp_group=tensorrt_llm_layer.tp_group, - tp_size=tensorrt_llm_layer.tp_size, - gather_output=tensorrt_llm_layer.gather_output, - ) - elif linear_layer_type == RowLinear: - if quantization == QUANTIZATION_FP8: - row_linear = FP8RowLinear - elif quantization == QUANTIZATION_INT8_SQ: - row_linear = Int8SmoothQuantRowLinear - else: - assert False, f"{quantization} is not supported." - quantized_linear_layer = row_linear( - in_features=tensorrt_llm_layer.in_features * tensorrt_llm_layer.tp_size, - out_features=tensorrt_llm_layer.out_features, - bias=bias, - dtype=tensorrt_llm_layer.dtype, - tp_group=tensorrt_llm_layer.tp_group, - tp_size=tensorrt_llm_layer.tp_size, - ) - else: - assert False, f"{linear_layer_type} is not supported." - - quantized_linear_layer.weight = tensorrt_llm_layer.weight - quantized_linear_layer.bias = tensorrt_llm_layer.bias - - quantized_linear_layer.activation_scaling_factor.value = layer_config.activation_scaling_factor - quantized_linear_layer.weights_scaling_factor.value = layer_config.weights_scaling_factor - - if hasattr(quantized_linear_layer, "prequant_scaling_factor"): - quantized_linear_layer.prequant_scaling_factor.value = layer_config.prequant_scaling_factor - - return quantized_linear_layer - - -def naive_quantization(config: ModelConfig, quantization: str): - """Generates a constant scaling factor (1) with target quantization. - - This is for debugging and performance measurement only. - """ - config.quantization = quantization - # Here the scaling factor is not inversed. - # In nvidia systems: - # pytorch_quantization uses inv scale - # onnx & trt uses non-inv scale - # cask uses inv scale - default_scaling_factor = np.array([1], dtype=np.float32) - - if quantization == QUANTIZATION_FP8: - for layer in config.layers: - linear_layers = [ - layer.attention.qkv, - layer.attention.dense, - layer.mlp.fc, - layer.mlp.proj, - layer.mlp.gate, - ] - for linear_layer in linear_layers: - if linear_layer: - linear_layer.activation_scaling_factor = default_scaling_factor - linear_layer.weights_scaling_factor = default_scaling_factor - config.lm_head.activation_scaling_factor = default_scaling_factor - config.lm_head.weights_scaling_factor = default_scaling_factor - - else: - assert False, f"{quantization} not supported" diff --git a/nemo/export/trt_llm/tensor_utils.py b/nemo/export/trt_llm/tensor_utils.py deleted file mode 100644 index 2fce81b91647..000000000000 --- a/nemo/export/trt_llm/tensor_utils.py +++ /dev/null @@ -1,59 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from typing import Dict - -import numpy as np -import tensorrt as trt -import tensorrt_llm -import torch - - -def torch_to_numpy_with_dtype(tensor, dtype=trt.float16): - """Converts a torch tensor to numpy array with the dtype.""" - if dtype == trt.float16: - torch_dtype = torch.float16 - elif dtype == trt.float32: - torch_dtype = torch.float32 - elif dtype == trt.bfloat16: - torch_dtype = torch.bfloat16 - else: - assert False, f"{dtype} not supported" - return tensorrt_llm._utils.torch_to_numpy(tensor.detach().to(torch_dtype)) - - -def split(v, tp_size, idx, dim=0): - """Splits the np tensor v on dim and return the idx's slice.""" - if tp_size == 1: - return v - if len(v.shape) == 1: - return np.ascontiguousarray(np.split(v, tp_size)[idx]) - else: - return np.ascontiguousarray(np.split(v, tp_size, axis=dim)[idx]) - - -def get_tensor_parallel_group(tensor_parallel: int): - """Returns the tensor_parallel_group config based on tensor_parallel.""" - from mpi4py import MPI - - mpi_rank = MPI.COMM_WORLD.Get_rank() - offset = mpi_rank - mpi_rank % tensor_parallel - tp_group = list(range(offset, offset + tensor_parallel)) - return None if tensor_parallel == 1 else tp_group - - -def get_tensor_from_dict(weights_dict: Dict[str, np.ndarray], name: str) -> np.array: - """Loads tensor from the weights_dict.""" - return weights_dict.get(f"model.{name}.bin", None) diff --git a/nemo/export/trt_llm/tensorrt_llm_build.py b/nemo/export/trt_llm/tensorrt_llm_build.py index 2336b8eb38ce..30490cc91254 100644 --- a/nemo/export/trt_llm/tensorrt_llm_build.py +++ b/nemo/export/trt_llm/tensorrt_llm_build.py @@ -20,10 +20,7 @@ from pathlib import Path from typing import List -import tensorrt as trt import tensorrt_llm -import torch -from tensorrt_llm import str_dtype_to_trt from tensorrt_llm._common import check_max_num_tokens from tensorrt_llm._utils import np_dtype_to_trt from tensorrt_llm.builder import BuildConfig, Builder @@ -41,323 +38,6 @@ LOGGER = logging.getLogger("NeMo") -def get_engine_name(model, dtype, tp_size, pp_size, rank): - """Returns the engine file name based on the provided info.""" - if pp_size == 1: - return '{}_{}_tp{}_rank{}.engine'.format(model, dtype, tp_size, rank) - return '{}_{}_tp{}_pp{}_rank{}.engine'.format(model, dtype, tp_size, pp_size, rank) - - -def serialize_engine(engine, path): - """Serializes the engine to path.""" - logger.info(f"Serializing engine to {path}...") - tik = time.time() - with open(path, "wb") as f: - f.write(bytearray(engine)) - tok = time.time() - t = time.strftime("%H:%M:%S", time.gmtime(tok - tik)) - logger.info(f"Engine serialized. Total time: {t}") - - -def refit_runtime_engine(params, cuda_engine): - ''' - @brief: Inplace refit one TensorRT cuda engine using weights from the network, - user should guarantee that the engine is built with REFIT flag, and the network has the same structure with the engine. - @param engine_buffer: A serialized TensorRT engine. - @param network: Network object. - @return: A serialized TRT engine if refit successfully, None otherwise - ''' - logger.info(f'Refit runtime engine') - tik = time.time() - - # Refit engine - assert params is not None - refitter = trt.Refitter(cuda_engine, logger.trt_logger) - for name, param in params: - trt_param = trt.Weights(np_dtype_to_trt(param._value.dtype), param._value.ctypes.data, param._value.size) - - if trt_param is None or not refitter.set_named_weights(name, trt_param): - logger.error(f'Failed to refit weight: {name}') - return None - - if not refitter.refit_cuda_engine(): - logger.error(f'Failed to refit engine.') - return None - - tok = time.time() - t = time.strftime('%H:%M:%S', time.gmtime(tok - tik)) - logger.info(f'Total time of refitting {cuda_engine.name}: {t}') - - return cuda_engine - - -def build_rank_engine( - tensorrt_llm_gpt, - builder: Builder, - builder_config: tensorrt_llm.builder.BuilderConfig, - engine_name, - args, -): - - str_dtype_to_trt(args.dtype) - ootb = os.getenv("OOTB", False) - - network = builder.create_network() - network.trt_network.name = engine_name - - # We have to use the attention plugin for most of the models. - if args.use_gpt_attention_plugin: - network.plugin_config.set_gpt_attention_plugin(dtype=args.use_gpt_attention_plugin) - - if not ootb: - network.plugin_config.use_custom_all_reduce = False - - if args.use_gemm_plugin: - network.plugin_config.set_gemm_plugin(dtype=args.use_gemm_plugin) - if args.use_layernorm_plugin: - network.plugin_config.set_layernorm_plugin(dtype=args.use_layernorm_plugin) - assert not (args.enable_context_fmha and args.enable_context_fmha_fp32_acc) - if args.enable_context_fmha: - network.plugin_config.set_context_fmha(ContextFMHAType.enabled) - if args.enable_context_fmha_fp32_acc: - network.plugin_config.set_context_fmha(ContextFMHAType.enabled_with_fp32_acc) - if args.remove_input_padding: - network.plugin_config.enable_remove_input_padding() - else: - network.plugin_config.remove_input_padding = False - if args.paged_kv_cache: - network.plugin_config.enable_paged_kv_cache() - else: - network.plugin_config.paged_kv_cache = False - if args.use_ib_gpt_attention_plugin: - network.plugin_config.set_inflight_batching_gpt_attention_plugin(dtype=args.use_ib_gpt_attention_plugin) - if args.enable_multi_block_mode: - network.plugin_config.enable_mmha_multi_block_mode() - - if args.use_lora_plugin: - network.plugin_config.set_lora_plugin(dtype=args.use_lora_plugin) - - if args.use_lookup_plugin: - # Use the plugin for the embedding parallelism and sharing - network.plugin_config.set_lookup_plugin(dtype=args.dtype) - else: - LOGGER.warning("Build engine in OOTB mode, disable all plugins except nccl.") - - if args.mapping.world_size > 1: - network.plugin_config.set_nccl_plugin(args.dtype) - - with net_guard(network): - # Prepare - network.set_named_parameters(tensorrt_llm_gpt.named_parameters()) - - # Forward - inputs = tensorrt_llm_gpt.prepare_inputs( - max_batch_size=args.max_batch_size, - max_input_len=args.max_input_len, - max_new_tokens=args.max_input_len + args.max_output_len, - use_cache=True, - max_beam_width=args.max_beam_width, - paged_kv_cache=args.paged_kv_cache, - tokens_per_block=args.tokens_per_block, - prompt_embedding_table_size=args.max_prompt_embedding_table_size, - lora_target_modules=args.lora_target_modules, - ) - tensorrt_llm_gpt(*inputs) - - # Network -> Engine - engine = builder.build_engine(network, builder_config) - if args.mapping.rank == 0 or args.use_refit: - config_path = args.output_dir / "config.json" - builder.save_config(builder_config, config_path) - return engine - - -def _build_impl(tensorrt_llm_model, args): - torch.cuda.set_device(args.mapping.rank % args.gpus_per_node) - tensorrt_llm.logger.set_level(args.log_level) - args.output_dir.mkdir(parents=True, exist_ok=True) - timing_cache_file = args.timing_cache if args.timing_cache else args.output_dir / "model.cache" - timing_cache = timing_cache_file - - if args.use_lora_plugin is not None: - add_lora(tensorrt_llm_model, args.max_lora_rank) - - builder = Builder() - apply_query_key_layer_scaling = False - - builder_config = builder.create_builder_config( - name=MODEL_NAME, - precision=args.dtype, - timing_cache=timing_cache, - tensor_parallel=args.mapping.tp_size, - pipeline_parallel=args.mapping.pp_size, - world_size=args.mapping.tp_size * args.mapping.pp_size, - parallel_build=args.parallel_build, - num_layers=tensorrt_llm_model._num_layers, - num_heads=tensorrt_llm_model._num_heads, - num_kv_heads=tensorrt_llm_model._num_kv_heads, - head_size=tensorrt_llm_model._head_size, - hidden_size=tensorrt_llm_model._hidden_size, - vocab_size=tensorrt_llm_model._vocab_size, - hidden_act=tensorrt_llm_model.hidden_act, - max_position_embeddings=tensorrt_llm_model.max_position_embeddings, - add_bos=tensorrt_llm_model._add_bos, - apply_query_key_layer_scaling=apply_query_key_layer_scaling, - max_batch_size=args.max_batch_size, - max_input_len=args.max_input_len, - max_output_len=args.max_output_len, - max_beam_width=args.max_beam_width, - max_num_tokens=None, - max_draft_len=0, - int8="int8" in args.quantization, - opt_level=args.builder_opt, - paged_kv_cache=args.paged_kv_cache, - tokens_per_block=args.tokens_per_block, - max_prompt_embedding_table_size=args.max_prompt_embedding_table_size, - use_parallel_embedding=args.use_parallel_embedding, - embedding_sharding_dim=args.embedding_sharding_dim, - fp8="fp8" in args.quantization, - use_refit=args.use_refit, - gather_context_logits=False, - gather_generation_logits=False, - quant_mode=args.quant_mode, - lora_target_modules=args.lora_target_modules, - max_lora_rank=args.max_lora_rank, - ) - - tp_size = args.mapping.tp_size - pp_size = args.mapping.pp_size - rank = args.mapping.rank - engine_name = get_engine_name(MODEL_NAME, args.dtype, tp_size, pp_size, rank) - engine = build_rank_engine(tensorrt_llm_model, builder, builder_config, engine_name, args) - assert engine is not None, f"Failed to build engine for rank {rank}" - - serialize_engine(engine, args.output_dir / engine_name) - - if args.mapping.rank == 0: - ok = builder.save_timing_cache(builder_config, timing_cache_file) - assert ok, "Failed to save timing cache." - - -def build( - tensorrt_llm_model, - output_dir: Path, - mapping=None, - dtype="float16", - timing_cache="", - log_level="info", - max_batch_size=1, - max_input_len=200, - max_output_len=200, - max_beam_width=1, - max_prompt_embedding_table_size=0, - parallel_build=False, - gpus_per_node=1, - quantization=None, - use_inflight_batching=False, - paged_kv_cache=False, - enable_context_fmha: bool = True, - enable_multi_block_mode=False, - use_refit=False, - use_lora_plugin: str = None, - lora_target_modules: List[str] = None, - max_lora_rank: int = 64, -): - """Builds the tensorrt_llm_model to engine.""" - args = argparse.Namespace() - args.mapping = mapping - args.dtype = dtype - args.timing_cache = timing_cache - args.log_level = log_level - args.max_batch_size = max_batch_size - args.max_input_len = max_input_len - args.max_output_len = max_output_len - args.max_beam_width = max_beam_width - args.use_gpt_attention_plugin = dtype - args.use_gemm_plugin = dtype - args.use_layernorm_plugin = False - args.parallel_build = parallel_build - args.enable_context_fmha = enable_context_fmha - args.enable_context_fmha_fp32_acc = False - args.gpus_per_node = gpus_per_node - args.builder_opt = None - args.output_dir = Path(output_dir) - args.remove_input_padding = True - args.use_smooth_quant = False - args.use_weight_only = False - args.weight_only_precision = "int8" - args.per_channel = False - args.per_token = False - args.int8_kv_cache = False - args.random_seed = None - args.paged_kv_cache = paged_kv_cache - args.max_prompt_embedding_table_size = max_prompt_embedding_table_size - args.use_inflight_batching = use_inflight_batching - args.use_ib_gpt_attention_plugin = False - args.use_parallel_embedding = False - args.embedding_sharding_dim = 0 - args.use_lookup_plugin = False - args.tokens_per_block = 64 - args.quantization = quantization - args.enable_multi_block_mode = enable_multi_block_mode - args.use_refit = use_refit - args.use_lora_plugin = use_lora_plugin - args.lora_target_modules = lora_target_modules - args.max_lora_rank = max_lora_rank - - logger.set_level(args.log_level) - - assert not ( - args.use_smooth_quant and args.use_weight_only - ), "You cannot enable both SmoothQuant and INT8 weight-only together." - - assert not ( - args.use_smooth_quant and args.use_weight_only - ), "You cannot enable both SmoothQuant and INT8 weight-only together." - - if args.use_ib_gpt_attention_plugin: - logger.warning( - "use_ib_gpt_attention_plugin is deprecated. Use combination of" - " --use_gpt_attention_plugin=dtype --use_inflight_batching instead." - ) - - if args.use_inflight_batching: - assert args.use_gpt_attention_plugin, "You have to use GPT attention plugin for in-flight batching mode" - - if not args.paged_kv_cache: - logger.warning("Paged kv cache feature will enabled for in-flight batching mode.") - args.paged_kv_cache = True - - if not args.remove_input_padding: - logger.warning("Remove input padding feature will enabled for in-flight batching mode.") - args.remove_input_padding = True - - if args.use_smooth_quant: - args.quant_mode = QuantMode.use_smooth_quant(args.per_token, args.per_channel) - elif args.use_weight_only: - args.quant_mode = QuantMode.use_weight_only(args.weight_only_precision == "int4") - else: - args.quant_mode = QuantMode(0) - - if args.int8_kv_cache: - args.quant_mode = args.quant_mode.set_int8_kv_cache() - - if args.random_seed is not None: - torch.manual_seed(args.random_seed) - - if args.mapping.is_first_pp_rank(): - if tensorrt_llm_model._modules['vocab_embedding'].tp_size > 1: - args.use_parallel_embedding = True - args.embedding_sharding_dim = tensorrt_llm_model._modules['vocab_embedding'].sharding_dim - - tik = time.time() - _build_impl(tensorrt_llm_model, args) - - tok = time.time() - t = time.strftime("%H:%M:%S", time.gmtime(tok - tik)) - logger.info(f"Total time of building all {args.mapping.world_size} engines: {t}") - - def build_and_save_engine( max_input_len=1024, max_output_len=1024, diff --git a/nemo/export/trt_llm/tensorrt_llm_model.py b/nemo/export/trt_llm/tensorrt_llm_model.py deleted file mode 100644 index f4b44552af63..000000000000 --- a/nemo/export/trt_llm/tensorrt_llm_model.py +++ /dev/null @@ -1,406 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from pathlib import Path -from typing import List - -import numpy as np -import torch -from tensorrt_llm import default_net, str_dtype_to_trt -from tensorrt_llm._utils import trt_dtype_to_str -from tensorrt_llm.functional import expand_mask, gather_last_token_logits, recv, send, shape -from tensorrt_llm.layers import AttentionParams, ColumnLinear, KeyValueCacheParams, LoraParams -from tensorrt_llm.models.generation_mixin import GenerationMixin -from tensorrt_llm.module import Module, ModuleList - -from nemo.export.trt_llm.decoder import build_decoder_layer -from nemo.export.trt_llm.model_config import DECODER_GEMMA, DECODER_LLAMA, ModelConfig -from nemo.export.trt_llm.quantization_utils import quantize_linear -from nemo.export.trt_llm.tensorrt_llm_build import build -from nemo.export.trt_llm.tensorrt_llm_utils import ( - build_embedding_from_config, - build_layernorm_from_config, - print_tensorrt_llm, -) - - -def get_transformer_layers(mapping, num_layers): - layers_per_pipeline_stage = num_layers // mapping.pp_size - layers_range = list( - range(mapping.pp_rank * layers_per_pipeline_stage, (mapping.pp_rank + 1) * layers_per_pipeline_stage, 1) - ) - return layers_range - - -class ModelBuilder(Module): - def __init__(self, model_config: ModelConfig): - super().__init__() - self.quantization = model_config.quantization - self.max_position_embeddings = model_config.max_position_embeddings - self.hidden_act = model_config.hidden_act - - self._dtype = str_dtype_to_trt(model_config.dtype) - self._kv_dtype = self._dtype - self._tensor_parallel = model_config.mapping.tp_size - self._vocab_size = model_config.vocab_size - self._hidden_size = model_config.hidden_size - self._num_layers = len(model_config.layers) - self._num_heads = model_config.num_attention_heads - self._num_kv_heads = model_config.num_kv_heads - self._head_size = ( - model_config.hidden_size // model_config.num_attention_heads - if model_config.head_size is None - else model_config.head_size - ) - self._use_prompt_tuning = model_config.use_prompt_tuning - self._add_bos = model_config.layers[0].decoder_type in (DECODER_GEMMA, DECODER_LLAMA) - self._mapping = model_config.mapping - self.rank = model_config.mapping.rank - self.max_lora_rank = model_config.max_lora_rank - - if self._mapping.is_first_pp_rank(): - self.vocab_embedding = build_embedding_from_config( - model_config.vocab_embedding, - self._dtype, - use_prompt_tuning=self._use_prompt_tuning, - tensor_parallel=model_config.mapping.tp_size, - tensor_parallel_rank=model_config.mapping.tp_rank, - ) - - if model_config.positional_embedding.weight is not None: - self.positional_embedding = build_embedding_from_config( - model_config.positional_embedding, - self._dtype, - tensor_parallel=model_config.mapping.tp_size, - tensor_parallel_rank=model_config.mapping.tp_rank, - ) - - self.layers = [] - for layer_id in get_transformer_layers(self._mapping, self._num_layers): - model_config.layers[layer_id].max_lora_rank = self.max_lora_rank - self.layers.append( - build_decoder_layer( - model_config.layers[layer_id], - layer_id, - self._num_layers, - dtype=self._dtype, - quantization=model_config.quantization, - rank=self.rank, - tensor_parallel=self._tensor_parallel, - tp_group=model_config.mapping.tp_group, - ) - ) - - self.layers = ModuleList(self.layers) - - if self._mapping.is_last_pp_rank(): - self.ln_f = build_layernorm_from_config(model_config.final_layernorm, self._dtype) - - def forward( - self, - input_ids, - position_ids, - use_cache=False, - attention_mask=None, - kv_cache_params=None, - attention_params=None, - prompt_embedding_table=None, - prompt_tasks=None, - prompt_vocab_size=None, - inflight_batching_args=None, - hidden_states=None, - lora_params=None, - ): - ptuning_args = [] - if self._use_prompt_tuning: - ptuning_args = [prompt_embedding_table, prompt_tasks, prompt_vocab_size] - - if self._mapping.is_first_pp_rank(): - x = self.vocab_embedding(input_ids, *ptuning_args) - if hasattr(self, "positional_embedding") and self.positional_embedding: - assert position_ids - x = x + self.positional_embedding(position_ids) - hidden_states = x - else: - hidden_states = recv(hidden_states, self._mapping.prev_pp_rank()) - - kv_cache_params.fill_none_tensor_list(len(self.layers)) - - if use_cache: - presents = [] - - if attention_mask is not None: - attention_mask = expand_mask(attention_mask, shape(input_ids, -1)) - - for layer_idx, (layer, past) in enumerate( - zip( - self.layers, - kv_cache_params.past_key_value, - ) - ): - - decoder_params = { - "hidden_states": hidden_states, - "attention_mask": attention_mask, - "use_cache": use_cache, - "kv_cache_params": KeyValueCacheParams( - past_key_value=[past], - host_past_key_value_lengths=kv_cache_params.host_past_key_value_lengths, - kv_cache_block_pointers=kv_cache_params.kv_cache_block_pointers, - host_max_attention_window_sizes=kv_cache_params.host_max_attention_window_sizes, - cache_indirection=kv_cache_params.cache_indirection, - host_sink_token_length=kv_cache_params.host_sink_token_length, - host_kv_cache_block_pointers=kv_cache_params.host_kv_cache_block_pointers, - ), - "attention_params": attention_params, - } - - if lora_params.lora_ranks is not None: - decoder_params["lora_layer_params"] = lora_params.get_layer_params(layer_idx) - - hidden_states = layer(**decoder_params) - - if use_cache: - presents.append(hidden_states[1]) - hidden_states = hidden_states[0] - - if self._mapping.is_last_pp_rank(): - hidden_states = self.ln_f(hidden_states) - else: - hidden_states = send(hidden_states, self._mapping.next_pp_rank()) - - if use_cache: - return hidden_states, tuple(presents) - return hidden_states - - -class LMHeadModelBuilder(ModelBuilder, GenerationMixin): - def __init__(self, model_config: ModelConfig): - super().__init__(model_config) - - if self._mapping.is_last_pp_rank(): - self.lm_head = ColumnLinear( - self._hidden_size, - model_config.vocab_size_padded, - bias=False, - dtype=self._dtype, - tp_group=self._mapping.tp_group, - tp_size=self._tensor_parallel, - gather_output=True, - share_weight=None, - ) - self.lm_head.weight.value = model_config.lm_head.weight - if model_config.quantization: - self.lm_head = quantize_linear(self.lm_head, model_config.quantization, model_config.lm_head) - - def forward( - self, - input_ids, - position_ids, - use_cache=False, - last_token_ids=None, - attention_mask=None, - kv_cache_params=None, - attention_params=None, - prompt_embedding_table=None, - prompt_tasks=None, - prompt_vocab_size=None, - inflight_batching_args=None, - hidden_states=None, - lora_params=None, - ): - - hidden_states = super().forward( - input_ids, - position_ids, - use_cache, - attention_mask, - kv_cache_params, - attention_params, - prompt_embedding_table, - prompt_tasks, - prompt_vocab_size, - inflight_batching_args, - hidden_states, - lora_params, - ) - - if use_cache: - hidden_states, presents = hidden_states - - if self._mapping.is_last_pp_rank(): - assert last_token_ids is not None, "Expecting last token ids to be not None" - hidden_states = gather_last_token_logits( - hidden_states, last_token_ids, default_net().plugin_config.remove_input_padding - ) - - # [batch_size, hidden_size] -> [batch_size, vocab_size] - lm_logits = self.lm_head(hidden_states) - lm_logits.mark_output("logits", str_dtype_to_trt("float16")) - else: - hidden_states.mark_output('hidden_states_output', self._dtype) - - if use_cache: - if not default_net().plugin_config.paged_kv_cache: - for i, present in zip(self._mapping.pp_layers(self._num_layers), presents): - present.mark_output(f'present_key_value_{i}', self._kv_dtype) - if self._mapping.is_last_pp_rank(): - return (lm_logits, presents) - return (hidden_states, presents) - else: - if self._mapping.is_last_pp_rank(): - return lm_logits - return hidden_states - - def prepare_inputs( - self, - max_batch_size, - max_input_len, - max_new_tokens, - use_cache=True, - max_beam_width: int = 1, - paged_kv_cache: bool = False, - tokens_per_block: int = 64, - prompt_embedding_table_size: int = 0, - lora_target_modules: List[str] = None, - ): - - # Prepare inputs - head_size = self._head_size - num_heads_kv = self._num_kv_heads - remove_input_padding = default_net().plugin_config.remove_input_padding - use_gpt_attention_plugin = default_net().plugin_config.gpt_attention_plugin - use_gemm_plugin = default_net().plugin_config.gemm_plugin - use_custom_all_reduce = default_net().plugin_config.use_custom_all_reduce - use_lora_plugin = default_net().plugin_config.lora_plugin - - model_inputs = self.prepare_basic_inputs( - max_batch_size=max_batch_size, - max_beam_width=max_beam_width, - max_input_len=max_input_len, - max_seq_len=max_new_tokens, - num_kv_heads=num_heads_kv, - head_size=head_size, - num_layers=self._num_layers, - kv_dtype=self._kv_dtype, - remove_input_padding=remove_input_padding, - use_gpt_attention_plugin=use_gpt_attention_plugin, - use_gemm_plugin=use_gemm_plugin, - paged_kv_cache=paged_kv_cache, - tokens_per_block=tokens_per_block, - gather_context_logits=False, - gather_generation_logits=False, - dtype=self._dtype, - num_heads=self._num_heads, - mapping=self._mapping, - max_num_tokens=None, - prompt_embedding_table_size=prompt_embedding_table_size, - position_encoding_2d=False, - use_lora_plugin=use_lora_plugin, - lora_target_modules=lora_target_modules, - max_draft_len=0, - use_custom_all_reduce=use_custom_all_reduce, - ) - - inflight_batching_args = None - - return ( - model_inputs["input_ids"], - model_inputs["position_ids"], - use_cache, - model_inputs["last_token_ids"], - model_inputs["attention_mask"], - KeyValueCacheParams( - past_key_value=model_inputs['past_key_value'], - host_past_key_value_lengths=model_inputs['host_past_key_value_lengths'], - host_max_attention_window_sizes=model_inputs['host_max_attention_window_sizes'], - kv_cache_block_pointers=model_inputs['kv_cache_block_pointers'], - host_kv_cache_block_pointers=model_inputs['host_kv_cache_block_pointers'], - cache_indirection=model_inputs['cache_indirection'], - host_sink_token_length=model_inputs['host_sink_token_length'], - ), - AttentionParams( - sequence_length=model_inputs['sequence_length'], - context_lengths=model_inputs['context_lengths'], - host_context_lengths=model_inputs['host_context_lengths'], - max_context_length=max_input_len, - host_request_types=model_inputs['host_request_types'], - ), - model_inputs['prompt_embedding_table'], - model_inputs['tasks'], - model_inputs['prompt_vocab_size'], - inflight_batching_args, - model_inputs["hidden_states_input"], - LoraParams( - model_inputs['lora_ranks'], - model_inputs['lora_weights_pointers'], - host_context_lengths=model_inputs['host_context_lengths'], - max_context_length=max_input_len, - host_request_types=model_inputs['host_request_types'], - ), - ) - - def build( - self, - output_dir: Path, - timing_cache: str = "", - log_level: str = "info", - max_batch_size: int = 1, - max_input_len: int = 200, - max_output_len: int = 200, - max_beam_width: int = 1, - parallel_build: bool = False, - max_prompt_embedding_table_size: int = 0, - use_inflight_batching: bool = False, - paged_kv_cache: bool = False, - enable_context_fmha: bool = True, - enable_multi_block_mode: bool = False, - use_refit: bool = False, - use_lora_plugin: str = None, - lora_target_modules: List[str] = None, - max_lora_rank: int = 64, - ): - - if self.rank > torch.cuda.device_count(): - print(f"warning: Rank {self.rank} larger than GPUs available ({torch.cuda.device_count()})") - - build( - tensorrt_llm_model=self, - output_dir=output_dir, - mapping=self._mapping, - dtype=trt_dtype_to_str(self._dtype), - timing_cache=timing_cache, - log_level=log_level, - max_batch_size=max_batch_size, - max_input_len=max_input_len, - max_output_len=max_output_len, - max_beam_width=max_beam_width, - max_prompt_embedding_table_size=max_prompt_embedding_table_size, - parallel_build=parallel_build, - gpus_per_node=torch.cuda.device_count(), - quantization=self.quantization, - use_inflight_batching=use_inflight_batching, - paged_kv_cache=paged_kv_cache, - enable_context_fmha=enable_context_fmha, - enable_multi_block_mode=enable_multi_block_mode, - use_refit=use_refit, - use_lora_plugin=use_lora_plugin, - lora_target_modules=lora_target_modules, - max_lora_rank=max_lora_rank, - ) - - def print(self): - np.set_printoptions(threshold=36) - print_tensorrt_llm(f"rank.{self.rank}", self) diff --git a/nemo/export/trt_llm/tensorrt_llm_run.py b/nemo/export/trt_llm/tensorrt_llm_run.py index 1bdfd5237caf..f79d6ddce4bc 100644 --- a/nemo/export/trt_llm/tensorrt_llm_run.py +++ b/nemo/export/trt_llm/tensorrt_llm_run.py @@ -25,16 +25,11 @@ import tensorrt_llm import torch from mpi4py.futures import MPIPoolExecutor -from tensorrt_llm.logger import logger from tensorrt_llm.lora_manager import LoraManager from tensorrt_llm.quantization import QuantMode from tensorrt_llm.runtime import ModelConfig, ModelRunner, ModelRunnerCpp, SamplingConfig from transformers import PreTrainedTokenizer -from nemo.export.trt_llm.tensor_utils import get_tensor_parallel_group -from nemo.export.trt_llm.tensorrt_llm_model import LMHeadModelBuilder - -from nemo.export.trt_llm.tensorrt_llm_build import get_engine_name, MODEL_NAME, refit_runtime_engine # isort:skip from nemo.export.trt_llm.nemo_utils import to_word_list_format # isort:skip @@ -330,110 +325,6 @@ def load( ) -def load_refit( - tokenizer, - engine_dir: str, - lora_ckpt_list: List[str] = None, - num_beams: int = 1, - model_configs: List = None, - stream=None, -) -> TensorrtLLMHostContext: - """Loaded the compiled LLM model and run it. - - It also supports running the TRT LLM model on multi-GPU. - """ - - config_path = os.path.join(engine_dir, "config.json") - with open(config_path, "r") as f: - config = json.load(f) - """The impl of `load` API for on a single GPU worker.""" - tensorrt_llm.logger.set_level("error") - - engine_dir = Path(engine_dir) - config_path = engine_dir / "config.json" - - ( - model_config, - world_size, - tensor_parallel_size, - pipeline_parallel_size, - dtype, - max_input_len, - max_batch_size, - ) = _read_config(config_path) - - runtime_rank = torch.cuda.current_device() - assert runtime_rank < torch.cuda.device_count(), f"Rank {runtime_rank} out of bound" - - # Manipulate the tensorrt_llm mapping to make it compatible with the multiprocessed env. - assert tensorrt_llm.mpi_world_size() == torch.distributed.get_world_size(), "MPI world size mismatch" - runtime_mapping = tensorrt_llm.Mapping( - world_size=tensorrt_llm.mpi_world_size(), - rank=runtime_rank, - tp_size=tensorrt_llm.mpi_world_size(), - pp_size=1, - ) - - engine_name = get_engine_name( - MODEL_NAME, dtype, tensor_parallel_size, pipeline_parallel_size, tensorrt_llm.mpi_rank() - ) - - logger.info(f"Loading engine: Rank ({tensorrt_llm.mpi_rank()} -> {engine_dir}/{engine_name}") - - serialize_path = os.path.join(engine_dir, engine_name) - with open(serialize_path, "rb") as f: - engine_buffer = f.read() - - decoder = tensorrt_llm.runtime.GenerationSession( - model_config, engine_buffer, runtime_mapping, debug_mode=False, stream=stream - ) - runtime_mapping.rank = runtime_rank - runtime_mapping.tp_group = get_tensor_parallel_group( - tensor_parallel_size - ) # Override the tp_group to support TP+DP - runtime_mapping.tp_rank = runtime_rank - runtime_mapping.tp_size = tensor_parallel_size - runtime_mapping.pp_group = [runtime_rank] - runtime_mapping.pp_rank = 0 - - sampling_config = SamplingConfig(end_id=tokenizer.eos_token_id, pad_id=tokenizer.eos_token_id, num_beams=num_beams) - - if decoder.use_lora_plugin: - lora_manager = LoraManager() - if lora_ckpt_list is not None: - lora_manager.load_from_nemo( - model_files=lora_ckpt_list, - model_config=model_config, - runtime_mapping=runtime_mapping, - ) - else: - lora_manager = None - - # create a new builder and refit the current engine - new_builder = LMHeadModelBuilder(model_configs[0]) - engine = decoder.runtime.engine - refit_runtime_engine(new_builder.named_parameters(), engine) - - # Initialize the global context so it can be used during `run` API. - global tensorrt_llm_worker_context - tensorrt_llm_worker_context.decoder = decoder - tensorrt_llm_worker_context.sampling_config = sampling_config - tensorrt_llm_worker_context.lora_manager = lora_manager - tensorrt_llm_worker_context.max_batch_size = max_batch_size - tensorrt_llm_worker_context.max_input_len = max_input_len - - max_batch_size = config["builder_config"]["max_batch_size"] - max_input_len = config["builder_config"]["max_input_len"] - - return TensorrtLLMHostContext( - executor=None, - world_size=world_size, - tokenizer=tokenizer, - max_batch_size=max_batch_size, - max_input_len=max_input_len, - ) - - def forward( input_tensors: List[torch.IntTensor], max_output_len: int, diff --git a/nemo/export/trt_llm/tensorrt_llm_utils.py b/nemo/export/trt_llm/tensorrt_llm_utils.py deleted file mode 100644 index b732daca2525..000000000000 --- a/nemo/export/trt_llm/tensorrt_llm_utils.py +++ /dev/null @@ -1,85 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging - -import tensorrt as trt -from tensorrt_llm.layers import Embedding, LayerNorm, PromptTuningEmbedding, RmsNorm -from tensorrt_llm.module import Module - -from nemo.export.trt_llm.model_config import LAYERNORM_DEFAULT, LAYERNORM_RMS, EmbeddingConfig, LayernormConfig -from nemo.export.trt_llm.tensor_utils import get_tensor_parallel_group - -LOGGER = logging.getLogger("NeMo") - - -def build_embedding_from_config( - config: EmbeddingConfig, - dtype: trt.DataType, - tensor_parallel: int = 1, - tensor_parallel_rank: int = 0, - use_prompt_tuning: bool = False, -): - """Returns the tensorrt_llm embedding layer from the embedding config.""" - # If the config is empty, return an empty impl. - if config is None: - return None - EmbeddingCls = PromptTuningEmbedding if use_prompt_tuning else Embedding - - trt_embedding = EmbeddingCls( - config.weight.shape[0] * tensor_parallel, - config.weight.shape[1], - dtype=dtype, - tp_size=tensor_parallel, - tp_rank=tensor_parallel_rank, - tp_group=get_tensor_parallel_group(tensor_parallel), - ) - trt_embedding.weight.value = config.weight - return trt_embedding - - -def build_layernorm_from_config(config: LayernormConfig, dtype: trt.DataType): - """Returns the tensorrt_llm layernorm layer from the torch layernorm.""" - # If the config is empty, return an empty impl. - if config is None: - return None - - if config.layernorm_type == LAYERNORM_DEFAULT: - trt_layernorm = LayerNorm(normalized_shape=config.weight.shape[0], dtype=dtype) - trt_layernorm.weight.value = config.weight - trt_layernorm.bias.value = config.bias - elif config.layernorm_type == LAYERNORM_RMS: - trt_layernorm = RmsNorm(normalized_shape=config.weight.shape[0], dtype=dtype) - trt_layernorm.weight.value = config.weight - else: - raise NotImplementedError(f"{config.layernorm_type} not supported") - return trt_layernorm - - -def print_tensorrt_llm(name: str, tensorrt_llm_module: Module): - """Prints the tensorrt llm structure including weights and related data for debugging purpose.""" - for tensor_name in [ - "weight", - "bias", - "activation_scaling_factor", - "weights_scaling_factor", - "prequant_scaling_factor", - ]: - if hasattr(tensorrt_llm_module, tensor_name): - tensor = getattr(tensorrt_llm_module, tensor_name) - if tensor is not None: - LOGGER.info(f"{name}.{tensor_name}:{tensor._value.dtype}:{tensor._value.shape}:\n{tensor._value}") - - for k, v in tensorrt_llm_module.named_children(): - print_tensorrt_llm(f"{name}.{k}({v._get_name()})", v) diff --git a/nemo/export/trt_llm/utils.py b/nemo/export/trt_llm/utils.py deleted file mode 100644 index 0f9fb66313b9..000000000000 --- a/nemo/export/trt_llm/utils.py +++ /dev/null @@ -1,78 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import logging -import pathlib -import numpy as np -import torch - -log_format = "%(asctime)s %(name)s [%(levelname)s] %(message)s" -logging.basicConfig(format=log_format) -LOGGER = logging.getLogger("NeMo") - -# numpy doesn't know bfloat16, define abstract binary type instead -np_bfloat16 = np.dtype('V2', metadata={"dtype": "bfloat16"}) - - -def prompt_convert(prompt_config, prompt_weights): - if "task_templates" in prompt_config: - prompt_templates = prompt_config["task_templates"] - actual_task_id = 0 - vtokens_embeddings = [] - vtokens_len = [] - for task_name_id, prompt_task in enumerate(prompt_templates): - prompt_task_name = prompt_task["taskname"] - LOGGER.info(f"Task {actual_task_id}: {prompt_task['taskname']}") - prompt_task_weights = prompt_weights["prompt_table"].get( - f"prompt_table.{prompt_task_name}.prompt_embeddings.weight" - ) - if prompt_task_weights is None: - continue - vtokens_embeddings.append(prompt_task_weights) - vtokens_len.append(prompt_task_weights.shape[0]) - actual_task_id += 1 - - max_vtoken_len = max(vtokens_len) - embedding_dim = vtokens_embeddings[0].shape[1] - - # pad tasks to longest task embedding table - for i, vtoken_emb_table in enumerate(vtokens_embeddings): - padded_table = torch.zeros((max_vtoken_len, embedding_dim)) - padded_table[: vtoken_emb_table.shape[0], :] = vtoken_emb_table - vtokens_embeddings[i] = padded_table - - vtokens_embeddings = torch.stack(vtokens_embeddings) - else: - vtokens_embeddings = prompt_weights["prompt_embeddings_weights"] - - return vtokens_embeddings - - -def cpu_map_location(storage, loc): - return storage.cpu() - - -def is_nemo_file(path): - flag = False - - if path is not None: - if len(path) > 5: - pc = pathlib.Path(path) - if pc.exists(): - if pc.is_file(): - if path[-5 : len(path)] == ".nemo": - flag = True - - return flag From 4a263e7f257d7e04f8e5d71756abeb4d9f4cfc60 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Fri, 31 May 2024 14:05:18 -0700 Subject: [PATCH 39/47] add large model stable training fix and contrastive loss update for variable seq (#9259) (#9348) * add stable training fix and contrastive loss update for variable seq length input * Apply isort and black reformatting * replace remove_bias with use_bias --------- Signed-off-by: Nithin Rao Koluguri Signed-off-by: nithinraok Co-authored-by: Nithin Rao Co-authored-by: nithinraok --- .../fastconformer_ctc_bpe_streaming.yaml | 1 + .../fastconformer_ctc_char_streaming.yaml | 1 + ...astconformer_transducer_bpe_streaming.yaml | 1 + ...stconformer_transducer_char_streaming.yaml | 1 + ...r_hybrid_transducer_ctc_bpe_streaming.yaml | 1 + ..._hybrid_transducer_ctc_char_streaming.yaml | 1 + ...stconformer_hybrid_transducer_ctc_bpe.yaml | 1 + ...tconformer_hybrid_transducer_ctc_char.yaml | 1 + .../fast-conformer-long_ctc_bpe.yaml | 1 + .../fast-conformer-long_transducer_bpe.yaml | 1 + .../ssl/fastconformer/fast-conformer.yaml | 1 + .../asr/losses/ssl_losses/contrastive.py | 23 +++--- .../asr/modules/conformer_encoder.py | 4 ++ .../asr/parts/submodules/conformer_modules.py | 71 ++++++++++++++----- .../parts/submodules/multi_head_attention.py | 41 +++++++---- 15 files changed, 110 insertions(+), 40 deletions(-) diff --git a/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_ctc_bpe_streaming.yaml b/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_ctc_bpe_streaming.yaml index a59a2628cd2f..acb499f18ffb 100644 --- a/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_ctc_bpe_streaming.yaml +++ b/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_ctc_bpe_streaming.yaml @@ -80,6 +80,7 @@ model: feat_out: -1 # you may set it if you need different output size other than the default d_model n_layers: 17 d_model: 512 + use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules # Sub-sampling parameters subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding diff --git a/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_ctc_char_streaming.yaml b/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_ctc_char_streaming.yaml index 8f8f7e40e39a..8dd978bb00e4 100644 --- a/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_ctc_char_streaming.yaml +++ b/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_ctc_char_streaming.yaml @@ -78,6 +78,7 @@ model: feat_out: -1 # you may set it if you need different output size other than the default d_model n_layers: 17 d_model: 512 + use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules # Sub-sampling params subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding diff --git a/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_transducer_bpe_streaming.yaml b/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_transducer_bpe_streaming.yaml index 69b21b496ddd..9f199c2dd488 100644 --- a/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_transducer_bpe_streaming.yaml +++ b/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_transducer_bpe_streaming.yaml @@ -85,6 +85,7 @@ model: feat_out: -1 # you may set it if you need different output size other than the default d_model n_layers: 17 d_model: 512 + use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules # Sub-sampling parameters subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding diff --git a/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_transducer_char_streaming.yaml b/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_transducer_char_streaming.yaml index 8fd096525e74..c7f83216aa0b 100644 --- a/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_transducer_char_streaming.yaml +++ b/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_transducer_char_streaming.yaml @@ -84,6 +84,7 @@ model: feat_out: -1 # you may set it if you need different output size other than the default d_model n_layers: 17 d_model: 512 + use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules # Sub-sampling params subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding diff --git a/examples/asr/conf/fastconformer/hybrid_cache_aware_streaming/fastconformer_hybrid_transducer_ctc_bpe_streaming.yaml b/examples/asr/conf/fastconformer/hybrid_cache_aware_streaming/fastconformer_hybrid_transducer_ctc_bpe_streaming.yaml index b0965b580d5b..6f356ce91caa 100644 --- a/examples/asr/conf/fastconformer/hybrid_cache_aware_streaming/fastconformer_hybrid_transducer_ctc_bpe_streaming.yaml +++ b/examples/asr/conf/fastconformer/hybrid_cache_aware_streaming/fastconformer_hybrid_transducer_ctc_bpe_streaming.yaml @@ -90,6 +90,7 @@ model: feat_out: -1 # you may set it if you need different output size other than the default d_model n_layers: 17 d_model: 512 + use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules # Sub-sampling parameters subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding diff --git a/examples/asr/conf/fastconformer/hybrid_cache_aware_streaming/fastconformer_hybrid_transducer_ctc_char_streaming.yaml b/examples/asr/conf/fastconformer/hybrid_cache_aware_streaming/fastconformer_hybrid_transducer_ctc_char_streaming.yaml index 9c144d22edec..870bb0190c03 100644 --- a/examples/asr/conf/fastconformer/hybrid_cache_aware_streaming/fastconformer_hybrid_transducer_ctc_char_streaming.yaml +++ b/examples/asr/conf/fastconformer/hybrid_cache_aware_streaming/fastconformer_hybrid_transducer_ctc_char_streaming.yaml @@ -88,6 +88,7 @@ model: feat_out: -1 # you may set it if you need different output size other than the default d_model n_layers: 17 d_model: 512 + use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules # Sub-sampling params subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding diff --git a/examples/asr/conf/fastconformer/hybrid_transducer_ctc/fastconformer_hybrid_transducer_ctc_bpe.yaml b/examples/asr/conf/fastconformer/hybrid_transducer_ctc/fastconformer_hybrid_transducer_ctc_bpe.yaml index 69e4546b77a7..3fc91cc1e436 100644 --- a/examples/asr/conf/fastconformer/hybrid_transducer_ctc/fastconformer_hybrid_transducer_ctc_bpe.yaml +++ b/examples/asr/conf/fastconformer/hybrid_transducer_ctc/fastconformer_hybrid_transducer_ctc_bpe.yaml @@ -87,6 +87,7 @@ model: feat_out: -1 # you may set it if you need different output size other than the default d_model n_layers: 17 d_model: 512 + use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules # Sub-sampling parameters subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding diff --git a/examples/asr/conf/fastconformer/hybrid_transducer_ctc/fastconformer_hybrid_transducer_ctc_char.yaml b/examples/asr/conf/fastconformer/hybrid_transducer_ctc/fastconformer_hybrid_transducer_ctc_char.yaml index ea98d13e62da..e99ba69df57a 100644 --- a/examples/asr/conf/fastconformer/hybrid_transducer_ctc/fastconformer_hybrid_transducer_ctc_char.yaml +++ b/examples/asr/conf/fastconformer/hybrid_transducer_ctc/fastconformer_hybrid_transducer_ctc_char.yaml @@ -85,6 +85,7 @@ model: feat_out: -1 # you may set it if you need different output size other than the default d_model n_layers: 17 d_model: 512 + use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules # Sub-sampling params subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding diff --git a/examples/asr/conf/fastconformer/long_fastconformer/fast-conformer-long_ctc_bpe.yaml b/examples/asr/conf/fastconformer/long_fastconformer/fast-conformer-long_ctc_bpe.yaml index 2fab24fa6373..3e3d2bf6788e 100644 --- a/examples/asr/conf/fastconformer/long_fastconformer/fast-conformer-long_ctc_bpe.yaml +++ b/examples/asr/conf/fastconformer/long_fastconformer/fast-conformer-long_ctc_bpe.yaml @@ -88,6 +88,7 @@ model: feat_out: -1 # you may set it if you need different output size other than the default d_model n_layers: 18 d_model: 512 + use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules # Sub-sampling params subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding diff --git a/examples/asr/conf/fastconformer/long_fastconformer/fast-conformer-long_transducer_bpe.yaml b/examples/asr/conf/fastconformer/long_fastconformer/fast-conformer-long_transducer_bpe.yaml index 4d5f4dbcbd06..5f6c37288ae9 100644 --- a/examples/asr/conf/fastconformer/long_fastconformer/fast-conformer-long_transducer_bpe.yaml +++ b/examples/asr/conf/fastconformer/long_fastconformer/fast-conformer-long_transducer_bpe.yaml @@ -90,6 +90,7 @@ model: feat_out: -1 # you may set it if you need different output size other than the default d_model n_layers: 17 d_model: 512 + use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules # Sub-sampling parameters subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding diff --git a/examples/asr/conf/ssl/fastconformer/fast-conformer.yaml b/examples/asr/conf/ssl/fastconformer/fast-conformer.yaml index 47ad5aa458ca..6e7b5e107629 100644 --- a/examples/asr/conf/ssl/fastconformer/fast-conformer.yaml +++ b/examples/asr/conf/ssl/fastconformer/fast-conformer.yaml @@ -79,6 +79,7 @@ model: feat_out: -1 # you may set it if you need different output size other than the default d_model n_layers: 17 d_model: 512 + use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules # Sub-sampling params subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding diff --git a/nemo/collections/asr/losses/ssl_losses/contrastive.py b/nemo/collections/asr/losses/ssl_losses/contrastive.py index bab691913c0a..16a70925ac9b 100644 --- a/nemo/collections/asr/losses/ssl_losses/contrastive.py +++ b/nemo/collections/asr/losses/ssl_losses/contrastive.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +from math import ceil + import torch import torch.nn.functional as F from torch import nn @@ -25,8 +27,7 @@ class ContrastiveLoss(Loss): @property def input_types(self): - """Input types definitions for Contrastive. - """ + """Input types definitions for Contrastive.""" return { "spectrograms": NeuralType(("B", "D", "T"), SpectrogramType()), "spec_masks": NeuralType(("B", "D", "T"), SpectrogramType()), @@ -147,13 +148,17 @@ def sample_negatives(self, y, num): @typecheck() def forward(self, spectrograms, spec_masks, decoder_outputs, decoder_lengths=None): - spec_in = spectrograms.transpose(-2, -1) + targets = spectrograms.transpose(-2, -1) masks = spec_masks.transpose(-2, -1) - targets = spec_in # BxTxC + diff = int(ceil(targets.shape[1] / decoder_outputs.shape[1]) * decoder_outputs.shape[1]) - targets.shape[1] + + if diff > 0: + targets = F.pad(targets, (0, 0, 0, diff)) + masks = F.pad(masks, (0, 0, 0, diff)) - targets = targets.reshape(targets.shape[0], targets.shape[1] // self.combine_time_steps, -1) - masks = masks.reshape(targets.shape[0], targets.shape[1], -1) + targets = targets.reshape(targets.shape[0], decoder_outputs.shape[1], -1) + masks = masks.reshape(targets.shape[0], decoder_outputs.shape[1], -1) if self.quantized_targets: if self.store_ids: @@ -198,7 +203,8 @@ def forward(self, spectrograms, spec_masks, decoder_outputs, decoder_lengths=Non if self.sample_from_non_masked: # sample from all steps in utterance negatives, _ = self.sample_negatives( - targets.transpose(0, 1), targets_masked_only.size(0), # TxBxC # T' + targets.transpose(0, 1), + targets_masked_only.size(0), # TxBxC # T' ) else: # only sample from masked steps in utterance @@ -239,7 +245,8 @@ def forward(self, spectrograms, spec_masks, decoder_outputs, decoder_lengths=Non elif self.sample_from_non_masked: # sample from all steps in batch negatives, _ = self.sample_negatives( - targets.reshape(targets.shape[0] * targets.shape[1], -1), targets_masked_only.size(0), # BTxC + targets.reshape(targets.shape[0] * targets.shape[1], -1), + targets_masked_only.size(0), # BTxC ) # T' else: # only sample from masked steps diff --git a/nemo/collections/asr/modules/conformer_encoder.py b/nemo/collections/asr/modules/conformer_encoder.py index d0e014e42a37..d8f0e58833f7 100644 --- a/nemo/collections/asr/modules/conformer_encoder.py +++ b/nemo/collections/asr/modules/conformer_encoder.py @@ -118,6 +118,8 @@ class ConformerEncoder(NeuralModule, StreamingEncoder, Exportable, AccessMixin): Defaults to None. conv_dual_mode (bool): specifies if convolution should be dual mode when dual_offline mode is being used. When enables, the left half of the convolution kernel would get masked in streaming cases. Defaults to False + use_bias (bool): Use bias in all Linear and Conv1d layers from each ConformerLayer to improve activation flow and stabilize training of huge models. + Defaults to True. dropout (float): the dropout rate used in all layers except the attention layers Defaults to 0.1. dropout_pre_encoder (float): the dropout rate used before the encoder @@ -282,6 +284,7 @@ def __init__( conv_kernel_size=31, conv_norm_type='batch_norm', conv_context_size=None, + use_bias=True, dropout=0.1, dropout_pre_encoder=0.1, dropout_emb=0.1, @@ -426,6 +429,7 @@ def __init__( pos_bias_u=pos_bias_u, pos_bias_v=pos_bias_v, att_context_size=self.att_context_size, + use_bias=use_bias, ) self.layers.append(layer) diff --git a/nemo/collections/asr/parts/submodules/conformer_modules.py b/nemo/collections/asr/parts/submodules/conformer_modules.py index aed6cc16245c..efd23ef44628 100644 --- a/nemo/collections/asr/parts/submodules/conformer_modules.py +++ b/nemo/collections/asr/parts/submodules/conformer_modules.py @@ -56,6 +56,8 @@ class ConformerLayer(torch.nn.Module, AdapterModuleMixin, AccessMixin): conv_kernel_size (int): kernel size for depthwise convolution in convolution module dropout (float): dropout probabilities for linear layers dropout_att (float): dropout probabilities for attention distributions + use_bias (bool): Apply bias to all Linear and Conv1d layers from each ConformerLayer to improve activation flow and stabilize training of huge models. + Defaults to True. """ def __init__( @@ -75,6 +77,7 @@ def __init__( pos_bias_u=None, pos_bias_v=None, att_context_size=[-1, -1], + use_bias=True, ): super(ConformerLayer, self).__init__() @@ -84,7 +87,7 @@ def __init__( # first feed forward module self.norm_feed_forward1 = LayerNorm(d_model) - self.feed_forward1 = ConformerFeedForward(d_model=d_model, d_ff=d_ff, dropout=dropout) + self.feed_forward1 = ConformerFeedForward(d_model=d_model, d_ff=d_ff, dropout=dropout, use_bias=use_bias) # convolution module self.norm_conv = LayerNorm(d_model) @@ -93,6 +96,7 @@ def __init__( kernel_size=conv_kernel_size, norm_type=conv_norm_type, conv_context_size=conv_context_size, + use_bias=use_bias, ) # multi-headed self-attention module @@ -107,6 +111,7 @@ def __init__( pos_bias_u=pos_bias_u, pos_bias_v=pos_bias_v, max_cache_len=MHA_max_cache_len, + use_bias=use_bias, ) elif self_attention_model == 'rel_pos_local_attn': self.self_attn = RelPositionMultiHeadAttentionLongformer( @@ -120,10 +125,15 @@ def __init__( global_tokens=global_tokens, global_tokens_spacing=global_tokens_spacing, global_attn_separate=global_attn_separate, + use_bias=use_bias, ) elif self_attention_model == 'abs_pos': self.self_attn = MultiHeadAttention( - n_head=n_heads, n_feat=d_model, dropout_rate=dropout_att, max_cache_len=MHA_max_cache_len + n_head=n_heads, + n_feat=d_model, + dropout_rate=dropout_att, + max_cache_len=MHA_max_cache_len, + use_bias=use_bias, ) else: raise ValueError( @@ -133,7 +143,7 @@ def __init__( # second feed forward module self.norm_feed_forward2 = LayerNorm(d_model) - self.feed_forward2 = ConformerFeedForward(d_model=d_model, d_ff=d_ff, dropout=dropout) + self.feed_forward2 = ConformerFeedForward(d_model=d_model, d_ff=d_ff, dropout=dropout, use_bias=use_bias) self.dropout = nn.Dropout(dropout) self.norm_out = LayerNorm(d_model) @@ -280,16 +290,25 @@ class ConformerConvolution(nn.Module): pointwise_activation (str): name of the activation function to be used for the pointwise conv. Note that Conformer uses a special key `glu_` which is treated as the original default from the paper. + use_bias (bool): Use bias in all Linear and Conv1d layers improve activation flow and stabilize training of huge models. + Defaults to True """ def __init__( - self, d_model, kernel_size, norm_type='batch_norm', conv_context_size=None, pointwise_activation='glu_' + self, + d_model, + kernel_size, + norm_type='batch_norm', + conv_context_size=None, + pointwise_activation='glu_', + use_bias=True, ): super(ConformerConvolution, self).__init__() assert (kernel_size - 1) % 2 == 0 self.d_model = d_model self.kernel_size = kernel_size self.norm_type = norm_type + self.use_bias = use_bias if conv_context_size is None: conv_context_size = (kernel_size - 1) // 2 @@ -305,7 +324,12 @@ def __init__( dw_conv_input_dim = d_model self.pointwise_conv1 = nn.Conv1d( - in_channels=d_model, out_channels=d_model * 2, kernel_size=1, stride=1, padding=0, bias=True + in_channels=d_model, + out_channels=d_model * 2, + kernel_size=1, + stride=1, + padding=0, + bias=self.use_bias, ) self.depthwise_conv = CausalConv1D( @@ -315,7 +339,7 @@ def __init__( stride=1, padding=conv_context_size, groups=dw_conv_input_dim, - bias=True, + bias=self.use_bias, ) if norm_type == 'batch_norm': @@ -334,7 +358,12 @@ def __init__( self.activation = Swish() self.pointwise_conv2 = nn.Conv1d( - in_channels=dw_conv_input_dim, out_channels=d_model, kernel_size=1, stride=1, padding=0, bias=True + in_channels=dw_conv_input_dim, + out_channels=d_model, + kernel_size=1, + stride=1, + padding=0, + bias=self.use_bias, ) def forward(self, x, pad_mask=None, cache=None): @@ -370,31 +399,34 @@ def forward(self, x, pad_mask=None, cache=None): return x, cache def reset_parameters_conv(self): - pw1_max = pw2_max = self.d_model ** -0.5 - dw_max = self.kernel_size ** -0.5 + pw1_max = pw2_max = self.d_model**-0.5 + dw_max = self.kernel_size**-0.5 with torch.no_grad(): nn.init.uniform_(self.pointwise_conv1.weight, -pw1_max, pw1_max) - nn.init.uniform_(self.pointwise_conv1.bias, -pw1_max, pw1_max) nn.init.uniform_(self.pointwise_conv2.weight, -pw2_max, pw2_max) - nn.init.uniform_(self.pointwise_conv2.bias, -pw2_max, pw2_max) nn.init.uniform_(self.depthwise_conv.weight, -dw_max, dw_max) - nn.init.uniform_(self.depthwise_conv.bias, -dw_max, dw_max) + if self.use_bias: + nn.init.uniform_(self.pointwise_conv1.bias, -pw1_max, pw1_max) + nn.init.uniform_(self.pointwise_conv2.bias, -pw2_max, pw2_max) + nn.init.uniform_(self.depthwise_conv.bias, -dw_max, dw_max) class ConformerFeedForward(nn.Module): """ feed-forward module of Conformer model. + use_bias (bool): Apply bias to all Linear and Conv1d layers improve activation flow and stabilize training of huge models. """ - def __init__(self, d_model, d_ff, dropout, activation=Swish()): + def __init__(self, d_model, d_ff, dropout, activation=Swish(), use_bias=True): super(ConformerFeedForward, self).__init__() self.d_model = d_model self.d_ff = d_ff - self.linear1 = nn.Linear(d_model, d_ff) + self.use_bias = use_bias + self.linear1 = nn.Linear(d_model, d_ff, bias=self.use_bias) self.activation = activation self.dropout = nn.Dropout(p=dropout) - self.linear2 = nn.Linear(d_ff, d_model) + self.linear2 = nn.Linear(d_ff, d_model, bias=self.use_bias) def forward(self, x): x = self.linear1(x) @@ -404,10 +436,11 @@ def forward(self, x): return x def reset_parameters_ff(self): - ffn1_max = self.d_model ** -0.5 - ffn2_max = self.d_ff ** -0.5 + ffn1_max = self.d_model**-0.5 + ffn2_max = self.d_ff**-0.5 with torch.no_grad(): nn.init.uniform_(self.linear1.weight, -ffn1_max, ffn1_max) - nn.init.uniform_(self.linear1.bias, -ffn1_max, ffn1_max) nn.init.uniform_(self.linear2.weight, -ffn2_max, ffn2_max) - nn.init.uniform_(self.linear2.bias, -ffn2_max, ffn2_max) + if self.use_bias: + nn.init.uniform_(self.linear1.bias, -ffn1_max, ffn1_max) + nn.init.uniform_(self.linear2.bias, -ffn2_max, ffn2_max) diff --git a/nemo/collections/asr/parts/submodules/multi_head_attention.py b/nemo/collections/asr/parts/submodules/multi_head_attention.py index 6a866a617f35..19d713405953 100644 --- a/nemo/collections/asr/parts/submodules/multi_head_attention.py +++ b/nemo/collections/asr/parts/submodules/multi_head_attention.py @@ -55,21 +55,23 @@ class MultiHeadAttention(nn.Module): n_head (int): number of heads n_feat (int): size of the features dropout_rate (float): dropout rate + use_bias (bool): whether to remove bias in linear and conv layers """ - def __init__(self, n_head, n_feat, dropout_rate, max_cache_len=0): + def __init__(self, n_head, n_feat, dropout_rate, max_cache_len=0, use_bias=True): """Construct an MultiHeadedAttention object.""" super(MultiHeadAttention, self).__init__() self.cache_drop_size = None + self.use_bias = use_bias assert n_feat % n_head == 0 # We assume d_v always equals d_k self.d_k = n_feat // n_head self.s_d_k = math.sqrt(self.d_k) self.h = n_head - self.linear_q = nn.Linear(n_feat, n_feat) - self.linear_k = nn.Linear(n_feat, n_feat) - self.linear_v = nn.Linear(n_feat, n_feat) - self.linear_out = nn.Linear(n_feat, n_feat) + self.linear_q = nn.Linear(n_feat, n_feat, bias=use_bias) + self.linear_k = nn.Linear(n_feat, n_feat, bias=use_bias) + self.linear_v = nn.Linear(n_feat, n_feat, bias=use_bias) + self.linear_out = nn.Linear(n_feat, n_feat, bias=use_bias) self.dropout = nn.Dropout(p=dropout_rate) self._max_cache_len = max_cache_len @@ -161,11 +163,18 @@ class RelPositionMultiHeadAttention(MultiHeadAttention): n_head (int): number of heads n_feat (int): size of the features dropout_rate (float): dropout rate + use_bias (bool): whether to apply bias in linear and conv layers of MultiHeadAttention """ - def __init__(self, n_head, n_feat, dropout_rate, pos_bias_u, pos_bias_v, max_cache_len=0): + def __init__(self, n_head, n_feat, dropout_rate, pos_bias_u, pos_bias_v, max_cache_len=0, use_bias=True): """Construct an RelPositionMultiHeadedAttention object.""" - super().__init__(n_head=n_head, n_feat=n_feat, dropout_rate=dropout_rate, max_cache_len=max_cache_len) + super().__init__( + n_head=n_head, + n_feat=n_feat, + dropout_rate=dropout_rate, + max_cache_len=max_cache_len, + use_bias=use_bias, + ) # linear transformation for positional encoding self.linear_pos = nn.Linear(n_feat, n_feat, bias=False) # these two learnable biases are used in matrix c and matrix d @@ -253,7 +262,7 @@ def forward(self, query, key, value, mask, pos_emb, cache=None): class RelPositionMultiHeadAttentionLongformer(RelPositionMultiHeadAttention): """Multi-Head Attention layer of Transformer-XL with sliding window local+global attention from Longformer. Partially adapted from allenai (https://github.com/allenai/longformer/blob/master/longformer/sliding_chunks.py) - and huggingface (https://github.com/huggingface/transformers/blob/main/src/transformers/models/longformer/modeling_longformer.py) + and huggingface (https://github.com/huggingface/transformers/blob/main/src/transformers/models/longformer/modeling_longformer.py) Paper: https://arxiv.org/abs/1901.02860 (Transformer-XL), https://arxiv.org/abs/2004.05150 (Longformer) Args: @@ -267,6 +276,7 @@ class RelPositionMultiHeadAttentionLongformer(RelPositionMultiHeadAttention): global_tokens (int): number of tokens to be used for global attention global_tokens_spacing (int): how far apart the global tokens are global_attn_separate (bool): whether the q, k, v layers used for global tokens should be separate + use_bias (bool): whether to apply bias in linear and conv layers of MultiHeadAttention """ def __init__( @@ -281,6 +291,7 @@ def __init__( global_tokens=0, global_tokens_spacing=1, global_attn_separate=False, + use_bias=True, ): """Construct an RelPositionMultiHeadAttentionLongformer object.""" super().__init__( @@ -290,6 +301,7 @@ def __init__( pos_bias_u=pos_bias_u, pos_bias_v=pos_bias_v, max_cache_len=max_cache_len, + use_bias=use_bias, ) self.att_context_size = att_context_size self.global_tokens = global_tokens @@ -297,9 +309,9 @@ def __init__( self.global_attn_separate = global_attn_separate if self.global_attn_separate: - self.global_q = nn.Linear(n_feat, n_feat) - self.global_k = nn.Linear(n_feat, n_feat) - self.global_v = nn.Linear(n_feat, n_feat) + self.global_q = nn.Linear(n_feat, n_feat, bias=use_bias) + self.global_k = nn.Linear(n_feat, n_feat, bias=use_bias) + self.global_v = nn.Linear(n_feat, n_feat, bias=use_bias) def forward(self, query, key, value, pad_mask, pos_emb, cache=None): """Compute Scaled Dot Product Local Attention with rel. positional encoding. using overlapping chunks @@ -650,7 +662,8 @@ def _compute_out_global_to_all( global_attn_scores = global_attn_scores.transpose(1, 2) global_attn_scores = global_attn_scores.masked_fill( - is_index_masked.transpose(2, 3), torch.finfo(global_attn_scores.dtype).min, + is_index_masked.transpose(2, 3), + torch.finfo(global_attn_scores.dtype).min, ) global_attn_scores = global_attn_scores.view(batch_size * self.h, max_num_global_attn_indices, seq_len) @@ -747,7 +760,9 @@ def _get_invalid_locations_mask(self, w: int, device: str): return mask.bool().to(device), ending_mask def mask_invalid_locations( - self, input_tensor: torch.Tensor, w: int, + self, + input_tensor: torch.Tensor, + w: int, ): """ Mask locations invalid for the sliding window attention From 006bd7f0614f963aea09cee4ffcff25afa8dd0db Mon Sep 17 00:00:00 2001 From: jgerh <163925524+jgerh@users.noreply.github.com> Date: Fri, 31 May 2024 14:35:15 -0700 Subject: [PATCH 40/47] Nemo readme revisions (#9129) * REvisions to NeMo ReadMe * NeMo Readme.rst revisions * Update README.rst Co-authored-by: Eric Harper Signed-off-by: jgerh <163925524+jgerh@users.noreply.github.com> * ReadMe updates * ReadMe Updates * Updates to NeMo Readme with new license information * NeMo Framework ReadMe Revisions Updates Signed-off-by: Jennifer Gerhold * NeMo Framework ReadMe Revisions 2 Signed-off-by: Jennifer Gerhold --------- Signed-off-by: jgerh <163925524+jgerh@users.noreply.github.com> Signed-off-by: Eric Harper Signed-off-by: Jennifer Gerhold Co-authored-by: Eric Harper --- README.rst | 287 ++++++++++++++++++++++++++--------------------------- 1 file changed, 143 insertions(+), 144 deletions(-) diff --git a/README.rst b/README.rst index 121c82b8590f..4a68acc286cd 100644 --- a/README.rst +++ b/README.rst @@ -108,57 +108,51 @@ Latest News Introduction ------------ -NVIDIA NeMo Framework is a generative AI framework built for researchers and PyTorch developers -working on large language models (LLMs), multimodal models (MM), automatic speech recognition (ASR), -and text-to-speech synthesis (TTS). -The primary objective of NeMo is to provide a scalable framework for researchers and developers from industry and academia -to more easily implement and design new generative AI models by being able to leverage existing code and pretrained models. +NVIDIA NeMo Framework is a scalable and cloud-native generative AI framework built for researchers and PyTorch developers working on Large Language Models (LLMs), Multimodal Models (MMs), Automatic Speech Recognition (ASR), Text to Speech (TTS), and Computer Vision (CV) domains. It is designed to help you efficiently create, customize, and deploy new generative AI models by leveraging existing code and pre-trained model checkpoints. For technical documentation, please see the `NeMo Framework User Guide `_. -All NeMo models are trained with `Lightning `_ and -training is automatically scalable to 1000s of GPUs. +LLMs and MMs Training, Alignment, and Customization +################################################### -When applicable, NeMo models take advantage of the latest possible distributed training techniques, -including parallelism strategies such as +All NeMo models are trained with `Lightning `_. +Training is automatically scalable to 1000s of GPUs. -* data parallelism -* tensor parallelism -* pipeline model parallelism -* fully sharded data parallelism (FSDP) -* sequence parallelism -* context parallelism -* mixture-of-experts (MoE) +When applicable, NeMo models leverage cutting-edge distributed training techniques, incorporating `parallelism strategies `_ to enable efficient training of very large models. These techniques include Tensor Parallelism (TP), Pipeline Parallelism (PP), Fully Sharded Data Parallelism (FSDP), Mixture-of-Experts (MoE), and Mixed Precision Training with BFloat16 and FP8, as well as others. -and mixed precision training recipes with bfloat16 and FP8 training. +NeMo Transformer-based LLMs and MMs utilize `NVIDIA Transformer Engine `_ for FP8 training on NVIDIA Hopper GPUs, while leveraging `NVIDIA Megatron Core `_ for scaling Transformer model training. -NeMo's Transformer based LLM and Multimodal models leverage `NVIDIA Transformer Engine `_ for FP8 training on NVIDIA Hopper GPUs -and leverages `NVIDIA Megatron Core `_ for scaling transformer model training. +NeMo LLMs can be aligned with state-of-the-art methods such as SteerLM, Direct Preference Optimization (DPO), and Reinforcement Learning from Human Feedback (RLHF). See `NVIDIA NeMo Aligner `_ for more information. -NeMo LLMs can be aligned with state of the art methods such as SteerLM, DPO and Reinforcement Learning from Human Feedback (RLHF), -see `NVIDIA NeMo Aligner `_ for more details. +In addition to supervised fine-tuning (SFT), NeMo also supports the latest parameter efficient fine-tuning (PEFT) techniques such as LoRA, P-Tuning, Adapters, and IA3. Refer to the `NeMo Framework User Guide `_ for the full list of supported models and techniques. -NeMo LLM and Multimodal models can be deployed and optimized with `NVIDIA Inference Microservices (Early Access) `_. +LLMs and MMs Deployment and Optimization +######################################## -NeMo ASR and TTS models can be optimized for inference and deployed for production use-cases with `NVIDIA Riva `_. +NeMo LLMs and MMs can be deployed and optimized with `NVIDIA Inference Microservices (Early Access) `_, in short, NIMs. -For scaling NeMo LLM and Multimodal training on Slurm clusters or public clouds, please see the `NVIDIA Framework Launcher `_. -The NeMo Framework launcher has extensive recipes, scripts, utilities, and documentation for training NeMo LLMs and Multimodal models and also has an `Autoconfigurator `_ -which can be used to find the optimal model parallel configuration for training on a specific cluster. -To get started quickly with the NeMo Framework Launcher, please see the `NeMo Framework Playbooks `_ -The NeMo Framework Launcher does not currently support ASR and TTS training but will soon. +NeMo ASR and TTS models can be optimized for inference and deployed for production use cases with `NVIDIA Riva `_. -Getting started with NeMo is simple. -State of the Art pretrained NeMo models are freely available on `HuggingFace Hub `_ and +NeMo Framework Launcher +####################### + +`NeMo Framework Launcher `_ is a cloud-native tool that streamlines the NeMo Framework experience. It is used for launching end-to-end NeMo Framework training jobs on CSPs and Slurm clusters. + +The NeMo Framework Launcher includes extensive recipes, scripts, utilities, and documentation for training NeMo LLMs. It also includes the NeMo Framework `Autoconfigurator `_, which is designed to find the optimal model parallel configuration for training on a specific cluster. + +To get started quickly with the NeMo Framework Launcher, please see the `NeMo Framework Playbooks `_. The NeMo Framework Launcher does not currently support ASR and TTS training, but it will soon. + +Get Started with NeMo Framework +------------------------------- + +Getting started with NeMo Framework is easy. State-of-the-art pretrained NeMo models are freely available on `Hugging Face Hub `_ and `NVIDIA NGC `_. These models can be used to generate text or images, transcribe audio, and synthesize speech in just a few lines of code. We have extensive `tutorials `_ that -can be run on `Google Colab `_ or with our `NGC NeMo Framework Container. `_ -and we have `playbooks `_ for users that want to train NeMo models with the NeMo Framework Launcher. +can be run on `Google Colab `_ or with our `NGC NeMo Framework Container `_. We also have `playbooks `_ for users who want to train NeMo models with the NeMo Framework Launcher. -For advanced users that want to train NeMo models from scratch or finetune existing NeMo models -we have a full suite of `example scripts `_ that support multi-GPU/multi-node training. +For advanced users who want to train NeMo models from scratch or fine-tune existing NeMo models, we have a full suite of `example scripts `_ that support multi-GPU/multi-node training. Key Features ------------ @@ -172,9 +166,9 @@ Key Features Requirements ------------ -1) Python 3.10 or above -2) Pytorch 1.13.1 or above -3) NVIDIA GPU, if you intend to do model training +* Python 3.10 or above +* Pytorch 1.13.1 or above +* NVIDIA GPU (if you intend to do model training) Developer Documentation ----------------------- @@ -197,54 +191,48 @@ Developer Documentation | Stable | |stable| | `Documentation of the stable (i.e. most recent release) branch. `_ | +---------+-------------+------------------------------------------------------------------------------------------------------------------------------------------+ - -Getting help with NeMo +Install NeMo Framework ---------------------- -FAQ can be found on NeMo's `Discussions board `_. You are welcome to ask questions or start discussions there. - - -Installation ------------- The NeMo Framework can be installed in a variety of ways, depending on your needs. Depending on the domain, you may find one of the following installation methods more suitable. -* Conda / Pip - Refer to the `Conda <#conda>`_ and `Pip <#pip>`_ sections for installation instructions. +* Conda / Pip - Refer to `Conda <#conda>`_ and `Pip <#pip>`_ for installation instructions. - * This is recommended for Automatic Speech Recognition (ASR) and Text-to-Speech (TTS) domains. - * When using a Nvidia PyTorch container as the base, this is the recommended installation method for all domains. + * This is the recommended method for Automatic Speech Recognition (ASR) and Text-to-Speech (TTS) domains. + * When using a Nvidia PyTorch container as the base, this is the recommended method for all domains. -* Docker Containers - Refer to the `Docker containers <#docker-containers>`_ section for installation instructions. +* Docker Containers - Refer to `Docker containers <#docker-containers>`_ for installation instructions. - * This is recommended for Large Language Models (LLM), Multimodal and Vision domains. - * NeMo LLM & Multimodal Container - `nvcr.io/nvidia/nemo:24.03.framework` - * NeMo Speech Container - `nvcr.io/nvidia/nemo:24.01.speech` + * NeMo Framework container - `nvcr.io/nvidia/nemo:24.05` -* LLM and Multimodal Dependencies - Refer to the `LLM and Multimodal dependencies <#llm-and-multimodal-dependencies>`_ section for installation instructions. - * It's highly recommended to start with a base NVIDIA PyTorch container: `nvcr.io/nvidia/pytorch:24.02-py3` +* LLMs and MMs Dependencies - Refer to `LLMs and MMs Dependencies <#install-llms-and-mms-dependencies>`_ for installation instructions. + +**Important: We strongly recommended that you start with a base NVIDIA PyTorch container: `nvcr.io/nvidia/pytorch:24.02-py3`** Conda -~~~~~ +^^^^^^ -We recommend installing NeMo in a fresh Conda environment. +Install NeMo in a fresh Conda environment: .. code-block:: bash conda create --name nemo python==3.10.12 conda activate nemo -Install PyTorch using their `configurator `_. +Install PyTorch using their `configurator `_: .. code-block:: bash conda install pytorch torchvision torchaudio pytorch-cuda=11.8 -c pytorch -c nvidia -The command used to install PyTorch may depend on your system. Please use the configurator linked above to find the right command for your system. +The command to install PyTorch may depend on your system. Use the configurator linked above to find the right command for your system. Then, install NeMo via Pip or from Source. We do not provide NeMo on the conda-forge or any other Conda channel. Pip -~~~ -Use this installation mode if you want the latest released version. +^^^ + +To install the nemo_toolkit, use the following installation method: .. code-block:: bash @@ -252,12 +240,12 @@ Use this installation mode if you want the latest released version. pip install Cython pip install nemo_toolkit['all'] -Depending on the shell used, you may need to use ``"nemo_toolkit[all]"`` instead in the above command. +Depending on the shell used, you may need to use the ``"nemo_toolkit[all]"`` specifier instead in the above command. -Pip (Domain Specific) -~~~~~~~~~~~~~~~~~~~~~ +Pip from a Specific Domain +^^^^^^^^^^^^^^^^^^^^^^^^^^ -To install only a specific domain of NeMo, use the following commands. Note: It is required to install the above pre-requisites before installing a specific domain of NeMo. +To install a specific domain of NeMo, you must first install the nemo_toolkit using the instructions listed above. Then, you run the following domain-specific commands: .. code-block:: bash @@ -267,9 +255,10 @@ To install only a specific domain of NeMo, use the following commands. Note: It pip install nemo_toolkit['vision'] pip install nemo_toolkit['multimodal'] -Pip from source -~~~~~~~~~~~~~~~ -Use this installation mode if you want the version from a particular GitHub branch (e.g main). +Pip from a Source Branch +^^^^^^^^^^^^^^^^^^^^^^^^ + +If you want to work with a specific version of NeMo from a particular GitHub branch (e.g main), use the following installation method: .. code-block:: bash @@ -278,9 +267,10 @@ Use this installation mode if you want the version from a particular GitHub bran python -m pip install git+https://github.com/NVIDIA/NeMo.git@{BRANCH}#egg=nemo_toolkit[all] -From source -~~~~~~~~~~~ -Use this installation mode if you are contributing to NeMo. +Build from Source +^^^^^^^^^^^^^^^^^ + +If you want to clone the NeMo GitHub repository and contribute to NeMo open-source development work, use the following installation method: .. code-block:: bash @@ -289,18 +279,16 @@ Use this installation mode if you are contributing to NeMo. cd NeMo ./reinstall.sh -If you only want the toolkit without additional conda-based dependencies, you may replace ``reinstall.sh`` -with ``pip install -e .`` when your PWD is the root of the NeMo repository. +If you only want the toolkit without the additional Conda-based dependencies, you can replace ``reinstall.sh`` with ``pip install -e .`` when your PWD is the root of the NeMo repository. -Mac computers with Apple silicon -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -To install NeMo on Mac with Apple M-Series GPU: +Mac Computers with Apple Silicon +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -- create a new Conda environment +To install NeMo on Mac computers with the Apple M-Series GPU, you need to create a new Conda environment, install PyTorch 2.0 or higher, and then install the nemo_toolkit. -- install PyTorch 2.0 or higher +**Important: This method is only applicable to the ASR domain.** -- run the following code: +Run the following code: .. code-block:: shell @@ -322,24 +310,22 @@ To install NeMo on Mac with Apple M-Series GPU: # Note that only the ASR toolkit is guaranteed to work on MacBook - so for MacBook use pip install 'nemo_toolkit[asr]' Windows Computers -~~~~~~~~~~~~~~~~~ - -One of the options is using Windows Subsystem for Linux (WSL). +^^^^^^^^^^^^^^^^^ -To install WSL: - -- In PowerShell, run the following code: +To install the Windows Subsystem for Linux (WSL), run the following code in PowerShell: .. code-block:: shell wsl --install # [note] If you run wsl --install and see the WSL help text, it means WSL is already installed. -Learn more about installing WSL at `Microsoft's official documentation `_. +To learn more about installing WSL, refer to `Microsoft's official documentation `_. + +After installing your Linux distribution with WSL, two options are available: -After Installing your Linux distribution with WSL: - - **Option 1:** Open the distribution (Ubuntu by default) from the Start menu and follow the instructions. - - **Option 2:** Launch the Terminal application. Download it from `Microsoft's Windows Terminal page `_ if not installed. +**Option 1:** Open the distribution (Ubuntu by default) from the Start menu and follow the instructions. + +**Option 2:** Launch the Terminal application. Download it from `Microsoft's Windows Terminal page `_ if not installed. Next, follow the instructions for Linux systems, as provided above. For example: @@ -351,8 +337,11 @@ Next, follow the instructions for Linux systems, as provided above. For example: ./reinstall.sh RNNT -~~~~ -Note that RNNT requires numba to be installed from conda. +^^^^ + +For optimal performance of a Recurrent Neural Network Transducer (RNNT), install the Numba package from Conda. + +Run the following code: .. code-block:: bash @@ -360,14 +349,12 @@ Note that RNNT requires numba to be installed from conda. pip uninstall numba conda install -c conda-forge numba -LLM and Multimodal Dependencies -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Install LLMs and MMs Dependencies +--------------------------------- -The LLM and Multimodal domains require three additional dependencies: -NVIDIA Apex, NVIDIA Transformer Engine, and NVIDIA Megatron Core. +If you work with the LLM and MM domains, three additional dependencies are required: NVIDIA Apex, NVIDIA Transformer Engine, and NVIDIA Megatron Core. When working with the `main` branch, these dependencies may require a recent commit. -When working with the `main` branch these dependencies may require a recent commit. -The most recent working versions of these dependencies are: +The most recent working versions of these dependencies are here: .. code-block:: bash @@ -376,11 +363,14 @@ The most recent working versions of these dependencies are: export mcore_commit=fbb375d4b5e88ce52f5f7125053068caff47f93f export nv_pytorch_tag=24.02-py3 -When using a released version of NeMo, -please refer to the `Software Component Versions `_ -for the correct versions. +When using a released version of NeMo, please refer to the `Software Component Versions `_ for the correct versions. + +PyTorch Container +^^^^^^^^^^^^^^^^^ + +We recommended that you start with a base NVIDIA PyTorch container: nvcr.io/nvidia/pytorch:24.02-py3. -If starting with a base NVIDIA PyTorch container first launch the container: +If starting with a base NVIDIA PyTorch container, you must first launch the container: .. code-block:: bash @@ -393,15 +383,14 @@ If starting with a base NVIDIA PyTorch container first launch the container: --ulimit stack=67108864 \ nvcr.io/nvidia/pytorch:$nv_pytorch_tag -Then install the dependencies: +Next, you need to install the dependencies. Apex -~~~~ -NeMo LLM Multimodal Domains require that NVIDIA Apex to be installed. -Apex comes installed in the NVIDIA PyTorch container but it's possible that -NeMo LLM and Multimodal may need to be updated to a newer version. +^^^^ -To install Apex, run +NVIDIA Apex is required for LLM and MM domains. Although Apex is pre-installed in the NVIDIA PyTorch container, you may need to update it to a newer version. + +To install Apex, run the following code: .. code-block:: bash @@ -410,35 +399,32 @@ To install Apex, run git checkout $apex_commit pip install . -v --no-build-isolation --disable-pip-version-check --no-cache-dir --config-settings "--build-option=--cpp_ext --cuda_ext --fast_layer_norm --distributed_adam --deprecated_fused_adam --group_norm" +When attempting to install Apex separately from the NVIDIA PyTorch container, you might encounter an error if the CUDA version on your system is different from the one used to compile PyTorch. To bypass this error, you can comment out the relevant line in the setup file located in the Apex repository on GitHub here: https://github.com/NVIDIA/apex/blob/master/setup.py#L32. -While installing Apex outside of the NVIDIA PyTorch container, -it may raise an error if the CUDA version on your system does not match the CUDA version torch was compiled with. -This raise can be avoided by commenting it here: https://github.com/NVIDIA/apex/blob/master/setup.py#L32 +cuda-nvprof is needed to install Apex. The version should match the CUDA version that you are using. -cuda-nvprof is needed to install Apex. The version should match the CUDA version that you are using: +To install cuda-nvprof, run the following code: .. code-block:: bash conda install -c nvidia cuda-nvprof=11.8 -packaging is also needed: +Finally, install the packaging: .. code-block:: bash pip install packaging -With the latest versions of Apex, the `pyproject.toml` file in Apex may need to be deleted in order to install locally. - +To install the most recent versions of Apex locally, it might be necessary to remove the `pyproject.toml` file from the Apex directory. Transformer Engine -~~~~~~~~~~~~~~~~~~ +^^^^^^^^^^^^^^^^^^ + +NVIDIA Transformer Engine is required for LLM and MM domains. Although the Transformer Engine is pre-installed in the NVIDIA PyTorch container, you may need to update it to a newer version. -The NeMo LLM Multimodal Domains require that NVIDIA Transformer Engine to be installed. -Transformer Engine comes installed in the NVIDIA PyTorch container but it's possible that -NeMo LLM and Multimodal may need Transformer Engine to be updated to a newer version. +The Transformer Engine facilitates training with FP8 precision on NVIDIA Hopper GPUs and introduces many enhancements for the training of Transformer-based models. Refer to `Transformer Enginer `_ for information. -Transformer Engine enables FP8 training on NVIDIA Hopper GPUs and many performance optimizations for transformer-based model training. -Documentation for installing Transformer Engine can be found `here `_. +To install Transformer Engine, run the following code: .. code-block:: bash @@ -451,14 +437,15 @@ Documentation for installing Transformer Engine can be found `here `_. +-------------------- + +NeMo Text Processing, specifically Inverse Text Normalization, is now a separate repository. It is located here: `https://github.com/NVIDIA/NeMo-text-processing `_. + +Docker Containers +----------------- + +NeMo containers are launched concurrently with NeMo version updates. For example, the release of NeMo ``r1.23.0`` comes with the container ``nemo:24.01.speech``. The latest containers are: + +* NeMo LLM and MM container - `nvcr.io/nvidia/nemo:24.03.framework` +* NeMo Speech container - `nvcr.io/nvidia/nemo:24.01.speech` -Docker containers -~~~~~~~~~~~~~~~~~ -We release NeMo containers alongside NeMo releases. For example, NeMo ``r1.23.0`` comes with container ``nemo:24.01.speech``, you may find more details about released containers in `releases page `_. +You can find additional information about released containers on the `NeMo releases page `_. -To use a pre-built container, please run +To use a pre-built container, run the following code: .. code-block:: bash docker pull nvcr.io/nvidia/nemo:24.01.speech -To build a nemo container with Dockerfile from a branch, please run +To build a nemo container with Dockerfile from a branch, run the following code: .. code-block:: bash - DOCKER_BUILDKIT=1 docker build -f Dockerfile -t nemo:latest . - + DOCKER_BUILDKIT=1 docker build -f Dockerfile -t nemo:latest If you choose to work with the main branch, we recommend using NVIDIA's PyTorch container version 23.10-py3 and then installing from GitHub. @@ -499,25 +491,32 @@ If you choose to work with the main branch, we recommend using NVIDIA's PyTorch -p 8888:8888 -p 6006:6006 --ulimit memlock=-1 --ulimit \ stack=67108864 --device=/dev/snd nvcr.io/nvidia/pytorch:23.10-py3 -Examples --------- -Many examples can be found under the `"Examples" `_ folder. +Future Work +----------- +The NeMo Framework Launcher does not currently support ASR and TTS training, but it will soon. -Contributing ------------- +Discussions Board +----------------- + +FAQ can be found on the NeMo `Discussions board `_. You are welcome to ask questions or start discussions on the board. + +Contribute to NeMo +------------------ We welcome community contributions! Please refer to `CONTRIBUTING.md `_ for the process. Publications ------------- +------------------ We provide an ever-growing list of `publications `_ that utilize the NeMo Framework. -If you would like to add your own article to the list, you are welcome to do so via a pull request to this repository's ``gh-pages-src`` branch. -Please refer to the instructions in the `README of that branch `_. +To contribute an article to the collection, please submit a pull request to the ``gh-pages-src`` branch of this repository. For detailed information, please consult the README located at the `gh-pages-src branch `_. + +Licenses +-------- + +* `NeMo GitHub Apache 2.0 license `__ -License -------- -NeMo is released under an `Apache 2.0 license `_. +* NeMo is licensed under the `NVIDIA AI PRODUCT AGREEMENT `__. By pulling and using the container, you accept the terms and conditions of this license. \ No newline at end of file From 64c2812a2537f29e7e6a62780207f2749ec17ed1 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Fri, 31 May 2024 21:37:37 -0700 Subject: [PATCH 41/47] Guard cuda memory allocator update (#9312) (#9313) * Guard cuda memory allocator update * Apply isort and black reformatting --------- Signed-off-by: smajumdar Signed-off-by: titu1994 Co-authored-by: Somshubra Majumdar --- nemo/collections/common/data/lhotse/dataloader.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/nemo/collections/common/data/lhotse/dataloader.py b/nemo/collections/common/data/lhotse/dataloader.py index 32bbc1f3e8f4..01bf51b0e2c6 100644 --- a/nemo/collections/common/data/lhotse/dataloader.py +++ b/nemo/collections/common/data/lhotse/dataloader.py @@ -531,7 +531,13 @@ def maybe_set_cuda_expandable_segments(enabled: bool): warnings.warn( "You have set PYTORCH_CUDA_ALLOC_CONF without expandable_segments:True option. We're setting that option anyway. To disable it, set cuda_expandable_segments=False in NeMo dataloader configuration." ) - torch.cuda.memory._set_allocator_settings("expandable_segments:True") + + try: + torch.cuda.memory._set_allocator_settings("expandable_segments:True") + except RuntimeError: + logging.info( + "Failed to set expandable_segments:True for PyTorch CUDA allocator. You may get training speed improvements if you enable this" + ) def _select_channel(cut, channel_selector: int | str) -> list: From 28ccec727cb76ba14d7a55061c290906a7dc6664 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Piotr=20=C5=BBelasko?= Date: Sat, 1 Jun 2024 00:40:41 -0400 Subject: [PATCH 42/47] Prompt formatter API and canary transcribe tensor input support (#9206) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Apply CanaryPromptFormatter in dataset/inference Signed-off-by: Piotr Żelasko * Working inference with CanaryPromptFormatter Signed-off-by: Piotr Żelasko * Minimum working example of Canary.transcribe() with tensors Signed-off-by: Piotr Żelasko * training fix Signed-off-by: Piotr Żelasko * Update to the new 'chat' based prompt formatting API Signed-off-by: Piotr Żelasko * Prompt formatters for popular models and partial unit test coverage Signed-off-by: Piotr Żelasko * Updated documentation Signed-off-by: Piotr Żelasko * Improved test coverage + proper preamble support Signed-off-by: Piotr Żelasko * Fix usage of PromptFormatter for MT-AED class + fix tokenization/formatting issues Signed-off-by: Piotr Żelasko * Move some canary hacks to canary prompt formatter, improve validation, add tests for aggtok Signed-off-by: Piotr Żelasko * aed_model.transcribe(**slots) support, rename all slots to lowercase and drop pipes everywhere except template definition. Signed-off-by: Piotr Żelasko * truly generic version Signed-off-by: Piotr Żelasko * making transcribe_speech.py work prompt slots + syntactic sugar Signed-off-by: Piotr Żelasko * update streaming_utils.py Signed-off-by: Piotr Żelasko * fix Signed-off-by: Piotr Żelasko * code review: partial Signed-off-by: Piotr Żelasko * Accept multi-turn, single-turn, and legacy prompt format in transcribe() and transcribe_speech.py Signed-off-by: Piotr Żelasko * Address code reviews Signed-off-by: Piotr Żelasko * Add support for SPE special tokens bos/eos in prompt templates and ensure Llama2 format gives identical results with the reference implementation Signed-off-by: Piotr Żelasko * Fix tests and add llama2 prompt formatter tests Signed-off-by: Piotr Żelasko * Fix tests Signed-off-by: Piotr Żelasko --------- Signed-off-by: Piotr Żelasko --- examples/asr/transcribe_speech.py | 13 +- nemo/collections/asr/data/audio_to_text.py | 45 +-- .../asr/data/audio_to_text_lhotse_prompted.py | 158 +++----- .../asr/models/aed_multitask_models.py | 172 +++++++-- .../asr/parts/mixins/transcription.py | 8 +- .../asr/parts/utils/streaming_utils.py | 66 +++- nemo/collections/common/prompts/__init__.py | 0 nemo/collections/common/prompts/canary.py | 71 ++++ nemo/collections/common/prompts/example.py | 36 ++ nemo/collections/common/prompts/formatter.py | 347 ++++++++++++++++++ nemo/collections/common/prompts/gemma.py | 29 ++ nemo/collections/common/prompts/llama.py | 72 ++++ nemo/collections/common/prompts/mistral.py | 33 ++ nemo/collections/common/prompts/phi2.py | 62 ++++ .../common/tokenizers/aggregate_tokenizer.py | 9 +- .../common/tokenizers/canary_tokenizer.py | 49 ++- .../tokenizers/sentencepiece_tokenizer.py | 3 +- tests/collections/__init__.py | 0 .../asr/test_asr_multitask_model_bpe.py | 25 +- .../collections/asr/test_custom_tokenizer.py | 12 +- .../common/prompt_formatters/conftest.py | 51 +++ .../test_canary_prompt_formatter.py | 50 +++ .../test_gemma_prompt_formatter.py | 40 ++ .../test_llama2_prompt_formatter.py | 63 ++++ .../test_mistral_prompt_formatter.py | 32 ++ .../test_prompt_formatter_api.py | 147 ++++++++ 26 files changed, 1382 insertions(+), 211 deletions(-) create mode 100644 nemo/collections/common/prompts/__init__.py create mode 100644 nemo/collections/common/prompts/canary.py create mode 100644 nemo/collections/common/prompts/example.py create mode 100644 nemo/collections/common/prompts/formatter.py create mode 100644 nemo/collections/common/prompts/gemma.py create mode 100644 nemo/collections/common/prompts/llama.py create mode 100644 nemo/collections/common/prompts/mistral.py create mode 100644 nemo/collections/common/prompts/phi2.py create mode 100644 tests/collections/__init__.py create mode 100644 tests/collections/common/prompt_formatters/conftest.py create mode 100644 tests/collections/common/prompt_formatters/test_canary_prompt_formatter.py create mode 100644 tests/collections/common/prompt_formatters/test_gemma_prompt_formatter.py create mode 100644 tests/collections/common/prompt_formatters/test_llama2_prompt_formatter.py create mode 100644 tests/collections/common/prompt_formatters/test_mistral_prompt_formatter.py create mode 100644 tests/collections/common/prompt_formatters/test_prompt_formatter_api.py diff --git a/examples/asr/transcribe_speech.py b/examples/asr/transcribe_speech.py index 1763c2035805..b63e9db5fef1 100644 --- a/examples/asr/transcribe_speech.py +++ b/examples/asr/transcribe_speech.py @@ -16,7 +16,7 @@ import glob import json import os -from dataclasses import dataclass, is_dataclass +from dataclasses import dataclass, field, is_dataclass from tempfile import NamedTemporaryFile from typing import List, Optional, Union @@ -25,6 +25,7 @@ from omegaconf import OmegaConf, open_dict from nemo.collections.asr.models import EncDecCTCModel, EncDecHybridRNNTCTCModel, EncDecMultiTaskModel +from nemo.collections.asr.models.aed_multitask_models import parse_multitask_prompt from nemo.collections.asr.modules.conformer_encoder import ConformerChangeConfig from nemo.collections.asr.parts.submodules.ctc_decoding import CTCDecodingConfig from nemo.collections.asr.parts.submodules.multitask_decoding import MultiTaskDecoding, MultiTaskDecodingConfig @@ -169,6 +170,14 @@ class TranscriptionConfig: # Decoding strategy for AED models multitask_decoding: MultiTaskDecodingConfig = MultiTaskDecodingConfig() + # Prompt slots for prompted models, e.g. Canary-1B. Examples of acceptable prompt inputs: + # Implicit single-turn assuming default role='user' (works with Canary-1B) + # +prompt.source_lang=en +prompt.target_lang=es +prompt.task=asr +prompt.pnc=yes + # Explicit single-turn prompt: + # +prompt.role=user +prompt.slots.source_lang=en +prompt.slots.target_lang=es +prompt.slots.task=s2t_translation +prompt.slots.pnc=yes + # Explicit multi-turn prompt: + # +prompt.turns='[{role:user,slots:{source_lang:en,target_lang:es,task:asr,pnc:yes}}]' + prompt: dict = field(default_factory=dict) # decoder type: ctc or rnnt, can be used to switch between CTC and RNNT decoder for Hybrid RNNT/CTC models decoder_type: Optional[str] = None @@ -411,6 +420,8 @@ def autocast(dtype=None): override_cfg.augmentor = augmentor override_cfg.text_field = cfg.gt_text_attr_name override_cfg.lang_field = cfg.gt_lang_attr_name + if hasattr(override_cfg, "prompt"): + override_cfg.prompt = parse_multitask_prompt(OmegaConf.to_container(cfg.prompt)) transcriptions = asr_model.transcribe( audio=filepaths, override_config=override_cfg, diff --git a/nemo/collections/asr/data/audio_to_text.py b/nemo/collections/asr/data/audio_to_text.py index 00c15109b64f..e0bb63ad18cd 100644 --- a/nemo/collections/asr/data/audio_to_text.py +++ b/nemo/collections/asr/data/audio_to_text.py @@ -75,7 +75,9 @@ def _speech_collate_fn(batch, pad_id): has_audio = audio_lengths[0] is not None if has_audio: max_audio_len = max(audio_lengths).item() - max_tokens_len = max(tokens_lengths).item() + has_tokens = tokens_lengths[0] is not None + if has_tokens: + max_tokens_len = max(tokens_lengths).item() audio_signal, tokens = [], [] for b in batch: @@ -89,19 +91,24 @@ def _speech_collate_fn(batch, pad_id): pad = (0, max_audio_len - sig_len) sig = torch.nn.functional.pad(sig, pad) audio_signal.append(sig) - tokens_i_len = tokens_i_len.item() - if tokens_i_len < max_tokens_len: - pad = (0, max_tokens_len - tokens_i_len) - tokens_i = torch.nn.functional.pad(tokens_i, pad, value=pad_id) - tokens.append(tokens_i) + if has_tokens: + tokens_i_len = tokens_i_len.item() + if tokens_i_len < max_tokens_len: + pad = (0, max_tokens_len - tokens_i_len) + tokens_i = torch.nn.functional.pad(tokens_i, pad, value=pad_id) + tokens.append(tokens_i) if has_audio: audio_signal = torch.stack(audio_signal) audio_lengths = torch.stack(audio_lengths) else: audio_signal, audio_lengths = None, None - tokens = torch.stack(tokens) - tokens_lengths = torch.stack(tokens_lengths) + if has_tokens: + tokens = torch.stack(tokens) + tokens_lengths = torch.stack(tokens_lengths) + else: + tokens = None + tokens_lengths = None if sample_ids is None: return audio_signal, audio_lengths, tokens, tokens_lengths else: @@ -256,8 +263,7 @@ def cache_datastore_manifests( if num_datastore_manifests > 0: # Local utility function def cache_data(manifest_filepaths, cache_audio, num_workers, max_num_workers): - """Cache manifests and audio data from object store. - """ + """Cache manifests and audio data from object store.""" # Determine the number of workers to use if num_workers is None: num_workers = os.cpu_count() - 1 @@ -421,8 +427,7 @@ class _AudioTextDataset(Dataset): @property def output_types(self) -> Optional[Dict[str, NeuralType]]: - """Returns definitions of module output ports. - """ + """Returns definitions of module output ports.""" return { 'audio_signal': NeuralType(('B', 'T'), AudioSignal()), 'a_sig_length': NeuralType(tuple('B'), LengthsType()), @@ -546,8 +551,7 @@ class AudioToCharDataset(_AudioTextDataset): @property def output_types(self) -> Optional[Dict[str, NeuralType]]: - """Returns definitions of module output ports. - """ + """Returns definitions of module output ports.""" return { 'audio_signal': NeuralType(('B', 'T'), AudioSignal()), 'a_sig_length': NeuralType(tuple('B'), LengthsType()), @@ -640,8 +644,7 @@ class AudioToBPEDataset(_AudioTextDataset): @property def output_types(self) -> Optional[Dict[str, NeuralType]]: - """Returns definitions of module output ports. - """ + """Returns definitions of module output ports.""" return { 'audio_signal': NeuralType(('B', 'T'), AudioSignal()), 'a_sig_length': NeuralType(tuple('B'), LengthsType()), @@ -910,8 +913,7 @@ def __next__(self): return TarredAudioFilter(self.manifest_processor.collection) def _loop_offsets(self, iterator): - """This function is used to iterate through utterances with different offsets for each file. - """ + """This function is used to iterate through utterances with different offsets for each file.""" class TarredAudioLoopOffsets: def __init__(self, collection): @@ -944,8 +946,7 @@ def _collate_fn(self, batch): return _speech_collate_fn(batch, self.pad_id) def _build_sample(self, tup): - """Builds the training sample by combining the data from the WebDataset with the manifest info. - """ + """Builds the training sample by combining the data from the WebDataset with the manifest info.""" audio_bytes, audio_filename, offset_id = tup # Grab manifest entry from self.manifest_preprocessor.collection @@ -1316,7 +1317,9 @@ class BucketingDataset(IterableDataset): """ def __init__( - self, dataset: IterableDataset, bucketing_batch_size: int, + self, + dataset: IterableDataset, + bucketing_batch_size: int, ): self.wrapped_dataset = dataset self.bucketing_batch_size = bucketing_batch_size diff --git a/nemo/collections/asr/data/audio_to_text_lhotse_prompted.py b/nemo/collections/asr/data/audio_to_text_lhotse_prompted.py index 000b1a8f0839..e9e97d3d32d7 100644 --- a/nemo/collections/asr/data/audio_to_text_lhotse_prompted.py +++ b/nemo/collections/asr/data/audio_to_text_lhotse_prompted.py @@ -13,7 +13,6 @@ # limitations under the License. from typing import Callable, Sequence -import omegaconf import torch.utils.data from lhotse import CutSet from lhotse.cut import MixedCut, MonoCut @@ -21,7 +20,9 @@ from lhotse.dataset.collation import collate_vectors from nemo.collections.asr.data.audio_to_text_lhotse import TokenizerWrapper +from nemo.collections.common.prompts.canary import CanaryPromptFormatter from nemo.collections.common.tokenizers import CanaryTokenizer, TokenizerSpec +from nemo.collections.common.tokenizers.canary_tokenizer import CANARY_SPECIAL_TOKENIZER class PromptedAudioToTextLhotseDataset(torch.utils.data.Dataset): @@ -57,21 +58,21 @@ def __init__( def __getitem__(self, cuts: CutSet) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: audio, audio_lens, cuts = self.load_audio(cuts) - tokens, prompt_tokens = self.prompt_format_fn(cuts, self.tokenizer, inference=self.inference) + prompts_with_answers, prompts = self.prompt_format_fn(cuts, self.tokenizer, inference=self.inference) - tokens = [torch.as_tensor(t) for t in tokens] - token_lens = torch.tensor([t.size(0) for t in tokens], dtype=torch.long) - tokens = collate_vectors(tokens, padding_value=self.padding_value) + prompts_with_answers = [torch.as_tensor(t) for t in prompts_with_answers] + prompts_with_answers_lens = torch.tensor([t.size(0) for t in prompts_with_answers], dtype=torch.long) + prompts_with_answers = collate_vectors(prompts_with_answers, padding_value=self.padding_value) if self.inference: - prompt_tokens = [torch.as_tensor(t) for t in prompt_tokens] - prompt_token_lens = torch.tensor([t.size(0) for t in prompt_tokens], dtype=torch.long) - prompt_tokens = collate_vectors(prompt_tokens, padding_value=self.padding_value) + prompts = [torch.as_tensor(t) for t in prompts] + prompts_lens = torch.tensor([t.size(0) for t in prompts], dtype=torch.long) + prompts = collate_vectors(prompts, padding_value=self.padding_value) else: - prompt_tokens = None - prompt_token_lens = None + prompts = None + prompts_lens = None - return audio, audio_lens, tokens, token_lens, prompt_tokens, prompt_token_lens + return audio, audio_lens, prompts_with_answers, prompts_with_answers_lens, prompts, prompts_lens # Mapping from a string name to a known prompt formatter function. @@ -105,7 +106,9 @@ def get_prompt_format_fn(name: str) -> Callable[[CutSet, TokenizerWrapper, bool] @registered_prompt_format_fn -def canary(cuts: CutSet, tokenizer: TokenizerWrapper, inference: bool = False) -> Sequence[Sequence[int]]: +def canary( + cuts: CutSet, tokenizer: TokenizerWrapper, inference: bool = False +) -> tuple[list[torch.Tensor], list[torch.Tensor]]: """ Prepend and append control tokens to the token sequence as per Canary format. @@ -132,116 +135,53 @@ def canary(cuts: CutSet, tokenizer: TokenizerWrapper, inference: bool = False) - assert isinstance( tokenizer._tokenizer, CanaryTokenizer ), "To use 'canary' prompt format, you must use the CanaryTokenizer." - tokenizer = tokenizer._tokenizer + formatter = CanaryPromptFormatter(tokenizer._tokenizer) - tokens, prompts = [], [] + prompts_with_answers, prompts = [], [] for cut in cuts: if isinstance(cut, MixedCut): cut = cut._first_non_padding_cut - assert isinstance(cut, MonoCut), "Expected MonoCut." + if not isinstance(cut, MonoCut): + raise TypeError( + f"Expected input audio to have a single channel (required MonoCut/MixedCut, but we received: {cut=})" + ) # first, validate the utterance - missing_keys = [k for k in ("source_lang", "target_lang", "taskname", "pnc") if k not in cut.custom] + expected_slots = set(formatter.get_slots("user")) + missing_keys = expected_slots - set(cut.custom) + if "task" in missing_keys and "taskname" in cut.custom: + # Compatibility with "old" Canary manifest format. + # For compatbility with inference options, this slot is now called "task". + cut.custom["task"] = cut.custom["taskname"] + missing_keys.remove("task") if missing_keys: raise RuntimeError( f"We found cut with ID {cut.id} that is missing the following keys: {missing_keys}" f"Please ensure that every utterance in the input manifests contains these keys." ) - # Actual tokenization. If a cut has multiple supervisions, we'll stitch their tokenized texts together. - texts = [sup.text for sup in cut.supervisions] - langs = [sup.language for sup in cut.supervisions] - taskname = cut.custom['taskname'] - pnc = cut.custom['pnc'] - source_lang = cut.custom['source_lang'] - target_lang = cut.custom['target_lang'] - - tokens.append(canary_prompt(tokenizer, texts, langs, source_lang, target_lang, taskname, pnc)) - if inference: - prompts.append(canary_prompt(tokenizer, None, None, source_lang, target_lang, taskname, pnc)) - return tokens, prompts - - -def canary_prompt( - tokenizer: CanaryTokenizer, - text: str | list[str] | None, - language: str | list[str] | None, - source_language: str, - target_language: str, - taskname: str, - pnc: str, -) -> list[int]: - if isinstance(text, str): - text = [text] - if isinstance(language, str): - language = [language] - - if text is not None: - try: - tokens = sum((tokenizer.text_to_ids(text_, lang_) for text_, lang_ in zip(text, language)), start=[]) - except omegaconf.errors.KeyValidationError as e: - raise ProbablyIncorrectLanguageKeyError( - "We couldn't select the right tokenizer, which could be due to issues with reading " - "the language from the manifest. " - "If you're training, try setting lang_field='' to a different value (probably 'target_lang' or 'lang'). " - "If you're using model.transcribe() directly, please use override_config kwarg to set this. " - "If you're using transcribe_speech.py, use option gt_lang_attr_name='...' " - ) from e - else: - tokens = None # create prompt for inference - - # bos - prompted_tokens = [tokenizer.bos_id] - - if tokens is not None and len(tokens) == 0: - # no speech token - prompted_tokens.append(tokenizer.nospeech_id) - else: - # first, validate the utterance - if source_language is None or target_language is None or taskname is None or pnc is None: - raise RuntimeError( - f"Missing keys provided to prompt: " - f"source_langauge={source_language},\n" - f"target_language={target_language},\n" - f"taskname={taskname},\n" - f"pnc={pnc}\n" - f"Please ensure that every utterance in the input manifests contains these keys." - ) - - # src_lang_id/no_speech - src_lang_id = tokenizer.spl_token_to_id(source_language) - prompted_tokens.append(src_lang_id) - - # task - task = taskname - if task == 'asr' or task == "transcribe": - prompted_tokens.append(tokenizer.spl_token_to_id("transcribe")) - elif task == 's2t_translation' or task == 'ast' or task == "translate": - prompted_tokens.append(tokenizer.spl_token_to_id("translate")) - else: - raise ValueError(f"Unknown task: {task}") - - # tgt_lang_id - tgt_lang_id = tokenizer.spl_token_to_id(target_language) - prompted_tokens.append(tgt_lang_id) - - # PnC - pnc = f"{pnc}".lower().strip() # to account for bool or str - if pnc in {'yes', 'true'}: - prompted_tokens.append(tokenizer.spl_token_to_id("pnc")) - elif pnc in {'no', 'false'}: - prompted_tokens.append(tokenizer.spl_token_to_id("nopnc")) - else: - raise ValueError(f"Unknown value for key 'pnc': {pnc}") - - # text (only in training) - if tokens is not None: - prompted_tokens.extend(tokens) + encoded = formatter.encode_dialog( + turns=[ + dict( + role="user", + slots={ + **{slot: cut.custom[slot] for slot in expected_slots}, + formatter.PROMPT_LANGUAGE_SLOT: CANARY_SPECIAL_TOKENIZER, + }, + ), + dict( + role="assistant", + slots={ + "text": ' '.join(s.text for s in cut.supervisions), + formatter.PROMPT_LANGUAGE_SLOT: cut.custom["target_lang"], + }, + ), + ] + ) + prompts_with_answers.append(encoded["input_ids"]) + prompts.append(encoded["context_ids"]) - # eos (only in training) - if tokens is not None: - prompted_tokens.append(tokenizer.eos_id) - return prompted_tokens + return prompts_with_answers, prompts class ProbablyIncorrectLanguageKeyError(RuntimeError): diff --git a/nemo/collections/asr/models/aed_multitask_models.py b/nemo/collections/asr/models/aed_multitask_models.py index b11d744a7e6a..880f8bb3a004 100644 --- a/nemo/collections/asr/models/aed_multitask_models.py +++ b/nemo/collections/asr/models/aed_multitask_models.py @@ -13,6 +13,7 @@ # limitations under the License. import os +import warnings from dataclasses import dataclass, field from math import ceil from typing import Any, Dict, List, Optional, Union @@ -45,6 +46,7 @@ from nemo.collections.common.metrics import GlobalAverageLossMetric from nemo.collections.common.parts import transformer_weights_init from nemo.collections.common.parts.preprocessing.manifest import get_full_path +from nemo.collections.common.prompts.formatter import PromptFormatter from nemo.core.classes.common import typecheck from nemo.core.neural_types import ( AudioSignal, @@ -100,10 +102,7 @@ class MultiTaskTranscriptionConfig(TranscribeConfig): Configuration for Multi Task Transcription """ - task: Optional[str] = None - pnc: Optional[bool] = None - source_lang: Optional[str] = None - target_lang: Optional[str] = None + prompt: list[dict[str, dict[str, str]]] | None = None text_field: str = "answer" lang_field: str = "target_lang" @@ -112,10 +111,7 @@ class MultiTaskTranscriptionConfig(TranscribeConfig): ) def __post_init__(self): - required_fields = ['task', 'pnc', 'source_lang', 'target_lang', 'text_field', 'lang_field'] - for field in required_fields: - if not hasattr(self, field): - raise ValueError(f"`{field}` must be present in the transcription config: {self}") + self.prompt = parse_multitask_prompt(self.prompt) class EncDecMultiTaskModel(ASRModel, ExportableEncDecModel, ASRBPEMixin, ASRTranscriptionMixin): @@ -134,6 +130,12 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None): super().__init__(cfg=cfg, trainer=trainer) + prompt_cls = PromptFormatter.resolve(self.prompt_format) + self.prompt = prompt_cls( + tokenizer=self.tokenizer, + defaults=OmegaConf.to_container(cfg.get("prompt_defaults")), + ) + # Setup audio preprocessor self.preprocessor = EncDecMultiTaskModel.from_config_dict(self.cfg.preprocessor) # Setup audio encoder @@ -391,15 +393,12 @@ def transcribe( audio: Union[str, List[str], np.ndarray, DataLoader], batch_size: int = 4, return_hypotheses: bool = False, - task: Optional[str] = None, - pnc: Optional[bool] = None, - source_lang: Optional[str] = None, - target_lang: Optional[str] = None, num_workers: int = 0, channel_selector: Optional[ChannelSelectorType] = None, augmentor: DictConfig = None, verbose: bool = True, override_config: Optional[MultiTaskTranscriptionConfig] = None, + **prompt, ) -> Union[List[str], List[Hypothesis]]: """ Uses greedy decoding to transcribe audio files. Use this method for debugging and prototyping. @@ -412,15 +411,12 @@ def transcribe( Bigger will result in better throughput performance but would use more memory. return_hypotheses: (bool) Either return hypotheses or text With hypotheses can do some postprocessing like getting timestamp or rescoring - task: (str) task name. Defaults to `asr`. - pnc: (bool) whether to apply punctuation and capitalization or not. Defaults to True. - source_lang: (str) source language. Defaults to `en`. - target_lang: (str) target language. Defaults to `en`. num_workers: (int) number of workers for DataLoader channel_selector (int | Iterable[int] | str): select a single channel or a subset of channels from multi-channel audio. If set to `'average'`, it performs averaging across channels. Disabled if set to `None`. Defaults to `None`. augmentor: (DictConfig): Augment audio samples during transcription if augmentor is applied. verbose: (bool) whether to display tqdm progress bar override_config: (Optional[MultiTaskTranscriptionConfig]) A config to override the default config. + **prompt: Optional input to construct the prompts for the model. Accepted formats are: 1) legacy Canary-1B API source_lang=, target_lang=, etc. 2) explicit single-turn role=, slots={: , ...} 3) explicit multi-turn: turns=[{"role": , "slots": {: , ...}}] Returns: A list of transcriptions (or raw log probabilities if logprobs is True) in the same order as paths2audio_files @@ -433,10 +429,7 @@ def transcribe( channel_selector=channel_selector, augmentor=augmentor, verbose=verbose, - task=task, - pnc=pnc, - source_lang=source_lang, - target_lang=target_lang, + prompt=prompt, ) else: if not isinstance(override_config, MultiTaskTranscriptionConfig): @@ -738,9 +731,6 @@ def _transcribe_on_begin(self, audio, trcfg: MultiTaskTranscriptionConfig): if hasattr(trcfg, '_internal') and hasattr(trcfg._internal, 'manifest_path'): trcfg._internal.manifest_filepath = manifest_path - elif isinstance(audio, (np.ndarray, torch.Tensor)): - raise NotImplementedError("Transcribing from numpy or torch tensors is not supported yet.") - def _transcribe_input_manifest_processing( self, audio_files: List[str], temp_dir: str, trcfg: MultiTaskTranscriptionConfig ) -> Dict[str, Any]: @@ -792,7 +782,47 @@ def _transcribe_forward(self, batch: Any, trcfg: MultiTaskTranscriptionConfig): log_probs, encoded_len, enc_states, enc_mask = self.forward( input_signal=batch[0], input_signal_length=batch[1] ) - decoder_input_ids = batch[-2].to(trcfg._internal.device) + if len(batch) == 6: + # Prompt provided by the dataloader. + decoder_input_ids = batch[4] + else: + # The dataloader provided only audio + audio_lens, so we + # are constructing the prompt dynamically using TranscribeConfig. + + # Now ask the prompt formatter about which slots are required. + # It will return a default prompt structure with default slot values (if available, None otherwise). + # We iterate over that structure and update slot values based on ``trcfg.prompt``. + default_turns = self.prompt.get_default_dialog_slots() + if not trcfg.prompt: + # No turns were provided, use defaults. + turns = default_turns + else: + # Turns were provided, iterate over them and fill missing slot values using defaults.. + turns = trcfg.prompt.copy() # shallow copy #1: don't override the config + for turn in turns: + role = turn["role"] + # Check if we have defaults for this role. + # There shouldn't be more than a single turn for a given role, but if there are, + # we'll emit a warning. + if default_turns_for_role := [t for t in default_turns if t["role"] == role]: + if len(default_turns_for_role) > 1: + warnings.warn( + f"More than one default turn detected for {role=}. " + f"We'll be using default slot values for the first turn of {role=} only." + ) + default_slots = default_turns_for_role[0]["slots"] + turn["slots"] = turn["slots"].copy() # shallow copy #1: don't override the config + # fill missing slots using defaults + for slot, val in default_slots.items(): + if turn["slots"].get(slot) is None: + turn["slots"][slot] = val + + decoder_input_ids = ( + self.prompt.encode_dialog(turns=turns)["context_ids"] + .unsqueeze(0) + .repeat(batch[0].shape[0], 1) + .to(trcfg._internal.device) + ) output = dict( log_probs=log_probs, encoded_lengths=encoded_len, @@ -906,6 +936,8 @@ def _may_be_make_dict_and_fix_paths(self, json_items, manifest_path, trcfg: Mult Returns: A list of dictionaries with the audio file paths fixed. """ + # This method is a legacy helper for Canary that checks whether prompt slot values were provided + # in the input manifest and if not, it injects the defaults. out_json_items = [] for item in json_items: if isinstance(item, str): @@ -913,28 +945,21 @@ def _may_be_make_dict_and_fix_paths(self, json_items, manifest_path, trcfg: Mult entry = { 'audio_filepath': item, 'duration': 100000, - 'source_lang': 'en' if trcfg.source_lang is None else trcfg.source_lang, - 'taskname': 'asr' if trcfg.task is None else trcfg.task, - 'target_lang': 'en' if trcfg.target_lang is None else trcfg.target_lang, - 'pnc': 'yes' if trcfg.pnc is None else 'yes' if trcfg.pnc else 'no', trcfg.text_field: 'nothing', } elif isinstance(item, dict): entry = item entry['audio_filepath'] = get_full_path(entry['audio_filepath'], manifest_file=manifest_path) - - if 'source_lang' not in entry: - entry['source_lang'] = 'en' if trcfg.source_lang is None else trcfg.source_lang - if 'taskname' not in entry: - entry['taskname'] = 'asr' if trcfg.task is None else trcfg.task - if 'target_lang' not in entry: - entry['target_lang'] = 'en' if trcfg.target_lang is None else trcfg.target_lang - if 'pnc' not in entry: - entry['pnc'] = 'yes' if trcfg.pnc is None else 'yes' if trcfg.pnc else 'no' if trcfg.text_field not in entry: entry[trcfg.text_field] = 'nothing' else: raise ValueError(f"Expected str or dict, got {type(item)}") + default_turn = [t for t in trcfg.prompt if t["role"] == "user"] + default_turn = default_turn[0]["slots"] if default_turn else {} + for k, dv in (("source_lang", "en"), ("target_lang", "en"), ("taskname", "asr"), ("pnc", "yes")): + if k not in entry: + # last-chance fallback injecting legacy Canary defaults if none were provided. + entry[k] = default_turn.get(k, dv) out_json_items.append(entry) return out_json_items @@ -977,3 +1002,76 @@ def predict_step(self, batch, batch_idx=0, dataloader_idx=0, has_processed_signa text = [self.decoding.strip_special_tokens(t) for t in text] return text + + +def parse_multitask_prompt(prompt: dict | None) -> list[dict]: + if prompt is None or not prompt: + return [] + + # Case 1. + # Multi-turn prompting format. This format conforms to PromptFormatter API and needs no further modification. + # This format allows to condition the model on chat history, system+user prompts, etc. + # Example: + # model.transcribe( + # audio, + # turns=[ + # dict( + # role="user", + # slots=dict( + # source_lang='en', target_lang='de', task='asr', pnc=True, context='translate this text' + # ), + # ), + # dict( + # role="assistant", + # slots=dict(message="Calculating the translation of given text. Do you want to proceed?"), + # ), + # dict( + # role="user", + # slots=dict( + # source_lang='en', target_lang='de', task='asr', pnc=True, context='Yes, please proceed.' + # ), + # ), + # ], + # ) + if 'turns' in prompt: + assert ( + len(prompt) == 1 + and isinstance(prompt["turns"], list) + and all(isinstance(t, dict) and "role" in t and "slots" in t for t in prompt["turns"]) + ), ( + f"When providing a multi-turn prompt through 'turns', no other keys are allowed " + f"and the value under prompt['turns'] must be a list of dicts with roles and slot values " + f"(we received {prompt=})" + ) + return prompt["turns"] + + values_are_dicts = any(isinstance(v, dict) for k, v in prompt.items() if k != "slots") + assert not values_are_dicts, ( + f"We don't support dict values for prompt keys other than 'slots'. " f"We received {prompt=}" + ) + + # Case 2. + # Single-turn prompting format with explicitly provided role and slot names and values. + # We create a 1-item multi-turn prompt from this input. + # Example: + # model.transcribe( + # audio, + # role="user", + # slots=dict(source_lang='en', target_lang='de', task='asr', pnc=True, context='translate this text'), + # ) + if "role" in prompt and "slots" in prompt: + assert isinstance(prompt["slots"], dict), ( + f"When providing a single-turn prompt through 'role', 'slots' must also be provided " + f"(we received {prompt=})." + ) + return [prompt] + + # Case 3. + # Legacy prompting format for Canary-1B preserved for backward compatibility. + # Extra fields are converted to a single-turn prompt with role "user" (unless overridden with 'role'). + # Example: + # model.transcribe( + # audio, pnc=True, source_lang='en', target_lang='de', task='asr', context='translate this text' + # ) + role = prompt.pop("role", "user") + return [dict(role=role, slots=prompt)] diff --git a/nemo/collections/asr/parts/mixins/transcription.py b/nemo/collections/asr/parts/mixins/transcription.py index df8d6bac50a9..261e97a225dd 100644 --- a/nemo/collections/asr/parts/mixins/transcription.py +++ b/nemo/collections/asr/parts/mixins/transcription.py @@ -148,11 +148,9 @@ def get_item(self, index): # Calculate seq length seq_len = torch.tensor(samples.shape[0], dtype=torch.long) - # Dummy text tokens - text_tokens = torch.tensor([0], dtype=torch.long) - text_tokens_len = torch.tensor(1, dtype=torch.long) - - return (samples, seq_len, text_tokens, text_tokens_len) + # Typically NeMo ASR models expect the mini-batch to be a 4-tuple of (audio, audio_len, text, text_len). + # For inference, we set text and text_len to None to not disrupt the shape of the tuple. + return samples, seq_len, None, None class TranscriptionMixin(ABC): diff --git a/nemo/collections/asr/parts/utils/streaming_utils.py b/nemo/collections/asr/parts/utils/streaming_utils.py index 71c945b66255..51a46184e66f 100644 --- a/nemo/collections/asr/parts/utils/streaming_utils.py +++ b/nemo/collections/asr/parts/utils/streaming_utils.py @@ -21,7 +21,6 @@ from omegaconf import OmegaConf from torch.utils.data import DataLoader -from nemo.collections.asr.data.audio_to_text_lhotse_prompted import canary_prompt from nemo.collections.asr.models.ctc_bpe_models import EncDecCTCModelBPE from nemo.collections.asr.parts.mixins.streaming import StreamingEncoder from nemo.collections.asr.parts.preprocessing.features import normalize_batch @@ -444,7 +443,10 @@ def _convert_buffer_to_features(self): device = self.asr_model.device audio_signal = samples.unsqueeze_(0).to(device) audio_signal_len = torch.Tensor([samples.shape[1]]).to(device) - features, features_len = self.raw_preprocessor(input_signal=audio_signal, length=audio_signal_len,) + features, features_len = self.raw_preprocessor( + input_signal=audio_signal, + length=audio_signal_len, + ) features = features.squeeze() self._update_feature_buffer(features[:, -self.feature_chunk_len :]) @@ -479,7 +481,10 @@ def __init__(self, samples, frame_len, preprocessor, device, pad_to_frame_len=Tr self._feature_frame_len = frame_len / timestep_duration audio_signal = torch.from_numpy(self._samples).unsqueeze_(0).to(device) audio_signal_len = torch.Tensor([self._samples.shape[0]]).to(device) - self._features, self._features_len = preprocessor(input_signal=audio_signal, length=audio_signal_len,) + self._features, self._features_len = preprocessor( + input_signal=audio_signal, + length=audio_signal_len, + ) self._features = self._features.squeeze() def __iter__(self): @@ -701,7 +706,12 @@ class for streaming frame-based ASR use reset() method to reset FrameASR's """ def __init__( - self, asr_model, frame_len=1.6, total_buffer=4.0, batch_size=4, pad_to_buffer_len=True, + self, + asr_model, + frame_len=1.6, + total_buffer=4.0, + batch_size=4, + pad_to_buffer_len=True, ): ''' Args: @@ -1183,7 +1193,9 @@ def _get_batch_preds(self): del best_hyp, pred def transcribe( - self, tokens_per_chunk: int, delay: int, + self, + tokens_per_chunk: int, + delay: int, ): """ Performs "middle token" alignment prediction using the buffered audio chunk. @@ -1210,7 +1222,12 @@ def transcribe( ids, toks = self._alignment_decoder(alignment, self.asr_model.tokenizer, self.blank_id) if len(ids) > 0 and a_idx < signal_end_idx: - self.unmerged[idx] = inplace_buffer_merge(self.unmerged[idx], ids, delay, model=self.asr_model,) + self.unmerged[idx] = inplace_buffer_merge( + self.unmerged[idx], + ids, + delay, + model=self.asr_model, + ) output = [] for idx in range(self.batch_size): @@ -1276,7 +1293,9 @@ def __init__( self.alignment_basepath = alignment_basepath def transcribe( - self, tokens_per_chunk: int, delay: int, + self, + tokens_per_chunk: int, + delay: int, ): if self.lcs_delay < 0: raise ValueError( @@ -1302,7 +1321,10 @@ def transcribe( if len(ids) > 0: self.unmerged[idx] = inplace_buffer_merge( - self.unmerged[idx], ids, delay, model=self.asr_model, + self.unmerged[idx], + ids, + delay, + model=self.asr_model, ) else: @@ -1588,15 +1610,17 @@ def get_input_tokens(self, sample: dict): f"We found sample that is missing the following keys: {missing_keys}" f"Please ensure that every utterance in the input manifests contains these keys. Sample: {sample}" ) - tokens = canary_prompt( - tokenizer=self.asr_model.tokenizer, - text=None, - language=None, - source_language=sample['source_lang'], - target_language=sample['target_lang'], - taskname=sample['taskname'], - pnc=sample['pnc'], - ) + tokens = self.asr_model.prompt.encode_dialog( + turns=[ + { + "role": "user", + "slots": { + **sample, + self.asr_model.prompt.PROMPT_LANGUAGE_SLOT: "spl_tokens", + }, + } + ] + )["context_ids"] else: raise ValueError(f"Unknown prompt format: {self.asr_model.prompt_format}") return torch.tensor(tokens, dtype=torch.long, device=self.asr_model.device).unsqueeze(0) # [1, T] @@ -1712,12 +1736,16 @@ def _get_batch_preds(self, keep_logits=False): encoded, encoded_len = results log_probs = self.asr_model.ctc_decoder(encoder_output=encoded) transcribed_texts, _ = self.asr_model.ctc_decoding.ctc_decoder_predictions_tensor( - decoder_outputs=log_probs, decoder_lengths=encoded_len, return_hypotheses=False, + decoder_outputs=log_probs, + decoder_lengths=encoded_len, + return_hypotheses=False, ) else: log_probs, encoded_len, predictions = results transcribed_texts, _ = self.asr_model.decoding.ctc_decoder_predictions_tensor( - decoder_outputs=log_probs, decoder_lengths=encoded_len, return_hypotheses=False, + decoder_outputs=log_probs, + decoder_lengths=encoded_len, + return_hypotheses=False, ) self.all_preds.extend(transcribed_texts) diff --git a/nemo/collections/common/prompts/__init__.py b/nemo/collections/common/prompts/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/nemo/collections/common/prompts/canary.py b/nemo/collections/common/prompts/canary.py new file mode 100644 index 000000000000..aadc976ba474 --- /dev/null +++ b/nemo/collections/common/prompts/canary.py @@ -0,0 +1,71 @@ +from nemo.collections.common.prompts.formatter import Modality, PromptFormatter +from nemo.collections.common.tokenizers.canary_tokenizer import ( + CANARY_BOS, + CANARY_EOS, + CANARY_NOPNC, + CANARY_PNC, + CANARY_SPECIAL_TOKENIZER, +) + + +class CanaryPromptFormatter(PromptFormatter): + NAME = "canary" + OUTPUT_ROLE = "assistant" + TEMPLATE = { + "user": { + "template": f"{CANARY_BOS}|source_lang||task||target_lang||pnc|", + "slots": { + "source_lang": Modality.Text, + "task": Modality.Text, + "target_lang": Modality.Text, + "pnc": Modality.Text, + }, + }, + OUTPUT_ROLE: { + "template": f"|text|{CANARY_EOS}", + "slots": { + "text": Modality.Text, + }, + }, + } + + def encode_turn(self, prompt_template: str, expected_slots: dict, slot_values: dict) -> list[int]: + # This method handles a level of indirection for Canary. + # It maps values provided in trcfg to the actual special tokens + # expected to be present in canary prompt. + # It used to be done in prompt_format_fn inside Dataset class corresponding to Canary, + # but we are not using it here anymore. + # This maps things such as '|task|: "asr"' to '|TASK|: "<|transcribe|>"'. + slot_values = map_manifest_values_to_special_tokens(slot_values) + return super().encode_turn( + prompt_template=prompt_template, expected_slots=expected_slots, slot_values=slot_values + ) + + +def map_manifest_values_to_special_tokens(slot_values: dict[str, str]) -> dict[str, str]: + slot_values = slot_values.copy() + + any_special_token_present = False + + for k in ("source_lang", "target_lang"): + if k in slot_values and not ((v := slot_values[k]).startswith("<|") and v.endswith("|>")): + slot_values[k] = "<|" + slot_values[k] + "|>" + any_special_token_present = True + + k = "pnc" + if k in slot_values and slot_values[k] not in (CANARY_PNC, CANARY_NOPNC): + slot_values[k] = CANARY_PNC if slot_values[k] in ("yes", "1", "True", "true") else CANARY_NOPNC + any_special_token_present = True + + # Note: we re-map 'taskname' to 'task' for compatibility with earlier versions of Canary training. + for k in ("task", "taskname"): + if k in slot_values and slot_values[k] not in ("<|transcribe|>", "<|translate|>"): + slot_values["task"] = "<|transcribe|>" if slot_values[k] == "asr" else "<|translate|>" + any_special_token_present = True + + # Auto-inject which tokenizer to look up in CanaryTokenizer if not provided, + # and slots for this turn correspond to user prompt. + if any_special_token_present and PromptFormatter.PROMPT_LANGUAGE_SLOT not in slot_values: + slot_values[PromptFormatter.PROMPT_LANGUAGE_SLOT] = CANARY_SPECIAL_TOKENIZER + + return slot_values diff --git a/nemo/collections/common/prompts/example.py b/nemo/collections/common/prompts/example.py new file mode 100644 index 000000000000..3589efb938f4 --- /dev/null +++ b/nemo/collections/common/prompts/example.py @@ -0,0 +1,36 @@ +""" +Implemented following the guide at https://www.promptingguide.ai/models/phi-2#phi-2-usage +""" + +from nemo.collections.common.prompts.formatter import Modality, PromptFormatter + + +class ExamplePromptFormatter(PromptFormatter): + """ + The simplest possible prompt formatter implementation. + + It defines a dialog of the form: + + User: Hi. + Assistant: Hi, how can I help you? + User: What's the time? + Assistant: It's 9 o'clock. + + """ + + NAME = "example_prompt_format" + OUTPUT_ROLE = "assistant" + TEMPLATE = { + "user": { + "template": f"User: |message|\n", + "slots": { + "message": Modality.Text, + }, + }, + OUTPUT_ROLE: { + "template": f"Assistant: |message|\n", + "slots": { + "message": Modality.Text, + }, + }, + } diff --git a/nemo/collections/common/prompts/formatter.py b/nemo/collections/common/prompts/formatter.py new file mode 100644 index 000000000000..524b2e62c5a3 --- /dev/null +++ b/nemo/collections/common/prompts/formatter.py @@ -0,0 +1,347 @@ +from abc import ABC +from enum import Enum +from functools import lru_cache +from typing import Any, Type + +import torch + +from nemo.collections.common.tokenizers import AggregateTokenizer, TokenizerSpec + +PREAMBLE_ROLE = "preamble" + +# Slots used to define when special tokens bos/eos should be inserted. +# These are special in the sense of how sentencepiece defines special tokens: +# They have to be specially inserted into the token sequence, and if they appear in the tokenized string, +# SPE wouldn't use the special token ids but rather tokenize them as if they were normal strings. +# We mimic SPE's behavior if these special slots are present in the template definition. +# To achieve that, insert |bos| / |eos| at the beginning/end of template. +# E.g., inserting only bos in llama2 user role: "template": "|bos|[INST] |message| [\INST]" +BOS_SLOT = "|bos|" +EOS_SLOT = "|eos|" + + +class Modality(Enum): + """ + Modalities supported as PromptFormatter slot values. + """ + + Text = "text" + + def matches(self, value: Any) -> bool: + """ + Checks if the provided value is compatible with an instance of Modality. + """ + match self: + case Modality.Text: + return isinstance(value, str) + case _: + return False + + +class PromptFormatter(ABC): + """ + :class:`~nemo.collections.common.prompts.formatter.PromptFormatter` is intended to simplify + working with various prompt format templates and encoding them into token ID tensors. + + It assumes a dialog-like structure, which is a list of turns, with each turn assigned to a role. + Sub-classes of PromptFormatter define turn templates for each role under TEMPLATE class attribute. + Each template may define some constant parts (e.g. begin-of-turn or end-of-turn tokens, whitespaces, etc.) + and variable parts which we call "slots", that will be provided by the user during training or inference. + + A role is typically "user" and "assistant", and some popular models also use a "system" role. + Other roles may be defined as well. We expect the role corresponding to the model's responses + will be registered under class attribute called OUTPUT_ROLE. + We reserve a special "preamble" role with no slots that will be inserted at the beginning of + the formatted prompt, if "preamble" is present in TEMPLATE. + + A turn is a dict with keys "role" and "slots", where "slots" are a dict that maps slot names + to values that should be filled in the template. + For example, a user role template may be ``"Question: |message|"`` and corresponding ``slots`` would then be + ``{"message": "What time is it?"}``. + + There is a special slot called ``|prompt_language|`` that's used to select the sub-tokenizer in + :class:`~nemo.collections.common.tokenizers.aggregate_tokenizer.AggregateTokenizer`. + It's only used when the tokenizer is aggregate; otherwise it's discarded. + + PromptFormatter supports constructing prompts for training (complete context and answers) + and for inference (context-only). + Training/inference is determined automatically; if the last role in a dialog is the OUTPUT_ROLE, + that's an 'asked-and-answered' scenario, so we assume it's inteded for training. + We'll create a dict with tokenized results available under the following keys: + + * ``context_ids`` (all turns minus last one), + * ``answer_ids`` (last turn) + * ``input_ids`` (previous two values concatenated) + * ``mask`` (boolean mask tensor of the same lenth as ``input_ids`` that's set to True on OUTPUT_ROLE turns) + + Typically, the user will use the ``encode_dialog`` method providing a list of turns to it. + Example showing how to construct model inputs/outputs for training:: + + >>> formatter = PromptFormatter(tokenizer) + ... encoded_for_training = formatter.encode_dialog( + ... turns=[ + ... {"role": "user", "slots": {"message": "What time is it?"}}, + ... {"role": "assistant", "slots": {"message": "Ten o'clock."}}, + ... {"role": "user", "slots": {"message": "PM or AM?"}}, + ... {"role": "assistant", "slots": {"message": "AM, naturally! It's bright outside"}}, + ... ] + ... ) + + Another example that shows how to use the same method to generate prompts for inference:: + + + >>> formatter = PromptFormatter(tokenizer) + ... encoded_for_training = formatter.encode_dialog( + ... turns=[ + ... {"role": "user", "slots": {"message": "What time is it?"}}, + ... {"role": "assistant", "slots": {"message": "Ten o'clock."}}, + ... {"role": "user", "slots": {"message": "PM or AM?"}}, + ... ] + ... ) + + """ + + # Used to support AggregateTokenizer; this key selects the right sub-tokenizer for each turn. + PROMPT_LANGUAGE_SLOT = "prompt_language" + + # Subclasses will be registered under this name, to be used via PromptFormatter.resolve(name). + NAME = None + + # Template is a dict that maps: + # * from a role name string (system/user/assistant/etc) + # * to a dict with keys + # * "template" that has a string value (the prompt template) + # * "slots" that has a value of dict[str, Modality] + # * keys of slots are the names of formattable slots in the prompt template + # * values of slots are :class:`Modality` objects that can be used to check + # whether a specific value conforms to a given modality requirements + # (e.g., Modality.Text may expect string objects). + # Template is intended to be defined by the child classes. + TEMPLATE = None + + # Turns under this role indicate responses by the model; if the last turn in + # PromptFormatter.encode_dialog() ends with this role, it indicates a training example. + OUTPUT_ROLE = None + + # Internal reserved field. + _REGISTERED_FORMATTERS = {} + + def __init__(self, tokenizer: TokenizerSpec, defaults: list[dict] | None = None) -> None: + self.tokenizer = tokenizer + self._defaults = defaults if defaults is not None else [] + self._validate_defaults() + + def __init_subclass__(cls, **kwargs) -> None: + ERR = "PromptFormatter subclass definition error:" + if cls.__name__ not in cls._REGISTERED_FORMATTERS: + for attr in ("NAME", "TEMPLATE", "OUTPUT_ROLE"): + assert ( + getattr(cls, attr, None) is not None + ), f"{ERR} PromptFormatter subclass {cls} did not define a class attribute {attr}" + assert cls.NAME not in cls._REGISTERED_FORMATTERS, ( + f"Cannot register {cls.__name__} under {cls.NAME}: another prompt formatter of type " + f"{cls._REGISTERED_FORMATTERS[cls.NAME]} has already been registered under this name." + ) + cls._REGISTERED_FORMATTERS[cls.NAME] = cls + if "preamble" in cls.TEMPLATE: + assert ( + len(cls.TEMPLATE["preamble"].get("slots", [])) == 0 + ), f"{ERR} Slots are not allowed for preamble template, but we found: '{cls.TEMPLATE['preamble']}'" + for role in cls.get_roles(): + template = cls.get_template(role) + for slot in cls.get_slots(role): + assert ( + _mangled(slot) in template + ), f"{ERR} Slot '{slot}' not found in template '{template}' for role '{role}'" + super().__init_subclass__(**kwargs) + + @classmethod + def resolve(cls, name: str) -> Type["PromptFormatter"]: + if name not in cls._REGISTERED_FORMATTERS: + raise RuntimeError( + f"Unknown prompt formatter: '{name}' (known formats: {', '.join(cls._REGISTERED_FORMATTERS.keys())})" + ) + return cls._REGISTERED_FORMATTERS[name] + + @classmethod + @lru_cache(1) + def get_roles(cls) -> list[str]: + return list(cls.TEMPLATE.keys()) + + @classmethod + def get_slots(cls, role: str) -> dict[str, Modality]: + # returns a copy to avoid accidential mutation of a global object by the user + return cls.TEMPLATE[role].get("slots", {}).copy() + + @classmethod + def get_template(cls, role: str) -> str: + return cls.TEMPLATE[role]["template"] + + def get_default_dialog_slots(self) -> list[dict]: + """ + Returns a list of dialog turns that can be used as a skeleton to fill with actual slot values. + If ``PromptFormatter`` was initialized with ``defaults`` argument, this method will return the + defaults. Otherwise, every slot is pre-filled with ``None``. + """ + + def _get_default_for_role(role: str) -> dict: + for turn in self._defaults: + if turn["role"] == role: + return turn + return {} + + return [ + { + "role": role, + "slots": { + slot: _get_default_for_role(role).get("slots", {}).get(slot) for slot in self.get_slots(role) + }, + } + for role in self.get_roles() + if role != self.OUTPUT_ROLE + ] + + def encode_turn( + self, prompt_template: str, expected_slots: dict[str, Modality], slot_values: dict[str, Any] + ) -> list[int]: + prompt = prompt_template + for slot in expected_slots: + # For the final substitution of 'slot' in the template we have to mangle it to '|slot|' anyway, + # but 'slot' form enables to use valid python identifiers as **kwargs + # for passing slots around in user functions. + value = slot_values.get(slot) + assert value is not None, f"Missing required {slot=} in {slot_values=} for {prompt_template=}" + prompt = prompt.replace(_mangled(slot), value) + return self._apply_tokenizer(prompt, lang=slot_values.get(self.PROMPT_LANGUAGE_SLOT)) + + def encode_dialog(self, turns: list[dict]) -> dict[str, torch.Tensor]: + assert len(turns) > 0, "Empty dialog is not supported." + roles = self.get_roles() + + turn_tokens = [] + turn_token_counts = [] + turn_mask_values = [] + + if "preamble" in self.TEMPLATE: + preamble_turns = [idx for idx, t in enumerate(turns) if t["role"] == "preamble"] + if not preamble_turns: + turns = [{"role": "preamble", **self.TEMPLATE["preamble"]}] + turns + else: + assert ( + len(preamble_turns) == 1 and preamble_turns[0] == 0 + ), f"Preamble can only be presented at turn 0, but we found preamble turns at indexes {preamble_turns}." + + for turn in turns: + assert "role" in turn, f"A turn must have have a 'role' key. We received {turn=}" + role = turn["role"] + assert role in roles, f"Found turn with {role=}, but availables roles are {roles}" + expected_slots = self.get_slots(role) + slot_values = turn.get("slots", {}) + if expected_slots: + assert ( + slot_values + ), f"A turn for role {role} must have have a non-empty value under 'slots' key. We received {turn=}" + self._validate_slot_values(expected_slots, slot_values) + template = self.get_template(role) + tokens = self.encode_turn(template, expected_slots, slot_values) + turn_tokens.extend(tokens) + turn_token_counts.append(len(tokens)) + turn_mask_values.append(role == self.OUTPUT_ROLE) + + ans = {"input_ids": torch.tensor(turn_tokens, dtype=torch.long)} + if turn_mask_values[-1]: + # The last turn comes from OUTPUT_ROLE, i.e. it's a response from the system. + # This indicates it's a training example for which we provide context/answer/mask. + ans["context_ids"] = ans["input_ids"][: -turn_token_counts[-1]] + ans["answer_ids"] = ans["input_ids"][-turn_token_counts[-1] :] + ans["mask"] = torch.tensor( + [ + turn_mask_values[turn_idx] + for turn_idx, turn_len in enumerate(turn_token_counts) + for _ in range(turn_len) + ], + dtype=torch.bool, + ) + else: + ans["context_ids"] = ans["input_ids"] # context == input for inference + return ans + + def _apply_tokenizer(self, text: str, lang: str | None = None) -> list[int]: + # Check if the tokenizer is aggregate and perform extra checks. + is_agg = isinstance(self.tokenizer, AggregateTokenizer) + if is_agg: + assert lang is not None, ( + f"Missing key '{self.PROMPT_LANGUAGE_SLOT}' in slot_values -- cannot resolve " + f"the correct sub-tokenizer in the aggregate tokenizer." + ) + + # Strip bos/eos if present and remember to apply them later. + has_bos = text.startswith(BOS_SLOT) + has_eos = text.endswith(EOS_SLOT) + if has_bos: + text = text[len(BOS_SLOT) :] + if has_eos: + text = text[: -len(EOS_SLOT)] + + # Tokenize, selecting the right API depending on aggregate/normal tokenizer. + if is_agg: + tokens = self.tokenizer.text_to_ids(text, lang) + else: + tokens = self.tokenizer.text_to_ids(text) + + # Lazily look up bos/eos and apply them. Lazy has the advantage that if a tokenizer + # doesn't define bos/eos and the prompt format does not request them, everything just works. + if has_eos: + eos_id = self.tokenizer.get_eos(lang) if is_agg else self.tokenizer.eos + tokens.append(eos_id) + if has_bos: + bos_id = self.tokenizer.get_bos(lang) if is_agg else self.tokenizer.bos + tokens = [bos_id] + tokens + + return tokens + + def _validate_slot_values(self, expected: dict[str, Modality], received: dict[str, Any]) -> None: + missing = set(expected) - set(received) + assert not missing, f"The following slot values were not provided: {missing}" + for slot in expected: + expected_modality = expected[slot] + value = received[slot] + assert expected_modality.matches( + value + ), f"{slot=} received {value=} which does not match modality {expected_modality}" + + def _validate_defaults(self): + if not self._defaults: + return + + err = "Error in default prompt definition:" + assert isinstance(self._defaults, list) + for turn in self._defaults: + assert isinstance(turn, dict) + assert "role" in turn, f"{err} Missing required 'role' key. We received {turn=}" + role = turn["role"] + assert role in self.get_roles(), ( + f"{err} Invalid {role=} in {turn=} - " f"supported roles are: {self.get_roles()}." + ) + if expected_slots := self.get_slots(role): + assert "slots" in turn, ( + f"{err} Missing required 'slots' key in {turn=} - " + f"we expected the following slots to be provided: {expected_slots}." + ) + for slot in turn["slots"]: + assert slot in expected_slots, ( + f"{err} Invalid {slot=} in {turn=}. " + f"The following slots are supported for {role=}: {expected_slots}" + ) + + +def _mangled(slot: str) -> str: + if not (slot[0] == "|" and slot[-1] == "|"): + return f"|{slot}|" + return slot + + +def _unmangled(slot: str) -> str: + if slot[0] == "|" and slot[-1] == "|": + return slot[1:-1] + return slot diff --git a/nemo/collections/common/prompts/gemma.py b/nemo/collections/common/prompts/gemma.py new file mode 100644 index 000000000000..e3b81c848a3e --- /dev/null +++ b/nemo/collections/common/prompts/gemma.py @@ -0,0 +1,29 @@ +""" +Implemented following the guide at https://www.promptingguide.ai/models/gemma#gemma-7b-prompt-format +""" + +from nemo.collections.common.prompts.formatter import Modality, PromptFormatter + +GEMMA_BOS = "" +GEMMA_END_OF_TURN = "" +GEMMA_NL = "\n\n" + + +class GemmaPromptFormatter(PromptFormatter): + NAME = "gemma" + OUTPUT_ROLE = "assistant" + TEMPLATE = { + "user": { + "template": f"{GEMMA_BOS}user\n|message|{GEMMA_END_OF_TURN}\n{GEMMA_BOS}model\n", + "slots": { + "message": Modality.Text, + }, + }, + OUTPUT_ROLE: { + # Note: that trailing NL is bothering me. + "template": f"|message|{GEMMA_END_OF_TURN}\n", + "slots": { + "message": Modality.Text, + }, + }, + } diff --git a/nemo/collections/common/prompts/llama.py b/nemo/collections/common/prompts/llama.py new file mode 100644 index 000000000000..fdaccfaa846e --- /dev/null +++ b/nemo/collections/common/prompts/llama.py @@ -0,0 +1,72 @@ +from nemo.collections.common.prompts.formatter import BOS_SLOT, EOS_SLOT, Modality, PromptFormatter + + +class Llama2PromptFormatter(PromptFormatter): + """ + This template has been validated to provide identical tokenized results to the official code + in https://github.com/meta-llama/llama/blob/main/llama/generation.py + """ + + NAME = "llama2" + OUTPUT_ROLE = "assistant" + TEMPLATE = { + "system_and_user": { + "template": f"{BOS_SLOT}[INST] <>\n|system|\n<>\n\n|message| [/INST]", + "slots": { + "system": Modality.Text, + "message": Modality.Text, + }, + }, + "user": { + "template": "|bos|[INST] |message| [/INST]", + "slots": { + "message": Modality.Text, + }, + }, + OUTPUT_ROLE: { + "template": f"|message| {EOS_SLOT}", + "slots": { + "message": Modality.Text, + }, + }, + } + + +LLAMA3_BOS = "<|begin_of_text|>" +LLAMA3_HEADER_BEGIN = "<|start_header_id|>" +LLAMA3_HEADER_END = "<|end_header_id|>" +LLAMA3_END_OF_TURN = "<|eot_id|>" +LLAMA3_NL = "\n\n" + + +class Llama3PromptFormatter(PromptFormatter): + """ + Implemented following the code at: + https://github.com/meta-llama/llama3/blob/main/llama/test_tokenizer.py#L56 + """ + + NAME = "llama3" + OUTPUT_ROLE = "assistant" + TEMPLATE = { + "preamble": { + "template": LLAMA3_BOS, + }, + "system": { + "template": f"{LLAMA3_HEADER_BEGIN}system{LLAMA3_HEADER_END}{LLAMA3_NL}|message|{LLAMA3_END_OF_TURN}", + "slots": { + "message": Modality.Text, + }, + }, + "user": { + "template": f"{LLAMA3_HEADER_BEGIN}user{LLAMA3_HEADER_END}{LLAMA3_NL}|message|{LLAMA3_END_OF_TURN}", + "slots": { + "message": Modality.Text, + }, + }, + OUTPUT_ROLE: { + "template": f"{LLAMA3_HEADER_BEGIN}assistant{LLAMA3_HEADER_END}{LLAMA3_NL}|message|{LLAMA3_END_OF_TURN}", + "slots": { + "message": Modality.Text, + }, + }, + } diff --git a/nemo/collections/common/prompts/mistral.py b/nemo/collections/common/prompts/mistral.py new file mode 100644 index 000000000000..e882ac5973b1 --- /dev/null +++ b/nemo/collections/common/prompts/mistral.py @@ -0,0 +1,33 @@ +""" +Implemented following the guide at https://www.promptingguide.ai/models/mistral-7b#chat-template-for-mistral-7b-instruct +""" + +from nemo.collections.common.prompts.formatter import Modality, PromptFormatter + +MISTRAL_BOS = "" +MISTRAL_PROMPT_BEGIN = "[INST]" +MISTRAL_PROMPT_END = "[/INST]" +MISTRAL_END_OF_TURN = "" +MISTRAL_NL = "\n\n" + + +class MistralPromptFormatter(PromptFormatter): + NAME = "mistral" + OUTPUT_ROLE = "assistant" + TEMPLATE = { + "preamble": { + "template": MISTRAL_BOS, + }, + "user": { + "template": f"{MISTRAL_PROMPT_BEGIN} |message| {MISTRAL_PROMPT_END} ", + "slots": { + "message": Modality.Text, + }, + }, + OUTPUT_ROLE: { + "template": f"|message|{MISTRAL_END_OF_TURN}", + "slots": { + "message": Modality.Text, + }, + }, + } diff --git a/nemo/collections/common/prompts/phi2.py b/nemo/collections/common/prompts/phi2.py new file mode 100644 index 000000000000..67dad8d5dd82 --- /dev/null +++ b/nemo/collections/common/prompts/phi2.py @@ -0,0 +1,62 @@ +""" +Implemented following the guide at https://www.promptingguide.ai/models/phi-2#phi-2-usage +""" + +from nemo.collections.common.prompts.formatter import Modality, PromptFormatter + + +class Phi2QAPromptFormatter(PromptFormatter): + NAME = "phi2_qa" + OUTPUT_ROLE = "assistant" + TEMPLATE = { + "user": { + "template": f"Instruct: |message|\nOutput: ", + "slots": { + "message": Modality.Text, + }, + }, + OUTPUT_ROLE: { + "template": f"|message|", + "slots": { + "message": Modality.Text, + }, + }, + } + + +class Phi2ChatPromptFormatter(PromptFormatter): + NAME = "phi2_chat" + OUTPUT_ROLE = "assistant" + TEMPLATE = { + "user": { + "template": f"Human: |message|\nAI: ", + "slots": { + "message": Modality.Text, + }, + }, + OUTPUT_ROLE: { + "template": f"|message|", + "slots": { + "message": Modality.Text, + }, + }, + } + + +class Phi2CodePromptFormatter(PromptFormatter): + NAME = "phi2_code" + OUTPUT_ROLE = "assistant" + TEMPLATE = { + "user": { + "template": f"|message|\n", + "slots": { + "message": Modality.Text, + }, + }, + OUTPUT_ROLE: { + "template": f"|message|", + "slots": { + "message": Modality.Text, + }, + }, + } diff --git a/nemo/collections/common/tokenizers/aggregate_tokenizer.py b/nemo/collections/common/tokenizers/aggregate_tokenizer.py index 9c003c37525a..66ec28ebda4d 100644 --- a/nemo/collections/common/tokenizers/aggregate_tokenizer.py +++ b/nemo/collections/common/tokenizers/aggregate_tokenizer.py @@ -15,6 +15,7 @@ from typing import Dict, List, Union import numpy as np +import torch from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec from nemo.utils import logging @@ -124,7 +125,7 @@ def tokens_to_text(self, tokens, lang_id): return tokenizer.decode_pieces(tokens) def ids_to_text(self, ids): - if isinstance(ids, np.ndarray): + if isinstance(ids, (np.ndarray, torch.Tensor)): ids = ids.tolist() tokens = [] @@ -224,6 +225,12 @@ def tokens_to_ids(self, tokens: Union[str, List[str]], langs: Union[str, List[st ids.append(self.token_to_id(token, lang_id)) return ids + def get_bos(self, lang_id: str) -> int: + return self.tokenizers_dict[lang_id].bos + self.token_id_offset[lang_id] + + def get_eos(self, lang_id: str) -> int: + return self.tokenizers_dict[lang_id].eos + self.token_id_offset[lang_id] + @property def vocab(self): return self.vocabulary diff --git a/nemo/collections/common/tokenizers/canary_tokenizer.py b/nemo/collections/common/tokenizers/canary_tokenizer.py index aed95c1f9312..6adcdd8cf734 100644 --- a/nemo/collections/common/tokenizers/canary_tokenizer.py +++ b/nemo/collections/common/tokenizers/canary_tokenizer.py @@ -24,7 +24,15 @@ __all__ = ['CanaryTokenizer'] # Default tokens for compatibility with Canary. -DEFAULT_TOKENS = ["<|nospeech|>", "", "<|endoftext|>", "<|startoftranscript|>", "<|pnc|>", "<|nopnc|>"] +CANARY_BOS = "<|startoftranscript|>" +CANARY_EOS = "<|endoftext|>" +CANARY_PAD = "" +CANARY_NOSPEECH = "<|nospeech|>" +CANARY_PNC = "<|pnc|>" +CANARY_NOPNC = "<|nopnc|>" +DEFAULT_TOKENS = [CANARY_NOSPEECH, CANARY_PAD, CANARY_EOS, CANARY_BOS, CANARY_PNC, CANARY_NOPNC] + +CANARY_SPECIAL_TOKENIZER = "spl_tokens" class CanaryTokenizer(AggregateTokenizer): @@ -37,26 +45,51 @@ def __init__(self, tokenizers: Dict): # for easy access of special tokens self.special_tokens = {} - for special in tokenizers['spl_tokens'].vocab: + for special in tokenizers[CANARY_SPECIAL_TOKENIZER].vocab: # Search for special prompting tokens - if (special.startswith("<|") and special.endswith("|>")) or special == "": - self.special_tokens[special] = self.token_to_id(special, lang_id='spl_tokens') + if (special.startswith("<|") and special.endswith("|>")) or special == CANARY_PAD: + self.special_tokens[special] = self.token_to_id(special, lang_id=CANARY_SPECIAL_TOKENIZER) @cached_property def eos_id(self) -> int: - return self.special_tokens["<|endoftext|>"] + return self.special_tokens[CANARY_EOS] @cached_property def bos_id(self) -> int: - return self.special_tokens["<|startoftranscript|>"] + return self.special_tokens[CANARY_BOS] @cached_property def nospeech_id(self) -> int: - return self.special_tokens["<|nospeech|>"] + return self.special_tokens[CANARY_NOSPEECH] @cached_property def pad_id(self) -> int: - return self.special_tokens[""] + return self.special_tokens[CANARY_PAD] + + def text_to_ids(self, text, lang_id) -> list[int]: + if lang_id == CANARY_SPECIAL_TOKENIZER: + return self._tokenize_special_prompt(text) + if text.endswith(CANARY_EOS): + return super().text_to_ids(text[: -len(CANARY_EOS)], lang_id) + [self.eos_id] + return super().text_to_ids(text[-len(CANARY_EOS) :], lang_id) + + def _tokenize_special_prompt(self, text: str) -> list[int]: + """ + Tokenize the input special prompt of the following schema: + + <|startoftranscript|><|source_lang|><|taskname|><|target_lang|><|pnc|> + + Required because otherwise self.text_to_ids() returns a different result than what Canary had been trained with. + """ + ans = [] + assert text.count('>') == 5, f"Expected exactly 5 special tokens in Canary's prompt, got: {text}." + assert text.startswith(CANARY_BOS), text + for _ in range(5): + token = text[: text.find(">") + 1] + ans.append(self.special_tokens[token]) + text = text[len(token) :] + assert len(text) == 0, text + return ans def spl_token_to_id(self, token): if token_id := self.special_tokens.get(f"<|{token}|>", None): diff --git a/nemo/collections/common/tokenizers/sentencepiece_tokenizer.py b/nemo/collections/common/tokenizers/sentencepiece_tokenizer.py index aed05673f6fa..4a47f0e49b1e 100644 --- a/nemo/collections/common/tokenizers/sentencepiece_tokenizer.py +++ b/nemo/collections/common/tokenizers/sentencepiece_tokenizer.py @@ -17,6 +17,7 @@ import numpy as np import sentencepiece +import torch from nemo.collections.common.parts.utils import if_exist from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec @@ -127,7 +128,7 @@ def tokens_to_text(self, tokens): return self.tokenizer.decode_pieces(tokens) def ids_to_text(self, ids): - if isinstance(ids, np.ndarray): + if isinstance(ids, (np.ndarray, torch.Tensor)): ids = ids.tolist() if self.legacy: diff --git a/tests/collections/__init__.py b/tests/collections/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tests/collections/asr/test_asr_multitask_model_bpe.py b/tests/collections/asr/test_asr_multitask_model_bpe.py index d250fbcf74a1..986df09deacb 100644 --- a/tests/collections/asr/test_asr_multitask_model_bpe.py +++ b/tests/collections/asr/test_asr_multitask_model_bpe.py @@ -80,9 +80,18 @@ def asr_model(test_data_dir): 'dir': None, 'type': 'agg', 'langs': { - 'spl_tokens': {'dir': os.path.join(test_data_dir, "asr", "tokenizers", "canary"), 'type': 'bpe',}, - 'en': {'dir': os.path.join(test_data_dir, "asr", "tokenizers", "an4_wpe_128"), 'type': 'wpe',}, - 'de': {'dir': os.path.join(test_data_dir, "asr", "tokenizers", "an4_wpe_128"), 'type': 'wpe',}, + 'spl_tokens': { + 'dir': os.path.join(test_data_dir, "asr", "tokenizers", "canary"), + 'type': 'bpe', + }, + 'en': { + 'dir': os.path.join(test_data_dir, "asr", "tokenizers", "an4_wpe_128"), + 'type': 'wpe', + }, + 'de': { + 'dir': os.path.join(test_data_dir, "asr", "tokenizers", "an4_wpe_128"), + 'type': 'wpe', + }, }, 'custom_tokenizer': { '_target_': 'nemo.collections.common.tokenizers.canary_tokenizer.CanaryTokenizer', @@ -98,6 +107,9 @@ def asr_model(test_data_dir): modelConfig = DictConfig( { 'prompt_format': 'canary', + 'prompt_defaults': [ + {"role": "user", "slots": {"source_lang": "en", "target_lang": "en", "task": "asr", "pnc": "yes"}} + ], 'sample_rate': 16000, 'preprocessor': DictConfig(preprocessor), 'model_defaults': DictConfig(model_defaults), @@ -304,10 +316,9 @@ def test_transcribe_tensor(self, asr_model, test_data_dir): audio, sr = sf.read(audio_file, dtype='float32') # Numpy array test - with pytest.raises(NotImplementedError): - outputs = asr_model.transcribe(audio, batch_size=1) - # assert len(outputs) == 1 - # assert isinstance(outputs[0], str) + outputs = asr_model.transcribe(audio, batch_size=1) + assert len(outputs) == 1 + assert isinstance(outputs[0], str) @pytest.mark.unit def test_build_tokenizer(self, asr_model, test_data_dir): diff --git a/tests/collections/asr/test_custom_tokenizer.py b/tests/collections/asr/test_custom_tokenizer.py index 5a033045b709..61692061661f 100644 --- a/tests/collections/asr/test_custom_tokenizer.py +++ b/tests/collections/asr/test_custom_tokenizer.py @@ -67,7 +67,9 @@ class DummyModel(ASRBPEMixin, Serialization): "spl_tokens": {"dir": special_tokenizer_path, "type": "bpe"}, "en": {"dir": lang_tokenizer_path, "type": "bpe"}, }, - "custom_tokenizer": {"_target_": "nemo.collections.common.tokenizers.canary_tokenizer.CanaryTokenizer",}, + "custom_tokenizer": { + "_target_": "nemo.collections.common.tokenizers.canary_tokenizer.CanaryTokenizer", + }, } ) model._setup_aggregate_tokenizer(config) @@ -83,5 +85,11 @@ class DummyModel(ASRBPEMixin, Serialization): assert isinstance(tokenizer.tokenizers_dict["en"], SentencePieceTokenizer) assert tokenizer.tokenizers_dict["en"].vocab_size == 6 - assert tokenizer.text_to_ids("<|startoftranscript|>", lang_id="spl_tokens") == [13, 4] # "_" comes first + assert tokenizer.text_to_ids("<|startoftranscript|><|en|><|asr|><|en|><|pnc|>", lang_id="spl_tokens") == [ + 4, + 9, + 7, + 9, + 5, + ] assert tokenizer.text_to_ids("a", lang_id="en") == [14 + 1, 14 + 2] diff --git a/tests/collections/common/prompt_formatters/conftest.py b/tests/collections/common/prompt_formatters/conftest.py new file mode 100644 index 000000000000..e18f1072af24 --- /dev/null +++ b/tests/collections/common/prompt_formatters/conftest.py @@ -0,0 +1,51 @@ +import pytest + +from nemo.collections.common.tokenizers import CanaryTokenizer, SentencePieceTokenizer +from nemo.collections.common.tokenizers.sentencepiece_tokenizer import create_spt_model + +# Note: We don't really define special tokens for this test so every 'special token' +# will be represented as a number of regular tokens. +TOKENIZER_TRAIN_TEXT = """ +Example system message. +Example user message. +Example assistant message. +TEST +[INST] +[/INST] + + +<> +<> +User: Assistant: +user model +Instruct Output +\n\n + +<| +|> +<|en|> <|de|> <|fr|> <|es|> <|transcribe|> <|translate|> <|pnc|> <|nopnc|> <|startoftranscript|> <|endoftext|> +Feel free to add new tokens for your own tests!? +But know that if you do so, you may need to update the token IDs in the existing tests! +So, it might be a good idea to create a new tokenizer instead when adding new prompt formats. +""" + + +@pytest.fixture(scope="session") +def bpe_tokenizer(tmp_path_factory): + tmpdir = tmp_path_factory.mktemp("bpe_tokenizer") + text_path = tmpdir / "text.txt" + text_path.write_text(TOKENIZER_TRAIN_TEXT) + create_spt_model(str(text_path), vocab_size=512, sample_size=-1, do_lower_case=False, output_dir=str(tmpdir)) + return SentencePieceTokenizer(str(tmpdir / "tokenizer.model")) + + +@pytest.fixture(scope="session") +def canary_tokenizer(bpe_tokenizer, tmp_path_factory): + tmpdir = tmp_path_factory.mktemp("spl_tokens") + spl_tokens = CanaryTokenizer.build_special_tokenizer(["transcribe", "en"], tmpdir) + return CanaryTokenizer( + tokenizers={ + "spl_tokens": spl_tokens, + "en": bpe_tokenizer, + } + ) diff --git a/tests/collections/common/prompt_formatters/test_canary_prompt_formatter.py b/tests/collections/common/prompt_formatters/test_canary_prompt_formatter.py new file mode 100644 index 000000000000..ff786766b246 --- /dev/null +++ b/tests/collections/common/prompt_formatters/test_canary_prompt_formatter.py @@ -0,0 +1,50 @@ +from nemo.collections.common.prompts.canary import CanaryPromptFormatter + + +def test_canary_prompt_formatter_training(canary_tokenizer): + formatter = CanaryPromptFormatter(canary_tokenizer) + ans = formatter.encode_dialog( + [ + { + "role": "user", + "slots": { + "source_lang": "<|en|>", + "target_lang": "<|en|>", + "task": "<|transcribe|>", + "pnc": "<|pnc|>", + "prompt_language": "spl_tokens", + }, + }, + {"role": "assistant", "slots": {"text": "TEST", "prompt_language": "en"}}, + ] + ) + assert set(ans) == {"input_ids", "context_ids", "answer_ids", "mask"} + # fmt: off + assert ans["input_ids"].tolist() == [4, 8, 7, 8, 5, 11, 91, 30, 40, 3] + assert ans["context_ids"].tolist() == [4, 8, 7, 8, 5] + assert ans["answer_ids"].tolist() == [11, 91, 30, 40, 3] + assert ans["mask"].tolist() == [False] * 5 + [True] * 5 + # fmt: on + + +def test_canary_prompt_formatter_inference(canary_tokenizer): + formatter = CanaryPromptFormatter(canary_tokenizer) + ans = formatter.encode_dialog( + [ + { + "role": "user", + "slots": { + "source_lang": "<|en|>", + "target_lang": "<|en|>", + "task": "<|transcribe|>", + "pnc": "<|pnc|>", + "prompt_language": "spl_tokens", + }, + }, + ] + ) + assert set(ans) == {"input_ids", "context_ids"} + # fmt: off + assert ans["input_ids"].tolist() == ans["context_ids"].tolist() + assert ans["input_ids"].tolist() == [4, 8, 7, 8, 5] + # fmt: on diff --git a/tests/collections/common/prompt_formatters/test_gemma_prompt_formatter.py b/tests/collections/common/prompt_formatters/test_gemma_prompt_formatter.py new file mode 100644 index 000000000000..be1f6de1a873 --- /dev/null +++ b/tests/collections/common/prompt_formatters/test_gemma_prompt_formatter.py @@ -0,0 +1,40 @@ +from nemo.collections.common.prompts.gemma import GemmaPromptFormatter + + +def test_gemma_prompt_formatter_training(bpe_tokenizer): + formatter = GemmaPromptFormatter(bpe_tokenizer) + ans = formatter.encode_dialog( + [ + {"role": "user", "slots": {"message": "TEST"}}, + {"role": "assistant", "slots": {"message": "TEST"}}, + ] + ) + assert set(ans) == {"input_ids", "context_ids", "answer_ids", "mask"} + # fmt: off + assert ans["input_ids"].tolist() == [ 21, 53, 18, 26, 18, 6, 60, 9, 7, 75, 31, 1, 81, 20, + 30, 104, 59, 18, 26, 18, 6, 60, 9, 7, 21, 53, 18, 26, + 18, 6, 60, 9, 7, 73, 61, 69, 1, 81, 20, 30, 104, 59, + 18, 26, 18, 6, 60, 9, 7] + assert ans["context_ids"].tolist() == [ 21, 53, 18, 26, 18, 6, 60, 9, 7, 75, 31, 1, 81, 20, + 30, 104, 59, 18, 26, 18, 6, 60, 9, 7, 21, 53, 18, 26, + 18, 6, 60, 9, 7, 73, 61, 69] + assert ans["answer_ids"].tolist() == [1, 81, 20, 30, 104, 59, + 18, 26, 18, 6, 60, 9, 7] + assert ans["mask"].tolist() == [False] * 36 + [True] * 13 + # fmt: on + + +def test_gemma_prompt_formatter_inference(bpe_tokenizer): + formatter = GemmaPromptFormatter(bpe_tokenizer) + ans = formatter.encode_dialog( + [ + {"role": "user", "slots": {"message": "TEST"}}, + ] + ) + assert set(ans) == {"input_ids", "context_ids"} + # fmt: off + assert ans["input_ids"].tolist() == ans["context_ids"].tolist() + assert ans["input_ids"].tolist() == [ 21, 53, 18, 26, 18, 6, 60, 9, 7, 75, 31, 1, 81, 20, + 30, 104, 59, 18, 26, 18, 6, 60, 9, 7, 21, 53, 18, 26, + 18, 6, 60, 9, 7, 73, 61, 69] + # fmt: on diff --git a/tests/collections/common/prompt_formatters/test_llama2_prompt_formatter.py b/tests/collections/common/prompt_formatters/test_llama2_prompt_formatter.py new file mode 100644 index 000000000000..9636dd31c768 --- /dev/null +++ b/tests/collections/common/prompt_formatters/test_llama2_prompt_formatter.py @@ -0,0 +1,63 @@ +from nemo.collections.common.prompts.llama import Llama2PromptFormatter + + +def test_llama2_prompt_formatter_training(bpe_tokenizer): + formatter = Llama2PromptFormatter(bpe_tokenizer) + ans = formatter.encode_dialog( + [ + {"role": "user", "slots": {"message": "TEST"}}, + {"role": "assistant", "slots": {"message": "TEST"}}, + ] + ) + assert set(ans) == {"input_ids", "context_ids", "answer_ids", "mask"} + # fmt: off + assert ans["input_ids"].tolist() == [-1, 54, 42, 49, 30, 50, 1, 81, 20, 30, 54, 72, 42, 49, 30, 50, 1, 81, 20, 30, -1] + assert ans["context_ids"].tolist() == [-1, 54, 42, 49, 30, 50, 1, 81, 20, 30, 54, 72, 42, 49, 30, 50] + assert ans["answer_ids"].tolist() == [1, 81, 20, 30, -1] + assert ans["mask"].tolist() == [False] * 16 + [True] * 5 + # fmt: on + + +def test_llama2_prompt_formatter_inference(bpe_tokenizer): + formatter = Llama2PromptFormatter(bpe_tokenizer) + ans = formatter.encode_dialog( + [ + {"role": "user", "slots": {"message": "TEST"}}, + ] + ) + assert set(ans) == {"input_ids", "context_ids"} + # fmt: off + assert ans["input_ids"].tolist() == ans["context_ids"].tolist() + assert ans["input_ids"].tolist() == [-1, 54, 42, 49, 30, 50, 1, 81, 20, 30, 54, 72, 42, 49, 30, 50] + # fmt: on + + +def test_llama2_prompt_formatter_training_with_system(bpe_tokenizer): + formatter = Llama2PromptFormatter(bpe_tokenizer) + ans = formatter.encode_dialog( + [ + {"role": "system_and_user", "slots": {"system": "TEST", "message": "TEST"}}, + {"role": "assistant", "slots": {"message": "TEST"}}, + ] + ) + assert set(ans) == {"input_ids", "context_ids", "answer_ids", "mask"} + # fmt: off + assert ans["input_ids"].tolist() == [-1, 54, 42, 49, 30, 50, 77, 13, 45, 13, 7, 7, 1, 81, 20, 30, 21, 66, 13, 45, 13, 7, 7, 1, 81, 20, 30, 54, 72, 42, 49, 30, 50, 1, 81, 20, 30, -1] + assert ans["context_ids"].tolist() == [-1, 54, 42, 49, 30, 50, 77, 13, 45, 13, 7, 7, 1, 81, 20, 30, 21, 66, 13, 45, 13, 7, 7, 1, 81, 20, 30, 54, 72, 42, 49, 30, 50] + assert ans["answer_ids"].tolist() == [1, 81, 20, 30, -1] + assert ans["mask"].tolist() == [False] * 33 + [True] * 5 + # fmt: on + + +def test_llama2_prompt_formatter_inference_with_system(bpe_tokenizer): + formatter = Llama2PromptFormatter(bpe_tokenizer) + ans = formatter.encode_dialog( + [ + {"role": "system_and_user", "slots": {"system": "TEST", "message": "TEST"}}, + ] + ) + assert set(ans) == {"input_ids", "context_ids"} + # fmt: off + assert ans["input_ids"].tolist() == ans["context_ids"].tolist() + assert ans["input_ids"].tolist() == [-1, 54, 42, 49, 30, 50, 77, 13, 45, 13, 7, 7, 1, 81, 20, 30, 21, 66, 13, 45, 13, 7, 7, 1, 81, 20, 30, 54, 72, 42, 49, 30, 50] + # fmt: on diff --git a/tests/collections/common/prompt_formatters/test_mistral_prompt_formatter.py b/tests/collections/common/prompt_formatters/test_mistral_prompt_formatter.py new file mode 100644 index 000000000000..edc00d426952 --- /dev/null +++ b/tests/collections/common/prompt_formatters/test_mistral_prompt_formatter.py @@ -0,0 +1,32 @@ +from nemo.collections.common.prompts.mistral import MistralPromptFormatter + + +def test_mistral_prompt_formatter_training(bpe_tokenizer): + formatter = MistralPromptFormatter(bpe_tokenizer) + ans = formatter.encode_dialog( + [ + {"role": "user", "slots": {"message": "TEST"}}, + {"role": "assistant", "slots": {"message": "TEST"}}, + ] + ) + assert set(ans) == {"input_ids", "context_ids", "answer_ids", "mask"} + # fmt: off + assert ans["input_ids"].tolist() == [21, 8, 7, 54, 42, 49, 30, 50, 1, 81, 20, 30, 54, 72, 42, 49, 30, 50, 1, 81, 20, 30, 66, 8, 7] + assert ans["context_ids"].tolist() == [21, 8, 7, 54, 42, 49, 30, 50, 1, 81, 20, 30, 54, 72, 42, 49, 30, 50] + assert ans["answer_ids"].tolist() == [1, 81, 20, 30, 66, 8, 7] + assert ans["mask"].tolist() == [False] * 18 + [True] * 7 + # fmt: on + + +def test_mistral_prompt_formatter_inference(bpe_tokenizer): + formatter = MistralPromptFormatter(bpe_tokenizer) + ans = formatter.encode_dialog( + [ + {"role": "user", "slots": {"message": "TEST"}}, + ] + ) + assert set(ans) == {"input_ids", "context_ids"} + # fmt: off + assert ans["input_ids"].tolist() == ans["context_ids"].tolist() + assert ans["input_ids"].tolist() == [21, 8, 7, 54, 42, 49, 30, 50, 1, 81, 20, 30, 54, 72, 42, 49, 30, 50] + # fmt: on diff --git a/tests/collections/common/prompt_formatters/test_prompt_formatter_api.py b/tests/collections/common/prompt_formatters/test_prompt_formatter_api.py new file mode 100644 index 000000000000..26ade7da1415 --- /dev/null +++ b/tests/collections/common/prompt_formatters/test_prompt_formatter_api.py @@ -0,0 +1,147 @@ +import pytest + +from nemo.collections.common.prompts.canary import PromptFormatter +from nemo.collections.common.prompts.formatter import Modality + + +class _DummyPromptFormatter(PromptFormatter): + NAME = "_dummy_test_formatter" + TEMPLATE = { + "user": {"template": "|text|", "slots": {"text": Modality.Text}}, + "assistant": {"template": "|text|", "slots": {"text": Modality.Text}}, + } + OUTPUT_ROLE = "assistant" + + +def test_prompt_formatter_empty_dialog_exception(bpe_tokenizer): + formatter = _DummyPromptFormatter(bpe_tokenizer) + with pytest.raises(AssertionError): + formatter.encode_dialog([]) + + +def test_prompt_formatter_inference(bpe_tokenizer): + formatter = _DummyPromptFormatter(bpe_tokenizer) + ans = formatter.encode_dialog([{"role": "user", "slots": {"text": "hi"}}]) + recovered = bpe_tokenizer.ids_to_text(ans["input_ids"]) + assert recovered == "hi" + + +def test_prompt_formatter_training(bpe_tokenizer): + formatter = _DummyPromptFormatter(bpe_tokenizer) + ans = formatter.encode_dialog( + [ + {"role": "user", "slots": {"text": "hi"}}, + {"role": "assistant", "slots": {"text": "hello"}}, + ] + ) + recovered = bpe_tokenizer.ids_to_text(ans["input_ids"]) + assert recovered == "hi hello", recovered + + +def test_prompt_formatter_missing_role(bpe_tokenizer): + formatter = _DummyPromptFormatter(bpe_tokenizer) + with pytest.raises(AssertionError, match="A turn must have have a 'role' key"): + formatter.encode_dialog([{"slots": {"text": "hi"}}]) + + +def test_prompt_formatter_missing_slots(bpe_tokenizer): + formatter = _DummyPromptFormatter(bpe_tokenizer) + with pytest.raises( + AssertionError, match="A turn for role user must have have a non-empty value under 'slots' key" + ): + formatter.encode_dialog([{"role": "user"}]) + with pytest.raises( + AssertionError, match="A turn for role user must have have a non-empty value under 'slots' key" + ): + formatter.encode_dialog([{"role": "user", "slots": {}}]) + + +def test_prompt_formatter_aggregate_tokenizer(canary_tokenizer): + # Note the 'canary_tokenizer' arg which is an aggregate tokenizer fixture. + formatter = _DummyPromptFormatter(canary_tokenizer) + ans = formatter.encode_dialog( + [ + { + "role": "user", + "slots": { + "text": "hi", + "prompt_language": "en", + }, + } + ] + ) + recovered = canary_tokenizer.ids_to_text(ans["input_ids"]) + assert recovered == " hi" + + +def test_prompt_formatter_aggregate_tokenizer_missing_prompt_language(canary_tokenizer): + # Note the 'canary_tokenizer' arg which is an aggregate tokenizer fixture. + formatter = _DummyPromptFormatter(canary_tokenizer) + + with pytest.raises(AssertionError, match="Missing key 'prompt_language' in slot_values"): + formatter.encode_dialog([{"role": "user", "slots": {"text": "hi"}}]) + + +class _DummyPreamblePromptFormatter(PromptFormatter): + NAME = "_dummy_preamble_test_formatter" + TEMPLATE = { + "preamble": {"template": "TEST"}, + "user": {"template": "|text|", "slots": {"text": Modality.Text}}, + "assistant": {"template": "|text|", "slots": {"text": Modality.Text}}, + } + OUTPUT_ROLE = "assistant" + + +def test_prompt_formatter_preamble_inference(bpe_tokenizer): + formatter = _DummyPreamblePromptFormatter(bpe_tokenizer) + ans = formatter.encode_dialog([{"role": "user", "slots": {"text": "hi"}}]) + recovered = bpe_tokenizer.ids_to_text(ans["input_ids"]) + assert recovered == "TEST hi", recovered + + +def test_prompt_formatter_premble_training(bpe_tokenizer): + formatter = _DummyPreamblePromptFormatter(bpe_tokenizer) + ans = formatter.encode_dialog( + [ + {"role": "user", "slots": {"text": "hi"}}, + {"role": "assistant", "slots": {"text": "hello"}}, + ] + ) + recovered = bpe_tokenizer.ids_to_text(ans["input_ids"]) + assert recovered == "TEST hi hello" + + +def test_prompt_formatter_explicit_preamble(bpe_tokenizer): + formatter = _DummyPreamblePromptFormatter(bpe_tokenizer) + ans = formatter.encode_dialog([{"role": "preamble"}, {"role": "user", "slots": {"text": "hi"}}]) + recovered = bpe_tokenizer.ids_to_text(ans["input_ids"]) + assert recovered == "TEST hi" + + +def test_prompt_formatter_wrong_preamble_excpetions(bpe_tokenizer): + formatter = _DummyPreamblePromptFormatter(bpe_tokenizer) + with pytest.raises(AssertionError): + # Error: 2 preambles + formatter.encode_dialog( + [ + {"role": "preamble"}, + {"role": "preamble"}, + {"role": "user", "slots": {"text": "hi"}}, + ] + ) + with pytest.raises(AssertionError): + # Error: preamble not at the beginning + formatter.encode_dialog( + [ + {"role": "user", "slots": {"text": "hi"}}, + {"role": "preamble"}, + ] + ) + with pytest.raises(AssertionError): + # Error: preamble with slots + formatter.encode_dialog( + [ + {"role": "user", "slots": {"text": "hi"}}, + {"role": "preamble", "slots": {"abc": "abc"}}, + ] + ) From b0f3138a6be7fab3175deb8935f8492aeb1445bd Mon Sep 17 00:00:00 2001 From: Adi Renduchintala Date: Sat, 1 Jun 2024 09:41:45 -0700 Subject: [PATCH 43/47] support null/None truncation field (#9355) * support null/None truncation field Signed-off-by: arendu * Apply isort and black reformatting Signed-off-by: arendu * Fix truncation when truncation field is empty Signed-off-by: Cheng-Ping Hsieh * Fix final truncation if truncation_field is not enough Signed-off-by: Cheng-Ping Hsieh * Apply isort and black reformatting Signed-off-by: hsiehjackson * Fix Signed-off-by: Cheng-Ping Hsieh --------- Signed-off-by: arendu Signed-off-by: arendu Signed-off-by: Cheng-Ping Hsieh Signed-off-by: hsiehjackson Co-authored-by: arendu Co-authored-by: Cheng-Ping Hsieh Co-authored-by: hsiehjackson --- .../megatron/gpt_sft_dataset.py | 113 +++++++++++------- 1 file changed, 72 insertions(+), 41 deletions(-) diff --git a/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py index faaa10606aa0..e16543a7568d 100644 --- a/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py +++ b/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py @@ -101,7 +101,7 @@ def __init__( self.seed = seed self.label_key = label_key self.answer_only_loss = answer_only_loss - self.truncation_fields = truncation_field.split(',') + self.truncation_fields = truncation_field.split(',') if truncation_field is not None else [] self.pad_to_max_length = pad_to_max_length self.index_mapping_dir = index_mapping_dir self.prompt_template = prompt_template @@ -166,8 +166,9 @@ def _maybe_validate_prompt_template(self): ), f'{label_placeholder} must be at the end of prompt_template.' # Legacy checkpoints has self.truncation_fields = ['context'] and self.prompt_template_keys = ['input', 'output'] - if self.prompt_template_keys[0] == 'input' and self.truncation_fields[0] == 'context': - self.truncation_fields[0] = self.prompt_template_keys[0] + if len(self.truncation_fields) > 0: + if self.prompt_template_keys[0] == 'input' and self.truncation_fields[0] == 'context': + self.truncation_fields[0] = self.prompt_template_keys[0] assert set(self.truncation_fields).issubset( self.prompt_template_keys @@ -305,32 +306,61 @@ def _multiple_truncation(self, template_ids: List[List[int]], template_ids_keys: if total_ids > self.max_seq_length: truncation_length_total = total_ids - self.max_seq_length num_fields = len(self.truncation_fields) - # sorted equal divide length to each field - # examples: - # truncation_length_total = 3 - # num_fields = 11 - # truncation_length_list = [3,4,4] - truncation_length_list = [ - truncation_length_total // num_fields + (1 if i < truncation_length_total % num_fields else 0) - for i in range(num_fields)[::-1] - ] - - for i, (ids, key) in enumerate(zip(template_ids, template_ids_keys)): - if key in self.truncation_fields: - truncation_length = truncation_length_list.pop() - if len(ids) < truncation_length: - logging.warning(f'{key} is not long enough to truncate.') - truncation_length = len(ids) - - if self.truncation_method == 'left': - window_offset = truncation_length - elif self.truncation_method == 'right': - window_offset = 0 - else: - raise ValueError(f'{self.truncation_method} is not supported') + if num_fields > 0: + # sorted equal divide length to each field + # examples: + # truncation_length_total = 3 + # num_fields = 11 + # truncation_length_list = [3,4,4] + truncation_length_list = [ + truncation_length_total // num_fields + (1 if i < truncation_length_total % num_fields else 0) + for i in range(num_fields)[::-1] + ] - window_length = len(ids) - truncation_length - template_ids[i] = ids[window_offset : window_offset + window_length] + for i, (ids, key) in enumerate(zip(template_ids, template_ids_keys)): + if key in self.truncation_fields: + truncation_length = truncation_length_list.pop() + if len(ids) < truncation_length: + logging.warning(f'{key} is not long enough to truncate.') + truncation_length = len(ids) + + if self.truncation_method == 'left': + window_offset = truncation_length + elif self.truncation_method == 'right': + window_offset = 0 + else: + raise ValueError(f'{self.truncation_method} is not supported') + + window_length = len(ids) - truncation_length + template_ids[i] = ids[window_offset : window_offset + window_length] + else: + # If truncation_field is empty, we truncate template_ids (List[List[int]]) to make total ids < self.max_seq_length. + logging.warning( + f'`truncation_field` is empty, we truncate input from {self.truncation_method} based on truncation_method.' + ) + template_ids_lengths = [len(ids) for ids in template_ids] + if self.truncation_method == 'left': + iters = range(0, len(template_ids_lengths), 1) + elif self.truncation_method == 'right': + iters = range(len(template_ids_lengths) - 1, -1, -1) + else: + raise ValueError(f'{self.truncation_method} is not supported') + + # Iterate all lengths of template_ids. + for i in iters: + if template_ids_lengths[i] >= truncation_length_total: + template_ids_lengths[i] -= truncation_length_total + if self.truncation_method == 'left': + template_ids[i] = template_ids[i][-template_ids_lengths[i] :] + elif self.truncation_method == 'right': + template_ids[i] = template_ids[i][: template_ids_lengths[i]] + else: + raise ValueError(f'{self.truncation_method} is not supported') + break + else: + truncation_length_total -= template_ids_lengths[i] + template_ids_lengths[i] = 0 + template_ids[i] = [] context_ids = [i for ids in template_ids[:-1] for i in ids] label_ids = template_ids[-1] @@ -362,31 +392,30 @@ def _process_example(self, example): # these pad/eos tokens are placeholders for virtual tokens context_ids = [self.tokenizer.eos_id] * self.virtual_tokens + context_ids - input_ids = context_ids - answer_start_idx = len(input_ids) - # Adds bos token in the start if self.add_bos: context_ids = [self.tokenizer.bos_id] + context_ids - input_ids = [self.tokenizer.bos_id] + input_ids - answer_start_idx += 1 # Adds sep token between text/prompt and answer if self.add_sep: context_ids = context_ids + [self.sep_id] - input_ids = input_ids + [self.sep_id] - answer_start_idx += 1 - input_ids = input_ids + answer_ids + input_ids = context_ids + answer_ids # Only training need to consider eos token if self.add_eos: input_ids = input_ids + [self.tokenizer.eos_id] if len(input_ids) > self.max_seq_length: - logging.warning(f'Input ids length {len(input_ids)} exceed max sequence length {self.max_seq_length}') + # this only happens if tuncation_field is not enough to truncate. + # context_ids can be empty if we truncate contexts. + # answer_ids can be empty if we truncate answers. + logging.warning( + f'After truncation, input ids length {len(input_ids)} still exceeds max sequence length {self.max_seq_length}' + ) + context_ids = context_ids[: self.max_seq_length] input_ids = input_ids[: self.max_seq_length] - answer_ids = input_ids[answer_start_idx:] + answer_ids = input_ids[len(context_ids) :] # store metadata in dataset, in case user may have keys required in the prediction json files metadata = {k: v for k, v in example.items() if k not in self.prompt_template_keys} @@ -396,7 +425,7 @@ def _process_example(self, example): processed_example = { 'input_ids': input_ids, - 'answer_start_idx': answer_start_idx, + 'answer_start_idx': len(context_ids), 'context_ids': context_ids, 'context_length': len(context_ids), 'answer_ids': answer_ids, @@ -426,7 +455,7 @@ def _collate_item(self, item, max_length, pad_id): return item def _build_loss_mask(self, processed_example): - """ Pad input_ids in batch to max batch length while building loss mask """ + """Pad input_ids in batch to max batch length while building loss mask""" input_ids = processed_example['input_ids'] answer_start_idx = processed_example['answer_start_idx'] if self.answer_only_loss: @@ -641,7 +670,9 @@ def collate_fn(self, batch): else: attention_mask = [self._create_attention_mask(max_length) for _ in batch] processed_batch.update( - {'attention_mask': torch.stack(attention_mask),} + { + 'attention_mask': torch.stack(attention_mask), + } ) return processed_batch From 9218c3aab7af7c2d7f3d6e45c0b027bafe25eba8 Mon Sep 17 00:00:00 2001 From: Pablo Garay Date: Mon, 3 Jun 2024 09:53:39 -0700 Subject: [PATCH 44/47] cicd_remove_commented_code (#9364) --- .github/workflows/cicd-main.yml | 40 +-------------------------------- 1 file changed, 1 insertion(+), 39 deletions(-) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index b924cf975b18..29e84b933f14 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -43,33 +43,11 @@ jobs: docker container prune --filter "until=24h" --force docker image prune -a --filter "until=24h" --force -# checkout-repository: -# runs-on: self-hosted-azure -# container: -# image: nvcr.io/nvidia/pytorch:24.02-py3 -# volumes: -# - ${{ github.workspace }}:/workspace -# steps: -# - name: Checkout repository -# uses: actions/checkout@v4 -# with: -# path: ${{ github.run_id }} - cicd-test-container-setup: needs: [cicd-cluster-clean] runs-on: self-hosted-azure-builder if: ${{ github.event.label.name == 'Run CICD' }} - # uses: actions/cache@v2 - #container: -# image: nvcr.io/nvidia/pytorch:24.02-py3 -# options: -# # --user 0:128 -# --device=/dev/nvidia0 -# --gpus all -# --shm-size=8g -# --env TRANSFORMERS_OFFLINE=0 -# --env HYDRA_FULL_ERROR=1 steps: - name: Checkout repository uses: actions/checkout@v4 @@ -114,23 +92,7 @@ jobs: # These checks are not crucial exit 0 ' - - # - name: Build and push to local registry - # uses: docker/build-push-action@v5 - # with: - # context: . - # push: true - # tags: nemoci.azurecr.io/name/app:latest - - # - name: Inspect - # run: | - # docker buildx imagetools inspect nemoci.azurecr.io/name/app:latest - - #- name: Post-workflow execution - # uses: gacts/run-and-post-run@v1 - # with: - # post: | - # chmod -R 777 . + ### \'\' OPTIONAL_L0_Unit_Tests_GPU: From 48a2668821e86b4e514c9b04f16d5a7c7e51fd70 Mon Sep 17 00:00:00 2001 From: paul-gibbons <87940629+paul-gibbons@users.noreply.github.com> Date: Mon, 3 Jun 2024 11:22:51 -0700 Subject: [PATCH 45/47] NeVa token fusion (#9245) * token fusion via mlp downsampling + media_type default fix Signed-off-by: paul-gibbons * inference update Signed-off-by: paul-gibbons * adapter fix Signed-off-by: paul-gibbons * config refactor, remove image_token_len dependency, transpose mlp_downsample height and weight Signed-off-by: paul-gibbons * Apply isort and black reformatting Signed-off-by: paul-gibbons * removing image_token_len in text generation strategy Signed-off-by: paul-gibbons * fix patch_dim text generation Signed-off-by: paul-gibbons * crop-size fix Signed-off-by: paul-gibbons * fixing RGB reversal bug Signed-off-by: paul-gibbons * Apply isort and black reformatting Signed-off-by: paul-gibbons * crop_size default -> None in text_generation_strategy Signed-off-by: paul-gibbons * Apply isort and black reformatting Signed-off-by: paul-gibbons * patch_dim padding for mlp_downsample Signed-off-by: paul-gibbons * Apply isort and black reformatting Signed-off-by: paul-gibbons * patch_dim padding update Signed-off-by: paul-gibbons * Apply isort and black reformatting Signed-off-by: paul-gibbons * updating h/w patch_dim naming convention Signed-off-by: paul-gibbons * Apply isort and black reformatting Signed-off-by: paul-gibbons --------- Signed-off-by: paul-gibbons Signed-off-by: paul-gibbons Co-authored-by: paul-gibbons --- .../neva/conf/llava_config.yaml | 1 + .../multimodal_llm/neva/conf/neva_config.yaml | 2 +- .../multimodal_llm/neva/conf/neva_peft.yaml | 1 + .../neva/conf/video_neva_config.yaml | 2 +- .../multimodal/data/neva/neva_dataset.py | 59 ++++++++++++++----- nemo/collections/multimodal/parts/utils.py | 15 ++--- .../megatron/adapters/parallel_adapters.py | 53 ++++++++++++++--- .../common/text_generation_strategy.py | 30 ++++++++-- 8 files changed, 127 insertions(+), 36 deletions(-) diff --git a/examples/multimodal/multimodal_llm/neva/conf/llava_config.yaml b/examples/multimodal/multimodal_llm/neva/conf/llava_config.yaml index 68d554efb806..b47c719fef1d 100644 --- a/examples/multimodal/multimodal_llm/neva/conf/llava_config.yaml +++ b/examples/multimodal/multimodal_llm/neva/conf/llava_config.yaml @@ -74,6 +74,7 @@ model: from_pretrained: "openai/clip-vit-large-patch14" # path or name from_hf: True patch_dim: 14 + crop_size: [224, 224] hidden_size: 1024 # could be found from model but tricky in code vision_select_layer: -2 # default to the last layer class_token_length: 1 diff --git a/examples/multimodal/multimodal_llm/neva/conf/neva_config.yaml b/examples/multimodal/multimodal_llm/neva/conf/neva_config.yaml index b9904981a5db..9ec6e51bb004 100644 --- a/examples/multimodal/multimodal_llm/neva/conf/neva_config.yaml +++ b/examples/multimodal/multimodal_llm/neva/conf/neva_config.yaml @@ -74,6 +74,7 @@ model: from_pretrained: "" # path or name from_hf: True patch_dim: 14 + crop_size: [224, 224] hidden_size: 1024 # could be found from model but tricky in code vision_select_layer: -2 # default to the last layer class_token_length: 1 @@ -189,7 +190,6 @@ model: is_multimodal: True media_type: image # currently supported: image sep_image_conv_front: False - image_token_len: 256 conv_template: ${model.mm_cfg.llm.model_type} # check `nemo/collections/multimodal/data/neva/conversation.py` image_folder: null image_aspect_ratio: 'square' diff --git a/examples/multimodal/multimodal_llm/neva/conf/neva_peft.yaml b/examples/multimodal/multimodal_llm/neva/conf/neva_peft.yaml index bde6718faf1a..5dfcec776b69 100644 --- a/examples/multimodal/multimodal_llm/neva/conf/neva_peft.yaml +++ b/examples/multimodal/multimodal_llm/neva/conf/neva_peft.yaml @@ -74,6 +74,7 @@ model: from_pretrained: "" # path or name from_hf: True patch_dim: 14 + crop_size: [224, 224] hidden_size: 1024 # could be found from model but tricky in code vision_select_layer: -2 # default to the last layer class_token_length: 1 diff --git a/examples/multimodal/multimodal_llm/neva/conf/video_neva_config.yaml b/examples/multimodal/multimodal_llm/neva/conf/video_neva_config.yaml index e2ba8494f2cd..8341ff857202 100644 --- a/examples/multimodal/multimodal_llm/neva/conf/video_neva_config.yaml +++ b/examples/multimodal/multimodal_llm/neva/conf/video_neva_config.yaml @@ -75,6 +75,7 @@ model: from_pretrained: "" # path or name from_hf: True patch_dim: 14 + crop_size: [336, 336] hidden_size: 1024 # could be found from model but tricky in code vision_select_layer: -2 # default to the last layer class_token_length: 1 @@ -194,7 +195,6 @@ model: num_frames: 8 # selects the number of frames to use from the video sep_token_between_frames: False # TODO: allow usage of separator tokens between frames sep_image_conv_front: False - image_token_len: 256 conv_template: ${model.mm_cfg.llm.model_type} # check `nemo/collections/multimodal/data/neva/conversation.py` image_folder: null video_folder: null diff --git a/nemo/collections/multimodal/data/neva/neva_dataset.py b/nemo/collections/multimodal/data/neva/neva_dataset.py index 70afc5b4a19a..07b5ad1a32df 100644 --- a/nemo/collections/multimodal/data/neva/neva_dataset.py +++ b/nemo/collections/multimodal/data/neva/neva_dataset.py @@ -145,25 +145,26 @@ def open_video(self, file_name): cap = decord.VideoReader(f) return self.flatten_frames(cap) else: + decord.bridge.set_bridge("torch") cap = decord.VideoReader(os.path.join(self.video_folder, file_name)) return self.flatten_frames(cap) return None def flatten_frames(self, cap): if self.data_cfg['splice_single_frame'] == 'first': - frame = cap[0].asnumpy()[:, :, ::-1] + frame = cap[0].asnumpy() return Image.fromarray(frame).convert('RGB') elif self.data_cfg['splice_single_frame'] == 'middle': - frame = cap[len(cap) // 2].asnumpy()[:, :, ::-1] + frame = cap[len(cap) // 2].asnumpy() return Image.fromarray(frame).convert('RGB') elif self.data_cfg['splice_single_frame'] == 'last': - frame = cap[-1].asnumpy()[:, :, ::-1] + frame = cap[-1].asnumpy() return Image.fromarray(frame).convert('RGB') else: if self.data_cfg['num_frames'] == -1: frames = [] for frame in cap: - rgb_frame = frame.asnumpy()[:, :, ::-1] + rgb_frame = frame.asnumpy() img = Image.fromarray(rgb_frame).convert('RGB') frames.append(img) return frames @@ -171,10 +172,7 @@ def flatten_frames(self, cap): num_frames = min(len(cap), self.data_cfg['num_frames']) indices = np.linspace(0, len(cap) - 1, num_frames, dtype=int) frames = [] - for i in indices: - rgb_frame = cap[i].asnumpy()[:, :, ::-1] - img = Image.fromarray(rgb_frame).convert('RGB') - frames.append(img) + frames = cap.get_batch(indices) while len(frames) < self.data_cfg['num_frames']: frames.append(frames[-1]) @@ -262,9 +260,13 @@ def preprocess_multimodal(sources: dict, multimodal_cfg: dict, cur_token_len: in return sources num_patches = image_token_len + if media_type == 'video': num_patches *= multimodal_cfg['num_frames'] + if multimodal_cfg['mm_mlp_adapter_type'] == 'mlp_downsample': + num_patches //= 4 + if multimodal_cfg['use_im_start_end']: replace_token = DEFAULT_IMAGE_PATCH_TOKEN[model_type] * num_patches else: @@ -922,9 +924,19 @@ def expand2square(pil_img, background_color): media_tensors = torch.tensor([]) if images: media_tensors = torch.stack(images) - cur_token_len = (media_tensors[0].shape[1] // 14) * ( - media_tensors[0].shape[2] // 14 - ) # FIXME: 14 is hardcoded patch size + patch_dim = self.multimodal_cfg['patch_dim'] + + height_num_patches = media_tensors[0].shape[1] // patch_dim + width_num_patches = media_tensors[0].shape[2] // patch_dim + + if self.multimodal_cfg['mm_mlp_adapter_type'] == 'mlp_downsample': + if height_num_patches % 2 != 0: + height_num_patches += 1 + if width_num_patches % 2 != 0: + width_num_patches += 1 + + cur_token_len = height_num_patches * width_num_patches + sources = preprocess_multimodal( copy.deepcopy(sources), self.multimodal_cfg, @@ -978,9 +990,19 @@ def expand2square(pil_img, background_color): media_tensors = frames if videos: media_tensors = torch.stack(videos) - cur_token_len = (media_tensors[0].shape[-1] // 14) * ( - media_tensors[0].shape[-2] // 14 - ) # FIXME: 14 is hardcoded patch size + patch_dim = self.multimodal_cfg['patch_dim'] + + height_num_patches = media_tensors[0].shape[-2] // patch_dim + width_num_patches = media_tensors[0].shape[-1] // patch_dim + + if self.multimodal_cfg['mm_mlp_adapter_type'] == 'mlp_downsample': + if height_num_patches % 2 != 0: + height_num_patches += 1 + if width_num_patches % 2 != 0: + width_num_patches += 1 + + cur_token_len = height_num_patches * width_num_patches + sources = preprocess_multimodal( copy.deepcopy(sources), self.multimodal_cfg, @@ -1190,11 +1212,15 @@ def make_supervised_data_module(tokenizer, model_cfg) -> Dict: add_extra_token = 1 if getattr(model_cfg, 'no_seqlen_plus_one_input_tokens', False): add_extra_token = 0 - crop_size = data_cfg.get("crop_size", (224, 224)) + crop_size = mm_cfg.vision_encoder.get("crop_size", (224, 224)) if mm_cfg.vision_encoder.from_hf: image_processor = CLIPImageProcessor.from_pretrained( mm_cfg.vision_encoder.from_pretrained, torch_dtype=torch.bfloat16 ) + assert crop_size == ( + image_processor.crop_size['height'], + image_processor.crop_size['width'], + ), f"Crop size {crop_size} does not match the HuggingFace CLIP model's crop size {(image_processor.crop_size['height'], image_processor.crop_size['width'])}" else: # TODO(yuya): Fix this hard-code for our own CLIP image_processor = image_transform( @@ -1212,8 +1238,8 @@ def make_supervised_data_module(tokenizer, model_cfg) -> Dict: sep_image_conv_front=data_cfg.sep_image_conv_front, model_type=mm_cfg.llm.get("model_type", "nvgpt"), conv_template=data_cfg.get("conv_template", "nvgpt"), + patch_dim=model_cfg.mm_cfg.vision_encoder.patch_dim, crop_size=crop_size, - image_token_len=data_cfg.image_token_len, image_folder=data_cfg.get('image_folder', None), video_folder=data_cfg.get('video_folder', None), image_aspect_ratio=data_cfg.image_aspect_ratio, @@ -1223,6 +1249,7 @@ def make_supervised_data_module(tokenizer, model_cfg) -> Dict: context_length=model_cfg.encoder_seq_length, media_type=data_cfg.get('media_type', 'image'), num_frames=data_cfg.get('num_frames', -1), + mm_mlp_adapter_type=model_cfg.mm_cfg.get('mm_mlp_adapter_type', 'linear'), ), data_cfg=dict( splice_single_frame=data_cfg.get('splice_single_frame', None), diff --git a/nemo/collections/multimodal/parts/utils.py b/nemo/collections/multimodal/parts/utils.py index 70dd2174a2b7..8f2549b8fcd0 100644 --- a/nemo/collections/multimodal/parts/utils.py +++ b/nemo/collections/multimodal/parts/utils.py @@ -15,6 +15,7 @@ import tempfile from typing import Any, Callable, Tuple +import decord import numpy as np import torch from omegaconf import DictConfig, OmegaConf, open_dict @@ -469,23 +470,23 @@ def expand2square(pil_img, background_color): # add video processor for video neva def video_processor(maybe_video_path): - from decord import VideoReader if isinstance(maybe_video_path, str): - vr = VideoReader(maybe_video_path) + decord.bridge.set_bridge("torch") + vr = decord.VideoReader(maybe_video_path) if neva_cfg.data.splice_single_frame == 'first': - frames = [Image.fromarray(vr[0].asnumpy()[:, :, ::-1]).convert('RGB')] + frames = [Image.fromarray(vr[0].asnumpy()).convert('RGB')] elif neva_cfg.data.splice_single_frame == 'middle': - frames = [Image.fromarray(vr[len(vr) // 2].asnumpy()[:, :, ::-1]).convert('RGB')] + frames = [Image.fromarray(vr[len(vr) // 2].asnumpy()).convert('RGB')] elif neva_cfg.data.splice_single_frame == 'last': - frames = [Image.fromarray(vr[-1].asnumpy()[:, :, ::-1]).convert('RGB')] + frames = [Image.fromarray(vr[-1].asnumpy()).convert('RGB')] else: if neva_cfg.data.num_frames == -1: - frames = [Image.fromarray(frame.asnumpy()[:, :, ::-1]).convert('RGB') for frame in vr] + frames = [Image.fromarray(frame.asnumpy()).convert('RGB') for frame in vr] else: num_frames = min(len(vr), neva_cfg.data.num_frames) indices = np.linspace(0, len(vr) - 1, num_frames, dtype=int) - frames = [Image.fromarray(vr[i].asnumpy()[:, :, ::-1]).convert('RGB') for i in indices] + frames = vr.get_batch(indices) while len(frames) < neva_cfg.data.num_frames: frames.append(frames[-1]) diff --git a/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py b/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py index 51510f1b881e..541ca9c28f3d 100644 --- a/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py +++ b/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py @@ -275,7 +275,9 @@ def _get_init_fn(self, init_method: str): raise NotImplementedError("out_init_method should be zero, normal, kaiming or xavier") return init_fn - def adapter_unfreeze(self,): + def adapter_unfreeze( + self, + ): """ Can be customized to allow for selective training of only some params in the PEFT. """ @@ -402,7 +404,7 @@ class LoraQAdapter(ParallelLinearAdapter): class LoraDenseAttentionAdapter(ParallelLinearAdapter): """ - Lora Adapters are the same arch as regular adapters but with potentially different input and output feature sizes + Lora Adapters are the same arch as regular adapters but with potentially different input and output feature sizes and they do not use an bottleneck activation function """ @@ -411,7 +413,7 @@ class LoraDenseAttentionAdapter(ParallelLinearAdapter): class LoraHto4HAdapter(ParallelLinearAdapter): """ - Lora Adapters are the same arch as regular adapters but with potentially different input and output feature sizes + Lora Adapters are the same arch as regular adapters but with potentially different input and output feature sizes and they do not use an bottleneck activation function """ @@ -420,7 +422,7 @@ class LoraHto4HAdapter(ParallelLinearAdapter): class Lora4HtoHAdapter(ParallelLinearAdapter): """ - Lora Adapters are the same arch as regular adapters but with potentially different input and output feature sizes + Lora Adapters are the same arch as regular adapters but with potentially different input and output feature sizes and they do not use an bottleneck activation function """ @@ -688,14 +690,20 @@ def set_inference_table(self, prompt_representation: torch.Tensor): self.is_inference_ready = True return True - def clear_inference_table(self,): + def clear_inference_table( + self, + ): self.inference_table.fill_(0.0) self.is_inference_ready = False - def get_inference_table(self,): + def get_inference_table( + self, + ): return self.inference_table.data - def inner_forward(self,): + def inner_forward( + self, + ): input_embeds = self.embedding(self.indices).unsqueeze(0) intermediate_parallel, bias_parallel = self.first(input_embeds) intermediate_parallel = fused_bias_gelu(intermediate_parallel, bias_parallel) @@ -890,6 +898,29 @@ class LoraKQVAdapterWeightTyingConfig(ParallelLinearAdapterWeightTyingConfig): _target_: str = "{0}.{1}".format(LoraKQVAdapterWeightTying.__module__, LoraKQVAdapterWeightTying.__name__) +class DownSampleBlock(nn.Module): + def forward(self, x): + vit_embeds = x + h = w = int(vit_embeds.shape[3] ** 0.5) + vit_embeds = vit_embeds.reshape(*vit_embeds.shape[:3], h, w, -1) + vit_embeds = self.flat_square(vit_embeds) + vit_embeds = vit_embeds.reshape(*vit_embeds.shape[:3], -1, vit_embeds.shape[-1]) + return vit_embeds + + def flat_square(self, x): + b, T, F, h, w, c = x.size() + if w % 2 == 1: + x = torch.cat([x, torch.zeros((b, T, F, h, 1, c), dtype=x.dtype).to(x.device)], dim=4) + b, T, F, h, w, c = x.size() + if h % 2 == 1: + x = torch.cat([x, torch.zeros((b, T, F, 1, w, c), dtype=x.dtype).to(x.device)], dim=3) + b, T, F, h, w, c = x.size() + x = x.view(b, T, F, h, int(w / 2), int(c * 2)) + x = x.permute(0, 1, 2, 4, 3, 5).contiguous() + x = x.view(b, T, F, int(h / 2), int(w / 2), int(c * 4)) + return x + + class MultimodalProjectorAdapter(nn.Module, AdapterModuleUtil): def __init__(self, adapter_type: str, in_features: int, out_features: int, bias: bool, **kwargs) -> None: super().__init__() @@ -898,6 +929,14 @@ def __init__(self, adapter_type: str, in_features: int, out_features: int, bias: self.mm_projector = torch.nn.Linear(in_features, out_features, bias) elif adapter_type == 'identity': self.mm_projector = lambda x: x + elif adapter_type == 'mlp_downsample': + self.mm_projector = torch.nn.Sequential( + DownSampleBlock(), + torch.nn.LayerNorm(in_features * 4), + torch.nn.Linear(in_features * 4, out_features, bias), + torch.nn.GELU(), + torch.nn.Linear(out_features, out_features, bias), + ) else: mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', adapter_type) if mlp_gelu_match: diff --git a/nemo/collections/nlp/modules/common/text_generation_strategy.py b/nemo/collections/nlp/modules/common/text_generation_strategy.py index 44a80465c34b..e8e2859e439f 100644 --- a/nemo/collections/nlp/modules/common/text_generation_strategy.py +++ b/nemo/collections/nlp/modules/common/text_generation_strategy.py @@ -20,7 +20,7 @@ from typing import List, Set, Tuple import torch - +from transformers import CLIPImageProcessor from nemo.collections.nlp.modules.common.lm_utils import pad_batch from nemo.collections.nlp.modules.common.megatron.module import Float16Module from nemo.collections.nlp.modules.common.megatron.utils import get_ltor_masks_and_position_ids @@ -533,7 +533,6 @@ class NevaModelTextGenerationStrategy(TextGenerationStrategy): def __init__(self, model): super().__init__(model) self.forward_model = self.model.model - self.num_media_latents = model.cfg.data.get("image_token_len", 576) self.tokenizer = self.model.tokenizer self.image_paths = [] self.cfg = self.model.cfg @@ -545,8 +544,10 @@ def __init__(self, model): sep_image_conv_front=self.data_cfg.sep_image_conv_front, conv_template=self.data_cfg.get("conv_template", "nvgpt"), model_type=self.cfg.mm_cfg.llm.get("model_type", "nvgpt"), - image_token_len=self.data_cfg.image_token_len, - image_folder=self.data_cfg.image_folder, + patch_dim=self.cfg.mm_cfg.vision_encoder.patch_dim, + crop_size=self.cfg.mm_cfg.vision_encoder.get("crop_size", None), + image_folder=self.data_cfg.get('image_folder', None), + video_folder=self.data_cfg.get('video_folder', None), image_aspect_ratio=self.data_cfg.image_aspect_ratio, use_im_start_end=getattr(self.cfg.mm_cfg, 'use_im_start_end', False), image_processor=None, @@ -554,7 +555,28 @@ def __init__(self, model): context_length=self.cfg.encoder_seq_length, media_type=getattr(self.data_cfg, 'media_type', 'image'), num_frames=getattr(self.data_cfg, 'num_frames', 1), + mm_mlp_adapter_type=getattr(self.cfg.mm_cfg, 'mm_mlp_adapter_type', 'linear'), ) + if self.multimodal_cfg['crop_size'] is None: + image_processor = CLIPImageProcessor.from_pretrained( + self.cfg.mm_cfg.vision_encoder.from_pretrained, torch_dtype=torch.bfloat16 + ) + self.multimodal_cfg['crop_size'] = ( + image_processor.crop_size['height'], + image_processor.crop_size['width'], + ) + + patch_dim = self.multimodal_cfg['patch_dim'] + height_num_patches = self.multimodal_cfg['crop_size'][0] // patch_dim + width_num_patches = self.multimodal_cfg['crop_size'][1] // patch_dim + + if self.multimodal_cfg['mm_mlp_adapter_type'] == 'mlp_downsample': + if height_num_patches % 2 != 0: + height_num_patches += 1 + if width_num_patches % 2 != 0: + width_num_patches += 1 + + self.num_media_latents = height_num_patches * width_num_patches def clip_max_len(self, maxlen: int) -> int: """clip the max len based on the LM model max sequence length""" From bd014d9d71a258da6c69c80df8244a9598c752f3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Piotr=20=C5=BBelasko?= Date: Mon, 3 Jun 2024 14:40:16 -0400 Subject: [PATCH 46/47] Fix prompt formatter's defaults=None case in multi-task model (#9366) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Piotr Żelasko --- nemo/collections/asr/models/aed_multitask_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo/collections/asr/models/aed_multitask_models.py b/nemo/collections/asr/models/aed_multitask_models.py index 880f8bb3a004..edb591921782 100644 --- a/nemo/collections/asr/models/aed_multitask_models.py +++ b/nemo/collections/asr/models/aed_multitask_models.py @@ -133,7 +133,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None): prompt_cls = PromptFormatter.resolve(self.prompt_format) self.prompt = prompt_cls( tokenizer=self.tokenizer, - defaults=OmegaConf.to_container(cfg.get("prompt_defaults")), + defaults=OmegaConf.to_container(pd) if (pd := cfg.get("prompt_defaults")) is not None else None, ) # Setup audio preprocessor From a0488f63fbfb555f05461dcf235f9a58559a99eb Mon Sep 17 00:00:00 2001 From: yaoyu-33 <54727607+yaoyu-33@users.noreply.github.com> Date: Mon, 3 Jun 2024 15:28:09 -0700 Subject: [PATCH 47/47] Update Gemma conversion script (#9365) * Update Gemma conversion script Signed-off-by: yaoyu-33 * Apply isort and black reformatting Signed-off-by: yaoyu-33 --------- Signed-off-by: yaoyu-33 Signed-off-by: yaoyu-33 Co-authored-by: yaoyu-33 --- .../checkpoint_converters/convert_gemma_jax_to_nemo.py | 3 ++- .../checkpoint_converters/convert_gemma_pyt_to_nemo.py | 10 ++++++---- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/scripts/checkpoint_converters/convert_gemma_jax_to_nemo.py b/scripts/checkpoint_converters/convert_gemma_jax_to_nemo.py index c35906dc78c1..1cbeeb41c66d 100644 --- a/scripts/checkpoint_converters/convert_gemma_jax_to_nemo.py +++ b/scripts/checkpoint_converters/convert_gemma_jax_to_nemo.py @@ -14,6 +14,7 @@ """ Requires to install: `pip install orbax jax flax jaxlib` +Requires to clone: https://github.com/google-deepmind/gemma.git Required to set: `export PYTHONPATH=/path/to/google/gemma_jax:$PYTHONPATH` python3 /opt/NeMo/scripts/nlp_language_modeling/convert_gemma_jax_to_nemo.py \ --input_name_or_path /path/to/gemma/checkpoints/jax/7b \ @@ -27,8 +28,8 @@ import jax import torch +from gemma.params import load_params, nest_params, param_remapper from omegaconf import OmegaConf -from params import load_params, nest_params, param_remapper from transformer import TransformerConfig from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel diff --git a/scripts/checkpoint_converters/convert_gemma_pyt_to_nemo.py b/scripts/checkpoint_converters/convert_gemma_pyt_to_nemo.py index 583ee7893c0f..d14e5f7de551 100644 --- a/scripts/checkpoint_converters/convert_gemma_pyt_to_nemo.py +++ b/scripts/checkpoint_converters/convert_gemma_pyt_to_nemo.py @@ -14,6 +14,7 @@ """ Requires to install: `pip install fairscale==0.4.13 immutabledict==4.1.0 tensorstore==0.1.45` +Requires to clone: https://github.com/google/gemma_pytorch.git Required to set: `export PYTHONPATH=/path/to/google/gemma_pytorchh:$PYTHONPATH` python3 /opt/NeMo/scripts/nlp_language_modeling/convert_gemma_pyt_to_nemo.py \ --input_name_or_path /path/to/gemma/checkpoints/pyt/7b.ckpt \ @@ -26,9 +27,9 @@ from argparse import ArgumentParser import torch -from model.config import get_config_for_2b, get_config_for_7b -from model.model import CausalLM -from model.tokenizer import Tokenizer +from gemma.config import get_config_for_2b, get_config_for_7b +from gemma.model import CausalLM +from gemma.tokenizer import Tokenizer from omegaconf import OmegaConf from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel @@ -152,7 +153,8 @@ def adjust_tensor_shapes(model, nemo_state_dict): # [(head_num + 2 * num_query_groups) * head_size, hidden_size] # -> [head_num, head_size, hidden_size], 2 * [num_query_groups, head_size, hidden_size] q_weight, k_weight, v_weight = qkv_weight.split( - [head_num * head_size, num_query_groups * head_size, num_query_groups * head_size], dim=0, + [head_num * head_size, num_query_groups * head_size, num_query_groups * head_size], + dim=0, ) q_weight = q_weight.reshape(head_num, head_size, hidden_size) k_weight = k_weight.reshape(num_query_groups, head_size, hidden_size)