Skip to content

Commit

Permalink
Merge branch 'r2.0.0rc1' into aot/peft_fix
Browse files Browse the repository at this point in the history
  • Loading branch information
cuichenx authored Jul 8, 2024
2 parents 6436015 + 66c960e commit e8372d9
Show file tree
Hide file tree
Showing 101 changed files with 7,164 additions and 866 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/_test_template.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@ on:
jobs:
main:
runs-on: ${{ inputs.RUNNER }}
timeout-minutes: ${{ inputs.TIMEOUT }}
outputs:
conclusion: ${{ steps.main.conclusion }}
log: ${{ steps.main.outputs.log }}
Expand All @@ -54,6 +53,7 @@ jobs:
uses: actions/checkout@v4
- id: main
name: Run main script
timeout-minutes: ${{ inputs.TIMEOUT }}
run: |
set +e
(
Expand Down
75 changes: 75 additions & 0 deletions .github/workflows/cicd-main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3488,6 +3488,80 @@ jobs:
rm -rf examples/nlp/language_modeling/t5_pretrain_results
rm -rf examples/nlp/language_modeling/t5_index_mappings
L2_Megatron_Core_T5_Pretraining_and_Resume_Training_TP2:
needs: [cicd-test-container-setup]
uses: ./.github/workflows/_test_template.yml
with:
RUNNER: self-hosted-azure
SCRIPT: |
NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_t5_pretraining.py \
trainer.devices=2 \
trainer.log_every_n_steps=1 \
trainer.max_epochs=null \
trainer.max_steps=10 \
trainer.val_check_interval=10 \
trainer.accumulate_grad_batches=1 \
trainer.precision=bf16 \
model.megatron_amp_O2=True \
exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
model.mcore_t5=True \
model.transformer_engine=True \
model.tensor_model_parallel_size=2 \
model.micro_batch_size=4 \
model.global_batch_size=4 \
model.seq_length=128 \
model.encoder.num_layers=4 \
model.encoder.hidden_size=64 \
model.encoder.num_attention_heads=8 \
model.decoder.num_layers=4 \
model.decoder.hidden_size=64 \
model.decoder.num_attention_heads=8 \
model.encoder.transformer_block_type='pre_ln' \
model.decoder.transformer_block_type='pre_ln' \
model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \
model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \
model.data.data_impl=text_mmap \
+model.data.data_impl_kwargs.newline_int=10 \
+model.data.data_impl_kwargs.header_lines=0 \
+model.data.data_impl_kwargs.workers=null \
+model.data.data_impl_kwargs.sort_dataset_paths=False
NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_t5_pretraining.py \
trainer.devices=2 \
trainer.log_every_n_steps=1 \
trainer.max_epochs=null \
trainer.max_steps=10 \
trainer.val_check_interval=10 \
trainer.accumulate_grad_batches=1 \
trainer.precision=bf16 \
model.megatron_amp_O2=True \
exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
exp_manager.resume_if_exists=True \
model.mcore_t5=True \
model.transformer_engine=True \
model.tensor_model_parallel_size=2 \
model.micro_batch_size=4 \
model.global_batch_size=4 \
model.seq_length=128 \
model.encoder.num_layers=4 \
model.encoder.hidden_size=64 \
model.encoder.num_attention_heads=8 \
model.decoder.num_layers=4 \
model.decoder.hidden_size=64 \
model.decoder.num_attention_heads=8 \
model.encoder.transformer_block_type='pre_ln' \
model.decoder.transformer_block_type='pre_ln' \
model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \
model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \
model.data.data_impl=text_mmap \
+model.data.data_impl_kwargs.newline_int=10 \
+model.data.data_impl_kwargs.header_lines=0 \
+model.data.data_impl_kwargs.workers=null \
+model.data.data_impl_kwargs.sort_dataset_paths=False
AFTER_SCRIPT: |
rm -rf examples/nlp/language_modeling/t5_pretrain_results
rm -rf examples/nlp/language_modeling/t5_index_mappings
L2_Megatron_T5_with_ALiBi_Pretraining_and_Resume_Training_TP2:
needs: [cicd-test-container-setup]
uses: ./.github/workflows/_test_template.yml
Expand Down Expand Up @@ -4433,6 +4507,7 @@ jobs:
- L2_Megatron_Change_Partitions_Reduce_TP_Num_Partitions_-2_to_1-_and_PP_Num_Partitions_-1_to_2
- L2_Megatron_Change_Partitions_Increase_TP_Num_Partitions_-2_to_4-_and_PP_Num_Partitions_-1_to_2
- L2_Megatron_T5_Pretraining_and_Resume_Training_TP2
- L2_Megatron_Core_T5_Pretraining_and_Resume_Training_TP2
- L2_Megatron_T5_with_ALiBi_Pretraining_and_Resume_Training_TP2
- L2_Megatron_T5_with_KERPLE_Pretraining_and_Resume_Training_TP2
- L2_Megatron_T5_Pretraining_and_Resume_Training_PP2
Expand Down
20 changes: 18 additions & 2 deletions Dockerfile.ci
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,9 @@ EOF
WORKDIR /workspace

# Install NeMo requirements
ARG TE_TAG=bfe21c3d68b0a9951e5716fb520045db53419c5e
ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea
ARG MODELOPT_VERSION=0.13.0
ARG MCORE_TAG=02871b4df8c69fac687ab6676c4246e936ce92d0
ARG MCORE_TAG=0bc3547702464501feefeb5523b7a17e591b21fa
ARG APEX_TAG=810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c
RUN \
--mount=type=bind,source=requirements,target=requirements \
Expand All @@ -61,6 +61,22 @@ git checkout ${MCORE_TAG} && \
popd && \
popd
export PYTHONPATH="${PYTHONPATH}:/workspace/Megatron-LM"

# Mamba dependancy installation
git clone https://github.com/state-spaces/mamba.git && \
cd mamba && \
git checkout v2.0.3 && \
python setup.py install && \
cd .. && \
rm -rf mamba

git clone https://github.com/Dao-AILab/causal-conv1d && \
cd causal-conv1d && \
git checkout v1.2.2.post1 && \
python setup.py install && \
cd .. && \
rm -rf causal-conv1d

EOF

# Copy over NeMo code
Expand Down
2 changes: 1 addition & 1 deletion docs/source/asr/speaker_recognition/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,6 @@ Model Classes
-------------
.. autoclass:: nemo.collections.asr.models.label_models.EncDecSpeakerLabelModel
:show-inheritance:
:members: setup_finetune_model, get_embedding, verify_speakers
:members: setup_finetune_model, get_embedding, verify_speakers, verify_speakers_batch


8 changes: 7 additions & 1 deletion docs/source/asr/speaker_recognition/results.rst
Original file line number Diff line number Diff line change
Expand Up @@ -91,14 +91,20 @@ Speaker Verification Inference

Speaker Verification is a task of verifying if two utterances are from the same speaker or not.

We provide a helper function to verify the audio files and return True if two provided audio files are from the same speaker, False otherwise.
We provide a helper function to verify the audio files (also in a batch) and return True if provided pair of audio files is from the same speaker, False otherwise.

The audio files should be 16KHz mono channel wav files.

.. code-block:: python
speaker_model = EncDecSpeakerLabelModel.from_pretrained(model_name="titanet_large")
decision = speaker_model.verify_speakers('path/to/one/audio_file','path/to/other/audio_file')
decisions = speaker_model.verify_speakers_batch([
('/path/to/audio_0_0', '/path/to/audio_0_1'),
('/path/to/audio_1_0', '/path/to/audio_1_1'),
('/path/to/audio_2_0', '/path/to/audio_2_1'),
('/path/to/audio_3_0', '/path/to/audio_3_1')
], batch_size=4, device='cuda')
NGC Pretrained Checkpoints
Expand Down
8 changes: 0 additions & 8 deletions examples/multimodal/convert_ckpt_to_nemo.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,14 +165,6 @@ def convert(local_rank, rank, world_size, args):
model = MegatronControlNet.load_from_checkpoint(
checkpoint_path, hparams_file=args.hparams_file, trainer=trainer
)
elif args.model_type == 'kosmos':
model = MegatronKosmosModel.load_from_checkpoint(
checkpoint_path, hparams_file=args.hparams_file, trainer=trainer
)
elif args.model_type == 'neva':
model = MegatronNevaModel.load_from_checkpoint(
checkpoint_path, hparams_file=args.hparams_file, trainer=trainer
)
else:
raise ValueError(f"Unrecognized model_type {args.model_type}.")

Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,50 @@
trainer:
devices: 1
num_nodes: 1
accelerator: gpu
precision: 32
logger: False # logger provided by exp_manager
enable_checkpointing: False
use_distributed_sampler: False
max_epochs: -1 # PTL default. In practice, max_steps will be reached first.
max_steps: 375000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
log_every_n_steps: 10
val_check_interval: 100
check_val_every_n_epoch: null
limit_val_batches: 50
limit_test_batches: 500
accumulate_grad_batches: 1 # do not modify, grad acc is automatic for training megatron models
gradient_clip_val: 1.0
benchmark: False
enable_model_summary: False # default PTL callback for this does not support model parallelism, instead we log manually

exp_manager:
explicit_log_dir: null
exp_dir: null
name: megatron_clip
create_wandb_logger: False
wandb_logger_kwargs:
project: null
name: null
resume_if_exists: True
resume_ignore_no_checkpoint: True
resume_from_checkpoint: ${model.resume_from_checkpoint}
create_checkpoint_callback: True
checkpoint_callback_params:
monitor: val_loss
save_top_k: 10
mode: min
always_save_nemo: False # saves nemo file during validation, not implemented for model parallel
save_nemo_on_train_end: False # not recommended when training large models on clusters with short time limits
filename: 'megatron_clip--{val_loss:.2f}-{step}-{consumed_samples}'
model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}}
ema:
enable: False
decay: 0.9999
validate_original_weights: False
every_n_steps: 1
cpu_offload: False

model:
precision: 32
# specify micro_batch_size, global_batch_size, and model parallelism
Expand All @@ -19,6 +66,9 @@ model:
local_loss: False # calculate loss w/ local features @ global (instead of realizing full global @ global matrix)
gather_with_grad: True # enable full distributed gradient for feature gather, set this to False may cause convergence issue

mcore_gpt: False
transformer_engine: False

vision:
precision: 32
# vision configs
Expand Down Expand Up @@ -135,7 +185,6 @@ model:
bias_activation_fusion: False
megatron_legacy: True

transformer_engine: False
fp8: False # enables fp8 in TransformerLayer forward
fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3
fp8_hybrid: False # sets fp8_format = recipe.Format.HYBRID
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,8 @@ model:
# numerical results as the naïve method.
local_loss: False # calculate loss w/ local features @ global (instead of realizing full global @ global matrix)
gather_with_grad: True # enable full distributed gradient for feature gather, set this to False may cause convergence issue
mcore_gpt: True
transformer_engine: True

vision:
precision: ${trainer.precision}
Expand Down Expand Up @@ -183,7 +185,6 @@ model:
bias_activation_fusion: False
megatron_legacy: False

transformer_engine: False
fp8: False # enables fp8 in TransformerLayer forward
fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3
fp8_hybrid: False # sets fp8_format = recipe.Format.HYBRID
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ trainer:
num_nodes: 1
accelerator: gpu
logger: False # logger provided by exp_manager
precision: 16 # 16, 32, or bf16
precision: 32 # 16, 32, or bf16

model:
restore_from_path: null # Path to a trained ViT .nemo file
Expand Down
Loading

0 comments on commit e8372d9

Please sign in to comment.