diff --git a/.github/workflows/_test_template.yml b/.github/workflows/_test_template.yml new file mode 100644 index 000000000000..31e9452d0fe5 --- /dev/null +++ b/.github/workflows/_test_template.yml @@ -0,0 +1,58 @@ +name: ~test template + +on: + workflow_call: + inputs: + RUNNER: + type: string + description: Runner to use for test + required: true + TIMEOUT: + type: number + description: Max runtime of test in minutes + required: false + default: 10 + SCRIPT: + type: string + description: Test script to execute + required: true + AFTER_SCRIPT: + type: string + description: Script to run after main test + required: false + default: ":" + IS_OPTIONAL: + type: boolean + description: Failure will cancel all other tests if set to true + required: false + default: false + outputs: + conclusion: + description: Conclusion of main test step + value: ${{ jobs.main.outputs.conclusion }} + +jobs: + main: + runs-on: ${{ inputs.RUNNER }} + timeout-minutes: ${{ inputs.TIMEOUT }} + outputs: + conclusion: ${{ steps.main.conclusion }} + container: + image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} + options: + --device=/dev/nvidia0 + --gpus all + --shm-size=8g + --env TRANSFORMERS_OFFLINE=0 + --env HYDRA_FULL_ERROR=1 + --volume /mnt/datadrive/TestData:/home/TestData + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - id: main + run: ${{ inputs.SCRIPT }} + - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" + if: failure() && inputs.IS_OPTIONAL == false + - name: after_script + if: always() && inputs.AFTER_SCRIPT != ':' + run: ${{ inputs.AFTER_SCRIPT }} \ No newline at end of file diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 29e84b933f14..815b8b5d69be 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -97,193 +97,88 @@ jobs: OPTIONAL_L0_Unit_Tests_GPU: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - name: "L0: Unit Tests GPU" - run: | + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | NEMO_NUMBA_MINVER=0.53 pytest -m "not pleasefixme" --with_downloads - #- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - # if: "failure()" - + IS_OPTIONAL: true L0_Unit_Tests_CPU: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-cpu - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - name: "L0: Unit Tests CPU" - run: | + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-cpu + TIMEOUT: 80 + SCRIPT: | CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" L0_Setup_Test_Data_And_Models: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python -m tests.setup --save_dir /home/TestData/nlp - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python -m tests.setup --save_dir /home/TestData/nlp -## - name: L2: Multimodal Imagen Train + ## - name: L2: Multimodal Imagen Train # L2: Community LLM Checkpoints tests L2_Community_LLM_Checkpoints_tests_Llama: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - CUDA_VISIBLE_DEVICES=0 python scripts/checkpoint_converters/convert_llama_hf_to_nemo.py \ - --input_name_or_path=/home/TestData/nlp/megatron_llama/llama-ci-hf-tiny \ - --output_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \ - --precision=16 - - name: Cleanup - if: "always()" - run: | - rm -rf /home/TestData/nlp/megatron_llama/model_weights - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + CUDA_VISIBLE_DEVICES=0 python scripts/checkpoint_converters/convert_llama_hf_to_nemo.py \ + --input_name_or_path=/home/TestData/nlp/megatron_llama/llama-ci-hf-tiny \ + --output_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \ + --precision=16 + AFTER_SCRIPT: | + rm -rf /home/TestData/nlp/megatron_llama/model_weights L2_Community_LLM_Checkpoints_tests_Llama3: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v2 - - run: | - CUDA_VISIBLE_DEVICES=0 python scripts/checkpoint_converters/convert_llama_hf_to_nemo.py \ - --input_name_or_path=/home/TestData/nlp/megatron_llama/llama3-ci-hf \ - --output_path=/home/TestData/nlp/megatron_llama/llama3-ci-hf/llama3_ci.nemo \ - --precision=16 - rm -f /home/TestData/nlp/megatron_llama/llama3-ci-hf/llama3_ci.nemo - - name: Cleanup - if: "always()" - run: | - rm -rf /home/TestData/nlp/megatron_llama/llama3-ci-hf/model_weights - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + CUDA_VISIBLE_DEVICES=0 python scripts/checkpoint_converters/convert_llama_hf_to_nemo.py \ + --input_name_or_path=/home/TestData/nlp/megatron_llama/llama3-ci-hf \ + --output_path=/home/TestData/nlp/megatron_llama/llama3-ci-hf/llama3_ci.nemo \ + --precision=16 + AFTER_SCRIPT: | + rm -f /home/TestData/nlp/megatron_llama/llama3-ci-hf/llama3_ci.nemo + rm -rf /home/TestData/nlp/megatron_llama/llama3-ci-hf/model_weights L2_Community_LLM_Checkpoints_tests_StarCoder: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - mkdir -p /home/TestData/nlp/megatron_gpt/starcoder-ci-hf/${{ github.run_id }}; - python scripts/checkpoint_converters/convert_starcoder_hf_to_nemo.py \ - --input_name_or_path /home/TestData/nlp/megatron_gpt/starcoder-ci-hf \ - --output_path /home/TestData/nlp/megatron_gpt/starcoder-ci-hf/${{ github.run_id }} - - name: Cleanup - if: "always()" - run: | - rm -rf /home/TestData/nlp/megatron_gpt/starcoder-ci-hf/megatron_starcoder_tp1_pp1.nemo; - rm -rf /home/TestData/nlp/megatron_gpt/starcoder-ci-hf/${{ github.run_id }}/ - rm -rf /home/TestData/nlp/megatron_gpt/starcoder-ci-hf/model_weights - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + mkdir -p /home/TestData/nlp/megatron_gpt/starcoder-ci-hf/${{ github.run_id }}; + python scripts/checkpoint_converters/convert_starcoder_hf_to_nemo.py \ + --input_name_or_path /home/TestData/nlp/megatron_gpt/starcoder-ci-hf \ + --output_path /home/TestData/nlp/megatron_gpt/starcoder-ci-hf/${{ github.run_id }} + AFTER_SCRIPT: | + rm -rf /home/TestData/nlp/megatron_gpt/starcoder-ci-hf/megatron_starcoder_tp1_pp1.nemo; + rm -rf /home/TestData/nlp/megatron_gpt/starcoder-ci-hf/${{ github.run_id }}/ + rm -rf /home/TestData/nlp/megatron_gpt/starcoder-ci-hf/model_weights L2_Community_LLM_Checkpoints_tests_Falcon: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python scripts/checkpoint_converters/convert_falcon_hf_to_nemo.py \ - --input_name_or_path /home/TestData/nlp/megatron_gpt/falcon-ci-hf \ - --output_path /home/TestData/nlp/megatron_gpt/falcon-ci-hf/falcon_ci.nemo - rm -f /home/TestData/nlp/megatron_gpt/falcon-ci-hf/falcon_ci.nemo - - name: Cleanup - if: "always()" - run: | - rm -rf /home/TestData/nlp/megatron_gpt/falcon-ci-hf/model_weights - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" - + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python scripts/checkpoint_converters/convert_falcon_hf_to_nemo.py \ + --input_name_or_path /home/TestData/nlp/megatron_gpt/falcon-ci-hf \ + --output_path /home/TestData/nlp/megatron_gpt/falcon-ci-hf/falcon_ci.nemo + rm -f /home/TestData/nlp/megatron_gpt/falcon-ci-hf/falcon_ci.nemo + AFTER_SCRIPT: | + rm -rf /home/TestData/nlp/megatron_gpt/falcon-ci-hf/model_weights + # this test is using a 7B model which is too large for GitHub CI # replace the model in this test with a toy model or move the test # to the nightly CI @@ -313,93 +208,51 @@ jobs: L2_PTQ_Llama2_Export_Only: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/nlp/language_modeling/megatron_quantization.py \ - model_file=/home/TestData/nlp/megatron_llama/llama_ci.nemo \ - quantization.algorithm=null \ - model_save=/home/TestData/nlp/megatron_llama/ci_baseline - - rm -rf /home/TestData/nlp/megatron_llama/ci_baseline - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python examples/nlp/language_modeling/megatron_quantization.py \ + model_file=/home/TestData/nlp/megatron_llama/llama_ci.nemo \ + quantization.algorithm=null \ + model_save=/home/TestData/nlp/megatron_llama/ci_baseline + AFTER_SCRIPT: | + rm -rf /home/TestData/nlp/megatron_llama/ci_baseline L2_PTQ_Llama2_FP8: - needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/nlp/language_modeling/megatron_quantization.py \ - model_file=/home/TestData/nlp/megatron_llama/llama_ci.nemo \ - tensor_model_parallel_size=2 \ - trainer.devices=2 \ - quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \ - quantization.algorithm=fp8 \ - quantization.num_calib_size=8 \ - inference.batch_size=2 \ - export.inference_tensor_parallel=2 \ - model_save=/home/TestData/nlp/megatron_llama/ci_fp8.qnemo - - rm -rf /home/TestData/nlp/megatron_llama/ci_fp8.qnemo - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + needs: [cicd-test-container-setup] + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python examples/nlp/language_modeling/megatron_quantization.py \ + model_file=/home/TestData/nlp/megatron_llama/llama_ci.nemo \ + tensor_model_parallel_size=2 \ + trainer.devices=2 \ + quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \ + quantization.algorithm=fp8 \ + quantization.num_calib_size=8 \ + inference.batch_size=2 \ + export.inference_tensor_parallel=2 \ + model_save=/home/TestData/nlp/megatron_llama/ci_fp8.qnemo + AFTER_SCRIPT: | + rm -rf /home/TestData/nlp/megatron_llama/ci_fp8.qnemo L2_PTQ_Llama2_INT8_SQ: - needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/nlp/language_modeling/megatron_quantization.py \ - model_file=/home/TestData/nlp/megatron_llama/llama_ci.nemo \ - quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \ - quantization.algorithm=int8_sq \ - quantization.num_calib_size=8 \ - inference.batch_size=2 \ - model_save=/home/TestData/nlp/megatron_llama/ci_int8_sq.qnemo - - rm -rf /home/TestData/nlp/megatron_llama/ci_int8_sq.qnemo - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + needs: [cicd-test-container-setup] + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python examples/nlp/language_modeling/megatron_quantization.py \ + model_file=/home/TestData/nlp/megatron_llama/llama_ci.nemo \ + quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \ + quantization.algorithm=int8_sq \ + quantization.num_calib_size=8 \ + inference.batch_size=2 \ + model_save=/home/TestData/nlp/megatron_llama/ci_int8_sq.qnemo + AFTER_SCRIPT: | + rm -rf /home/TestData/nlp/megatron_llama/ci_int8_sq.qnemo # TODO: investigate int4_awq stuck issues and restore the test #L2_PTQ_Llama2_INT4_AWQ: @@ -437,274 +290,172 @@ jobs: # L2: ASR dev run ASR_dev_run_Speech_to_Text: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-gpus-1 - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/asr/asr_ctc/speech_to_text_ctc.py \ - model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ - model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=True \ - exp_manager.exp_dir=examples/asr/speech_to_text_results - rm -rf examples/asr/speech_to_text_results - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-gpus-1 + SCRIPT: | + python examples/asr/asr_ctc/speech_to_text_ctc.py \ + model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ + model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \ + trainer.devices=1 \ + trainer.accelerator="gpu" \ + +trainer.fast_dev_run=True \ + exp_manager.exp_dir=examples/asr/speech_to_text_results + AFTER_SCRIPT: | + rm -rf examples/asr/speech_to_text_results ASR_dev_run_Speech_to_Text_WPE_-_CitriNet: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-gpus-1 - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/asr/asr_ctc/speech_to_text_ctc_bpe.py \ - --config-path="../conf/citrinet/" --config-name="config_bpe" \ - model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ - model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \ - model.tokenizer.dir="/home/TestData/asr_tokenizers/an4_wpe_128/" \ - model.tokenizer.type="wpe" \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=True \ - exp_manager.exp_dir=examples/asr/speech_to_text_wpe_results - rm -rf examples/asr/speech_to_text_wpe_results - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-gpus-1 + SCRIPT: | + python examples/asr/asr_ctc/speech_to_text_ctc_bpe.py \ + --config-path="../conf/citrinet/" --config-name="config_bpe" \ + model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ + model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \ + model.tokenizer.dir="/home/TestData/asr_tokenizers/an4_wpe_128/" \ + model.tokenizer.type="wpe" \ + trainer.devices=1 \ + trainer.accelerator="gpu" \ + +trainer.fast_dev_run=True \ + exp_manager.exp_dir=examples/asr/speech_to_text_wpe_results + AFTER_SCRIPT: | + rm -rf examples/asr/speech_to_text_wpe_results ASR_dev_run_Speech_Pre-training_-_CitriNet: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-gpus-1 - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/asr/speech_pretraining/speech_pre_training.py \ - --config-path="../conf/ssl/citrinet/" --config-name="citrinet_ssl_ci" \ - model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ - model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=True \ - exp_manager.exp_dir=examples/asr/speech_pre_training_results - rm -rf examples/asr/speech_pre_training_results - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-gpus-1 + SCRIPT: | + python examples/asr/speech_pretraining/speech_pre_training.py \ + --config-path="../conf/ssl/citrinet/" --config-name="citrinet_ssl_ci" \ + model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ + model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \ + trainer.devices=1 \ + trainer.accelerator="gpu" \ + +trainer.fast_dev_run=True \ + exp_manager.exp_dir=examples/asr/speech_pre_training_results + AFTER_SCRIPT: | + rm -rf examples/asr/speech_pre_training_results ASR_dev_run_Speech_To_Text_Finetuning: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-gpus-1 - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/asr/speech_to_text_finetune.py \ - --config-path="conf/asr_finetune" --config-name="speech_to_text_finetune" \ - model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ - model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \ - init_from_nemo_model=/home/TestData/asr/stt_en_fastconformer_transducer_large.nemo \ - model.tokenizer.update_tokenizer=False \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=True \ - exp_manager.exp_dir=examples/asr/speech_finetuning_results - rm -rf examples/asr/speech_finetuning_results - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-gpus-1 + SCRIPT: | + python examples/asr/speech_to_text_finetune.py \ + --config-path="conf/asr_finetune" --config-name="speech_to_text_finetune" \ + model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ + model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \ + init_from_nemo_model=/home/TestData/asr/stt_en_fastconformer_transducer_large.nemo \ + model.tokenizer.update_tokenizer=False \ + trainer.devices=1 \ + trainer.accelerator="gpu" \ + +trainer.fast_dev_run=True \ + exp_manager.exp_dir=examples/asr/speech_finetuning_results + AFTER_SCRIPT: | + rm -rf examples/asr/speech_finetuning_results OPTIONAL_ASR_dev_run_Speech_To_Text_HF_Finetuning: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-gpus-1 - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/asr/speech_to_text_finetune.py \ - --config-path="conf/asr_finetune" --config-name="speech_to_text_hf_finetune" \ - ~model.train_ds.hf_data_cfg \ - model.train_ds.num_workers=1 \ - model.train_ds.batch_size=2 model.validation_ds.batch_size=2 \ - model.train_ds.streaming=true \ - +model.train_ds.hf_data_cfg.path="librispeech_asr" \ - +model.train_ds.hf_data_cfg.name=null \ - +model.train_ds.hf_data_cfg.split="test.clean" \ - +model.train_ds.hf_data_cfg.streaming=true \ - ~model.validation_ds.hf_data_cfg \ - model.validation_ds.streaming=true \ - +model.validation_ds.hf_data_cfg.path="librispeech_asr" \ - +model.validation_ds.hf_data_cfg.name=null \ - +model.validation_ds.hf_data_cfg.split="test.clean" \ - +model.validation_ds.hf_data_cfg.streaming=true \ - ~model.test_ds \ - init_from_nemo_model=/home/TestData/asr/stt_en_fastconformer_transducer_large.nemo \ - model.tokenizer.update_tokenizer=False \ - model.optim.sched.warmup_steps=0 \ - +model.optim.sched.max_steps=3 \ - trainer.max_epochs=null \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=True \ - exp_manager.exp_dir=examples/asr/speech_finetuning_results - rm -rf examples/asr/speech_finetuning_results - #- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - # if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-gpus-1 + SCRIPT: |- + python examples/asr/speech_to_text_finetune.py \ + --config-path="conf/asr_finetune" --config-name="speech_to_text_hf_finetune" \ + ~model.train_ds.hf_data_cfg \ + model.train_ds.num_workers=1 \ + model.train_ds.batch_size=2 model.validation_ds.batch_size=2 \ + model.train_ds.streaming=true \ + +model.train_ds.hf_data_cfg.path="librispeech_asr" \ + +model.train_ds.hf_data_cfg.name=null \ + +model.train_ds.hf_data_cfg.split="test.clean" \ + +model.train_ds.hf_data_cfg.streaming=true \ + ~model.validation_ds.hf_data_cfg \ + model.validation_ds.streaming=true \ + +model.validation_ds.hf_data_cfg.path="librispeech_asr" \ + +model.validation_ds.hf_data_cfg.name=null \ + +model.validation_ds.hf_data_cfg.split="test.clean" \ + +model.validation_ds.hf_data_cfg.streaming=true \ + ~model.test_ds \ + init_from_nemo_model=/home/TestData/asr/stt_en_fastconformer_transducer_large.nemo \ + model.tokenizer.update_tokenizer=False \ + model.optim.sched.warmup_steps=0 \ + +model.optim.sched.max_steps=3 \ + trainer.max_epochs=null \ + trainer.devices=1 \ + trainer.accelerator="gpu" \ + +trainer.fast_dev_run=True \ + exp_manager.exp_dir=examples/asr/speech_finetuning_results + AFTER_SCRIPT: | + rm -rf examples/asr/speech_finetuning_results + IS_OPTIONAL: true ASR_dev_run_Speech_to_Text_WPE_-_Conformer: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-gpus-1 - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/asr/asr_ctc/speech_to_text_ctc_bpe.py \ - --config-path="../conf/conformer" --config-name="conformer_ctc_bpe" \ - model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ - model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \ - model.tokenizer.dir="/home/TestData/asr_tokenizers/an4_wpe_128/" \ - model.tokenizer.type="wpe" \ - model.train_ds.batch_size=4 \ - model.validation_ds.batch_size=4 \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=True \ - exp_manager.exp_dir=examples/asr/speech_to_text_wpe_conformer_results - rm -rf examples/asr/speech_to_text_wpe_conformer_results - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-gpus-1 + SCRIPT: | + python examples/asr/asr_ctc/speech_to_text_ctc_bpe.py \ + --config-path="../conf/conformer" --config-name="conformer_ctc_bpe" \ + model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ + model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \ + model.tokenizer.dir="/home/TestData/asr_tokenizers/an4_wpe_128/" \ + model.tokenizer.type="wpe" \ + model.train_ds.batch_size=4 \ + model.validation_ds.batch_size=4 \ + trainer.devices=1 \ + trainer.accelerator="gpu" \ + +trainer.fast_dev_run=True \ + exp_manager.exp_dir=examples/asr/speech_to_text_wpe_conformer_results + AFTER_SCRIPT: | + rm -rf examples/asr/speech_to_text_wpe_conformer_results # L2: ASR dev run - part two ASR_dev_run-part_two_Speech_to_Text_WPE_-_Squeezeformer: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-gpus-1 - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/asr/asr_ctc/speech_to_text_ctc_bpe.py \ - --config-path="../conf/squeezeformer" --config-name="squeezeformer_ctc_bpe" \ - model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ - model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \ - model.tokenizer.dir="/home/TestData/asr_tokenizers/an4_wpe_128/" \ - model.tokenizer.type="wpe" \ - model.encoder.d_model=144 \ - model.train_ds.batch_size=4 \ - model.validation_ds.batch_size=4 \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=True \ - exp_manager.exp_dir=examples/asr/speech_to_text_wpe_squeezeformer_results - rm -rf examples/asr/speech_to_text_wpe_squeezeformer_results - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-gpus-1 + SCRIPT: | + python examples/asr/asr_ctc/speech_to_text_ctc_bpe.py \ + --config-path="../conf/squeezeformer" --config-name="squeezeformer_ctc_bpe" \ + model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ + model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \ + model.tokenizer.dir="/home/TestData/asr_tokenizers/an4_wpe_128/" \ + model.tokenizer.type="wpe" \ + model.encoder.d_model=144 \ + model.train_ds.batch_size=4 \ + model.validation_ds.batch_size=4 \ + trainer.devices=1 \ + trainer.accelerator="gpu" \ + +trainer.fast_dev_run=True \ + exp_manager.exp_dir=examples/asr/speech_to_text_wpe_squeezeformer_results + AFTER_SCRIPT: | + rm -rf examples/asr/speech_to_text_wpe_squeezeformer_results L2_Speech_to_Text_EMA: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/asr/asr_ctc/speech_to_text_ctc.py \ - model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ - model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \ - trainer.devices=2 \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=True \ - +exp_manager.ema.enable=True \ - exp_manager.exp_dir=examples/asr/speech_to_text_results - rm -rf examples/asr/speech_to_text_results - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python examples/asr/asr_ctc/speech_to_text_ctc.py \ + model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ + model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \ + trainer.devices=2 \ + trainer.accelerator="gpu" \ + +trainer.fast_dev_run=True \ + +exp_manager.ema.enable=True \ + exp_manager.exp_dir=examples/asr/speech_to_text_results + AFTER_SCRIPT: | + rm -rf examples/asr/speech_to_text_results + # L2_Speech_to_Text_AED: # needs: [cicd-test-container-setup] @@ -758,530 +509,315 @@ jobs: # L2: Speaker dev run L2_Speaker_dev_run_Speaker_Recognition: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-gpus-1 - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/speaker_tasks/recognition/speaker_reco.py \ - model.train_ds.batch_size=10 \ - model.validation_ds.batch_size=2 \ - model.train_ds.manifest_filepath=/home/TestData/an4_speaker/train.json \ - model.validation_ds.manifest_filepath=/home/TestData/an4_speaker/dev.json \ - model.decoder.num_classes=2 \ - trainer.max_epochs=10 \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=True \ - exp_manager.exp_dir=examples/speaker_tasks/recognition/speaker_recognition_results - rm -rf examples/speaker_tasks/recognition/speaker_recognition_results - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-gpus-1 + SCRIPT: | + python examples/speaker_tasks/recognition/speaker_reco.py \ + model.train_ds.batch_size=10 \ + model.validation_ds.batch_size=2 \ + model.train_ds.manifest_filepath=/home/TestData/an4_speaker/train.json \ + model.validation_ds.manifest_filepath=/home/TestData/an4_speaker/dev.json \ + model.decoder.num_classes=2 \ + trainer.max_epochs=10 \ + trainer.devices=1 \ + trainer.accelerator="gpu" \ + +trainer.fast_dev_run=True \ + exp_manager.exp_dir=examples/speaker_tasks/recognition/speaker_recognition_results + AFTER_SCRIPT: | + rm -rf examples/speaker_tasks/recognition/speaker_recognition_results L2_Speaker_dev_run_Speaker_Diarization: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-gpus-1 - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/speaker_tasks/diarization/neural_diarizer/multiscale_diar_decoder.py \ - model.diarizer.speaker_embeddings.model_path=titanet_large \ - model.train_ds.batch_size=5 \ - model.validation_ds.batch_size=5 \ - model.train_ds.emb_dir=examples/speaker_tasks/diarization/speaker_diarization_results \ - model.validation_ds.emb_dir=examples/speaker_tasks/diarization/speaker_diarization_results \ - model.train_ds.manifest_filepath=/home/TestData/an4_diarizer/simulated_train/msdd_data.50step.json \ - model.validation_ds.manifest_filepath=/home/TestData/an4_diarizer/simulated_valid/msdd_data.50step.json \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=True \ - exp_manager.exp_dir=examples/speaker_tasks/diarization/speaker_diarization_results - rm -rf examples/speaker_tasks/diarization/speaker_diarization_results - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-gpus-1 + SCRIPT: | + python examples/speaker_tasks/diarization/neural_diarizer/multiscale_diar_decoder.py \ + model.diarizer.speaker_embeddings.model_path=titanet_large \ + model.train_ds.batch_size=5 \ + model.validation_ds.batch_size=5 \ + model.train_ds.emb_dir=examples/speaker_tasks/diarization/speaker_diarization_results \ + model.validation_ds.emb_dir=examples/speaker_tasks/diarization/speaker_diarization_results \ + model.train_ds.manifest_filepath=/home/TestData/an4_diarizer/simulated_train/msdd_data.50step.json \ + model.validation_ds.manifest_filepath=/home/TestData/an4_diarizer/simulated_valid/msdd_data.50step.json \ + trainer.devices=1 \ + trainer.accelerator="gpu" \ + +trainer.fast_dev_run=True \ + exp_manager.exp_dir=examples/speaker_tasks/diarization/speaker_diarization_results + AFTER_SCRIPT: | + rm -rf examples/speaker_tasks/diarization/speaker_diarization_results L2_Speaker_dev_run_Speech_to_Label: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-gpus-1 - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/asr/speech_classification/speech_to_label.py \ - model.train_ds.manifest_filepath=/home/TestData/speech_commands/train_manifest.json \ - model.validation_ds.manifest_filepath=/home/TestData/speech_commands/test_manifest.json \ - model.test_ds.manifest_filepath=/home/TestData/speech_commands/test_manifest.json \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=True \ - model.preprocessor._target_=nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor \ - ~model.preprocessor.window_size \ - ~model.preprocessor.window_stride \ - ~model.preprocessor.window \ - ~model.preprocessor.n_mels \ - ~model.preprocessor.n_mfcc \ - ~model.preprocessor.n_fft \ - exp_manager.exp_dir=examples/asr/speech_to_label_results - rm -rf examples/asr/speech_to_label_results - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-gpus-1 + SCRIPT: | + python examples/asr/speech_classification/speech_to_label.py \ + model.train_ds.manifest_filepath=/home/TestData/speech_commands/train_manifest.json \ + model.validation_ds.manifest_filepath=/home/TestData/speech_commands/test_manifest.json \ + model.test_ds.manifest_filepath=/home/TestData/speech_commands/test_manifest.json \ + trainer.devices=1 \ + trainer.accelerator="gpu" \ + +trainer.fast_dev_run=True \ + model.preprocessor._target_=nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor \ + ~model.preprocessor.window_size \ + ~model.preprocessor.window_stride \ + ~model.preprocessor.window \ + ~model.preprocessor.n_mels \ + ~model.preprocessor.n_mfcc \ + ~model.preprocessor.n_fft \ + exp_manager.exp_dir=examples/asr/speech_to_label_results + AFTER_SCRIPT: | + rm -rf examples/asr/speech_to_label_results L2_Speaker_dev_run_Speaker_Diarization_with_ASR_Inference: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/speaker_tasks/diarization/clustering_diarizer/offline_diar_with_asr_infer.py \ - diarizer.manifest_filepath=/home/TestData/an4_diarizer/an4_manifest.json \ - diarizer.speaker_embeddings.model_path=/home/TestData/an4_diarizer/spkr.nemo \ - diarizer.speaker_embeddings.parameters.save_embeddings=True \ - diarizer.speaker_embeddings.parameters.window_length_in_sec=[1.5] \ - diarizer.speaker_embeddings.parameters.shift_length_in_sec=[0.75] \ - diarizer.speaker_embeddings.parameters.multiscale_weights=[1.0] \ - diarizer.asr.model_path=QuartzNet15x5Base-En \ - diarizer.asr.parameters.asr_based_vad=True \ - diarizer.out_dir=examples/speaker_tasks/diarization/speaker_diarization_asr_results - rm -rf examples/speaker_tasks/diarization/speaker_diarization_asr_results - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python examples/speaker_tasks/diarization/clustering_diarizer/offline_diar_with_asr_infer.py \ + diarizer.manifest_filepath=/home/TestData/an4_diarizer/an4_manifest.json \ + diarizer.speaker_embeddings.model_path=/home/TestData/an4_diarizer/spkr.nemo \ + diarizer.speaker_embeddings.parameters.save_embeddings=True \ + diarizer.speaker_embeddings.parameters.window_length_in_sec=[1.5] \ + diarizer.speaker_embeddings.parameters.shift_length_in_sec=[0.75] \ + diarizer.speaker_embeddings.parameters.multiscale_weights=[1.0] \ + diarizer.asr.model_path=QuartzNet15x5Base-En \ + diarizer.asr.parameters.asr_based_vad=True \ + diarizer.out_dir=examples/speaker_tasks/diarization/speaker_diarization_asr_results + AFTER_SCRIPT: | + rm -rf examples/speaker_tasks/diarization/speaker_diarization_asr_results L2_Speaker_dev_run_Clustering_Diarizer_Inference: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/speaker_tasks/diarization/clustering_diarizer/offline_diar_infer.py \ - diarizer.manifest_filepath=/home/TestData/an4_diarizer/an4_manifest.json \ - diarizer.speaker_embeddings.model_path=/home/TestData/an4_diarizer/spkr.nemo \ - diarizer.speaker_embeddings.parameters.save_embeddings=True \ - diarizer.speaker_embeddings.parameters.window_length_in_sec=1.5 \ - diarizer.speaker_embeddings.parameters.shift_length_in_sec=0.75 \ - diarizer.speaker_embeddings.parameters.multiscale_weights=null \ - diarizer.vad.model_path=/home/TestData/an4_diarizer/MatchboxNet_VAD_3x2.nemo \ - diarizer.out_dir=examples/speaker_tasks/diarization/clustering_diarizer_results - rm -rf examples/speaker_tasks/diarization/clustering_diarizer_results - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" - + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python examples/speaker_tasks/diarization/clustering_diarizer/offline_diar_infer.py \ + diarizer.manifest_filepath=/home/TestData/an4_diarizer/an4_manifest.json \ + diarizer.speaker_embeddings.model_path=/home/TestData/an4_diarizer/spkr.nemo \ + diarizer.speaker_embeddings.parameters.save_embeddings=True \ + diarizer.speaker_embeddings.parameters.window_length_in_sec=1.5 \ + diarizer.speaker_embeddings.parameters.shift_length_in_sec=0.75 \ + diarizer.speaker_embeddings.parameters.multiscale_weights=null \ + diarizer.vad.model_path=/home/TestData/an4_diarizer/MatchboxNet_VAD_3x2.nemo \ + diarizer.out_dir=examples/speaker_tasks/diarization/clustering_diarizer_results + AFTER_SCRIPT: | + rm -rf examples/speaker_tasks/diarization/clustering_diarizer_results L2_Speaker_dev_run_Neural_Diarizer_Inference: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/speaker_tasks/diarization/neural_diarizer/multiscale_diar_decoder_infer.py \ - diarizer.manifest_filepath=/home/TestData/an4_diarizer/an4_manifest.json \ - diarizer.msdd_model.model_path=/home/TestData/an4_diarizer/diar_msdd_telephonic.nemo \ - diarizer.speaker_embeddings.parameters.save_embeddings=True \ - diarizer.vad.model_path=/home/TestData/an4_diarizer/MatchboxNet_VAD_3x2.nemo \ - diarizer.out_dir=examples/speaker_tasks/diarization/neural_diarizer_results - rm -rf examples/speaker_tasks/diarization/neural_diarizer_results - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python examples/speaker_tasks/diarization/neural_diarizer/multiscale_diar_decoder_infer.py \ + diarizer.manifest_filepath=/home/TestData/an4_diarizer/an4_manifest.json \ + diarizer.msdd_model.model_path=/home/TestData/an4_diarizer/diar_msdd_telephonic.nemo \ + diarizer.speaker_embeddings.parameters.save_embeddings=True \ + diarizer.vad.model_path=/home/TestData/an4_diarizer/MatchboxNet_VAD_3x2.nemo \ + diarizer.out_dir=examples/speaker_tasks/diarization/neural_diarizer_results + AFTER_SCRIPT: | + rm -rf examples/speaker_tasks/diarization/neural_diarizer_results L2_Speaker_dev_run_Multispeaker_ASR_Data_Simulation: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python tools/speech_data_simulator/multispeaker_simulator.py \ - --config-path=conf --config-name=data_simulator.yaml \ - data_simulator.random_seed=42 \ - data_simulator.manifest_filepath=/home/TestData/LibriSpeechShort/dev-clean-align-short.json \ - data_simulator.outputs.output_dir=./test_simulator \ - data_simulator.session_config.num_sessions=2 \ - data_simulator.session_config.session_length=60 - rm -rf ./test_simulator - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" - + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python tools/speech_data_simulator/multispeaker_simulator.py \ + --config-path=conf --config-name=data_simulator.yaml \ + data_simulator.random_seed=42 \ + data_simulator.manifest_filepath=/home/TestData/LibriSpeechShort/dev-clean-align-short.json \ + data_simulator.outputs.output_dir=./test_simulator \ + data_simulator.session_config.num_sessions=2 \ + data_simulator.session_config.session_length=60 + AFTER_SCRIPT: | + rm -rf ./test_simulator # L2: ASR Multi-dataloader dev run L2_ASR_Multi-dataloader_dev_run_Speech_to_Text_multi-dataloader: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-gpus-1 - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/asr/asr_ctc/speech_to_text_ctc.py \ - model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ - model.validation_ds.manifest_filepath=[/home/TestData/an4_dataset/an4_val.json,/home/TestData/an4_dataset/an4_val.json] \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - trainer.max_epochs=1 \ - trainer.max_steps=1 \ - +trainer.num_sanity_val_steps=1 \ - exp_manager.exp_dir=examples/asr/speech_to_text_results - rm -rf examples/asr/speech_to_text_results - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-gpus-1 + SCRIPT: | + python examples/asr/asr_ctc/speech_to_text_ctc.py \ + model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ + model.validation_ds.manifest_filepath=[/home/TestData/an4_dataset/an4_val.json,/home/TestData/an4_dataset/an4_val.json] \ + trainer.devices=1 \ + trainer.accelerator="gpu" \ + trainer.max_epochs=1 \ + trainer.max_steps=1 \ + +trainer.num_sanity_val_steps=1 \ + exp_manager.exp_dir=examples/asr/speech_to_text_results + AFTER_SCRIPT: | + rm -rf examples/asr/speech_to_text_results L2_ASR_Multi-dataloader_dev_run_Speech_to_Label_multi-dataloader: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-gpus-1 - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/asr/speech_classification/speech_to_label.py \ - model.train_ds.manifest_filepath=/home/TestData/speech_commands/train_manifest.json \ - model.validation_ds.manifest_filepath=[/home/TestData/speech_commands/test_manifest.json,/home/TestData/speech_commands/test_manifest.json] \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - trainer.max_epochs=1 \ - trainer.max_steps=1 \ - +trainer.num_sanity_val_steps=1 \ - model.preprocessor._target_=nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor \ - ~model.preprocessor.window_size \ - ~model.preprocessor.window_stride \ - ~model.preprocessor.window \ - ~model.preprocessor.n_mels \ - ~model.preprocessor.n_mfcc \ - ~model.preprocessor.n_fft \ - exp_manager.exp_dir=examples/asr/speech_to_label_results - rm -rf examples/asr/speech_to_label_results - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" - + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-gpus-1 + SCRIPT: | + python examples/asr/speech_classification/speech_to_label.py \ + model.train_ds.manifest_filepath=/home/TestData/speech_commands/train_manifest.json \ + model.validation_ds.manifest_filepath=[/home/TestData/speech_commands/test_manifest.json,/home/TestData/speech_commands/test_manifest.json] \ + trainer.devices=1 \ + trainer.accelerator="gpu" \ + trainer.max_epochs=1 \ + trainer.max_steps=1 \ + +trainer.num_sanity_val_steps=1 \ + model.preprocessor._target_=nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor \ + ~model.preprocessor.window_size \ + ~model.preprocessor.window_stride \ + ~model.preprocessor.window \ + ~model.preprocessor.n_mels \ + ~model.preprocessor.n_mfcc \ + ~model.preprocessor.n_fft \ + exp_manager.exp_dir=examples/asr/speech_to_label_results + AFTER_SCRIPT: | + rm -rf examples/asr/speech_to_label_results # L2: ASR Adapters L2_ASR_Adapters_Linear_Adapters: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-gpus-1 - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/asr/asr_adapters/train_asr_adapter.py \ - model.pretrained_model="stt_en_conformer_ctc_small" \ - model.adapter.adapter_name="an4" \ - model.adapter.linear.in_features=176 \ - model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ - model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \ - trainer.max_steps=5 \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=True \ - exp_manager.exp_dir=examples/asr/speech_to_text_adapters_results - rm -rf examples/asr/speech_to_text_adapters_results - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-gpus-1 + SCRIPT: | + python examples/asr/asr_adapters/train_asr_adapter.py \ + model.pretrained_model="stt_en_conformer_ctc_small" \ + model.adapter.adapter_name="an4" \ + model.adapter.linear.in_features=176 \ + model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ + model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \ + trainer.max_steps=5 \ + trainer.devices=1 \ + trainer.accelerator="gpu" \ + +trainer.fast_dev_run=True \ + exp_manager.exp_dir=examples/asr/speech_to_text_adapters_results + AFTER_SCRIPT: | + rm -rf examples/asr/speech_to_text_adapters_results L2_ASR_Adapters_RelPos_MHA_Adapters: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-gpus-1 - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/asr/asr_adapters/train_asr_adapter.py \ - model.pretrained_model="stt_en_conformer_ctc_small" \ - model.adapter.adapter_name="encoder:an4" \ - model.adapter.adapter_type="tiny_attn" \ - model.adapter.tiny_attn.n_feat=176 \ - model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ - model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \ - trainer.max_steps=5 \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=True \ - exp_manager.exp_dir=examples/asr/speech_to_text_adapters_mha_results - rm -rf examples/asr/speech_to_text_adapters_mha_results - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" - + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-gpus-1 + SCRIPT: | + python examples/asr/asr_adapters/train_asr_adapter.py \ + model.pretrained_model="stt_en_conformer_ctc_small" \ + model.adapter.adapter_name="encoder:an4" \ + model.adapter.adapter_type="tiny_attn" \ + model.adapter.tiny_attn.n_feat=176 \ + model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ + model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \ + trainer.max_steps=5 \ + trainer.devices=1 \ + trainer.accelerator="gpu" \ + +trainer.fast_dev_run=True \ + exp_manager.exp_dir=examples/asr/speech_to_text_adapters_mha_results + AFTER_SCRIPT: | + rm -rf examples/asr/speech_to_text_adapters_mha_results # L2: Speech Transcription L2_Speech_Transcription_Speech_to_Text_Transcribe: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/asr/transcribe_speech.py \ - pretrained_name="QuartzNet15x5Base-En" \ - audio_dir="/home/TestData/an4_transcribe/test_subset/" \ - output_filename="stt_test_res.json" \ - amp=true - rm -rf stt_test_res.json - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python examples/asr/transcribe_speech.py \ + pretrained_name="QuartzNet15x5Base-En" \ + audio_dir="/home/TestData/an4_transcribe/test_subset/" \ + output_filename="stt_test_res.json" \ + amp=true + AFTER_SCRIPT: | + rm -rf stt_test_res.json # L2: Transducer alignment L2_Transducer_alignment_Running_pytest: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - pytest tests/collections/asr/decoding/rnnt_alignments_check.py --durations=-1 - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + pytest tests/collections/asr/decoding/rnnt_alignments_check.py --durations=-1 # L2: Segmentation Tool L2_Segmentation_Tool_Parallel_ctc_segmentation_test_L2_Eng_CitriNet_with_wav: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - cd tools/ctc_segmentation && \ - TIME=`date +"%Y-%m-%d-%T"` && \ - /bin/bash run_segmentation.sh \ - --MODEL_NAME_OR_PATH="stt_en_citrinet_512_gamma_0_25" \ - --DATA_DIR=/home/TestData/ctc_segmentation/eng \ - --OUTPUT_DIR=/home/TestData/ctc_segmentation/eng/output${TIME} \ - --LANGUAGE=en \ - --USE_NEMO_NORMALIZATION="TRUE" && \ - python /home/TestData/ctc_segmentation/verify_alignment.py \ - -r /home/TestData/ctc_segmentation/eng/eng_valid_segments_1.7.txt \ - -g /home/TestData/ctc_segmentation/eng/output${TIME}/verified_segments/nv_test_segments.txt && \ - rm -rf /home/TestData/ctc_segmentation/eng/output${TIME} - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + cd tools/ctc_segmentation && \ + $=`date +"%Y-%m-%d-%T"` && \ + /bin/bash run_segmentation.sh \ + --MODEL_NAME_OR_PATH="stt_en_citrinet_512_gamma_0_25" \ + --DATA_DIR=/home/TestData/ctc_segmentation/eng \ + --OUTPUT_DIR=/home/TestData/ctc_segmentation/eng/output${TIME} \ + --LANGUAGE=en \ + --USE_NEMO_NORMALIZATION="TRUE" && \ + python /home/TestData/ctc_segmentation/verify_alignment.py \ + -r /home/TestData/ctc_segmentation/eng/eng_valid_segments_1.7.txt \ + -g /home/TestData/ctc_segmentation/eng/output${TIME}/verified_segments/nv_test_segments.txt; + AFTER_SCRIPT: | + rm -rf /home/TestData/ctc_segmentation/eng/output${TIME} L2_Segmentation_Tool_Parallel_ctc_segmentation_test_L2_Ru_QN_with_mp3: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - cd tools/ctc_segmentation && \ - TIME=`date +"%Y-%m-%d-%T"` && \ - /bin/bash run_segmentation.sh \ - --MODEL_NAME_OR_PATH=/home/TestData/ctc_segmentation/QuartzNet15x5-Ru-e512-wer14.45.nemo \ - --DATA_DIR=/home/TestData/ctc_segmentation/ru \ - --OUTPUT_DIR=/home/TestData/ctc_segmentation/ru/output${TIME} \ - --LANGUAGE=ru \ - --ADDITIONAL_SPLIT_SYMBOLS=";" && \ - python /home/TestData/ctc_segmentation/verify_alignment.py \ - -r /home/TestData/ctc_segmentation/ru/valid_ru_segments_1.7.txt \ - -g /home/TestData/ctc_segmentation/ru/output${TIME}/verified_segments/ru_segments.txt && \ - rm -rf /home/TestData/ctc_segmentation/ru/output${TIME} - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" - + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + cd tools/ctc_segmentation && \ + TIME=`date +"%Y-%m-%d-%T"` && \ + /bin/bash run_segmentation.sh \ + --MODEL_NAME_OR_PATH=/home/TestData/ctc_segmentation/QuartzNet15x5-Ru-e512-wer14.45.nemo \ + --DATA_DIR=/home/TestData/ctc_segmentation/ru \ + --OUTPUT_DIR=/home/TestData/ctc_segmentation/ru/output${TIME} \ + --LANGUAGE=ru \ + --ADDITIONAL_SPLIT_SYMBOLS=";" && \ + python /home/TestData/ctc_segmentation/verify_alignment.py \ + -r /home/TestData/ctc_segmentation/ru/valid_ru_segments_1.7.txt \ + -g /home/TestData/ctc_segmentation/ru/output${TIME}/verified_segments/ru_segments.txt; + + rm -rf /home/TestData/ctc_segmentation/eng/output${TIME} # L2: G2P Models L2_G2P_Models_G2P_Conformer_training_evaluation_and_inference: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-gpus-1 - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - cd examples/tts/g2p && \ - TIME=`date +"%Y-%m-%d-%T"` && OUTPUT_DIR_CONFORMER=output_ctc_${TIME} && \ - python g2p_train_and_evaluate.py \ - train_manifest=/home/TestData/g2p/g2p.json \ - validation_manifest=/home/TestData/g2p/g2p.json \ - model.test_ds.manifest_filepath=/home/TestData/g2p/g2p.json \ - model.tokenizer.dir=/home/TestData/g2p/tokenizer_spe_unigram_v512 \ - trainer.max_epochs=1 \ - model.max_source_len=64 \ - trainer.devices=1 \ - do_training=True \ - do_testing=True \ - exp_manager.exp_dir=${OUTPUT_DIR_CONFORMER} \ - +exp_manager.use_datetime_version=False\ - +exp_manager.version=test \ - --config-name=g2p_conformer_ctc && \ - python g2p_inference.py \ - pretrained_model=${OUTPUT_DIR_CONFORMER}/G2P-Conformer-CTC/test/checkpoints/G2P-Conformer-CTC.nemo \ - manifest_filepath=/home/TestData/g2p/g2p.json \ - phoneme_field=text - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + cd examples/tts/g2p && \ + TIME=`date +"%Y-%m-%d-%T"` && OUTPUT_DIR_CONFORMER=output_ctc_${TIME} && \ + python g2p_train_and_evaluate.py \ + train_manifest=/home/TestData/g2p/g2p.json \ + validation_manifest=/home/TestData/g2p/g2p.json \ + model.test_ds.manifest_filepath=/home/TestData/g2p/g2p.json \ + model.tokenizer.dir=/home/TestData/g2p/tokenizer_spe_unigram_v512 \ + trainer.max_epochs=1 \ + model.max_source_len=64 \ + trainer.devices=1 \ + do_training=True \ + do_testing=True \ + exp_manager.exp_dir=${OUTPUT_DIR_CONFORMER} \ + +exp_manager.use_datetime_version=False\ + +exp_manager.version=test \ + --config-name=g2p_conformer_ctc && \ + python g2p_inference.py \ + pretrained_model=${OUTPUT_DIR_CONFORMER}/G2P-Conformer-CTC/test/checkpoints/G2P-Conformer-CTC.nemo \ + manifest_filepath=/home/TestData/g2p/g2p.json \ + phoneme_field=text # TODO: pleasefixme @redoctopus # - name: ByT5G2P training, evaluation and inference @@ -1311,43 +847,28 @@ jobs: L2_G2P_Models_HeteronymClassificationModel_training_evaluation_and_inference: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - cd examples/tts/g2p && \ - TIME=`date +"%Y-%m-%d-%T"` && OUTPUT_DIR=output_${TIME} && \ - python g2p_heteronym_classification_train_and_evaluate.py \ - train_manifest=/home/TestData/g2p/manifest.json \ - validation_manifest=/home/TestData/g2p/manifest.json \ - test_manifest=/home/TestData/g2p/manifest.json \ - model.wordids=/home/TestData/g2p/wordids.tsv \ - trainer.max_epochs=1 \ - model.max_seq_length=64 \ - do_training=True \ - do_testing=True \ - exp_manager.exp_dir=${OUTPUT_DIR} \ - +exp_manager.use_datetime_version=False\ - +exp_manager.version=test && \ - python g2p_heteronym_classification_inference.py \ - manifest=/home/TestData/g2p/manifest.json \ - pretrained_model=${OUTPUT_DIR}/HeteronymClassification/test/checkpoints/HeteronymClassification.nemo \ - output_manifest=preds.json - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" - + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + cd examples/tts/g2p && \ + TIME=`date +"%Y-%m-%d-%T"` && OUTPUT_DIR=output_${TIME} && \ + python g2p_heteronym_classification_train_and_evaluate.py \ + train_manifest=/home/TestData/g2p/manifest.json \ + validation_manifest=/home/TestData/g2p/manifest.json \ + test_manifest=/home/TestData/g2p/manifest.json \ + model.wordids=/home/TestData/g2p/wordids.tsv \ + trainer.max_epochs=1 \ + model.max_seq_length=64 \ + do_training=True \ + do_testing=True \ + exp_manager.exp_dir=${OUTPUT_DIR} \ + +exp_manager.use_datetime_version=False\ + +exp_manager.version=test && \ + python g2p_heteronym_classification_inference.py \ + manifest=/home/TestData/g2p/manifest.json \ + pretrained_model=${OUTPUT_DIR}/HeteronymClassification/test/checkpoints/HeteronymClassification.nemo \ + output_manifest=preds.json # L2: Dialogue Classification @@ -1395,328 +916,217 @@ jobs: L2_Dialogue_Classification_Intent_and_slot_classification_using_SGDQA: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-gpus-1 - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - cd examples/nlp/dialogue && \ - python dialogue.py \ - model.dataset.data_dir=/home/TestData/nlp/sgd_small \ - model.dataset.dialogues_example_dir=sgd_gen_bert_outputs \ - model.dataset.task_name=debug_sample \ - trainer.max_steps=1 \ - trainer.max_epochs=1 \ - model.train_ds.batch_size=2 \ - model.validation_ds.batch_size=2 \ - model.test_ds.batch_size=2 \ - model.dataset.num_tasks=6 \ - model.nemo_path=null \ - trainer.val_check_interval=0.0 \ - trainer.devices=1 \ - model.dataset.use_cache=false \ - model.language_model.pretrained_model_name=bert-base-cased \ - trainer.accelerator=gpu \ - exp_manager=null && \ - rm -rf sgd_gen_bert_outputs - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-gpus-1 + SCRIPT: | + cd examples/nlp/dialogue && \ + python dialogue.py \ + model.dataset.data_dir=/home/TestData/nlp/sgd_small \ + model.dataset.dialogues_example_dir=sgd_gen_bert_outputs \ + model.dataset.task_name=debug_sample \ + trainer.max_steps=1 \ + trainer.max_epochs=1 \ + model.train_ds.batch_size=2 \ + model.validation_ds.batch_size=2 \ + model.test_ds.batch_size=2 \ + model.dataset.num_tasks=6 \ + model.nemo_path=null \ + trainer.val_check_interval=0.0 \ + trainer.devices=1 \ + model.dataset.use_cache=false \ + model.language_model.pretrained_model_name=bert-base-cased \ + trainer.accelerator=gpu \ + exp_manager=null && \ + rm -rf sgd_gen_bert_outputs L2_Dialogue_Classification_Intent_and_slot_classification_using_IntentSlotClassificationModel: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-gpus-1 - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - cd examples/nlp/dialogue && \ - python dialogue.py \ - model.dataset.data_dir=/home/TestData/nlp/processed_assistant \ - model.dataset.dialogues_example_dir=sgd_gen_bert_intent_classification_outputs \ - model.dataset.task=assistant \ - trainer.max_steps=1 \ - trainer.max_epochs=1 \ - model.train_ds.batch_size=2 \ - model.validation_ds.batch_size=2 \ - model.test_ds.batch_size=2 \ - model.nemo_path=null \ - trainer.val_check_interval=0.0 \ - trainer.devices=1 \ - model.dataset.use_cache=false \ - model.language_model.pretrained_model_name=bert-base-uncased \ - trainer.accelerator=gpu \ - exp_manager=null && \ - rm -rf sgd_gen_bert_intent_classification_outputs - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-gpus-1 + SCRIPT: | + cd examples/nlp/dialogue && \ + python dialogue.py \ + model.dataset.data_dir=/home/TestData/nlp/processed_assistant \ + model.dataset.dialogues_example_dir=sgd_gen_bert_intent_classification_outputs \ + model.dataset.task=assistant \ + trainer.max_steps=1 \ + trainer.max_epochs=1 \ + model.train_ds.batch_size=2 \ + model.validation_ds.batch_size=2 \ + model.test_ds.batch_size=2 \ + model.nemo_path=null \ + trainer.val_check_interval=0.0 \ + trainer.devices=1 \ + model.dataset.use_cache=false \ + model.language_model.pretrained_model_name=bert-base-uncased \ + trainer.accelerator=gpu \ + exp_manager=null && \ + rm -rf sgd_gen_bert_intent_classification_outputs L2_Dialogue_Classification_Intent_classification_using_ZeroShotIntentModel: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-gpus-1 - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - cd examples/nlp/dialogue && \ - python dialogue.py \ - do_training=False \ - model.dataset.data_dir=/home/TestData/nlp/drive_thru_revised \ - model.original_nemo_checkpoint=/home/TestData/nlp/drive_thru_revised/zeroshotintent_en_bert_base_uncased.nemo \ - model.dataset.dialogues_example_dir=sgd_gen_zero_shot_intent_classification_outputs \ - model.dataset.task=zero_shot \ - model.dataset.prompt_template="This example is" \ - trainer.max_steps=1 \ - trainer.max_epochs=1 \ - model.train_ds.batch_size=2 \ - model.validation_ds.batch_size=2 \ - model.test_ds.batch_size=2 \ - model.nemo_path=null \ - trainer.val_check_interval=0.0 \ - trainer.devices=1 \ - model.dataset.use_cache=false \ - model.language_model.pretrained_model_name=bert-base-uncased \ - trainer.accelerator=gpu \ - exp_manager=null && \ - rm -rf sgd_gen_zero_shot_intent_classification_outputs - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-gpus-1 + SCRIPT: | + cd examples/nlp/dialogue && \ + python dialogue.py \ + do_training=False \ + model.dataset.data_dir=/home/TestData/nlp/drive_thru_revised \ + model.original_nemo_checkpoint=/home/TestData/nlp/drive_thru_revised/zeroshotintent_en_bert_base_uncased.nemo \ + model.dataset.dialogues_example_dir=sgd_gen_zero_shot_intent_classification_outputs \ + model.dataset.task=zero_shot \ + model.dataset.prompt_template="This example is" \ + trainer.max_steps=1 \ + trainer.max_epochs=1 \ + model.train_ds.batch_size=2 \ + model.validation_ds.batch_size=2 \ + model.test_ds.batch_size=2 \ + model.nemo_path=null \ + trainer.val_check_interval=0.0 \ + trainer.devices=1 \ + model.dataset.use_cache=false \ + model.language_model.pretrained_model_name=bert-base-uncased \ + trainer.accelerator=gpu \ + exp_manager=null && \ + rm -rf sgd_gen_zero_shot_intent_classification_outputs L2_Dialogue_Classification_Design_Intent_classification_using_ZeroShotIntentModel: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-gpus-1 - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - cd examples/nlp/dialogue && \ - python dialogue.py \ - do_training=False \ - model.dataset.data_dir=/home/TestData/nlp/design_dataset \ - model.original_nemo_checkpoint=/home/TestData/nlp/drive_thru_revised/zeroshotintent_en_bert_base_uncased.nemo \ - model.dataset.dialogues_example_dir=design_zero_shot_intent_classification_outputs \ - model.dataset.task=design \ - model.dataset.prompt_template="This example is related to" \ - model.library=megatron \ - trainer.max_steps=1 \ - trainer.max_epochs=1 \ - model.train_ds.batch_size=2 \ - model.validation_ds.batch_size=2 \ - model.test_ds.batch_size=2 \ - model.nemo_path=null \ - trainer.val_check_interval=0.0 \ - trainer.devices=1 \ - model.dataset.use_cache=false \ - model.language_model.pretrained_model_name=bert-base-uncased \ - trainer.accelerator=gpu \ - exp_manager=null && \ - rm -rf design_zero_shot_intent_classification_outputs - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-gpus-1 + SCRIPT: | + cd examples/nlp/dialogue && \ + python dialogue.py \ + do_training=False \ + model.dataset.data_dir=/home/TestData/nlp/design_dataset \ + model.original_nemo_checkpoint=/home/TestData/nlp/drive_thru_revised/zeroshotintent_en_bert_base_uncased.nemo \ + model.dataset.dialogues_example_dir=design_zero_shot_intent_classification_outputs \ + model.dataset.task=design \ + model.dataset.prompt_template="This example is related to" \ + model.library=megatron \ + trainer.max_steps=1 \ + trainer.max_epochs=1 \ + model.train_ds.batch_size=2 \ + model.validation_ds.batch_size=2 \ + model.test_ds.batch_size=2 \ + model.nemo_path=null \ + trainer.val_check_interval=0.0 \ + trainer.devices=1 \ + model.dataset.use_cache=false \ + model.language_model.pretrained_model_name=bert-base-uncased \ + trainer.accelerator=gpu \ + exp_manager=null && \ + rm -rf design_zero_shot_intent_classification_outputs L2_Dialogue_Classification_Design_Intent_classification_using_ZeroShotIntentModel_BART_Classifier: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-gpus-1 - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - cd examples/nlp/dialogue && \ - python dialogue.py \ - do_training=False \ - model.dataset.data_dir=/home/TestData/nlp/design_dataset \ - model.original_nemo_checkpoint=/home/TestData/nlp/drive_thru_revised/zeroshotintent_en_bert_base_uncased.nemo \ - model.dataset.dialogues_example_dir=design_zero_shot_intent_classification_bart_outputs \ - model.dataset.task=design \ - model.dataset.prompt_template="This example is related to" \ - model.library=huggingface \ - trainer.devices=1 \ - model.dataset.use_cache=false \ - model.language_model.pretrained_model_name=bert-base-uncased \ - trainer.accelerator=gpu \ - exp_manager=null && \ - rm -rf design_zero_shot_intent_classification_bart_outputs - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-gpus-1 + SCRIPT: | + cd examples/nlp/dialogue && \ + python dialogue.py \ + do_training=False \ + model.dataset.data_dir=/home/TestData/nlp/design_dataset \ + model.original_nemo_checkpoint=/home/TestData/nlp/drive_thru_revised/zeroshotintent_en_bert_base_uncased.nemo \ + model.dataset.dialogues_example_dir=design_zero_shot_intent_classification_bart_outputs \ + model.dataset.task=design \ + model.dataset.prompt_template="This example is related to" \ + model.library=huggingface \ + trainer.devices=1 \ + model.dataset.use_cache=false \ + model.language_model.pretrained_model_name=bert-base-uncased \ + trainer.accelerator=gpu \ + exp_manager=null && \ + rm -rf design_zero_shot_intent_classification_bart_outputs L2_Dialogue_Classification_Design_Intent_classification_using_DialogueNearestNeighbourModel: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-gpus-1 - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - cd examples/nlp/dialogue && \ - python dialogue.py \ - do_training=False \ - model.dataset.data_dir=/home/TestData/nlp/design_dataset \ - model.dataset.dialogues_example_dir=design_dialogue_nearest_neighbour_classification_outputs \ - model.dataset.task=design \ - model.dataset.prompt_template="" \ - model.library=huggingface \ - trainer.devices=1 \ - model.dataset.use_cache=false \ - model.language_model.pretrained_model_name=sentence-transformers/all-MiniLM-L6-v2 \ - trainer.accelerator=gpu \ - exp_manager=null && \ - rm -rf design_dialogue_nearest_neighbour_classification_outputs - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-gpus-1 + SCRIPT: | + cd examples/nlp/dialogue && \ + python dialogue.py \ + do_training=False \ + model.dataset.data_dir=/home/TestData/nlp/design_dataset \ + model.dataset.dialogues_example_dir=design_dialogue_nearest_neighbour_classification_outputs \ + model.dataset.task=design \ + model.dataset.prompt_template="" \ + model.library=huggingface \ + trainer.devices=1 \ + model.dataset.use_cache=false \ + model.language_model.pretrained_model_name=sentence-transformers/all-MiniLM-L6-v2 \ + trainer.accelerator=gpu \ + exp_manager=null && \ + rm -rf design_dialogue_nearest_neighbour_classification_outputs # L2: Dialogue Generation L2_Dialogue_Generation_Dialogue_Answer_Extender_using_DialogueS2SGenerationModel: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-gpus-1 - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - cd examples/nlp/dialogue && \ - python dialogue.py \ - do_training=False \ - model.dataset.data_dir=/home/TestData/nlp/ms-marco-qa \ - model.dataset.dialogues_example_dir=answer_extender_s2s \ - model.dataset.task=ms_marco \ - model.library=huggingface \ - model.dataset.debug_mode=True \ - trainer.max_steps=1 \ - trainer.max_epochs=1 \ - model.train_ds.batch_size=2 \ - model.validation_ds.batch_size=2 \ - model.test_ds.batch_size=2 \ - model.nemo_path=null \ - trainer.val_check_interval=0.0 \ - trainer.devices=1 \ - model.dataset.use_cache=false \ - model.language_model.pretrained_model_name=facebook/bart-large \ - trainer.accelerator=gpu \ - exp_manager=null && \ - rm -rf answer_extender_s2s - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-gpus-1 + SCRIPT: | + cd examples/nlp/dialogue && \ + python dialogue.py \ + do_training=False \ + model.dataset.data_dir=/home/TestData/nlp/ms-marco-qa \ + model.dataset.dialogues_example_dir=answer_extender_s2s \ + model.dataset.task=ms_marco \ + model.library=huggingface \ + model.dataset.debug_mode=True \ + trainer.max_steps=1 \ + trainer.max_epochs=1 \ + model.train_ds.batch_size=2 \ + model.validation_ds.batch_size=2 \ + model.test_ds.batch_size=2 \ + model.nemo_path=null \ + trainer.val_check_interval=0.0 \ + trainer.devices=1 \ + model.dataset.use_cache=false \ + model.language_model.pretrained_model_name=facebook/bart-large \ + trainer.accelerator=gpu \ + exp_manager=null && \ + rm -rf answer_extender_s2s L2_Dialogue_Generation_Dialogue_SGD_Based_Answer_Extender_using_DialogueS2SGenerationModel: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-gpus-1 - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - cd examples/nlp/dialogue && \ - python dialogue.py \ - do_training=False \ - model.dataset.data_dir=/home/TestData/nlp/sgd_small \ - model.dataset.dialogues_example_dir=sgd_answer_extender_s2s \ - model.dataset.task_name=debug_sample \ - model.dataset.task=sgd_generation \ - model.dataset.input_field=utterance+system_actions \ - model.dataset.output_field=system_utterance \ - model.dataset.use_cache=false \ - model.dataset.system_utterance=next_turn \ - model.dataset.debug_mode=True \ - model.dataset.prompt_template=slots_values \ - model.library=huggingface \ - trainer.max_steps=1 \ - trainer.max_epochs=1 \ - model.train_ds.batch_size=2 \ - model.validation_ds.batch_size=2 \ - model.test_ds.batch_size=2 \ - model.nemo_path=null \ - trainer.val_check_interval=0.0 \ - trainer.devices=1 \ - model.language_model.pretrained_model_name=facebook/bart-large \ - trainer.accelerator=gpu \ - exp_manager=null && \ - rm -rf sgd_answer_extender_s2s - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-gpus-1 + SCRIPT: | + cd examples/nlp/dialogue && \ + python dialogue.py \ + do_training=False \ + model.dataset.data_dir=/home/TestData/nlp/sgd_small \ + model.dataset.dialogues_example_dir=sgd_answer_extender_s2s \ + model.dataset.task_name=debug_sample \ + model.dataset.task=sgd_generation \ + model.dataset.input_field=utterance+system_actions \ + model.dataset.output_field=system_utterance \ + model.dataset.use_cache=false \ + model.dataset.system_utterance=next_turn \ + model.dataset.debug_mode=True \ + model.dataset.prompt_template=slots_values \ + model.library=huggingface \ + trainer.max_steps=1 \ + trainer.max_epochs=1 \ + model.train_ds.batch_size=2 \ + model.validation_ds.batch_size=2 \ + model.test_ds.batch_size=2 \ + model.nemo_path=null \ + trainer.val_check_interval=0.0 \ + trainer.devices=1 \ + model.language_model.pretrained_model_name=facebook/bart-large \ + trainer.accelerator=gpu \ + exp_manager=null + AFTER_SCRIPT: | + rm -rf sgd_answer_extender_s2s # - name: L2: Dialogue Generation Part 2 # when { @@ -1752,82 +1162,54 @@ jobs: # L2: COPY L2_COPY_Dialogue_Answer_Extender_using_DialogueGPTGenerationModel: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-gpus-1 - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - cd examples/nlp/dialogue && \ - python dialogue.py \ - do_training=False \ - model.dataset.data_dir=/home/TestData/nlp/ms-marco-qa \ - model.dataset.dialogues_example_dir=answer_extender \ - model.library=huggingface \ - model.dataset.task=ms_marco \ - model.dataset.debug_mode=True \ - trainer.val_check_interval=0.0 \ - trainer.devices=1 \ - model.dataset.use_cache=false \ - model.language_model.pretrained_model_name=gpt2 \ - trainer.accelerator=gpu \ - exp_manager=null && \ - rm -rf answer_extender - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-gpus-1 + SCRIPT: | + cd examples/nlp/dialogue && \ + python dialogue.py \ + do_training=False \ + model.dataset.data_dir=/home/TestData/nlp/ms-marco-qa \ + model.dataset.dialogues_example_dir=answer_extender \ + model.library=huggingface \ + model.dataset.task=ms_marco \ + model.dataset.debug_mode=True \ + trainer.val_check_interval=0.0 \ + trainer.devices=1 \ + model.dataset.use_cache=false \ + model.language_model.pretrained_model_name=gpt2 \ + trainer.accelerator=gpu \ + exp_manager=null && \ + rm -rf answer_extender # L2: Duplex Text Normalization L2_Duplex_Text_Normalization_with_Tarred_dataset: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - cd examples/nlp/duplex_text_normalization && \ - python duplex_text_normalization_train.py \ - data.validation_ds.data_path=/home/TestData/nlp/duplex_text_norm/small_test.tsv \ - mode=tn \ - lang=en \ - tagger_model.do_training=false \ - decoder_model.transformer=t5-small \ - data.validation_ds.batch_size=2 \ - data.train_ds.use_cache=false \ - data.validation_ds.use_cache=false \ - data.test_ds.batch_size=2 \ - data.train_ds.decoder_data_augmentation=false \ - data.train_ds.num_workers=2 \ - decoder_trainer.devices=[0,1] \ - decoder_trainer.accelerator="gpu" \ - data.train_ds.use_tarred_dataset=true \ - +decoder_trainer.fast_dev_run=true \ - decoder_exp_manager.create_checkpoint_callback=false \ - data.train_ds.tar_metadata_file=/home/TestData/nlp/duplex_text_norm/tarred_small/metadata.json \ - data.test_ds.use_cache=false \ - data.test_ds.data_path=/home/TestData/nlp/duplex_text_norm/small_test.tsv - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + cd examples/nlp/duplex_text_normalization && \ + python duplex_text_normalization_train.py \ + data.validation_ds.data_path=/home/TestData/nlp/duplex_text_norm/small_test.tsv \ + mode=tn \ + lang=en \ + tagger_model.do_training=false \ + decoder_model.transformer=t5-small \ + data.validation_ds.batch_size=2 \ + data.train_ds.use_cache=false \ + data.validation_ds.use_cache=false \ + data.test_ds.batch_size=2 \ + data.train_ds.decoder_data_augmentation=false \ + data.train_ds.num_workers=2 \ + decoder_trainer.devices=[0,1] \ + decoder_trainer.accelerator="gpu" \ + data.train_ds.use_tarred_dataset=true \ + +decoder_trainer.fast_dev_run=true \ + decoder_exp_manager.create_checkpoint_callback=false \ + data.train_ds.tar_metadata_file=/home/TestData/nlp/duplex_text_norm/tarred_small/metadata.json \ + data.test_ds.use_cache=false \ + data.test_ds.data_path=/home/TestData/nlp/duplex_text_norm/small_test.tsv # Runs out of memory on the 12G TITAN V (GPU 0 on main CI) # TODO: add when megatron bert is supported again in NeMo @@ -1860,345 +1242,221 @@ jobs: # L2: BERT Text Classification L2_BERT_Text_Classification_with_BERT_Test: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-gpus-1 - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - cd examples/nlp/text_classification && \ - python text_classification_with_bert.py \ - model.dataset.num_classes=6 \ - model.train_ds.file_path=/home/TestData/nlp/retail_text_classification/train.tsv \ - model.validation_ds.file_path=/home/TestData/nlp/retail_text_classification/dev.tsv \ - model.language_model.pretrained_model_name=distilbert-base-uncased \ - model.train_ds.batch_size=10 \ - model.dataset.max_seq_length=50 \ - model.dataset.use_cache=false \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=true \ - exp_manager=null - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-gpus-1 + SCRIPT: | + cd examples/nlp/text_classification && \ + python text_classification_with_bert.py \ + model.dataset.num_classes=6 \ + model.train_ds.file_path=/home/TestData/nlp/retail_text_classification/train.tsv \ + model.validation_ds.file_path=/home/TestData/nlp/retail_text_classification/dev.tsv \ + model.language_model.pretrained_model_name=distilbert-base-uncased \ + model.train_ds.batch_size=10 \ + model.dataset.max_seq_length=50 \ + model.dataset.use_cache=false \ + trainer.devices=1 \ + trainer.accelerator="gpu" \ + +trainer.fast_dev_run=true \ + exp_manager=null # L2: Parallel BERT Question-Answering SQUAD v1.1 & v2.0 L2_Parallel_BERT_Question-Answering_SQUAD_v1_1: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-gpus-1 - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - # Cannot do fast_dev_run because squad needs whole dev dataset - cd examples/nlp/question_answering && \ - python question_answering.py \ - model.train_ds.file=/home/TestData/nlp/squad_mini/v1.1/train-v1.1.json \ - model.dataset.use_cache=false \ - model.validation_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \ - model.test_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \ - model.train_ds.batch_size=2 \ - model.train_ds.num_samples=2 \ - model.validation_ds.batch_size=2 \ - model.validation_ds.num_samples=2 \ - model.test_ds.num_samples=2 \ - model.test_ds.batch_size=2 \ - trainer.max_epochs=1 \ - trainer.max_steps=1 \ - model.language_model.pretrained_model_name=bert-base-uncased \ - model.dataset.version_2_with_negative=false \ - trainer.precision=16 \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - exp_manager=null - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-gpus-1 + SCRIPT: | + # Cannot do fast_dev_run because squad needs whole dev dataset + cd examples/nlp/question_answering && \ + python question_answering.py \ + model.train_ds.file=/home/TestData/nlp/squad_mini/v1.1/train-v1.1.json \ + model.dataset.use_cache=false \ + model.validation_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \ + model.test_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \ + model.train_ds.batch_size=2 \ + model.train_ds.num_samples=2 \ + model.validation_ds.batch_size=2 \ + model.validation_ds.num_samples=2 \ + model.test_ds.num_samples=2 \ + model.test_ds.batch_size=2 \ + trainer.max_epochs=1 \ + trainer.max_steps=1 \ + model.language_model.pretrained_model_name=bert-base-uncased \ + model.dataset.version_2_with_negative=false \ + trainer.precision=16 \ + trainer.devices=1 \ + trainer.accelerator="gpu" \ + exp_manager=null L2_Parallel_BERT_Question-Answering_SQUAD_v2_0: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-gpus-1 - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - # Cannot do fast_dev_run because squad needs whole dev dataset - cd examples/nlp/question_answering && \ - python question_answering.py \ - model.train_ds.file=/home/TestData/nlp/squad_mini/v2.0/train-v2.0.json \ - model.dataset.use_cache=false \ - model.train_ds.batch_size=2 \ - model.train_ds.num_samples=2 \ - model.validation_ds.batch_size=2 \ - model.validation_ds.num_samples=2 \ - trainer.max_epochs=1 \ - trainer.max_steps=1 \ - model.validation_ds.file=/home/TestData/nlp/squad_mini/v2.0/dev-v2.0.json \ - model.language_model.pretrained_model_name=bert-base-uncased \ - model.dataset.version_2_with_negative=true \ - trainer.precision=16 \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - exp_manager=null - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-gpus-1 + SCRIPT: | + # Cannot do fast_dev_run because squad needs whole dev dataset + cd examples/nlp/question_answering && \ + python question_answering.py \ + model.train_ds.file=/home/TestData/nlp/squad_mini/v2.0/train-v2.0.json \ + model.dataset.use_cache=false \ + model.train_ds.batch_size=2 \ + model.train_ds.num_samples=2 \ + model.validation_ds.batch_size=2 \ + model.validation_ds.num_samples=2 \ + trainer.max_epochs=1 \ + trainer.max_steps=1 \ + model.validation_ds.file=/home/TestData/nlp/squad_mini/v2.0/dev-v2.0.json \ + model.language_model.pretrained_model_name=bert-base-uncased \ + model.dataset.version_2_with_negative=true \ + trainer.precision=16 \ + trainer.devices=1 \ + trainer.accelerator="gpu" \ + exp_manager=null # L2: Parallel BART Question-Answering SQUAD v1.1 & v2.0 L2_Parallel_BART_Question-Answering_SQUAD_v1_1: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-gpus-1 - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - cd examples/nlp/question_answering && \ - python question_answering.py \ - model.train_ds.file=/home/TestData/nlp/squad_mini/v1.1/train-v1.1.json \ - model.dataset.use_cache=false \ - model.dataset.check_if_answer_in_context=false \ - model.validation_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \ - model.test_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \ - model.train_ds.batch_size=2 \ - model.train_ds.num_samples=2 \ - model.validation_ds.batch_size=2 \ - model.validation_ds.num_samples=2 \ - model.test_ds.num_samples=2 \ - model.test_ds.batch_size=2 \ - trainer.max_epochs=1 \ - trainer.max_steps=1 \ - model.language_model.pretrained_model_name=facebook/bart-base \ - model.dataset.version_2_with_negative=false \ - trainer.precision=16 \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - exp_manager=null - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-gpus-1 + SCRIPT: | + cd examples/nlp/question_answering && \ + python question_answering.py \ + model.train_ds.file=/home/TestData/nlp/squad_mini/v1.1/train-v1.1.json \ + model.dataset.use_cache=false \ + model.dataset.check_if_answer_in_context=false \ + model.validation_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \ + model.test_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \ + model.train_ds.batch_size=2 \ + model.train_ds.num_samples=2 \ + model.validation_ds.batch_size=2 \ + model.validation_ds.num_samples=2 \ + model.test_ds.num_samples=2 \ + model.test_ds.batch_size=2 \ + trainer.max_epochs=1 \ + trainer.max_steps=1 \ + model.language_model.pretrained_model_name=facebook/bart-base \ + model.dataset.version_2_with_negative=false \ + trainer.precision=16 \ + trainer.devices=1 \ + trainer.accelerator="gpu" \ + exp_manager=null L2_Parallel_BART_Question-Answering_SQUAD_v2_0: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-gpus-1 - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - cd examples/nlp/question_answering && \ - python question_answering.py \ - model.train_ds.file=/home/TestData/nlp/squad_mini/v2.0/train-v2.0.json \ - model.dataset.use_cache=false \ - model.dataset.check_if_answer_in_context=false \ - model.train_ds.batch_size=2 \ - model.train_ds.num_samples=2 \ - model.validation_ds.batch_size=2 \ - model.validation_ds.num_samples=2 \ - trainer.max_epochs=1 \ - trainer.max_steps=1 \ - model.validation_ds.file=/home/TestData/nlp/squad_mini/v2.0/dev-v2.0.json \ - model.language_model.pretrained_model_name=facebook/bart-base \ - model.dataset.version_2_with_negative=true \ - trainer.precision=16 \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - exp_manager=null - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-gpus-1 + SCRIPT: | + cd examples/nlp/question_answering && \ + python question_answering.py \ + model.train_ds.file=/home/TestData/nlp/squad_mini/v2.0/train-v2.0.json \ + model.dataset.use_cache=false \ + model.dataset.check_if_answer_in_context=false \ + model.train_ds.batch_size=2 \ + model.train_ds.num_samples=2 \ + model.validation_ds.batch_size=2 \ + model.validation_ds.num_samples=2 \ + trainer.max_epochs=1 \ + trainer.max_steps=1 \ + model.validation_ds.file=/home/TestData/nlp/squad_mini/v2.0/dev-v2.0.json \ + model.language_model.pretrained_model_name=facebook/bart-base \ + model.dataset.version_2_with_negative=true \ + trainer.precision=16 \ + trainer.devices=1 \ + trainer.accelerator="gpu" \ + exp_manager=null # L2: Parallel GPT2 Question-Answering SQUAD v1.1 & v2.0 L2_Parallel_GPT2_Question-Answering_SQUAD_v1_1: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-gpus-1 - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - cd examples/nlp/question_answering && \ - python question_answering.py \ - model.train_ds.file=/home/TestData/nlp/squad_mini/v1.1/train-v1.1.json \ - model.dataset.use_cache=false \ - model.dataset.check_if_answer_in_context=false \ - model.validation_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \ - model.test_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \ - model.train_ds.batch_size=2 \ - model.train_ds.num_samples=2 \ - model.validation_ds.batch_size=2 \ - model.validation_ds.num_samples=2 \ - model.test_ds.num_samples=2 \ - model.test_ds.batch_size=2 \ - trainer.max_epochs=1 \ - trainer.max_steps=1 \ - model.language_model.pretrained_model_name=gpt2 \ - model.dataset.version_2_with_negative=false \ - trainer.precision=16 \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - exp_manager=null - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-gpus-1 + SCRIPT: | + cd examples/nlp/question_answering && \ + python question_answering.py \ + model.train_ds.file=/home/TestData/nlp/squad_mini/v1.1/train-v1.1.json \ + model.dataset.use_cache=false \ + model.dataset.check_if_answer_in_context=false \ + model.validation_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \ + model.test_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \ + model.train_ds.batch_size=2 \ + model.train_ds.num_samples=2 \ + model.validation_ds.batch_size=2 \ + model.validation_ds.num_samples=2 \ + model.test_ds.num_samples=2 \ + model.test_ds.batch_size=2 \ + trainer.max_epochs=1 \ + trainer.max_steps=1 \ + model.language_model.pretrained_model_name=gpt2 \ + model.dataset.version_2_with_negative=false \ + trainer.precision=16 \ + trainer.devices=1 \ + trainer.accelerator="gpu" \ + exp_manager=null L2_Parallel_GPT2_Question-Answering_SQUAD_v2_0: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-gpus-1 - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - cd examples/nlp/question_answering && \ - python question_answering.py \ - model.train_ds.file=/home/TestData/nlp/squad_mini/v2.0/train-v2.0.json \ - model.dataset.use_cache=false \ - model.dataset.check_if_answer_in_context=false \ - model.train_ds.batch_size=2 \ - model.train_ds.num_samples=2 \ - model.validation_ds.batch_size=2 \ - model.validation_ds.num_samples=2 \ - trainer.max_epochs=1 \ - trainer.max_steps=1 \ - model.validation_ds.file=/home/TestData/nlp/squad_mini/v2.0/dev-v2.0.json \ - model.language_model.pretrained_model_name=gpt2 \ - model.dataset.version_2_with_negative=true \ - trainer.precision=16 \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - exp_manager=null - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-gpus-1 + SCRIPT: | + cd examples/nlp/question_answering && \ + python question_answering.py \ + model.train_ds.file=/home/TestData/nlp/squad_mini/v2.0/train-v2.0.json \ + model.dataset.use_cache=false \ + model.dataset.check_if_answer_in_context=false \ + model.train_ds.batch_size=2 \ + model.train_ds.num_samples=2 \ + model.validation_ds.batch_size=2 \ + model.validation_ds.num_samples=2 \ + trainer.max_epochs=1 \ + trainer.max_steps=1 \ + model.validation_ds.file=/home/TestData/nlp/squad_mini/v2.0/dev-v2.0.json \ + model.language_model.pretrained_model_name=gpt2 \ + model.dataset.version_2_with_negative=true \ + trainer.precision=16 \ + trainer.devices=1 \ + trainer.accelerator="gpu" \ + exp_manager=null # L2: Intent and Slot Classification Tasks L2_Intent_and_Slot_Classification_Tasks_Intent_and_Slot_Classification: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-gpus-1 - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - cd examples/nlp/intent_slot_classification && \ - python intent_slot_classification.py \ - model.data_dir=/home/TestData/nlp/retail \ - model.validation_ds.prefix=dev \ - model.test_ds.prefix=dev \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=true \ - exp_manager.exp_dir=checkpoints - rm -rf checkpoints - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-gpus-1 + SCRIPT: | + cd examples/nlp/intent_slot_classification && \ + python intent_slot_classification.py \ + model.data_dir=/home/TestData/nlp/retail \ + model.validation_ds.prefix=dev \ + model.test_ds.prefix=dev \ + trainer.devices=1 \ + trainer.accelerator="gpu" \ + +trainer.fast_dev_run=true \ + exp_manager.exp_dir=checkpoints + AFTER_SCRIPT: | + rm -rf checkpoints L2_Intent_and_Slot_Classification_Tasks_Multi-Label_Intent_and_Slot_Classification: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-gpus-1 - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - cd examples/nlp/intent_slot_classification && \ - python multi_label_intent_slot_classification.py \ - model.data_dir=/home/TestData/nlp/new_multiatis \ - model.validation_ds.prefix=dev \ - model.test_ds.prefix=dev \ - trainer.devices=1 \ - +trainer.fast_dev_run=true \ - exp_manager.exp_dir=checkpoints2 - rm -rf checkpoints2 - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-gpus-1 + SCRIPT: | + cd examples/nlp/intent_slot_classification && \ + python multi_label_intent_slot_classification.py \ + model.data_dir=/home/TestData/nlp/new_multiatis \ + model.validation_ds.prefix=dev \ + model.test_ds.prefix=dev \ + trainer.devices=1 \ + +trainer.fast_dev_run=true \ + exp_manager.exp_dir=checkpoints2 + AFTER_SCRIPT: | + rm -rf checkpoints2 # TODO: add when megatron-bert is supported again # stage('L2: Model Parallel Size 2 Megatron Text Classification') { @@ -2309,350 +1567,246 @@ jobs: # L2: Parallel NLP Examples 2 L2_Parallel_NLP_Examples2_NER_finetuning_from_pretrained_Test: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-gpus-1 - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - cd examples/nlp/token_classification && \ - python token_classification_train.py \ - pretrained_model=ner_en_bert \ - model.dataset.data_dir=/home/TestData/nlp/ner/ \ - model.train_ds.batch_size=2 \ - model.dataset.use_cache=false \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=true \ - model.dataset.class_balancing="weighted_loss" \ - exp_manager.exp_dir=null - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-gpus-1 + SCRIPT: | + cd examples/nlp/token_classification && \ + python token_classification_train.py \ + pretrained_model=ner_en_bert \ + model.dataset.data_dir=/home/TestData/nlp/ner/ \ + model.train_ds.batch_size=2 \ + model.dataset.use_cache=false \ + trainer.devices=1 \ + trainer.accelerator="gpu" \ + +trainer.fast_dev_run=true \ + model.dataset.class_balancing="weighted_loss" \ + exp_manager.exp_dir=null L2_Parallel_NLP_Examples2_Punctuation_and_capitalization_finetuning_from_pretrained_test: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-gpus-1 - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - cd examples/nlp/token_classification && \ - data_dir="$(mktemp -d -p "$(pwd)")" && \ - cp /home/TestData/nlp/token_classification_punctuation/*.txt "${data_dir}"/ && \ - python punctuation_capitalization_train_evaluate.py \ - pretrained_model=punctuation_en_bert \ - model.train_ds.ds_item="${data_dir}" \ - model.validation_ds.ds_item="${data_dir}" \ - model.test_ds.ds_item="${data_dir}" \ - +model.train_ds.use_cache=false \ - +model.validation_ds.use_cache=false \ - +model.test_ds.use_cache=false \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=true \ - exp_manager.exp_dir=null && \ - rm -rf "${data_dir}" - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-gpus-1 + SCRIPT: | + cd examples/nlp/token_classification && \ + data_dir="$(mktemp -d -p "$(pwd)")" && \ + cp /home/TestData/nlp/token_classification_punctuation/*.txt "${data_dir}"/ && \ + python punctuation_capitalization_train_evaluate.py \ + pretrained_model=punctuation_en_bert \ + model.train_ds.ds_item="${data_dir}" \ + model.validation_ds.ds_item="${data_dir}" \ + model.test_ds.ds_item="${data_dir}" \ + +model.train_ds.use_cache=false \ + +model.validation_ds.use_cache=false \ + +model.test_ds.use_cache=false \ + trainer.devices=1 \ + trainer.accelerator="gpu" \ + +trainer.fast_dev_run=true \ + exp_manager.exp_dir=null; + + rm -rf "${data_dir}" L2_Parallel_NLP_Examples2_NER_with_TurkuNLP__bert-base-finnish-cased-v1: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-gpus-1 - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - cd examples/nlp/token_classification && \ - python token_classification_train.py \ - model.dataset.data_dir=/home/TestData/nlp/token_classification_punctuation/ \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=true \ - model.dataset.use_cache=false \ - model.language_model.pretrained_model_name="TurkuNLP/bert-base-finnish-cased-v1" \ - exp_manager.exp_dir=null - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" - + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-gpus-1 + SCRIPT: | + cd examples/nlp/token_classification && \ + python token_classification_train.py \ + model.dataset.data_dir=/home/TestData/nlp/token_classification_punctuation/ \ + trainer.devices=1 \ + trainer.accelerator="gpu" \ + +trainer.fast_dev_run=true \ + model.dataset.use_cache=false \ + model.language_model.pretrained_model_name="TurkuNLP/bert-base-finnish-cased-v1" \ + exp_manager.exp_dir=null + L2_Parallel_NLP_Examples2_Evaluation_script_for_Token_Classification: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/nlp/token_classification/token_classification_evaluate.py \ - model.dataset.data_dir=/home/TestData/nlp/ner/ \ - model.dataset.use_cache=false \ - pretrained_model=/home/TestData/nlp/pretrained_models/NER_Model_with_BERT_base_uncased.nemo - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python examples/nlp/token_classification/token_classification_evaluate.py \ + model.dataset.data_dir=/home/TestData/nlp/ner/ \ + model.dataset.use_cache=false \ + pretrained_model=/home/TestData/nlp/pretrained_models/NER_Model_with_BERT_base_uncased.nemo L2_Parallel_NLP_Examples2_Evaluation_script_for_Punctuation: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - data_dir="$(mktemp -d -p "$(pwd)")" && \ - cp /home/TestData/nlp/token_classification_punctuation/*.txt "${data_dir}"/ && \ - python examples/nlp/token_classification/punctuation_capitalization_train_evaluate.py \ - +do_training=false \ - +do_testing=true \ - model.test_ds.ds_item="${data_dir}" \ - ~model.train_ds \ - ~model.validation_ds \ - +model.test_ds.use_cache=false \ - pretrained_model=/home/TestData/nlp/pretrained_models/Punctuation_Capitalization_with_DistilBERT_base_uncased.nemo && \ - rm -rf "${data_dir}" - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + data_dir="$(mktemp -d -p "$(pwd)")" && \ + cp /home/TestData/nlp/token_classification_punctuation/*.txt "${data_dir}"/ && \ + python examples/nlp/token_classification/punctuation_capitalization_train_evaluate.py \ + +do_training=false \ + +do_testing=true \ + model.test_ds.ds_item="${data_dir}" \ + ~model.train_ds \ + ~model.validation_ds \ + +model.test_ds.use_cache=false \ + pretrained_model=/home/TestData/nlp/pretrained_models/Punctuation_Capitalization_with_DistilBERT_base_uncased.nemo; + + rm -rf "${data_dir}" + L2_Parallel_NLP_Examples2_Punctuation_Capitalization_2GPUs_with_DistilBERT_Finetuning_on_other_data: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - cd examples/nlp/token_classification && \ - output_dir="$(mktemp -d -p "$(pwd)")" && \ - tmp_data_dir="$(mktemp -d -p "$(pwd)")" && \ - cp /home/TestData/nlp/token_classification_punctuation/*.txt "${tmp_data_dir}"/ && \ - python punctuation_capitalization_train_evaluate.py \ - model.train_ds.use_tarred_dataset=false \ - model.train_ds.ds_item="${tmp_data_dir}" \ - model.validation_ds.ds_item="${tmp_data_dir}" \ - model.test_ds.ds_item="${tmp_data_dir}" \ - model.language_model.pretrained_model_name=distilbert-base-uncased \ - +model.train_ds.use_cache=false \ - +model.validation_ds.use_cache=false \ - +model.test_ds.use_cache=false \ - trainer.devices=[0,1] \ - trainer.accelerator="gpu" \ - trainer.strategy=ddp \ - trainer.max_epochs=1 \ - +exp_manager.explicit_log_dir="${output_dir}" \ - +do_testing=true && \ - tmp_data_dir_2="$(mktemp -d -p "$(pwd)")" && \ - mv "${tmp_data_dir}"/* "${tmp_data_dir_2}" && \ - rm -rf "${tmp_data_dir}" && \ - python punctuation_capitalization_train_evaluate.py \ - model.train_ds.use_tarred_dataset=false \ - model.train_ds.ds_item="${tmp_data_dir_2}" \ - model.validation_ds.ds_item="${tmp_data_dir_2}" \ - model.test_ds.ds_item="${tmp_data_dir_2}" \ - pretrained_model="${output_dir}/checkpoints/Punctuation_and_Capitalization.nemo" \ - +model.train_ds.use_cache=false \ - +model.validation_ds.use_cache=false \ - +model.test_ds.use_cache=false \ - trainer.devices=[0,1] \ - trainer.accelerator="gpu" \ - trainer.strategy=ddp \ - trainer.max_epochs=1 \ - exp_manager=null && \ - rm -rf /workspace/NeMo/examples/nlp/token_classification/nemo_experiments \ - "${tmp_data_dir_2}" \ - "${output_dir}" - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + cd examples/nlp/token_classification && \ + output_dir="$(mktemp -d -p "$(pwd)")" && \ + tmp_data_dir="$(mktemp -d -p "$(pwd)")" && \ + cp /home/TestData/nlp/token_classification_punctuation/*.txt "${tmp_data_dir}"/ && \ + python punctuation_capitalization_train_evaluate.py \ + model.train_ds.use_tarred_dataset=false \ + model.train_ds.ds_item="${tmp_data_dir}" \ + model.validation_ds.ds_item="${tmp_data_dir}" \ + model.test_ds.ds_item="${tmp_data_dir}" \ + model.language_model.pretrained_model_name=distilbert-base-uncased \ + +model.train_ds.use_cache=false \ + +model.validation_ds.use_cache=false \ + +model.test_ds.use_cache=false \ + trainer.devices=[0,1] \ + trainer.accelerator="gpu" \ + trainer.strategy=ddp \ + trainer.max_epochs=1 \ + +exp_manager.explicit_log_dir="${output_dir}" \ + +do_testing=true && \ + tmp_data_dir_2="$(mktemp -d -p "$(pwd)")" && \ + mv "${tmp_data_dir}"/* "${tmp_data_dir_2}" && \ + rm -rf "${tmp_data_dir}" && \ + python punctuation_capitalization_train_evaluate.py \ + model.train_ds.use_tarred_dataset=false \ + model.train_ds.ds_item="${tmp_data_dir_2}" \ + model.validation_ds.ds_item="${tmp_data_dir_2}" \ + model.test_ds.ds_item="${tmp_data_dir_2}" \ + pretrained_model="${output_dir}/checkpoints/Punctuation_and_Capitalization.nemo" \ + +model.train_ds.use_cache=false \ + +model.validation_ds.use_cache=false \ + +model.test_ds.use_cache=false \ + trainer.devices=[0,1] \ + trainer.accelerator="gpu" \ + trainer.strategy=ddp \ + trainer.max_epochs=1 \ + exp_manager=null; + + rm -rf /workspace/NeMo/examples/nlp/token_classification/nemo_experiments \ + "${tmp_data_dir_2}" \ + "${output_dir}" # Punctuation & Capitalization tarred dataset: Punctuation_Capitalization_tarred_dataset_create_and_use_tarred_dataset: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - data_dir="$(mktemp -d -p "$(pwd)")" && \ - cp -r /home/TestData/nlp/token_classification_punctuation/*.txt \ - /home/TestData/nlp/token_classification_punctuation/wmt_wiki_10000 \ - "${data_dir}"/ && \ - usual_data=${data_dir}/wmt_wiki_10000 && \ - output_dir="$(mktemp -d -p "$(pwd)")" && \ - tarred_data=${output_dir}/train_tarred && \ - tokens_in_batch=2000 && \ - max_seq_length=512 && \ - lm_model=distilbert-base-uncased && \ - python examples/nlp/token_classification/data/create_punctuation_capitalization_tarred_dataset.py \ - --text ${usual_data}/input.txt \ - --labels ${usual_data}/labels.txt \ - --output_dir ${tarred_data} \ - --tokens_in_batch ${tokens_in_batch} \ - --max_seq_length 512 \ - --lines_per_dataset_fragment 2000 \ - --num_batches_per_tarfile 5 \ - --tar_file_prefix punctuation_capitalization \ - --tokenizer_name ${lm_model} \ - --use_fast_tokenizer \ - --pad_label O \ - --n_jobs 3 && \ - echo "Number of tarred files in dataset:" && \ - ls ${tarred_data}/*.tar | wc -l && \ - echo "Label id files in dataset:" && \ - ls ${tarred_data}/*.csv && \ - metadata_file=${tarred_data}/metadata.punctuation_capitalization.tokens${tokens_in_batch}.max_seq_length${max_seq_length}.${lm_model}.json && \ - python examples/nlp/token_classification/punctuation_capitalization_train_evaluate.py \ - model.validation_ds.ds_item="${data_dir}" \ - model.test_ds.ds_item="${data_dir}" \ - model.train_ds.ds_item=${tarred_data} \ - model.language_model.pretrained_model_name=${lm_model} \ - model.train_ds.use_tarred_dataset=true \ - model.train_ds.tar_metadata_file=${metadata_file} \ - +model.train_ds.use_cache=false \ - +model.validation_ds.use_cache=false \ - +model.test_ds.use_cache=false \ - trainer.devices=[0,1] \ - trainer.accelerator="gpu" \ - trainer.strategy=ddp \ - trainer.max_epochs=1 \ - +exp_manager.explicit_log_dir=${output_dir}/output && \ - rm -rf "${output_dir}" "${data_dir}" - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + data_dir="$(mktemp -d -p "$(pwd)")" && \ + cp -r /home/TestData/nlp/token_classification_punctuation/*.txt \ + /home/TestData/nlp/token_classification_punctuation/wmt_wiki_10000 \ + "${data_dir}"/ && \ + usual_data=${data_dir}/wmt_wiki_10000 && \ + output_dir="$(mktemp -d -p "$(pwd)")" && \ + tarred_data=${output_dir}/train_tarred && \ + tokens_in_batch=2000 && \ + max_seq_length=512 && \ + lm_model=distilbert-base-uncased && \ + python examples/nlp/token_classification/data/create_punctuation_capitalization_tarred_dataset.py \ + --text ${usual_data}/input.txt \ + --labels ${usual_data}/labels.txt \ + --output_dir ${tarred_data} \ + --tokens_in_batch ${tokens_in_batch} \ + --max_seq_length 512 \ + --lines_per_dataset_fragment 2000 \ + --num_batches_per_tarfile 5 \ + --tar_file_prefix punctuation_capitalization \ + --tokenizer_name ${lm_model} \ + --use_fast_tokenizer \ + --pad_label O \ + --n_jobs 3 && \ + echo "Number of tarred files in dataset:" && \ + ls ${tarred_data}/*.tar | wc -l && \ + echo "Label id files in dataset:" && \ + ls ${tarred_data}/*.csv && \ + metadata_file=${tarred_data}/metadata.punctuation_capitalization.tokens${tokens_in_batch}.max_seq_length${max_seq_length}.${lm_model}.json && \ + python examples/nlp/token_classification/punctuation_capitalization_train_evaluate.py \ + model.validation_ds.ds_item="${data_dir}" \ + model.test_ds.ds_item="${data_dir}" \ + model.train_ds.ds_item=${tarred_data} \ + model.language_model.pretrained_model_name=${lm_model} \ + model.train_ds.use_tarred_dataset=true \ + model.train_ds.tar_metadata_file=${metadata_file} \ + +model.train_ds.use_cache=false \ + +model.validation_ds.use_cache=false \ + +model.test_ds.use_cache=false \ + trainer.devices=[0,1] \ + trainer.accelerator="gpu" \ + trainer.strategy=ddp \ + trainer.max_epochs=1 \ + +exp_manager.explicit_log_dir=${output_dir}/output; + + rm -rf "${output_dir}" "${data_dir}" # Punctuation_Capitalization_Different_ways_of_passing_labels_to_model Punctuation_Capitalization_Using_model-common_datasets_parameters-label_vocab_dir: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - cd examples/nlp/token_classification && \ - work_dir="$(mktemp -d -p "$(pwd)")" && \ - label_vocab_dir="${work_dir}/labels" && \ - mkdir -p ${label_vocab_dir} && \ - data_dir="${work_dir}/data" && \ - mkdir -p "${data_dir}" && \ - cp /home/TestData/nlp/token_classification_punctuation/*.txt "${data_dir}" && \ - output_dir="${work_dir}/output" && \ - mkdir -p "${output_dir}" && \ - punct_label_vocab="${label_vocab_dir}/punct_label_vocab.csv" && \ - capit_label_vocab="${label_vocab_dir}/capit_label_vocab.csv" && \ - printf "O\n,\n.\n?\n" > "${punct_label_vocab}" && \ - printf "O\nU\n" > "${capit_label_vocab}" && \ - python punctuation_capitalization_train_evaluate.py \ - model.train_ds.use_tarred_dataset=false \ - model.train_ds.ds_item="${data_dir}" \ - model.validation_ds.ds_item="${data_dir}" \ - model.test_ds.ds_item="${data_dir}" \ - model.language_model.pretrained_model_name=distilbert-base-uncased \ - model.common_dataset_parameters.label_vocab_dir="${label_vocab_dir}" \ - model.class_labels.punct_labels_file="$(basename "${punct_label_vocab}")" \ - model.class_labels.capit_labels_file="$(basename "${capit_label_vocab}")" \ - +model.train_ds.use_cache=false \ - +model.validation_ds.use_cache=false \ - +model.test_ds.use_cache=false \ - trainer.devices=[0,1] \ - trainer.strategy=ddp \ - trainer.max_epochs=1 \ - +exp_manager.explicit_log_dir="${output_dir}" \ - +do_testing=false && \ - python punctuation_capitalization_train_evaluate.py \ - +do_training=false \ - +do_testing=true \ - ~model.train_ds \ - ~model.validation_ds \ - model.test_ds.ds_item="${data_dir}" \ - pretrained_model="${output_dir}/checkpoints/Punctuation_and_Capitalization.nemo" \ - +model.train_ds.use_cache=false \ - +model.validation_ds.use_cache=false \ - +model.test_ds.use_cache=false \ - trainer.devices=[0,1] \ - trainer.strategy=ddp \ - trainer.max_epochs=1 \ - exp_manager=null && \ - rm -rf "${work_dir}" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + cd examples/nlp/token_classification && \ + work_dir="$(mktemp -d -p "$(pwd)")" && \ + label_vocab_dir="${work_dir}/labels" && \ + mkdir -p ${label_vocab_dir} && \ + data_dir="${work_dir}/data" && \ + mkdir -p "${data_dir}" && \ + cp /home/TestData/nlp/token_classification_punctuation/*.txt "${data_dir}" && \ + output_dir="${work_dir}/output" && \ + mkdir -p "${output_dir}" && \ + punct_label_vocab="${label_vocab_dir}/punct_label_vocab.csv" && \ + capit_label_vocab="${label_vocab_dir}/capit_label_vocab.csv" && \ + printf "O\n,\n.\n?\n" > "${punct_label_vocab}" && \ + printf "O\nU\n" > "${capit_label_vocab}" && \ + python punctuation_capitalization_train_evaluate.py \ + model.train_ds.use_tarred_dataset=false \ + model.train_ds.ds_item="${data_dir}" \ + model.validation_ds.ds_item="${data_dir}" \ + model.test_ds.ds_item="${data_dir}" \ + model.language_model.pretrained_model_name=distilbert-base-uncased \ + model.common_dataset_parameters.label_vocab_dir="${label_vocab_dir}" \ + model.class_labels.punct_labels_file="$(basename "${punct_label_vocab}")" \ + model.class_labels.capit_labels_file="$(basename "${capit_label_vocab}")" \ + +model.train_ds.use_cache=false \ + +model.validation_ds.use_cache=false \ + +model.test_ds.use_cache=false \ + trainer.devices=[0,1] \ + trainer.strategy=ddp \ + trainer.max_epochs=1 \ + +exp_manager.explicit_log_dir="${output_dir}" \ + +do_testing=false && \ + python punctuation_capitalization_train_evaluate.py \ + +do_training=false \ + +do_testing=true \ + ~model.train_ds \ + ~model.validation_ds \ + model.test_ds.ds_item="${data_dir}" \ + pretrained_model="${output_dir}/checkpoints/Punctuation_and_Capitalization.nemo" \ + +model.train_ds.use_cache=false \ + +model.validation_ds.use_cache=false \ + +model.test_ds.use_cache=false \ + trainer.devices=[0,1] \ + trainer.strategy=ddp \ + trainer.max_epochs=1 \ + exp_manager=null && \ + rm -rf "${work_dir}" + # TODO: pleasefixme # Punctuation_Capitalization_Using_model-common_datasets_parameters-punct-capit-_label_ids: # needs: [cicd-test-container-setup] @@ -2719,683 +1873,501 @@ jobs: # Punctuation & Capitalization inference Punctuation_Capitalization_inference_Restore_punctuation_and_capitalization_in_long_text: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - output_dir="$(mktemp -d -p "$(pwd)")" && \ - python examples/nlp/token_classification/punctuate_capitalize_infer.py \ - --input_manifest /home/TestData/nlp/token_classification_punctuation/iwslt_tst2019.manifest \ - --output_text "${output_dir}/iwslt_inference_result.txt" \ - --max_seq_length 92 \ - --step 8 \ - --margin 16 \ - --pretrained_name punctuation_en_bert \ - --batch_size 32 && \ - rm -rf "${output_dir}" - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" - + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + output_dir="$(mktemp -d -p "$(pwd)")" && \ + python examples/nlp/token_classification/punctuate_capitalize_infer.py \ + --input_manifest /home/TestData/nlp/token_classification_punctuation/iwslt_tst2019.manifest \ + --output_text "${output_dir}/iwslt_inference_result.txt" \ + --max_seq_length 92 \ + --step 8 \ + --margin 16 \ + --pretrained_name punctuation_en_bert \ + --batch_size 32; + rm -rf "${output_dir}" + # L2: Parallel Pretraining BERT pretraining from Text/Preprocessed L2_Pretraining_BERT_pretraining_from_Text: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-gpus-1 - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - cd examples/nlp/language_modeling && \ - python bert_pretraining.py \ - --config-name=bert_pretraining_from_text_config.yaml \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - trainer.precision=16 \ - +trainer.fast_dev_run=true \ - model.train_ds.data_file=/home/TestData/nlp/wikitext-2/train.txt \ - model.train_ds.batch_size=32 \ - model.validation_ds.data_file=/home/TestData/nlp/wikitext-2/valid.txt \ - model.validation_ds.batch_size=32 \ - model.language_model.config_file=/home/TestData/nlp/bert_configs/bert_3200.json \ - model.optim.lr=0.01 \ - model.optim.sched.warmup_ratio=0.1 \ - model.tokenizer.tokenizer_name=sentencepiece \ - model.tokenizer.tokenizer_model=/home/TestData/nlp/wikitext-2/tokenizer_bpe_v3193/tokenizer.model \ - model.mask_prob=0.15 \ - model.short_seq_prob=0.1 \ - exp_manager.exp_dir=PretrainingBERTFromText \ - - rm -f /home/TestData/nlp/wikitext-2/*.pkl - #rm -rf examples/nlp/language_modeling/PretrainingBERTFromText - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-gpus-1 + SCRIPT: | + cd examples/nlp/language_modeling && \ + python bert_pretraining.py \ + --config-name=bert_pretraining_from_text_config.yaml \ + trainer.devices=1 \ + trainer.accelerator="gpu" \ + trainer.precision=16 \ + +trainer.fast_dev_run=true \ + model.train_ds.data_file=/home/TestData/nlp/wikitext-2/train.txt \ + model.train_ds.batch_size=32 \ + model.validation_ds.data_file=/home/TestData/nlp/wikitext-2/valid.txt \ + model.validation_ds.batch_size=32 \ + model.language_model.config_file=/home/TestData/nlp/bert_configs/bert_3200.json \ + model.optim.lr=0.01 \ + model.optim.sched.warmup_ratio=0.1 \ + model.tokenizer.tokenizer_name=sentencepiece \ + model.tokenizer.tokenizer_model=/home/TestData/nlp/wikitext-2/tokenizer_bpe_v3193/tokenizer.model \ + model.mask_prob=0.15 \ + model.short_seq_prob=0.1 \ + exp_manager.exp_dir=PretrainingBERTFromText; + AFTER_SCRIPT: | + rm -f /home/TestData/nlp/wikitext-2/*.pkl + #rm -rf examples/nlp/language_modeling/PretrainingBERTFromText L2_Pretraining_BERT_from_Preprocessed: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-gpus-1 - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - cd examples/nlp/language_modeling && \ - python bert_pretraining.py \ - --config-name=bert_pretraining_from_preprocessed_config.yaml \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - trainer.precision=16 \ - +trainer.fast_dev_run=false \ - +trainer.max_epochs=1 \ - +trainer.limit_val_batches=0 \ - +trainer.limit_train_batches=1 \ - model.train_ds.data_file=/home/TestData/nlp/wiki_book_mini/training \ - model.train_ds.batch_size=8 \ - model.language_model.lm_checkpoint=/home/TestData/nlp/bert_ckpts/nemo1.0/bert_base_uncased_mlm_final_1074591_nemo1.0.pt \ - model.language_model.config_file=/home/TestData/nlp/bert_configs/uncased_L-12_H-768_A-12.json \ - model.optim.lr=0.875e-4 \ - model.optim.weight_decay=0.01 \ - model.optim.sched.warmup_ratio=0.01 \ - exp_manager.exp_dir=PretrainingBERTFromPreprocessed \ - exp_manager.create_checkpoint_callback=False \ - + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-gpus-1 + SCRIPT: | + cd examples/nlp/language_modeling && \ + python bert_pretraining.py \ + --config-name=bert_pretraining_from_preprocessed_config.yaml \ + trainer.devices=1 \ + trainer.accelerator="gpu" \ + trainer.precision=16 \ + +trainer.fast_dev_run=false \ + +trainer.max_epochs=1 \ + +trainer.limit_val_batches=0 \ + +trainer.limit_train_batches=1 \ + model.train_ds.data_file=/home/TestData/nlp/wiki_book_mini/training \ + model.train_ds.batch_size=8 \ + model.language_model.lm_checkpoint=/home/TestData/nlp/bert_ckpts/nemo1.0/bert_base_uncased_mlm_final_1074591_nemo1.0.pt \ + model.language_model.config_file=/home/TestData/nlp/bert_configs/uncased_L-12_H-768_A-12.json \ + model.optim.lr=0.875e-4 \ + model.optim.weight_decay=0.01 \ + model.optim.sched.warmup_ratio=0.01 \ + exp_manager.exp_dir=PretrainingBERTFromPreprocessed \ + exp_manager.create_checkpoint_callback=False \ + #rm -rf examples/nlp/language_modeling/PretrainingBERTFromPreprocessed - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" # L2: Entity Linking L2_Entity_Linking_Self_Alignment_Pretraining_BERT: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - cd examples/nlp/entity_linking && \ - python self_alignment_pretraining.py \ - project_dir=. \ - trainer.val_check_interval=3 \ - model.raw_data=None \ - model.train_ds.data_file=/home/TestData/nlp/entity_linking/tiny_example_train_pairs.tsv \ - model.validation_ds.data_file=/home/TestData/nlp/entity_linking/tiny_example_validation_pairs.tsv \ - model.train_ds.batch_size=8 \ - model.validation_ds.batch_size=8 \ - exp_manager.exp_dir=null - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" - + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + cd examples/nlp/entity_linking && \ + python self_alignment_pretraining.py \ + project_dir=. \ + trainer.val_check_interval=3 \ + model.raw_data=None \ + model.train_ds.data_file=/home/TestData/nlp/entity_linking/tiny_example_train_pairs.tsv \ + model.validation_ds.data_file=/home/TestData/nlp/entity_linking/tiny_example_validation_pairs.tsv \ + model.train_ds.batch_size=8 \ + model.validation_ds.batch_size=8 \ + exp_manager.exp_dir=null # TODO: remove +model.optim.capturable=True when Pytorch fix: https://github.com/pytorch/pytorch/pull/81858 # is in the release container # L2: NMT Attention is All You Need Training L2_NMT_Attention_is_All_You_Need_Training_NMT_Training_Post-LN: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-gpus-1 - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/nlp/machine_translation/enc_dec_nmt.py \ - --config-path=conf \ - --config-name=aayn_base \ - do_testing=false \ - model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ - model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \ - model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \ - model.encoder.num_layers=1 \ - model.encoder.hidden_size=64 \ - model.encoder.inner_size=256 \ - model.decoder.num_layers=1 \ - model.decoder.hidden_size=64 \ - model.decoder.inner_size=256 \ - +model.optim.capturable=True \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - +trainer.val_check_interval=2 \ - +trainer.limit_val_batches=1 \ - +trainer.max_steps=2 \ - trainer.precision=16 \ - +exp_manager.explicit_log_dir=examples/nlp/machine_translation/nmt_results \ - +exp_manager.create_checkpoint_callback=true - - python examples/nlp/machine_translation/enc_dec_nmt.py \ - --config-path=conf \ - --config-name=aayn_base \ - do_testing=true \ - model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ - model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \ - model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \ - model.encoder.num_layers=1 \ - model.encoder.hidden_size=64 \ - model.encoder.inner_size=256 \ - model.decoder.num_layers=1 \ - model.decoder.hidden_size=64 \ - model.decoder.inner_size=256 \ - +model.optim.capturable=True \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - +trainer.val_check_interval=10 \ - +trainer.limit_val_batches=1 \ - +trainer.limit_test_batches=1 \ - +trainer.max_steps=10 \ - +exp_manager.explicit_log_dir=examples/nlp/machine_translation/nmt_results \ - +exp_manager.create_checkpoint_callback=true \ - +exp_manager.resume_if_exists=True - - rm -rf examples/nlp/machine_translation/nmt_results - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" - + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-gpus-1 + SCRIPT: | + python examples/nlp/machine_translation/enc_dec_nmt.py \ + --config-path=conf \ + --config-name=aayn_base \ + do_testing=false \ + model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ + model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ + model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ + model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ + model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ + model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ + model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \ + model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \ + model.encoder.num_layers=1 \ + model.encoder.hidden_size=64 \ + model.encoder.inner_size=256 \ + model.decoder.num_layers=1 \ + model.decoder.hidden_size=64 \ + model.decoder.inner_size=256 \ + +model.optim.capturable=True \ + trainer.devices=1 \ + trainer.accelerator="gpu" \ + +trainer.val_check_interval=2 \ + +trainer.limit_val_batches=1 \ + +trainer.max_steps=2 \ + trainer.precision=16 \ + +exp_manager.explicit_log_dir=examples/nlp/machine_translation/nmt_results \ + +exp_manager.create_checkpoint_callback=true + + python examples/nlp/machine_translation/enc_dec_nmt.py \ + --config-path=conf \ + --config-name=aayn_base \ + do_testing=true \ + model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ + model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ + model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ + model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ + model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ + model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ + model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \ + model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \ + model.encoder.num_layers=1 \ + model.encoder.hidden_size=64 \ + model.encoder.inner_size=256 \ + model.decoder.num_layers=1 \ + model.decoder.hidden_size=64 \ + model.decoder.inner_size=256 \ + +model.optim.capturable=True \ + trainer.devices=1 \ + trainer.accelerator="gpu" \ + +trainer.val_check_interval=10 \ + +trainer.limit_val_batches=1 \ + +trainer.limit_test_batches=1 \ + +trainer.max_steps=10 \ + +exp_manager.explicit_log_dir=examples/nlp/machine_translation/nmt_results \ + +exp_manager.create_checkpoint_callback=true \ + +exp_manager.resume_if_exists=True + AFTER_SCRIPT: | + rm -rf examples/nlp/machine_translation/nmt_results L2_NMT_Attention_is_All_You_Need_Training_NMT_Training_Pre-LN: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-gpus-1 - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - cd examples/nlp/machine_translation && \ - python enc_dec_nmt.py \ - --config-path=conf \ - --config-name=aayn_base \ - do_testing=true \ - model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ - model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \ - model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \ - model.encoder.pre_ln=true \ - model.decoder.pre_ln=true \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=true \ - +trainer.limit_test_batches=2 \ - exp_manager=null - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-gpus-1 + SCRIPT: | + cd examples/nlp/machine_translation && \ + python enc_dec_nmt.py \ + --config-path=conf \ + --config-name=aayn_base \ + do_testing=true \ + model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ + model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ + model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ + model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ + model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ + model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ + model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \ + model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \ + model.encoder.pre_ln=true \ + model.decoder.pre_ln=true \ + trainer.devices=1 \ + trainer.accelerator="gpu" \ + +trainer.fast_dev_run=true \ + +trainer.limit_test_batches=2 \ + exp_manager=null L2_NMT_Attention_is_All_You_Need_Training_NMT_Multi-Validation: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-gpus-1 - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - cd examples/nlp/machine_translation && \ - python enc_dec_nmt.py \ - --config-path=conf \ - --config-name=aayn_base \ - do_testing=true \ - model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-en-de.src \ - model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-en-de.ref \ - model.validation_ds.src_file_name=[/home/TestData/nlp/nmt/toy_data/wmt13-en-de.src,/home/TestData/nlp/nmt/toy_data/wmt14-en-de.src] \ - model.validation_ds.tgt_file_name=[/home/TestData/nlp/nmt/toy_data/wmt13-en-de.ref,/home/TestData/nlp/nmt/toy_data/wmt14-en-de.ref] \ - model.test_ds.src_file_name=[/home/TestData/nlp/nmt/toy_data/wmt13-en-de.src,/home/TestData/nlp/nmt/toy_data/wmt14-en-de.src] \ - model.test_ds.tgt_file_name=[/home/TestData/nlp/nmt/toy_data/wmt13-en-de.ref,/home/TestData/nlp/nmt/toy_data/wmt14-en-de.ref] \ - model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \ - model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=true \ - +trainer.limit_test_batches=2 \ - exp_manager=null - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-gpus-1 + SCRIPT: | + cd examples/nlp/machine_translation && \ + python enc_dec_nmt.py \ + --config-path=conf \ + --config-name=aayn_base \ + do_testing=true \ + model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-en-de.src \ + model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-en-de.ref \ + model.validation_ds.src_file_name=[/home/TestData/nlp/nmt/toy_data/wmt13-en-de.src,/home/TestData/nlp/nmt/toy_data/wmt14-en-de.src] \ + model.validation_ds.tgt_file_name=[/home/TestData/nlp/nmt/toy_data/wmt13-en-de.ref,/home/TestData/nlp/nmt/toy_data/wmt14-en-de.ref] \ + model.test_ds.src_file_name=[/home/TestData/nlp/nmt/toy_data/wmt13-en-de.src,/home/TestData/nlp/nmt/toy_data/wmt14-en-de.src] \ + model.test_ds.tgt_file_name=[/home/TestData/nlp/nmt/toy_data/wmt13-en-de.ref,/home/TestData/nlp/nmt/toy_data/wmt14-en-de.ref] \ + model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \ + model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \ + trainer.devices=1 \ + trainer.accelerator="gpu" \ + +trainer.fast_dev_run=true \ + +trainer.limit_test_batches=2 \ + exp_manager=null # L2: NMT Attention is All You Need Inference L2_NMT_Attention_is_All_You_Need_Inference: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - cd examples/nlp/machine_translation && \ - python nmt_transformer_infer.py \ - --model=/home/TestData/nlp/nmt/toy_data/enes_v16k_s100k_6x6.nemo \ - --srctext=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.test.src \ - --tgtout=/home/TestData/nlp/nmt/toy_data/out.txt \ - --target_lang en \ - --source_lang de - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + cd examples/nlp/machine_translation && \ + python nmt_transformer_infer.py \ + --model=/home/TestData/nlp/nmt/toy_data/enes_v16k_s100k_6x6.nemo \ + --srctext=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.test.src \ + --tgtout=/home/TestData/nlp/nmt/toy_data/out.txt \ + --target_lang en \ + --source_lang de # L2: NMT Attention is All You Need Finetuning L2_NMT_Attention_is_All_You_Need_Finetuning: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-gpus-1 - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - cd examples/nlp/machine_translation && \ - python enc_dec_nmt_finetune.py \ - model_path=/home/TestData/nlp/nmt/toy_data/enes_v16k_s100k_6x6.nemo \ - trainer.devices=1 \ - ~trainer.max_epochs \ - model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ - model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - +trainer.val_check_interval=10 \ - +trainer.limit_val_batches=1 \ - +trainer.limit_test_batches=1 \ - +trainer.max_steps=10 \ - +exp_manager.exp_dir=examples/nlp/machine_translation/nmt_finetune \ - +exp_manager.create_checkpoint_callback=True \ - +exp_manager.checkpoint_callback_params.monitor=val_sacreBLEU \ - +exp_manager.checkpoint_callback_params.mode=max \ - +exp_manager.checkpoint_callback_params.save_best_model=true - - rm -rf examples/nlp/machine_translation/nmt_finetune - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-gpus-1 + SCRIPT: | + cd examples/nlp/machine_translation && \ + python enc_dec_nmt_finetune.py \ + model_path=/home/TestData/nlp/nmt/toy_data/enes_v16k_s100k_6x6.nemo \ + trainer.devices=1 \ + ~trainer.max_epochs \ + model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ + model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ + model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ + model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ + model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ + model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ + +trainer.val_check_interval=10 \ + +trainer.limit_val_batches=1 \ + +trainer.limit_test_batches=1 \ + +trainer.max_steps=10 \ + +exp_manager.exp_dir=examples/nlp/machine_translation/nmt_finetune \ + +exp_manager.create_checkpoint_callback=True \ + +exp_manager.checkpoint_callback_params.monitor=val_sacreBLEU \ + +exp_manager.checkpoint_callback_params.mode=max \ + +exp_manager.checkpoint_callback_params.save_best_model=true + AFTER_SCRIPT: | + rm -rf examples/nlp/machine_translation/nmt_finetune # L2: NMT Tarred Dataset Creation L2_NMT_Tarred_Dataset_Creation_Auto_Tarred_Dataset_Creation: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-gpus-1 - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - cd examples/nlp/machine_translation && \ - python enc_dec_nmt.py \ - --config-path=conf \ - --config-name=aayn_base \ - do_training=false \ - model.preproc_out_dir=$PWD/preproc_out_dir \ - model.train_ds.use_tarred_dataset=true \ - model.train_ds.n_preproc_jobs=2 \ - model.train_ds.lines_per_dataset_fragment=500 \ - model.train_ds.num_batches_per_tarfile=10 \ - model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ - model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.encoder_tokenizer.vocab_size=2000 \ - model.decoder_tokenizer.vocab_size=2000 \ - ~model.test_ds \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=true \ - exp_manager=null \ - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-gpus-1 + SCRIPT: | + cd examples/nlp/machine_translation && \ + python enc_dec_nmt.py \ + --config-path=conf \ + --config-name=aayn_base \ + do_training=false \ + model.preproc_out_dir=$PWD/preproc_out_dir \ + model.train_ds.use_tarred_dataset=true \ + model.train_ds.n_preproc_jobs=2 \ + model.train_ds.lines_per_dataset_fragment=500 \ + model.train_ds.num_batches_per_tarfile=10 \ + model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ + model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ + model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ + model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ + model.encoder_tokenizer.vocab_size=2000 \ + model.decoder_tokenizer.vocab_size=2000 \ + ~model.test_ds \ + trainer.devices=1 \ + trainer.accelerator="gpu" \ + +trainer.fast_dev_run=true \ + exp_manager=null L2_NMT_Tarred_Dataset_Creation_Script_Tarred_Dataset_Creation: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - cd examples/nlp/machine_translation && \ - python create_tarred_parallel_dataset.py \ - --src_fname /home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - --tgt_fname /home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ - --out_dir $PWD/out_dir \ - --encoder_tokenizer_vocab_size=2000 \ - --decoder_tokenizer_vocab_size=2000 \ - --tokens_in_batch=1000 \ - --lines_per_dataset_fragment=500 \ - --num_batches_per_tarfile=10 \ - --n_preproc_jobs=2 \ - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + cd examples/nlp/machine_translation && \ + python create_tarred_parallel_dataset.py \ + --src_fname /home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ + --tgt_fname /home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ + --out_dir $PWD/out_dir \ + --encoder_tokenizer_vocab_size=2000 \ + --decoder_tokenizer_vocab_size=2000 \ + --tokens_in_batch=1000 \ + --lines_per_dataset_fragment=500 \ + --num_batches_per_tarfile=10 \ + --n_preproc_jobs=2 L2_Megatron_NMT_Training_TP2: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/nlp/machine_translation/megatron_nmt_training.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - +trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/machine_translation/megatron_nmt_results \ - model.tensor_model_parallel_size=2 \ - model.seq_length=128 \ - model.encoder.num_layers=4 \ - model.encoder.hidden_size=64 \ - model.encoder.num_attention_heads=8 \ - model.encoder.activation='swiglu' \ - model.encoder.masked_softmax_fusion=False \ - model.encoder.bias_activation_fusion=False \ - model.encoder.activations_checkpoint_method='block' \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.decoder.num_layers=2 \ - model.decoder.hidden_size=64 \ - model.decoder.num_attention_heads=8 \ - model.decoder.activation='swiglu' \ - model.decoder.masked_softmax_fusion=False \ - model.decoder.bias_activation_fusion=False \ - model.decoder.activations_checkpoint_method='block' \ - model.decoder.activations_checkpoint_num_layers=1 \ - model.micro_batch_size=2 \ - model.global_batch_size=4 \ - model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ - model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ - model.train_ds.num_workers=1 \ - model.validation_ds.num_workers=1 \ - ~model.test_ds \ - model.train_ds.dataset_type=text_memmap \ - model.encoder_tokenizer.library=sentencepiece \ - model.encoder_tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model \ - model.decoder_tokenizer.library=sentencepiece \ - model.decoder_tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model - # Change val_check_interval to 1 for resume as the len(dataloder) is 1 due to max_steps being the same as that of training and Lightning 2.0 raises an error - # if val_check_interval > len(dataloder: https://github.com/Lightning-AI/lightning/blob/2.0.6/src/lightning/pytorch/loops/fit_loop.py#L259 at the beginning of fit_loop.run() - python examples/nlp/machine_translation/megatron_nmt_training.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=1 \ - +trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/machine_translation/megatron_nmt_results \ - model.tensor_model_parallel_size=2 \ - model.seq_length=128 \ - model.encoder.num_layers=4 \ - model.encoder.hidden_size=64 \ - model.encoder.num_attention_heads=8 \ - model.encoder.activation='swiglu' \ - model.encoder.masked_softmax_fusion=False \ - model.encoder.bias_activation_fusion=False \ - model.encoder.activations_checkpoint_method='block' \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.decoder.num_layers=2 \ - model.decoder.hidden_size=64 \ - model.decoder.num_attention_heads=8 \ - model.decoder.activation='swiglu' \ - model.decoder.masked_softmax_fusion=False \ - model.decoder.bias_activation_fusion=False \ - model.decoder.activations_checkpoint_method='block' \ - model.decoder.activations_checkpoint_num_layers=1 \ - model.micro_batch_size=2 \ - model.global_batch_size=4 \ - model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ - model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ - model.train_ds.num_workers=1 \ - model.validation_ds.num_workers=1 \ - ~model.test_ds \ - model.train_ds.dataset_type=text_memmap \ - model.encoder_tokenizer.library=sentencepiece \ - model.encoder_tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model \ - model.decoder_tokenizer.library=sentencepiece \ - model.decoder_tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model - rm -rf examples/nlp/machine_translation/megatron_nmt_results - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python examples/nlp/machine_translation/megatron_nmt_training.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=10 \ + +trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=10 \ + trainer.precision=16 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/machine_translation/megatron_nmt_results \ + model.tensor_model_parallel_size=2 \ + model.seq_length=128 \ + model.encoder.num_layers=4 \ + model.encoder.hidden_size=64 \ + model.encoder.num_attention_heads=8 \ + model.encoder.activation='swiglu' \ + model.encoder.masked_softmax_fusion=False \ + model.encoder.bias_activation_fusion=False \ + model.encoder.activations_checkpoint_method='block' \ + model.encoder.activations_checkpoint_num_layers=1 \ + model.decoder.num_layers=2 \ + model.decoder.hidden_size=64 \ + model.decoder.num_attention_heads=8 \ + model.decoder.activation='swiglu' \ + model.decoder.masked_softmax_fusion=False \ + model.decoder.bias_activation_fusion=False \ + model.decoder.activations_checkpoint_method='block' \ + model.decoder.activations_checkpoint_num_layers=1 \ + model.micro_batch_size=2 \ + model.global_batch_size=4 \ + model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ + model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ + model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ + model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ + model.train_ds.num_workers=1 \ + model.validation_ds.num_workers=1 \ + ~model.test_ds \ + model.train_ds.dataset_type=text_memmap \ + model.encoder_tokenizer.library=sentencepiece \ + model.encoder_tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model \ + model.decoder_tokenizer.library=sentencepiece \ + model.decoder_tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model + # Change val_check_interval to 1 for resume as the len(dataloder) is 1 due to max_steps being the same as that of training and Lightning 2.0 raises an error + # if val_check_interval > len(dataloder: https://github.com/Lightning-AI/lightning/blob/2.0.6/src/lightning/pytorch/loops/fit_loop.py#L259 at the beginning of fit_loop.run() + python examples/nlp/machine_translation/megatron_nmt_training.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=1 \ + +trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=10 \ + trainer.precision=16 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/machine_translation/megatron_nmt_results \ + model.tensor_model_parallel_size=2 \ + model.seq_length=128 \ + model.encoder.num_layers=4 \ + model.encoder.hidden_size=64 \ + model.encoder.num_attention_heads=8 \ + model.encoder.activation='swiglu' \ + model.encoder.masked_softmax_fusion=False \ + model.encoder.bias_activation_fusion=False \ + model.encoder.activations_checkpoint_method='block' \ + model.encoder.activations_checkpoint_num_layers=1 \ + model.decoder.num_layers=2 \ + model.decoder.hidden_size=64 \ + model.decoder.num_attention_heads=8 \ + model.decoder.activation='swiglu' \ + model.decoder.masked_softmax_fusion=False \ + model.decoder.bias_activation_fusion=False \ + model.decoder.activations_checkpoint_method='block' \ + model.decoder.activations_checkpoint_num_layers=1 \ + model.micro_batch_size=2 \ + model.global_batch_size=4 \ + model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ + model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ + model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ + model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ + model.train_ds.num_workers=1 \ + model.validation_ds.num_workers=1 \ + ~model.test_ds \ + model.train_ds.dataset_type=text_memmap \ + model.encoder_tokenizer.library=sentencepiece \ + model.encoder_tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model \ + model.decoder_tokenizer.library=sentencepiece \ + model.decoder_tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model + AFTER_SCRIPT: | + rm -rf examples/nlp/machine_translation/megatron_nmt_results L2_Megatron_BART_Perceiver_MIM_Training_TP2: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/nlp/language_modeling/megatron_bart_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/megatron_mim_results \ - model.tensor_model_parallel_size=2 \ - model.seq_length=128 \ - model.encoder.num_layers=4 \ - model.encoder.hidden_size=64 \ - model.encoder.arch=perceiver \ - model.encoder.num_attention_heads=8 \ - model.encoder.activation='swiglu' \ - model.encoder.masked_softmax_fusion=False \ - model.encoder.bias_activation_fusion=False \ - model.encoder.activations_checkpoint_method='block' \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.decoder.num_layers=2 \ - model.decoder.hidden_size=64 \ - model.decoder.num_attention_heads=8 \ - model.decoder.activation='swiglu' \ - model.decoder.masked_softmax_fusion=False \ - model.decoder.bias_activation_fusion=False \ - model.decoder.activations_checkpoint_method='block' \ - model.decoder.activations_checkpoint_num_layers=1 \ - model.micro_batch_size=2 \ - model.global_batch_size=4 \ - model.data.data_impl=text_mmap \ - model.data.data_prefix=[1.0,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src] \ - model.data.splits_string='"800,100,100"' \ - model.data.whole_word_masking=False \ - model.tokenizer.library=sentencepiece \ - model.tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model \ - ++model.hiddens.enc_output_name=z \ - ++model.hiddens.transform.q_z_given_x.cls_name=cond_gaussian \ - ++model.hiddens.transform.q_z_given_x.hidden_size=64 \ - ++model.hiddens.loss.mim.cls_name=a_mim \ - ++model.hiddens.loss.mim.loss_weight=0.5 - # Change val_check_interval to 1 for resume as the len(dataloder) is 1 due to max_steps being the same as that of training and Lightning 2.0 raises an error - # if val_check_interval > len(dataloder: https://github.com/Lightning-AI/lightning/blob/2.0.6/src/lightning/pytorch/loops/fit_loop.py#L259 at the beginning of fit_loop.run() - python examples/nlp/language_modeling/megatron_bart_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=1 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/megatron_mim_results \ - model.tensor_model_parallel_size=2 \ - model.seq_length=128 \ - model.encoder.num_layers=4 \ - model.encoder.hidden_size=64 \ - model.encoder.arch=perceiver \ - model.encoder.num_attention_heads=8 \ - model.encoder.activation='swiglu' \ - model.encoder.masked_softmax_fusion=False \ - model.encoder.bias_activation_fusion=False \ - model.encoder.activations_checkpoint_method='block' \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.decoder.num_layers=2 \ - model.decoder.hidden_size=64 \ - model.decoder.num_attention_heads=8 \ - model.decoder.activation='swiglu' \ - model.decoder.masked_softmax_fusion=False \ - model.decoder.bias_activation_fusion=False \ - model.decoder.activations_checkpoint_method='block' \ - model.decoder.activations_checkpoint_num_layers=1 \ - model.micro_batch_size=2 \ - model.global_batch_size=4 \ - model.data.data_impl=text_mmap \ - model.data.data_prefix=[1.0,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src] \ - model.data.splits_string='"800,100,100"' \ - model.data.whole_word_masking=False \ - model.tokenizer.library=sentencepiece \ - model.tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model \ - ++model.hiddens.enc_output_name=z \ - ++model.hiddens.transform.q_z_given_x.cls_name=cond_gaussian \ - ++model.hiddens.transform.q_z_given_x.hidden_size=64 \ - ++model.hiddens.loss.mim.cls_name=a_mim \ - ++model.hiddens.loss.mim.loss_weight=0.5 - rm -rf examples/nlp/language_modeling/megatron_mim_results - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python examples/nlp/language_modeling/megatron_bart_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=10 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=10 \ + trainer.precision=16 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/megatron_mim_results \ + model.tensor_model_parallel_size=2 \ + model.seq_length=128 \ + model.encoder.num_layers=4 \ + model.encoder.hidden_size=64 \ + model.encoder.arch=perceiver \ + model.encoder.num_attention_heads=8 \ + model.encoder.activation='swiglu' \ + model.encoder.masked_softmax_fusion=False \ + model.encoder.bias_activation_fusion=False \ + model.encoder.activations_checkpoint_method='block' \ + model.encoder.activations_checkpoint_num_layers=1 \ + model.decoder.num_layers=2 \ + model.decoder.hidden_size=64 \ + model.decoder.num_attention_heads=8 \ + model.decoder.activation='swiglu' \ + model.decoder.masked_softmax_fusion=False \ + model.decoder.bias_activation_fusion=False \ + model.decoder.activations_checkpoint_method='block' \ + model.decoder.activations_checkpoint_num_layers=1 \ + model.micro_batch_size=2 \ + model.global_batch_size=4 \ + model.data.data_impl=text_mmap \ + model.data.data_prefix=[1.0,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src] \ + model.data.splits_string='"800,100,100"' \ + model.data.whole_word_masking=False \ + model.tokenizer.library=sentencepiece \ + model.tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model \ + ++model.hiddens.enc_output_name=z \ + ++model.hiddens.transform.q_z_given_x.cls_name=cond_gaussian \ + ++model.hiddens.transform.q_z_given_x.hidden_size=64 \ + ++model.hiddens.loss.mim.cls_name=a_mim \ + ++model.hiddens.loss.mim.loss_weight=0.5 + # Change val_check_interval to 1 for resume as the len(dataloder) is 1 due to max_steps being the same as that of training and Lightning 2.0 raises an error + # if val_check_interval > len(dataloder: https://github.com/Lightning-AI/lightning/blob/2.0.6/src/lightning/pytorch/loops/fit_loop.py#L259 at the beginning of fit_loop.run() + python examples/nlp/language_modeling/megatron_bart_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=1 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=10 \ + trainer.precision=16 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/megatron_mim_results \ + model.tensor_model_parallel_size=2 \ + model.seq_length=128 \ + model.encoder.num_layers=4 \ + model.encoder.hidden_size=64 \ + model.encoder.arch=perceiver \ + model.encoder.num_attention_heads=8 \ + model.encoder.activation='swiglu' \ + model.encoder.masked_softmax_fusion=False \ + model.encoder.bias_activation_fusion=False \ + model.encoder.activations_checkpoint_method='block' \ + model.encoder.activations_checkpoint_num_layers=1 \ + model.decoder.num_layers=2 \ + model.decoder.hidden_size=64 \ + model.decoder.num_attention_heads=8 \ + model.decoder.activation='swiglu' \ + model.decoder.masked_softmax_fusion=False \ + model.decoder.bias_activation_fusion=False \ + model.decoder.activations_checkpoint_method='block' \ + model.decoder.activations_checkpoint_num_layers=1 \ + model.micro_batch_size=2 \ + model.global_batch_size=4 \ + model.data.data_impl=text_mmap \ + model.data.data_prefix=[1.0,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src] \ + model.data.splits_string='"800,100,100"' \ + model.data.whole_word_masking=False \ + model.tokenizer.library=sentencepiece \ + model.tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model \ + ++model.hiddens.enc_output_name=z \ + ++model.hiddens.transform.q_z_given_x.cls_name=cond_gaussian \ + ++model.hiddens.transform.q_z_given_x.hidden_size=64 \ + ++model.hiddens.loss.mim.cls_name=a_mim \ + ++model.hiddens.loss.mim.loss_weight=0.5 + AFTER_SCRIPT: | + rm -rf examples/nlp/language_modeling/megatron_mim_results # stage('L2: NMT Bottleneck Fallback') { # when { @@ -3608,372 +2580,304 @@ jobs: L2_Megatron_Bert_Pretraining_and_Resume_Training_with_Pipeline_Parallelism: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/nlp/language_modeling/megatron_bert_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \ - model.pipeline_model_parallel_size=2 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=2 \ - model.optim.sched.constant_steps=2 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method=block \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings - - python examples/nlp/language_modeling/megatron_bert_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=20 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \ - exp_manager.resume_if_exists=True \ - model.pipeline_model_parallel_size=2 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=2 \ - model.optim.sched.constant_steps=2 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method=block \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings - - rm -rf examples/nlp/language_modeling/bert_pretrain_results - rm -rf examples/nlp/language_modeling/bert_index_mappings - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" - + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python examples/nlp/language_modeling/megatron_bert_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=10 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=10 \ + trainer.precision=16 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \ + model.pipeline_model_parallel_size=2 \ + model.optim.name=fused_adam \ + model.optim.lr=2e-4 \ + model.optim.sched.warmup_steps=2 \ + model.optim.sched.constant_steps=2 \ + model.optim.sched.min_lr=8e-5 \ + model.max_position_embeddings=128 \ + model.encoder_seq_length=128 \ + model.data.seq_length=128 \ + model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \ + model.num_layers=8 \ + model.hidden_size=256 \ + model.num_attention_heads=8 \ + model.activations_checkpoint_method=block \ + model.activations_checkpoint_num_layers=1 \ + model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings + + python examples/nlp/language_modeling/megatron_bert_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=10 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=20 \ + trainer.precision=16 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \ + exp_manager.resume_if_exists=True \ + model.pipeline_model_parallel_size=2 \ + model.optim.name=fused_adam \ + model.optim.lr=2e-4 \ + model.optim.sched.warmup_steps=2 \ + model.optim.sched.constant_steps=2 \ + model.optim.sched.min_lr=8e-5 \ + model.max_position_embeddings=128 \ + model.encoder_seq_length=128 \ + model.data.seq_length=128 \ + model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \ + model.num_layers=8 \ + model.hidden_size=256 \ + model.num_attention_heads=8 \ + model.activations_checkpoint_method=block \ + model.activations_checkpoint_num_layers=1 \ + model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings + AFTER_SCRIPT: | + rm -rf examples/nlp/language_modeling/bert_pretrain_results + rm -rf examples/nlp/language_modeling/bert_index_mappings + L2_Megatron_Bert_Pretraining_and_Resume_Training: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/nlp/language_modeling/megatron_bert_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \ - model.tensor_model_parallel_size=2 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.sequence_parallel=True \ - model.optim.sched.warmup_steps=2 \ - model.optim.sched.constant_steps=2 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method=block \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings - - python examples/nlp/language_modeling/megatron_bert_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=20 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \ - exp_manager.resume_if_exists=True \ - model.tensor_model_parallel_size=2 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=2 \ - model.optim.sched.constant_steps=2 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method=block \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings - - rm -rf examples/nlp/language_modeling/bert_pretrain_results - rm -rf examples/nlp/language_modeling/bert_index_mappings - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python examples/nlp/language_modeling/megatron_bert_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=10 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=10 \ + trainer.precision=16 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \ + model.tensor_model_parallel_size=2 \ + model.optim.name=fused_adam \ + model.optim.lr=2e-4 \ + model.sequence_parallel=True \ + model.optim.sched.warmup_steps=2 \ + model.optim.sched.constant_steps=2 \ + model.optim.sched.min_lr=8e-5 \ + model.max_position_embeddings=128 \ + model.encoder_seq_length=128 \ + model.data.seq_length=128 \ + model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \ + model.num_layers=8 \ + model.hidden_size=256 \ + model.num_attention_heads=8 \ + model.activations_checkpoint_method=block \ + model.activations_checkpoint_num_layers=1 \ + model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings + + python examples/nlp/language_modeling/megatron_bert_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=10 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=20 \ + trainer.precision=16 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \ + exp_manager.resume_if_exists=True \ + model.tensor_model_parallel_size=2 \ + model.optim.name=fused_adam \ + model.optim.lr=2e-4 \ + model.optim.sched.warmup_steps=2 \ + model.optim.sched.constant_steps=2 \ + model.optim.sched.min_lr=8e-5 \ + model.max_position_embeddings=128 \ + model.encoder_seq_length=128 \ + model.data.seq_length=128 \ + model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \ + model.num_layers=8 \ + model.hidden_size=256 \ + model.num_attention_heads=8 \ + model.activations_checkpoint_method=block \ + model.activations_checkpoint_num_layers=1 \ + model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings + AFTER_SCRIPT: | + rm -rf examples/nlp/language_modeling/bert_pretrain_results + rm -rf examples/nlp/language_modeling/bert_index_mappings L2_Megatron_Core_Bert_Pretraining_and_Resume_Training: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=32 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \ - model.mcore_bert=True \ - model.tensor_model_parallel_size=2 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.sequence_parallel=True \ - model.optim.sched.warmup_steps=2 \ - model.optim.sched.constant_steps=2 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method='block' \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings - - NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=20 \ - trainer.precision=32 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \ - exp_manager.resume_if_exists=True \ - model.mcore_bert=True \ - model.tensor_model_parallel_size=2 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=2 \ - model.optim.sched.constant_steps=2 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method='block' \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings - - rm -rf examples/nlp/language_modeling/bert_pretrain_results - rm -rf examples/nlp/language_modeling/bert_index_mappings - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=10 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=10 \ + trainer.precision=32 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \ + model.mcore_bert=True \ + model.tensor_model_parallel_size=2 \ + model.optim.name=fused_adam \ + model.optim.lr=2e-4 \ + model.sequence_parallel=True \ + model.optim.sched.warmup_steps=2 \ + model.optim.sched.constant_steps=2 \ + model.optim.sched.min_lr=8e-5 \ + model.max_position_embeddings=128 \ + model.encoder_seq_length=128 \ + model.data.seq_length=128 \ + model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \ + model.num_layers=8 \ + model.hidden_size=256 \ + model.num_attention_heads=8 \ + model.activations_checkpoint_method='block' \ + model.activations_checkpoint_num_layers=1 \ + model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings + + NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=10 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=20 \ + trainer.precision=32 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \ + exp_manager.resume_if_exists=True \ + model.mcore_bert=True \ + model.tensor_model_parallel_size=2 \ + model.optim.name=fused_adam \ + model.optim.lr=2e-4 \ + model.optim.sched.warmup_steps=2 \ + model.optim.sched.constant_steps=2 \ + model.optim.sched.min_lr=8e-5 \ + model.max_position_embeddings=128 \ + model.encoder_seq_length=128 \ + model.data.seq_length=128 \ + model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \ + model.num_layers=8 \ + model.hidden_size=256 \ + model.num_attention_heads=8 \ + model.activations_checkpoint_method='block' \ + model.activations_checkpoint_num_layers=1 \ + model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings + AFTER_SCRIPT: | + rm -rf examples/nlp/language_modeling/bert_pretrain_results + rm -rf examples/nlp/language_modeling/bert_index_mappings L2_Megatron_RETRO_Pretraining_and_Resume_Training: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/nlp/language_modeling/megatron_retro_pretraining.py \ - trainer.num_nodes=1 \ - trainer.devices=2 \ - trainer.precision=bf16 \ - trainer.accelerator=gpu \ - model.data.data_prefix=['none'] \ - exp_manager.exp_dir=examples/nlp/language_modeling/mcore_retro_results \ - model.mcore_gpt=True \ - model.tensor_model_parallel_size=1 \ - model.pipeline_model_parallel_size=1 \ - model.optim.name=distributed_fused_adam \ - model.retro.retro_project_dir=/home/TestData/nlp/megatron_retro/mcore_retro/micro-wiki-core \ - model.data.num_workers=4 \ - model.micro_batch_size=1 \ - model.data.shuffle_documents=False \ - trainer.val_check_interval=30 \ - +trainer.num_sanity_val_steps=0 \ - model.init_method_std=0.023 \ - model.optim.lr=6.0e-4 \ - model.megatron_amp_O2=True \ - model.data.splits_string=\'\"98,2,0\"\' \ - model.data.dataloader_type=cyclic \ - trainer.max_steps=10 - - python examples/nlp/language_modeling/megatron_retro_pretraining.py \ - trainer.num_nodes=1 \ - trainer.devices=2 \ - trainer.precision=bf16 \ - trainer.accelerator=gpu \ - model.data.data_prefix=['none'] \ - exp_manager.exp_dir=examples/nlp/language_modeling/mcore_retro_results \ - model.mcore_gpt=True \ - model.tensor_model_parallel_size=1 \ - model.pipeline_model_parallel_size=1 \ - model.optim.name=distributed_fused_adam \ - model.retro.retro_project_dir=/home/TestData/nlp/megatron_retro/mcore_retro/micro-wiki-core \ - model.data.num_workers=4 \ - model.micro_batch_size=1 \ - model.data.shuffle_documents=False \ - trainer.val_check_interval=30 \ - +trainer.num_sanity_val_steps=0 \ - model.init_method_std=0.023 \ - model.optim.lr=6.0e-4 \ - model.megatron_amp_O2=True \ - model.data.splits_string=\'\"98,2,0\"\' \ - model.data.dataloader_type=cyclic \ - trainer.max_steps=20 - - rm -rf examples/nlp/language_modeling/mcore_retro_results - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python examples/nlp/language_modeling/megatron_retro_pretraining.py \ + trainer.num_nodes=1 \ + trainer.devices=2 \ + trainer.precision=bf16 \ + trainer.accelerator=gpu \ + model.data.data_prefix=['none'] \ + exp_manager.exp_dir=examples/nlp/language_modeling/mcore_retro_results \ + model.mcore_gpt=True \ + model.tensor_model_parallel_size=1 \ + model.pipeline_model_parallel_size=1 \ + model.optim.name=distributed_fused_adam \ + model.retro.retro_project_dir=/home/TestData/nlp/megatron_retro/mcore_retro/micro-wiki-core \ + model.data.num_workers=4 \ + model.micro_batch_size=1 \ + model.data.shuffle_documents=False \ + trainer.val_check_interval=30 \ + +trainer.num_sanity_val_steps=0 \ + model.init_method_std=0.023 \ + model.optim.lr=6.0e-4 \ + model.megatron_amp_O2=True \ + model.data.splits_string=\'\"98,2,0\"\' \ + model.data.dataloader_type=cyclic \ + trainer.max_steps=10 + + python examples/nlp/language_modeling/megatron_retro_pretraining.py \ + trainer.num_nodes=1 \ + trainer.devices=2 \ + trainer.precision=bf16 \ + trainer.accelerator=gpu \ + model.data.data_prefix=['none'] \ + exp_manager.exp_dir=examples/nlp/language_modeling/mcore_retro_results \ + model.mcore_gpt=True \ + model.tensor_model_parallel_size=1 \ + model.pipeline_model_parallel_size=1 \ + model.optim.name=distributed_fused_adam \ + model.retro.retro_project_dir=/home/TestData/nlp/megatron_retro/mcore_retro/micro-wiki-core \ + model.data.num_workers=4 \ + model.micro_batch_size=1 \ + model.data.shuffle_documents=False \ + trainer.val_check_interval=30 \ + +trainer.num_sanity_val_steps=0 \ + model.init_method_std=0.023 \ + model.optim.lr=6.0e-4 \ + model.megatron_amp_O2=True \ + model.data.splits_string=\'\"98,2,0\"\' \ + model.data.dataloader_type=cyclic \ + trainer.max_steps=20 + AFTER_SCRIPT: | + rm -rf examples/nlp/language_modeling/mcore_retro_results L2_Legacy_Megatron_RETRO_Pretraining_and_Resume_Training: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/nlp/language_modeling/megatron_retro_pretraining_legacy.py \ - trainer.devices=2 \ - trainer.num_nodes=1 \ - trainer.accelerator=gpu \ - trainer.accumulate_grad_batches=1 \ - trainer.limit_val_batches=2 \ - exp_manager.resume_if_exists=True \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - trainer.val_check_interval=10 \ - exp_manager.exp_dir=examples/nlp/language_modeling/retro_legacy_results \ - model.data.data_prefix= \ - model.data.knn_index= \ - model.data.retrieval_prefix= \ - model.tensor_model_parallel_size=2 \ - model.micro_batch_size=4 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=2 \ - model.optim.sched.constant_steps=2 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.chunk_size=32 \ - model.enc_num_layers=2 \ - model.dec_num_layers=2 \ - model.enc_cross_attention=[1] \ - model.dec_cross_attention=[1] \ - +model.data.mock=True + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python examples/nlp/language_modeling/megatron_retro_pretraining_legacy.py \ + trainer.devices=2 \ + trainer.num_nodes=1 \ + trainer.accelerator=gpu \ + trainer.accumulate_grad_batches=1 \ + trainer.limit_val_batches=2 \ + exp_manager.resume_if_exists=True \ + trainer.max_steps=10 \ + trainer.precision=16 \ + trainer.gradient_clip_val=1.0 \ + trainer.val_check_interval=10 \ + exp_manager.exp_dir=examples/nlp/language_modeling/retro_legacy_results \ + model.data.data_prefix= \ + model.data.knn_index= \ + model.data.retrieval_prefix= \ + model.tensor_model_parallel_size=2 \ + model.micro_batch_size=4 \ + model.optim.name=fused_adam \ + model.optim.lr=2e-4 \ + model.optim.sched.warmup_steps=2 \ + model.optim.sched.constant_steps=2 \ + model.optim.sched.min_lr=8e-5 \ + model.max_position_embeddings=128 \ + model.encoder_seq_length=128 \ + model.chunk_size=32 \ + model.enc_num_layers=2 \ + model.dec_num_layers=2 \ + model.enc_cross_attention=[1] \ + model.dec_cross_attention=[1] \ + +model.data.mock=True python examples/nlp/language_modeling/megatron_retro_pretraining_legacy.py \ trainer.devices=2 \ @@ -4005,10 +2909,8 @@ jobs: model.enc_cross_attention=[1] \ model.dec_cross_attention=[1] \ +model.data.mock=True - - rm -rf examples/nlp/language_modeling/retro_legacy_results - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + AFTER_SCRIPT: | + rm -rf examples/nlp/language_modeling/retro_legacy_results # L2_Megatron_RETRO_muTransfer_Pretraining_Performance: # needs: [cicd-test-container-setup] @@ -4106,224 +3008,183 @@ jobs: L2_BioMegatron_Bert_NER_Task: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/nlp/token_classification/token_classification_train.py \ - exp_manager.exp_dir=examples/nlp/language_modeling/token_classification_results \ - trainer.max_epochs=1 \ - model.dataset.data_dir=/home/TestData/nlp/ner \ - model.language_model.pretrained_model_name=biomegatron345m_biovocab_30k_cased \ - model.tokenizer.tokenizer_name=null - rm -rf examples/nlp/language_modeling/token_classification_results - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python examples/nlp/token_classification/token_classification_train.py \ + exp_manager.exp_dir=examples/nlp/language_modeling/token_classification_results \ + trainer.max_epochs=1 \ + model.dataset.data_dir=/home/TestData/nlp/ner \ + model.language_model.pretrained_model_name=biomegatron345m_biovocab_30k_cased \ + model.tokenizer.tokenizer_name=null + AFTER_SCRIPT: | + rm -rf examples/nlp/language_modeling/token_classification_results L2_Megatron_GPT_Pretraining_and_Resume_Training_TP2: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=2 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=3 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - model.tensor_model_parallel_size=2 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=1 \ - model.optim.sched.constant_steps=1 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.normalization=rmsnorm \ - model.bias=False \ - model.bias_activation_fusion=False \ - model.bias_dropout_add_fusion=False \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method=block \ - model.activations_checkpoint_granularity=full \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings - - python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=2 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=6 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - exp_manager.resume_if_exists=True \ - model.tensor_model_parallel_size=2 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=2 \ - model.optim.sched.constant_steps=2 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.normalization=rmsnorm \ - model.bias=False \ - model.bias_activation_fusion=False \ - model.bias_dropout_add_fusion=False \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method=block \ - model.activations_checkpoint_granularity=full \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings - - rm -rf examples/nlp/language_modeling/gpt_pretrain_results - rm -rf examples/nlp/language_modeling/gpt_index_mappings - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=2 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=3 \ + trainer.precision=16 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ + model.tensor_model_parallel_size=2 \ + model.optim.name=fused_adam \ + model.optim.lr=2e-4 \ + model.optim.sched.warmup_steps=1 \ + model.optim.sched.constant_steps=1 \ + model.optim.sched.min_lr=8e-5 \ + model.max_position_embeddings=128 \ + model.encoder_seq_length=128 \ + model.data.seq_length=128 \ + model.normalization=rmsnorm \ + model.bias=False \ + model.bias_activation_fusion=False \ + model.bias_dropout_add_fusion=False \ + model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ + model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ + model.num_layers=8 \ + model.hidden_size=256 \ + model.num_attention_heads=8 \ + model.activations_checkpoint_method=block \ + model.activations_checkpoint_granularity=full \ + model.activations_checkpoint_num_layers=1 \ + model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings + + python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=2 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=6 \ + trainer.precision=16 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ + exp_manager.resume_if_exists=True \ + model.tensor_model_parallel_size=2 \ + model.optim.name=fused_adam \ + model.optim.lr=2e-4 \ + model.optim.sched.warmup_steps=2 \ + model.optim.sched.constant_steps=2 \ + model.optim.sched.min_lr=8e-5 \ + model.max_position_embeddings=128 \ + model.encoder_seq_length=128 \ + model.data.seq_length=128 \ + model.normalization=rmsnorm \ + model.bias=False \ + model.bias_activation_fusion=False \ + model.bias_dropout_add_fusion=False \ + model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ + model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ + model.num_layers=8 \ + model.hidden_size=256 \ + model.num_attention_heads=8 \ + model.activations_checkpoint_method=block \ + model.activations_checkpoint_granularity=full \ + model.activations_checkpoint_num_layers=1 \ + model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings + AFTER_SCRIPT: | + rm -rf examples/nlp/language_modeling/gpt_pretrain_results + rm -rf examples/nlp/language_modeling/gpt_index_mappings L2_Megatron_GPT_with_Rope_Pretraining_and_Resume_Training_TP2: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=2 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=3 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - model.tensor_model_parallel_size=2 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=1 \ - model.optim.sched.constant_steps=1 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.position_embedding_type=rope \ - model.rotary_percentage=0.5 \ - model.normalization=rmsnorm \ - model.bias=False \ - model.bias_activation_fusion=False \ - model.bias_dropout_add_fusion=False \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method=block \ - model.activations_checkpoint_granularity=full \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings - - # commented out to save time on github ci @adithyare - # python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - # trainer.devices=2 \ - # trainer.accelerator=gpu \ - # trainer.log_every_n_steps=1 \ - # trainer.val_check_interval=2 \ - # trainer.limit_val_batches=1 \ - # trainer.accumulate_grad_batches=1 \ - # trainer.max_steps=6 \ - # trainer.precision=16 \ - # trainer.gradient_clip_val=1.0 \ - # exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - # exp_manager.resume_if_exists=True \ - # model.tensor_model_parallel_size=2 \ - # model.optim.name=fused_adam \ - # model.optim.lr=2e-4 \ - # model.optim.sched.warmup_steps=2 \ - # model.optim.sched.constant_steps=2 \ - # model.optim.sched.min_lr=8e-5 \ - # model.max_position_embeddings=128 \ - # model.encoder_seq_length=128 \ - # model.data.seq_length=128 \ - # model.position_embedding_type=rope \ - # model.rotary_percentage=0.5 \ - # model.normalization=rmsnorm \ - # model.bias=False \ - # model.bias_activation_fusion=False \ - # model.bias_dropout_add_fusion=False \ - # model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - # model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - # model.num_layers=8 \ - # model.hidden_size=256 \ - # model.num_attention_heads=8 \ - # model.activations_checkpoint_method=block \ - # model.activations_checkpoint_granularity=full \ - # model.activations_checkpoint_num_layers=1 \ - # model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - # model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings" - - rm -rf examples/nlp/language_modeling/gpt_pretrain_results - rm -rf examples/nlp/language_modeling/gpt_index_mappings - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=2 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=3 \ + trainer.precision=16 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ + model.tensor_model_parallel_size=2 \ + model.optim.name=fused_adam \ + model.optim.lr=2e-4 \ + model.optim.sched.warmup_steps=1 \ + model.optim.sched.constant_steps=1 \ + model.optim.sched.min_lr=8e-5 \ + model.max_position_embeddings=128 \ + model.encoder_seq_length=128 \ + model.data.seq_length=128 \ + model.position_embedding_type=rope \ + model.rotary_percentage=0.5 \ + model.normalization=rmsnorm \ + model.bias=False \ + model.bias_activation_fusion=False \ + model.bias_dropout_add_fusion=False \ + model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ + model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ + model.num_layers=8 \ + model.hidden_size=256 \ + model.num_attention_heads=8 \ + model.activations_checkpoint_method=block \ + model.activations_checkpoint_granularity=full \ + model.activations_checkpoint_num_layers=1 \ + model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings + + # commented out to save time on github ci @adithyare + # python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ + # trainer.devices=2 \ + # trainer.accelerator=gpu \ + # trainer.log_every_n_steps=1 \ + # trainer.val_check_interval=2 \ + # trainer.limit_val_batches=1 \ + # trainer.accumulate_grad_batches=1 \ + # trainer.max_steps=6 \ + # trainer.precision=16 \ + # trainer.gradient_clip_val=1.0 \ + # exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ + # exp_manager.resume_if_exists=True \ + # model.tensor_model_parallel_size=2 \ + # model.optim.name=fused_adam \ + # model.optim.lr=2e-4 \ + # model.optim.sched.warmup_steps=2 \ + # model.optim.sched.constant_steps=2 \ + # model.optim.sched.min_lr=8e-5 \ + # model.max_position_embeddings=128 \ + # model.encoder_seq_length=128 \ + # model.data.seq_length=128 \ + # model.position_embedding_type=rope \ + # model.rotary_percentage=0.5 \ + # model.normalization=rmsnorm \ + # model.bias=False \ + # model.bias_activation_fusion=False \ + # model.bias_dropout_add_fusion=False \ + # model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ + # model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ + # model.num_layers=8 \ + # model.hidden_size=256 \ + # model.num_attention_heads=8 \ + # model.activations_checkpoint_method=block \ + # model.activations_checkpoint_granularity=full \ + # model.activations_checkpoint_num_layers=1 \ + # model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ + # model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings" + AFTER_SCRIPT: | + rm -rf examples/nlp/language_modeling/gpt_pretrain_results + rm -rf examples/nlp/language_modeling/gpt_index_mappings # This test requires Ampere but some of the test GPUs are Volta # Need to add a check for compute capability before uncommenting this test @@ -4419,683 +3280,529 @@ jobs: L2_Megatron_GPT_with_ALiBi_Pretraining_and_Resume_Training_TP2: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=2 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=3 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - model.tensor_model_parallel_size=2 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=1 \ - model.optim.sched.constant_steps=1 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.position_embedding_type=alibi \ - model.normalization=rmsnorm \ - model.bias=False \ - model.bias_activation_fusion=False \ - model.bias_dropout_add_fusion=False \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method=block \ - model.activations_checkpoint_granularity=full \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings - - # not testing resume functionality to save time on ci @adithyare - #python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - #trainer.devices=2 \ - #trainer.accelerator=gpu \ - #trainer.log_every_n_steps=1 \ - #trainer.val_check_interval=2 \ - #trainer.limit_val_batches=1 \ - #trainer.accumulate_grad_batches=1 \ - #trainer.max_steps=6 \ - #trainer.precision=16 \ - #trainer.gradient_clip_val=1.0 \ - #exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - #exp_manager.resume_if_exists=True \ - #model.tensor_model_parallel_size=2 \ - #model.optim.name=fused_adam \ - #model.optim.lr=2e-4 \ - #model.optim.sched.warmup_steps=2 \ - #model.optim.sched.constant_steps=2 \ - #model.optim.sched.min_lr=8e-5 \ - #model.max_position_embeddings=128 \ - #model.encoder_seq_length=128 \ - #model.data.seq_length=128 \ - #model.position_embedding_type=alibi \ - #model.normalization=rmsnorm \ - #model.bias=False \ - #model.bias_activation_fusion=False \ - #model.bias_dropout_add_fusion=False \ - #model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - #model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - #model.num_layers=8 \ - #model.hidden_size=256 \ - #model.num_attention_heads=8 \ - #model.activations_checkpoint_method=block \ - #model.activations_checkpoint_granularity=full \ - #model.activations_checkpoint_num_layers=1 \ - #model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - #model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings" - - rm -rf examples/nlp/language_modeling/gpt_pretrain_results - rm -rf examples/nlp/language_modeling/gpt_index_mappings - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=2 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=3 \ + trainer.precision=16 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ + model.tensor_model_parallel_size=2 \ + model.optim.name=fused_adam \ + model.optim.lr=2e-4 \ + model.optim.sched.warmup_steps=1 \ + model.optim.sched.constant_steps=1 \ + model.optim.sched.min_lr=8e-5 \ + model.max_position_embeddings=128 \ + model.encoder_seq_length=128 \ + model.data.seq_length=128 \ + model.position_embedding_type=alibi \ + model.normalization=rmsnorm \ + model.bias=False \ + model.bias_activation_fusion=False \ + model.bias_dropout_add_fusion=False \ + model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ + model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ + model.num_layers=8 \ + model.hidden_size=256 \ + model.num_attention_heads=8 \ + model.activations_checkpoint_method=block \ + model.activations_checkpoint_granularity=full \ + model.activations_checkpoint_num_layers=1 \ + model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings + + # not testing resume functionality to save time on ci @adithyare + #python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ + #trainer.devices=2 \ + #trainer.accelerator=gpu \ + #trainer.log_every_n_steps=1 \ + #trainer.val_check_interval=2 \ + #trainer.limit_val_batches=1 \ + #trainer.accumulate_grad_batches=1 \ + #trainer.max_steps=6 \ + #trainer.precision=16 \ + #trainer.gradient_clip_val=1.0 \ + #exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ + #exp_manager.resume_if_exists=True \ + #model.tensor_model_parallel_size=2 \ + #model.optim.name=fused_adam \ + #model.optim.lr=2e-4 \ + #model.optim.sched.warmup_steps=2 \ + #model.optim.sched.constant_steps=2 \ + #model.optim.sched.min_lr=8e-5 \ + #model.max_position_embeddings=128 \ + #model.encoder_seq_length=128 \ + #model.data.seq_length=128 \ + #model.position_embedding_type=alibi \ + #model.normalization=rmsnorm \ + #model.bias=False \ + #model.bias_activation_fusion=False \ + #model.bias_dropout_add_fusion=False \ + #model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ + #model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ + #model.num_layers=8 \ + #model.hidden_size=256 \ + #model.num_attention_heads=8 \ + #model.activations_checkpoint_method=block \ + #model.activations_checkpoint_granularity=full \ + #model.activations_checkpoint_num_layers=1 \ + #model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ + #model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings" + AFTER_SCRIPT: | + rm -rf examples/nlp/language_modeling/gpt_pretrain_results + rm -rf examples/nlp/language_modeling/gpt_index_mappings L2_Megatron_GPT_with_KERPLE_Pretraining_and_Resume_Training_TP2: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=2 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=3 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - model.tensor_model_parallel_size=2 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=1 \ - model.optim.sched.constant_steps=1 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.position_embedding_type=kerple \ - model.normalization=rmsnorm \ - model.bias=False \ - model.bias_activation_fusion=False \ - model.bias_dropout_add_fusion=False \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method=block \ - model.activations_checkpoint_granularity=full \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings - - # commented out to save time on github ci @adithyare - #python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - #trainer.devices=2 \ - #trainer.accelerator=gpu \ - #trainer.log_every_n_steps=1 \ - #trainer.val_check_interval=2 \ - #trainer.limit_val_batches=1 \ - #trainer.accumulate_grad_batches=1 \ - #trainer.max_steps=6 \ - #trainer.precision=16 \ - #trainer.gradient_clip_val=1.0 \ - #exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - #exp_manager.resume_if_exists=True \ - #model.tensor_model_parallel_size=2 \ - #model.optim.name=fused_adam \ - #model.optim.lr=2e-4 \ - #model.optim.sched.warmup_steps=2 \ - #model.optim.sched.constant_steps=2 \ - #model.optim.sched.min_lr=8e-5 \ - #model.max_position_embeddings=128 \ - #model.encoder_seq_length=128 \ - #model.data.seq_length=128 \ - #model.position_embedding_type=kerple \ - #model.normalization=rmsnorm \ - #model.bias=False \ - #model.bias_activation_fusion=False \ - #model.bias_dropout_add_fusion=False \ - #model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - #model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - #model.num_layers=8 \ - #model.hidden_size=256 \ - #model.num_attention_heads=8 \ - #model.activations_checkpoint_method=block \ - #model.activations_checkpoint_granularity=full \ - #model.activations_checkpoint_num_layers=1 \ - #model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - #model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings" - - rm -rf examples/nlp/language_modeling/gpt_pretrain_results - rm -rf examples/nlp/language_modeling/gpt_index_mappings - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=2 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=3 \ + trainer.precision=16 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ + model.tensor_model_parallel_size=2 \ + model.optim.name=fused_adam \ + model.optim.lr=2e-4 \ + model.optim.sched.warmup_steps=1 \ + model.optim.sched.constant_steps=1 \ + model.optim.sched.min_lr=8e-5 \ + model.max_position_embeddings=128 \ + model.encoder_seq_length=128 \ + model.data.seq_length=128 \ + model.position_embedding_type=kerple \ + model.normalization=rmsnorm \ + model.bias=False \ + model.bias_activation_fusion=False \ + model.bias_dropout_add_fusion=False \ + model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ + model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ + model.num_layers=8 \ + model.hidden_size=256 \ + model.num_attention_heads=8 \ + model.activations_checkpoint_method=block \ + model.activations_checkpoint_granularity=full \ + model.activations_checkpoint_num_layers=1 \ + model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings + + # commented out to save time on github ci @adithyare + #python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ + #trainer.devices=2 \ + #trainer.accelerator=gpu \ + #trainer.log_every_n_steps=1 \ + #trainer.val_check_interval=2 \ + #trainer.limit_val_batches=1 \ + #trainer.accumulate_grad_batches=1 \ + #trainer.max_steps=6 \ + #trainer.precision=16 \ + #trainer.gradient_clip_val=1.0 \ + #exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ + #exp_manager.resume_if_exists=True \ + #model.tensor_model_parallel_size=2 \ + #model.optim.name=fused_adam \ + #model.optim.lr=2e-4 \ + #model.optim.sched.warmup_steps=2 \ + #model.optim.sched.constant_steps=2 \ + #model.optim.sched.min_lr=8e-5 \ + #model.max_position_embeddings=128 \ + #model.encoder_seq_length=128 \ + #model.data.seq_length=128 \ + #model.position_embedding_type=kerple \ + #model.normalization=rmsnorm \ + #model.bias=False \ + #model.bias_activation_fusion=False \ + #model.bias_dropout_add_fusion=False \ + #model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ + #model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ + #model.num_layers=8 \ + #model.hidden_size=256 \ + #model.num_attention_heads=8 \ + #model.activations_checkpoint_method=block \ + #model.activations_checkpoint_granularity=full \ + #model.activations_checkpoint_num_layers=1 \ + #model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ + #model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings" + AFTER_SCRIPT: | + rm -rf examples/nlp/language_modeling/gpt_pretrain_results + rm -rf examples/nlp/language_modeling/gpt_index_mappings L2_Megatron_GPT_Pretraining_and_Resume_Training_PP2: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - trainer.devices=2 \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=2 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=3 \ - trainer.precision=bf16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - model.pipeline_model_parallel_size=2 \ - model.tensor_model_parallel_size=1 \ - model.mcore_gpt=True \ - model.megatron_amp_O2=True \ - model.optim.name=distributed_fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=1 \ - model.optim.sched.constant_steps=1 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.activation=fast-swiglu \ - model.bias_activation_fusion=False \ - model.hidden_dropout=0.0 \ - model.attention_dropout=0.0 \ - model.transformer_block_type=normformer \ - model.headscale=True \ - model.data.seq_length=128 \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method=block \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings - - python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - trainer.devices=2 \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=2 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=6 \ - trainer.precision=bf16 \ - trainer.gradient_clip_val=1.0 \ - model.mcore_gpt=True \ - model.megatron_amp_O2=True \ - exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - exp_manager.resume_if_exists=True \ - model.pipeline_model_parallel_size=2 \ - model.tensor_model_parallel_size=1 \ - model.optim.name=distributed_fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=2 \ - model.optim.sched.constant_steps=2 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.activation=fast-swiglu \ - model.bias_activation_fusion=False \ - model.hidden_dropout=0.0 \ - model.attention_dropout=0.0 \ - model.transformer_block_type=normformer \ - model.headscale=True \ - model.data.seq_length=128 \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method=block \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings - - rm -rf examples/nlp/language_modeling/gpt_pretrain_results - rm -rf examples/nlp/language_modeling/gpt_index_mappings - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ + trainer.devices=2 \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=2 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=3 \ + trainer.precision=bf16 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ + model.pipeline_model_parallel_size=2 \ + model.tensor_model_parallel_size=1 \ + model.mcore_gpt=True \ + model.megatron_amp_O2=True \ + model.optim.name=distributed_fused_adam \ + model.optim.lr=2e-4 \ + model.optim.sched.warmup_steps=1 \ + model.optim.sched.constant_steps=1 \ + model.optim.sched.min_lr=8e-5 \ + model.max_position_embeddings=128 \ + model.encoder_seq_length=128 \ + model.activation=fast-swiglu \ + model.bias_activation_fusion=False \ + model.hidden_dropout=0.0 \ + model.attention_dropout=0.0 \ + model.transformer_block_type=normformer \ + model.headscale=True \ + model.data.seq_length=128 \ + model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ + model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ + model.num_layers=8 \ + model.hidden_size=256 \ + model.num_attention_heads=8 \ + model.activations_checkpoint_method=block \ + model.activations_checkpoint_num_layers=1 \ + model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings + + python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ + trainer.devices=2 \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=2 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=6 \ + trainer.precision=bf16 \ + trainer.gradient_clip_val=1.0 \ + model.mcore_gpt=True \ + model.megatron_amp_O2=True \ + exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ + exp_manager.resume_if_exists=True \ + model.pipeline_model_parallel_size=2 \ + model.tensor_model_parallel_size=1 \ + model.optim.name=distributed_fused_adam \ + model.optim.lr=2e-4 \ + model.optim.sched.warmup_steps=2 \ + model.optim.sched.constant_steps=2 \ + model.optim.sched.min_lr=8e-5 \ + model.max_position_embeddings=128 \ + model.encoder_seq_length=128 \ + model.activation=fast-swiglu \ + model.bias_activation_fusion=False \ + model.hidden_dropout=0.0 \ + model.attention_dropout=0.0 \ + model.transformer_block_type=normformer \ + model.headscale=True \ + model.data.seq_length=128 \ + model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ + model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ + model.num_layers=8 \ + model.hidden_size=256 \ + model.num_attention_heads=8 \ + model.activations_checkpoint_method=block \ + model.activations_checkpoint_num_layers=1 \ + model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings + AFTER_SCRIPT: | + rm -rf examples/nlp/language_modeling/gpt_pretrain_results + rm -rf examples/nlp/language_modeling/gpt_index_mappings - #@athitten Remove /home/TestData/nlp/megatron_sft/trec.jsonl for validation and test file until we have support for multiple dataloaders in lightning 2.0 L2_Megatron_GPT_Finetuning_PP2: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \ - trainer.devices=2 \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=2 \ - +trainer.limit_val_batches=2 \ - trainer.max_steps=3 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/gpt_sft_results \ - model.pipeline_model_parallel_size=2 \ - model.tensor_model_parallel_size=1 \ - model.restore_from_path=/home/TestData/nlp/megatron_gpt/PP2/gpt_pp2_tp1.nemo \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.peft.peft_scheme=null \ - model.data.train_ds.micro_batch_size=1 \ - model.data.train_ds.global_batch_size=4 \ - model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl,/home/TestData/nlp/megatron_sft/trec.jsonl] \ - model.data.train_ds.concat_sampling_probabilities=[0.3,0.7] \ - model.data.train_ds.num_workers=0 \ - model.data.test_ds.micro_batch_size=1 \ - model.data.test_ds.global_batch_size=1 \ - model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ - model.data.test_ds.names=[quarel] \ - model.data.validation_ds.micro_batch_size=1 \ - model.data.validation_ds.global_batch_size=1 \ - model.data.validation_ds.num_workers=0 \ - model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ - model.data.validation_ds.names=[quarel] - - python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \ - trainer.devices=2 \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=1 \ - +trainer.limit_val_batches=2 \ - trainer.max_steps=3 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/gpt_sft_results \ - model.pipeline_model_parallel_size=2 \ - model.tensor_model_parallel_size=1 \ - model.restore_from_path=/home/TestData/nlp/megatron_gpt/PP2/gpt_pp2_tp1.nemo \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.peft.peft_scheme=null \ - model.data.train_ds.micro_batch_size=1 \ - model.data.train_ds.global_batch_size=4 \ - model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl,/home/TestData/nlp/megatron_sft/trec.jsonl] \ - model.data.train_ds.concat_sampling_probabilities=[0.3,0.7] \ - model.data.train_ds.num_workers=0 \ - model.data.test_ds.micro_batch_size=1 \ - model.data.test_ds.global_batch_size=1 \ - model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ - model.data.test_ds.names=[quarel] \ - model.data.validation_ds.micro_batch_size=1 \ - model.data.validation_ds.global_batch_size=1 \ - model.data.validation_ds.num_workers=0 \ - model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ - model.data.validation_ds.names=[quarel] - - rm -rf examples/nlp/language_modeling/gpt_sft_results - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \ + trainer.devices=2 \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=2 \ + +trainer.limit_val_batches=2 \ + trainer.max_steps=3 \ + trainer.precision=16 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/gpt_sft_results \ + model.pipeline_model_parallel_size=2 \ + model.tensor_model_parallel_size=1 \ + model.restore_from_path=/home/TestData/nlp/megatron_gpt/PP2/gpt_pp2_tp1.nemo \ + model.optim.name=fused_adam \ + model.optim.lr=2e-4 \ + model.peft.peft_scheme=null \ + model.data.train_ds.micro_batch_size=1 \ + model.data.train_ds.global_batch_size=4 \ + model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl,/home/TestData/nlp/megatron_sft/trec.jsonl] \ + model.data.train_ds.concat_sampling_probabilities=[0.3,0.7] \ + model.data.train_ds.num_workers=0 \ + model.data.test_ds.micro_batch_size=1 \ + model.data.test_ds.global_batch_size=1 \ + model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ + model.data.test_ds.names=[quarel] \ + model.data.validation_ds.micro_batch_size=1 \ + model.data.validation_ds.global_batch_size=1 \ + model.data.validation_ds.num_workers=0 \ + model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ + model.data.validation_ds.names=[quarel] + + python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \ + trainer.devices=2 \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=1 \ + +trainer.limit_val_batches=2 \ + trainer.max_steps=3 \ + trainer.precision=16 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/gpt_sft_results \ + model.pipeline_model_parallel_size=2 \ + model.tensor_model_parallel_size=1 \ + model.restore_from_path=/home/TestData/nlp/megatron_gpt/PP2/gpt_pp2_tp1.nemo \ + model.optim.name=fused_adam \ + model.optim.lr=2e-4 \ + model.peft.peft_scheme=null \ + model.data.train_ds.micro_batch_size=1 \ + model.data.train_ds.global_batch_size=4 \ + model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl,/home/TestData/nlp/megatron_sft/trec.jsonl] \ + model.data.train_ds.concat_sampling_probabilities=[0.3,0.7] \ + model.data.train_ds.num_workers=0 \ + model.data.test_ds.micro_batch_size=1 \ + model.data.test_ds.global_batch_size=1 \ + model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ + model.data.test_ds.names=[quarel] \ + model.data.validation_ds.micro_batch_size=1 \ + model.data.validation_ds.global_batch_size=1 \ + model.data.validation_ds.num_workers=0 \ + model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ + model.data.validation_ds.names=[quarel] + AFTER_SCRIPT: | + rm -rf examples/nlp/language_modeling/gpt_sft_results L2_Megatron_GPT_Finetuning_StarCoder_PP1: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-gpus-1 - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \ - trainer.devices=1 \ - trainer.num_nodes=1 \ - trainer.precision=32 \ - trainer.max_steps=4 \ - trainer.val_check_interval=4 \ - trainer.enable_checkpointing=False \ - +trainer.limit_val_batches=2 \ - +trainer.limit_test_batches=2 \ - exp_manager.checkpoint_callback_params.save_best_model=False \ - exp_manager.exp_dir=examples/nlp/language_modeling/gpt_sft_results \ - model.peft.peft_scheme=none \ - model.optim.name=distributed_fused_adam \ - model.restore_from_path=/home/TestData/nlp/megatron_gpt/starcoder-ci-nemo/megatron_starcoder_tp1_pp1.nemo \ - model.tensor_model_parallel_size=1 \ - model.pipeline_model_parallel_size=1 \ - model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ - model.data.train_ds.num_workers=0 \ - model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ - model.data.validation_ds.num_workers=0 \ - model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ - model.data.test_ds.num_workers=0 \ - model.data.train_ds.concat_sampling_probabilities=[1.0] - - rm -rf examples/nlp/language_modeling/gpt_sft_results - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" - + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-gpus-1 + SCRIPT: | + python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \ + trainer.devices=1 \ + trainer.num_nodes=1 \ + trainer.precision=32 \ + trainer.max_steps=4 \ + trainer.val_check_interval=4 \ + trainer.enable_checkpointing=False \ + +trainer.limit_val_batches=2 \ + +trainer.limit_test_batches=2 \ + exp_manager.checkpoint_callback_params.save_best_model=False \ + exp_manager.exp_dir=examples/nlp/language_modeling/gpt_sft_results \ + model.peft.peft_scheme=none \ + model.optim.name=distributed_fused_adam \ + model.restore_from_path=/home/TestData/nlp/megatron_gpt/starcoder-ci-nemo/megatron_starcoder_tp1_pp1.nemo \ + model.tensor_model_parallel_size=1 \ + model.pipeline_model_parallel_size=1 \ + model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ + model.data.train_ds.num_workers=0 \ + model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ + model.data.validation_ds.num_workers=0 \ + model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ + model.data.test_ds.num_workers=0 \ + model.data.train_ds.concat_sampling_probabilities=[1.0] + AFTER_SCRIPT: | + rm -rf examples/nlp/language_modeling/gpt_sft_results + L2_Megatron_GPT_Embedding: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - rm -rf /home/TestData/nlp/megatron_ir/working_dir - - python examples/nlp/information_retrieval/megatron_gpt_embedding_finetuning.py \ - exp_manager.exp_dir='/home/TestData/nlp/megatron_ir/working_dir' \ - model.global_batch_size=4 \ - model.micro_batch_size=4 \ - trainer.devices=1 \ - trainer.num_nodes=1 \ - trainer.max_epochs=null \ - trainer.max_steps=20 \ - trainer.val_check_interval=10 \ - model.restore_from_path='/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo' \ - model.peft.lora_tuning.adapter_dim=8 \ - model.data.validation_ds.query_file_names=[/home/TestData/nlp/megatron_ir/test_query.jsonl] \ - model.data.validation_ds.doc_file_names=[/home/TestData/nlp/megatron_ir/test_doc.jsonl] \ - model.data.validation_ds.write_embeddings_to_file=True \ - model.data.validation_ds.output_file_path_prefix='/home/TestData/nlp/megatron_ir/working_dir/val_embs' \ - model.data.train_ds.file_names=[/home/TestData/nlp/megatron_ir/train.jsonl] - - - python examples/nlp/information_retrieval/megatron_gpt_embedding_generate.py \ - trainer.devices=1 \ - trainer.num_nodes=1 \ - model.restore_from_path='/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo' \ - model.peft.restore_from_path='/home/TestData/nlp/megatron_ir/working_dir/megatron_gpt_peft_lora_tuning/checkpoints/megatron_gpt_peft_lora_tuning.nemo' \ - model.global_batch_size=4 \ - model.micro_batch_size=4 \ - model.peft.lora_tuning.adapter_dim=8 \ - model.data.test_ds.write_embeddings_to_file=True \ - model.data.test_ds.output_file_path_prefix='/home/TestData/nlp/megatron_ir/working_dir/test_embs' \ - model.data.test_ds.query_file_names=[/home/TestData/nlp/megatron_ir/test_query.jsonl] \ - model.data.test_ds.doc_file_names=[/home/TestData/nlp/megatron_ir/test_doc.jsonl] - - rm -rf /home/TestData/nlp/megatron_ir/working_dir - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + rm -rf /home/TestData/nlp/megatron_ir/working_dir + + python examples/nlp/information_retrieval/megatron_gpt_embedding_finetuning.py \ + exp_manager.exp_dir='/home/TestData/nlp/megatron_ir/working_dir' \ + model.global_batch_size=4 \ + model.micro_batch_size=4 \ + trainer.devices=1 \ + trainer.num_nodes=1 \ + trainer.max_epochs=null \ + trainer.max_steps=20 \ + trainer.val_check_interval=10 \ + model.restore_from_path='/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo' \ + model.peft.lora_tuning.adapter_dim=8 \ + model.data.validation_ds.query_file_names=[/home/TestData/nlp/megatron_ir/test_query.jsonl] \ + model.data.validation_ds.doc_file_names=[/home/TestData/nlp/megatron_ir/test_doc.jsonl] \ + model.data.validation_ds.write_embeddings_to_file=True \ + model.data.validation_ds.output_file_path_prefix='/home/TestData/nlp/megatron_ir/working_dir/val_embs' \ + model.data.train_ds.file_names=[/home/TestData/nlp/megatron_ir/train.jsonl] + + + python examples/nlp/information_retrieval/megatron_gpt_embedding_generate.py \ + trainer.devices=1 \ + trainer.num_nodes=1 \ + model.restore_from_path='/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo' \ + model.peft.restore_from_path='/home/TestData/nlp/megatron_ir/working_dir/megatron_gpt_peft_lora_tuning/checkpoints/megatron_gpt_peft_lora_tuning.nemo' \ + model.global_batch_size=4 \ + model.micro_batch_size=4 \ + model.peft.lora_tuning.adapter_dim=8 \ + model.data.test_ds.write_embeddings_to_file=True \ + model.data.test_ds.output_file_path_prefix='/home/TestData/nlp/megatron_ir/working_dir/test_embs' \ + model.data.test_ds.query_file_names=[/home/TestData/nlp/megatron_ir/test_query.jsonl] \ + model.data.test_ds.doc_file_names=[/home/TestData/nlp/megatron_ir/test_doc.jsonl] + AFTER_SCRIPT: | + rm -rf /home/TestData/nlp/megatron_ir/working_dir L2_Megatron_GPT_PEFT_Lora_PP2: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - rm -rf examples/nlp/language_modeling/gpt_peft_lora_results_pp2 - - python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \ - trainer.devices=2 \ - trainer.log_every_n_steps=1 \ - trainer.max_epochs=9999 \ - trainer.max_steps=3 \ - trainer.val_check_interval=3 \ - ++trainer.limit_val_batches=2 \ - trainer.precision=16 \ - exp_manager.exp_dir=examples/nlp/language_modeling/gpt_peft_lora_results_pp2 \ - model.pipeline_model_parallel_size=2 \ - model.tensor_model_parallel_size=1 \ - model.restore_from_path=/home/TestData/nlp/megatron_gpt/PP2/gpt_pp2_tp1.nemo \ - model.peft.peft_scheme=lora \ - model.answer_only_loss=True \ - model.micro_batch_size=1 \ - model.global_batch_size=1 \ - model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ - model.data.train_ds.concat_sampling_probabilities=[1.0] \ - model.data.train_ds.num_workers=0 \ - model.data.validation_ds.num_workers=0 \ - model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ - model.data.validation_ds.names=[quarel] - - rm -rf examples/nlp/language_modeling/gpt_peft_lora_results_pp2 - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + rm -rf examples/nlp/language_modeling/gpt_peft_lora_results_pp2 + + python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \ + trainer.devices=2 \ + trainer.log_every_n_steps=1 \ + trainer.max_epochs=9999 \ + trainer.max_steps=3 \ + trainer.val_check_interval=3 \ + ++trainer.limit_val_batches=2 \ + trainer.precision=16 \ + exp_manager.exp_dir=examples/nlp/language_modeling/gpt_peft_lora_results_pp2 \ + model.pipeline_model_parallel_size=2 \ + model.tensor_model_parallel_size=1 \ + model.restore_from_path=/home/TestData/nlp/megatron_gpt/PP2/gpt_pp2_tp1.nemo \ + model.peft.peft_scheme=lora \ + model.answer_only_loss=True \ + model.micro_batch_size=1 \ + model.global_batch_size=1 \ + model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ + model.data.train_ds.concat_sampling_probabilities=[1.0] \ + model.data.train_ds.num_workers=0 \ + model.data.validation_ds.num_workers=0 \ + model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ + model.data.validation_ds.names=[quarel] + AFTER_SCRIPT: | + rm -rf examples/nlp/language_modeling/gpt_peft_lora_results_pp2 L2_Megatron_GPT_PEFT_Lora_TP2: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - rm -rf /home/TestData/nlp/lora_tuning_tp2 - - python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \ - trainer.devices=2 \ - trainer.log_every_n_steps=1 \ - trainer.max_epochs=9999 \ - trainer.max_steps=3 \ - trainer.val_check_interval=3 \ - ++trainer.limit_val_batches=2 \ - trainer.precision=16 \ - exp_manager.exp_dir=/home/TestData/nlp/lora_tuning_tp2 \ - model.pipeline_model_parallel_size=1 \ - model.tensor_model_parallel_size=2 \ - model.restore_from_path=/home/TestData/nlp/megatron_gpt/TP2/megatron_gpt_tp2.nemo \ - model.peft.peft_scheme='lora' \ - model.answer_only_loss=True \ - model.micro_batch_size=1 \ - model.global_batch_size=1 \ - model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ - model.data.train_ds.concat_sampling_probabilities=[1.0] \ - model.data.train_ds.num_workers=0 \ - model.data.validation_ds.num_workers=0 \ - model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ - model.data.validation_ds.names=[quarel] - - python examples/nlp/language_modeling/tuning/megatron_gpt_generate.py \ - model.restore_from_path=/home/TestData/nlp/megatron_gpt/TP2/megatron_gpt_tp2.nemo \ - model.peft.restore_from_path=/home/TestData/nlp/lora_tuning_tp2/megatron_gpt_peft_lora_tuning/checkpoints/megatron_gpt_peft_lora_tuning.nemo \ - model.tensor_model_parallel_size=2 \ - trainer.devices=2 \ - model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel_4.jsonl] \ - model.data.test_ds.names=['quarel4'] \ - model.global_batch_size=2 \ - model.micro_batch_size=1 \ - model.data.test_ds.tokens_to_generate=10 \ - model.data.test_ds.write_predictions_to_file=True \ - model.data.test_ds.output_file_path_prefix='/home/TestData/nlp/lora_tuning_tp2/out' \ - inference.greedy=True \ - inference.repetition_penalty=1.0 \ - inference.outfile_path='/home/TestData/nlp/lora_tuning_tp2/out.jsonl' - - rm -rf /home/TestData/nlp/lora_tuning_tp2 - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + rm -rf /home/TestData/nlp/lora_tuning_tp2 + + python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \ + trainer.devices=2 \ + trainer.log_every_n_steps=1 \ + trainer.max_epochs=9999 \ + trainer.max_steps=3 \ + trainer.val_check_interval=3 \ + ++trainer.limit_val_batches=2 \ + trainer.precision=16 \ + exp_manager.exp_dir=/home/TestData/nlp/lora_tuning_tp2 \ + model.pipeline_model_parallel_size=1 \ + model.tensor_model_parallel_size=2 \ + model.restore_from_path=/home/TestData/nlp/megatron_gpt/TP2/megatron_gpt_tp2.nemo \ + model.peft.peft_scheme='lora' \ + model.answer_only_loss=True \ + model.micro_batch_size=1 \ + model.global_batch_size=1 \ + model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ + model.data.train_ds.concat_sampling_probabilities=[1.0] \ + model.data.train_ds.num_workers=0 \ + model.data.validation_ds.num_workers=0 \ + model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ + model.data.validation_ds.names=[quarel] + + python examples/nlp/language_modeling/tuning/megatron_gpt_generate.py \ + model.restore_from_path=/home/TestData/nlp/megatron_gpt/TP2/megatron_gpt_tp2.nemo \ + model.peft.restore_from_path=/home/TestData/nlp/lora_tuning_tp2/megatron_gpt_peft_lora_tuning/checkpoints/megatron_gpt_peft_lora_tuning.nemo \ + model.tensor_model_parallel_size=2 \ + trainer.devices=2 \ + model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel_4.jsonl] \ + model.data.test_ds.names=['quarel4'] \ + model.global_batch_size=2 \ + model.micro_batch_size=1 \ + model.data.test_ds.tokens_to_generate=10 \ + model.data.test_ds.write_predictions_to_file=True \ + model.data.test_ds.output_file_path_prefix='/home/TestData/nlp/lora_tuning_tp2/out' \ + inference.greedy=True \ + inference.repetition_penalty=1.0 \ + inference.outfile_path='/home/TestData/nlp/lora_tuning_tp2/out.jsonl' + AFTER_SCRIPT: | + rm -rf /home/TestData/nlp/lora_tuning_tp2 L2_Megatron_GPT_Eval: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/nlp/language_modeling/megatron_gpt_eval.py \ - gpt_model_file=/home/TestData/nlp/megatron_gpt/125M/megatron_gpt.nemo \ - prompts=['How to fix GPU memory? A:'] \ - tensor_model_parallel_size=1 \ - inference.tokens_to_generate=32 \ - trainer.precision=32 - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python examples/nlp/language_modeling/megatron_gpt_eval.py \ + gpt_model_file=/home/TestData/nlp/megatron_gpt/125M/megatron_gpt.nemo \ + prompts=['How to fix GPU memory? A:'] \ + tensor_model_parallel_size=1 \ + inference.tokens_to_generate=32 \ + trainer.precision=32 L2_Megatron_GPT_Eval_PP2: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/nlp/language_modeling/megatron_gpt_eval.py \ - gpt_model_file=/home/TestData/nlp/megatron_gpt/PP2/gpt_pp2_tp1.nemo \ - server=False \ - tensor_model_parallel_size=1 \ - pipeline_model_parallel_size=2 \ - trainer.devices=2 \ - trainer.num_nodes=1 \ - trainer.precision=32 - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python examples/nlp/language_modeling/megatron_gpt_eval.py \ + gpt_model_file=/home/TestData/nlp/megatron_gpt/PP2/gpt_pp2_tp1.nemo \ + server=False \ + tensor_model_parallel_size=1 \ + pipeline_model_parallel_size=2 \ + trainer.devices=2 \ + trainer.num_nodes=1 \ + trainer.precision=32 L2_Megatron_GPT_SFT_Eval_inference_seq_len_greaterThan_training_seq_len: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/nlp/language_modeling/tuning/megatron_gpt_generate.py \ - model.restore_from_path=/home/TestData/nlp/megatron_gpt_sft/megatron_gpt_rope_sft.nemo \ - model.peft.restore_from_path=null \ - model.data.test_ds.file_names=[/home/TestData/nlp/megatron_gpt_sft/sample.jsonl] \ - model.data.test_ds.names=[test] \ - model.data.test_ds.global_batch_size=1 \ - model.data.test_ds.micro_batch_size=1 \ - model.data.test_ds.tokens_to_generate=30 \ - model.data.test_ds.max_seq_length=6000 \ - model.data.test_ds.write_predictions_to_file=True \ - model.data.test_ds.output_file_path_prefix=examples/nlp/language_modeling/out \ - inference.greedy=True \ - inference.repetition_penalty=1.0 \ - inference.outfile_path=examples/nlp/language_modeling/out.jsonl && \ - rm -rf examples/nlp/language_modeling/out.jsonl - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python examples/nlp/language_modeling/tuning/megatron_gpt_generate.py \ + model.restore_from_path=/home/TestData/nlp/megatron_gpt_sft/megatron_gpt_rope_sft.nemo \ + model.peft.restore_from_path=null \ + model.data.test_ds.file_names=[/home/TestData/nlp/megatron_gpt_sft/sample.jsonl] \ + model.data.test_ds.names=[test] \ + model.data.test_ds.global_batch_size=1 \ + model.data.test_ds.micro_batch_size=1 \ + model.data.test_ds.tokens_to_generate=30 \ + model.data.test_ds.max_seq_length=6000 \ + model.data.test_ds.write_predictions_to_file=True \ + model.data.test_ds.output_file_path_prefix=examples/nlp/language_modeling/out \ + inference.greedy=True \ + inference.repetition_penalty=1.0 \ + inference.outfile_path=examples/nlp/language_modeling/out.jsonl + AFTER_SCRIPT: | + rm -rf examples/nlp/language_modeling/out.jsonl # TODO: Add this test back. Test was failing on CI machines due to HW error # - name: L2: Megatron GPT Convert from Megatron-LM checkpoing and Eval @@ -5127,1149 +3834,883 @@ jobs: # L2_Megatron_Change_Partitions L2_Megatron_Change_Partitions_Reduce_TP_Num_Partitions_-2_to_1-_and_PP_Num_Partitions_-1_to_2: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/nlp/language_modeling/megatron_change_num_partitions.py \ - --model_file /home/TestData/nlp/megatron_gpt/TP2/megatron_gpt_tp2.nemo \ - --target_file /home/TestData/nlp/megatron_gpt/TP2-Temp/test-reduce.nemo \ - --tensor_model_parallel_size 2 \ - --target_tensor_model_parallel_size 1 \ - --pipeline_model_parallel_size 1 \ - --target_pipeline_model_parallel_size 2 - - rm /home/TestData/nlp/megatron_gpt/TP2-Temp/test-reduce.nemo - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python examples/nlp/language_modeling/megatron_change_num_partitions.py \ + --model_file /home/TestData/nlp/megatron_gpt/TP2/megatron_gpt_tp2.nemo \ + --target_file /home/TestData/nlp/megatron_gpt/TP2-Temp/test-reduce.nemo \ + --tensor_model_parallel_size 2 \ + --target_tensor_model_parallel_size 1 \ + --pipeline_model_parallel_size 1 \ + --target_pipeline_model_parallel_size 2 + AFTER_SCRIPT: | + rm /home/TestData/nlp/megatron_gpt/TP2-Temp/test-reduce.nemo L2_Megatron_Change_Partitions_Increase_TP_Num_Partitions_-2_to_4-_and_PP_Num_Partitions_-1_to_2: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/nlp/language_modeling/megatron_change_num_partitions.py \ - --model_file /home/TestData/nlp/megatron_gpt/TP2/megatron_gpt_tp2.nemo \ - --target_file /home/TestData/nlp/megatron_gpt/TP2-Temp/test-increase.nemo \ - --tensor_model_parallel_size 2 \ - --target_tensor_model_parallel_size 4 \ - --pipeline_model_parallel_size 1 \ - --target_pipeline_model_parallel_size 1 - - rm /home/TestData/nlp/megatron_gpt/TP2-Temp/test-increase.nemo - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python examples/nlp/language_modeling/megatron_change_num_partitions.py \ + --model_file /home/TestData/nlp/megatron_gpt/TP2/megatron_gpt_tp2.nemo \ + --target_file /home/TestData/nlp/megatron_gpt/TP2-Temp/test-increase.nemo \ + --tensor_model_parallel_size 2 \ + --target_tensor_model_parallel_size 4 \ + --pipeline_model_parallel_size 1 \ + --target_pipeline_model_parallel_size 1 + AFTER_SCRIPT: | + rm /home/TestData/nlp/megatron_gpt/TP2-Temp/test-increase.nemo L2_Megatron_T5_Pretraining_and_Resume_Training_TP2: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/nlp/language_modeling/megatron_t5_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ - model.tensor_model_parallel_size=2 \ - model.seq_length=128 \ - model.encoder.num_layers=4 \ - model.encoder.hidden_size=64 \ - model.encoder.num_attention_heads=8 \ - model.encoder.activation=swiglu \ - model.encoder.masked_softmax_fusion=False \ - model.encoder.bias_activation_fusion=False \ - model.encoder.activations_checkpoint_method=block \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.encoder.position_embedding_type=relative \ - model.decoder.num_layers=2 \ - model.decoder.hidden_size=64 \ - model.decoder.num_attention_heads=8 \ - model.decoder.activation=fast-swiglu \ - model.decoder.masked_softmax_fusion=False \ - model.decoder.bias_activation_fusion=False \ - model.decoder.activations_checkpoint_method=block \ - model.decoder.activations_checkpoint_num_layers=1 \ - model.encoder.transformer_block_type=pre_ln \ - model.decoder.transformer_block_type=pre_ln \ - model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \ - model.data.data_impl=text_mmap \ - +model.data.data_impl_kwargs.newline_int=10 \ - +model.data.data_impl_kwargs.header_lines=0 \ - +model.data.data_impl_kwargs.workers=null \ - +model.data.data_impl_kwargs.sort_dataset_paths=False \ - model.share_token_embeddings=False \ - model.share_decoder_tokens_head_embeddings=False - - python examples/nlp/language_modeling/megatron_t5_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=1 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ - exp_manager.resume_if_exists=True \ - model.tensor_model_parallel_size=2 \ - model.seq_length=128 \ - model.encoder.num_layers=4 \ - model.encoder.hidden_size=64 \ - model.encoder.num_attention_heads=8 \ - model.encoder.activation=swiglu \ - model.encoder.masked_softmax_fusion=False \ - model.encoder.bias_activation_fusion=False \ - model.encoder.activations_checkpoint_method=block \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.encoder.position_embedding_type=relative \ - model.decoder.num_layers=2 \ - model.decoder.hidden_size=64 \ - model.decoder.num_attention_heads=8 \ - model.decoder.activation=fast-swiglu \ - model.decoder.masked_softmax_fusion=False \ - model.decoder.bias_activation_fusion=False \ - model.decoder.activations_checkpoint_method=block \ - model.decoder.activations_checkpoint_num_layers=1 \ - model.encoder.transformer_block_type=pre_ln \ - model.decoder.transformer_block_type=pre_ln \ - model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \ - model.data.data_impl=text_mmap \ - +model.data.data_impl_kwargs.newline_int=10 \ - +model.data.data_impl_kwargs.header_lines=0 \ - +model.data.data_impl_kwargs.workers=null \ - +model.data.data_impl_kwargs.sort_dataset_paths=False \ - model.share_token_embeddings=False \ - model.share_decoder_tokens_head_embeddings=False - - rm -rf examples/nlp/language_modeling/t5_pretrain_results - rm -rf examples/nlp/language_modeling/t5_index_mappings - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python examples/nlp/language_modeling/megatron_t5_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=10 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=10 \ + trainer.precision=16 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ + model.tensor_model_parallel_size=2 \ + model.seq_length=128 \ + model.encoder.num_layers=4 \ + model.encoder.hidden_size=64 \ + model.encoder.num_attention_heads=8 \ + model.encoder.activation=swiglu \ + model.encoder.masked_softmax_fusion=False \ + model.encoder.bias_activation_fusion=False \ + model.encoder.activations_checkpoint_method=block \ + model.encoder.activations_checkpoint_num_layers=1 \ + model.encoder.position_embedding_type=relative \ + model.decoder.num_layers=2 \ + model.decoder.hidden_size=64 \ + model.decoder.num_attention_heads=8 \ + model.decoder.activation=fast-swiglu \ + model.decoder.masked_softmax_fusion=False \ + model.decoder.bias_activation_fusion=False \ + model.decoder.activations_checkpoint_method=block \ + model.decoder.activations_checkpoint_num_layers=1 \ + model.encoder.transformer_block_type=pre_ln \ + model.decoder.transformer_block_type=pre_ln \ + model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \ + model.data.data_impl=text_mmap \ + +model.data.data_impl_kwargs.newline_int=10 \ + +model.data.data_impl_kwargs.header_lines=0 \ + +model.data.data_impl_kwargs.workers=null \ + +model.data.data_impl_kwargs.sort_dataset_paths=False \ + model.share_token_embeddings=False \ + model.share_decoder_tokens_head_embeddings=False + + python examples/nlp/language_modeling/megatron_t5_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=1 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=10 \ + trainer.precision=16 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ + exp_manager.resume_if_exists=True \ + model.tensor_model_parallel_size=2 \ + model.seq_length=128 \ + model.encoder.num_layers=4 \ + model.encoder.hidden_size=64 \ + model.encoder.num_attention_heads=8 \ + model.encoder.activation=swiglu \ + model.encoder.masked_softmax_fusion=False \ + model.encoder.bias_activation_fusion=False \ + model.encoder.activations_checkpoint_method=block \ + model.encoder.activations_checkpoint_num_layers=1 \ + model.encoder.position_embedding_type=relative \ + model.decoder.num_layers=2 \ + model.decoder.hidden_size=64 \ + model.decoder.num_attention_heads=8 \ + model.decoder.activation=fast-swiglu \ + model.decoder.masked_softmax_fusion=False \ + model.decoder.bias_activation_fusion=False \ + model.decoder.activations_checkpoint_method=block \ + model.decoder.activations_checkpoint_num_layers=1 \ + model.encoder.transformer_block_type=pre_ln \ + model.decoder.transformer_block_type=pre_ln \ + model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \ + model.data.data_impl=text_mmap \ + +model.data.data_impl_kwargs.newline_int=10 \ + +model.data.data_impl_kwargs.header_lines=0 \ + +model.data.data_impl_kwargs.workers=null \ + +model.data.data_impl_kwargs.sort_dataset_paths=False \ + model.share_token_embeddings=False \ + model.share_decoder_tokens_head_embeddings=False + AFTER_SCRIPT: | + rm -rf examples/nlp/language_modeling/t5_pretrain_results + rm -rf examples/nlp/language_modeling/t5_index_mappings L2_Megatron_T5_with_ALiBi_Pretraining_and_Resume_Training_TP2: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/nlp/language_modeling/megatron_t5_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ - model.tensor_model_parallel_size=2 \ - model.seq_length=128 \ - model.encoder.num_layers=4 \ - model.encoder.hidden_size=64 \ - model.encoder.num_attention_heads=8 \ - model.encoder.activation=swiglu \ - model.encoder.masked_softmax_fusion=False \ - model.encoder.bias_activation_fusion=False \ - model.encoder.activations_checkpoint_method=block \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.encoder.position_embedding_type=alibi \ - model.decoder.num_layers=2 \ - model.decoder.hidden_size=64 \ - model.decoder.num_attention_heads=8 \ - model.decoder.activation=swiglu \ - model.decoder.masked_softmax_fusion=False \ - model.decoder.bias_activation_fusion=False \ - model.decoder.activations_checkpoint_method=block \ - model.decoder.activations_checkpoint_num_layers=1 \ - model.encoder.transformer_block_type=pre_ln \ - model.decoder.transformer_block_type=pre_ln \ - model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \ - model.data.data_impl=text_mmap \ - +model.data.data_impl_kwargs.newline_int=10 \ - +model.data.data_impl_kwargs.header_lines=0 \ - +model.data.data_impl_kwargs.workers=null \ - +model.data.data_impl_kwargs.sort_dataset_paths=False \ - model.share_token_embeddings=False \ - model.share_decoder_tokens_head_embeddings=False - - python examples/nlp/language_modeling/megatron_t5_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=1 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ - exp_manager.resume_if_exists=True \ - model.tensor_model_parallel_size=2 \ - model.seq_length=128 \ - model.encoder.num_layers=4 \ - model.encoder.hidden_size=64 \ - model.encoder.num_attention_heads=8 \ - model.encoder.activation=swiglu \ - model.encoder.masked_softmax_fusion=False \ - model.encoder.bias_activation_fusion=False \ - model.encoder.activations_checkpoint_method=block \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.encoder.position_embedding_type=alibi \ - model.decoder.num_layers=2 \ - model.decoder.hidden_size=64 \ - model.decoder.num_attention_heads=8 \ - model.decoder.activation=swiglu \ - model.decoder.masked_softmax_fusion=False \ - model.decoder.bias_activation_fusion=False \ - model.decoder.activations_checkpoint_method=block \ - model.decoder.activations_checkpoint_num_layers=1 \ - model.encoder.transformer_block_type=pre_ln \ - model.decoder.transformer_block_type=pre_ln \ - model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \ - model.data.data_impl=text_mmap \ - +model.data.data_impl_kwargs.newline_int=10 \ - +model.data.data_impl_kwargs.header_lines=0 \ - +model.data.data_impl_kwargs.workers=null \ - +model.data.data_impl_kwargs.sort_dataset_paths=False \ - model.share_token_embeddings=False \ - model.share_decoder_tokens_head_embeddings=False - - rm -rf examples/nlp/language_modeling/t5_pretrain_results - rm -rf examples/nlp/language_modeling/t5_index_mappings - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python examples/nlp/language_modeling/megatron_t5_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=10 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=10 \ + trainer.precision=16 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ + model.tensor_model_parallel_size=2 \ + model.seq_length=128 \ + model.encoder.num_layers=4 \ + model.encoder.hidden_size=64 \ + model.encoder.num_attention_heads=8 \ + model.encoder.activation=swiglu \ + model.encoder.masked_softmax_fusion=False \ + model.encoder.bias_activation_fusion=False \ + model.encoder.activations_checkpoint_method=block \ + model.encoder.activations_checkpoint_num_layers=1 \ + model.encoder.position_embedding_type=alibi \ + model.decoder.num_layers=2 \ + model.decoder.hidden_size=64 \ + model.decoder.num_attention_heads=8 \ + model.decoder.activation=swiglu \ + model.decoder.masked_softmax_fusion=False \ + model.decoder.bias_activation_fusion=False \ + model.decoder.activations_checkpoint_method=block \ + model.decoder.activations_checkpoint_num_layers=1 \ + model.encoder.transformer_block_type=pre_ln \ + model.decoder.transformer_block_type=pre_ln \ + model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \ + model.data.data_impl=text_mmap \ + +model.data.data_impl_kwargs.newline_int=10 \ + +model.data.data_impl_kwargs.header_lines=0 \ + +model.data.data_impl_kwargs.workers=null \ + +model.data.data_impl_kwargs.sort_dataset_paths=False \ + model.share_token_embeddings=False \ + model.share_decoder_tokens_head_embeddings=False + + python examples/nlp/language_modeling/megatron_t5_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=1 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=10 \ + trainer.precision=16 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ + exp_manager.resume_if_exists=True \ + model.tensor_model_parallel_size=2 \ + model.seq_length=128 \ + model.encoder.num_layers=4 \ + model.encoder.hidden_size=64 \ + model.encoder.num_attention_heads=8 \ + model.encoder.activation=swiglu \ + model.encoder.masked_softmax_fusion=False \ + model.encoder.bias_activation_fusion=False \ + model.encoder.activations_checkpoint_method=block \ + model.encoder.activations_checkpoint_num_layers=1 \ + model.encoder.position_embedding_type=alibi \ + model.decoder.num_layers=2 \ + model.decoder.hidden_size=64 \ + model.decoder.num_attention_heads=8 \ + model.decoder.activation=swiglu \ + model.decoder.masked_softmax_fusion=False \ + model.decoder.bias_activation_fusion=False \ + model.decoder.activations_checkpoint_method=block \ + model.decoder.activations_checkpoint_num_layers=1 \ + model.encoder.transformer_block_type=pre_ln \ + model.decoder.transformer_block_type=pre_ln \ + model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \ + model.data.data_impl=text_mmap \ + +model.data.data_impl_kwargs.newline_int=10 \ + +model.data.data_impl_kwargs.header_lines=0 \ + +model.data.data_impl_kwargs.workers=null \ + +model.data.data_impl_kwargs.sort_dataset_paths=False \ + model.share_token_embeddings=False \ + model.share_decoder_tokens_head_embeddings=False + AFTER_SCRIPT: | + rm -rf examples/nlp/language_modeling/t5_pretrain_results + rm -rf examples/nlp/language_modeling/t5_index_mappings L2_Megatron_T5_with_KERPLE_Pretraining_and_Resume_Training_TP2: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/nlp/language_modeling/megatron_t5_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ - model.tensor_model_parallel_size=2 \ - model.seq_length=128 \ - model.encoder.num_layers=4 \ - model.encoder.hidden_size=64 \ - model.encoder.num_attention_heads=8 \ - model.encoder.activation=swiglu \ - model.encoder.masked_softmax_fusion=False \ - model.encoder.bias_activation_fusion=False \ - model.encoder.activations_checkpoint_method=block \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.encoder.position_embedding_type=kerple \ - model.decoder.num_layers=2 \ - model.decoder.hidden_size=64 \ - model.decoder.num_attention_heads=8 \ - model.decoder.activation=swiglu \ - model.decoder.masked_softmax_fusion=False \ - model.decoder.bias_activation_fusion=False \ - model.decoder.activations_checkpoint_method=block \ - model.decoder.activations_checkpoint_num_layers=1 \ - model.encoder.transformer_block_type=pre_ln \ - model.decoder.transformer_block_type=pre_ln \ - model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \ - model.data.data_impl=text_mmap \ - +model.data.data_impl_kwargs.newline_int=10 \ - +model.data.data_impl_kwargs.header_lines=0 \ - +model.data.data_impl_kwargs.workers=null \ - +model.data.data_impl_kwargs.sort_dataset_paths=False \ - model.share_token_embeddings=False \ - model.share_decoder_tokens_head_embeddings=False - - python examples/nlp/language_modeling/megatron_t5_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=1 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ - exp_manager.resume_if_exists=True \ - model.tensor_model_parallel_size=2 \ - model.seq_length=128 \ - model.encoder.num_layers=4 \ - model.encoder.hidden_size=64 \ - model.encoder.num_attention_heads=8 \ - model.encoder.activation=swiglu \ - model.encoder.masked_softmax_fusion=False \ - model.encoder.bias_activation_fusion=False \ - model.encoder.activations_checkpoint_method=block \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.encoder.position_embedding_type=kerple \ - model.decoder.num_layers=2 \ - model.decoder.hidden_size=64 \ - model.decoder.num_attention_heads=8 \ - model.decoder.activation=swiglu \ - model.decoder.masked_softmax_fusion=False \ - model.decoder.bias_activation_fusion=False \ - model.decoder.activations_checkpoint_method=block \ - model.decoder.activations_checkpoint_num_layers=1 \ - model.encoder.transformer_block_type=pre_ln \ - model.decoder.transformer_block_type=pre_ln \ - model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \ - model.data.data_impl=text_mmap \ - +model.data.data_impl_kwargs.newline_int=10 \ - +model.data.data_impl_kwargs.header_lines=0 \ - +model.data.data_impl_kwargs.workers=null \ - +model.data.data_impl_kwargs.sort_dataset_paths=False \ - model.share_token_embeddings=False \ - model.share_decoder_tokens_head_embeddings=False - - rm -rf examples/nlp/language_modeling/t5_pretrain_results - rm -rf examples/nlp/language_modeling/t5_index_mappings - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python examples/nlp/language_modeling/megatron_t5_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=10 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=10 \ + trainer.precision=16 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ + model.tensor_model_parallel_size=2 \ + model.seq_length=128 \ + model.encoder.num_layers=4 \ + model.encoder.hidden_size=64 \ + model.encoder.num_attention_heads=8 \ + model.encoder.activation=swiglu \ + model.encoder.masked_softmax_fusion=False \ + model.encoder.bias_activation_fusion=False \ + model.encoder.activations_checkpoint_method=block \ + model.encoder.activations_checkpoint_num_layers=1 \ + model.encoder.position_embedding_type=kerple \ + model.decoder.num_layers=2 \ + model.decoder.hidden_size=64 \ + model.decoder.num_attention_heads=8 \ + model.decoder.activation=swiglu \ + model.decoder.masked_softmax_fusion=False \ + model.decoder.bias_activation_fusion=False \ + model.decoder.activations_checkpoint_method=block \ + model.decoder.activations_checkpoint_num_layers=1 \ + model.encoder.transformer_block_type=pre_ln \ + model.decoder.transformer_block_type=pre_ln \ + model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \ + model.data.data_impl=text_mmap \ + +model.data.data_impl_kwargs.newline_int=10 \ + +model.data.data_impl_kwargs.header_lines=0 \ + +model.data.data_impl_kwargs.workers=null \ + +model.data.data_impl_kwargs.sort_dataset_paths=False \ + model.share_token_embeddings=False \ + model.share_decoder_tokens_head_embeddings=False + + python examples/nlp/language_modeling/megatron_t5_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=1 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=10 \ + trainer.precision=16 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ + exp_manager.resume_if_exists=True \ + model.tensor_model_parallel_size=2 \ + model.seq_length=128 \ + model.encoder.num_layers=4 \ + model.encoder.hidden_size=64 \ + model.encoder.num_attention_heads=8 \ + model.encoder.activation=swiglu \ + model.encoder.masked_softmax_fusion=False \ + model.encoder.bias_activation_fusion=False \ + model.encoder.activations_checkpoint_method=block \ + model.encoder.activations_checkpoint_num_layers=1 \ + model.encoder.position_embedding_type=kerple \ + model.decoder.num_layers=2 \ + model.decoder.hidden_size=64 \ + model.decoder.num_attention_heads=8 \ + model.decoder.activation=swiglu \ + model.decoder.masked_softmax_fusion=False \ + model.decoder.bias_activation_fusion=False \ + model.decoder.activations_checkpoint_method=block \ + model.decoder.activations_checkpoint_num_layers=1 \ + model.encoder.transformer_block_type=pre_ln \ + model.decoder.transformer_block_type=pre_ln \ + model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \ + model.data.data_impl=text_mmap \ + +model.data.data_impl_kwargs.newline_int=10 \ + +model.data.data_impl_kwargs.header_lines=0 \ + +model.data.data_impl_kwargs.workers=null \ + +model.data.data_impl_kwargs.sort_dataset_paths=False \ + model.share_token_embeddings=False \ + model.share_decoder_tokens_head_embeddings=False + AFTER_SCRIPT: | + rm -rf examples/nlp/language_modeling/t5_pretrain_results + rm -rf examples/nlp/language_modeling/t5_index_mappings L2_Megatron_T5_Pretraining_and_Resume_Training_PP2: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/nlp/language_modeling/megatron_t5_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ - model.pipeline_model_parallel_size=2 \ - model.pipeline_model_parallel_split_rank=1 \ - model.seq_length=256 \ - model.encoder.num_layers=4 \ - model.decoder.num_layers=1 \ - model.encoder.hidden_size=64 \ - model.decoder.hidden_size=64 \ - model.encoder.num_attention_heads=8 \ - model.decoder.num_attention_heads=8 \ - model.decoder.ffn_hidden_size=2048 \ - model.encoder.activation=gelu \ - model.encoder.activations_checkpoint_method=block \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.encoder.transformer_block_type=pre_ln \ - model.decoder.transformer_block_type=post_ln \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings - - python examples/nlp/language_modeling/megatron_t5_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=1 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ - exp_manager.resume_if_exists=True \ - model.pipeline_model_parallel_size=2 \ - model.pipeline_model_parallel_split_rank=1 \ - model.seq_length=256 \ - model.encoder.num_layers=4 \ - model.decoder.num_layers=1 \ - model.encoder.hidden_size=64 \ - model.decoder.hidden_size=64 \ - model.encoder.num_attention_heads=8 \ - model.decoder.num_attention_heads=8 \ - model.decoder.ffn_hidden_size=2048 \ - model.encoder.activation=gelu \ - model.encoder.activations_checkpoint_method=block \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.encoder.transformer_block_type=pre_ln \ - model.decoder.transformer_block_type=post_ln \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings - - rm -rf examples/nlp/language_modeling/t5_pretrain_results - rm -rf examples/nlp/language_modeling/t5_index_mappings - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python examples/nlp/language_modeling/megatron_t5_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=10 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=10 \ + trainer.precision=16 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ + model.pipeline_model_parallel_size=2 \ + model.pipeline_model_parallel_split_rank=1 \ + model.seq_length=256 \ + model.encoder.num_layers=4 \ + model.decoder.num_layers=1 \ + model.encoder.hidden_size=64 \ + model.decoder.hidden_size=64 \ + model.encoder.num_attention_heads=8 \ + model.decoder.num_attention_heads=8 \ + model.decoder.ffn_hidden_size=2048 \ + model.encoder.activation=gelu \ + model.encoder.activations_checkpoint_method=block \ + model.encoder.activations_checkpoint_num_layers=1 \ + model.encoder.transformer_block_type=pre_ln \ + model.decoder.transformer_block_type=post_ln \ + model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings + + python examples/nlp/language_modeling/megatron_t5_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=1 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=10 \ + trainer.precision=16 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ + exp_manager.resume_if_exists=True \ + model.pipeline_model_parallel_size=2 \ + model.pipeline_model_parallel_split_rank=1 \ + model.seq_length=256 \ + model.encoder.num_layers=4 \ + model.decoder.num_layers=1 \ + model.encoder.hidden_size=64 \ + model.decoder.hidden_size=64 \ + model.encoder.num_attention_heads=8 \ + model.decoder.num_attention_heads=8 \ + model.decoder.ffn_hidden_size=2048 \ + model.encoder.activation=gelu \ + model.encoder.activations_checkpoint_method=block \ + model.encoder.activations_checkpoint_num_layers=1 \ + model.encoder.transformer_block_type=pre_ln \ + model.decoder.transformer_block_type=post_ln \ + model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings + AFTER_SCRIPT: | + rm -rf examples/nlp/language_modeling/t5_pretrain_results + rm -rf examples/nlp/language_modeling/t5_index_mappings L2_Megatron_T5_w_Mixture_of_Expert_Pretraining: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/nlp/language_modeling/megatron_t5_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ - model.pipeline_model_parallel_split_rank=1 \ - model.seq_length=256 \ - model.encoder.num_layers=4 \ - model.decoder.num_layers=1 \ - model.encoder.num_moe_experts=4 \ - model.decoder.num_moe_experts=4 \ - model.encoder.moe_frequency=3 \ - model.decoder.moe_frequency=1 \ - model.encoder.hidden_size=64 \ - model.decoder.hidden_size=64 \ - model.encoder.num_attention_heads=8 \ - model.decoder.num_attention_heads=8 \ - model.decoder.ffn_hidden_size=2048 \ - model.encoder.activation=gelu \ - model.encoder.activations_checkpoint_method=block \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.encoder.transformer_block_type=pre_ln \ - model.decoder.transformer_block_type=post_ln \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings - - rm -rf examples/nlp/language_modeling/t5_pretrain_results - rm -rf examples/nlp/language_modeling/t5_index_mappings - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python examples/nlp/language_modeling/megatron_t5_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=10 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=10 \ + trainer.precision=16 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ + model.pipeline_model_parallel_split_rank=1 \ + model.seq_length=256 \ + model.encoder.num_layers=4 \ + model.decoder.num_layers=1 \ + model.encoder.num_moe_experts=4 \ + model.decoder.num_moe_experts=4 \ + model.encoder.moe_frequency=3 \ + model.decoder.moe_frequency=1 \ + model.encoder.hidden_size=64 \ + model.decoder.hidden_size=64 \ + model.encoder.num_attention_heads=8 \ + model.decoder.num_attention_heads=8 \ + model.decoder.ffn_hidden_size=2048 \ + model.encoder.activation=gelu \ + model.encoder.activations_checkpoint_method=block \ + model.encoder.activations_checkpoint_num_layers=1 \ + model.encoder.transformer_block_type=pre_ln \ + model.decoder.transformer_block_type=post_ln \ + model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings + AFTER_SCRIPT: | + rm -rf examples/nlp/language_modeling/t5_pretrain_results + rm -rf examples/nlp/language_modeling/t5_index_mappings L2_Megatron_UL2_Pretraining_and_Resume_Training_TP2: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/nlp/language_modeling/megatron_t5_pretraining.py -cn megatron_ul2_config \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ - model.tensor_model_parallel_size=2 \ - model.seq_length=128 \ - model.encoder.num_layers=4 \ - model.encoder.hidden_size=64 \ - model.encoder.num_attention_heads=8 \ - model.encoder.activation=swiglu \ - model.encoder.bias_activation_fusion=False \ - model.encoder.activations_checkpoint_method=block \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.encoder.transformer_block_type=normformer \ - model.encoder.headscale=True \ - model.decoder.num_layers=4 \ - model.decoder.hidden_size=64 \ - model.decoder.num_attention_heads=8 \ - model.decoder.activation=geglu \ - model.decoder.bias_activation_fusion=False \ - model.decoder.activations_checkpoint_method=block \ - model.decoder.activations_checkpoint_num_layers=1 \ - model.decoder.transformer_block_type=normformer \ - model.decoder.headscale=False \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings - - python examples/nlp/language_modeling/megatron_t5_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=1 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ - exp_manager.resume_if_exists=True \ - model.tensor_model_parallel_size=2 \ - model.seq_length=128 \ - model.encoder.num_layers=4 \ - model.encoder.hidden_size=64 \ - model.encoder.num_attention_heads=8 \ - model.encoder.activation=swiglu \ - model.encoder.bias_activation_fusion=False \ - model.encoder.activations_checkpoint_method=block \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.encoder.transformer_block_type=normformer \ - model.encoder.headscale=True \ - model.decoder.num_layers=4 \ - model.decoder.hidden_size=64 \ - model.decoder.num_attention_heads=8 \ - model.decoder.activation=geglu \ - model.decoder.bias_activation_fusion=False \ - model.decoder.activations_checkpoint_method=block \ - model.decoder.activations_checkpoint_num_layers=1 \ - model.decoder.transformer_block_type=normformer \ - model.decoder.headscale=False \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings - - rm -rf examples/nlp/language_modeling/t5_pretrain_results - rm -rf examples/nlp/language_modeling/t5_index_mappings - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python examples/nlp/language_modeling/megatron_t5_pretraining.py -cn megatron_ul2_config \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=10 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=10 \ + trainer.precision=16 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ + model.tensor_model_parallel_size=2 \ + model.seq_length=128 \ + model.encoder.num_layers=4 \ + model.encoder.hidden_size=64 \ + model.encoder.num_attention_heads=8 \ + model.encoder.activation=swiglu \ + model.encoder.bias_activation_fusion=False \ + model.encoder.activations_checkpoint_method=block \ + model.encoder.activations_checkpoint_num_layers=1 \ + model.encoder.transformer_block_type=normformer \ + model.encoder.headscale=True \ + model.decoder.num_layers=4 \ + model.decoder.hidden_size=64 \ + model.decoder.num_attention_heads=8 \ + model.decoder.activation=geglu \ + model.decoder.bias_activation_fusion=False \ + model.decoder.activations_checkpoint_method=block \ + model.decoder.activations_checkpoint_num_layers=1 \ + model.decoder.transformer_block_type=normformer \ + model.decoder.headscale=False \ + model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings + + python examples/nlp/language_modeling/megatron_t5_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=1 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=10 \ + trainer.precision=16 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ + exp_manager.resume_if_exists=True \ + model.tensor_model_parallel_size=2 \ + model.seq_length=128 \ + model.encoder.num_layers=4 \ + model.encoder.hidden_size=64 \ + model.encoder.num_attention_heads=8 \ + model.encoder.activation=swiglu \ + model.encoder.bias_activation_fusion=False \ + model.encoder.activations_checkpoint_method=block \ + model.encoder.activations_checkpoint_num_layers=1 \ + model.encoder.transformer_block_type=normformer \ + model.encoder.headscale=True \ + model.decoder.num_layers=4 \ + model.decoder.hidden_size=64 \ + model.decoder.num_attention_heads=8 \ + model.decoder.activation=geglu \ + model.decoder.bias_activation_fusion=False \ + model.decoder.activations_checkpoint_method=block \ + model.decoder.activations_checkpoint_num_layers=1 \ + model.decoder.transformer_block_type=normformer \ + model.decoder.headscale=False \ + model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings + AFTER_SCRIPT: | + rm -rf examples/nlp/language_modeling/t5_pretrain_results + rm -rf examples/nlp/language_modeling/t5_index_mappings L2_Megatron_T5_Eval: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/nlp/language_modeling/megatron_t5_eval.py \ - --model_file /home/TestData/nlp/megatron_t5/8m/megatron_t5_8m-refactor.nemo \ - --prompt 'How do I fix my GPU memory issue? I am seeing out of memory.' \ - --tensor_model_parallel_size 1 - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python examples/nlp/language_modeling/megatron_t5_eval.py \ + --model_file /home/TestData/nlp/megatron_t5/8m/megatron_t5_8m-refactor.nemo \ + --prompt 'How do I fix my GPU memory issue? I am seeing out of memory.' \ + --tensor_model_parallel_size 1 L2_Megatron_BART_Pretraining_and_Resume_Training_TP2: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/nlp/language_modeling/megatron_bart_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=2 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=3 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/bart_pretrain_results \ - model.tensor_model_parallel_size=2 \ - model.seq_length=128 \ - model.encoder.num_layers=4 \ - model.encoder.hidden_size=64 \ - model.encoder.num_attention_heads=8 \ - model.encoder.activation='reglu' \ - model.encoder.bias_activation_fusion=False \ - model.encoder.activations_checkpoint_method='block' \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.decoder.num_layers=4 \ - model.decoder.hidden_size=64 \ - model.decoder.num_attention_heads=8 \ - model.decoder.activation='reglu' \ - model.decoder.bias_activation_fusion=False \ - model.decoder.activations_checkpoint_method='block' \ - model.decoder.activations_checkpoint_num_layers=1 \ - model.data.data_prefix='{train:[1.0,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document],test:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document], validation:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]}' - - python examples/nlp/language_modeling/megatron_bart_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=2 \ - trainer.limit_val_batches=5 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=6 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/bart_pretrain_results \ - exp_manager.resume_if_exists=True \ - model.tensor_model_parallel_size=2 \ - model.seq_length=128 \ - model.encoder.num_layers=4 \ - model.encoder.hidden_size=64 \ - model.encoder.num_attention_heads=8 \ - model.encoder.activation='reglu' \ - model.encoder.bias_activation_fusion=False \ - model.encoder.activations_checkpoint_method='block' \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.decoder.num_layers=4 \ - model.decoder.hidden_size=64 \ - model.decoder.num_attention_heads=8 \ - model.decoder.activation='reglu' \ - model.decoder.bias_activation_fusion=False \ - model.decoder.activations_checkpoint_method='block' \ - model.decoder.activations_checkpoint_num_layers=1 \ - model.data.data_prefix='{train:[1.0,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document],test:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document], validation:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]}' - - rm -rf examples/nlp/language_modeling/bart_pretrain_results - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python examples/nlp/language_modeling/megatron_bart_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=2 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=3 \ + trainer.precision=16 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/bart_pretrain_results \ + model.tensor_model_parallel_size=2 \ + model.seq_length=128 \ + model.encoder.num_layers=4 \ + model.encoder.hidden_size=64 \ + model.encoder.num_attention_heads=8 \ + model.encoder.activation='reglu' \ + model.encoder.bias_activation_fusion=False \ + model.encoder.activations_checkpoint_method='block' \ + model.encoder.activations_checkpoint_num_layers=1 \ + model.decoder.num_layers=4 \ + model.decoder.hidden_size=64 \ + model.decoder.num_attention_heads=8 \ + model.decoder.activation='reglu' \ + model.decoder.bias_activation_fusion=False \ + model.decoder.activations_checkpoint_method='block' \ + model.decoder.activations_checkpoint_num_layers=1 \ + model.data.data_prefix='{train:[1.0,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document],test:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document], validation:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]}' + + python examples/nlp/language_modeling/megatron_bart_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=2 \ + trainer.limit_val_batches=5 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=6 \ + trainer.precision=16 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/bart_pretrain_results \ + exp_manager.resume_if_exists=True \ + model.tensor_model_parallel_size=2 \ + model.seq_length=128 \ + model.encoder.num_layers=4 \ + model.encoder.hidden_size=64 \ + model.encoder.num_attention_heads=8 \ + model.encoder.activation='reglu' \ + model.encoder.bias_activation_fusion=False \ + model.encoder.activations_checkpoint_method='block' \ + model.encoder.activations_checkpoint_num_layers=1 \ + model.decoder.num_layers=4 \ + model.decoder.hidden_size=64 \ + model.decoder.num_attention_heads=8 \ + model.decoder.activation='reglu' \ + model.decoder.bias_activation_fusion=False \ + model.decoder.activations_checkpoint_method='block' \ + model.decoder.activations_checkpoint_num_layers=1 \ + model.data.data_prefix='{train:[1.0,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document],test:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document], validation:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]}' + AFTER_SCRIPT: | + rm -rf examples/nlp/language_modeling/bart_pretrain_results L2_Megatron_BART_Pretraining_and_Resume_Training_PP2: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/nlp/language_modeling/megatron_bart_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/bart_pretrain_results \ - model.pipeline_model_parallel_size=2 \ - model.pipeline_model_parallel_split_rank=1 \ - model.seq_length=256 \ - model.encoder.num_layers=4 \ - model.encoder.hidden_size=64 \ - model.encoder.num_attention_heads=8 \ - model.encoder.activation=geglu \ - model.encoder.bias_activation_fusion=False \ - model.encoder.activations_checkpoint_method=block \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.decoder.num_layers=4 \ - model.decoder.hidden_size=64 \ - model.decoder.num_attention_heads=8 \ - model.decoder.activation=geglu \ - model.decoder.bias_activation_fusion=False \ - model.decoder.activations_checkpoint_method=block \ - model.decoder.activations_checkpoint_num_layers=1 \ - model.data.respect_document_boundaries=False \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] - - python examples/nlp/language_modeling/megatron_bart_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=1 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/bart_pretrain_results \ - exp_manager.resume_if_exists=True \ - model.pipeline_model_parallel_size=2 \ - model.pipeline_model_parallel_split_rank=1 \ - model.seq_length=256 \ - model.encoder.num_layers=4 \ - model.encoder.hidden_size=64 \ - model.encoder.num_attention_heads=8 \ - model.encoder.activation=geglu \ - model.encoder.bias_activation_fusion=False \ - model.encoder.activations_checkpoint_method=block \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.decoder.num_layers=4 \ - model.decoder.hidden_size=64 \ - model.decoder.num_attention_heads=8 \ - model.decoder.activation=geglu \ - model.decoder.bias_activation_fusion=False \ - model.decoder.activations_checkpoint_method=block \ - model.decoder.activations_checkpoint_num_layers=1 \ - model.data.respect_document_boundaries=False \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] - - rm -rf examples/nlp/language_modeling/bart_pretrain_results - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python examples/nlp/language_modeling/megatron_bart_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=10 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=10 \ + trainer.precision=16 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/bart_pretrain_results \ + model.pipeline_model_parallel_size=2 \ + model.pipeline_model_parallel_split_rank=1 \ + model.seq_length=256 \ + model.encoder.num_layers=4 \ + model.encoder.hidden_size=64 \ + model.encoder.num_attention_heads=8 \ + model.encoder.activation=geglu \ + model.encoder.bias_activation_fusion=False \ + model.encoder.activations_checkpoint_method=block \ + model.encoder.activations_checkpoint_num_layers=1 \ + model.decoder.num_layers=4 \ + model.decoder.hidden_size=64 \ + model.decoder.num_attention_heads=8 \ + model.decoder.activation=geglu \ + model.decoder.bias_activation_fusion=False \ + model.decoder.activations_checkpoint_method=block \ + model.decoder.activations_checkpoint_num_layers=1 \ + model.data.respect_document_boundaries=False \ + model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] + + python examples/nlp/language_modeling/megatron_bart_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=1 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=10 \ + trainer.precision=16 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/bart_pretrain_results \ + exp_manager.resume_if_exists=True \ + model.pipeline_model_parallel_size=2 \ + model.pipeline_model_parallel_split_rank=1 \ + model.seq_length=256 \ + model.encoder.num_layers=4 \ + model.encoder.hidden_size=64 \ + model.encoder.num_attention_heads=8 \ + model.encoder.activation=geglu \ + model.encoder.bias_activation_fusion=False \ + model.encoder.activations_checkpoint_method=block \ + model.encoder.activations_checkpoint_num_layers=1 \ + model.decoder.num_layers=4 \ + model.decoder.hidden_size=64 \ + model.decoder.num_attention_heads=8 \ + model.decoder.activation=geglu \ + model.decoder.bias_activation_fusion=False \ + model.decoder.activations_checkpoint_method=block \ + model.decoder.activations_checkpoint_num_layers=1 \ + model.data.respect_document_boundaries=False \ + model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] + AFTER_SCRIPT: | + rm -rf examples/nlp/language_modeling/bart_pretrain_results # L2: Megatron T5 GLUE/XNLI Finetuning # TODO(Oktai15): update it in 1.8.0 version L2_Megatron_T5_GLUE_RTE: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-gpus-1 - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py \ - trainer.devices=1 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=1 \ - +trainer.limit_val_batches=2 \ - +trainer.limit_test_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=2 \ - trainer.precision=16 \ - exp_manager.exp_dir=examples/nlp/language_modeling/t5_glue_results \ - model.restore_from_path=/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m-refactor.nemo \ - model.pipeline_model_parallel_size=1 \ - model.pipeline_model_parallel_split_rank=0 \ - model.data.train_ds.task_name=rte \ - model.data.train_ds.global_batch_size=4 \ - model.data.train_ds.micro_batch_size=2 \ - model.data.validation_ds.global_batch_size=2 \ - model.data.validation_ds.micro_batch_size=2 \ - model.data.train_ds.file_path=/home/TestData/nlp/megatron_t5/data/train_ci.tsv \ - model.data.validation_ds.task_name=rte \ - model.data.validation_ds.file_path=/home/TestData/nlp/megatron_t5/data/dev_ci.tsv - - rm -rf examples/nlp/language_modeling/t5_glue_results - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" - + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-gpus-1 + SCRIPT: | + python examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py \ + trainer.devices=1 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=1 \ + +trainer.limit_val_batches=2 \ + +trainer.limit_test_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=2 \ + trainer.precision=16 \ + exp_manager.exp_dir=examples/nlp/language_modeling/t5_glue_results \ + model.restore_from_path=/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m-refactor.nemo \ + model.pipeline_model_parallel_size=1 \ + model.pipeline_model_parallel_split_rank=0 \ + model.data.train_ds.task_name=rte \ + model.data.train_ds.global_batch_size=4 \ + model.data.train_ds.micro_batch_size=2 \ + model.data.validation_ds.global_batch_size=2 \ + model.data.validation_ds.micro_batch_size=2 \ + model.data.train_ds.file_path=/home/TestData/nlp/megatron_t5/data/train_ci.tsv \ + model.data.validation_ds.task_name=rte \ + model.data.validation_ds.file_path=/home/TestData/nlp/megatron_t5/data/dev_ci.tsv + AFTER_SCRIPT: | + rm -rf examples/nlp/language_modeling/t5_glue_results + L2_Megatron_T5_GLUE_XNLI: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-gpus-1 - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py \ - -cn megatron_t5_config_finetune_glue_xnli \ - trainer.devices=1 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=1 \ - +trainer.limit_val_batches=2 \ - +trainer.limit_test_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=2 \ - trainer.precision=16 \ - exp_manager.exp_dir=examples/nlp/language_modeling/t5_xnli_results \ - model.restore_from_path=/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m-refactor.nemo \ - model.pipeline_model_parallel_size=1 \ - model.pipeline_model_parallel_split_rank=0 \ - model.data.train_ds.global_batch_size=4 \ - model.data.train_ds.micro_batch_size=2 \ - model.data.validation_ds.global_batch_size=2 \ - model.data.validation_ds.micro_batch_size=2 \ - model.data.test_ds.global_batch_size=2 \ - model.data.test_ds.micro_batch_size=2 \ - model.data.train_ds.task_name=rte \ - model.data.train_ds.file_path=/home/TestData/nlp/megatron_t5/data/train_ci.tsv \ - model.data.validation_ds.task_name=xnli \ - model.data.validation_ds.file_path=/home/TestData/nlp/megatron_t5/data/xnli_dev_ci.tsv \ - model.data.test_ds.task_name=xnli \ - model.data.test_ds.file_path=/home/TestData/nlp/megatron_t5/data/xnli_dev_ci.tsv - - rm -rf examples/nlp/language_modeling/t5_xnli_results - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" - + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-gpus-1 + SCRIPT: | + python examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py \ + -cn megatron_t5_config_finetune_glue_xnli \ + trainer.devices=1 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=1 \ + +trainer.limit_val_batches=2 \ + +trainer.limit_test_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=2 \ + trainer.precision=16 \ + exp_manager.exp_dir=examples/nlp/language_modeling/t5_xnli_results \ + model.restore_from_path=/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m-refactor.nemo \ + model.pipeline_model_parallel_size=1 \ + model.pipeline_model_parallel_split_rank=0 \ + model.data.train_ds.global_batch_size=4 \ + model.data.train_ds.micro_batch_size=2 \ + model.data.validation_ds.global_batch_size=2 \ + model.data.validation_ds.micro_batch_size=2 \ + model.data.test_ds.global_batch_size=2 \ + model.data.test_ds.micro_batch_size=2 \ + model.data.train_ds.task_name=rte \ + model.data.train_ds.file_path=/home/TestData/nlp/megatron_t5/data/train_ci.tsv \ + model.data.validation_ds.task_name=xnli \ + model.data.validation_ds.file_path=/home/TestData/nlp/megatron_t5/data/xnli_dev_ci.tsv \ + model.data.test_ds.task_name=xnli \ + model.data.test_ds.file_path=/home/TestData/nlp/megatron_t5/data/xnli_dev_ci.tsv + AFTER_SCRIPT: | + rm -rf examples/nlp/language_modeling/t5_xnli_results + L2_Megatron_T5_PEFT_Lora_TP2: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - rm -rf /home/TestData/nlp/t5_lora_tuning_tp2 - - python examples/nlp/language_modeling/tuning/megatron_t5_finetuning.py \ - trainer.devices=2 \ - trainer.log_every_n_steps=1 \ - trainer.max_epochs=9999 \ - trainer.max_steps=3 \ - trainer.val_check_interval=3 \ - ++trainer.limit_val_batches=2 \ - trainer.precision=16 \ - exp_manager.exp_dir=/home/TestData/nlp/t5_lora_tuning_tp2 \ - model.pipeline_model_parallel_size=1 \ - model.tensor_model_parallel_size=2 \ - model.restore_from_path=/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m_tp2.nemo \ - model.peft.peft_scheme=lora \ - model.answer_only_loss=True \ - model.micro_batch_size=1 \ - model.global_batch_size=1 \ - model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ - model.data.train_ds.concat_sampling_probabilities=[1.0] \ - model.data.train_ds.num_workers=0 \ - model.data.validation_ds.num_workers=0 \ - model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ - model.data.validation_ds.names=[quarel] - - python examples/nlp/language_modeling/tuning/megatron_t5_generate.py \ - model.restore_from_path=/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m_tp2.nemo \ - model.peft.restore_from_path=/home/TestData/nlp/t5_lora_tuning_tp2/megatron_t5_peft_lora_tuning/checkpoints/megatron_t5_peft_lora_tuning.nemo \ - model.peft.restore_from_ckpt_name=null \ - model.peft.restore_from_hparams_path=null \ - model.tensor_model_parallel_size=2 \ - trainer.devices=2 \ - model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel_4.jsonl] \ - model.data.test_ds.names=[quarel4] \ - model.global_batch_size=2 \ - model.micro_batch_size=1 \ - model.data.test_ds.tokens_to_generate=10 \ - model.data.test_ds.write_predictions_to_file=True \ - model.data.test_ds.output_file_path_prefix=/home/TestData/nlp/t5_lora_tuning_tp2/out \ - inference.greedy=True \ - inference.repetition_penalty=1.0 \ - inference.outfile_path=/home/TestData/nlp/t5_lora_tuning_tp2/out.jsonl - - rm -rf /home/TestData/nlp/t5_lora_tuning_tp2 - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + rm -rf /home/TestData/nlp/t5_lora_tuning_tp2 + + python examples/nlp/language_modeling/tuning/megatron_t5_finetuning.py \ + trainer.devices=2 \ + trainer.log_every_n_steps=1 \ + trainer.max_epochs=9999 \ + trainer.max_steps=3 \ + trainer.val_check_interval=3 \ + ++trainer.limit_val_batches=2 \ + trainer.precision=16 \ + exp_manager.exp_dir=/home/TestData/nlp/t5_lora_tuning_tp2 \ + model.pipeline_model_parallel_size=1 \ + model.tensor_model_parallel_size=2 \ + model.restore_from_path=/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m_tp2.nemo \ + model.peft.peft_scheme=lora \ + model.answer_only_loss=True \ + model.micro_batch_size=1 \ + model.global_batch_size=1 \ + model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ + model.data.train_ds.concat_sampling_probabilities=[1.0] \ + model.data.train_ds.num_workers=0 \ + model.data.validation_ds.num_workers=0 \ + model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ + model.data.validation_ds.names=[quarel] + + python examples/nlp/language_modeling/tuning/megatron_t5_generate.py \ + model.restore_from_path=/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m_tp2.nemo \ + model.peft.restore_from_path=/home/TestData/nlp/t5_lora_tuning_tp2/megatron_t5_peft_lora_tuning/checkpoints/megatron_t5_peft_lora_tuning.nemo \ + model.peft.restore_from_ckpt_name=null \ + model.peft.restore_from_hparams_path=null \ + model.tensor_model_parallel_size=2 \ + trainer.devices=2 \ + model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel_4.jsonl] \ + model.data.test_ds.names=[quarel4] \ + model.global_batch_size=2 \ + model.micro_batch_size=1 \ + model.data.test_ds.tokens_to_generate=10 \ + model.data.test_ds.write_predictions_to_file=True \ + model.data.test_ds.output_file_path_prefix=/home/TestData/nlp/t5_lora_tuning_tp2/out \ + inference.greedy=True \ + inference.repetition_penalty=1.0 \ + inference.outfile_path=/home/TestData/nlp/t5_lora_tuning_tp2/out.jsonl + AFTER_SCRIPT: | + rm -rf /home/TestData/nlp/t5_lora_tuning_tp2 # L2: Megatron Mock Data Generation L2_Megatron_Mock_Data_Generation_MockGPTDataset: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - trainer.max_steps=10 \ - trainer.limit_val_batches=7 \ - trainer.val_check_interval=10 \ - exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - model.mcore_gpt=True \ - model.data.data_impl=mock \ - model.data.data_prefix=[] - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" - - L2_Megatron_Mock_Data_Generation_MockT5Dataset: - needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/nlp/language_modeling/megatron_t5_pretraining.py \ + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ trainer.max_steps=10 \ - trainer.limit_val_batches=3 \ + trainer.limit_val_batches=7 \ trainer.val_check_interval=10 \ - exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ + exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ + model.mcore_gpt=True \ model.data.data_impl=mock \ model.data.data_prefix=[] - rm -rf examples/nlp/language_modeling/t5_pretrain_results - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + L2_Megatron_Mock_Data_Generation_MockT5Dataset: + needs: [cicd-test-container-setup] + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python examples/nlp/language_modeling/megatron_t5_pretraining.py \ + trainer.max_steps=10 \ + trainer.limit_val_batches=3 \ + trainer.val_check_interval=10 \ + exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ + model.data.data_impl=mock \ + model.data.data_prefix=[] + AFTER_SCRIPT: | + rm -rf examples/nlp/language_modeling/t5_pretrain_results # L2: TTS Fast dev runs 1 L2_TTS_Fast_dev_runs_1_Tacotron_2: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-gpus-1 - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/tts/tacotron2.py \ - train_dataset=/home/TestData/an4_dataset/an4_train.json \ - validation_datasets=/home/TestData/an4_dataset/an4_val.json \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - +trainer.limit_train_batches=1 +trainer.limit_val_batches=1 trainer.max_epochs=1 \ - trainer.strategy=auto \ - model.decoder.decoder_rnn_dim=256 \ - model.decoder.attention_rnn_dim=1024 \ - model.decoder.prenet_dim=128 \ - model.postnet.postnet_n_convolutions=3 \ - model.train_ds.dataloader_params.batch_size=4 \ - model.train_ds.dataloader_params.num_workers=0 \ - model.validation_ds.dataloader_params.batch_size=4 \ - model.validation_ds.dataloader_params.num_workers=0 \ - ~model.text_normalizer \ - ~model.text_normalizer_call_kwargs \ - ~trainer.check_val_every_n_epoch - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-gpus-1 + SCRIPT: | + python examples/tts/tacotron2.py \ + train_dataset=/home/TestData/an4_dataset/an4_train.json \ + validation_datasets=/home/TestData/an4_dataset/an4_val.json \ + trainer.devices=1 \ + trainer.accelerator="gpu" \ + +trainer.limit_train_batches=1 +trainer.limit_val_batches=1 trainer.max_epochs=1 \ + trainer.strategy=auto \ + model.decoder.decoder_rnn_dim=256 \ + model.decoder.attention_rnn_dim=1024 \ + model.decoder.prenet_dim=128 \ + model.postnet.postnet_n_convolutions=3 \ + model.train_ds.dataloader_params.batch_size=4 \ + model.train_ds.dataloader_params.num_workers=0 \ + model.validation_ds.dataloader_params.batch_size=4 \ + model.validation_ds.dataloader_params.num_workers=0 \ + ~model.text_normalizer \ + ~model.text_normalizer_call_kwargs \ + ~trainer.check_val_every_n_epoch L2_TTS_Fast_dev_runs_1_WaveGlow: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/tts/waveglow.py \ - train_dataset=/home/TestData/an4_dataset/an4_train.json \ - validation_datasets=/home/TestData/an4_dataset/an4_val.json \ - trainer.devices="[0]" \ - +trainer.limit_train_batches=1 +trainer.limit_val_batches=1 trainer.max_epochs=1 \ - trainer.strategy=auto \ - model.train_ds.dataloader_params.batch_size=4 \ - model.train_ds.dataloader_params.num_workers=0 \ - model.validation_ds.dataloader_params.batch_size=4 \ - model.validation_ds.dataloader_params.num_workers=0 \ - model.waveglow.n_flows=4 \ - model.waveglow.n_wn_layers=2 \ - model.waveglow.n_wn_channels=32 \ - ~trainer.check_val_every_n_epoch - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python examples/tts/waveglow.py \ + train_dataset=/home/TestData/an4_dataset/an4_train.json \ + validation_datasets=/home/TestData/an4_dataset/an4_val.json \ + trainer.devices="[0]" \ + +trainer.limit_train_batches=1 +trainer.limit_val_batches=1 trainer.max_epochs=1 \ + trainer.strategy=auto \ + model.train_ds.dataloader_params.batch_size=4 \ + model.train_ds.dataloader_params.num_workers=0 \ + model.validation_ds.dataloader_params.batch_size=4 \ + model.validation_ds.dataloader_params.num_workers=0 \ + model.waveglow.n_flows=4 \ + model.waveglow.n_wn_layers=2 \ + model.waveglow.n_wn_channels=32 \ + ~trainer.check_val_every_n_epoch L2_TTS_Fast_dev_runs_1_FastPitch: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/tts/fastpitch.py \ - --config-name fastpitch_align_v1.05 \ - train_dataset=/home/TestData/an4_dataset/an4_train.json \ - validation_datasets=/home/TestData/an4_dataset/an4_val.json \ - sup_data_path=/home/TestData/an4_dataset/beta_priors \ - trainer.devices="[0]" \ - +trainer.limit_train_batches=1 \ - +trainer.limit_val_batches=1 \ - trainer.max_epochs=1 \ - trainer.strategy=auto \ - model.pitch_mean=212.35873413085938 \ - model.pitch_std=68.52806091308594 \ - model.train_ds.dataloader_params.batch_size=4 \ - model.train_ds.dataloader_params.num_workers=0 \ - model.validation_ds.dataloader_params.batch_size=4 \ - model.validation_ds.dataloader_params.num_workers=0 \ - model.symbols_embedding_dim=64 \ - model.input_fft.d_inner=384 \ - model.input_fft.n_layer=2 \ - model.output_fft.d_inner=384 \ - model.output_fft.n_layer=2 \ - ~trainer.check_val_every_n_epoch \ - ~model.text_normalizer \ - ~model.text_normalizer_call_kwargs - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python examples/tts/fastpitch.py \ + --config-name fastpitch_align_v1.05 \ + train_dataset=/home/TestData/an4_dataset/an4_train.json \ + validation_datasets=/home/TestData/an4_dataset/an4_val.json \ + sup_data_path=/home/TestData/an4_dataset/beta_priors \ + trainer.devices="[0]" \ + +trainer.limit_train_batches=1 \ + +trainer.limit_val_batches=1 \ + trainer.max_epochs=1 \ + trainer.strategy=auto \ + model.pitch_mean=212.35873413085938 \ + model.pitch_std=68.52806091308594 \ + model.train_ds.dataloader_params.batch_size=4 \ + model.train_ds.dataloader_params.num_workers=0 \ + model.validation_ds.dataloader_params.batch_size=4 \ + model.validation_ds.dataloader_params.num_workers=0 \ + model.symbols_embedding_dim=64 \ + model.input_fft.d_inner=384 \ + model.input_fft.n_layer=2 \ + model.output_fft.d_inner=384 \ + model.output_fft.n_layer=2 \ + ~trainer.check_val_every_n_epoch \ + ~model.text_normalizer \ + ~model.text_normalizer_call_kwargs # OPTIONAL_L2_TTS_Fast_dev_runs_1_RADTTS: # needs: [cicd-test-container-setup] @@ -6315,78 +4756,50 @@ jobs: L2_TTS_Fast_dev_runs_1_Mixer-TTS: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/tts/mixer_tts.py \ - train_dataset=/home/TestData/an4_dataset/an4_train.json \ - validation_datasets=/home/TestData/an4_dataset/an4_val.json \ - sup_data_path=/home/TestData/an4_dataset/sup_data \ - trainer.devices="[0]" \ - +trainer.limit_train_batches=1 \ - +trainer.limit_val_batches=1 \ - trainer.max_epochs=1 \ - trainer.strategy=auto \ - model.pitch_mean=212.35873413085938 \ - model.pitch_std=68.52806091308594 \ - model.train_ds.dataloader_params.batch_size=4 \ - model.train_ds.dataloader_params.num_workers=0 \ - model.validation_ds.dataloader_params.batch_size=4 \ - model.validation_ds.dataloader_params.num_workers=0 \ - ~trainer.check_val_every_n_epoch \ - ~model.text_normalizer \ - ~model.text_normalizer_call_kwargs - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python examples/tts/mixer_tts.py \ + train_dataset=/home/TestData/an4_dataset/an4_train.json \ + validation_datasets=/home/TestData/an4_dataset/an4_val.json \ + sup_data_path=/home/TestData/an4_dataset/sup_data \ + trainer.devices="[0]" \ + +trainer.limit_train_batches=1 \ + +trainer.limit_val_batches=1 \ + trainer.max_epochs=1 \ + trainer.strategy=auto \ + model.pitch_mean=212.35873413085938 \ + model.pitch_std=68.52806091308594 \ + model.train_ds.dataloader_params.batch_size=4 \ + model.train_ds.dataloader_params.num_workers=0 \ + model.validation_ds.dataloader_params.batch_size=4 \ + model.validation_ds.dataloader_params.num_workers=0 \ + ~trainer.check_val_every_n_epoch \ + ~model.text_normalizer \ + ~model.text_normalizer_call_kwargs L2_TTS_Fast_dev_runs_1_Hifigan: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/tts/hifigan.py \ - train_dataset=/home/TestData/an4_dataset/an4_train.json \ - validation_datasets=/home/TestData/an4_dataset/an4_val.json \ - trainer.devices="[0]" \ - +trainer.limit_train_batches=1 \ - +trainer.limit_val_batches=1 \ - +trainer.max_epochs=1 \ - trainer.strategy=auto \ - model.train_ds.dataloader_params.batch_size=4 \ - model.train_ds.dataloader_params.num_workers=0 \ - model.validation_ds.dataloader_params.batch_size=4 \ - model.validation_ds.dataloader_params.num_workers=0 \ - model.generator.upsample_initial_channel=64 \ - +model.debug=true \ - ~trainer.check_val_every_n_epoch - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python examples/tts/hifigan.py \ + train_dataset=/home/TestData/an4_dataset/an4_train.json \ + validation_datasets=/home/TestData/an4_dataset/an4_val.json \ + trainer.devices="[0]" \ + +trainer.limit_train_batches=1 \ + +trainer.limit_val_batches=1 \ + +trainer.max_epochs=1 \ + trainer.strategy=auto \ + model.train_ds.dataloader_params.batch_size=4 \ + model.train_ds.dataloader_params.num_workers=0 \ + model.validation_ds.dataloader_params.batch_size=4 \ + model.validation_ds.dataloader_params.num_workers=0 \ + model.generator.upsample_initial_channel=64 \ + +model.debug=true \ + ~trainer.check_val_every_n_epoch # L2: NeRF # L2_NeRF_DreamFusion: @@ -6419,30 +4832,18 @@ jobs: Speech_Checkpoints_tests: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - timeout-minutes: 20 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - CUDA_VISIBLE_DEVICES=0 python examples/asr/speech_to_text_eval.py \ - pretrained_name=QuartzNet15x5Base-En \ - dataset_manifest=/home/TestData/librispeech/librivox-dev-other.json \ - batch_size=64 \ - tolerance=0.1012 - rm -f examples/asr/evaluation_transcripts.json - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + CUDA_VISIBLE_DEVICES=0 python examples/asr/speech_to_text_eval.py \ + pretrained_name=QuartzNet15x5Base-En \ + dataset_manifest=/home/TestData/librispeech/librivox-dev-other.json \ + batch_size=64 \ + tolerance=0.1012 + TIMEOUT: 20 + AFTER_SCRIPT: | + rm -f examples/asr/evaluation_transcripts.json Nemo_CICD_Test: needs: @@ -6561,16 +4962,27 @@ jobs: - Speech_Checkpoints_tests if: always() runs-on: ubuntu-latest - steps: - # This should depend on all the tests so we block/unblock based on all tests passing - - if: ${{ contains(needs.*.result, 'success') }} + steps: + - if: ${{ always() }} + id: pipeline-conclusion + run: | + # Slack notifications are send only on test failure (not cancelled): + FAILED=${{ contains(needs.*.outputs.conclusion, 'failure') }} + echo "FAILED=$FAILED" >> $GITHUB_OUTPUT + + # Mark as successful if no job was cancelled: + SUCCESS=${{ !contains(needs.*.result, 'cancelled') }} + echo "SUCCESS=$SUCCESS" >> $GITHUB_OUTPUT + + # This should depend on all the tests so we block/unblock based on all tests passing + - if: ${{ always() && steps.pipeline-conclusion.outputs.SUCCESS == 'true' }} run: exit 0 - - if: ${{ contains(needs.*.result, 'failure') }} + - if: ${{ always() && steps.pipeline-conclusion.outputs.FAILED == 'true' }} name: Checkout repository uses: actions/checkout@v4 - - if: ${{ contains(needs.*.result, 'failure') }} + - if: ${{ always() && steps.pipeline-conclusion.outputs.FAILED == 'true' }} run: | source .github/scripts/slackHelper.sh @@ -6578,3 +4990,7 @@ jobs: PIPELINE_URL=${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} sendSlackMessage "$WEBHOOK_URL" "$PIPELINE_URL" + + - if: ${{ always() && steps.pipeline-conclusion.outputs.SUCCESS == 'false' }} + run: | + exit 1 \ No newline at end of file