Skip to content

Commit

Permalink
Merge branch 'main' into cherry-pick-main-2a2d98514fe80b8fc435d1818ef…
Browse files Browse the repository at this point in the history
…769df88b19920

Signed-off-by: Daniel Galvez <[email protected]>
  • Loading branch information
galv authored Jun 4, 2024
2 parents ef7a598 + a0488f6 commit d305a99
Show file tree
Hide file tree
Showing 174 changed files with 4,675 additions and 4,430 deletions.
23 changes: 23 additions & 0 deletions .github/scripts/slackHelper.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#!/bin/bash

function sendSlackMessage() {

WEBHOOK_URL="$1"
PIPELINE_URL="$2"

curl -X POST -H "Content-type: application/json" --data "{
\"blocks\": [
{
\"type\": \"section\",
\"text\": {
\"type\": \"mrkdwn\",
\"text\": \"\
🚨 *CI/CD failure at <$PIPELINE_URL|NeMo CI>*:
\"
}
}
]
}" $WEBHOOK_URL

}
76 changes: 24 additions & 52 deletions .github/workflows/cicd-main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -43,33 +43,11 @@ jobs:
docker container prune --filter "until=24h" --force
docker image prune -a --filter "until=24h" --force
# checkout-repository:
# runs-on: self-hosted-azure
# container:
# image: nvcr.io/nvidia/pytorch:24.02-py3
# volumes:
# - ${{ github.workspace }}:/workspace
# steps:
# - name: Checkout repository
# uses: actions/checkout@v4
# with:
# path: ${{ github.run_id }}

cicd-test-container-setup:
needs: [cicd-cluster-clean]
runs-on: self-hosted-azure-builder
if: ${{ github.event.label.name == 'Run CICD' }}
# uses: actions/cache@v2
#container:
# image: nvcr.io/nvidia/pytorch:24.02-py3
# options:
# # --user 0:128
# --device=/dev/nvidia0
# --gpus all
# --shm-size=8g
# --env TRANSFORMERS_OFFLINE=0
# --env HYDRA_FULL_ERROR=1
steps:
- name: Checkout repository
uses: actions/checkout@v4
Expand Down Expand Up @@ -114,26 +92,10 @@ jobs:
# These checks are not crucial
exit 0
'
### \'\'
# - name: Build and push to local registry
# uses: docker/build-push-action@v5
# with:
# context: .
# push: true
# tags: nemoci.azurecr.io/name/app:latest

# - name: Inspect
# run: |
# docker buildx imagetools inspect nemoci.azurecr.io/name/app:latest

#- name: Post-workflow execution
# uses: gacts/run-and-post-run@v1
# with:
# post: |
# chmod -R 777 .

L0_Unit_Tests_GPU:
OPTIONAL_L0_Unit_Tests_GPU:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure
container:
Expand All @@ -152,8 +114,8 @@ jobs:
- name: "L0: Unit Tests GPU"
run: |
NEMO_NUMBA_MINVER=0.53 pytest -m "not pleasefixme" --with_downloads
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
#- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
# if: "failure()"


L0_Unit_Tests_CPU:
Expand Down Expand Up @@ -289,9 +251,6 @@ jobs:
run: |
rm -rf /home/TestData/nlp/megatron_gpt/starcoder-ci-hf/megatron_starcoder_tp1_pp1.nemo;
rm -rf /home/TestData/nlp/megatron_gpt/starcoder-ci-hf/${{ github.run_id }}/
- name: Cleanup
if: "always()"
run: |
rm -rf /home/TestData/nlp/megatron_gpt/starcoder-ci-hf/model_weights
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
Expand Down Expand Up @@ -328,7 +287,7 @@ jobs:
# this test is using a 7B model which is too large for GitHub CI
# replace the model in this test with a toy model or move the test
# to the nightly CI
# L2_Community_LLM_Checkpoints_tests_Baichuan2:
# OPTIONAL_L2_Community_LLM_Checkpoints_tests_Baichuan2:
# needs: [cicd-test-container-setup]
# runs-on: self-hosted-azure
# container:
Expand Down Expand Up @@ -6485,15 +6444,14 @@ jobs:
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"


Nemo_CICD_Test:
needs:
- L0_Unit_Tests_GPU
needs:
#- OPTIONAL_L0_Unit_Tests_GPU
- L0_Unit_Tests_CPU
- L2_Community_LLM_Checkpoints_tests_Llama
- L2_Community_LLM_Checkpoints_tests_StarCoder
- L2_Community_LLM_Checkpoints_tests_Falcon
#- L2_Community_LLM_Checkpoints_tests_Baichuan2
#- OPTIONAL_L2_Community_LLM_Checkpoints_tests_Baichuan2
- ASR_dev_run_Speech_to_Text
- ASR_dev_run_Speech_to_Text_WPE_-_CitriNet
- ASR_dev_run_Speech_Pre-training_-_CitriNet
Expand Down Expand Up @@ -6601,8 +6559,22 @@ jobs:
- L2_TTS_Fast_dev_runs_1_Mixer-TTS
- L2_TTS_Fast_dev_runs_1_Hifigan
- Speech_Checkpoints_tests

if: always()
runs-on: ubuntu-latest
steps:
# This should depend on all the tests so we block/unblock based on all tests passing
- run: exit 0
- if: ${{ contains(needs.*.result, 'success') }}
run: exit 0

- if: ${{ contains(needs.*.result, 'failure') }}
name: Checkout repository
uses: actions/checkout@v4

- if: ${{ contains(needs.*.result, 'failure') }}
run: |
source .github/scripts/slackHelper.sh
WEBHOOK_URL=${{ secrets.SLACK_WEBHOOK }}
PIPELINE_URL=${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
sendSlackMessage "$WEBHOOK_URL" "$PIPELINE_URL"
Loading

0 comments on commit d305a99

Please sign in to comment.