From f1267cd7fa7f0aac665da8acadbbe86db5b9cb0b Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Thu, 12 Dec 2024 23:09:01 -0500 Subject: [PATCH] Add GPU runner for linux-aarch64 (#289) * add linux-aarch64 GPU runner * fix test skip condition * check system * try to run on bare image * fix * change to ubuntu container * Update gh-build-and-test.yml * Update action.yml * fix apt install --- .github/actions/test/action.yml | 7 +++++++ .github/workflows/gh-build-and-test.yml | 12 +++++++----- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/.github/actions/test/action.yml b/.github/actions/test/action.yml index 66468bd1..079dd039 100644 --- a/.github/actions/test/action.yml +++ b/.github/actions/test/action.yml @@ -14,6 +14,13 @@ runs: shell: bash --noprofile --norc -xeuo pipefail {0} run: nvidia-smi + # The cache action needs this + - name: Install zstd + shell: bash --noprofile --norc -xeuo pipefail {0} + run: | + apt update + apt install zstd + - name: Download bindings build artifacts uses: actions/download-artifact@v4 with: diff --git a/.github/workflows/gh-build-and-test.yml b/.github/workflows/gh-build-and-test.yml index 06f6a168..dac1ff48 100644 --- a/.github/workflows/gh-build-and-test.yml +++ b/.github/workflows/gh-build-and-test.yml @@ -76,17 +76,19 @@ jobs: test: # TODO: improve the name once a separate test matrix is defined name: Test (CUDA ${{ inputs.cuda-version }}) - # TODO: enable testing once linux-aarch64 & win-64 GPU runners are up + # TODO: enable testing once win-64 GPU runners are up if: ${{ (github.repository_owner == 'nvidia') && - startsWith(inputs.host-platform, 'linux-x64') }} + startsWith(inputs.host-platform, 'linux') }} permissions: id-token: write # This is required for configure-aws-credentials contents: read # This is required for actions/checkout - runs-on: ${{ (inputs.host-platform == 'linux-x64' && 'linux-amd64-gpu-v100-latest-1') }} - # TODO: use a different (nvidia?) container, or just run on bare image + runs-on: ${{ (inputs.host-platform == 'linux-x64' && 'linux-amd64-gpu-v100-latest-1') || + (inputs.host-platform == 'linux-aarch64' && 'linux-arm64-gpu-a100-latest-1') }} + # Our self-hosted runners require a container + # TODO: use a different (nvidia?) container container: options: -u root --security-opt seccomp=unconfined --privileged --shm-size 16g - image: condaforge/miniforge3:latest + image: ubuntu:22.04 env: NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }} needs: