From c3533b670c7010b347c0edf947ef95951d9801d9 Mon Sep 17 00:00:00 2001 From: Benson Ma Date: Mon, 13 Mar 2023 12:21:47 -0700 Subject: [PATCH] [T145005253] Migrate CPU and ROCm jobs to Docker containers - Migrate CPU and ROCm jobs to run on top of Docker containers instead of bare metal instances - Update GitHub workflow configuration to cancel previous jobs for a PR if a new commit is pushed to the PR --- .github/scripts/setup_env.bash | 200 +++++++++++------- .github/workflows/fbgemm_ci.yml | 5 + .github/workflows/fbgemm_gpu_ci.yml | 19 +- .github/workflows/fbgemm_gpu_lint.yml | 5 + .github/workflows/fbgemm_nightly_build.yml | 6 +- .../workflows/fbgemm_nightly_build_cpu.yml | 18 ++ .github/workflows/fbgemm_release_build.yml | 5 + .../workflows/fbgemm_release_build_cpu.yml | 17 ++ 8 files changed, 194 insertions(+), 81 deletions(-) diff --git a/.github/scripts/setup_env.bash b/.github/scripts/setup_env.bash index 29f0aac1ed..ccdac79097 100755 --- a/.github/scripts/setup_env.bash +++ b/.github/scripts/setup_env.bash @@ -318,7 +318,7 @@ print_ec2_info () { ################################################################################ -# Environment Setup and Install Functions +# Miniconda Setup Functions ################################################################################ setup_miniconda () { @@ -403,6 +403,11 @@ create_conda_environment () { echo "[SETUP] Successfully created Conda environment: ${env_name}" } + +################################################################################ +# PyTorch Setup Functions +################################################################################ + install_pytorch_conda () { local env_name="$1" local pytorch_version="$2" @@ -558,6 +563,28 @@ install_pytorch_pip () { echo "[INSTALL] NOTE: The installed version is: ${installed_pytorch_version}" } + +################################################################################ +# CUDA Setup Functions +################################################################################ + +install_nvidia_drivers_centos () { + echo "################################################################################" + echo "# Install NVIDIA Drivers" + echo "#" + echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)" + echo "################################################################################" + echo "" + + echo "[SETUP] Adding NVIDIA repos to yum ..." + print_exec sudo yum install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm + print_exec sudo yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo + print_exec sudo yum clean expire-cache + + echo "[SETUP] Installing NVIDIA drivers ..." + install_system_packages nvidia-driver-latest-dkms +} + install_cuda () { local env_name="$1" local cuda_version="$2" @@ -609,6 +636,86 @@ install_cuda () { echo "[INSTALL] Successfully installed CUDA ${cuda_version}" } +install_cudnn () { + local env_name="$1" + local install_path="$2" + local cuda_version="$3" + if [ "$cuda_version" == "" ]; then + echo "Usage: ${FUNCNAME[0]} ENV_NAME INSTALL_PATH CUDA_VERSION" + echo "Example:" + echo " ${FUNCNAME[0]} build_env \$(pwd)/cudnn_install 11.7" + return 1 + else + echo "################################################################################" + echo "# Install cuDNN" + echo "#" + echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)" + echo "################################################################################" + echo "" + fi + + # Install cuDNN manually + # Based on install script in https://github.com/pytorch/builder/blob/main/common/install_cuda.sh + local cudnn_packages=( + ["115"]="https://developer.download.nvidia.com/compute/redist/cudnn/v8.3.2/local_installers/11.5/cudnn-linux-x86_64-8.3.2.44_cuda11.5-archive.tar.xz" + ["116"]="https://developer.download.nvidia.com/compute/redist/cudnn/v8.3.2/local_installers/11.5/cudnn-linux-x86_64-8.3.2.44_cuda11.5-archive.tar.xz" + ["117"]="https://ossci-linux.s3.amazonaws.com/cudnn-linux-x86_64-8.5.0.96_cuda11-archive.tar.xz" + ["118"]="https://developer.download.nvidia.com/compute/redist/cudnn/v8.7.0/local_installers/11.8/cudnn-linux-x86_64-8.7.0.84_cuda11-archive.tar.xz" + ) + + # Split version string by dot into array, i.e. 11.7.1 => [11, 7, 1] + # shellcheck disable=SC2206 + local cuda_version_arr=(${cuda_version//./ }) + # Fetch the major and minor version to concat + local cuda_concat_version="${cuda_version_arr[0]}${cuda_version_arr[1]}" + + # Get the URL + local cudnn_url="${cudnn_packages[cuda_concat_version]}" + if [ "$cudnn_url" == "" ]; then + # Default to cuDNN for 11.7 if no CUDA version fits + echo "[INSTALL] Defaulting to cuDNN for CUDA 11.7" + cudnn_url="${cudnn_packages[117]}" + fi + + # Clear the install path + rm -rf "$install_path" + mkdir -p "$install_path" + + # Create temporary directory + # shellcheck disable=SC2155 + local tmp_dir=$(mktemp -d) + cd "$tmp_dir" || return 1 + + # Download cuDNN + echo "[INSTALL] Downloading cuDNN to ${tmp_dir} ..." + (exec_with_retries wget -q "$cudnn_url" -O cudnn.tar.xz) || return 1 + + # Unpack the tarball + echo "[INSTALL] Unpacking cuDNN ..." + tar -xvf cudnn.tar.xz + + # Copy the includes and libs over to the install path + echo "[INSTALL] Moving cuDNN files to ${install_path} ..." + rm -rf "${install_path:?}/include" + rm -rf "${install_path:?}/lib" + mv cudnn-linux-*/include "$install_path" + mv cudnn-linux-*/lib "$install_path" + + # Delete the temporary directory + cd - || return 1 + rm -rf "$tmp_dir" + + # Export the environment variables to the Conda environment + echo "[INSTALL] Set environment variables CUDNN_INCLUDE_DIR and CUDNN_LIBRARY ..." + print_exec conda env config vars set -n "${env_name}" CUDNN_INCLUDE_DIR="${install_path}/include" CUDNN_LIBRARY="${install_path}/lib" + + echo "[INSTALL] Successfully installed cuDNN (for CUDA ${cuda_version})" +} + +################################################################################ +# ROCm Setup Functions +################################################################################ + install_rocm_ubuntu () { local env_name="$1" local rocm_version="$2" @@ -665,9 +772,17 @@ install_rocm_ubuntu () { echo "[INSTALL] Cleaning up ..." print_exec rm -f "${package_name}" + echo "[INFO] Check ROCM GPU info ..." + print_exec rocm-smi + echo "[INSTALL] Successfully installed ROCm ${rocm_version}" } + +################################################################################ +# Build Tools Setup Functions +################################################################################ + install_cxx_compiler () { local env_name="$1" local use_system_package_manager="$2" @@ -766,82 +881,6 @@ install_build_tools () { echo "[INSTALL] Successfully installed all the build tools" } -install_cudnn () { - local env_name="$1" - local install_path="$2" - local cuda_version="$3" - if [ "$cuda_version" == "" ]; then - echo "Usage: ${FUNCNAME[0]} ENV_NAME INSTALL_PATH CUDA_VERSION" - echo "Example:" - echo " ${FUNCNAME[0]} build_env \$(pwd)/cudnn_install 11.7" - return 1 - else - echo "################################################################################" - echo "# Install cuDNN" - echo "#" - echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)" - echo "################################################################################" - echo "" - fi - - # Install cuDNN manually - # Based on install script in https://github.com/pytorch/builder/blob/main/common/install_cuda.sh - local cudnn_packages=( - ["115"]="https://developer.download.nvidia.com/compute/redist/cudnn/v8.3.2/local_installers/11.5/cudnn-linux-x86_64-8.3.2.44_cuda11.5-archive.tar.xz" - ["116"]="https://developer.download.nvidia.com/compute/redist/cudnn/v8.3.2/local_installers/11.5/cudnn-linux-x86_64-8.3.2.44_cuda11.5-archive.tar.xz" - ["117"]="https://ossci-linux.s3.amazonaws.com/cudnn-linux-x86_64-8.5.0.96_cuda11-archive.tar.xz" - ["118"]="https://developer.download.nvidia.com/compute/redist/cudnn/v8.7.0/local_installers/11.8/cudnn-linux-x86_64-8.7.0.84_cuda11-archive.tar.xz" - ) - - # Split version string by dot into array, i.e. 11.7.1 => [11, 7, 1] - # shellcheck disable=SC2206 - local cuda_version_arr=(${cuda_version//./ }) - # Fetch the major and minor version to concat - local cuda_concat_version="${cuda_version_arr[0]}${cuda_version_arr[1]}" - - # Get the URL - local cudnn_url="${cudnn_packages[cuda_concat_version]}" - if [ "$cudnn_url" == "" ]; then - # Default to cuDNN for 11.7 if no CUDA version fits - echo "[INSTALL] Defaulting to cuDNN for CUDA 11.7" - cudnn_url="${cudnn_packages[117]}" - fi - - # Clear the install path - rm -rf "$install_path" - mkdir -p "$install_path" - - # Create temporary directory - # shellcheck disable=SC2155 - local tmp_dir=$(mktemp -d) - cd "$tmp_dir" || return 1 - - # Download cuDNN - echo "[INSTALL] Downloading cuDNN to ${tmp_dir} ..." - (exec_with_retries wget -q "$cudnn_url" -O cudnn.tar.xz) || return 1 - - # Unpack the tarball - echo "[INSTALL] Unpacking cuDNN ..." - tar -xvf cudnn.tar.xz - - # Copy the includes and libs over to the install path - echo "[INSTALL] Moving cuDNN files to ${install_path} ..." - rm -rf "${install_path:?}/include" - rm -rf "${install_path:?}/lib" - mv cudnn-linux-*/include "$install_path" - mv cudnn-linux-*/lib "$install_path" - - # Delete the temporary directory - cd - || return 1 - rm -rf "$tmp_dir" - - # Export the environment variables to the Conda environment - echo "[INSTALL] Set environment variables CUDNN_INCLUDE_DIR and CUDNN_LIBRARY ..." - print_exec conda env config vars set -n "${env_name}" CUDNN_INCLUDE_DIR="${install_path}/include" CUDNN_LIBRARY="${install_path}/lib" - - echo "[INSTALL] Successfully installed cuDNN (for CUDA ${cuda_version})" -} - ################################################################################ # Combination Functions @@ -883,7 +922,7 @@ create_conda_pytorch_environment () { ################################################################################ -# Build Functions +# FBGEMM_GPU Build Functions ################################################################################ prepare_fbgemm_gpu_build () { @@ -902,6 +941,11 @@ prepare_fbgemm_gpu_build () { echo "" fi + if [[ "${GITHUB_WORKSPACE}" ]]; then + # https://github.com/actions/checkout/issues/841 + git config --global --add safe.directory "${GITHUB_WORKSPACE}" + fi + echo "[BUILD] Running git submodules update ..." git submodule sync git submodule update --init --recursive diff --git a/.github/workflows/fbgemm_ci.yml b/.github/workflows/fbgemm_ci.yml index f6bae56123..977b443a2b 100644 --- a/.github/workflows/fbgemm_ci.yml +++ b/.github/workflows/fbgemm_ci.yml @@ -13,6 +13,11 @@ on: branches: - main +concurrency: + # Cancel previous runs in the PR if a new commit is pushed + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + jobs: build-posix: runs-on: ${{ matrix.os }} diff --git a/.github/workflows/fbgemm_gpu_ci.yml b/.github/workflows/fbgemm_gpu_ci.yml index 8e021c4451..bd62f23761 100644 --- a/.github/workflows/fbgemm_gpu_ci.yml +++ b/.github/workflows/fbgemm_gpu_ci.yml @@ -13,9 +13,17 @@ on: branches: - main +concurrency: + # Cancel previous runs in the PR if a new commit is pushed + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + jobs: build_and_test_amd: runs-on: ${{ matrix.os }} + container: + image: ${{ matrix.container-image }} + options: --user root defaults: run: shell: bash @@ -25,11 +33,18 @@ jobs: strategy: fail-fast: false matrix: - os: [ ubuntu-20.04 ] + os: [ linux.12xlarge ] + container-image: [ "ubuntu:20.04" ] python-version: [ "3.10" ] rocm-version: [ "5.3" ] steps: + - name: Setup Build Container + run: | + apt update -y + apt install -y binutils git sudo wget + git config --global --add safe.directory '*' + - name: Checkout the Repository uses: actions/checkout@v3 with: @@ -74,7 +89,7 @@ jobs: print_exec conda env config vars set -n $BUILD_ENV PYTORCH_ROCM_ARCH=gfx90a print_exec conda run -n $BUILD_ENV python setup.py build develop - - name: Test FBGEMM_GPU-ROCM Nightly installation + - name: Test FBGEMM_GPU-ROCM Nightly Installation timeout-minutes: 10 run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV rocm diff --git a/.github/workflows/fbgemm_gpu_lint.yml b/.github/workflows/fbgemm_gpu_lint.yml index dc2b6344ce..1ff7203108 100644 --- a/.github/workflows/fbgemm_gpu_lint.yml +++ b/.github/workflows/fbgemm_gpu_lint.yml @@ -14,6 +14,11 @@ on: branches: - main +concurrency: + # Cancel previous runs in the PR if a new commit is pushed + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + jobs: run_pylint: runs-on: ubuntu-latest diff --git a/.github/workflows/fbgemm_nightly_build.yml b/.github/workflows/fbgemm_nightly_build.yml index 4cdb10aaa8..24866808a0 100644 --- a/.github/workflows/fbgemm_nightly_build.yml +++ b/.github/workflows/fbgemm_nightly_build.yml @@ -30,6 +30,11 @@ on: # workflow_dispatch: +concurrency: + # Cancel previous runs in the PR if a new commit is pushed + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + jobs: # Build on CPU hosts and upload to GHA build_artifact: @@ -117,7 +122,6 @@ jobs: cuda-version-publish: [ "11.7.1" ] needs: build_artifact - steps: - name: Checkout the Repository uses: actions/checkout@v3 with: diff --git a/.github/workflows/fbgemm_nightly_build_cpu.yml b/.github/workflows/fbgemm_nightly_build_cpu.yml index 72a0af01e7..1125b17a0d 100644 --- a/.github/workflows/fbgemm_nightly_build_cpu.yml +++ b/.github/workflows/fbgemm_nightly_build_cpu.yml @@ -30,10 +30,19 @@ on: # workflow_dispatch: +concurrency: + # Cancel previous runs in the PR if a new commit is pushed + # https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + jobs: # Build on CPU hosts, run tests, and upload to GHA build_artifact: runs-on: ${{ matrix.os }} + container: + image: amazonlinux:2023 + options: --user root defaults: run: shell: bash @@ -48,6 +57,9 @@ jobs: python-version: [ "3.8", "3.9", "3.10" ] steps: + - name: Setup Build Container + run: yum update -y; yum install -y binutils findutils git sudo wget which + - name: Checkout the Repository uses: actions/checkout@v3 with: @@ -93,6 +105,9 @@ jobs: # Download the built artifact from GHA, test on GPU, and push to PyPI test_and_publish_artifact: runs-on: ${{ matrix.os }} + container: + image: amazonlinux:2023 + options: --user root defaults: run: shell: bash @@ -107,6 +122,9 @@ jobs: needs: build_artifact steps: + - name: Setup Build Container + run: yum update -y; yum install -y binutils findutils git sudo wget which + - name: Checkout the Repository uses: actions/checkout@v3 with: diff --git a/.github/workflows/fbgemm_release_build.yml b/.github/workflows/fbgemm_release_build.yml index 5e3d369fe4..def6002a76 100644 --- a/.github/workflows/fbgemm_release_build.yml +++ b/.github/workflows/fbgemm_release_build.yml @@ -22,6 +22,11 @@ on: # workflow_dispatch: +concurrency: + # Cancel previous runs in the PR if a new commit is pushed + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + jobs: # Build on CPU hosts and upload to GHA build_artifact: diff --git a/.github/workflows/fbgemm_release_build_cpu.yml b/.github/workflows/fbgemm_release_build_cpu.yml index a652c89854..c7fb53cabd 100644 --- a/.github/workflows/fbgemm_release_build_cpu.yml +++ b/.github/workflows/fbgemm_release_build_cpu.yml @@ -22,10 +22,18 @@ on: # workflow_dispatch: +concurrency: + # Cancel previous runs in the PR if a new commit is pushed + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + jobs: # Build on CPU hosts, run tests, and upload to GHA build_artifact: runs-on: ${{ matrix.os }} + container: + image: amazonlinux:2023 + options: --user root defaults: run: shell: bash @@ -40,6 +48,9 @@ jobs: python-version: [ "3.8", "3.9", "3.10" ] steps: + - name: Setup Build Container + run: yum update -y; yum install -y binutils findutils git sudo wget which + - name: Checkout the Repository uses: actions/checkout@v3 with: @@ -85,6 +96,9 @@ jobs: # Download the built artifact from GHA, test on GPU, and push to PyPI test_and_publish_artifact: runs-on: ${{ matrix.os }} + container: + image: amazonlinux:2023 + options: --user root defaults: run: shell: bash @@ -99,6 +113,9 @@ jobs: needs: build_artifact steps: + - name: Setup Build Container + run: yum update -y; yum install -y binutils findutils git sudo wget which + - name: Checkout the Repository uses: actions/checkout@v3 with: