From c3533b670c7010b347c0edf947ef95951d9801d9 Mon Sep 17 00:00:00 2001
From: Benson Ma <bensonma415@meta.com>
Date: Mon, 13 Mar 2023 12:21:47 -0700
Subject: [PATCH] [T145005253] Migrate CPU and ROCm jobs to Docker containers

- Migrate CPU and ROCm jobs to run on top of Docker containers instead
of bare metal instances

- Update GitHub workflow configuration to cancel previous jobs for a PR
if a new commit is pushed to the PR
---
 .github/scripts/setup_env.bash                | 200 +++++++++++-------
 .github/workflows/fbgemm_ci.yml               |   5 +
 .github/workflows/fbgemm_gpu_ci.yml           |  19 +-
 .github/workflows/fbgemm_gpu_lint.yml         |   5 +
 .github/workflows/fbgemm_nightly_build.yml    |   6 +-
 .../workflows/fbgemm_nightly_build_cpu.yml    |  18 ++
 .github/workflows/fbgemm_release_build.yml    |   5 +
 .../workflows/fbgemm_release_build_cpu.yml    |  17 ++
 8 files changed, 194 insertions(+), 81 deletions(-)

diff --git a/.github/scripts/setup_env.bash b/.github/scripts/setup_env.bash
index 29f0aac1ed..ccdac79097 100755
--- a/.github/scripts/setup_env.bash
+++ b/.github/scripts/setup_env.bash
@@ -318,7 +318,7 @@ print_ec2_info () {
 
 
 ################################################################################
-# Environment Setup and Install Functions
+# Miniconda Setup Functions
 ################################################################################
 
 setup_miniconda () {
@@ -403,6 +403,11 @@ create_conda_environment () {
   echo "[SETUP] Successfully created Conda environment: ${env_name}"
 }
 
+
+################################################################################
+# PyTorch Setup Functions
+################################################################################
+
 install_pytorch_conda () {
   local env_name="$1"
   local pytorch_version="$2"
@@ -558,6 +563,28 @@ install_pytorch_pip () {
   echo "[INSTALL] NOTE: The installed version is: ${installed_pytorch_version}"
 }
 
+
+################################################################################
+# CUDA Setup Functions
+################################################################################
+
+install_nvidia_drivers_centos () {
+  echo "################################################################################"
+  echo "# Install NVIDIA Drivers"
+  echo "#"
+  echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)"
+  echo "################################################################################"
+  echo ""
+
+  echo "[SETUP] Adding NVIDIA repos to yum ..."
+  print_exec sudo yum install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm
+  print_exec sudo yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo
+  print_exec sudo yum clean expire-cache
+
+  echo "[SETUP] Installing NVIDIA drivers ..."
+  install_system_packages nvidia-driver-latest-dkms
+}
+
 install_cuda () {
   local env_name="$1"
   local cuda_version="$2"
@@ -609,6 +636,86 @@ install_cuda () {
   echo "[INSTALL] Successfully installed CUDA ${cuda_version}"
 }
 
+install_cudnn () {
+  local env_name="$1"
+  local install_path="$2"
+  local cuda_version="$3"
+  if [ "$cuda_version" == "" ]; then
+    echo "Usage: ${FUNCNAME[0]} ENV_NAME INSTALL_PATH CUDA_VERSION"
+    echo "Example:"
+    echo "    ${FUNCNAME[0]} build_env \$(pwd)/cudnn_install 11.7"
+    return 1
+  else
+    echo "################################################################################"
+    echo "# Install cuDNN"
+    echo "#"
+    echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)"
+    echo "################################################################################"
+    echo ""
+  fi
+
+  # Install cuDNN manually
+  # Based on install script in https://github.com/pytorch/builder/blob/main/common/install_cuda.sh
+  local cudnn_packages=(
+    ["115"]="https://developer.download.nvidia.com/compute/redist/cudnn/v8.3.2/local_installers/11.5/cudnn-linux-x86_64-8.3.2.44_cuda11.5-archive.tar.xz"
+    ["116"]="https://developer.download.nvidia.com/compute/redist/cudnn/v8.3.2/local_installers/11.5/cudnn-linux-x86_64-8.3.2.44_cuda11.5-archive.tar.xz"
+    ["117"]="https://ossci-linux.s3.amazonaws.com/cudnn-linux-x86_64-8.5.0.96_cuda11-archive.tar.xz"
+    ["118"]="https://developer.download.nvidia.com/compute/redist/cudnn/v8.7.0/local_installers/11.8/cudnn-linux-x86_64-8.7.0.84_cuda11-archive.tar.xz"
+  )
+
+  # Split version string by dot into array, i.e. 11.7.1 => [11, 7, 1]
+  # shellcheck disable=SC2206
+  local cuda_version_arr=(${cuda_version//./ })
+  # Fetch the major and minor version to concat
+  local cuda_concat_version="${cuda_version_arr[0]}${cuda_version_arr[1]}"
+
+  # Get the URL
+  local cudnn_url="${cudnn_packages[cuda_concat_version]}"
+  if [ "$cudnn_url" == "" ]; then
+    # Default to cuDNN for 11.7 if no CUDA version fits
+    echo "[INSTALL] Defaulting to cuDNN for CUDA 11.7"
+    cudnn_url="${cudnn_packages[117]}"
+  fi
+
+  # Clear the install path
+  rm -rf "$install_path"
+  mkdir -p "$install_path"
+
+  # Create temporary directory
+  # shellcheck disable=SC2155
+  local tmp_dir=$(mktemp -d)
+  cd "$tmp_dir" || return 1
+
+  # Download cuDNN
+  echo "[INSTALL] Downloading cuDNN to ${tmp_dir} ..."
+  (exec_with_retries wget -q "$cudnn_url" -O cudnn.tar.xz) || return 1
+
+  # Unpack the tarball
+  echo "[INSTALL] Unpacking cuDNN ..."
+  tar -xvf cudnn.tar.xz
+
+  # Copy the includes and libs over to the install path
+  echo "[INSTALL] Moving cuDNN files to ${install_path} ..."
+  rm -rf "${install_path:?}/include"
+  rm -rf "${install_path:?}/lib"
+  mv cudnn-linux-*/include "$install_path"
+  mv cudnn-linux-*/lib "$install_path"
+
+  # Delete the temporary directory
+  cd - || return 1
+  rm -rf "$tmp_dir"
+
+  # Export the environment variables to the Conda environment
+  echo "[INSTALL] Set environment variables CUDNN_INCLUDE_DIR and CUDNN_LIBRARY ..."
+  print_exec conda env config vars set -n "${env_name}" CUDNN_INCLUDE_DIR="${install_path}/include" CUDNN_LIBRARY="${install_path}/lib"
+
+  echo "[INSTALL] Successfully installed cuDNN (for CUDA ${cuda_version})"
+}
+
+################################################################################
+# ROCm Setup Functions
+################################################################################
+
 install_rocm_ubuntu () {
   local env_name="$1"
   local rocm_version="$2"
@@ -665,9 +772,17 @@ install_rocm_ubuntu () {
   echo "[INSTALL] Cleaning up ..."
   print_exec rm -f "${package_name}"
 
+  echo "[INFO] Check ROCM GPU info ..."
+  print_exec rocm-smi
+
   echo "[INSTALL] Successfully installed ROCm ${rocm_version}"
 }
 
+
+################################################################################
+# Build Tools Setup Functions
+################################################################################
+
 install_cxx_compiler () {
   local env_name="$1"
   local use_system_package_manager="$2"
@@ -766,82 +881,6 @@ install_build_tools () {
   echo "[INSTALL] Successfully installed all the build tools"
 }
 
-install_cudnn () {
-  local env_name="$1"
-  local install_path="$2"
-  local cuda_version="$3"
-  if [ "$cuda_version" == "" ]; then
-    echo "Usage: ${FUNCNAME[0]} ENV_NAME INSTALL_PATH CUDA_VERSION"
-    echo "Example:"
-    echo "    ${FUNCNAME[0]} build_env \$(pwd)/cudnn_install 11.7"
-    return 1
-  else
-    echo "################################################################################"
-    echo "# Install cuDNN"
-    echo "#"
-    echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)"
-    echo "################################################################################"
-    echo ""
-  fi
-
-  # Install cuDNN manually
-  # Based on install script in https://github.com/pytorch/builder/blob/main/common/install_cuda.sh
-  local cudnn_packages=(
-    ["115"]="https://developer.download.nvidia.com/compute/redist/cudnn/v8.3.2/local_installers/11.5/cudnn-linux-x86_64-8.3.2.44_cuda11.5-archive.tar.xz"
-    ["116"]="https://developer.download.nvidia.com/compute/redist/cudnn/v8.3.2/local_installers/11.5/cudnn-linux-x86_64-8.3.2.44_cuda11.5-archive.tar.xz"
-    ["117"]="https://ossci-linux.s3.amazonaws.com/cudnn-linux-x86_64-8.5.0.96_cuda11-archive.tar.xz"
-    ["118"]="https://developer.download.nvidia.com/compute/redist/cudnn/v8.7.0/local_installers/11.8/cudnn-linux-x86_64-8.7.0.84_cuda11-archive.tar.xz"
-  )
-
-  # Split version string by dot into array, i.e. 11.7.1 => [11, 7, 1]
-  # shellcheck disable=SC2206
-  local cuda_version_arr=(${cuda_version//./ })
-  # Fetch the major and minor version to concat
-  local cuda_concat_version="${cuda_version_arr[0]}${cuda_version_arr[1]}"
-
-  # Get the URL
-  local cudnn_url="${cudnn_packages[cuda_concat_version]}"
-  if [ "$cudnn_url" == "" ]; then
-    # Default to cuDNN for 11.7 if no CUDA version fits
-    echo "[INSTALL] Defaulting to cuDNN for CUDA 11.7"
-    cudnn_url="${cudnn_packages[117]}"
-  fi
-
-  # Clear the install path
-  rm -rf "$install_path"
-  mkdir -p "$install_path"
-
-  # Create temporary directory
-  # shellcheck disable=SC2155
-  local tmp_dir=$(mktemp -d)
-  cd "$tmp_dir" || return 1
-
-  # Download cuDNN
-  echo "[INSTALL] Downloading cuDNN to ${tmp_dir} ..."
-  (exec_with_retries wget -q "$cudnn_url" -O cudnn.tar.xz) || return 1
-
-  # Unpack the tarball
-  echo "[INSTALL] Unpacking cuDNN ..."
-  tar -xvf cudnn.tar.xz
-
-  # Copy the includes and libs over to the install path
-  echo "[INSTALL] Moving cuDNN files to ${install_path} ..."
-  rm -rf "${install_path:?}/include"
-  rm -rf "${install_path:?}/lib"
-  mv cudnn-linux-*/include "$install_path"
-  mv cudnn-linux-*/lib "$install_path"
-
-  # Delete the temporary directory
-  cd - || return 1
-  rm -rf "$tmp_dir"
-
-  # Export the environment variables to the Conda environment
-  echo "[INSTALL] Set environment variables CUDNN_INCLUDE_DIR and CUDNN_LIBRARY ..."
-  print_exec conda env config vars set -n "${env_name}" CUDNN_INCLUDE_DIR="${install_path}/include" CUDNN_LIBRARY="${install_path}/lib"
-
-  echo "[INSTALL] Successfully installed cuDNN (for CUDA ${cuda_version})"
-}
-
 
 ################################################################################
 # Combination Functions
@@ -883,7 +922,7 @@ create_conda_pytorch_environment () {
 
 
 ################################################################################
-# Build Functions
+# FBGEMM_GPU Build Functions
 ################################################################################
 
 prepare_fbgemm_gpu_build () {
@@ -902,6 +941,11 @@ prepare_fbgemm_gpu_build () {
     echo ""
   fi
 
+  if [[ "${GITHUB_WORKSPACE}" ]]; then
+    # https://github.com/actions/checkout/issues/841
+    git config --global --add safe.directory "${GITHUB_WORKSPACE}"
+  fi
+
   echo "[BUILD] Running git submodules update ..."
   git submodule sync
   git submodule update --init --recursive
diff --git a/.github/workflows/fbgemm_ci.yml b/.github/workflows/fbgemm_ci.yml
index f6bae56123..977b443a2b 100644
--- a/.github/workflows/fbgemm_ci.yml
+++ b/.github/workflows/fbgemm_ci.yml
@@ -13,6 +13,11 @@ on:
     branches:
       - main
 
+concurrency:
+  # Cancel previous runs in the PR if a new commit is pushed
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
 jobs:
   build-posix:
     runs-on: ${{ matrix.os }}
diff --git a/.github/workflows/fbgemm_gpu_ci.yml b/.github/workflows/fbgemm_gpu_ci.yml
index 8e021c4451..bd62f23761 100644
--- a/.github/workflows/fbgemm_gpu_ci.yml
+++ b/.github/workflows/fbgemm_gpu_ci.yml
@@ -13,9 +13,17 @@ on:
     branches:
       - main
 
+concurrency:
+  # Cancel previous runs in the PR if a new commit is pushed
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
 jobs:
   build_and_test_amd:
     runs-on: ${{ matrix.os }}
+    container:
+      image: ${{ matrix.container-image }}
+      options: --user root
     defaults:
       run:
         shell: bash
@@ -25,11 +33,18 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        os: [ ubuntu-20.04 ]
+        os: [ linux.12xlarge ]
+        container-image: [ "ubuntu:20.04" ]
         python-version: [ "3.10" ]
         rocm-version: [ "5.3" ]
 
     steps:
+    - name: Setup Build Container
+      run: |
+        apt update -y
+        apt install -y binutils git sudo wget
+        git config --global --add safe.directory '*'
+
     - name: Checkout the Repository
       uses: actions/checkout@v3
       with:
@@ -74,7 +89,7 @@ jobs:
         print_exec conda env config vars set -n $BUILD_ENV PYTORCH_ROCM_ARCH=gfx90a
         print_exec conda run -n $BUILD_ENV python setup.py build develop
 
-    - name: Test FBGEMM_GPU-ROCM Nightly installation
+    - name: Test FBGEMM_GPU-ROCM Nightly Installation
       timeout-minutes: 10
       run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV rocm
 
diff --git a/.github/workflows/fbgemm_gpu_lint.yml b/.github/workflows/fbgemm_gpu_lint.yml
index dc2b6344ce..1ff7203108 100644
--- a/.github/workflows/fbgemm_gpu_lint.yml
+++ b/.github/workflows/fbgemm_gpu_lint.yml
@@ -14,6 +14,11 @@ on:
     branches:
       - main
 
+concurrency:
+  # Cancel previous runs in the PR if a new commit is pushed
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
 jobs:
   run_pylint:
     runs-on: ubuntu-latest
diff --git a/.github/workflows/fbgemm_nightly_build.yml b/.github/workflows/fbgemm_nightly_build.yml
index 4cdb10aaa8..24866808a0 100644
--- a/.github/workflows/fbgemm_nightly_build.yml
+++ b/.github/workflows/fbgemm_nightly_build.yml
@@ -30,6 +30,11 @@ on:
   #
   workflow_dispatch:
 
+concurrency:
+  # Cancel previous runs in the PR if a new commit is pushed
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
 jobs:
   # Build on CPU hosts and upload to GHA
   build_artifact:
@@ -117,7 +122,6 @@ jobs:
         cuda-version-publish: [ "11.7.1" ]
     needs: build_artifact
 
-    steps:
     - name: Checkout the Repository
       uses: actions/checkout@v3
       with:
diff --git a/.github/workflows/fbgemm_nightly_build_cpu.yml b/.github/workflows/fbgemm_nightly_build_cpu.yml
index 72a0af01e7..1125b17a0d 100644
--- a/.github/workflows/fbgemm_nightly_build_cpu.yml
+++ b/.github/workflows/fbgemm_nightly_build_cpu.yml
@@ -30,10 +30,19 @@ on:
   #
   workflow_dispatch:
 
+concurrency:
+  # Cancel previous runs in the PR if a new commit is pushed
+  # https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
 jobs:
   # Build on CPU hosts, run tests, and upload to GHA
   build_artifact:
     runs-on: ${{ matrix.os }}
+    container:
+      image: amazonlinux:2023
+      options: --user root
     defaults:
       run:
         shell: bash
@@ -48,6 +57,9 @@ jobs:
         python-version: [ "3.8", "3.9", "3.10" ]
 
     steps:
+    - name: Setup Build Container
+      run: yum update -y; yum install -y binutils findutils git sudo wget which
+
     - name: Checkout the Repository
       uses: actions/checkout@v3
       with:
@@ -93,6 +105,9 @@ jobs:
   # Download the built artifact from GHA, test on GPU, and push to PyPI
   test_and_publish_artifact:
     runs-on: ${{ matrix.os }}
+    container:
+      image: amazonlinux:2023
+      options: --user root
     defaults:
       run:
         shell: bash
@@ -107,6 +122,9 @@ jobs:
     needs: build_artifact
 
     steps:
+    - name: Setup Build Container
+      run: yum update -y; yum install -y binutils findutils git sudo wget which
+
     - name: Checkout the Repository
       uses: actions/checkout@v3
       with:
diff --git a/.github/workflows/fbgemm_release_build.yml b/.github/workflows/fbgemm_release_build.yml
index 5e3d369fe4..def6002a76 100644
--- a/.github/workflows/fbgemm_release_build.yml
+++ b/.github/workflows/fbgemm_release_build.yml
@@ -22,6 +22,11 @@ on:
   #
   workflow_dispatch:
 
+concurrency:
+  # Cancel previous runs in the PR if a new commit is pushed
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
 jobs:
   # Build on CPU hosts and upload to GHA
   build_artifact:
diff --git a/.github/workflows/fbgemm_release_build_cpu.yml b/.github/workflows/fbgemm_release_build_cpu.yml
index a652c89854..c7fb53cabd 100644
--- a/.github/workflows/fbgemm_release_build_cpu.yml
+++ b/.github/workflows/fbgemm_release_build_cpu.yml
@@ -22,10 +22,18 @@ on:
   #
   workflow_dispatch:
 
+concurrency:
+  # Cancel previous runs in the PR if a new commit is pushed
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
 jobs:
   # Build on CPU hosts, run tests, and upload to GHA
   build_artifact:
     runs-on: ${{ matrix.os }}
+    container:
+      image: amazonlinux:2023
+      options: --user root
     defaults:
       run:
         shell: bash
@@ -40,6 +48,9 @@ jobs:
         python-version: [ "3.8", "3.9", "3.10" ]
 
     steps:
+    - name: Setup Build Container
+      run: yum update -y; yum install -y binutils findutils git sudo wget which
+
     - name: Checkout the Repository
       uses: actions/checkout@v3
       with:
@@ -85,6 +96,9 @@ jobs:
   # Download the built artifact from GHA, test on GPU, and push to PyPI
   test_and_publish_artifact:
     runs-on: ${{ matrix.os }}
+    container:
+      image: amazonlinux:2023
+      options: --user root
     defaults:
       run:
         shell: bash
@@ -99,6 +113,9 @@ jobs:
     needs: build_artifact
 
     steps:
+    - name: Setup Build Container
+      run: yum update -y; yum install -y binutils findutils git sudo wget which
+
     - name: Checkout the Repository
       uses: actions/checkout@v3
       with: