Skip to content

Commit

Permalink
[T145005253] Migrate CPU and ROCm jobs to Docker containers
Browse files Browse the repository at this point in the history
- Migrate CPU and ROCm jobs to run on top of Docker containers instead
of bare metal instances

- Update GitHub workflow configuration to cancel previous jobs for a PR
if a new commit is pushed to the PR
  • Loading branch information
q10 committed Mar 14, 2023
1 parent 04bbea5 commit c3533b6
Show file tree
Hide file tree
Showing 8 changed files with 194 additions and 81 deletions.
200 changes: 122 additions & 78 deletions .github/scripts/setup_env.bash
Original file line number Diff line number Diff line change
Expand Up @@ -318,7 +318,7 @@ print_ec2_info () {


################################################################################
# Environment Setup and Install Functions
# Miniconda Setup Functions
################################################################################

setup_miniconda () {
Expand Down Expand Up @@ -403,6 +403,11 @@ create_conda_environment () {
echo "[SETUP] Successfully created Conda environment: ${env_name}"
}


################################################################################
# PyTorch Setup Functions
################################################################################

install_pytorch_conda () {
local env_name="$1"
local pytorch_version="$2"
Expand Down Expand Up @@ -558,6 +563,28 @@ install_pytorch_pip () {
echo "[INSTALL] NOTE: The installed version is: ${installed_pytorch_version}"
}


################################################################################
# CUDA Setup Functions
################################################################################

install_nvidia_drivers_centos () {
echo "################################################################################"
echo "# Install NVIDIA Drivers"
echo "#"
echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)"
echo "################################################################################"
echo ""

echo "[SETUP] Adding NVIDIA repos to yum ..."
print_exec sudo yum install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm
print_exec sudo yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo
print_exec sudo yum clean expire-cache

echo "[SETUP] Installing NVIDIA drivers ..."
install_system_packages nvidia-driver-latest-dkms
}

install_cuda () {
local env_name="$1"
local cuda_version="$2"
Expand Down Expand Up @@ -609,6 +636,86 @@ install_cuda () {
echo "[INSTALL] Successfully installed CUDA ${cuda_version}"
}

install_cudnn () {
local env_name="$1"
local install_path="$2"
local cuda_version="$3"
if [ "$cuda_version" == "" ]; then
echo "Usage: ${FUNCNAME[0]} ENV_NAME INSTALL_PATH CUDA_VERSION"
echo "Example:"
echo " ${FUNCNAME[0]} build_env \$(pwd)/cudnn_install 11.7"
return 1
else
echo "################################################################################"
echo "# Install cuDNN"
echo "#"
echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)"
echo "################################################################################"
echo ""
fi

# Install cuDNN manually
# Based on install script in https://github.com/pytorch/builder/blob/main/common/install_cuda.sh
local cudnn_packages=(
["115"]="https://developer.download.nvidia.com/compute/redist/cudnn/v8.3.2/local_installers/11.5/cudnn-linux-x86_64-8.3.2.44_cuda11.5-archive.tar.xz"
["116"]="https://developer.download.nvidia.com/compute/redist/cudnn/v8.3.2/local_installers/11.5/cudnn-linux-x86_64-8.3.2.44_cuda11.5-archive.tar.xz"
["117"]="https://ossci-linux.s3.amazonaws.com/cudnn-linux-x86_64-8.5.0.96_cuda11-archive.tar.xz"
["118"]="https://developer.download.nvidia.com/compute/redist/cudnn/v8.7.0/local_installers/11.8/cudnn-linux-x86_64-8.7.0.84_cuda11-archive.tar.xz"
)

# Split version string by dot into array, i.e. 11.7.1 => [11, 7, 1]
# shellcheck disable=SC2206
local cuda_version_arr=(${cuda_version//./ })
# Fetch the major and minor version to concat
local cuda_concat_version="${cuda_version_arr[0]}${cuda_version_arr[1]}"

# Get the URL
local cudnn_url="${cudnn_packages[cuda_concat_version]}"
if [ "$cudnn_url" == "" ]; then
# Default to cuDNN for 11.7 if no CUDA version fits
echo "[INSTALL] Defaulting to cuDNN for CUDA 11.7"
cudnn_url="${cudnn_packages[117]}"
fi

# Clear the install path
rm -rf "$install_path"
mkdir -p "$install_path"

# Create temporary directory
# shellcheck disable=SC2155
local tmp_dir=$(mktemp -d)
cd "$tmp_dir" || return 1

# Download cuDNN
echo "[INSTALL] Downloading cuDNN to ${tmp_dir} ..."
(exec_with_retries wget -q "$cudnn_url" -O cudnn.tar.xz) || return 1

# Unpack the tarball
echo "[INSTALL] Unpacking cuDNN ..."
tar -xvf cudnn.tar.xz

# Copy the includes and libs over to the install path
echo "[INSTALL] Moving cuDNN files to ${install_path} ..."
rm -rf "${install_path:?}/include"
rm -rf "${install_path:?}/lib"
mv cudnn-linux-*/include "$install_path"
mv cudnn-linux-*/lib "$install_path"

# Delete the temporary directory
cd - || return 1
rm -rf "$tmp_dir"

# Export the environment variables to the Conda environment
echo "[INSTALL] Set environment variables CUDNN_INCLUDE_DIR and CUDNN_LIBRARY ..."
print_exec conda env config vars set -n "${env_name}" CUDNN_INCLUDE_DIR="${install_path}/include" CUDNN_LIBRARY="${install_path}/lib"

echo "[INSTALL] Successfully installed cuDNN (for CUDA ${cuda_version})"
}

################################################################################
# ROCm Setup Functions
################################################################################

install_rocm_ubuntu () {
local env_name="$1"
local rocm_version="$2"
Expand Down Expand Up @@ -665,9 +772,17 @@ install_rocm_ubuntu () {
echo "[INSTALL] Cleaning up ..."
print_exec rm -f "${package_name}"

echo "[INFO] Check ROCM GPU info ..."
print_exec rocm-smi

echo "[INSTALL] Successfully installed ROCm ${rocm_version}"
}


################################################################################
# Build Tools Setup Functions
################################################################################

install_cxx_compiler () {
local env_name="$1"
local use_system_package_manager="$2"
Expand Down Expand Up @@ -766,82 +881,6 @@ install_build_tools () {
echo "[INSTALL] Successfully installed all the build tools"
}

install_cudnn () {
local env_name="$1"
local install_path="$2"
local cuda_version="$3"
if [ "$cuda_version" == "" ]; then
echo "Usage: ${FUNCNAME[0]} ENV_NAME INSTALL_PATH CUDA_VERSION"
echo "Example:"
echo " ${FUNCNAME[0]} build_env \$(pwd)/cudnn_install 11.7"
return 1
else
echo "################################################################################"
echo "# Install cuDNN"
echo "#"
echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)"
echo "################################################################################"
echo ""
fi

# Install cuDNN manually
# Based on install script in https://github.com/pytorch/builder/blob/main/common/install_cuda.sh
local cudnn_packages=(
["115"]="https://developer.download.nvidia.com/compute/redist/cudnn/v8.3.2/local_installers/11.5/cudnn-linux-x86_64-8.3.2.44_cuda11.5-archive.tar.xz"
["116"]="https://developer.download.nvidia.com/compute/redist/cudnn/v8.3.2/local_installers/11.5/cudnn-linux-x86_64-8.3.2.44_cuda11.5-archive.tar.xz"
["117"]="https://ossci-linux.s3.amazonaws.com/cudnn-linux-x86_64-8.5.0.96_cuda11-archive.tar.xz"
["118"]="https://developer.download.nvidia.com/compute/redist/cudnn/v8.7.0/local_installers/11.8/cudnn-linux-x86_64-8.7.0.84_cuda11-archive.tar.xz"
)

# Split version string by dot into array, i.e. 11.7.1 => [11, 7, 1]
# shellcheck disable=SC2206
local cuda_version_arr=(${cuda_version//./ })
# Fetch the major and minor version to concat
local cuda_concat_version="${cuda_version_arr[0]}${cuda_version_arr[1]}"

# Get the URL
local cudnn_url="${cudnn_packages[cuda_concat_version]}"
if [ "$cudnn_url" == "" ]; then
# Default to cuDNN for 11.7 if no CUDA version fits
echo "[INSTALL] Defaulting to cuDNN for CUDA 11.7"
cudnn_url="${cudnn_packages[117]}"
fi

# Clear the install path
rm -rf "$install_path"
mkdir -p "$install_path"

# Create temporary directory
# shellcheck disable=SC2155
local tmp_dir=$(mktemp -d)
cd "$tmp_dir" || return 1

# Download cuDNN
echo "[INSTALL] Downloading cuDNN to ${tmp_dir} ..."
(exec_with_retries wget -q "$cudnn_url" -O cudnn.tar.xz) || return 1

# Unpack the tarball
echo "[INSTALL] Unpacking cuDNN ..."
tar -xvf cudnn.tar.xz

# Copy the includes and libs over to the install path
echo "[INSTALL] Moving cuDNN files to ${install_path} ..."
rm -rf "${install_path:?}/include"
rm -rf "${install_path:?}/lib"
mv cudnn-linux-*/include "$install_path"
mv cudnn-linux-*/lib "$install_path"

# Delete the temporary directory
cd - || return 1
rm -rf "$tmp_dir"

# Export the environment variables to the Conda environment
echo "[INSTALL] Set environment variables CUDNN_INCLUDE_DIR and CUDNN_LIBRARY ..."
print_exec conda env config vars set -n "${env_name}" CUDNN_INCLUDE_DIR="${install_path}/include" CUDNN_LIBRARY="${install_path}/lib"

echo "[INSTALL] Successfully installed cuDNN (for CUDA ${cuda_version})"
}


################################################################################
# Combination Functions
Expand Down Expand Up @@ -883,7 +922,7 @@ create_conda_pytorch_environment () {


################################################################################
# Build Functions
# FBGEMM_GPU Build Functions
################################################################################

prepare_fbgemm_gpu_build () {
Expand All @@ -902,6 +941,11 @@ prepare_fbgemm_gpu_build () {
echo ""
fi

if [[ "${GITHUB_WORKSPACE}" ]]; then
# https://github.com/actions/checkout/issues/841
git config --global --add safe.directory "${GITHUB_WORKSPACE}"
fi

echo "[BUILD] Running git submodules update ..."
git submodule sync
git submodule update --init --recursive
Expand Down
5 changes: 5 additions & 0 deletions .github/workflows/fbgemm_ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,11 @@ on:
branches:
- main

concurrency:
# Cancel previous runs in the PR if a new commit is pushed
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true

jobs:
build-posix:
runs-on: ${{ matrix.os }}
Expand Down
19 changes: 17 additions & 2 deletions .github/workflows/fbgemm_gpu_ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,17 @@ on:
branches:
- main

concurrency:
# Cancel previous runs in the PR if a new commit is pushed
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true

jobs:
build_and_test_amd:
runs-on: ${{ matrix.os }}
container:
image: ${{ matrix.container-image }}
options: --user root
defaults:
run:
shell: bash
Expand All @@ -25,11 +33,18 @@ jobs:
strategy:
fail-fast: false
matrix:
os: [ ubuntu-20.04 ]
os: [ linux.12xlarge ]
container-image: [ "ubuntu:20.04" ]
python-version: [ "3.10" ]
rocm-version: [ "5.3" ]

steps:
- name: Setup Build Container
run: |
apt update -y
apt install -y binutils git sudo wget
git config --global --add safe.directory '*'
- name: Checkout the Repository
uses: actions/checkout@v3
with:
Expand Down Expand Up @@ -74,7 +89,7 @@ jobs:
print_exec conda env config vars set -n $BUILD_ENV PYTORCH_ROCM_ARCH=gfx90a
print_exec conda run -n $BUILD_ENV python setup.py build develop
- name: Test FBGEMM_GPU-ROCM Nightly installation
- name: Test FBGEMM_GPU-ROCM Nightly Installation
timeout-minutes: 10
run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV rocm

Expand Down
5 changes: 5 additions & 0 deletions .github/workflows/fbgemm_gpu_lint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,11 @@ on:
branches:
- main

concurrency:
# Cancel previous runs in the PR if a new commit is pushed
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true

jobs:
run_pylint:
runs-on: ubuntu-latest
Expand Down
6 changes: 5 additions & 1 deletion .github/workflows/fbgemm_nightly_build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,11 @@ on:
#
workflow_dispatch:

concurrency:
# Cancel previous runs in the PR if a new commit is pushed
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true

jobs:
# Build on CPU hosts and upload to GHA
build_artifact:
Expand Down Expand Up @@ -117,7 +122,6 @@ jobs:
cuda-version-publish: [ "11.7.1" ]
needs: build_artifact

steps:
- name: Checkout the Repository
uses: actions/checkout@v3
with:
Expand Down
18 changes: 18 additions & 0 deletions .github/workflows/fbgemm_nightly_build_cpu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,19 @@ on:
#
workflow_dispatch:

concurrency:
# Cancel previous runs in the PR if a new commit is pushed
# https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true

jobs:
# Build on CPU hosts, run tests, and upload to GHA
build_artifact:
runs-on: ${{ matrix.os }}
container:
image: amazonlinux:2023
options: --user root
defaults:
run:
shell: bash
Expand All @@ -48,6 +57,9 @@ jobs:
python-version: [ "3.8", "3.9", "3.10" ]

steps:
- name: Setup Build Container
run: yum update -y; yum install -y binutils findutils git sudo wget which

- name: Checkout the Repository
uses: actions/checkout@v3
with:
Expand Down Expand Up @@ -93,6 +105,9 @@ jobs:
# Download the built artifact from GHA, test on GPU, and push to PyPI
test_and_publish_artifact:
runs-on: ${{ matrix.os }}
container:
image: amazonlinux:2023
options: --user root
defaults:
run:
shell: bash
Expand All @@ -107,6 +122,9 @@ jobs:
needs: build_artifact

steps:
- name: Setup Build Container
run: yum update -y; yum install -y binutils findutils git sudo wget which

- name: Checkout the Repository
uses: actions/checkout@v3
with:
Expand Down
Loading

0 comments on commit c3533b6

Please sign in to comment.