Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[T145005253][T143174754] Add Comprehensive Build Instructions and Isolate CPU and ROCm Builds #1639

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
213 changes: 132 additions & 81 deletions .github/scripts/setup_env.bash
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,13 @@
print_exec () {
echo "+ $*"
echo ""
"$@"
if "$@"; then
local retcode=0
else
local retcode=$?
fi
echo ""
return $retcode
}

exec_with_retries () {
Expand Down Expand Up @@ -205,7 +210,7 @@ run_python_test () {
echo "################################################################################"
fi

if conda run -n "${env_name}" python -m pytest -v -rsx -s -W ignore::pytest.PytestCollectionWarning "${python_test_file}"; then
if print_exec conda run -n "${env_name}" python -m pytest -v -rsx -s -W ignore::pytest.PytestCollectionWarning "${python_test_file}"; then
echo "[TEST] Python test suite PASSED: ${python_test_file}"
else
echo "[TEST] Python test suite FAILED: ${python_test_file}"
Expand Down Expand Up @@ -313,7 +318,7 @@ print_ec2_info () {


################################################################################
# Environment Setup and Install Functions
# Miniconda Setup Functions
################################################################################

setup_miniconda () {
Expand Down Expand Up @@ -398,6 +403,11 @@ create_conda_environment () {
echo "[SETUP] Successfully created Conda environment: ${env_name}"
}


################################################################################
# PyTorch Setup Functions
################################################################################

install_pytorch_conda () {
local env_name="$1"
local pytorch_version="$2"
Expand Down Expand Up @@ -553,6 +563,28 @@ install_pytorch_pip () {
echo "[INSTALL] NOTE: The installed version is: ${installed_pytorch_version}"
}


################################################################################
# CUDA Setup Functions
################################################################################

install_nvidia_drivers_centos () {
echo "################################################################################"
echo "# Install NVIDIA Drivers"
echo "#"
echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)"
echo "################################################################################"
echo ""

echo "[SETUP] Adding NVIDIA repos to yum ..."
print_exec sudo yum install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm
print_exec sudo yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo
print_exec sudo yum clean expire-cache

echo "[SETUP] Installing NVIDIA drivers ..."
install_system_packages nvidia-driver-latest-dkms
}

install_cuda () {
local env_name="$1"
local cuda_version="$2"
Expand Down Expand Up @@ -604,6 +636,86 @@ install_cuda () {
echo "[INSTALL] Successfully installed CUDA ${cuda_version}"
}

install_cudnn () {
local env_name="$1"
local install_path="$2"
local cuda_version="$3"
if [ "$cuda_version" == "" ]; then
echo "Usage: ${FUNCNAME[0]} ENV_NAME INSTALL_PATH CUDA_VERSION"
echo "Example:"
echo " ${FUNCNAME[0]} build_env \$(pwd)/cudnn_install 11.7"
return 1
else
echo "################################################################################"
echo "# Install cuDNN"
echo "#"
echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)"
echo "################################################################################"
echo ""
fi

# Install cuDNN manually
# Based on install script in https://github.com/pytorch/builder/blob/main/common/install_cuda.sh
local cudnn_packages=(
["115"]="https://developer.download.nvidia.com/compute/redist/cudnn/v8.3.2/local_installers/11.5/cudnn-linux-x86_64-8.3.2.44_cuda11.5-archive.tar.xz"
["116"]="https://developer.download.nvidia.com/compute/redist/cudnn/v8.3.2/local_installers/11.5/cudnn-linux-x86_64-8.3.2.44_cuda11.5-archive.tar.xz"
["117"]="https://ossci-linux.s3.amazonaws.com/cudnn-linux-x86_64-8.5.0.96_cuda11-archive.tar.xz"
["118"]="https://developer.download.nvidia.com/compute/redist/cudnn/v8.7.0/local_installers/11.8/cudnn-linux-x86_64-8.7.0.84_cuda11-archive.tar.xz"
)

# Split version string by dot into array, i.e. 11.7.1 => [11, 7, 1]
# shellcheck disable=SC2206
local cuda_version_arr=(${cuda_version//./ })
# Fetch the major and minor version to concat
local cuda_concat_version="${cuda_version_arr[0]}${cuda_version_arr[1]}"

# Get the URL
local cudnn_url="${cudnn_packages[cuda_concat_version]}"
if [ "$cudnn_url" == "" ]; then
# Default to cuDNN for 11.7 if no CUDA version fits
echo "[INSTALL] Defaulting to cuDNN for CUDA 11.7"
cudnn_url="${cudnn_packages[117]}"
fi

# Clear the install path
rm -rf "$install_path"
mkdir -p "$install_path"

# Create temporary directory
# shellcheck disable=SC2155
local tmp_dir=$(mktemp -d)
cd "$tmp_dir" || return 1

# Download cuDNN
echo "[INSTALL] Downloading cuDNN to ${tmp_dir} ..."
(exec_with_retries wget -q "$cudnn_url" -O cudnn.tar.xz) || return 1

# Unpack the tarball
echo "[INSTALL] Unpacking cuDNN ..."
tar -xvf cudnn.tar.xz

# Copy the includes and libs over to the install path
echo "[INSTALL] Moving cuDNN files to ${install_path} ..."
rm -rf "${install_path:?}/include"
rm -rf "${install_path:?}/lib"
mv cudnn-linux-*/include "$install_path"
mv cudnn-linux-*/lib "$install_path"

# Delete the temporary directory
cd - || return 1
rm -rf "$tmp_dir"

# Export the environment variables to the Conda environment
echo "[INSTALL] Set environment variables CUDNN_INCLUDE_DIR and CUDNN_LIBRARY ..."
print_exec conda env config vars set -n "${env_name}" CUDNN_INCLUDE_DIR="${install_path}/include" CUDNN_LIBRARY="${install_path}/lib"

echo "[INSTALL] Successfully installed cuDNN (for CUDA ${cuda_version})"
}

################################################################################
# ROCm Setup Functions
################################################################################

install_rocm_ubuntu () {
local env_name="$1"
local rocm_version="$2"
Expand Down Expand Up @@ -652,15 +764,25 @@ install_rocm_ubuntu () {
(exec_with_retries amdgpu-install -y --usecase=hiplibsdk,rocm --no-dkms) || return 1

echo "[INSTALL] Installing HIP-relevant packages ..."
install_system_packages mesa-common-dev clang comgr libopenblas-dev jp intel-mkl-full locales libnuma-dev
install_system_packages hipify-clang miopen-hip miopen-hip-dev

# There is no need to install these packages for ROCm
# install_system_packages mesa-common-dev clang comgr libopenblas-dev jp intel-mkl-full locales libnuma-dev

echo "[INSTALL] Cleaning up ..."
print_exec rm -f "${package_name}"

echo "[INFO] Check ROCM GPU info ..."
print_exec rocm-smi

echo "[INSTALL] Successfully installed ROCm ${rocm_version}"
}


################################################################################
# Build Tools Setup Functions
################################################################################

install_cxx_compiler () {
local env_name="$1"
local use_system_package_manager="$2"
Expand Down Expand Up @@ -759,82 +881,6 @@ install_build_tools () {
echo "[INSTALL] Successfully installed all the build tools"
}

install_cudnn () {
local env_name="$1"
local install_path="$2"
local cuda_version="$3"
if [ "$cuda_version" == "" ]; then
echo "Usage: ${FUNCNAME[0]} ENV_NAME INSTALL_PATH CUDA_VERSION"
echo "Example:"
echo " ${FUNCNAME[0]} build_env \$(pwd)/cudnn_install 11.7"
return 1
else
echo "################################################################################"
echo "# Install cuDNN"
echo "#"
echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)"
echo "################################################################################"
echo ""
fi

# Install cuDNN manually
# Based on install script in https://github.com/pytorch/builder/blob/main/common/install_cuda.sh
local cudnn_packages=(
["115"]="https://developer.download.nvidia.com/compute/redist/cudnn/v8.3.2/local_installers/11.5/cudnn-linux-x86_64-8.3.2.44_cuda11.5-archive.tar.xz"
["116"]="https://developer.download.nvidia.com/compute/redist/cudnn/v8.3.2/local_installers/11.5/cudnn-linux-x86_64-8.3.2.44_cuda11.5-archive.tar.xz"
["117"]="https://ossci-linux.s3.amazonaws.com/cudnn-linux-x86_64-8.5.0.96_cuda11-archive.tar.xz"
["118"]="https://developer.download.nvidia.com/compute/redist/cudnn/v8.7.0/local_installers/11.8/cudnn-linux-x86_64-8.7.0.84_cuda11-archive.tar.xz"
)

# Split version string by dot into array, i.e. 11.7.1 => [11, 7, 1]
# shellcheck disable=SC2206
local cuda_version_arr=(${cuda_version//./ })
# Fetch the major and minor version to concat
local cuda_concat_version="${cuda_version_arr[0]}${cuda_version_arr[1]}"

# Get the URL
local cudnn_url="${cudnn_packages[cuda_concat_version]}"
if [ "$cudnn_url" == "" ]; then
# Default to cuDNN for 11.7 if no CUDA version fits
echo "[INSTALL] Defaulting to cuDNN for CUDA 11.7"
cudnn_url="${cudnn_packages[117]}"
fi

# Clear the install path
rm -rf "$install_path"
mkdir -p "$install_path"

# Create temporary directory
# shellcheck disable=SC2155
local tmp_dir=$(mktemp -d)
cd "$tmp_dir" || return 1

# Download cuDNN
echo "[INSTALL] Downloading cuDNN to ${tmp_dir} ..."
(exec_with_retries wget -q "$cudnn_url" -O cudnn.tar.xz) || return 1

# Unpack the tarball
echo "[INSTALL] Unpacking cuDNN ..."
tar -xvf cudnn.tar.xz

# Copy the includes and libs over to the install path
echo "[INSTALL] Moving cuDNN files to ${install_path} ..."
rm -rf "${install_path:?}/include"
rm -rf "${install_path:?}/lib"
mv cudnn-linux-*/include "$install_path"
mv cudnn-linux-*/lib "$install_path"

# Delete the temporary directory
cd - || return 1
rm -rf "$tmp_dir"

# Export the environment variables to the Conda environment
echo "[INSTALL] Set environment variables CUDNN_INCLUDE_DIR and CUDNN_LIBRARY ..."
print_exec conda env config vars set -n "${env_name}" CUDNN_INCLUDE_DIR="${install_path}/include" CUDNN_LIBRARY="${install_path}/lib"

echo "[INSTALL] Successfully installed cuDNN (for CUDA ${cuda_version})"
}


################################################################################
# Combination Functions
Expand Down Expand Up @@ -876,7 +922,7 @@ create_conda_pytorch_environment () {


################################################################################
# Build Functions
# FBGEMM_GPU Build Functions
################################################################################

prepare_fbgemm_gpu_build () {
Expand All @@ -895,6 +941,11 @@ prepare_fbgemm_gpu_build () {
echo ""
fi

if [[ "${GITHUB_WORKSPACE}" ]]; then
# https://github.com/actions/checkout/issues/841
git config --global --add safe.directory "${GITHUB_WORKSPACE}"
fi

echo "[BUILD] Running git submodules update ..."
git submodule sync
git submodule update --init --recursive
Expand Down
5 changes: 5 additions & 0 deletions .github/workflows/fbgemm_ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,11 @@ on:
branches:
- main

concurrency:
# Cancel previous runs in the PR if a new commit is pushed
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true

jobs:
build-posix:
runs-on: ${{ matrix.os }}
Expand Down
19 changes: 17 additions & 2 deletions .github/workflows/fbgemm_gpu_ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,17 @@ on:
branches:
- main

concurrency:
# Cancel previous runs in the PR if a new commit is pushed
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true

jobs:
build_and_test_amd:
runs-on: ${{ matrix.os }}
container:
image: ${{ matrix.container-image }}
options: --user root
defaults:
run:
shell: bash
Expand All @@ -25,11 +33,18 @@ jobs:
strategy:
fail-fast: false
matrix:
os: [ ubuntu-20.04 ]
os: [ linux.12xlarge ]
container-image: [ "ubuntu:20.04" ]
python-version: [ "3.10" ]
rocm-version: [ "5.3" ]

steps:
- name: Setup Build Container
run: |
apt update -y
apt install -y binutils git sudo wget
git config --global --add safe.directory '*'
- name: Checkout the Repository
uses: actions/checkout@v3
with:
Expand Down Expand Up @@ -74,7 +89,7 @@ jobs:
print_exec conda env config vars set -n $BUILD_ENV PYTORCH_ROCM_ARCH=gfx90a
print_exec conda run -n $BUILD_ENV python setup.py build develop
- name: Test FBGEMM_GPU-ROCM Nightly installation
- name: Test FBGEMM_GPU-ROCM Nightly Installation
timeout-minutes: 10
run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV rocm

Expand Down
5 changes: 5 additions & 0 deletions .github/workflows/fbgemm_gpu_lint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,11 @@ on:
branches:
- main

concurrency:
# Cancel previous runs in the PR if a new commit is pushed
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true

jobs:
run_pylint:
runs-on: ubuntu-latest
Expand Down
5 changes: 5 additions & 0 deletions .github/workflows/fbgemm_nightly_build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,11 @@ on:
#
workflow_dispatch:

concurrency:
# Cancel previous runs in the PR if a new commit is pushed
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true

jobs:
# Build on CPU hosts and upload to GHA
build_artifact:
Expand Down
Loading