From 79850c57777d661fd7b55115a00ece81c283bfb0 Mon Sep 17 00:00:00 2001 From: Benson Ma Date: Wed, 8 Mar 2023 14:45:10 -0800 Subject: [PATCH 1/3] [T145005253] Remove `.post0` from autogenerated version - Remove `.post0` suffix from the autogenerated package version --- fbgemm_gpu/setup.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fbgemm_gpu/setup.py b/fbgemm_gpu/setup.py index 6b8ebbb570..2b34cb240a 100644 --- a/fbgemm_gpu/setup.py +++ b/fbgemm_gpu/setup.py @@ -7,6 +7,7 @@ import argparse import os import random +import re import subprocess import sys @@ -38,8 +39,9 @@ def generate_package_version(package_name: str): print( f"[SETUP.PY] TAG: {gitversion.get_tag()}, BRANCH: {gitversion.get_branch()}, SHA: {gitversion.get_sha()}" ) - # Remove the local version identifier, if any (0.4.0rc0.post0+git.6a63116c.dirty => 0.4.0rc0.post0) - version = gitversion.version_from_git().split("+")[0] + # Remove the local version identifier, if any (e.g. 0.4.0rc0.post0+git.6a63116c.dirty => 0.4.0rc0.post0) + # Then remove post0 (keep postN for N > 0) (e.g. 0.4.0rc0.post0 => 0.4.0rc0) + version = re.sub(".post0$", "", gitversion.version_from_git().split("+")[0]) print(f"[SETUP.PY] Setting the package version: {version}") return version From da2abb0834984748b7bdc83f7db3cbc243a378ef Mon Sep 17 00:00:00 2001 From: Benson Ma Date: Thu, 9 Mar 2023 18:08:10 -0800 Subject: [PATCH 2/3] [T143174754] Document the FBGEMM_GPU build process - Document the full FBGEMM_GPU OSS build process in a separate Markdown file - Remove installation of packages not needed for ROCm builds --- .github/scripts/setup_env.bash | 13 +- fbgemm_gpu/docs/BuildInstructions.md | 430 +++++++++++++++++++++++++++ fbgemm_gpu/docs/README.md | 2 +- 3 files changed, 441 insertions(+), 4 deletions(-) create mode 100644 fbgemm_gpu/docs/BuildInstructions.md diff --git a/.github/scripts/setup_env.bash b/.github/scripts/setup_env.bash index 4f1c808598..29f0aac1ed 100755 --- a/.github/scripts/setup_env.bash +++ b/.github/scripts/setup_env.bash @@ -13,8 +13,13 @@ print_exec () { echo "+ $*" echo "" - "$@" + if "$@"; then + local retcode=0 + else + local retcode=$? + fi echo "" + return $retcode } exec_with_retries () { @@ -205,7 +210,7 @@ run_python_test () { echo "################################################################################" fi - if conda run -n "${env_name}" python -m pytest -v -rsx -s -W ignore::pytest.PytestCollectionWarning "${python_test_file}"; then + if print_exec conda run -n "${env_name}" python -m pytest -v -rsx -s -W ignore::pytest.PytestCollectionWarning "${python_test_file}"; then echo "[TEST] Python test suite PASSED: ${python_test_file}" else echo "[TEST] Python test suite FAILED: ${python_test_file}" @@ -652,9 +657,11 @@ install_rocm_ubuntu () { (exec_with_retries amdgpu-install -y --usecase=hiplibsdk,rocm --no-dkms) || return 1 echo "[INSTALL] Installing HIP-relevant packages ..." - install_system_packages mesa-common-dev clang comgr libopenblas-dev jp intel-mkl-full locales libnuma-dev install_system_packages hipify-clang miopen-hip miopen-hip-dev + # There is no need to install these packages for ROCm + # install_system_packages mesa-common-dev clang comgr libopenblas-dev jp intel-mkl-full locales libnuma-dev + echo "[INSTALL] Cleaning up ..." print_exec rm -f "${package_name}" diff --git a/fbgemm_gpu/docs/BuildInstructions.md b/fbgemm_gpu/docs/BuildInstructions.md new file mode 100644 index 0000000000..a90a059b40 --- /dev/null +++ b/fbgemm_gpu/docs/BuildInstructions.md @@ -0,0 +1,430 @@ +# FBGEMM_GPU Build Instructions + +The most up-to-date instructions are embedded in +[`setup_env.bash`](../../.github/scripts/setup_env.bash). The general steps for +building FBGEMM_GPU are as follows: + +1. Set up an isolated environment for building (Miniconda) +1. Install the relevant build tools (C/C++ compiler) +1. Set up for either CUDA, ROCm, or CPU build +1. Install PyTorch +1. Run the build + + +## Set Up an Isolated Build Environment + +### Install Miniconda + +Setting up a [Miniconda](https://docs.conda.io/en/latest/miniconda.html) +environment is recommended for reproducible builds: + +```sh +# Set the Miniconda prefix directory +miniconda_prefix=$HOME/miniconda + +# Download the Miniconfs installer +wget -q https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh + +# Run the installer +bash miniconda.sh -b -p "$miniconda_prefix" -u + +# Load the shortcuts +. ~/.bashrc + +# Run updates +conda update -n base -c defaults -y conda +``` + +From here on out, all installation commands will be run against or inside a +Conda environment. + + +### Set Up the Conda Environment + +Create a Conda environment with the specified Python version: + +```sh +env_name= +python_version=3.10 + +# Create the environment +conda create -y --name "${env_name}" python="${python_version}" + +# Upgrade PIP and pyOpenSSL package +conda run -n "${env_name}" pip install --upgrade pip +conda run -n "${env_name}" python -m pip install pyOpenSSL>22.1.0 +``` + +## Install the Build Tools + +### C/C++ Compiler + +Install the GCC toolchain. Note that GCC (as opposed to LLVM for example) is +required for GPU (CUDA) builds because NVIDIA's `nvcc` relies on `gcc` and `g++` +in the path. + +```sh +conda install -n "${env_name}" -y gxx_linux-64=9.3.0 +``` + +Note that while newer versions of GCC can be used, binaries compiled under newer +versions of GCC will not be compatible with older systems such as Ubuntu 20.04 +or CentOS Stream 8, because the compiled library will reference symbols from +versions of `GLIBCXX` that the system's `libstdc++.so.6` will not support. To +see what versions of GLIBCXX that the available `libstdc++.so.6` supports: + +```sh +libcxx_path=/path/to/libstdc++.so.6 +objdump -TC "${libcxx_path}" | grep GLIBCXX | sed 's/.*GLIBCXX_\([.0-9]*\).*/GLIBCXX_\1/g' | sort -Vu | cat +``` + +### Other Build Tools + +Install the other necessary build tools such as `ninja`, `cmake`, etc: + +```sh +conda install -n "${env_name}" -y \ + click \ + cmake \ + hypothesis \ + jinja2 \ + ninja \ + numpy \ + scikit-build \ + wheel +``` + + +## Set Up for CUDA Build + +The CUDA build of FBGEMM_GPU requires `nvcc` that supports compute capability +3.5+. Setting the machine up for CUDA builds of FBGEMM_GPU can be done either +through pre-built Docker images or through Conda installation on bare metal. +Note that neither a GPU nor the NVIDIA drivers need to be present for builds, +since they are only used at runtime. + +### Docker Image + +For setups through Docker, simply pull the pre-installed +[Docker image for CUDA](https://hub.docker.com/r/nvidia/cuda) for the desired +Linux distribution and CUDA version. + +```sh +# Run for Ubuntu 22.04, CUDA 11.8 +docker run -it --entrypoint "/bin/bash" nvidia/cuda:11.8.0-devel-ubuntu22.04 +``` + +From there, the rest of the build environment may be constructed through Conda. + +### Install CUDA + +Install the full CUDA package through Conda, which includes +[NVML](https://developer.nvidia.com/nvidia-management-library-nvml): + +```sh +cuda_version=11.7.1 + +# Install the full CUDA package +conda install -n "${env_name}" -y cuda -c "nvidia/label/cuda-${cuda_version}" +``` + +Ensure that at the minimum, **`cuda_runtime.h`** and **`libnvidia-ml.so`** are +found: + +```sh +conda_prefix=$(conda run -n "${env_name}" printenv CONDA_PREFIX) +find "${conda_prefix}" -name cuda_runtime.h +find "${conda_prefix}" -name libnvidia-ml.so +``` + +### Install cuDNN + +[cuDNN](https://developer.nvidia.com/cudnn) is a build-time dependency for the +CUDA variant of FBGEMM_GPU. Download and extract the cuDNN package for the +given CUDA version: + +```sh +# cuDNN package URLs can be found in: https://github.com/pytorch/builder/blob/main/common/install_cuda.sh +cudnn_url=https://developer.download.nvidia.com/compute/redist/cudnn/v8.7.0/local_installers/11.8/cudnn-linux-x86_64-8.7.0.84_cuda11-archive.tar.xz + +# Download and unpack cuDNN +wget -q "${cudnn_url}" -O cudnn.tar.xz +``` + +### [OPTIONAL] Install CUB + +[CUB](https://docs.nvidia.com/cuda/cub/index.html) is a build-time dependency for +the CUDA variant FBGEMM_GPU. This must be installed separately for +**previous versions of CUDA (prior to 11.1)** since they did not come with CUB packaged. + +To install CUB through Conda: + +```sh +conda install -c bottler nvidiacub +``` + +Alternatively, CUB may be installed manually by downloading from the +[GitHub Releases](https://github.com/NVIDIA/cub/releases ) page and unpacking +the package: + +```sh +# Download and unpack CUB +wget -q https://github.com/NVIDIA/cub/archive/1.10.0.tar.gz +``` + + +## Set Up for ROCm Build + +Setting the machine up for ROCm builds of FBGEMM_GPU can be done either through +pre-built Docker images or through bare metal. + +### Docker Image + +For setups through Docker, simply pull the pre-installed +[Docker image for ROCm](https://hub.docker.com/r/rocm/rocm-terminal) for the +desired ROCm CUDA version. + +```sh +# Run for ROCm 5.4.2 +docker run -it --entrypoint "/bin/bash" rocm/rocm-terminal:5.4.2 +``` + +From there, the rest of the build environment may be constructed through Conda. + +### Install ROCm + +Install the full ROCm package through the operating system package manger. The +full instructions can be found in the +[ROCm installation guide](https://docs.amd.com/bundle/ROCm-Installation-Guide-v5.4.3/page/How_to_Install_ROCm.html): + +```sh +# [OPTIONAL] Disable apt installation prompts +export DEBIAN_FRONTEND=noninteractive + +# Update the repo DB +apt update + +# Download the installer +wget https://repo.radeon.com/amdgpu-install/5.4.3/ubuntu/focal/amdgpu-install_5.4.50403-1_all.deb + +# Run the installer +apt install ./amdgpu-install_5.4.50403-1_all.deb + +# Install ROCm +amdgpu-install -y --usecase=hiplibsdk,rocm --no-dkms +``` + +### Install MIOpen + +[MIOpen](https://github.com/ROCmSoftwarePlatform/MIOpen) is a dependency for the +ROCm variant of FBGEMM_GPU that needs to be installed: + +```sh +apt install hipify-clang miopen-hip miopen-hip-dev +``` + + +## Install PyTorch + +The official [PyTorch Homepage](https://pytorch.org/get-started/locally/) contains +the most authoritative instructions on how to install PyTorch, either through +Conda or through PIP. + +### Installation Through Conda + +```sh +# Install the latest nightly +conda install -n "${env_name}" -y pytorch -c pytorch-nightly +# Install the latest test (RC) +conda install -n "${env_name}" -y pytorch -c pytorch-test +# Install a specific version +conda install -n "${env_name}" -y pytorch==1.13.1 -c pytorch +``` + +Note that installing PyTorch through Conda without specifying a version (as in +the case of nightly builds) may not always be reliable. For example, it is known +that the GPU builds for PyTorch nightlies arrive in Conda 2 hours later than the +CPU-only builds. As such, a Conda installation of `pytorch-nightly` in that time +window will silently fall back to installing the CPU-only version. + +Also note that, because both the GPU and CPU-only versions of PyTorch are placed +into the same artifact bucket, the PyTorch variant that is selected during +installation will depend on whether or not CUDA is installed on the system. Thus +for GPU builds, it is important to install CUDA first prior to PyTorch. + +### Installation Through PIP + +Note that PIP is the only choice of installation of PyTorch for ROCm builds. + +```sh +# Install the latest nightly +conda run -n "${env_name}" pip install --pre torch --extra-index-url https://download.pytorch.org/whl/nightly/cu117/ +# Install the latest test (RC) +conda run -n "${env_name}" pip install --pre torch --extra-index-url https://download.pytorch.org/whl/test/cu117/ +# Install a specific version +conda run -n "${env_name}" pip install torch==1.13.1+cu117 --extra-index-url https://download.pytorch.org/whl/cu117/ +# Install the latest nightly (ROCm 5.3) +conda run -n "${env_name}" pip install --pre torch --extra-index-url https://download.pytorch.org/whl/nightly/rocm5.3/ +``` + +### Post-Install Checks + +Verify the PyTorch installation with an `import` test: + +```sh +conda run -n "${env_name}" python -c "import torch.distributed" +``` + +For the GPU variant of PyTorch, ensure that at the minimum, **`cuda_cmake_macros.h`** +is found: + +```sh +conda_prefix=$(conda run -n "${env_name}" printenv CONDA_PREFIX) +find "${conda_prefix}" -name cuda_cmake_macros.h +``` + + +## Build the FBGEMM_GPU Package + +### Preparing the Build + +Clone the repo along with its submodules, and install the `requirements.txt`: + +```sh +# !! Run inside the Conda environment !! + +# Select a version tag +FBGEMM_VERSION=v0.4.0 + +# Clone the repo along with its submodules +git clone --recursive -b ${FBGEMM_VERSION} https://github.com/pytorch/FBGEMM.git fbgemm_${FBGEMM_VERSION} + +# Install additional required packages for building and testing +cd fbgemm_${FBGEMM_VERSION}/fbgemm_gpu +pip install requirements.txt +``` + +### The Build Process + +The FBGEMM_GPU build process uses a scikit-build CMake-based build flow, and it +keeps state across install runs. As such, builds can become stale and can cause +problems when re-runs are attempted after a build failure due to missing +dependencies, etc. To address this, simply clear the build cache: + +```sh +# !! Run in fbgemm_gpu/ directory inside the Conda environment !! + +python setup.py clean +``` + +### CUDA Build + +Building FBGEMM_GPU for CUDA requires both NVML and cuDNN to be installed and +made available to the build through environment variables: + +```sh +# !! Run in fbgemm_gpu/ directory inside the Conda environment !! + +# [OPTIONAL] Specify the CUDA installation paths +# This may be required if CMake is unable to find nvcc +export CUDACXX=/path/to/nvcc +export CUDA_BIN_PATH=/path/to/cuda/installation + +# [OPTIONAL] Provide the CUB installation directory (applicable only to CUDA versions prior to 11.1) +export CUB_DIR=/path/to/cub + +# Specify cuDNN header and library paths +export CUDNN_INCLUDE_DIR=/path/to/cudnn/include +export CUDNN_LIBRARY=/path/to/cudnn/lib + +# Specify NVML path +export NVML_LIB_PATH=/path/to/libnvidia-ml.so + +# Update to reflect the version of Python in the Conda environment +python_tag=py310 +package_name=fbgemm_gpu + +# Build for SM70/80 (V100/A100 GPU); update as needed +# If not specified, only the CUDA architecture supported by current system will be targeted +# Ifo CUDA device is present either, all CUDA architectures will be targeted +cuda_arch_list=7.0;8.0 + +# Build the wheel artifact only +python setup.py bdist_wheel \ + --package_name="${package_name}" \ + --python-tag="${python_tag}" \ + --plat-name=manylinux1_x86_64 \ + --nvml_lib_path=${NVML_LIB_PATH} \ + -DTORCH_CUDA_ARCH_LIST="${cuda_arch_list}" + +# Build and install the library into the Conda environment +python setup.py install \ + --nvml_lib_path=${NVML_LIB_PATH} \ + -DTORCH_CUDA_ARCH_LIST="${cuda_arch_list}" +``` + +### ROCm Build + +For ROCm builds, `ROCM_PATH` and `PYTORCH_ROCM_ARCH` need to be specified: + +```sh +# !! Run in fbgemm_gpu/ directory inside the Conda environment !! + +# Build for the ROCm architecture on current machine; update as needed (e.g. 'gfx906;gfx908;gfx90a') +export ROCM_PATH=/path/to/rocm +export PYTORCH_ROCM_ARCH=$(${ROCM_PATH}/bin/rocminfo | grep -o -m 1 'gfx.*') + +python_tag=py310 +package_name=fbgemm_gpu_rocm + +# Build the wheel artifact only +python setup.py bdist_wheel \ + --package_name="${package_name}" \ + --python-tag="${python_tag}" \ + --plat-name=manylinux1_x86_64 + +# Build and install the library into the Conda environment +python setup.py install develop +``` + +### CPU-Only Build + +For CPU-only builds, the `--cpu_only` needs to be specified: + +```sh +# !! Run in fbgemm_gpu/ directory inside the Conda environment !! + +python_tag=py310 +package_name=fbgemm_gpu_cpu + +# Build the wheel artifact only +python setup.py bdist_wheel \ + --package_name="${package_name}" \ + --python-tag="${python_tag}" \ + --plat-name=manylinux1_x86_64 \ + --cpu_only + +# Build and install the library into the Conda environment +python setup.py install --cpu_only +``` + +### Post-Build Checks + +After the build completes, it is useful to check the built library and verify +the version numbers of GLIBCXX referenced as well as the availability of certain +function symbols: + +```sh +# !! Run in fbgemm_gpu/ directory inside the Conda environment !! + +# Locate the built .SO file +fbgemm_gpu_lib_path=$(find . -name fbgemm_gpu_py.so) + +# Note the versions of GLIBCXX referenced by the .SO +# The libstdc++.so.6 available on the install target must support these versions +objdump -TC "${fbgemm_gpu_lib_path}" | grep GLIBCXX | sed 's/.*GLIBCXX_\([.0-9]*\).*/GLIBCXX_\1/g' | sort -Vu | cat + +# Test for the existence of a given function symbol in the .SO +nm -gDC "${fbgemm_gpu_lib_path}" | grep " fbgemm_gpu::merge_pooled_embeddings(" +nm -gDC "${fbgemm_gpu_lib_path}" | grep " fbgemm_gpu::jagged_2d_to_dense(" +``` diff --git a/fbgemm_gpu/docs/README.md b/fbgemm_gpu/docs/README.md index 097cde17dc..e2b0c81ae7 100644 --- a/fbgemm_gpu/docs/README.md +++ b/fbgemm_gpu/docs/README.md @@ -123,7 +123,7 @@ Follow these instructions to document, generate, and publish a new C++ descripti ``` pip3 install -r requirements.txt - doxygen Doxygen.ini + doxygen Doxyfile.in make html ``` From 55e030af96e192012f273596017fef32ca156717 Mon Sep 17 00:00:00 2001 From: Benson Ma Date: Mon, 13 Mar 2023 12:21:47 -0700 Subject: [PATCH 3/3] [T145005253] Migrate CPU and ROCm jobs to Docker containers - Migrate CPU and ROCm jobs to run on top of Docker containers instead of bare metal instances - Update GitHub workflow configuration to cancel previous jobs for a PR if a new commit is pushed to the PR --- .github/scripts/setup_env.bash | 200 +++++++++++------- .github/workflows/fbgemm_ci.yml | 5 + .github/workflows/fbgemm_gpu_ci.yml | 19 +- .github/workflows/fbgemm_gpu_lint.yml | 5 + .github/workflows/fbgemm_nightly_build.yml | 5 + .../workflows/fbgemm_nightly_build_cpu.yml | 18 ++ .github/workflows/fbgemm_release_build.yml | 5 + .../workflows/fbgemm_release_build_cpu.yml | 17 ++ 8 files changed, 194 insertions(+), 80 deletions(-) diff --git a/.github/scripts/setup_env.bash b/.github/scripts/setup_env.bash index 29f0aac1ed..ccdac79097 100755 --- a/.github/scripts/setup_env.bash +++ b/.github/scripts/setup_env.bash @@ -318,7 +318,7 @@ print_ec2_info () { ################################################################################ -# Environment Setup and Install Functions +# Miniconda Setup Functions ################################################################################ setup_miniconda () { @@ -403,6 +403,11 @@ create_conda_environment () { echo "[SETUP] Successfully created Conda environment: ${env_name}" } + +################################################################################ +# PyTorch Setup Functions +################################################################################ + install_pytorch_conda () { local env_name="$1" local pytorch_version="$2" @@ -558,6 +563,28 @@ install_pytorch_pip () { echo "[INSTALL] NOTE: The installed version is: ${installed_pytorch_version}" } + +################################################################################ +# CUDA Setup Functions +################################################################################ + +install_nvidia_drivers_centos () { + echo "################################################################################" + echo "# Install NVIDIA Drivers" + echo "#" + echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)" + echo "################################################################################" + echo "" + + echo "[SETUP] Adding NVIDIA repos to yum ..." + print_exec sudo yum install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm + print_exec sudo yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo + print_exec sudo yum clean expire-cache + + echo "[SETUP] Installing NVIDIA drivers ..." + install_system_packages nvidia-driver-latest-dkms +} + install_cuda () { local env_name="$1" local cuda_version="$2" @@ -609,6 +636,86 @@ install_cuda () { echo "[INSTALL] Successfully installed CUDA ${cuda_version}" } +install_cudnn () { + local env_name="$1" + local install_path="$2" + local cuda_version="$3" + if [ "$cuda_version" == "" ]; then + echo "Usage: ${FUNCNAME[0]} ENV_NAME INSTALL_PATH CUDA_VERSION" + echo "Example:" + echo " ${FUNCNAME[0]} build_env \$(pwd)/cudnn_install 11.7" + return 1 + else + echo "################################################################################" + echo "# Install cuDNN" + echo "#" + echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)" + echo "################################################################################" + echo "" + fi + + # Install cuDNN manually + # Based on install script in https://github.com/pytorch/builder/blob/main/common/install_cuda.sh + local cudnn_packages=( + ["115"]="https://developer.download.nvidia.com/compute/redist/cudnn/v8.3.2/local_installers/11.5/cudnn-linux-x86_64-8.3.2.44_cuda11.5-archive.tar.xz" + ["116"]="https://developer.download.nvidia.com/compute/redist/cudnn/v8.3.2/local_installers/11.5/cudnn-linux-x86_64-8.3.2.44_cuda11.5-archive.tar.xz" + ["117"]="https://ossci-linux.s3.amazonaws.com/cudnn-linux-x86_64-8.5.0.96_cuda11-archive.tar.xz" + ["118"]="https://developer.download.nvidia.com/compute/redist/cudnn/v8.7.0/local_installers/11.8/cudnn-linux-x86_64-8.7.0.84_cuda11-archive.tar.xz" + ) + + # Split version string by dot into array, i.e. 11.7.1 => [11, 7, 1] + # shellcheck disable=SC2206 + local cuda_version_arr=(${cuda_version//./ }) + # Fetch the major and minor version to concat + local cuda_concat_version="${cuda_version_arr[0]}${cuda_version_arr[1]}" + + # Get the URL + local cudnn_url="${cudnn_packages[cuda_concat_version]}" + if [ "$cudnn_url" == "" ]; then + # Default to cuDNN for 11.7 if no CUDA version fits + echo "[INSTALL] Defaulting to cuDNN for CUDA 11.7" + cudnn_url="${cudnn_packages[117]}" + fi + + # Clear the install path + rm -rf "$install_path" + mkdir -p "$install_path" + + # Create temporary directory + # shellcheck disable=SC2155 + local tmp_dir=$(mktemp -d) + cd "$tmp_dir" || return 1 + + # Download cuDNN + echo "[INSTALL] Downloading cuDNN to ${tmp_dir} ..." + (exec_with_retries wget -q "$cudnn_url" -O cudnn.tar.xz) || return 1 + + # Unpack the tarball + echo "[INSTALL] Unpacking cuDNN ..." + tar -xvf cudnn.tar.xz + + # Copy the includes and libs over to the install path + echo "[INSTALL] Moving cuDNN files to ${install_path} ..." + rm -rf "${install_path:?}/include" + rm -rf "${install_path:?}/lib" + mv cudnn-linux-*/include "$install_path" + mv cudnn-linux-*/lib "$install_path" + + # Delete the temporary directory + cd - || return 1 + rm -rf "$tmp_dir" + + # Export the environment variables to the Conda environment + echo "[INSTALL] Set environment variables CUDNN_INCLUDE_DIR and CUDNN_LIBRARY ..." + print_exec conda env config vars set -n "${env_name}" CUDNN_INCLUDE_DIR="${install_path}/include" CUDNN_LIBRARY="${install_path}/lib" + + echo "[INSTALL] Successfully installed cuDNN (for CUDA ${cuda_version})" +} + +################################################################################ +# ROCm Setup Functions +################################################################################ + install_rocm_ubuntu () { local env_name="$1" local rocm_version="$2" @@ -665,9 +772,17 @@ install_rocm_ubuntu () { echo "[INSTALL] Cleaning up ..." print_exec rm -f "${package_name}" + echo "[INFO] Check ROCM GPU info ..." + print_exec rocm-smi + echo "[INSTALL] Successfully installed ROCm ${rocm_version}" } + +################################################################################ +# Build Tools Setup Functions +################################################################################ + install_cxx_compiler () { local env_name="$1" local use_system_package_manager="$2" @@ -766,82 +881,6 @@ install_build_tools () { echo "[INSTALL] Successfully installed all the build tools" } -install_cudnn () { - local env_name="$1" - local install_path="$2" - local cuda_version="$3" - if [ "$cuda_version" == "" ]; then - echo "Usage: ${FUNCNAME[0]} ENV_NAME INSTALL_PATH CUDA_VERSION" - echo "Example:" - echo " ${FUNCNAME[0]} build_env \$(pwd)/cudnn_install 11.7" - return 1 - else - echo "################################################################################" - echo "# Install cuDNN" - echo "#" - echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)" - echo "################################################################################" - echo "" - fi - - # Install cuDNN manually - # Based on install script in https://github.com/pytorch/builder/blob/main/common/install_cuda.sh - local cudnn_packages=( - ["115"]="https://developer.download.nvidia.com/compute/redist/cudnn/v8.3.2/local_installers/11.5/cudnn-linux-x86_64-8.3.2.44_cuda11.5-archive.tar.xz" - ["116"]="https://developer.download.nvidia.com/compute/redist/cudnn/v8.3.2/local_installers/11.5/cudnn-linux-x86_64-8.3.2.44_cuda11.5-archive.tar.xz" - ["117"]="https://ossci-linux.s3.amazonaws.com/cudnn-linux-x86_64-8.5.0.96_cuda11-archive.tar.xz" - ["118"]="https://developer.download.nvidia.com/compute/redist/cudnn/v8.7.0/local_installers/11.8/cudnn-linux-x86_64-8.7.0.84_cuda11-archive.tar.xz" - ) - - # Split version string by dot into array, i.e. 11.7.1 => [11, 7, 1] - # shellcheck disable=SC2206 - local cuda_version_arr=(${cuda_version//./ }) - # Fetch the major and minor version to concat - local cuda_concat_version="${cuda_version_arr[0]}${cuda_version_arr[1]}" - - # Get the URL - local cudnn_url="${cudnn_packages[cuda_concat_version]}" - if [ "$cudnn_url" == "" ]; then - # Default to cuDNN for 11.7 if no CUDA version fits - echo "[INSTALL] Defaulting to cuDNN for CUDA 11.7" - cudnn_url="${cudnn_packages[117]}" - fi - - # Clear the install path - rm -rf "$install_path" - mkdir -p "$install_path" - - # Create temporary directory - # shellcheck disable=SC2155 - local tmp_dir=$(mktemp -d) - cd "$tmp_dir" || return 1 - - # Download cuDNN - echo "[INSTALL] Downloading cuDNN to ${tmp_dir} ..." - (exec_with_retries wget -q "$cudnn_url" -O cudnn.tar.xz) || return 1 - - # Unpack the tarball - echo "[INSTALL] Unpacking cuDNN ..." - tar -xvf cudnn.tar.xz - - # Copy the includes and libs over to the install path - echo "[INSTALL] Moving cuDNN files to ${install_path} ..." - rm -rf "${install_path:?}/include" - rm -rf "${install_path:?}/lib" - mv cudnn-linux-*/include "$install_path" - mv cudnn-linux-*/lib "$install_path" - - # Delete the temporary directory - cd - || return 1 - rm -rf "$tmp_dir" - - # Export the environment variables to the Conda environment - echo "[INSTALL] Set environment variables CUDNN_INCLUDE_DIR and CUDNN_LIBRARY ..." - print_exec conda env config vars set -n "${env_name}" CUDNN_INCLUDE_DIR="${install_path}/include" CUDNN_LIBRARY="${install_path}/lib" - - echo "[INSTALL] Successfully installed cuDNN (for CUDA ${cuda_version})" -} - ################################################################################ # Combination Functions @@ -883,7 +922,7 @@ create_conda_pytorch_environment () { ################################################################################ -# Build Functions +# FBGEMM_GPU Build Functions ################################################################################ prepare_fbgemm_gpu_build () { @@ -902,6 +941,11 @@ prepare_fbgemm_gpu_build () { echo "" fi + if [[ "${GITHUB_WORKSPACE}" ]]; then + # https://github.com/actions/checkout/issues/841 + git config --global --add safe.directory "${GITHUB_WORKSPACE}" + fi + echo "[BUILD] Running git submodules update ..." git submodule sync git submodule update --init --recursive diff --git a/.github/workflows/fbgemm_ci.yml b/.github/workflows/fbgemm_ci.yml index f6bae56123..977b443a2b 100644 --- a/.github/workflows/fbgemm_ci.yml +++ b/.github/workflows/fbgemm_ci.yml @@ -13,6 +13,11 @@ on: branches: - main +concurrency: + # Cancel previous runs in the PR if a new commit is pushed + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + jobs: build-posix: runs-on: ${{ matrix.os }} diff --git a/.github/workflows/fbgemm_gpu_ci.yml b/.github/workflows/fbgemm_gpu_ci.yml index 8e021c4451..bd62f23761 100644 --- a/.github/workflows/fbgemm_gpu_ci.yml +++ b/.github/workflows/fbgemm_gpu_ci.yml @@ -13,9 +13,17 @@ on: branches: - main +concurrency: + # Cancel previous runs in the PR if a new commit is pushed + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + jobs: build_and_test_amd: runs-on: ${{ matrix.os }} + container: + image: ${{ matrix.container-image }} + options: --user root defaults: run: shell: bash @@ -25,11 +33,18 @@ jobs: strategy: fail-fast: false matrix: - os: [ ubuntu-20.04 ] + os: [ linux.12xlarge ] + container-image: [ "ubuntu:20.04" ] python-version: [ "3.10" ] rocm-version: [ "5.3" ] steps: + - name: Setup Build Container + run: | + apt update -y + apt install -y binutils git sudo wget + git config --global --add safe.directory '*' + - name: Checkout the Repository uses: actions/checkout@v3 with: @@ -74,7 +89,7 @@ jobs: print_exec conda env config vars set -n $BUILD_ENV PYTORCH_ROCM_ARCH=gfx90a print_exec conda run -n $BUILD_ENV python setup.py build develop - - name: Test FBGEMM_GPU-ROCM Nightly installation + - name: Test FBGEMM_GPU-ROCM Nightly Installation timeout-minutes: 10 run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV rocm diff --git a/.github/workflows/fbgemm_gpu_lint.yml b/.github/workflows/fbgemm_gpu_lint.yml index dc2b6344ce..1ff7203108 100644 --- a/.github/workflows/fbgemm_gpu_lint.yml +++ b/.github/workflows/fbgemm_gpu_lint.yml @@ -14,6 +14,11 @@ on: branches: - main +concurrency: + # Cancel previous runs in the PR if a new commit is pushed + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + jobs: run_pylint: runs-on: ubuntu-latest diff --git a/.github/workflows/fbgemm_nightly_build.yml b/.github/workflows/fbgemm_nightly_build.yml index 4cdb10aaa8..bc699ef62b 100644 --- a/.github/workflows/fbgemm_nightly_build.yml +++ b/.github/workflows/fbgemm_nightly_build.yml @@ -30,6 +30,11 @@ on: # workflow_dispatch: +concurrency: + # Cancel previous runs in the PR if a new commit is pushed + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + jobs: # Build on CPU hosts and upload to GHA build_artifact: diff --git a/.github/workflows/fbgemm_nightly_build_cpu.yml b/.github/workflows/fbgemm_nightly_build_cpu.yml index 72a0af01e7..1125b17a0d 100644 --- a/.github/workflows/fbgemm_nightly_build_cpu.yml +++ b/.github/workflows/fbgemm_nightly_build_cpu.yml @@ -30,10 +30,19 @@ on: # workflow_dispatch: +concurrency: + # Cancel previous runs in the PR if a new commit is pushed + # https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + jobs: # Build on CPU hosts, run tests, and upload to GHA build_artifact: runs-on: ${{ matrix.os }} + container: + image: amazonlinux:2023 + options: --user root defaults: run: shell: bash @@ -48,6 +57,9 @@ jobs: python-version: [ "3.8", "3.9", "3.10" ] steps: + - name: Setup Build Container + run: yum update -y; yum install -y binutils findutils git sudo wget which + - name: Checkout the Repository uses: actions/checkout@v3 with: @@ -93,6 +105,9 @@ jobs: # Download the built artifact from GHA, test on GPU, and push to PyPI test_and_publish_artifact: runs-on: ${{ matrix.os }} + container: + image: amazonlinux:2023 + options: --user root defaults: run: shell: bash @@ -107,6 +122,9 @@ jobs: needs: build_artifact steps: + - name: Setup Build Container + run: yum update -y; yum install -y binutils findutils git sudo wget which + - name: Checkout the Repository uses: actions/checkout@v3 with: diff --git a/.github/workflows/fbgemm_release_build.yml b/.github/workflows/fbgemm_release_build.yml index 5e3d369fe4..def6002a76 100644 --- a/.github/workflows/fbgemm_release_build.yml +++ b/.github/workflows/fbgemm_release_build.yml @@ -22,6 +22,11 @@ on: # workflow_dispatch: +concurrency: + # Cancel previous runs in the PR if a new commit is pushed + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + jobs: # Build on CPU hosts and upload to GHA build_artifact: diff --git a/.github/workflows/fbgemm_release_build_cpu.yml b/.github/workflows/fbgemm_release_build_cpu.yml index a652c89854..c7fb53cabd 100644 --- a/.github/workflows/fbgemm_release_build_cpu.yml +++ b/.github/workflows/fbgemm_release_build_cpu.yml @@ -22,10 +22,18 @@ on: # workflow_dispatch: +concurrency: + # Cancel previous runs in the PR if a new commit is pushed + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + jobs: # Build on CPU hosts, run tests, and upload to GHA build_artifact: runs-on: ${{ matrix.os }} + container: + image: amazonlinux:2023 + options: --user root defaults: run: shell: bash @@ -40,6 +48,9 @@ jobs: python-version: [ "3.8", "3.9", "3.10" ] steps: + - name: Setup Build Container + run: yum update -y; yum install -y binutils findutils git sudo wget which + - name: Checkout the Repository uses: actions/checkout@v3 with: @@ -85,6 +96,9 @@ jobs: # Download the built artifact from GHA, test on GPU, and push to PyPI test_and_publish_artifact: runs-on: ${{ matrix.os }} + container: + image: amazonlinux:2023 + options: --user root defaults: run: shell: bash @@ -99,6 +113,9 @@ jobs: needs: build_artifact steps: + - name: Setup Build Container + run: yum update -y; yum install -y binutils findutils git sudo wget which + - name: Checkout the Repository uses: actions/checkout@v3 with: