From 79850c57777d661fd7b55115a00ece81c283bfb0 Mon Sep 17 00:00:00 2001
From: Benson Ma <bensonma415@meta.com>
Date: Wed, 8 Mar 2023 14:45:10 -0800
Subject: [PATCH 1/3] [T145005253] Remove `.post0` from autogenerated version

- Remove `.post0` suffix from the autogenerated package version
---
 fbgemm_gpu/setup.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/fbgemm_gpu/setup.py b/fbgemm_gpu/setup.py
index 6b8ebbb570..2b34cb240a 100644
--- a/fbgemm_gpu/setup.py
+++ b/fbgemm_gpu/setup.py
@@ -7,6 +7,7 @@
 import argparse
 import os
 import random
+import re
 import subprocess
 import sys
 
@@ -38,8 +39,9 @@ def generate_package_version(package_name: str):
         print(
             f"[SETUP.PY] TAG: {gitversion.get_tag()}, BRANCH: {gitversion.get_branch()}, SHA: {gitversion.get_sha()}"
         )
-        # Remove the local version identifier, if any (0.4.0rc0.post0+git.6a63116c.dirty => 0.4.0rc0.post0)
-        version = gitversion.version_from_git().split("+")[0]
+        # Remove the local version identifier, if any (e.g. 0.4.0rc0.post0+git.6a63116c.dirty => 0.4.0rc0.post0)
+        # Then remove post0 (keep postN for N > 0) (e.g. 0.4.0rc0.post0 => 0.4.0rc0)
+        version = re.sub(".post0$", "", gitversion.version_from_git().split("+")[0])
 
     print(f"[SETUP.PY] Setting the package version: {version}")
     return version

From da2abb0834984748b7bdc83f7db3cbc243a378ef Mon Sep 17 00:00:00 2001
From: Benson Ma <bensonma415@meta.com>
Date: Thu, 9 Mar 2023 18:08:10 -0800
Subject: [PATCH 2/3] [T143174754] Document the FBGEMM_GPU build process

- Document the full FBGEMM_GPU OSS build process in
a separate Markdown file

- Remove installation of packages not needed for ROCm builds
---
 .github/scripts/setup_env.bash       |  13 +-
 fbgemm_gpu/docs/BuildInstructions.md | 430 +++++++++++++++++++++++++++
 fbgemm_gpu/docs/README.md            |   2 +-
 3 files changed, 441 insertions(+), 4 deletions(-)
 create mode 100644 fbgemm_gpu/docs/BuildInstructions.md

diff --git a/.github/scripts/setup_env.bash b/.github/scripts/setup_env.bash
index 4f1c808598..29f0aac1ed 100755
--- a/.github/scripts/setup_env.bash
+++ b/.github/scripts/setup_env.bash
@@ -13,8 +13,13 @@
 print_exec () {
   echo "+ $*"
   echo ""
-  "$@"
+  if "$@"; then
+    local retcode=0
+  else
+    local retcode=$?
+  fi
   echo ""
+  return $retcode
 }
 
 exec_with_retries () {
@@ -205,7 +210,7 @@ run_python_test () {
     echo "################################################################################"
   fi
 
-  if conda run -n "${env_name}" python -m pytest -v -rsx -s -W ignore::pytest.PytestCollectionWarning "${python_test_file}"; then
+  if print_exec conda run -n "${env_name}" python -m pytest -v -rsx -s -W ignore::pytest.PytestCollectionWarning "${python_test_file}"; then
     echo "[TEST] Python test suite PASSED: ${python_test_file}"
   else
     echo "[TEST] Python test suite FAILED: ${python_test_file}"
@@ -652,9 +657,11 @@ install_rocm_ubuntu () {
   (exec_with_retries amdgpu-install -y --usecase=hiplibsdk,rocm --no-dkms) || return 1
 
   echo "[INSTALL] Installing HIP-relevant packages ..."
-  install_system_packages mesa-common-dev clang comgr libopenblas-dev jp intel-mkl-full locales libnuma-dev
   install_system_packages hipify-clang miopen-hip miopen-hip-dev
 
+  # There is no need to install these packages for ROCm
+  # install_system_packages mesa-common-dev clang comgr libopenblas-dev jp intel-mkl-full locales libnuma-dev
+
   echo "[INSTALL] Cleaning up ..."
   print_exec rm -f "${package_name}"
 
diff --git a/fbgemm_gpu/docs/BuildInstructions.md b/fbgemm_gpu/docs/BuildInstructions.md
new file mode 100644
index 0000000000..a90a059b40
--- /dev/null
+++ b/fbgemm_gpu/docs/BuildInstructions.md
@@ -0,0 +1,430 @@
+# FBGEMM_GPU Build Instructions
+
+The most up-to-date instructions are embedded in
+[`setup_env.bash`](../../.github/scripts/setup_env.bash).  The general steps for
+building FBGEMM_GPU are as follows:
+
+1. Set up an isolated environment for building (Miniconda)
+1. Install the relevant build tools (C/C++ compiler)
+1. Set up for either CUDA, ROCm, or CPU build
+1. Install PyTorch
+1. Run the build
+
+
+## Set Up an Isolated Build Environment
+
+### Install Miniconda
+
+Setting up a [Miniconda](https://docs.conda.io/en/latest/miniconda.html)
+environment is recommended for reproducible builds:
+
+```sh
+# Set the Miniconda prefix directory
+miniconda_prefix=$HOME/miniconda
+
+# Download the Miniconfs installer
+wget -q https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh
+
+# Run the installer
+bash miniconda.sh -b -p "$miniconda_prefix" -u
+
+# Load the shortcuts
+. ~/.bashrc
+
+# Run updates
+conda update -n base -c defaults -y conda
+```
+
+From here on out, all installation commands will be run against or inside a
+Conda environment.
+
+
+### Set Up the Conda Environment
+
+Create a Conda environment with the specified Python version:
+
+```sh
+env_name=<ENV NAME>
+python_version=3.10
+
+# Create the environment
+conda create -y --name "${env_name}" python="${python_version}"
+
+# Upgrade PIP and pyOpenSSL package
+conda run -n "${env_name}" pip install --upgrade pip
+conda run -n "${env_name}" python -m pip install pyOpenSSL>22.1.0
+```
+
+## Install the Build Tools
+
+### C/C++ Compiler
+
+Install the GCC toolchain.  Note that GCC (as opposed to LLVM for example) is
+required for GPU (CUDA) builds because NVIDIA's `nvcc` relies on `gcc` and `g++`
+in the path.
+
+```sh
+conda install -n "${env_name}" -y gxx_linux-64=9.3.0
+```
+
+Note that while newer versions of GCC can be used, binaries compiled under newer
+versions of GCC will not be compatible with older systems such as Ubuntu 20.04
+or CentOS Stream 8, because the compiled library will reference symbols from
+versions of `GLIBCXX` that the system's `libstdc++.so.6` will not support.  To
+see what versions of GLIBCXX that the available `libstdc++.so.6` supports:
+
+```sh
+libcxx_path=/path/to/libstdc++.so.6
+objdump -TC "${libcxx_path}" | grep GLIBCXX | sed 's/.*GLIBCXX_\([.0-9]*\).*/GLIBCXX_\1/g' | sort -Vu | cat
+```
+
+### Other Build Tools
+
+Install the other necessary build tools such as `ninja`, `cmake`, etc:
+
+```sh
+conda install -n "${env_name}" -y \
+    click \
+    cmake \
+    hypothesis \
+    jinja2 \
+    ninja \
+    numpy \
+    scikit-build \
+    wheel
+```
+
+
+## Set Up for CUDA Build
+
+The CUDA build of FBGEMM_GPU requires `nvcc` that supports compute capability
+3.5+.  Setting the machine up for CUDA builds of FBGEMM_GPU can be done either
+through pre-built Docker images or through Conda installation on bare metal.
+Note that neither a GPU nor the NVIDIA drivers need to be present for builds,
+since they are only used at runtime.
+
+### Docker Image
+
+For setups through Docker, simply pull the pre-installed
+[Docker image for CUDA](https://hub.docker.com/r/nvidia/cuda) for the desired
+Linux distribution and CUDA version.
+
+```sh
+# Run for Ubuntu 22.04, CUDA 11.8
+docker run -it --entrypoint "/bin/bash" nvidia/cuda:11.8.0-devel-ubuntu22.04
+```
+
+From there, the rest of the build environment may be constructed through Conda.
+
+### Install CUDA
+
+Install the full CUDA package through Conda, which includes
+[NVML](https://developer.nvidia.com/nvidia-management-library-nvml):
+
+```sh
+cuda_version=11.7.1
+
+# Install the full CUDA package
+conda install -n "${env_name}" -y cuda -c "nvidia/label/cuda-${cuda_version}"
+```
+
+Ensure that at the minimum, **`cuda_runtime.h`** and **`libnvidia-ml.so`** are
+found:
+
+```sh
+conda_prefix=$(conda run -n "${env_name}" printenv CONDA_PREFIX)
+find "${conda_prefix}" -name cuda_runtime.h
+find "${conda_prefix}" -name libnvidia-ml.so
+```
+
+### Install cuDNN
+
+[cuDNN](https://developer.nvidia.com/cudnn) is a build-time dependency for the
+CUDA variant of FBGEMM_GPU.  Download and extract the cuDNN package for the
+given CUDA version:
+
+```sh
+# cuDNN package URLs can be found in: https://github.com/pytorch/builder/blob/main/common/install_cuda.sh
+cudnn_url=https://developer.download.nvidia.com/compute/redist/cudnn/v8.7.0/local_installers/11.8/cudnn-linux-x86_64-8.7.0.84_cuda11-archive.tar.xz
+
+# Download and unpack cuDNN
+wget -q "${cudnn_url}" -O cudnn.tar.xz
+```
+
+### [OPTIONAL] Install CUB
+
+[CUB](https://docs.nvidia.com/cuda/cub/index.html) is a build-time dependency for
+the CUDA variant FBGEMM_GPU.  This must be installed separately for
+**previous versions of CUDA (prior to 11.1)** since they did not come with CUB packaged.
+
+To install CUB through Conda:
+
+```sh
+conda install -c bottler nvidiacub
+```
+
+Alternatively, CUB may be installed manually by downloading from the
+[GitHub Releases](https://github.com/NVIDIA/cub/releases ) page and unpacking
+the package:
+
+```sh
+# Download and unpack CUB
+wget -q https://github.com/NVIDIA/cub/archive/1.10.0.tar.gz
+```
+
+
+## Set Up for ROCm Build
+
+Setting the machine up for ROCm builds of FBGEMM_GPU can be done either through
+pre-built Docker images or through bare metal.
+
+### Docker Image
+
+For setups through Docker, simply pull the pre-installed
+[Docker image for ROCm](https://hub.docker.com/r/rocm/rocm-terminal) for the
+desired ROCm CUDA version.
+
+```sh
+# Run for ROCm 5.4.2
+docker run -it --entrypoint "/bin/bash" rocm/rocm-terminal:5.4.2
+```
+
+From there, the rest of the build environment may be constructed through Conda.
+
+### Install ROCm
+
+Install the full ROCm package through the operating system package manger. The
+full instructions can be found in the
+[ROCm installation guide](https://docs.amd.com/bundle/ROCm-Installation-Guide-v5.4.3/page/How_to_Install_ROCm.html):
+
+```sh
+# [OPTIONAL] Disable apt installation prompts
+export DEBIAN_FRONTEND=noninteractive
+
+# Update the repo DB
+apt update
+
+# Download the installer
+wget https://repo.radeon.com/amdgpu-install/5.4.3/ubuntu/focal/amdgpu-install_5.4.50403-1_all.deb
+
+# Run the installer
+apt install ./amdgpu-install_5.4.50403-1_all.deb
+
+# Install ROCm
+amdgpu-install -y --usecase=hiplibsdk,rocm --no-dkms
+```
+
+### Install MIOpen
+
+[MIOpen](https://github.com/ROCmSoftwarePlatform/MIOpen) is a dependency for the
+ROCm variant of FBGEMM_GPU that needs to be installed:
+
+```sh
+apt install hipify-clang miopen-hip miopen-hip-dev
+```
+
+
+## Install PyTorch
+
+The official [PyTorch Homepage](https://pytorch.org/get-started/locally/) contains
+the most authoritative instructions on how to install PyTorch, either through
+Conda or through PIP.
+
+### Installation Through Conda
+
+```sh
+# Install the latest nightly
+conda install -n "${env_name}" -y pytorch -c pytorch-nightly
+# Install the latest test (RC)
+conda install -n "${env_name}" -y pytorch -c pytorch-test
+# Install a specific version
+conda install -n "${env_name}" -y pytorch==1.13.1 -c pytorch
+```
+
+Note that installing PyTorch through Conda without specifying a version (as in
+the case of nightly builds) may not always be reliable.  For example, it is known
+that the GPU builds for PyTorch nightlies arrive in Conda 2 hours later than the
+CPU-only builds.  As such, a Conda installation of `pytorch-nightly` in that time
+window will silently fall back to installing the CPU-only version.
+
+Also note that, because both the GPU and CPU-only versions of PyTorch are placed
+into the same artifact bucket, the PyTorch variant that is selected during
+installation will depend on whether or not CUDA is installed on the system.  Thus
+for GPU builds, it is important to install CUDA first prior to PyTorch.
+
+### Installation Through PIP
+
+Note that PIP is the only choice of installation of PyTorch for ROCm builds.
+
+```sh
+# Install the latest nightly
+conda run -n "${env_name}" pip install --pre torch --extra-index-url https://download.pytorch.org/whl/nightly/cu117/
+# Install the latest test (RC)
+conda run -n "${env_name}" pip install --pre torch --extra-index-url https://download.pytorch.org/whl/test/cu117/
+# Install a specific version
+conda run -n "${env_name}" pip install torch==1.13.1+cu117 --extra-index-url https://download.pytorch.org/whl/cu117/
+# Install the latest nightly (ROCm 5.3)
+conda run -n "${env_name}" pip install --pre torch --extra-index-url https://download.pytorch.org/whl/nightly/rocm5.3/
+```
+
+### Post-Install Checks
+
+Verify the PyTorch installation with an `import` test:
+
+```sh
+conda run -n "${env_name}" python -c "import torch.distributed"
+```
+
+For the GPU variant of PyTorch, ensure that at the minimum, **`cuda_cmake_macros.h`**
+is found:
+
+```sh
+conda_prefix=$(conda run -n "${env_name}" printenv CONDA_PREFIX)
+find "${conda_prefix}" -name cuda_cmake_macros.h
+```
+
+
+## Build the FBGEMM_GPU Package
+
+### Preparing the Build
+
+Clone the repo along with its submodules, and install the `requirements.txt`:
+
+```sh
+# !! Run inside the Conda environment !!
+
+# Select a version tag
+FBGEMM_VERSION=v0.4.0
+
+# Clone the repo along with its submodules
+git clone --recursive -b ${FBGEMM_VERSION} https://github.com/pytorch/FBGEMM.git fbgemm_${FBGEMM_VERSION}
+
+# Install additional required packages for building and testing
+cd fbgemm_${FBGEMM_VERSION}/fbgemm_gpu
+pip install requirements.txt
+```
+
+### The Build Process
+
+The FBGEMM_GPU build process uses a scikit-build CMake-based build flow, and it
+keeps state across install runs.  As such, builds can become stale and can cause
+problems when re-runs are attempted after a build failure due to missing
+dependencies, etc.  To address this, simply clear the build cache:
+
+```sh
+# !! Run in fbgemm_gpu/ directory inside the Conda environment !!
+
+python setup.py clean
+```
+
+### CUDA Build
+
+Building FBGEMM_GPU for CUDA requires both NVML and cuDNN to be installed and
+made available to the build through environment variables:
+
+```sh
+# !! Run in fbgemm_gpu/ directory inside the Conda environment !!
+
+# [OPTIONAL] Specify the CUDA installation paths
+# This may be required if CMake is unable to find nvcc
+export CUDACXX=/path/to/nvcc
+export CUDA_BIN_PATH=/path/to/cuda/installation
+
+# [OPTIONAL] Provide the CUB installation directory (applicable only to CUDA versions prior to 11.1)
+export CUB_DIR=/path/to/cub
+
+# Specify cuDNN header and library paths
+export CUDNN_INCLUDE_DIR=/path/to/cudnn/include
+export CUDNN_LIBRARY=/path/to/cudnn/lib
+
+# Specify NVML path
+export NVML_LIB_PATH=/path/to/libnvidia-ml.so
+
+# Update to reflect the version of Python in the Conda environment
+python_tag=py310
+package_name=fbgemm_gpu
+
+# Build for SM70/80 (V100/A100 GPU); update as needed
+# If not specified, only the CUDA architecture supported by current system will be targeted
+# Ifo CUDA device is present either, all CUDA architectures will be targeted
+cuda_arch_list=7.0;8.0
+
+# Build the wheel artifact only
+python setup.py bdist_wheel \
+    --package_name="${package_name}" \
+    --python-tag="${python_tag}" \
+    --plat-name=manylinux1_x86_64 \
+    --nvml_lib_path=${NVML_LIB_PATH} \
+    -DTORCH_CUDA_ARCH_LIST="${cuda_arch_list}"
+
+# Build and install the library into the Conda environment
+python setup.py install \
+    --nvml_lib_path=${NVML_LIB_PATH} \
+    -DTORCH_CUDA_ARCH_LIST="${cuda_arch_list}"
+```
+
+### ROCm Build
+
+For ROCm builds, `ROCM_PATH` and `PYTORCH_ROCM_ARCH` need to be specified:
+
+```sh
+# !! Run in fbgemm_gpu/ directory inside the Conda environment !!
+
+# Build for the ROCm architecture on current machine; update as needed (e.g. 'gfx906;gfx908;gfx90a')
+export ROCM_PATH=/path/to/rocm
+export PYTORCH_ROCM_ARCH=$(${ROCM_PATH}/bin/rocminfo | grep -o -m 1 'gfx.*')
+
+python_tag=py310
+package_name=fbgemm_gpu_rocm
+
+# Build the wheel artifact only
+python setup.py bdist_wheel \
+    --package_name="${package_name}" \
+    --python-tag="${python_tag}" \
+    --plat-name=manylinux1_x86_64
+
+# Build and install the library into the Conda environment
+python setup.py install develop
+```
+
+### CPU-Only Build
+
+For CPU-only builds, the `--cpu_only` needs to be specified:
+
+```sh
+# !! Run in fbgemm_gpu/ directory inside the Conda environment !!
+
+python_tag=py310
+package_name=fbgemm_gpu_cpu
+
+# Build the wheel artifact only
+python setup.py bdist_wheel \
+    --package_name="${package_name}" \
+    --python-tag="${python_tag}" \
+    --plat-name=manylinux1_x86_64 \
+    --cpu_only
+
+# Build and install the library into the Conda environment
+python setup.py install --cpu_only
+```
+
+### Post-Build Checks
+
+After the build completes, it is useful to check the built library and verify
+the version numbers of GLIBCXX referenced as well as the availability of certain
+function symbols:
+
+```sh
+# !! Run in fbgemm_gpu/ directory inside the Conda environment !!
+
+# Locate the built .SO file
+fbgemm_gpu_lib_path=$(find . -name fbgemm_gpu_py.so)
+
+# Note the versions of GLIBCXX referenced by the .SO
+# The libstdc++.so.6 available on the install target must support these versions
+objdump -TC "${fbgemm_gpu_lib_path}" | grep GLIBCXX | sed 's/.*GLIBCXX_\([.0-9]*\).*/GLIBCXX_\1/g' | sort -Vu | cat
+
+# Test for the existence of a given function symbol in the .SO
+nm -gDC "${fbgemm_gpu_lib_path}" | grep " fbgemm_gpu::merge_pooled_embeddings("
+nm -gDC "${fbgemm_gpu_lib_path}" | grep " fbgemm_gpu::jagged_2d_to_dense("
+```
diff --git a/fbgemm_gpu/docs/README.md b/fbgemm_gpu/docs/README.md
index 097cde17dc..e2b0c81ae7 100644
--- a/fbgemm_gpu/docs/README.md
+++ b/fbgemm_gpu/docs/README.md
@@ -123,7 +123,7 @@ Follow these instructions to document, generate, and publish a new C++ descripti
 
    ```
    pip3 install -r requirements.txt
-   doxygen Doxygen.ini
+   doxygen Doxyfile.in
    make html
    ```
 

From 55e030af96e192012f273596017fef32ca156717 Mon Sep 17 00:00:00 2001
From: Benson Ma <bensonma415@meta.com>
Date: Mon, 13 Mar 2023 12:21:47 -0700
Subject: [PATCH 3/3] [T145005253] Migrate CPU and ROCm jobs to Docker
 containers

- Migrate CPU and ROCm jobs to run on top of Docker containers instead
of bare metal instances

- Update GitHub workflow configuration to cancel previous jobs for a PR
if a new commit is pushed to the PR
---
 .github/scripts/setup_env.bash                | 200 +++++++++++-------
 .github/workflows/fbgemm_ci.yml               |   5 +
 .github/workflows/fbgemm_gpu_ci.yml           |  19 +-
 .github/workflows/fbgemm_gpu_lint.yml         |   5 +
 .github/workflows/fbgemm_nightly_build.yml    |   5 +
 .../workflows/fbgemm_nightly_build_cpu.yml    |  18 ++
 .github/workflows/fbgemm_release_build.yml    |   5 +
 .../workflows/fbgemm_release_build_cpu.yml    |  17 ++
 8 files changed, 194 insertions(+), 80 deletions(-)

diff --git a/.github/scripts/setup_env.bash b/.github/scripts/setup_env.bash
index 29f0aac1ed..ccdac79097 100755
--- a/.github/scripts/setup_env.bash
+++ b/.github/scripts/setup_env.bash
@@ -318,7 +318,7 @@ print_ec2_info () {
 
 
 ################################################################################
-# Environment Setup and Install Functions
+# Miniconda Setup Functions
 ################################################################################
 
 setup_miniconda () {
@@ -403,6 +403,11 @@ create_conda_environment () {
   echo "[SETUP] Successfully created Conda environment: ${env_name}"
 }
 
+
+################################################################################
+# PyTorch Setup Functions
+################################################################################
+
 install_pytorch_conda () {
   local env_name="$1"
   local pytorch_version="$2"
@@ -558,6 +563,28 @@ install_pytorch_pip () {
   echo "[INSTALL] NOTE: The installed version is: ${installed_pytorch_version}"
 }
 
+
+################################################################################
+# CUDA Setup Functions
+################################################################################
+
+install_nvidia_drivers_centos () {
+  echo "################################################################################"
+  echo "# Install NVIDIA Drivers"
+  echo "#"
+  echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)"
+  echo "################################################################################"
+  echo ""
+
+  echo "[SETUP] Adding NVIDIA repos to yum ..."
+  print_exec sudo yum install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm
+  print_exec sudo yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo
+  print_exec sudo yum clean expire-cache
+
+  echo "[SETUP] Installing NVIDIA drivers ..."
+  install_system_packages nvidia-driver-latest-dkms
+}
+
 install_cuda () {
   local env_name="$1"
   local cuda_version="$2"
@@ -609,6 +636,86 @@ install_cuda () {
   echo "[INSTALL] Successfully installed CUDA ${cuda_version}"
 }
 
+install_cudnn () {
+  local env_name="$1"
+  local install_path="$2"
+  local cuda_version="$3"
+  if [ "$cuda_version" == "" ]; then
+    echo "Usage: ${FUNCNAME[0]} ENV_NAME INSTALL_PATH CUDA_VERSION"
+    echo "Example:"
+    echo "    ${FUNCNAME[0]} build_env \$(pwd)/cudnn_install 11.7"
+    return 1
+  else
+    echo "################################################################################"
+    echo "# Install cuDNN"
+    echo "#"
+    echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)"
+    echo "################################################################################"
+    echo ""
+  fi
+
+  # Install cuDNN manually
+  # Based on install script in https://github.com/pytorch/builder/blob/main/common/install_cuda.sh
+  local cudnn_packages=(
+    ["115"]="https://developer.download.nvidia.com/compute/redist/cudnn/v8.3.2/local_installers/11.5/cudnn-linux-x86_64-8.3.2.44_cuda11.5-archive.tar.xz"
+    ["116"]="https://developer.download.nvidia.com/compute/redist/cudnn/v8.3.2/local_installers/11.5/cudnn-linux-x86_64-8.3.2.44_cuda11.5-archive.tar.xz"
+    ["117"]="https://ossci-linux.s3.amazonaws.com/cudnn-linux-x86_64-8.5.0.96_cuda11-archive.tar.xz"
+    ["118"]="https://developer.download.nvidia.com/compute/redist/cudnn/v8.7.0/local_installers/11.8/cudnn-linux-x86_64-8.7.0.84_cuda11-archive.tar.xz"
+  )
+
+  # Split version string by dot into array, i.e. 11.7.1 => [11, 7, 1]
+  # shellcheck disable=SC2206
+  local cuda_version_arr=(${cuda_version//./ })
+  # Fetch the major and minor version to concat
+  local cuda_concat_version="${cuda_version_arr[0]}${cuda_version_arr[1]}"
+
+  # Get the URL
+  local cudnn_url="${cudnn_packages[cuda_concat_version]}"
+  if [ "$cudnn_url" == "" ]; then
+    # Default to cuDNN for 11.7 if no CUDA version fits
+    echo "[INSTALL] Defaulting to cuDNN for CUDA 11.7"
+    cudnn_url="${cudnn_packages[117]}"
+  fi
+
+  # Clear the install path
+  rm -rf "$install_path"
+  mkdir -p "$install_path"
+
+  # Create temporary directory
+  # shellcheck disable=SC2155
+  local tmp_dir=$(mktemp -d)
+  cd "$tmp_dir" || return 1
+
+  # Download cuDNN
+  echo "[INSTALL] Downloading cuDNN to ${tmp_dir} ..."
+  (exec_with_retries wget -q "$cudnn_url" -O cudnn.tar.xz) || return 1
+
+  # Unpack the tarball
+  echo "[INSTALL] Unpacking cuDNN ..."
+  tar -xvf cudnn.tar.xz
+
+  # Copy the includes and libs over to the install path
+  echo "[INSTALL] Moving cuDNN files to ${install_path} ..."
+  rm -rf "${install_path:?}/include"
+  rm -rf "${install_path:?}/lib"
+  mv cudnn-linux-*/include "$install_path"
+  mv cudnn-linux-*/lib "$install_path"
+
+  # Delete the temporary directory
+  cd - || return 1
+  rm -rf "$tmp_dir"
+
+  # Export the environment variables to the Conda environment
+  echo "[INSTALL] Set environment variables CUDNN_INCLUDE_DIR and CUDNN_LIBRARY ..."
+  print_exec conda env config vars set -n "${env_name}" CUDNN_INCLUDE_DIR="${install_path}/include" CUDNN_LIBRARY="${install_path}/lib"
+
+  echo "[INSTALL] Successfully installed cuDNN (for CUDA ${cuda_version})"
+}
+
+################################################################################
+# ROCm Setup Functions
+################################################################################
+
 install_rocm_ubuntu () {
   local env_name="$1"
   local rocm_version="$2"
@@ -665,9 +772,17 @@ install_rocm_ubuntu () {
   echo "[INSTALL] Cleaning up ..."
   print_exec rm -f "${package_name}"
 
+  echo "[INFO] Check ROCM GPU info ..."
+  print_exec rocm-smi
+
   echo "[INSTALL] Successfully installed ROCm ${rocm_version}"
 }
 
+
+################################################################################
+# Build Tools Setup Functions
+################################################################################
+
 install_cxx_compiler () {
   local env_name="$1"
   local use_system_package_manager="$2"
@@ -766,82 +881,6 @@ install_build_tools () {
   echo "[INSTALL] Successfully installed all the build tools"
 }
 
-install_cudnn () {
-  local env_name="$1"
-  local install_path="$2"
-  local cuda_version="$3"
-  if [ "$cuda_version" == "" ]; then
-    echo "Usage: ${FUNCNAME[0]} ENV_NAME INSTALL_PATH CUDA_VERSION"
-    echo "Example:"
-    echo "    ${FUNCNAME[0]} build_env \$(pwd)/cudnn_install 11.7"
-    return 1
-  else
-    echo "################################################################################"
-    echo "# Install cuDNN"
-    echo "#"
-    echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)"
-    echo "################################################################################"
-    echo ""
-  fi
-
-  # Install cuDNN manually
-  # Based on install script in https://github.com/pytorch/builder/blob/main/common/install_cuda.sh
-  local cudnn_packages=(
-    ["115"]="https://developer.download.nvidia.com/compute/redist/cudnn/v8.3.2/local_installers/11.5/cudnn-linux-x86_64-8.3.2.44_cuda11.5-archive.tar.xz"
-    ["116"]="https://developer.download.nvidia.com/compute/redist/cudnn/v8.3.2/local_installers/11.5/cudnn-linux-x86_64-8.3.2.44_cuda11.5-archive.tar.xz"
-    ["117"]="https://ossci-linux.s3.amazonaws.com/cudnn-linux-x86_64-8.5.0.96_cuda11-archive.tar.xz"
-    ["118"]="https://developer.download.nvidia.com/compute/redist/cudnn/v8.7.0/local_installers/11.8/cudnn-linux-x86_64-8.7.0.84_cuda11-archive.tar.xz"
-  )
-
-  # Split version string by dot into array, i.e. 11.7.1 => [11, 7, 1]
-  # shellcheck disable=SC2206
-  local cuda_version_arr=(${cuda_version//./ })
-  # Fetch the major and minor version to concat
-  local cuda_concat_version="${cuda_version_arr[0]}${cuda_version_arr[1]}"
-
-  # Get the URL
-  local cudnn_url="${cudnn_packages[cuda_concat_version]}"
-  if [ "$cudnn_url" == "" ]; then
-    # Default to cuDNN for 11.7 if no CUDA version fits
-    echo "[INSTALL] Defaulting to cuDNN for CUDA 11.7"
-    cudnn_url="${cudnn_packages[117]}"
-  fi
-
-  # Clear the install path
-  rm -rf "$install_path"
-  mkdir -p "$install_path"
-
-  # Create temporary directory
-  # shellcheck disable=SC2155
-  local tmp_dir=$(mktemp -d)
-  cd "$tmp_dir" || return 1
-
-  # Download cuDNN
-  echo "[INSTALL] Downloading cuDNN to ${tmp_dir} ..."
-  (exec_with_retries wget -q "$cudnn_url" -O cudnn.tar.xz) || return 1
-
-  # Unpack the tarball
-  echo "[INSTALL] Unpacking cuDNN ..."
-  tar -xvf cudnn.tar.xz
-
-  # Copy the includes and libs over to the install path
-  echo "[INSTALL] Moving cuDNN files to ${install_path} ..."
-  rm -rf "${install_path:?}/include"
-  rm -rf "${install_path:?}/lib"
-  mv cudnn-linux-*/include "$install_path"
-  mv cudnn-linux-*/lib "$install_path"
-
-  # Delete the temporary directory
-  cd - || return 1
-  rm -rf "$tmp_dir"
-
-  # Export the environment variables to the Conda environment
-  echo "[INSTALL] Set environment variables CUDNN_INCLUDE_DIR and CUDNN_LIBRARY ..."
-  print_exec conda env config vars set -n "${env_name}" CUDNN_INCLUDE_DIR="${install_path}/include" CUDNN_LIBRARY="${install_path}/lib"
-
-  echo "[INSTALL] Successfully installed cuDNN (for CUDA ${cuda_version})"
-}
-
 
 ################################################################################
 # Combination Functions
@@ -883,7 +922,7 @@ create_conda_pytorch_environment () {
 
 
 ################################################################################
-# Build Functions
+# FBGEMM_GPU Build Functions
 ################################################################################
 
 prepare_fbgemm_gpu_build () {
@@ -902,6 +941,11 @@ prepare_fbgemm_gpu_build () {
     echo ""
   fi
 
+  if [[ "${GITHUB_WORKSPACE}" ]]; then
+    # https://github.com/actions/checkout/issues/841
+    git config --global --add safe.directory "${GITHUB_WORKSPACE}"
+  fi
+
   echo "[BUILD] Running git submodules update ..."
   git submodule sync
   git submodule update --init --recursive
diff --git a/.github/workflows/fbgemm_ci.yml b/.github/workflows/fbgemm_ci.yml
index f6bae56123..977b443a2b 100644
--- a/.github/workflows/fbgemm_ci.yml
+++ b/.github/workflows/fbgemm_ci.yml
@@ -13,6 +13,11 @@ on:
     branches:
       - main
 
+concurrency:
+  # Cancel previous runs in the PR if a new commit is pushed
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
 jobs:
   build-posix:
     runs-on: ${{ matrix.os }}
diff --git a/.github/workflows/fbgemm_gpu_ci.yml b/.github/workflows/fbgemm_gpu_ci.yml
index 8e021c4451..bd62f23761 100644
--- a/.github/workflows/fbgemm_gpu_ci.yml
+++ b/.github/workflows/fbgemm_gpu_ci.yml
@@ -13,9 +13,17 @@ on:
     branches:
       - main
 
+concurrency:
+  # Cancel previous runs in the PR if a new commit is pushed
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
 jobs:
   build_and_test_amd:
     runs-on: ${{ matrix.os }}
+    container:
+      image: ${{ matrix.container-image }}
+      options: --user root
     defaults:
       run:
         shell: bash
@@ -25,11 +33,18 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        os: [ ubuntu-20.04 ]
+        os: [ linux.12xlarge ]
+        container-image: [ "ubuntu:20.04" ]
         python-version: [ "3.10" ]
         rocm-version: [ "5.3" ]
 
     steps:
+    - name: Setup Build Container
+      run: |
+        apt update -y
+        apt install -y binutils git sudo wget
+        git config --global --add safe.directory '*'
+
     - name: Checkout the Repository
       uses: actions/checkout@v3
       with:
@@ -74,7 +89,7 @@ jobs:
         print_exec conda env config vars set -n $BUILD_ENV PYTORCH_ROCM_ARCH=gfx90a
         print_exec conda run -n $BUILD_ENV python setup.py build develop
 
-    - name: Test FBGEMM_GPU-ROCM Nightly installation
+    - name: Test FBGEMM_GPU-ROCM Nightly Installation
       timeout-minutes: 10
       run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV rocm
 
diff --git a/.github/workflows/fbgemm_gpu_lint.yml b/.github/workflows/fbgemm_gpu_lint.yml
index dc2b6344ce..1ff7203108 100644
--- a/.github/workflows/fbgemm_gpu_lint.yml
+++ b/.github/workflows/fbgemm_gpu_lint.yml
@@ -14,6 +14,11 @@ on:
     branches:
       - main
 
+concurrency:
+  # Cancel previous runs in the PR if a new commit is pushed
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
 jobs:
   run_pylint:
     runs-on: ubuntu-latest
diff --git a/.github/workflows/fbgemm_nightly_build.yml b/.github/workflows/fbgemm_nightly_build.yml
index 4cdb10aaa8..bc699ef62b 100644
--- a/.github/workflows/fbgemm_nightly_build.yml
+++ b/.github/workflows/fbgemm_nightly_build.yml
@@ -30,6 +30,11 @@ on:
   #
   workflow_dispatch:
 
+concurrency:
+  # Cancel previous runs in the PR if a new commit is pushed
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
 jobs:
   # Build on CPU hosts and upload to GHA
   build_artifact:
diff --git a/.github/workflows/fbgemm_nightly_build_cpu.yml b/.github/workflows/fbgemm_nightly_build_cpu.yml
index 72a0af01e7..1125b17a0d 100644
--- a/.github/workflows/fbgemm_nightly_build_cpu.yml
+++ b/.github/workflows/fbgemm_nightly_build_cpu.yml
@@ -30,10 +30,19 @@ on:
   #
   workflow_dispatch:
 
+concurrency:
+  # Cancel previous runs in the PR if a new commit is pushed
+  # https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
 jobs:
   # Build on CPU hosts, run tests, and upload to GHA
   build_artifact:
     runs-on: ${{ matrix.os }}
+    container:
+      image: amazonlinux:2023
+      options: --user root
     defaults:
       run:
         shell: bash
@@ -48,6 +57,9 @@ jobs:
         python-version: [ "3.8", "3.9", "3.10" ]
 
     steps:
+    - name: Setup Build Container
+      run: yum update -y; yum install -y binutils findutils git sudo wget which
+
     - name: Checkout the Repository
       uses: actions/checkout@v3
       with:
@@ -93,6 +105,9 @@ jobs:
   # Download the built artifact from GHA, test on GPU, and push to PyPI
   test_and_publish_artifact:
     runs-on: ${{ matrix.os }}
+    container:
+      image: amazonlinux:2023
+      options: --user root
     defaults:
       run:
         shell: bash
@@ -107,6 +122,9 @@ jobs:
     needs: build_artifact
 
     steps:
+    - name: Setup Build Container
+      run: yum update -y; yum install -y binutils findutils git sudo wget which
+
     - name: Checkout the Repository
       uses: actions/checkout@v3
       with:
diff --git a/.github/workflows/fbgemm_release_build.yml b/.github/workflows/fbgemm_release_build.yml
index 5e3d369fe4..def6002a76 100644
--- a/.github/workflows/fbgemm_release_build.yml
+++ b/.github/workflows/fbgemm_release_build.yml
@@ -22,6 +22,11 @@ on:
   #
   workflow_dispatch:
 
+concurrency:
+  # Cancel previous runs in the PR if a new commit is pushed
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
 jobs:
   # Build on CPU hosts and upload to GHA
   build_artifact:
diff --git a/.github/workflows/fbgemm_release_build_cpu.yml b/.github/workflows/fbgemm_release_build_cpu.yml
index a652c89854..c7fb53cabd 100644
--- a/.github/workflows/fbgemm_release_build_cpu.yml
+++ b/.github/workflows/fbgemm_release_build_cpu.yml
@@ -22,10 +22,18 @@ on:
   #
   workflow_dispatch:
 
+concurrency:
+  # Cancel previous runs in the PR if a new commit is pushed
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
 jobs:
   # Build on CPU hosts, run tests, and upload to GHA
   build_artifact:
     runs-on: ${{ matrix.os }}
+    container:
+      image: amazonlinux:2023
+      options: --user root
     defaults:
       run:
         shell: bash
@@ -40,6 +48,9 @@ jobs:
         python-version: [ "3.8", "3.9", "3.10" ]
 
     steps:
+    - name: Setup Build Container
+      run: yum update -y; yum install -y binutils findutils git sudo wget which
+
     - name: Checkout the Repository
       uses: actions/checkout@v3
       with:
@@ -85,6 +96,9 @@ jobs:
   # Download the built artifact from GHA, test on GPU, and push to PyPI
   test_and_publish_artifact:
     runs-on: ${{ matrix.os }}
+    container:
+      image: amazonlinux:2023
+      options: --user root
     defaults:
       run:
         shell: bash
@@ -99,6 +113,9 @@ jobs:
     needs: build_artifact
 
     steps:
+    - name: Setup Build Container
+      run: yum update -y; yum install -y binutils findutils git sudo wget which
+
     - name: Checkout the Repository
       uses: actions/checkout@v3
       with: