diff --git a/.github/scripts/build_wheel.bash b/.github/scripts/build_wheel.bash
new file mode 100644
index 0000000000..dbf473da05
--- /dev/null
+++ b/.github/scripts/build_wheel.bash
@@ -0,0 +1,126 @@
+#!/bin/bash
+
+# Exit on failure
+set -e
+
+# shellcheck source=/dev/null
+. "$(dirname "$(realpath -s "$0")")/setup_env.bash"
+
+verbose=0
+package_name=""
+python_version=""
+pytorch_channel_name=""
+pytorch_cuda_version="x"
+miniconda_prefix="${HOME}/miniconda"
+
+usage () {
+  echo "Usage: bash build_wheel.bash -o PACKAGE_NAME -p PYTHON_VERSION -P PYTORCH_CHANNEL_NAME -c PYTORCH_CUDA_VERSION [-m MINICONDA_PREFIX] [-v] [-h]"
+  echo "-v                  : verbose"
+  echo "-h                  : help"
+  echo "PACKAGE_NAME        : output package name (e.g., fbgemm_gpu_nightly)"
+  echo "PYTHON_VERSION      : Python version (e.g., 3.7, 3.8, 3.10)"
+  echo "PYTORCH_CHANNEL_NAME: PyTorch's channel name (e.g., pytorch-nightly, pytorch-test (=pre-release), pytorch (=stable release))"
+  echo "PYTORCH_CUDA_VERSION: PyTorch's CUDA version (e.g., 11.6, 11.7)"
+  echo "MINICONDA_PREFIX    : path to install Miniconda (default: \$HOME/miniconda)"
+  echo "Example 1: Python 3.10 + PyTorch nightly (CUDA 11.7), install miniconda at /home/user/tmp/miniconda"
+  echo "       bash build_wheel.bash -v -P pytorch-nightly -p 3.10 -c 11.7 -m /home/user/tmp/miniconda"
+  echo "Example 2: Python 3.10 + PyTorch stable (CPU), install miniconda at \$HOME/miniconda"
+  echo "       bash build_wheel.bash -v -P pytorch -p 3.10 -c \"\""
+}
+
+while getopts vfho:p:P:c:m: flag
+do
+    case "$flag" in
+        v) verbose="1";;
+        o) package_name="${OPTARG}";;
+        p) python_version="${OPTARG}";;
+        P) pytorch_channel_name="${OPTARG}";;
+        c) pytorch_cuda_version="${OPTARG}";;
+        m) miniconda_prefix="${OPTARG}";;
+        h) usage
+           exit 0;;
+        *) usage
+           exit 1;;
+    esac
+done
+
+if [ "$python_version" == "" ] || [ "$pytorch_cuda_version" == "x" ] || [ "$miniconda_prefix" == "" ] || [ "$pytorch_channel_name" == "" ] || [ "$package_name" == "" ]; then
+  usage
+  exit 1
+fi
+python_tag="${python_version//\./}"
+
+if [ "$verbose" == "1" ]; then
+  # Print each line verbosely
+  set -x -e
+fi
+
+################################################################################
+echo "## 0. Minimal check"
+################################################################################
+
+if [ ! -d "fbgemm_gpu" ]; then
+  echo "Error: this script must be executed in FBGEMM/"
+  exit 1
+elif [ "$(which gcc 2>/dev/null)" == "" ]; then
+  echo "Error: GCC is needed to compile FBGEMM"
+  exit 1
+fi
+
+################################################################################
+echo "## 1. Set up Miniconda"
+################################################################################
+
+setup_miniconda "$miniconda_prefix"
+
+################################################################################
+echo "## 2. Create build_binary environment"
+################################################################################
+
+create_conda_environment build_binary "$python_version" "$pytorch_channel_name" "$pytorch_cuda_version"
+
+cd fbgemm_gpu
+
+# cuDNN is needed to "build" FBGEMM
+install_cudnn "$miniconda_prefix/build_only/cudnn"
+export CUDNN_INCLUDE_DIR="$miniconda_prefix/build_only/cudnn/include"
+export CUDNN_LIBRARY="$miniconda_prefix/build_only/cudnn/lib"
+
+conda run -n build_binary python -m pip install -r requirements.txt
+
+# TODO: Do we need these checks?
+ldd --version
+conda info
+conda run -n build_binary python --version
+gcc --version
+conda run -n build_binary python -c "import torch.distributed"
+conda run -n build_binary python -c "import skbuild"
+conda run -n build_binary python -c "import numpy"
+cd ../
+
+################################################################################
+echo "## 3. Build FBGEMM_GPU"
+################################################################################
+
+cd fbgemm_gpu
+rm -rf dist _skbuild
+if [ "$pytorch_cuda_version" == "" ]; then
+  # CPU version
+  build_arg="--cpu_only"
+  package_name="${package_name}_cpu"
+else
+  # GPU version
+  # We build only CUDA 7.0 and 8.0 (i.e., for v100 and a100) because of 100 MB binary size limit from PYPI website.
+  build_arg="-DTORCH_CUDA_ARCH_LIST=7.0;8.0"
+fi
+
+# manylinux1_x86_64 is specified for pypi upload: distribute python extensions as wheels on Linux
+conda run -n build_binary python setup.py bdist_wheel --package_name="${package_name}" --python-tag="py${python_tag}" "${build_arg}" --plat-name=manylinux1_x86_64
+cd ../
+
+# Usage:
+#     pip install $(ls fbgemm_gpu/dist/${package_name}-*.whl)
+#     python -c "import fbgemm_gpu"
+
+wheel_name="$(ls fbgemm_gpu/dist/"${package_name}"-*.whl)"
+echo "Successfully built $wheel_name"
diff --git a/.github/scripts/setup_env.bash b/.github/scripts/setup_env.bash
new file mode 100644
index 0000000000..14f0804826
--- /dev/null
+++ b/.github/scripts/setup_env.bash
@@ -0,0 +1,70 @@
+#!/bin/bash
+
+setup_miniconda () {
+  miniconda_prefix="$1"
+  if [ "$miniconda_prefix" == "" ]; then
+    echo "Usage: setup_miniconda MINICONDA_PREFIX_PATH"
+    echo "Example:"
+    echo "    setup_miniconda /home/user/tmp/miniconda"
+    exit 1
+  fi
+  if [ -d "$miniconda_prefix" ]; then
+    rm -rf "$miniconda_prefix"
+    # echo "Error: '$miniconda_prefix' already exists."
+    # exit 1
+  fi
+
+  mkdir -p "$miniconda_prefix"
+  wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh
+  bash miniconda.sh -b -p "$miniconda_prefix" -u
+  # these variables will be exported outside
+  export PATH="${miniconda_prefix}/bin:${PATH}"
+  export CONDA="${miniconda_prefix}"
+}
+
+create_conda_environment () {
+  env_name="$1"
+  python_version="$2"
+  pytorch_channel_name="$3"
+  pytorch_cuda_version="$4"
+  if [ "$python_version" == "" ]; then
+    echo "Usage: create_conda_environment ENV_NAME PYTHON_VERSION PYTORCH_CHANNEL_NAME PYTORCH_CUDA_VERSION"
+    echo "Example:"
+    echo "    create_conda_environment build_binary 3.10 pytorch-nightly 11.7"
+    exit 1
+  fi
+  conda create -y --name "$env_name" python="$python_version"
+  if [ "$pytorch_cuda_version" == "" ]; then
+    # CPU version
+    conda install -n "$env_name" -y pytorch cpuonly -c "$pytorch_channel_name"
+  else
+    # GPU version
+    conda install -n "$env_name" -y pytorch pytorch-cuda="$pytorch_cuda_version" -c "$pytorch_channel_name" -c nvidia
+  fi
+}
+
+install_cudnn () {
+  install_path="$1"
+  if [ "$install_path" == "" ]; then
+    echo "Usage: install_cudnn INSTALL_PATH"
+    echo "Example:"
+    echo "    install_cudnn \$(pwd)/cudnn_install"
+    exit 1
+  fi
+
+  rm -rf "$install_path"
+  mkdir -p "$install_path"
+
+  # Install cuDNN manually
+  # See https://github.com/pytorch/builder/blob/main/common/install_cuda.sh
+  mkdir -p tmp_cudnn
+  cd tmp_cudnn || exit
+  wget -q https://ossci-linux.s3.amazonaws.com/cudnn-linux-x86_64-8.5.0.96_cuda11-archive.tar.xz -O cudnn-linux-x86_64-8.5.0.96_cuda11-archive.tar.xz
+  tar xf cudnn-linux-x86_64-8.5.0.96_cuda11-archive.tar.xz
+  rm -rf "${install_path:?}/include"
+  rm -rf "${install_path:?}/lib"
+  mv cudnn-linux-x86_64-8.5.0.96_cuda11-archive/include "$install_path"
+  mv cudnn-linux-x86_64-8.5.0.96_cuda11-archive/lib "$install_path"
+  cd ../
+  rm -rf tmp_cudnn
+}
diff --git a/.github/scripts/test_torchrec.bash b/.github/scripts/test_torchrec.bash
new file mode 100644
index 0000000000..1d96e85d67
--- /dev/null
+++ b/.github/scripts/test_torchrec.bash
@@ -0,0 +1,102 @@
+#!/bin/bash
+
+# Exit on failure
+set -e
+
+# shellcheck source=/dev/null
+. "$(dirname "$(realpath -s "$0")")/setup_env.bash"
+
+verbose=0
+torchrec_package_name=""
+python_version=""
+pytorch_cuda_version="x"
+fbgemm_wheel_path="x"
+miniconda_prefix="${HOME}/miniconda"
+
+usage () {
+  echo "Usage: bash test_torchrec.bash -o PACKAGE_NAME -p PYTHON_VERSION -P PYTORCH_CHANNEL_NAME -c PYTORCH_CUDA_VERSION -w FBGEMM_WHEEL_PATH [-m MINICONDA_PREFIX] [-v] [-h]"
+  echo "-v                  : verbose"
+  echo "-h                  : help"
+  echo "PACKAGE_NAME        : output package name of TorchRec (e.g., torchrec_nightly)"
+  echo "                      Note: TorchRec is sensitive to its package name"
+  echo "                      e.g., torchrec needs fbgemm-gpu while torchrec_nightly needs fbgemm-gpu-nightly"
+  echo "PYTHON_VERSION      : Python version (e.g., 3.7, 3.8, 3.10)"
+  echo "PYTORCH_CHANNEL_NAME: PyTorch's channel name (e.g., pytorch-nightly, pytorch-test (=pre-release), pytorch (=stable release))"
+  echo "PYTORCH_CUDA_VERSION: PyTorch's CUDA version (e.g., 11.6, 11.7)"
+  echo "FBGEMM_WHEEL_PATH   : path to FBGEMM_GPU's wheel file"
+  echo "MINICONDA_PREFIX    : path to install Miniconda (default: \$HOME/miniconda)"
+  echo "Example: Python 3.10 + PyTorch nightly (CUDA 11.7), install miniconda at \$HOME/miniconda, using dist/fbgemm_gpu_nightly.whl"
+  echo "       bash test_torchrec.bash -v -o torchrec_nightly -p 3.10 -P pytorch-nightly -c 11.7 -w dist/fbgemm_gpu_nightly.whl"
+}
+
+while getopts vho:p:P:c:m:w: flag
+do
+    case "$flag" in
+        v) verbose="1";;
+        o) torchrec_package_name="${OPTARG}";;
+        p) python_version="${OPTARG}";;
+        P) pytorch_channel_name="${OPTARG}";;
+        c) pytorch_cuda_version="${OPTARG}";;
+        m) miniconda_prefix="${OPTARG}";;
+        w) fbgemm_wheel_path="${OPTARG}";;
+        h) usage
+           exit 0;;
+        *) usage
+           exit 1;;
+    esac
+done
+
+if [ "$torchrec_package_name" == "" ] || [ "$python_version" == "" ] || [ "$pytorch_cuda_version" == "x" ] || [ "$miniconda_prefix" == "" ] || [ "$pytorch_channel_name" == "" ] || [ "$fbgemm_wheel_path" == "" ]; then
+  usage
+  exit 1
+fi
+python_tag="${python_version//\./}"
+
+if [ "$verbose" == "1" ]; then
+  # Print each line verbosely
+  set -x -e
+fi
+
+################################################################################
+echo "## 0. Minimal check"
+################################################################################
+
+if [ ! -d "torchrec" ]; then
+  echo "Error: this script must be executed in torchrec/"
+  exit 1
+fi
+
+################################################################################
+echo "## 1. Set up Miniconda"
+################################################################################
+
+setup_miniconda "$miniconda_prefix"
+
+################################################################################
+echo "## 2. Create test_binary environment"
+################################################################################
+
+create_conda_environment test_binary "$python_version" "$pytorch_channel_name" "$pytorch_cuda_version"
+
+# Comment out FBGEMM_GPU since we will install it from "$fbgemm_wheel_path"
+sed -i 's/fbgemm-gpu/#fbgemm-gpu/g' requirements.txt
+conda run -n test_binary python -m pip install -r requirements.txt
+# Install FBGEMM_GPU from a local wheel file.
+conda run -n test_binary python -m pip install "$fbgemm_wheel_path"
+conda run -n test_binary python -c "import fbgemm_gpu"
+
+################################################################################
+echo "## 3. Build TorchRec"
+################################################################################
+
+rm -rf dist
+conda run -n test_binary python setup.py bdist_wheel --package_name "${torchrec_package_name}" --python-tag="py${python_tag}"
+
+################################################################################
+echo "## 4. Import TorchRec"
+################################################################################
+
+conda run -n test_binary python -m pip install dist/"${torchrec_package_name}"*.whl
+conda run -n test_binary python -c "import torchrec"
+
+echo "Test succeeded"
diff --git a/.github/scripts/test_wheel.bash b/.github/scripts/test_wheel.bash
new file mode 100644
index 0000000000..d8b984407a
--- /dev/null
+++ b/.github/scripts/test_wheel.bash
@@ -0,0 +1,108 @@
+#!/bin/bash
+
+# Exit on failure
+set -e
+
+# shellcheck source=/dev/null
+. "$(dirname "$(realpath -s "$0")")/setup_env.bash"
+
+
+verbose=0
+python_version=""
+pytorch_cuda_version="x"
+fbgemm_wheel_path="x"
+miniconda_prefix="${HOME}/miniconda"
+
+usage () {
+  echo "Usage: bash test_wheel.bash -p PYTHON_VERSION -P PYTORCH_CHANNEL_NAME -c PYTORCH_CUDA_VERSION -w FBGEMM_WHEEL_PATH [-m MINICONDA_PREFIX] [-v] [-h]"
+  echo "-v                  : verbose"
+  echo "-h                  : help"
+  echo "PYTHON_VERSION      : Python version (e.g., 3.7, 3.8, 3.10)"
+  echo "PYTORCH_CHANNEL_NAME: PyTorch's channel name (e.g., pytorch-nightly, pytorch-test (=pre-release), pytorch (=stable release))"
+  echo "PYTORCH_CUDA_VERSION: PyTorch's CUDA version (e.g., 11.6, 11.7)"
+  echo "FBGEMM_WHEEL_PATH   : path to FBGEMM_GPU's wheel file"
+  echo "MINICONDA_PREFIX    : path to install Miniconda (default: \$HOME/miniconda)"
+  echo "Example 1: Python 3.10 + PyTorch nightly (CUDA 11.7), install miniconda at /home/user/tmp/miniconda, using dist/fbgemm_gpu.whl"
+  echo "       bash test_wheel.bash -v -p 3.10 -P pytorch-nightly -c 11.7 -m /home/user/tmp/miniconda -w dist/fbgemm_gpu.whl"
+  echo "Example 2: Python 3.10 + PyTorch stable (CPU), install miniconda at \$HOME/miniconda, using /tmp/fbgemm_gpu_cpu.whl"
+  echo "       bash test_wheel.bash -v -p 3.10 -P pytorch -c \"\" -w /tmp/fbgemm_gpu_cpu.whl"
+}
+
+while getopts vhp:P:c:m:w: flag
+do
+    case "$flag" in
+        v) verbose="1";;
+        p) python_version="${OPTARG}";;
+        P) pytorch_channel_name="${OPTARG}";;
+        c) pytorch_cuda_version="${OPTARG}";;
+        m) miniconda_prefix="${OPTARG}";;
+        w) fbgemm_wheel_path="${OPTARG}";;
+        h) usage
+           exit 0;;
+        *) usage
+           exit 1;;
+    esac
+done
+
+if [ "$python_version" == "" ] || [ "$pytorch_cuda_version" == "x" ] || [ "$miniconda_prefix" == "" ] || [ "$pytorch_channel_name" == "" ] || [ "$fbgemm_wheel_path" == "" ]; then
+  usage
+  exit 1
+fi
+
+if [ "$verbose" == "1" ]; then
+  # Print each line verbosely
+  set -x -e
+fi
+
+################################################################################
+echo "## 0. Minimal check"
+################################################################################
+
+if [ ! -d "fbgemm_gpu" ]; then
+  echo "Error: this script must be executed in FBGEMM/"
+  exit 1
+fi
+
+################################################################################
+echo "## 1. Set up Miniconda"
+################################################################################
+
+setup_miniconda "$miniconda_prefix"
+
+################################################################################
+echo "## 2. Create test_binary environment"
+################################################################################
+
+create_conda_environment test_binary "$python_version" "$pytorch_channel_name" "$pytorch_cuda_version"
+conda install -n test_binary -y pytest
+
+cd fbgemm_gpu
+conda run -n test_binary python -m pip install -r requirements.txt
+cd ../
+
+################################################################################
+echo "## 3. Install and test FBGEMM_GPU"
+################################################################################
+
+conda run -n test_binary python -m pip install "$fbgemm_wheel_path"
+conda run -n test_binary python -c "import fbgemm_gpu"
+
+if [ "$pytorch_cuda_version" == "" ]; then
+  # CPU version: unfortunately, not all tests are properly excluded for CPUs,
+  # so we cherry-pick what we can run.
+  conda run -n test_binary python fbgemm_gpu/test/batched_unary_embeddings_test.py -v
+  conda run -n test_binary python fbgemm_gpu/test/input_combine_test.py -v
+  conda run -n test_binary python fbgemm_gpu/test/layout_transform_ops_test.py -v
+  conda run -n test_binary python fbgemm_gpu/test/merge_pooled_embeddings_test.py -v
+  conda run -n test_binary python fbgemm_gpu/test/permute_pooled_embedding_modules_test.py -v
+  conda run -n test_binary python fbgemm_gpu/test/quantize_ops_test.py -v
+  conda run -n test_binary python fbgemm_gpu/test/sparse_ops_test.py -v
+else
+  # GPU version
+  # Don't run it in the fbgemm_gpu directory; fbgemm_gpu has fbgemm_gpu directory, which confuses import in Python.
+  # FIXME: now it fails. Needs to fix failures/warnings and revisit it.
+  # conda run -n test_binary python -m pytest fbgemm_gpu -v -s -W ignore::pytest.PytestCollectionWarning --continue-on-collection-errors
+  echo "Skip tests"
+fi
+
+echo "Test succeeded"
diff --git a/.github/workflows/build_wheel.yml b/.github/workflows/build_wheel.yml
new file mode 100644
index 0000000000..4188ec0b40
--- /dev/null
+++ b/.github/workflows/build_wheel.yml
@@ -0,0 +1,190 @@
+name: Build Wheel
+
+on:
+  workflow_call:
+    inputs:
+      release_version:
+        required: true
+        type: string
+      upload_pypi:
+        required: true
+        type: boolean
+
+jobs:
+  wheel_setup:
+    runs-on: [ubuntu-latest]
+    outputs:
+      pytorch_channel: ${{ steps.output_variables.outputs.pytorch_channel }}
+      fbgemm_package_name: ${{ steps.output_variables.outputs.fbgemm_package_name }}
+      torchrec_package_name: ${{ steps.output_variables.outputs.torchrec_package_name }}
+    steps:
+    - id: output_variables
+      run: |
+        if [ x"${{ inputs.release_version }}" == x"nightly" ]; then
+          echo "pytorch_channel=pytorch-nightly" >> $GITHUB_OUTPUT
+          echo "fbgemm_package_name=fbgemm_gpu_nightly" >> $GITHUB_OUTPUT
+          echo "torchrec_package_name=torchrec_nightly" >> $GITHUB_OUTPUT
+        elif [ x"${{ inputs.release_version }}" == x"prerelease" ]; then
+          echo "pytorch_channel=pytorch-test" >> $GITHUB_OUTPUT
+          echo "fbgemm_package_name=fbgemm_gpu_test" >> $GITHUB_OUTPUT
+          echo "torchrec_package_name=torchrec_test" >> $GITHUB_OUTPUT
+        elif [ x"${{ inputs.release_version }}" == x"release" ]; then
+          echo "pytorch_channel=pytorch" >> $GITHUB_OUTPUT
+          echo "fbgemm_package_name=fbgemm_gpu" >> $GITHUB_OUTPUT
+          echo "torchrec_package_name=torchrec" >> $GITHUB_OUTPUT
+        else
+          echo "Error: unknown release_version ${{ inputs.release_version }}"
+          exit 1
+        fi
+
+  # Build on CPU hosts and upload it as the GitHub Action artifact
+  build_wheel:
+    runs-on: ${{ matrix.os }}
+    needs: [wheel_setup]
+    strategy:
+      matrix:
+        os: [linux.2xlarge]
+        python-version: ["3.7", "3.8", "3.9", "3.10"]
+        cuda-tag: ["cu11", "cpu"]
+    steps:
+    - name: Checkout FBGEMM_GPU
+      uses: actions/checkout@v2
+      with:
+        submodules: "recursive"
+    - name: Install prerequisite packages
+      run: |
+        sudo yum install -y gcc gcc-c++
+    - name: Run build_wheel.bash
+      run: |
+        if [ x"${{ matrix.cuda-tag }}" == x"cpu" ]; then
+          # Empty string
+          PYTORCH_CUDA_VERSION=""
+        else
+          PYTORCH_CUDA_VERSION="11.7"
+        fi
+        bash .github/scripts/build_wheel.bash -v -p ${{ matrix.python-version }} -o ${{ needs.wheel_setup.outputs.fbgemm_package_name }} -P ${{ needs.wheel_setup.outputs.pytorch_channel }} -c "${PYTORCH_CUDA_VERSION}" -m "${HOME}/miniconda"
+        rm -rf "${HOME}/miniconda"
+    - name: Upload wheel as GHA artifact
+      uses: actions/upload-artifact@v3
+      with:
+        if-no-files-found: error
+        name: ${{ needs.wheel_setup.outputs.fbgemm_package_name }}_${{ matrix.python-version }}_${{ matrix.cuda-tag }}.whl
+        path: fbgemm_gpu/dist/${{ needs.wheel_setup.outputs.fbgemm_package_name }}*.whl
+
+  # Download the GitHub Action artifact and test the artifact on a GPU machine
+  test_wheel_gpu:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [linux.g5.4xlarge.nvidia.gpu]
+        python-version: ["3.7", "3.8", "3.9", "3.10"]
+        cuda-tag: ["cu11"]
+    needs: [wheel_setup, build_wheel]
+    steps:
+    - name: Check system information
+      shell: bash
+      run: |
+        # TODO: Do we really want this information?
+        ldd --version
+        cat /proc/cpuinfo
+        cat /proc/version
+        set -euo pipefail
+        function get_ec2_metadata() {
+          # Pulled from instance metadata endpoint for EC2
+          # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+          category=$1
+          curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+        }
+        echo "ami-id: $(get_ec2_metadata ami-id)"
+        echo "instance-id: $(get_ec2_metadata instance-id)"
+        echo "instance-type: $(get_ec2_metadata instance-type)"
+        sudo yum install lshw -y
+        sudo lshw -C display
+    # Checkout the repository to the GitHub Actions runner
+    - name: Checkout FBGEMM_GPU
+      uses: actions/checkout@v2
+      with:
+        submodules: "recursive"
+    - name: Install CUDA drivers
+      shell: bash
+      continue-on-error: true
+      run: |
+        sudo yum install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm
+        sudo yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo
+        sudo yum clean expire-cache
+        sudo yum install -y nvidia-driver-latest-dkms
+        sudo yum install -y cuda-drivers
+    # download wheel from GitHub Actions
+    - name: Download wheel
+      uses: actions/download-artifact@v3
+      with:
+        name: ${{ needs.wheel_setup.outputs.fbgemm_package_name }}_${{ matrix.python-version }}_${{ matrix.cuda-tag }}.whl
+    - name: Run test_wheel.bash
+      shell: bash
+      run: |
+        PYTORCH_CUDA_VERSION="11.7"
+        bash .github/scripts/test_wheel.bash -v -p ${{ matrix.python-version }} -P ${{ needs.wheel_setup.outputs.pytorch_channel }} -c "${PYTORCH_CUDA_VERSION}" -w "$(ls ${{ needs.wheel_setup.outputs.fbgemm_package_name }}*.whl)" -m "${HOME}/miniconda"
+        rm -rf "${HOME}/miniconda"
+    - name: Run test_torchrec.bash
+      shell: bash
+      run: |
+        FBGEMM_WHEEL_PATH="$(pwd)/$(ls ${{ needs.wheel_setup.outputs.fbgemm_package_name }}*.whl)"
+        TEST_TORCHREC_PATH="$(pwd)/.github/scripts/test_torchrec.bash"
+        git clone https://github.com/pytorch/torchrec.git
+        cd torchrec
+        git submodule update --init --recursive
+        PYTORCH_CUDA_VERSION="11.7"
+        bash "$TEST_TORCHREC_PATH" -v -o ${{ needs.wheel_setup.outputs.torchrec_package_name }} -p ${{ matrix.python-version }} -P ${{ needs.wheel_setup.outputs.pytorch_channel }} -c "${PYTORCH_CUDA_VERSION}" -w "${FBGEMM_WHEEL_PATH}" -m "${HOME}/miniconda"
+        rm -rf "${HOME}/miniconda"
+        cd ../
+
+  # Download the GitHub Action artifact and test the artifact on a CPU machine
+  test_wheel_cpu:
+    runs-on: ${{ matrix.os }}
+    needs: [wheel_setup, build_wheel]
+    strategy:
+      matrix:
+        os: [ubuntu-latest]
+        python-version: ["3.7", "3.8", "3.9", "3.10"]
+        cuda-tag: ["cpu"]
+    steps:
+    # Checkout the repository to the GitHub Actions runner
+    - name: Checkout FBGEMM_GPU
+      uses: actions/checkout@v2
+      with:
+        submodules: "recursive"
+    # download wheel from GitHub Actions
+    - name: Download wheel
+      uses: actions/download-artifact@v3
+      with:
+        name: ${{ needs.wheel_setup.outputs.fbgemm_package_name }}_${{ matrix.python-version }}_${{ matrix.cuda-tag }}.whl
+    - name: Run test_wheel.bash
+      shell: bash
+      run: |
+        # Empty string for CPU
+        PYTORCH_CUDA_VERSION=""
+        bash .github/scripts/test_wheel.bash -v -p ${{ matrix.python-version }} -P ${{ needs.wheel_setup.outputs.pytorch_channel }} -c "${PYTORCH_CUDA_VERSION}" -w "$(ls ${{ needs.wheel_setup.outputs.fbgemm_package_name }}*.whl)" -m "${HOME}/miniconda"
+        rm -rf "${HOME}/miniconda"
+
+  # Upload the created wheel to PYPI
+  upload_pypi:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [linux.2xlarge]
+        python-version: ["3.7", "3.8", "3.9", "3.10"]
+        cuda-tag: ["cu11", "cpu"]
+    needs: [wheel_setup, test_wheel_gpu, test_wheel_cpu]
+    if: ${{ inputs.upload_pypi }}
+    steps:
+    - name: Download wheel
+      uses: actions/download-artifact@v3
+      with:
+        name: ${{ needs.wheel_setup.outputs.fbgemm_package_name }}_${{ matrix.python-version }}_${{ matrix.cuda-tag }}.whl
+    - name: Push FBGEMM_GPU wheel to PYPI
+      env:
+        PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}
+      run: |
+        conda run -n build_binary python -m pip install twine
+        # Upload it to the official PYPI website
+        conda run -n build_binary python -m twine upload --username __token__ --password "$PYPI_TOKEN" --skip-existing --verbose ${{ needs.wheel_setup.outputs.fbgemm_package_name }}*.whl
diff --git a/.github/workflows/fbgemm_nightly_build.yml b/.github/workflows/fbgemm_nightly_build.yml
deleted file mode 100644
index eb552abafc..0000000000
--- a/.github/workflows/fbgemm_nightly_build.yml
+++ /dev/null
@@ -1,262 +0,0 @@
-# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
-# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
-
-name: Push Binary Nightly
-
-on:
-  # For debugging, enable push/pull_request
-  # [push, pull_request]
-  # run every day at 10:45 AM
-  schedule:
-    - cron:  '45 10 * * *'
-  # or manually trigger it
-  workflow_dispatch:
-
-jobs:
-  # build on cpu hosts and upload to GHA
-  build_on_cpu:
-    runs-on: ${{ matrix.os }}
-    strategy:
-      matrix:
-        include:
-         - os: linux.2xlarge
-           python-version: "3.7"
-           python-tag: "py37"
-           cuda-tag: "cu11"
-         - os: linux.2xlarge
-           python-version: "3.8"
-           python-tag: "py38"
-           cuda-tag: "cu11"
-         - os: linux.2xlarge
-           python-version: "3.9"
-           python-tag: "py39"
-           cuda-tag: "cu11"
-         - os: linux.2xlarge
-           python-version: "3.10"
-           python-tag: "py310"
-           cuda-tag: "cu11"
-    steps:
-    # Checkout the repository to the GitHub Actions runner
-    - name: Check ldd --version
-      run: ldd --version
-    - name: Checkout
-      uses: actions/checkout@v2
-      with:
-        submodules: true
-    # Update references
-    - name: Git Submodule Update
-      run: |
-        cd fbgemm_gpu/
-        git submodule sync
-        git submodule update --init --recursive
-    - name: Setup conda
-      run: |
-        wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh
-        bash ~/miniconda.sh -b -p $HOME/miniconda -u
-    - name: Setup PATH with conda
-      run: |
-        echo "${HOME}/miniconda/bin" >> $GITHUB_PATH
-        echo "CONDA=${HOME}/miniconda" >> $GITHUB_PATH
-    - name: Create conda env
-      run: |
-        conda create --name build_binary python=${{ matrix.python-version }}
-        conda info
-    - name: check python version
-      run: |
-        conda run -n build_binary python --version
-    - name: Install C/C++ compilers
-      run: |
-        sudo yum install -y gcc gcc-c++
-    - name: Install PyTorch and CUDA
-      shell: bash
-      run: |
-        conda install -n build_binary -y pytorch pytorch-cuda=11.7 -c pytorch-nightly -c nvidia
-        conda install -n build_binary -y -c conda-forge cudnn
-    - name: Install Other Dependencies
-      shell: bash
-      run: |
-        cd fbgemm_gpu/
-        conda run -n build_binary python -m pip install -r requirements.txt
-    - name: Test Installation of Dependencies
-      run: |
-        cd fbgemm_gpu/
-        conda run -n build_binary python -c "import torch.distributed"
-        echo "torch.distributed succeeded"
-        conda run -n build_binary python -c "import skbuild"
-        echo "skbuild succeeded"
-        conda run -n build_binary python -c "import numpy"
-        echo "numpy succeeded"
-    # for the conda run with quotes, we have to use "\" and double quotes
-    # here is the issue: https://github.com/conda/conda/issues/10972
-    - name: Build FBGEMM_GPU Nightly
-      run: |
-        cd fbgemm_gpu/
-        rm -rf dist
-        # build cuda7.0;8.0 for v100/a100 arch:
-        # Couldn't build more cuda arch due to 100 MB binary size limit from
-        # pypi website.
-        # manylinux1_x86_64 is specified for pypi upload:
-        # distribute python extensions as wheels on Linux
-        conda run -n build_binary \
-          python setup.py bdist_wheel \
-          --package_name=fbgemm_gpu_nightly \
-          --python-tag=${{ matrix.python-tag }} \
-          -DTORCH_CUDA_ARCH_LIST="'7.0;8.0'" \
-          --plat-name=manylinux1_x86_64
-        ls -lt dist/*.whl
-    - name: Upload wheel as GHA artifact
-      uses: actions/upload-artifact@v2
-      with:
-        name: fbgemm_gpu_nightly_${{ matrix.python-version }}_${{ matrix.cuda-tag }}.whl
-        path: fbgemm_gpu/dist/fbgemm_gpu_nightly-*.whl
-
-  # download from GHA, test on gpu and push to pypi
-  test_on_gpu:
-    runs-on: ${{ matrix.os }}
-    strategy:
-      matrix:
-        os: [linux.g5.4xlarge.nvidia.gpu]
-        python-version: ["3.7", "3.8", "3.9", "3.10"]
-        cuda-tag: ["cu11"]
-    needs: build_on_cpu
-    steps:
-    - name: Check ldd --version
-      run: ldd --version
-    - name: check cpu info
-      shell: bash
-      run: |
-        cat /proc/cpuinfo
-    - name: check distribution info
-      shell: bash
-      run: |
-        cat /proc/version
-    - name: Display EC2 information
-      shell: bash
-      run: |
-        set -euo pipefail
-        function get_ec2_metadata() {
-          # Pulled from instance metadata endpoint for EC2
-          # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-          category=$1
-          curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-        }
-        echo "ami-id: $(get_ec2_metadata ami-id)"
-        echo "instance-id: $(get_ec2_metadata instance-id)"
-        echo "instance-type: $(get_ec2_metadata instance-type)"
-    - name: check gpu info
-      shell: bash
-      run: |
-        sudo yum install lshw -y
-        sudo lshw -C display
-    # Checkout the repository to the GitHub Actions runner
-    - name: Checkout
-      uses: actions/checkout@v2
-      with:
-        submodules: true
-    # Update references
-    - name: Git Sumbodule Update
-      run: |
-        cd fbgemm_gpu/
-        git submodule sync
-        git submodule update --init --recursive
-        git log
-    - name: Update pip
-      run: |
-        sudo yum update -y
-        sudo yum -y install git python3-pip
-        sudo pip3 install --upgrade pip
-    - name: Setup conda
-      run: |
-        wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh
-        bash ~/miniconda.sh -b -p $HOME/miniconda -u
-    - name: setup Path
-      run: |
-        echo "$HOME/miniconda/bin" >> $GITHUB_PATH
-        echo "CONDA=$HOME/miniconda" >> $GITHUB_PATH
-    - name: create conda env
-      run: |
-        conda create --name build_binary python=${{ matrix.python-version }}
-        conda info
-    - name: check python version without Conda
-      run: |
-        python --version
-    - name: check python version with Conda
-      run: |
-        conda run -n build_binary python --version
-    - name: Install CUDA 11.3
-      shell: bash
-      run: |
-        sudo yum install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm
-        sudo yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo
-        sudo yum clean expire-cache
-        sudo yum install -y nvidia-driver-latest-dkms
-        sudo yum install -y cuda-11-3
-        sudo yum install -y cuda-drivers
-        sudo yum install -y libcudnn8-devel
-    - name: setup Path
-      run: |
-        echo /usr/local/cuda-11.3/bin >> $GITHUB_PATH
-        echo /usr/local/bin >> $GITHUB_PATH
-    - name: nvcc check
-      run: |
-        nvcc --version
-    - name: Install PyTorch using Conda
-      shell: bash
-      run: |
-        conda install -n build_binary -y pytorch pytorch-cuda=11.7 -c pytorch-nightly -c nvidia
-        conda install -n build_binary -y -c conda-forge cudnn
-    # download wheel from GHA
-    - name: Download wheel
-      uses: actions/download-artifact@v2
-      with:
-        name: fbgemm_gpu_nightly_${{ matrix.python-version }}_${{ matrix.cuda-tag }}.whl
-    - name: Display structure of downloaded files
-      run: ls -R
-    - name: Install Dependencies
-      shell: bash
-      run: |
-        cd fbgemm_gpu/
-        conda run -n build_binary python -m pip install -r requirements.txt
-    - name: Test Installation of dependencies
-      run: |
-        cd fbgemm_gpu/
-        conda run -n build_binary python -c "import torch.distributed"
-        echo "torch.distributed succeeded"
-        conda run -n build_binary python -c "import skbuild"
-        echo "skbuild succeeded"
-        conda run -n build_binary python -c "import numpy"
-        echo "numpy succeeded"
-    - name: Install FBGEMM_GPU Nightly
-      run: |
-        rm -rf dist
-        conda run -n build_binary \
-          python -m pip install *.whl
-    - name: Test fbgemm_gpu installation
-      shell: bash
-      run: |
-        conda run -n build_binary \
-          python -c "import fbgemm_gpu"
-    - name: Test with pytest
-      # remove this line when we fixed all the unit tests
-      continue-on-error: true
-      run: |
-        conda run -n build_binary \
-          python -m pip install pytest
-        # The tests with single CPU core on a less powerful testing GPU in GHA
-        # can take 5 hours.
-        timeout 600s conda run -n build_binary \
-          python -m pytest -v -s -W ignore::pytest.PytestCollectionWarning --continue-on-collection-errors
-    # Push to Pypi
-    - name: Push FBGEMM_GPU Binary to PYPI
-      env:
-        PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}
-      run: |
-        conda run -n build_binary python -m pip install twine
-        # Official PYPI website
-        conda run -n build_binary \
-          python -m twine upload \
-            --username __token__ \
-            --password "$PYPI_TOKEN" \
-            --skip-existing \
-            --verbose \
-            fbgemm_gpu_nightly-*.whl
diff --git a/.github/workflows/fbgemm_nightly_build_cpu.yml b/.github/workflows/fbgemm_nightly_build_cpu.yml
deleted file mode 100644
index b42d1089a4..0000000000
--- a/.github/workflows/fbgemm_nightly_build_cpu.yml
+++ /dev/null
@@ -1,165 +0,0 @@
-# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
-# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
-
-name: Push CPU Binary Nightly
-
-on:
-  # # For debugging, enable push/pull_request
-  # [push, pull_request]
-  # run every day at 10:45 AM
-  schedule:
-    - cron:  '45 10 * * *'
-  # or manually trigger it
-  workflow_dispatch:
-
-jobs:
-  # build, test, and upload to GHA on cpu hosts
-  build_test_upload:
-    runs-on: ${{ matrix.os }}
-    strategy:
-      matrix:
-        include:
-         - os: linux.2xlarge
-           python-version: "3.7"
-           python-tag: "py37"
-           cuda-tag: "cpu"
-         - os: linux.2xlarge
-           python-version: "3.8"
-           python-tag: "py38"
-           cuda-tag: "cpu"
-         - os: linux.2xlarge
-           python-version: "3.9"
-           python-tag: "py39"
-           cuda-tag: "cpu"
-         - os: linux.2xlarge
-           python-version: "3.10"
-           python-tag: "py310"
-           cuda-tag: "cpu"
-    steps:
-    # Checkout the repository to the GitHub Actions runner
-    - name: Check ldd --version
-      run: ldd --version
-    - name: Checkout
-      uses: actions/checkout@v2
-      with:
-        submodules: true
-    # Update references
-    - name: Git Sumbodule Update
-      run: |
-        cd fbgemm_gpu/
-        git submodule sync
-        git submodule update --init --recursive
-    - name: Update pip
-      run: |
-        sudo yum update -y
-        sudo yum -y install git python3-pip
-        sudo pip3 install --upgrade pip
-    - name: Setup conda
-      run: |
-        wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh
-        bash ~/miniconda.sh -b -p $HOME/miniconda -u
-    - name: Setup PATH with conda
-      run: |
-        echo "/home/ec2-user/miniconda/bin" >> $GITHUB_PATH
-        echo "CONDA=/home/ec2-user/miniconda" >> $GITHUB_PATH
-    - name: Create conda env
-      run: |
-        conda create --name build_binary python=${{ matrix.python-version }}
-        conda info
-    - name: check python version
-      run: |
-        conda run -n build_binary python --version
-    - name: Install gcc
-      shell: bash
-      run: |
-        sudo yum group install -y "Development Tools"
-    - name: setup Path
-      run: |
-        echo /usr/local/bin >> $GITHUB_PATH
-    - name: Install PyTorch
-      shell: bash
-      run: |
-        conda run -n build_binary python -m pip install --pre torch -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html
-    - name: Install Dependencies
-      shell: bash
-      run: |
-        cd fbgemm_gpu/
-        conda run -n build_binary python -m pip install -r requirements.txt
-    - name: Test Installation of dependencies
-      run: |
-        cd fbgemm_gpu/
-        conda run -n build_binary python -c "import torch.distributed"
-        echo "torch.distributed succeeded"
-        conda run -n build_binary python -c "import skbuild"
-        echo "skbuild succeeded"
-        conda run -n build_binary python -c "import numpy"
-        echo "numpy succeeded"
-    - name: Build FBGEMM_GPU Nightly
-      run: |
-        cd fbgemm_gpu/
-        rm -r dist || true
-        # buld cuda7.0;8.0 for v100/a100 arch:
-        # Couldn't build more cuda arch due to 100 MB binary size limit from
-        # pypi website.
-        # manylinux1_x86_64 is specified for pypi upload:
-        # distribute python extensions as wheels on Linux
-        conda run -n build_binary \
-          python setup.py bdist_wheel \
-          --package_name=fbgemm_gpu_nightly-cpu \
-          --python-tag=${{ matrix.python-tag }} \
-          --cpu_only \
-          --plat-name=manylinux1_x86_64
-        ls -lt dist/*.whl
-    - name: Upload wheel as GHA artifact
-      uses: actions/upload-artifact@v2
-      with:
-        name: fbgemm_gpu_nightly_cpu_${{ matrix.python-version }}_${{ matrix.cuda-tag }}.whl
-        path: fbgemm_gpu/dist/fbgemm_gpu_nightly_cpu-*.whl
-
-    - name: Install Dependencies
-      shell: bash
-      run: |
-        cd fbgemm_gpu/
-        conda run -n build_binary python -m pip install -r requirements.txt
-    - name: Test Installation of dependencies
-      run: |
-        cd fbgemm_gpu/
-        conda run -n build_binary python -c "import torch.distributed"
-        echo "torch.distributed succeeded"
-        conda run -n build_binary python -c "import skbuild"
-        echo "skbuild succeeded"
-        conda run -n build_binary python -c "import numpy"
-        echo "numpy succeeded"
-    - name: Install FBGEMM_GPU Nightly (CPU version)
-      run: |
-        conda run -n build_binary \
-          python -m pip install fbgemm_gpu/dist/fbgemm_gpu_nightly_cpu-*.whl
-    - name: Test fbgemm_gpu installation
-      shell: bash
-      run: |
-        conda run -n build_binary \
-          python -c "import fbgemm_gpu"
-    - name: Test with pytest
-      # remove this line when we fixed all the unit tests
-      continue-on-error: true
-      run: |
-        conda run -n build_binary \
-          python -m pip install pytest
-        # The tests with single CPU core on a less powerful testing GPU in GHA
-        # can take 5 hours.
-        timeout 600s conda run -n build_binary \
-          python -m pytest -v -s -W ignore::pytest.PytestCollectionWarning --continue-on-collection-errors
-    # Push to Pypi
-    - name: Push FBGEMM_GPU Binary to PYPI
-      env:
-        PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}
-      run: |
-        conda run -n build_binary python -m pip install twine
-        # Official PYPI website
-        conda run -n build_binary \
-          python -m twine upload \
-            --username __token__ \
-            --password "$PYPI_TOKEN" \
-            --skip-existing \
-            --verbose \
-            fbgemm_gpu/dist/fbgemm_gpu_nightly_cpu-*.whl
diff --git a/.github/workflows/fbgemm_release_build.yml b/.github/workflows/fbgemm_release_build.yml
deleted file mode 100644
index 7935004cb9..0000000000
--- a/.github/workflows/fbgemm_release_build.yml
+++ /dev/null
@@ -1,282 +0,0 @@
-# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
-# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
-
-name: Push Binary Release
-
-on:
-  # # For debugging, enable push/pull_request
-  # [push, pull_request]
-  # # run every day at 10:45 AM
-  # schedule:
-  #   - cron:  '45 10 * * *'
-  # # or manually trigger it
-  # workflow_dispatch:
-
-jobs:
-  # build on cpu hosts and upload to GHA
-  build_on_cpu:
-    runs-on: ${{ matrix.os }}
-    strategy:
-      matrix:
-        include:
-         - os: linux.2xlarge
-           python-version: "3.7"
-           python-tag: "py37"
-           cuda-tag: "cu11"
-         - os: linux.2xlarge
-           python-version: "3.8"
-           python-tag: "py38"
-           cuda-tag: "cu11"
-         - os: linux.2xlarge
-           python-version: "3.9"
-           python-tag: "py39"
-           cuda-tag: "cu11"
-         - os: linux.2xlarge
-           python-version: "3.10"
-           python-tag: "py310"
-           cuda-tag: "cu11"
-    steps:
-    # Checkout the repository to the GitHub Actions runner
-    - name: Check ldd --version
-      run: ldd --version
-    - name: Checkout
-      uses: actions/checkout@v2
-      with:
-        submodules: true
-    # Update references
-    - name: Git Sumbodule Update
-      run: |
-        cd fbgemm_gpu/
-        git submodule sync
-        git submodule update --init --recursive
-    - name: Update pip
-      run: |
-        sudo yum update -y
-        sudo yum -y install git python3-pip
-        sudo pip3 install --upgrade pip
-    - name: Setup conda
-      run: |
-        wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh
-        bash ~/miniconda.sh -b -p $HOME/miniconda -u
-    - name: Setup PATH with conda
-      run: |
-        echo "/home/ec2-user/miniconda/bin" >> $GITHUB_PATH
-        echo "CONDA=/home/ec2-user/miniconda" >> $GITHUB_PATH
-    - name: Create conda env
-      run: |
-        conda create --name build_binary python=${{ matrix.python-version }}
-        conda info
-    - name: check python version
-      run: |
-        conda run -n build_binary python --version
-    - name: Install CUDA 11.3
-      shell: bash
-      run: |
-        sudo yum install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm
-        sudo yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo
-        sudo yum clean expire-cache
-        sudo yum install -y nvidia-driver-latest-dkms
-        sudo yum install -y cuda-11-3
-        sudo yum install -y cuda-drivers
-        sudo yum install -y libcudnn8-devel
-    - name: setup Path
-      run: |
-        echo /usr/local/cuda-11.3/bin >> $GITHUB_PATH
-        echo /usr/local/bin >> $GITHUB_PATH
-    - name: nvcc check
-      run: |
-        nvcc --version
-    - name: Install PyTorch
-      shell: bash
-      run: |
-        conda run -n build_binary \
-          python -m pip install --pre torch -f https://download.pytorch.org/whl/test/cu113/torch_test.html
-
-    - name: Install Dependencies
-      shell: bash
-      run: |
-        cd fbgemm_gpu/
-        conda run -n build_binary python -m pip install -r requirements.txt
-    - name: Test Installation of dependencies
-      run: |
-        cd fbgemm_gpu/
-        conda run -n build_binary python -c "import torch.distributed"
-        echo "torch.distributed succeeded"
-        conda run -n build_binary python -c "import skbuild"
-        echo "skbuild succeeded"
-        conda run -n build_binary python -c "import numpy"
-        echo "numpy succeeded"
-    # for the conda run with quotes, we have to use "\" and double quotes
-    # here is the issue: https://github.com/conda/conda/issues/10972
-    - name: Build FBGEMM_GPU Release
-      run: |
-        cd fbgemm_gpu/
-        rm -r dist || true
-        # buld cuda7.0;8.0 for v100/a100 arch:
-        # Couldn't build more cuda arch due to 100 MB binary size limit from
-        # pypi website.
-        # manylinux1_x86_64 is specified for pypi upload:
-        # distribute python extensions as wheels on Linux
-        conda run -n build_binary \
-          python setup.py bdist_wheel \
-          --package_name=fbgemm_gpu \
-          --python-tag=${{ matrix.python-tag }} \
-          -DTORCH_CUDA_ARCH_LIST="'7.0;8.0'" \
-          --plat-name=manylinux1_x86_64
-        ls -lt dist/*.whl
-    - name: Upload wheel as GHA artifact
-      uses: actions/upload-artifact@v2
-      with:
-        name: fbgemm_gpu_${{ matrix.python-version }}_${{ matrix.cuda-tag }}.whl
-        path: fbgemm_gpu/dist/fbgemm_gpu-*.whl
-
-  # download from GHA, test on gpu and push to pypi
-  test_on_gpu:
-    runs-on: ${{ matrix.os }}
-    strategy:
-      matrix:
-        os: [linux.g5.4xlarge.nvidia.gpu]
-        python-version: ["3.7", "3.8", "3.9", "3.10"]
-        cuda-tag: ["cu11"]
-    needs: build_on_cpu
-    steps:
-    - name: Check ldd --version
-      run: ldd --version
-    - name: check cpu info
-      shell: bash
-      run: |
-        cat /proc/cpuinfo
-    - name: check distribution info
-      shell: bash
-      run: |
-        cat /proc/version
-    - name: Display EC2 information
-      shell: bash
-      run: |
-        set -euo pipefail
-        function get_ec2_metadata() {
-          # Pulled from instance metadata endpoint for EC2
-          # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-          category=$1
-          curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-        }
-        echo "ami-id: $(get_ec2_metadata ami-id)"
-        echo "instance-id: $(get_ec2_metadata instance-id)"
-        echo "instance-type: $(get_ec2_metadata instance-type)"
-    - name: check gpu info
-      shell: bash
-      run: |
-        sudo yum install lshw -y
-        sudo lshw -C display
-    # Checkout the repository to the GitHub Actions runner
-    - name: Checkout
-      uses: actions/checkout@v2
-      with:
-        submodules: true
-    # Update references
-    - name: Git Sumbodule Update
-      run: |
-        cd fbgemm_gpu/
-        git submodule sync
-        git submodule update --init --recursive
-        git log
-    - name: Update pip
-      run: |
-        sudo yum update -y
-        sudo yum -y install git python3-pip
-        sudo pip3 install --upgrade pip
-    - name: Setup conda
-      run: |
-        wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh
-        bash ~/miniconda.sh -b -p $HOME/miniconda -u
-    - name: setup Path
-      run: |
-        echo "/home/ec2-user/miniconda/bin" >> $GITHUB_PATH
-        echo "CONDA=/home/ec2-user/miniconda" >> $GITHUB_PATH
-    - name: create conda env
-      run: |
-        conda create --name build_binary python=${{ matrix.python-version }}
-        conda info
-    - name: check python version no Conda
-      run: |
-        python --version
-    - name: check python version
-      run: |
-        conda run -n build_binary python --version
-    - name: Install CUDA 11.3
-      shell: bash
-      run: |
-        sudo yum install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm
-        sudo yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo
-        sudo yum clean expire-cache
-        sudo yum install -y nvidia-driver-latest-dkms
-        sudo yum install -y cuda-11-3
-        sudo yum install -y cuda-drivers
-        sudo yum install -y libcudnn8-devel
-    - name: setup Path
-      run: |
-        echo /usr/local/cuda-11.3/bin >> $GITHUB_PATH
-        echo /usr/local/bin >> $GITHUB_PATH
-    - name: nvcc check
-      run: |
-        nvcc --version
-    - name: Install PyTorch
-      shell: bash
-      run: |
-        conda run -n build_binary \
-          python -m pip install --pre torch -f https://download.pytorch.org/whl/test/cu113/torch_test.html
-    # download wheel from GHA
-    - name: Download wheel
-      uses: actions/download-artifact@v2
-      with:
-        name: fbgemm_gpu_${{ matrix.python-version }}_${{ matrix.cuda-tag }}.whl
-    - name: Display structure of downloaded files
-      run: ls -R
-    - name: Install Dependencies
-      shell: bash
-      run: |
-        cd fbgemm_gpu/
-        conda run -n build_binary python -m pip install -r requirements.txt
-    - name: Test Installation of dependencies
-      run: |
-        cd fbgemm_gpu/
-        conda run -n build_binary python -c "import torch.distributed"
-        echo "torch.distributed succeeded"
-        conda run -n build_binary python -c "import skbuild"
-        echo "skbuild succeeded"
-        conda run -n build_binary python -c "import numpy"
-        echo "numpy succeeded"
-    - name: Install FBGEMM_GPU Release
-      run: |
-        rm -r dist || true
-        conda run -n build_binary \
-          python -m pip install *.whl
-    - name: Test fbgemm_gpu installation
-      shell: bash
-      run: |
-        conda run -n build_binary \
-          python -c "import fbgemm_gpu"
-    - name: Test with pytest
-      # remove this line when we fixed all the unit tests
-      continue-on-error: true
-      run: |
-        conda run -n build_binary \
-          python -m pip install pytest
-        # The tests with single CPU core on a less powerful testing GPU in GHA
-        # can take 5 hours.
-        timeout 600s conda run -n build_binary \
-          python -m pytest -v -s -W ignore::pytest.PytestCollectionWarning --continue-on-collection-errors
-    # Push to Pypi
-    - name: Push FBGEMM_GPU Binary to PYPI
-      env:
-        PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}
-      run: |
-        conda run -n build_binary python -m pip install twine
-        # Official PYPI website
-        conda run -n build_binary \
-          python -m twine upload \
-            --username __token__ \
-            --password "$PYPI_TOKEN" \
-            --skip-existing \
-            --verbose \
-            fbgemm_gpu-*.whl
diff --git a/.github/workflows/fbgemm_release_build_cpu.yml b/.github/workflows/fbgemm_release_build_cpu.yml
deleted file mode 100644
index 1a72143447..0000000000
--- a/.github/workflows/fbgemm_release_build_cpu.yml
+++ /dev/null
@@ -1,167 +0,0 @@
-# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
-# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
-
-name: Push CPU Binary Release
-
-on:
-  # # For debugging, enable push/pull_request
-  # [push, pull_request]
-  # # run every day at 10:45 AM
-  # schedule:
-  #   - cron:  '45 10 * * *'
-  # # or manually trigger it
-  # workflow_dispatch:
-
-jobs:
-  # build, test, and upload to GHA on cpu hosts
-  build_test_upload:
-    runs-on: ${{ matrix.os }}
-    strategy:
-      matrix:
-        include:
-         - os: linux.2xlarge
-           python-version: "3.7"
-           python-tag: "py37"
-           cuda-tag: "cpu"
-         - os: linux.2xlarge
-           python-version: "3.8"
-           python-tag: "py38"
-           cuda-tag: "cpu"
-         - os: linux.2xlarge
-           python-version: "3.9"
-           python-tag: "py39"
-           cuda-tag: "cpu"
-         - os: linux.2xlarge
-           python-version: "3.10"
-           python-tag: "py310"
-           cuda-tag: "cpu"
-    steps:
-    # Checkout the repository to the GitHub Actions runner
-    - name: Check ldd --version
-      run: ldd --version
-    - name: Checkout
-      uses: actions/checkout@v2
-      with:
-        submodules: true
-    # Update references
-    - name: Git Sumbodule Update
-      run: |
-        cd fbgemm_gpu/
-        git submodule sync
-        git submodule update --init --recursive
-    - name: Update pip
-      run: |
-        sudo yum update -y
-        sudo yum -y install git python3-pip
-        sudo pip3 install --upgrade pip
-    - name: Setup conda
-      run: |
-        wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh
-        bash ~/miniconda.sh -b -p $HOME/miniconda -u
-    - name: Setup PATH with conda
-      run: |
-        echo "/home/ec2-user/miniconda/bin" >> $GITHUB_PATH
-        echo "CONDA=/home/ec2-user/miniconda" >> $GITHUB_PATH
-    - name: Create conda env
-      run: |
-        conda create --name build_binary python=${{ matrix.python-version }}
-        conda info
-    - name: check python version
-      run: |
-        conda run -n build_binary python --version
-    - name: Install gcc
-      shell: bash
-      run: |
-        sudo yum group install -y "Development Tools"
-    - name: setup Path
-      run: |
-        echo /usr/local/bin >> $GITHUB_PATH
-    - name: Install PyTorch
-      shell: bash
-      run: |
-        conda run -n build_binary \
-          python -m pip install --pre torch -f https://download.pytorch.org/whl/test/cpu/torch_test.html
-
-    - name: Install Dependencies
-      shell: bash
-      run: |
-        cd fbgemm_gpu/
-        conda run -n build_binary python -m pip install -r requirements.txt
-    - name: Test Installation of dependencies
-      run: |
-        cd fbgemm_gpu/
-        conda run -n build_binary python -c "import torch.distributed"
-        echo "torch.distributed succeeded"
-        conda run -n build_binary python -c "import skbuild"
-        echo "skbuild succeeded"
-        conda run -n build_binary python -c "import numpy"
-        echo "numpy succeeded"
-    - name: Build FBGEMM_GPU Release
-      run: |
-        cd fbgemm_gpu/
-        rm -r dist || true
-        # buld cuda7.0;8.0 for v100/a100 arch:
-        # Couldn't build more cuda arch due to 100 MB binary size limit from
-        # pypi website.
-        # manylinux1_x86_64 is specified for pypi upload:
-        # distribute python extensions as wheels on Linux
-        conda run -n build_binary \
-          python setup.py bdist_wheel \
-          --package_name=fbgemm_gpu-cpu \
-          --python-tag=${{ matrix.python-tag }} \
-          --cpu_only \
-          --plat-name=manylinux1_x86_64
-        ls -lt dist/*.whl
-    - name: Upload wheel as GHA artifact
-      uses: actions/upload-artifact@v2
-      with:
-        name: fbgemm_gpu_cpu_${{ matrix.python-version }}_${{ matrix.cuda-tag }}.whl
-        path: fbgemm_gpu/dist/fbgemm_gpu_cpu-*.whl
-
-    - name: Install Dependencies
-      shell: bash
-      run: |
-        cd fbgemm_gpu/
-        conda run -n build_binary python -m pip install -r requirements.txt
-    - name: Test Installation of dependencies
-      run: |
-        cd fbgemm_gpu/
-        conda run -n build_binary python -c "import torch.distributed"
-        echo "torch.distributed succeeded"
-        conda run -n build_binary python -c "import skbuild"
-        echo "skbuild succeeded"
-        conda run -n build_binary python -c "import numpy"
-        echo "numpy succeeded"
-    - name: Install FBGEMM_GPU Release (CPU version)
-      run: |
-        conda run -n build_binary \
-          python -m pip install fbgemm_gpu/dist/fbgemm_gpu_cpu-*.whl
-    - name: Test fbgemm_gpu installation
-      shell: bash
-      run: |
-        conda run -n build_binary \
-          python -c "import fbgemm_gpu"
-    - name: Test with pytest
-      # remove this line when we fixed all the unit tests
-      continue-on-error: true
-      run: |
-        conda run -n build_binary \
-          python -m pip install pytest
-        # The tests with single CPU core on a less powerful testing GPU in GHA
-        # can take 5 hours.
-        timeout 600s conda run -n build_binary \
-          python -m pytest -v -s -W ignore::pytest.PytestCollectionWarning --continue-on-collection-errors
-    # Push to Pypi
-    - name: Push FBGEMM_GPU Binary to PYPI
-      env:
-        PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}
-      run: |
-        conda run -n build_binary python -m pip install twine
-        # Official PYPI website
-        conda run -n build_binary \
-          python -m twine upload \
-            --username __token__ \
-            --password "$PYPI_TOKEN" \
-            --skip-existing \
-            --verbose \
-            fbgemm_gpu/dist/fbgemm_gpu_cpu-*.whl
diff --git a/.github/workflows/push_wheel.yml b/.github/workflows/push_wheel.yml
new file mode 100644
index 0000000000..9c8631e60f
--- /dev/null
+++ b/.github/workflows/push_wheel.yml
@@ -0,0 +1,33 @@
+name: Push Wheel
+
+on:
+  # For debugging, please use test_wheel_*.yml
+  # run every day at 10:45 AM
+  schedule:
+    - cron:  '45 10 * * *'
+  # or manually trigger it
+  workflow_dispatch:
+    inputs:
+      release_version:
+        type: choice
+        required: true
+        default: true
+        options:
+        - nightly
+        - prerelease
+        - release
+      upload_pypi:
+        type: choice
+        required: true
+        default: true
+        options:
+        - true
+        - false
+
+jobs:
+  push_wheel:
+    uses: ./.github/workflows/build_wheel.yml
+    with:
+      # if it's triggered by "schedule", nightly + true will be chosen
+      release_version: ${{ inputs.release_version || 'nightly' }}
+      upload_pypi: ${{ (inputs.upload_pypi || 'true') == 'true' }}
diff --git a/.github/workflows/test_wheel.yml b/.github/workflows/test_wheel.yml
new file mode 100644
index 0000000000..66ab537bbe
--- /dev/null
+++ b/.github/workflows/test_wheel.yml
@@ -0,0 +1,33 @@
+name: Test Wheel
+
+on:
+  pull_request:
+    branches:
+      - main
+    types:
+      - opened
+      - synchronize
+      - reopened
+      - labeled
+
+jobs:
+  test_wheel_nightly:
+    if: contains(github.event.pull_request.labels.*.name, 'test_wheel_nightly')
+    uses: ./.github/workflows/build_wheel.yml
+    with:
+      release_version: "nightly"
+      upload_pypi: false
+
+  test_wheel_prerelease:
+    if: contains(github.event.pull_request.labels.*.name, 'test_wheel_prerelease')
+    uses: ./.github/workflows/build_wheel.yml
+    with:
+      release_version: "prerelease"
+      upload_pypi: false
+
+  test_wheel_release:
+    if: contains(github.event.pull_request.labels.*.name, 'test_wheel_release')
+    uses: ./.github/workflows/build_wheel.yml
+    with:
+      release_version: "release"
+      upload_pypi: false