.github/workflows/fbgemm_gpu_cuda_release.yml

# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

name: FBGEMM_GPU-CUDA Release Build

on:
  # PR Trigger (enabled only for debugging)
  #
  pull_request:
    branches:
      - main

  # Push Trigger (enable to catch errors coming out of multiple merges)
  #
  push:
    branches:
      - main

  # Manual Trigger
  #
  workflow_dispatch:

concurrency:
  # Cancel previous runs in the PR if a new commit is pushed
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
  cancel-in-progress: true

jobs:
  # Build on CPU hosts and upload to GHA
  build_artifact:
    runs-on: ${{ matrix.host-machine.instance }}
    container:
      image: amazonlinux:2023
      options: --user root
    defaults:
      run:
        shell: bash
    env:
      PRELUDE: .github/scripts/setup_env.bash
      BUILD_ENV: build_binary
    continue-on-error: true
    strategy:
      # Don't fast-fail all the other builds if one of the them fails
      fail-fast: false
      matrix:
        host-machine: [
          { arch: x86, instance: "linux.24xlarge" },
        ]
        python-version: [ "3.8", "3.9", "3.10", "3.11" ]
        cuda-version: [ "11.8.0", "12.1.1" ]

    steps:
    - name: Setup Build Container
      run: yum update -y; yum install -y binutils findutils git pciutils sudo tar wget which

    - name: Checkout the Repository
      uses: actions/checkout@v3
      with:
        submodules: true

    - name: Display System Info
      run: . $PRELUDE; print_system_info

    - name: Display GPU Info
      run: . $PRELUDE; print_gpu_info

    - name: Setup Miniconda
      run: . $PRELUDE; setup_miniconda $HOME/miniconda

    - name: Create Conda Environment
      run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }}

    - name: Install C/C++ Compilers
      run: . $PRELUDE; install_cxx_compiler $BUILD_ENV

    - name: Install Build Tools
      run: . $PRELUDE; install_build_tools $BUILD_ENV

    - name: Install CUDA
      run: . $PRELUDE; install_cuda $BUILD_ENV ${{ matrix.cuda-version }}

    - name: Install PyTorch Test
      run: . $PRELUDE; install_pytorch_pip $BUILD_ENV test cuda ${{ matrix.cuda-version }}

    - name: Install cuDNN
      run: . $PRELUDE; install_cudnn $BUILD_ENV "$(pwd)/build_only/cudnn" ${{ matrix.cuda-version }}

    - name: Prepare FBGEMM_GPU Build
      run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV

    - name: Build FBGEMM_GPU
      run: . $PRELUDE; cd fbgemm_gpu; build_fbgemm_gpu_package $BUILD_ENV release cuda

    - name: Upload Built Wheel as GHA Artifact
      uses: actions/upload-artifact@v3
      with:
        name: fbgemm_gpu_release_cuda_${{ matrix.host-machine.arch }}_${{ matrix.python-version }}_cu${{ matrix.cuda-version }}.whl
        path: fbgemm_gpu/dist/fbgemm_gpu-*.whl


  # Download the built artifact from GHA, test on GPU, and push to PyPI
  test_and_publish_artifact:
    runs-on: ${{ matrix.host-machine.instance }}
    defaults:
      run:
        shell: bash
    env:
      PRELUDE: .github/scripts/setup_env.bash
      BUILD_ENV: build_binary
      ENFORCE_NVIDIA_GPU: 1
    strategy:
      fail-fast: false
      matrix:
        host-machine: [
          { arch: x86, instance: "linux.g5.4xlarge.nvidia.gpu" },
        ]
        python-version: [ "3.8", "3.9", "3.10", "3.11" ]
        cuda-version: [ "11.8.0", "12.1.1" ]
        # Specify exactly ONE CUDA version for artifact publish
        cuda-version-publish: [ "11.8.0" ]
    needs: build_artifact

    steps:
    - name: Checkout the Repository
      uses: actions/checkout@v3
      with:
        submodules: true

    - name: Download Wheel Artifact from GHA
      uses: actions/download-artifact@v3
      with:
        name: fbgemm_gpu_release_cuda_${{ matrix.host-machine.arch }}_${{ matrix.python-version }}_cu${{ matrix.cuda-version }}.whl

    - name: Install NVIDIA Drivers and NVIDIA-Docker Runtime
      uses: pytorch/test-infra/.github/actions/setup-nvidia@main

    - name: Display System Info
      run: . $PRELUDE; print_system_info; print_ec2_info

    - name: Display GPU Info
      run: . $PRELUDE; print_gpu_info

    - name: Setup Miniconda
      run: . $PRELUDE; setup_miniconda $HOME/miniconda

    - name: Create Conda Environment
      run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }}

    - name: Install CUDA
      run: . $PRELUDE; install_cuda $BUILD_ENV ${{ matrix.cuda-version }}

    - name: Install PyTorch Test
      run: . $PRELUDE; install_pytorch_pip $BUILD_ENV test cuda ${{ matrix.cuda-version }}

    - name: Prepare FBGEMM_GPU Build
      run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV

    - name: Install FBGEMM_GPU
      run: . $PRELUDE; install_fbgemm_gpu_package $BUILD_ENV *.whl

    - name: Test with PyTest
      timeout-minutes: 10
      run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV

    - name: Push FBGEMM_GPU Binary to PYPI
      if: ${{ github.event_name != 'pull_request' && github.event_name != 'push' && matrix.cuda-version == matrix.cuda-version-publish }}
      env:
        PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}
      run: . $PRELUDE; publish_to_pypi $BUILD_ENV fbgemm_gpu-*.whl "$PYPI_TOKEN"