Skip to content

Support MTIA device type in FBGEEM TBE training #2806

Support MTIA device type in FBGEEM TBE training

Support MTIA device type in FBGEEM TBE training #2806

Workflow file for this run

# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
name: FBGEMM_GPU CI
on:
# PR Trigger
#
pull_request:
branches:
- main
# Push Trigger (enable to catch errors coming out of multiple merges)
#
push:
branches:
- main
# Manual Trigger (for testing only)
#
workflow_dispatch:
concurrency:
# Cancel previous runs in the PR if a new commit is pushed
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
jobs:
build_and_test_amd:
runs-on: ${{ matrix.host-machine.instance }}
container:
image: ${{ matrix.container-image }}
options: --user root
defaults:
run:
shell: bash
env:
PRELUDE: .github/scripts/setup_env.bash
BUILD_ENV: build_binary
strategy:
fail-fast: false
matrix:
host-machine: [
{ arch: x86, instance: "linux.12xlarge" },
]
container-image: [ "ubuntu:20.04" ]
python-version: [ "3.8", "3.9", "3.10" ]
rocm-version: [ "5.5.1", "5.6" ]
steps:
- name: Setup Build Container
run: |
apt update -y
apt install -y binutils git pciutils sudo wget
git config --global --add safe.directory '*'
- name: Checkout the Repository
uses: actions/checkout@v3
with:
submodules: true
- name: Display System Info
run: . $PRELUDE; print_system_info
- name: Display GPU Info
run: . $PRELUDE; print_gpu_info
- name: Free Disk Space
run: . $PRELUDE; free_disk_space
- name: Setup Miniconda
run: . $PRELUDE; setup_miniconda $HOME/miniconda
- name: Create Conda Environment
run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }}
- name: Install ROCm
run: . $PRELUDE; install_rocm_ubuntu $BUILD_ENV ${{ matrix.rocm-version }}
- name: Install Build Tools
run: . $PRELUDE; install_build_tools $BUILD_ENV
- name: Install PyTorch-ROCm Nightly
run: . $PRELUDE; install_pytorch_pip $BUILD_ENV nightly rocm ${{ matrix.rocm-version }}
- name: Prepare FBGEMM_GPU Build
run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV
- name: Build FBGEMM_GPU-ROCM Nightly
run: . $PRELUDE; cd fbgemm_gpu; build_fbgemm_gpu_develop $BUILD_ENV rocm gfx90a
- name: Test FBGEMM_GPU-ROCM Nightly Installation
timeout-minutes: 10
run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV rocm
test_amd_gpu:
runs-on: ${{ matrix.host-machine.instance }}
container:
image: "rocm/dev-ubuntu-20.04:${{ matrix.rocm-version }}-complete"
options: --user root --device=/dev/kfd --device=/dev/dri --ipc=host --shm-size 16G --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined
defaults:
run:
shell: bash
env:
PRELUDE: .github/scripts/setup_env.bash
BUILD_ENV: build_binary
ENFORCE_AMD_GPU: 1
strategy:
fail-fast: false
matrix:
host-machine: [
{ arch: x86, instance: "rocm" },
]
# ROCm machines are limited, so we only test against Python 3.10
python-version: [ "3.10" ]
rocm-version: [ "5.5.1", "5.6" ]
steps:
- name: Setup Build Container
run: |
apt update -y
apt install -y git wget
git config --global --add safe.directory '*'
- name: Checkout the Repository
uses: actions/checkout@v3
with:
submodules: true
- name: Display System Info
run: . $PRELUDE; print_system_info
- name: Display GPU Info
run: . $PRELUDE; print_gpu_info
- name: Free Disk Space
run: . $PRELUDE; free_disk_space
- name: Setup Miniconda
run: . $PRELUDE; setup_miniconda $HOME/miniconda
- name: Create Conda Environment
run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }}
- name: Install Build Tools
run: . $PRELUDE; install_build_tools $BUILD_ENV
- name: Install PyTorch-ROCm Nightly
run: . $PRELUDE; install_pytorch_pip $BUILD_ENV nightly rocm ${{ matrix.rocm-version }}
- name: Prepare FBGEMM_GPU Build
run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV
- name: Build FBGEMM_GPU-ROCM Nightly
run: . $PRELUDE; cd fbgemm_gpu; build_fbgemm_gpu_develop $BUILD_ENV rocm
- name: Test FBGEMM_GPU-ROCM Nightly Installation
timeout-minutes: 15
run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV rocm
build_and_test_cpu:
runs-on: ${{ matrix.host-machine.instance }}
container:
image: ${{ matrix.container-image }}
options: --user root
defaults:
run:
shell: bash
env:
PRELUDE: .github/scripts/setup_env.bash
BUILD_ENV: build_binary
strategy:
fail-fast: false
matrix:
host-machine: [
{ arch: x86, instance: "linux.4xlarge" },
{ arch: arm, instance: "linux.arm64.2xlarge" },
]
container-image: [ "ubuntu:20.04", "ubuntu:22.04" ]
python-version: [ "3.8", "3.9", "3.10", "3.11" ]
steps:
- name: Setup Build Container
run: |
apt update -y
apt install -y binutils build-essential git pciutils sudo wget
git config --global --add safe.directory '*'
- name: Checkout the Repository
uses: actions/checkout@v3
with:
submodules: true
- name: Display System Info
run: . $PRELUDE; print_system_info
- name: Display GPU Info
run: . $PRELUDE; print_gpu_info
- name: Setup Miniconda
run: . $PRELUDE; setup_miniconda $HOME/miniconda
- name: Create Conda Environment
run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }}
- name: Install Build Tools
run: . $PRELUDE; install_build_tools $BUILD_ENV
- name: Install PyTorch
run: . $PRELUDE; install_pytorch_pip $BUILD_ENV nightly cpu
- name: Prepare FBGEMM_GPU Build
run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV
- name: Build + Install FBGEMM_GPU (CPU version)
run: . $PRELUDE; cd fbgemm_gpu; build_fbgemm_gpu_install $BUILD_ENV cpu
- name: Test FBGEMM_GPU-CPU Nightly Installation
timeout-minutes: 15
run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV cpu