Skip to content

Commit

Permalink
[T145005253] Check for presence of NVIDIA drivers in OSS builds
Browse files Browse the repository at this point in the history
- Check for the actual presence of NVIDIA drivers in OSS builds,
and error out with friendly message instead of cryptic
`RuntimeError: No such operator fbgemm::jagged_2d_to_dense`
errors when `fbgemm_gpu` is installed and loaded on a system with
a GPU but without GPU drivers installed

- Allow for testing of multiple CUDA versions of the build, but
restrict artifact publishing to just one CUDA version
  • Loading branch information
q10 committed Mar 7, 2023
1 parent f4e0b70 commit e0d0be8
Show file tree
Hide file tree
Showing 7 changed files with 86 additions and 37 deletions.
39 changes: 26 additions & 13 deletions .github/scripts/setup_env.bash
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ print_exec () {
echo "+ $*"
echo ""
"$@"
echo ""
}

exec_with_retries () {
Expand Down Expand Up @@ -238,6 +239,30 @@ free_disk_space () {
# Info Functions
################################################################################

print_gpu_info () {
echo "################################################################################"
echo "[INFO] Check GPU info ..."
install_system_packages lshw
print_exec sudo lshw -C display

echo "################################################################################"
echo "[INFO] Check NVIDIA GPU info ..."

if [[ "${ENFORCE_NVIDIA_GPU}" ]]; then
# Ensure that nvidia-smi is available and returns GPU entries
if ! nvidia-smi; then
echo "[CHECK] NVIDIA driver is required, but does not appear to have been installed. This will cause FBGEMM_GPU installation to fail!"
return 1
fi

else
if which nvidia-smi; then
# If nvidia-smi is installed on a machine without GPUs, this will return error
(print_exec nvidia-smi) || true
fi
fi
}

print_system_info () {
echo "################################################################################"
echo "# Print System Info"
Expand All @@ -264,18 +289,6 @@ print_system_info () {
print_exec uname -a
print_exec cat /proc/version
print_exec cat /etc/os-release

echo "################################################################################"
echo "[INFO] Check GPU info ..."
install_system_packages lshw
print_exec sudo lshw -C display

if which nvidia-smi; then
echo "################################################################################"
echo "[INFO] Check NVIDIA GPU info ..."
# If nvidia-smi is installed on a machine without GPUs, this will return error
(print_exec nvidia-smi) || true
fi
}

print_ec2_info () {
Expand Down Expand Up @@ -336,7 +349,7 @@ setup_miniconda () {
print_exec . ~/.bashrc

echo "[SETUP] Updating Miniconda base packages ..."
print_exec conda update -n base -c defaults -y conda
(exec_with_retries conda update -n base -c defaults -y conda) || return 1

# Print Conda info
print_exec conda info
Expand Down
6 changes: 6 additions & 0 deletions .github/workflows/fbgemm_gpu_ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,9 @@ jobs:
- name: Display System Info
run: . $PRELUDE; print_system_info

- name: Display GPU Info
run: . $PRELUDE; print_gpu_info

- name: Free Disk Space
run: . $PRELUDE; free_disk_space

Expand Down Expand Up @@ -150,6 +153,9 @@ jobs:
- name: Display System Info
run: . $PRELUDE; print_system_info

- name: Display GPU Info
run: . $PRELUDE; print_gpu_info

- name: Setup Miniconda
run: |
. $PRELUDE; setup_miniconda $HOME/miniconda
Expand Down
19 changes: 15 additions & 4 deletions .github/workflows/fbgemm_nightly_build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,9 @@ jobs:
- name: Display System Info
run: . $PRELUDE; print_system_info

- name: Display GPU Info
run: . $PRELUDE; print_gpu_info

- name: Setup Miniconda
run: |
. $PRELUDE; setup_miniconda $HOME/miniconda
Expand Down Expand Up @@ -103,12 +106,15 @@ jobs:
env:
PRELUDE: .github/scripts/setup_env.bash
BUILD_ENV: build_binary
ENFORCE_NVIDIA_GPU: 1
strategy:
fail-fast: false
matrix:
os: [ linux.g5.4xlarge.nvidia.gpu ]
python-version: [ "3.8", "3.9", "3.10" ]
cuda-version: [ "11.7.1", "11.8.0" ]
# Specify exactly ONE CUDA version for artifact publish
cuda-version-publish: [ "11.7.1" ]
needs: build_artifact

steps:
Expand All @@ -118,10 +124,10 @@ jobs:
submodules: true

- name: Display System Info
run: . $PRELUDE; print_system_info
run: . $PRELUDE; print_system_info; print_ec2_info

- name: Display EC2 Info
run: . $PRELUDE; print_ec2_info
- name: Display GPU Info
run: . $PRELUDE; print_gpu_info

- name: Setup Miniconda
run: |
Expand Down Expand Up @@ -157,7 +163,12 @@ jobs:
run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV

- name: Push FBGEMM_GPU Nightly Binary to PYPI
if: ${{ github.event_name != 'pull_request' && github.event_name != 'push' }}
if: |
${{
github.event_name != 'pull_request' &&
github.event_name != 'push' &&
matrix.cuda-version == matrix.cuda-version-publish
}}
env:
PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}
run: . $PRELUDE; publish_to_pypi $BUILD_ENV fbgemm_gpu_nightly-*.whl "$PYPI_TOKEN"
9 changes: 6 additions & 3 deletions .github/workflows/fbgemm_nightly_build_cpu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,9 @@ jobs:
- name: Display System Info
run: . $PRELUDE; print_system_info

- name: Display GPU Info
run: . $PRELUDE; print_gpu_info

- name: Setup Miniconda
run: |
. $PRELUDE; setup_miniconda $HOME/miniconda
Expand Down Expand Up @@ -110,10 +113,10 @@ jobs:
submodules: true

- name: Display System Info
run: . $PRELUDE; print_system_info
run: . $PRELUDE; print_system_info; print_ec2_info

- name: Display EC2 Info
run: . $PRELUDE; print_ec2_info
- name: Display GPU Info
run: . $PRELUDE; print_gpu_info

- name: Setup Miniconda
run: |
Expand Down
23 changes: 17 additions & 6 deletions .github/workflows/fbgemm_release_build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ jobs:
matrix:
os: [ linux.12xlarge ]
python-version: [ "3.8", "3.9", "3.10" ]
cuda-version: [ "11.7.1" ]
cuda-version: [ "11.7.1", "11.8.0" ]

steps:
- name: Checkout the Repository
Expand All @@ -49,6 +49,9 @@ jobs:
- name: Display System Info
run: . $PRELUDE; print_system_info

- name: Display GPU Info
run: . $PRELUDE; print_gpu_info

- name: Setup Miniconda
run: |
. $PRELUDE; setup_miniconda $HOME/miniconda
Expand Down Expand Up @@ -95,12 +98,15 @@ jobs:
env:
PRELUDE: .github/scripts/setup_env.bash
BUILD_ENV: build_binary
ENFORCE_NVIDIA_GPU: 1
strategy:
fail-fast: false
matrix:
os: [ linux.g5.4xlarge.nvidia.gpu ]
python-version: [ "3.8", "3.9", "3.10" ]
cuda-version: [ "11.7.1" ]
cuda-version: [ "11.7.1", "11.8.0" ]
# Specify exactly ONE CUDA version for artifact publish
cuda-version-publish: [ "11.7.1" ]
needs: build_artifact
steps:
- name: Checkout the Repository
Expand All @@ -109,10 +115,10 @@ jobs:
submodules: true

- name: Display System Info
run: . $PRELUDE; print_system_info
run: . $PRELUDE; print_system_info; print_ec2_info

- name: Display EC2 Info
run: . $PRELUDE; print_ec2_info
- name: Display GPU Info
run: . $PRELUDE; print_gpu_info

- name: Setup Miniconda
run: |
Expand Down Expand Up @@ -148,7 +154,12 @@ jobs:
run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV

- name: Push FBGEMM_GPU Binary to PYPI
if: ${{ github.event_name != 'pull_request' && github.event_name != 'push' }}
if: |
${{
github.event_name != 'pull_request' &&
github.event_name != 'push' &&
matrix.cuda-version == matrix.cuda-version-publish
}}
env:
PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}
run: . $PRELUDE; publish_to_pypi $BUILD_ENV fbgemm_gpu-*.whl "$PYPI_TOKEN"
9 changes: 6 additions & 3 deletions .github/workflows/fbgemm_release_build_cpu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,9 @@ jobs:
- name: Display System Info
run: . $PRELUDE; print_system_info

- name: Display GPU Info
run: . $PRELUDE; print_gpu_info

- name: Setup Miniconda
run: |
. $PRELUDE; setup_miniconda $HOME/miniconda
Expand Down Expand Up @@ -102,10 +105,10 @@ jobs:
submodules: true

- name: Display System Info
run: . $PRELUDE; print_system_info
run: . $PRELUDE; print_system_info; print_ec2_info

- name: Display EC2 Info
run: . $PRELUDE; print_ec2_info
- name: Display GPU Info
run: . $PRELUDE; print_gpu_info

- name: Setup Miniconda
run: |
Expand Down
18 changes: 10 additions & 8 deletions fbgemm_gpu/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,34 +7,36 @@
import argparse
import os
import random
import re
import subprocess
import sys
import setuptools_git_versioning as gitversion
import torch

from datetime import date
from skbuild import setup
from typing import List, Optional


def generate_package_version(package_name: str):
print(f"[SETUP.PY] Generating the package version ...")
print("[SETUP.PY] Generating the package version ...")

if 'nightly' in package_name:
if "nightly" in package_name:
# Use date stamp for nightly versions
print(f"[SETUP.PY] Package is for NIGHTLY; using timestamp for the versioning")
print("[SETUP.PY] Package is for NIGHTLY; using timestamp for the versioning")
today = date.today()
version = f"{today.year}.{today.month}.{today.day}"

elif 'test' in package_name:
elif "test" in package_name:
# Use date stamp for nightly versions
print(f"[SETUP.PY] Package is for TEST: using random number for the versioning")
print("[SETUP.PY] Package is for TEST: using random number for the versioning")
version = (f"0.0.{random.randint(0, 1000)}",)

else:
# Use git tag / branch / commit info to generate a PEP-440-compliant version string
print(f"[SETUP.PY] Package is for RELEASE: using git info for the versioning")
print(f"[SETUP.PY] TAG: {gitversion.get_tag()}, BRANCH: {gitversion.get_branch()}, SHA: {gitversion.get_sha()}")
print("[SETUP.PY] Package is for RELEASE: using git info for the versioning")
print(
f"[SETUP.PY] TAG: {gitversion.get_tag()}, BRANCH: {gitversion.get_branch()}, SHA: {gitversion.get_sha()}"
)
version = gitversion.version_from_git()

print(f"[SETUP.PY] Setting the package version: {version}")
Expand Down

0 comments on commit e0d0be8

Please sign in to comment.