Skip to content

Commit

Permalink
[T145005253] Check for presence of NVIDIA drivers in OSS builds
Browse files Browse the repository at this point in the history
- Check for the actual presence of NVIDIA drivers in OSS builds,
and error out with friendly message instead of cryptic
`RuntimeError: No such operator fbgemm::jagged_2d_to_dense`
errors when `fbgemm_gpu` is installed and loaded on a system with
a GPU but without GPU drivers installed
  • Loading branch information
q10 committed Mar 7, 2023
1 parent f4e0b70 commit 05f29f8
Show file tree
Hide file tree
Showing 7 changed files with 62 additions and 30 deletions.
39 changes: 26 additions & 13 deletions .github/scripts/setup_env.bash
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ print_exec () {
echo "+ $*"
echo ""
"$@"
echo ""
}

exec_with_retries () {
Expand Down Expand Up @@ -238,6 +239,30 @@ free_disk_space () {
# Info Functions
################################################################################

print_gpu_info () {
echo "################################################################################"
echo "[INFO] Check GPU info ..."
install_system_packages lshw
print_exec sudo lshw -C display

echo "################################################################################"
echo "[INFO] Check NVIDIA GPU info ..."

if [[ "${ENFORCE_NVIDIA_GPU}" ]]; then
# Ensure that nvidia-smi is available and returns GPU entries
if ! nvidia-smi; then
echo "[CHECK] NVIDIA driver is required, but does not appear to have been installed. This will cause FBGEMM_GPU installation to fail!"
return 1
fi

else
if which nvidia-smi; then
# If nvidia-smi is installed on a machine without GPUs, this will return error
(print_exec nvidia-smi) || true
fi
fi
}

print_system_info () {
echo "################################################################################"
echo "# Print System Info"
Expand All @@ -264,18 +289,6 @@ print_system_info () {
print_exec uname -a
print_exec cat /proc/version
print_exec cat /etc/os-release

echo "################################################################################"
echo "[INFO] Check GPU info ..."
install_system_packages lshw
print_exec sudo lshw -C display

if which nvidia-smi; then
echo "################################################################################"
echo "[INFO] Check NVIDIA GPU info ..."
# If nvidia-smi is installed on a machine without GPUs, this will return error
(print_exec nvidia-smi) || true
fi
}

print_ec2_info () {
Expand Down Expand Up @@ -336,7 +349,7 @@ setup_miniconda () {
print_exec . ~/.bashrc

echo "[SETUP] Updating Miniconda base packages ..."
print_exec conda update -n base -c defaults -y conda
(exec_with_retries conda update -n base -c defaults -y conda) || return 1

# Print Conda info
print_exec conda info
Expand Down
6 changes: 6 additions & 0 deletions .github/workflows/fbgemm_gpu_ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,9 @@ jobs:
- name: Display System Info
run: . $PRELUDE; print_system_info

- name: Display GPU Info
run: . $PRELUDE; print_gpu_info

- name: Free Disk Space
run: . $PRELUDE; free_disk_space

Expand Down Expand Up @@ -150,6 +153,9 @@ jobs:
- name: Display System Info
run: . $PRELUDE; print_system_info

- name: Display GPU Info
run: . $PRELUDE; print_gpu_info

- name: Setup Miniconda
run: |
. $PRELUDE; setup_miniconda $HOME/miniconda
Expand Down
10 changes: 7 additions & 3 deletions .github/workflows/fbgemm_nightly_build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,9 @@ jobs:
- name: Display System Info
run: . $PRELUDE; print_system_info

- name: Display GPU Info
run: . $PRELUDE; print_gpu_info

- name: Setup Miniconda
run: |
. $PRELUDE; setup_miniconda $HOME/miniconda
Expand Down Expand Up @@ -103,6 +106,7 @@ jobs:
env:
PRELUDE: .github/scripts/setup_env.bash
BUILD_ENV: build_binary
ENFORCE_NVIDIA_GPU: 1
strategy:
fail-fast: false
matrix:
Expand All @@ -118,10 +122,10 @@ jobs:
submodules: true

- name: Display System Info
run: . $PRELUDE; print_system_info
run: . $PRELUDE; print_system_info; print_ec2_info

- name: Display EC2 Info
run: . $PRELUDE; print_ec2_info
- name: Display GPU Info
run: . $PRELUDE; print_gpu_info

- name: Setup Miniconda
run: |
Expand Down
9 changes: 6 additions & 3 deletions .github/workflows/fbgemm_nightly_build_cpu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,9 @@ jobs:
- name: Display System Info
run: . $PRELUDE; print_system_info

- name: Display GPU Info
run: . $PRELUDE; print_gpu_info

- name: Setup Miniconda
run: |
. $PRELUDE; setup_miniconda $HOME/miniconda
Expand Down Expand Up @@ -110,10 +113,10 @@ jobs:
submodules: true

- name: Display System Info
run: . $PRELUDE; print_system_info
run: . $PRELUDE; print_system_info; print_ec2_info

- name: Display EC2 Info
run: . $PRELUDE; print_ec2_info
- name: Display GPU Info
run: . $PRELUDE; print_gpu_info

- name: Setup Miniconda
run: |
Expand Down
10 changes: 7 additions & 3 deletions .github/workflows/fbgemm_release_build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,9 @@ jobs:
- name: Display System Info
run: . $PRELUDE; print_system_info

- name: Display GPU Info
run: . $PRELUDE; print_gpu_info

- name: Setup Miniconda
run: |
. $PRELUDE; setup_miniconda $HOME/miniconda
Expand Down Expand Up @@ -95,6 +98,7 @@ jobs:
env:
PRELUDE: .github/scripts/setup_env.bash
BUILD_ENV: build_binary
ENFORCE_NVIDIA_GPU: 1
strategy:
fail-fast: false
matrix:
Expand All @@ -109,10 +113,10 @@ jobs:
submodules: true

- name: Display System Info
run: . $PRELUDE; print_system_info
run: . $PRELUDE; print_system_info; print_ec2_info

- name: Display EC2 Info
run: . $PRELUDE; print_ec2_info
- name: Display GPU Info
run: . $PRELUDE; print_gpu_info

- name: Setup Miniconda
run: |
Expand Down
9 changes: 6 additions & 3 deletions .github/workflows/fbgemm_release_build_cpu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,9 @@ jobs:
- name: Display System Info
run: . $PRELUDE; print_system_info

- name: Display GPU Info
run: . $PRELUDE; print_gpu_info

- name: Setup Miniconda
run: |
. $PRELUDE; setup_miniconda $HOME/miniconda
Expand Down Expand Up @@ -102,10 +105,10 @@ jobs:
submodules: true

- name: Display System Info
run: . $PRELUDE; print_system_info
run: . $PRELUDE; print_system_info; print_ec2_info

- name: Display EC2 Info
run: . $PRELUDE; print_ec2_info
- name: Display GPU Info
run: . $PRELUDE; print_gpu_info

- name: Setup Miniconda
run: |
Expand Down
9 changes: 4 additions & 5 deletions fbgemm_gpu/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
import argparse
import os
import random
import re
import subprocess
import sys
import setuptools_git_versioning as gitversion
Expand All @@ -18,22 +17,22 @@


def generate_package_version(package_name: str):
print(f"[SETUP.PY] Generating the package version ...")
print("[SETUP.PY] Generating the package version ...")

if 'nightly' in package_name:
# Use date stamp for nightly versions
print(f"[SETUP.PY] Package is for NIGHTLY; using timestamp for the versioning")
print("[SETUP.PY] Package is for NIGHTLY; using timestamp for the versioning")
today = date.today()
version = f"{today.year}.{today.month}.{today.day}"

elif 'test' in package_name:
# Use date stamp for nightly versions
print(f"[SETUP.PY] Package is for TEST: using random number for the versioning")
print("[SETUP.PY] Package is for TEST: using random number for the versioning")
version = (f"0.0.{random.randint(0, 1000)}",)

else:
# Use git tag / branch / commit info to generate a PEP-440-compliant version string
print(f"[SETUP.PY] Package is for RELEASE: using git info for the versioning")
print("[SETUP.PY] Package is for RELEASE: using git info for the versioning")
print(f"[SETUP.PY] TAG: {gitversion.get_tag()}, BRANCH: {gitversion.get_branch()}, SHA: {gitversion.get_sha()}")
version = gitversion.version_from_git()

Expand Down

0 comments on commit 05f29f8

Please sign in to comment.