From 05f29f85d7e95badd4a3ff4287e286ba699ec7a1 Mon Sep 17 00:00:00 2001 From: Benson Ma Date: Mon, 6 Mar 2023 17:25:06 -0800 Subject: [PATCH] [T145005253] Check for presence of NVIDIA drivers in OSS builds - Check for the actual presence of NVIDIA drivers in OSS builds, and error out with friendly message instead of cryptic `RuntimeError: No such operator fbgemm::jagged_2d_to_dense` errors when `fbgemm_gpu` is installed and loaded on a system with a GPU but without GPU drivers installed --- .github/scripts/setup_env.bash | 39 ++++++++++++------- .github/workflows/fbgemm_gpu_ci.yml | 6 +++ .github/workflows/fbgemm_nightly_build.yml | 10 +++-- .../workflows/fbgemm_nightly_build_cpu.yml | 9 +++-- .github/workflows/fbgemm_release_build.yml | 10 +++-- .../workflows/fbgemm_release_build_cpu.yml | 9 +++-- fbgemm_gpu/setup.py | 9 ++--- 7 files changed, 62 insertions(+), 30 deletions(-) diff --git a/.github/scripts/setup_env.bash b/.github/scripts/setup_env.bash index 0d0aa0534b..2a0e072293 100755 --- a/.github/scripts/setup_env.bash +++ b/.github/scripts/setup_env.bash @@ -14,6 +14,7 @@ print_exec () { echo "+ $*" echo "" "$@" + echo "" } exec_with_retries () { @@ -238,6 +239,30 @@ free_disk_space () { # Info Functions ################################################################################ +print_gpu_info () { + echo "################################################################################" + echo "[INFO] Check GPU info ..." + install_system_packages lshw + print_exec sudo lshw -C display + + echo "################################################################################" + echo "[INFO] Check NVIDIA GPU info ..." + + if [[ "${ENFORCE_NVIDIA_GPU}" ]]; then + # Ensure that nvidia-smi is available and returns GPU entries + if ! nvidia-smi; then + echo "[CHECK] NVIDIA driver is required, but does not appear to have been installed. This will cause FBGEMM_GPU installation to fail!" + return 1 + fi + + else + if which nvidia-smi; then + # If nvidia-smi is installed on a machine without GPUs, this will return error + (print_exec nvidia-smi) || true + fi + fi +} + print_system_info () { echo "################################################################################" echo "# Print System Info" @@ -264,18 +289,6 @@ print_system_info () { print_exec uname -a print_exec cat /proc/version print_exec cat /etc/os-release - - echo "################################################################################" - echo "[INFO] Check GPU info ..." - install_system_packages lshw - print_exec sudo lshw -C display - - if which nvidia-smi; then - echo "################################################################################" - echo "[INFO] Check NVIDIA GPU info ..." - # If nvidia-smi is installed on a machine without GPUs, this will return error - (print_exec nvidia-smi) || true - fi } print_ec2_info () { @@ -336,7 +349,7 @@ setup_miniconda () { print_exec . ~/.bashrc echo "[SETUP] Updating Miniconda base packages ..." - print_exec conda update -n base -c defaults -y conda + (exec_with_retries conda update -n base -c defaults -y conda) || return 1 # Print Conda info print_exec conda info diff --git a/.github/workflows/fbgemm_gpu_ci.yml b/.github/workflows/fbgemm_gpu_ci.yml index 793273f036..8e021c4451 100644 --- a/.github/workflows/fbgemm_gpu_ci.yml +++ b/.github/workflows/fbgemm_gpu_ci.yml @@ -38,6 +38,9 @@ jobs: - name: Display System Info run: . $PRELUDE; print_system_info + - name: Display GPU Info + run: . $PRELUDE; print_gpu_info + - name: Free Disk Space run: . $PRELUDE; free_disk_space @@ -150,6 +153,9 @@ jobs: - name: Display System Info run: . $PRELUDE; print_system_info + - name: Display GPU Info + run: . $PRELUDE; print_gpu_info + - name: Setup Miniconda run: | . $PRELUDE; setup_miniconda $HOME/miniconda diff --git a/.github/workflows/fbgemm_nightly_build.yml b/.github/workflows/fbgemm_nightly_build.yml index 84e3f8023f..45458508e3 100644 --- a/.github/workflows/fbgemm_nightly_build.yml +++ b/.github/workflows/fbgemm_nightly_build.yml @@ -57,6 +57,9 @@ jobs: - name: Display System Info run: . $PRELUDE; print_system_info + - name: Display GPU Info + run: . $PRELUDE; print_gpu_info + - name: Setup Miniconda run: | . $PRELUDE; setup_miniconda $HOME/miniconda @@ -103,6 +106,7 @@ jobs: env: PRELUDE: .github/scripts/setup_env.bash BUILD_ENV: build_binary + ENFORCE_NVIDIA_GPU: 1 strategy: fail-fast: false matrix: @@ -118,10 +122,10 @@ jobs: submodules: true - name: Display System Info - run: . $PRELUDE; print_system_info + run: . $PRELUDE; print_system_info; print_ec2_info - - name: Display EC2 Info - run: . $PRELUDE; print_ec2_info + - name: Display GPU Info + run: . $PRELUDE; print_gpu_info - name: Setup Miniconda run: | diff --git a/.github/workflows/fbgemm_nightly_build_cpu.yml b/.github/workflows/fbgemm_nightly_build_cpu.yml index 46ee25a869..268877692f 100644 --- a/.github/workflows/fbgemm_nightly_build_cpu.yml +++ b/.github/workflows/fbgemm_nightly_build_cpu.yml @@ -56,6 +56,9 @@ jobs: - name: Display System Info run: . $PRELUDE; print_system_info + - name: Display GPU Info + run: . $PRELUDE; print_gpu_info + - name: Setup Miniconda run: | . $PRELUDE; setup_miniconda $HOME/miniconda @@ -110,10 +113,10 @@ jobs: submodules: true - name: Display System Info - run: . $PRELUDE; print_system_info + run: . $PRELUDE; print_system_info; print_ec2_info - - name: Display EC2 Info - run: . $PRELUDE; print_ec2_info + - name: Display GPU Info + run: . $PRELUDE; print_gpu_info - name: Setup Miniconda run: | diff --git a/.github/workflows/fbgemm_release_build.yml b/.github/workflows/fbgemm_release_build.yml index 9b8fc3bec6..f174928efb 100644 --- a/.github/workflows/fbgemm_release_build.yml +++ b/.github/workflows/fbgemm_release_build.yml @@ -49,6 +49,9 @@ jobs: - name: Display System Info run: . $PRELUDE; print_system_info + - name: Display GPU Info + run: . $PRELUDE; print_gpu_info + - name: Setup Miniconda run: | . $PRELUDE; setup_miniconda $HOME/miniconda @@ -95,6 +98,7 @@ jobs: env: PRELUDE: .github/scripts/setup_env.bash BUILD_ENV: build_binary + ENFORCE_NVIDIA_GPU: 1 strategy: fail-fast: false matrix: @@ -109,10 +113,10 @@ jobs: submodules: true - name: Display System Info - run: . $PRELUDE; print_system_info + run: . $PRELUDE; print_system_info; print_ec2_info - - name: Display EC2 Info - run: . $PRELUDE; print_ec2_info + - name: Display GPU Info + run: . $PRELUDE; print_gpu_info - name: Setup Miniconda run: | diff --git a/.github/workflows/fbgemm_release_build_cpu.yml b/.github/workflows/fbgemm_release_build_cpu.yml index 611e8cce1e..3200a702aa 100644 --- a/.github/workflows/fbgemm_release_build_cpu.yml +++ b/.github/workflows/fbgemm_release_build_cpu.yml @@ -48,6 +48,9 @@ jobs: - name: Display System Info run: . $PRELUDE; print_system_info + - name: Display GPU Info + run: . $PRELUDE; print_gpu_info + - name: Setup Miniconda run: | . $PRELUDE; setup_miniconda $HOME/miniconda @@ -102,10 +105,10 @@ jobs: submodules: true - name: Display System Info - run: . $PRELUDE; print_system_info + run: . $PRELUDE; print_system_info; print_ec2_info - - name: Display EC2 Info - run: . $PRELUDE; print_ec2_info + - name: Display GPU Info + run: . $PRELUDE; print_gpu_info - name: Setup Miniconda run: | diff --git a/fbgemm_gpu/setup.py b/fbgemm_gpu/setup.py index 41ba7758b6..daefdb9322 100644 --- a/fbgemm_gpu/setup.py +++ b/fbgemm_gpu/setup.py @@ -7,7 +7,6 @@ import argparse import os import random -import re import subprocess import sys import setuptools_git_versioning as gitversion @@ -18,22 +17,22 @@ def generate_package_version(package_name: str): - print(f"[SETUP.PY] Generating the package version ...") + print("[SETUP.PY] Generating the package version ...") if 'nightly' in package_name: # Use date stamp for nightly versions - print(f"[SETUP.PY] Package is for NIGHTLY; using timestamp for the versioning") + print("[SETUP.PY] Package is for NIGHTLY; using timestamp for the versioning") today = date.today() version = f"{today.year}.{today.month}.{today.day}" elif 'test' in package_name: # Use date stamp for nightly versions - print(f"[SETUP.PY] Package is for TEST: using random number for the versioning") + print("[SETUP.PY] Package is for TEST: using random number for the versioning") version = (f"0.0.{random.randint(0, 1000)}",) else: # Use git tag / branch / commit info to generate a PEP-440-compliant version string - print(f"[SETUP.PY] Package is for RELEASE: using git info for the versioning") + print("[SETUP.PY] Package is for RELEASE: using git info for the versioning") print(f"[SETUP.PY] TAG: {gitversion.get_tag()}, BRANCH: {gitversion.get_branch()}, SHA: {gitversion.get_sha()}") version = gitversion.version_from_git()