[fbgemm_gpu] Add ROCm debugging

- Add compile flags to enable device-side assertions
pytorch · Dec 14, 2023 · 43d5092 · 43d5092
1 parent ee89f56
commit 43d5092
Show file tree

Hide file tree

Showing 7 changed files with 30 additions and 14 deletions.
diff --git a/.github/scripts/fbgemm_gpu_build.bash b/.github/scripts/fbgemm_gpu_build.bash
@@ -77,8 +77,12 @@ __configure_fbgemm_gpu_build_rocm () {
       echo "[BUILD] Architectures list from rocminfo: ${arch_list}"
 
       if [ "$arch_list" == "" ]; then
-        # By default, build for MI250 only to save time
-        local arch_list=gfx90a
+        echo "[BUILD] rocminfo did not return anything valid!"
+
+        # By default, we build just for MI100 and MI250 to save time.  This list
+        # needs to be updated if the CI ROCm machines have different hardware.
+        # Architecture mapping can be found at: https://wiki.gentoo.org/wiki/ROCm
+        local arch_list="gfx908,gfx90a"
       fi
     else
       echo "[BUILD] rocminfo not found in PATH!"
@@ -92,9 +96,12 @@ __configure_fbgemm_gpu_build_rocm () {
   echo "[BUILD] Setting ROCm build args ..."
   build_args=(
     --package_variant=rocm
-    -DTORCH_USE_HIP_DSA=1
     # HIP_ROOT_DIR now required for HIP to be correctly detected by CMake
     -DHIP_ROOT_DIR=/opt/rocm
+    # Enable device-side assertions in HIP
+    # https://stackoverflow.com/questions/44284275/passing-compiler-options-in-cmake-command-line
+    -DCMAKE_C_FLAGS="-DTORCH_USE_HIP_DSA"
+    -DCMAKE_CXX_FLAGS="-DTORCH_USE_HIP_DSA"
   )
 }
 
@@ -142,6 +149,7 @@ __configure_fbgemm_gpu_build_cuda () {
   build_args=(
     --package_variant=cuda
     --nvml_lib_path="${nvml_lib_path}"
+    # Pass to PyTorch CMake
     -DTORCH_CUDA_ARCH_LIST="'${arch_list}'"
   )
 }

diff --git a/.github/scripts/fbgemm_gpu_test.bash b/.github/scripts/fbgemm_gpu_test.bash
@@ -50,11 +50,11 @@ run_python_test () {
 run_fbgemm_gpu_tests () {
   local env_name="$1"
   local fbgemm_variant="$2"
-  if [ "$env_name" == "" ]; then
+  if [ "$fbgemm_variant" == "" ]; then
     echo "Usage: ${FUNCNAME[0]} ENV_NAME [FBGEMM_VARIANT]"
     echo "Example(s):"
-    echo "    ${FUNCNAME[0]} build_env        # Run all tests applicable to CUDA"
     echo "    ${FUNCNAME[0]} build_env cpu    # Run all tests applicable to CPU"
+    echo "    ${FUNCNAME[0]} build_env cuda   # Run all tests applicable to CUDA"
     echo "    ${FUNCNAME[0]} build_env rocm   # Run all tests applicable to ROCm"
     return 1
   else
@@ -212,8 +212,8 @@ test_fbgemm_gpu_build_and_install () {
   cd -
   install_fbgemm_gpu_wheel    "${env_name}" fbgemm_gpu/dist/*.whl             || return 1
 
-  cd fbgemm_gpu/test                        || return 1
-  run_fbgemm_gpu_tests        "${env_name}" || return 1
+  cd fbgemm_gpu/test                                                          || return 1
+  run_fbgemm_gpu_tests        "${env_name}" "${pytorch_variant_type}"         || return 1
   # shellcheck disable=SC2164
   cd -
 }
diff --git a/.github/scripts/utils_cuda.bash b/.github/scripts/utils_cuda.bash
@@ -77,6 +77,14 @@ install_cuda () {
   # Print nvcc version
   # shellcheck disable=SC2086
   print_exec conda run ${env_prefix} nvcc --version
+
+  if which nvidia-smi; then
+    # If nvidia-smi is installed on a machine without GPUs, this will return error
+    (print_exec nvidia-smi) || true
+  else
+    echo "[CHECK] nvidia-smi not found"
+  fi
+
   echo "[INSTALL] Successfully installed CUDA ${cuda_version}"
 }
 

diff --git a/.github/workflows/fbgemm_gpu_ci_cuda.yml b/.github/workflows/fbgemm_gpu_ci_cuda.yml
@@ -192,7 +192,7 @@ jobs:
 
     - name: Test with PyTest
       timeout-minutes: 15
-      run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV
+      run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV cuda
 
     - name: Push Wheel to PyPI
       if: ${{ (github.event_name == 'schedule' && matrix.cuda-version == matrix.cuda-version-publish) || (github.event_name == 'workflow_dispatch' && github.event.inputs.publish_to_pypi == 'true' && matrix.cuda-version == matrix.cuda-version-publish) }}

diff --git a/.github/workflows/fbgemm_gpu_ci_rocm.yml b/.github/workflows/fbgemm_gpu_ci_rocm.yml
@@ -141,8 +141,8 @@ jobs:
         host-machine: [
           { arch: x86, instance: "rocm" },
         ]
-        # ROCm machines are limited, so we only test against Python 3.11
-        python-version: [ "3.11" ]
+        # ROCm machines are limited, so we only test a subset of Python versions
+        python-version: [ "3.11", "3.12" ]
         rocm-version: [ "5.7" ]
     needs: build_artifact
 
@@ -194,4 +194,4 @@ jobs:
 
     - name: Test with PyTest
       timeout-minutes: 15
-      run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV
+      run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV rocm
diff --git a/.github/workflows/fbgemm_gpu_pip.yml b/.github/workflows/fbgemm_gpu_pip.yml
@@ -159,7 +159,7 @@ jobs:
 
     - name: Test with PyTest
       timeout-minutes: 15
-      run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV
+      run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV cuda
 
 
   test_pypi_install_rocm:
@@ -179,7 +179,7 @@ jobs:
       fail-fast: false
       matrix:
         host-machine: [
-          { instance: "rocm" },
+          { arch: x86, instance: "rocm" },
         ]
         # ROCm machines are limited, so we only test a subset of Python versions
         python-version: [ "3.11", "3.12" ]

diff --git a/.github/workflows/fbgemm_gpu_release_cuda.yml b/.github/workflows/fbgemm_gpu_release_cuda.yml
@@ -188,7 +188,7 @@ jobs:
 
     - name: Test with PyTest
       timeout-minutes: 15
-      run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV
+      run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV cuda
 
     - name: Push FBGEMM_GPU Binary to PYPI
       if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.publish_to_pypi == 'true' && matrix.cuda-version == github.event.inputs.cuda_version }}