Skip to content

Commit

Permalink
[fbgemm_gpu] Add ROCm debugging
Browse files Browse the repository at this point in the history
- Add compile flags to enable device-side assertions
  • Loading branch information
q10 committed Dec 14, 2023
1 parent ee89f56 commit 43d5092
Show file tree
Hide file tree
Showing 7 changed files with 30 additions and 14 deletions.
14 changes: 11 additions & 3 deletions .github/scripts/fbgemm_gpu_build.bash
Original file line number Diff line number Diff line change
Expand Up @@ -77,8 +77,12 @@ __configure_fbgemm_gpu_build_rocm () {
echo "[BUILD] Architectures list from rocminfo: ${arch_list}"

if [ "$arch_list" == "" ]; then
# By default, build for MI250 only to save time
local arch_list=gfx90a
echo "[BUILD] rocminfo did not return anything valid!"

# By default, we build just for MI100 and MI250 to save time. This list
# needs to be updated if the CI ROCm machines have different hardware.
# Architecture mapping can be found at: https://wiki.gentoo.org/wiki/ROCm
local arch_list="gfx908,gfx90a"
fi
else
echo "[BUILD] rocminfo not found in PATH!"
Expand All @@ -92,9 +96,12 @@ __configure_fbgemm_gpu_build_rocm () {
echo "[BUILD] Setting ROCm build args ..."
build_args=(
--package_variant=rocm
-DTORCH_USE_HIP_DSA=1
# HIP_ROOT_DIR now required for HIP to be correctly detected by CMake
-DHIP_ROOT_DIR=/opt/rocm
# Enable device-side assertions in HIP
# https://stackoverflow.com/questions/44284275/passing-compiler-options-in-cmake-command-line
-DCMAKE_C_FLAGS="-DTORCH_USE_HIP_DSA"
-DCMAKE_CXX_FLAGS="-DTORCH_USE_HIP_DSA"
)
}

Expand Down Expand Up @@ -142,6 +149,7 @@ __configure_fbgemm_gpu_build_cuda () {
build_args=(
--package_variant=cuda
--nvml_lib_path="${nvml_lib_path}"
# Pass to PyTorch CMake
-DTORCH_CUDA_ARCH_LIST="'${arch_list}'"
)
}
Expand Down
8 changes: 4 additions & 4 deletions .github/scripts/fbgemm_gpu_test.bash
Original file line number Diff line number Diff line change
Expand Up @@ -50,11 +50,11 @@ run_python_test () {
run_fbgemm_gpu_tests () {
local env_name="$1"
local fbgemm_variant="$2"
if [ "$env_name" == "" ]; then
if [ "$fbgemm_variant" == "" ]; then
echo "Usage: ${FUNCNAME[0]} ENV_NAME [FBGEMM_VARIANT]"
echo "Example(s):"
echo " ${FUNCNAME[0]} build_env # Run all tests applicable to CUDA"
echo " ${FUNCNAME[0]} build_env cpu # Run all tests applicable to CPU"
echo " ${FUNCNAME[0]} build_env cuda # Run all tests applicable to CUDA"
echo " ${FUNCNAME[0]} build_env rocm # Run all tests applicable to ROCm"
return 1
else
Expand Down Expand Up @@ -212,8 +212,8 @@ test_fbgemm_gpu_build_and_install () {
cd -
install_fbgemm_gpu_wheel "${env_name}" fbgemm_gpu/dist/*.whl || return 1

cd fbgemm_gpu/test || return 1
run_fbgemm_gpu_tests "${env_name}" || return 1
cd fbgemm_gpu/test || return 1
run_fbgemm_gpu_tests "${env_name}" "${pytorch_variant_type}" || return 1
# shellcheck disable=SC2164
cd -
}
8 changes: 8 additions & 0 deletions .github/scripts/utils_cuda.bash
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,14 @@ install_cuda () {
# Print nvcc version
# shellcheck disable=SC2086
print_exec conda run ${env_prefix} nvcc --version

if which nvidia-smi; then
# If nvidia-smi is installed on a machine without GPUs, this will return error
(print_exec nvidia-smi) || true
else
echo "[CHECK] nvidia-smi not found"
fi

echo "[INSTALL] Successfully installed CUDA ${cuda_version}"
}

Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/fbgemm_gpu_ci_cuda.yml
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@ jobs:

- name: Test with PyTest
timeout-minutes: 15
run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV
run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV cuda

- name: Push Wheel to PyPI
if: ${{ (github.event_name == 'schedule' && matrix.cuda-version == matrix.cuda-version-publish) || (github.event_name == 'workflow_dispatch' && github.event.inputs.publish_to_pypi == 'true' && matrix.cuda-version == matrix.cuda-version-publish) }}
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/fbgemm_gpu_ci_rocm.yml
Original file line number Diff line number Diff line change
Expand Up @@ -141,8 +141,8 @@ jobs:
host-machine: [
{ arch: x86, instance: "rocm" },
]
# ROCm machines are limited, so we only test against Python 3.11
python-version: [ "3.11" ]
# ROCm machines are limited, so we only test a subset of Python versions
python-version: [ "3.11", "3.12" ]
rocm-version: [ "5.7" ]
needs: build_artifact

Expand Down Expand Up @@ -194,4 +194,4 @@ jobs:

- name: Test with PyTest
timeout-minutes: 15
run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV
run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV rocm
4 changes: 2 additions & 2 deletions .github/workflows/fbgemm_gpu_pip.yml
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ jobs:

- name: Test with PyTest
timeout-minutes: 15
run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV
run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV cuda


test_pypi_install_rocm:
Expand All @@ -179,7 +179,7 @@ jobs:
fail-fast: false
matrix:
host-machine: [
{ instance: "rocm" },
{ arch: x86, instance: "rocm" },
]
# ROCm machines are limited, so we only test a subset of Python versions
python-version: [ "3.11", "3.12" ]
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/fbgemm_gpu_release_cuda.yml
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,7 @@ jobs:

- name: Test with PyTest
timeout-minutes: 15
run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV
run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV cuda

- name: Push FBGEMM_GPU Binary to PYPI
if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.publish_to_pypi == 'true' && matrix.cuda-version == github.event.inputs.cuda_version }}
Expand Down

0 comments on commit 43d5092

Please sign in to comment.