Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[fbgemm_gpu] Add ROCm debugging #2211

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 11 additions & 3 deletions .github/scripts/fbgemm_gpu_build.bash
Original file line number Diff line number Diff line change
Expand Up @@ -77,8 +77,12 @@ __configure_fbgemm_gpu_build_rocm () {
echo "[BUILD] Architectures list from rocminfo: ${arch_list}"

if [ "$arch_list" == "" ]; then
# By default, build for MI250 only to save time
local arch_list=gfx90a
echo "[BUILD] rocminfo did not return anything valid!"

# By default, we build just for MI100 and MI250 to save time. This list
# needs to be updated if the CI ROCm machines have different hardware.
# Architecture mapping can be found at: https://wiki.gentoo.org/wiki/ROCm
local arch_list="gfx908,gfx90a"
fi
else
echo "[BUILD] rocminfo not found in PATH!"
Expand All @@ -92,9 +96,12 @@ __configure_fbgemm_gpu_build_rocm () {
echo "[BUILD] Setting ROCm build args ..."
build_args=(
--package_variant=rocm
-DTORCH_USE_HIP_DSA=1
# HIP_ROOT_DIR now required for HIP to be correctly detected by CMake
-DHIP_ROOT_DIR=/opt/rocm
# Enable device-side assertions in HIP
# https://stackoverflow.com/questions/44284275/passing-compiler-options-in-cmake-command-line
-DCMAKE_C_FLAGS="-DTORCH_USE_HIP_DSA"
-DCMAKE_CXX_FLAGS="-DTORCH_USE_HIP_DSA"
)
}

Expand Down Expand Up @@ -142,6 +149,7 @@ __configure_fbgemm_gpu_build_cuda () {
build_args=(
--package_variant=cuda
--nvml_lib_path="${nvml_lib_path}"
# Pass to PyTorch CMake
-DTORCH_CUDA_ARCH_LIST="'${arch_list}'"
)
}
Expand Down
8 changes: 4 additions & 4 deletions .github/scripts/fbgemm_gpu_test.bash
Original file line number Diff line number Diff line change
Expand Up @@ -50,11 +50,11 @@ run_python_test () {
run_fbgemm_gpu_tests () {
local env_name="$1"
local fbgemm_variant="$2"
if [ "$env_name" == "" ]; then
if [ "$fbgemm_variant" == "" ]; then
echo "Usage: ${FUNCNAME[0]} ENV_NAME [FBGEMM_VARIANT]"
echo "Example(s):"
echo " ${FUNCNAME[0]} build_env # Run all tests applicable to CUDA"
echo " ${FUNCNAME[0]} build_env cpu # Run all tests applicable to CPU"
echo " ${FUNCNAME[0]} build_env cuda # Run all tests applicable to CUDA"
echo " ${FUNCNAME[0]} build_env rocm # Run all tests applicable to ROCm"
return 1
else
Expand Down Expand Up @@ -212,8 +212,8 @@ test_fbgemm_gpu_build_and_install () {
cd -
install_fbgemm_gpu_wheel "${env_name}" fbgemm_gpu/dist/*.whl || return 1

cd fbgemm_gpu/test || return 1
run_fbgemm_gpu_tests "${env_name}" || return 1
cd fbgemm_gpu/test || return 1
run_fbgemm_gpu_tests "${env_name}" "${pytorch_variant_type}" || return 1
# shellcheck disable=SC2164
cd -
}
8 changes: 8 additions & 0 deletions .github/scripts/utils_cuda.bash
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,14 @@ install_cuda () {
# Print nvcc version
# shellcheck disable=SC2086
print_exec conda run ${env_prefix} nvcc --version

if which nvidia-smi; then
# If nvidia-smi is installed on a machine without GPUs, this will return error
(print_exec nvidia-smi) || true
else
echo "[CHECK] nvidia-smi not found"
fi

echo "[INSTALL] Successfully installed CUDA ${cuda_version}"
}

Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/fbgemm_gpu_ci_cuda.yml
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@ jobs:

- name: Test with PyTest
timeout-minutes: 15
run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV
run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV cuda

- name: Push Wheel to PyPI
if: ${{ (github.event_name == 'schedule' && matrix.cuda-version == matrix.cuda-version-publish) || (github.event_name == 'workflow_dispatch' && github.event.inputs.publish_to_pypi == 'true' && matrix.cuda-version == matrix.cuda-version-publish) }}
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/fbgemm_gpu_ci_rocm.yml
Original file line number Diff line number Diff line change
Expand Up @@ -141,8 +141,8 @@ jobs:
host-machine: [
{ arch: x86, instance: "rocm" },
]
# ROCm machines are limited, so we only test against Python 3.11
python-version: [ "3.11" ]
# ROCm machines are limited, so we only test a subset of Python versions
python-version: [ "3.11", "3.12" ]
rocm-version: [ "5.7" ]
needs: build_artifact

Expand Down Expand Up @@ -194,4 +194,4 @@ jobs:

- name: Test with PyTest
timeout-minutes: 15
run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV
run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV rocm
4 changes: 2 additions & 2 deletions .github/workflows/fbgemm_gpu_pip.yml
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ jobs:

- name: Test with PyTest
timeout-minutes: 15
run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV
run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV cuda


test_pypi_install_rocm:
Expand All @@ -179,7 +179,7 @@ jobs:
fail-fast: false
matrix:
host-machine: [
{ instance: "rocm" },
{ arch: x86, instance: "rocm" },
]
# ROCm machines are limited, so we only test a subset of Python versions
python-version: [ "3.11", "3.12" ]
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/fbgemm_gpu_release_cuda.yml
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,7 @@ jobs:

- name: Test with PyTest
timeout-minutes: 15
run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV
run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV cuda

- name: Push FBGEMM_GPU Binary to PYPI
if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.publish_to_pypi == 'true' && matrix.cuda-version == github.event.inputs.cuda_version }}
Expand Down
Loading