Skip to content

Commit

Permalink
[fbgemm_gpu] More flexible test retries
Browse files Browse the repository at this point in the history
- Allow for re-running tests for only the failed test cases, to avoid OOM
errors with large test suites

- Re-enable the `test_backward_dense` test
  • Loading branch information
q10 committed Feb 13, 2024
1 parent eb3c304 commit 00018b1
Show file tree
Hide file tree
Showing 7 changed files with 124 additions and 66 deletions.
108 changes: 74 additions & 34 deletions .github/scripts/fbgemm_gpu_test.bash
Original file line number Diff line number Diff line change
Expand Up @@ -32,20 +32,77 @@ run_python_test () {
local env_prefix=$(env_name_or_prefix "${env_name}")

# shellcheck disable=SC2086
if exec_with_retries 2 conda run --no-capture-output ${env_prefix} python -m pytest -v -rsx -s -W ignore::pytest.PytestCollectionWarning "${python_test_file}"; then
if print_exec conda run --no-capture-output ${env_prefix} python -m pytest -v -rsx -s -W ignore::pytest.PytestCollectionWarning --cache-clear "${python_test_file}"; then
echo "[TEST] Python test suite PASSED: ${python_test_file}"
echo ""
echo ""
echo ""
return 0
fi

echo "[TEST] Some tests FAILED. Re-attempting only FAILED tests: ${python_test_file}"
echo ""
echo ""

# NOTE: Running large test suites may result in OOM error that will cause the
# process to be prematurely killed. To work around this, when we re-run test
# suites, we only run tests that have failed in the previous round. This is
# enabled by using the pytest cache and the --lf flag.

# shellcheck disable=SC2086
if exec_with_retries 2 conda run --no-capture-output ${env_prefix} python -m pytest -v -rsx -s -W ignore::pytest.PytestCollectionWarning --lf --last-failed-no-failures none "${python_test_file}"; then
echo "[TEST] Python test suite PASSED with retries: ${python_test_file}"
echo ""
echo ""
echo ""
else
echo "[TEST] Python test suite FAILED: ${python_test_file}"
echo "[TEST] Python test suite FAILED for some or all tests despite retries: ${python_test_file}"
echo ""
echo ""
echo ""
return 1
fi
}

__configure_fbgemm_gpu_test_cpu () {
ignored_tests=(
./ssd_split_table_batched_embeddings_test.py
# These tests have non-CPU operators referenced in @given
./uvm/copy_test.py
./uvm/uvm_test.py
)
}

__configure_fbgemm_gpu_test_cuda () {
ignored_tests=(
./ssd_split_table_batched_embeddings_test.py
)
}

__configure_fbgemm_gpu_test_rocm () {
# shellcheck disable=SC2155
local env_prefix=$(env_name_or_prefix "${env_name}")

echo "[TEST] Set environment variables for ROCm testing ..."
# shellcheck disable=SC2086
print_exec conda env config vars set ${env_prefix} FBGEMM_TEST_WITH_ROCM=1
# shellcheck disable=SC2086
print_exec conda env config vars set ${env_prefix} HIP_LAUNCH_BLOCKING=1

# Starting from MI250 AMD GPUs support per process XNACK mode change
# shellcheck disable=SC2155
local rocm_version=$(awk -F'[.-]' '{print $1 * 10000 + $2 * 100 + $3}' /opt/rocm/.info/version-dev)
if [ "$rocm_version" -ge 50700 ]; then
# shellcheck disable=SC2086
print_exec conda env config vars set ${env_prefix} HSA_XNACK=1
fi

ignored_tests=(
./ssd_split_table_batched_embeddings_test.py
# https://github.com/pytorch/FBGEMM/issues/1559
./batched_unary_embeddings_test.py
)
}

################################################################################
# FBGEMM_GPU Test Functions
Expand Down Expand Up @@ -73,37 +130,17 @@ run_fbgemm_gpu_tests () {
# shellcheck disable=SC2155
local env_prefix=$(env_name_or_prefix "${env_name}")

# Enable ROCM testing if specified
if [ "$fbgemm_variant" == "rocm" ]; then
echo "[TEST] Set environment variables for ROCm testing ..."
# shellcheck disable=SC2086
print_exec conda env config vars set ${env_prefix} FBGEMM_TEST_WITH_ROCM=1
# shellcheck disable=SC2086
print_exec conda env config vars set ${env_prefix} HIP_LAUNCH_BLOCKING=1
fi

# These are either non-tests or currently-broken tests in both FBGEMM_GPU and FBGEMM_GPU-CPU
local files_to_skip=(
./ssd_split_table_batched_embeddings_test.py
)

if [ "$fbgemm_variant" == "cpu" ]; then
# These tests have non-CPU operators referenced in @given
local ignored_tests=(
./uvm/copy_test.py
./uvm/uvm_test.py
)
echo "Configuring for CPU-based testing ..."
__configure_fbgemm_gpu_test_cpu

elif [ "$fbgemm_variant" == "rocm" ]; then
local ignored_tests=(
# https://github.com/pytorch/FBGEMM/issues/1559
./batched_unary_embeddings_test.py
./tbe/backward_adagrad_test.py
./tbe/backward_dense_test.py
./tbe/backward_none_test.py
./tbe/backward_sgd_test.py
)
echo "Configuring for ROCm-based testing ..."
__configure_fbgemm_gpu_test_rocm

else
local ignored_tests=()
echo "Configuring for CUDA-based testing ..."
__configure_fbgemm_gpu_test_cuda
fi

echo "[TEST] Installing pytest ..."
Expand All @@ -114,19 +151,22 @@ run_fbgemm_gpu_tests () {
(test_python_import_package "${env_name}" fbgemm_gpu) || return 1
(test_python_import_package "${env_name}" fbgemm_gpu.split_embedding_codegen_lookup_invokers) || return 1

echo "[TEST] Enumerating test files ..."
echo "[TEST] Enumerating ALL test files ..."
# shellcheck disable=SC2155
local all_test_files=$(find . -type f -name '*_test.py' -print | sort)
for f in $all_test_files; do echo "$f"; done
echo ""

echo "[TEST] Enumerating IGNORED test files ..."
for f in $ignored_tests; do echo "$f"; done
echo ""

# NOTE: Tests running on single CPU core with a less powerful testing GPU in
# GHA can take up to 5 hours.
for test_file in $all_test_files; do
if echo "${files_to_skip[@]}" | grep "${test_file}"; then
echo "[TEST] Skipping test file known to be broken: ${test_file}"
elif echo "${ignored_tests[@]}" | grep "${test_file}"; then
if echo "${ignored_tests[@]}" | grep "${test_file}"; then
echo "[TEST] Skipping test file: ${test_file}"
echo ""
elif run_python_test "${env_name}" "${test_file}"; then
echo ""
else
Expand Down
6 changes: 0 additions & 6 deletions fbgemm_gpu/test/jagged_tensor_ops_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@
gpu_unavailable,
gradcheck,
optests,
skipIfRocm,
symint_vector_unsupported,
use_cpu_strategy,
)
Expand All @@ -46,7 +45,6 @@
gpu_unavailable,
gradcheck,
optests,
skipIfRocm,
symint_vector_unsupported,
use_cpu_strategy,
)
Expand Down Expand Up @@ -1630,7 +1628,6 @@ def test_jagged_dense_dense_elementwise_add_jagged_output_dynamic_shape(

assert output.size() == output_ref.size()

@skipIfRocm()
@settings(
verbosity=Verbosity.verbose,
max_examples=20,
Expand Down Expand Up @@ -2370,7 +2367,6 @@ def test_jagged_softmax(

torch.testing.assert_close(values.grad, values_ref.grad)

@skipIfRocm()
@given(
B=st.integers(10, 512),
M=st.integers(1, 32),
Expand Down Expand Up @@ -2669,7 +2665,6 @@ def test_jagged_slice_errors(
)

@unittest.skipIf(*gpu_unavailable)
@skipIfRocm()
@given(
B=st.integers(min_value=100, max_value=200),
F=st.integers(min_value=50, max_value=100),
Expand Down Expand Up @@ -2774,7 +2769,6 @@ def test_jagged_unique_indices(
self.assertTrue((output_start <= pos) and (pos < output_end))

@unittest.skipIf(*gpu_unavailable)
@skipIfRocm()
@given(
B=st.integers(min_value=100, max_value=200),
F=st.integers(min_value=50, max_value=100),
Expand Down
10 changes: 1 addition & 9 deletions fbgemm_gpu/test/permute_pooled_embedding_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,20 +23,13 @@
# pyre-fixme[16]: Module `fbgemm_gpu` has no attribute `open_source`.
if getattr(fbgemm_gpu, "open_source", False):
# pyre-ignore[21]
from test_utils import (
cpu_and_maybe_gpu,
gpu_unavailable,
on_arm_platform,
optests,
skipIfRocm,
)
from test_utils import cpu_and_maybe_gpu, gpu_unavailable, on_arm_platform, optests
else:
from fbgemm_gpu.test.test_utils import (
cpu_and_maybe_gpu,
gpu_unavailable,
on_arm_platform,
optests,
skipIfRocm,
)

typed_gpu_unavailable: Tuple[bool, str] = gpu_unavailable
Expand Down Expand Up @@ -145,7 +138,6 @@ def test_permutation(self, fwd_only: bool) -> None:
[6, 7, 8, 9, 0, 1, 5, 2, 3, 4],
)

@skipIfRocm()
@unittest.skipIf(*typed_on_arm_platform)
def test_permutation_autograd(self) -> None:
net = Net().to(self.device)
Expand Down
5 changes: 2 additions & 3 deletions fbgemm_gpu/test/sparse/index_select_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,14 +23,13 @@

if open_source:
# pyre-ignore[21]
from test_utils import gpu_available, skipIfRocm
from test_utils import gpu_available
else:
import fbgemm_gpu.sparse_ops # noqa: F401, E402
from fbgemm_gpu.test.test_utils import gpu_available, skipIfRocm
from fbgemm_gpu.test.test_utils import gpu_available


class IndexSelectTest(unittest.TestCase):
@skipIfRocm()
@given(
N=st.integers(1, 32),
shape=st.one_of(
Expand Down
5 changes: 2 additions & 3 deletions fbgemm_gpu/test/sparse/pack_segments_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,9 @@

if open_source:
# pyre-ignore[21]
from test_utils import gpu_available, skipIfRocm
from test_utils import gpu_available
else:
from fbgemm_gpu.test.test_utils import gpu_available, skipIfRocm
from fbgemm_gpu.test.test_utils import gpu_available


def get_n_rand_num_summing_to_k(n: int, k: int) -> np.ndarray:
Expand Down Expand Up @@ -236,7 +236,6 @@ def test_pack_segments_smaller_max_len(
)
self.assertTrue(torch.equal(packed_tensor, packed_cuda.cpu()))

@skipIfRocm()
@given(
n=st.integers(2, 10),
k=st.integers(2, 10),
Expand Down
10 changes: 2 additions & 8 deletions fbgemm_gpu/test/tbe/training/backward_dense_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,14 +31,9 @@

if open_source:
# pyre-ignore[21]
from test_utils import gradcheck, optests, running_on_github, use_cpu_strategy
from test_utils import gradcheck, optests, use_cpu_strategy
else:
from fbgemm_gpu.test.test_utils import (
gradcheck,
optests,
running_on_github,
use_cpu_strategy,
)
from fbgemm_gpu.test.test_utils import gradcheck, optests, use_cpu_strategy


VERBOSITY: Verbosity = Verbosity.verbose
Expand Down Expand Up @@ -72,7 +67,6 @@ class BackwardDenseTest(unittest.TestCase):
deadline=None,
suppress_health_check=[HealthCheck.filter_too_much, HealthCheck.data_too_large],
)
@unittest.skipIf(*running_on_github)
def test_backward_dense( # noqa C901
self,
T: int,
Expand Down
46 changes: 43 additions & 3 deletions fbgemm_gpu/test/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ def use_cpu_strategy() -> st.SearchStrategy[bool]:
def skipIfRocm(reason: str = "Test currently doesn't work on the ROCm stack") -> Any:
# pyre-fixme[3]: Return annotation cannot be `Any`.
# pyre-fixme[24]: Generic type `Callable` expects 2 type parameters.
def skipIfRocmDecorator(fn: Callable) -> Any:
def decorator(fn: Callable) -> Any:
@wraps(fn)
# pyre-fixme[3]: Return annotation cannot be `Any`.
def wrapper(*args: Any, **kwargs: Any) -> Any:
Expand All @@ -196,15 +196,55 @@ def wrapper(*args: Any, **kwargs: Any) -> Any:

return wrapper

return skipIfRocmDecorator
return decorator


# pyre-fixme[3]: Return annotation cannot be `Any`.
def skipIfRocmLessThan(min_version: int) -> Any:
# pyre-fixme[3]: Return annotation cannot be `Any`.
# pyre-fixme[24]: Generic type `Callable` expects 2 type parameters.
def decorator(testfn: Callable) -> Any:
@wraps(testfn)
# pyre-fixme[3]: Return annotation cannot be `Any`.
def wrapper(*args: Any, **kwargs: Any) -> Any:
ROCM_VERSION_FILEPATH = "/opt/rocm/.info/version-dev"
if TEST_WITH_ROCM:
# Fail if ROCm version file is missing.
if not os.path.isfile(ROCM_VERSION_FILEPATH):
raise AssertionError(
f"ROCm version file {ROCM_VERSION_FILEPATH} is missing!"
)

# Parse the version number from the file.
with open(ROCM_VERSION_FILEPATH, "r") as file:
version = file.read().strip()
version = version.replace("-", "").split(".")
version = (
int(version[0]) * 10000 + int(version[1]) * 100 + int(version[2])
)

# Fail if ROCm version is less than the minimum version.
if version < min_version:
raise unittest.SkipTest(
f"Skip the test since the ROCm version is less than {min_version}"
)
else:
testfn(*args, **kwargs)

else:
testfn(*args, **kwargs)

return wrapper

return decorator


def symint_vector_unsupported() -> Tuple[bool, str]:
major, minor = torch.__version__.split(".")[0:2]
return (
int(major) < 2 or (int(major) == 2 and int(minor) < 1),
"""
dynamic shape support for this op needs to be on PyTorch 2.1 or
Dynamic shape support for this operator needs to be on PyTorch 2.1 or
newer with https://github.com/pytorch/pytorch/pull/101056
""",
)

0 comments on commit 00018b1

Please sign in to comment.