Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[fbgemm_gpu] More flexible test retries #2328

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
108 changes: 74 additions & 34 deletions .github/scripts/fbgemm_gpu_test.bash
Original file line number Diff line number Diff line change
Expand Up @@ -32,20 +32,77 @@ run_python_test () {
local env_prefix=$(env_name_or_prefix "${env_name}")

# shellcheck disable=SC2086
if exec_with_retries 2 conda run --no-capture-output ${env_prefix} python -m pytest -v -rsx -s -W ignore::pytest.PytestCollectionWarning "${python_test_file}"; then
if print_exec conda run --no-capture-output ${env_prefix} python -m pytest -v -rsx -s -W ignore::pytest.PytestCollectionWarning --cache-clear "${python_test_file}"; then
echo "[TEST] Python test suite PASSED: ${python_test_file}"
echo ""
echo ""
echo ""
return 0
fi

echo "[TEST] Some tests FAILED. Re-attempting only FAILED tests: ${python_test_file}"
echo ""
echo ""

# NOTE: Running large test suites may result in OOM error that will cause the
# process to be prematurely killed. To work around this, when we re-run test
# suites, we only run tests that have failed in the previous round. This is
# enabled by using the pytest cache and the --lf flag.

# shellcheck disable=SC2086
if exec_with_retries 2 conda run --no-capture-output ${env_prefix} python -m pytest -v -rsx -s -W ignore::pytest.PytestCollectionWarning --lf --last-failed-no-failures none "${python_test_file}"; then
echo "[TEST] Python test suite PASSED with retries: ${python_test_file}"
echo ""
echo ""
echo ""
else
echo "[TEST] Python test suite FAILED: ${python_test_file}"
echo "[TEST] Python test suite FAILED for some or all tests despite retries: ${python_test_file}"
echo ""
echo ""
echo ""
return 1
fi
}

__configure_fbgemm_gpu_test_cpu () {
ignored_tests=(
./ssd_split_table_batched_embeddings_test.py
# These tests have non-CPU operators referenced in @given
./uvm/copy_test.py
./uvm/uvm_test.py
)
}

__configure_fbgemm_gpu_test_cuda () {
ignored_tests=(
./ssd_split_table_batched_embeddings_test.py
)
}

__configure_fbgemm_gpu_test_rocm () {
# shellcheck disable=SC2155
local env_prefix=$(env_name_or_prefix "${env_name}")

echo "[TEST] Set environment variables for ROCm testing ..."
# shellcheck disable=SC2086
print_exec conda env config vars set ${env_prefix} FBGEMM_TEST_WITH_ROCM=1
# shellcheck disable=SC2086
print_exec conda env config vars set ${env_prefix} HIP_LAUNCH_BLOCKING=1

# Starting from MI250 AMD GPUs support per process XNACK mode change
# shellcheck disable=SC2155
local rocm_version=$(awk -F'[.-]' '{print $1 * 10000 + $2 * 100 + $3}' /opt/rocm/.info/version-dev)
if [ "$rocm_version" -ge 50700 ]; then
# shellcheck disable=SC2086
print_exec conda env config vars set ${env_prefix} HSA_XNACK=1
fi

ignored_tests=(
./ssd_split_table_batched_embeddings_test.py
# https://github.com/pytorch/FBGEMM/issues/1559
./batched_unary_embeddings_test.py
)
}

################################################################################
# FBGEMM_GPU Test Functions
Expand Down Expand Up @@ -73,37 +130,17 @@ run_fbgemm_gpu_tests () {
# shellcheck disable=SC2155
local env_prefix=$(env_name_or_prefix "${env_name}")

# Enable ROCM testing if specified
if [ "$fbgemm_variant" == "rocm" ]; then
echo "[TEST] Set environment variables for ROCm testing ..."
# shellcheck disable=SC2086
print_exec conda env config vars set ${env_prefix} FBGEMM_TEST_WITH_ROCM=1
# shellcheck disable=SC2086
print_exec conda env config vars set ${env_prefix} HIP_LAUNCH_BLOCKING=1
fi

# These are either non-tests or currently-broken tests in both FBGEMM_GPU and FBGEMM_GPU-CPU
local files_to_skip=(
./ssd_split_table_batched_embeddings_test.py
)

if [ "$fbgemm_variant" == "cpu" ]; then
# These tests have non-CPU operators referenced in @given
local ignored_tests=(
./uvm/copy_test.py
./uvm/uvm_test.py
)
echo "Configuring for CPU-based testing ..."
__configure_fbgemm_gpu_test_cpu

elif [ "$fbgemm_variant" == "rocm" ]; then
local ignored_tests=(
# https://github.com/pytorch/FBGEMM/issues/1559
./batched_unary_embeddings_test.py
./tbe/backward_adagrad_test.py
./tbe/backward_dense_test.py
./tbe/backward_none_test.py
./tbe/backward_sgd_test.py
)
echo "Configuring for ROCm-based testing ..."
__configure_fbgemm_gpu_test_rocm

else
local ignored_tests=()
echo "Configuring for CUDA-based testing ..."
__configure_fbgemm_gpu_test_cuda
fi

echo "[TEST] Installing pytest ..."
Expand All @@ -114,19 +151,22 @@ run_fbgemm_gpu_tests () {
(test_python_import_package "${env_name}" fbgemm_gpu) || return 1
(test_python_import_package "${env_name}" fbgemm_gpu.split_embedding_codegen_lookup_invokers) || return 1

echo "[TEST] Enumerating test files ..."
echo "[TEST] Enumerating ALL test files ..."
# shellcheck disable=SC2155
local all_test_files=$(find . -type f -name '*_test.py' -print | sort)
for f in $all_test_files; do echo "$f"; done
echo ""

echo "[TEST] Enumerating IGNORED test files ..."
for f in $ignored_tests; do echo "$f"; done
echo ""

# NOTE: Tests running on single CPU core with a less powerful testing GPU in
# GHA can take up to 5 hours.
for test_file in $all_test_files; do
if echo "${files_to_skip[@]}" | grep "${test_file}"; then
echo "[TEST] Skipping test file known to be broken: ${test_file}"
elif echo "${ignored_tests[@]}" | grep "${test_file}"; then
if echo "${ignored_tests[@]}" | grep "${test_file}"; then
echo "[TEST] Skipping test file: ${test_file}"
echo ""
elif run_python_test "${env_name}" "${test_file}"; then
echo ""
else
Expand Down
6 changes: 0 additions & 6 deletions fbgemm_gpu/test/jagged_tensor_ops_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@
gpu_unavailable,
gradcheck,
optests,
skipIfRocm,
symint_vector_unsupported,
use_cpu_strategy,
)
Expand All @@ -46,7 +45,6 @@
gpu_unavailable,
gradcheck,
optests,
skipIfRocm,
symint_vector_unsupported,
use_cpu_strategy,
)
Expand Down Expand Up @@ -1630,7 +1628,6 @@ def test_jagged_dense_dense_elementwise_add_jagged_output_dynamic_shape(

assert output.size() == output_ref.size()

@skipIfRocm()
@settings(
verbosity=Verbosity.verbose,
max_examples=20,
Expand Down Expand Up @@ -2370,7 +2367,6 @@ def test_jagged_softmax(

torch.testing.assert_close(values.grad, values_ref.grad)

@skipIfRocm()
@given(
B=st.integers(10, 512),
M=st.integers(1, 32),
Expand Down Expand Up @@ -2669,7 +2665,6 @@ def test_jagged_slice_errors(
)

@unittest.skipIf(*gpu_unavailable)
@skipIfRocm()
@given(
B=st.integers(min_value=100, max_value=200),
F=st.integers(min_value=50, max_value=100),
Expand Down Expand Up @@ -2774,7 +2769,6 @@ def test_jagged_unique_indices(
self.assertTrue((output_start <= pos) and (pos < output_end))

@unittest.skipIf(*gpu_unavailable)
@skipIfRocm()
@given(
B=st.integers(min_value=100, max_value=200),
F=st.integers(min_value=50, max_value=100),
Expand Down
10 changes: 1 addition & 9 deletions fbgemm_gpu/test/permute_pooled_embedding_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,20 +23,13 @@
# pyre-fixme[16]: Module `fbgemm_gpu` has no attribute `open_source`.
if getattr(fbgemm_gpu, "open_source", False):
# pyre-ignore[21]
from test_utils import (
cpu_and_maybe_gpu,
gpu_unavailable,
on_arm_platform,
optests,
skipIfRocm,
)
from test_utils import cpu_and_maybe_gpu, gpu_unavailable, on_arm_platform, optests
else:
from fbgemm_gpu.test.test_utils import (
cpu_and_maybe_gpu,
gpu_unavailable,
on_arm_platform,
optests,
skipIfRocm,
)

typed_gpu_unavailable: Tuple[bool, str] = gpu_unavailable
Expand Down Expand Up @@ -145,7 +138,6 @@ def test_permutation(self, fwd_only: bool) -> None:
[6, 7, 8, 9, 0, 1, 5, 2, 3, 4],
)

@skipIfRocm()
@unittest.skipIf(*typed_on_arm_platform)
def test_permutation_autograd(self) -> None:
net = Net().to(self.device)
Expand Down
5 changes: 2 additions & 3 deletions fbgemm_gpu/test/sparse/index_select_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,14 +23,13 @@

if open_source:
# pyre-ignore[21]
from test_utils import gpu_available, skipIfRocm
from test_utils import gpu_available
else:
import fbgemm_gpu.sparse_ops # noqa: F401, E402
from fbgemm_gpu.test.test_utils import gpu_available, skipIfRocm
from fbgemm_gpu.test.test_utils import gpu_available


class IndexSelectTest(unittest.TestCase):
@skipIfRocm()
@given(
N=st.integers(1, 32),
shape=st.one_of(
Expand Down
5 changes: 2 additions & 3 deletions fbgemm_gpu/test/sparse/pack_segments_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,9 @@

if open_source:
# pyre-ignore[21]
from test_utils import gpu_available, skipIfRocm
from test_utils import gpu_available
else:
from fbgemm_gpu.test.test_utils import gpu_available, skipIfRocm
from fbgemm_gpu.test.test_utils import gpu_available


def get_n_rand_num_summing_to_k(n: int, k: int) -> np.ndarray:
Expand Down Expand Up @@ -236,7 +236,6 @@ def test_pack_segments_smaller_max_len(
)
self.assertTrue(torch.equal(packed_tensor, packed_cuda.cpu()))

@skipIfRocm()
@given(
n=st.integers(2, 10),
k=st.integers(2, 10),
Expand Down
10 changes: 2 additions & 8 deletions fbgemm_gpu/test/tbe/training/backward_dense_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,14 +31,9 @@

if open_source:
# pyre-ignore[21]
from test_utils import gradcheck, optests, running_on_github, use_cpu_strategy
from test_utils import gradcheck, optests, use_cpu_strategy
else:
from fbgemm_gpu.test.test_utils import (
gradcheck,
optests,
running_on_github,
use_cpu_strategy,
)
from fbgemm_gpu.test.test_utils import gradcheck, optests, use_cpu_strategy


VERBOSITY: Verbosity = Verbosity.verbose
Expand Down Expand Up @@ -72,7 +67,6 @@ class BackwardDenseTest(unittest.TestCase):
deadline=None,
suppress_health_check=[HealthCheck.filter_too_much, HealthCheck.data_too_large],
)
@unittest.skipIf(*running_on_github)
def test_backward_dense( # noqa C901
self,
T: int,
Expand Down
46 changes: 43 additions & 3 deletions fbgemm_gpu/test/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ def use_cpu_strategy() -> st.SearchStrategy[bool]:
def skipIfRocm(reason: str = "Test currently doesn't work on the ROCm stack") -> Any:
# pyre-fixme[3]: Return annotation cannot be `Any`.
# pyre-fixme[24]: Generic type `Callable` expects 2 type parameters.
def skipIfRocmDecorator(fn: Callable) -> Any:
def decorator(fn: Callable) -> Any:
@wraps(fn)
# pyre-fixme[3]: Return annotation cannot be `Any`.
def wrapper(*args: Any, **kwargs: Any) -> Any:
Expand All @@ -196,15 +196,55 @@ def wrapper(*args: Any, **kwargs: Any) -> Any:

return wrapper

return skipIfRocmDecorator
return decorator


# pyre-fixme[3]: Return annotation cannot be `Any`.
def skipIfRocmLessThan(min_version: int) -> Any:
# pyre-fixme[3]: Return annotation cannot be `Any`.
# pyre-fixme[24]: Generic type `Callable` expects 2 type parameters.
def decorator(testfn: Callable) -> Any:
@wraps(testfn)
# pyre-fixme[3]: Return annotation cannot be `Any`.
def wrapper(*args: Any, **kwargs: Any) -> Any:
ROCM_VERSION_FILEPATH = "/opt/rocm/.info/version-dev"
if TEST_WITH_ROCM:
# Fail if ROCm version file is missing.
if not os.path.isfile(ROCM_VERSION_FILEPATH):
raise AssertionError(
f"ROCm version file {ROCM_VERSION_FILEPATH} is missing!"
)

# Parse the version number from the file.
with open(ROCM_VERSION_FILEPATH, "r") as file:
version = file.read().strip()
version = version.replace("-", "").split(".")
version = (
int(version[0]) * 10000 + int(version[1]) * 100 + int(version[2])
)

# Fail if ROCm version is less than the minimum version.
if version < min_version:
raise unittest.SkipTest(
f"Skip the test since the ROCm version is less than {min_version}"
)
else:
testfn(*args, **kwargs)

else:
testfn(*args, **kwargs)

return wrapper

return decorator


def symint_vector_unsupported() -> Tuple[bool, str]:
major, minor = torch.__version__.split(".")[0:2]
return (
int(major) < 2 or (int(major) == 2 and int(minor) < 1),
"""
dynamic shape support for this op needs to be on PyTorch 2.1 or
Dynamic shape support for this operator needs to be on PyTorch 2.1 or
newer with https://github.com/pytorch/pytorch/pull/101056
""",
)
Loading