Skip to content

Commit

Permalink
[ci] CUDA tests speed up (taichi-dev#6516)
Browse files Browse the repository at this point in the history
Issue: taichi-dev#6445 

### Brief Summary
  • Loading branch information
feisuzhu authored and quadpixels committed May 13, 2023
1 parent 505f6f5 commit 5411efc
Show file tree
Hide file tree
Showing 11 changed files with 63 additions and 38 deletions.
40 changes: 20 additions & 20 deletions .github/workflows/scripts/unix_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -78,13 +78,23 @@ if [ -z "$TI_SKIP_CPP_TESTS" ]; then
python3 tests/run_tests.py --cpp
fi

function run-it {
ARCH=$1
PARALLELISM=$2
KEYS=${3:-"not torch and not paddle"}

if [[ $TI_WANTED_ARCHS == *"$1"* ]]; then
python3 tests/run_tests.py -vr2 -t$PARALLELISM -k "$KEYS" -m "not run_in_serial" -a $ARCH
python3 tests/run_tests.py -vr2 -t1 -k "$KEYS" -m "run_in_serial" -a $ARCH
fi
}

if [ -z "$GPU_TEST" ]; then
if [[ $PLATFORM == *"m1"* ]]; then
# Split per arch to avoid flaky test
python3 tests/run_tests.py -vr2 -t4 -k "not torch and not paddle" -a cpu
# Run metal and vulkan separately so that they don't use M1 chip simultaneously.
python3 tests/run_tests.py -vr2 -t4 -k "not torch and not paddle" -a vulkan
python3 tests/run_tests.py -vr2 -t2 -k "not torch and not paddle" -a metal
run-it cpu 4
run-it vulkan 4
run-it metal 2

python3 tests/run_tests.py -vr2 -t1 -k "torch" -a "$TI_WANTED_ARCHS"
else
# Fail fast, give priority to the error-prone tests
Expand All @@ -94,21 +104,11 @@ if [ -z "$GPU_TEST" ]; then
python3 tests/run_tests.py -vr2 -t4 -k "not paddle" -a "$TI_WANTED_ARCHS"
fi
else
# Split per arch to increase parallelism for linux GPU tests
if [[ $TI_WANTED_ARCHS == *"cuda"* ]]; then
# FIXME: suddenly tests exibit OOM on nvidia driver 470 + RTX2060 cards, lower parallelism by 1 (4->3)
python3 tests/run_tests.py -vr2 -t3 -k "not torch and not paddle" -m "not run_in_serial" -a cuda
python3 tests/run_tests.py -vr2 -t1 -k "not torch and not paddle" -m "run_in_serial" -a cuda
fi
if [[ $TI_WANTED_ARCHS == *"cpu"* ]]; then
python3 tests/run_tests.py -vr2 -t8 -k "not torch and not paddle" -a cpu
fi
if [[ $TI_WANTED_ARCHS == *"vulkan"* ]]; then
python3 tests/run_tests.py -vr2 -t8 -k "not torch and not paddle" -a vulkan
fi
if [[ $TI_WANTED_ARCHS == *"opengl"* ]]; then
python3 tests/run_tests.py -vr2 -t4 -k "not torch and not paddle" -a opengl
fi
run-it cuda 6
run-it cpu $(nproc)
run-it vulkan 8
run-it opengl 4

python3 tests/run_tests.py -vr2 -t1 -k "torch" -a "$TI_WANTED_ARCHS"
# Paddle's paddle.fluid.core.Tensor._ptr() is only available on develop branch, and CUDA version on linux will get error `Illegal Instruction`
fi
23 changes: 14 additions & 9 deletions .github/workflows/scripts/win_test.ps1
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,13 @@ Invoke python tests/run_tests.py -vr2 -t1 -k "paddle" -a cpu
# Disable paddle for the remaining test
$env:TI_ENABLE_PADDLE = "0"

function RunIt($arch, $parallelism) {
if ("$env:TI_WANTED_ARCHS".Contains("cuda")) {
Invoke python tests/run_tests.py -vr2 -t"$parallelism" -k "not torch and not paddle" -m "not run_in_serial" -a $arch
Invoke python tests/run_tests.py -vr2 -t1 -k "not torch and not paddle" -m "run_in_serial" -a $arch
}
}

if ("$env:TI_WANTED_ARCHS".Contains("cpu")) {
# NOTE: Always test CPU with non-CUDA version of PyTorch,
# since CUDA version of PyTorch will load a lot of CUDA libraries,
Expand All @@ -45,20 +52,18 @@ if ("$env:TI_WANTED_ARCHS".Contains("cpu")) {
# This is a non-issue on Linux, since Linux overcommits.
# TODO relax this when torch supports 3.10
Invoke pip install "torch==1.12.1; python_version < '3.10'"
Invoke python tests/run_tests.py -vr2 "-t$(EstimateNumProcs)" -k "not torch and not paddle" -a cpu
RunIt cpu (EstimateNumProcs)
}

if ("$env:TI_WANTED_ARCHS".Contains("cuda")) {
# TODO relax this when torch supports 3.10
Invoke pip install "torch==1.10.1+cu113; python_version < '3.10'" -f https://download.pytorch.org/whl/cu113/torch_stable.html
Invoke python tests/run_tests.py -vr2 -t4 -k "not torch and not paddle" -m "not run_in_serial" -a cuda
Invoke python tests/run_tests.py -vr2 -t1 -k "not torch and not paddle" -m "run_in_serial" -a cuda
}
if ("$env:TI_WANTED_ARCHS".Contains("opengl")) {
Invoke python tests/run_tests.py -vr2 -t4 -k "not torch and not paddle" -a opengl
}
if ("$env:TI_WANTED_ARCHS".Contains("vulkan")) {
Invoke python tests/run_tests.py -vr2 -t4 -k "not torch and not paddle" -a vulkan
RunIt cuda 6
}

RunIt opengl 4
RunIt vulkan 4

Invoke python tests/run_tests.py -vr2 -t1 -k "torch" -a "$env:TI_WANTED_ARCHS"

if ("$env:TI_RUN_RELEASE_TESTS" -eq "1" -and -not "$env:TI_LITE_TEST") {
Expand Down
1 change: 1 addition & 0 deletions requirements_test.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,4 @@ requests==2.26
matplotlib
cffi
scipy
setproctitle
12 changes: 11 additions & 1 deletion tests/python/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,18 @@


@pytest.fixture(autouse=True)
def wanted_arch(req_arch, req_options):
def wanted_arch(request, req_arch, req_options):
if req_arch is not None:
if req_arch == ti.cuda:
if not request.node.get_closest_marker('run_in_serial'):
# Optimization only apply to non-serial tests, since serial tests
# are picked out exactly because of extensive resource consumption.
# Separation of serial/non-serial tests is done by the test runner
# through `-m run_in_serial` / `-m not run_in_serial`.
req_options = {'device_memory_GB': 0.3, **req_options}
else:
# Serial tests run without aggressive resource optimization
req_options = {'device_memory_GB': 1, **req_options}
ti.init(arch=req_arch, enable_fallback=False, **req_options)
yield
if req_arch is not None:
Expand Down
4 changes: 3 additions & 1 deletion tests/python/test_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -343,7 +343,9 @@ def test_ref_atomic():
# Please remove this guardiance when you fix this issue
cur_arch = ti.lang.impl.get_runtime().prog.config().arch
if cur_arch == ti.cuda and ti.lang.impl.get_cuda_compute_capability() < 70:
return
pytest.skip(
'Skip this test on Pascal (and potentially older) architecture, ask turbo0628/Proton for more information'
)

@ti.experimental.real_func
def foo(a: ti.ref(ti.f32)):
Expand Down
2 changes: 1 addition & 1 deletion tests/python/test_global_thread_idx.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

@test_utils.test(arch=ti.cuda)
def test_global_thread_idx():
n = 2048
n = 128
x = ti.field(ti.i32, shape=n)

@ti.kernel
Expand Down
2 changes: 2 additions & 0 deletions tests/python/test_memory.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,14 @@
import os

import psutil
import pytest
from taichi.lang.misc import get_host_arch_list

import taichi as ti
from tests import test_utils


@pytest.mark.run_in_serial
@test_utils.test(arch=ti.cuda)
def test_memory_allocate():
HUGE_SIZE = 1024**2 * 128
Expand Down
5 changes: 5 additions & 0 deletions tests/python/test_mpm88.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import os

import pytest

import taichi as ti
from tests import test_utils

Expand Down Expand Up @@ -100,11 +102,13 @@ def substep():
rel=1e-2)


@pytest.mark.skipif(os.environ.get('TI_LITE_TEST') or '0', reason='Lite test')
@test_utils.test()
def test_mpm88():
run_mpm88_test()


@pytest.mark.skipif(os.environ.get('TI_LITE_TEST') or '0', reason='Lite test')
@test_utils.test(real_matrix=True, real_matrix_scalarize=True)
def test_mpm88_real_matrix_scalarize():
run_mpm88_test()
Expand All @@ -116,6 +120,7 @@ def _is_appveyor():
return os.getenv('APPVEYOR', '').lower() == 'true'


@pytest.mark.skipif(os.environ.get('TI_LITE_TEST') or '0', reason='Lite test')
@test_utils.test(arch=[ti.cpu, ti.cuda, ti.opengl])
def test_mpm88_numpy_and_ndarray():
import numpy as np
Expand Down
2 changes: 1 addition & 1 deletion tests/python/test_ndarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -319,7 +319,7 @@ def test_ndarray_rw_cache():
a = ti.Vector.ndarray(3, ti.f32, ())
b = ti.Vector.ndarray(3, ti.f32, 12)

n = 1000
n = 100
for i in range(n):
c_a = copy.deepcopy(a)
c_b = copy.deepcopy(b)
Expand Down
7 changes: 5 additions & 2 deletions tests/python/test_simt.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import numpy as np
import pytest
from pytest import approx
from taichi.lang.simt import subgroup

Expand Down Expand Up @@ -271,7 +272,8 @@ def foo():
def test_match_any():
# Skip match_any test for Pascal
if ti.lang.impl.get_cuda_compute_capability() < 70:
return
pytest.skip('match_any not supported on Pascal')

a = ti.field(dtype=ti.i32, shape=32)
b = ti.field(dtype=ti.u32, shape=32)

Expand All @@ -297,7 +299,8 @@ def foo():
def test_match_all():
# Skip match_all test for Pascal
if ti.lang.impl.get_cuda_compute_capability() < 70:
return
pytest.skip('match_all not supported on Pascal')

a = ti.field(dtype=ti.i32, shape=32)
b = ti.field(dtype=ti.u32, shape=32)
c = ti.field(dtype=ti.u32, shape=32)
Expand Down
3 changes: 0 additions & 3 deletions tests/run_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,9 +195,6 @@ def _test_python(args):
except NotImplementedError:
threads = 2

if not os.environ.get('TI_DEVICE_MEMORY_GB'):
os.environ['TI_DEVICE_MEMORY_GB'] = '1.0' # Discussion: #769

env_threads = os.environ.get('TI_TEST_THREADS', '')
threads = args.threads or env_threads or threads
print(f'Starting {threads} testing thread(s)...')
Expand Down

0 comments on commit 5411efc

Please sign in to comment.