[ci] CUDA tests speed up (taichi-dev#6516)

Issue: taichi-dev#6445 ### Brief Summary
quadpixels · May 13, 2023 · 5411efc · 5411efc
1 parent 505f6f5
commit 5411efc
Show file tree

Hide file tree

Showing 11 changed files with 63 additions and 38 deletions.
diff --git a/.github/workflows/scripts/unix_test.sh b/.github/workflows/scripts/unix_test.sh
@@ -78,13 +78,23 @@ if [ -z "$TI_SKIP_CPP_TESTS" ]; then
     python3 tests/run_tests.py --cpp
 fi
 
+function run-it {
+    ARCH=$1
+    PARALLELISM=$2
+    KEYS=${3:-"not torch and not paddle"}
+
+    if [[ $TI_WANTED_ARCHS == *"$1"* ]]; then
+        python3 tests/run_tests.py -vr2 -t$PARALLELISM -k "$KEYS" -m "not run_in_serial" -a $ARCH
+        python3 tests/run_tests.py -vr2 -t1 -k "$KEYS" -m "run_in_serial" -a $ARCH
+    fi
+}
+
 if [ -z "$GPU_TEST" ]; then
     if [[ $PLATFORM == *"m1"* ]]; then
-	# Split per arch to avoid flaky test
-        python3 tests/run_tests.py -vr2 -t4 -k "not torch and not paddle" -a cpu
-        # Run metal and vulkan separately so that they don't use M1 chip simultaneously.
-        python3 tests/run_tests.py -vr2 -t4 -k "not torch and not paddle" -a vulkan
-        python3 tests/run_tests.py -vr2 -t2 -k "not torch and not paddle" -a metal
+        run-it cpu 4
+        run-it vulkan 4
+        run-it metal 2
+
         python3 tests/run_tests.py -vr2 -t1 -k "torch" -a "$TI_WANTED_ARCHS"
     else
         # Fail fast, give priority to the error-prone tests
@@ -94,21 +104,11 @@ if [ -z "$GPU_TEST" ]; then
         python3 tests/run_tests.py -vr2 -t4 -k "not paddle" -a "$TI_WANTED_ARCHS"
     fi
 else
-    # Split per arch to increase parallelism for linux GPU tests
-    if [[ $TI_WANTED_ARCHS == *"cuda"* ]]; then
-        # FIXME: suddenly tests exibit OOM on nvidia driver 470 + RTX2060 cards, lower parallelism by 1 (4->3)
-        python3 tests/run_tests.py -vr2 -t3 -k "not torch and not paddle" -m "not run_in_serial" -a cuda
-        python3 tests/run_tests.py -vr2 -t1 -k "not torch and not paddle" -m "run_in_serial" -a cuda
-    fi
-    if [[ $TI_WANTED_ARCHS == *"cpu"* ]]; then
-        python3 tests/run_tests.py -vr2 -t8 -k "not torch and not paddle" -a cpu
-    fi
-    if [[ $TI_WANTED_ARCHS == *"vulkan"* ]]; then
-        python3 tests/run_tests.py -vr2 -t8 -k "not torch and not paddle" -a vulkan
-    fi
-    if [[ $TI_WANTED_ARCHS == *"opengl"* ]]; then
-        python3 tests/run_tests.py -vr2 -t4 -k "not torch and not paddle" -a opengl
-    fi
+    run-it cuda   6
+    run-it cpu    $(nproc)
+    run-it vulkan 8
+    run-it opengl 4
+
     python3 tests/run_tests.py -vr2 -t1 -k "torch" -a "$TI_WANTED_ARCHS"
     # Paddle's paddle.fluid.core.Tensor._ptr() is only available on develop branch, and CUDA version on linux will get error `Illegal Instruction`
 fi
diff --git a/.github/workflows/scripts/win_test.ps1 b/.github/workflows/scripts/win_test.ps1
@@ -36,6 +36,13 @@ Invoke python tests/run_tests.py -vr2 -t1 -k "paddle" -a cpu
 # Disable paddle for the remaining test
 $env:TI_ENABLE_PADDLE = "0"
 
+function RunIt($arch, $parallelism) {
+    if ("$env:TI_WANTED_ARCHS".Contains("cuda")) {
+        Invoke python tests/run_tests.py -vr2 -t"$parallelism" -k "not torch and not paddle" -m "not run_in_serial" -a $arch
+        Invoke python tests/run_tests.py -vr2 -t1 -k "not torch and not paddle" -m "run_in_serial" -a $arch
+    }
+}
+
 if ("$env:TI_WANTED_ARCHS".Contains("cpu")) {
   # NOTE: Always test CPU with non-CUDA version of PyTorch,
   #       since CUDA version of PyTorch will load a lot of CUDA libraries,
@@ -45,20 +52,18 @@ if ("$env:TI_WANTED_ARCHS".Contains("cpu")) {
   #       This is a non-issue on Linux, since Linux overcommits.
   # TODO relax this when torch supports 3.10
   Invoke pip install "torch==1.12.1; python_version < '3.10'"
-  Invoke python tests/run_tests.py -vr2 "-t$(EstimateNumProcs)" -k "not torch and not paddle" -a cpu
+  RunIt cpu (EstimateNumProcs)
 }
+
 if ("$env:TI_WANTED_ARCHS".Contains("cuda")) {
   # TODO relax this when torch supports 3.10
   Invoke pip install "torch==1.10.1+cu113; python_version < '3.10'" -f https://download.pytorch.org/whl/cu113/torch_stable.html
-  Invoke python tests/run_tests.py -vr2 -t4 -k "not torch and not paddle" -m "not run_in_serial" -a cuda
-  Invoke python tests/run_tests.py -vr2 -t1 -k "not torch and not paddle" -m "run_in_serial" -a cuda
-}
-if ("$env:TI_WANTED_ARCHS".Contains("opengl")) {
-  Invoke python tests/run_tests.py -vr2 -t4 -k "not torch and not paddle" -a opengl
-}
-if ("$env:TI_WANTED_ARCHS".Contains("vulkan")) {
-  Invoke python tests/run_tests.py -vr2 -t4 -k "not torch and not paddle" -a vulkan
+  RunIt cuda 6
 }
+
+RunIt opengl 4
+RunIt vulkan 4
+
 Invoke python tests/run_tests.py -vr2 -t1 -k "torch" -a "$env:TI_WANTED_ARCHS"
 
 if ("$env:TI_RUN_RELEASE_TESTS" -eq "1" -and -not "$env:TI_LITE_TEST") {

diff --git a/requirements_test.txt b/requirements_test.txt
@@ -10,3 +10,4 @@ requests==2.26
 matplotlib
 cffi
 scipy
+setproctitle
diff --git a/tests/python/conftest.py b/tests/python/conftest.py
@@ -6,8 +6,18 @@
 
 
 @pytest.fixture(autouse=True)
-def wanted_arch(req_arch, req_options):
+def wanted_arch(request, req_arch, req_options):
     if req_arch is not None:
+        if req_arch == ti.cuda:
+            if not request.node.get_closest_marker('run_in_serial'):
+                # Optimization only apply to non-serial tests, since serial tests
+                # are picked out exactly because of extensive resource consumption.
+                # Separation of serial/non-serial tests is done by the test runner
+                # through `-m run_in_serial` / `-m not run_in_serial`.
+                req_options = {'device_memory_GB': 0.3, **req_options}
+            else:
+                # Serial tests run without aggressive resource optimization
+                req_options = {'device_memory_GB': 1, **req_options}
         ti.init(arch=req_arch, enable_fallback=False, **req_options)
     yield
     if req_arch is not None:

diff --git a/tests/python/test_function.py b/tests/python/test_function.py
@@ -343,7 +343,9 @@ def test_ref_atomic():
     # Please remove this guardiance when you fix this issue
     cur_arch = ti.lang.impl.get_runtime().prog.config().arch
     if cur_arch == ti.cuda and ti.lang.impl.get_cuda_compute_capability() < 70:
-        return
+        pytest.skip(
+            'Skip this test on Pascal (and potentially older) architecture, ask turbo0628/Proton for more information'
+        )
 
     @ti.experimental.real_func
     def foo(a: ti.ref(ti.f32)):

diff --git a/tests/python/test_global_thread_idx.py b/tests/python/test_global_thread_idx.py
@@ -6,7 +6,7 @@
 
 @test_utils.test(arch=ti.cuda)
 def test_global_thread_idx():
-    n = 2048
+    n = 128
     x = ti.field(ti.i32, shape=n)
 
     @ti.kernel

diff --git a/tests/python/test_memory.py b/tests/python/test_memory.py
@@ -2,12 +2,14 @@
 import os
 
 import psutil
+import pytest
 from taichi.lang.misc import get_host_arch_list
 
 import taichi as ti
 from tests import test_utils
 
 
+@pytest.mark.run_in_serial
 @test_utils.test(arch=ti.cuda)
 def test_memory_allocate():
     HUGE_SIZE = 1024**2 * 128

diff --git a/tests/python/test_mpm88.py b/tests/python/test_mpm88.py
@@ -1,5 +1,7 @@
 import os
 
+import pytest
+
 import taichi as ti
 from tests import test_utils
 
@@ -100,11 +102,13 @@ def substep():
                                                           rel=1e-2)
 
 
+@pytest.mark.skipif(os.environ.get('TI_LITE_TEST') or '0', reason='Lite test')
 @test_utils.test()
 def test_mpm88():
     run_mpm88_test()
 
 
+@pytest.mark.skipif(os.environ.get('TI_LITE_TEST') or '0', reason='Lite test')
 @test_utils.test(real_matrix=True, real_matrix_scalarize=True)
 def test_mpm88_real_matrix_scalarize():
     run_mpm88_test()
@@ -116,6 +120,7 @@ def _is_appveyor():
     return os.getenv('APPVEYOR', '').lower() == 'true'
 
 
+@pytest.mark.skipif(os.environ.get('TI_LITE_TEST') or '0', reason='Lite test')
 @test_utils.test(arch=[ti.cpu, ti.cuda, ti.opengl])
 def test_mpm88_numpy_and_ndarray():
     import numpy as np

diff --git a/tests/python/test_ndarray.py b/tests/python/test_ndarray.py
@@ -319,7 +319,7 @@ def test_ndarray_rw_cache():
     a = ti.Vector.ndarray(3, ti.f32, ())
     b = ti.Vector.ndarray(3, ti.f32, 12)
 
-    n = 1000
+    n = 100
     for i in range(n):
         c_a = copy.deepcopy(a)
         c_b = copy.deepcopy(b)

diff --git a/tests/python/test_simt.py b/tests/python/test_simt.py
@@ -1,4 +1,5 @@
 import numpy as np
+import pytest
 from pytest import approx
 from taichi.lang.simt import subgroup
 
@@ -271,7 +272,8 @@ def foo():
 def test_match_any():
     # Skip match_any test for Pascal
     if ti.lang.impl.get_cuda_compute_capability() < 70:
-        return
+        pytest.skip('match_any not supported on Pascal')
+
     a = ti.field(dtype=ti.i32, shape=32)
     b = ti.field(dtype=ti.u32, shape=32)
 
@@ -297,7 +299,8 @@ def foo():
 def test_match_all():
     # Skip match_all test for Pascal
     if ti.lang.impl.get_cuda_compute_capability() < 70:
-        return
+        pytest.skip('match_all not supported on Pascal')
+
     a = ti.field(dtype=ti.i32, shape=32)
     b = ti.field(dtype=ti.u32, shape=32)
     c = ti.field(dtype=ti.u32, shape=32)

diff --git a/tests/run_tests.py b/tests/run_tests.py
@@ -195,9 +195,6 @@ def _test_python(args):
     except NotImplementedError:
         threads = 2
 
-    if not os.environ.get('TI_DEVICE_MEMORY_GB'):
-        os.environ['TI_DEVICE_MEMORY_GB'] = '1.0'  # Discussion: #769
-
     env_threads = os.environ.get('TI_TEST_THREADS', '')
     threads = args.threads or env_threads or threads
     print(f'Starting {threads} testing thread(s)...')
-Original file line number
+Diff line change
@@ Expand Up / @@ -10,3 +10,4 @@ requests==2.26 @@
     matplotlib
     cffi
     scipy
+    setproctitle