Build DML in Windows GPU CI pipeline (microsoft#22869)

### Description Add a new stage to build cuda and dml in Windows GPU CI pipeline (PR checks) to prevent regressions introduced by new cuda tests. Update all tests in cuda/testcases name prefix to CudaEp for skipping them easily ### Motivation and Context 1. CudaNhwcEP is added by default when using cuda ep 2. if onnxruntime_ENABLE_CUDA_EP_INTERNAL_TES is enable, the tests in tests/provider/cuda/testcases is added too. ### To do add enable_pybind in the new stage. Now, --enable_pybind will trigger some python test, like onnxruntime_test_python.py. It uses the API of get_avaible_providers() . More discussions are needed to decide how to make it works
intel · Dec 11, 2024 · a192a8a · a192a8a
1 parent 23f88f3
commit a192a8a
Show file tree

Hide file tree

Showing 15 changed files with 81 additions and 38 deletions.
diff --git a/java/src/test/java/ai/onnxruntime/providers/ProviderOptionsTest.java b/java/src/test/java/ai/onnxruntime/providers/ProviderOptionsTest.java
@@ -27,13 +27,15 @@
 import java.util.HashMap;
 import java.util.Map;
 import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.condition.DisabledIfSystemProperty;
 import org.junit.jupiter.api.condition.EnabledIfSystemProperty;
 
 public class ProviderOptionsTest {
   private static final OrtEnvironment env = TestHelpers.getOrtEnvironment();
 
   @Test
   @EnabledIfSystemProperty(named = "USE_CUDA", matches = "1")
+  @DisabledIfSystemProperty(named = "NO_CUDA_TEST", matches = "1")
   public void testCUDAOptions() throws OrtException {
     // Test standard options
     OrtCUDAProviderOptions cudaOpts = new OrtCUDAProviderOptions(0);
@@ -61,6 +63,7 @@ public void testCUDAOptions() throws OrtException {
 
   @Test
   @EnabledIfSystemProperty(named = "USE_TENSORRT", matches = "1")
+  @DisabledIfSystemProperty(named = "NO_CUDA_TEST", matches = "1")
   public void testTensorRT() throws OrtException {
     // Test standard options
     OrtTensorRTProviderOptions rtOpts = new OrtTensorRTProviderOptions(0);

diff --git a/onnxruntime/test/providers/cuda/cuda_provider_test.cc b/onnxruntime/test/providers/cuda/cuda_provider_test.cc
@@ -11,7 +11,7 @@ ProviderInfo_CUDA& GetProviderInfo_CUDA_Test();
 
 namespace test {
 namespace cuda {
-TEST(CUDA_EP_Unittest, All) {
+TEST(CudaEpUnittest, All) {
   onnxruntime::ProviderInfo_CUDA& ep = onnxruntime::GetProviderInfo_CUDA_Test();
   ep.TestAll();
 }

diff --git a/onnxruntime/test/providers/cuda/test_cases/allocator_cuda_test.cc b/onnxruntime/test/providers/cuda/test_cases/allocator_cuda_test.cc
@@ -11,7 +11,7 @@
 namespace onnxruntime {
 namespace test {
 
-TEST(AllocatorTest, CUDAAllocatorTest) {
+TEST(CudaEpAllocatorTest, CUDAAllocatorTest) {
   OrtDevice::DeviceId cuda_device_id = 0;
 
   // ensure CUDA device is available.
@@ -77,7 +77,7 @@ TEST(AllocatorTest, CUDAAllocatorTest) {
 }
 
 // test that we fallback to smaller allocations if the growth of the arena exceeds the available memory
-TEST(AllocatorTest, CUDAAllocatorFallbackTest) {
+TEST(CudaEpAllocatorTest, CUDAAllocatorFallbackTest) {
   OrtDevice::DeviceId cuda_device_id = 0;
 
   size_t free = 0;

diff --git a/onnxruntime/test/providers/cuda/test_cases/attention_kernel_options_test.cc b/onnxruntime/test/providers/cuda/test_cases/attention_kernel_options_test.cc
@@ -17,7 +17,7 @@ using onnxruntime::contrib::attention::AttentionBackend;
 namespace onnxruntime {
 namespace test {
 
-TEST(AttentionKernelOptionsTest, NonZeroValue) {
+TEST(CudaEpAttentionKernelOptionsTest, NonZeroValue) {
   {
     AttentionKernelOptions options;
     int value = static_cast<int>(AttentionBackend::FLASH_ATTENTION) | static_cast<int>(AttentionBackend::EFFICIENT_ATTENTION);
@@ -156,7 +156,7 @@ TEST(AttentionKernelOptionsTest, NonZeroValue) {
 }
 
 // Test all environment variables take effect when option value is 0.
-TEST(AttentionKernelOptionsTest, DefaultOptionWithEnvVar) {
+TEST(CudaEpAttentionKernelOptionsTest, DefaultOptionWithEnvVar) {
   constexpr int value = 0;
   ScopedEnvironmentVariables scoped_env_vars{
       EnvVarMap{
@@ -186,7 +186,7 @@ TEST(AttentionKernelOptionsTest, DefaultOptionWithEnvVar) {
 }
 
 // Test default min sequence lengths when environment variables are not set.
-TEST(AttentionKernelOptionsTest, DefaultMinSeqLens) {
+TEST(CudaEpAttentionKernelOptionsTest, DefaultMinSeqLens) {
   constexpr int value = 0;
   ScopedEnvironmentVariables scoped_env_vars{
       EnvVarMap{

diff --git a/onnxruntime/test/providers/cuda/test_cases/beam_search_topk.cc b/onnxruntime/test/providers/cuda/test_cases/beam_search_topk.cc
@@ -68,7 +68,7 @@ void ComputeTopKReference(const std::vector<float>& values,
   }
 }
 
-TEST(TestBeamSearch, TopK) {
+TEST(CudaEpTestBeamSearch, TopK) {
   int32_t batch_size = 4;
   int32_t beam_size = 4;
   int32_t vocab_size = 50257;

diff --git a/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_test.cc b/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_test.cc
@@ -230,7 +230,7 @@ void testPrepack(int rows, int columns) {
 }
 
 // TODO: code runs on CPU, but this is for sm80 only, maybe enable only when test on sm80
-TEST(BlkQ4_GEMM, PrepackSm80Test) {
+TEST(CudaEpBlkQ4_GEMM, PrepackSm80Test) {
   Status status = onnxruntime::cuda::test::sm80_supported();
   if (!status.IsOK()) {
     // skip the test if sm80 is not supported
@@ -263,7 +263,7 @@ TEST(BlkQ4_GEMM, PrepackSm80Test) {
   testPrepack<true, false>(256, 256);
 }
 
-TEST(BlkQ4_GEMM, Sm80RowBlockingTest) {
+TEST(CudaEpBlkQ4_GEMM, Sm80RowBlockingTest) {
   Status status = onnxruntime::cuda::test::sm80_supported();
   if (!status.IsOK()) {
     // skip the test if sm80 is not supported
@@ -292,7 +292,7 @@ TEST(BlkQ4_GEMM, Sm80RowBlockingTest) {
   onnxruntime::cuda::test::run_blkq4_gemm<64, false, false, true>(256, 1024, 576);
 }
 
-TEST(BlkQ4_GEMM, Sm80ColBlockingTest) {
+TEST(CudaEpBlkQ4_GEMM, Sm80ColBlockingTest) {
   Status status = onnxruntime::cuda::test::sm80_supported();
   if (!status.IsOK()) {
     // skip the test if sm80 is not supported
@@ -305,7 +305,7 @@ TEST(BlkQ4_GEMM, Sm80ColBlockingTest) {
   onnxruntime::cuda::test::run_blkq4_gemm<64, true, false, true>(256, 1024, 576);
 }
 
-TEST(BlkQ4_GEMM, Sm80SmallMTest) {
+TEST(CudaEpBlkQ4_GEMM, Sm80SmallMTest) {
   Status status = onnxruntime::cuda::test::sm80_supported();
   if (!status.IsOK()) {
     // skip the test if sm80 is not supported
@@ -326,7 +326,7 @@ TEST(BlkQ4_GEMM, Sm80SmallMTest) {
   onnxruntime::cuda::test::run_blkq4_gemm<64, true, true, true>(16, 1024, 576);
 }
 
-TEST(BlkQ4_GEMM, Sm80SmallTileKernelTest) {
+TEST(CudaEpBlkQ4_GEMM, Sm80SmallTileKernelTest) {
   Status status = onnxruntime::cuda::test::sm80_supported();
   if (!status.IsOK()) {
     // skip the test if sm80 is not supported

diff --git a/onnxruntime/test/providers/cuda/test_cases/cuda_execution_provider_test.cc b/onnxruntime/test/providers/cuda/test_cases/cuda_execution_provider_test.cc
@@ -19,7 +19,7 @@ namespace cuda {
 namespace test {
 // TODO: Since the "DeferredRelease" has been migrated to CudaStream class,
 // we should migrate this test from CudaEP unit test to CudaStream unit test.
-TEST(TestDeferredRelease, WithArena) {
+TEST(CudaEpTestDeferredRelease, WithArena) {
   // Create CUDA EP.
   CUDAExecutionProviderInfo info;
   CUDAExecutionProvider ep(info);
@@ -52,7 +52,7 @@ TEST(TestDeferredRelease, WithArena) {
   ORT_THROW_IF_ERROR(ep.OnRunEnd(true, run_opts));
 }
 
-TEST(TestDeferredRelease, WithoutArena) {
+TEST(CudaEpTestDeferredRelease, WithoutArena) {
   // Create CUDA EP.
   CUDAExecutionProviderInfo info;
   CUDAExecutionProvider ep(info);

diff --git a/onnxruntime/test/providers/cuda/test_cases/cuda_utils_test.cc b/onnxruntime/test/providers/cuda/test_cases/cuda_utils_test.cc
@@ -40,7 +40,7 @@ void TestFillCorrectness(size_t num_elements, TElement value) {
 }
 }  // namespace
 
-TEST(CudaUtilsTest, FillCorrectness) {
+TEST(CudaEpUnittest, FillCorrectness) {
   TestFillCorrectness<int8_t>(1 << 20, 1);
   TestFillCorrectness<int16_t>(1 << 20, 2);
   TestFillCorrectness<int32_t>(1 << 20, 3);

diff --git a/onnxruntime/test/providers/cuda/test_cases/gemm_options_test.cc b/onnxruntime/test/providers/cuda/test_cases/gemm_options_test.cc
@@ -10,7 +10,7 @@ namespace onnxruntime {
 namespace cuda {
 namespace test {
 
-TEST(CudaGemmOptions, TestDefaultOptions) {
+TEST(CudaEpGemmOptions, TestDefaultOptions) {
   HalfGemmOptions gemm_options;
   ASSERT_FALSE(gemm_options.IsCompute16F());
 #if defined(USE_CUDA)
@@ -22,7 +22,7 @@ TEST(CudaGemmOptions, TestDefaultOptions) {
 #endif
 }
 
-TEST(CudaGemmOptions, TestCompute16F) {
+TEST(CudaEpGemmOptions, TestCompute16F) {
   HalfGemmOptions gemm_options;
   gemm_options.Initialize(1);
   ASSERT_TRUE(gemm_options.IsCompute16F());
@@ -35,7 +35,7 @@ TEST(CudaGemmOptions, TestCompute16F) {
 #endif
 }
 
-TEST(CudaGemmOptions, NoReducedPrecision) {
+TEST(CudaEpGemmOptions, NoReducedPrecision) {
   HalfGemmOptions gemm_options;
   gemm_options.Initialize(2);
   ASSERT_FALSE(gemm_options.IsCompute16F());
@@ -48,7 +48,7 @@ TEST(CudaGemmOptions, NoReducedPrecision) {
 #endif
 }
 
-TEST(CudaGemmOptions, Pedantic) {
+TEST(CudaEpGemmOptions, Pedantic) {
   HalfGemmOptions gemm_options;
   gemm_options.Initialize(4);
   ASSERT_FALSE(gemm_options.IsCompute16F());
@@ -61,7 +61,7 @@ TEST(CudaGemmOptions, Pedantic) {
 #endif
 }
 
-TEST(CudaGemmOptions, Compute16F_Pedantic) {
+TEST(CudaEpGemmOptions, Compute16F_Pedantic) {
   HalfGemmOptions gemm_options;
   gemm_options.Initialize(5);
   ASSERT_TRUE(gemm_options.IsCompute16F());
@@ -74,7 +74,7 @@ TEST(CudaGemmOptions, Compute16F_Pedantic) {
 #endif
 }
 
-TEST(CudaGemmOptions, Compute16F_NoReducedPrecision) {
+TEST(CudaEpGemmOptions, Compute16F_NoReducedPrecision) {
   HalfGemmOptions gemm_options;
   gemm_options.Initialize(3);
   ASSERT_TRUE(gemm_options.IsCompute16F());

diff --git a/onnxruntime/test/providers/cuda/test_cases/greedy_search_top_one.cc b/onnxruntime/test/providers/cuda/test_cases/greedy_search_top_one.cc
@@ -41,7 +41,7 @@ void ComputeTop1Reference(const std::vector<float>& values,
   }
 }
 
-TEST(TestGreedySearch, TopOne) {
+TEST(CudaEpTestGreedySearch, TopOne) {
   int32_t batch_size = 4;
   int32_t vocab_size = 50257;
   int32_t batch_x_vocab = batch_size * vocab_size;

diff --git a/onnxruntime/test/providers/cuda/test_cases/reduction_functions_test.cc b/onnxruntime/test/providers/cuda/test_cases/reduction_functions_test.cc
@@ -179,7 +179,7 @@ void TestReduceColumnsToColumn(int m, int n, float relative_error_tolerance = 1e
 }
 }  // namespace
 
-TEST(ReductionFunctionsTest, ReduceRowToScalar) {
+TEST(CudaEpReductionFunctionsTest, ReduceRowToScalar) {
   TestReduceRowToScalarApis(3);
   TestReduceRowToScalarApis(19);
   TestReduceRowToScalarApis(123);
@@ -188,7 +188,7 @@ TEST(ReductionFunctionsTest, ReduceRowToScalar) {
   TestReduceRowToScalarApis(941736, 2e-4f);
 }
 
-TEST(ReductionFunctionsTest, ReduceRowsToRow) {
+TEST(CudaEpReductionFunctionsTest, ReduceRowsToRow) {
   for (int m : {3, 193, 2945}) {
     for (int n : {3, 193, 2945}) {
       TestReduceRowsToRow(m, n, true);
@@ -197,15 +197,15 @@ TEST(ReductionFunctionsTest, ReduceRowsToRow) {
   }
 }
 
-TEST(ReductionFunctionsTest, ReduceColumnsToColumn) {
+TEST(CudaEpReductionFunctionsTest, ReduceColumnsToColumn) {
   for (int m : {3, 193, 2945}) {
     for (int n : {3, 193, 2945}) {
       TestReduceColumnsToColumn(m, n);
     }
   }
 }
 
-TEST(ReductionFunctionsTest, BufferOffsets) {
+TEST(CudaEpReductionFunctionsTest, BufferOffsets) {
   const int m = 2048;
   const int n = 1024;
   const TensorShape shape{m, n};
@@ -240,7 +240,7 @@ TEST(ReductionFunctionsTest, BufferOffsets) {
   }
 }
 
-TEST(ReductionFunctionsTest, InvalidBufferSize) {
+TEST(CudaEpReductionFunctionsTest, InvalidBufferSize) {
   const int m = 2048;
   const int n = 1024;
   const TensorShape shape{m, n};
@@ -262,7 +262,7 @@ TEST(ReductionFunctionsTest, InvalidBufferSize) {
   ASSERT_FALSE(status.IsOK());
 }
 
-TEST(ReductionFunctionsTest, GetApplicableMatrixReduction) {
+TEST(CudaEpReductionFunctionsTest, GetApplicableMatrixReduction) {
   auto test_get_applicable_matrix_reduction =
       [](cudnnReduceTensorOp_t cudnn_op,
          const std::vector<int64_t>& dims, const std::vector<int64_t>& axes,

diff --git a/tools/ci_build/github/azure-pipelines/templates/jobs/win-ci-vs-2022-job.yml b/tools/ci_build/github/azure-pipelines/templates/jobs/win-ci-vs-2022-job.yml
@@ -218,16 +218,32 @@ jobs:
       - powershell: |
          python3 -m pip uninstall -y onnxruntime onnxruntime-gpu onnxruntime-training onnxruntime-directml -qq
          Get-ChildItem -Path dist/*.whl | foreach {pip --disable-pip-version-check install --upgrade $_.fullname}
-
         workingDirectory: '$(Build.BinariesDirectory)\${{ parameters.BuildConfig }}\${{ parameters.BuildConfig }}'
         displayName: 'Install onnxruntime wheel'
 
   - ${{ if eq(parameters.RunOnnxRuntimeTests, true) }}:
-      - powershell: |
-         python $(Build.SourcesDirectory)\tools\ci_build\build.py --config ${{ parameters.BuildConfig }} --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022"  --build_shared_lib --enable_onnx_tests ${{ parameters.additionalBuildFlags }}
-
-        workingDirectory: '$(Build.BinariesDirectory)\${{ parameters.BuildConfig }}\${{ parameters.BuildConfig }}'
-        displayName: 'Run tests'
+      - ${{ if and(contains(parameters.additionalBuildFlags, 'use_cuda'), contains(parameters.additionalBuildFlags, 'use_dml')) }}:
+        - powershell: |
+           python $(Build.SourcesDirectory)\tools\ci_build\build.py --config ${{ parameters.BuildConfig }} --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --enable_onnx_tests ${{ parameters.additionalBuildFlags }}
+          workingDirectory: '$(Build.BinariesDirectory)\${{ parameters.BuildConfig }}\${{ parameters.BuildConfig }}'
+          displayName: 'Run tests excluding CUDA tests'
+          env:
+            NO_CUDA_TEST: '1'
+            GTEST_FILTER: '-CudaEp*:CudaNhwcTypedTest*:*cpu_*models*' # Exclude CUDA EP tests under providers/cuda/ and cpu models test
+            PATH: '$(Build.BinariesDirectory)\${{ parameters.BuildConfig }}\${{ parameters.BuildConfig }};$(PATH)' # For onnxruntime4j_test to find dependent dlls
+        - powershell: |
+            python $(Build.SourcesDirectory)\tools\ci_build\build.py --config ${{ parameters.BuildConfig }} --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --enable_onnx_tests ${{ parameters.additionalBuildFlags }}
+          workingDirectory: '$(Build.BinariesDirectory)\${{ parameters.BuildConfig }}\${{ parameters.BuildConfig }}'
+          displayName: 'Run tests excluding DML tests'
+          env:
+            NO_DML_TEST: '1'
+            GTEST_FILTER: '-*cpu_*models*'
+            PATH: '$(Build.BinariesDirectory)\${{ parameters.BuildConfig }}\${{ parameters.BuildConfig }};$(PATH)'
+      - ${{ else }}:
+        - powershell: |
+           python $(Build.SourcesDirectory)\tools\ci_build\build.py --config ${{ parameters.BuildConfig }} --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --enable_onnx_tests ${{ parameters.additionalBuildFlags }}
+          workingDirectory: '$(Build.BinariesDirectory)\${{ parameters.BuildConfig }}\${{ parameters.BuildConfig }}'
+          displayName: 'Run tests'
 
   - ${{ if eq(parameters.GenerateDocumentation, true) }}:
     - task: PythonScript@0

diff --git a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
@@ -413,7 +413,7 @@ stages:
               workingDirectory: '$(Build.BinariesDirectory)'
             env:
               NO_CUDA_TEST: '1'
-              GTEST_FILTER: -*CudaNhwcTypedTest*
+              GTEST_FILTER: '-CudaEp*:CudaNhwcTypedTest*' # Exclude CUDA EP tests under providers/cuda/
           - task: PythonScript@0
             displayName: 'test excludes DML'
             condition: and(succeeded(), eq('${{ parameters.runTests}}', true))

diff --git a/tools/ci_build/github/azure-pipelines/win-gpu-cuda-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-gpu-cuda-ci-pipeline.yml
@@ -62,4 +62,28 @@ stages:
         RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }}
         ORT_EP_NAME: CUDA
         WITH_CACHE: true
-        MachinePool: onnxruntime-Win2022-GPU-A10
+        MachinePool: onnxruntime-Win2022-GPU-A10
+
+- stage: cuda_dml
+  dependsOn: []
+  jobs:
+    - template: templates/jobs/win-ci-vs-2022-job.yml
+      parameters:
+        BuildConfig: 'RelWithDebInfo'
+        EnvSetupScript: setup_env_cuda.bat
+        buildArch: x64
+        additionalBuildFlags: >-
+          --build_java --build_nodejs --use_cuda --cuda_home="$(Agent.TempDirectory)\v${{ parameters.CudaVersion }}"
+          --enable_cuda_profiling --enable_transformers_tool_test
+          --use_dml
+          --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
+          --cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=ON
+          --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON
+        msbuildPlatform: x64
+        isX86: false
+        job_name_suffix: x64_RelWithDebInfo
+        RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }}
+        ORT_EP_NAME: CUDA
+        EnablePython: false
+        WITH_CACHE: true
+        MachinePool: onnxruntime-Win2022-GPU-A10
diff --git a/tools/ci_build/github/azure-pipelines/win-gpu-dml-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-gpu-dml-ci-pipeline.yml
@@ -43,11 +43,11 @@ stages:
         BuildConfig: 'RelWithDebInfo'
         EnvSetupScript: setup_env.bat
         buildArch: x64
-        additionalBuildFlags: --enable_pybind --use_dml --enable_wcos  --use_winml
+        additionalBuildFlags: --enable_pybind --use_dml --enable_wcos --use_winml
         msbuildPlatform: x64
         isX86: false
         job_name_suffix: x64_RelWithDebInfo
         RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }}
         ORT_EP_NAME: DML
         WITH_CACHE: false
-        MachinePool: onnxruntime-Win2022-GPU-dml-A10
+        MachinePool: onnxruntime-Win2022-GPU-dml-A10