microsoft · tianleiwu · Mar 19, 2024 · Mar 18, 2024 · Mar 18, 2024 · Mar 19, 2024
diff --git a/onnxruntime/test/common/cuda_op_test_utils.cc b/onnxruntime/test/common/cuda_op_test_utils.cc
@@ -0,0 +1,36 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#ifdef USE_CUDA
+#include "cuda_runtime_api.h"
+#endif
+
+namespace onnxruntime {
+namespace test {
+
+int GetCudaArchitecture() {
+  // This will cache the result so we only call cudaGetDeviceProperties once.
+  // Usually, we test on a single GPU or multiple GPUs of same architecture, so it's fine to cache the result.
+  static int cuda_arch = -1;
+
+#ifdef USE_CUDA
+  if (cuda_arch == -1) {
+    int current_device_id = 0;
+    cudaGetDevice(&current_device_id);
+    // must wait GPU idle, otherwise cudaGetDeviceProperties might fail
+    cudaDeviceSynchronize();
+    cudaDeviceProp prop;
+
+    // When cudaGetDeviceProperties fails, just return -1 and no error is raised.
+    // If cuda device has issue, test will fail anyway so no need to raise error here.
+    if (cudaSuccess == cudaGetDeviceProperties(&prop, current_device_id)){
+      cuda_arch = prop.major * 100 + prop.minor * 10;
+    }
+  }
+#endif
+
+  return cuda_arch;
+}
+
+}  // namespace test
+}  // namespace onnxruntime
diff --git a/onnxruntime/test/common/cuda_op_test_utils.h b/onnxruntime/test/common/cuda_op_test_utils.h
@@ -4,37 +4,20 @@
 #pragma once
 
 #include "test/util/include/default_providers.h"
-#ifdef USE_CUDA
-#include "cuda_runtime_api.h"
-#endif
 
 namespace onnxruntime {
 namespace test {
 
+// CUDA architecture of the current device like 100 * major + 10 * minor.
+// Please call this function after CUDA EP is enabled.
+int GetCudaArchitecture();
+
 inline bool HasCudaEnvironment(int min_cuda_architecture) {
   if (DefaultCudaExecutionProvider().get() == nullptr) {
     return false;
   }
 
-  if (min_cuda_architecture == 0) {
-    return true;
-  }
-
-  int cuda_architecture = 0;
-
-#ifdef USE_CUDA
-  int currentCudaDevice = 0;
-  cudaGetDevice(&currentCudaDevice);
-  cudaDeviceSynchronize();
-  cudaDeviceProp prop;
-  if (cudaSuccess != cudaGetDeviceProperties(&prop, currentCudaDevice)) {
-    return false;
-  }
-
-  cuda_architecture = prop.major * 100 + prop.minor * 10;
-#endif
-
-  return cuda_architecture >= min_cuda_architecture;
+  return GetCudaArchitecture() >= min_cuda_architecture;
 }
 
 inline bool NeedSkipIfCudaArchLowerThan(int min_cuda_architecture) {

diff --git a/onnxruntime/test/common/trt_op_test_utils.h b/onnxruntime/test/common/trt_op_test_utils.h
@@ -0,0 +1,33 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "test/common/cuda_op_test_utils.h"
+
+namespace onnxruntime {
+namespace test {
+
+// TensorRT EP Segmentation fault on A100: https://github.com/microsoft/onnxruntime/issues/19530
+inline const std::unordered_set<std::string> ExcludeTrtOnA100() {
+  // Note: GetCudaArchitecture need USE_CUDA to be defined. Currently, it is defined when TRT EP is enabled.
+  // If we want to make TRT EP independent of CUDA EP, we need to change the implementation of GetCudaArchitecture.
+  if (DefaultTensorrtExecutionProvider() != nullptr && GetCudaArchitecture() == 800) {
+    return {kTensorrtExecutionProvider};
+  }
+
+  return {};
+}
+
+// Add TensorRT EP to an excluded provider list when running on A100
+inline const std::unordered_set<std::string>& ExcludeTrtOnA100(std::unordered_set<std::string>& excluded_providers) {
+  if (DefaultTensorrtExecutionProvider() != nullptr && GetCudaArchitecture() == 800) {
+    excluded_providers.insert(kTensorrtExecutionProvider);
+    return excluded_providers;
+  }
+
+  return excluded_providers;
+}
+
+}  // namespace test
+}  // namespace onnxruntime