diff --git a/onnxruntime/core/providers/cuda/cu_inc/common.cuh b/onnxruntime/core/providers/cuda/cu_inc/common.cuh index 1cd3532846114..052dd05574ab1 100644 --- a/onnxruntime/core/providers/cuda/cu_inc/common.cuh +++ b/onnxruntime/core/providers/cuda/cu_inc/common.cuh @@ -5,7 +5,9 @@ #include #include #include +#include #include +#include #include #include #include "core/providers/cuda/cuda_common.h" @@ -345,9 +347,29 @@ __device__ __inline__ half _Pow(half a, half b) { return half(powf((float)a, (fl template __device__ __inline__ T _Min(T a, T b) { return a < b ? a : b; } +template <> +__device__ __inline__ float _Min(float a, float b) { + return (isnan(a) || isnan(b)) ? std::numeric_limits::quiet_NaN() : ( a < b ? a : b ); +} + +template <> +__device__ __inline__ double _Min(double a, double b) { + return (isnan(a) || isnan(b)) ? std::numeric_limits::quiet_NaN() : ( a < b ? a : b ); +} + template __device__ __inline__ T _Max(T a, T b) { return a > b ? a : b; } +template <> +__device__ __inline__ float _Max(float a, float b) { + return (isnan(a) || isnan(b)) ? std::numeric_limits::quiet_NaN() : ( a > b ? a : b ); +} + +template <> +__device__ __inline__ double _Max(double a, double b) { + return (isnan(a) || isnan(b)) ? std::numeric_limits::quiet_NaN() : ( a > b ? a : b ); +} + template __device__ __inline__ T _Abs(T a) { return a > (T)0 ? a : -a; } diff --git a/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc b/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc index c73dfcbce1b53..c02486a2ec26f 100644 --- a/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc +++ b/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc @@ -9,6 +9,7 @@ #include "test/common/trt_op_test_utils.h" #include "core/util/math.h" #include +#include #include namespace onnxruntime { @@ -1508,6 +1509,34 @@ TEST(MathOpTest, Min_12_Float_2_Input) { test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kOpenVINOExecutionProvider}); // TensorRT: Input batch size is inconsistent } +TEST(MathOpTest, Min_12_Float_Nan) { + OpTester test("Min", 12); + test.AddInput("data_2", {3, 3}, + {std::numeric_limits::quiet_NaN(), + std::numeric_limits::quiet_NaN(), + std::numeric_limits::quiet_NaN(), + -0.5f, 0.0f, -2.0f, + 0.5f, 0.0f, 2.0f}); + test.AddInput("data_1", {3, 1}, + {0.0f, -1.0f, 1.0f}); + test.AddOutput("min", {3, 3}, + {std::numeric_limits::quiet_NaN(), + std::numeric_limits::quiet_NaN(), + std::numeric_limits::quiet_NaN(), + -1.0f, -1.0f, -2.0f, + 0.5f, 0.0f, 1.0f}); + if (nullptr != DefaultCpuExecutionProvider().get()) { + std::vector> execution_providers; + execution_providers.push_back(DefaultCpuExecutionProvider()); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); + } + if (nullptr != DefaultCudaExecutionProvider().get()) { + std::vector> execution_providers; + execution_providers.push_back(DefaultCudaExecutionProvider()); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); + } +} + TEST(MathOpTest, Min_12_Double) { OpTester test("Min", 12); test.AddInput("data_0", {1, 3}, @@ -1525,6 +1554,34 @@ TEST(MathOpTest, Min_12_Double) { test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); // TensorRT: Input batch size is inconsistent } +TEST(MathOpTest, Min_12_Double_Nan) { + OpTester test("Min", 12); + test.AddInput("data_2", {3, 3}, + {std::numeric_limits::quiet_NaN(), + std::numeric_limits::quiet_NaN(), + std::numeric_limits::quiet_NaN(), + -0.5, 0.0, -2.0, + 0.5, 0.0, 2.0}); + test.AddInput("data_1", {3, 1}, + {0.0, -1.0, 1.0}); + test.AddOutput("min", {3, 3}, + {std::numeric_limits::quiet_NaN(), + std::numeric_limits::quiet_NaN(), + std::numeric_limits::quiet_NaN(), + -1.0, -1.0, -2.0, + 0.5, 0.0, 1.0}); + if (nullptr != DefaultCpuExecutionProvider().get()) { + std::vector> execution_providers; + execution_providers.push_back(DefaultCpuExecutionProvider()); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); + } + if (nullptr != DefaultCudaExecutionProvider().get()) { + std::vector> execution_providers; + execution_providers.push_back(DefaultCudaExecutionProvider()); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); + } +} + TEST(MathOpTest, Min_12_Int32) { OpTester test("Min", 12); test.AddInput("data_0", {1, 3}, @@ -1631,6 +1688,7 @@ TEST(MathOpTest, Min_12_MLFLoat16_Scalar1) { MakeMLFloat16({-10.f, -10.f, -10.f})); test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); // TensorRT: Input batch size is inconsistent } + TEST(MathOpTest, Max_6) { OpTester test("Max", 6); std::vector dims{3, 3}; @@ -1719,6 +1777,34 @@ TEST(MathOpTest, Max_12_Float) { test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kOpenVINOExecutionProvider}); // TensorRT: Input batch size is inconsistent } +TEST(MathOpTest, Max_12_Float_Nan) { + OpTester test("Max", 12); + test.AddInput("data_2", {3, 3}, + {std::numeric_limits::quiet_NaN(), + std::numeric_limits::quiet_NaN(), + std::numeric_limits::quiet_NaN(), + -0.5f, 0.0f, -2.0f, + 0.5f, 0.0f, 2.0f}); + test.AddInput("data_1", {3, 1}, + {0.0f, -1.0f, 1.0f}); + test.AddOutput("max", {3, 3}, + {std::numeric_limits::quiet_NaN(), + std::numeric_limits::quiet_NaN(), + std::numeric_limits::quiet_NaN(), + -0.5f, 0.0f, -1.0f, + 1.0f, 1.0f, 2.0f}); + if (nullptr != DefaultCpuExecutionProvider().get()) { + std::vector> execution_providers; + execution_providers.push_back(DefaultCpuExecutionProvider()); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); + } + if (nullptr != DefaultCudaExecutionProvider().get()) { + std::vector> execution_providers; + execution_providers.push_back(DefaultCudaExecutionProvider()); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); + } +} + TEST(MathOpTest, Max_12_Double) { OpTester test("Max", 12); test.AddInput("data_0", {1, 3}, @@ -1736,6 +1822,34 @@ TEST(MathOpTest, Max_12_Double) { test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); // TensorRT: Input batch size is inconsistent } +TEST(MathOpTest, Max_12_Double_Nan) { + OpTester test("Max", 12); + test.AddInput("data_2", {3, 3}, + {std::numeric_limits::quiet_NaN(), + std::numeric_limits::quiet_NaN(), + std::numeric_limits::quiet_NaN(), + -0.5, 0.0, -2.0, + 0.5, 0.0, 2.0}); + test.AddInput("data_1", {3, 1}, + {0.0, -1.0, 1.0}); + test.AddOutput("max", {3, 3}, + {std::numeric_limits::quiet_NaN(), + std::numeric_limits::quiet_NaN(), + std::numeric_limits::quiet_NaN(), + -0.5, 0.0, -1.0, + 1.0, 1.0, 2.0}); + if (nullptr != DefaultCpuExecutionProvider().get()) { + std::vector> execution_providers; + execution_providers.push_back(DefaultCpuExecutionProvider()); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); + } + if (nullptr != DefaultCudaExecutionProvider().get()) { + std::vector> execution_providers; + execution_providers.push_back(DefaultCudaExecutionProvider()); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); + } +} + TEST(MathOpTest, Max_12_Int32) { OpTester test("Max", 12); test.AddInput("data_0", {1, 3},