[QNN EP] Improve QDQ model accuracy tests (microsoft#16916)

### Description - Improves how unit tests measure the accuracy of QDQ models on QNN EP. - Adds tests for ops: Add, Mul, Abs<sup>1</sup>, And<sup>1</sup>, Or<sup>1</sup>, Ceil<sup>1</sup>, Cos<sup>1</sup> <sup>1</sup>: Not previously supported due to missing node unit handling. ### Motivation and Context The new approach for testing QDQ operator accuracy requires running 3 inferences: 1. float model on CPU EP (baseline) 2. qdq model on CPU EP 3. qdq model on QNN EP The units tests check that running the QDQ model on QNN EP (3) is at least as accurate (+- small tolerance) as running the QDQ model on CPU EP (2). We measure accuracy by comparing to the baseline (1). This is essentially what we care about: is qnn ep as accurate as cpu ep. If not, it is worth investigating as a potential bug.
kleiti · Mar 22, 2024 · 97a9bd2 · 97a9bd2
1 parent 51e216f
commit 97a9bd2
Show file tree

Hide file tree

Showing 24 changed files with 2,234 additions and 1,589 deletions.
diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/shared/utils.cc b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/shared/utils.cc
@@ -64,10 +64,13 @@ static const OpVersionsAndSelector::OpVersionsMap GetUnaryOpVersionsMap() {
           {"Atan", {}},
           {"Asin", {}},
           {"Sin", {}},
+          {"Cos", {}},
           {"Sign", {}},
           {"Tanh", {}},
           {"Exp", {}},
-          {"LRN", {}}};
+          {"LRN", {}},
+          {"Ceil", {}},
+          {"Abs", {}}};
 }
 static const OpVersionsAndSelector::OpVersionsMap GetBinaryOpVersionsMap() {
   return {{"Add", {}},

diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
@@ -145,34 +145,28 @@ bool QNNExecutionProvider::IsNodeSupported(qnn::QnnModelWrapper& qnn_model_wrapp
   if (it != node_unit_supported_result.cend()) {
     return it->second;
   } else {
-    // quantized required, filter out the non-quantized nodes, filter in the QDQ nodes
-    auto IsQdqNode = [](const NodeUnit& node_unit) {
-      if ("QuantizeLinear" == node_unit.OpType() || "DequantizeLinear" == node_unit.OpType()) {
-        return true;
-      } else {
-        return false;
-      }
-    };
+    const std::string& op_type = node_unit.OpType();
+    const bool is_qdq_node = op_type == "QuantizeLinear" || op_type == "DequantizeLinear";
 
     // Is NPU backend, is single node, case by case
     // Q/DQ nodes -- supported
     // Transpose nodes -- supported
     // Cast nodes -- need to call CastOpBuilder::IsOpSupported
     if (is_npu_backend && NodeUnit::Type::SingleNode == node_unit.UnitType()) {
-      if (IsQdqNode(node_unit)) {  // Qnn has Quantize & Dequantize Op
+      if (is_qdq_node) {  // Qnn has Quantize & Dequantize Op
         LOGS(logger, VERBOSE) << "Single Q/DQ node is supported for NPU backend. Node name: " << node_unit.Name();
         return true;
       }
 
       // Tranpose only changes the data layout. NPU still supports it.
-      if ("Transpose" == node_unit.OpType()) {
+      if ("Transpose" == op_type) {
         LOGS(logger, VERBOSE) << "Single Transpose node is supported for NPU backend. Node name: " << node_unit.Name();
         return true;
       }
 
-      // For Cast, need to call IsOpSupported (below) to validate input and output types.
+      // For Cast, And, and Or, we need to call IsOpSupported (below) to validate input and output types.
       // For other single non-qdq nodes, immediately return not supported.
-      if (node_unit.OpType() != "Cast") {
+      if (op_type != "Cast" && op_type != "And" && op_type != "Or") {
         LOGS(logger, WARNING) << "Non-QDQ " << node_unit.OpType()
                               << " operators are not supported on HTP or DSP backends. " << node_unit.OpType()
                               << " node `" << node_unit.Name() << " will not be assigned to QNN EP.";
@@ -181,14 +175,14 @@ bool QNNExecutionProvider::IsNodeSupported(qnn::QnnModelWrapper& qnn_model_wrapp
     }
 
     // Non-NPU backend, quantized model not supported, but a QDQ node encountered
-    if (!is_npu_backend && IsQdqNode(node_unit)) {
+    if (!is_npu_backend && is_qdq_node) {
       LOGS(logger, ERROR) << "QDQ models are only supported on HTP or DSP backends. "
                           << node_unit.OpType() << " node `" << node_unit.Name() << "` will not be assigned to QNN EP.";
       return false;
     }
 
     bool supported = false;
-    const auto* op_builder = qnn::GetOpBuilder(node_unit.OpType());
+    const auto* op_builder = qnn::GetOpBuilder(op_type);
     if (op_builder == nullptr) {
       LOGS(logger, WARNING) << "Operators of type `" << node_unit.OpType() << "` are not supported by QNN EP."
                             << node_unit.OpType() << " node `" << node_unit.Name()

diff --git a/onnxruntime/test/optimizer/graph_transform_test_builder.h b/onnxruntime/test/optimizer/graph_transform_test_builder.h
@@ -219,6 +219,15 @@ class ModelTestBuilder {
     return &graph_.GetOrCreateNodeArg(name, nullptr);
   }
 
+  NodeArg* MakeRandInitializerBool(const std::vector<int64_t>& shape) {
+    std::vector<uint8_t> data_uint8 = rand_gen_.Uniform<uint8_t>(shape, 0, 1);
+    std::vector<bool> data;
+    for (uint8_t x : data_uint8) {
+      data.push_back(x != 0);
+    }
+    return MakeInitializerBool(shape, data);
+  }
+
   template <typename T>
   NodeArg* MakeInitializer(const std::vector<int64_t>& shape, T min, T max) {
     return MakeInitializer<T>(shape, rand_gen_.Uniform<T>(shape, min, max));

diff --git a/onnxruntime/test/optimizer/qdq_test_utils.h b/onnxruntime/test/optimizer/qdq_test_utils.h
@@ -91,102 +91,6 @@ GetQDQTestCaseFn BuildQDQConvTransposeTestCase(const std::vector<int64_t>& input
   };
 }
 
-// Creates the following graph:
-//                                _______________________
-//    input (f32) -> Q -> DQ ->  |                       | -> Q -> DQ -> output (f32)
-// axes (int32, initializer) ->  |         Gather        |
-//                               |_______________________|
-//
-template <typename QuantType, typename IndicesType>
-GetQDQTestCaseFn BuildQDQGatherOpTestCase(const std::vector<int64_t>& input_shape,
-                                          const std::vector<IndicesType> indices,
-                                          const std::vector<int64_t>& indices_shape,
-                                          int64_t axis) {
-  return [input_shape, indices, indices_shape, axis](ModelTestBuilder& builder) {
-    auto* input_data = builder.MakeInput<float>(input_shape, -1.0f, 1.0f);
-    auto* final_output = builder.MakeOutput();
-
-    // input_data -> Q/DQ ->
-    auto* input_qdq_output = AddQDQNodePair<QuantType>(builder, input_data, .003f, 1);
-
-    auto* indices_input = builder.MakeInitializer<IndicesType>(indices_shape, indices);
-
-    auto* gather_output = builder.MakeIntermediate();
-    Node& gather_node = builder.AddNode("Gather", {input_qdq_output, indices_input}, {gather_output});
-    gather_node.AddAttribute("axis", axis);
-
-    // -> Q/DQ -> final_output
-    auto* q_output = builder.MakeIntermediate();
-    builder.AddQuantizeLinearNode<QuantType>(gather_output, .003f, 1,
-                                             q_output);
-
-    builder.AddDequantizeLinearNode<QuantType>(q_output, .003f, 1,
-                                               final_output);
-  };
-}
-
-// Creates the following graph:
-//                                _______________________
-//    input (f32) -> Q -> DQ ->  |                       | -> Q -> DQ -> output (f32)
-// axes (int32, initializer) ->  |         Gather        |
-//                               |_______________________|
-//
-template <typename QuantType, typename IndicesType>
-GetQDQTestCaseFn BuildQDQGatherOpScalarIndicesTestCase(const std::vector<int64_t>& input_shape,
-                                                       const IndicesType indices,
-                                                       int64_t axis) {
-  return [input_shape, indices, axis](ModelTestBuilder& builder) {
-    auto* input_data = builder.MakeInput<float>(input_shape, -1.0f, 1.0f);
-    auto* final_output = builder.MakeOutput();
-
-    // input_data -> Q/DQ ->
-    auto* input_qdq_output = AddQDQNodePair<QuantType>(builder, input_data, .003f, 1);
-
-    auto* indices_input = builder.MakeScalarInitializer<IndicesType>(indices);
-
-    auto* gather_output = builder.MakeIntermediate();
-    Node& gather_node = builder.AddNode("Gather", {input_qdq_output, indices_input}, {gather_output});
-    gather_node.AddAttribute("axis", axis);
-
-    // -> Q/DQ -> final_output
-    auto* q_output = builder.MakeIntermediate();
-    builder.AddQuantizeLinearNode<QuantType>(gather_output, .003f, 1,
-                                             q_output);
-
-    builder.AddDequantizeLinearNode<QuantType>(q_output, .003f, 1,
-                                               final_output);
-  };
-}
-
-// Creates the following graph:
-//                                _______________________
-//                               |                       |
-//    input (f32) -> Q -> DQ ->  |       LeakyRelu       | -> Q -> DQ -> output (f32)
-//                               |_______________________|
-//
-template <typename QuantType>
-GetQDQTestCaseFn BuildQDQLeakyReluOpTestCase(const std::vector<int64_t>& input_shape) {
-  return [input_shape](ModelTestBuilder& builder) {
-    auto* input_data = builder.MakeInput<float>(input_shape, -1.0f, 1.0f);
-    auto* final_output = builder.MakeOutput();
-
-    // input_data -> Q/DQ ->
-    auto* input_qdq_output = AddQDQNodePair<QuantType>(builder, input_data, 0.0473f, 137);
-
-    auto* leakyrelu_output = builder.MakeIntermediate();
-    Node& leakyrelu_node = builder.AddNode("LeakyRelu", {input_qdq_output}, {leakyrelu_output});
-    leakyrelu_node.AddAttribute("alpha", 0.2f);
-
-    // -> Q/DQ -> final_output
-    auto* q_output = builder.MakeIntermediate();
-    builder.AddQuantizeLinearNode<QuantType>(leakyrelu_output, 0.02696f, 48,
-                                             q_output);
-
-    builder.AddDequantizeLinearNode<QuantType>(q_output, 0.02696f, 48,
-                                               final_output);
-  };
-}
-
 template <typename InputType, typename WeightType, typename BiasType, typename OutputType>
 GetQDQTestCaseFn BuildQDQConvTestCase(const std::vector<int64_t>& input_shape, const std::vector<int64_t>& weights_shape) {
   return [input_shape, weights_shape](ModelTestBuilder& builder) {

diff --git a/onnxruntime/test/providers/qnn/argmaxmin_op_test.cc b/onnxruntime/test/providers/qnn/argmaxmin_op_test.cc
@@ -20,21 +20,29 @@ static GetTestModelFn BuildArgMxxTestCase(const std::string& op_type, TestInputD
                                           const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs) {
   return [op_type, input_def, attrs](ModelTestBuilder& builder) {
     auto* input = MakeTestInput(builder, input_def);
-    auto* output = builder.MakeOutput();
 
-    Node& argm_node = builder.AddNode(op_type, {input}, {output});
+    auto* argm_output = builder.MakeIntermediate();
+    Node& argm_node = builder.AddNode(op_type, {input}, {argm_output});
     for (const auto& attr : attrs) {
       argm_node.AddAttributeProto(attr);
     }
+
+    // Add cast to uint32
+    auto* output = builder.MakeOutput();
+    Node& cast_node = builder.AddNode("Cast", {argm_output}, {output});
+    const auto dst_type = ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_UINT32;
+    cast_node.AddAttribute("to", static_cast<int64_t>(dst_type));
   };
 }
 
 // Builds a QDQ model with ArgMin/ArgMax and a Cast to uint32. The quantization parameters are computed from the provided
 // input definition.
 template <typename QType = uint8_t>
-static GetTestModelFn BuildQDQArgMxxTestCase(const std::string& op_type, TestInputDef<float> input_def,
-                                             const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs) {
-  return [op_type, input_def, attrs](ModelTestBuilder& builder) {
+static GetTestQDQModelFn<QType> BuildQDQArgMxxTestCase(const std::string& op_type, TestInputDef<float> input_def,
+                                                       const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs) {
+  return [op_type, input_def, attrs](ModelTestBuilder& builder,
+                                     std::vector<QuantParams<QType>>& output_qparams) {
+    ORT_UNUSED_PARAMETER(output_qparams);
     QuantParams<QType> input_qparams = GetTestInputQuantParams(input_def);
 
     auto* input = MakeTestInput(builder, input_def);
@@ -75,8 +83,8 @@ static void RunCPUArgMxxOpTest(const std::string& op_type, TestInputDef<float> i
                   expected_ep_assignment);
 }
 
-// Runs an ArgMax/ArgMin model on the QNN CPU backend. Checks the graph node assignment, and that inference
-// outputs for QNN EP and CPU EP match.
+// Runs a QDQ ArgMax/ArgMin model on the QNN (HTP) EP and the ORT CPU EP. Checks the graph node assignment, and that inference
+// running the QDQ model on QNN EP is at least as accurate as on ORT CPU EP (when compared to the baseline float32 model).
 template <typename QType = uint8_t>
 static void RunQDQArgMxxOpTest(const std::string& op_type, TestInputDef<float> input_def,
                                const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs,
@@ -90,10 +98,12 @@ static void RunQDQArgMxxOpTest(const std::string& op_type, TestInputDef<float> i
   provider_options["backend_path"] = "libQnnHtp.so";
 #endif
 
-  RunQnnModelTest(BuildQDQArgMxxTestCase(op_type, input_def, attrs),
-                  provider_options,
-                  opset,
-                  expected_ep_assignment);
+  TestQDQModelAccuracy(BuildArgMxxTestCase(op_type, input_def, attrs),            // baseline float32 model
+                       BuildQDQArgMxxTestCase<QType>(op_type, input_def, attrs),  // QDQ model
+                       provider_options,
+                       opset,
+                       expected_ep_assignment,
+                       1e-5f);
 }
 
 //