diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.cc
index 5b958c686d007..f492a4cbbcb4b 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.cc
@@ -61,6 +61,8 @@ QuantizedOpType GetQuantizedOpType(const NodeUnit& node_unit) {
       return QuantizedOpType::QLinearMatMul;
     else if (op_type == "QLinearAdd")
       return QuantizedOpType::QLinearAdd;
+    else if (op_type == "QLinearMul")
+      return QuantizedOpType::QLinearMul;
     else if (op_type == "QLinearSigmoid")
       return QuantizedOpType::QLinearSigmoid;
     else if (op_type == "QLinearAveragePool")
@@ -72,6 +74,10 @@ QuantizedOpType GetQuantizedOpType(const NodeUnit& node_unit) {
       return QuantizedOpType::QDQResize;
     else if (op_type == "AveragePool")
       return QuantizedOpType::QDQAveragePool;
+    else if (op_type == "Add")
+      return QuantizedOpType::QDQAdd;
+    else if (op_type == "Mul")
+      return QuantizedOpType::QDQMul;
   } else {
     // throw?
     // Do we want to throw here? seems got neglected last time
@@ -114,25 +120,13 @@ bool IsQuantizedPool(QuantizedOpType quant_op_type) {
 bool IsQuantizedBinaryOp(QuantizedOpType quant_op_type) {
   return quant_op_type == QuantizedOpType::QLinearMatMul ||
          quant_op_type == QuantizedOpType::QLinearAdd ||
+         quant_op_type == QuantizedOpType::QLinearMul ||
+         quant_op_type == QuantizedOpType::QDQAdd ||
+         quant_op_type == QuantizedOpType::QDQMul ||
          IsQuantizedConv(quant_op_type);
 }
 
-bool HasValidUnaryOpQuantizedInputs(const NodeUnit& node_unit) {
-  int32_t input_type;
-  if (!GetType(node_unit.Inputs()[0].node_arg, input_type))
-    return false;
-
-  if (input_type != ONNX_NAMESPACE::TensorProto_DataType_UINT8) {
-    LOGS_DEFAULT(VERBOSE) << "[" << node_unit.OpType()
-                          << "] Input type: [" << input_type
-                          << "] is not supported for now";
-    return false;
-  }
-
-  return true;
-}
-
-bool HasValidBinaryOpQuantizedInputs(const NodeUnit& node_unit) {
+bool HasValidBinaryOpQuantizedInputTypes(const NodeUnit& node_unit) {
   auto quant_op_type = GetQuantizedOpType(node_unit);
   int32_t a_input_type, b_input_type;
   if (!IsQuantizedBinaryOp(quant_op_type)) {
@@ -146,16 +140,17 @@ bool HasValidBinaryOpQuantizedInputs(const NodeUnit& node_unit) {
   if (!GetType(inputs[1].node_arg, b_input_type))
     return false;
 
-  // QlinearConv supports u8u8 or u8s8
-  // QLinearMatMul/Add only support u8u8
-  bool is_quant_conv = IsQuantizedConv(quant_op_type);
+  // QlinearConv/MatMul supports u8u8 or u8s8
+  // QLinearAdd/QLinearMul only support u8u8
+  bool is_quant_conv_or_matmul = IsQuantizedConv(quant_op_type) || (quant_op_type == QuantizedOpType::QLinearMatMul);
+
   bool has_valid_qlinear_conv_weight =
       (b_input_type == ONNX_NAMESPACE::TensorProto_DataType_UINT8 ||
        b_input_type == ONNX_NAMESPACE::TensorProto_DataType_INT8);
 
   if (a_input_type != ONNX_NAMESPACE::TensorProto_DataType_UINT8 ||
-      (!is_quant_conv && a_input_type != b_input_type) ||
-      (is_quant_conv && !has_valid_qlinear_conv_weight)) {
+      (!is_quant_conv_or_matmul && a_input_type != b_input_type) ||
+      (is_quant_conv_or_matmul && !has_valid_qlinear_conv_weight)) {
     LOGS_DEFAULT(VERBOSE) << "[" << node_unit.OpType()
                           << "] A Input type: [" << a_input_type
                           << "] B Input type: [" << b_input_type
@@ -166,182 +161,6 @@ bool HasValidBinaryOpQuantizedInputs(const NodeUnit& node_unit) {
   return true;
 }
 
-bool HasValidQuantizationScales(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
-                                const std::vector<size_t>& indices, const OpSupportCheckParams& params, bool is_input) {
-  const auto& op_type = node_unit.OpType();
-  auto quant_op_type = GetQuantizedOpType(node_unit);
-  bool is_quant_conv = IsQuantizedConv(quant_op_type);
-  bool is_quant_matmul = (quant_op_type == QuantizedOpType::QLinearMatMul);
-  const auto& io_defs = is_input ? node_unit.Inputs() : node_unit.Outputs();
-  for (const auto idx : indices) {
-    if (idx >= io_defs.size()) {
-      LOGS_DEFAULT(VERBOSE) << (is_input ? "Input" : "Output") << " index,  " << idx
-                            << " >= size, " << io_defs.size()
-                            << " of NodeUnit: " << node_unit.Name();
-      return false;
-    }
-
-    const auto& io_def = io_defs[idx];
-    if (!io_def.quant_param.has_value()) {
-      LOGS_DEFAULT(VERBOSE) << "HasValidQuantizationZeroPoints, Input index,  " << idx
-                            << " has no quant_param";
-      return false;
-    }
-
-    const auto scale_name = io_def.quant_param->scale.Name();
-
-    if (!Contains(initializers, scale_name)) {
-      LOGS_DEFAULT(VERBOSE) << "The scale of " << op_type << " must be an initializer tensor";
-      return false;
-    }
-
-    // If this op is Qlinear[Conv/MatMul], we want to check u8s8 support for weight tensor (or B tensor for QlinearMatMul)
-    bool is_conv_matmul_weight = is_input && (is_quant_conv || is_quant_matmul) && idx == 1;
-    bool is_conv_matmul_u8s8_weight = false;
-
-    if (is_conv_matmul_weight) {
-      const auto& weight_tensor = *initializers.at(io_def.node_arg.Name());
-      is_conv_matmul_u8s8_weight = weight_tensor.data_type() == ONNX_NAMESPACE::TensorProto_DataType_INT8;
-    }
-
-    const auto& scale_tensor = *initializers.at(scale_name);
-    int64_t scales_dim = scale_tensor.dims().empty() ? 1 : scale_tensor.dims()[0];
-    if (!is_conv_matmul_u8s8_weight) {
-      if (scales_dim != 1) {
-        LOGS_DEFAULT(VERBOSE) << op_type << " does not support per-channel quantization, "
-                              << " for now, only u8s8 QlinearConv supports per-channel quantization on API 29+";
-        return false;
-      }
-    } else if (scales_dim != 1) {
-      // For u8s8 Qlinear[Conv/MatMul], we support
-      // 1. Per-tensor, the weight will be transformed to uint8 later
-      // 2. Per-channel, only from Android API level 29
-      if (is_quant_matmul) {
-        LOGS_DEFAULT(VERBOSE) << "QLinearMatMul does not support per-channel quantization";
-        return false;
-      }
-
-      if (params.android_feature_level < ANEURALNETWORKS_FEATURE_LEVEL_3) {
-        LOGS_DEFAULT(VERBOSE) << op_type << " only supports per-channel quantization on Android API 29+, "
-                              << "system NNAPI feature level: " << params.android_feature_level;
-        return false;
-      }
-
-      const auto& weight_tensor = *initializers.at(io_def.node_arg.Name());
-      if (weight_tensor.dims()[0] != scales_dim) {
-        LOGS_DEFAULT(VERBOSE) << op_type << " mismatch int8 per-channel quantization weight,"
-                              << " weight dimension[0] " << weight_tensor.dims()[0]
-                              << " scale dimension " << scales_dim;
-        return false;
-      }
-    }
-  }
-
-  return true;
-}
-
-bool HasValidQuantizationZeroPoints(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
-                                    const std::vector<size_t>& indices, bool is_input) {
-  const auto& op_type = node_unit.OpType();
-  auto quant_op_type = GetQuantizedOpType(node_unit);
-  bool is_quant_conv = IsQuantizedConv(quant_op_type);
-  bool is_quant_matmul = (quant_op_type == QuantizedOpType::QLinearMatMul);
-
-  const auto& io_defs = is_input ? node_unit.Inputs() : node_unit.Outputs();
-  for (const auto idx : indices) {
-    if (idx >= io_defs.size()) {
-      LOGS_DEFAULT(VERBOSE) << "HasValidQuantizationZeroPoints, "
-                            << (is_input ? "Input" : "Output") << " index,  " << idx
-                            << " >= size, " << io_defs.size();
-      return false;
-    }
-
-    const auto& io_def = io_defs[idx];
-    if (!io_def.quant_param.has_value()) {
-      LOGS_DEFAULT(VERBOSE) << "HasValidQuantizationZeroPoints, Input index,  " << idx
-                            << " has no quant_param";
-      return false;
-    }
-
-    // zero point is optional here
-    if (!io_def.quant_param->zero_point)
-      return true;
-
-    const auto& zero_point_name = io_def.quant_param->zero_point->Name();
-    if (!Contains(initializers, zero_point_name)) {
-      LOGS_DEFAULT(VERBOSE) << "The zero point of " << op_type << " must be an initializer tensor";
-      return false;
-    }
-
-    bool is_conv_matmul_weight = is_input && (is_quant_conv || is_quant_matmul) && idx == 1;
-    bool is_conv_matmul_u8s8_weight = false;
-
-    if (is_conv_matmul_weight) {
-      const auto& weight_tensor = *initializers.at(io_def.node_arg.Name());
-      is_conv_matmul_u8s8_weight = weight_tensor.data_type() == ONNX_NAMESPACE::TensorProto_DataType_INT8;
-    }
-
-    const auto& zero_tensor = *initializers.at(zero_point_name);
-    int64_t zero_dim = zero_tensor.dims().empty() ? 1 : zero_tensor.dims()[0];
-
-    if (!is_conv_matmul_u8s8_weight) {
-      if (zero_dim != 1) {
-        LOGS_DEFAULT(VERBOSE) << op_type << " does not support per-channel quantization, "
-                              << " for now, only u8s8 QlinearConv supports per-channel quantization on API 29+";
-        return false;
-      }
-    } else {
-      // For u8s8 Qlinear[Conv/MatMul], we support
-      // 1. Per-tensor, the weight will be transformed to uint8 later
-      // 2. Per-channel, only from Android API level 29
-      if (zero_tensor.data_type() != ONNX_NAMESPACE::TensorProto_DataType_INT8) {
-        LOGS_DEFAULT(VERBOSE) << "u8s8 Qlinear[Conv/MatMul] only supports int8 zero point for weight, "
-                              << "actual zero point type: [" << zero_tensor.data_type() << "]";
-        return false;
-      }
-
-      if (zero_dim != 1) {
-        if (is_quant_matmul) {
-          LOGS_DEFAULT(VERBOSE) << "QLinearMatMul does not support per-channel quantization";
-          return false;
-        }
-      }
-
-      // For onnx, u8s8 QlinearConv, the weight zero point can be a scalar,
-      // or a tensor with same channel as weight, for NNAPI we only support it be
-      // 0 (scalar) or all 0 (tensor), NNAPI will assume the zero point for per-channel
-      // quantization is 0 there is no input for it
-      const auto& weight_tensor = *initializers.at(io_def.node_arg.Name());
-      if (weight_tensor.dims()[0] != zero_dim && zero_dim != 1) {
-        LOGS_DEFAULT(VERBOSE) << op_type << " mismatch int8 per-channel quantization weight,"
-                              << " weight dimension[0] " << weight_tensor.dims()[0]
-                              << " zero point dimension " << zero_dim;
-        return false;
-      }
-
-      std::vector<uint8_t> unpacked_tensor;
-      auto status = onnxruntime::utils::UnpackInitializerData(zero_tensor, node_unit.ModelPath(), unpacked_tensor);
-      if (!status.IsOK()) {
-        LOGS_DEFAULT(ERROR) << "Qlinear[Conv/MatMul] error when unpack zero tensor: " << zero_point_name
-                            << ", error msg: " << status.ErrorMessage();
-        return false;
-      }
-
-      // Verify all onnx weight zero point(s) are 0(s)
-      const int8_t* zero_points = reinterpret_cast<const int8_t*>(unpacked_tensor.data());
-      for (size_t i = 0; i < unpacked_tensor.size(); i++) {
-        if (zero_points[i] != 0) {
-          LOGS_DEFAULT(VERBOSE) << "u8s8 Qlinear[Conv/MatMul]  only support 0 as zero point, "
-                                << "zero_points[" << i << "] has value: " << zero_points[i];
-          return false;
-        }
-      }
-    }
-  }
-
-  return true;
-}
-
 common::Status GetQuantizationScaleAndZeroPoint(
     const InitializedTensorSet& initializers, const NodeUnitIODef& io_def, const Path& model_path,
     float& scale, int32_t& zero_point) {
@@ -387,8 +206,8 @@ common::Status GetQuantizationScaleAndZeroPoint(
 
 common::Status GetQuantizationScaleAndZeroPoint(
     const InitializedTensorSet& initializers, const NodeUnit& node_unit, const std::string& name,
-    float& scale, int32_t& zero_point, bool is_input) {
-  const auto& io_defs = is_input ? node_unit.Inputs() : node_unit.Outputs();
+    float& scale, int32_t& zero_point, IOKind io_kind) {
+  const auto& io_defs = io_kind == IOKind::Input ? node_unit.Inputs() : node_unit.Outputs();
   for (const auto& io_def : io_defs) {
     if (io_def.node_arg.Name() == name)
       return GetQuantizationScaleAndZeroPoint(initializers, io_def, node_unit.ModelPath(),
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.h b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.h
index 2d99bce246184..73ea329d0b31b 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.h
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.h
@@ -82,12 +82,14 @@ enum class QuantizedOpType : uint8_t {
   QLinearAdd,
   QLinearSigmoid,
   QLinearAveragePool,
+  QLinearMul,
   // Not yet supported
-  // QLinearMul,
   // QLinearReduceMean,
   QDQConv,
   QDQResize,
   QDQAveragePool,
+  QDQAdd,
+  QDQMul,
   // TODO, add other QDQ NodeUnit types
 };
 
@@ -97,6 +99,11 @@ enum class ConvType : uint8_t {
   Grouped,
 };
 
+enum class IOKind : uint8_t {
+  Input,
+  Output,
+};
+
 QuantizedOpType GetQuantizedOpType(const NodeUnit& node_unit);
 
 // Return the type of the conv ops,
@@ -113,18 +120,8 @@ bool IsQuantizedPool(QuantizedOpType quant_op_type);
 // Such as QLinearConv, QLinearMatMul, QLinearAdd, QDQConv,...
 bool IsQuantizedBinaryOp(QuantizedOpType quant_op_type);
 
-// Check if a qlinear unary op has valid inputs, Qlinear[Sigmoid/AveragePool]
-bool HasValidUnaryOpQuantizedInputs(const NodeUnit& node_unit);
 // Check if a qlinear binary op has valid inputs, Qlinear[Conv/MatMul/Add]
-bool HasValidBinaryOpQuantizedInputs(const NodeUnit& node_unit);
-
-// Check if a qlinear op has valid scales for given indices
-bool HasValidQuantizationScales(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
-                                const std::vector<size_t>& indices, const OpSupportCheckParams& params, bool is_input);
-
-// Check if a qlinear op has valid zero points for given indices
-bool HasValidQuantizationZeroPoints(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
-                                    const std::vector<size_t>& indices, bool is_input);
+bool HasValidBinaryOpQuantizedInputTypes(const NodeUnit& node_unit);
 
 common::Status GetQuantizationScaleAndZeroPoint(
     const InitializedTensorSet& initializers, const NodeUnitIODef& io_def, const Path& model_path,
@@ -132,7 +129,7 @@ common::Status GetQuantizationScaleAndZeroPoint(
 
 common::Status GetQuantizationScaleAndZeroPoint(
     const InitializedTensorSet& initializers, const NodeUnit& node_unit, const std::string& name,
-    float& scale, int32_t& zero_point, bool is_input = true);
+    float& scale, int32_t& zero_point, IOKind io_kind = IOKind::Input);
 
 // Get Shape/Type of a NodeArg
 // TODO, move to shared_utils
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.cc
index 0037e1a1d487d..3f95048335358 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.cc
@@ -210,7 +210,7 @@ static Status GetInputDataType(
       // TODO, verify the scale and zero point match if there are multiple op using same input
       const auto* node_unit = all_quantized_op_inputs.at(name)[0];
       ORT_RETURN_IF_ERROR(GetQuantizationScaleAndZeroPoint(
-          initializers, *node_unit, name, scale, zero_point, true /* is_input */));
+          initializers, *node_unit, name, scale, zero_point, IOKind::Input));
       break;
     }
       // case ONNX_NAMESPACE::TensorProto_DataType_INT8:
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc
index 8a61e2a105127..47e8fe2eef749 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc
@@ -452,7 +452,7 @@ static Status HandleAutoPad(const Shape& input_shape,
 }
 
 // Get scales and zero points for the qlinear binary ops (which has 2 input and 1 output)
-// QLinearConv, QLinearMatmul, QLinearAdd
+// QLinearConv, QLinearMatmul, QLinearAdd, QLinearMul
 // a, b are inputs, and y is output
 static Status GetBinaryOpQuantizationScaleAndZeroPoint(
     const InitializedTensorSet& initializers, const NodeUnit& node_unit,
@@ -656,8 +656,11 @@ class BinaryOpBuilder : public BaseOpBuilder {
 };
 
 /* static */ bool BinaryOpBuilder::IsQuantizedOp(const NodeUnit& node_unit) {
-  // TODO, add support for QDQ NodeUnit
-  return node_unit.OpType() == "QLinearAdd";
+  const auto quant_type = GetQuantizedOpType(node_unit);
+  return quant_type == QuantizedOpType::QLinearAdd ||
+         quant_type == QuantizedOpType::QLinearMul ||
+         quant_type == QuantizedOpType::QDQAdd ||
+         quant_type == QuantizedOpType::QDQMul;
 }
 
 void BinaryOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const NodeUnit& node_unit) const {
@@ -680,6 +683,7 @@ void BinaryOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const N
           "Mul",
           "Div",
           "QLinearAdd",
+          "QLinearMul",
           "Pow",
       });
 }
@@ -690,12 +694,12 @@ Status BinaryOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
 
   int32_t op_code;
   bool add_activation = true;
-  bool op_is_qlinear = op_type == "QLinearAdd";
-  if (op_type == "Add" || op_is_qlinear) {
+  bool is_quant_op = IsQuantizedOp(node_unit);
+  if (op_type == "Add" || op_type == "QLinearAdd") {  // Add/QLinearAdd/QDQAdd
     op_code = ANEURALNETWORKS_ADD;
   } else if (op_type == "Sub") {
     op_code = ANEURALNETWORKS_SUB;
-  } else if (op_type == "Mul") {
+  } else if (op_type == "Mul" || op_type == "QLinearMul") {  // Mul/QLinearMul/QDQMul
     op_code = ANEURALNETWORKS_MUL;
   } else if (op_type == "Div") {
     op_code = ANEURALNETWORKS_DIV;
@@ -721,7 +725,7 @@ Status BinaryOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
           b_zero_point = 0,
           y_zero_point = 0;
 
-  if (op_is_qlinear) {
+  if (is_quant_op) {
     ORT_RETURN_IF_ERROR(GetBinaryOpQuantizationScaleAndZeroPoint(
         model_builder.GetInitializerTensors(), node_unit,
         a_scale, b_scale, y_scale,
@@ -729,7 +733,7 @@ Status BinaryOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
   }
 
   // Verify if the scale and zero point matchs from onnx input and nnapi input match
-  if (op_is_qlinear) {
+  if (is_quant_op) {
     ORT_RETURN_IF_ERROR(IsValidInputQuantizedType(model_builder, input1, a_scale, a_zero_point));
     ORT_RETURN_IF_ERROR(IsValidInputQuantizedType(model_builder, input2, b_scale, b_zero_point));
   }
@@ -2717,6 +2721,7 @@ static OpBuilderRegistrations CreateOpBuilderRegistrations() {
     NNAPI_EP_ADD_SHARED_OP_BUILDER("Mul", BinaryOpBuilder);
     NNAPI_EP_ADD_SHARED_OP_BUILDER("Pow", BinaryOpBuilder);
     NNAPI_EP_ADD_SHARED_OP_BUILDER("QLinearAdd", BinaryOpBuilder);
+    NNAPI_EP_ADD_SHARED_OP_BUILDER("QLinearMul", BinaryOpBuilder);
     NNAPI_EP_ADD_SHARED_OP_BUILDER("Sub", BinaryOpBuilder);
   }
 
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_support_checker.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_support_checker.cc
index 950fb302f1b53..01b51fed399f6 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_support_checker.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_support_checker.cc
@@ -22,7 +22,21 @@ struct OpSupportCheckerRegistrations {
   std::unordered_map<std::string, const IOpSupportChecker*> op_support_checker_map;
 };
 
-bool HasExternalInitializer(const InitializedTensorSet& initializers, const NodeUnit& node_unit) {
+template <class T>
+void CreateSharedOpSupportCheckerImpl(const std::string& op_type,
+                                      OpSupportCheckerRegistrations& op_registrations,
+                                      const std::vector<std::string>& op_types) {
+  // The shared OpSupportChecker is already in the OpSupportCheckerRegistrations
+  if (op_registrations.op_support_checker_map.find(op_type) != op_registrations.op_support_checker_map.cend())
+    return;
+
+  op_registrations.support_checkers.push_back(std::make_unique<T>());
+  for (const auto& op : op_types) {
+    op_registrations.op_support_checker_map.emplace(op, op_registrations.support_checkers.back().get());
+  }
+}
+
+static bool HasExternalInitializer(const InitializedTensorSet& initializers, const NodeUnit& node_unit) {
   const auto is_ext_initializer =
       [&](const NodeArg& node_arg) {
         const auto& input_name(node_arg.Name());
@@ -58,18 +72,200 @@ bool HasExternalInitializer(const InitializedTensorSet& initializers, const Node
   return false;
 }
 
-template <class T>
-void CreateSharedOpSupportCheckerImpl(const std::string& op_type,
-                                      OpSupportCheckerRegistrations& op_registrations,
-                                      const std::vector<std::string>& op_types) {
-  // The shared OpSupportChecker is already in the OpSupportCheckerRegistrations
-  if (op_registrations.op_support_checker_map.find(op_type) != op_registrations.op_support_checker_map.cend())
-    return;
+static bool IsQuantizationScaleSupported(const InitializedTensorSet& initializers,
+                                         const NodeUnitIODef& io_def,
+                                         const OpSupportCheckParams& params,
+                                         const std::string& op_type,
+                                         bool is_quant_matmul,
+                                         bool is_conv_matmul_u8s8_weight) {
+  const auto scale_name = io_def.quant_param->scale.Name();
+  auto it = initializers.find(scale_name);
+  if (it == initializers.cend()) {
+    LOGS_DEFAULT(VERBOSE) << "The scale of " << op_type << " must be an initializer tensor";
+    return false;
+  }
+
+  const auto& scale_tensor = *it->second;
+  int64_t scales_dim = scale_tensor.dims().empty() ? 1 : scale_tensor.dims()[0];
+  if (!is_conv_matmul_u8s8_weight) {
+    if (scales_dim != 1) {
+      LOGS_DEFAULT(VERBOSE) << op_type << " does not support per-channel quantization, "
+                            << " for now, only u8s8 QlinearConv supports per-channel quantization on API 29+";
+      return false;
+    }
+  } else if (scales_dim != 1) {
+    // For u8s8 Qlinear[Conv/MatMul], we support
+    // 1. Per-tensor, the weight will be transformed to uint8 later
+    // 2. Per-channel, only from Android API level 29
+    if (is_quant_matmul) {
+      LOGS_DEFAULT(VERBOSE) << "QLinearMatMul does not support per-channel quantization";
+      return false;
+    }
 
-  op_registrations.support_checkers.push_back(std::make_unique<T>());
-  for (const auto& op : op_types) {
-    op_registrations.op_support_checker_map.emplace(op, op_registrations.support_checkers.back().get());
+    if (params.android_feature_level < ANEURALNETWORKS_FEATURE_LEVEL_3) {
+      LOGS_DEFAULT(VERBOSE) << op_type << " only supports per-channel quantization on Android API 29+, "
+                            << "system NNAPI feature level: " << params.android_feature_level;
+      return false;
+    }
+
+    Shape weight_shape;
+    if (!GetShape(io_def.node_arg, weight_shape))
+      return false;
+
+    if (weight_shape[0] != scales_dim) {
+      LOGS_DEFAULT(VERBOSE) << op_type << " mismatch int8 per-channel quantization weight,"
+                            << " weight dimension[0] " << weight_shape[0]
+                            << " scale dimension " << scales_dim;
+      return false;
+    }
+  }
+
+  return true;
+}
+
+static bool IsQuantizationZeroPointSupported(const InitializedTensorSet& initializers,
+                                             const NodeUnitIODef& io_def,
+                                             const std::string& op_type,
+                                             const Path& model_path,
+                                             bool is_quant_matmul,
+                                             bool is_conv_matmul_u8s8_weight) {
+  // zero point is optional here
+  if (!io_def.quant_param->zero_point)
+    return true;
+
+  const auto& zero_point_name = io_def.quant_param->zero_point->Name();
+  if (!Contains(initializers, zero_point_name)) {
+    LOGS_DEFAULT(VERBOSE) << "The zero point of " << op_type << " must be an initializer tensor";
+    return false;
+  }
+
+  const auto& zero_tensor = *initializers.at(zero_point_name);
+  int64_t zero_dim = zero_tensor.dims().empty() ? 1 : zero_tensor.dims()[0];
+
+  if (!is_conv_matmul_u8s8_weight) {
+    if (zero_dim != 1) {
+      LOGS_DEFAULT(VERBOSE) << op_type << " does not support per-channel quantization, "
+                            << " for now, only u8s8 QlinearConv supports per-channel quantization on API 29+";
+      return false;
+    }
+  } else {
+    // For u8s8 Qlinear[Conv/MatMul], we support
+    // 1. Per-tensor, the weight will be transformed to uint8 later
+    // 2. Per-channel, only from Android API level 29
+    if (zero_tensor.data_type() != ONNX_NAMESPACE::TensorProto_DataType_INT8) {
+      LOGS_DEFAULT(VERBOSE) << "u8s8 Qlinear[Conv/MatMul] only supports int8 zero point for weight, "
+                            << "actual zero point type: [" << zero_tensor.data_type() << "]";
+      return false;
+    }
+
+    if (zero_dim != 1) {
+      if (is_quant_matmul) {
+        LOGS_DEFAULT(VERBOSE) << "QLinearMatMul does not support per-channel quantization";
+        return false;
+      }
+    }
+
+    // For onnx, u8s8 QlinearConv, the weight zero point can be a scalar,
+    // or a tensor with same channel as weight, for NNAPI we only support it be
+    // 0 (scalar) or all 0 (tensor), NNAPI will assume the zero point for per-channel
+    // quantization is 0 there is no input for it
+    Shape weight_shape;
+    if (!GetShape(io_def.node_arg, weight_shape))
+      return false;
+
+    if (weight_shape[0] != zero_dim && zero_dim != 1) {
+      LOGS_DEFAULT(VERBOSE) << op_type << " mismatch int8 per-channel quantization weight,"
+                            << " weight dimension[0] " << weight_shape[0]
+                            << " zero point dimension " << zero_dim;
+      return false;
+    }
+
+    std::vector<uint8_t> unpacked_tensor;
+    auto status = onnxruntime::utils::UnpackInitializerData(zero_tensor, model_path, unpacked_tensor);
+    if (!status.IsOK()) {
+      LOGS_DEFAULT(ERROR) << "Qlinear[Conv/MatMul] error when unpack zero tensor: " << zero_point_name
+                          << ", error msg: " << status.ErrorMessage();
+      return false;
+    }
+
+    // Verify all onnx weight zero point(s) are 0(s)
+    const int8_t* zero_points = reinterpret_cast<const int8_t*>(unpacked_tensor.data());
+    for (size_t i = 0; i < unpacked_tensor.size(); i++) {
+      if (zero_points[i] != 0) {
+        LOGS_DEFAULT(VERBOSE) << "u8s8 Qlinear[Conv/MatMul]  only support 0 as zero point, "
+                              << "zero_points[" << i << "] has value: " << zero_points[i];
+        return false;
+      }
+    }
   }
+
+  return true;
+}
+
+// Check if the given quantized input(s) or output(s) is supported
+static bool IsQuantizedIOSupported(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+                                   const std::vector<size_t>& indices, const OpSupportCheckParams& params, IOKind io_kind) {
+  const auto& op_type = node_unit.OpType();
+  auto quant_op_type = GetQuantizedOpType(node_unit);
+
+  ORT_ENFORCE(quant_op_type != QuantizedOpType::Unknown, "[", op_type, "] is not a quantized op");
+
+  bool is_input = io_kind == IOKind::Input;
+  bool is_quant_conv = IsQuantizedConv(quant_op_type);
+  bool is_quant_matmul = (quant_op_type == QuantizedOpType::QLinearMatMul);
+  const auto& io_defs = is_input ? node_unit.Inputs() : node_unit.Outputs();
+
+  for (const auto idx : indices) {
+    if (idx >= io_defs.size()) {
+      LOGS_DEFAULT(VERBOSE) << (is_input ? "Input" : "Output") << " index,  " << idx
+                            << " >= size, " << io_defs.size()
+                            << " of NodeUnit: " << node_unit.Name();
+      return false;
+    }
+
+    const auto& io_def = io_defs[idx];
+    ORT_ENFORCE(io_def.quant_param.has_value(), "Input index,  ", idx, " has no quant_param");
+
+    // If this op is Qlinear[Conv/MatMul], we want to check u8s8 support for weight tensor (or B tensor for QlinearMatMul)
+    bool is_conv_matmul_weight = is_input && (is_quant_conv || is_quant_matmul) && idx == 1;
+    bool is_conv_matmul_u8s8_weight = false;
+
+    if (is_conv_matmul_weight) {
+      int32_t weight_type;
+      if (!GetType(io_def.node_arg, weight_type))
+        return false;
+      is_conv_matmul_u8s8_weight = weight_type == ONNX_NAMESPACE::TensorProto_DataType_INT8;
+    }
+
+    int32_t input_type;
+    if (!GetType(io_def.node_arg, input_type))
+      return false;
+
+    // We only support u8 for most of the inputs and all outputs, with the exception for Quantized MatMul and Conv,
+    // which allows s8 weight (u8s8)
+    // TODO, add support of s8s8
+    if (input_type != ONNX_NAMESPACE::TensorProto_DataType_UINT8 &&
+        !(input_type == ONNX_NAMESPACE::TensorProto_DataType_INT8 && is_conv_matmul_u8s8_weight)) {
+      LOGS_DEFAULT(VERBOSE) << op_type << "NodeUnit [" << node_unit.Name()
+                            << "], type [" << op_type << "]'s "
+                            << (is_input ? "Input" : "Output") << " index  [" << idx
+                            << "] has unsupported type [" << input_type << "]";
+      return false;
+    }
+
+    // Check scale and zero point
+    if (!IsQuantizationScaleSupported(initializers, io_def, params, op_type,
+                                      is_quant_matmul, is_conv_matmul_u8s8_weight)) {
+      return false;
+    }
+
+    if (!IsQuantizationZeroPointSupported(initializers, io_def, op_type, node_unit.ModelPath(),
+                                          is_quant_matmul, is_conv_matmul_u8s8_weight)) {
+      return false;
+    }
+  }
+
+  return true;
 }
 
 #pragma endregion helpers
@@ -100,7 +296,9 @@ class BaseOpSupportChecker : public IOpSupportChecker {
     return ANEURALNETWORKS_FEATURE_LEVEL_1;
   }
 
-  virtual bool HasSupportedInputsImpl(const NodeUnit& node_unit) const;
+  virtual bool HasSupportedInputOutputsImpl(
+      const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+      const OpSupportCheckParams& params) const;
 
   virtual int GetMinSupportedOpSet(const NodeUnit& /* node_unit */) const { return 1; }
   virtual int GetMaxSupportedOpSet(const NodeUnit& /* node_unit */) const { return 15; }
@@ -112,7 +310,8 @@ class BaseOpSupportChecker : public IOpSupportChecker {
 
  private:
   bool HasSupportedOpSet(const NodeUnit& node_unit) const;
-  bool HasSupportedInputs(const NodeUnit& node_unit) const;
+  bool HasSupportedInputOutputs(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+                                const OpSupportCheckParams& params) const;
 };
 
 /* static */ void BaseOpSupportChecker::CreateSharedOpSupportChecker(
@@ -138,7 +337,7 @@ bool BaseOpSupportChecker::IsOpSupported(const InitializedTensorSet& initializer
   if (!IsNodeUnitTypeSupported(node_unit))
     return false;
 
-  if (!HasSupportedInputs(node_unit))
+  if (!HasSupportedInputOutputs(initializers, node_unit, params))
     return false;
 
   // We do not support external initializers for now
@@ -151,7 +350,8 @@ bool BaseOpSupportChecker::IsOpSupported(const InitializedTensorSet& initializer
   return IsOpSupportedImpl(initializers, node_unit, params);
 }
 
-bool BaseOpSupportChecker::HasSupportedInputs(const NodeUnit& node_unit) const {
+bool BaseOpSupportChecker::HasSupportedInputOutputs(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+                                                    const OpSupportCheckParams& params) const {
   // We do not support unknown(null) input shape
   auto has_supported_shape = [](const NodeArg& node_arg, const std::string& name, const std::string op_type) {
     const auto* shape_proto = node_arg.Shape();
@@ -185,10 +385,12 @@ bool BaseOpSupportChecker::HasSupportedInputs(const NodeUnit& node_unit) const {
         return false;
     }
   }
-  return HasSupportedInputsImpl(node_unit);
+  return HasSupportedInputOutputsImpl(initializers, node_unit, params);
 }
 
-bool BaseOpSupportChecker::HasSupportedInputsImpl(const NodeUnit& node_unit) const {
+bool BaseOpSupportChecker::HasSupportedInputOutputsImpl(
+    const InitializedTensorSet& /* initializers */, const NodeUnit& node_unit,
+    const OpSupportCheckParams& /* params */) const {
   // We only check the type of input 0 by default
   // specific op builder can override this
   const auto& input = node_unit.Inputs()[0].node_arg;
@@ -245,8 +447,13 @@ class BinaryOpSupportChecker : public BaseOpSupportChecker {
                                            const OpSupportCheckParams& params) const override;
   bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
                          const OpSupportCheckParams& params) const override;
-  bool HasSupportedInputsImpl(const NodeUnit& node_unit) const override;
+  bool HasSupportedInputOutputsImpl(
+      const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+      const OpSupportCheckParams& params) const override;
   int GetMinSupportedOpSet(const NodeUnit& node_unit) const override;
+
+  bool IsNodeUnitTypeSupported(const NodeUnit& node_unit) const override;
+  static bool IsQuantizedOp(const NodeUnit& node_unit);
 };
 
 /* static */ void BinaryOpSupportChecker::CreateSharedOpSupportChecker(
@@ -259,10 +466,29 @@ class BinaryOpSupportChecker : public BaseOpSupportChecker {
           "Mul",
           "Div",
           "QLinearAdd",
+          "QLinearMul",
           "Pow",
       });
 }
 
+bool BinaryOpSupportChecker::IsNodeUnitTypeSupported(const NodeUnit& node_unit) const {
+  if (node_unit.UnitType() == NodeUnit::Type::QDQGroup) {
+    const auto quant_type = GetQuantizedOpType(node_unit);
+    return quant_type == QuantizedOpType::QDQAdd ||
+           quant_type == QuantizedOpType::QDQMul;
+  }
+
+  return true;
+}
+
+/* static */ bool BinaryOpSupportChecker::IsQuantizedOp(const NodeUnit& node_unit) {
+  const auto quant_type = GetQuantizedOpType(node_unit);
+  return quant_type == QuantizedOpType::QLinearAdd ||
+         quant_type == QuantizedOpType::QLinearMul ||
+         quant_type == QuantizedOpType::QDQAdd ||
+         quant_type == QuantizedOpType::QDQMul;
+}
+
 int32_t BinaryOpSupportChecker::GetMinSupportedNNAPIFeatureLevel(
     const NodeUnit& node_unit, const OpSupportCheckParams& /* params */) const {
   const auto& op(node_unit.OpType());
@@ -281,21 +507,29 @@ int BinaryOpSupportChecker::GetMinSupportedOpSet(const NodeUnit& node_unit) cons
   const auto& op(node_unit.OpType());
 
   // Add/Sub/Mul/Div/Pow opset 6- has broadcast attributes we do not support now
-  if (op != "QLinearAdd")
+  if (op != "QLinearAdd" && op != "QLinearMul")
     return 7;
 
   return 1;
 }
 
-bool BinaryOpSupportChecker::HasSupportedInputsImpl(const NodeUnit& node_unit) const {
-  bool is_qlinear_add = node_unit.OpType() == "QLinearAdd";
+bool BinaryOpSupportChecker::HasSupportedInputOutputsImpl(
+    const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+    const OpSupportCheckParams& params) const {
+  bool is_quantized_op = IsQuantizedOp(node_unit);
   bool is_pow = node_unit.OpType() == "Pow";
-  if (!is_qlinear_add && !is_pow)
-    return BaseOpSupportChecker::HasSupportedInputsImpl(node_unit);
+  if (!is_quantized_op && !is_pow)
+    return BaseOpSupportChecker::HasSupportedInputOutputsImpl(initializers, node_unit, params);
 
-  if (is_qlinear_add) {
-    // QLinearAdd
-    if (!HasValidBinaryOpQuantizedInputs(node_unit))
+  if (is_quantized_op) {
+    // QLinearAdd/QDQAdd/QLinearMul/QDQMul
+    if (!HasValidBinaryOpQuantizedInputTypes(node_unit))
+      return false;
+
+    if (!IsQuantizedIOSupported(initializers, node_unit, {0, 1}, params, IOKind::Input))
+      return false;
+
+    if (!IsQuantizedIOSupported(initializers, node_unit, {0}, params, IOKind::Output))
       return false;
   }
 
@@ -320,11 +554,10 @@ bool BinaryOpSupportChecker::HasSupportedInputsImpl(const NodeUnit& node_unit) c
   return true;
 }
 
-bool BinaryOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
-                                               const OpSupportCheckParams& params) const {
+bool BinaryOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& /* initializers */, const NodeUnit& node_unit,
+                                               const OpSupportCheckParams& /* params */) const {
   const auto& op_type(node_unit.OpType());
   const auto& inputs = node_unit.Inputs();
-  bool op_is_qlinear = op_type == "QLinearAdd";
   Shape input1_shape, input2_shape;
   if (!GetShape(inputs[0].node_arg, input1_shape) ||
       !GetShape(inputs[1].node_arg, input2_shape))
@@ -339,32 +572,6 @@ bool BinaryOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initi
     return false;
   }
 
-  if (op_is_qlinear) {
-    // For QLinearAdd, we only support uint8 output now
-    int32_t output_type;
-    if (!GetType(node_unit.Outputs()[0].node_arg, output_type))
-      return false;
-
-    if (output_type != ONNX_NAMESPACE::TensorProto_DataType_UINT8) {
-      LOGS_DEFAULT(VERBOSE) << "[" << op_type
-                            << "] output type: [" << output_type
-                            << "] is not supported for now";
-      return false;
-    }
-
-    // Check input scales and ZPs
-    if (!HasValidQuantizationScales(initializers, node_unit, {0, 1}, params, true /* is_input */))
-      return false;
-    if (!HasValidQuantizationZeroPoints(initializers, node_unit, {0, 1}, true /* is_input */))
-      return false;
-
-    // Check output scale and ZP
-    if (!HasValidQuantizationScales(initializers, node_unit, {0}, params, false /* is_input */))
-      return false;
-    if (!HasValidQuantizationZeroPoints(initializers, node_unit, {0}, false /* is_input */))
-      return false;
-  }
-
   return true;
 }
 
@@ -382,7 +589,9 @@ class TransposeOpSupportChecker : public BaseOpSupportChecker {
     return ANEURALNETWORKS_FEATURE_LEVEL_2;
   }
 
-  bool HasSupportedInputsImpl(const NodeUnit& node_unit) const override;
+  bool HasSupportedInputOutputsImpl(
+      const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+      const OpSupportCheckParams& params) const override;
 };
 
 bool TransposeOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& /* initializers */, const NodeUnit& node_unit,
@@ -401,7 +610,9 @@ bool TransposeOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& /*
   return true;
 }
 
-bool TransposeOpSupportChecker::HasSupportedInputsImpl(const NodeUnit& node_unit) const {
+bool TransposeOpSupportChecker::HasSupportedInputOutputsImpl(
+    const InitializedTensorSet& /* initializers */, const NodeUnit& node_unit,
+    const OpSupportCheckParams& /* params */) const {
   int32_t input_type;
   if (!GetType(node_unit.Inputs()[0].node_arg, input_type))
     return false;
@@ -561,8 +772,10 @@ class PoolOpSupportChecker : public BaseOpSupportChecker {
     return params.use_nchw ? ANEURALNETWORKS_FEATURE_LEVEL_3 : ANEURALNETWORKS_FEATURE_LEVEL_2;
   }
 
-  bool HasSupportedInputsImpl(const NodeUnit& node_unit) const override;
-  bool IsNodeUnitTypeSupported(const NodeUnit& /* node_unit */) const override { return true; }
+  bool HasSupportedInputOutputsImpl(
+      const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+      const OpSupportCheckParams& params) const override;
+  bool IsNodeUnitTypeSupported(const NodeUnit& /* node_unit */) const override;
   static bool IsQuantizedOp(const NodeUnit& node_unit);
 };
 
@@ -579,12 +792,21 @@ class PoolOpSupportChecker : public BaseOpSupportChecker {
       });
 }
 
+bool PoolOpSupportChecker::IsNodeUnitTypeSupported(const NodeUnit& node_unit) const {
+  if (node_unit.UnitType() == NodeUnit::Type::QDQGroup) {
+    const auto quant_type = GetQuantizedOpType(node_unit);
+    return quant_type == QuantizedOpType::QDQAveragePool;
+  }
+
+  return true;
+}
+
 /* static */ bool PoolOpSupportChecker::IsQuantizedOp(const NodeUnit& node_unit) {
   return IsQuantizedPool(GetQuantizedOpType(node_unit));
 }
 
 bool PoolOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
-                                             const OpSupportCheckParams& params) const {
+                                             const OpSupportCheckParams& /* params */) const {
   const auto& op_name = node_unit.Name();
   const auto& op_type = node_unit.OpType();
   const auto& inputs = node_unit.Inputs();
@@ -601,7 +823,8 @@ bool PoolOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initial
   }
 
   bool is_quant_pool = IsQuantizedOp(node_unit);
-  if (op_type == "AveragePool" || op_type == "MaxPool" || op_type == "QLinearAveragePool") {
+  bool is_average_pool = op_type == "AveragePool" || op_type == "QLinearAveragePool";
+  if (is_average_pool || op_type == "MaxPool") {
     NodeAttrHelper helper(node_unit);
 
     const auto count_include_pad = helper.Get("count_include_pad", 0);
@@ -642,20 +865,7 @@ bool PoolOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initial
   }
 
   // We need to check if we have valid scales and zero points for QLinearAveragePool
-  if (is_quant_pool) {
-    // Check input scales and ZPs
-    if (!HasValidQuantizationScales(initializers, node_unit, {0}, params, true /* is_input */))
-      return false;
-    if (!HasValidQuantizationZeroPoints(initializers, node_unit, {0}, true /* is_input */))
-      return false;
-
-    // Check output scale and ZP
-
-    if (!HasValidQuantizationScales(initializers, node_unit, {0}, params, false /* is_input */))
-      return false;
-    if (!HasValidQuantizationZeroPoints(initializers, node_unit, {0}, false /* is_input */))
-      return false;
-
+  if (is_average_pool && is_quant_pool) {
     // NNAPI requires Quantized Average Pool has same scale and zero point for both input and output
     float input_scale = 0.0f;
     int32_t input_zp = 0;
@@ -697,14 +907,23 @@ bool PoolOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initial
   return true;
 }
 
-bool PoolOpSupportChecker::HasSupportedInputsImpl(const NodeUnit& node_unit) const {
-  bool is_max_pool = node_unit.OpType() == "MaxPool";
+bool PoolOpSupportChecker::HasSupportedInputOutputsImpl(
+    const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+    const OpSupportCheckParams& params) const {
+  const auto& op_type = node_unit.OpType();
   bool is_quant_pool = IsQuantizedOp(node_unit);
-  if (!is_max_pool && !is_quant_pool)
-    return BaseOpSupportChecker::HasSupportedInputsImpl(node_unit);
+  bool is_max_pool = op_type == "MaxPool";
+  bool is_average_pool = op_type == "AveragePool" || op_type == "QLinearAveragePool";
+  bool is_quant_average_pool = is_quant_pool && is_average_pool;
+  if (!is_max_pool && !is_quant_average_pool)
+    return BaseOpSupportChecker::HasSupportedInputOutputsImpl(initializers, node_unit, params);
+
+  if (is_quant_average_pool) {
+    if (!IsQuantizedIOSupported(initializers, node_unit, {0}, params, IOKind::Input))
+      return false;
 
-  if (is_quant_pool) {
-    return HasValidUnaryOpQuantizedInputs(node_unit);
+    if (!IsQuantizedIOSupported(initializers, node_unit, {0}, params, IOKind::Output))
+      return false;
   }
 
   // is_max_pool
@@ -742,7 +961,9 @@ class ConvOpSupportChecker : public BaseOpSupportChecker {
     return params.use_nchw ? ANEURALNETWORKS_FEATURE_LEVEL_3 : ANEURALNETWORKS_FEATURE_LEVEL_2;
   }
 
-  bool HasSupportedInputsImpl(const NodeUnit& node_unit) const override;
+  bool HasSupportedInputOutputsImpl(
+      const InitializedTensorSet& /* initializers */, const NodeUnit& node_unit,
+      const OpSupportCheckParams& /* params */) const override;
   bool IsNodeUnitTypeSupported(const NodeUnit& /* node_unit */) const override { return true; }
   static bool IsQuantizedOp(const NodeUnit& node_unit);
 };
@@ -761,12 +982,20 @@ class ConvOpSupportChecker : public BaseOpSupportChecker {
   return IsQuantizedConv(GetQuantizedOpType(node_unit));
 }
 
-bool ConvOpSupportChecker::HasSupportedInputsImpl(const NodeUnit& node_unit) const {
+bool ConvOpSupportChecker::HasSupportedInputOutputsImpl(
+    const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+    const OpSupportCheckParams& params) const {
   if (!IsQuantizedOp(node_unit))
-    return BaseOpSupportChecker::HasSupportedInputsImpl(node_unit);
+    return BaseOpSupportChecker::HasSupportedInputOutputsImpl(initializers, node_unit, params);
 
   // QLinearConv only supports input of uint8 for now
-  if (!HasValidBinaryOpQuantizedInputs(node_unit))
+  if (!HasValidBinaryOpQuantizedInputTypes(node_unit))
+    return false;
+
+  if (!IsQuantizedIOSupported(initializers, node_unit, {0, 1}, params, IOKind::Input))
+    return false;
+
+  if (!IsQuantizedIOSupported(initializers, node_unit, {0}, params, IOKind::Output))
     return false;
 
   return true;
@@ -813,34 +1042,10 @@ bool ConvOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initial
   }
 
   if (is_quant_conv) {
-    // For QLinearConv, we only support uint8 output now
-    int32_t output_type;
-    if (!GetType(node_unit.Outputs()[0].node_arg, output_type))
-      return false;
-
-    if (output_type != ONNX_NAMESPACE::TensorProto_DataType_UINT8) {
-      LOGS_DEFAULT(VERBOSE) << "[" << op_type
-                            << "] output type: [" << output_type
-                            << "] is not supported for now";
-      return false;
-    }
-
     if (inputs.size() > 2 && !Contains(initializers, inputs[2].node_arg.Name())) {
       LOGS_DEFAULT(VERBOSE) << "Bias of QLinearConv must be known";
       return false;
     }
-
-    // Check input scales and ZPs
-    if (!HasValidQuantizationScales(initializers, node_unit, {0, 1}, params, true /* is_input */))
-      return false;
-    if (!HasValidQuantizationZeroPoints(initializers, node_unit, {0, 1}, true /* is_input */))
-      return false;
-
-    // Check output scale and ZP
-    if (!HasValidQuantizationScales(initializers, node_unit, {0}, params, false /* is_input */))
-      return false;
-    if (!HasValidQuantizationZeroPoints(initializers, node_unit, {0}, false /* is_input */))
-      return false;
   }
 
   return true;
@@ -931,16 +1136,26 @@ class GemmOpSupportChecker : public BaseOpSupportChecker {
  private:
   bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
                          const OpSupportCheckParams& params) const override;
-  bool HasSupportedInputsImpl(const NodeUnit& node_unit) const override;
+  bool HasSupportedInputOutputsImpl(
+      const InitializedTensorSet& /* initializers */, const NodeUnit& node_unit,
+      const OpSupportCheckParams& /* params */) const override;
   int GetMinSupportedOpSet(const NodeUnit& node_unit) const override;
 };
 
-bool GemmOpSupportChecker::HasSupportedInputsImpl(const NodeUnit& node_unit) const {
+bool GemmOpSupportChecker::HasSupportedInputOutputsImpl(
+    const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+    const OpSupportCheckParams& params) const {
   if (node_unit.OpType() != "QLinearMatMul")
-    return BaseOpSupportChecker::HasSupportedInputsImpl(node_unit);
+    return BaseOpSupportChecker::HasSupportedInputOutputsImpl(initializers, node_unit, params);
 
   // QLinearMatMul
-  if (!HasValidBinaryOpQuantizedInputs(node_unit))
+  if (!HasValidBinaryOpQuantizedInputTypes(node_unit))
+    return false;
+
+  if (!IsQuantizedIOSupported(initializers, node_unit, {0, 1}, params, IOKind::Input))
+    return false;
+
+  if (!IsQuantizedIOSupported(initializers, node_unit, {0}, params, IOKind::Output))
     return false;
 
   return true;
@@ -1077,33 +1292,6 @@ bool GemmOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initial
       LOGS_DEFAULT(VERBOSE) << "B of MatMul must be known";
       return false;
     }
-
-    if (is_qlinear_matmul) {
-      // For QLinearMatMul, we only support uint8 output now
-      int32_t output_type;
-      if (!GetType(node_unit.Outputs()[0].node_arg, output_type))
-        return false;
-
-      if (output_type != ONNX_NAMESPACE::TensorProto_DataType_UINT8) {
-        LOGS_DEFAULT(VERBOSE) << "[" << op_type
-                              << "] output type: [" << output_type
-                              << "] is not supported for now";
-        return false;
-      }
-
-      // All scale/zero points are initializer scalars
-      // Check input scales and ZPs
-      if (!HasValidQuantizationScales(initializers, node_unit, {0, 1}, params, true /* is_input */))
-        return false;
-      if (!HasValidQuantizationZeroPoints(initializers, node_unit, {0, 1}, true /* is_input */))
-        return false;
-
-      // Check output scale and ZP
-      if (!HasValidQuantizationScales(initializers, node_unit, {0}, params, false /* is_input */))
-        return false;
-      if (!HasValidQuantizationZeroPoints(initializers, node_unit, {0}, false /* is_input */))
-        return false;
-    }
   } else {
     LOGS_DEFAULT(VERBOSE) << "GemmOpSupportChecker, unknown op: " << op_type;
   }
@@ -1127,7 +1315,9 @@ class UnaryOpSupportChecker : public BaseOpSupportChecker {
   int32_t GetMinSupportedNNAPIFeatureLevel(const NodeUnit& /* node_unit */,
                                            const OpSupportCheckParams& params) const override;
 
-  bool HasSupportedInputsImpl(const NodeUnit& node_unit) const override;
+  bool HasSupportedInputOutputsImpl(
+      const InitializedTensorSet& /* initializers */, const NodeUnit& node_unit,
+      const OpSupportCheckParams& /* params */) const override;
 
   int GetMinSupportedOpSet(const NodeUnit& node_unit) const override;
 
@@ -1176,12 +1366,20 @@ int32_t UnaryOpSupportChecker::GetMinSupportedNNAPIFeatureLevel(const NodeUnit&
   return ANEURALNETWORKS_FEATURE_LEVEL_1;
 }
 
-bool UnaryOpSupportChecker::HasSupportedInputsImpl(const NodeUnit& node_unit) const {
+bool UnaryOpSupportChecker::HasSupportedInputOutputsImpl(
+    const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+    const OpSupportCheckParams& params) const {
   // We only need to override input check for QLinearSigmoid
   if (node_unit.OpType() != "QLinearSigmoid")
-    return BaseOpSupportChecker::HasSupportedInputsImpl(node_unit);
+    return BaseOpSupportChecker::HasSupportedInputOutputsImpl(initializers, node_unit, params);
 
-  return HasValidUnaryOpQuantizedInputs(node_unit);
+  if (!IsQuantizedIOSupported(initializers, node_unit, {0}, params, IOKind::Input))
+    return false;
+
+  if (!IsQuantizedIOSupported(initializers, node_unit, {0}, params, IOKind::Output))
+    return false;
+
+  return true;
 }
 
 // All ops except "Sin" opset 5- uses consumed_inputs attribute which is not supported for now
@@ -1195,24 +1393,11 @@ int UnaryOpSupportChecker::GetMinSupportedOpSet(const NodeUnit& node_unit) const
 }
 
 /* static */ bool UnaryOpSupportChecker::IsQuantizedOpSupported(
-    const InitializedTensorSet& initializers, const NodeUnit& node_unit, const OpSupportCheckParams& params) {
+    const InitializedTensorSet& initializers, const NodeUnit& node_unit, const OpSupportCheckParams& /* params */) {
   const auto& op_type = node_unit.OpType();
   ORT_ENFORCE(op_type == "QLinearSigmoid");
-
   const auto& op_name = node_unit.Name();
 
-  // Check input scales and ZPs
-  if (!HasValidQuantizationScales(initializers, node_unit, {0}, params, true /* is_input */))
-    return false;
-  if (!HasValidQuantizationZeroPoints(initializers, node_unit, {0}, true /* is_input */))
-    return false;
-
-  // Check output scale and ZP
-  if (!HasValidQuantizationScales(initializers, node_unit, {0}, params, false /* is_input */))
-    return false;
-  if (!HasValidQuantizationZeroPoints(initializers, node_unit, {0}, false /* is_input */))
-    return false;
-
   // NNAPI requires the scale be 1.f/256 and zero point to be 0
   // See https://android.googlesource.com/platform/frameworks/ml/+/refs/heads/android10-c2f2-release/nn/common/operations/Activation.cpp#180
   float output_scale = 0.0f;
@@ -1249,7 +1434,9 @@ class ConcatOpSupportChecker : public BaseOpSupportChecker {
   bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
                          const OpSupportCheckParams& params) const override;
 
-  bool HasSupportedInputsImpl(const NodeUnit& node_unit) const override;
+  bool HasSupportedInputOutputsImpl(
+      const InitializedTensorSet& /* initializers */, const NodeUnit& node_unit,
+      const OpSupportCheckParams& /* params */) const override;
 };
 
 bool ConcatOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& /* initializers */, const NodeUnit& node_unit,
@@ -1268,7 +1455,9 @@ bool ConcatOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& /* in
   return true;
 }
 
-bool ConcatOpSupportChecker::HasSupportedInputsImpl(const NodeUnit& node_unit) const {
+bool ConcatOpSupportChecker::HasSupportedInputOutputsImpl(
+    const InitializedTensorSet& /* initializers */, const NodeUnit& node_unit,
+    const OpSupportCheckParams& /* params */) const {
   int32_t input_type;
   if (!GetType(node_unit.Inputs()[0].node_arg, input_type))
     return false;
@@ -1331,37 +1520,17 @@ bool SqueezeOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& init
 
 class QuantizeLinearOpSupportChecker : public BaseOpSupportChecker {
  private:
-  bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
-                         const OpSupportCheckParams& params) const override;
-
   int32_t GetMinSupportedNNAPIFeatureLevel(const NodeUnit& /* node_unit */,
                                            const OpSupportCheckParams& /* params */) const override {
     return ANEURALNETWORKS_FEATURE_LEVEL_3;
   }
-};
 
-bool QuantizeLinearOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
-                                                       const OpSupportCheckParams& params) const {
-  int32_t output_type;
-  if (!GetType(node_unit.Outputs()[0].node_arg, output_type))
-    return false;
-
-  if (output_type != ONNX_NAMESPACE::TensorProto_DataType_UINT8) {
-    LOGS_DEFAULT(VERBOSE) << "[" << node_unit.OpType()
-                          << "] output type: [" << output_type
-                          << "] is not supported for now";
-    return false;
+  bool HasSupportedInputOutputsImpl(
+      const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+      const OpSupportCheckParams& params) const override {
+    return IsQuantizedIOSupported(initializers, node_unit, {0}, params, IOKind::Output);
   }
-
-  // For QuantizeLinear only output is quantized
-  // Check output scale and ZP
-  if (!HasValidQuantizationScales(initializers, node_unit, {0}, params, false /* is_input */))
-    return false;
-  if (!HasValidQuantizationZeroPoints(initializers, node_unit, {0}, false /* is_input */))
-    return false;
-
-  return true;
-}
+};
 
 #pragma endregion
 
@@ -1369,42 +1538,17 @@ bool QuantizeLinearOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSe
 
 class DequantizeLinearOpSupportChecker : public BaseOpSupportChecker {
  private:
-  bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
-                         const OpSupportCheckParams& params) const override;
-
   int32_t GetMinSupportedNNAPIFeatureLevel(const NodeUnit& /* node_unit */,
                                            const OpSupportCheckParams& /* params */) const override {
     return ANEURALNETWORKS_FEATURE_LEVEL_1;
   }
-  bool HasSupportedInputsImpl(const NodeUnit& node_unit) const override;
-};
 
-bool DequantizeLinearOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
-                                                         const OpSupportCheckParams& params) const {
-  // For DequantizeLinear only input is quantized
-  // Check input scale and ZP
-  if (!HasValidQuantizationScales(initializers, node_unit, {0}, params, true /* is_input */))
-    return false;
-  if (!HasValidQuantizationZeroPoints(initializers, node_unit, {0}, true /* is_input */))
-    return false;
-
-  return true;
-}
-
-bool DequantizeLinearOpSupportChecker::HasSupportedInputsImpl(const NodeUnit& node_unit) const {
-  int32_t input_type;
-  if (!GetType(node_unit.Inputs()[0].node_arg, input_type))
-    return false;
-
-  if (input_type != ONNX_NAMESPACE::TensorProto_DataType_UINT8) {
-    LOGS_DEFAULT(VERBOSE) << "[" << node_unit.OpType()
-                          << "] Input type: [" << input_type
-                          << "] is not supported for now";
-    return false;
+  bool HasSupportedInputOutputsImpl(
+      const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+      const OpSupportCheckParams& params) const override {
+    return IsQuantizedIOSupported(initializers, node_unit, {0}, params, IOKind::Input);
   }
-
-  return true;
-}
+};
 
 #pragma endregion
 
@@ -1480,7 +1624,9 @@ class ResizeOpSupportChecker : public BaseOpSupportChecker {
   // We only support Resize opset 11+ here
   int GetMinSupportedOpSet(const NodeUnit& /* node_unit */) const override { return 11; }
 
-  bool HasSupportedInputsImpl(const NodeUnit& node_unit) const override;
+  bool HasSupportedInputOutputsImpl(
+      const InitializedTensorSet& /* initializers */, const NodeUnit& node_unit,
+      const OpSupportCheckParams& /* params */) const override;
   bool IsNodeUnitTypeSupported(const NodeUnit& /* node_unit */) const override { return true; }
   static bool IsQuantizedOp(const NodeUnit& node_unit) ORT_MUST_USE_RESULT;  // TODO, see if we want to move this to BaseOpBuilder
 };
@@ -1609,33 +1755,6 @@ bool ResizeOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initi
     }
   }
 
-  if (IsQuantizedOp(node_unit)) {
-    // For QDQResize, we only support uint8 output now
-    // TODO, add int8 support to NNAPI, and maybe move all the output type check into a virtual function
-    // similar to HasSupportedInputsImpl
-    int32_t output_type;
-    if (!GetType(node_unit.Outputs()[0].node_arg, output_type))
-      return false;
-
-    if (output_type != ONNX_NAMESPACE::TensorProto_DataType_UINT8) {
-      LOGS_DEFAULT(VERBOSE) << "[Resize] output type: [" << output_type
-                            << "] is not supported for now";
-      return false;
-    }
-
-    // Check input scales and ZPs
-    if (!HasValidQuantizationScales(initializers, node_unit, {0}, params, true /* is_input */))
-      return false;
-    if (!HasValidQuantizationZeroPoints(initializers, node_unit, {0}, true /* is_input */))
-      return false;
-
-    // Check output scale and ZP
-    if (!HasValidQuantizationScales(initializers, node_unit, {0}, params, false /* is_input */))
-      return false;
-    if (!HasValidQuantizationZeroPoints(initializers, node_unit, {0}, false /* is_input */))
-      return false;
-  }
-
   return true;
 }
 
@@ -1653,7 +1772,9 @@ int32_t ResizeOpSupportChecker::GetMinSupportedNNAPIFeatureLevel(const NodeUnit&
   return ANEURALNETWORKS_FEATURE_LEVEL_2;
 }
 
-bool ResizeOpSupportChecker::HasSupportedInputsImpl(const NodeUnit& node_unit) const {
+bool ResizeOpSupportChecker::HasSupportedInputOutputsImpl(
+    const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+    const OpSupportCheckParams& params) const {
   int32_t input_type;
   if (!GetType(node_unit.Inputs()[0].node_arg, input_type))
     return false;
@@ -1666,6 +1787,14 @@ bool ResizeOpSupportChecker::HasSupportedInputsImpl(const NodeUnit& node_unit) c
     return false;
   }
 
+  if (IsQuantizedOp(node_unit)) {
+    if (!IsQuantizedIOSupported(initializers, node_unit, {0}, params, IOKind::Input))
+      return false;
+
+    if (!IsQuantizedIOSupported(initializers, node_unit, {0}, params, IOKind::Output))
+      return false;
+  }
+
   return true;
 }
 
@@ -1870,6 +1999,7 @@ static OpSupportCheckerRegistrations CreateOpSupportCheckerRegistrations() {
     NNAPI_EP_ADD_SHARED_OP_SUPPORT_CHECKER("Mul", BinaryOpSupportChecker);
     NNAPI_EP_ADD_SHARED_OP_SUPPORT_CHECKER("Pow", BinaryOpSupportChecker);
     NNAPI_EP_ADD_SHARED_OP_SUPPORT_CHECKER("QLinearAdd", BinaryOpSupportChecker);
+    NNAPI_EP_ADD_SHARED_OP_SUPPORT_CHECKER("QLinearMul", BinaryOpSupportChecker);
     NNAPI_EP_ADD_SHARED_OP_SUPPORT_CHECKER("Sub", BinaryOpSupportChecker);
   }
 
diff --git a/onnxruntime/test/contrib_ops/qlinear_binary_op_test.cc b/onnxruntime/test/contrib_ops/qlinear_binary_op_test.cc
index 8dbc58c12ce74..c63d79806e777 100644
--- a/onnxruntime/test/contrib_ops/qlinear_binary_op_test.cc
+++ b/onnxruntime/test/contrib_ops/qlinear_binary_op_test.cc
@@ -43,61 +43,87 @@ void RunQLinearMathTestFromFloat(
     const quantization::Params<T>& a_params,
     const std::vector<float>& b, const std::vector<int64_t>& b_shape_origin,
     const quantization::Params<T>& b_params,
-    const quantization::Params<T>& c_params,
-    bool input_b_is_initializer = false,
-    bool all_initializer_scale_zero_point = false) {
-  size_t number_dims = std::max(a_shape_origin.size(), b_shape_origin.size());
-  std::vector<int64_t> a_shape = PrefixingDims(a_shape_origin, number_dims);
-  std::vector<int64_t> b_shape = PrefixingDims(b_shape_origin, number_dims);
-  // calc broadcasting shaped
-  std::vector<int64_t> c_shape(number_dims, 1);
-  for (size_t axis = 0; axis < number_dims; ++axis) {
-    if (a_shape[axis] != b_shape[axis] && (a_shape[axis] != 1 && b_shape[axis] != 1)) {
-      ORT_THROW("Shapes can not be broadcasted");
+    const quantization::Params<T>& c_params) {
+  const auto run_test = [&](bool input_b_is_initializer,
+                            bool all_initializer_scale_zero_point) {
+    size_t number_dims = std::max(a_shape_origin.size(), b_shape_origin.size());
+    std::vector<int64_t> a_shape = PrefixingDims(a_shape_origin, number_dims);
+    std::vector<int64_t> b_shape = PrefixingDims(b_shape_origin, number_dims);
+    // calc broadcasting shaped
+    std::vector<int64_t> c_shape(number_dims, 1);
+    for (size_t axis = 0; axis < number_dims; ++axis) {
+      if (a_shape[axis] != b_shape[axis] && (a_shape[axis] != 1 && b_shape[axis] != 1)) {
+        ORT_THROW("Shapes can not be broadcasted");
+      }
+      c_shape[axis] = std::max(a_shape[axis], b_shape[axis]);
     }
-    c_shape[axis] = std::max(a_shape[axis], b_shape[axis]);
-  }
 
-  std::vector<int64_t> a_strides, b_strides, c_strides;
-  auto c_size = CalcStrides(c_shape, c_strides, false);
-  auto a_size = CalcStrides(a_shape, a_strides, true);
-  auto b_size = CalcStrides(b_shape, b_strides, true);
-  if (a_size != static_cast<int64_t>(a.size()) || b_size != static_cast<int64_t>(b.size())) {
-    ORT_THROW("Input size not match input shape!");
-  }
-  constexpr int qmax = std::numeric_limits<T>::max();
-  constexpr int qmin = std::numeric_limits<T>::min();
-
-  OpTester test(op_name, 1, onnxruntime::kMSDomain);
-  std::vector<T> a_quantized = QuantizeTestVector<T>(a, a_params);
-  test.template AddInput<T>("A", a_shape_origin, a_quantized);
-  test.AddInput<float>("A_scale", {}, {a_params.scale}, all_initializer_scale_zero_point);
-  test.template AddInput<T>("A_zero_point", {}, {a_params.zero_point}, all_initializer_scale_zero_point);
-
-  std::vector<T> b_quantized = QuantizeTestVector<T>(b, b_params);
-  test.template AddInput<T>("B", b_shape_origin, b_quantized, input_b_is_initializer);
-  test.AddInput<float>("B_scale", {}, {b_params.scale}, all_initializer_scale_zero_point);
-  test.template AddInput<T>("B_zero_point", {}, {b_params.zero_point}, all_initializer_scale_zero_point);
-
-  test.AddInput<float>("C_scale", {}, {c_params.scale}, all_initializer_scale_zero_point);
-  test.template AddInput<T>("C_zero_point", {}, {c_params.zero_point}, all_initializer_scale_zero_point);
-  std::vector<T> c(c_size);
-  for (int64_t offset = 0; offset < c_size; ++offset) {
-    int64_t remain = offset, a_offset = 0, b_offset = 0;
-    for (size_t axis = 0; axis < number_dims; ++axis) {
-      int64_t index = remain / c_strides[axis];
-      remain = remain % c_strides[axis];
-      a_offset += index * a_strides[axis];
-      b_offset += index * b_strides[axis];
+    std::vector<int64_t> a_strides, b_strides, c_strides;
+    auto c_size = CalcStrides(c_shape, c_strides, false);
+    auto a_size = CalcStrides(a_shape, a_strides, true);
+    auto b_size = CalcStrides(b_shape, b_strides, true);
+    if (a_size != static_cast<int64_t>(a.size()) || b_size != static_cast<int64_t>(b.size())) {
+      ORT_THROW("Input size not match input shape!");
+    }
+    constexpr int qmax = std::numeric_limits<T>::max();
+    constexpr int qmin = std::numeric_limits<T>::min();
+
+    OpTester test(op_name, 1, onnxruntime::kMSDomain);
+    std::vector<T> a_quantized = QuantizeTestVector<T>(a, a_params);
+    test.template AddInput<T>("A", a_shape_origin, a_quantized);
+    test.AddInput<float>("A_scale", {}, {a_params.scale}, all_initializer_scale_zero_point);
+    test.template AddInput<T>("A_zero_point", {}, {a_params.zero_point}, all_initializer_scale_zero_point);
+
+    std::vector<T> b_quantized = QuantizeTestVector<T>(b, b_params);
+    test.template AddInput<T>("B", b_shape_origin, b_quantized, input_b_is_initializer);
+    test.AddInput<float>("B_scale", {}, {b_params.scale}, all_initializer_scale_zero_point);
+    test.template AddInput<T>("B_zero_point", {}, {b_params.zero_point}, all_initializer_scale_zero_point);
+
+    test.AddInput<float>("C_scale", {}, {c_params.scale}, all_initializer_scale_zero_point);
+    test.template AddInput<T>("C_zero_point", {}, {c_params.zero_point}, all_initializer_scale_zero_point);
+    std::vector<T> c(c_size);
+    for (int64_t offset = 0; offset < c_size; ++offset) {
+      int64_t remain = offset, a_offset = 0, b_offset = 0;
+      for (size_t axis = 0; axis < number_dims; ++axis) {
+        int64_t index = remain / c_strides[axis];
+        remain = remain % c_strides[axis];
+        a_offset += index * a_strides[axis];
+        b_offset += index * b_strides[axis];
+      }
+
+      float a_dequantized = quantization::Dequantize(a_quantized[a_offset], a_params);
+      float b_dequantized = quantization::Dequantize(b_quantized[b_offset], b_params);
+      c[offset] = clampi<T>(static_cast<int>(std::nearbyintf(calc(a_dequantized, b_dequantized) / c_params.scale)) + c_params.zero_point, qmin, qmax);
     }
 
-    float a_dequantized = quantization::Dequantize(a_quantized[a_offset], a_params);
-    float b_dequantized = quantization::Dequantize(b_quantized[b_offset], b_params);
-    c[offset] = clampi<T>(static_cast<int>(std::nearbyintf(calc(a_dequantized, b_dequantized) / c_params.scale)) + c_params.zero_point, qmin, qmax);
-  }
-  test.template AddOutput<T>("C", c_shape, c);
+    float abs_error = 0.0f;
+
+    // For quantized models, NNAPI's rounding is different than CPU provider
+    // Sometimes the result is within +/-1 of result of CPU provider
+    // For ONNX, we use rounding to nearest ties to even.
+    // For NNAPI, it is using std::round which is HALF_AWAY_FROM_ZERO, see
+    // https://android.googlesource.com/platform/frameworks/ml/+/refs/heads/master/nn/common/operations/Quantize.cpp
+    // Use 1 as abs_error which is the smallest possbile for uint8_t
+    //
+    // NOTE, for now the tolerance will only apply if the NNAPI is actually used,
+    // if for any reason the execution falls back to CPU, we still expect an exact match
+    // See, 'void Check<uint8_t>(...' in onnxruntime/test/providers/provider_test_utils.cc
+#ifdef USE_NNAPI
+    abs_error = 1.0f;
+#endif
+
+    test.template AddOutput<T>("C", c_shape, c, false /* sort_output */, 0.0f /* rel_error */, abs_error);
+
+    test.Run();
+  };
+
+  run_test(false /* input_b_is_initializer */, false /* all_initializer_scale_zero_point */);
 
-  test.Run();
+  // NNAPI will require all the scales and zero points be initializers
+  run_test(false /* input_b_is_initializer */, true /* all_initializer_scale_zero_point */);
+
+  // We also want to test the case input B is an initializer
+  run_test(true /* input_b_is_initializer */, true /* all_initializer_scale_zero_point */);
 }
 
 // total 32 + 31 elements to cover all path
@@ -145,22 +171,6 @@ TEST(QLinearBinaryOpTest, AddU8VectorVectorFull) {
                               A, {63}, A_params,
                               B, {63}, B_params,
                               C_params);
-
-  // NNAPI will require all the scales and zero points be initializers
-  // We also want to test the case input B is an initializer
-  RunQLinearMathTestFromFloat("QLinearAdd", add_function,
-                              A, {63}, A_params,
-                              B, {63}, B_params,
-                              C_params,
-                              false /* input_b_is_initializer */,
-                              true /* all_initializer_scale_zero_point */);
-
-  RunQLinearMathTestFromFloat("QLinearAdd", add_function,
-                              A, {63}, A_params,
-                              B, {63}, B_params,
-                              C_params,
-                              true /* input_b_is_initializer */,
-                              true /* all_initializer_scale_zero_point */);
 }
 
 TEST(QLinearBinaryOpTest, AddU8VectorVectorBroadcast) {
@@ -180,22 +190,6 @@ TEST(QLinearBinaryOpTest, AddU8VectorVectorBroadcast) {
                               A, {3, 3, 7}, A_params,
                               B, {3, 1, 7}, B_params,
                               C_params);
-
-  // NNAPI will require all the scales and zero points be initializers
-  // We also want to test the case input B is an initializer
-  RunQLinearMathTestFromFloat("QLinearAdd", add_function,
-                              A, {3, 3, 7}, A_params,
-                              B, {3, 1, 7}, B_params,
-                              C_params,
-                              false /* input_b_is_initializer */,
-                              true /* all_initializer_scale_zero_point */);
-
-  RunQLinearMathTestFromFloat("QLinearAdd", add_function,
-                              A, {3, 3, 7}, A_params,
-                              B, {3, 1, 7}, B_params,
-                              C_params,
-                              true /* input_b_is_initializer */,
-                              true /* all_initializer_scale_zero_point */);
 }
 
 TEST(QLinearBinaryOpTest, AddU8ScalarVectorFull) {
@@ -212,22 +206,6 @@ TEST(QLinearBinaryOpTest, AddU8ScalarVectorFull) {
                               B, {1}, B_params,
                               A, {63}, A_params,
                               C_params);
-
-  // NNAPI will require all the scales and zero points be initializers
-  // We also want to test the case input B is an initializer
-  RunQLinearMathTestFromFloat("QLinearAdd", add_function,
-                              B, {1}, B_params,
-                              A, {63}, A_params,
-                              C_params,
-                              false /* input_b_is_initializer */,
-                              true /* all_initializer_scale_zero_point */);
-
-  RunQLinearMathTestFromFloat("QLinearAdd", add_function,
-                              B, {1}, B_params,
-                              A, {63}, A_params,
-                              C_params,
-                              true /* input_b_is_initializer */,
-                              true /* all_initializer_scale_zero_point */);
 }
 
 TEST(QLinearBinaryOpTest, AddU8ScalarVectorBroadcast) {
@@ -244,22 +222,6 @@ TEST(QLinearBinaryOpTest, AddU8ScalarVectorBroadcast) {
                               B, {3, 1, 1}, B_params,
                               A, {3, 7, 3}, A_params,
                               C_params);
-
-  // NNAPI will require all the scales and zero points be initializers
-  // We also want to test the case input B is an initializer
-  RunQLinearMathTestFromFloat("QLinearAdd", add_function,
-                              B, {3, 1, 1}, B_params,
-                              A, {3, 7, 3}, A_params,
-                              C_params,
-                              false /* input_b_is_initializer */,
-                              true /* all_initializer_scale_zero_point */);
-
-  RunQLinearMathTestFromFloat("QLinearAdd", add_function,
-                              B, {3, 1, 1}, B_params,
-                              A, {3, 7, 3}, A_params,
-                              C_params,
-                              true /* input_b_is_initializer */,
-                              true /* all_initializer_scale_zero_point */);
 }
 
 TEST(QLinearBinaryOpTest, AddU8VectorScalarFull) {
@@ -276,22 +238,6 @@ TEST(QLinearBinaryOpTest, AddU8VectorScalarFull) {
                               A, {63}, A_params,
                               B, {1}, B_params,
                               C_params);
-
-  // NNAPI will require all the scales and zero points be initializers
-  // We also want to test the case input B is an initializer
-  RunQLinearMathTestFromFloat("QLinearAdd", add_function,
-                              A, {63}, A_params,
-                              B, {1}, B_params,
-                              C_params,
-                              false /* input_b_is_initializer */,
-                              true /* all_initializer_scale_zero_point */);
-
-  RunQLinearMathTestFromFloat("QLinearAdd", add_function,
-                              A, {63}, A_params,
-                              B, {1}, B_params,
-                              C_params,
-                              true /* input_b_is_initializer */,
-                              true /* all_initializer_scale_zero_point */);
 }
 
 TEST(QLinearBinaryOpTest, AddU8VectorScalarBroadcast) {
@@ -308,22 +254,6 @@ TEST(QLinearBinaryOpTest, AddU8VectorScalarBroadcast) {
                               A, {3, 7, 3}, A_params,
                               B, {1, 1, 3}, B_params,
                               C_params);
-
-  // NNAPI will require all the scales and zero points be initializers
-  // We also want to test the case input B is an initializer
-  RunQLinearMathTestFromFloat("QLinearAdd", add_function,
-                              A, {3, 7, 3}, A_params,
-                              B, {1, 1, 3}, B_params,
-                              C_params,
-                              false /* input_b_is_initializer */,
-                              true /* all_initializer_scale_zero_point */);
-
-  RunQLinearMathTestFromFloat("QLinearAdd", add_function,
-                              A, {3, 7, 3}, A_params,
-                              B, {1, 1, 3}, B_params,
-                              C_params,
-                              true /* input_b_is_initializer */,
-                              true /* all_initializer_scale_zero_point */);
 }
 
 TEST(QLinearBinaryOpTest, AddS8VectorVectorFull) {
diff --git a/onnxruntime/test/optimizer/qdq_test_utils.h b/onnxruntime/test/optimizer/qdq_test_utils.h
index 7a054c9a64352..fbbb0b8af1028 100644
--- a/onnxruntime/test/optimizer/qdq_test_utils.h
+++ b/onnxruntime/test/optimizer/qdq_test_utils.h
@@ -81,10 +81,27 @@ GetQDQTestCaseFn BuildQDQConvTestCase(const std::vector<int64_t>& input_shape, c
 template <typename InputType, typename OutputType>
 GetQDQTestCaseFn BuildQDQAveragePoolTestCase(const std::vector<int64_t>& input_shape) {
   return [input_shape](ModelTestBuilder& builder) {
+
+#ifdef USE_NNAPI  // NNAPI require consistent scales/ZPs for DQ -> Pool -> Q
+    float dq_scale = 0.0038f;
+    float pool_output_scale = 0.0038f;
+    float q_scale = 0.0038f;
+    InputType dq_zp = std::numeric_limits<OutputType>::max() / 2;
+    InputType pool_output_zp = std::numeric_limits<OutputType>::max() / 2;
+    InputType q_zp = std::numeric_limits<OutputType>::max() / 2;
+#else
+    float dq_scale = 0.0035f;
+    float pool_output_scale = 0.0038f;
+    float q_scale = 0.0039f;
+    InputType dq_zp = 7;
+    InputType pool_output_zp = std::numeric_limits<OutputType>::max() / 2;
+    InputType q_zp = std::numeric_limits<OutputType>::max() / 2;
+#endif
+
     auto* input_arg = builder.MakeInput<float>(input_shape, -1.f, 1.f);
     auto* output_arg = builder.MakeOutput();
     // add QDQ + AveragePool
-    auto* dq_output = AddQDQNodePair<InputType>(builder, input_arg, .0035f, 7);
+    auto* dq_output = AddQDQNodePair<InputType>(builder, input_arg, dq_scale, dq_zp);
     auto* averagepool_output = builder.MakeIntermediate();
     Node& pool_node = builder.AddNode("AveragePool", {dq_output}, {averagepool_output});
     std::vector<int64_t> pads((input_shape.size() - 2) * 2, 1);
@@ -95,12 +112,12 @@ GetQDQTestCaseFn BuildQDQAveragePoolTestCase(const std::vector<int64_t>& input_s
     // add QDQ output
     auto* q_output = builder.MakeIntermediate();
     builder.AddQuantizeLinearNode<OutputType>(averagepool_output,
-                                              .0038f,
-                                              std::numeric_limits<OutputType>::max() / 2,
+                                              pool_output_scale,
+                                              pool_output_zp,
                                               q_output);
     builder.AddDequantizeLinearNode<OutputType>(q_output,
-                                                .0039f,
-                                                std::numeric_limits<OutputType>::max() / 2,
+                                                q_scale,
+                                                q_zp,
                                                 output_arg);
   };
 }
@@ -110,5 +127,65 @@ GetQDQTestCaseFn BuildQDQResizeTestCase(const std::vector<int64_t>& input_shape,
                                         const std::string& mode = "nearest",
                                         const std::string& coordinate_transformation_mode = "half_pixel");
 
+template <typename Input1Type, typename Input2Type, typename OutputType>
+GetQDQTestCaseFn BuildBinaryOpTestCase(const std::vector<int64_t>& input_shape,
+                                       const std::string& op_type) {
+  return [input_shape, op_type](ModelTestBuilder& builder) {
+    auto* input1_arg = builder.MakeInput<float>(input_shape, -1.f, 1.f);
+    auto* input2_arg = builder.MakeInput<float>(input_shape, -1.f, 1.f);
+    auto* output_arg = builder.MakeOutput();
+
+#ifdef USE_NNAPI  // NNAPI require consistent scales for DQ -> bin_op_input and bin_op_output-> Q
+    float q_scale = 0.008f;
+    float op_input_scale = 0.008f;
+    float op_output_scale = 0.0076f;
+    float dq_scale = 0.0076f;
+#else
+    float q_scale = 0.008f;
+    float op_input_scale = 0.0079f;
+    float op_output_scale = 0.0076f;
+    float dq_scale = 0.0078f;
+#endif
+
+    // add QDQ 1
+    auto* q1_output = builder.MakeIntermediate();
+    auto* dq1_output = builder.MakeIntermediate();
+    builder.AddQuantizeLinearNode<Input1Type>(input1_arg,
+                                              q_scale,
+                                              std::numeric_limits<Input1Type>::max() / 2,
+                                              q1_output);
+    builder.AddDequantizeLinearNode<Input1Type>(q1_output,
+                                                op_input_scale,
+                                                std::numeric_limits<Input1Type>::max() / 2,
+                                                dq1_output);
+
+    // add QDQ 2
+    auto* q2_output = builder.MakeIntermediate();
+    auto* dq2_output = builder.MakeIntermediate();
+    builder.AddQuantizeLinearNode<Input2Type>(input2_arg,
+                                              q_scale,
+                                              std::numeric_limits<Input2Type>::max() / 2,
+                                              q2_output);
+    builder.AddDequantizeLinearNode<Input2Type>(q2_output,
+                                                op_input_scale,
+                                                std::numeric_limits<Input2Type>::max() / 2,
+                                                dq2_output);
+
+    // add binary operator
+    auto* binary_op_output = builder.MakeIntermediate();
+    builder.AddNode(op_type, {dq1_output, dq2_output}, {binary_op_output});
+
+    // add QDQ output
+    auto* q3_output = builder.MakeIntermediate();
+    builder.AddQuantizeLinearNode<OutputType>(binary_op_output,
+                                              op_output_scale,
+                                              std::numeric_limits<OutputType>::max() / 2,
+                                              q3_output);
+    builder.AddDequantizeLinearNode<OutputType>(q3_output,
+                                                dq_scale,
+                                                std::numeric_limits<OutputType>::max() / 2,
+                                                output_arg);
+  };
+}
 }  // namespace test
 }  // namespace onnxruntime
\ No newline at end of file
diff --git a/onnxruntime/test/optimizer/qdq_transformer_test.cc b/onnxruntime/test/optimizer/qdq_transformer_test.cc
index 7ea62a2817100..73b07e8adbd6b 100644
--- a/onnxruntime/test/optimizer/qdq_transformer_test.cc
+++ b/onnxruntime/test/optimizer/qdq_transformer_test.cc
@@ -39,7 +39,7 @@ namespace test {
 template <typename InputType, typename WeightType, typename BiasType, typename OutputType>
 void QDQTransformerConvTests() {
   auto test_case = [&](const std::vector<int64_t>& input_shape, const std::vector<int64_t>& weights_shape) {
-    auto check_conv_graph = [&](InferenceSessionWrapper& session) {
+    auto check_graph = [&](InferenceSessionWrapper& session) {
       auto op_to_count = CountOpsInGraph(session.GetGraph());
       if constexpr (std::is_same<InputType, OutputType>::value &&
                     std::is_same<BiasType, int32_t>::value &&
@@ -57,7 +57,7 @@ void QDQTransformerConvTests() {
     };
 
     TransformerTester(BuildQDQConvTestCase<InputType, WeightType, BiasType, OutputType>(input_shape, weights_shape),
-                      check_conv_graph,
+                      check_graph,
                       TransformerLevel::Level1,
                       TransformerLevel::Level2,
                       12 /*opset_version*/,
@@ -136,7 +136,7 @@ TEST(QDQTransformerTests, ConvMaxPoolReshape_UInt8) {
       builder.AddQuantizeLinearNode<uint8_t>(reshape_output, .0039f, 135, output_arg);
     };
 
-    auto check_mp_reshape_graph = [&](InferenceSessionWrapper& session) {
+    auto check_graph = [&](InferenceSessionWrapper& session) {
       auto op_to_count = CountOpsInGraph(session.GetGraph());
       EXPECT_EQ(op_to_count["QLinearConv"], 1);
       EXPECT_EQ(op_to_count["MaxPool"], 1);
@@ -146,7 +146,7 @@ TEST(QDQTransformerTests, ConvMaxPoolReshape_UInt8) {
     };
 
     TransformerTester(build_test_case,
-                      check_mp_reshape_graph,
+                      check_graph,
                       TransformerLevel::Level1,
                       TransformerLevel::Level2,
                       opset_version);
@@ -197,7 +197,7 @@ TEST(QDQTransformerTests, ConvMaxPoolReshape_Int8) {
       }
     };
 
-    auto check_mp_reshape_graph = [&](InferenceSessionWrapper& session) {
+    auto check_graph = [&](InferenceSessionWrapper& session) {
       auto op_to_count = CountOpsInGraph(session.GetGraph());
       EXPECT_EQ(op_to_count["QLinearConv"], 1);
       EXPECT_EQ(op_to_count["MaxPool"], 1);
@@ -206,7 +206,7 @@ TEST(QDQTransformerTests, ConvMaxPoolReshape_Int8) {
       EXPECT_EQ(op_to_count["DequantizeLinear"], 0);
     };
 
-    TransformerTester(build_test_case, check_mp_reshape_graph, TransformerLevel::Level1, TransformerLevel::Level2);
+    TransformerTester(build_test_case, check_graph, TransformerLevel::Level1, TransformerLevel::Level2);
   };
 
   test_case({1, 12, 37}, {32, 12, 5});
@@ -217,7 +217,7 @@ TEST(QDQTransformerTests, ConvMaxPoolReshape_Int8) {
 template <typename InputType, typename OutputType>
 void QDQTransformerAveragePoolTests() {
   auto test_case = [&](const std::vector<int64_t>& input_shape) {
-    auto check_averagepool_op_graph = [&](InferenceSessionWrapper& session) {
+    auto check_graph = [&](InferenceSessionWrapper& session) {
       auto op_to_count = CountOpsInGraph(session.GetGraph());
       if constexpr (std::is_same<InputType, OutputType>::value) {
         EXPECT_EQ(op_to_count["com.microsoft.QLinearAveragePool"], 1);
@@ -233,7 +233,7 @@ void QDQTransformerAveragePoolTests() {
     };
 
     TransformerTester(BuildQDQAveragePoolTestCase<InputType, OutputType>(input_shape),
-                      check_averagepool_op_graph,
+                      check_graph,
                       TransformerLevel::Level1,
                       TransformerLevel::Level2,
                       12 /*opset_version*/,
@@ -266,52 +266,7 @@ TEST(QDQTransformerTests, AveragePool_U8S8) {
 template <typename Input1Type, typename Input2Type, typename OutputType>
 void QDQTransformerBinaryOpTests(const std::string& op_type) {
   auto test_case = [&](const std::vector<int64_t>& input_shape) {
-    auto build_test_case = [&](ModelTestBuilder& builder) {
-      auto* input1_arg = builder.MakeInput<float>(input_shape, -1.f, 1.f);
-      auto* input2_arg = builder.MakeInput<float>(input_shape, -1.f, 1.f);
-      auto* output_arg = builder.MakeOutput();
-
-      // add QDQ 1
-      auto* q1_output = builder.MakeIntermediate();
-      auto* dq1_output = builder.MakeIntermediate();
-      builder.AddQuantizeLinearNode<Input1Type>(input1_arg,
-                                                .004f,
-                                                std::numeric_limits<Input1Type>::max() / 2,
-                                                q1_output);
-      builder.AddDequantizeLinearNode<Input1Type>(q1_output,
-                                                  .0039f,
-                                                  std::numeric_limits<Input1Type>::max() / 2,
-                                                  dq1_output);
-
-      // add QDQ 2
-      auto* q2_output = builder.MakeIntermediate();
-      auto* dq2_output = builder.MakeIntermediate();
-      builder.AddQuantizeLinearNode<Input2Type>(input2_arg,
-                                                .004f,
-                                                std::numeric_limits<Input2Type>::max() / 2,
-                                                q2_output);
-      builder.AddDequantizeLinearNode<Input2Type>(q2_output,
-                                                  .0039f,
-                                                  std::numeric_limits<Input2Type>::max() / 2,
-                                                  dq2_output);
-
-      // add binary operator
-      auto* binary_op_output = builder.MakeIntermediate();
-      builder.AddNode(op_type, {dq1_output, dq2_output}, {binary_op_output});
-
-      // add QDQ output
-      auto* q3_output = builder.MakeIntermediate();
-      builder.AddQuantizeLinearNode<OutputType>(binary_op_output,
-                                                .0038f,
-                                                std::numeric_limits<OutputType>::max() / 2,
-                                                q3_output);
-      builder.AddDequantizeLinearNode<OutputType>(q3_output,
-                                                  .0039f,
-                                                  std::numeric_limits<OutputType>::max() / 2,
-                                                  output_arg);
-    };
-
-    auto check_binary_op_graph = [&](InferenceSessionWrapper& session) {
+    auto check_graph = [&](InferenceSessionWrapper& session) {
       auto op_to_count = CountOpsInGraph(session.GetGraph());
       if (std::is_same<Input1Type, Input2Type>::value &&
           std::is_same<Input1Type, OutputType>::value) {
@@ -327,8 +282,8 @@ void QDQTransformerBinaryOpTests(const std::string& op_type) {
       }
     };
 
-    TransformerTester(build_test_case,
-                      check_binary_op_graph,
+    TransformerTester(BuildBinaryOpTestCase<Input1Type, Input2Type, OutputType>(input_shape, op_type),
+                      check_graph,
                       TransformerLevel::Level1,
                       TransformerLevel::Level2,
                       12 /*opset_version*/,
@@ -426,7 +381,7 @@ void QDQTransformerMatMulTests(bool has_output_q) {
       }
     };
 
-    auto check_binary_op_graph = [&](InferenceSessionWrapper& session) {
+    auto check_graph = [&](InferenceSessionWrapper& session) {
       auto op_to_count = CountOpsInGraph(session.GetGraph());
       if (has_output_q) {
         if constexpr (std::is_same<Input1Type, OutputType>::value &&
@@ -459,7 +414,7 @@ void QDQTransformerMatMulTests(bool has_output_q) {
     };
 
     TransformerTester(build_test_case,
-                      check_binary_op_graph,
+                      check_graph,
                       TransformerLevel::Level1,
                       TransformerLevel::Level2,
                       12 /*opset_version*/,
@@ -696,14 +651,14 @@ TEST(QDQTransformerTests, Gather) {
       builder.AddQuantizeLinearNode<int8_t>(gather_output, .003f, 1, output_arg);
     };
 
-    auto check_matmul_graph = [&](InferenceSessionWrapper& session) {
+    auto check_graph = [&](InferenceSessionWrapper& session) {
       auto op_to_count = CountOpsInGraph(session.GetGraph());
       EXPECT_EQ(op_to_count["Gather"], 1);
       EXPECT_EQ(op_to_count["QuantizeLinear"], 0);
       EXPECT_EQ(op_to_count["DequantizeLinear"], 0);
     };
 
-    TransformerTester(build_test_case, check_matmul_graph, TransformerLevel::Level1, TransformerLevel::Level2);
+    TransformerTester(build_test_case, check_graph, TransformerLevel::Level1, TransformerLevel::Level2);
   };
 
   test_case({12, 37}, {24, 12});
@@ -728,14 +683,14 @@ TEST(QDQTransformerTests, Transpose) {
       builder.AddQuantizeLinearNode<int8_t>(transpose_output, .003f, 1, output_arg);
     };
 
-    auto check_matmul_graph = [&](InferenceSessionWrapper& session) {
+    auto check_graph = [&](InferenceSessionWrapper& session) {
       auto op_to_count = CountOpsInGraph(session.GetGraph());
       EXPECT_EQ(op_to_count["Transpose"], 1);
       EXPECT_EQ(op_to_count["QuantizeLinear"], 0);
       EXPECT_EQ(op_to_count["DequantizeLinear"], 0);
     };
 
-    TransformerTester(build_test_case, check_matmul_graph, TransformerLevel::Level1, TransformerLevel::Level2);
+    TransformerTester(build_test_case, check_graph, TransformerLevel::Level1, TransformerLevel::Level2);
   };
 
   test_case({2, 13, 12, 37}, {0, 3, 1, 2});
@@ -760,13 +715,13 @@ TEST(QDQTransformerTests, Transpose_No_Fusion) {
       builder.AddQuantizeLinearNode<int8_t>(transpose_output, .003f, 1, output_arg);
     };
 
-    auto check_matmul_graph = [&](InferenceSessionWrapper& session) {
+    auto check_graph = [&](InferenceSessionWrapper& session) {
       auto op_to_count = CountOpsInGraph(session.GetGraph());
       EXPECT_EQ(op_to_count["QuantizeLinear"], 1);
       EXPECT_EQ(op_to_count["DequantizeLinear"], 1);
     };
 
-    TransformerTester(build_test_case, check_matmul_graph, TransformerLevel::Level1, TransformerLevel::Level2);
+    TransformerTester(build_test_case, check_graph, TransformerLevel::Level1, TransformerLevel::Level2);
   };
 
   test_case({2, 13, 12, 37}, {0, 3, 1, 2});
@@ -775,7 +730,7 @@ TEST(QDQTransformerTests, Transpose_No_Fusion) {
 TEST(QDQTransformerTests, Resize) {
   auto test_case = [&](const std::vector<int64_t>& input1_shape,
                        const std::vector<int64_t>& sizes_shape) {
-    auto check_resize_graph = [&](InferenceSessionWrapper& session) {
+    auto check_graph = [&](InferenceSessionWrapper& session) {
       auto op_to_count = CountOpsInGraph(session.GetGraph());
       EXPECT_EQ(op_to_count["Resize"], 1);
       EXPECT_EQ(op_to_count["QuantizeLinear"], 0);
@@ -783,7 +738,7 @@ TEST(QDQTransformerTests, Resize) {
     };
 
     TransformerTester(BuildQDQResizeTestCase(input1_shape, sizes_shape),
-                      check_resize_graph,
+                      check_graph,
                       TransformerLevel::Level1,
                       TransformerLevel::Level2);
   };
@@ -828,7 +783,7 @@ TEST(QDQTransformerTests, Resize_No_Fusion) {
       builder.AddQuantizeLinearNode<uint8_t>(resize_output, .003f, 1, output_arg);
     };
 
-    auto check_qdq_graph = [&](InferenceSessionWrapper& session) {
+    auto check_graph = [&](InferenceSessionWrapper& session) {
       auto op_to_count = CountOpsInGraph(session.GetGraph());
       EXPECT_EQ(op_to_count["Resize"], 1);
       EXPECT_EQ(op_to_count["Concat"], 1);
@@ -836,7 +791,7 @@ TEST(QDQTransformerTests, Resize_No_Fusion) {
       EXPECT_EQ(op_to_count["DequantizeLinear"], 1);
     };
 
-    TransformerTester(build_test_case, check_qdq_graph,
+    TransformerTester(build_test_case, check_graph,
                       TransformerLevel::Level1,
                       TransformerLevel::Level2);
   };
@@ -867,7 +822,7 @@ TEST(QDQTransformerTests, ResizeReshape) {
       builder.AddNode("Reshape", {qdq_resize_output, reshape_shape}, {output_arg});
     };
 
-    auto check_qdq_graph = [&](InferenceSessionWrapper& session) {
+    auto check_graph = [&](InferenceSessionWrapper& session) {
       auto op_to_count = CountOpsInGraph(session.GetGraph());
       EXPECT_EQ(op_to_count["Resize"], 1);
       EXPECT_EQ(op_to_count["Reshape"], 1);
@@ -875,7 +830,7 @@ TEST(QDQTransformerTests, ResizeReshape) {
       EXPECT_EQ(op_to_count["DequantizeLinear"], 1);
     };
 
-    TransformerTester(build_test_case, check_qdq_graph,
+    TransformerTester(build_test_case, check_graph,
                       TransformerLevel::Level1,
                       TransformerLevel::Level2);
   };
@@ -905,13 +860,13 @@ TEST(QDQTransformerTests, ArgMax) {
       argmax_node.AddAttribute("select_last_index", static_cast<int64_t>(select_last_index));
     };
 
-    auto check_argmax_graph = [&](InferenceSessionWrapper& session) {
+    auto check_graph = [&](InferenceSessionWrapper& session) {
       auto op_to_count = CountOpsInGraph(session.GetGraph());
       EXPECT_EQ(op_to_count["ArgMax"], 1);
       EXPECT_EQ(op_to_count["DequantizeLinear"], 0);
     };
 
-    TransformerTester(build_test_case, check_argmax_graph,
+    TransformerTester(build_test_case, check_graph,
                       TransformerLevel::Level1,
                       TransformerLevel::Level2,
                       /* opset_version */ 13);
@@ -939,14 +894,14 @@ TEST(QDQTransformerTests, QLinearMatMul) {
       builder.AddQuantizeLinearNode<uint8_t>(matmul_output, .0039f, 135, output_arg);
     };
 
-    auto check_matmul_graph = [&](InferenceSessionWrapper& session) {
+    auto check_graph = [&](InferenceSessionWrapper& session) {
       auto op_to_count = CountOpsInGraph(session.GetGraph());
       EXPECT_EQ(op_to_count["QLinearMatMul"], 1);
       EXPECT_EQ(op_to_count["QuantizeLinear"], 2);
       EXPECT_EQ(op_to_count["DequantizeLinear"], 0);
     };
 
-    TransformerTester(build_test_case, check_matmul_graph, TransformerLevel::Level1, TransformerLevel::Level2);
+    TransformerTester(build_test_case, check_graph, TransformerLevel::Level1, TransformerLevel::Level2);
   };
 
   test_case({12, 37}, {37, 12});
@@ -970,7 +925,7 @@ TEST(QDQTransformerTests, MatMul_No_Fusion) {
       builder.AddQuantizeLinearNode<uint8_t>(matmul_output, .0039f, 135, output_arg);
     };
 
-    auto check_matmul_graph = [&](InferenceSessionWrapper& session) {
+    auto check_graph = [&](InferenceSessionWrapper& session) {
       auto op_to_count = CountOpsInGraph(session.GetGraph());
       EXPECT_EQ(op_to_count["MatMul"], 1);
       EXPECT_EQ(op_to_count["QLinearMatMul"], 0);
@@ -978,7 +933,7 @@ TEST(QDQTransformerTests, MatMul_No_Fusion) {
       EXPECT_EQ(op_to_count["DequantizeLinear"], 1);
     };
 
-    TransformerTester(build_test_case, check_matmul_graph, TransformerLevel::Level1, TransformerLevel::Level2);
+    TransformerTester(build_test_case, check_graph, TransformerLevel::Level1, TransformerLevel::Level2);
   };
 
   test_case({12, 37}, {37, 12});
@@ -1006,7 +961,7 @@ TEST(QDQTransformerTests, MatMul_1st_Input_Int8) {
       builder.AddQuantizeLinearNode<uint8_t>(matmul_output, .0039f, 135, output_arg);
     };
 
-    auto check_matmul_graph = [&](InferenceSessionWrapper& session) {
+    auto check_graph = [&](InferenceSessionWrapper& session) {
       auto op_to_count = CountOpsInGraph(session.GetGraph());
       EXPECT_EQ(op_to_count["MatMul"], 1);
       EXPECT_EQ(op_to_count["QLinearMatMul"], 0);
@@ -1014,7 +969,7 @@ TEST(QDQTransformerTests, MatMul_1st_Input_Int8) {
       EXPECT_EQ(op_to_count["DequantizeLinear"], 2);
     };
 
-    TransformerTester(build_test_case, check_matmul_graph, TransformerLevel::Level1, TransformerLevel::Level2);
+    TransformerTester(build_test_case, check_graph, TransformerLevel::Level1, TransformerLevel::Level2);
   };
 
   test_case({12, 37}, {37, 12});
@@ -1043,7 +998,7 @@ TEST(QDQTransformerTests, MatMulIntegerToFloat) {
       builder.AddNode("MatMul", {dq_output_1, dq_output_2}, {output_arg});
     };
 
-    auto check_matmul_graph = [&](InferenceSessionWrapper& session) {
+    auto check_graph = [&](InferenceSessionWrapper& session) {
       auto op_to_count = CountOpsInGraph(session.GetGraph());
       EXPECT_EQ(op_to_count["com.microsoft.MatMulIntegerToFloat"], 1);
       EXPECT_EQ(op_to_count["QuantizeLinear"], 0);
@@ -1051,7 +1006,7 @@ TEST(QDQTransformerTests, MatMulIntegerToFloat) {
     };
 
     TransformerTester(build_test_case,
-                      check_matmul_graph,
+                      check_graph,
                       TransformerLevel::Level1,
                       TransformerLevel::Level2,
                       12 /*opset_version*/,
@@ -1086,7 +1041,7 @@ TEST(QDQTransformerTests, ConvRelu) {
       builder.AddQuantizeLinearNode<uint8_t>(relu_output, .0039f, is_zp_zero ? 0 : 1, output_arg);
     };
 
-    auto check_mp_reshape_graph = [&](InferenceSessionWrapper& session) {
+    auto check_graph = [&](InferenceSessionWrapper& session) {
       auto op_to_count = CountOpsInGraph(session.GetGraph());
       if (is_zp_zero) {
         EXPECT_EQ(op_to_count["QLinearConv"], 1);
@@ -1104,7 +1059,7 @@ TEST(QDQTransformerTests, ConvRelu) {
       }
     };
 
-    TransformerTester(build_test_case, check_mp_reshape_graph, TransformerLevel::Level1, TransformerLevel::Level2);
+    TransformerTester(build_test_case, check_graph, TransformerLevel::Level1, TransformerLevel::Level2);
   };
 
   test_case({1, 12, 37}, {32, 12, 5}, true);
@@ -1150,7 +1105,7 @@ TEST(QDQTransformerTests, ConvAveragePoolReshape_UInt8) {
       builder.AddDequantizeLinearNode<uint8_t>(q_output, .0035f, 135, output_arg);
     };
 
-    auto check_mp_reshape_graph = [&](InferenceSessionWrapper& session) {
+    auto check_graph = [&](InferenceSessionWrapper& session) {
       auto op_to_count = CountOpsInGraph(session.GetGraph());
       EXPECT_EQ(op_to_count["QLinearConv"], 1);
       EXPECT_EQ(op_to_count["com.microsoft.QLinearAveragePool"], 1);
@@ -1160,7 +1115,7 @@ TEST(QDQTransformerTests, ConvAveragePoolReshape_UInt8) {
     };
 
     TransformerTester(build_test_case,
-                      check_mp_reshape_graph,
+                      check_graph,
                       TransformerLevel::Level1,
                       TransformerLevel::Level2,
                       12 /*opset_version*/,
@@ -1213,7 +1168,7 @@ TEST(QDQTransformerTests, ConvAveragePoolReshape_Int8) {
       }
     };
 
-    auto check_mp_reshape_graph = [&](InferenceSessionWrapper& session) {
+    auto check_graph = [&](InferenceSessionWrapper& session) {
       auto op_to_count = CountOpsInGraph(session.GetGraph());
       EXPECT_EQ(op_to_count["QLinearConv"], 1);
       EXPECT_EQ(op_to_count["com.microsoft.QLinearAveragePool"], 1);
@@ -1223,7 +1178,7 @@ TEST(QDQTransformerTests, ConvAveragePoolReshape_Int8) {
     };
 
     TransformerTester(build_test_case,
-                      check_mp_reshape_graph,
+                      check_graph,
                       TransformerLevel::Level1,
                       TransformerLevel::Level2,
                       12 /*opset_version*/,
@@ -1277,7 +1232,7 @@ TEST(QDQTransformerTests, ConvAveragePoolReshape_Int8_Fail) {
       }
     };
 
-    auto check_mp_reshape_graph = [&](InferenceSessionWrapper& session) {
+    auto check_graph = [&](InferenceSessionWrapper& session) {
       auto op_to_count = CountOpsInGraph(session.GetGraph());
       EXPECT_EQ(op_to_count["Conv"], 1);
       EXPECT_EQ(op_to_count["QLinearConv"], 0);
@@ -1288,7 +1243,7 @@ TEST(QDQTransformerTests, ConvAveragePoolReshape_Int8_Fail) {
     };
 
     TransformerTester(build_test_case,
-                      check_mp_reshape_graph,
+                      check_graph,
                       TransformerLevel::Level1,
                       TransformerLevel::Level2,
                       12 /*opset_version*/,
@@ -1325,7 +1280,7 @@ void QDQTransformerLeakyReluTests() {
                                                   output_arg);
     };
 
-    auto check_binary_op_graph = [&](InferenceSessionWrapper& session) {
+    auto check_graph = [&](InferenceSessionWrapper& session) {
       auto op_to_count = CountOpsInGraph(session.GetGraph());
       if constexpr (std::is_same<InputType, OutputType>::value) {
         EXPECT_EQ(op_to_count["com.microsoft.QLinearLeakyRelu"], 1);
@@ -1341,7 +1296,7 @@ void QDQTransformerLeakyReluTests() {
     };
 
     TransformerTester(build_test_case,
-                      check_binary_op_graph,
+                      check_graph,
                       TransformerLevel::Level1,
                       TransformerLevel::Level2,
                       12 /*opset_version*/,
@@ -1401,7 +1356,7 @@ TEST(QDQTransformerTests, ConvTranspose_QBackward) {
       }
     };
 
-    auto check_mp_reshape_graph = [&](InferenceSessionWrapper& session) {
+    auto check_graph = [&](InferenceSessionWrapper& session) {
       auto op_to_count = CountOpsInGraph(session.GetGraph());
       EXPECT_EQ(op_to_count["QLinearConv"], 1);
       EXPECT_EQ(op_to_count["Transpose"], 1);
@@ -1410,7 +1365,7 @@ TEST(QDQTransformerTests, ConvTranspose_QBackward) {
     };
 
     TransformerTester(build_test_case,
-                      check_mp_reshape_graph,
+                      check_graph,
                       TransformerLevel::Level1,
                       TransformerLevel::Level2);
   };
@@ -1461,7 +1416,7 @@ TEST(QDQTransformerTests, QBackward_MutilpleSteps) {
       }
     };
 
-    auto check_mp_reshape_graph = [&](InferenceSessionWrapper& session) {
+    auto check_graph = [&](InferenceSessionWrapper& session) {
       auto op_to_count = CountOpsInGraph(session.GetGraph());
       EXPECT_EQ(op_to_count["QLinearConv"], 1);
       EXPECT_EQ(op_to_count["MaxPool"], 1);
@@ -1472,7 +1427,7 @@ TEST(QDQTransformerTests, QBackward_MutilpleSteps) {
     };
 
     TransformerTester(build_test_case,
-                      check_mp_reshape_graph,
+                      check_graph,
                       TransformerLevel::Level1,
                       TransformerLevel::Level2);
   };
@@ -1512,7 +1467,7 @@ TEST(QDQTransformerTests, ConvTranspose_DQForward) {
       }
     };
 
-    auto check_mp_reshape_graph = [&](InferenceSessionWrapper& session) {
+    auto check_graph = [&](InferenceSessionWrapper& session) {
       auto op_to_count = CountOpsInGraph(session.GetGraph());
       EXPECT_EQ(op_to_count["QLinearConv"], 1);
       EXPECT_EQ(op_to_count["Transpose"], 1);
@@ -1521,7 +1476,7 @@ TEST(QDQTransformerTests, ConvTranspose_DQForward) {
     };
 
     TransformerTester(build_test_case,
-                      check_mp_reshape_graph,
+                      check_graph,
                       TransformerLevel::Level1,
                       TransformerLevel::Level2);
   };
@@ -1572,7 +1527,7 @@ TEST(QDQTransformerTests, DQForward_MutilpleSteps) {
       }
     };
 
-    auto check_mp_reshape_graph = [&](InferenceSessionWrapper& session) {
+    auto check_graph = [&](InferenceSessionWrapper& session) {
       auto op_to_count = CountOpsInGraph(session.GetGraph());
       EXPECT_EQ(op_to_count["QLinearConv"], 1);
       EXPECT_EQ(op_to_count["MaxPool"], 1);
@@ -1583,7 +1538,7 @@ TEST(QDQTransformerTests, DQForward_MutilpleSteps) {
     };
 
     TransformerTester(build_test_case,
-                      check_mp_reshape_graph,
+                      check_graph,
                       TransformerLevel::Level1,
                       TransformerLevel::Level2);
   };
@@ -1704,7 +1659,7 @@ TEST(QDQTransformerTests, Concat) {
       }
     };
 
-    auto check_mp_reshape_graph = [&input_shapes, &has_input_float, &has_input_int8, &has_output_int8](InferenceSessionWrapper& session) {
+    auto check_graph = [&input_shapes, &has_input_float, &has_input_int8, &has_output_int8](InferenceSessionWrapper& session) {
       auto op_to_count = CountOpsInGraph(session.GetGraph());
       if (has_input_float || has_input_int8 || has_output_int8) {
         EXPECT_EQ(op_to_count["com.microsoft.QLinearConcat"], 0);
@@ -1716,7 +1671,7 @@ TEST(QDQTransformerTests, Concat) {
     };
 
     TransformerTester(build_test_case,
-                      check_mp_reshape_graph,
+                      check_graph,
                       TransformerLevel::Level1,
                       TransformerLevel::Level2,
                       12 /*opset_version*/,
@@ -1763,7 +1718,7 @@ TEST(QDQTransformerTests, QDQPropagation_QDQCancelOut) {
       builder.AddNode("Reshape", {maxpool_output, reshape_shape}, {output_arg});
     };
 
-    auto check_mp_reshape_graph = [&](InferenceSessionWrapper& session) {
+    auto check_graph = [&](InferenceSessionWrapper& session) {
       auto op_to_count = CountOpsInGraph(session.GetGraph());
       EXPECT_EQ(op_to_count["MaxPool"], 1);
       EXPECT_EQ(op_to_count["Reshape"], 1);
@@ -1773,7 +1728,7 @@ TEST(QDQTransformerTests, QDQPropagation_QDQCancelOut) {
     };
 
     TransformerTester(build_test_case,
-                      check_mp_reshape_graph,
+                      check_graph,
                       TransformerLevel::Level1,
                       TransformerLevel::Level2);
   };
@@ -1799,7 +1754,7 @@ TEST(QDQTransformerTests, QDQPropagation_QDQ_CancelOut_More) {
       builder.AddQuantizeLinearNode<uint8_t>(reshape_output, same_scale ? .004f : .0039f, same_zp ? 129 : 128, output_arg);
     };
 
-    auto check_mp_reshape_graph = [&](InferenceSessionWrapper& session) {
+    auto check_graph = [&](InferenceSessionWrapper& session) {
       auto op_to_count = CountOpsInGraph(session.GetGraph());
       EXPECT_EQ(op_to_count["Reshape"], 1);
       EXPECT_EQ(op_to_count["QuantizeLinear"], same_scale && same_zp ? 1 : 2);
@@ -1807,7 +1762,7 @@ TEST(QDQTransformerTests, QDQPropagation_QDQ_CancelOut_More) {
     };
 
     TransformerTester(build_test_case,
-                      check_mp_reshape_graph,
+                      check_graph,
                       TransformerLevel::Level1,
                       TransformerLevel::Level2);
   };
@@ -1833,7 +1788,7 @@ TEST(QDQTransformerTests, QDQPropagation_Q_No_Parent) {
       builder.AddQuantizeLinearNode<uint8_t>(transpose_output, .0035f, 135, output_arg);
     };
 
-    auto check_mp_reshape_graph = [&](InferenceSessionWrapper& session) {
+    auto check_graph = [&](InferenceSessionWrapper& session) {
       GraphViewer graph_viewer(session.GetGraph());
       const auto& node_topology_list = graph_viewer.GetNodesInTopologicalOrder();
       EXPECT_EQ(graph_viewer.GetNode(node_topology_list[0])->OpType(), "QuantizeLinear");
@@ -1841,7 +1796,7 @@ TEST(QDQTransformerTests, QDQPropagation_Q_No_Parent) {
     };
 
     TransformerTester(build_test_case,
-                      check_mp_reshape_graph,
+                      check_graph,
                       TransformerLevel::Level1,
                       TransformerLevel::Level2);
   };
@@ -1866,7 +1821,7 @@ TEST(QDQTransformerTests, QDQPropagation_DQ_No_Children) {
       transpose_node.AddAttribute("perm", perms);
     };
 
-    auto check_mp_reshape_graph = [&](InferenceSessionWrapper& session) {
+    auto check_graph = [&](InferenceSessionWrapper& session) {
       auto op_to_count = CountOpsInGraph(session.GetGraph());
       GraphViewer graph_viewer(session.GetGraph());
       const auto& node_topology_list = graph_viewer.GetNodesInTopologicalOrder();
@@ -1875,7 +1830,7 @@ TEST(QDQTransformerTests, QDQPropagation_DQ_No_Children) {
     };
 
     TransformerTester(build_test_case,
-                      check_mp_reshape_graph,
+                      check_graph,
                       TransformerLevel::Level1,
                       TransformerLevel::Level2);
   };
@@ -1902,7 +1857,7 @@ TEST(QDQTransformerTests, QDQPropagation_Per_Layer_No_Propagation) {
       transpose_node.AddAttribute("perm", perms);
     };
 
-    auto check_mp_reshape_graph = [&](InferenceSessionWrapper& session) {
+    auto check_graph = [&](InferenceSessionWrapper& session) {
       auto op_to_count = CountOpsInGraph(session.GetGraph());
       GraphViewer graph_viewer(session.GetGraph());
       const auto& node_topology_list = graph_viewer.GetNodesInTopologicalOrder();
@@ -1911,7 +1866,7 @@ TEST(QDQTransformerTests, QDQPropagation_Per_Layer_No_Propagation) {
     };
 
     TransformerTester(build_test_case,
-                      check_mp_reshape_graph,
+                      check_graph,
                       TransformerLevel::Level1,
                       TransformerLevel::Level2);
   };
@@ -1935,14 +1890,14 @@ TEST(QDQTransformerTests, QDQPropagation_DQ_Q) {
       builder.AddQuantizeLinearNode<uint8_t>(dq_output, .0035f, 135, output_arg);
     };
 
-    auto check_mp_reshape_graph = [&](InferenceSessionWrapper& session) {
+    auto check_graph = [&](InferenceSessionWrapper& session) {
       auto op_to_count = CountOpsInGraph(session.GetGraph());
       EXPECT_EQ(op_to_count["QuantizeLinear"], 1);
       EXPECT_EQ(op_to_count["DequantizeLinear"], 1);
     };
 
     TransformerTester(build_test_case,
-                      check_mp_reshape_graph,
+                      check_graph,
                       TransformerLevel::Level1,
                       TransformerLevel::Level2);
   };
diff --git a/onnxruntime/test/providers/nnapi/nnapi_basic_test.cc b/onnxruntime/test/providers/nnapi/nnapi_basic_test.cc
index 4316ff7a51d9e..834f0a7378275 100644
--- a/onnxruntime/test/providers/nnapi/nnapi_basic_test.cc
+++ b/onnxruntime/test/providers/nnapi/nnapi_basic_test.cc
@@ -271,7 +271,9 @@ TEST(NnapiExecutionProviderTest, TestNoShapeInputModel) {
       << "No node should be taken by the NNAPI EP";
 }
 
-static void RunQDQModelTest(const GetQDQTestCaseFn& build_test_case, const char* test_description) {
+static void RunQDQModelTest(const GetQDQTestCaseFn& build_test_case,
+                            const char* test_description,
+                            const EPVerificationParams& params = EPVerificationParams()) {
   onnxruntime::Model model(test_description, false, DefaultLoggingManager().DefaultLogger());
   Graph& graph = model.MainGraph();
   ModelTestBuilder helper(graph);
@@ -286,7 +288,7 @@ static void RunQDQModelTest(const GetQDQTestCaseFn& build_test_case, const char*
 #if defined(__ANDROID__)
   RunAndVerifyOutputsWithEP(model_data, "NnapiExecutionProviderTest.TestQDQModel",
                             std::make_unique<NnapiExecutionProvider>(0),
-                            helper.feeds_);
+                            helper.feeds_, params);
 #else
   // test load only
   SessionOptions so;
@@ -306,7 +308,8 @@ TEST(NnapiExecutionProviderTest, TestQDQConv) {
                                        uint8_t /* OutputType */>(
                       {1, 1, 5, 5} /*input_shape*/,
                       {1, 1, 3, 3} /*weights_shape*/),
-                  "nnapi_qdq_test_graph_conv");
+                  "nnapi_qdq_test_graph_conv",
+                  {true /* verify_entire_graph_use_ep */});
 }
 
 TEST(NnapiExecutionProviderTest, TestQDQResize) {
@@ -316,14 +319,44 @@ TEST(NnapiExecutionProviderTest, TestQDQResize) {
                                          {1, 3, 32, 32} /* sizes_data */,
                                          "linear" /* mode */,
                                          "asymmetric" /* coordinate_transformation_mode */),
-                  "nnapi_qdq_test_graph_resize");
+                  "nnapi_qdq_test_graph_resize",
+                  {true /* verify_entire_graph_use_ep */});
 }
 
 TEST(NnapiExecutionProviderTest, TestQDQAveragePool) {
+  // NNAPI use different rounding, which may cause ~1% difference in the result
   RunQDQModelTest(BuildQDQAveragePoolTestCase<uint8_t /* InputType */,
                                               uint8_t /* OutputType */>(
                       {1, 3, 32, 32} /* input_shape */),
-                  "nnapi_qdq_test_graph_averagepool");
+                  "nnapi_qdq_test_graph_averagepool",
+                  {
+                      true /* verify_entire_graph_use_ep */,
+                      1e-2f /* fp32_abs_err */,
+                  });
+}
+
+TEST(NnapiExecutionProviderTest, TestQDQAdd) {
+  RunQDQModelTest(BuildBinaryOpTestCase<uint8_t /* Input1Type */,
+                                        uint8_t /* Input2Type */,
+                                        uint8_t /* OutputType */>(
+                      {1, 23, 13, 13} /* input_shape */,
+                      "Add" /* op_type */),
+                  "nnapi_qdq_test_graph_add",
+                  {true /* verify_entire_graph_use_ep */});
+}
+
+TEST(NnapiExecutionProviderTest, TestQDQMul) {
+  // NNAPI use different rounding, which may cause ~1% difference in the result
+  RunQDQModelTest(BuildBinaryOpTestCase<uint8_t /* Input1Type */,
+                                        uint8_t /* Input2Type */,
+                                        uint8_t /* OutputType */>(
+                      {1, 23, 13, 13} /* input_shape */,
+                      "Mul" /* op_type */),
+                  "nnapi_qdq_test_graph_mul",
+                  {
+                      true /* verify_entire_graph_use_ep */,
+                      1e-2f /* fp32_abs_err */,
+                  });
 }
 
 #endif  // !(ORT_MINIMAL_BUILD)
diff --git a/onnxruntime/test/util/include/test_utils.h b/onnxruntime/test/util/include/test_utils.h
index 60d2a3e8caba2..50859d826fa2d 100644
--- a/onnxruntime/test/util/include/test_utils.h
+++ b/onnxruntime/test/util/include/test_utils.h
@@ -15,6 +15,18 @@ class Graph;
 
 namespace test {
 
+// struct to hold some verification params for RunAndVerifyOutputsWithEP
+struct EPVerificationParams {
+  // Verify the entire graph is taken by the EP
+  // if this is set to false, then will verify that at least one node is assigned to 'execution_provider'
+  bool verify_entire_graph_use_ep{false};
+
+  // Some EP may use different rounding than ORT CPU EP, which may cause a bigger abs error than
+  // the default of 1e-5f, especially for scenarios such as [Q -> Quantized op -> DQ]
+  // Set this only if this is necessary
+  float fp32_abs_err = 1e-5f;
+};
+
 // return number of nodes in the Graph and any subgraphs that are assigned to the specified execution provider
 int CountAssignedNodes(const Graph& current_graph, const std::string& ep_type);
 
@@ -23,13 +35,14 @@ int CountAssignedNodes(const Graph& current_graph, const std::string& ep_type);
 void RunAndVerifyOutputsWithEP(const ORTCHAR_T* model_path,
                                const char* log_id,
                                std::unique_ptr<IExecutionProvider> execution_provider,
-                               const NameMLValMap& feeds);
+                               const NameMLValMap& feeds,
+                               const EPVerificationParams& params = EPVerificationParams());
 
 // helper function that takes in model_data
-// used in nnapi qdq model tests
 void RunAndVerifyOutputsWithEP(const std::string& model_data,
                                const char* log_id,
                                std::unique_ptr<IExecutionProvider> execution_provider,
-                               const NameMLValMap& feeds);
+                               const NameMLValMap& feeds,
+                               const EPVerificationParams& params = EPVerificationParams());
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/util/test_utils.cc b/onnxruntime/test/util/test_utils.cc
index 220b1342f12d9..b069b08810cb8 100644
--- a/onnxruntime/test/util/test_utils.cc
+++ b/onnxruntime/test/util/test_utils.cc
@@ -18,7 +18,8 @@ namespace onnxruntime {
 namespace test {
 static void VerifyOutputs(const std::vector<std::string>& output_names,
                           const std::vector<OrtValue>& expected_fetches,
-                          const std::vector<OrtValue>& fetches) {
+                          const std::vector<OrtValue>& fetches,
+                          const EPVerificationParams& params) {
   ASSERT_EQ(expected_fetches.size(), fetches.size());
 
   for (size_t i = 0, end = expected_fetches.size(); i < end; ++i) {
@@ -40,10 +41,8 @@ static void VerifyOutputs(const std::vector<std::string>& output_names,
             << " mismatch for " << output_names[i];
         break;
       case ONNX_NAMESPACE::TensorProto_DataType_FLOAT: {
-        constexpr float abs_err = 1e-5f;
-
         EXPECT_THAT(ltensor.DataAsSpan<float>(),
-                    ::testing::Pointwise(::testing::FloatNear(abs_err), rtensor.DataAsSpan<float>()));
+                    ::testing::Pointwise(::testing::FloatNear(params.fp32_abs_err), rtensor.DataAsSpan<float>()));
         break;
       }
       default:
@@ -72,16 +71,18 @@ int CountAssignedNodes(const Graph& current_graph, const std::string& ep_type) {
 
 void RunAndVerifyOutputsWithEP(const ORTCHAR_T* model_path, const char* log_id,
                                std::unique_ptr<IExecutionProvider> execution_provider,
-                               const NameMLValMap& feeds) {
+                               const NameMLValMap& feeds,
+                               const EPVerificationParams& params) {
   // read raw data from model provided by the model_path
   std::ifstream stream(model_path, std::ios::in | std::ios::binary);
   std::string model_data((std::istreambuf_iterator<char>(stream)), std::istreambuf_iterator<char>());
-  RunAndVerifyOutputsWithEP(model_data, log_id, std::move(execution_provider), feeds);
+  RunAndVerifyOutputsWithEP(model_data, log_id, std::move(execution_provider), feeds, params);
 }
 
 void RunAndVerifyOutputsWithEP(const std::string& model_data, const char* log_id,
                                std::unique_ptr<IExecutionProvider> execution_provider,
-                               const NameMLValMap& feeds) {
+                               const NameMLValMap& feeds,
+                               const EPVerificationParams& params) {
   SessionOptions so;
   so.session_logid = log_id;
   RunOptions run_options;
@@ -122,12 +123,17 @@ void RunAndVerifyOutputsWithEP(const std::string& model_data, const char* log_id
   // make sure that some nodes are assigned to the EP, otherwise this test is pointless...
   const auto& graph2 = session_object2.GetGraph();
   auto ep_nodes = CountAssignedNodes(graph2, provider_type);
-  ASSERT_GT(ep_nodes, 0) << "No nodes were assigned to " << provider_type;
+  if (params.verify_entire_graph_use_ep) {
+    // Verify the entire graph is assigned to the EP
+    ASSERT_EQ(ep_nodes, graph2.NumberOfNodes()) << "Not all nodes were assigned to " << provider_type;
+  } else {
+    ASSERT_GT(ep_nodes, 0) << "No nodes were assigned to " << provider_type;
+  }
 
   // Run with EP and verify the result
   std::vector<OrtValue> fetches;
   ASSERT_STATUS_OK(session_object2.Run(run_options, feeds, output_names, &fetches));
-  VerifyOutputs(output_names, expected_fetches, fetches);
+  VerifyOutputs(output_names, expected_fetches, fetches, params);
 }
 
 #if !defined(DISABLE_SPARSE_TENSORS)