diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/shared/utils.cc b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/shared/utils.cc
index 4f24fa26d8896..5dcf27c9b5d2e 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/shared/utils.cc
+++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/shared/utils.cc
@@ -64,10 +64,13 @@ static const OpVersionsAndSelector::OpVersionsMap GetUnaryOpVersionsMap() {
           {"Atan", {}},
           {"Asin", {}},
           {"Sin", {}},
+          {"Cos", {}},
           {"Sign", {}},
           {"Tanh", {}},
           {"Exp", {}},
-          {"LRN", {}}};
+          {"LRN", {}},
+          {"Ceil", {}},
+          {"Abs", {}}};
 }
 static const OpVersionsAndSelector::OpVersionsMap GetBinaryOpVersionsMap() {
   return {{"Add", {}},
diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
index 43998084618c0..d80594d8f72c7 100644
--- a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
+++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
@@ -145,34 +145,28 @@ bool QNNExecutionProvider::IsNodeSupported(qnn::QnnModelWrapper& qnn_model_wrapp
   if (it != node_unit_supported_result.cend()) {
     return it->second;
   } else {
-    // quantized required, filter out the non-quantized nodes, filter in the QDQ nodes
-    auto IsQdqNode = [](const NodeUnit& node_unit) {
-      if ("QuantizeLinear" == node_unit.OpType() || "DequantizeLinear" == node_unit.OpType()) {
-        return true;
-      } else {
-        return false;
-      }
-    };
+    const std::string& op_type = node_unit.OpType();
+    const bool is_qdq_node = op_type == "QuantizeLinear" || op_type == "DequantizeLinear";
 
     // Is NPU backend, is single node, case by case
     // Q/DQ nodes -- supported
     // Transpose nodes -- supported
     // Cast nodes -- need to call CastOpBuilder::IsOpSupported
     if (is_npu_backend && NodeUnit::Type::SingleNode == node_unit.UnitType()) {
-      if (IsQdqNode(node_unit)) {  // Qnn has Quantize & Dequantize Op
+      if (is_qdq_node) {  // Qnn has Quantize & Dequantize Op
         LOGS(logger, VERBOSE) << "Single Q/DQ node is supported for NPU backend. Node name: " << node_unit.Name();
         return true;
       }
 
       // Tranpose only changes the data layout. NPU still supports it.
-      if ("Transpose" == node_unit.OpType()) {
+      if ("Transpose" == op_type) {
         LOGS(logger, VERBOSE) << "Single Transpose node is supported for NPU backend. Node name: " << node_unit.Name();
         return true;
       }
 
-      // For Cast, need to call IsOpSupported (below) to validate input and output types.
+      // For Cast, And, and Or, we need to call IsOpSupported (below) to validate input and output types.
       // For other single non-qdq nodes, immediately return not supported.
-      if (node_unit.OpType() != "Cast") {
+      if (op_type != "Cast" && op_type != "And" && op_type != "Or") {
         LOGS(logger, WARNING) << "Non-QDQ " << node_unit.OpType()
                               << " operators are not supported on HTP or DSP backends. " << node_unit.OpType()
                               << " node `" << node_unit.Name() << " will not be assigned to QNN EP.";
@@ -181,14 +175,14 @@ bool QNNExecutionProvider::IsNodeSupported(qnn::QnnModelWrapper& qnn_model_wrapp
     }
 
     // Non-NPU backend, quantized model not supported, but a QDQ node encountered
-    if (!is_npu_backend && IsQdqNode(node_unit)) {
+    if (!is_npu_backend && is_qdq_node) {
       LOGS(logger, ERROR) << "QDQ models are only supported on HTP or DSP backends. "
                           << node_unit.OpType() << " node `" << node_unit.Name() << "` will not be assigned to QNN EP.";
       return false;
     }
 
     bool supported = false;
-    const auto* op_builder = qnn::GetOpBuilder(node_unit.OpType());
+    const auto* op_builder = qnn::GetOpBuilder(op_type);
     if (op_builder == nullptr) {
       LOGS(logger, WARNING) << "Operators of type `" << node_unit.OpType() << "` are not supported by QNN EP."
                             << node_unit.OpType() << " node `" << node_unit.Name()
diff --git a/onnxruntime/test/optimizer/graph_transform_test_builder.h b/onnxruntime/test/optimizer/graph_transform_test_builder.h
index d0be5aa201671..361903c386dd5 100644
--- a/onnxruntime/test/optimizer/graph_transform_test_builder.h
+++ b/onnxruntime/test/optimizer/graph_transform_test_builder.h
@@ -219,6 +219,15 @@ class ModelTestBuilder {
     return &graph_.GetOrCreateNodeArg(name, nullptr);
   }
 
+  NodeArg* MakeRandInitializerBool(const std::vector<int64_t>& shape) {
+    std::vector<uint8_t> data_uint8 = rand_gen_.Uniform<uint8_t>(shape, 0, 1);
+    std::vector<bool> data;
+    for (uint8_t x : data_uint8) {
+      data.push_back(x != 0);
+    }
+    return MakeInitializerBool(shape, data);
+  }
+
   template <typename T>
   NodeArg* MakeInitializer(const std::vector<int64_t>& shape, T min, T max) {
     return MakeInitializer<T>(shape, rand_gen_.Uniform<T>(shape, min, max));
diff --git a/onnxruntime/test/optimizer/qdq_test_utils.h b/onnxruntime/test/optimizer/qdq_test_utils.h
index 62dd322f292f2..7f6865a89e6e6 100644
--- a/onnxruntime/test/optimizer/qdq_test_utils.h
+++ b/onnxruntime/test/optimizer/qdq_test_utils.h
@@ -91,102 +91,6 @@ GetQDQTestCaseFn BuildQDQConvTransposeTestCase(const std::vector<int64_t>& input
   };
 }
 
-// Creates the following graph:
-//                                _______________________
-//    input (f32) -> Q -> DQ ->  |                       | -> Q -> DQ -> output (f32)
-// axes (int32, initializer) ->  |         Gather        |
-//                               |_______________________|
-//
-template <typename QuantType, typename IndicesType>
-GetQDQTestCaseFn BuildQDQGatherOpTestCase(const std::vector<int64_t>& input_shape,
-                                          const std::vector<IndicesType> indices,
-                                          const std::vector<int64_t>& indices_shape,
-                                          int64_t axis) {
-  return [input_shape, indices, indices_shape, axis](ModelTestBuilder& builder) {
-    auto* input_data = builder.MakeInput<float>(input_shape, -1.0f, 1.0f);
-    auto* final_output = builder.MakeOutput();
-
-    // input_data -> Q/DQ ->
-    auto* input_qdq_output = AddQDQNodePair<QuantType>(builder, input_data, .003f, 1);
-
-    auto* indices_input = builder.MakeInitializer<IndicesType>(indices_shape, indices);
-
-    auto* gather_output = builder.MakeIntermediate();
-    Node& gather_node = builder.AddNode("Gather", {input_qdq_output, indices_input}, {gather_output});
-    gather_node.AddAttribute("axis", axis);
-
-    // -> Q/DQ -> final_output
-    auto* q_output = builder.MakeIntermediate();
-    builder.AddQuantizeLinearNode<QuantType>(gather_output, .003f, 1,
-                                             q_output);
-
-    builder.AddDequantizeLinearNode<QuantType>(q_output, .003f, 1,
-                                               final_output);
-  };
-}
-
-// Creates the following graph:
-//                                _______________________
-//    input (f32) -> Q -> DQ ->  |                       | -> Q -> DQ -> output (f32)
-// axes (int32, initializer) ->  |         Gather        |
-//                               |_______________________|
-//
-template <typename QuantType, typename IndicesType>
-GetQDQTestCaseFn BuildQDQGatherOpScalarIndicesTestCase(const std::vector<int64_t>& input_shape,
-                                                       const IndicesType indices,
-                                                       int64_t axis) {
-  return [input_shape, indices, axis](ModelTestBuilder& builder) {
-    auto* input_data = builder.MakeInput<float>(input_shape, -1.0f, 1.0f);
-    auto* final_output = builder.MakeOutput();
-
-    // input_data -> Q/DQ ->
-    auto* input_qdq_output = AddQDQNodePair<QuantType>(builder, input_data, .003f, 1);
-
-    auto* indices_input = builder.MakeScalarInitializer<IndicesType>(indices);
-
-    auto* gather_output = builder.MakeIntermediate();
-    Node& gather_node = builder.AddNode("Gather", {input_qdq_output, indices_input}, {gather_output});
-    gather_node.AddAttribute("axis", axis);
-
-    // -> Q/DQ -> final_output
-    auto* q_output = builder.MakeIntermediate();
-    builder.AddQuantizeLinearNode<QuantType>(gather_output, .003f, 1,
-                                             q_output);
-
-    builder.AddDequantizeLinearNode<QuantType>(q_output, .003f, 1,
-                                               final_output);
-  };
-}
-
-// Creates the following graph:
-//                                _______________________
-//                               |                       |
-//    input (f32) -> Q -> DQ ->  |       LeakyRelu       | -> Q -> DQ -> output (f32)
-//                               |_______________________|
-//
-template <typename QuantType>
-GetQDQTestCaseFn BuildQDQLeakyReluOpTestCase(const std::vector<int64_t>& input_shape) {
-  return [input_shape](ModelTestBuilder& builder) {
-    auto* input_data = builder.MakeInput<float>(input_shape, -1.0f, 1.0f);
-    auto* final_output = builder.MakeOutput();
-
-    // input_data -> Q/DQ ->
-    auto* input_qdq_output = AddQDQNodePair<QuantType>(builder, input_data, 0.0473f, 137);
-
-    auto* leakyrelu_output = builder.MakeIntermediate();
-    Node& leakyrelu_node = builder.AddNode("LeakyRelu", {input_qdq_output}, {leakyrelu_output});
-    leakyrelu_node.AddAttribute("alpha", 0.2f);
-
-    // -> Q/DQ -> final_output
-    auto* q_output = builder.MakeIntermediate();
-    builder.AddQuantizeLinearNode<QuantType>(leakyrelu_output, 0.02696f, 48,
-                                             q_output);
-
-    builder.AddDequantizeLinearNode<QuantType>(q_output, 0.02696f, 48,
-                                               final_output);
-  };
-}
-
 template <typename InputType, typename WeightType, typename BiasType, typename OutputType>
 GetQDQTestCaseFn BuildQDQConvTestCase(const std::vector<int64_t>& input_shape, const std::vector<int64_t>& weights_shape) {
   return [input_shape, weights_shape](ModelTestBuilder& builder) {
diff --git a/onnxruntime/test/providers/qnn/argmaxmin_op_test.cc b/onnxruntime/test/providers/qnn/argmaxmin_op_test.cc
index 66b53109d7f05..e579e3274e699 100644
--- a/onnxruntime/test/providers/qnn/argmaxmin_op_test.cc
+++ b/onnxruntime/test/providers/qnn/argmaxmin_op_test.cc
@@ -20,21 +20,29 @@ static GetTestModelFn BuildArgMxxTestCase(const std::string& op_type, TestInputD
                                           const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs) {
   return [op_type, input_def, attrs](ModelTestBuilder& builder) {
     auto* input = MakeTestInput(builder, input_def);
-    auto* output = builder.MakeOutput();
 
-    Node& argm_node = builder.AddNode(op_type, {input}, {output});
+    auto* argm_output = builder.MakeIntermediate();
+    Node& argm_node = builder.AddNode(op_type, {input}, {argm_output});
     for (const auto& attr : attrs) {
       argm_node.AddAttributeProto(attr);
     }
+
+    // Add cast to uint32
+    auto* output = builder.MakeOutput();
+    Node& cast_node = builder.AddNode("Cast", {argm_output}, {output});
+    const auto dst_type = ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_UINT32;
+    cast_node.AddAttribute("to", static_cast<int64_t>(dst_type));
   };
 }
 
 // Builds a QDQ model with ArgMin/ArgMax and a Cast to uint32. The quantization parameters are computed from the provided
 // input definition.
 template <typename QType = uint8_t>
-static GetTestModelFn BuildQDQArgMxxTestCase(const std::string& op_type, TestInputDef<float> input_def,
-                                             const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs) {
-  return [op_type, input_def, attrs](ModelTestBuilder& builder) {
+static GetTestQDQModelFn<QType> BuildQDQArgMxxTestCase(const std::string& op_type, TestInputDef<float> input_def,
+                                                       const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs) {
+  return [op_type, input_def, attrs](ModelTestBuilder& builder,
+                                     std::vector<QuantParams<QType>>& output_qparams) {
+    ORT_UNUSED_PARAMETER(output_qparams);
     QuantParams<QType> input_qparams = GetTestInputQuantParams(input_def);
 
     auto* input = MakeTestInput(builder, input_def);
@@ -75,8 +83,8 @@ static void RunCPUArgMxxOpTest(const std::string& op_type, TestInputDef<float> i
                   expected_ep_assignment);
 }
 
-// Runs an ArgMax/ArgMin model on the QNN CPU backend. Checks the graph node assignment, and that inference
-// outputs for QNN EP and CPU EP match.
+// Runs a QDQ ArgMax/ArgMin model on the QNN (HTP) EP and the ORT CPU EP. Checks the graph node assignment, and that inference
+// running the QDQ model on QNN EP is at least as accurate as on ORT CPU EP (when compared to the baseline float32 model).
 template <typename QType = uint8_t>
 static void RunQDQArgMxxOpTest(const std::string& op_type, TestInputDef<float> input_def,
                                const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs,
@@ -90,10 +98,12 @@ static void RunQDQArgMxxOpTest(const std::string& op_type, TestInputDef<float> i
   provider_options["backend_path"] = "libQnnHtp.so";
 #endif
 
-  RunQnnModelTest(BuildQDQArgMxxTestCase(op_type, input_def, attrs),
-                  provider_options,
-                  opset,
-                  expected_ep_assignment);
+  TestQDQModelAccuracy(BuildArgMxxTestCase(op_type, input_def, attrs),            // baseline float32 model
+                       BuildQDQArgMxxTestCase<QType>(op_type, input_def, attrs),  // QDQ model
+                       provider_options,
+                       opset,
+                       expected_ep_assignment,
+                       1e-5f);
 }
 
 //
diff --git a/onnxruntime/test/providers/qnn/average_pool_test.cc b/onnxruntime/test/providers/qnn/average_pool_test.cc
index c501622b5bb61..114802d56cfd3 100644
--- a/onnxruntime/test/providers/qnn/average_pool_test.cc
+++ b/onnxruntime/test/providers/qnn/average_pool_test.cc
@@ -17,16 +17,15 @@ namespace onnxruntime {
 namespace test {
 
 // Returns a function that creates a graph with a single AveragePool operator.
-static GetTestModelFn BuildAveragePoolTestCase(const std::vector<int64_t>& shape,
+static GetTestModelFn BuildAveragePoolTestCase(const TestInputDef<float>& input_def,
                                                const std::vector<int64_t>& kernel_shape,
                                                const std::vector<int64_t>& strides,
                                                const std::vector<int64_t>& pads,
                                                int64_t count_include_pad,
                                                const std::string& auto_pad = "NOTSET") {
-  return [shape, kernel_shape, strides, pads,
+  return [input_def, kernel_shape, strides, pads,
           count_include_pad, auto_pad](ModelTestBuilder& builder) {
-    // Random input data
-    auto input = builder.MakeInput<float>(shape, 0.0f, 10.0f);
+    auto* input = MakeTestInput(builder, input_def);
 
     auto* output = builder.MakeOutput();
     Node& pool_node = builder.AddNode("AveragePool", {input}, {output});
@@ -51,26 +50,20 @@ static GetTestModelFn BuildAveragePoolTestCase(const std::vector<int64_t>& shape
 
 // Returns a function that creates a graph with a QDQ AveragePool operator.
 template <typename QuantType>
-GetQDQTestCaseFn BuildAveragePoolQDQTestCase(const std::vector<int64_t>& shape,
-                                             const std::vector<int64_t>& kernel_shape,
-                                             const std::vector<int64_t>& strides,
-                                             const std::vector<int64_t>& pads,
-                                             int64_t count_include_pad,
-                                             const std::string& auto_pad = "NOTSET") {
-  return [shape, kernel_shape, strides, pads,
-          count_include_pad, auto_pad](ModelTestBuilder& builder) {
-    float dq_scale = 0.0038f;
-    float pool_output_scale = 0.0038f;
-    float q_scale = 0.0038f;
-    QuantType dq_zp = std::numeric_limits<QuantType>::max() / 2;
-    QuantType pool_output_zp = std::numeric_limits<QuantType>::max() / 2;
-    QuantType q_zp = std::numeric_limits<QuantType>::max() / 2;
-
-    auto* input_arg = builder.MakeInput<float>(shape, -1.0f, 1.0f);
-    auto* output_arg = builder.MakeOutput();
+GetTestQDQModelFn<QuantType> BuildAveragePoolQDQTestCase(const TestInputDef<float>& input_def,
+                                                         const std::vector<int64_t>& kernel_shape,
+                                                         const std::vector<int64_t>& strides,
+                                                         const std::vector<int64_t>& pads,
+                                                         int64_t count_include_pad,
+                                                         const std::string& auto_pad = "NOTSET") {
+  return [input_def, kernel_shape, strides, pads,
+          count_include_pad, auto_pad](ModelTestBuilder& builder,
+                                       std::vector<QuantParams<QuantType>>& output_qparams) {
+    auto* input_arg = MakeTestInput(builder, input_def);
 
     // add QDQ + AveragePool
-    auto* dq_output = AddQDQNodePair<QuantType>(builder, input_arg, dq_scale, dq_zp);
+    QuantParams<QuantType> input_qparams = GetTestInputQuantParams(input_def);
+    auto* dq_output = AddQDQNodePair<QuantType>(builder, input_arg, input_qparams.scale, input_qparams.zero_point);
     auto* averagepool_output = builder.MakeIntermediate();
     Node& pool_node = builder.AddNode("AveragePool", {dq_output}, {averagepool_output});
 
@@ -90,22 +83,15 @@ GetQDQTestCaseFn BuildAveragePoolQDQTestCase(const std::vector<int64_t>& shape,
       pool_node.AddAttribute("count_include_pad", count_include_pad);
     }
 
-    // add QDQ output
-    auto* q_output = builder.MakeIntermediate();
-    builder.AddQuantizeLinearNode<QuantType>(averagepool_output,
-                                             pool_output_scale,
-                                             pool_output_zp,
-                                             q_output);
-    builder.AddDequantizeLinearNode<QuantType>(q_output,
-                                               q_scale,
-                                               q_zp,
-                                               output_arg);
+    // op_output -> Q -> DQ -> output
+    AddQDQNodePairWithOutputAsGraphOutput<QuantType>(builder, averagepool_output,
+                                                     output_qparams[0].scale, output_qparams[0].zero_point);
   };
 }
 
 // Runs an AveragePool model on the QNN CPU backend. Checks the graph node assignment, and that inference
 // outputs for QNN and CPU match.
-static void RunAveragePoolOpTest(const std::vector<int64_t>& shape,
+static void RunAveragePoolOpTest(const TestInputDef<float>& input_def,
                                  const std::vector<int64_t>& kernel_shape,
                                  const std::vector<int64_t>& strides,
                                  const std::vector<int64_t>& pads,
@@ -120,16 +106,16 @@ static void RunAveragePoolOpTest(const std::vector<int64_t>& shape,
   provider_options["backend_path"] = "libQnnCpu.so";
 #endif
 
-  RunQnnModelTest(BuildAveragePoolTestCase(shape, kernel_shape, strides, pads, count_include_pad, auto_pad),
+  RunQnnModelTest(BuildAveragePoolTestCase(input_def, kernel_shape, strides, pads, count_include_pad, auto_pad),
                   provider_options,
                   opset,
                   expected_ep_assignment);
 }
 
-// Runs a QDQ AveragePool model on the QNN HTP backend. Checks the graph node assignment, and that inference
-// outputs for QNN and CPU match.
+// Runs a QDQ AveragePool model on the QNN HTP backend. Checks the graph node assignment, and that accuracy
+// on QNN EP is at least as good as on CPU EP.
 template <typename QuantType>
-static void RunQDQAveragePoolOpTest(const std::vector<int64_t>& shape,
+static void RunQDQAveragePoolOpTest(const TestInputDef<float>& input_def,
                                     const std::vector<int64_t>& kernel_shape,
                                     const std::vector<int64_t>& strides,
                                     const std::vector<int64_t>& pads,
@@ -144,12 +130,13 @@ static void RunQDQAveragePoolOpTest(const std::vector<int64_t>& shape,
   provider_options["backend_path"] = "libQnnHtp.so";
 #endif
 
-  RunQnnModelTest(BuildAveragePoolQDQTestCase<QuantType>(shape, kernel_shape, strides, pads, count_include_pad,
-                                                         auto_pad),
-                  provider_options,
-                  opset,
-                  expected_ep_assignment,
-                  fp32_abs_err);
+  TestQDQModelAccuracy(BuildAveragePoolTestCase(input_def, kernel_shape, strides, pads, count_include_pad, auto_pad),
+                       BuildAveragePoolQDQTestCase<QuantType>(input_def, kernel_shape, strides, pads, count_include_pad,
+                                                              auto_pad),
+                       provider_options,
+                       opset,
+                       expected_ep_assignment,
+                       fp32_abs_err);
 }
 
 //
@@ -157,45 +144,45 @@ static void RunQDQAveragePoolOpTest(const std::vector<int64_t>& shape,
 //
 
 // AveragePool with kernel size equal to the spatial dimension of input tensor.
-TEST_F(QnnCPUBackendTests, TestAveragePool_Global) {
-  RunAveragePoolOpTest({1, 2, 3, 3},  // shape
-                       {3, 3},        // kernel_shape
-                       {3, 3},        // strides
-                       {0, 0, 0, 0},  // pads
-                       0,             // count_include_pad
+TEST_F(QnnCPUBackendTests, AveragePool_Global) {
+  RunAveragePoolOpTest(TestInputDef<float>({1, 2, 3, 3}, false, -10.0f, 10.0f),  // random input
+                       {3, 3},                                                   // kernel_shape
+                       {3, 3},                                                   // strides
+                       {0, 0, 0, 0},                                             // pads
+                       0,                                                        // count_include_pad
                        "NOTSET",
                        ExpectedEPNodeAssignment::All);
 }
 
 // AveragePool that counts padding.
-TEST_F(QnnCPUBackendTests, TestAveragePool_CountIncludePad) {
-  RunAveragePoolOpTest({1, 2, 3, 3},  // shape
-                       {1, 1},        // kernel_shape
-                       {1, 1},        // strides
-                       {0, 0, 0, 0},  // pads
-                       1,             // count_include_pad
+TEST_F(QnnCPUBackendTests, AveragePool_CountIncludePad) {
+  RunAveragePoolOpTest(TestInputDef<float>({1, 2, 3, 3}, false, -10.0f, 10.0f),  // random input
+                       {1, 1},                                                   // kernel_shape
+                       {1, 1},                                                   // strides
+                       {0, 0, 0, 0},                                             // pads
+                       1,                                                        // count_include_pad
                        "NOTSET",
                        ExpectedEPNodeAssignment::All);
 }
 
 // AveragePool that use auto_pad 'SAME_UPPER'.
-TEST_F(QnnCPUBackendTests, TestAveragePool_AutopadSameUpper) {
-  RunAveragePoolOpTest({1, 2, 3, 3},  // shape
-                       {1, 1},        // kernel_shape
-                       {1, 1},        // strides
-                       {},            // pads
-                       1,             // count_include_pad
+TEST_F(QnnCPUBackendTests, AveragePool_AutopadSameUpper) {
+  RunAveragePoolOpTest(TestInputDef<float>({1, 2, 3, 3}, false, -10.0f, 10.0f),  // random input
+                       {1, 1},                                                   // kernel_shape
+                       {1, 1},                                                   // strides
+                       {},                                                       // pads
+                       1,                                                        // count_include_pad
                        "SAME_UPPER",
                        ExpectedEPNodeAssignment::All);
 }
 
 // AveragePool that use auto_pad 'SAME_LOWER'.
-TEST_F(QnnCPUBackendTests, TestAveragePool_AutopadSameLower) {
-  RunAveragePoolOpTest({1, 2, 3, 3},  // shape
-                       {1, 1},        // kernel_shape
-                       {1, 1},        // strides
-                       {},            // pads
-                       1,             // count_include_pad
+TEST_F(QnnCPUBackendTests, AveragePool_AutopadSameLower) {
+  RunAveragePoolOpTest(TestInputDef<float>({1, 2, 3, 3}, false, -10.0f, 10.0f),  // random input
+                       {1, 1},                                                   // kernel_shape
+                       {1, 1},                                                   // strides
+                       {},                                                       // pads
+                       1,                                                        // count_include_pad
                        "SAME_LOWER",
                        ExpectedEPNodeAssignment::All);
 }
@@ -206,8 +193,10 @@ TEST_F(QnnCPUBackendTests, TestAveragePool_AutopadSameLower) {
 //
 
 // QDQ AveragePool with kernel size equal to the spatial dimension of input tensor.
-TEST_F(QnnHTPBackendTests, TestAveragePool_Global_HTP_u8) {
-  RunQDQAveragePoolOpTest<uint8_t>({1, 2, 3, 3},  // shape
+TEST_F(QnnHTPBackendTests, AveragePool_Global_HTP) {
+  std::vector<float> input = {32.1289f, -59.981f, -17.2799f, 62.7263f, 33.6205f, -19.3515f, -54.0113f, 37.5648f, 61.5357f,
+                              -52.5769f, 27.3637f, -9.01382f, -65.5612f, 19.9497f, -47.9228f, 26.9813f, 83.064f, 0.362503f};
+  RunQDQAveragePoolOpTest<uint8_t>(TestInputDef<float>({1, 2, 3, 3}, false, input),
                                    {3, 3},        // kernel_shape
                                    {3, 3},        // strides
                                    {0, 0, 0, 0},  // pads
@@ -217,39 +206,48 @@ TEST_F(QnnHTPBackendTests, TestAveragePool_Global_HTP_u8) {
 }
 
 // QDQ AveragePool that counts padding.
-TEST_F(QnnHTPBackendTests, TestAveragePool_CountIncludePad_HTP_u8) {
-  RunQDQAveragePoolOpTest<uint8_t>({1, 2, 3, 3},  // shape
+TEST_F(QnnHTPBackendTests, AveragePool_CountIncludePad_HTP_u8) {
+  std::vector<float> input = {-9.0f, -7.33f, -6.0f, -5.0f, -4.0f, -3.0f, -2.0f, -1.0f, 0.0f,
+                              1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f};
+
+  RunQDQAveragePoolOpTest<uint8_t>(TestInputDef<float>({1, 2, 3, 3}, false, input),
                                    {1, 1},        // kernel_shape
                                    {1, 1},        // strides
                                    {0, 0, 0, 0},  // pads
                                    1,             // count_include_pad
                                    "NOTSET",
                                    ExpectedEPNodeAssignment::All,
-                                   18, 0.00381f);
+                                   18);
 }
 
 // QDQ AveragePool that use auto_pad 'SAME_UPPER'.
-TEST_F(QnnHTPBackendTests, TestAveragePool_AutopadSameUpper_HTP_u8) {
-  RunQDQAveragePoolOpTest<uint8_t>({1, 2, 3, 3},  // shape
-                                   {1, 1},        // kernel_shape
-                                   {1, 1},        // strides
-                                   {},            // pads
-                                   0,             // count_include_pad
+TEST_F(QnnHTPBackendTests, AveragePool_AutopadSameUpper_HTP_u8) {
+  std::vector<float> input = {-9.0f, -7.33f, -6.0f, -5.0f, -4.0f, -3.0f, -2.0f, -1.0f, 0.0f,
+                              1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f};
+
+  RunQDQAveragePoolOpTest<uint8_t>(TestInputDef<float>({1, 2, 3, 3}, false, input),
+                                   {1, 1},  // kernel_shape
+                                   {1, 1},  // strides
+                                   {},      // pads
+                                   0,       // count_include_pad
                                    "SAME_UPPER",
                                    ExpectedEPNodeAssignment::All,
-                                   18, 0.00381f);
+                                   18);
 }
 
 // QDQ AveragePool that use auto_pad 'SAME_LOWER'.
-TEST_F(QnnHTPBackendTests, TestAveragePool_AutopadSameLower_HTP_u8) {
-  RunQDQAveragePoolOpTest<uint8_t>({1, 2, 3, 3},  // shape
-                                   {1, 1},        // kernel_shape
-                                   {1, 1},        // strides
-                                   {},            // pads
-                                   0,             // count_include_pad
+TEST_F(QnnHTPBackendTests, AveragePool_AutopadSameLower_HTP_u8) {
+  std::vector<float> input = {-9.0f, -7.33f, -6.0f, -5.0f, -4.0f, -3.0f, -2.0f, -1.0f, 0.0f,
+                              1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f};
+
+  RunQDQAveragePoolOpTest<uint8_t>(TestInputDef<float>({1, 2, 3, 3}, false, input),
+                                   {1, 1},  // kernel_shape
+                                   {1, 1},  // strides
+                                   {},      // pads
+                                   0,       // count_include_pad
                                    "SAME_LOWER",
                                    ExpectedEPNodeAssignment::All,
-                                   18, 0.00381f);
+                                   18);
 }
 
 #endif  // defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
diff --git a/onnxruntime/test/providers/qnn/batch_norm_htp_test.cc b/onnxruntime/test/providers/qnn/batch_norm_htp_test.cc
index d69ce53d41b5c..9a4021c5563c8 100644
--- a/onnxruntime/test/providers/qnn/batch_norm_htp_test.cc
+++ b/onnxruntime/test/providers/qnn/batch_norm_htp_test.cc
@@ -15,53 +15,133 @@ namespace onnxruntime {
 namespace test {
 #if defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
 
-// Creates the graph:
-//                                  _______________________
-//               input_u8 -> DQ -> |                       |
-// scale_u8 (initializer) -> DQ -> |                       |
-// bias_u8 (initializer)  -> DQ -> |  BatchNormalization   | -> Q -> output_u8
-// mean_u8 (initializer)  -> DQ -> |                       |
-// var_u8 (initializer)   -> DQ -> |_______________________|
-//
-// Currently used to test QNN EP.
+// Computes the mean and variance of inputs within a channel.
+// Requires an input with rank >= 3
+static void ComputeChannelMeanAndVar(const std::vector<float>& input_data, const std::vector<int64_t>& input_shape,
+                                     std::vector<float>& mean_vals, std::vector<float>& var_vals) {
+  const size_t input_rank = input_shape.size();
+  const size_t num_batches = input_shape[0];
+  const size_t num_channels = input_shape[1];
+
+  size_t batch_stride = 1;
+  for (size_t i = 1; i < input_rank; i++) {
+    batch_stride *= input_shape[i];
+  }
+  const size_t channel_stride = batch_stride / num_channels;
+
+  assert(mean_vals.size() == num_channels);
+  assert(var_vals.size() == num_channels);
+  for (size_t i = 0; i < num_channels; i++) {
+    mean_vals[i] = 0.0f;
+    var_vals[i] = 0.0f;
+  }
+
+  // Compute running sum of elements within each channel. The running sum is stored in the mean_vals array directly.
+  for (size_t b = 0; b < num_batches; b++) {
+    const size_t batch_start = b * batch_stride;
+
+    for (size_t c = 0; c < num_channels; c++) {
+      const size_t chan_start = batch_start + (c * channel_stride);
+
+      for (size_t i = chan_start; i < chan_start + channel_stride; i++) {
+        mean_vals[c] += input_data[i];
+      }
+    }
+  }
+
+  // Divide sums by the number of elements in a channel to get the mean.
+  for (size_t c = 0; c < num_channels; c++) {
+    mean_vals[c] /= static_cast<float>(num_batches * channel_stride);
+  }
+
+  // Compute running sum of deviations from mean within each channel. The running sum is stored in the var_vals array directly.
+  for (size_t b = 0; b < num_batches; b++) {
+    const size_t batch_start = b * batch_stride;
+
+    for (size_t c = 0; c < num_channels; c++) {
+      const size_t chan_start = batch_start + (c * channel_stride);
+
+      for (size_t i = chan_start; i < chan_start + channel_stride; i++) {
+        const float deviation = input_data[i] - mean_vals[c];
+        var_vals[c] += (deviation * deviation);
+      }
+    }
+  }
+
+  // Divide sums by the number of elements in a channel to get the variance.
+  for (size_t c = 0; c < num_channels; c++) {
+    var_vals[c] /= static_cast<float>(num_batches * channel_stride);
+  }
+}
+
+static GetTestModelFn BuildBatchNormTestCase(const TestInputDef<float>& input_def,
+                                             const TestInputDef<float>& scale_def,
+                                             const TestInputDef<float>& bias_def) {
+  ORT_ENFORCE(input_def.IsRawData());            // Need raw data to compute mean and variance inputs.
+  ORT_ENFORCE(input_def.GetShape().size() > 2);  // Need at least rank 3 data for convenience.
+
+  return [input_def, scale_def, bias_def](ModelTestBuilder& builder) {
+    const auto& input_shape = input_def.GetShape();
+    const auto& input_data = input_def.GetRawData();
+    const int64_t num_channels = input_shape[1];
+
+    NodeArg* input = MakeTestInput(builder, input_def);
+    NodeArg* scale = MakeTestInput(builder, scale_def);
+    NodeArg* bias = MakeTestInput(builder, bias_def);
+
+    std::vector<float> mean_vals(num_channels);
+    std::vector<float> var_vals(num_channels);
+    ComputeChannelMeanAndVar(input_data, input_shape, mean_vals, var_vals);
+
+    NodeArg* mean = builder.MakeInitializer<float>({num_channels}, mean_vals);
+    NodeArg* var = builder.MakeInitializer<float>({num_channels}, var_vals);
+    NodeArg* output = builder.MakeOutput();
+    builder.AddNode("BatchNormalization", {input, scale, bias, mean, var}, {output});
+  };
+}
+
 template <typename InputQType, typename ScaleQType, typename BiasQType>
-GetQDQTestCaseFn BuildQDQBatchNormTestCase(const std::vector<int64_t>& input_shape) {
-  return [input_shape](ModelTestBuilder& builder) {
+GetTestQDQModelFn<InputQType> BuildQDQBatchNormTestCase(const TestInputDef<float>& input_def,
+                                                        const TestInputDef<float>& scale_def,
+                                                        const TestInputDef<float>& bias_def) {
+  ORT_ENFORCE(input_def.IsRawData());            // Need raw data to compute mean and variance inputs.
+  ORT_ENFORCE(input_def.GetShape().size() > 2);  // Need at least rank 3 data for convenience.
+
+  return [input_def, scale_def, bias_def](ModelTestBuilder& builder,
+                                          std::vector<QuantParams<InputQType>>& output_qparams) {
+    const auto& input_shape = input_def.GetShape();
+    const auto& input_data = input_def.GetRawData();
     const int64_t num_channels = input_shape[1];
-    const InputQType quant_zero_point = 0;
-    const float quant_scale = 1.0f;
 
-    auto* input = builder.MakeInput<InputQType>(input_shape, std::numeric_limits<InputQType>::min(),
-                                                std::numeric_limits<InputQType>::max());
-    auto* dq_input = builder.MakeIntermediate();
-    builder.AddDequantizeLinearNode<InputQType>(input, 0.0039f, quant_zero_point, dq_input);
+    NodeArg* input = MakeTestInput(builder, input_def);
+    QuantParams<InputQType> input_qparams = GetTestInputQuantParams(input_def);
+    NodeArg* input_qdq = AddQDQNodePair<InputQType>(builder, input, input_qparams.scale, input_qparams.zero_point);
 
-    auto* dq_scale_output = builder.MakeIntermediate();
-    auto* scale = builder.MakeInitializer<ScaleQType>({num_channels}, static_cast<ScaleQType>(1), static_cast<ScaleQType>(127));
-    builder.AddDequantizeLinearNode<ScaleQType>(scale, 0.0028f, quant_zero_point, dq_scale_output);
+    NodeArg* scale = MakeTestInput(builder, scale_def);
+    QuantParams<ScaleQType> scale_qparams = GetTestInputQuantParams(scale_def);
+    NodeArg* scale_qdq = AddQDQNodePair<ScaleQType>(builder, scale, scale_qparams.scale, scale_qparams.zero_point);
 
-    auto* dq_bias_output = builder.MakeIntermediate();
-    auto* bias = builder.MakeInitializer<BiasQType>({num_channels}, std::vector<BiasQType>(num_channels));
-    builder.AddDequantizeLinearNode<BiasQType>(bias, quant_scale, quant_zero_point, dq_bias_output);
+    NodeArg* bias = MakeTestInput(builder, bias_def);
+    QuantParams<BiasQType> bias_qparams = GetTestInputQuantParams(bias_def);
+    NodeArg* bias_qdq = AddQDQNodePair<BiasQType>(builder, bias, bias_qparams.scale, bias_qparams.zero_point);
 
-    auto* dq_mean_output = builder.MakeIntermediate();
-    auto* mean = builder.MakeInitializer<InputQType>({num_channels}, std::vector<InputQType>(num_channels));
-    builder.AddDequantizeLinearNode<InputQType>(mean, quant_scale, quant_zero_point, dq_mean_output);
+    std::vector<float> mean_vals(num_channels);
+    std::vector<float> var_vals(num_channels);
+    ComputeChannelMeanAndVar(input_data, input_shape, mean_vals, var_vals);
 
-    auto* dq_var_output = builder.MakeIntermediate();
-    auto* var = builder.MakeInitializer<InputQType>({num_channels}, std::vector<InputQType>(num_channels, 255));
-    builder.AddDequantizeLinearNode<InputQType>(var, 0.003921f, 0, dq_var_output);
+    NodeArg* mean = builder.MakeInitializer<float>({num_channels}, mean_vals);
+    QuantParams<InputQType> mean_qparams = GetDataQuantParams(mean_vals);
+    NodeArg* mean_qdq = AddQDQNodePair<InputQType>(builder, mean, mean_qparams.scale, mean_qparams.zero_point);
 
-    auto* batchnorm_output = builder.MakeIntermediate();
-    builder.AddNode("BatchNormalization", {dq_input, dq_scale_output, dq_bias_output, dq_mean_output, dq_var_output}, {batchnorm_output});
+    NodeArg* var = builder.MakeInitializer<float>({num_channels}, var_vals);
+    QuantParams<InputQType> var_qparams = GetDataQuantParams(var_vals);
+    NodeArg* var_qdq = AddQDQNodePair<InputQType>(builder, var, var_qparams.scale, var_qparams.zero_point);
 
-    auto* q_output = builder.MakeIntermediate();
-    builder.AddQuantizeLinearNode<InputQType>(batchnorm_output, 0.00377f, quant_zero_point, q_output);
+    auto* batchnorm_output = builder.MakeIntermediate();
+    builder.AddNode("BatchNormalization", {input_qdq, scale_qdq, bias_qdq, mean_qdq, var_qdq},
+                    {batchnorm_output});
 
-    auto* final_output = builder.MakeOutput();
-    builder.AddDequantizeLinearNode<InputQType>(q_output, 0.00377f,
-                                                quant_zero_point,
-                                                final_output);
+    AddQDQNodePairWithOutputAsGraphOutput<InputQType>(builder, batchnorm_output, output_qparams[0].scale, output_qparams[0].zero_point);
   };
 }
 
@@ -72,7 +152,9 @@ GetQDQTestCaseFn BuildQDQBatchNormTestCase(const std::vector<int64_t>& input_sha
  * \param input_shape The input's shape.
  * \param expected_ep_assignment How many nodes are expected to be assigned to QNN (All, Some, or None).
  */
-static void RunBatchNormQDQTest(const std::vector<int64_t>& input_shape,
+static void RunBatchNormQDQTest(const TestInputDef<float>& input_def,
+                                const TestInputDef<float>& scale_def,
+                                const TestInputDef<float>& bias_def,
                                 ExpectedEPNodeAssignment expected_ep_assignment) {
   ProviderOptions provider_options;
 #if defined(_WIN32)
@@ -82,28 +164,49 @@ static void RunBatchNormQDQTest(const std::vector<int64_t>& input_shape,
 #endif
 
   // Runs model with DQ-> InstanceNorm -> Q and compares the outputs of the CPU and QNN EPs.
-  RunQnnModelTest(BuildQDQBatchNormTestCase<uint8_t, uint8_t, uint8_t>(input_shape),
-                  provider_options,
-                  11,
-                  expected_ep_assignment);
+  TestQDQModelAccuracy(BuildBatchNormTestCase(input_def, scale_def, bias_def),
+                       BuildQDQBatchNormTestCase<uint8_t, uint8_t, uint8_t>(input_def, scale_def, bias_def),
+                       provider_options,
+                       11,
+                       expected_ep_assignment,
+                       1e-5f);
 }
 
+// TODO: FIX TRANSLATION!!!
 // Check that QNN compiles DQ -> BatchNormalization -> Q as a single unit.
 // Use an input of rank 3.
-TEST_F(QnnHTPBackendTests, TestQDQBatchNorm1D) {
-  RunBatchNormQDQTest({1, 2, 3}, ExpectedEPNodeAssignment::All);
+TEST_F(QnnHTPBackendTests, DISABLED_BatchNorm1D) {
+  constexpr int64_t num_channels = 2;
+
+  RunBatchNormQDQTest(TestInputDef<float>({1, num_channels, 3}, false, {-5.0f, -4.0f, -3.0f, 0.0f, 2.0f, 5.0f}),  // Input data
+                      TestInputDef<float>({num_channels}, true, {1.0f, 2.0f}),                                    // Scale initializer
+                      TestInputDef<float>({num_channels}, true, {1.1f, 2.1f}),                                    // Bias initializer
+                      ExpectedEPNodeAssignment::All);
 }
 
+// TODO: FIX TRANSLATION!!!
 // Check that QNN compiles DQ -> BatchNormalization -> Q as a single unit.
 // Use an input of rank 4.
-TEST_F(QnnHTPBackendTests, TestQDQBatchNorm2D) {
-  RunBatchNormQDQTest({2, 3, 4, 5}, ExpectedEPNodeAssignment::All);
+TEST_F(QnnHTPBackendTests, DISABLED_BatchNorm2D) {
+  constexpr int64_t num_channels = 2;
+  std::vector<float> input_data = {-8.0f, -6.0f, -4.0f, -2.0f, 0.0f, 1.1f, 3.3f, 8.0f,
+                                   -7.0f, -5.0f, -3.0f, -1.0f, 0.0f, 2.1f, 4.3f, 7.0f};
+
+  RunBatchNormQDQTest(TestInputDef<float>({2, num_channels, 2, 2}, false, input_data),  // Input data
+                      TestInputDef<float>({num_channels}, true, {1.0f, 2.0f}),          // Scale initializer
+                      TestInputDef<float>({num_channels}, true, {1.1f, 2.1f}),          // Bias initializer
+                      ExpectedEPNodeAssignment::All);
 }
 
 // Check that QNN compiles DQ -> BatchNormalization -> Q as a single unit.
 // Use an input of rank 5. QNN BatchNormalization doesn't support 5D on HTP
-TEST_F(QnnHTPBackendTests, TestQDQBatchNorm3D) {
-  RunBatchNormQDQTest({1, 2, 3, 4, 5}, ExpectedEPNodeAssignment::None);
+TEST_F(QnnHTPBackendTests, BatchNorm3D) {
+  constexpr int64_t num_channels = 2;
+  constexpr int64_t num_elems = 1 * num_channels * 3 * 4 * 5;
+  RunBatchNormQDQTest(TestInputDef<float>({1, num_channels, 3, 4, 5}, false, std::vector<float>(num_elems)),  // Input data (all zeros)
+                      TestInputDef<float>({num_channels}, true, {1.0f, 2.0f}),                                // Scale initializer
+                      TestInputDef<float>({num_channels}, true, {1.1f, 2.1f}),                                // Bias initializer
+                      ExpectedEPNodeAssignment::None);
 }
 
 #endif  // defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
diff --git a/onnxruntime/test/providers/qnn/conv_test.cc b/onnxruntime/test/providers/qnn/conv_test.cc
index ddaf7bbf59ad7..147c1dda13e66 100644
--- a/onnxruntime/test/providers/qnn/conv_test.cc
+++ b/onnxruntime/test/providers/qnn/conv_test.cc
@@ -142,65 +142,36 @@ static void RunCPUConvOpTest(const std::string& conv_op_type, const TestInputDef
 
 // Creates a graph with a single Q/DQ Conv operator. Used for testing HTP backend.
 template <typename InputQType>
-static GetTestModelFn BuildQDQConvTestCase(const std::string& conv_op_type, const TestInputDef<float>& input_def,
-                                           const TestInputDef<float>& weights_def,
-                                           const TestInputDef<float>& bias_def,
-                                           const std::vector<int64_t>& strides,
-                                           const std::vector<int64_t>& pads,
-                                           const std::vector<int64_t>& dilations,
-                                           const std::string& auto_pad = "NOTSET") {
-  return [conv_op_type, input_def, weights_def, bias_def, strides, pads, dilations, auto_pad](ModelTestBuilder& builder) {
-    auto* output = builder.MakeOutput();
-
-    using InputQLimits = std::numeric_limits<InputQType>;
-
-    const float input_scale = 0.004f;
-    const float weight_scale = 0.004f;
-    const InputQType io_zp = (InputQLimits::min() + InputQLimits::max()) / 2 + 1;
-
+static GetTestQDQModelFn<InputQType> BuildQDQConvTestCase(const std::string& conv_op_type, const TestInputDef<float>& input_def,
+                                                          const TestInputDef<float>& weights_def,
+                                                          const TestInputDef<float>& bias_def,
+                                                          const std::vector<int64_t>& strides,
+                                                          const std::vector<int64_t>& pads,
+                                                          const std::vector<int64_t>& dilations,
+                                                          const std::string& auto_pad = "NOTSET") {
+  return [conv_op_type, input_def, weights_def, bias_def, strides, pads,
+          dilations, auto_pad](ModelTestBuilder& builder,
+                               std::vector<QuantParams<InputQType>>& output_qparams) {
     std::vector<NodeArg*> conv_inputs;
 
     // input -> Q/DQ ->
     auto* input = MakeTestInput(builder, input_def);
-    auto* input_qdq = AddQDQNodePair<InputQType>(builder, input, input_scale, io_zp);
+    QuantParams<InputQType> input_qparams = GetTestInputQuantParams(input_def);
+    auto* input_qdq = AddQDQNodePair<InputQType>(builder, input, input_qparams.scale, input_qparams.zero_point);
     conv_inputs.push_back(input_qdq);
 
     // weights -> Q/DQ ->
     auto* weights = MakeTestInput(builder, weights_def);
-    auto* weights_qdq = AddQDQNodePair<InputQType>(builder, weights, weight_scale, io_zp);
+    QuantParams<InputQType> weights_qparams = GetTestInputQuantParams(weights_def);
+    auto* weights_qdq = AddQDQNodePair<InputQType>(builder, weights, weights_qparams.scale, weights_qparams.zero_point);
     conv_inputs.push_back(weights_qdq);
 
     // bias ->
     if (!bias_def.GetShape().empty()) {
-      NodeArg* bias_int32 = nullptr;
-      const float bias_scale = input_scale * weight_scale;  // Taken from python quantization tool: onnx_quantizer.py::quantize_bias_static()
-
-      // Bias must be int32 to be detected as a QDQ node unit.
-      // We must quantize the data.
-      if (bias_def.IsRandomData()) {
-        // Create random initializer def that is quantized to int32
-        const auto& rand_info = bias_def.GetRandomDataInfo();
-        TestInputDef<int32_t> bias_int32_def(bias_def.GetShape(), bias_def.IsInitializer(), static_cast<int32_t>(rand_info.min / bias_scale),
-                                             static_cast<int32_t>(rand_info.max / bias_scale));
-        bias_int32 = MakeTestInput(builder, bias_int32_def);
-      } else {
-        assert(bias_def.IsRawData());
-        // Create raw data initializer def that is quantized to int32
-        const auto& bias_f32_raw = bias_def.GetRawData();
-        const size_t num_elems = bias_f32_raw.size();
-
-        std::vector<int32_t> bias_int32_raw(num_elems);
-        for (size_t i = 0; i < num_elems; i++) {
-          bias_int32_raw[i] = static_cast<int32_t>(bias_f32_raw[i] / bias_scale);
-        }
-
-        TestInputDef<int32_t> bias_int32_def(bias_def.GetShape(), bias_def.IsInitializer(), bias_int32_raw);
-        bias_int32 = MakeTestInput(builder, bias_int32_def);
-      }
-
-      auto* bias = builder.MakeIntermediate();
-      builder.AddDequantizeLinearNode<int32_t>(bias_int32, bias_scale, 0, bias);
-      conv_inputs.push_back(bias);
+      // Bias requirement taken from python quantization tool: onnx_quantizer.py::quantize_bias_static()
+      const float bias_scale = input_qparams.scale * weights_qparams.scale;
+
+      conv_inputs.push_back(MakeTestQDQBiasInput(builder, bias_def, bias_scale));
     }
 
     auto* conv_output = builder.MakeIntermediate();
@@ -218,9 +189,7 @@ static GetTestModelFn BuildQDQConvTestCase(const std::string& conv_op_type, cons
       conv_node.AddAttribute("dilations", dilations);
     }
 
-    auto* q_output = builder.MakeIntermediate();
-    builder.AddQuantizeLinearNode<InputQType>(conv_output, input_scale, io_zp, q_output);
-    builder.AddDequantizeLinearNode<InputQType>(q_output, input_scale, io_zp, output);
+    AddQDQNodePairWithOutputAsGraphOutput<InputQType>(builder, conv_output, output_qparams[0].scale, output_qparams[0].zero_point);
   };
 }
 
@@ -245,18 +214,19 @@ static void RunHTPConvOpTest(const std::string& conv_op_type, const TestInputDef
   provider_options["backend_path"] = "libQnnHtp.so";
 #endif
 
-  RunQnnModelTest(BuildQDQConvTestCase<InputQType>(conv_op_type, input_def, weights_def, bias_def,
-                                                   strides, pads, dilations, auto_pad),
-                  provider_options,
-                  opset,
-                  expected_ep_assignment,
-                  fp32_abs_err);
+  TestQDQModelAccuracy(BuildF32ConvTestCase(conv_op_type, input_def, weights_def, bias_def, strides, pads, dilations, auto_pad),
+                       BuildQDQConvTestCase<InputQType>(conv_op_type, input_def, weights_def, bias_def,
+                                                        strides, pads, dilations, auto_pad),
+                       provider_options,
+                       opset,
+                       expected_ep_assignment,
+                       fp32_abs_err);
 }
 
 // Check that QNN compiles DQ -> Conv -> Q as a single unit.
 // Tests bias as a dynamic input.
 // TODO: Segfaults when calling graphFinalize().
-TEST_F(QnnCPUBackendTests, DISABLED_TestCPUConvf32_dynamic_bias) {
+TEST_F(QnnCPUBackendTests, DISABLED_Convf32_dynamic_bias) {
   RunCPUConvOpTest("Conv",
                    TestInputDef<float>({1, 1, 3, 3}, false, 0.0f, 10.0f),  // Random dynamic input
                    TestInputDef<float>({2, 1, 2, 2}, true, 0.0f, 1.0f),    // Random static weights
@@ -270,7 +240,7 @@ TEST_F(QnnCPUBackendTests, DISABLED_TestCPUConvf32_dynamic_bias) {
 
 // Check that QNN compiles DQ -> Conv -> Q as a single unit.
 // Tests bias as an initializer.
-TEST_F(QnnCPUBackendTests, TestCPUConvf32_bias_initializer) {
+TEST_F(QnnCPUBackendTests, Convf32_bias_initializer) {
   RunCPUConvOpTest("Conv",
                    TestInputDef<float>({1, 1, 3, 3}, false, 0.0f, 10.0f),  // Random dynamic input
                    TestInputDef<float>({2, 1, 2, 2}, true, 0.0f, 1.0f),    // Random static weights
@@ -283,7 +253,7 @@ TEST_F(QnnCPUBackendTests, TestCPUConvf32_bias_initializer) {
 }
 
 // Tests Conv's auto_pad value "SAME_UPPER" (compares to CPU EP).
-TEST_F(QnnCPUBackendTests, TestCPUConvf32_AutoPadUpper) {
+TEST_F(QnnCPUBackendTests, Convf32_AutoPadUpper) {
   RunCPUConvOpTest("Conv",
                    TestInputDef<float>({1, 1, 3, 3}, false, -3.0f, 3.0f),  // Random dynamic input
                    TestInputDef<float>({2, 1, 2, 2}, true, -1.0f, 1.0f),   // Random static weights
@@ -296,7 +266,7 @@ TEST_F(QnnCPUBackendTests, TestCPUConvf32_AutoPadUpper) {
 }
 
 // Tests ConvTranspose's auto_pad value "SAME_UPPER" (compares to CPU EP).
-TEST_F(QnnCPUBackendTests, TestCPUConvTransposef32_AutoPadUpper) {
+TEST_F(QnnCPUBackendTests, ConvTransposef32_AutoPadUpper) {
   RunCPUConvOpTest("ConvTranspose",
                    TestInputDef<float>({1, 1, 3, 3}, false, -3.0f, 3.0f),  // Random dynamic input
                    TestInputDef<float>({1, 2, 2, 2}, true, -1.0f, 1.0f),   // Random static weights
@@ -309,7 +279,7 @@ TEST_F(QnnCPUBackendTests, TestCPUConvTransposef32_AutoPadUpper) {
 }
 
 // Tests Conv's auto_pad value "SAME_LOWER" (compares to CPU EP).
-TEST_F(QnnCPUBackendTests, TestCPUConvf32_AutoPadLower) {
+TEST_F(QnnCPUBackendTests, Convf32_AutoPadLower) {
   RunCPUConvOpTest("Conv",
                    TestInputDef<float>({1, 1, 3, 3}, false, -3.0f, 3.0f),  // Random dynamic input
                    TestInputDef<float>({2, 1, 2, 2}, false, -1.0f, 1.0f),  // Random dynamic weights
@@ -322,7 +292,7 @@ TEST_F(QnnCPUBackendTests, TestCPUConvf32_AutoPadLower) {
 }
 
 // Tests ConvTranspose's auto_pad value "SAME_LOWER" (compares to CPU EP).
-TEST_F(QnnCPUBackendTests, TestCPUConvTransposef32_AutoPadLower) {
+TEST_F(QnnCPUBackendTests, ConvTransposef32_AutoPadLower) {
   RunCPUConvOpTest("ConvTranspose",
                    TestInputDef<float>({1, 1, 3, 3}, false, -3.0f, 3.0f),  // Random dynamic input
                    TestInputDef<float>({1, 2, 2, 2}, false, -1.0f, 1.0f),  // Random dynamic weights
@@ -335,7 +305,7 @@ TEST_F(QnnCPUBackendTests, TestCPUConvTransposef32_AutoPadLower) {
 }
 
 // large input,output, pads
-TEST_F(QnnCPUBackendTests, TestCPUConvf32_large_input1_pad_bias_initializer) {
+TEST_F(QnnCPUBackendTests, Convf32_large_input1_pad_bias_initializer) {
   RunCPUConvOpTest("Conv",
                    TestInputDef<float>({1, 3, 60, 452}, false, 0.0f, 10.0f),  // Random dynamic input
                    TestInputDef<float>({16, 3, 3, 3}, true, 0.0f, 1.0f),      // Random dynamic weights
@@ -349,7 +319,7 @@ TEST_F(QnnCPUBackendTests, TestCPUConvf32_large_input1_pad_bias_initializer) {
                    1e-4f);
 }
 
-TEST_F(QnnCPUBackendTests, TestCPUConvf32_large_input2_nopad_bias_initializer) {
+TEST_F(QnnCPUBackendTests, Convf32_large_input2_nopad_bias_initializer) {
 #if defined(_WIN32)
   // Tolerance needs to be > 1.52588e-05 on Windows x64
   // TODO: Investigate why
@@ -372,7 +342,7 @@ TEST_F(QnnCPUBackendTests, TestCPUConvf32_large_input2_nopad_bias_initializer) {
 }
 
 // Test 1D Conv with static weights (implemented in QNN EP as 2D convolution with height of 1).
-TEST_F(QnnCPUBackendTests, TestCPUConv1Df32_StaticWeights_DefaultBias) {
+TEST_F(QnnCPUBackendTests, Conv1Df32_StaticWeights_DefaultBias) {
   RunCPUConvOpTest("Conv",
                    TestInputDef<float>({1, 2, 4}, false, {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f}),  // Dynamic input
                    TestInputDef<float>({1, 2, 2}, true, {1.0f, 2.0f, 3.0f, 4.0f}),                           // Static weights
@@ -385,7 +355,7 @@ TEST_F(QnnCPUBackendTests, TestCPUConv1Df32_StaticWeights_DefaultBias) {
 }
 
 // Test 1D Conv with dynamic weights (implemented in QNN EP as 2D convolution with height of 1).
-TEST_F(QnnCPUBackendTests, TestCPUConv1Df32_DynamicWeights_DefaultBias) {
+TEST_F(QnnCPUBackendTests, Conv1Df32_DynamicWeights_DefaultBias) {
   RunCPUConvOpTest("Conv",
                    TestInputDef<float>({1, 2, 4}, false, {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f}),  // Dynamic input
                    TestInputDef<float>({1, 2, 2}, false, {1.0f, 2.0f, 3.0f, 4.0f}),                          // Dynamic weights
@@ -398,7 +368,7 @@ TEST_F(QnnCPUBackendTests, TestCPUConv1Df32_DynamicWeights_DefaultBias) {
 }
 
 // Test 1D ConvTranspose with static weights (implemented in QNN EP as 2D convolution with height of 1).
-TEST_F(QnnCPUBackendTests, TestCPUConvTranspose1Df32_StaticWeights_DefaultBias) {
+TEST_F(QnnCPUBackendTests, ConvTranspose1Df32_StaticWeights_DefaultBias) {
   RunCPUConvOpTest("ConvTranspose",
                    TestInputDef<float>({1, 2, 4}, false, {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f}),  // Dynamic input
                    TestInputDef<float>({2, 1, 2}, true, {1.0f, 2.0f, 3.0f, 4.0f}),                           // Static weights
@@ -411,7 +381,7 @@ TEST_F(QnnCPUBackendTests, TestCPUConvTranspose1Df32_StaticWeights_DefaultBias)
 }
 
 // Test 1D ConvTranspose with dynamic weights (implemented in QNN EP as 2D convolution with height of 1).
-TEST_F(QnnCPUBackendTests, TestCPUConvTranspose1Df32_DynamicWeights_DefaultBias) {
+TEST_F(QnnCPUBackendTests, ConvTranspose1Df32_DynamicWeights_DefaultBias) {
   RunCPUConvOpTest("ConvTranspose",
                    TestInputDef<float>({1, 2, 4}, false, {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f}),  // Dynamic input
                    TestInputDef<float>({2, 1, 2}, false, {1.0f, 2.0f, 3.0f, 4.0f}),                          // Dynamic weights
@@ -427,7 +397,7 @@ TEST_F(QnnCPUBackendTests, TestCPUConvTranspose1Df32_DynamicWeights_DefaultBias)
 
 // Check that QNN compiles DQ -> Conv -> Q as a single unit.
 // Tests bias as a dynamic input.
-TEST_F(QnnHTPBackendTests, TestQDQConvU8S32_bias_dynamic_input) {
+TEST_F(QnnHTPBackendTests, ConvU8S32_bias_dynamic_input) {
   RunHTPConvOpTest<uint8_t>("Conv",
                             TestInputDef<float>({1, 1, 5, 5}, false, 0.0f, 10.0f),   // Random dynamic input
                             TestInputDef<float>({1, 1, 3, 3}, true, -10.0f, 10.0f),  // Random static input
@@ -441,35 +411,35 @@ TEST_F(QnnHTPBackendTests, TestQDQConvU8S32_bias_dynamic_input) {
 
 // Test that dynamic weights with default bias works for Conv. This was previously not working
 // on older versions of QNN sdk.
-TEST_F(QnnHTPBackendTests, TestQDQConvU8S32_DynamicWeight_NoBias) {
+TEST_F(QnnHTPBackendTests, ConvU8S32_DynamicWeight_NoBias) {
   RunHTPConvOpTest<uint8_t>("Conv",
-                            TestInputDef<float>({1, 3, 32, 32}, false, 0.0f, 10.0f),  // Random dynamic input
-                            TestInputDef<float>({1, 3, 4, 4}, false, -10.0f, 10.0f),  // Random dynamic weights
-                            TestInputDef<float>(),                                    // Default bias
-                            {1, 1},                                                   // Strides
-                            {0, 0, 0, 0},                                             // Pads
-                            {1, 1},                                                   // Dilations
+                            TestInputDef<float>({1, 3, 32, 32}, false, -10.0f, 10.0f),  // Random dynamic input
+                            TestInputDef<float>({1, 3, 4, 4}, false, -10.0f, 10.0f),    // Random dynamic weights
+                            TestInputDef<float>(),                                      // Default bias
+                            {1, 1},                                                     // Strides
+                            {0, 0, 0, 0},                                               // Pads
+                            {1, 1},                                                     // Dilations
                             "NOTSET",
                             ExpectedEPNodeAssignment::All);
 }
 
 // Test that dynamic weights with default bias works for ConvTranspose. This was previously not working
 // on older versions of QNN sdk.
-TEST_F(QnnHTPBackendTests, TestQDQConvTransposeU8S32_DynamicWeight_NoBias) {
+TEST_F(QnnHTPBackendTests, ConvTransposeU8S32_DynamicWeight_NoBias) {
   RunHTPConvOpTest<uint8_t>("ConvTranspose",
-                            TestInputDef<float>({1, 3, 32, 32}, false, 0.0f, 100.0f),  // Random dynamic input
-                            TestInputDef<float>({3, 1, 4, 4}, false, -10.0f, 10.0f),   // Random dynamic weights
-                            TestInputDef<float>(),                                     // Default bias
-                            {1, 1},                                                    // Strides
-                            {0, 0, 0, 0},                                              // Pads
-                            {1, 1},                                                    // Dilations
+                            TestInputDef<float>({1, 3, 32, 32}, false, -10.0f, 10.0f),  // Random dynamic input
+                            TestInputDef<float>({3, 1, 4, 4}, false, -10.0f, 10.0f),    // Random dynamic weights
+                            TestInputDef<float>(),                                      // Default bias
+                            {1, 1},                                                     // Strides
+                            {0, 0, 0, 0},                                               // Pads
+                            {1, 1},                                                     // Dilations
                             "NOTSET",
                             ExpectedEPNodeAssignment::All);
 }
 
 // Check that QNN compiles DQ -> Conv -> Q as a single unit.
 // Tests bias as an initializer.
-TEST_F(QnnHTPBackendTests, TestQDQConvU8U8S32_bias_initializer) {
+TEST_F(QnnHTPBackendTests, ConvU8U8S32_bias_initializer) {
   RunHTPConvOpTest<uint8_t>("Conv",
                             TestInputDef<float>({1, 1, 5, 5}, false, 0.0f, 10.0f),   // Random dynamic input
                             TestInputDef<float>({1, 1, 3, 3}, true, -10.0f, 10.0f),  // Random static weight
@@ -482,7 +452,7 @@ TEST_F(QnnHTPBackendTests, TestQDQConvU8U8S32_bias_initializer) {
 }
 
 // Tests 1D Conv with bias as an initializer.
-TEST_F(QnnHTPBackendTests, TestQDQConv1DU8S32_bias_initializer) {
+TEST_F(QnnHTPBackendTests, Conv1DU8S32_bias_initializer) {
   RunHTPConvOpTest<uint8_t>("Conv",
                             TestInputDef<float>({1, 2, 4}, false, {0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f}),  // Dynamic input
                             TestInputDef<float>({1, 2, 2}, true, {1.f, 2.f, 3.f, 4.f}),                       // Static weight
@@ -495,7 +465,7 @@ TEST_F(QnnHTPBackendTests, TestQDQConv1DU8S32_bias_initializer) {
 }
 
 // Tests 1D ConvTranspose with bias as an initializer.
-TEST_F(QnnHTPBackendTests, TestQDQConvTranspose1DU8S32_bias_initializer) {
+TEST_F(QnnHTPBackendTests, ConvTranspose1DU8S32_bias_initializer) {
   RunHTPConvOpTest<uint8_t>("ConvTranspose",
                             TestInputDef<float>({1, 2, 4}, false, {0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f}),  // Dynamic input
                             TestInputDef<float>({2, 1, 2}, true, {1.f, 2.f, 3.f, 4.f}),                       // Static weight
@@ -508,7 +478,7 @@ TEST_F(QnnHTPBackendTests, TestQDQConvTranspose1DU8S32_bias_initializer) {
 }
 
 // Tests auto_pad value "SAME_UPPER" on HTP backend (compares to CPU EP).
-TEST_F(QnnHTPBackendTests, TestQDQConvU8S32_AutoPadUpper) {
+TEST_F(QnnHTPBackendTests, ConvU8S32_AutoPadUpper) {
   RunHTPConvOpTest<uint8_t>("Conv",
                             TestInputDef<float>({1, 1, 5, 5}, false, 0.f, 10.f),  // Dynamic input
                             TestInputDef<float>({1, 1, 4, 4}, true, -1.f, 1.f),   // Static weights
@@ -518,12 +488,11 @@ TEST_F(QnnHTPBackendTests, TestQDQConvU8S32_AutoPadUpper) {
                             {1, 1},                                               // dilations
                             "SAME_UPPER",                                         // auto_pad
                             ExpectedEPNodeAssignment::All,
-                            13,
-                            1e-4f);
+                            13);
 }
 
 // Tests Conv1d auto_pad value "SAME_UPPER" on HTP backend (compares to CPU EP).
-TEST_F(QnnHTPBackendTests, TestQDQConv1DU8U8S32_AutoPadUpper) {
+TEST_F(QnnHTPBackendTests, Conv1DU8U8S32_AutoPadUpper) {
   RunHTPConvOpTest<uint8_t>("Conv",
                             TestInputDef<float>({1, 2, 4}, false, {0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f}),  // Dynamic input
                             TestInputDef<float>({1, 2, 2}, true, {1.f, 2.f, 3.f, 4.f}),                       // Static weight
@@ -533,12 +502,11 @@ TEST_F(QnnHTPBackendTests, TestQDQConv1DU8U8S32_AutoPadUpper) {
                             {1},                                                                              // dilations
                             "SAME_UPPER",                                                                     // auto_pad
                             ExpectedEPNodeAssignment::All,
-                            13,
-                            1e-4f);
+                            13);
 }
 
 // Tests TransposeConv1d auto_pad value "SAME_UPPER" on HTP backend (compares to CPU EP).
-TEST_F(QnnHTPBackendTests, TestQDQConvTranspose1DU8U8S32_AutoPadUpper) {
+TEST_F(QnnHTPBackendTests, ConvTranspose1DU8U8S32_AutoPadUpper) {
   RunHTPConvOpTest<uint8_t>("ConvTranspose",
                             TestInputDef<float>({1, 2, 4}, false, {0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f}),  // Dynamic input
                             TestInputDef<float>({2, 1, 2}, true, {1.f, 2.f, 3.f, 4.f}),                       // Static weight
@@ -548,12 +516,11 @@ TEST_F(QnnHTPBackendTests, TestQDQConvTranspose1DU8U8S32_AutoPadUpper) {
                             {1},                                                                              // dilations
                             "SAME_UPPER",                                                                     // auto_pad
                             ExpectedEPNodeAssignment::All,
-                            13,
-                            1e-4f);
+                            13);
 }
 
 // Tests Conv's auto_pad value "SAME_LOWER" on HTP backend (compares to CPU EP).
-TEST_F(QnnHTPBackendTests, TestQDQConvU8U8S32_AutoPadLower) {
+TEST_F(QnnHTPBackendTests, ConvU8U8S32_AutoPadLower) {
   RunHTPConvOpTest<uint8_t>("Conv",
                             TestInputDef<float>({1, 1, 5, 5}, false, 0.f, 10.f),  // Dynamic input
                             TestInputDef<float>({1, 1, 4, 4}, true, -1.f, 1.f),   // Static weights
@@ -563,12 +530,11 @@ TEST_F(QnnHTPBackendTests, TestQDQConvU8U8S32_AutoPadLower) {
                             {1, 1},                                               // dilations
                             "SAME_LOWER",                                         // auto_pad
                             ExpectedEPNodeAssignment::All,
-                            13,
-                            1e-4f);
+                            13);
 }
 
 // Tests ConvTranspose's auto_pad value "SAME_LOWER" on HTP backend (compares to CPU EP).
-TEST_F(QnnHTPBackendTests, TestQDQConvTransposeU8U8S32_AutoPadLower) {
+TEST_F(QnnHTPBackendTests, ConvTransposeU8U8S32_AutoPadLower) {
   RunHTPConvOpTest<uint8_t>("ConvTranspose",
                             TestInputDef<float>({1, 1, 5, 5}, false, 0.f, 10.f),  // Dynamic input
                             TestInputDef<float>({1, 1, 4, 4}, true, -1.f, 1.f),   // Static weights
@@ -578,12 +544,11 @@ TEST_F(QnnHTPBackendTests, TestQDQConvTransposeU8U8S32_AutoPadLower) {
                             {1, 1},                                               // dilations
                             "SAME_LOWER",                                         // auto_pad
                             ExpectedEPNodeAssignment::All,
-                            13,
-                            1e-4f);
+                            13);
 }
 
 // Tests Conv1d auto_pad value "SAME_LOWER" on HTP backend (compares to CPU EP).
-TEST_F(QnnHTPBackendTests, TestQDQConv1DU8U8S32_AutoPadLower) {
+TEST_F(QnnHTPBackendTests, Conv1DU8U8S32_AutoPadLower) {
   RunHTPConvOpTest<uint8_t>("Conv",
                             TestInputDef<float>({1, 2, 4}, false, {0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f}),  // Dynamic input
                             TestInputDef<float>({1, 2, 2}, true, {1.f, 2.f, 3.f, 4.f}),                       // Static weight
@@ -593,12 +558,11 @@ TEST_F(QnnHTPBackendTests, TestQDQConv1DU8U8S32_AutoPadLower) {
                             {1},                                                                              // dilations
                             "SAME_LOWER",                                                                     // auto_pad
                             ExpectedEPNodeAssignment::All,
-                            13,
-                            1e-4f);
+                            13);
 }
 
 // Tests ConvTranspose 1d auto_pad value "SAME_LOWER" on HTP backend (compares to CPU EP).
-TEST_F(QnnHTPBackendTests, TestQDQConvTranspose1DU8U8S32_AutoPadLower) {
+TEST_F(QnnHTPBackendTests, ConvTranspose1DU8U8S32_AutoPadLower) {
   RunHTPConvOpTest<uint8_t>("ConvTranspose",
                             TestInputDef<float>({1, 2, 4}, false, {0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f}),  // Dynamic input
                             TestInputDef<float>({2, 1, 2}, true, {1.f, 2.f, 3.f, 4.f}),                       // Static weight
@@ -608,12 +572,11 @@ TEST_F(QnnHTPBackendTests, TestQDQConvTranspose1DU8U8S32_AutoPadLower) {
                             {1},                                                                              // dilations
                             "SAME_LOWER",                                                                     // auto_pad
                             ExpectedEPNodeAssignment::All,
-                            13,
-                            1e-4f);
+                            13);
 }
 
 // TODO: re-enable tests once HTP issues are resolved
-TEST_F(QnnHTPBackendTests, DISABLED_TestQDQConvU8U8S32_large_input1_padding_bias_initializer) {
+TEST_F(QnnHTPBackendTests, DISABLED_ConvU8U8S32_large_input1_padding_bias_initializer) {
   RunHTPConvOpTest<uint8_t>("Conv",
                             TestInputDef<float>({1, 3, 60, 452}, false, 0.f, 10.f),        // Dynamic input
                             TestInputDef<float>({16, 3, 3, 3}, true, -1.f, 1.f),           // Static weights
@@ -625,7 +588,7 @@ TEST_F(QnnHTPBackendTests, DISABLED_TestQDQConvU8U8S32_large_input1_padding_bias
                             ExpectedEPNodeAssignment::All);
 }
 
-TEST_F(QnnHTPBackendTests, DISABLED_TestQDQConvU8S32_large_input2_bias_initializer) {
+TEST_F(QnnHTPBackendTests, DISABLED_ConvU8S32_large_input2_bias_initializer) {
   RunHTPConvOpTest<uint8_t>("Conv",
                             TestInputDef<float>({1, 128, 8, 56}, false, 0.f, 10.f),  // Dynamic input
                             TestInputDef<float>({32, 128, 1, 1}, true, -1.f, 1.f),   // Random static weights
@@ -638,7 +601,7 @@ TEST_F(QnnHTPBackendTests, DISABLED_TestQDQConvU8S32_large_input2_bias_initializ
 }
 
 // TODO: Certain large input sizes cause the QNN graph to fail to finalize with error 1002 (QNN_COMMON_ERROR_MEM_ALLOC).
-TEST_F(QnnHTPBackendTests, DISABLED_TestQDQConvU8U8S32_LargeInput_Dilations_Pads) {
+TEST_F(QnnHTPBackendTests, DISABLED_ConvU8U8S32_LargeInput_Dilations_Pads) {
   RunHTPConvOpTest<uint8_t>("Conv",
                             TestInputDef<float>({1, 3, 768, 1152}, false, 0.f, 10.f),  // Dynamic input
                             TestInputDef<float>({64, 3, 7, 7}, true, -1.f, 1.f),       // Random static weights
diff --git a/onnxruntime/test/providers/qnn/gather_op_htp_test.cc b/onnxruntime/test/providers/qnn/gather_op_htp_test.cc
index 3571cdff9b6cc..d2ca9d8ff71e0 100644
--- a/onnxruntime/test/providers/qnn/gather_op_htp_test.cc
+++ b/onnxruntime/test/providers/qnn/gather_op_htp_test.cc
@@ -6,7 +6,6 @@
 #include <string>
 #include "core/graph/graph.h"
 
-#include "test/optimizer/qdq_test_utils.h"
 #include "test/providers/qnn/qnn_test_utils.h"
 
 #include "gtest/gtest.h"
@@ -15,17 +14,47 @@ namespace onnxruntime {
 namespace test {
 #if defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
 
-/**
- * Runs a Gather op model on the QNN HTP backend. Checks the graph node assignment, and that inference
- * outputs for QNN and CPU match.
- *
- * \param opset The opset version.
- * \param scalar_indices whether the incidices input is scalar or not.
- * \param expected_ep_assignment How many nodes are expected to be assigned to QNN (All, Some, or None)
- */
+// Function that builds a float model with a Gather op.
+template <typename IndicesType = int32_t>
+static GetTestModelFn BuildGatherOpTestCase(const TestInputDef<float>& input_def,
+                                            const TestInputDef<IndicesType>& indices_def,
+                                            int64_t axis = 0) {
+  return [input_def, indices_def, axis](ModelTestBuilder& builder) {
+    NodeArg* input = MakeTestInput(builder, input_def);
+    NodeArg* indices = MakeTestInput(builder, indices_def);
+    NodeArg* output = builder.MakeOutput();
+
+    Node& gather_node = builder.AddNode("Gather", {input, indices}, {output});
+    gather_node.AddAttribute("axis", axis);
+  };
+}
+
+// Function that builds a QDQ model with a Gather op.
+template <typename QuantType = uint8_t, typename IndicesType = int32_t>
+static GetTestQDQModelFn<QuantType> BuildQDQGatherOpTestCase(const TestInputDef<float>& input_def,
+                                                             const TestInputDef<IndicesType>& indices_def,
+                                                             int64_t axis = 0) {
+  return [input_def, indices_def, axis](ModelTestBuilder& builder,
+                                        std::vector<QuantParams<QuantType>>& output_qparams) {
+    NodeArg* input = MakeTestInput(builder, input_def);
+    QuantParams<QuantType> input_qparams = GetTestInputQuantParams(input_def);
+    NodeArg* input_qdq = AddQDQNodePair<QuantType>(builder, input, input_qparams.scale, input_qparams.zero_point);
+
+    NodeArg* indices = MakeTestInput(builder, indices_def);
+
+    NodeArg* gather_output = builder.MakeIntermediate();
+    Node& gather_node = builder.AddNode("Gather", {input_qdq, indices}, {gather_output});
+    gather_node.AddAttribute("axis", axis);
+
+    AddQDQNodePairWithOutputAsGraphOutput<QuantType>(builder, gather_output, output_qparams[0].scale, output_qparams[0].zero_point);
+  };
+}
+
+// Test the accuracy of a QDQ Gather model on QNN EP. Checks if the QDQ model on QNN EP as accurate as the QDQ model on CPU EP
+// (compared to float32 model).
 template <typename QuantType, typename IndicesType>
-static void RunGatherOpQDQTest(int opset, bool scalar_indices = false,
-                               ExpectedEPNodeAssignment expected_ep_assignment = ExpectedEPNodeAssignment::All) {
+static void RunQDQGatherOpTest(const TestInputDef<float>& input_def, const TestInputDef<IndicesType>& indices_def,
+                               int64_t axis, int opset, ExpectedEPNodeAssignment expected_ep_assignment) {
   ProviderOptions provider_options;
 #if defined(_WIN32)
   provider_options["backend_path"] = "QnnHtp.dll";
@@ -33,54 +62,69 @@ static void RunGatherOpQDQTest(int opset, bool scalar_indices = false,
   provider_options["backend_path"] = "libQnnHtp.so";
 #endif
 
-  if (scalar_indices) {
-    RunQnnModelTest(BuildQDQGatherOpScalarIndicesTestCase<QuantType, IndicesType>({2, 3, 4},  // input shape
-                                                                                  1,          // indices
-                                                                                  1),         // axis
-                    provider_options,
-                    opset,
-                    expected_ep_assignment);
-  } else {
-    RunQnnModelTest(BuildQDQGatherOpTestCase<QuantType, IndicesType>({2, 3, 4},                    // input shape
-                                                                     std::vector<IndicesType>{1},  // indices
-                                                                     {1},                          // indices_shape
-                                                                     1),                           // axis
-                    provider_options,
-                    opset,
-                    expected_ep_assignment);
-  }
+  TestQDQModelAccuracy<QuantType>(BuildGatherOpTestCase<IndicesType>(input_def, indices_def, axis),
+                                  BuildQDQGatherOpTestCase<QuantType, IndicesType>(input_def, indices_def, axis),
+                                  provider_options,
+                                  opset,
+                                  expected_ep_assignment,
+                                  1e-5f);
 }
 
 // Test creates a DQ -> Gather -> Q -> DQ graph, and checks that all
-// nodes are supported by the QNN EP, and that the inference results match the CPU EP results.
+// nodes are supported by the QNN EP, and that the inference results are as accurate as CPU EP.
 //
-// - Uses uint8 as the quantization type.
-TEST_F(QnnHTPBackendTests, TestQDQGatherOpU8) {
-  RunGatherOpQDQTest<uint8_t, int64_t>(11);
+// Static int64 indices with default axis.
+TEST_F(QnnHTPBackendTests, GatherOp_IndicesStaticInt64_Axis0) {
+  RunQDQGatherOpTest<uint8_t, int64_t>(TestInputDef<float>({3, 2}, false, {1.0f, 1.2f, 2.3f, 3.4f, 4.5f, 5.7f}),
+                                       TestInputDef<int64_t>({2, 2}, true, {0, 1, 1, 2}),
+                                       0,
+                                       13,
+                                       ExpectedEPNodeAssignment::All);
+}
+
+// Tests that dynamic int64 indices are not supported on HTP backend.
+TEST_F(QnnHTPBackendTests, GatherOp_IndicesDynamicInt64_Axis0) {
+  RunQDQGatherOpTest<uint8_t, int64_t>(TestInputDef<float>({3, 2}, false, {1.0f, 1.2f, 2.3f, 3.4f, 4.5f, 5.7f}),
+                                       TestInputDef<int64_t>({2, 2}, false, {0, 1, 1, 2}),
+                                       0,
+                                       13,
+                                       ExpectedEPNodeAssignment::None);
 }
 
 // Test creates a DQ -> Gather -> Q -> DQ graph, and checks that all
-// nodes are supported by the QNN EP, and that the inference results match the CPU EP results.
+// nodes are supported by the QNN EP, and that the inference results are as accurate as CPU EP.
 //
-// - Uses int8 as the quantization type.
-TEST_F(QnnHTPBackendTests, TestQDQGatherOpI8) {
-  RunGatherOpQDQTest<int8_t, int32_t>(11);
+// Static int32 indices with default axis.
+TEST_F(QnnHTPBackendTests, GatherOp_IndicesStaticInt32_Axis0) {
+  RunQDQGatherOpTest<uint8_t, int32_t>(TestInputDef<float>({3, 2}, false, {1.0f, 1.2f, 2.3f, 3.4f, 4.5f, 5.7f}),
+                                       TestInputDef<int32_t>({2, 2}, true, {0, 1, 1, 2}),
+                                       0,
+                                       13,
+                                       ExpectedEPNodeAssignment::All);
 }
 
 // Test creates a DQ -> Gather -> Q -> DQ graph, and checks that all
-// nodes are supported by the QNN EP, and that the inference results match the CPU EP results.
+// nodes are supported by the QNN EP, and that the inference results are as accurate as CPU EP.
 //
-// - Uses uint8 as the quantization type.
-TEST_F(QnnHTPBackendTests, TestQDQGatherOpScalarIndicesU8) {
-  RunGatherOpQDQTest<uint8_t, int64_t>(11, true);
+// Dynamic int32 indices with default axis.
+TEST_F(QnnHTPBackendTests, GatherOp_IndicesDynamicInt32_Axis0) {
+  RunQDQGatherOpTest<uint8_t, int32_t>(TestInputDef<float>({3, 2}, false, {1.0f, 1.2f, 2.3f, 3.4f, 4.5f, 5.7f}),
+                                       TestInputDef<int32_t>({2, 2}, false, {0, 1, 1, 2}),
+                                       0,
+                                       13,
+                                       ExpectedEPNodeAssignment::All);
 }
 
 // Test creates a DQ -> Gather -> Q -> DQ graph, and checks that all
-// nodes are supported by the QNN EP, and that the inference results match the CPU EP results.
+// nodes are supported by the QNN EP, and that the inference results are as accurate as CPU EP.
 //
-// - Uses int8 as the quantization type.
-TEST_F(QnnHTPBackendTests, TestQDQGatherOpScalarIndicesI8) {
-  RunGatherOpQDQTest<int8_t, int32_t>(11, true);
+// Static int32 indices with axis = 1
+TEST_F(QnnHTPBackendTests, GatherOp_IndicesStaticInt32_Axis1) {
+  RunQDQGatherOpTest<uint8_t, int32_t>(TestInputDef<float>({3, 3}, false, {1.0f, 1.2f, 1.9f, 2.3f, 3.4f, 3.9f, 4.5f, 5.7f, 5.9f}),
+                                       TestInputDef<int32_t>({1, 2}, true, {0, 2}),
+                                       1,
+                                       13,
+                                       ExpectedEPNodeAssignment::All);
 }
 
 #endif  // defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
diff --git a/onnxruntime/test/providers/qnn/instance_norm_htp_test.cc b/onnxruntime/test/providers/qnn/instance_norm_htp_test.cc
index 3846a2868a895..683c4d49fa99d 100644
--- a/onnxruntime/test/providers/qnn/instance_norm_htp_test.cc
+++ b/onnxruntime/test/providers/qnn/instance_norm_htp_test.cc
@@ -16,47 +16,56 @@ namespace onnxruntime {
 namespace test {
 #if defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
 
-// Creates the graph:
-//                                  _______________________
-//               input_u8 -> DQ -> |                       | -> Q -> output_u8
-// scale_u8 (initializer) -> DQ -> | InstanceNormalization |
-// bias_u8 (initializer)  -> DQ -> |_______________________|
-//
-// Currently used to test QNN EP.
-template <typename QuantType>
-GetQDQTestCaseFn BuildQDQInstanceNormTestCase(const TestInputDef<QuantType>& input_def,
-                                              const TestInputDef<QuantType>& scale_def,
-                                              const TestInputDef<int32_t>& bias_def,
-                                              const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs) {
+// Function that builds a float32 model with an InstanceNormalization operator.
+GetTestModelFn BuildInstanceNormTestCase(const TestInputDef<float>& input_def,
+                                         const TestInputDef<float>& scale_def,
+                                         const TestInputDef<float>& bias_def,
+                                         const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs) {
   return [input_def, scale_def, bias_def, attrs](ModelTestBuilder& builder) {
-    const QuantType quant_zero_point = 0;
-    const float quant_scale = 1.0f;
-
-    auto* dq_scale_output = builder.MakeIntermediate();
-    auto* scale = MakeTestInput<QuantType>(builder, scale_def);
-    builder.AddDequantizeLinearNode<QuantType>(scale, quant_scale, quant_zero_point, dq_scale_output);
+    NodeArg* input = MakeTestInput(builder, input_def);
+    NodeArg* scale = MakeTestInput(builder, scale_def);
+    NodeArg* bias = MakeTestInput(builder, bias_def);
 
-    // Add bias (initializer) -> DQ ->
-    auto* dq_bias_output = builder.MakeIntermediate();
-    auto* bias = MakeTestInput<int32_t>(builder, bias_def);
-    builder.AddDequantizeLinearNode<int32_t>(bias, 1.0f, 0, dq_bias_output);
+    NodeArg* output = builder.MakeOutput();
+    Node& op_node = builder.AddNode("InstanceNormalization", {input, scale, bias}, {output});
 
-    // Add input_u8 -> DQ ->
-    auto* input_u8 = MakeTestInput<QuantType>(builder, input_def);
-    auto* dq_input_output = builder.MakeIntermediate();
-    builder.AddDequantizeLinearNode<QuantType>(input_u8, quant_scale, quant_zero_point, dq_input_output);
+    for (const auto& attr : attrs) {
+      op_node.AddAttributeProto(attr);
+    }
+  };
+}
 
-    // Add dq_input_output -> InstanceNormalization ->
+// Function that builds a QDQ model with an InstanceNormalization operator.
+template <typename QuantType>
+static GetTestQDQModelFn<QuantType> BuildQDQInstanceNormTestCase(const TestInputDef<float>& input_def,
+                                                                 const TestInputDef<float>& scale_def,
+                                                                 const TestInputDef<float>& bias_def,
+                                                                 const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs) {
+  return [input_def, scale_def, bias_def, attrs](ModelTestBuilder& builder,
+                                                 std::vector<QuantParams<QuantType>>& output_qparams) {
+    // input => Q => DQ =>
+    NodeArg* input = MakeTestInput(builder, input_def);
+    QuantParams<QuantType> input_qparams = GetTestInputQuantParams(input_def);
+    NodeArg* input_qdq = AddQDQNodePair(builder, input, input_qparams.scale, input_qparams.zero_point);
+
+    // scale => Q => DQ =>
+    NodeArg* scale = MakeTestInput(builder, scale_def);
+    QuantParams<QuantType> scale_qparams = GetTestInputQuantParams(scale_def);
+    NodeArg* scale_qdq = AddQDQNodePair(builder, scale, scale_qparams.scale, scale_qparams.zero_point);
+
+    // bias (as int32) => DQ =>
+    NodeArg* bias_qdq = MakeTestQDQBiasInput(builder, bias_def, input_qparams.scale * scale_qparams.scale);
+
+    // InstanceNormalization operator.
     auto* instance_norm_output = builder.MakeIntermediate();
-    Node& inst_norm_node = builder.AddNode("InstanceNormalization", {dq_input_output, dq_scale_output, dq_bias_output},
+    Node& inst_norm_node = builder.AddNode("InstanceNormalization", {input_qdq, scale_qdq, bias_qdq},
                                            {instance_norm_output});
     for (const auto& attr : attrs) {
       inst_norm_node.AddAttributeProto(attr);
     }
 
     // Add instance_norm_output -> Q -> output_u8
-    auto* output_u8 = builder.MakeOutput();
-    builder.AddQuantizeLinearNode<QuantType>(instance_norm_output, quant_scale, quant_zero_point, output_u8);
+    AddQDQNodePairWithOutputAsGraphOutput<QuantType>(builder, instance_norm_output, output_qparams[0].scale, output_qparams[0].zero_point);
   };
 }
 
@@ -71,9 +80,9 @@ GetQDQTestCaseFn BuildQDQInstanceNormTestCase(const TestInputDef<QuantType>& inp
  * \param expected_ep_assignment How many nodes are expected to be assigned to QNN (All, Some, or None).
  */
 template <typename QuantType = uint8_t>
-static void RunInstanceNormQDQTest(const TestInputDef<QuantType>& input_def,
-                                   const TestInputDef<QuantType>& scale_def,
-                                   const TestInputDef<int32_t>& bias_def,
+static void RunInstanceNormQDQTest(const TestInputDef<float>& input_def,
+                                   const TestInputDef<float>& scale_def,
+                                   const TestInputDef<float>& bias_def,
                                    const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs,
                                    ExpectedEPNodeAssignment expected_ep_assignment) {
   ProviderOptions provider_options;
@@ -84,50 +93,39 @@ static void RunInstanceNormQDQTest(const TestInputDef<QuantType>& input_def,
 #endif
 
   // Runs model with DQ-> InstanceNorm -> Q and compares the outputs of the CPU and QNN EPs.
-  RunQnnModelTest(BuildQDQInstanceNormTestCase<QuantType>(input_def, scale_def, bias_def, attrs),
-                  provider_options,
-                  18,
-                  expected_ep_assignment);
+  TestQDQModelAccuracy(BuildInstanceNormTestCase(input_def, scale_def, bias_def, attrs),
+                       BuildQDQInstanceNormTestCase<QuantType>(input_def, scale_def, bias_def, attrs),
+                       provider_options,
+                       18,
+                       expected_ep_assignment,
+                       1e-5f);
 }
 
 // Check that QNN compiles DQ -> InstanceNormalization -> Q as a single unit.
 // Use an input of rank 4.
-TEST_F(QnnHTPBackendTests, TestQDQInstanceNormU8) {
-  RunInstanceNormQDQTest(TestInputDef<uint8_t>({1, 2, 3, 3}, false, 0, 255),
-                         TestInputDef<uint8_t>({2}, true, 0, 127),
-                         TestInputDef<int32_t>({2}, true, 0, 10),
+TEST_F(QnnHTPBackendTests, InstanceNormU8) {
+  RunInstanceNormQDQTest(TestInputDef<float>({1, 2, 3, 3}, false, -10.0f, 10.0f),
+                         TestInputDef<float>({2}, true, -2.0f, 2.0f),
+                         TestInputDef<float>({2}, true, -3.0f, 3.0f),
                          {},
                          ExpectedEPNodeAssignment::All);
 }
 
 // Check that QNN compiles DQ -> InstanceNormalization -> Q as a single unit.
 // Use an input of rank 3.
-TEST_F(QnnHTPBackendTests, TestQDQInstanceNormU8Rank3) {
-  RunInstanceNormQDQTest(TestInputDef<uint8_t>({1, 2, 3}, false, {6, 4, 2, 6, 8, 2}),
-                         TestInputDef<uint8_t>({2}, true, {1, 2}),
-                         TestInputDef<int32_t>({2}, true, {1, 3}),
-                         {},
-                         ExpectedEPNodeAssignment::All);
-}
-
-// TODO: This test now fails in QNN SDK version 2.12.0 (windows arm64 and linux x86_64).
-// This worked in QNN SDK version 2.10.0. Need to determine the severity of this inaccuracy.
-//
-// Exepcted output: 2 6 2 42 42 0
-// Actual output: 2 6 2 43 43 0
-TEST_F(QnnHTPBackendTests, DISABLED_TestQDQInstanceNormU8Rank3_QnnSdk_2_12_Regression) {
-  RunInstanceNormQDQTest(TestInputDef<uint8_t>({1, 2, 3}, false, {3, 4, 3, 9, 9, 8}),
-                         TestInputDef<uint8_t>({2}, true, {2, 57}),
-                         TestInputDef<int32_t>({2}, true, {3, 2}),
+TEST_F(QnnHTPBackendTests, InstanceNormU8Rank3) {
+  RunInstanceNormQDQTest(TestInputDef<float>({1, 2, 3}, false, {6.0f, 4.0f, 2.0f, 6.0f, 8.0f, 2.0f}),
+                         TestInputDef<float>({2}, true, {1.0f, 2.0f}),
+                         TestInputDef<float>({2}, true, {1.0f, 3.0f}),
                          {},
                          ExpectedEPNodeAssignment::All);
 }
 
 // Check that QNN InstanceNorm operator does not handle inputs with rank > 4.
-TEST_F(QnnHTPBackendTests, TestQDQInstanceNormU8Rank5) {
-  RunInstanceNormQDQTest(TestInputDef<uint8_t>({1, 2, 3, 3, 3}, false, 0, 255),
-                         TestInputDef<uint8_t>({2}, true, 0, 127),
-                         TestInputDef<int32_t>({2}, true, 0, 10),
+TEST_F(QnnHTPBackendTests, InstanceNormU8Rank5) {
+  RunInstanceNormQDQTest(TestInputDef<float>({1, 2, 3, 3, 3}, false, -10.0f, 10.0f),
+                         TestInputDef<float>({2}, true, -2.0f, 2.0f),
+                         TestInputDef<float>({2}, true, -3.0f, 3.0f),
                          {},
                          ExpectedEPNodeAssignment::None);
 }
diff --git a/onnxruntime/test/providers/qnn/layer_norm_test.cc b/onnxruntime/test/providers/qnn/layer_norm_test.cc
index d9512d16a1f28..3b73a6bf800a3 100644
--- a/onnxruntime/test/providers/qnn/layer_norm_test.cc
+++ b/onnxruntime/test/providers/qnn/layer_norm_test.cc
@@ -113,6 +113,7 @@ static void RunLayerNormQDQTest(const std::vector<int64_t>& input_shape,
 #endif
 
   // Runs model with DQ-> InstanceNorm -> Q and compares the outputs of the CPU and QNN EPs.
+  // TODO: Use new QDQ accuracy testing approach (see TestQDQModelAccuracy)
   RunQnnModelTest(BuildQDQLayerNormTestCase<uint8_t, uint8_t>(input_shape, scale_shape, axis_value),
                   provider_options,
                   11,
@@ -122,11 +123,14 @@ static void RunLayerNormQDQTest(const std::vector<int64_t>& input_shape,
 // Check that QNN compiles DQ -> LayerNormalization -> Q as a single unit.
 // Use an input of rank 3.
 // Failed QNN op validation: QnnDsp <E> Param[0] has incorrect Value 3
+// TODO: Use new QDQ accuracy testing approach (see TestQDQModelAccuracy)
 TEST_F(QnnHTPBackendTests, TestQDQLayerNorm1DAxis0) {
   RunLayerNormQDQTest({1, 2, 3}, {1, 2, 3}, ExpectedEPNodeAssignment::None);
 }
 
 // Failed QNN FinalizeGraphs: QnnDsp <E> Failed to finalize graph (id: 1) with err 1002
+//
+// TODO: Use new QDQ accuracy testing approach (see TestQDQModelAccuracy)
 TEST_F(QnnHTPBackendTests, DISABLED_TestQDQLayerNorm1DAxis2) {
   RunLayerNormQDQTest({1, 2, 3}, {3}, ExpectedEPNodeAssignment::All, -1);
 }
@@ -136,4 +140,4 @@ TEST_F(QnnHTPBackendTests, DISABLED_TestQDQLayerNorm1DAxis2) {
 }  // namespace test
 }  // namespace onnxruntime
 
-#endif
\ No newline at end of file
+#endif
diff --git a/onnxruntime/test/providers/qnn/leakyrelu_op_htp_test.cc b/onnxruntime/test/providers/qnn/leakyrelu_op_htp_test.cc
index 489ac1924eb8e..772476cb0d245 100644
--- a/onnxruntime/test/providers/qnn/leakyrelu_op_htp_test.cc
+++ b/onnxruntime/test/providers/qnn/leakyrelu_op_htp_test.cc
@@ -15,17 +15,44 @@ namespace onnxruntime {
 namespace test {
 #if defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
 
-/**
- * Runs a LeakyRelu op model on the QNN HTP backend. Checks the graph node assignment, and that inference
- * outputs for QNN and CPU match.
- *
- * \param op_type The LeakyRelu op type (e.g., ReduceSum).
- * \param opset The opset version.
- * \param expected_ep_assignment How many nodes are expected to be assigned to QNN (All, Some, or None)
- */
+// Creates a function that builds a model with a LeakyRelu operator.
+static GetTestModelFn BuildLeakyReluOpTestCase(const TestInputDef<float>& input_def, float alpha) {
+  return [input_def, alpha](ModelTestBuilder& builder) {
+    NodeArg* input = MakeTestInput(builder, input_def);
+    NodeArg* output = builder.MakeOutput();
+    Node& leakyrelu_node = builder.AddNode("LeakyRelu", {input}, {output});
+    leakyrelu_node.AddAttribute("alpha", alpha);
+  };
+}
+
+// Creates a function that builds a QDQ model with a LeakyRelu operator.
+template <typename QuantType>
+static GetTestQDQModelFn<QuantType> BuildQDQLeakyReluOpTestCase(const TestInputDef<float>& input_def,
+                                                                float alpha) {
+  return [input_def, alpha](ModelTestBuilder& builder,
+                            std::vector<QuantParams<QuantType>>& output_qparams) {
+    // input => Q => DQ =>
+    NodeArg* input = MakeTestInput(builder, input_def);
+    QuantParams<QuantType> input_qparams = GetTestInputQuantParams(input_def);
+    NodeArg* input_qdq = AddQDQNodePair<QuantType>(builder, input, input_qparams.scale, input_qparams.zero_point);
+
+    // LeakryRelu
+    auto* leakyrelu_output = builder.MakeIntermediate();
+    Node& leakyrelu_node = builder.AddNode("LeakyRelu", {input_qdq}, {leakyrelu_output});
+    leakyrelu_node.AddAttribute("alpha", alpha);
+
+    // => Q => DQ -> final output
+    AddQDQNodePairWithOutputAsGraphOutput<QuantType>(builder, leakyrelu_output, output_qparams[0].scale,
+                                                     output_qparams[0].zero_point);
+  };
+}
+
+// Checks the accuracy of a QDQ LeakyRelu model by comparing to ORT CPU EP.
 template <typename QuantType>
-static void RunLeakyReluOpQDQTest(int opset,
-                                  ExpectedEPNodeAssignment expected_ep_assignment = ExpectedEPNodeAssignment::All) {
+static void RunLeakyReluOpQDQTest(const TestInputDef<float>& input_def,
+                                  float alpha,
+                                  int opset,
+                                  ExpectedEPNodeAssignment expected_ep_assignment) {
   ProviderOptions provider_options;
 #if defined(_WIN32)
   provider_options["backend_path"] = "QnnHtp.dll";
@@ -33,26 +60,34 @@ static void RunLeakyReluOpQDQTest(int opset,
   provider_options["backend_path"] = "libQnnHtp.so";
 #endif
 
-  RunQnnModelTest(BuildQDQLeakyReluOpTestCase<QuantType>({2, 3, 4}),
-                  provider_options,
-                  opset,
-                  expected_ep_assignment);
+  TestQDQModelAccuracy(BuildLeakyReluOpTestCase(input_def, alpha),
+                       BuildQDQLeakyReluOpTestCase<QuantType>(input_def, alpha),
+                       provider_options,
+                       opset,
+                       expected_ep_assignment,
+                       1e-5f);
 }
 
 // Test creates a DQ -> Gather -> Q -> DQ graph, and checks that all
 // nodes are supported by the QNN EP, and that the inference results match the CPU EP results.
 //
 // - Uses uint8 as the quantization type.
-TEST_F(QnnHTPBackendTests, TestQDQLeakyReluOpSet15) {
-  RunLeakyReluOpQDQTest<uint8_t>(15);
+TEST_F(QnnHTPBackendTests, LeakyReluOpSet15) {
+  RunLeakyReluOpQDQTest<uint8_t>(TestInputDef<float>({1, 2, 3}, false, {-40.0f, -20.0f, 0.0f, 10.0f, 30.0f, 40.0f}),
+                                 0.2f,
+                                 15,
+                                 ExpectedEPNodeAssignment::All);
 }
 
 // Test creates a DQ -> Gather -> Q -> DQ graph, and checks that all
 // nodes are supported by the QNN EP, and that the inference results match the CPU EP results.
 //
 // - Uses uint8 as the quantization type.
-TEST_F(QnnHTPBackendTests, TestQDQLeakyReluOpSet16) {
-  RunLeakyReluOpQDQTest<uint8_t>(16);
+TEST_F(QnnHTPBackendTests, LeakyReluOpSet16) {
+  RunLeakyReluOpQDQTest<uint8_t>(TestInputDef<float>({1, 2, 3}, false, {-40.0f, -20.0f, 0.0f, 10.0f, 30.0f, 40.0f}),
+                                 0.2f,
+                                 16,
+                                 ExpectedEPNodeAssignment::All);
 }
 
 #endif  // defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
diff --git a/onnxruntime/test/providers/qnn/lrn_op_test.cc b/onnxruntime/test/providers/qnn/lrn_op_test.cc
index 3b28678bcb0a7..82f7b246aa5e4 100644
--- a/onnxruntime/test/providers/qnn/lrn_op_test.cc
+++ b/onnxruntime/test/providers/qnn/lrn_op_test.cc
@@ -17,10 +17,10 @@ namespace onnxruntime {
 namespace test {
 
 // Creates a graph with a single LRN operator. Used for testing CPU backend.
-static GetTestModelFn BuildLRNTestCase(const std::vector<int64_t>& shape, int64_t size,
+static GetTestModelFn BuildLRNTestCase(const TestInputDef<float>& input_def, int64_t size,
                                        float alpha = 0.0001f, float beta = 0.75f, float bias = 1.0f) {
-  return [shape, size, alpha, beta, bias](ModelTestBuilder& builder) {
-    auto* input = builder.MakeInput<float>(shape, 0.0f, 20.0f);
+  return [input_def, size, alpha, beta, bias](ModelTestBuilder& builder) {
+    auto* input = MakeTestInput(builder, input_def);
     auto* output = builder.MakeOutput();
 
     Node& lrn_node = builder.AddNode("LRN", {input}, {output});
@@ -31,40 +31,34 @@ static GetTestModelFn BuildLRNTestCase(const std::vector<int64_t>& shape, int64_
   };
 }
 
-// Q/DQ scaled used to build Q/DQ test model. This is a global constant
-// because results from HTP backend are off by exactly this amount.
-static constexpr float qdq_scale = 0.0038f;
-
 // Creates a graph with a single Q/DQ LRN operator. Used for testing HTP backend.
 template <typename InputQType = uint8_t>
-static GetTestModelFn BuildQDQLRNTestCase(const std::vector<int64_t>& shape, int64_t size,
-                                          float alpha = 0.0001f, float beta = 0.75f, float bias = 1.0f) {
-  return [shape, size, alpha, beta, bias](ModelTestBuilder& builder) {
-    const InputQType zero_point = std::numeric_limits<InputQType>::max() / 2;
-
-    auto* input = builder.MakeInput<float>(shape, -1.0f, 1.0f);
-    auto* output = builder.MakeOutput();
-
-    // input -> Q -> DQ -> LRN
-    auto* qdq_output = AddQDQNodePair<InputQType>(builder, input, qdq_scale, zero_point);
-    auto* lrn_output = builder.MakeIntermediate();
-
-    Node& lrn_node = builder.AddNode("LRN", {qdq_output}, {lrn_output});
+static GetTestQDQModelFn<InputQType> BuildQDQLRNTestCase(const TestInputDef<float>& input_def, int64_t size,
+                                                         float alpha = 0.0001f, float beta = 0.75f, float bias = 1.0f) {
+  return [input_def, size, alpha, beta, bias](ModelTestBuilder& builder,
+                                              std::vector<QuantParams<InputQType>>& output_qparams) {
+    // input -> Q -> DQ ->
+    NodeArg* input = MakeTestInput(builder, input_def);
+    QuantParams<InputQType> input_qparams = GetTestInputQuantParams(input_def);
+    NodeArg* input_qdq = AddQDQNodePair<InputQType>(builder, input, input_qparams.scale, input_qparams.zero_point);
+
+    // LRN
+    NodeArg* lrn_output = builder.MakeIntermediate();
+    Node& lrn_node = builder.AddNode("LRN", {input_qdq}, {lrn_output});
     lrn_node.AddAttribute("size", size);
     lrn_node.AddAttribute("alpha", alpha);
     lrn_node.AddAttribute("beta", beta);
     lrn_node.AddAttribute("bias", bias);
 
-    // -> Q -> DQ -> output
-    auto* q_output = builder.MakeIntermediate();
-    builder.AddQuantizeLinearNode<InputQType>(lrn_output, qdq_scale, zero_point, q_output);
-    builder.AddDequantizeLinearNode<InputQType>(q_output, qdq_scale, zero_point, output);
+    // LRN output -> Q -> DQ -> final output
+    AddQDQNodePairWithOutputAsGraphOutput<InputQType>(builder, lrn_output, output_qparams[0].scale,
+                                                      output_qparams[0].zero_point);
   };
 }
 
 // Runs an LRN model on the QNN CPU backend. Checks the graph node assignment, and that inference
 // outputs for QNN EP and CPU EP match.
-static void RunCPULRNOpTest(const std::vector<int64_t>& shape, int64_t size,
+static void RunCPULRNOpTest(const TestInputDef<float>& input_def, int64_t size,
                             ExpectedEPNodeAssignment expected_ep_assignment,
                             float alpha = 0.0001f, float beta = 0.75f, float bias = 1.0f, int opset = 13) {
   ProviderOptions provider_options;
@@ -77,7 +71,7 @@ static void RunCPULRNOpTest(const std::vector<int64_t>& shape, int64_t size,
   fp32_abs_err = 1.5e-5f;  // On linux we need slightly larger tolerance.
 #endif
 
-  RunQnnModelTest(BuildLRNTestCase(shape, size, alpha, beta, bias),
+  RunQnnModelTest(BuildLRNTestCase(input_def, size, alpha, beta, bias),
                   provider_options,
                   opset,
                   expected_ep_assignment,
@@ -87,10 +81,10 @@ static void RunCPULRNOpTest(const std::vector<int64_t>& shape, int64_t size,
 // Runs an LRN model on the QNN HTP backend. Checks the graph node assignment, and that inference
 // outputs for QNN EP and CPU EP match.
 template <typename QuantType>
-static void RunQDQLRNOpTest(const std::vector<int64_t>& shape, int64_t size,
+static void RunQDQLRNOpTest(const TestInputDef<float>& input_def, int64_t size,
                             ExpectedEPNodeAssignment expected_ep_assignment,
                             float alpha = 0.0001f, float beta = 0.75f, float bias = 1.0f,
-                            int opset = 13, float fp32_abs_err = qdq_scale) {
+                            int opset = 13) {
   ProviderOptions provider_options;
 #if defined(_WIN32)
   provider_options["backend_path"] = "QnnHtp.dll";
@@ -98,27 +92,34 @@ static void RunQDQLRNOpTest(const std::vector<int64_t>& shape, int64_t size,
   provider_options["backend_path"] = "libQnnHtp.so";
 #endif
 
-  RunQnnModelTest(BuildQDQLRNTestCase<QuantType>(shape, size, alpha, beta, bias),
-                  provider_options,
-                  opset,
-                  expected_ep_assignment,
-                  fp32_abs_err + 0.0001f);
+  TestQDQModelAccuracy(BuildLRNTestCase(input_def, size, alpha, beta, bias),
+                       BuildQDQLRNTestCase<QuantType>(input_def, size, alpha, beta, bias),
+                       provider_options,
+                       opset,
+                       expected_ep_assignment,
+                       1e-5f);
 }
 
 //
 // CPU tests:
 //
 
-TEST_F(QnnCPUBackendTests, TestCPULRNSize3) {
-  RunCPULRNOpTest({1, 128, 4, 5}, 3, ExpectedEPNodeAssignment::All);
+TEST_F(QnnCPUBackendTests, LRNSize3) {
+  RunCPULRNOpTest(TestInputDef<float>({1, 128, 4, 5}, false, -10.0f, 10.0f),
+                  3,  // Size
+                  ExpectedEPNodeAssignment::All);
 }
 
-TEST_F(QnnCPUBackendTests, TestCPULRNSize5) {
-  RunCPULRNOpTest({1, 128, 4, 5}, 5, ExpectedEPNodeAssignment::All);
+TEST_F(QnnCPUBackendTests, LRNSize5) {
+  RunCPULRNOpTest(TestInputDef<float>({1, 128, 4, 5}, false, -10.0f, 10.0f),
+                  5,  // Size
+                  ExpectedEPNodeAssignment::All);
 }
 
-TEST_F(QnnCPUBackendTests, TestCPULRN_size_larger_than_channel) {
-  RunCPULRNOpTest({1, 128, 4, 5}, 255, ExpectedEPNodeAssignment::All);
+TEST_F(QnnCPUBackendTests, LRN_size_larger_than_channel) {
+  RunCPULRNOpTest(TestInputDef<float>({1, 128, 4, 5}, false, -10.0f, 10.0f),
+                  255,  // Size
+                  ExpectedEPNodeAssignment::All);
 }
 
 #if defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
@@ -126,16 +127,22 @@ TEST_F(QnnCPUBackendTests, TestCPULRN_size_larger_than_channel) {
 // HTP tests:
 //
 
-TEST_F(QnnHTPBackendTests, TestHTPLRNSize3) {
-  RunQDQLRNOpTest<uint8_t>({1, 128, 4, 5}, 3, ExpectedEPNodeAssignment::All);
+TEST_F(QnnHTPBackendTests, LRNSize3) {
+  RunQDQLRNOpTest<uint8_t>(TestInputDef<float>({1, 128, 4, 5}, false, -10.0f, 10.0f),
+                           3,  // Size
+                           ExpectedEPNodeAssignment::All);
 }
 
-TEST_F(QnnHTPBackendTests, TestHTPLRNSize5) {
-  RunQDQLRNOpTest<uint8_t>({1, 128, 4, 5}, 5, ExpectedEPNodeAssignment::All);
+TEST_F(QnnHTPBackendTests, LRNSize5) {
+  RunQDQLRNOpTest<uint8_t>(TestInputDef<float>({1, 128, 4, 5}, false, -10.0f, 10.0f),
+                           5,  // Size
+                           ExpectedEPNodeAssignment::All);
 }
 
-TEST_F(QnnHTPBackendTests, TestHTPLRN_size_larger_than_channel) {
-  RunQDQLRNOpTest<uint8_t>({1, 128, 4, 5}, 255, ExpectedEPNodeAssignment::All);
+TEST_F(QnnHTPBackendTests, LRN_size_larger_than_channel) {
+  RunQDQLRNOpTest<uint8_t>(TestInputDef<float>({1, 128, 4, 5}, false, -10.0f, 10.0f),
+                           255,  // Size
+                           ExpectedEPNodeAssignment::All);
 }
 
 #endif  // defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
diff --git a/onnxruntime/test/providers/qnn/matmul_test.cpp b/onnxruntime/test/providers/qnn/matmul_test.cpp
index 5c7a08ae06080..421bdfdaf1bb6 100644
--- a/onnxruntime/test/providers/qnn/matmul_test.cpp
+++ b/onnxruntime/test/providers/qnn/matmul_test.cpp
@@ -6,7 +6,6 @@
 #include <string>
 #include <unordered_map>
 
-#include "test/optimizer/qdq_test_utils.h"
 #include "test/providers/qnn/qnn_test_utils.h"
 
 #include "onnx/onnx_pb.h"
@@ -17,74 +16,46 @@ namespace onnxruntime {
 namespace test {
 
 // Returns a function that creates a graph with MatMul operator.
-static GetTestModelFn BuildMatMulOpTestCase(const std::vector<int64_t>& input1_shape,
-                                            const std::vector<int64_t>& input2_shape) {
-  return [input1_shape, input2_shape](ModelTestBuilder& builder) {
-    // Random input data
-    auto input1 = builder.MakeInput<float>(input1_shape, 0.0f, 10.0f);
-    auto input2 = builder.MakeInput<float>(input2_shape, 0.0f, 10.0f);
-
-    auto* output = builder.MakeOutput();
+static GetTestModelFn BuildMatMulOpTestCase(const TestInputDef<float>& input1_def,
+                                            const TestInputDef<float>& input2_def) {
+  return [input1_def, input2_def](ModelTestBuilder& builder) {
+    NodeArg* input1 = MakeTestInput(builder, input1_def);
+    NodeArg* input2 = MakeTestInput(builder, input2_def);
+    NodeArg* output = builder.MakeOutput();
     builder.AddNode("MatMul", {input1, input2}, {output});
   };
 }
 
-// Returns a function that creates a graph with a QDQ AveragePool operator.
+// Returns a function that creates a graph with a QDQ MatMul operator.
 template <typename QuantType>
-GetQDQTestCaseFn BuildMatMulOpQDQTestCase(const std::vector<int64_t>& input1_shape,
-                                          const std::vector<int64_t>& input2_shape) {
-  return [input1_shape, input2_shape](ModelTestBuilder& builder) {
-    float pool_output_scale = 0.0038f;
-    float q_scale = 0.0038f;
-    QuantType pool_output_zp = std::numeric_limits<QuantType>::max() / 2;
-    QuantType q_zp = std::numeric_limits<QuantType>::max() / 2;
-
-    auto* input_arg = builder.MakeInput<float>(input1_shape, -1.f, 1.f);
-    auto* output_arg = builder.MakeOutput();
-
-    using InputLimits = std::numeric_limits<QuantType>;
-
-    // add QDQ input
-    auto* q1_output = builder.MakeIntermediate();
-    auto* dq1_output = builder.MakeIntermediate();
-    builder.AddQuantizeLinearNode<QuantType>(input_arg,
-                                             pool_output_scale,
-                                             pool_output_zp,
-                                             q1_output);
-    builder.AddDequantizeLinearNode<QuantType>(q1_output,
-                                               q_scale,
-                                               q_zp,
-                                               dq1_output);
-
-    // add input b initializer (NNAPI only supports case of MatMul A*B - B is an initializer)
-    auto* dq_2_output = builder.MakeIntermediate();
-    auto* input_b = builder.MakeInitializer<QuantType>(input2_shape, InputLimits::min(), InputLimits::max());
-    builder.AddDequantizeLinearNode<QuantType>(input_b,
-                                               q_scale,
-                                               q_zp,
-                                               dq_2_output);
-
-    // add MatMul operator
-    auto* matmul_op_output = builder.MakeIntermediate();
-    builder.AddNode("MatMul", {dq1_output, dq_2_output}, {matmul_op_output});
-
-    // add QDQ output
-    auto* q3_output = builder.MakeIntermediate();
-    builder.AddQuantizeLinearNode<QuantType>(matmul_op_output,
-                                             pool_output_scale,
-                                             pool_output_zp,
-                                             q3_output);
-    builder.AddDequantizeLinearNode<QuantType>(q3_output,
-                                               q_scale,
-                                               q_zp,
-                                               output_arg);
+static GetTestQDQModelFn<QuantType> BuildMatMulOpQDQTestCase(const TestInputDef<float>& input1_def,
+                                                             const TestInputDef<float>& input2_def) {
+  return [input1_def, input2_def](ModelTestBuilder& builder,
+                                  std::vector<QuantParams<QuantType>>& output_qparams) {
+    // input1 -> Q -> DQ ->
+    NodeArg* input1 = MakeTestInput(builder, input1_def);
+    QuantParams<QuantType> input1_qparams = GetTestInputQuantParams(input1_def);
+    auto* input1_qdq = AddQDQNodePair<QuantType>(builder, input1, input1_qparams.scale, input1_qparams.zero_point);
+
+    // input2 -> Q -> DQ ->
+    NodeArg* input2 = MakeTestInput(builder, input2_def);
+    QuantParams<QuantType> input2_qparams = GetTestInputQuantParams(input2_def);
+    auto* input2_qdq = AddQDQNodePair<QuantType>(builder, input2, input2_qparams.scale, input2_qparams.zero_point);
+
+    // MatMul
+    auto* op_output = builder.MakeIntermediate();
+    builder.AddNode("MatMul", {input1_qdq, input2_qdq}, {op_output});
+
+    // op_output -> Q -> DQ -> output
+    AddQDQNodePairWithOutputAsGraphOutput<QuantType>(builder, op_output, output_qparams[0].scale,
+                                                     output_qparams[0].zero_point);
   };
 }
 
-// Runs an AveragePool model on the QNN CPU backend. Checks the graph node assignment, and that inference
+// Runs an MatMul model on the QNN CPU backend. Checks the graph node assignment, and that inference
 // outputs for QNN and CPU match.
-static void RunMatMulOpOpTest(const std::vector<int64_t>& input1_shape,
-                              const std::vector<int64_t>& input2_shape,
+static void RunMatMulOpOpTest(const TestInputDef<float>& input1_def,
+                              const TestInputDef<float>& input2_def,
                               ExpectedEPNodeAssignment expected_ep_assignment,
                               int opset = 13) {
   ProviderOptions provider_options;
@@ -94,19 +65,20 @@ static void RunMatMulOpOpTest(const std::vector<int64_t>& input1_shape,
   provider_options["backend_path"] = "libQnnCpu.so";
 #endif
 
-  RunQnnModelTest(BuildMatMulOpTestCase(input1_shape, input2_shape),
+  RunQnnModelTest(BuildMatMulOpTestCase(input1_def, input2_def),
                   provider_options,
                   opset,
-                  expected_ep_assignment);
+                  expected_ep_assignment,
+                  2e-4f);
 }
 
-// Runs a QDQ AveragePool model on the QNN HTP backend. Checks the graph node assignment, and that inference
-// outputs for QNN and CPU match.
+// Runs a QDQ MatMul model on the QNN HTP backend. Checks the graph node assignment, and that the
+// QDQ model is accurate on QNN EP (compared to CPU EP).
 template <typename QuantType>
-static void RunQDQMatMulOpOpTest(const std::vector<int64_t>& input1_shape,
-                                 const std::vector<int64_t>& input2_shape,
+static void RunQDQMatMulOpOpTest(const TestInputDef<float>& input1_def,
+                                 const TestInputDef<float>& input2_def,
                                  ExpectedEPNodeAssignment expected_ep_assignment,
-                                 int opset = 18, float fp32_abs_err = 1e-5f) {
+                                 int opset = 18) {
   ProviderOptions provider_options;
 #if defined(_WIN32)
   provider_options["backend_path"] = "QnnHtp.dll";
@@ -114,27 +86,28 @@ static void RunQDQMatMulOpOpTest(const std::vector<int64_t>& input1_shape,
   provider_options["backend_path"] = "libQnnHtp.so";
 #endif
 
-  RunQnnModelTest(BuildMatMulOpQDQTestCase<QuantType>(input1_shape, input2_shape),
-                  provider_options,
-                  opset,
-                  expected_ep_assignment,
-                  fp32_abs_err);
+  TestQDQModelAccuracy(BuildMatMulOpTestCase(input1_def, input2_def),
+                       BuildMatMulOpQDQTestCase<QuantType>(input1_def, input2_def),
+                       provider_options,
+                       opset,
+                       expected_ep_assignment,
+                       1e-5f);
 }
 
 //
 // CPU tests:
 //
 
-TEST_F(QnnCPUBackendTests, TestMatMulOp) {
-  RunMatMulOpOpTest({2, 2} /* input_shape1 */,
-                    {2, 2} /* input_shape2 */,
+TEST_F(QnnCPUBackendTests, MatMulOp) {
+  RunMatMulOpOpTest(TestInputDef<float>({2, 3}, false, {-10.0f, -4.0f, -2.0f, 0.0f, 5.0f, 10.0f}),
+                    TestInputDef<float>({3, 2}, false, {-10.0f, -6.0f, -1.0f, 0.0f, 3.0f, 10.0f}),
                     ExpectedEPNodeAssignment::All, 18);
 }
 
-// QNN broadcast issue
-TEST_F(QnnCPUBackendTests, DISABLED_TestMatMulOp2) {
-  RunMatMulOpOpTest({28, 1, 64} /* input_shape1 */,
-                    {64, 32} /* input_shape2 */,
+// Test MatMul broadcasting
+TEST_F(QnnCPUBackendTests, MatMulOp_Broadcast) {
+  RunMatMulOpOpTest(TestInputDef<float>({28, 1, 64}, false, -10.0f, 10.0f),
+                    TestInputDef<float>({64, 32}, false, -10.0f, 10.0f),
                     ExpectedEPNodeAssignment::All, 18);
 }
 
@@ -143,27 +116,17 @@ TEST_F(QnnCPUBackendTests, DISABLED_TestMatMulOp2) {
 // HTP tests:
 //
 
-TEST_F(QnnHTPBackendTests, TestMatMulOp_HTP_u8) {
-  RunQDQMatMulOpOpTest<uint8_t>({2, 2} /* input_shape1 */,
-                                {2, 2} /* input_shape2 */,
-                                ExpectedEPNodeAssignment::All,
-                                18, 0.00381f);
-}
-
-// QNN broadcast issue
-TEST_F(QnnHTPBackendTests, DISABLED_TestMatMulOp2_HTP_u8) {
-  RunQDQMatMulOpOpTest<uint8_t>({28, 1, 64} /* input_shape1 */,
-                                {64, 32} /* input_shape2 */,
-                                ExpectedEPNodeAssignment::All,
-                                18, 0.00381f);
+TEST_F(QnnHTPBackendTests, MatMulOp_HTP_u8) {
+  RunQDQMatMulOpOpTest<uint8_t>(TestInputDef<float>({2, 3}, false, {-10.0f, -4.0f, -2.0f, 0.0f, 5.0f, 10.0f}),
+                                TestInputDef<float>({3, 2}, false, {-10.0f, -6.0f, -1.0f, 0.0f, 3.0f, 10.0f}),
+                                ExpectedEPNodeAssignment::All, 18);
 }
 
-// QNN broadcast issue
-TEST_F(QnnHTPBackendTests, DISABLED_TestMatMulOp3_HTP_u8) {
-  RunQDQMatMulOpOpTest<uint8_t>({28, 1, 32} /* input_shape1 */,
-                                {32, 2} /* input_shape2 */,
-                                ExpectedEPNodeAssignment::All,
-                                18, 0.00381f);
+// Test MatMul broadcasting
+TEST_F(QnnHTPBackendTests, MatMulOp_Broadcast) {
+  RunQDQMatMulOpOpTest<uint8_t>(TestInputDef<float>({28, 1, 64}, false, -10.0f, 10.0f),
+                                TestInputDef<float>({64, 32}, false, -10.0f, 10.0f),
+                                ExpectedEPNodeAssignment::All, 18);
 }
 
 #endif  // defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
diff --git a/onnxruntime/test/providers/qnn/max_pool_test.cpp b/onnxruntime/test/providers/qnn/max_pool_test.cpp
index 1beac1d326ccd..f574948f02c17 100644
--- a/onnxruntime/test/providers/qnn/max_pool_test.cpp
+++ b/onnxruntime/test/providers/qnn/max_pool_test.cpp
@@ -6,6 +6,7 @@
 #include <string>
 #include <unordered_map>
 
+#include "core/graph/node_attr_utils.h"
 #include "test/optimizer/qdq_test_utils.h"
 #include "test/providers/qnn/qnn_test_utils.h"
 
@@ -17,122 +18,50 @@ namespace onnxruntime {
 namespace test {
 
 // Returns a function that creates a graph with a single MaxPool operator.
-static GetTestModelFn BuildMaxPoolTestCase(const std::vector<int64_t>& shape,
-                                           const std::vector<int64_t>& kernel_shape,
-                                           const std::vector<int64_t>& strides,
-                                           const std::vector<int64_t>& pads,
-                                           const std::vector<int64_t>& dilations,
-                                           int64_t ceil_mode,
-                                           int64_t storage_order,
-                                           const std::string& auto_pad = "NOTSET") {
-  return [shape, kernel_shape, strides, pads, dilations,
-          ceil_mode, storage_order, auto_pad](ModelTestBuilder& builder) {
-    // Random input data
-    auto input = builder.MakeInput<float>(shape, 0.0f, 10.0f);
-
-    auto* output = builder.MakeOutput();
+static GetTestModelFn BuildMaxPoolTestCase(const TestInputDef<float>& input_def,
+                                           const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs) {
+  return [input_def, attrs](ModelTestBuilder& builder) {
+    NodeArg* input = MakeTestInput(builder, input_def);
+    NodeArg* output = builder.MakeOutput();
     Node& pool_node = builder.AddNode("MaxPool", {input}, {output});
 
-    pool_node.AddAttribute("kernel_shape", kernel_shape);
-
-    if (!strides.empty()) {
-      pool_node.AddAttribute("strides", strides);
-    }
-
-    if (!dilations.empty()) {
-      pool_node.AddAttribute("dilations", dilations);
-    }
-
-    pool_node.AddAttribute("auto_pad", auto_pad);
-
-    if (!pads.empty() && auto_pad == "NOTSET") {
-      pool_node.AddAttribute("pads", pads);
-    }
-
-    if (ceil_mode > 0) {
-      pool_node.AddAttribute("ceil_mode", ceil_mode);
-    }
-
-    if (storage_order > 0) {
-      pool_node.AddAttribute("storage_order", storage_order);
+    for (const auto& attr : attrs) {
+      pool_node.AddAttributeProto(attr);
     }
   };
 }
 
 // Returns a function that creates a graph with a QDQ MaxPool operator.
 template <typename QuantType>
-GetQDQTestCaseFn BuildMaxPoolQDQTestCase(const std::vector<int64_t>& shape,
-                                         const std::vector<int64_t>& kernel_shape,
-                                         const std::vector<int64_t>& strides,
-                                         const std::vector<int64_t>& pads,
-                                         const std::vector<int64_t>& dilations,
-                                         int64_t ceil_mode,
-                                         int64_t storage_order,
-                                         const std::string& auto_pad = "NOTSET") {
-  return [shape, kernel_shape, strides, pads, dilations,
-          ceil_mode, storage_order, auto_pad](ModelTestBuilder& builder) {
-    float dq_scale = 0.0038f;
-    float pool_output_scale = 0.0038f;
-    float q_scale = 0.0038f;
-    QuantType dq_zp = std::numeric_limits<QuantType>::max() / 2;
-    QuantType pool_output_zp = std::numeric_limits<QuantType>::max() / 2;
-    QuantType q_zp = std::numeric_limits<QuantType>::max() / 2;
-
-    auto* input_arg = builder.MakeInput<float>(shape, -1.0f, 1.0f);
-    auto* output_arg = builder.MakeOutput();
-
-    // add QDQ + MaxPool
-    auto* dq_output = AddQDQNodePair<QuantType>(builder, input_arg, dq_scale, dq_zp);
-    auto* MaxPool_output = builder.MakeIntermediate();
-    Node& pool_node = builder.AddNode("MaxPool", {dq_output}, {MaxPool_output});
-
-    pool_node.AddAttribute("kernel_shape", kernel_shape);
-
-    if (!strides.empty()) {
-      pool_node.AddAttribute("strides", strides);
-    }
-
-    if (!dilations.empty()) {
-      pool_node.AddAttribute("dilations", dilations);
+GetTestQDQModelFn<QuantType> BuildMaxPoolQDQTestCase(const TestInputDef<float>& input_def,
+                                                     const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs) {
+  return [input_def, attrs](ModelTestBuilder& builder,
+                            std::vector<QuantParams<QuantType>>& output_qparams) {
+    // input -> Q -> DQ ->
+    NodeArg* input = MakeTestInput(builder, input_def);
+    QuantParams<QuantType> input_qparams = GetTestInputQuantParams(input_def);
+    NodeArg* input_qdq = AddQDQNodePair<QuantType>(builder, input, input_qparams.scale, input_qparams.zero_point);
+
+    // MaxPool
+    NodeArg* pool_output = builder.MakeIntermediate();
+    Node& pool_node = builder.AddNode("MaxPool", {input_qdq}, {pool_output});
+
+    for (const auto& attr : attrs) {
+      pool_node.AddAttributeProto(attr);
     }
 
-    pool_node.AddAttribute("auto_pad", auto_pad);
-
-    if (!pads.empty() && auto_pad == "NOTSET") {
-      pool_node.AddAttribute("pads", pads);
-    }
-
-    if (ceil_mode > 0) {
-      pool_node.AddAttribute("ceil_mode", ceil_mode);
-    }
-
-    if (storage_order > 0) {
-      pool_node.AddAttribute("storage_order", storage_order);
-    }
-
-    // add QDQ output
-    auto* q_output = builder.MakeIntermediate();
-    builder.AddQuantizeLinearNode<QuantType>(MaxPool_output,
-                                             pool_output_scale,
-                                             pool_output_zp,
-                                             q_output);
-    builder.AddDequantizeLinearNode<QuantType>(q_output,
-                                               q_scale,
-                                               q_zp,
-                                               output_arg);
+    // op_output -> Q -> DQ -> output
+    // NOTE: Input and output quantization parameters must be equal for MaxPool.
+    output_qparams[0] = input_qparams;  // Overwrite!
+    AddQDQNodePairWithOutputAsGraphOutput<QuantType>(builder, pool_output, input_qparams.scale,
+                                                     input_qparams.zero_point);
   };
 }
 
 // Runs an MaxPool model on the QNN CPU backend. Checks the graph node assignment, and that inference
 // outputs for QNN and CPU match.
-static void RunMaxPoolOpTest(const std::vector<int64_t>& shape,
-                             const std::vector<int64_t>& kernel_shape,
-                             const std::vector<int64_t>& strides,
-                             const std::vector<int64_t>& pads,
-                             const std::vector<int64_t>& dilations,
-                             int64_t ceil_mode,
-                             int64_t storage_order,
-                             const std::string& auto_pad,
+static void RunMaxPoolOpTest(const TestInputDef<float>& input_def,
+                             const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs,
                              ExpectedEPNodeAssignment expected_ep_assignment,
                              int opset = 18) {
   ProviderOptions provider_options;
@@ -142,7 +71,7 @@ static void RunMaxPoolOpTest(const std::vector<int64_t>& shape,
   provider_options["backend_path"] = "libQnnCpu.so";
 #endif
 
-  RunQnnModelTest(BuildMaxPoolTestCase(shape, kernel_shape, strides, pads, dilations, ceil_mode, storage_order, auto_pad),
+  RunQnnModelTest(BuildMaxPoolTestCase(input_def, attrs),
                   provider_options,
                   opset,
                   expected_ep_assignment);
@@ -151,16 +80,10 @@ static void RunMaxPoolOpTest(const std::vector<int64_t>& shape,
 // Runs a QDQ MaxPool model on the QNN HTP backend. Checks the graph node assignment, and that inference
 // outputs for QNN and CPU match.
 template <typename QuantType>
-static void RunQDQMaxPoolOpTest(const std::vector<int64_t>& shape,
-                                const std::vector<int64_t>& kernel_shape,
-                                const std::vector<int64_t>& strides,
-                                const std::vector<int64_t>& pads,
-                                const std::vector<int64_t>& dilations,
-                                int64_t ceil_mode,
-                                int64_t storage_order,
-                                const std::string& auto_pad,
+static void RunQDQMaxPoolOpTest(const TestInputDef<float>& input_def,
+                                const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs,
                                 ExpectedEPNodeAssignment expected_ep_assignment,
-                                int opset = 18, float fp32_abs_err = 1e-5f) {
+                                int opset = 18) {
   ProviderOptions provider_options;
 #if defined(_WIN32)
   provider_options["backend_path"] = "QnnHtp.dll";
@@ -168,11 +91,12 @@ static void RunQDQMaxPoolOpTest(const std::vector<int64_t>& shape,
   provider_options["backend_path"] = "libQnnHtp.so";
 #endif
 
-  RunQnnModelTest(BuildMaxPoolQDQTestCase<QuantType>(shape, kernel_shape, strides, pads, dilations, ceil_mode, storage_order, auto_pad),
-                  provider_options,
-                  opset,
-                  expected_ep_assignment,
-                  fp32_abs_err);
+  TestQDQModelAccuracy(BuildMaxPoolTestCase(input_def, attrs),
+                       BuildMaxPoolQDQTestCase<QuantType>(input_def, attrs),
+                       provider_options,
+                       opset,
+                       expected_ep_assignment,
+                       1e-5f);
 }
 
 //
@@ -180,65 +104,53 @@ static void RunQDQMaxPoolOpTest(const std::vector<int64_t>& shape,
 //
 
 // MaxPool with kernel size equal to the spatial dimension of input tensor.
-TEST_F(QnnCPUBackendTests, TestMaxPool_Global) {
-  RunMaxPoolOpTest({1, 2, 3, 3},  // shape
-                   {3, 3},        // kernel_shape
-                   {3, 3},        // strides
-                   {0, 0, 0, 0},  // pads
-                   {1, 1},        // dialations
-                   0,             // ceil_mode
-                   0,             // storage_order
-                   "NOTSET",      // auto_pad
-                   ExpectedEPNodeAssignment::All);
-}
-
-TEST_F(QnnCPUBackendTests, TestMaxPool_Large_Input) {
-  RunMaxPoolOpTest({1, 125, 8, 56},  // shape
-                   {2, 2},           // kernel_shape
-                   {2, 2},           // strides
-                   {0, 0, 0, 0},     // pads
-                   {1, 1},           // dialations
-                   0,                // ceil_mode
-                   0,                // storage_order
-                   "NOTSET",         // auto_pad
+TEST_F(QnnCPUBackendTests, MaxPool_Global) {
+  RunMaxPoolOpTest(TestInputDef<float>({1, 2, 3, 3}, false, -10.0f, 10.0f),  // Dynamic input with range [-10, 10]
+                   {utils::MakeAttribute("kernel_shape", std::vector<int64_t>{3, 3}),
+                    utils::MakeAttribute("strides", std::vector<int64_t>{3, 3}),
+                    utils::MakeAttribute("pads", std::vector<int64_t>{0, 0, 0, 0}),
+                    utils::MakeAttribute("dilations", std::vector<int64_t>{1, 1}),
+                    utils::MakeAttribute("ceil_mode", static_cast<int64_t>(0)),
+                    utils::MakeAttribute("storage_order", static_cast<int64_t>(0)),
+                    utils::MakeAttribute("auto_pad", "NOTSET")},
                    ExpectedEPNodeAssignment::All);
 }
 
-TEST_F(QnnCPUBackendTests, TestMaxPool_Large_Input2) {
-  RunMaxPoolOpTest({1, 128, 16, 113},  // shape
-                   {2, 2},             // kernel_shape
-                   {2, 2},             // strides
-                   {0, 0, 0, 0},       // pads
-                   {1, 1},             // dialations
-                   0,                  // ceil_mode
-                   0,                  // storage_order
-                   "NOTSET",           // auto_pad
+TEST_F(QnnCPUBackendTests, MaxPool_Large_Input) {
+  RunMaxPoolOpTest(TestInputDef<float>({1, 125, 8, 56}, false, -10.0f, 10.0f),  // Dynamic input with range [-10, 10]
+                   {utils::MakeAttribute("kernel_shape", std::vector<int64_t>{2, 2}),
+                    utils::MakeAttribute("strides", std::vector<int64_t>{2, 2}),
+                    utils::MakeAttribute("pads", std::vector<int64_t>{0, 0, 0, 0}),
+                    utils::MakeAttribute("dilations", std::vector<int64_t>{1, 1}),
+                    utils::MakeAttribute("ceil_mode", static_cast<int64_t>(0)),
+                    utils::MakeAttribute("storage_order", static_cast<int64_t>(0)),
+                    utils::MakeAttribute("auto_pad", "NOTSET")},
                    ExpectedEPNodeAssignment::All);
 }
 
 // TODO: Certain large input sizes cause the QNN graph to fail to finalize with error 1002 (QNN_COMMON_ERROR_MEM_ALLOC).
-TEST_F(QnnCPUBackendTests, DISABLED_TestMaxPool_Ceil) {
-  RunMaxPoolOpTest({1, 2, 3, 3},  // shape
-                   {3, 3},        // kernel_shape
-                   {3, 3},        // strides
-                   {0, 0, 0, 0},  // pads
-                   {1, 1},        // dialations
-                   1,             // ceil_mode
-                   0,             // storage_order
-                   "NOTSET",      // auto_pad
+TEST_F(QnnCPUBackendTests, DISABLED_MaxPool_Ceil) {
+  RunMaxPoolOpTest(TestInputDef<float>({1, 2, 3, 3}, false, -10.0f, 10.0f),  // Dynamic input with range [-10, 10]
+                   {utils::MakeAttribute("kernel_shape", std::vector<int64_t>{3, 3}),
+                    utils::MakeAttribute("strides", std::vector<int64_t>{3, 3}),
+                    utils::MakeAttribute("pads", std::vector<int64_t>{0, 0, 0, 0}),
+                    utils::MakeAttribute("dilations", std::vector<int64_t>{1, 1}),
+                    utils::MakeAttribute("ceil_mode", static_cast<int64_t>(1)),
+                    utils::MakeAttribute("storage_order", static_cast<int64_t>(0)),
+                    utils::MakeAttribute("auto_pad", "NOTSET")},
                    ExpectedEPNodeAssignment::All);
 }
 
 // TODO: Certain large input sizes cause the QNN graph to fail to finalize with error 1002 (QNN_COMMON_ERROR_MEM_ALLOC).
-TEST_F(QnnCPUBackendTests, DISABLED_TestMaxPool_Large_Input2_Ceil) {
-  RunMaxPoolOpTest({1, 128, 16, 113},  // shape
-                   {2, 2},             // kernel_shape
-                   {2, 2},             // strides
-                   {0, 0, 0, 0},       // pads
-                   {1, 1},             // dialations
-                   1,                  // ceil_mode
-                   0,                  // storage_order
-                   "NOTSET",           // auto_pad
+TEST_F(QnnCPUBackendTests, DISABLED_MaxPool_Large_Input2_Ceil) {
+  RunMaxPoolOpTest(TestInputDef<float>({1, 128, 16, 113}, false, -10.0f, 10.0f),  // Dynamic input with range [-10, 10]
+                   {utils::MakeAttribute("kernel_shape", std::vector<int64_t>{2, 2}),
+                    utils::MakeAttribute("strides", std::vector<int64_t>{2, 2}),
+                    utils::MakeAttribute("pads", std::vector<int64_t>{0, 0, 0, 0}),
+                    utils::MakeAttribute("dilations", std::vector<int64_t>{1, 1}),
+                    utils::MakeAttribute("ceil_mode", static_cast<int64_t>(1)),
+                    utils::MakeAttribute("storage_order", static_cast<int64_t>(0)),
+                    utils::MakeAttribute("auto_pad", "NOTSET")},
                    ExpectedEPNodeAssignment::All);
 }
 
@@ -247,79 +159,66 @@ TEST_F(QnnCPUBackendTests, DISABLED_TestMaxPool_Large_Input2_Ceil) {
 // HTP tests:
 //
 // QDQ MaxPool with kernel size equal to the spatial dimension of input tensor.
-TEST_F(QnnHTPBackendTests, TestMaxPool_Global_HTP_u8) {
-  RunQDQMaxPoolOpTest<uint8_t>({1, 2, 3, 3},  // shape
-                               {3, 3},        // kernel_shape
-                               {3, 3},        // strides
-                               {0, 0, 0, 0},  // pads
-                               {1, 1},        // dialations
-                               0,             // ceil_mode
-                               0,             // storage_order
-                               "NOTSET",      // auto_pad
-                               ExpectedEPNodeAssignment::All);
-}
-
-// TODO: Certain large input sizes cause the QNN graph to fail to finalize with error 1002 (QNN_COMMON_ERROR_MEM_ALLOC).
-TEST_F(QnnHTPBackendTests, DISABLED_TestMaxPool_Large_Input_HTP_u8) {
-  RunQDQMaxPoolOpTest<uint8_t>({1, 125, 8, 56},  // shape
-                               {2, 2},           // kernel_shape
-                               {2, 2},           // strides
-                               {0, 0, 0, 0},     // pads
-                               {1, 1},           // dialations
-                               0,                // ceil_mode
-                               0,                // storage_order
-                               "NOTSET",         // auto_pad
+TEST_F(QnnHTPBackendTests, MaxPool_Global_HTP_u8) {
+  RunQDQMaxPoolOpTest<uint8_t>(TestInputDef<float>({1, 2, 3, 3}, false, -10.0f, 10.0f),  // Dynamic input with range [-10, 10]
+                               {utils::MakeAttribute("kernel_shape", std::vector<int64_t>{3, 3}),
+                                utils::MakeAttribute("strides", std::vector<int64_t>{3, 3}),
+                                utils::MakeAttribute("pads", std::vector<int64_t>{0, 0, 0, 0}),
+                                utils::MakeAttribute("dilations", std::vector<int64_t>{1, 1}),
+                                utils::MakeAttribute("ceil_mode", static_cast<int64_t>(0)),
+                                utils::MakeAttribute("storage_order", static_cast<int64_t>(0)),
+                                utils::MakeAttribute("auto_pad", "NOTSET")},
                                ExpectedEPNodeAssignment::All);
 }
 
 // TODO: Certain large input sizes cause the QNN graph to fail to finalize with error 1002 (QNN_COMMON_ERROR_MEM_ALLOC).
-TEST_F(QnnHTPBackendTests, DISABLED_TestMaxPool_Large_Input2_HTP_u8) {
-  RunQDQMaxPoolOpTest<uint8_t>({1, 128, 16, 113},  // shape
-                               {2, 2},             // kernel_shape
-                               {2, 2},             // strides
-                               {0, 0, 0, 0},       // pads
-                               {1, 1},             // dialations
-                               0,                  // ceil_mode
-                               0,                  // storage_order
-                               "NOTSET",           // auto_pad
+TEST_F(QnnHTPBackendTests, DISABLED_MaxPool_Large_Input_HTP_u8) {
+  RunQDQMaxPoolOpTest<uint8_t>(TestInputDef<float>({1, 125, 8, 56}, false, -10.0f, 10.0f),  // Dynamic input with range [-10, 10]
+                               {utils::MakeAttribute("kernel_shape", std::vector<int64_t>{2, 2}),
+                                utils::MakeAttribute("strides", std::vector<int64_t>{2, 2}),
+                                utils::MakeAttribute("pads", std::vector<int64_t>{0, 0, 0, 0}),
+                                utils::MakeAttribute("dilations", std::vector<int64_t>{1, 1}),
+                                utils::MakeAttribute("ceil_mode", static_cast<int64_t>(0)),
+                                utils::MakeAttribute("storage_order", static_cast<int64_t>(0)),
+                                utils::MakeAttribute("auto_pad", "NOTSET")},
                                ExpectedEPNodeAssignment::All);
 }
 
-TEST_F(QnnHTPBackendTests, TestMaxPool_Ceil_HTP_u8) {
-  RunQDQMaxPoolOpTest<uint8_t>({1, 2, 3, 3},  // shape
-                               {3, 3},        // kernel_shape
-                               {3, 3},        // strides
-                               {0, 0, 0, 0},  // pads
-                               {1, 1},        // dialations
-                               1,             // ceil_mode
-                               0,             // storage_order
-                               "NOTSET",      // auto_pad
+TEST_F(QnnHTPBackendTests, MaxPool_Ceil_HTP_u8) {
+  RunQDQMaxPoolOpTest<uint8_t>(TestInputDef<float>({1, 2, 3, 3}, false, -10.0f, 10.0f),  // Dynamic input with range [-10, 10]
+                               {utils::MakeAttribute("kernel_shape", std::vector<int64_t>{3, 3}),
+                                utils::MakeAttribute("strides", std::vector<int64_t>{3, 3}),
+                                utils::MakeAttribute("pads", std::vector<int64_t>{0, 0, 0, 0}),
+                                utils::MakeAttribute("dilations", std::vector<int64_t>{1, 1}),
+                                utils::MakeAttribute("ceil_mode", static_cast<int64_t>(1)),
+                                utils::MakeAttribute("storage_order", static_cast<int64_t>(0)),
+                                utils::MakeAttribute("auto_pad", "NOTSET")},
                                ExpectedEPNodeAssignment::All);
 }
 
 // TODO: Certain large input sizes cause the QNN graph to fail to finalize with error 1002 (QNN_COMMON_ERROR_MEM_ALLOC).
-TEST_F(QnnHTPBackendTests, DISABLED_TestMaxPool_Large_Input2_Ceil_HTP_u8) {
-  RunQDQMaxPoolOpTest<uint8_t>({1, 128, 16, 113},  // shape
-                               {2, 2},             // kernel_shape
-                               {2, 2},             // strides
-                               {0, 0, 0, 0},       // pads
-                               {1, 1},             // dialations
-                               1,                  // ceil_mode
-                               0,                  // storage_order
-                               "NOTSET",           // auto_pad
+TEST_F(QnnHTPBackendTests, DISABLED_MaxPool_Large_Input2_Ceil_HTP_u8) {
+  RunQDQMaxPoolOpTest<uint8_t>(TestInputDef<float>({1, 128, 16, 113}, false, -10.0f, 10.0f),  // Dynamic input with range [-10, 10]
+                               {utils::MakeAttribute("kernel_shape", std::vector<int64_t>{2, 2}),
+                                utils::MakeAttribute("strides", std::vector<int64_t>{2, 2}),
+                                utils::MakeAttribute("pads", std::vector<int64_t>{0, 0, 0, 0}),
+                                utils::MakeAttribute("dilations", std::vector<int64_t>{1, 1}),
+                                utils::MakeAttribute("ceil_mode", static_cast<int64_t>(1)),
+                                utils::MakeAttribute("storage_order", static_cast<int64_t>(0)),
+                                utils::MakeAttribute("auto_pad", "NOTSET")},
                                ExpectedEPNodeAssignment::All);
 }
 
 // TODO: Certain large input sizes cause the QNN graph to fail to finalize with error 1002 (QNN_COMMON_ERROR_MEM_ALLOC).
-TEST_F(QnnHTPBackendTests, DISABLED_TestMaxPool_LargeInput_1Pads) {
-  RunQDQMaxPoolOpTest<uint8_t>({1, 64, 384, 576},  // shape
-                               {3, 3},             // kernel_shape
-                               {2, 2},             // strides
-                               {1, 1, 1, 1},       // pads
-                               {1, 1},             // dialations
-                               0,                  // ceil_mode
-                               0,                  // storage_order
-                               "NOTSET",           // auto_pad
+TEST_F(QnnHTPBackendTests, DISABLED_MaxPool_LargeInput_1Pads) {
+  RunQDQMaxPoolOpTest<uint8_t>(TestInputDef<float>({1, 64, 384, 576}, false, -10.0f, 10.0f),  // Dynamic input with range [-10, 10]
+                               {utils::MakeAttribute("kernel_shape", std::vector<int64_t>{3, 3}),
+                                utils::MakeAttribute("strides", std::vector<int64_t>{2, 2}),
+                                utils::MakeAttribute("pads", std::vector<int64_t>{1, 1, 1, 1}),
+                                utils::MakeAttribute("dilations", std::vector<int64_t>{1, 1}),
+                                utils::MakeAttribute("ceil_mode", static_cast<int64_t>(0)),
+                                utils::MakeAttribute("storage_order", static_cast<int64_t>(0)),
+                                utils::MakeAttribute("auto_pad", "NOTSET")},
                                ExpectedEPNodeAssignment::All);
 }
 
diff --git a/onnxruntime/test/providers/qnn/qnn_test_utils.cc b/onnxruntime/test/providers/qnn/qnn_test_utils.cc
index 14f2a351d414c..6a6dc6d84af2f 100644
--- a/onnxruntime/test/providers/qnn/qnn_test_utils.cc
+++ b/onnxruntime/test/providers/qnn/qnn_test_utils.cc
@@ -43,6 +43,84 @@ void RunQnnModelTest(const GetTestModelFn& build_test_case, const ProviderOption
                             helper.feeds_, verification_params);
 }
 
+void InferenceModel(const std::string& model_data, const char* log_id,
+                    std::unique_ptr<IExecutionProvider> execution_provider,
+                    ExpectedEPNodeAssignment expected_ep_assignment, const NameMLValMap& feeds,
+                    std::vector<std::string>& output_names, std::vector<OrtValue>& output_vals) {
+  SessionOptions so;
+  so.session_logid = log_id;
+  RunOptions run_options;
+  run_options.run_tag = so.session_logid;
+
+  InferenceSessionWrapper session_object{so, GetEnvironment()};
+
+  std::string provider_type = kCpuExecutionProvider;
+  if (execution_provider) {
+    provider_type = execution_provider->Type();
+    ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(std::move(execution_provider)));
+  }
+  ASSERT_STATUS_OK(session_object.Load(model_data.data(), static_cast<int>(model_data.size())));
+  ASSERT_STATUS_OK(session_object.Initialize());
+
+  const auto& graph = session_object.GetGraph();
+
+  auto ep_nodes = CountAssignedNodes(graph, provider_type);
+  if (expected_ep_assignment == ExpectedEPNodeAssignment::All) {
+    // Verify the entire graph is assigned to the EP
+    ASSERT_EQ(ep_nodes, graph.NumberOfNodes()) << "Not all nodes were assigned to " << provider_type;
+  } else if (expected_ep_assignment == ExpectedEPNodeAssignment::None) {
+    ASSERT_EQ(ep_nodes, 0) << "No nodes are supposed to be assigned to " << provider_type;
+  } else {
+    ASSERT_GT(ep_nodes, 0) << "No nodes were assigned to " << provider_type;
+  }
+
+  const auto& outputs = graph.GetOutputs();
+
+  // fetch all outputs if necessary.
+  if (output_names.empty()) {
+    output_names.reserve(outputs.size());
+    for (const auto* node_arg : outputs) {
+      if (node_arg->Exists()) {
+        output_names.push_back(node_arg->Name());
+      }
+    }
+  }
+
+  ASSERT_STATUS_OK(session_object.Run(run_options, feeds, output_names, &output_vals));
+}
+
+NodeArg* MakeTestQDQBiasInput(ModelTestBuilder& builder, const TestInputDef<float>& bias_def, float bias_scale) {
+  NodeArg* bias_int32 = nullptr;
+
+  // Bias must be int32 to be detected as a QDQ node unit.
+  // We must quantize the data.
+  if (bias_def.IsRandomData()) {
+    // Create random initializer def that is quantized to int32
+    const auto& rand_info = bias_def.GetRandomDataInfo();
+    TestInputDef<int32_t> bias_int32_def(bias_def.GetShape(), bias_def.IsInitializer(), static_cast<int32_t>(rand_info.min / bias_scale),
+                                         static_cast<int32_t>(rand_info.max / bias_scale));
+    bias_int32 = MakeTestInput(builder, bias_int32_def);
+  } else {
+    assert(bias_def.IsRawData());
+    // Create raw data initializer def that is quantized to int32
+    const auto& bias_f32_raw = bias_def.GetRawData();
+    const size_t num_elems = bias_f32_raw.size();
+
+    std::vector<int32_t> bias_int32_raw(num_elems);
+    for (size_t i = 0; i < num_elems; i++) {
+      bias_int32_raw[i] = static_cast<int32_t>(bias_f32_raw[i] / bias_scale);
+    }
+
+    TestInputDef<int32_t> bias_int32_def(bias_def.GetShape(), bias_def.IsInitializer(), bias_int32_raw);
+    bias_int32 = MakeTestInput(builder, bias_int32_def);
+  }
+
+  auto* bias = builder.MakeIntermediate();
+  builder.AddDequantizeLinearNode<int32_t>(bias_int32, bias_scale, 0, bias);
+
+  return bias;
+}
+
 // Mock IKernelLookup class passed to QNN EP's GetCapability() function in order to
 // determine if the HTP backend is supported on specific platforms (e.g., Windows ARM64).
 // TODO: Remove once HTP can be emulated on Windows ARM64.
diff --git a/onnxruntime/test/providers/qnn/qnn_test_utils.h b/onnxruntime/test/providers/qnn/qnn_test_utils.h
index 21d34136c7c85..b091177b24ee2 100644
--- a/onnxruntime/test/providers/qnn/qnn_test_utils.h
+++ b/onnxruntime/test/providers/qnn/qnn_test_utils.h
@@ -5,19 +5,76 @@
 
 #if !defined(ORT_MINIMAL_BUILD)
 #include <string>
+#include <cmath>
 #include <unordered_map>
 #include "core/framework/provider_options.h"
 
 #include "test/optimizer/qdq_test_utils.h"
 #include "test/util/include/test_utils.h"
+#include "test/util/include/test/test_environment.h"
+#include "test/util/include/default_providers.h"
 
 #include "gtest/gtest.h"
 
 namespace onnxruntime {
 namespace test {
 
+// Signature for function that builds a float32 model.
 using GetTestModelFn = std::function<void(ModelTestBuilder& builder)>;
 
+// Class that stores quantization params (scale, zero point).
+// Has a static function that computes quantization parameters from a floating-point range.
+template <typename QType = uint8_t>
+struct QuantParams {
+  float scale;
+  QType zero_point;
+
+  static QuantParams<QType> Compute(float rmin, float rmax) {
+    if (rmin == 0.0f && rmax == 0.0f) {  // Quantizing a single zero.
+      return QuantParams<QType>{1.0f, 0};
+    }
+
+    if (rmin == rmax) {  // One data-point (x) to quantize.
+      if (rmin < 0) {    // new range is [-x , 0.0f]
+        rmax = 0.0f;
+      } else {  // new range is [0.0f, x]
+        rmin = 0.0f;
+      }
+    }
+
+    constexpr float qmin = static_cast<float>(std::numeric_limits<QType>::min());
+    constexpr float qmax = static_cast<float>(std::numeric_limits<QType>::max());
+
+    const float scale = (rmax - rmin) / (qmax - qmin);
+    const QType zero_point = static_cast<QType>(std::roundf((qmin - rmin) / scale));
+
+    return QuantParams<QType>{scale, zero_point};
+  }
+};
+
+// Signature for function that builds a QDQ model.
+// The parameter `output_qparams` contains quantization parameters that *can* be used for the QDQ model output.
+// These output quantization parameters are computed by first running the float32 model and determining the
+// range of output values. Note that the function is able to overwrite the output_qparams parameter if necessary
+// (Example: MaxPool must have identical input and output quantization params).
+template <typename QuantType>
+using GetTestQDQModelFn = std::function<void(ModelTestBuilder& builder, std::vector<QuantParams<QuantType>>& output_qparams)>;
+
+// Computes quantization parameters for an array of floating-point values.
+template <typename QType = uint8_t>
+inline QuantParams<QType> GetDataQuantParams(gsl::span<const float> data) {
+  // Get min/max of raw data.
+  float min_val = std::numeric_limits<float>::max();
+  float max_val = std::numeric_limits<float>::min();
+
+  for (auto val : data) {
+    min_val = std::min(min_val, val);
+    max_val = std::max(max_val, val);
+  }
+
+  return QuantParams<QType>::Compute(min_val, max_val);
+}
+
 // Class that defines an input that can be created with ModelTestBuilder.
 // Defines whether the input is an initializer and if the data should be randomized or if
 // set to an explicit value.
@@ -39,14 +96,18 @@ struct TestInputDef {
   TestInputDef(std::vector<int64_t> shape, bool is_initializer, T rand_min, T rand_max)
       : shape_(std::move(shape)),
         data_info_(RandomData{rand_min, rand_max}),
-        is_initializer_(is_initializer) {}
+        is_initializer_(is_initializer),
+        has_range_override_(false),
+        range_override_() {}
 
   // Create an input definition with explicit data. Specify its shape, whether it's an initializer,
   // and the raw data.
   TestInputDef(std::vector<int64_t> shape, bool is_initializer, std::vector<T> data)
       : shape_(std::move(shape)),
         data_info_(RawData{std::move(data)}),
-        is_initializer_(is_initializer) {}
+        is_initializer_(is_initializer),
+        has_range_override_(false),
+        range_override_() {}
 
   TestInputDef(TestInputDef&& other) = default;
   TestInputDef(const TestInputDef& other) = default;
@@ -54,6 +115,18 @@ struct TestInputDef {
   TestInputDef& operator=(const TestInputDef& other) = default;
   TestInputDef& operator=(TestInputDef&& other) = default;
 
+  // Overrides the range of input values reported by TestInputDef::GetRange().
+  // This is useful when you want to quantize over a range that is larger or smaller
+  // than the actual range of the data.
+  //
+  // Returns a reference to this object to allow chaining.
+  TestInputDef& OverrideValueRange(T range_min, T range_max) {
+    range_override_.first = range_min;
+    range_override_.second = range_max;
+    has_range_override_ = true;
+    return *this;
+  }
+
   const std::vector<int64_t>& GetShape() const {
     return shape_;
   }
@@ -78,7 +151,15 @@ struct TestInputDef {
     return std::get<RawData>(data_info_).data;
   }
 
+  // Get the range of values represented by this input, which is necessary for computing quantization parameters.
+  // For raw data, we return [min, max] of the elements.
+  // For random data, we return [rand_min, rand_max].
+  // Optionally, the user can override this range by using OverrideValueRange().
   std::pair<T, T> GetRange() const {
+    if (has_range_override_) {
+      return range_override_;
+    }
+
     auto which_type = data_info_.index();
     std::pair<T, T> range;
 
@@ -105,28 +186,169 @@ struct TestInputDef {
   std::vector<int64_t> shape_;
   std::variant<RawData, RandomData> data_info_;
   bool is_initializer_;
+  bool has_range_override_;
+  std::pair<T, T> range_override_;
 };
 
 template <typename QType = uint8_t>
-struct QuantParams {
-  float scale;
-  QType zero_point;
+inline QuantParams<QType> GetTestInputQuantParams(const TestInputDef<float>& input_def) {
+  const std::pair<float, float> frange = input_def.GetRange();
+  return QuantParams<QType>::Compute(frange.first, frange.second);
+}
 
-  static QuantParams<QType> Compute(float rmin, float rmax) {
-    constexpr float qmin = static_cast<float>(std::numeric_limits<QType>::min());
-    constexpr float qmax = static_cast<float>(std::numeric_limits<QType>::max());
+/**
+ * Inferences a given serialized model. Returns output values via an out-param.
+ *
+ * \param model_data The serialized ONNX model to inference.
+ * \param log_id The logger ID.
+ * \param execution_provider The EP on which to run the model. Set to nullptr for CPU EP.
+ * \param expected_ep_assignment Describes "which nodes" should be assigned to the EP.
+ * \param feeds The input feeds.
+ * \param output_names If empty, the function will write the output names.
+ * \param output_vals Initialized to the inference results.
+ */
+void InferenceModel(const std::string& model_data, const char* log_id,
+                    std::unique_ptr<IExecutionProvider> execution_provider,
+                    ExpectedEPNodeAssignment expected_ep_assignment, const NameMLValMap& feeds,
+                    std::vector<std::string>& output_names, std::vector<OrtValue>& output_vals);
 
-    const float scale = (rmax - rmin) / (qmax - qmin);
-    const QType zero_point = static_cast<QType>((qmin - rmin) / scale);
+/**
+ * Tests the accuracy of a QDQ model on QNN EP by runnning 3 inferences:
+ *
+ * 1. float model on CPU EP (baseline)
+ * 2. QDQ model on CPU EP
+ * 3. QDQ model on QNN EP
+ *
+ * This function checks that running the QDQ model on QNN EP (#3) is at least as accurate (+- small tolerance)
+ * as running the QDQ model on CPU EP (#2). We primarily measure accuracy by comparing to the baseline (#1).
+ *
+ * \param f32_model_fn Function that builds the float model (baseline for comparison).
+ * \param qdq_model_fn Function that builds the QDQ model (run by CPU EP and QNN EP).
+ * \param qnn_options QNN EP provider options.
+ * \param opset_version The opset version.
+ * \param expected_ep_assignment Describes "which nodes" should be assigned to the EP.
+ * \param fp32_abs_err Small tolerance used for floating-point comparisons.
+ * \param log_severity The logger's severity setting.
+ */
+template <typename QuantType = uint8_t>
+inline void TestQDQModelAccuracy(const GetTestModelFn& f32_model_fn, const GetTestQDQModelFn<QuantType>& qdq_model_fn,
+                                 const ProviderOptions& qnn_options, int opset_version,
+                                 ExpectedEPNodeAssignment expected_ep_assignment, float fp32_abs_err,
+                                 logging::Severity log_severity = logging::Severity::kERROR) {
+  // Add kMSDomain to cover contrib op like Gelu
+  const std::unordered_map<std::string, int> domain_to_version = {{"", opset_version}, {kMSDomain, 1}};
+
+  auto& logging_manager = DefaultLoggingManager();
+  logging_manager.SetDefaultLoggerSeverity(log_severity);
+
+  // Create float model and serialize it to a string.
+  onnxruntime::Model f32_model("f32_model", false, ModelMetaData(), PathString(),
+                               IOnnxRuntimeOpSchemaRegistryList(), domain_to_version, {},
+                               logging_manager.DefaultLogger());
+  ModelTestBuilder f32_helper(f32_model.MainGraph());
+  std::string f32_model_data;
+  f32_model_fn(f32_helper);
+  f32_helper.SetGraphOutputs();
+  ASSERT_STATUS_OK(f32_model.MainGraph().Resolve());
+  f32_model.ToProto().SerializeToString(&f32_model_data);
+
+  // Run f32 model on CPU EP and collect outputs.
+  std::vector<OrtValue> cpu_f32_outputs;
+  std::vector<std::string> output_names;
+  InferenceModel(f32_model_data, "f32_model_logger", nullptr, ExpectedEPNodeAssignment::All,
+                 f32_helper.feeds_, output_names, cpu_f32_outputs);
+  const size_t num_outputs = cpu_f32_outputs.size();
+
+  // Compute output range(s) and quantization params.
+  std::vector<QuantParams<QuantType>> output_qparams;
+  std::vector<gsl::span<const float>> output_vals;
+  std::vector<int32_t> output_types;
+  output_qparams.resize(num_outputs);
+  output_vals.resize(num_outputs);
+  output_types.resize(num_outputs);
+
+  for (size_t i = 0; i < num_outputs; i++) {
+    auto& tensor = cpu_f32_outputs[i].Get<Tensor>();
+    int32_t elem_type = tensor.GetElementType();
+
+    if (elem_type == ONNX_NAMESPACE::TensorProto_DataType_FLOAT) {
+      output_vals[i] = tensor.DataAsSpan<float>();
+      output_qparams[i] = GetDataQuantParams<QuantType>(output_vals[i]);
+    }
 
-    return QuantParams<QType>{scale, zero_point};
+    output_types[i] = elem_type;
   }
-};
 
-template <typename QType = uint8_t>
-inline QuantParams<QType> GetTestInputQuantParams(const TestInputDef<float>& input_def) {
-  const std::pair<float, float> frange = input_def.GetRange();
-  return QuantParams<QType>::Compute(frange.first, frange.second);
+  // Create QDQ model and serialize it to a string.
+  onnxruntime::Model qdq_model("qdq_model", false, ModelMetaData(), PathString(),
+                               IOnnxRuntimeOpSchemaRegistryList(), domain_to_version, {},
+                               logging_manager.DefaultLogger());
+  ModelTestBuilder qdq_helper(qdq_model.MainGraph());
+  std::string qdq_model_data;
+  qdq_model_fn(qdq_helper, output_qparams);
+  qdq_helper.SetGraphOutputs();
+  ASSERT_STATUS_OK(qdq_model.MainGraph().Resolve());
+  qdq_model.ToProto().SerializeToString(&qdq_model_data);
+
+  // Run QDQ model on QNN EP and collect outputs.
+  std::vector<OrtValue> qnn_qdq_outputs;
+  InferenceModel(qdq_model_data, "qdq_model_logger", QnnExecutionProviderWithOptions(qnn_options),
+                 expected_ep_assignment, qdq_helper.feeds_, output_names, qnn_qdq_outputs);
+
+  if (expected_ep_assignment != ExpectedEPNodeAssignment::None) {
+    // Run QDQ model on CPU EP and collect outputs.
+    std::vector<OrtValue> cpu_qdq_outputs;
+    InferenceModel(qdq_model_data, "qdq_model_logger", nullptr, ExpectedEPNodeAssignment::All,
+                   qdq_helper.feeds_, output_names, cpu_qdq_outputs);
+    ASSERT_EQ(cpu_qdq_outputs.size(), num_outputs);
+    ASSERT_EQ(qnn_qdq_outputs.size(), num_outputs);
+
+    // Compare accuracy of QDQ results with float model.
+    // QNN EP must be at least as accurate as CPU EP when running the QDQ model.
+    for (size_t i = 0; i < num_outputs; i++) {
+      auto& cpu_qdq_tensor = cpu_qdq_outputs[i].Get<Tensor>();
+      auto& qnn_qdq_tensor = qnn_qdq_outputs[i].Get<Tensor>();
+
+      ASSERT_EQ(cpu_qdq_tensor.GetElementType(), output_types[i]);
+      ASSERT_EQ(qnn_qdq_tensor.GetElementType(), output_types[i]);
+
+      if (output_types[i] == ONNX_NAMESPACE::TensorProto_DataType_FLOAT) {
+        const size_t num_vals = output_vals[i].size();
+        gsl::span<const float> cpu_f32_vals = output_vals[i];
+        gsl::span<const float> cpu_qdq_vals = cpu_qdq_tensor.DataAsSpan<float>();
+        gsl::span<const float> qnn_qdq_vals = qnn_qdq_tensor.DataAsSpan<float>();
+
+        ASSERT_EQ(num_vals, cpu_qdq_vals.size());
+        ASSERT_EQ(num_vals, qnn_qdq_vals.size());
+
+        for (size_t j = 0; j < num_vals; j++) {
+          const float expected_val = cpu_f32_vals[j];  // "ground-truth"
+          const float qnn_qdq_val = qnn_qdq_vals[j];
+          const float cpu_qdq_val = cpu_qdq_vals[j];
+          const float cpu_err = std::fabs(expected_val - cpu_qdq_val);
+          const float qnn_err = std::fabs(expected_val - qnn_qdq_val);
+
+          // Case 1 (qnn_err <= cpu_err): QNN EP is *more* accurate, which makes (qnn_err - cpu_err) zero or
+          //                              a negative value.
+          // Case 2 (qnn_err > cpu_err):  QNN EP is less accurate, but the error difference is within 1
+          //                              quantization unit (i.e., scale). This can occur due to rounding differences.
+          const bool is_as_accurate_as_cpu_qdq = (qnn_err - cpu_err) <= (output_qparams[i].scale + fp32_abs_err);
+
+          EXPECT_TRUE(is_as_accurate_as_cpu_qdq)
+              << "Inaccuracy detected for output '"
+              << output_names[i]
+              << "', element " << j
+              << ".\nOutput quant params: scale=" << output_qparams[i].scale
+              << ", zero_point=" << static_cast<int32_t>(output_qparams[i].zero_point)
+              << ".\nExpected val: " << expected_val << "\n"
+              << "QNN QDQ val: " << qnn_qdq_val << " (err " << qnn_err << ")\n"
+              << "CPU QDQ val: " << cpu_qdq_val << " (err " << cpu_err << ")";
+        }
+      } else {
+        VerifyOutput(output_names[i], cpu_f32_outputs[i].Get<Tensor>(), qnn_qdq_tensor, fp32_abs_err);
+      }
+    }
+  }
 }
 
 /**
@@ -164,6 +386,38 @@ inline NodeArg* MakeTestInput(ModelTestBuilder& builder, const TestInputDef<T>&
   return input;
 }
 
+template <>
+inline NodeArg* MakeTestInput(ModelTestBuilder& builder, const TestInputDef<bool>& input_def) {
+  NodeArg* input = nullptr;
+  const auto& shape = input_def.GetShape();
+  const bool is_initializer = input_def.IsInitializer();
+
+  if (input_def.IsRawData()) {  // Raw data.
+    const std::vector<bool>& raw_data = input_def.GetRawData();
+
+    if (is_initializer) {
+      input = builder.MakeInitializerBool(shape, raw_data);
+    } else {
+      input = builder.MakeInput<bool>(shape, raw_data);
+    }
+  } else {  // Random data
+    if (is_initializer) {
+      input = builder.MakeRandInitializerBool(shape);
+    } else {
+      input = builder.MakeInputBool(shape);
+    }
+  }
+
+  return input;
+}
+
+// ONNX spec does not allow quantizing float to int32. However, this function will create an int32 input (divide by scale)
+// and then return the output of DequantizeLinear. Note that bias_scale should be generally be equal
+// to input_scale * weights_scale. See quantization tool: onnx_quantizer.py::quantize_bias_static()
+//
+// i.e., initial bias => manual quantization (int32) => DQ => final float bias
+NodeArg* MakeTestQDQBiasInput(ModelTestBuilder& builder, const TestInputDef<float>& bias_def, float bias_scale);
+
 /**
  * Runs a test model on the QNN EP. Checks the graph node assignment, and that inference
  * outputs for QNN and CPU match.
diff --git a/onnxruntime/test/providers/qnn/reduce_op_cpu_test.cc b/onnxruntime/test/providers/qnn/reduce_op_cpu_test.cc
deleted file mode 100644
index c854d2e5dc5e7..0000000000000
--- a/onnxruntime/test/providers/qnn/reduce_op_cpu_test.cc
+++ /dev/null
@@ -1,225 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#if !defined(ORT_MINIMAL_BUILD)
-
-#include <string>
-#include <unordered_map>
-
-#include "test/optimizer/qdq_test_utils.h"
-#include "test/providers/qnn/qnn_test_utils.h"
-
-#include "gtest/gtest.h"
-
-namespace onnxruntime {
-namespace test {
-
-/**
- * Creates a graph with a single reduce operator (e.g., ReduceSum, ReduceMin, etc.). Reduce operators take the
- * axes of reduction as either a node attribute or an optional input (depending on opset).
- *
- * \param reduce_op_type The string denoting the reduce operator's type (e.g., "ReduceSum").
- * \param input_shape The shape of the input. Input data is randomly generated with this shape.
- * \param axes_as_input True if the "axes" are specified as a node input.
- * \param axes The axes of reduction.
- * \param keepdims True if the output's rank should match the input. This is a node attribute that defaults to true.
- * \param noop_with_empty_axes True if empty axes should force the node to act as a NoOp (no operation).
- *                             This is a node attribute that defaults to false.
- * \param domain The domain to assign to the graph node.
- *
- * \return A function that builds the graph with the provided builder.
- */
-template <typename DataType>
-static GetTestModelFn BuildReduceOpTestCase(const std::string& reduce_op_type,
-                                            const std::vector<int64_t>& input_shape,
-                                            bool axes_as_input, std::vector<int64_t> axes, bool keepdims,
-                                            bool noop_with_empty_axes) {
-  return [reduce_op_type, input_shape, axes_as_input, axes, keepdims,
-          noop_with_empty_axes](ModelTestBuilder& builder) {
-    std::vector<NodeArg*> input_args;
-
-    // Input data arg
-    input_args.push_back(builder.MakeInput<DataType>(input_shape, static_cast<DataType>(0),
-                                                     static_cast<DataType>(20)));
-
-    // Axes input (initializer) for newer opsets.
-    if (axes_as_input) {
-      input_args.push_back(builder.MakeInitializer({static_cast<int64_t>(axes.size())}, axes));
-    }
-
-    auto* reduce_sum_output = builder.MakeOutput();
-    Node& reduce_sum_node = builder.AddNode(reduce_op_type, input_args, {reduce_sum_output});
-    reduce_sum_node.AddAttribute("keepdims", static_cast<int64_t>(keepdims));
-
-    // Older opsets have "axes" as a node attribute.
-    if (!axes_as_input) {
-      reduce_sum_node.AddAttribute("axes", axes);
-    } else {
-      reduce_sum_node.AddAttribute("noop_with_empty_axes", static_cast<int64_t>(noop_with_empty_axes));
-    }
-  };
-}
-
-/**
- * Runs a ReduceOp model on the QNN CPU backend. Checks the graph node assignment, and that inference
- * outputs for QNN and CPU match.
- *
- * \param op_type The ReduceOp type (e.g., ReduceSum).
- * \param opset The opset version. Some opset versions have "axes" as an attribute or input.
- * \param expected_ep_assignment How many nodes are expected to be assigned to QNN (All, Some, or None)
- * \param keepdims Common attribute for all reduce operations.
- */
-template <typename DataType>
-static void RunReduceOpCpuTest(const std::string& op_type, int opset,
-                               ExpectedEPNodeAssignment expected_ep_assignment = ExpectedEPNodeAssignment::All,
-                               bool keepdims = true) {
-  ProviderOptions provider_options;
-#if defined(_WIN32)
-  provider_options["backend_path"] = "QnnCpu.dll";
-#else
-  provider_options["backend_path"] = "libQnnCpu.so";
-#endif
-
-  RunQnnModelTest(BuildReduceOpTestCase<DataType>(op_type,
-                                                  {2, 2},  // input shape
-                                                  ReduceOpHasAxesInput(op_type, opset),
-                                                  {0, 1},  // axes
-                                                  keepdims,
-                                                  false),  // noop_with_empty_axes
-                  provider_options,
-                  opset,
-                  expected_ep_assignment);
-}
-
-//
-// ReduceSum
-//
-
-// Test creates a graph with a ReduceSum node, and checks that all
-// nodes are supported by the QNN EP (cpu backend), and that the inference results match the CPU EP results.
-//
-// - The input and output data type is int32.
-// - Uses opset 13, which has "axes" as an input.
-TEST_F(QnnCPUBackendTests, TestInt32ReduceSumOpset13) {
-  RunReduceOpCpuTest<int32_t>("ReduceSum", 13);
-}
-
-// Test creates a graph with a ReduceSum node, and checks that all
-// nodes are supported by the QNN EP (cpu backend), and that the inference results match the CPU EP results.
-//
-// - The input and output data type is int32.
-// - Uses opset 11, which has "axes" as an attribute.
-TEST_F(QnnCPUBackendTests, TestInt32ReduceSumOpset11) {
-  RunReduceOpCpuTest<int32_t>("ReduceSum", 11);
-}
-
-// Test creates a graph with a ReduceSum node, and checks that all
-// nodes are supported by the QNN EP (cpu backend), and that the inference results match the CPU EP results.
-//
-// - The input and output data type is float.
-// - Uses opset 13, which has "axes" as an input.
-TEST_F(QnnCPUBackendTests, TestFloatReduceSumOpset13) {
-  RunReduceOpCpuTest<float>("ReduceSum", 13);
-}
-
-// Test creates a graph with a ReduceSum node, and checks that all
-// nodes are supported by the QNN EP (cpu backend), and that the inference results match the CPU EP results.
-//
-// - The input and output data type is float.
-// - Uses opset 11, which has "axes" as an attribute.
-TEST_F(QnnCPUBackendTests, TestFloatReduceSumOpset11) {
-  RunReduceOpCpuTest<float>("ReduceSum", 11);
-}
-
-//
-// ReduceProd
-//
-
-// Test creates a graph with a ReduceProd node, and checks that all
-// nodes are supported by the QNN EP (cpu backend), and that the inference results match the CPU EP results.
-//
-// - The input and output data type is float.
-// - Uses opset 18, which has "axes" as an input.
-TEST_F(QnnCPUBackendTests, TestReduceProdOpset18) {
-  RunReduceOpCpuTest<float>("ReduceProd", 18);
-}
-
-// Test creates a graph with a ReduceProd node, and checks that all
-// nodes are supported by the QNN EP (cpu backend), and that the inference results match the CPU EP results.
-//
-// - The input and output data type is float.
-// - Uses opset 13, which has "axes" as an attribute.
-TEST_F(QnnCPUBackendTests, TestReduceProdOpset13) {
-  RunReduceOpCpuTest<float>("ReduceProd", 13);
-}
-
-//
-// ReduceMax
-//
-
-// Test creates a graph with a ReduceMax node, and checks that all
-// nodes are supported by the QNN EP (cpu backend), and that the inference results match the CPU EP results.
-//
-// - The input and output data type is float.
-// - Uses opset 18, which has "axes" as an input.
-TEST_F(QnnCPUBackendTests, TestReduceMaxOpset18) {
-  RunReduceOpCpuTest<float>("ReduceMax", 18);
-}
-
-// Test creates a graph with a ReduceMax node, and checks that all
-// nodes are supported by the QNN EP (cpu backend), and that the inference results match the CPU EP results.
-//
-// - The input and output data type is float.
-// - Uses opset 13, which has "axes" as an attribute.
-TEST_F(QnnCPUBackendTests, TestReduceMaxOpset13) {
-  RunReduceOpCpuTest<float>("ReduceMax", 13);
-}
-
-//
-// ReduceMin
-//
-
-// Test creates a graph with a ReduceMin node, and checks that all
-// nodes are supported by the QNN EP (cpu backend), and that the inference results match the CPU EP results.
-//
-// - The input and output data type is float.
-// - Uses opset 18, which has "axes" as an input.
-TEST_F(QnnCPUBackendTests, TestReduceMinOpset18) {
-  RunReduceOpCpuTest<float>("ReduceMin", 18);
-}
-
-// Test creates a graph with a ReduceMin node, and checks that all
-// nodes are supported by the QNN EP (cpu backend), and that the inference results match the CPU EP results.
-//
-// - The input and output data type is float.
-// - Uses opset 13, which has "axes" as an attribute.
-TEST_F(QnnCPUBackendTests, TestReduceMinOpset13) {
-  RunReduceOpCpuTest<float>("ReduceMin", 13);
-}
-
-//
-// ReduceMean
-//
-
-// Test creates a graph with a ReduceMean node, and checks that all
-// nodes are supported by the QNN EP (cpu backend), and that the inference results match the CPU EP results.
-//
-// - The input and output data type is float.
-// - Uses opset 18, which has "axes" as an input.
-TEST_F(QnnCPUBackendTests, TestReduceMeanOpset18) {
-  RunReduceOpCpuTest<float>("ReduceMean", 18);
-}
-
-// Test creates a graph with a ReduceMean node, and checks that all
-// nodes are supported by the QNN EP (cpu backend), and that the inference results match the CPU EP results.
-//
-// - The input and output data type is float.
-// - Uses opset 13, which has "axes" as an attribute.
-TEST_F(QnnCPUBackendTests, TestReduceMeanOpset13) {
-  RunReduceOpCpuTest<float>("ReduceMean", 13);
-}
-
-}  // namespace test
-}  // namespace onnxruntime
-
-#endif  // !defined(ORT_MINIMAL_BUILD)
\ No newline at end of file
diff --git a/onnxruntime/test/providers/qnn/reduce_op_htp_test.cc b/onnxruntime/test/providers/qnn/reduce_op_htp_test.cc
deleted file mode 100644
index 86b319eea0b14..0000000000000
--- a/onnxruntime/test/providers/qnn/reduce_op_htp_test.cc
+++ /dev/null
@@ -1,256 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#if !defined(ORT_MINIMAL_BUILD)
-
-#include <string>
-#include "core/graph/graph.h"
-
-#include "test/optimizer/qdq_test_utils.h"
-#include "test/providers/qnn/qnn_test_utils.h"
-
-#include "gtest/gtest.h"
-
-namespace onnxruntime {
-namespace test {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
-
-// Creates the following graph if axes is an input (newer opsets):
-//                                _______________________
-//    input (f32) -> Q -> DQ ->  |                       | -> Q -> DQ -> output (f32)
-// axes (int32, initializer) ->  |       Reduce___       |
-//                               |_______________________|
-//
-// Creates the following graph if axes is an attribute (older opsets):
-//                                _______________________
-//    input (f32) -> Q -> DQ ->  |                       | -> Q -> DQ -> output (f32)
-//                               |       Reduce___       |
-//                               |_______________________|
-//
-template <typename QuantType>
-GetTestModelFn BuildQDQReduceOpTestCase(const std::string& reduce_op_type, const std::vector<int64_t>& input_shape,
-                                        bool axes_as_input, const std::vector<int64_t>& axes, bool keepdims,
-                                        bool noop_with_empty_axes) {
-  return [reduce_op_type, input_shape, axes_as_input, axes, keepdims,
-          noop_with_empty_axes](ModelTestBuilder& builder) {
-    using QuantTypeLimits = std::numeric_limits<QuantType>;
-    QuantType input_quant_min_value = QuantTypeLimits::min();
-    QuantType input_quant_max_value = QuantTypeLimits::max();
-
-    auto* input_data = builder.MakeInput<float>(input_shape, -100.0f, 100.0f);
-    auto* final_output = builder.MakeOutput();
-
-    // input_data -> Q/DQ ->
-    auto* input_qdq_output = AddQDQNodePair<QuantType>(builder, input_data, .04f,
-                                                       (input_quant_min_value + input_quant_max_value) / 2 + 1);
-
-    // -> ReduceOp (e.g., ReduceSum) ->
-    std::vector<NodeArg*> reduce_op_inputs;
-    reduce_op_inputs.push_back(input_qdq_output);
-
-    if (axes_as_input) {
-      reduce_op_inputs.push_back(builder.MakeInitializer({static_cast<int64_t>(axes.size())}, axes));
-    }
-
-    auto* reduce_sum_output = builder.MakeIntermediate();
-    Node& reduce_sum_node = builder.AddNode(reduce_op_type, reduce_op_inputs, {reduce_sum_output});
-    reduce_sum_node.AddAttribute("keepdims", static_cast<int64_t>(keepdims));
-
-    if (axes_as_input) {
-      reduce_sum_node.AddAttribute("noop_with_empty_axes", static_cast<int64_t>(noop_with_empty_axes));
-    } else {
-      reduce_sum_node.AddAttribute("axes", axes);
-    }
-
-    // -> Q/DQ -> final_output
-    auto* q_output = builder.MakeIntermediate();
-    builder.AddQuantizeLinearNode<QuantType>(reduce_sum_output, .039f,
-                                             (QuantTypeLimits::min() + QuantTypeLimits::max()) / 2 + 1,
-                                             q_output);
-
-    builder.AddDequantizeLinearNode<QuantType>(q_output, .039f,
-                                               (QuantTypeLimits::min() + QuantTypeLimits::max()) / 2 + 1,
-                                               final_output);
-  };
-}
-
-/**
- * Runs a ReduceOp model on the QNN HTP backend. Checks the graph node assignment, and that inference
- * outputs for QNN and CPU match.
- *
- * \param op_type The ReduceOp type (e.g., ReduceSum).
- * \param opset The opset version. Some opset versions have "axes" as an attribute or input.
- * \param expected_ep_assignment How many nodes are expected to be assigned to QNN (All, Some, or None)
- * \param keepdims Common attribute for all reduce operations.
- */
-template <typename QuantType>
-static void RunReduceOpQDQTest(const std::string& op_type, int opset, const std::vector<int64_t>& input_shape,
-                               const std::vector<int64_t>& axes,
-                               ExpectedEPNodeAssignment expected_ep_assignment = ExpectedEPNodeAssignment::All,
-                               bool keepdims = true) {
-  ProviderOptions provider_options;
-#if defined(_WIN32)
-  provider_options["backend_path"] = "QnnHtp.dll";
-#else
-  provider_options["backend_path"] = "libQnnHtp.so";
-#endif
-
-  // If QNN EP can support all ops, then we expect a single fused node in the graph.
-  // Otherwise, we'll get a graph with 5 individual nodes handled by CPU EP.
-  constexpr bool noop_with_empty_axes = false;
-  RunQnnModelTest(BuildQDQReduceOpTestCase<QuantType>(op_type,
-                                                      input_shape,
-                                                      ReduceOpHasAxesInput(op_type, opset),  // New opset changed axes to input.
-                                                      axes,
-                                                      keepdims,
-                                                      noop_with_empty_axes),
-                  provider_options,
-                  opset,
-                  expected_ep_assignment);
-}
-
-//
-// ReduceSum
-//
-
-// Test creates a Q -> DQ -> ReduceSum -> Q -> DQ graph, and checks that all
-// nodes are supported by the QNN EP, and that the inference results match the CPU EP results.
-//
-// - Uses uint8 as the quantization type.
-// - Uses opset 13, which has "axes" as an input.
-TEST_F(QnnHTPBackendTests, TestQDQReduceSumU8Opset13) {
-  RunReduceOpQDQTest<uint8_t>("ReduceSum", 13, {2, 2}, {0, 1});
-}
-
-// Test creates a Q -> DQ -> ReduceSum -> Q -> DQ graph, and checks that all
-// nodes are supported by the QNN EP, and that the inference results match the CPU EP results.
-//
-// - Uses uint8 as the quantization type.
-// - Uses opset 11, which has "axes" as an attribute.
-TEST_F(QnnHTPBackendTests, TestQDQReduceSumU8Opset11) {
-  RunReduceOpQDQTest<uint8_t>("ReduceSum", 11, {1, 3, 4, 4}, {0, 1, 2, 3});
-}
-
-// Test creates a Q -> DQ -> ReduceSum -> Q -> DQ graph, and checks that all
-// nodes are supported by the QNN EP, and that the inference results match the CPU EP results.
-//
-// - Uses int8 as the quantization type.
-// - Uses opset 13, which has "axes" as an input.
-TEST_F(QnnHTPBackendTests, TestQDQReduceSumS8Opset13) {
-  RunReduceOpQDQTest<int8_t>("ReduceSum", 13, {2, 2}, {0, 1});
-}
-
-// Tests that keepdims = false generates expected results.
-TEST_F(QnnHTPBackendTests, TestQDQReduceSumS8Opset13_NoKeepDims) {
-  RunReduceOpQDQTest<int8_t>("ReduceSum", 13, {2, 2}, {1}, ExpectedEPNodeAssignment::All, false);
-}
-
-// Test that we don't support rank 5 Reduce ops.
-TEST_F(QnnHTPBackendTests, TestQDQReduceSumS8Opset13_Rank5Unsupported) {
-  RunReduceOpQDQTest<int8_t>("ReduceSum", 13, {1, 3, 4, 4, 2}, {0, 1, 2, 3, 4}, ExpectedEPNodeAssignment::None);
-}
-
-//
-// ReduceMax
-//
-
-// ReduceMax on Linux's HTP emulator is always off by an amount equal to the final DQ.scale
-// Works fine on windows arm64.
-#if !defined(__linux__)
-// Test creates a Q -> DQ -> ReduceMax -> Q -> DQ graph, and checks that all
-// nodes are supported by the QNN EP, and that the inference results match the CPU EP results.
-//
-// - Uses uint8 as the quantization type.
-// - Uses opset 18, which has "axes" as an input.
-TEST_F(QnnHTPBackendTests, TestQDQReduceMaxU8Opset18) {
-  RunReduceOpQDQTest<uint8_t>("ReduceMax", 18, {2, 2}, {0, 1});
-}
-
-// Test creates a Q -> DQ -> ReduceMax -> Q -> DQ graph, and checks that all
-// nodes are supported by the QNN EP, and that the inference results match the CPU EP results.
-//
-// - Uses uint8 as the quantization type.
-// - Uses opset 13, which has "axes" as an attribute.
-TEST_F(QnnHTPBackendTests, TestQDQReduceMaxU8Opset13) {
-  RunReduceOpQDQTest<uint8_t>("ReduceMax", 13, {2, 2}, {0, 1});
-}
-
-// Test creates a Q -> DQ -> ReduceMax -> Q -> DQ graph, and checks that all
-// nodes are supported by the QNN EP, and that the inference results match the CPU EP results.
-//
-// - Uses int8 as the quantization type.
-// - Uses opset 18, which has "axes" as an input.
-TEST_F(QnnHTPBackendTests, TestQDQReduceMaxS8Opset18) {
-  RunReduceOpQDQTest<int8_t>("ReduceMax", 18, {2, 2}, {0, 1});
-}
-#endif  // !defined(__linux__)
-
-//
-// ReduceMin
-//
-// ReduceMin on Linux's HTP emulator is always off by an amount equal to the final DQ.scale
-// Works fine on windows arm64.
-#if !defined(__linux__)
-// Test creates a Q -> DQ -> ReduceMin -> Q -> DQ graph, and checks that all
-// nodes are supported by the QNN EP, and that the inference results match the CPU EP results.
-//
-// - Uses uint8 as the quantization type.
-// - Uses opset 18, which has "axes" as an input.
-TEST_F(QnnHTPBackendTests, TestQDQReduceMinU8Opset18) {
-  RunReduceOpQDQTest<uint8_t>("ReduceMin", 18, {2, 2}, {0, 1});
-}
-
-// Test creates a Q -> DQ -> ReduceMin -> Q -> DQ graph, and checks that all
-// nodes are supported by the QNN EP, and that the inference results match the CPU EP results.
-//
-// - Uses uint8 as the quantization type.
-// - Uses opset 13, which has "axes" as an attribute.
-TEST_F(QnnHTPBackendTests, TestQDQReduceMinU8Opset13) {
-  RunReduceOpQDQTest<uint8_t>("ReduceMin", 13, {2, 2}, {0, 1});
-}
-
-// Test creates a Q -> DQ -> ReduceMin -> Q -> DQ graph, and checks that all
-// nodes are supported by the QNN EP, and that the inference results match the CPU EP results.
-//
-// Uses int8 as the quantization type.
-TEST_F(QnnHTPBackendTests, TestQDQReduceMinS8Opset18) {
-  RunReduceOpQDQTest<int8_t>("ReduceMin", 18, {2, 2}, {0, 1});
-}
-#endif  // !defined(__linux__)
-
-//
-// ReduceMean
-//
-
-// Test creates a Q -> DQ -> ReduceMean -> Q -> DQ graph, and checks that all
-// nodes are supported by the QNN EP, and that the inference results match the CPU EP results.
-//
-// - Uses uint8 as the quantization type.
-// - Uses opset 18, which has "axes" as an input.
-TEST_F(QnnHTPBackendTests, TestQDQReduceMeanU8Opset18) {
-  RunReduceOpQDQTest<uint8_t>("ReduceMean", 18, {2, 2}, {0, 1});
-}
-
-// Test creates a Q -> DQ -> ReduceMean -> Q -> DQ graph, and checks that all
-// nodes are supported by the QNN EP, and that the inference results match the CPU EP results.
-//
-// - Uses uint8 as the quantization type.
-// - Uses opset 13, which has "axes" as an attribute.
-TEST_F(QnnHTPBackendTests, TestQDQReduceMeanU8Opset13) {
-  RunReduceOpQDQTest<uint8_t>("ReduceMean", 13, {2, 2}, {0, 1});
-}
-
-// Test creates a Q -> DQ -> ReduceMean -> Q -> DQ graph, and checks that all
-// nodes are supported by the QNN EP, and that the inference results match the CPU EP results.
-//
-// - Uses int8 as the quantization type.
-// - Uses opset 18, which has "axes" as an input.
-TEST_F(QnnHTPBackendTests, TestQDQReduceMeanS8Opset18) {
-  RunReduceOpQDQTest<int8_t>("ReduceMean", 18, {2, 2}, {0, 1});
-}
-
-#endif  // defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
-}  // namespace test
-}  // namespace onnxruntime
-
-#endif
\ No newline at end of file
diff --git a/onnxruntime/test/providers/qnn/reduce_op_test.cc b/onnxruntime/test/providers/qnn/reduce_op_test.cc
new file mode 100644
index 0000000000000..e0357de3e52f1
--- /dev/null
+++ b/onnxruntime/test/providers/qnn/reduce_op_test.cc
@@ -0,0 +1,618 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#if !defined(ORT_MINIMAL_BUILD)
+
+#include <string>
+#include "core/graph/graph.h"
+
+#include "test/optimizer/qdq_test_utils.h"
+#include "test/providers/qnn/qnn_test_utils.h"
+
+#include "gtest/gtest.h"
+
+namespace onnxruntime {
+namespace test {
+
+/**
+ * Creates a graph with a single reduce operator (e.g., ReduceSum, ReduceMin, etc.). Reduce operators take the
+ * axes of reduction as either a node attribute or an optional input (depending on opset).
+ *
+ * \param reduce_op_type The string denoting the reduce operator's type (e.g., "ReduceSum").
+ * \param input_def The input definition (shape, data, etc.)
+ * \param axes_as_input True if the "axes" are specified as a node input.
+ * \param axes The axes of reduction.
+ * \param keepdims True if the output's rank should match the input. This is a node attribute that defaults to true.
+ * \param noop_with_empty_axes True if empty axes should force the node to act as a NoOp (no operation).
+ *                             This is a node attribute that defaults to false.
+ * \param domain The domain to assign to the graph node.
+ *
+ * \return A function that builds the graph with the provided builder.
+ */
+template <typename DataType>
+static GetTestModelFn BuildReduceOpTestCase(const std::string& reduce_op_type,
+                                            const TestInputDef<DataType>& input_def,
+                                            bool axes_as_input, std::vector<int64_t> axes, bool keepdims,
+                                            bool noop_with_empty_axes) {
+  return [reduce_op_type, input_def, axes_as_input, axes, keepdims,
+          noop_with_empty_axes](ModelTestBuilder& builder) {
+    std::vector<NodeArg*> input_args;
+
+    // Input data arg
+    input_args.push_back(MakeTestInput(builder, input_def));
+
+    // Axes input (initializer) for newer opsets.
+    if (axes_as_input) {
+      input_args.push_back(builder.MakeInitializer({static_cast<int64_t>(axes.size())}, axes));
+    }
+
+    auto* reduce_sum_output = builder.MakeOutput();
+    Node& reduce_sum_node = builder.AddNode(reduce_op_type, input_args, {reduce_sum_output});
+    reduce_sum_node.AddAttribute("keepdims", static_cast<int64_t>(keepdims));
+
+    // Older opsets have "axes" as a node attribute.
+    if (!axes_as_input) {
+      reduce_sum_node.AddAttribute("axes", axes);
+    } else {
+      reduce_sum_node.AddAttribute("noop_with_empty_axes", static_cast<int64_t>(noop_with_empty_axes));
+    }
+  };
+}
+
+/**
+ * Runs a ReduceOp model on the QNN CPU backend. Checks the graph node assignment, and that inference
+ * outputs for QNN and CPU match.
+ *
+ * \param op_type The ReduceOp type (e.g., ReduceSum).
+ * \param input_def The input definition (shape, data, etc.)
+ * \param axes The axes of reduction.
+ * \param opset The opset version. Some opset versions have "axes" as an attribute or input.
+ * \param expected_ep_assignment How many nodes are expected to be assigned to QNN (All, Some, or None)
+ * \param keepdims Common attribute for all reduce operations.
+ */
+template <typename DataType>
+static void RunReduceOpCpuTest(const std::string& op_type,
+                               const TestInputDef<DataType>& input_def,
+                               const std::vector<int64_t>& axes,
+                               bool keepdims,
+                               int opset,
+                               ExpectedEPNodeAssignment expected_ep_assignment) {
+  ProviderOptions provider_options;
+#if defined(_WIN32)
+  provider_options["backend_path"] = "QnnCpu.dll";
+#else
+  provider_options["backend_path"] = "libQnnCpu.so";
+#endif
+
+  RunQnnModelTest(BuildReduceOpTestCase<DataType>(op_type,
+                                                  input_def,  //{2, 2},  // input shape
+                                                  ReduceOpHasAxesInput(op_type, opset),
+                                                  axes,  //{0, 1},  // axes
+                                                  keepdims,
+                                                  false),  // noop_with_empty_axes
+                  provider_options,
+                  opset,
+                  expected_ep_assignment);
+}
+
+//
+// ReduceSum
+//
+
+// Test creates a graph with a ReduceSum node, and checks that all
+// nodes are supported by the QNN EP (cpu backend), and that the inference results match the CPU EP results.
+//
+// - The input and output data type is int32.
+// - Uses opset 13, which has "axes" as an input.
+TEST_F(QnnCPUBackendTests, ReduceSumOpset13_Int32) {
+  RunReduceOpCpuTest<int32_t>("ReduceSum",
+                              TestInputDef<int32_t>({2, 2}, false, -10.0f, 10.0f),
+                              std::vector<int64_t>{0, 1},
+                              true,  // keepdims
+                              13,
+                              ExpectedEPNodeAssignment::All);
+}
+
+// Test creates a graph with a ReduceSum node, and checks that all
+// nodes are supported by the QNN EP (cpu backend), and that the inference results match the CPU EP results.
+//
+// - The input and output data type is int32.
+// - Uses opset 11, which has "axes" as an attribute.
+TEST_F(QnnCPUBackendTests, ReduceSumOpset11_Int32) {
+  RunReduceOpCpuTest<int32_t>("ReduceSum",
+                              TestInputDef<int32_t>({2, 2}, false, -10.0f, 10.0f),
+                              std::vector<int64_t>{0, 1},
+                              true,  // keepdims
+                              11,
+                              ExpectedEPNodeAssignment::All);
+}
+
+// Test creates a graph with a ReduceSum node, and checks that all
+// nodes are supported by the QNN EP (cpu backend), and that the inference results match the CPU EP results.
+//
+// - The input and output data type is float.
+// - Uses opset 13, which has "axes" as an input.
+TEST_F(QnnCPUBackendTests, ReduceSumOpset13_Float) {
+  RunReduceOpCpuTest<float>("ReduceSum",
+                            TestInputDef<float>({2, 2}, false, -10.0f, 10.0f),
+                            std::vector<int64_t>{0, 1},
+                            true,  // keepdims
+                            13,
+                            ExpectedEPNodeAssignment::All);
+}
+
+// Test creates a graph with a ReduceSum node, and checks that all
+// nodes are supported by the QNN EP (cpu backend), and that the inference results match the CPU EP results.
+//
+// - The input and output data type is float.
+// - Uses opset 11, which has "axes" as an attribute.
+TEST_F(QnnCPUBackendTests, ReduceSumOpset11_Float) {
+  RunReduceOpCpuTest<float>("ReduceSum",
+                            TestInputDef<float>({2, 2}, false, -10.0f, 10.0f),
+                            std::vector<int64_t>{0, 1},
+                            true,  // keepdims
+                            11,
+                            ExpectedEPNodeAssignment::All);
+}
+
+//
+// ReduceProd
+//
+
+// Test creates a graph with a ReduceProd node, and checks that all
+// nodes are supported by the QNN EP (cpu backend), and that the inference results match the CPU EP results.
+//
+// - The input and output data type is float.
+// - Uses opset 18, which has "axes" as an input.
+TEST_F(QnnCPUBackendTests, ReduceProdOpset18) {
+  RunReduceOpCpuTest<float>("ReduceProd",
+                            TestInputDef<float>({2, 2}, false, -10.0f, 10.0f),
+                            std::vector<int64_t>{0, 1},
+                            true,  // keepdims
+                            18,
+                            ExpectedEPNodeAssignment::All);
+}
+
+// Test creates a graph with a ReduceProd node, and checks that all
+// nodes are supported by the QNN EP (cpu backend), and that the inference results match the CPU EP results.
+//
+// - The input and output data type is float.
+// - Uses opset 13, which has "axes" as an attribute.
+TEST_F(QnnCPUBackendTests, ReduceProdOpset13) {
+  RunReduceOpCpuTest<float>("ReduceProd",
+                            TestInputDef<float>({2, 2}, false, -10.0f, 10.0f),
+                            std::vector<int64_t>{0, 1},
+                            true,  // keepdims
+                            13,
+                            ExpectedEPNodeAssignment::All);
+}
+
+//
+// ReduceMax
+//
+
+// Test creates a graph with a ReduceMax node, and checks that all
+// nodes are supported by the QNN EP (cpu backend), and that the inference results match the CPU EP results.
+//
+// - The input and output data type is float.
+// - Uses opset 18, which has "axes" as an input.
+TEST_F(QnnCPUBackendTests, ReduceMaxOpset18) {
+  RunReduceOpCpuTest<float>("ReduceMax",
+                            TestInputDef<float>({2, 2}, false, -10.0f, 10.0f),
+                            std::vector<int64_t>{0, 1},
+                            true,  // keepdims
+                            18,
+                            ExpectedEPNodeAssignment::All);
+}
+
+// Test creates a graph with a ReduceMax node, and checks that all
+// nodes are supported by the QNN EP (cpu backend), and that the inference results match the CPU EP results.
+//
+// - The input and output data type is float.
+// - Uses opset 13, which has "axes" as an attribute.
+TEST_F(QnnCPUBackendTests, ReduceMaxOpset13) {
+  RunReduceOpCpuTest<float>("ReduceMax",
+                            TestInputDef<float>({2, 2}, false, -10.0f, 10.0f),
+                            std::vector<int64_t>{0, 1},
+                            true,  // keepdims
+                            13,
+                            ExpectedEPNodeAssignment::All);
+}
+
+//
+// ReduceMin
+//
+
+// Test creates a graph with a ReduceMin node, and checks that all
+// nodes are supported by the QNN EP (cpu backend), and that the inference results match the CPU EP results.
+//
+// - The input and output data type is float.
+// - Uses opset 18, which has "axes" as an input.
+TEST_F(QnnCPUBackendTests, ReduceMinOpset18) {
+  RunReduceOpCpuTest<float>("ReduceMin",
+                            TestInputDef<float>({2, 2}, false, -10.0f, 10.0f),
+                            std::vector<int64_t>{0, 1},
+                            true,  // keepdims
+                            18,
+                            ExpectedEPNodeAssignment::All);
+}
+
+// Test creates a graph with a ReduceMin node, and checks that all
+// nodes are supported by the QNN EP (cpu backend), and that the inference results match the CPU EP results.
+//
+// - The input and output data type is float.
+// - Uses opset 13, which has "axes" as an attribute.
+TEST_F(QnnCPUBackendTests, ReduceMinOpset13) {
+  RunReduceOpCpuTest<float>("ReduceMin",
+                            TestInputDef<float>({2, 2}, false, -10.0f, 10.0f),
+                            std::vector<int64_t>{0, 1},
+                            true,  // keepdims
+                            13,
+                            ExpectedEPNodeAssignment::All);
+}
+
+//
+// ReduceMean
+//
+
+// Test creates a graph with a ReduceMean node, and checks that all
+// nodes are supported by the QNN EP (cpu backend), and that the inference results match the CPU EP results.
+//
+// - The input and output data type is float.
+// - Uses opset 18, which has "axes" as an input.
+TEST_F(QnnCPUBackendTests, ReduceMeanOpset18) {
+  RunReduceOpCpuTest<float>("ReduceMean",
+                            TestInputDef<float>({2, 2}, false, -10.0f, 10.0f),
+                            std::vector<int64_t>{0, 1},
+                            true,  // keepdims
+                            18,
+                            ExpectedEPNodeAssignment::All);
+}
+
+// Test creates a graph with a ReduceMean node, and checks that all
+// nodes are supported by the QNN EP (cpu backend), and that the inference results match the CPU EP results.
+//
+// - The input and output data type is float.
+// - Uses opset 13, which has "axes" as an attribute.
+TEST_F(QnnCPUBackendTests, ReduceMeanOpset13) {
+  RunReduceOpCpuTest<float>("ReduceMean",
+                            TestInputDef<float>({2, 2}, false, -10.0f, 10.0f),
+                            std::vector<int64_t>{0, 1},
+                            true,  // keepdims
+                            13,
+                            ExpectedEPNodeAssignment::All);
+}
+
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
+
+// Creates the following graph if axes is an input (newer opsets):
+//                                _______________________
+//    input (f32) -> Q -> DQ ->  |                       | -> Q -> DQ -> output (f32)
+// axes (int32, initializer) ->  |       Reduce___       |
+//                               |_______________________|
+//
+// Creates the following graph if axes is an attribute (older opsets):
+//                                _______________________
+//    input (f32) -> Q -> DQ ->  |                       | -> Q -> DQ -> output (f32)
+//                               |       Reduce___       |
+//                               |_______________________|
+//
+template <typename QuantType>
+GetTestQDQModelFn<QuantType> BuildQDQReduceOpTestCase(const std::string& reduce_op_type,
+                                                      const TestInputDef<float>& input_def,
+                                                      bool axes_as_input, const std::vector<int64_t>& axes, bool keepdims,
+                                                      bool noop_with_empty_axes) {
+  return [reduce_op_type, input_def, axes_as_input, axes, keepdims,
+          noop_with_empty_axes](ModelTestBuilder& builder,
+                                std::vector<QuantParams<QuantType>>& output_qparams) {
+    // input -> Q -> DQ ->
+    NodeArg* input = MakeTestInput(builder, input_def);
+    QuantParams<QuantType> input_qparams = GetTestInputQuantParams<QuantType>(input_def);
+    auto* input_qdq = AddQDQNodePair<QuantType>(builder, input, input_qparams.scale, input_qparams.zero_point);
+
+    // -> ReduceOp (e.g., ReduceSum) ->
+    std::vector<NodeArg*> reduce_op_inputs;
+    reduce_op_inputs.push_back(input_qdq);
+
+    if (axes_as_input) {
+      reduce_op_inputs.push_back(builder.MakeInitializer({static_cast<int64_t>(axes.size())}, axes));
+    }
+
+    auto* op_output = builder.MakeIntermediate();
+    Node& reduce_sum_node = builder.AddNode(reduce_op_type, reduce_op_inputs, {op_output});
+    reduce_sum_node.AddAttribute("keepdims", static_cast<int64_t>(keepdims));
+
+    if (axes_as_input) {
+      reduce_sum_node.AddAttribute("noop_with_empty_axes", static_cast<int64_t>(noop_with_empty_axes));
+    } else {
+      reduce_sum_node.AddAttribute("axes", axes);
+    }
+
+    // -> Q -> DQ -> final output
+    AddQDQNodePairWithOutputAsGraphOutput<QuantType>(builder, op_output, output_qparams[0].scale, output_qparams[0].zero_point);
+  };
+}
+
+/**
+ * Runs a ReduceOp model on the QNN HTP backend. Checks the graph node assignment, and that inference
+ * outputs for QNN and CPU match.
+ *
+ * \param op_type The ReduceOp type (e.g., ReduceSum).
+ * \param input_def The input definition (shape, data, etc.).
+ * \param axes The axes input (or attribute).
+ * \param keepdims Common attribute for all reduce operations.
+ * \param opset The opset version. Some opset versions have "axes" as an attribute or input.
+ * \param expected_ep_assignment How many nodes are expected to be assigned to QNN (All, Some, or None)
+ */
+template <typename QuantType>
+static void RunReduceOpQDQTest(const std::string& op_type,
+                               const TestInputDef<float>& input_def,
+                               const std::vector<int64_t>& axes,
+                               bool keepdims,
+                               int opset,
+                               ExpectedEPNodeAssignment expected_ep_assignment) {
+  ProviderOptions provider_options;
+#if defined(_WIN32)
+  provider_options["backend_path"] = "QnnHtp.dll";
+#else
+  provider_options["backend_path"] = "libQnnHtp.so";
+#endif
+
+  constexpr bool noop_with_empty_axes = false;
+  const bool axes_as_input = ReduceOpHasAxesInput(op_type, opset);  // Later opsets have "axes" as an input.
+
+  TestQDQModelAccuracy(BuildReduceOpTestCase<float>(op_type, input_def, axes_as_input, axes, keepdims,
+                                                    noop_with_empty_axes),
+                       BuildQDQReduceOpTestCase<QuantType>(op_type, input_def, axes_as_input, axes, keepdims,
+                                                           noop_with_empty_axes),
+                       provider_options,
+                       opset,
+                       expected_ep_assignment,
+                       1e-5f);
+}
+
+//
+// ReduceSum
+//
+
+// Test creates a Q -> DQ -> ReduceSum -> Q -> DQ graph, and checks that all
+// nodes are supported by the QNN EP, and that the inference results match the CPU EP results.
+//
+// - Uses uint8 as the quantization type.
+// - Uses opset 13, which has "axes" as an input.
+TEST_F(QnnHTPBackendTests, ReduceSumU8Opset13) {
+  RunReduceOpQDQTest<uint8_t>("ReduceSum",
+                              TestInputDef<float>({2, 2}, false, {-10.0f, 3.21289f, -5.9981f, 10.0f}),
+                              {0, 1},  // axes
+                              true,    // keepdims
+                              13,      // opset
+                              ExpectedEPNodeAssignment::All);
+}
+
+// TODO: Investigate inaccuracy
+// Input values: 3.21289 -5.9981 -1.72799 6.27263
+// Input quantization params [-10, 10]: scale=0.0784313753, zero_point=127
+//
+// Inaccuracy detected for output 'output', element 0.
+// Output quant params: scale=0.0068997270427644253, zero_point=0.
+// Expected val: 1.7594304084777832
+// QNN QDQ val: 1.731831431388855 (err 0.027598977088928223)
+// CPU QDQ val: 1.7594304084777832 (err 0)
+TEST_F(QnnHTPBackendTests, DISABLED_ReduceSumU8Opset13_Inaccurate) {
+  const std::vector<float> input_data = {3.21289f, -5.9981f, -1.72799f, 6.27263f};
+  RunReduceOpQDQTest<uint8_t>("ReduceSum",
+                              TestInputDef<float>({2, 2}, false, input_data).OverrideValueRange(-10.0f, 10.0f),
+                              {0, 1},  // axes
+                              true,    // keepdims
+                              13,      // opset
+                              ExpectedEPNodeAssignment::All);
+}
+// Test creates a Q -> DQ -> ReduceSum -> Q -> DQ graph, and checks that all
+// nodes are supported by the QNN EP, and that the inference results match the CPU EP results.
+//
+// - Uses uint8 as the quantization type.
+// - Uses opset 11, which has "axes" as an attribute.
+TEST_F(QnnHTPBackendTests, ReduceSumU8Opset11) {
+  RunReduceOpQDQTest<uint8_t>("ReduceSum",
+                              TestInputDef<float>({2, 2}, false, {-10.0f, 3.21289f, -5.9981f, 10.0f}),
+                              {0, 1},  // axes
+                              true,    // keepdims
+                              11,      // opset
+                              ExpectedEPNodeAssignment::All);
+}
+
+// Test creates a Q -> DQ -> ReduceSum -> Q -> DQ graph, and checks that all
+// nodes are supported by the QNN EP, and that the inference results match the CPU EP results.
+//
+// - Uses int8 as the quantization type.
+// - Uses opset 13, which has "axes" as an input.
+TEST_F(QnnHTPBackendTests, ReduceSumS8Opset13) {
+  RunReduceOpQDQTest<int8_t>("ReduceSum",
+                             TestInputDef<float>({2, 2}, false, -10.0f, 10.0f),
+                             {0, 1},  // axes
+                             true,    // keepdims
+                             13,      // opset
+                             ExpectedEPNodeAssignment::All);
+}
+
+// Tests that keepdims = false generates expected results.
+TEST_F(QnnHTPBackendTests, ReduceSumS8Opset13_NoKeepDims) {
+  RunReduceOpQDQTest<int8_t>("ReduceSum",
+                             TestInputDef<float>({2, 2}, false, -10.0f, 10.0f),
+                             {1},    // axes
+                             false,  // keepdims
+                             13,     // opset
+                             ExpectedEPNodeAssignment::All);
+}
+
+// Test that we don't support rank 5 Reduce ops.
+TEST_F(QnnHTPBackendTests, ReduceSumS8Opset13_Rank5Unsupported) {
+  RunReduceOpQDQTest<int8_t>("ReduceSum",
+                             TestInputDef<float>({1, 3, 4, 4, 2}, false, -10.0f, 10.0f),
+                             {0, 1, 2, 3, 4},  // axes
+                             true,             // keepdims
+                             13,               // opset
+                             ExpectedEPNodeAssignment::None);
+}
+
+//
+// ReduceMax
+//
+
+// Test creates a Q -> DQ -> ReduceMax -> Q -> DQ graph, and checks that all
+// nodes are supported by the QNN EP, and that the inference results match the CPU EP results.
+//
+// - Uses uint8 as the quantization type.
+// - Uses opset 18, which has "axes" as an input.
+TEST_F(QnnHTPBackendTests, ReduceMaxU8Opset18) {
+  RunReduceOpQDQTest<uint8_t>("ReduceMax",
+                              TestInputDef<float>({2, 2}, false, -10.0f, 10.0f),
+                              {0, 1},  // axes
+                              true,    // keepdims
+                              18,      // opset
+                              ExpectedEPNodeAssignment::All);
+}
+
+// Test creates a Q -> DQ -> ReduceMax -> Q -> DQ graph, and checks that all
+// nodes are supported by the QNN EP, and that the inference results match the CPU EP results.
+//
+// - Uses uint8 as the quantization type.
+// - Uses opset 13, which has "axes" as an attribute.
+TEST_F(QnnHTPBackendTests, ReduceMaxU8Opset13) {
+  RunReduceOpQDQTest<uint8_t>("ReduceMax",
+                              TestInputDef<float>({2, 2}, false, -10.0f, 10.0f),
+                              {0, 1},  // axes
+                              true,    // keepdims
+                              13,      // opset
+                              ExpectedEPNodeAssignment::All);
+}
+
+// Test creates a Q -> DQ -> ReduceMax -> Q -> DQ graph, and checks that all
+// nodes are supported by the QNN EP, and that the inference results match the CPU EP results.
+//
+// - Uses int8 as the quantization type.
+// - Uses opset 18, which has "axes" as an input.
+TEST_F(QnnHTPBackendTests, ReduceMaxS8Opset18) {
+  RunReduceOpQDQTest<int8_t>("ReduceMax",
+                             TestInputDef<float>({2, 2}, false, -10.0f, 10.0f),
+                             {0, 1},  // axes
+                             true,    // keepdims
+                             18,      // opset
+                             ExpectedEPNodeAssignment::All);
+}
+
+//
+// ReduceMin
+//
+
+// Test creates a Q -> DQ -> ReduceMin -> Q -> DQ graph, and checks that all
+// nodes are supported by the QNN EP, and that the inference results match the CPU EP results.
+//
+// - Uses uint8 as the quantization type.
+// - Uses opset 18, which has "axes" as an input.
+TEST_F(QnnHTPBackendTests, ReduceMinU8Opset18) {
+  RunReduceOpQDQTest<uint8_t>("ReduceMin",
+                              TestInputDef<float>({2, 2}, false, -10.0f, 10.0f),
+                              {0, 1},  // axes
+                              true,    // keepdims
+                              18,      // opset
+                              ExpectedEPNodeAssignment::All);
+}
+
+// Test creates a Q -> DQ -> ReduceMin -> Q -> DQ graph, and checks that all
+// nodes are supported by the QNN EP, and that the inference results match the CPU EP results.
+//
+// - Uses uint8 as the quantization type.
+// - Uses opset 13, which has "axes" as an attribute.
+TEST_F(QnnHTPBackendTests, ReduceMinU8Opset13) {
+  RunReduceOpQDQTest<uint8_t>("ReduceMin",
+                              TestInputDef<float>({2, 2}, false, -10.0f, 10.0f),
+                              {0, 1},  // axes
+                              true,    // keepdims
+                              13,      // opset
+                              ExpectedEPNodeAssignment::All);
+}
+
+// Test creates a Q -> DQ -> ReduceMin -> Q -> DQ graph, and checks that all
+// nodes are supported by the QNN EP, and that the inference results match the CPU EP results.
+//
+// Uses int8 as the quantization type.
+TEST_F(QnnHTPBackendTests, ReduceMinS8Opset18) {
+  RunReduceOpQDQTest<int8_t>("ReduceMin",
+                             TestInputDef<float>({2, 2}, false, -10.0f, 10.0f),
+                             {0, 1},  // axes
+                             true,    // keepdims
+                             18,      // opset
+                             ExpectedEPNodeAssignment::All);
+}
+
+//
+// ReduceMean
+//
+
+// Test creates a Q -> DQ -> ReduceMean -> Q -> DQ graph, and checks that all
+// nodes are supported by the QNN EP, and that the inference results match the CPU EP results.
+//
+// - Uses uint8 as the quantization type.
+// - Uses opset 18, which has "axes" as an input.
+TEST_F(QnnHTPBackendTests, ReduceMeanU8Opset18) {
+  RunReduceOpQDQTest<uint8_t>("ReduceMean",
+                              TestInputDef<float>({2, 2}, false, {-10.0f, 3.21289f, -5.9981f, 10.0f}),
+                              {0, 1},  // axes
+                              true,    // keepdims
+                              18,      // opset
+                              ExpectedEPNodeAssignment::All);
+}
+
+// TODO: Investigate inaccuracy
+// Input values: 3.21289 -5.9981 -1.72799 6.27263
+// Input quantization params [-10, 10]: scale=0.0784313753, zero_point=127
+//
+// Inaccuracy detected for output 'output', element 0.
+// Output quant params: scale=0.0017249317606911063, zero_point=0.
+// Expected val: 0.4398576021194458
+// QNN QDQ val: 0.43295785784721375 (err 0.0068997442722320557)
+// CPU QDQ val: 0.4398576021194458 (err 0)
+TEST_F(QnnHTPBackendTests, DISABLED_ReduceMeanU8Opset18_Inaccurate) {
+  const std::vector<float> input_data = {3.21289f, -5.9981f, -1.72799f, 6.27263f};
+  RunReduceOpQDQTest<uint8_t>("ReduceMean",
+                              TestInputDef<float>({2, 2}, false, input_data).OverrideValueRange(-10.0f, 10.0f),
+                              {0, 1},  // axes
+                              true,    // keepdims
+                              18,      // opset
+                              ExpectedEPNodeAssignment::All);
+}
+
+// Test creates a Q -> DQ -> ReduceMean -> Q -> DQ graph, and checks that all
+// nodes are supported by the QNN EP, and that the inference results match the CPU EP results.
+//
+// - Uses uint8 as the quantization type.
+// - Uses opset 13, which has "axes" as an attribute.
+TEST_F(QnnHTPBackendTests, ReduceMeanU8Opset13) {
+  RunReduceOpQDQTest<uint8_t>("ReduceMean",
+                              TestInputDef<float>({2, 2}, false, {-10.0f, 3.21289f, -5.9981f, 10.0f}),
+                              {0, 1},  // axes
+                              true,    // keepdims
+                              13,      // opset
+                              ExpectedEPNodeAssignment::All);
+}
+
+// Test creates a Q -> DQ -> ReduceMean -> Q -> DQ graph, and checks that all
+// nodes are supported by the QNN EP, and that the inference results match the CPU EP results.
+//
+// - Uses int8 as the quantization type.
+// - Uses opset 18, which has "axes" as an input.
+TEST_F(QnnHTPBackendTests, ReduceMeanS8Opset18) {
+  RunReduceOpQDQTest<int8_t>("ReduceMean",
+                             TestInputDef<float>({1, 3, 4, 4}, false, -10.0f, 10.0f),
+                             {0, 1, 2, 3},  // axes
+                             true,          // keepdims
+                             18,            // opset
+                             ExpectedEPNodeAssignment::All);
+}
+
+#endif  // defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
+}  // namespace test
+}  // namespace onnxruntime
+
+#endif
\ No newline at end of file
diff --git a/onnxruntime/test/providers/qnn/resize_test.cc b/onnxruntime/test/providers/qnn/resize_test.cc
index c5913ad3db5b8..1d900a41b1331 100644
--- a/onnxruntime/test/providers/qnn/resize_test.cc
+++ b/onnxruntime/test/providers/qnn/resize_test.cc
@@ -27,18 +27,18 @@ namespace test {
  *
  * \return A function that builds the graph with the provided builder.
  */
-static GetTestModelFn BuildResizeTestCase(const std::vector<int64_t>& shape,
-                                          const std::vector<int64_t>& sizes_data,
-                                          const std::string& mode = "nearest",
-                                          const std::string& coordinate_transformation_mode = "half_pixel",
-                                          const std::string& nearest_mode = "round_prefer_floor") {
-  return [shape, sizes_data, mode, coordinate_transformation_mode, nearest_mode](ModelTestBuilder& builder) {
-    auto* input = builder.MakeInput<float>(shape, 0.0f, 20.0f);
-    auto* roi = builder.MakeInitializer<float>({0}, {});
-    auto* scales = builder.MakeInitializer<float>({0}, {});
-    auto* sizes = builder.Make1DInitializer<int64_t>(sizes_data);
-
-    auto* output = builder.MakeOutput();
+static GetTestModelFn GetResizeModelBuilder(const TestInputDef<float>& input_def,
+                                            const std::vector<int64_t>& sizes_data,
+                                            const std::string& mode = "nearest",
+                                            const std::string& coordinate_transformation_mode = "half_pixel",
+                                            const std::string& nearest_mode = "round_prefer_floor") {
+  return [input_def, sizes_data, mode, coordinate_transformation_mode, nearest_mode](ModelTestBuilder& builder) {
+    NodeArg* input = MakeTestInput(builder, input_def);
+    NodeArg* roi = builder.MakeInitializer<float>({0}, {});
+    NodeArg* scales = builder.MakeInitializer<float>({0}, {});
+    NodeArg* sizes = builder.Make1DInitializer<int64_t>(sizes_data);
+
+    NodeArg* output = builder.MakeOutput();
     Node& resize_node = builder.AddNode("Resize", {input, roi, scales, sizes}, {output});
     resize_node.AddAttribute("mode", mode);
     resize_node.AddAttribute("coordinate_transformation_mode", coordinate_transformation_mode);
@@ -49,17 +49,17 @@ static GetTestModelFn BuildResizeTestCase(const std::vector<int64_t>& shape,
   };
 }
 
-static GetTestModelFn BuildResizeTestCaseWithScales(const std::vector<int64_t>& shape,
-                                                    const std::vector<float>& scales_data,
-                                                    const std::string& mode = "nearest",
-                                                    const std::string& coordinate_transformation_mode = "half_pixel",
-                                                    const std::string& nearest_mode = "round_prefer_floor") {
-  return [shape, scales_data, mode, coordinate_transformation_mode, nearest_mode](ModelTestBuilder& builder) {
-    auto* input = builder.MakeInput<float>(shape, 0.0f, 20.0f);
-    auto* roi = builder.MakeInitializer<float>({0}, {});
-    auto* scales = builder.Make1DInitializer<float>(scales_data);
-
-    auto* output = builder.MakeOutput();
+static GetTestModelFn GetResizeModelBuilderWithScales(const TestInputDef<float>& input_def,
+                                                      const std::vector<float>& scales_data,
+                                                      const std::string& mode = "nearest",
+                                                      const std::string& coordinate_transformation_mode = "half_pixel",
+                                                      const std::string& nearest_mode = "round_prefer_floor") {
+  return [input_def, scales_data, mode, coordinate_transformation_mode, nearest_mode](ModelTestBuilder& builder) {
+    NodeArg* input = MakeTestInput(builder, input_def);
+    NodeArg* roi = builder.MakeInitializer<float>({0}, {});
+    NodeArg* scales = builder.Make1DInitializer<float>(scales_data);
+
+    NodeArg* output = builder.MakeOutput();
     Node& resize_node = builder.AddNode("Resize", {input, roi, scales}, {output});
     resize_node.AddAttribute("mode", mode);
     resize_node.AddAttribute("coordinate_transformation_mode", coordinate_transformation_mode);
@@ -70,11 +70,45 @@ static GetTestModelFn BuildResizeTestCaseWithScales(const std::vector<int64_t>&
   };
 }
 
+template <typename QuantType = uint8_t>
+static GetTestQDQModelFn<QuantType> GetQDQResizeModelBuilder(const TestInputDef<float>& input_def,
+                                                             const std::vector<int64_t>& sizes_data,
+                                                             const std::string& mode = "nearest",
+                                                             const std::string& coordinate_transformation_mode = "half_pixel",
+                                                             const std::string& nearest_mode = "round_prefer_floor") {
+  return [input_def, sizes_data, mode,
+          coordinate_transformation_mode, nearest_mode](ModelTestBuilder& builder,
+                                                        std::vector<QuantParams<QuantType>>& output_qparams) {
+    // input -> Q -> DQ ->
+    NodeArg* input = MakeTestInput(builder, input_def);
+    QuantParams<QuantType> input_qparams = GetTestInputQuantParams<QuantType>(input_def);
+    NodeArg* input_qdq = AddQDQNodePair<QuantType>(builder, input, input_qparams.scale, input_qparams.zero_point);
+
+    NodeArg* roi = builder.MakeInitializer<float>({0}, {});
+    NodeArg* scales = builder.MakeInitializer<float>({0}, {});
+    NodeArg* sizes = builder.Make1DInitializer<int64_t>(sizes_data);
+
+    NodeArg* resize_output = builder.MakeIntermediate();
+    Node& resize_node = builder.AddNode("Resize", {input_qdq, roi, scales, sizes}, {resize_output});
+    resize_node.AddAttribute("mode", mode);
+    resize_node.AddAttribute("coordinate_transformation_mode", coordinate_transformation_mode);
+
+    if (mode == "nearest") {
+      resize_node.AddAttribute("nearest_mode", nearest_mode);
+    }
+
+    // Resize requires the output quantization parameters to match the input.
+    output_qparams[0] = input_qparams;
+    AddQDQNodePairWithOutputAsGraphOutput<QuantType>(builder, resize_output, output_qparams[0].scale,
+                                                     output_qparams[0].zero_point);
+  };
+}
+
 /**
  * Runs a Resize model on the QNN CPU backend. Checks the graph node assignment, and that inference
  * outputs for QNN and CPU match.
  *
- * \param shape The shape of the input and output. Input data is randomly generated with this shape.
+ * \param input_def The input definition (shape, data, etc).
  * \param sizes_data The sizes input which determines the output shape.
  * \param mode The resize mode (e.g., nearest, linear).
  * \param coordinate_transformation_mode The coordinate transformation mode (e.g., half_pixel, pytorch_half_pixel).
@@ -82,7 +116,7 @@ static GetTestModelFn BuildResizeTestCaseWithScales(const std::vector<int64_t>&
  * \param expected_ep_assignment How many nodes are expected to be assigned to QNN (All, Some, or None).
  * \param opset The opset version to use.
  */
-static void RunCPUResizeOpTest(const std::vector<int64_t>& shape, const std::vector<int64_t>& sizes_data,
+static void RunCPUResizeOpTest(const TestInputDef<float>& input_def, const std::vector<int64_t>& sizes_data,
                                const std::string& mode, const std::string& coordinate_transformation_mode,
                                const std::string& nearest_mode,
                                ExpectedEPNodeAssignment expected_ep_assignment,
@@ -94,13 +128,13 @@ static void RunCPUResizeOpTest(const std::vector<int64_t>& shape, const std::vec
   provider_options["backend_path"] = "libQnnCpu.so";
 #endif
 
-  RunQnnModelTest(BuildResizeTestCase(shape, sizes_data, mode, coordinate_transformation_mode, nearest_mode),
+  RunQnnModelTest(GetResizeModelBuilder(input_def, sizes_data, mode, coordinate_transformation_mode, nearest_mode),
                   provider_options,
                   opset,
                   expected_ep_assignment);
 }
 
-static void RunCPUResizeOpTestWithScales(const std::vector<int64_t>& shape, const std::vector<float>& scales_data,
+static void RunCPUResizeOpTestWithScales(const TestInputDef<float>& input_def, const std::vector<float>& scales_data,
                                          const std::string& mode, const std::string& coordinate_transformation_mode,
                                          const std::string& nearest_mode,
                                          ExpectedEPNodeAssignment expected_ep_assignment,
@@ -112,17 +146,18 @@ static void RunCPUResizeOpTestWithScales(const std::vector<int64_t>& shape, cons
   provider_options["backend_path"] = "libQnnCpu.so";
 #endif
 
-  RunQnnModelTest(BuildResizeTestCaseWithScales(shape, scales_data, mode, coordinate_transformation_mode, nearest_mode),
+  RunQnnModelTest(GetResizeModelBuilderWithScales(input_def, scales_data, mode, coordinate_transformation_mode, nearest_mode),
                   provider_options,
                   opset,
                   expected_ep_assignment);
 }
 
 template <typename QuantType>
-static void RunQDQResizeOpTest(const std::vector<int64_t>& shape, const std::vector<int64_t>& sizes_data,
+static void RunQDQResizeOpTest(const TestInputDef<float>& input_def,
+                               const std::vector<int64_t>& sizes_data,
                                const std::string& mode, const std::string& coordinate_transformation_mode,
                                const std::string& nearest_mode,
-                               ExpectedEPNodeAssignment expected_ep_assignment, float fp32_abs_err) {
+                               ExpectedEPNodeAssignment expected_ep_assignment) {
   ProviderOptions provider_options;
 #if defined(_WIN32)
   provider_options["backend_path"] = "QnnHtp.dll";
@@ -130,12 +165,13 @@ static void RunQDQResizeOpTest(const std::vector<int64_t>& shape, const std::vec
   provider_options["backend_path"] = "libQnnHtp.so";
 #endif
 
-  RunQnnModelTest(BuildQDQResizeTestCase<QuantType>(shape, sizes_data, mode, coordinate_transformation_mode,
-                                                    nearest_mode, true),
-                  provider_options,
-                  18,  // opset
-                  expected_ep_assignment,
-                  fp32_abs_err);
+  TestQDQModelAccuracy(GetResizeModelBuilder(input_def, sizes_data, mode, coordinate_transformation_mode, nearest_mode),
+                       GetQDQResizeModelBuilder<QuantType>(input_def, sizes_data, mode, coordinate_transformation_mode,
+                                                           nearest_mode),
+                       provider_options,
+                       18,  // opset
+                       expected_ep_assignment,
+                       1e-5f);
 }
 
 //
@@ -152,57 +188,68 @@ static void RunQDQResizeOpTest(const std::vector<int64_t>& shape, const std::vec
 
 // Upsample that uses "round_prefer_floor" as the "nearest_mode".
 // coordinate_transformation_mode: "half_pixel"
-TEST_F(QnnCPUBackendTests, DISABLED_TestResizeUpsampleNearestHalfPixel_rpf) {
-  RunCPUResizeOpTest({1, 2, 7, 5}, {1, 2, 21, 10}, "nearest", "half_pixel", "round_prefer_floor",
+TEST_F(QnnCPUBackendTests, DISABLED_ResizeUpsampleNearestHalfPixel_rpf) {
+  RunCPUResizeOpTest(TestInputDef<float>({1, 2, 7, 5}, false, -10.0f, 10.0f),  // Random input w/ range [-10, 10]
+                     {1, 2, 21, 10},                                           // Sizes
+                     "nearest",
+                     "half_pixel",
+                     "round_prefer_floor",
                      ExpectedEPNodeAssignment::All);
 }
 
 // Upsample that uses "round_prefer_ceil" as the "nearest_mode".
 // coordinate_transformation_mode: "half_pixel"
-TEST_F(QnnCPUBackendTests, DISABLED_TestResizeUpsampleNearestHalfPixel_rpc) {
-  RunCPUResizeOpTest({1, 1, 2, 4}, {1, 1, 7, 5}, "nearest", "half_pixel", "round_prefer_ceil",
+TEST_F(QnnCPUBackendTests, DISABLED_ResizeUpsampleNearestHalfPixel_rpc) {
+  RunCPUResizeOpTest(TestInputDef<float>({1, 1, 2, 4}, false, -10.0f, 10.0f),
+                     {1, 1, 7, 5}, "nearest", "half_pixel", "round_prefer_ceil",
                      ExpectedEPNodeAssignment::All);
 }
 
 // Downsample that uses "round_prefer_ceil" as the "nearest_mode".
 // coordinate_transformation_mode: "half_pixel"
-TEST_F(QnnCPUBackendTests, DISABLED_TestResizeDownsampleNearestHalfPixel_rpc) {
-  RunCPUResizeOpTest({1, 1, 2, 4}, {1, 1, 1, 3}, "nearest", "half_pixel", "round_prefer_ceil",
+TEST_F(QnnCPUBackendTests, DISABLED_ResizeDownsampleNearestHalfPixel_rpc) {
+  RunCPUResizeOpTest(TestInputDef<float>({1, 1, 2, 4}, false, -10.0f, 10.0f),
+                     {1, 1, 1, 3}, "nearest", "half_pixel", "round_prefer_ceil",
                      ExpectedEPNodeAssignment::All);
 }
 
 // Downsample that uses "round_prefer_floor" as the "nearest_mode".
 // coordinate_transformation_mode: "half_pixel"
-TEST_F(QnnCPUBackendTests, DISABLED_TestResizeDownsampleNearestHalfPixel_rpf) {
-  RunCPUResizeOpTest({1, 1, 2, 4}, {1, 1, 1, 2}, "nearest", "half_pixel", "round_prefer_ceil",
+TEST_F(QnnCPUBackendTests, DISABLED_ResizeDownsampleNearestHalfPixel_rpf) {
+  RunCPUResizeOpTest(TestInputDef<float>({1, 1, 2, 4}, false, -10.0f, 10.0f),
+                     {1, 1, 1, 2}, "nearest", "half_pixel", "round_prefer_ceil",
                      ExpectedEPNodeAssignment::All);
 }
 
 // Upsample that uses "round_prefer_floor" as the "nearest_mode".
 // coordinate_transformation_mode: "align_corners"
-TEST_F(QnnCPUBackendTests, DISABLED_TestResizeUpsampleNearestAlignCorners_rpf) {
-  RunCPUResizeOpTest({1, 2, 7, 5}, {1, 2, 21, 10}, "nearest", "align_corners", "round_prefer_floor",
+TEST_F(QnnCPUBackendTests, DISABLED_ResizeUpsampleNearestAlignCorners_rpf) {
+  RunCPUResizeOpTest(TestInputDef<float>({1, 2, 7, 5}, false, -10.0f, 10.0f),
+                     {1, 2, 21, 10}, "nearest", "align_corners", "round_prefer_floor",
                      ExpectedEPNodeAssignment::All);
 }
 
 // Upsample that uses "round_prefer_ceil" as the "nearest_mode".
 // coordinate_transformation_mode: "align_corners"
-TEST_F(QnnCPUBackendTests, DISABLED_TestResizeUpsampleNearestAlignCorners_rpc) {
-  RunCPUResizeOpTest({1, 1, 2, 4}, {1, 1, 7, 5}, "nearest", "align_corners", "round_prefer_ceil",
+TEST_F(QnnCPUBackendTests, DISABLED_ResizeUpsampleNearestAlignCorners_rpc) {
+  RunCPUResizeOpTest(TestInputDef<float>({1, 1, 2, 4}, false, -10.0f, 10.0f),
+                     {1, 1, 7, 5}, "nearest", "align_corners", "round_prefer_ceil",
                      ExpectedEPNodeAssignment::All);
 }
 
 // Downsample that uses "round_prefer_ceil" as the "nearest_mode".
 // coordinate_transformation_mode: "align_corners"
-TEST_F(QnnCPUBackendTests, DISABLED_TestResizeDownsampleNearestAlignCorners_rpc) {
-  RunCPUResizeOpTest({1, 1, 2, 4}, {1, 1, 1, 3}, "nearest", "align_corners", "round_prefer_ceil",
+TEST_F(QnnCPUBackendTests, DISABLED_ResizeDownsampleNearestAlignCorners_rpc) {
+  RunCPUResizeOpTest(TestInputDef<float>({1, 1, 2, 4}, false, -10.0f, 10.0f),
+                     {1, 1, 1, 3}, "nearest", "align_corners", "round_prefer_ceil",
                      ExpectedEPNodeAssignment::All);
 }
 
 // Downsample that uses "round_prefer_floor" as the "nearest_mode".
 // coordinate_transformation_mode: "align_corners"
-TEST_F(QnnCPUBackendTests, DISABLED_TestResizeDownsampleNearestAlignCorners_rpf) {
-  RunCPUResizeOpTest({1, 1, 2, 4}, {1, 1, 1, 2}, "nearest", "align_corners", "round_prefer_floor",
+TEST_F(QnnCPUBackendTests, DISABLED_ResizeDownsampleNearestAlignCorners_rpf) {
+  RunCPUResizeOpTest(TestInputDef<float>({1, 1, 2, 4}, false, -10.0f, 10.0f),
+                     {1, 1, 1, 2}, "nearest", "align_corners", "round_prefer_floor",
                      ExpectedEPNodeAssignment::All);
 }
 
@@ -210,23 +257,27 @@ TEST_F(QnnCPUBackendTests, DISABLED_TestResizeDownsampleNearestAlignCorners_rpf)
 // Cpu tests that use the "linear" mode.
 //
 
-TEST_F(QnnCPUBackendTests, TestResize2xLinearHalfPixel) {
-  RunCPUResizeOpTest({1, 3, 4, 5}, {1, 3, 8, 10}, "linear", "half_pixel", "",
+TEST_F(QnnCPUBackendTests, Resize2xLinearHalfPixel) {
+  RunCPUResizeOpTest(TestInputDef<float>({1, 3, 4, 5}, false, -10.0f, 10.0f),
+                     {1, 3, 8, 10}, "linear", "half_pixel", "",
                      ExpectedEPNodeAssignment::All);
 }
 
-TEST_F(QnnCPUBackendTests, TestResize2xLinearHalfPixel_scales) {
-  RunCPUResizeOpTestWithScales({1, 3, 4, 5}, {1.0f, 1.0f, 2.0f, 2.0f}, "linear", "half_pixel", "",
+TEST_F(QnnCPUBackendTests, Resize2xLinearHalfPixel_scales) {
+  RunCPUResizeOpTestWithScales(TestInputDef<float>({1, 3, 4, 5}, false, -10.0f, 10.0f),
+                               {1.0f, 1.0f, 2.0f, 2.0f}, "linear", "half_pixel", "",
                                ExpectedEPNodeAssignment::All);
 }
 
-TEST_F(QnnCPUBackendTests, TestResize2xLinearAlignCorners) {
-  RunCPUResizeOpTest({1, 3, 4, 5}, {1, 3, 8, 10}, "linear", "align_corners", "",
+TEST_F(QnnCPUBackendTests, Resize2xLinearAlignCorners) {
+  RunCPUResizeOpTest(TestInputDef<float>({1, 3, 4, 5}, false, -10.0f, 10.0f),
+                     {1, 3, 8, 10}, "linear", "align_corners", "",
                      ExpectedEPNodeAssignment::All);
 }
 
-TEST_F(QnnCPUBackendTests, TestResize2xLinearAlignCorners_scales) {
-  RunCPUResizeOpTestWithScales({1, 3, 4, 5}, {1.0f, 1.0f, 2.0f, 2.0f}, "linear", "align_corners", "",
+TEST_F(QnnCPUBackendTests, Resize2xLinearAlignCorners_scales) {
+  RunCPUResizeOpTestWithScales(TestInputDef<float>({1, 3, 4, 5}, false, -10.0f, 10.0f),
+                               {1.0f, 1.0f, 2.0f, 2.0f}, "linear", "align_corners", "",
                                ExpectedEPNodeAssignment::All);
 }
 
@@ -235,19 +286,22 @@ TEST_F(QnnCPUBackendTests, TestResize2xLinearAlignCorners_scales) {
 // HTP tests:
 //
 
-TEST_F(QnnHTPBackendTests, TestQDQU8Resize2xLinearPytorchHalfPixel) {
-  RunQDQResizeOpTest<uint8_t>({1, 3, 4, 4}, {1, 3, 8, 8}, "linear", "pytorch_half_pixel", "",
-                              ExpectedEPNodeAssignment::All, 0.0031f);
+TEST_F(QnnHTPBackendTests, ResizeU8_2xLinearPytorchHalfPixel) {
+  RunQDQResizeOpTest<uint8_t>(TestInputDef<float>({1, 3, 4, 4}, false, -10.0f, 10.0f),
+                              {1, 3, 8, 8}, "linear", "pytorch_half_pixel", "",
+                              ExpectedEPNodeAssignment::All);
 }
 
-TEST_F(QnnHTPBackendTests, TestQDQU8Resize2xNearestHalfPixelRoundPreferFloor) {
-  RunQDQResizeOpTest<uint8_t>({1, 3, 4, 4}, {1, 3, 8, 8}, "nearest", "half_pixel", "round_prefer_floor",
-                              ExpectedEPNodeAssignment::All, 1e-5f);
+TEST_F(QnnHTPBackendTests, ResizeU8_2xNearestHalfPixelRoundPreferFloor) {
+  RunQDQResizeOpTest<uint8_t>(TestInputDef<float>({1, 3, 4, 4}, false, -10.0f, 10.0f),
+                              {1, 3, 8, 8}, "nearest", "half_pixel", "round_prefer_floor",
+                              ExpectedEPNodeAssignment::All);
 }
 
-TEST_F(QnnHTPBackendTests, TestQDQU8Resize2xNearestAsymmetricFloor) {
-  RunQDQResizeOpTest<uint8_t>({1, 3, 4, 4}, {1, 3, 8, 8}, "nearest", "asymmetric", "floor",
-                              ExpectedEPNodeAssignment::All, 1e-5f);
+TEST_F(QnnHTPBackendTests, ResizeU8_2xNearestAsymmetricFloor) {
+  RunQDQResizeOpTest<uint8_t>(TestInputDef<float>({1, 3, 4, 4}, false, -10.0f, 10.0f),
+                              {1, 3, 8, 8}, "nearest", "asymmetric", "floor",
+                              ExpectedEPNodeAssignment::All);
 }
 
 // TODO: Investigate with Qualcomm. The qnn-onnx-converter tool translates ONNX Resize [nearest, asymmetric, ceil] to
@@ -259,19 +313,22 @@ TEST_F(QnnHTPBackendTests, TestQDQU8Resize2xNearestAsymmetricFloor) {
 // <C0-00 00-00 00-00 00-00 40-05 D6-27 BB-01 00-00> are an almost-equal pair
 // Actual : 16 - byte object<C0 - 00 00 - 00 00 - 00 00 - 00 40 - 04 E9 - 1B BB - 01 00 - 00>,
 // where the value pair(0.15, 0.501) at index #1 don't match, which is 0.351 from 0.15
-TEST_F(QnnHTPBackendTests, DISABLED_TestQDQU8Resize2xNearestAsymmetricCeil) {
-  RunQDQResizeOpTest<uint8_t>({1, 3, 4, 4}, {1, 3, 8, 8}, "nearest", "asymmetric", "ceil",
-                              ExpectedEPNodeAssignment::All, 1e-5f);
+TEST_F(QnnHTPBackendTests, DISABLED_ResizeU8_2xNearestAsymmetricCeil) {
+  RunQDQResizeOpTest<uint8_t>(TestInputDef<float>({1, 3, 4, 4}, false, -10.0f, 10.0f),
+                              {1, 3, 8, 8}, "nearest", "asymmetric", "ceil",
+                              ExpectedEPNodeAssignment::All);
 }
 
-TEST_F(QnnHTPBackendTests, TestQDQU8Resize3xNearestAsymmetricFloor) {
-  RunQDQResizeOpTest<uint8_t>({1, 3, 4, 4}, {1, 3, 12, 12}, "nearest", "asymmetric", "floor",
-                              ExpectedEPNodeAssignment::All, 1e-5f);
+TEST_F(QnnHTPBackendTests, ResizeU8_3xNearestAsymmetricFloor) {
+  RunQDQResizeOpTest<uint8_t>(TestInputDef<float>({1, 3, 4, 4}, false, -10.0f, 10.0f),
+                              {1, 3, 12, 12}, "nearest", "asymmetric", "floor",
+                              ExpectedEPNodeAssignment::All);
 }
 
-TEST_F(QnnHTPBackendTests, TestQDQU8ResizeHalfNearestAsymmetricFloor) {
-  RunQDQResizeOpTest<uint8_t>({1, 3, 4, 4}, {1, 3, 2, 2}, "nearest", "asymmetric", "floor",
-                              ExpectedEPNodeAssignment::All, 1e-5f);
+TEST_F(QnnHTPBackendTests, ResizeU8_HalfNearestAsymmetricFloor) {
+  RunQDQResizeOpTest<uint8_t>(TestInputDef<float>({1, 3, 4, 4}, false, -10.0f, 10.0f),
+                              {1, 3, 2, 2}, "nearest", "asymmetric", "floor",
+                              ExpectedEPNodeAssignment::All);
 }
 
 #endif  // defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
diff --git a/onnxruntime/test/providers/qnn/simple_op_htp_test.cc b/onnxruntime/test/providers/qnn/simple_op_htp_test.cc
index 93bd96e9549e8..5b4049d52c16f 100644
--- a/onnxruntime/test/providers/qnn/simple_op_htp_test.cc
+++ b/onnxruntime/test/providers/qnn/simple_op_htp_test.cc
@@ -20,6 +20,21 @@ namespace test {
 
 using UInt8Limits = std::numeric_limits<uint8_t>;
 
+template <typename InputType = float>
+static GetTestModelFn BuildUnaryOpTestCase(const std::string& op_type, const TestInputDef<InputType>& input0_def,
+                                           const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs,
+                                           const std::string& domain = kOnnxDomain) {
+  return [op_type, input0_def, attrs, domain](ModelTestBuilder& builder) {
+    NodeArg* input0 = MakeTestInput(builder, input0_def);
+
+    auto* output = builder.MakeOutput();
+    auto& op_node = builder.AddNode(op_type, {input0}, {output}, domain);
+    for (const auto& attr : attrs) {
+      op_node.AddAttributeProto(attr);
+    }
+  };
+}
+
 // Creates the graph:
 //                       _______________________
 //                      |                       |
@@ -28,60 +43,100 @@ using UInt8Limits = std::numeric_limits<uint8_t>;
 //
 // Currently used to test QNN EP.
 template <typename InputQType>
-GetQDQTestCaseFn BuildQDQSingleInputOpTestCase(const TestInputDef<InputQType>& input_def,
-                                               const std::string& op_type,
-                                               const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs = {},
-                                               const std::string& domain = kOnnxDomain) {
-  return [input_def, op_type, attrs, domain](ModelTestBuilder& builder) {
-    const InputQType quant_zero_point = 0;
-    const float quant_scale = 1.0f;
-
-    auto* input = MakeTestInput<InputQType>(builder, input_def);
-    auto* dq_input = builder.MakeIntermediate();
-    builder.AddDequantizeLinearNode<InputQType>(input, quant_scale, quant_zero_point, dq_input);
+GetTestQDQModelFn<InputQType> BuildQDQUnaryOpTestCase(const TestInputDef<float>& input_def,
+                                                      const std::string& op_type,
+                                                      const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs,
+                                                      const std::string& domain = kOnnxDomain) {
+  return [input_def, op_type, attrs, domain](ModelTestBuilder& builder,
+                                             std::vector<QuantParams<InputQType>>& output_qparams) {
+    auto* input = MakeTestInput(builder, input_def);
+    QuantParams<InputQType> input_qparams = GetTestInputQuantParams(input_def);
+    auto* input_qdq = AddQDQNodePair<InputQType>(builder, input, input_qparams.scale, input_qparams.zero_point);
 
     auto* op_output = builder.MakeIntermediate();
-    auto& op_node = builder.AddNode(op_type, {dq_input}, {op_output}, domain);
+    auto& op_node = builder.AddNode(op_type, {input_qdq}, {op_output}, domain);
 
     for (const auto& attr : attrs) {
       op_node.AddAttributeProto(attr);
     }
 
-    auto* q_output = builder.MakeOutput();
-    builder.AddQuantizeLinearNode<InputQType>(op_output, quant_scale, quant_zero_point, q_output);
+    // op_output -> Q -> DQ -> output
+    AddQDQNodePairWithOutputAsGraphOutput<InputQType>(builder, op_output, output_qparams[0].scale, output_qparams[0].zero_point);
   };
 }
 
-template <typename InputType = float, typename InputQType = uint8_t>
-static GetTestModelFn BuildQDQBinaryOpTestCase(const std::string& op_type, const TestInputDef<InputType>& input0_def,
-                                               const TestInputDef<InputType>& input1_def) {
+/**
+ * Runs an Simple Op model on the QNN HTP backend. Checks the graph node assignment, and that inference
+ * outputs for QNN and CPU match.
+ *
+ * \param input_shape The input's shape.
+ * \param test_description Description of the test for error reporting.
+ * \param expected_ep_assignment How many nodes are expected to be assigned to QNN (All, Some, or None).
+ * \param num_modes_in_graph The number of expected nodes in the graph.
+ */
+template <typename InputQType = uint8_t>
+static void RunQDQUnaryOpTest(const TestInputDef<float>& input_def, const std::string& op_type,
+                              const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs,
+                              int opset_version,
+                              ExpectedEPNodeAssignment expected_ep_assignment,
+                              const std::string& domain = kOnnxDomain) {
+  ProviderOptions provider_options;
+#if defined(_WIN32)
+  provider_options["backend_path"] = "QnnHtp.dll";
+#else
+  provider_options["backend_path"] = "libQnnHtp.so";
+#endif
+
+  // Runs model with DQ-> Op -> Q and compares the outputs of the CPU and QNN EPs.
+  TestQDQModelAccuracy(BuildUnaryOpTestCase<float>(op_type, input_def, attrs, domain),
+                       BuildQDQUnaryOpTestCase<InputQType>(input_def, op_type, attrs, domain),
+                       provider_options,
+                       opset_version,
+                       expected_ep_assignment,
+                       1e-5f);
+}
+
+template <typename InputType = float>
+static GetTestModelFn BuildBinaryOpTestCase(const std::string& op_type, const TestInputDef<InputType>& input0_def,
+                                            const TestInputDef<InputType>& input1_def) {
   return [op_type, input0_def, input1_def](ModelTestBuilder& builder) {
-    const InputQType zero_point = std::numeric_limits<InputQType>::max() / 2;
-    constexpr float qdq_scale = 0.0004f;
+    NodeArg* input0 = MakeTestInput(builder, input0_def);
+    NodeArg* input1 = MakeTestInput(builder, input1_def);
 
+    auto* output = builder.MakeOutput();
+    builder.AddNode(op_type, {input0, input1}, {output});
+  };
+}
+
+template <typename InputQType = uint8_t>
+static GetTestQDQModelFn<InputQType> BuildQDQBinaryOpTestCase(const std::string& op_type,
+                                                              const TestInputDef<float>& input0_def,
+                                                              const TestInputDef<float>& input1_def) {
+  return [op_type, input0_def, input1_def](ModelTestBuilder& builder,
+                                           std::vector<QuantParams<InputQType>>& output_qparams) {
     NodeArg* input0 = MakeTestInput(builder, input0_def);
     NodeArg* input1 = MakeTestInput(builder, input1_def);
-    NodeArg* output = builder.MakeOutput();
 
     // input -> Q -> DQ -> Op
-    auto* qdq0_output = AddQDQNodePair<InputQType>(builder, input0, qdq_scale, zero_point);
-    auto* qdq1_output = AddQDQNodePair<InputQType>(builder, input1, qdq_scale, zero_point);
+    QuantParams<InputQType> input0_qparams = GetTestInputQuantParams(input0_def);
+    auto* qdq0_output = AddQDQNodePair<InputQType>(builder, input0, input0_qparams.scale, input0_qparams.zero_point);
+
+    QuantParams<InputQType> input1_qparams = GetTestInputQuantParams(input1_def);
+    auto* qdq1_output = AddQDQNodePair<InputQType>(builder, input1, input1_qparams.scale, input1_qparams.zero_point);
 
     // Op -> op_output
     auto* op_output = builder.MakeIntermediate();
     builder.AddNode(op_type, {qdq0_output, qdq1_output}, {op_output});
 
     // op_output -> Q -> DQ -> output
-    auto* op_q_output = builder.MakeIntermediate();
-    builder.AddQuantizeLinearNode<InputQType>(op_output, qdq_scale, zero_point, op_q_output);
-    builder.AddDequantizeLinearNode<InputQType>(op_q_output, qdq_scale, zero_point, output);
+    AddQDQNodePairWithOutputAsGraphOutput<InputQType>(builder, op_output, output_qparams[0].scale,
+                                                      output_qparams[0].zero_point);
   };
 }
 
-template <typename InputType = float, typename InputQType = uint8_t>
-static void RunQDQBinaryOpTest(const std::string& op_type, const TestInputDef<InputType>& input0_def,
-                               const TestInputDef<InputType>& input1_def,
-                               int opset_version,
+template <typename InputQType = uint8_t>
+static void RunQDQBinaryOpTest(const std::string& op_type, const TestInputDef<float>& input0_def,
+                               const TestInputDef<float>& input1_def, int opset_version,
                                ExpectedEPNodeAssignment expected_ep_assignment) {
   ProviderOptions provider_options;
 #if defined(_WIN32)
@@ -90,28 +145,18 @@ static void RunQDQBinaryOpTest(const std::string& op_type, const TestInputDef<In
   provider_options["backend_path"] = "libQnnHtp.so";
 #endif
 
-  // Runs model with a Q/DQ binary op and compares the outputs of the CPU and QNN EPs.
-  RunQnnModelTest(BuildQDQBinaryOpTestCase<InputType, InputQType>(op_type, input0_def, input1_def),
-                  provider_options,
-                  opset_version,
-                  expected_ep_assignment);
+  TestQDQModelAccuracy(BuildBinaryOpTestCase<float>(op_type, input0_def, input1_def),
+                       BuildQDQBinaryOpTestCase<InputQType>(op_type, input0_def, input1_def),
+                       provider_options,
+                       opset_version,
+                       expected_ep_assignment,
+                       1e-5f);
 }
 
-/**
- * Runs an Simple Op model on the QNN HTP backend. Checks the graph node assignment, and that inference
- * outputs for QNN and CPU match.
- *
- * \param input_shape The input's shape.
- * \param test_description Description of the test for error reporting.
- * \param expected_ep_assignment How many nodes are expected to be assigned to QNN (All, Some, or None).
- * \param num_modes_in_graph The number of expected nodes in the graph.
- */
-template <typename InputQType = uint8_t>
-static void RunQDQSingleInputOpTest(const TestInputDef<InputQType>& input_def, const std::string& op_type,
-                                    const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs,
-                                    int opset_version,
-                                    ExpectedEPNodeAssignment expected_ep_assignment,
-                                    const std::string& domain = kOnnxDomain) {
+template <typename InputType = float>
+static void RunBinaryOpTest(const std::string& op_type, const TestInputDef<InputType>& input0_def,
+                            const TestInputDef<InputType>& input1_def, int opset_version,
+                            ExpectedEPNodeAssignment expected_ep_assignment) {
   ProviderOptions provider_options;
 #if defined(_WIN32)
   provider_options["backend_path"] = "QnnHtp.dll";
@@ -119,8 +164,8 @@ static void RunQDQSingleInputOpTest(const TestInputDef<InputQType>& input_def, c
   provider_options["backend_path"] = "libQnnHtp.so";
 #endif
 
-  // Runs model with DQ-> Op -> Q and compares the outputs of the CPU and QNN EPs.
-  RunQnnModelTest(BuildQDQSingleInputOpTestCase<InputQType>(input_def, op_type, attrs, domain),
+  // Runs model with a Q/DQ binary op and compares the outputs of the CPU and QNN EPs.
+  RunQnnModelTest(BuildBinaryOpTestCase<InputType>(op_type, input0_def, input1_def),
                   provider_options,
                   opset_version,
                   expected_ep_assignment);
@@ -128,87 +173,143 @@ static void RunQDQSingleInputOpTest(const TestInputDef<InputQType>& input_def, c
 
 // Check that QNN compiles DQ -> Gelu -> Q as a single unit.
 // Use an input of rank 3.
-TEST_F(QnnHTPBackendTests, TestQDQGeluTest) {
-  RunQDQSingleInputOpTest(TestInputDef<uint8_t>({1, 2, 3}, false, UInt8Limits::min(), UInt8Limits::max()),
-                          "Gelu", {}, 11, ExpectedEPNodeAssignment::All, kMSDomain);
+TEST_F(QnnHTPBackendTests, UnaryOp_Gelu) {
+  RunQDQUnaryOpTest(TestInputDef<float>({1, 2, 3}, false, -10.0f, 10.0f),  // Input range [-10.0, 10.0f]
+                    "Gelu",
+                    {},
+                    11,
+                    ExpectedEPNodeAssignment::All,
+                    kMSDomain);  // GeLu is a contrib op.
 }
 
 // Check that QNN compiles DQ -> Elu -> Q as a single unit.
 // Use an input of rank 3.
-TEST_F(QnnHTPBackendTests, TestQDQEluTest) {
-  RunQDQSingleInputOpTest(TestInputDef<uint8_t>({1, 2, 3}, false, UInt8Limits::min(), UInt8Limits::max()),
-                          "Elu", {}, 11, ExpectedEPNodeAssignment::All);
+TEST_F(QnnHTPBackendTests, UnaryOp_Elu) {
+  RunQDQUnaryOpTest(TestInputDef<float>({1, 2, 3}, false, -10.0f, 10.0f),  // Input range [-10.0, 10.0f]
+                    "Elu",
+                    {},
+                    11,
+                    ExpectedEPNodeAssignment::All);
 }
 
 // Check that QNN compiles DQ -> HardSwish -> Q as a single unit.
 // Use an input of rank 3.
-TEST_F(QnnHTPBackendTests, TestQDQHardSwishTest) {
-  RunQDQSingleInputOpTest(TestInputDef<uint8_t>({1, 2, 3}, false, UInt8Limits::min(), UInt8Limits::max()),
-                          "HardSwish", {}, 14, ExpectedEPNodeAssignment::All);
+TEST_F(QnnHTPBackendTests, UnaryOp_HardSwish) {
+  RunQDQUnaryOpTest(TestInputDef<float>({1, 2, 3}, false, -10.0f, 10.0f),  // Input range [-10.0, 10.0f]
+                    "HardSwish",
+                    {},
+                    14,
+                    ExpectedEPNodeAssignment::All);
 }
 
 // Check that QNN compiles DQ -> Atan -> Q as a single unit.
 // Use an input of rank 3.
-TEST_F(QnnHTPBackendTests, TestQDQAtanTest) {
-  RunQDQSingleInputOpTest(TestInputDef<uint8_t>({1, 2, 3}, false, UInt8Limits::min(), UInt8Limits::max()),
-                          "Atan", {}, 11, ExpectedEPNodeAssignment::All);
+TEST_F(QnnHTPBackendTests, UnaryOp_Atan) {
+  RunQDQUnaryOpTest(TestInputDef<float>({1, 2, 3}, false, -10.0f, 10.0f),  // Input range [-10.0, 10.0f]
+                    "Atan",
+                    {},
+                    14,
+                    ExpectedEPNodeAssignment::All);
 }
 
 // Check that QNN compiles DQ -> Asin -> Q as a single unit.
 // Use an input of rank 3.
-TEST_F(QnnHTPBackendTests, TestQDQAsinTest) {
-  RunQDQSingleInputOpTest(TestInputDef<uint8_t>({1, 2, 3}, false, 0, 1),  // input range 0 ~ 1
-                          "Asin", {}, 11, ExpectedEPNodeAssignment::All);
+TEST_F(QnnHTPBackendTests, UnaryOp_Asin) {
+  RunQDQUnaryOpTest(TestInputDef<float>({1, 2, 3}, false, -0.5f, 0.5f),  // input range -0.5 to 0.5
+                    "Asin", {},
+                    13, ExpectedEPNodeAssignment::All);
 }
 
 // Check that QNN compiles DQ -> Sign -> Q as a single unit.
 // Use an input of rank 3.
-TEST_F(QnnHTPBackendTests, TestQDQSignTest) {
-  RunQDQSingleInputOpTest(TestInputDef<uint8_t>({1, 2, 3}, false, UInt8Limits::min(), UInt8Limits::max()),
-                          "Sign", {}, 11, ExpectedEPNodeAssignment::All);
+TEST_F(QnnHTPBackendTests, UnaryOp_Sign) {
+  RunQDQUnaryOpTest(TestInputDef<float>({1, 2, 3}, false, -10.0f, 10.0f),
+                    "Sign", {},
+                    13, ExpectedEPNodeAssignment::All);
 }
 
-// Check that QNN compiles DQ -> Sign -> Q as a single unit.
+// Check that QNN compiles DQ -> Sin -> Q as a single unit.
 // Use an input of rank 3.
-TEST_F(QnnHTPBackendTests, TestQDQSinTest) {
-  RunQDQSingleInputOpTest(TestInputDef<uint8_t>({1, 2, 3}, false, UInt8Limits::min(), UInt8Limits::max()),
-                          "Sin", {}, 11, ExpectedEPNodeAssignment::All);
+TEST_F(QnnHTPBackendTests, UnaryOp_Sin) {
+  RunQDQUnaryOpTest(TestInputDef<float>({1, 2, 3}, false, -3.14159f, 3.14159f),
+                    "Sin", {},
+                    11, ExpectedEPNodeAssignment::All);
+}
+
+// Check that QNN compiles DQ -> Cos -> Q as a single unit.
+// Use an input of rank 3.
+TEST_F(QnnHTPBackendTests, UnaryOp_Cos) {
+  RunQDQUnaryOpTest(TestInputDef<float>({1, 2, 3}, false, {-3.14159f, -1.5f, -0.5f, 0.0f, 1.5, 3.14159f}),
+                    "Cos", {},
+                    11, ExpectedEPNodeAssignment::All);
+}
+
+// TODO: Inaccuracy when computing cos(-1.88436)
+//
+// cos(-1.88436f) fp32 cpu ep = -0.308450460
+// cos(-1.88436f) qdq cpu ep  = -0.298039228
+// cos(-1.88436f) qdq QNN ep  = -0.321568638
+//
+// QNN error: 0.013118177652359009, CPU error: 0.010411232709884644
+//
+// input quant params: scale=0.0246399231, zero_point=127
+// output quant params: scale=0.00784313772, zero_point=127
+TEST_F(QnnHTPBackendTests, DISABLED_UnaryOp_Cos_Inaccurate) {
+  RunQDQUnaryOpTest(TestInputDef<float>({1, 2, 3}, false, {-3.14159f, -1.88436f, -0.542863f, 0.0f, 1.05622f, 3.14159f}),
+                    "Cos", {},
+                    11, ExpectedEPNodeAssignment::All);
 }
 
 // Check that QNN compiles DQ -> Softmax -> Q as a single unit.
 // Test that the default axis (-1) for SoftMax opset 13 works.
-TEST_F(QnnHTPBackendTests, TestQDQSoftmax13_DefaultAxis) {
-  RunQDQSingleInputOpTest(TestInputDef<uint8_t>({1, 2, 3}, false, UInt8Limits::min(), UInt8Limits::max()),
-                          "Softmax",
-                          {},  // Uses default axis of -1 for opset 13
-                          13, ExpectedEPNodeAssignment::All);
+TEST_F(QnnHTPBackendTests, UnaryOp_Softmax13_DefaultAxis) {
+  RunQDQUnaryOpTest(TestInputDef<float>({1, 2, 3}, false, -5.0f, 5.0f),
+                    "Softmax",
+                    {},  // Uses default axis of -1 for opset 13
+                    13, ExpectedEPNodeAssignment::All);
 }
 
 // Check that QNN compiles DQ -> Softmax -> Q as a single unit.
 // Test that an axis != -1 is not supported.
-TEST_F(QnnHTPBackendTests, TestQDQSoftmax13_UnsupportedAxis) {
-  RunQDQSingleInputOpTest(TestInputDef<uint8_t>({1, 2, 3}, false, UInt8Limits::min(), UInt8Limits::max()),
-                          "Softmax",
-                          {utils::MakeAttribute("axis", static_cast<int64_t>(1))},
-                          13, ExpectedEPNodeAssignment::None);
+TEST_F(QnnHTPBackendTests, UnaryOp_Softmax13_UnsupportedAxis) {
+  RunQDQUnaryOpTest(TestInputDef<float>({1, 2, 3}, false, -5.0f, 5.0f),
+                    "Softmax",
+                    {utils::MakeAttribute("axis", static_cast<int64_t>(1))},
+                    13, ExpectedEPNodeAssignment::None);
 }
 
 // Check that QNN compiles DQ -> Softmax -> Q as a single unit.
 // Test that the default axis (1) for SoftMax opset < 13 does not work.
-TEST_F(QnnHTPBackendTests, TestQDQSoftmax11_DefaultAxisFails) {
-  RunQDQSingleInputOpTest(TestInputDef<uint8_t>({1, 2, 3}, false, UInt8Limits::min(), UInt8Limits::max()),
-                          "Softmax",
-                          {},  // Uses default axis of 1 for opset < 13.
-                          11, ExpectedEPNodeAssignment::None);
+TEST_F(QnnHTPBackendTests, UnaryOp_Softmax11_DefaultAxisFails) {
+  RunQDQUnaryOpTest(TestInputDef<float>({1, 2, 3}, false, -5.0f, 5.0f),
+                    "Softmax",
+                    {},  // Uses default axis of 1 for opset < 13.
+                    11, ExpectedEPNodeAssignment::None);
 }
 
 // Check that QNN compiles DQ -> Softmax -> Q as a single unit.
 // Test that setting an axis value of -1 works for Softmax opset < 13.
-TEST_F(QnnHTPBackendTests, TestQDQSoftmax11_SetValidAxis) {
-  RunQDQSingleInputOpTest(TestInputDef<uint8_t>({1, 2, 3}, false, UInt8Limits::min(), UInt8Limits::max()),
-                          "Softmax",
-                          {utils::MakeAttribute("axis", static_cast<int64_t>(-1))},
-                          11, ExpectedEPNodeAssignment::All);
+TEST_F(QnnHTPBackendTests, UnaryOp_Softmax11_SetValidAxis) {
+  RunQDQUnaryOpTest(TestInputDef<float>({1, 2, 3}, false, -5.0f, 5.0f),
+                    "Softmax",
+                    {utils::MakeAttribute("axis", static_cast<int64_t>(-1))},
+                    11, ExpectedEPNodeAssignment::All);
+}
+
+// Test QDQ Abs op.
+TEST_F(QnnHTPBackendTests, UnaryOp_Abs) {
+  RunQDQUnaryOpTest(TestInputDef<float>({1, 2, 3}, false, -10.0f, 10.0f),
+                    "Abs",
+                    {},
+                    13, ExpectedEPNodeAssignment::All);
+}
+
+// Test QDQ Ceil op.
+TEST_F(QnnHTPBackendTests, UnaryOp_Ceil) {
+  RunQDQUnaryOpTest(TestInputDef<float>({1, 2, 3}, false, -100.0f, 100.0f),
+                    "Ceil",
+                    {},
+                    13, ExpectedEPNodeAssignment::All);
 }
 
 // Run QDQ model on HTP twice
@@ -225,68 +326,138 @@ TEST_F(QnnHTPBackendTests, ContextBinaryCacheTest) {
   const std::string context_binary_file = "./qnn_context_binary_test.bin";
   provider_options["qnn_context_cache_path"] = context_binary_file;
 
-  const TestInputDef<uint8_t> input_def({1, 2, 3}, false, UInt8Limits::min(), UInt8Limits::max());
+  const TestInputDef<float> input_def({1, 2, 3}, false, -10.0f, 10.0f);
+  const std::string op_type = "Atan";
 
   // Runs model with DQ-> Atan-> Q and compares the outputs of the CPU and QNN EPs.
   // 1st run will generate the Qnn context cache binary file
-  RunQnnModelTest(BuildQDQSingleInputOpTestCase<uint8_t>(input_def, "Atan"),
-                  provider_options,
-                  11,
-                  ExpectedEPNodeAssignment::All);
+  TestQDQModelAccuracy(BuildUnaryOpTestCase<float>(op_type, input_def, {}),
+                       BuildQDQUnaryOpTestCase<uint8_t>(input_def, op_type, {}),
+                       provider_options,
+                       14,
+                       ExpectedEPNodeAssignment::All,
+                       1e-5f);
 
   // Make sure the Qnn context cache binary file is generated
   EXPECT_TRUE(std::filesystem::exists(context_binary_file.c_str()));
 
   // 2nd run will load and run from Qnn context cache binary file
-  RunQnnModelTest(BuildQDQSingleInputOpTestCase<uint8_t>(input_def, "Atan"),
+  TestQDQModelAccuracy(BuildUnaryOpTestCase<float>(op_type, input_def, {}),
+                       BuildQDQUnaryOpTestCase<uint8_t>(input_def, op_type, {}),
+                       provider_options,
+                       14,
+                       ExpectedEPNodeAssignment::All,
+                       1e-5f);
+}
+
+TEST_F(QnnHTPBackendTests, QuantAccuracyTest) {
+  ProviderOptions provider_options;
+
+#if defined(_WIN32)
+  provider_options["backend_path"] = "QnnHtp.dll";
+#else
+  provider_options["backend_path"] = "libQnnHtp.so";
+#endif
+
+  // Note: a graph input -> Q -> DQ -> is optimized by Qnn to have a perfectly accurate output.
+  // ORT's CPU EP, on the otherhand, actually quantizes and dequantizes the input, which leads to different outputs.
+  auto builder_func = [](ModelTestBuilder& builder) {
+    const TestInputDef<float> input0_def({1, 2, 3}, false, {1.0f, 2.0f, 10.0f, 20.0f, 100.0f, 200.0f});
+
+    // input -> Q -> Transpose -> DQ -> output
+    NodeArg* input0 = MakeTestInput(builder, input0_def);
+    QuantParams<uint8_t> qparams = GetTestInputQuantParams(input0_def);
+
+    auto* quant_input = builder.MakeIntermediate();
+    builder.AddQuantizeLinearNode<uint8_t>(input0, qparams.scale, qparams.zero_point, quant_input);
+
+    auto* op_output = builder.MakeIntermediate();
+    builder.AddNode("Transpose", {quant_input}, {op_output});
+
+    NodeArg* output = builder.MakeOutput();
+    builder.AddDequantizeLinearNode<uint8_t>(op_output, qparams.scale, qparams.zero_point, output);
+  };
+
+  // Runs model with DQ-> Atan-> Q and compares the outputs of the CPU and QNN EPs.
+  // 1st run will generate the Qnn context cache binary file
+  RunQnnModelTest(builder_func,
                   provider_options,
-                  11,
+                  13,
                   ExpectedEPNodeAssignment::All);
 }
 
-TEST_F(QnnHTPBackendTests, TestSub4D_SmallInputs) {
-  RunQDQBinaryOpTest<float, uint8_t>("Sub", TestInputDef<float>({1, 3, 8, 8}, false, -1.0f, 1.0f),
-                                     TestInputDef<float>({1, 3, 8, 8}, false, -1.0f, 1.0f),
-                                     17, ExpectedEPNodeAssignment::All);
+// Test QDQ Add
+TEST_F(QnnHTPBackendTests, BinaryOp_Add4D) {
+  RunQDQBinaryOpTest<uint8_t>("Add", TestInputDef<float>({1, 2, 2, 2}, false, -10.0f, 10.0f),
+                              TestInputDef<float>({1, 2, 2, 2}, false, -10.0f, 10.0f),
+                              17, ExpectedEPNodeAssignment::All);
+}
+
+// Test QDQ Sub
+TEST_F(QnnHTPBackendTests, BinaryOp_Sub4D) {
+  RunQDQBinaryOpTest<uint8_t>("Sub", TestInputDef<float>({1, 3, 8, 8}, false, -10.0f, 10.0f),
+                              TestInputDef<float>({1, 3, 8, 8}, false, -10.0f, 10.0f),
+                              17, ExpectedEPNodeAssignment::All);
 }
 
 // TODO: Certain large input sizes cause the QNN graph to fail to finalize with error 1002 (QNN_COMMON_ERROR_MEM_ALLOC).
 // Enable when this is fixed.
-TEST_F(QnnHTPBackendTests, DISABLED_TestSub4D_LargeInputs) {
-  RunQDQBinaryOpTest<float, uint8_t>("Sub", TestInputDef<float>({1, 3, 768, 1152}, false, -1.0f, 1.0f),
-                                     TestInputDef<float>({1, 3, 768, 1152}, false, -1.0f, 1.0f),
-                                     17, ExpectedEPNodeAssignment::All);
+TEST_F(QnnHTPBackendTests, DISABLED_BinaryOp_Sub4D_LargeInputs) {
+  RunQDQBinaryOpTest<uint8_t>("Sub", TestInputDef<float>({1, 3, 768, 1152}, false, -1.0f, 1.0f),
+                              TestInputDef<float>({1, 3, 768, 1152}, false, -1.0f, 1.0f),
+                              17, ExpectedEPNodeAssignment::All);
 }
 
 // TODO: Certain large input sizes cause the QNN graph to fail to finalize with error 1002 (QNN_COMMON_ERROR_MEM_ALLOC).
 // Enable when this is fixed.
-TEST_F(QnnHTPBackendTests, DISABLED_TestSub4D_Broadcast) {
-  RunQDQBinaryOpTest<float, uint8_t>("Sub", TestInputDef<float>({1, 3, 768, 1152}, false, -1.0f, 1.0f),
-                                     TestInputDef<float>({3, 1, 1}, true, {1.0f, 0.5f, -0.3f}),
-                                     17, ExpectedEPNodeAssignment::All);
+TEST_F(QnnHTPBackendTests, DISABLED_BinaryOp_Sub4D_Broadcast) {
+  RunQDQBinaryOpTest<uint8_t>("Sub", TestInputDef<float>({1, 3, 768, 1152}, false, -1.0f, 1.0f),
+                              TestInputDef<float>({3, 1, 1}, true, {1.0f, 0.5f, -0.3f}),
+                              17, ExpectedEPNodeAssignment::All);
 }
 
-TEST_F(QnnHTPBackendTests, TestDiv4D_SmallInputs) {
-  RunQDQBinaryOpTest<float, uint8_t>("Div", TestInputDef<float>({1, 3, 8, 8}, false, -1.0f, 1.0f),
-                                     TestInputDef<float>({1, 3, 8, 8}, false, -1.0f, 1.0f),
-                                     17, ExpectedEPNodeAssignment::All);
+TEST_F(QnnHTPBackendTests, BinaryOp_Div4D_SmallInputs) {
+  RunQDQBinaryOpTest<uint8_t>("Div",
+                              TestInputDef<float>({1, 2, 2, 2}, false, {-10.0f, -8.0f, -1.0f, 0.0f, 1.0f, 2.1f, 8.0f, 10.0f}),
+                              TestInputDef<float>({1, 2, 2, 2}, false, {5.0f, 4.0f, 1.0f, 1.0f, 1.0f, 4.0f, 4.0f, 5.0f}),
+                              17, ExpectedEPNodeAssignment::All);
 }
 
 // TODO: Certain large input sizes cause the QNN graph to fail to finalize with error 1002 (QNN_COMMON_ERROR_MEM_ALLOC).
 // Enable when this is fixed.
-TEST_F(QnnHTPBackendTests, DISABLED_TestDiv4D_LargeInputs) {
-  RunQDQBinaryOpTest<float, uint8_t>("Div", TestInputDef<float>({1, 3, 768, 1152}, false, -1.0f, 1.0f),
-                                     TestInputDef<float>({1, 3, 768, 1152}, false, -1.0f, 1.0f),
-                                     17, ExpectedEPNodeAssignment::All);
+TEST_F(QnnHTPBackendTests, DISABLED_BinaryOp_Div4D_LargeInputs) {
+  RunQDQBinaryOpTest<uint8_t>("Div", TestInputDef<float>({1, 3, 768, 1152}, false, -1.0f, 1.0f),
+                              TestInputDef<float>({1, 3, 768, 1152}, false, -1.0f, 1.0f),
+                              17, ExpectedEPNodeAssignment::All);
 }
 
 // TODO: Certain large input sizes cause the QNN graph to fail to finalize with error 1002 (QNN_COMMON_ERROR_MEM_ALLOC).
 // Enable when this is fixed.
 // Fails accuracy when input0 has dims [1,3,768,768]
-TEST_F(QnnHTPBackendTests, DISABLED_TestDiv4D_Broadcast) {
-  RunQDQBinaryOpTest<float, uint8_t>("Div", TestInputDef<float>({1, 3, 768, 1152}, false, -1.0f, 1.0f),
-                                     TestInputDef<float>({3, 1, 1}, true, {1.0f, 0.5f, -0.3f}),
-                                     17, ExpectedEPNodeAssignment::All);
+TEST_F(QnnHTPBackendTests, DISABLED_BinaryOp_Div4D_Broadcast) {
+  RunQDQBinaryOpTest<uint8_t>("Div", TestInputDef<float>({1, 3, 768, 1152}, false, -1.0f, 1.0f),
+                              TestInputDef<float>({3, 1, 1}, true, {1.0f, 0.5f, -0.3f}),
+                              17, ExpectedEPNodeAssignment::All);
+}
+
+// Test QDQ Mul
+TEST_F(QnnHTPBackendTests, BinaryOp_Mul4D) {
+  RunQDQBinaryOpTest<uint8_t>("Mul", TestInputDef<float>({1, 2, 2, 2}, false, -10.0f, 10.0f),
+                              TestInputDef<float>({1, 2, 2, 2}, false, -10.0f, 10.0f),
+                              17, ExpectedEPNodeAssignment::All);
+}
+// Test QDQ And
+TEST_F(QnnHTPBackendTests, BinaryOp_And4D) {
+  RunBinaryOpTest<bool>("And", TestInputDef<bool>({1, 4}, false, {false, false, true, true}),
+                        TestInputDef<bool>({1, 4}, false, {false, true, false, true}),
+                        17, ExpectedEPNodeAssignment::All);
+}
+
+// Test that Or is not yet supported on HTP backend.
+TEST_F(QnnHTPBackendTests, BinaryOp_HTP_Or_Unsupported) {
+  RunBinaryOpTest<bool>("Or", TestInputDef<bool>({1, 4}, false, {false, false, true, true}),
+                        TestInputDef<bool>({1, 4}, false, {false, true, false, true}),
+                        17, ExpectedEPNodeAssignment::None);
 }
 
 #endif  // defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
diff --git a/onnxruntime/test/util/include/test_utils.h b/onnxruntime/test/util/include/test_utils.h
index 2c20177cd3a48..aa29181548f57 100644
--- a/onnxruntime/test/util/include/test_utils.h
+++ b/onnxruntime/test/util/include/test_utils.h
@@ -38,6 +38,12 @@ struct EPVerificationParams {
   const std::function<void(const Graph&)>* graph_verifier{nullptr};
 };
 
+// Verify equality of two output tensors.
+void VerifyOutput(const std::string& output_name,
+                  const Tensor& expected_tensor,
+                  const Tensor& tensor,
+                  float fp32_abs_err);
+
 // Return number of nodes in the Graph and any subgraphs that are assigned to the specified execution provider
 int CountAssignedNodes(const Graph& current_graph, const std::string& ep_type);
 
diff --git a/onnxruntime/test/util/test_utils.cc b/onnxruntime/test/util/test_utils.cc
index 2f67fea9f6289..2ce8bf6e56d6e 100644
--- a/onnxruntime/test/util/test_utils.cc
+++ b/onnxruntime/test/util/test_utils.cc
@@ -18,6 +18,48 @@
 
 namespace onnxruntime {
 namespace test {
+void VerifyOutput(const std::string& output_name,
+                  const Tensor& expected_tensor,
+                  const Tensor& tensor,
+                  float fp32_abs_err) {
+  ASSERT_TRUE(SpanEq(expected_tensor.Shape().GetDims(), tensor.Shape().GetDims()));
+  ASSERT_EQ(expected_tensor.GetElementType(), tensor.GetElementType());
+  auto element_type = expected_tensor.GetElementType();
+  switch (element_type) {
+    case ONNX_NAMESPACE::TensorProto_DataType_UINT32:
+      EXPECT_TRUE(SpanEq(expected_tensor.DataAsSpan<uint32_t>(), tensor.DataAsSpan<uint32_t>()))
+          << " mismatch for " << output_name;
+      break;
+    case ONNX_NAMESPACE::TensorProto_DataType_INT32:
+      EXPECT_TRUE(SpanEq(expected_tensor.DataAsSpan<int32_t>(), tensor.DataAsSpan<int32_t>()))
+          << " mismatch for " << output_name;
+      break;
+    case ONNX_NAMESPACE::TensorProto_DataType_INT64:
+      EXPECT_TRUE(SpanEq(expected_tensor.DataAsSpan<int64_t>(), tensor.DataAsSpan<int64_t>()))
+          << " mismatch for " << output_name;
+      break;
+    case ONNX_NAMESPACE::TensorProto_DataType_UINT8:
+      EXPECT_TRUE(SpanEq(expected_tensor.DataAsSpan<uint8_t>(), tensor.DataAsSpan<uint8_t>()))
+          << " mismatch for " << output_name;
+      break;
+    case ONNX_NAMESPACE::TensorProto_DataType_INT8:
+      EXPECT_TRUE(SpanEq(expected_tensor.DataAsSpan<int8_t>(), tensor.DataAsSpan<int8_t>()))
+          << " mismatch for " << output_name;
+      break;
+    case ONNX_NAMESPACE::TensorProto_DataType_BOOL:
+      EXPECT_TRUE(SpanEq(expected_tensor.DataAsSpan<bool>(), tensor.DataAsSpan<bool>()))
+          << " mismatch for " << output_name;
+      break;
+    case ONNX_NAMESPACE::TensorProto_DataType_FLOAT: {
+      EXPECT_THAT(expected_tensor.DataAsSpan<float>(),
+                  ::testing::Pointwise(::testing::FloatNear(fp32_abs_err), tensor.DataAsSpan<float>()));
+      break;
+    }
+    default:
+      ORT_THROW("Unhandled data type. Please add 'case' statement for ", element_type);
+  }
+}
+
 static void VerifyOutputs(const std::vector<std::string>& output_names,
                           const std::vector<OrtValue>& expected_fetches,
                           const std::vector<OrtValue>& fetches,
@@ -27,41 +69,7 @@ static void VerifyOutputs(const std::vector<std::string>& output_names,
   for (size_t i = 0, end = expected_fetches.size(); i < end; ++i) {
     auto& ltensor = expected_fetches[i].Get<Tensor>();
     auto& rtensor = fetches[i].Get<Tensor>();
-    ASSERT_TRUE(SpanEq(ltensor.Shape().GetDims(), rtensor.Shape().GetDims()));
-    auto element_type = ltensor.GetElementType();
-    switch (element_type) {
-      case ONNX_NAMESPACE::TensorProto_DataType_UINT32:
-        EXPECT_TRUE(SpanEq(ltensor.DataAsSpan<uint32_t>(), rtensor.DataAsSpan<uint32_t>()))
-            << " mismatch for " << output_names[i];
-        break;
-      case ONNX_NAMESPACE::TensorProto_DataType_INT32:
-        EXPECT_TRUE(SpanEq(ltensor.DataAsSpan<int32_t>(), rtensor.DataAsSpan<int32_t>()))
-            << " mismatch for " << output_names[i];
-        break;
-      case ONNX_NAMESPACE::TensorProto_DataType_INT64:
-        EXPECT_TRUE(SpanEq(ltensor.DataAsSpan<int64_t>(), rtensor.DataAsSpan<int64_t>()))
-            << " mismatch for " << output_names[i];
-        break;
-      case ONNX_NAMESPACE::TensorProto_DataType_UINT8:
-        EXPECT_TRUE(SpanEq(ltensor.DataAsSpan<uint8_t>(), rtensor.DataAsSpan<uint8_t>()))
-            << " mismatch for " << output_names[i];
-        break;
-      case ONNX_NAMESPACE::TensorProto_DataType_INT8:
-        EXPECT_TRUE(SpanEq(ltensor.DataAsSpan<int8_t>(), rtensor.DataAsSpan<int8_t>()))
-            << " mismatch for " << output_names[i];
-        break;
-      case ONNX_NAMESPACE::TensorProto_DataType_BOOL:
-        EXPECT_TRUE(SpanEq(ltensor.DataAsSpan<bool>(), rtensor.DataAsSpan<bool>()))
-            << " mismatch for " << output_names[i];
-        break;
-      case ONNX_NAMESPACE::TensorProto_DataType_FLOAT: {
-        EXPECT_THAT(ltensor.DataAsSpan<float>(),
-                    ::testing::Pointwise(::testing::FloatNear(params.fp32_abs_err), rtensor.DataAsSpan<float>()));
-        break;
-      }
-      default:
-        ORT_THROW("Unhandled data type. Please add 'case' statement for ", element_type);
-    }
+    VerifyOutput(output_names[i], ltensor, rtensor, params.fp32_abs_err);
   }
 }