From 5331711e2245c3456ca36fa9da0411d628e3e9fb Mon Sep 17 00:00:00 2001
From: Guoyu Wang <wanggy@outlook.com>
Date: Thu, 16 Dec 2021 22:34:57 -0800
Subject: [PATCH 01/23] clear using in code

---
 .../nnapi/nnapi_builtin/builders/helper.cc    |  4 -
 .../nnapi_builtin/builders/model_builder.cc   | 14 ++-
 .../nnapi_builtin/builders/op_builder.cc      | 86 +++++++++----------
 .../builders/op_support_checker.cc            |  7 +-
 .../nnapi/nnapi_builtin/builders/shaper.cc    | 21 ++---
 .../nnapi_builtin/nnapi_execution_provider.cc |  2 -
 6 files changed, 59 insertions(+), 75 deletions(-)
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.cc
index 2fc1afd435279..edbcaf590c379 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.cc
@@ -17,10 +17,6 @@
 #include "helper.h"
 #include "op_support_checker.h"
 
-using onnxruntime::NodeUnit;
-using std::string;
-using std::vector;
-
 namespace onnxruntime {
 namespace nnapi {
 
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.cc
index 645ab23a85109..7136251927e09 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.cc
@@ -14,9 +14,7 @@
 #include "op_builder.h"
 #include "op_support_checker.h"
 
-using onnxruntime::NodeUnit;
 using namespace android::nn::wrapper;
-using std::vector;
 
 namespace onnxruntime {
 namespace nnapi {
@@ -31,7 +29,7 @@ int32_t ModelBuilder::GetNNAPIFeatureLevel() const {
 // Scalar operand is copied into the model, no need to persist
 #define DEFINE_ADD_OPERAND_FROM_SCALAR(scalar_type, op_type)                      \
   Status ModelBuilder::AddOperandFromScalar(scalar_type value, uint32_t& index) { \
-    OperandType operandType(Type::op_type, vector<uint32_t>{});                   \
+    OperandType operandType(Type::op_type, std::vector<uint32_t>{});              \
     ORT_RETURN_IF_ERROR(AddNewNNAPIOperand(operandType, index));                  \
     RETURN_STATUS_ON_ERROR_WITH_NOTE(                                             \
         nnapi_->ANeuralNetworksModel_setOperandValue(                             \
@@ -50,7 +48,7 @@ void ModelBuilder::AddInitializerToSkip(const std::string& tensor_name) {
   skipped_initializers_.insert(tensor_name);
 }
 
-static std::unordered_map<std::string, vector<const Node*>> GetAllQuantizedOpInputs(const GraphViewer& graph_viewer);
+static std::unordered_map<std::string, std::vector<const Node*>> GetAllQuantizedOpInputs(const GraphViewer& graph_viewer);
 
 Status ModelBuilder::Prepare() {
   nnapi_model_ = std::unique_ptr<Model>(new Model());
@@ -151,8 +149,8 @@ void ModelBuilder::PreprocessActivations() {
 }
 
 // Help to get all quantized operators' input and the node(s) using the input
-static std::unordered_map<std::string, vector<const Node*>> GetAllQuantizedOpInputs(const GraphViewer& graph_viewer) {
-  std::unordered_map<std::string, vector<const Node*>> all_quantized_op_inputs;
+static std::unordered_map<std::string, std::vector<const Node*>> GetAllQuantizedOpInputs(const GraphViewer& graph_viewer) {
+  std::unordered_map<std::string, std::vector<const Node*>> all_quantized_op_inputs;
   const auto& node_indices = graph_viewer.GetNodesInTopologicalOrder();
   for (const auto& node_idx : node_indices) {
     const auto* node(graph_viewer.GetNode(node_idx));
@@ -168,7 +166,7 @@ static std::unordered_map<std::string, vector<const Node*>> GetAllQuantizedOpInp
       if (Contains(all_quantized_op_inputs, input_name))
         all_quantized_op_inputs.at(input_name).push_back(node);
       else
-        all_quantized_op_inputs.emplace(input_name, vector<const Node*>{node});
+        all_quantized_op_inputs.emplace(input_name, std::vector<const Node*>{node});
     }
 
     if (IsQLinearBinaryOp(qlinear_op_type)) {
@@ -176,7 +174,7 @@ static std::unordered_map<std::string, vector<const Node*>> GetAllQuantizedOpInp
       if (Contains(all_quantized_op_inputs, input_name))
         all_quantized_op_inputs.at(input_name).push_back(node);
       else
-        all_quantized_op_inputs.emplace(input_name, vector<const Node*>{node});
+        all_quantized_op_inputs.emplace(input_name, std::vector<const Node*>{node});
     }
   }
 
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc
index 66f870df15074..45f4c975843bd 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc
@@ -16,9 +16,7 @@
 #include "model_builder.h"
 #include "op_support_checker.h"
 
-using onnxruntime::NodeUnit;
 using namespace android::nn::wrapper;
-using std::vector;
 
 namespace onnxruntime {
 namespace nnapi {
@@ -40,13 +38,13 @@ struct OpBuilderRegistrations {
 Status AddTransposeOperator(ModelBuilder& model_builder,
                             const std::string& input,
                             const std::string& perm_name,
-                            vector<int32_t> perm,
+                            std::vector<int32_t> perm,
                             const std::string& output,
                             bool output_is_nhwc) ORT_MUST_USE_RESULT;
 Status AddTransposeOperator(ModelBuilder& model_builder,
                             const std::string& input,
                             const std::string& perm_name,
-                            vector<int32_t> perm,
+                            std::vector<int32_t> perm,
                             const std::string& output,
                             bool output_is_nhwc) {
   auto& shaper(model_builder.GetShaper());
@@ -83,7 +81,7 @@ Status TransposeBetweenNCHWAndNHWC(ModelBuilder& model_builder,
                     "TransposeBetweenNCHWAndNHWC input has to be a 4d tensor, actual dimensions: ", shaper[input].size());
 
   std::string perm_name;
-  vector<int32_t> perm;
+  std::vector<int32_t> perm;
   if (nchw_to_nhwc) {
     perm_name = model_builder.GetUniqueName(input + "nchw_to_nhwc_perm");
     perm = {0, 2, 3, 1};
@@ -222,11 +220,11 @@ static Status AddBinaryOperator(int32_t op_type,
 static Status AddSqueezeOp(ModelBuilder& model_builder,
                            const std::string& node_name,
                            const std::string& input, const std::string& output,
-                           vector<int32_t> axes) ORT_MUST_USE_RESULT;
+                           std::vector<int32_t> axes) ORT_MUST_USE_RESULT;
 static Status AddSqueezeOp(ModelBuilder& model_builder,
                            const std::string& node_name,
                            const std::string& input, const std::string& output,
-                           vector<int32_t> axes) {
+                           std::vector<int32_t> axes) {
   if (model_builder.GetNNAPIFeatureLevel() < ANEURALNETWORKS_FEATURE_LEVEL_2) {
     return ORT_MAKE_STATUS(
         ONNXRUNTIME, FAIL, "Squeeze is not supported on API level ", model_builder.GetNNAPIFeatureLevel());
@@ -430,13 +428,13 @@ static Status ComputeConvPads(
     const uint32_t weight_size_y, const uint32_t weight_size_x,
     const std::vector<int32_t>& onnx_pads, const std::vector<int32_t>& onnx_strides, const std::vector<int32_t>& onnx_dilations,
     AutoPadType auto_pad_type, bool nchw,
-    vector<int32_t>& pads_out) ORT_MUST_USE_RESULT;
+    std::vector<int32_t>& pads_out) ORT_MUST_USE_RESULT;
 static Status ComputeConvPads(
     const Shape& input_dimen,
     const uint32_t weight_size_y, const uint32_t weight_size_x,
     const std::vector<int32_t>& onnx_pads, const std::vector<int32_t>& onnx_strides, const std::vector<int32_t>& onnx_dilations,
     AutoPadType auto_pad_type, bool nchw,
-    vector<int32_t>& pads_out) {
+    std::vector<int32_t>& pads_out) {
   const int32_t input_size_y = nchw ? input_dimen[2] : input_dimen[1];
   const int32_t input_size_x = nchw ? input_dimen[3] : input_dimen[2];
   const int32_t stride_y = onnx_strides[0];
@@ -467,21 +465,21 @@ static Status ComputeConvPads(
 static Status HandleAutoPad(const Shape& input_shape,
                             const uint32_t weight_size_y,
                             const uint32_t weight_size_x,
-                            const vector<int32_t>& onnx_strides,
-                            const vector<int32_t>& onnx_dilations,
+                            const std::vector<int32_t>& onnx_strides,
+                            const std::vector<int32_t>& onnx_dilations,
                             AutoPadType auto_pad_type,
                             bool use_nchw,
-                            vector<int32_t>& onnx_pads,
+                            std::vector<int32_t>& onnx_pads,
                             int32_t& nnapi_padding_code,
                             bool& use_auto_pad) ORT_MUST_USE_RESULT;
 static Status HandleAutoPad(const Shape& input_shape,
                             const uint32_t weight_size_y,
                             const uint32_t weight_size_x,
-                            const vector<int32_t>& onnx_strides,
-                            const vector<int32_t>& onnx_dilations,
+                            const std::vector<int32_t>& onnx_strides,
+                            const std::vector<int32_t>& onnx_dilations,
                             AutoPadType auto_pad_type,
                             bool use_nchw,
-                            vector<int32_t>& onnx_pads,
+                            std::vector<int32_t>& onnx_pads,
                             int32_t& nnapi_padding_code,
                             bool& use_auto_pad) {
   use_auto_pad = false;
@@ -498,7 +496,7 @@ static Status HandleAutoPad(const Shape& input_shape,
     }
   } else if (onnx_dilations == std::vector<int32_t>{1, 1}) {
     // Since NNAPI runs more efficiently using auto_pad, we try to map the NOTSET padding to auto_pad
-    vector<int32_t> same_upper_pads;
+    std::vector<int32_t> same_upper_pads;
     ORT_RETURN_IF_ERROR(ComputeConvPads(input_shape, weight_size_y, weight_size_x,
                                         onnx_pads, onnx_strides, onnx_dilations,
                                         AutoPadType::SAME_UPPER, use_nchw,
@@ -547,12 +545,12 @@ static Status GetConvMatMulOpQuantizationScaleAndZeroPoint(
     const ModelBuilder& model_builder, const Node& node,
     float& a_scale, float& w_scale, float& y_scale,
     int32_t& a_zero_point, int32_t& w_zero_point, int32_t& y_zero_point,
-    optional<vector<float>>& w_scales, bool& is_per_tensor_u8s8) ORT_MUST_USE_RESULT;
+    optional<std::vector<float>>& w_scales, bool& is_per_tensor_u8s8) ORT_MUST_USE_RESULT;
 static Status GetConvMatMulOpQuantizationScaleAndZeroPoint(
     const ModelBuilder& model_builder, const Node& node,
     float& a_scale, float& w_scale, float& y_scale,
     int32_t& a_zero_point, int32_t& w_zero_point, int32_t& y_zero_point,
-    optional<vector<float>>& w_scales, bool& is_per_tensor_u8s8) {
+    optional<std::vector<float>>& w_scales, bool& is_per_tensor_u8s8) {
   is_per_tensor_u8s8 = false;
   // Get scale and zero points
   // We will handle per-channel weight scale and zero point later
@@ -593,7 +591,7 @@ static Status GetConvMatMulOpQuantizationScaleAndZeroPoint(
   ORT_RETURN_IF_ERROR(onnxruntime::utils::UnpackInitializerData(scale_tensor, unpacked_tensor));
   const float* scales = reinterpret_cast<const float*>(unpacked_tensor.data());
   const size_t scales_size = scale_tensor.dims().empty() ? 1 : scale_tensor.dims()[0];
-  vector<float> scales_vec(scales, scales + scales_size);
+  std::vector<float> scales_vec(scales, scales + scales_size);
   w_scales = onnxruntime::make_optional(std::move(scales_vec));
   return Status::OK();
 }
@@ -631,12 +629,12 @@ static Status IsValidConvWeightQuantizedType(const ModelBuilder& model_builder,
                                              const std::string& input_name,
                                              float scale,
                                              int32_t zero_point,
-                                             const optional<vector<float>>& scales) ORT_MUST_USE_RESULT;
+                                             const optional<std::vector<float>>& scales) ORT_MUST_USE_RESULT;
 static Status IsValidConvWeightQuantizedType(const ModelBuilder& model_builder,
                                              const std::string& input_name,
                                              float scale,
                                              int32_t zero_point,
-                                             const optional<vector<float>>& scales) {
+                                             const optional<std::vector<float>>& scales) {
   // first verify as the weight has no per-channel quantization
   ORT_RETURN_IF_ERROR(IsValidInputQuantizedType(model_builder, input_name, scale, zero_point));
 
@@ -902,7 +900,7 @@ Status TransposeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, co
   auto input = node.InputDefs()[0]->Name();
   const auto& output = node.OutputDefs()[0]->Name();
   NodeAttrHelper helper(node);
-  vector<int32_t> perm = helper.Get("perm", vector<int32_t>());
+  std::vector<int32_t> perm = helper.Get("perm", std::vector<int32_t>());
   auto input_dims = shaper[input].size();
   if (perm.empty()) {
     for (int32_t i = input_dims - 1; i >= 0; i--)
@@ -1118,7 +1116,7 @@ Status BatchNormalizationOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_bu
   const auto eps = helper.Get("epsilon", 1e-5f);
 
   const auto size = SafeInt<uint32_t>(scale_tensor.dims()[0]);
-  vector<float> a, b;
+  std::vector<float> a, b;
   a.reserve(size);
   b.reserve(size);
 
@@ -1267,15 +1265,15 @@ Status PoolOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
   else  // (op_type == "MaxPool" || op_type == "GlobalMaxPool")
     op_code = ANEURALNETWORKS_MAX_POOL_2D;
 
-  vector<int32_t> onnx_pads, onnx_strides, kernel_shape;
+  std::vector<int32_t> onnx_pads, onnx_strides, kernel_shape;
   bool use_auto_pad = false;
   int32_t nnapi_padding_code = ANEURALNETWORKS_PADDING_VALID;
   const auto& input_shape = shaper[input];
   if (is_average_pool || op_type == "MaxPool") {
     const auto auto_pad_type = StringToAutoPadType(helper.Get("auto_pad", "NOTSET"));
-    kernel_shape = helper.Get("kernel_shape", vector<int32_t>{0, 0});
-    onnx_strides = helper.Get("strides", vector<int>{1, 1});
-    onnx_pads = helper.Get("pads", vector<int>{0, 0, 0, 0});
+    kernel_shape = helper.Get("kernel_shape", std::vector<int32_t>{0, 0});
+    onnx_strides = helper.Get("strides", std::vector<int>{1, 1});
+    onnx_pads = helper.Get("pads", std::vector<int>{0, 0, 0, 0});
     const auto weight_size_y = static_cast<uint32_t>(kernel_shape[0]);
     const auto weight_size_x = static_cast<uint32_t>(kernel_shape[1]);
     ORT_RETURN_IF_ERROR(
@@ -1286,14 +1284,14 @@ Status PoolOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
   } else {  // (op_type == "GlobalAveragePool" || op_type == "GlobalMaxPool")
     use_auto_pad = true;
     nnapi_padding_code = ANEURALNETWORKS_PADDING_VALID;
-    onnx_strides = vector<int32_t>{1, 1};
-    onnx_pads = vector<int32_t>{0, 0, 0, 0};
+    onnx_strides = std::vector<int32_t>{1, 1};
+    onnx_pads = std::vector<int32_t>{0, 0, 0, 0};
     if (use_nchw) {
-      kernel_shape = vector<int32_t>{static_cast<int32_t>(input_shape[2]),
-                                     static_cast<int32_t>(input_shape[3])};
+      kernel_shape = std::vector<int32_t>{static_cast<int32_t>(input_shape[2]),
+                                          static_cast<int32_t>(input_shape[3])};
     } else {
-      kernel_shape = vector<int32_t>{static_cast<int32_t>(input_shape[1]),
-                                     static_cast<int32_t>(input_shape[2])};
+      kernel_shape = std::vector<int32_t>{static_cast<int32_t>(input_shape[1]),
+                                          static_cast<int32_t>(input_shape[2])};
     }
   }
 
@@ -1403,15 +1401,15 @@ Status ConvOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
 
   // onnx strides are in the order height, width
   // while nnapi strides are in the order width, height
-  const auto onnx_strides = helper.Get("strides", vector<int>{1, 1});
+  const auto onnx_strides = helper.Get("strides", std::vector<int>{1, 1});
 
   // onnx pads are in the order top, left, bottom, right
   // while nnapi pads is in the order left, right, top, bottom
-  auto onnx_pads = helper.Get("pads", vector<int>{0, 0, 0, 0});
+  auto onnx_pads = helper.Get("pads", std::vector<int>{0, 0, 0, 0});
 
   // onnx dilations is in the order height, width
   // while nnapi dilations are in the order width, height
-  const auto onnx_dilations = helper.Get("dilations", vector<int>{1, 1});
+  const auto onnx_dilations = helper.Get("dilations", std::vector<int>{1, 1});
   const auto group = helper.Get("group", 1);
 
   size_t x_idx = 0,
@@ -1446,7 +1444,7 @@ Status ConvOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
           y_zero_point = 0;
 
   // this is for per-channel quantization weights
-  optional<vector<float>> w_scales;
+  optional<std::vector<float>> w_scales;
   bool is_per_tensor_u8s8 = false;
   if (is_qlinear_conv) {
     ORT_RETURN_IF_ERROR(GetConvMatMulOpQuantizationScaleAndZeroPoint(model_builder, node,
@@ -1517,11 +1515,11 @@ Status ConvOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
 
     const auto& weight_type = operand_types.at(weight).type;
     if (weight_type == Type::TENSOR_FLOAT32) {
-      vector<float> buffer(bias_dimen[0], 0.0f);
+      std::vector<float> buffer(bias_dimen[0], 0.0f);
       OperandType bias_operand_type(Type::TENSOR_FLOAT32, bias_dimen, x_scale * w_scale);
       ORT_RETURN_IF_ERROR(model_builder.AddOperandFromPersistMemoryBuffer(bias, buffer.data(), bias_operand_type));
     } else if (weight_type == Type::TENSOR_QUANT8_ASYMM || weight_type == Type::TENSOR_QUANT8_SYMM_PER_CHANNEL) {
-      vector<int32_t> buffer(bias_dimen[0], 0);
+      std::vector<int32_t> buffer(bias_dimen[0], 0);
       OperandType bias_operand_type(Type::TENSOR_INT32, bias_dimen, x_scale * w_scale);
       ORT_RETURN_IF_ERROR(model_builder.AddOperandFromPersistMemoryBuffer(bias, buffer.data(), bias_operand_type));
     } else {
@@ -1819,7 +1817,7 @@ Status GemmOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
 
   bool is_per_tensor_u8s8 = false;
   if (is_qlinear_matmul) {
-    optional<vector<float>> w_scales;
+    optional<std::vector<float>> w_scales;
     ORT_RETURN_IF_ERROR(
         GetConvMatMulOpQuantizationScaleAndZeroPoint(model_builder, node,
                                                      a_scale, b_scale, y_scale,
@@ -2118,7 +2116,7 @@ class SqueezeOpBuilder : public BaseOpBuilder {
 
  private:
   Status AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const override ORT_MUST_USE_RESULT;
-  static Status GetAxes(ModelBuilder& model_builder, const Node& node, vector<int32_t>& axes);
+  static Status GetAxes(ModelBuilder& model_builder, const Node& node, std::vector<int32_t>& axes);
 };
 
 void SqueezeOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const NodeUnit& node_unit) const {
@@ -2129,7 +2127,7 @@ void SqueezeOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const
 }
 
 /* static */ Status SqueezeOpBuilder::GetAxes(ModelBuilder& model_builder,
-                                              const Node& node, vector<int32_t>& axes) {
+                                              const Node& node, std::vector<int32_t>& axes) {
   // Squeeze opset 13 use input as axes
   if (node.SinceVersion() > 12) {
     // If axes is not supplied, return an empty axes as default to squeeze all
@@ -2148,7 +2146,7 @@ void SqueezeOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const
     }
   } else {
     NodeAttrHelper helper(node);
-    axes = helper.Get("axes", vector<int32_t>());
+    axes = helper.Get("axes", std::vector<int32_t>());
   }
 
   return Status::OK();
@@ -2163,7 +2161,7 @@ Status SqueezeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, cons
     ORT_RETURN_IF_ERROR(GetNCHWInput(model_builder, node, 0, input));
   }
 
-  vector<int32_t> axes;
+  std::vector<int32_t> axes;
   ORT_RETURN_IF_ERROR(GetAxes(model_builder, node, axes));
   return AddSqueezeOp(model_builder, node.Name(), input, node.OutputDefs()[0]->Name(), axes);
 }
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_support_checker.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_support_checker.cc
index ca2ba5e90fb93..331b2d2e9e745 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_support_checker.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_support_checker.cc
@@ -12,9 +12,6 @@
 #include "helper.h"
 #include "op_support_checker.h"
 
-using onnxruntime::NodeUnit;
-using std::vector;
-
 namespace onnxruntime {
 namespace nnapi {
 
@@ -777,8 +774,8 @@ bool ConvOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initial
       return false;
     }
 
-    const auto onnx_dilations = helper.Get("dilations", vector<int>{1, 1});
-    if (onnx_dilations != vector<int>{1, 1}) {
+    const auto onnx_dilations = helper.Get("dilations", std::vector<int>{1, 1});
+    if (onnx_dilations != std::vector<int>{1, 1}) {
       if (group != 1 && tensor.dims()[1] != 1) {
         LOGS_DEFAULT(VERBOSE) << "dilation is not supported on grouped conv";
         return false;
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/shaper.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/shaper.cc
index 9d675ecfa84c0..2bcc167622b3e 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/shaper.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/shaper.cc
@@ -9,14 +9,11 @@
 namespace onnxruntime {
 namespace nnapi {
 
-using std::string;
-using std::vector;
-
 std::pair<uint32_t, uint32_t> ComputeConvOutputShape(const uint32_t input_size_y, const uint32_t input_size_x,
                                                      const uint32_t weight_size_y, const uint32_t weight_size_x,
-                                                     const vector<int32_t>& onnx_pads,
-                                                     const vector<int32_t>& onnx_strides,
-                                                     const vector<int32_t>& onnx_dilations) {
+                                                     const std::vector<int32_t>& onnx_pads,
+                                                     const std::vector<int32_t>& onnx_strides,
+                                                     const std::vector<int32_t>& onnx_dilations) {
   int32_t padding_top = onnx_pads[0];
   int32_t padding_bottom = onnx_pads[2];
   int32_t padding_left = onnx_pads[1];
@@ -53,9 +50,9 @@ std::pair<uint32_t, uint32_t> ComputeConvOutputShape(const uint32_t input_size_y
 
 Status Shaper::Conv(const std::string& input_name,
                     const std::string& weight_name,
-                    const vector<int32_t>& onnx_pads,
-                    const vector<int32_t>& onnx_strides,
-                    const vector<int32_t>& onnx_dilations,
+                    const std::vector<int32_t>& onnx_pads,
+                    const std::vector<int32_t>& onnx_strides,
+                    const std::vector<int32_t>& onnx_dilations,
                     bool nchw,
                     const std::string& output_name) {
   SHAPER_FUNC(Conv,
@@ -150,9 +147,9 @@ Status Shaper::ResizeUsingOutputSizes(const std::string& input_name,
 
 Status Shaper::ConvImpl(const std::string& input_name,
                         const std::string& weight_name,
-                        const vector<int32_t>& onnx_pads,
-                        const vector<int32_t>& onnx_strides,
-                        const vector<int32_t>& onnx_dilations,
+                        const std::vector<int32_t>& onnx_pads,
+                        const std::vector<int32_t>& onnx_strides,
+                        const std::vector<int32_t>& onnx_dilations,
                         bool nchw,
                         const std::string& output_name) {
   const Shape& input_dimen = shape_map_.at(input_name);
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/nnapi_execution_provider.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/nnapi_execution_provider.cc
index fa876a7ef6bca..150db8f593e11 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/nnapi_execution_provider.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/nnapi_execution_provider.cc
@@ -21,8 +21,6 @@
 #include "core/providers/nnapi/nnapi_builtin/model.h"
 #endif
 
-using onnxruntime::NodeUnit;
-
 namespace onnxruntime {
 
 namespace {

From e0f844c477099b8bcacde90319261b4a61e2df24 Mon Sep 17 00:00:00 2001
From: Guoyu Wang <wanggy@outlook.com>
Date: Fri, 17 Dec 2021 11:46:15 -0800
Subject: [PATCH 02/23] add qlinear iodef generation

---
 .../providers/shared/node_unit/node_unit.cc   | 160 ++++++++++++++++--
 1 file changed, 147 insertions(+), 13 deletions(-)

diff --git a/onnxruntime/core/providers/shared/node_unit/node_unit.cc b/onnxruntime/core/providers/shared/node_unit/node_unit.cc
index 80492ce701eff..49133f5d66bf0 100644
--- a/onnxruntime/core/providers/shared/node_unit/node_unit.cc
+++ b/onnxruntime/core/providers/shared/node_unit/node_unit.cc
@@ -6,6 +6,81 @@
 
 namespace onnxruntime {
 
+namespace {
+
+// The QLinearOpType GetQLinearOpType, is very similar to the one in NNAPI
+// However, the NNAPI ones are only the subset of the ones here,
+// TODO, make these shared
+enum class QLinearOpType : uint8_t {
+  Unknown,  // Unknown or not a linear quantized op
+  DequantizeLinear,
+  QuantizeLinear,
+  QLinearConv,
+  QLinearMatMul,
+  QLinearAdd,
+  QLinearSigmoid,
+  QLinearAveragePool,
+  QLinearMul,
+  QLinearReduceMean,
+  QLinearConcat,
+  QLinearGlobalAveragePool,
+  QLinearLeakyRelu,
+};
+
+QLinearOpType GetQLinearOpType(const onnxruntime::Node& node) {
+  const auto& op_type = node.OpType();
+  if (op_type == "DequantizeLinear")
+    return QLinearOpType::DequantizeLinear;
+  else if (op_type == "QuantizeLinear")
+    return QLinearOpType::QuantizeLinear;
+  else if (op_type == "QLinearConv")
+    return QLinearOpType::QLinearConv;
+  else if (op_type == "QLinearMatMul")
+    return QLinearOpType::QLinearMatMul;
+  else if (op_type == "QLinearAdd")
+    return QLinearOpType::QLinearAdd;
+  else if (op_type == "QLinearSigmoid")
+    return QLinearOpType::QLinearSigmoid;
+  else if (op_type == "QLinearAveragePool")
+    return QLinearOpType::QLinearAveragePool;
+  else if (op_type == "QLinearMul")
+    return QLinearOpType::QLinearMul;
+  else if (op_type == "QLinearReduceMean")
+    return QLinearOpType::QLinearReduceMean;
+  else if (op_type == "QLinearConcat")
+    return QLinearOpType::QLinearConcat;
+  else if (op_type == "QLinearGlobalAveragePool")
+    return QLinearOpType::QLinearGlobalAveragePool;
+  else if (op_type == "QLinearLeakyRelu")
+    return QLinearOpType::QLinearLeakyRelu;
+
+  return QLinearOpType::Unknown;
+}
+
+// Ops have 1 input
+bool IsUnaryQLinearOp(QLinearOpType type) {
+  return type == QLinearOpType::QLinearSigmoid ||
+         type == QLinearOpType::QLinearAveragePool ||
+         type == QLinearOpType::QLinearGlobalAveragePool ||
+         type == QLinearOpType::QLinearLeakyRelu ||
+         type == QLinearOpType::QLinearReduceMean;
+}
+
+// Ops have 2 inputs
+bool IsBinaryQLinearOp(QLinearOpType type) {
+  return type == QLinearOpType::QLinearConv ||
+         type == QLinearOpType::QLinearMatMul ||
+         type == QLinearOpType::QLinearAdd ||
+         type == QLinearOpType::QLinearMul;
+}
+
+// Ops have 1 or more inputs
+bool IsVariadicQLinearOp(QLinearOpType type) {
+  return type == QLinearOpType::QLinearConcat;
+}
+
+}  // namespace
+
 NodeUnit::NodeUnit(const Node& node)
     : nodes_{&node},
       node_(node),
@@ -26,20 +101,79 @@ void NodeUnit::InitForNode() {
   const auto& output_defs = node_.OutputDefs();
   // The 1st step is to hookup the NodeUnit with the NNAPI builder interface
   // So we are not handling quantization here now
-  // TODO, enable quantization
-  // auto qlinear_type = GetQLinearOpType(node_);
-  // if (qlinear_type == QLinearOpType::Unknown) {
-  // Not a Qlinear op, add all inputs/outputs
-  auto add_all_io = [](std::vector<IODef>& defs,
-                       const ConstPointerContainer<std::vector<NodeArg*>>& node_defs) {
-    defs.reserve(node_defs.size());
-
-    for (const auto def : node_defs) {
-      defs.push_back(NodeUnit::IODef{*def, std::nullopt});
+  auto qlinear_type = GetQLinearOpType(node_);
+  if (qlinear_type == QLinearOpType::Unknown) {
+    //Not a Qlinear op, add all inputs / outputs
+    auto add_all_io = [](std::vector<IODef>& defs,
+                         const ConstPointerContainer<std::vector<NodeArg*>>& node_defs) {
+      defs.reserve(node_defs.size());
+
+      for (const auto def : node_defs) {
+        defs.push_back(NodeUnit::IODef{*def, std::nullopt});
+      }
+    };
+    add_all_io(input_defs_, input_defs);
+    add_all_io(output_defs_, output_defs);
+  } else if (IsUnaryQLinearOp(qlinear_type)) {
+    // Unary QLinear Op has 5 inputs
+    // x, x_scale, x_zp, y_scale, y_zp (optional)
+    input_defs_.push_back(NodeUnit::IODef{
+        *input_defs[0],
+        NodeUnit::IODef::QuantParam{*input_defs[1], input_defs[2]}});
+
+    output_defs_.push_back(NodeUnit::IODef{
+        *output_defs[0],
+        NodeUnit::IODef::QuantParam{*input_defs[3],
+                                    input_defs_.size() > 4
+                                        ? input_defs[4]
+                                        : nullptr}});
+  } else if (IsBinaryQLinearOp(qlinear_type)) {
+    // Binary QLinear Op has 9 inputs
+    // x1, x1_scale, x1_zp, x2/w, x2_scale, x2_zp, y_scale , y_zp, B
+    input_defs_.push_back(NodeUnit::IODef{
+        *input_defs[0],
+        NodeUnit::IODef::QuantParam{*input_defs[1], input_defs[2]}});
+    input_defs_.push_back(NodeUnit::IODef{
+        *input_defs[3],
+        NodeUnit::IODef::QuantParam{*input_defs[4], input_defs[5]}});
+
+    if (input_defs_.size() == 9) {  // has Bias
+      input_defs_.push_back(NodeUnit::IODef{
+          *input_defs[8],
+          std::nullopt});  // for Bias the scale and zp are optional
     }
-  };
-  add_all_io(input_defs_, input_defs);
-  add_all_io(output_defs_, output_defs);
+
+    output_defs_.push_back(NodeUnit::IODef{
+        *output_defs[0],
+        NodeUnit::IODef::QuantParam{*input_defs[6], input_defs[7]}});
+  } else if (IsVariadicQLinearOp(qlinear_type)) {
+    // TODO, add variadic support
+    ORT_NOT_IMPLEMENTED();
+  } else if (qlinear_type == QLinearOpType::DequantizeLinear) {
+    // DequantizeLinear has 3 inputs
+    // x, x_scale, x_zp
+    // output is not quantized
+    input_defs_.push_back(NodeUnit::IODef{
+        *input_defs[0],
+        NodeUnit::IODef::QuantParam{*input_defs[1],
+                                    input_defs_.size() == 3
+                                        ? input_defs[2]
+                                        : nullptr}});
+    output_defs_.push_back(NodeUnit::IODef{*output_defs[0], std::nullopt});
+  } else if (qlinear_type == QLinearOpType::QuantizeLinear) {
+    // QuantizeLinear the input is not quantized and has 3 inputs
+    // x, y_scale, y_zp (optional)
+    // The output is quantized
+    input_defs_.push_back(NodeUnit::IODef{*input_defs[0], std::nullopt});
+    output_defs_.push_back(NodeUnit::IODef{
+        *output_defs[0],
+        NodeUnit::IODef::QuantParam{*input_defs[1],
+                                    input_defs_.size() == 3
+                                        ? input_defs[2]
+                                        : nullptr}});
+  } else {
+    ORT_THROW("The QLinear op [", static_cast<uint8_t>(qlinear_type), "] is not supported");
+  }
 }
 
 }  // namespace onnxruntime

From 72f0324465dfb6206b433bff63253e23ad1a3525 Mon Sep 17 00:00:00 2001
From: Guoyu Wang <wanggy@outlook.com>
Date: Fri, 17 Dec 2021 15:47:22 -0800
Subject: [PATCH 03/23] Move iodef out of nodeunit

---
 .../providers/shared/node_unit/node_unit.cc   | 56 +++++++++----------
 .../providers/shared/node_unit/node_unit.h    | 36 ++++++------
 2 files changed, 46 insertions(+), 46 deletions(-)

diff --git a/onnxruntime/core/providers/shared/node_unit/node_unit.cc b/onnxruntime/core/providers/shared/node_unit/node_unit.cc
index 49133f5d66bf0..340662e66b6e4 100644
--- a/onnxruntime/core/providers/shared/node_unit/node_unit.cc
+++ b/onnxruntime/core/providers/shared/node_unit/node_unit.cc
@@ -104,12 +104,12 @@ void NodeUnit::InitForNode() {
   auto qlinear_type = GetQLinearOpType(node_);
   if (qlinear_type == QLinearOpType::Unknown) {
     //Not a Qlinear op, add all inputs / outputs
-    auto add_all_io = [](std::vector<IODef>& defs,
+    auto add_all_io = [](std::vector<NodeUnitIODef>& defs,
                          const ConstPointerContainer<std::vector<NodeArg*>>& node_defs) {
       defs.reserve(node_defs.size());
 
       for (const auto def : node_defs) {
-        defs.push_back(NodeUnit::IODef{*def, std::nullopt});
+        defs.push_back(NodeUnitIODef{*def, std::nullopt});
       }
     };
     add_all_io(input_defs_, input_defs);
@@ -117,35 +117,35 @@ void NodeUnit::InitForNode() {
   } else if (IsUnaryQLinearOp(qlinear_type)) {
     // Unary QLinear Op has 5 inputs
     // x, x_scale, x_zp, y_scale, y_zp (optional)
-    input_defs_.push_back(NodeUnit::IODef{
+    input_defs_.push_back(NodeUnitIODef{
         *input_defs[0],
-        NodeUnit::IODef::QuantParam{*input_defs[1], input_defs[2]}});
+        NodeUnitIODef::QuantParam{*input_defs[1], input_defs[2]}});
 
-    output_defs_.push_back(NodeUnit::IODef{
+    output_defs_.push_back(NodeUnitIODef{
         *output_defs[0],
-        NodeUnit::IODef::QuantParam{*input_defs[3],
-                                    input_defs_.size() > 4
-                                        ? input_defs[4]
-                                        : nullptr}});
+        NodeUnitIODef::QuantParam{*input_defs[3],
+                                  input_defs_.size() > 4
+                                      ? input_defs[4]
+                                      : nullptr}});
   } else if (IsBinaryQLinearOp(qlinear_type)) {
     // Binary QLinear Op has 9 inputs
     // x1, x1_scale, x1_zp, x2/w, x2_scale, x2_zp, y_scale , y_zp, B
-    input_defs_.push_back(NodeUnit::IODef{
+    input_defs_.push_back(NodeUnitIODef{
         *input_defs[0],
-        NodeUnit::IODef::QuantParam{*input_defs[1], input_defs[2]}});
-    input_defs_.push_back(NodeUnit::IODef{
+        NodeUnitIODef::QuantParam{*input_defs[1], input_defs[2]}});
+    input_defs_.push_back(NodeUnitIODef{
         *input_defs[3],
-        NodeUnit::IODef::QuantParam{*input_defs[4], input_defs[5]}});
+        NodeUnitIODef::QuantParam{*input_defs[4], input_defs[5]}});
 
     if (input_defs_.size() == 9) {  // has Bias
-      input_defs_.push_back(NodeUnit::IODef{
+      input_defs_.push_back(NodeUnitIODef{
           *input_defs[8],
           std::nullopt});  // for Bias the scale and zp are optional
     }
 
-    output_defs_.push_back(NodeUnit::IODef{
+    output_defs_.push_back(NodeUnitIODef{
         *output_defs[0],
-        NodeUnit::IODef::QuantParam{*input_defs[6], input_defs[7]}});
+        NodeUnitIODef::QuantParam{*input_defs[6], input_defs[7]}});
   } else if (IsVariadicQLinearOp(qlinear_type)) {
     // TODO, add variadic support
     ORT_NOT_IMPLEMENTED();
@@ -153,24 +153,24 @@ void NodeUnit::InitForNode() {
     // DequantizeLinear has 3 inputs
     // x, x_scale, x_zp
     // output is not quantized
-    input_defs_.push_back(NodeUnit::IODef{
+    input_defs_.push_back(NodeUnitIODef{
         *input_defs[0],
-        NodeUnit::IODef::QuantParam{*input_defs[1],
-                                    input_defs_.size() == 3
-                                        ? input_defs[2]
-                                        : nullptr}});
-    output_defs_.push_back(NodeUnit::IODef{*output_defs[0], std::nullopt});
+        NodeUnitIODef::QuantParam{*input_defs[1],
+                                  input_defs_.size() == 3
+                                      ? input_defs[2]
+                                      : nullptr}});
+    output_defs_.push_back(NodeUnitIODef{*output_defs[0], std::nullopt});
   } else if (qlinear_type == QLinearOpType::QuantizeLinear) {
     // QuantizeLinear the input is not quantized and has 3 inputs
     // x, y_scale, y_zp (optional)
     // The output is quantized
-    input_defs_.push_back(NodeUnit::IODef{*input_defs[0], std::nullopt});
-    output_defs_.push_back(NodeUnit::IODef{
+    input_defs_.push_back(NodeUnitIODef{*input_defs[0], std::nullopt});
+    output_defs_.push_back(NodeUnitIODef{
         *output_defs[0],
-        NodeUnit::IODef::QuantParam{*input_defs[1],
-                                    input_defs_.size() == 3
-                                        ? input_defs[2]
-                                        : nullptr}});
+        NodeUnitIODef::QuantParam{*input_defs[1],
+                                  input_defs_.size() == 3
+                                      ? input_defs[2]
+                                      : nullptr}});
   } else {
     ORT_THROW("The QLinear op [", static_cast<uint8_t>(qlinear_type), "] is not supported");
   }
diff --git a/onnxruntime/core/providers/shared/node_unit/node_unit.h b/onnxruntime/core/providers/shared/node_unit/node_unit.h
index d94b0e2fc55e7..0e4e9687f6c98 100644
--- a/onnxruntime/core/providers/shared/node_unit/node_unit.h
+++ b/onnxruntime/core/providers/shared/node_unit/node_unit.h
@@ -21,6 +21,20 @@ namespace QDQ {
 struct NodeGroup;
 }
 
+// Definition of one input or output
+// If the optional quant_param is present, then this is a quantized input,
+// otherwise this is a regular input
+struct NodeUnitIODef {
+  // The quantization parmeter, scale is manadatory, and zero_point is optional
+  struct QuantParam {
+    const NodeArg& scale;
+    const NodeArg* zero_point{nullptr};
+  };
+
+  const NodeArg& node_arg;
+  const std::optional<QuantParam> quant_param;
+};
+
 /**
 @class NodeUnit
 Class to represent a single node or a QDQ group of nodes, which will be used as a single unit.
@@ -33,27 +47,13 @@ class NodeUnit {
     QDQGroup,    // The NodeUnit contain a QDQ group of nodes, such as "DQ->Sigmoid->Q"
   };
 
-  // Definition of one input or output
-  // If the optional quant_param is present, then this is a quantized input,
-  // otherwise this is a regular input
-  struct IODef {
-    // The quantization parmeter, scale is manadatory, and zero_point is optional
-    struct QuantParam {
-      const NodeArg& scale;
-      const NodeArg* zero_point{nullptr};
-    };
-
-    const NodeArg& node_arg;
-    const std::optional<QuantParam> quant_param;
-  };
-
  public:
   explicit NodeUnit(const Node& node);
 
   Type UnitType() const noexcept { return type_; }
 
-  const std::vector<IODef>& Inputs() const noexcept { return input_defs_; }
-  const std::vector<IODef>& Outputs() const noexcept { return output_defs_; }
+  const std::vector<NodeUnitIODef>& Inputs() const noexcept { return input_defs_; }
+  const std::vector<NodeUnitIODef>& Outputs() const noexcept { return output_defs_; }
 
   const std::string& Domain() const noexcept;
   const std::string& OpType() const noexcept;
@@ -68,8 +68,8 @@ class NodeUnit {
   const std::vector<const Node*> GetAllNodes() const noexcept { return nodes_; }
 
  private:
-  std::vector<IODef> input_defs_;
-  std::vector<IODef> output_defs_;
+  std::vector<NodeUnitIODef> input_defs_;
+  std::vector<NodeUnitIODef> output_defs_;
 
   const std::vector<const Node*> nodes_;  // all nodes in this NodeUnit
   const Node& node_;                      // target Node

From b55527b9a98fa4be51ea2db4b2cb47914471fe6a Mon Sep 17 00:00:00 2001
From: Guoyu Wang <wanggy@outlook.com>
Date: Fri, 17 Dec 2021 22:02:36 -0800
Subject: [PATCH 04/23] Move AddInitializersToSkip to use iodef

---
 .../nnapi_builtin/builders/op_builder.cc      | 213 +++++++++---------
 1 file changed, 107 insertions(+), 106 deletions(-)

diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc
index 45f4c975843bd..17acf39a4635c 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc
@@ -654,15 +654,17 @@ static Status IsValidConvWeightQuantizedType(const ModelBuilder& model_builder,
   return Status::OK();
 }
 
-static void AddBinaryOpQuantizationScaleAndZeroPointToSkip(ModelBuilder& model_builder, const NodeUnit& node_unit) {
-  const auto& node = node_unit.GetNode();
-  const auto input_defs(node.InputDefs());
-  model_builder.AddInitializerToSkip(input_defs[1]->Name());  // a_scale
-  model_builder.AddInitializerToSkip(input_defs[2]->Name());  // a_zero_point
-  model_builder.AddInitializerToSkip(input_defs[4]->Name());  // b_scale
-  model_builder.AddInitializerToSkip(input_defs[5]->Name());  // b_zero_point
-  model_builder.AddInitializerToSkip(input_defs[6]->Name());  // y_scale
-  model_builder.AddInitializerToSkip(input_defs[7]->Name());  // y_zero_point
+static void AddQuantizationScaleAndZeroPointToSkip(ModelBuilder& model_builder, const NodeUnitIODef& io_def) {
+  // If we reach here, we assume the io_def has quant_param
+  model_builder.AddInitializerToSkip(io_def.quant_param->scale.Name());  // scale
+  if (io_def.quant_param->zero_point) {
+    model_builder.AddInitializerToSkip(io_def.quant_param->zero_point->Name());  // zero_point
+  }
+}
+
+static void AddInputToSkip(ModelBuilder& model_builder, const NodeUnitIODef& io_def) {
+  model_builder.AddInitializerToSkip(io_def.node_arg.Name());  // main input
+  AddQuantizationScaleAndZeroPointToSkip(model_builder, io_def);
 }
 
 Status GetQuantizedInputScaleAndZeroPoint(const InitializedTensorSet& initializers,
@@ -759,14 +761,23 @@ class BinaryOpBuilder : public BaseOpBuilder {
   static void CreateSharedOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
 
  private:
+  static bool IsQuantizedOp(const NodeUnit& node_unit);  // TODO, see if we want to move this to BaseOpBuilder
   Status AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const override ORT_MUST_USE_RESULT;
 };
 
+/* static */ bool BinaryOpBuilder::IsQuantizedOp(const NodeUnit& node_unit) {
+  // TODO, add support for QDQ NodeUnit
+  return node_unit.OpType() == "QLinearAdd";
+}
+
 void BinaryOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const NodeUnit& node_unit) const {
-  const auto& op = node_unit.OpType();
-  if (op == "QLinearAdd") {
-    AddBinaryOpQuantizationScaleAndZeroPointToSkip(model_builder, node_unit);
-  }
+  if (!IsQuantizedOp(node_unit))
+    return;
+
+  const auto& inputs = node_unit.Inputs();
+  AddQuantizationScaleAndZeroPointToSkip(model_builder, inputs[0]);               // a_scale, a_zp
+  AddQuantizationScaleAndZeroPointToSkip(model_builder, inputs[1]);               // b_scale, b_zp
+  AddQuantizationScaleAndZeroPointToSkip(model_builder, node_unit.Outputs()[0]);  // y_scale, y_zp
 }
 
 /* static */ void BinaryOpBuilder::CreateSharedOpBuilder(
@@ -945,8 +956,7 @@ class ReshapeOpBuilder : public BaseOpBuilder {
 };
 
 void ReshapeOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const NodeUnit& node_unit) const {
-  const auto& node = node_unit.GetNode();
-  model_builder.AddInitializerToSkip(node.InputDefs()[1]->Name());
+  model_builder.AddInitializerToSkip(node_unit.Inputs()[1].node_arg.Name());
 }
 
 // We can skip the Reshape if all the output edges satisfies both the following conditions
@@ -1089,12 +1099,11 @@ class BatchNormalizationOpBuilder : public BaseOpBuilder {
 };
 
 void BatchNormalizationOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const NodeUnit& node_unit) const {
-  const auto& node = node_unit.GetNode();
   // skip everything except input0 for BatchNormalization
-  model_builder.AddInitializerToSkip(node.InputDefs()[1]->Name());  // scale
-  model_builder.AddInitializerToSkip(node.InputDefs()[2]->Name());  // B
-  model_builder.AddInitializerToSkip(node.InputDefs()[3]->Name());  // mean
-  model_builder.AddInitializerToSkip(node.InputDefs()[4]->Name());  //var
+  model_builder.AddInitializerToSkip(node_unit.Inputs()[1].node_arg.Name());  // scale
+  model_builder.AddInitializerToSkip(node_unit.Inputs()[2].node_arg.Name());  // B
+  model_builder.AddInitializerToSkip(node_unit.Inputs()[3].node_arg.Name());  // mean
+  model_builder.AddInitializerToSkip(node_unit.Inputs()[4].node_arg.Name());  //var
 }
 
 Status BatchNormalizationOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const {
@@ -1199,24 +1208,22 @@ class PoolOpBuilder : public BaseOpBuilder {
   static void CreateSharedOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
 
  private:
+  static bool IsQuantizedOp(const NodeUnit& node_unit);  // TODO, see if we want to move this to BaseOpBuilder
   Status AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const override ORT_MUST_USE_RESULT;
 };
 
+/* static */ bool PoolOpBuilder::IsQuantizedOp(const NodeUnit& node_unit) {
+  // TODO, add support for QDQ NodeUnit
+  return node_unit.OpType() == "QLinearAveragePool";
+}
+
 void PoolOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const NodeUnit& node_unit) const {
-  const auto& node = node_unit.GetNode();
-  const auto& op = node.OpType();
-  if (op != "QLinearAveragePool")
+  if (!IsQuantizedOp(node_unit))
     return;
 
-  const auto input_defs = node.InputDefs();
-
   // skip input/output scales and zeropoints
-  model_builder.AddInitializerToSkip(input_defs[1]->Name());  // X_scale
-  model_builder.AddInitializerToSkip(input_defs[2]->Name());  // X_zero_point
-  model_builder.AddInitializerToSkip(input_defs[3]->Name());  // Y_scale
-
-  if (input_defs.size() == 5)                                   // has Y_zero_point input
-    model_builder.AddInitializerToSkip(input_defs[4]->Name());  // Y_zero_point
+  AddQuantizationScaleAndZeroPointToSkip(model_builder, node_unit.Inputs()[0]);   // x_scale, x_zp
+  AddQuantizationScaleAndZeroPointToSkip(model_builder, node_unit.Outputs()[0]);  // y_scale, y_zp
 }
 
 /* static */ void PoolOpBuilder::CreateSharedOpBuilder(
@@ -1359,10 +1366,17 @@ class ConvOpBuilder : public BaseOpBuilder {
   static void CreateSharedOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
 
  private:
+  static bool IsQuantizedOp(const NodeUnit& node_unit);  // TODO, see if we want to move this to BaseOpBuilder
   Status AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const override ORT_MUST_USE_RESULT;
 };
 
-/* static */ void ConvOpBuilder::CreateSharedOpBuilder(
+/* static */ bool ConvOpBuilder::IsQuantizedOp(const NodeUnit& node_unit) {
+  // TODO, add support for QDQ NodeUnit
+  return node_unit.OpType() == "QLinearConv";
+}
+
+/* static */ void
+ConvOpBuilder::CreateSharedOpBuilder(
     const std::string& op_type, OpBuilderRegistrations& op_registrations) {
   CreateSharedOpBuilderImpl<ConvOpBuilder>(
       op_type, op_registrations,
@@ -1373,18 +1387,16 @@ class ConvOpBuilder : public BaseOpBuilder {
 }
 
 void ConvOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const NodeUnit& node_unit) const {
-  const auto& node = node_unit.GetNode();
-  const auto& op = node.OpType();
-  const auto input_defs = node.InputDefs();
-
+  const auto& inputs = node_unit.Inputs();
   // skip the weight for conv as we need to transpose
-  if (op == "QLinearConv") {
-    AddBinaryOpQuantizationScaleAndZeroPointToSkip(model_builder, node_unit);
-    model_builder.AddInitializerToSkip(input_defs[3]->Name());  // w
-    if (input_defs.size() > 8)
-      model_builder.AddInitializerToSkip(input_defs[8]->Name());  // B
+  if (IsQuantizedOp(node_unit)) {
+    AddQuantizationScaleAndZeroPointToSkip(model_builder, inputs[0]);               // x_scale, x_zp
+    AddInputToSkip(model_builder, inputs[1]);                                       // w, w_scale, w_zp
+    AddQuantizationScaleAndZeroPointToSkip(model_builder, node_unit.Outputs()[0]);  // y_scale, y_zp
+    if (inputs.size() > 2)
+      AddInputToSkip(model_builder, inputs[2]);  // B, B_scale, B_zp
   } else {
-    model_builder.AddInitializerToSkip(input_defs[1]->Name());  // w
+    model_builder.AddInitializerToSkip(inputs[1].node_arg.Name());  // w
   }
 }
 
@@ -1396,8 +1408,7 @@ Status ConvOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
   const auto& initializers(model_builder.GetInitializerTensors());
   NodeAttrHelper helper(node);
   const auto input_defs = node.InputDefs();
-  const auto& op_type = node.OpType();
-  bool is_qlinear_conv = (op_type == "QLinearConv");
+  bool is_qlinear_conv = IsQuantizedOp(node_unit);
 
   // onnx strides are in the order height, width
   // while nnapi strides are in the order width, height
@@ -1754,9 +1765,15 @@ class GemmOpBuilder : public BaseOpBuilder {
   static void CreateSharedOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
 
  private:
+  static bool IsQuantizedOp(const NodeUnit& node_unit);  // TODO, see if we want to move this to BaseOpBuilder
   Status AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const override ORT_MUST_USE_RESULT;
 };
 
+/* static */ bool GemmOpBuilder::IsQuantizedOp(const NodeUnit& node_unit) {
+  // TODO, add support for QDQ NodeUnit
+  return node_unit.OpType() == "QLinearMatMul";
+}
+
 /* static */ void GemmOpBuilder::CreateSharedOpBuilder(
     const std::string& op_type, OpBuilderRegistrations& op_registrations) {
   CreateSharedOpBuilderImpl<GemmOpBuilder>(
@@ -1769,20 +1786,22 @@ class GemmOpBuilder : public BaseOpBuilder {
 }
 
 void GemmOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const NodeUnit& node_unit) const {
-  const auto& node = node_unit.GetNode();
-
-  const auto& op = node.OpType();
-  const auto input_defs(node.InputDefs());
-  if (op == "MatMul") {
-    model_builder.AddInitializerToSkip(input_defs[1]->Name());
-  } else if (op == "Gemm") {
-    NodeAttrHelper helper(node);
-    const auto transB = helper.Get("transB", 0);
-    if (transB == 0)
-      model_builder.AddInitializerToSkip(input_defs[1]->Name());
-  } else if (op == "QLinearMatMul") {
-    AddBinaryOpQuantizationScaleAndZeroPointToSkip(model_builder, node_unit);
-    model_builder.AddInitializerToSkip(input_defs[3]->Name());  // b
+  const auto& inputs = node_unit.Inputs();
+  if (IsQuantizedOp(node_unit)) {
+    AddQuantizationScaleAndZeroPointToSkip(model_builder, inputs[0]);               // b_scale, b_zp
+    AddInputToSkip(model_builder, inputs[1]);                                       // b, b_scale, b_zp
+    AddQuantizationScaleAndZeroPointToSkip(model_builder, node_unit.Outputs()[0]);  // y_scale, y_zp
+  } else {
+    const auto& op = node_unit.OpType();
+    const auto& inputs = node_unit.Inputs();
+    if (op == "MatMul") {
+      model_builder.AddInitializerToSkip(inputs[1].node_arg.Name());
+    } else if (op == "Gemm") {
+      NodeAttrHelper helper(node_unit.GetNode());
+      const auto transB = helper.Get("transB", 0);
+      if (transB == 0)
+        model_builder.AddInitializerToSkip(inputs[1].node_arg.Name());
+    }
   }
 }
 
@@ -1913,24 +1932,21 @@ class UnaryOpBuilder : public BaseOpBuilder {
   static void CreateSharedOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
 
  private:
+  static bool IsQuantizedOp(const NodeUnit& node_unit);  // TODO, see if we want to move this to BaseOpBuilder
   Status AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const override ORT_MUST_USE_RESULT;
 };
 
+/* static */ bool UnaryOpBuilder::IsQuantizedOp(const NodeUnit& node_unit) {
+  // TODO, add support for QDQ NodeUnit
+  return node_unit.OpType() == "QLinearSigmoid";
+}
+
 void UnaryOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const NodeUnit& node_unit) const {
-  const auto& node = node_unit.GetNode();
-  const auto& op = node.OpType();
-  if (op != "QLinearSigmoid")
+  if (!IsQuantizedOp(node_unit))
     return;
 
-  const auto input_defs = node.InputDefs();
-
-  // skip input/output scales and zeropoints
-  model_builder.AddInitializerToSkip(input_defs[1]->Name());  // X_scale
-  model_builder.AddInitializerToSkip(input_defs[2]->Name());  // X_zero_point
-  model_builder.AddInitializerToSkip(input_defs[3]->Name());  // Y_scale
-
-  if (input_defs.size() == 5)                                   // has Y_zero_point input
-    model_builder.AddInitializerToSkip(input_defs[4]->Name());  // Y_zero_point
+  AddQuantizationScaleAndZeroPointToSkip(model_builder, node_unit.Inputs()[0]);   // x_scale, x_zp
+  AddQuantizationScaleAndZeroPointToSkip(model_builder, node_unit.Outputs()[0]);  // y_scale, y_zp
 }
 
 /* static */ void UnaryOpBuilder::CreateSharedOpBuilder(
@@ -2120,9 +2136,8 @@ class SqueezeOpBuilder : public BaseOpBuilder {
 };
 
 void SqueezeOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const NodeUnit& node_unit) const {
-  const auto& node = node_unit.GetNode();
-  if (node.SinceVersion() > 12 && node.InputDefs().size() > 1) {
-    model_builder.AddInitializerToSkip(node.InputDefs()[1]->Name());
+  if (node_unit.SinceVersion() > 12 && node_unit.Inputs().size() > 1) {
+    model_builder.AddInitializerToSkip(node_unit.Inputs()[1].node_arg.Name());
   }
 }
 
@@ -2179,13 +2194,7 @@ class QuantizeLinearOpBuilder : public BaseOpBuilder {
 };
 
 void QuantizeLinearOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const NodeUnit& node_unit) const {
-  const auto& node = node_unit.GetNode();
-  const auto input_defs(node.InputDefs());
-
-  model_builder.AddInitializerToSkip(input_defs[1]->Name());
-
-  if (input_defs.size() == 3)  // has zero_point input
-    model_builder.AddInitializerToSkip(input_defs[2]->Name());
+  AddQuantizationScaleAndZeroPointToSkip(model_builder, node_unit.Outputs()[0]);  // y_scale, y_zp
 }
 
 Status QuantizeLinearOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const {
@@ -2229,13 +2238,7 @@ class DequantizeLinearOpBuilder : public BaseOpBuilder {
 };
 
 void DequantizeLinearOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const NodeUnit& node_unit) const {
-  const auto& node = node_unit.GetNode();
-  const auto input_defs(node.InputDefs());
-
-  model_builder.AddInitializerToSkip(input_defs[1]->Name());
-
-  if (input_defs.size() == 3)  // has zero_point input
-    model_builder.AddInitializerToSkip(input_defs[2]->Name());
+  AddQuantizationScaleAndZeroPointToSkip(model_builder, node_unit.Inputs()[0]);  // x_scale, x_zp
 }
 
 Status DequantizeLinearOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const {
@@ -2340,12 +2343,12 @@ class ClipOpBuilder : public BaseOpBuilder {
 };
 
 void ClipOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const NodeUnit& node_unit) const {
-  const auto& node = node_unit.GetNode();
-  if (node.InputDefs().size() > 1)
-    model_builder.AddInitializerToSkip(node.InputDefs()[1]->Name());  // min
+  const auto& inputs = node_unit.Inputs();
+  if (inputs.size() > 1)
+    model_builder.AddInitializerToSkip(inputs[1].node_arg.Name());  // min
 
-  if (node.InputDefs().size() > 2)
-    model_builder.AddInitializerToSkip(node.InputDefs()[2]->Name());  // max
+  if (inputs.size() > 2)
+    model_builder.AddInitializerToSkip(inputs[2].node_arg.Name());  // max
 }
 
 Status ClipOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const {
@@ -2400,16 +2403,16 @@ class ResizeOpBuilder : public BaseOpBuilder {
 };
 
 void ResizeOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const NodeUnit& node_unit) const {
-  const auto& node = node_unit.GetNode();
+  const auto& inputs = node_unit.Inputs();
   // We don't really use ROI here, so add them to skipped list
-  model_builder.AddInitializerToSkip(node.InputDefs()[1]->Name());  // ROI
+  model_builder.AddInitializerToSkip(inputs[1].node_arg.Name());  // ROI
 
   // We will still add scales to the skipped list even sizes are present
   // since there is no use of it, we will not process it later
-  model_builder.AddInitializerToSkip(node.InputDefs()[2]->Name());  // scales
+  model_builder.AddInitializerToSkip(inputs[2].node_arg.Name());  // scales
 
-  if (node.InputDefs().size() > 3)
-    model_builder.AddInitializerToSkip(node.InputDefs()[3]->Name());  // sizes
+  if (inputs.size() > 3)
+    model_builder.AddInitializerToSkip(inputs[3].node_arg.Name());  // sizes
 }
 
 Status ResizeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const {
@@ -2641,16 +2644,14 @@ class SliceOpBuilder : public BaseOpBuilder {
 };
 
 void SliceOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const NodeUnit& node_unit) const {
-  const auto& node = node_unit.GetNode();
-
   // Skip everything except input0 for Slice
-  const auto input_defs = node.InputDefs();
-  model_builder.AddInitializerToSkip(input_defs[1]->Name());  // starts
-  model_builder.AddInitializerToSkip(input_defs[2]->Name());  // ends
-  if (input_defs.size() > 3) {
-    model_builder.AddInitializerToSkip(input_defs[3]->Name());  // axes
-    if (input_defs.size() > 4) {
-      model_builder.AddInitializerToSkip(input_defs[4]->Name());  // steps
+  const auto& inputs = node_unit.Inputs();
+  model_builder.AddInitializerToSkip(inputs[1].node_arg.Name());  // starts
+  model_builder.AddInitializerToSkip(inputs[2].node_arg.Name());  // ends
+  if (inputs.size() > 3) {
+    model_builder.AddInitializerToSkip(inputs[3].node_arg.Name());  // axes
+    if (inputs.size() > 4) {
+      model_builder.AddInitializerToSkip(inputs[4].node_arg.Name());  // steps
     }
   }
 }

From 5252da79b9bfe93aae4d45f9f6bb5fbe82a079cc Mon Sep 17 00:00:00 2001
From: Guoyu Wang <wanggy@outlook.com>
Date: Sat, 18 Dec 2021 19:35:00 -0800
Subject: [PATCH 05/23] minor fix

---
 .../nnapi_builtin/builders/op_builder.cc      |  5 ++-
 .../providers/shared/node_unit/node_unit.cc   | 40 +++++++++----------
 .../providers/shared/node_unit/node_unit.h    |  8 ++--
 3 files changed, 27 insertions(+), 26 deletions(-)

diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc
index 17acf39a4635c..c4e1f103a0212 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc
@@ -657,14 +657,17 @@ static Status IsValidConvWeightQuantizedType(const ModelBuilder& model_builder,
 static void AddQuantizationScaleAndZeroPointToSkip(ModelBuilder& model_builder, const NodeUnitIODef& io_def) {
   // If we reach here, we assume the io_def has quant_param
   model_builder.AddInitializerToSkip(io_def.quant_param->scale.Name());  // scale
+  LOGS_DEFAULT(VERBOSE) << io_def.quant_param->scale.Name() << "is skipped";
   if (io_def.quant_param->zero_point) {
     model_builder.AddInitializerToSkip(io_def.quant_param->zero_point->Name());  // zero_point
+    LOGS_DEFAULT(VERBOSE) << io_def.quant_param->zero_point->Name() << "is skipped";
   }
 }
 
 static void AddInputToSkip(ModelBuilder& model_builder, const NodeUnitIODef& io_def) {
   model_builder.AddInitializerToSkip(io_def.node_arg.Name());  // main input
-  AddQuantizationScaleAndZeroPointToSkip(model_builder, io_def);
+  if (io_def.quant_param)
+    AddQuantizationScaleAndZeroPointToSkip(model_builder, io_def);
 }
 
 Status GetQuantizedInputScaleAndZeroPoint(const InitializedTensorSet& initializers,
diff --git a/onnxruntime/core/providers/shared/node_unit/node_unit.cc b/onnxruntime/core/providers/shared/node_unit/node_unit.cc
index 340662e66b6e4..3d761dfbe971f 100644
--- a/onnxruntime/core/providers/shared/node_unit/node_unit.cc
+++ b/onnxruntime/core/providers/shared/node_unit/node_unit.cc
@@ -102,8 +102,9 @@ void NodeUnit::InitForNode() {
   // The 1st step is to hookup the NodeUnit with the NNAPI builder interface
   // So we are not handling quantization here now
   auto qlinear_type = GetQLinearOpType(node_);
-  if (qlinear_type == QLinearOpType::Unknown) {
-    //Not a Qlinear op, add all inputs / outputs
+  if (qlinear_type == QLinearOpType::Unknown ||
+      IsVariadicQLinearOp(qlinear_type)) {  // TODO, add variadic support
+    // Not a Qlinear op, add all inputs / outputs
     auto add_all_io = [](std::vector<NodeUnitIODef>& defs,
                          const ConstPointerContainer<std::vector<NodeArg*>>& node_defs) {
       defs.reserve(node_defs.size());
@@ -112,63 +113,60 @@ void NodeUnit::InitForNode() {
         defs.push_back(NodeUnitIODef{*def, std::nullopt});
       }
     };
-    add_all_io(input_defs_, input_defs);
-    add_all_io(output_defs_, output_defs);
+    add_all_io(inputs_, input_defs);
+    add_all_io(outputs_, output_defs);
   } else if (IsUnaryQLinearOp(qlinear_type)) {
     // Unary QLinear Op has 5 inputs
     // x, x_scale, x_zp, y_scale, y_zp (optional)
-    input_defs_.push_back(NodeUnitIODef{
+    inputs_.push_back(NodeUnitIODef{
         *input_defs[0],
         NodeUnitIODef::QuantParam{*input_defs[1], input_defs[2]}});
 
-    output_defs_.push_back(NodeUnitIODef{
+    outputs_.push_back(NodeUnitIODef{
         *output_defs[0],
         NodeUnitIODef::QuantParam{*input_defs[3],
-                                  input_defs_.size() > 4
+                                  input_defs.size() > 4
                                       ? input_defs[4]
                                       : nullptr}});
   } else if (IsBinaryQLinearOp(qlinear_type)) {
     // Binary QLinear Op has 9 inputs
     // x1, x1_scale, x1_zp, x2/w, x2_scale, x2_zp, y_scale , y_zp, B
-    input_defs_.push_back(NodeUnitIODef{
+    inputs_.push_back(NodeUnitIODef{
         *input_defs[0],
         NodeUnitIODef::QuantParam{*input_defs[1], input_defs[2]}});
-    input_defs_.push_back(NodeUnitIODef{
+    inputs_.push_back(NodeUnitIODef{
         *input_defs[3],
         NodeUnitIODef::QuantParam{*input_defs[4], input_defs[5]}});
 
-    if (input_defs_.size() == 9) {  // has Bias
-      input_defs_.push_back(NodeUnitIODef{
+    if (input_defs.size() == 9) {  // has Bias
+      inputs_.push_back(NodeUnitIODef{
           *input_defs[8],
           std::nullopt});  // for Bias the scale and zp are optional
     }
 
-    output_defs_.push_back(NodeUnitIODef{
+    outputs_.push_back(NodeUnitIODef{
         *output_defs[0],
         NodeUnitIODef::QuantParam{*input_defs[6], input_defs[7]}});
-  } else if (IsVariadicQLinearOp(qlinear_type)) {
-    // TODO, add variadic support
-    ORT_NOT_IMPLEMENTED();
   } else if (qlinear_type == QLinearOpType::DequantizeLinear) {
     // DequantizeLinear has 3 inputs
     // x, x_scale, x_zp
     // output is not quantized
-    input_defs_.push_back(NodeUnitIODef{
+    inputs_.push_back(NodeUnitIODef{
         *input_defs[0],
         NodeUnitIODef::QuantParam{*input_defs[1],
-                                  input_defs_.size() == 3
+                                  input_defs.size() == 3
                                       ? input_defs[2]
                                       : nullptr}});
-    output_defs_.push_back(NodeUnitIODef{*output_defs[0], std::nullopt});
+    outputs_.push_back(NodeUnitIODef{*output_defs[0], std::nullopt});
   } else if (qlinear_type == QLinearOpType::QuantizeLinear) {
     // QuantizeLinear the input is not quantized and has 3 inputs
     // x, y_scale, y_zp (optional)
     // The output is quantized
-    input_defs_.push_back(NodeUnitIODef{*input_defs[0], std::nullopt});
-    output_defs_.push_back(NodeUnitIODef{
+    inputs_.push_back(NodeUnitIODef{*input_defs[0], std::nullopt});
+    outputs_.push_back(NodeUnitIODef{
         *output_defs[0],
         NodeUnitIODef::QuantParam{*input_defs[1],
-                                  input_defs_.size() == 3
+                                  input_defs.size() == 3
                                       ? input_defs[2]
                                       : nullptr}});
   } else {
diff --git a/onnxruntime/core/providers/shared/node_unit/node_unit.h b/onnxruntime/core/providers/shared/node_unit/node_unit.h
index 0e4e9687f6c98..1d02854c327b4 100644
--- a/onnxruntime/core/providers/shared/node_unit/node_unit.h
+++ b/onnxruntime/core/providers/shared/node_unit/node_unit.h
@@ -52,8 +52,8 @@ class NodeUnit {
 
   Type UnitType() const noexcept { return type_; }
 
-  const std::vector<NodeUnitIODef>& Inputs() const noexcept { return input_defs_; }
-  const std::vector<NodeUnitIODef>& Outputs() const noexcept { return output_defs_; }
+  const std::vector<NodeUnitIODef>& Inputs() const noexcept { return inputs_; }
+  const std::vector<NodeUnitIODef>& Outputs() const noexcept { return outputs_; }
 
   const std::string& Domain() const noexcept;
   const std::string& OpType() const noexcept;
@@ -68,8 +68,8 @@ class NodeUnit {
   const std::vector<const Node*> GetAllNodes() const noexcept { return nodes_; }
 
  private:
-  std::vector<NodeUnitIODef> input_defs_;
-  std::vector<NodeUnitIODef> output_defs_;
+  std::vector<NodeUnitIODef> inputs_;
+  std::vector<NodeUnitIODef> outputs_;
 
   const std::vector<const Node*> nodes_;  // all nodes in this NodeUnit
   const Node& node_;                      // target Node

From 6189bb16234f970995126a184b39d3eec7e15ccb Mon Sep 17 00:00:00 2001
From: Guoyu Wang <wanggy@outlook.com>
Date: Mon, 3 Jan 2022 15:20:39 -0800
Subject: [PATCH 06/23] Move get quantized inputs to NodeUnit

---
 .../nnapi/nnapi_builtin/builders/helper.cc    | 57 ++++++++++++++
 .../nnapi/nnapi_builtin/builders/helper.h     | 13 +++-
 .../nnapi_builtin/builders/model_builder.cc   | 74 ++++++++++++-------
 .../nnapi_builtin/builders/model_builder.h    | 22 +++++-
 4 files changed, 135 insertions(+), 31 deletions(-)

diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.cc
index edbcaf590c379..b26d8cef94f76 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.cc
@@ -331,6 +331,63 @@ common::Status GetQuantizationZeroPoint(const InitializedTensorSet& initializers
   return Status::OK();
 }
 
+common::Status GetQuantizationScaleAndZeroPoint(
+    const InitializedTensorSet& initializers, const NodeUnitIODef& io_def, const Path& model_path,
+    float& scale, int32_t& zero_point) {
+  scale = 0.0f;
+  zero_point = 0;
+
+  if (!io_def.quant_param) {  // Not a quantized IO
+    return Status::OK();
+  }
+
+  const auto unpack_tensor = [&model_path](const InitializedTensorSet& initializers,
+                                           const std::string& name, std::vector<uint8_t>& unpacked_tensor) {
+    unpacked_tensor.clear();
+    const auto& tensor = *initializers.at(name);
+    ORT_RETURN_IF_ERROR(
+        onnxruntime::utils::UnpackInitializerData(tensor, model_path, unpacked_tensor));
+    return Status::OK();
+  };
+
+  const auto& quant_param = *io_def.quant_param;
+  {  // get the scale
+    std::vector<uint8_t> unpacked_tensor;
+    const auto& name = quant_param.scale.Name();
+    ORT_RETURN_IF_ERROR(unpack_tensor(initializers, name, unpacked_tensor));
+    // The scale should be one or more floats
+    ORT_RETURN_IF(unpacked_tensor.size() < 4,
+                  "The initializer [", name, "] should have one or more floats ",
+                  "with size no less than 4, actual size: ", unpacked_tensor.size());
+    scale = reinterpret_cast<const float*>(unpacked_tensor.data())[0];
+  }
+
+  if (quant_param.zero_point) {  // get the zero point if it's there
+    std::vector<uint8_t> unpacked_tensor;
+    const auto& name = quant_param.zero_point->Name();
+    ORT_RETURN_IF_ERROR(unpack_tensor(initializers, name, unpacked_tensor));
+    ORT_RETURN_IF(unpacked_tensor.empty(), "The initializer [", name, "] is empty");
+    // Onnx quantization uses uint8 [int8 not yet supported], need to cast to int32_t used by NNAPI
+    zero_point = static_cast<int32_t>(unpacked_tensor[0]);
+  }
+
+  return Status::OK();
+}
+
+common::Status GetQuantizationScaleAndZeroPoint(
+    const InitializedTensorSet& initializers, const NodeUnit& node_unit, const std::string& name,
+    float& scale, int32_t& zero_point, bool is_input) {
+  const auto& io_defs = is_input ? node_unit.Inputs() : node_unit.Outputs();
+  for (const auto& io_def : io_defs) {
+    if (io_def.node_arg.Name() == name)
+      return GetQuantizationScaleAndZeroPoint(initializers, io_def, node_unit.GetNode().ModelPath(),
+                                              scale, zero_point);
+  }
+
+  return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                         "Unknown input: ", name, ", for NodeUnit with node index: ", node_unit.Index());
+}
+
 bool GetShape(const NodeArg& node_arg, Shape& shape) {
   shape.clear();
   const auto* shape_proto = node_arg.Shape();
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.h b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.h
index d8d89269c9f55..62d0d8e87915a 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.h
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.h
@@ -26,10 +26,13 @@ namespace onnxruntime {
 using Shape = std::vector<uint32_t>;
 using InitializerMap = std::unordered_map<std::string, const ONNX_NAMESPACE::TensorProto&>;
 
+class GraphViewer;
 class Node;
 class NodeArg;
 class NodeUnit;
-class GraphViewer;
+class Path;
+
+struct NodeUnitIODef;
 
 namespace nnapi {
 
@@ -117,6 +120,14 @@ common::Status GetQuantizationScale(const InitializedTensorSet& initializers, co
 common::Status GetQuantizationZeroPoint(const InitializedTensorSet& initializers,
                                         const Node& node, size_t idx, int32_t& zero_point) ORT_MUST_USE_RESULT;
 
+common::Status GetQuantizationScaleAndZeroPoint(
+    const InitializedTensorSet& initializers, const NodeUnitIODef& io_def, const Path& model_path,
+    float& scale, int32_t& zero_point);
+
+common::Status GetQuantizationScaleAndZeroPoint(
+    const InitializedTensorSet& initializers, const NodeUnit& node_unit, const std::string& name,
+    float& scale, int32_t& zero_point, bool is_input = true);
+
 // Get Shape/Type of a NodeArg
 // TODO, move to shared_utils
 bool GetShape(const NodeArg& node_arg, Shape& shape);
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.cc
index 7136251927e09..8664646a9af5c 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.cc
@@ -48,13 +48,12 @@ void ModelBuilder::AddInitializerToSkip(const std::string& tensor_name) {
   skipped_initializers_.insert(tensor_name);
 }
 
-static std::unordered_map<std::string, std::vector<const Node*>> GetAllQuantizedOpInputs(const GraphViewer& graph_viewer);
-
 Status ModelBuilder::Prepare() {
   nnapi_model_ = std::unique_ptr<Model>(new Model());
   RETURN_STATUS_ON_ERROR(nnapi_->ANeuralNetworksModel_create(&nnapi_model_->model_));
   ORT_RETURN_IF_ERROR(GetTargetDevices());
-  all_quantized_op_inputs_ = GetAllQuantizedOpInputs(graph_viewer_);
+  PreprocessNodeUnits();
+  GetAllQuantizedOpInputs();
   PreprocessInitializers();
   PreprocessActivations();
   ORT_RETURN_IF_ERROR(RegisterInitializers());
@@ -148,42 +147,66 @@ void ModelBuilder::PreprocessActivations() {
   }
 }
 
-// Help to get all quantized operators' input and the node(s) using the input
-static std::unordered_map<std::string, std::vector<const Node*>> GetAllQuantizedOpInputs(const GraphViewer& graph_viewer) {
-  std::unordered_map<std::string, std::vector<const Node*>> all_quantized_op_inputs;
-  const auto& node_indices = graph_viewer.GetNodesInTopologicalOrder();
+const NodeUnit& ModelBuilder::GetNodeUnit(const Node* node) const {
+  // Do we want to throw here if the node is not in the map?
+  return *node_unit_map_.at(node);
+}
+
+void ModelBuilder::PreprocessNodeUnits() {
+  // TODO, hookup shared QDQ selectors here to identify all the qdq NodeUnit in the graph
+  const auto& node_indices = graph_viewer_.GetNodesInTopologicalOrder();
+  for (size_t i = 0; i < node_indices.size(); i++) {
+    const auto node_idx = node_indices[i];
+    // TODO, check if the node is already part of a qdq group
+    const auto* node(graph_viewer_.GetNode(node_idx));
+    auto node_unit = std::make_unique<NodeUnit>(*node);
+    node_unit_map_.insert({node, node_unit.get()});
+    node_unit_holder_.push_back(std::move(node_unit));
+  }
+}
+
+// Help to get all quantized operators' input and the NodeUnit(s) using the input
+void ModelBuilder::GetAllQuantizedOpInputs() {
+  const auto& node_indices = graph_viewer_.GetNodesInTopologicalOrder();
   for (const auto& node_idx : node_indices) {
-    const auto* node(graph_viewer.GetNode(node_idx));
-    auto qlinear_op_type = GetQLinearOpType(*node);
+    const auto* node(graph_viewer_.GetNode(node_idx));
+    // TODO check if the node_unit has already been processed
+    const auto& node_unit = GetNodeUnit(node);
+
+    // TODO, hookup getting quantized inputs with QDQ NodeUnits and remove the ORT_ENFORCE
+    ORT_ENFORCE(node_unit.UnitType() == NodeUnit::Type::SingleNode, "QDQ NodeUnit is not yet implemented");
+
+    auto qlinear_op_type = GetQLinearOpType(node_unit.GetNode());
 
     // Not a qlinear op
     if (qlinear_op_type == QLinearOpType::Unknown)
       continue;
 
+    const auto add_quantized_input =
+        [&all_quantized_op_inputs = all_quantized_op_inputs_](const NodeUnit& node_unit, size_t input_idx) {
+          const auto& input_name = node_unit.Inputs()[input_idx].node_arg.Name();
+          if (Contains(all_quantized_op_inputs, input_name))
+            all_quantized_op_inputs.at(input_name).push_back(&node_unit);
+          else
+            all_quantized_op_inputs.emplace(input_name, std::vector<const NodeUnit*>{&node_unit});
+        };
+
     // All qlinear ops EXCEPT QuantizeLinear has quantized input
     if (qlinear_op_type != QLinearOpType::QuantizeLinear) {
-      const auto& input_name = node->InputDefs()[0]->Name();
-      if (Contains(all_quantized_op_inputs, input_name))
-        all_quantized_op_inputs.at(input_name).push_back(node);
-      else
-        all_quantized_op_inputs.emplace(input_name, std::vector<const Node*>{node});
+      add_quantized_input(node_unit, 0);
     }
 
     if (IsQLinearBinaryOp(qlinear_op_type)) {
-      const auto& input_name = node->InputDefs()[3]->Name();
-      if (Contains(all_quantized_op_inputs, input_name))
-        all_quantized_op_inputs.at(input_name).push_back(node);
-      else
-        all_quantized_op_inputs.emplace(input_name, std::vector<const Node*>{node});
+      add_quantized_input(node_unit, 1);
     }
-  }
 
-  return all_quantized_op_inputs;
+    // TODO, add handling for varidiac nodes such as QLinearConcat
+  }
 }
 
 static Status GetInputDataType(
     const InitializedTensorSet& initializers,
-    const std::unordered_map<std::string, std::vector<const Node*>>& all_quantized_op_inputs,
+    const std::unordered_map<std::string, std::vector<const NodeUnit*>>& all_quantized_op_inputs,
     const std::string& name, int32_t data_type, const Shape& shape,
     OperandType& operand_type) {
   Type type = Type::TENSOR_FLOAT32;
@@ -206,10 +229,9 @@ static Status GetInputDataType(
       }
 
       // TODO, verify the scale and zero point match if there are multiple op using same input
-      const auto* node = all_quantized_op_inputs.at(name)[0];
-      const NodeUnit node_unit(*node);
-      ORT_RETURN_IF_ERROR(GetQuantizedInputScaleAndZeroPoint(
-          initializers, node_unit, name, scale, zero_point));
+      const auto* node_unit = all_quantized_op_inputs.at(name)[0];
+      ORT_RETURN_IF_ERROR(GetQuantizationScaleAndZeroPoint(
+          initializers, *node_unit, name, scale, zero_point, true /* is_input */));
       break;
     }
       // case ONNX_NAMESPACE::TensorProto_DataType_INT8:
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.h b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.h
index d7dfd78ac0a43..73c99486a959a 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.h
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.h
@@ -12,6 +12,9 @@
 #include "shaper.h"
 
 namespace onnxruntime {
+
+class NodeUnit;
+
 namespace nnapi {
 
 class IOpBuilder;
@@ -33,7 +36,6 @@ class ModelBuilder {
   };
 
   ModelBuilder(const GraphViewer& graph_viewer);
-  ~ModelBuilder() = default;
 
   Status Compile(std::unique_ptr<Model>& model) ORT_MUST_USE_RESULT;
 
@@ -149,9 +151,14 @@ class ModelBuilder {
   std::vector<uint32_t> input_index_vec_;
   std::vector<uint32_t> output_index_vec_;
 
-  // Contains all quantized operators' input and the node(s) using the input
-  // In the form of {input_name, [node(s) using the input]}
-  std::unordered_map<std::string, std::vector<const Node*>> all_quantized_op_inputs_;
+  // Contains all quantized operators' input and the NodeUnit(s) using the input
+  // In the form of {input_name, [NodeUnit(s) using the input]}
+  std::unordered_map<std::string, std::vector<const NodeUnit*>> all_quantized_op_inputs_;
+
+  // Holder for the NodeUnits in the graph, this will guarantee the NodeUnits is
+  // valid throughout the lifetime of the ModelBuilder
+  std::vector<std::unique_ptr<NodeUnit>> node_unit_holder_;
+  std::unordered_map<const Node*, const NodeUnit*> node_unit_map_;
 
   std::unordered_set<std::string> unique_names_;
 
@@ -180,6 +187,13 @@ class ModelBuilder {
   // After constructing the NNAPI model, will set the shape inferencing record to the Model
   void RegisterModelShaper();
 
+  // Get all quantized inputs in the underlying graph_viewer
+  void GetAllQuantizedOpInputs();
+  // Go through the underlying graph_viewer, and generate NodeUnits
+  void PreprocessNodeUnits();
+  // Get the NodeUnit which contains the given node
+  const NodeUnit& GetNodeUnit(const Node* node) const;
+
   Status SetOperandValue(uint32_t index, Model::NNMemory* memory,
                          size_t size, size_t offset) ORT_MUST_USE_RESULT;
 

From 2baac120265f9804468d2c5031590fa0080666d3 Mon Sep 17 00:00:00 2001
From: Guoyu Wang <wanggy@outlook.com>
Date: Mon, 3 Jan 2022 22:44:32 -0800
Subject: [PATCH 07/23] move isvalidzp to nodeunit, fix some minor bug

---
 .../nnapi/nnapi_builtin/builders/helper.cc    | 100 ++++++++++++++++++
 .../nnapi/nnapi_builtin/builders/helper.h     |   3 +
 .../nnapi_builtin/builders/model_builder.cc   |  17 +--
 .../nnapi_builtin/builders/model_builder.h    |   7 +-
 4 files changed, 116 insertions(+), 11 deletions(-)

diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.cc
index b26d8cef94f76..785eca5753334 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.cc
@@ -214,6 +214,106 @@ bool HasValidQuantizationScales(const InitializedTensorSet& initializers, const
   return true;
 }
 
+bool HasValidQuantizationZeroPoint(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+                                   const std::vector<size_t>& indices) {
+  const auto& op_type = node_unit.OpType();
+  auto qlinear_op_type = GetQLinearOpType(node_unit.GetNode());
+  bool is_qlinear_conv = (qlinear_op_type == QLinearOpType::QLinearConv);
+  bool is_qlinear_matmul = (qlinear_op_type == QLinearOpType::QLinearMatMul);
+
+  const auto& inputs = node_unit.Inputs();
+  for (const auto idx : indices) {
+    if (idx >= inputs.size()) {
+      LOGS_DEFAULT(VERBOSE) << "HasValidQuantizationZeroPoints, Input index,  " << idx
+                            << " >= input number, " << inputs.size();
+      return false;
+    }
+
+    const auto& input = inputs[idx];
+    if (!input.quant_param.has_value()) {
+      LOGS_DEFAULT(VERBOSE) << "HasValidQuantizationZeroPoints, Input index,  " << idx
+                            << " has no quant_param";
+      return false;
+    }
+
+    // zero point is optional here
+    if (!input.quant_param->zero_point)
+      return true;
+
+    const auto& zero_point_name = input.quant_param->zero_point->Name();
+    const auto& weight_tensor = *initializers.at(input.node_arg.Name());
+    if (!Contains(initializers, zero_point_name)) {
+      LOGS_DEFAULT(VERBOSE) << "The zero point of " << op_type << " must be an initializer tensor";
+      return false;
+    }
+
+    bool is_conv_matmul_weight = (is_qlinear_conv || is_qlinear_matmul) && idx == 2;
+    bool is_conv_matmul_u8s8_weight = false;
+
+    if (is_conv_matmul_weight) {
+      is_conv_matmul_u8s8_weight = weight_tensor.data_type() == ONNX_NAMESPACE::TensorProto_DataType_INT8;
+    }
+
+    const auto& zero_tensor = *initializers.at(zero_point_name);
+    int64_t zero_dim = zero_tensor.dims().empty() ? 1 : zero_tensor.dims()[0];
+
+    if (!is_conv_matmul_u8s8_weight) {
+      if (zero_dim != 1) {
+        LOGS_DEFAULT(VERBOSE) << op_type << " does not support per-channel quantization, "
+                              << " for now, only u8s8 QlinearConv supports per-channel quantization on API 29+";
+        return false;
+      }
+    } else {
+      // For u8s8 Qlinear[Conv/MatMul], we support
+      // 1. Per-tensor, the weight will be transformed to uint8 later
+      // 2. Per-channel, only from Android API level 29
+      if (zero_tensor.data_type() != ONNX_NAMESPACE::TensorProto_DataType_INT8) {
+        LOGS_DEFAULT(VERBOSE) << "u8s8 Qlinear[Conv/MatMul] only supports int8 zero point for weight, "
+                              << "actual zero point type: [" << zero_tensor.data_type() << "]";
+        return false;
+      }
+
+      if (zero_dim != 1) {
+        if (is_qlinear_matmul) {
+          LOGS_DEFAULT(VERBOSE) << "QLinearMatMul does not support per-channel quantization";
+          return false;
+        }
+      }
+
+      // For onnx, u8s8 QlinearConv, the weight zero point can be a scalar,
+      // or a tensor with same channel as weight, for NNAPI we only support it be
+      // 0 (scalar) or all 0 (tensor), NNAPI will assume the zero point for per-channel
+      // quantization is 0 there is no input for it
+      if (weight_tensor.dims()[0] != zero_dim && zero_dim != 1) {
+        LOGS_DEFAULT(VERBOSE) << op_type << " mismatch int8 per-channel quantization weight,"
+                              << " weight dimension[0] " << weight_tensor.dims()[0]
+                              << " zero point dimension " << zero_dim;
+        return false;
+      }
+
+      std::vector<uint8_t> unpacked_tensor;
+      auto status = onnxruntime::utils::UnpackInitializerData(zero_tensor, node_unit.ModelPath(), unpacked_tensor);
+      if (!status.IsOK()) {
+        LOGS_DEFAULT(ERROR) << "Qlinear[Conv/MatMul] error when unpack zero tensor: " << zero_point_name
+                            << ", error msg: " << status.ErrorMessage();
+        return false;
+      }
+
+      // Verify all onnx weight zero point(s) are 0(s)
+      const int8_t* zero_points = reinterpret_cast<const int8_t*>(unpacked_tensor.data());
+      for (size_t i = 0; i < unpacked_tensor.size(); i++) {
+        if (zero_points[i] != 0) {
+          LOGS_DEFAULT(VERBOSE) << "u8s8 Qlinear[Conv/MatMul]  only support 0 as zero point, "
+                                << "zero_points[" << i << "] has value: " << zero_points[i];
+          return false;
+        }
+      }
+    }
+  }
+
+  return true;
+}
+
 bool HasValidQuantizationZeroPoints(const InitializedTensorSet& initializers, const Node& node,
                                     const std::vector<size_t>& indices) {
   const auto& op_type = node.OpType();
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.h b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.h
index 62d0d8e87915a..215d21857fd55 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.h
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.h
@@ -120,6 +120,9 @@ common::Status GetQuantizationScale(const InitializedTensorSet& initializers, co
 common::Status GetQuantizationZeroPoint(const InitializedTensorSet& initializers,
                                         const Node& node, size_t idx, int32_t& zero_point) ORT_MUST_USE_RESULT;
 
+bool HasValidQuantizationZeroPoint(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+                                   const std::vector<size_t>& indices);
+
 common::Status GetQuantizationScaleAndZeroPoint(
     const InitializedTensorSet& initializers, const NodeUnitIODef& io_def, const Path& model_path,
     float& scale, int32_t& zero_point);
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.cc
index 8664646a9af5c..f0c209ddf8d25 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.cc
@@ -118,8 +118,8 @@ void ModelBuilder::PreprocessInitializers() {
   const auto& node_indices = graph_viewer_.GetNodesInTopologicalOrder();
   for (size_t i = 0; i < node_indices.size(); i++) {
     const auto* node(graph_viewer_.GetNode(node_indices[i]));
-    if (const auto* op_builder = GetOpBuilder(*node)) {
-      const NodeUnit node_unit(*node);
+    const auto& node_unit = GetNodeUnit(node);
+    if (const auto* op_builder = GetOpBuilder(node_unit)) {
       op_builder->AddInitializersToSkip(*this, node_unit);
     }
   }
@@ -513,12 +513,12 @@ Status ModelBuilder::AddOperations() {
   const auto& node_indices = graph_viewer_.GetNodesInTopologicalOrder();
   for (size_t i = 0; i < node_indices.size(); i++) {
     const auto* node(graph_viewer_.GetNode(node_indices[i]));
-    if (const auto* op_builder = GetOpBuilder(*node)) {
-      const NodeUnit node_unit(*node);
+    const NodeUnit& node_unit = GetNodeUnit(node);
+    if (const auto* op_builder = GetOpBuilder(node_unit)) {
       ORT_RETURN_IF_ERROR(op_builder->AddToModelBuilder(*this, node_unit));
     } else {
       return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                             "Node [", node->Name(), "], type [", node->OpType(), "] is not supported");
+                             "Node [", node_unit.Name(), "], type [", node_unit.OpType(), "] is not supported");
     }
   }
 
@@ -664,12 +664,13 @@ int32_t ModelBuilder::FindActivation(const Node& node, const NodeArg& output) {
   return fuse_code;
 }
 
-/* static */ const IOpBuilder* ModelBuilder::GetOpBuilder(const Node& node) {
+/* static */ const IOpBuilder* ModelBuilder::GetOpBuilder(const NodeUnit& node_unit) {
   const auto& op_builders = GetOpBuilders();
-  if (!Contains(op_builders, node.OpType()))
+  const auto& op_type = node_unit.GetNode().OpType();
+  if (!Contains(op_builders, op_type))
     return nullptr;
 
-  return op_builders.at(node.OpType());
+  return op_builders.at(op_type);
 }
 
 std::string ModelBuilder::GetUniqueName(const std::string& base_name) {
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.h b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.h
index 73c99486a959a..9b7dd0a289aad 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.h
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.h
@@ -109,6 +109,9 @@ class ModelBuilder {
   bool GetNCHWOperand(const std::string& nhwc_name, std::string& nchw_name);
   bool GetNHWCOperand(const std::string& nchw_name, std::string& nhwc_name);
 
+  // Get the NodeUnit which contains the given node
+  const NodeUnit& GetNodeUnit(const Node* node) const;
+
   Status SetNHWCToNCHWOperandMap(const std::string& nhwc_name,
                                  const std::string& nchw_name) ORT_MUST_USE_RESULT;
   Status SetNCHWToNHWCOperandMap(const std::string& nchw_name,
@@ -191,8 +194,6 @@ class ModelBuilder {
   void GetAllQuantizedOpInputs();
   // Go through the underlying graph_viewer, and generate NodeUnits
   void PreprocessNodeUnits();
-  // Get the NodeUnit which contains the given node
-  const NodeUnit& GetNodeUnit(const Node* node) const;
 
   Status SetOperandValue(uint32_t index, Model::NNMemory* memory,
                          size_t size, size_t offset) ORT_MUST_USE_RESULT;
@@ -203,7 +204,7 @@ class ModelBuilder {
                        bool is_nhwc,
                        uint32_t& index) ORT_MUST_USE_RESULT;
 
-  static const IOpBuilder* GetOpBuilder(const Node& node);
+  static const IOpBuilder* GetOpBuilder(const NodeUnit& node_unit);
 };
 
 }  // namespace nnapi

From 5504358ae33d294c46c0a60c56083dab5ccee835 Mon Sep 17 00:00:00 2001
From: Guoyu Wang <wanggy@outlook.com>
Date: Mon, 3 Jan 2022 23:34:08 -0800
Subject: [PATCH 08/23] Let HasValidQuantizationZeroPoints handle output def of
 node_unit

---
 .../nnapi/nnapi_builtin/builders/helper.cc    | 25 ++++++++++---------
 .../nnapi/nnapi_builtin/builders/helper.h     |  4 +--
 2 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.cc
index 785eca5753334..af038c08eeffc 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.cc
@@ -214,40 +214,41 @@ bool HasValidQuantizationScales(const InitializedTensorSet& initializers, const
   return true;
 }
 
-bool HasValidQuantizationZeroPoint(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
-                                   const std::vector<size_t>& indices) {
+bool HasValidQuantizationZeroPoints(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+                                    const std::vector<size_t>& indices, bool is_input) {
   const auto& op_type = node_unit.OpType();
   auto qlinear_op_type = GetQLinearOpType(node_unit.GetNode());
   bool is_qlinear_conv = (qlinear_op_type == QLinearOpType::QLinearConv);
   bool is_qlinear_matmul = (qlinear_op_type == QLinearOpType::QLinearMatMul);
 
-  const auto& inputs = node_unit.Inputs();
+  const auto& io_defs = is_input ? node_unit.Inputs() : node_unit.Outputs();
   for (const auto idx : indices) {
-    if (idx >= inputs.size()) {
-      LOGS_DEFAULT(VERBOSE) << "HasValidQuantizationZeroPoints, Input index,  " << idx
-                            << " >= input number, " << inputs.size();
+    if (idx >= io_defs.size()) {
+      LOGS_DEFAULT(VERBOSE) << "HasValidQuantizationZeroPoints, "
+                            << (is_input ? "Input" : "Output") << " index,  " << idx
+                            << " >= input number, " << io_defs.size();
       return false;
     }
 
-    const auto& input = inputs[idx];
-    if (!input.quant_param.has_value()) {
+    const auto& io_def = io_defs[idx];
+    if (!io_def.quant_param.has_value()) {
       LOGS_DEFAULT(VERBOSE) << "HasValidQuantizationZeroPoints, Input index,  " << idx
                             << " has no quant_param";
       return false;
     }
 
     // zero point is optional here
-    if (!input.quant_param->zero_point)
+    if (!io_def.quant_param->zero_point)
       return true;
 
-    const auto& zero_point_name = input.quant_param->zero_point->Name();
-    const auto& weight_tensor = *initializers.at(input.node_arg.Name());
+    const auto& zero_point_name = io_def.quant_param->zero_point->Name();
+    const auto& weight_tensor = *initializers.at(io_def.node_arg.Name());
     if (!Contains(initializers, zero_point_name)) {
       LOGS_DEFAULT(VERBOSE) << "The zero point of " << op_type << " must be an initializer tensor";
       return false;
     }
 
-    bool is_conv_matmul_weight = (is_qlinear_conv || is_qlinear_matmul) && idx == 2;
+    bool is_conv_matmul_weight = is_input && (is_qlinear_conv || is_qlinear_matmul) && idx == 2;
     bool is_conv_matmul_u8s8_weight = false;
 
     if (is_conv_matmul_weight) {
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.h b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.h
index 215d21857fd55..7d920cfa1a8eb 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.h
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.h
@@ -120,8 +120,8 @@ common::Status GetQuantizationScale(const InitializedTensorSet& initializers, co
 common::Status GetQuantizationZeroPoint(const InitializedTensorSet& initializers,
                                         const Node& node, size_t idx, int32_t& zero_point) ORT_MUST_USE_RESULT;
 
-bool HasValidQuantizationZeroPoint(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
-                                   const std::vector<size_t>& indices);
+bool HasValidQuantizationZeroPoints(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+                                    const std::vector<size_t>& indices, bool is_input);
 
 common::Status GetQuantizationScaleAndZeroPoint(
     const InitializedTensorSet& initializers, const NodeUnitIODef& io_def, const Path& model_path,

From 87c1204cd4e8b7ce2ac8b49afd144c1342c9e72e Mon Sep 17 00:00:00 2001
From: Guoyu Wang <wanggy@outlook.com>
Date: Thu, 6 Jan 2022 15:48:34 -0800
Subject: [PATCH 09/23] move op_support_checked to node_unit

---
 .../nnapi/nnapi_builtin/builders/helper.cc    | 153 +++-------
 .../nnapi/nnapi_builtin/builders/helper.h     |  21 +-
 .../nnapi_builtin/builders/op_builder.cc      |  42 ---
 .../nnapi/nnapi_builtin/builders/op_builder.h |   5 -
 .../builders/op_support_checker.cc            | 271 +++++++++---------
 5 files changed, 179 insertions(+), 313 deletions(-)

diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.cc
index af038c08eeffc..71f5628211655 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.cc
@@ -100,13 +100,13 @@ bool IsQLinearBinaryOp(QLinearOpType qlinear_op_type) {
          qlinear_op_type == QLinearOpType::QLinearAdd;
 }
 
-bool HasValidUnaryOpQuantizedInputs(const Node& node) {
+bool HasValidUnaryOpQuantizedInputs(const NodeUnit& node_unit) {
   int32_t input_type;
-  if (!GetType(*node.InputDefs()[0], input_type))
+  if (!GetType(node_unit.Inputs()[0].node_arg, input_type))
     return false;
 
   if (input_type != ONNX_NAMESPACE::TensorProto_DataType_UINT8) {
-    LOGS_DEFAULT(VERBOSE) << "[" << node.OpType()
+    LOGS_DEFAULT(VERBOSE) << "[" << node_unit.OpType()
                           << "] Input type: [" << input_type
                           << "] is not supported for now";
     return false;
@@ -115,18 +115,18 @@ bool HasValidUnaryOpQuantizedInputs(const Node& node) {
   return true;
 }
 
-bool HasValidBinaryOpQuantizedInputs(const Node& node) {
-  auto op_type = GetQLinearOpType(node);
+bool HasValidBinaryOpQuantizedInputs(const NodeUnit& node_unit) {
+  auto op_type = GetQLinearOpType(node_unit.GetNode());
   int32_t a_input_type, b_input_type;
   if (!IsQLinearBinaryOp(op_type)) {
-    LOGS_DEFAULT(VERBOSE) << "[" << node.OpType() << "] is not a binary qlinear op";
+    LOGS_DEFAULT(VERBOSE) << "[" << node_unit.OpType() << "] is not a binary qlinear op";
     return false;
   }
 
-  const auto input_defs(node.InputDefs());
-  if (!GetType(*input_defs[0], a_input_type))
+  const auto& inputs = node_unit.Inputs();
+  if (!GetType(inputs[0].node_arg, a_input_type))
     return false;
-  if (!GetType(*input_defs[3], b_input_type))
+  if (!GetType(inputs[1].node_arg, b_input_type))
     return false;
 
   // QlinearConv supports u8u8 or u8s8
@@ -139,7 +139,7 @@ bool HasValidBinaryOpQuantizedInputs(const Node& node) {
   if (a_input_type != ONNX_NAMESPACE::TensorProto_DataType_UINT8 ||
       (!is_qlinear_conv && a_input_type != b_input_type) ||
       (is_qlinear_conv && !has_valid_qlinear_conv_weight)) {
-    LOGS_DEFAULT(VERBOSE) << "[" << node.OpType()
+    LOGS_DEFAULT(VERBOSE) << "[" << node_unit.OpType()
                           << "] A Input type: [" << a_input_type
                           << "] B Input type: [" << b_input_type
                           << "] is not supported for now";
@@ -149,32 +149,41 @@ bool HasValidBinaryOpQuantizedInputs(const Node& node) {
   return true;
 }
 
-bool HasValidQuantizationScales(const InitializedTensorSet& initializers, const Node& node,
-                                const std::vector<size_t>& indices, const OpSupportCheckParams& params) {
-  const auto& op_type = node.OpType();
-  auto qlinear_op_type = GetQLinearOpType(node);
+bool HasValidQuantizationScales(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+                                const std::vector<size_t>& indices, const OpSupportCheckParams& params, bool is_input) {
+  const auto& op_type = node_unit.OpType();
+  auto qlinear_op_type = GetQLinearOpType(node_unit.GetNode());
   bool is_qlinear_conv = (qlinear_op_type == QLinearOpType::QLinearConv);
   bool is_qlinear_matmul = (qlinear_op_type == QLinearOpType::QLinearMatMul);
-  const auto input_defs(node.InputDefs());
+  const auto& io_defs = is_input ? node_unit.Inputs() : node_unit.Outputs();
   for (const auto idx : indices) {
-    if (idx >= input_defs.size()) {
-      LOGS_DEFAULT(VERBOSE) << "HasValidQuantizationScales, Input index,  " << idx
-                            << " >= input number, " << input_defs.size();
+    if (idx >= io_defs.size()) {
+      LOGS_DEFAULT(VERBOSE) << "HasValidQuantizationScales, "
+                            << (is_input ? "Input" : "Output") << " index,  " << idx
+                            << " >= size, " << io_defs.size();
       return false;
     }
 
-    const auto scale_name = input_defs[idx]->Name();
+    const auto& io_def = io_defs[idx];
+    if (!io_def.quant_param.has_value()) {
+      LOGS_DEFAULT(VERBOSE) << "HasValidQuantizationZeroPoints, Input index,  " << idx
+                            << " has no quant_param";
+      return false;
+    }
+
+    const auto scale_name = io_def.quant_param->scale.Name();
+
     if (!Contains(initializers, scale_name)) {
       LOGS_DEFAULT(VERBOSE) << "The scale of " << op_type << " must be an initializer tensor";
       return false;
     }
 
     // If this op is Qlinear[Conv/MatMul], we want to check u8s8 support for weight tensor (or B tensor for QlinearMatMul)
-    bool is_conv_matmul_weight = (is_qlinear_conv || is_qlinear_matmul) && idx == 4;
+    bool is_conv_matmul_weight = is_input && (is_qlinear_conv || is_qlinear_matmul) && idx == 1;
     bool is_conv_matmul_u8s8_weight = false;
 
     if (is_conv_matmul_weight) {
-      const auto& weight_tensor = *initializers.at(node.InputDefs()[3]->Name());
+      const auto& weight_tensor = *initializers.at(io_def.node_arg.Name());
       is_conv_matmul_u8s8_weight = weight_tensor.data_type() == ONNX_NAMESPACE::TensorProto_DataType_INT8;
     }
 
@@ -201,7 +210,7 @@ bool HasValidQuantizationScales(const InitializedTensorSet& initializers, const
         return false;
       }
 
-      const auto& weight_tensor = *initializers.at(node.InputDefs()[3]->Name());
+      const auto& weight_tensor = *initializers.at(io_def.node_arg.Name());
       if (weight_tensor.dims()[0] != scales_dim) {
         LOGS_DEFAULT(VERBOSE) << op_type << " mismatch int8 per-channel quantization weight,"
                               << " weight dimension[0] " << weight_tensor.dims()[0]
@@ -226,7 +235,7 @@ bool HasValidQuantizationZeroPoints(const InitializedTensorSet& initializers, co
     if (idx >= io_defs.size()) {
       LOGS_DEFAULT(VERBOSE) << "HasValidQuantizationZeroPoints, "
                             << (is_input ? "Input" : "Output") << " index,  " << idx
-                            << " >= input number, " << io_defs.size();
+                            << " >= size, " << io_defs.size();
       return false;
     }
 
@@ -242,16 +251,16 @@ bool HasValidQuantizationZeroPoints(const InitializedTensorSet& initializers, co
       return true;
 
     const auto& zero_point_name = io_def.quant_param->zero_point->Name();
-    const auto& weight_tensor = *initializers.at(io_def.node_arg.Name());
     if (!Contains(initializers, zero_point_name)) {
       LOGS_DEFAULT(VERBOSE) << "The zero point of " << op_type << " must be an initializer tensor";
       return false;
     }
 
-    bool is_conv_matmul_weight = is_input && (is_qlinear_conv || is_qlinear_matmul) && idx == 2;
+    bool is_conv_matmul_weight = is_input && (is_qlinear_conv || is_qlinear_matmul) && idx == 1;
     bool is_conv_matmul_u8s8_weight = false;
 
     if (is_conv_matmul_weight) {
+      const auto& weight_tensor = *initializers.at(io_def.node_arg.Name());
       is_conv_matmul_u8s8_weight = weight_tensor.data_type() == ONNX_NAMESPACE::TensorProto_DataType_INT8;
     }
 
@@ -285,6 +294,7 @@ bool HasValidQuantizationZeroPoints(const InitializedTensorSet& initializers, co
       // or a tensor with same channel as weight, for NNAPI we only support it be
       // 0 (scalar) or all 0 (tensor), NNAPI will assume the zero point for per-channel
       // quantization is 0 there is no input for it
+      const auto& weight_tensor = *initializers.at(io_def.node_arg.Name());
       if (weight_tensor.dims()[0] != zero_dim && zero_dim != 1) {
         LOGS_DEFAULT(VERBOSE) << op_type << " mismatch int8 per-channel quantization weight,"
                               << " weight dimension[0] " << weight_tensor.dims()[0]
@@ -315,94 +325,6 @@ bool HasValidQuantizationZeroPoints(const InitializedTensorSet& initializers, co
   return true;
 }
 
-bool HasValidQuantizationZeroPoints(const InitializedTensorSet& initializers, const Node& node,
-                                    const std::vector<size_t>& indices) {
-  const auto& op_type = node.OpType();
-  auto qlinear_op_type = GetQLinearOpType(node);
-  bool is_qlinear_conv = (qlinear_op_type == QLinearOpType::QLinearConv);
-  bool is_qlinear_matmul = (qlinear_op_type == QLinearOpType::QLinearMatMul);
-  const auto input_defs(node.InputDefs());
-  for (const auto idx : indices) {
-    if (idx >= input_defs.size()) {
-      LOGS_DEFAULT(VERBOSE) << "HasValidQuantizationZeroPoints, Input index,  " << idx
-                            << " >= input number, " << input_defs.size();
-      return false;
-    }
-
-    const auto zero_point_name = input_defs[idx]->Name();
-    if (!Contains(initializers, zero_point_name)) {
-      LOGS_DEFAULT(VERBOSE) << "The zero point of " << op_type << " must be an initializer tensor";
-      return false;
-    }
-
-    bool is_conv_matmul_weight = is_qlinear_conv && idx == 5;
-    bool is_conv_matmul_u8s8_weight = false;
-    if (is_conv_matmul_weight) {
-      const auto& weight_tensor = *initializers.at(node.InputDefs()[3]->Name());
-      is_conv_matmul_u8s8_weight = weight_tensor.data_type() == ONNX_NAMESPACE::TensorProto_DataType_INT8;
-    }
-
-    const auto& zero_tensor = *initializers.at(zero_point_name);
-    int64_t zero_dim = zero_tensor.dims().empty() ? 1 : zero_tensor.dims()[0];
-
-    if (!is_conv_matmul_u8s8_weight) {
-      if (zero_dim != 1) {
-        LOGS_DEFAULT(VERBOSE) << op_type << " does not support per-channel quantization, "
-                              << " for now, only u8s8 QlinearConv supports per-channel quantization on API 29+";
-        return false;
-      }
-    } else {
-      // For u8s8 Qlinear[Conv/MatMul], we support
-      // 1. Per-tensor, the weight will be transformed to uint8 later
-      // 2. Per-channel, only from Android API level 29
-      if (zero_tensor.data_type() != ONNX_NAMESPACE::TensorProto_DataType_INT8) {
-        LOGS_DEFAULT(VERBOSE) << "u8s8 Qlinear[Conv/MatMul] only supports int8 zero point for weight, "
-                              << "actual zero point type: [" << zero_tensor.data_type() << "]";
-        return false;
-      }
-
-      if (zero_dim != 1) {
-        if (is_qlinear_matmul) {
-          LOGS_DEFAULT(VERBOSE) << "QLinearMatMul does not support per-channel quantization";
-          return false;
-        }
-      }
-
-      // For onnx, u8s8 QlinearConv, the weight zero point can be a scalar,
-      // or a tensor with same channel as weight, for NNAPI we only support it be
-      // 0 (scalar) or all 0 (tensor), NNAPI will assume the zero point for per-channel
-      // quantization is 0 there is no input for it
-      const auto& weight_tensor = *initializers.at(node.InputDefs()[3]->Name());
-      if (weight_tensor.dims()[0] != zero_dim && zero_dim != 1) {
-        LOGS_DEFAULT(VERBOSE) << op_type << " mismatch int8 per-channel quantization weight,"
-                              << " weight dimension[0] " << weight_tensor.dims()[0]
-                              << " zero point dimension " << zero_dim;
-        return false;
-      }
-
-      std::vector<uint8_t> unpacked_tensor;
-      auto status = onnxruntime::utils::UnpackInitializerData(zero_tensor, node.ModelPath(), unpacked_tensor);
-      if (!status.IsOK()) {
-        LOGS_DEFAULT(ERROR) << "Qlinear[Conv/MatMul] error when unpack zero tensor: " << zero_point_name
-                            << ", error msg: " << status.ErrorMessage();
-        return false;
-      }
-
-      // Verify all onnx weight zero point(s) are 0(s)
-      const int8_t* zero_points = reinterpret_cast<const int8_t*>(unpacked_tensor.data());
-      for (size_t i = 0; i < unpacked_tensor.size(); i++) {
-        if (zero_points[i] != 0) {
-          LOGS_DEFAULT(VERBOSE) << "u8s8 Qlinear[Conv/MatMul]  only support 0 as zero point, "
-                                << "zero_points[" << i << "] has value: " << zero_points[i];
-          return false;
-        }
-      }
-    }
-  }
-
-  return true;
-}
-
 common::Status GetQuantizationScale(const InitializedTensorSet& initializers, const Node& node,
                                     size_t idx, float& scale) {
   std::vector<uint8_t> unpacked_tensor;
@@ -439,7 +361,8 @@ common::Status GetQuantizationScaleAndZeroPoint(
   zero_point = 0;
 
   if (!io_def.quant_param) {  // Not a quantized IO
-    return Status::OK();
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                           "NodeArg: ", io_def.node_arg.Name(), " is not quantized");
   }
 
   const auto unpack_tensor = [&model_path](const InitializedTensorSet& initializers,
@@ -481,7 +404,7 @@ common::Status GetQuantizationScaleAndZeroPoint(
   const auto& io_defs = is_input ? node_unit.Inputs() : node_unit.Outputs();
   for (const auto& io_def : io_defs) {
     if (io_def.node_arg.Name() == name)
-      return GetQuantizationScaleAndZeroPoint(initializers, io_def, node_unit.GetNode().ModelPath(),
+      return GetQuantizationScaleAndZeroPoint(initializers, io_def, node_unit.ModelPath(),
                                               scale, zero_point);
   }
 
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.h b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.h
index 7d920cfa1a8eb..35f24dda439fb 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.h
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.h
@@ -103,23 +103,22 @@ ConvType GetConvType(const onnxruntime::Node& node, const InitializedTensorSet&
 // Such as QLinearConv, QLinearMatMul, QLinearAdd, ...
 bool IsQLinearBinaryOp(QLinearOpType qlinear_op_type);
 
-// Check if a qlinear unary op has valid inputs, Qlinear[Sigmoid/AveragePool]
-bool HasValidUnaryOpQuantizedInputs(const Node& node);
-// Check if a qlinear binary op has valid inputs, Qlinear[Conv/MatMul/Add]
-bool HasValidBinaryOpQuantizedInputs(const Node& node);
-// Check if a qlinear op has valid scales for given indices
-bool HasValidQuantizationScales(const InitializedTensorSet& initializers, const Node& node,
-                                const std::vector<size_t>& indices, const OpSupportCheckParams& params);
-// Check if a qlinear op has valid zero points for given indices
-bool HasValidQuantizationZeroPoints(const InitializedTensorSet& initializers, const Node& node,
-                                    const std::vector<size_t>& indices);
-
 common::Status GetQuantizationScale(const InitializedTensorSet& initializers, const Node& node,
                                     size_t idx, float& scale);
 
 common::Status GetQuantizationZeroPoint(const InitializedTensorSet& initializers,
                                         const Node& node, size_t idx, int32_t& zero_point) ORT_MUST_USE_RESULT;
 
+// Check if a qlinear unary op has valid inputs, Qlinear[Sigmoid/AveragePool]
+bool HasValidUnaryOpQuantizedInputs(const NodeUnit& node_unit);
+// Check if a qlinear binary op has valid inputs, Qlinear[Conv/MatMul/Add]
+bool HasValidBinaryOpQuantizedInputs(const NodeUnit& node_unit);
+
+// Check if a qlinear op has valid scales for given indices
+bool HasValidQuantizationScales(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+                                const std::vector<size_t>& indices, const OpSupportCheckParams& params, bool is_input);
+
+// Check if a qlinear op has valid zero points for given indices
 bool HasValidQuantizationZeroPoints(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
                                     const std::vector<size_t>& indices, bool is_input);
 
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc
index c4e1f103a0212..bc9fa92f30e06 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc
@@ -670,48 +670,6 @@ static void AddInputToSkip(ModelBuilder& model_builder, const NodeUnitIODef& io_
     AddQuantizationScaleAndZeroPointToSkip(model_builder, io_def);
 }
 
-Status GetQuantizedInputScaleAndZeroPoint(const InitializedTensorSet& initializers,
-                                          const NodeUnit& node_unit,
-                                          const std::string& input_name,
-                                          float& scale,
-                                          int32_t& zero_point) {
-  const auto& node = node_unit.GetNode();
-  const auto& op_type = node.OpType();
-  auto qlinear_op_type = GetQLinearOpType(node);
-  assert(qlinear_op_type != QLinearOpType::Unknown &&
-         qlinear_op_type != QLinearOpType::QuantizeLinear);
-
-  size_t scale_idx, zero_point_idx;
-  if (qlinear_op_type == QLinearOpType::DequantizeLinear ||
-      qlinear_op_type == QLinearOpType::QLinearSigmoid ||
-      qlinear_op_type == QLinearOpType::QLinearAveragePool) {
-    scale_idx = 1;
-    zero_point_idx = 2;
-  } else if (IsQLinearBinaryOp(qlinear_op_type)) {
-    const auto input_defs(node.InputDefs());
-    if (input_name == input_defs[0]->Name()) {
-      scale_idx = 1;
-      zero_point_idx = 2;
-    } else if (input_name == input_defs[3]->Name()) {
-      scale_idx = 4;
-      zero_point_idx = 5;
-    } else {
-      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                             "Unknown input: ", input_name, ", for op: ", op_type);
-    }
-  } else {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Unsupported op: ", op_type);
-  }
-
-  ORT_RETURN_IF_ERROR(GetQuantizationScale(initializers, node, scale_idx, scale));
-  zero_point = 0;
-  if (node.InputDefs().size() > zero_point_idx) {
-    ORT_RETURN_IF_ERROR(GetQuantizationZeroPoint(initializers, node, zero_point_idx, zero_point));
-  }
-
-  return Status::OK();
-}
-
 template <class T>
 void CreateSharedOpBuilderImpl(const std::string& op_type,
                                OpBuilderRegistrations& op_registrations,
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.h b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.h
index 46acbc4eff4b9..f25a6591040bc 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.h
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.h
@@ -43,10 +43,5 @@ const std::unordered_map<std::string, const IOpBuilder*>& GetOpBuilders();
 common::Status TransposeNHWCToNCHW(ModelBuilder& model_builder, const std::string& input, const std::string& output)
     ORT_MUST_USE_RESULT;
 
-// Get the quantized input's scale and zero point for the given input
-common::Status GetQuantizedInputScaleAndZeroPoint(const InitializedTensorSet& initializers,
-                                                  const NodeUnit& node_unit, const std::string& input_name,
-                                                  float& scale, int32_t& zero_point) ORT_MUST_USE_RESULT;
-
 }  // namespace nnapi
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_support_checker.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_support_checker.cc
index 331b2d2e9e745..262440859843c 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_support_checker.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_support_checker.cc
@@ -22,19 +22,37 @@ struct OpSupportCheckerRegistrations {
   std::unordered_map<std::string, const IOpSupportChecker*> op_support_checker_map;
 };
 
-bool HasExternalInitializer(const InitializedTensorSet& initializers, const Node& node) {
-  for (const auto* node_arg : node.InputDefs()) {
-    const auto& input_name(node_arg->Name());
-    if (!Contains(initializers, input_name))
-      continue;
-
-    const auto& tensor = *initializers.at(input_name);
-    if (tensor.has_data_location() &&
-        tensor.data_location() == ONNX_NAMESPACE::TensorProto_DataLocation_EXTERNAL) {
-      LOGS_DEFAULT(VERBOSE) << "Initializer [" << input_name
-                            << "] with external data location are not currently supported";
+bool HasExternalInitializer(const InitializedTensorSet& initializers, const NodeUnit& node_unit) {
+  const auto is_ext_initializer =
+      [&](const NodeArg& node_arg) {
+        const auto& input_name(node_arg.Name());
+        if (!Contains(initializers, input_name))
+          return false;
+
+        const auto& tensor = *initializers.at(input_name);
+        if (tensor.has_data_location() &&
+            tensor.data_location() == ONNX_NAMESPACE::TensorProto_DataLocation_EXTERNAL) {
+          LOGS_DEFAULT(VERBOSE) << "Initializer [" << input_name
+                                << "] with external data location are not currently supported";
+          return true;
+        }
+
+        return false;
+      };
+
+  const auto& inputs = node_unit.Inputs();
+  for (const auto& input : inputs) {
+    if (is_ext_initializer(input.node_arg))
+      return true;
+
+    if (!input.quant_param)
+      return false;
+
+    if (is_ext_initializer(input.quant_param->scale))
+      return true;
+
+    if (input.quant_param->zero_point && is_ext_initializer(*input.quant_param->zero_point))
       return true;
-    }
   }
 
   return false;
@@ -115,10 +133,8 @@ bool BaseOpSupportChecker::IsOpSupported(const InitializedTensorSet& initializer
   if (!HasSupportedInputs(node_unit))
     return false;
 
-  const auto& node = node_unit.GetNode();
-
   // We do not support external initializers for now
-  if (HasExternalInitializer(initializers, node))
+  if (HasExternalInitializer(initializers, node_unit))
     return false;
 
   if (!HasSupportedOpSet(node_unit))
@@ -244,30 +260,25 @@ int BinaryOpSupportChecker::GetMinSupportedOpSet(const NodeUnit& node_unit) cons
 }
 
 bool BinaryOpSupportChecker::HasSupportedInputsImpl(const NodeUnit& node_unit) const {
-  // TODO, change to use node unit and quant_param of IODef
-  const auto& node = node_unit.GetNode();
-  bool is_qlinear_add = node.OpType() == "QLinearAdd";
-  bool is_pow = node.OpType() == "Pow";
+  bool is_qlinear_add = node_unit.OpType() == "QLinearAdd";
+  bool is_pow = node_unit.OpType() == "Pow";
   if (!is_qlinear_add && !is_pow)
     return BaseOpSupportChecker::HasSupportedInputsImpl(node_unit);
 
   if (is_qlinear_add) {
     // QLinearAdd
-    if (!HasValidBinaryOpQuantizedInputs(node))
+    if (!HasValidBinaryOpQuantizedInputs(node_unit))
       return false;
   }
 
   // Pow we only support both input as fp32 now
   if (is_pow) {
-    const auto& input1 = *node.InputDefs()[0];
-    const auto& input2 = *node.InputDefs()[1];
-
     int32_t input_type_1;
-    if (!GetType(input1, input_type_1))
+    if (!GetType(node_unit.Inputs()[0].node_arg, input_type_1))
       return false;
 
     int32_t input_type_2;
-    if (!GetType(input2, input_type_2))
+    if (!GetType(node_unit.Inputs()[1].node_arg, input_type_2))
       return false;
 
     if (input_type_1 != ONNX_NAMESPACE::TensorProto_DataType_FLOAT || input_type_1 != input_type_2) {
@@ -283,24 +294,18 @@ bool BinaryOpSupportChecker::HasSupportedInputsImpl(const NodeUnit& node_unit) c
 
 bool BinaryOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
                                                const OpSupportCheckParams& params) const {
-  const auto& node = node_unit.GetNode();
-
-  const auto& op_type(node.OpType());
-  const auto input_defs(node.InputDefs());
+  const auto& op_type(node_unit.OpType());
+  const auto& inputs = node_unit.Inputs();
   bool op_is_qlinear = op_type == "QLinearAdd";
-  size_t a_idx = 0, b_idx = 1;
-  if (op_is_qlinear) {
-    b_idx = 3;
-  }
   Shape input1_shape, input2_shape;
-  if (!GetShape(*input_defs[a_idx], input1_shape) ||
-      !GetShape(*input_defs[b_idx], input2_shape))
+  if (!GetShape(inputs[0].node_arg, input1_shape) ||
+      !GetShape(inputs[1].node_arg, input2_shape))
     return false;
 
   const auto input1_size = input1_shape.size();
   const auto input2_size = input2_shape.size();
   if (input1_size > 4 || input2_size > 4) {
-    LOGS_DEFAULT(VERBOSE) << node.OpType() << " only support up to 4d shape, input1 is "
+    LOGS_DEFAULT(VERBOSE) << op_type << " only support up to 4d shape, input1 is "
                           << input1_size << "d shape, input 2 is "
                           << input2_size << "d shape";
     return false;
@@ -309,7 +314,7 @@ bool BinaryOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initi
   if (op_is_qlinear) {
     // For QLinearAdd, we only support uint8 output now
     int32_t output_type;
-    if (!GetType(*node.OutputDefs()[0], output_type))
+    if (!GetType(inputs[0].node_arg, output_type))
       return false;
 
     if (output_type != ONNX_NAMESPACE::TensorProto_DataType_UINT8) {
@@ -319,13 +324,16 @@ bool BinaryOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initi
       return false;
     }
 
-    // All scale/zero points are initializer scalars
-    // a/b/y_scale
-    if (!HasValidQuantizationScales(initializers, node, {1, 4, 6}, params))
+    // Check input scales and ZPs
+    if (!HasValidQuantizationScales(initializers, node_unit, {0, 1}, params, true /* is_input */))
+      return false;
+    if (!HasValidQuantizationZeroPoints(initializers, node_unit, {0, 1}, true /* is_input */))
       return false;
 
-    // a/b/y_zero_point
-    if (!HasValidQuantizationZeroPoints(initializers, node, {2, 5, 7}))
+    // Check output scale and ZP
+    if (!HasValidQuantizationScales(initializers, node_unit, {0}, params, false /* is_input */))
+      return false;
+    if (!HasValidQuantizationZeroPoints(initializers, node_unit, {0}, false /* is_input */))
       return false;
   }
 
@@ -351,9 +359,8 @@ class TransposeOpSupportChecker : public BaseOpSupportChecker {
 
 bool TransposeOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& /* initializers */, const NodeUnit& node_unit,
                                                   const OpSupportCheckParams& /* params */) const {
-  const auto& node = node_unit.GetNode();
   Shape input_shape;
-  if (!GetShape(*node.InputDefs()[0], input_shape))
+  if (!GetShape(node_unit.Inputs()[0].node_arg, input_shape))
     return false;
 
   const auto input_size = input_shape.size();
@@ -397,15 +404,15 @@ class ReshapeOpSupportChecker : public BaseOpSupportChecker {
 
 bool ReshapeOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
                                                 const OpSupportCheckParams& /* params */) const {
-  const auto& node = node_unit.GetNode();
-  const auto& perm_name = node.InputDefs()[1]->Name();
+  const auto& inputs = node_unit.Inputs();
+  const auto& perm_name = inputs[1].node_arg.Name();
   if (!Contains(initializers, perm_name)) {
     LOGS_DEFAULT(VERBOSE) << "New shape of reshape must be known";
     return false;
   }
 
   Shape input_shape;
-  if (!GetShape(*node.InputDefs()[0], input_shape))
+  if (!GetShape(inputs[0].node_arg, input_shape))
     return false;
 
   if (input_shape.size() > 4 || input_shape.empty()) {
@@ -424,7 +431,7 @@ bool ReshapeOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& init
   const int64_t* raw_perm = reinterpret_cast<const int64_t*>(unpacked_tensor.data());
   const auto perm_size = SafeInt<uint32_t>(perm_tensor.dims()[0]);
 
-  NodeAttrHelper helper(node);
+  NodeAttrHelper helper(node_unit.GetNode());
   const bool allow_zero = helper.Get("allowzero ", 0) == 1;
   for (uint32_t i = 0; i < perm_size; i++) {
     // NNAPI reshape does not support 0 as dimension
@@ -610,15 +617,18 @@ bool PoolOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initial
     // the output zero point can be optional
     bool has_output_zp = input_defs.size() == 5;
 
-    if (!HasValidQuantizationScales(initializers, node, {1, 3}, params))
+    // Check input scales and ZPs
+    if (!HasValidQuantizationScales(initializers, node_unit, {0}, params, true /* is_input */))
       return false;
+    if (!HasValidQuantizationZeroPoints(initializers, node_unit, {0}, true /* is_input */))
+      return false;
+
+    // Check output scale and ZP
 
-    if (!HasValidQuantizationZeroPoints(initializers, node,
-                                        has_output_zp
-                                            ? std::vector<size_t>{2}
-                                            : std::vector<size_t>{2, 4})) {
+    if (!HasValidQuantizationScales(initializers, node_unit, {0}, params, false /* is_input */))
+      return false;
+    if (!HasValidQuantizationZeroPoints(initializers, node_unit, {0}, false /* is_input */))
       return false;
-    }
 
     // NNAPI requires Quantized Average Pool has same scale and zero point for both input and output
     float input_scale = 0.0f;
@@ -678,26 +688,24 @@ bool PoolOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initial
 }
 
 bool PoolOpSupportChecker::HasSupportedInputsImpl(const NodeUnit& node_unit) const {
-  // TODO, change to use node unit and quant_param of IODef
-  const auto& node = node_unit.GetNode();
-  bool is_max_pool = node.OpType() == "MaxPool";
-  bool is_qlinear_average_pool = node.OpType() == "QLinearAveragePool";
+  bool is_max_pool = node_unit.OpType() == "MaxPool";
+  bool is_qlinear_average_pool = node_unit.OpType() == "QLinearAveragePool";
   if (!is_max_pool && !is_qlinear_average_pool)
     return BaseOpSupportChecker::HasSupportedInputsImpl(node_unit);
 
   if (is_qlinear_average_pool) {
-    return HasValidUnaryOpQuantizedInputs(node);
+    return HasValidUnaryOpQuantizedInputs(node_unit);
   }
 
   // is_max_pool
   // For max pool, we can support both float and uint8 input
   int32_t input_type;
-  if (!GetType(*node.InputDefs()[0], input_type))
+  if (!GetType(node_unit.Inputs()[0].node_arg, input_type))
     return false;
 
   if (input_type != ONNX_NAMESPACE::TensorProto_DataType_FLOAT &&
       input_type != ONNX_NAMESPACE::TensorProto_DataType_UINT8) {
-    LOGS_DEFAULT(VERBOSE) << "[" << node.OpType()
+    LOGS_DEFAULT(VERBOSE) << "[" << node_unit.OpType()
                           << "] Input type: [" << input_type
                           << "] is not supported for now";
     return false;
@@ -738,13 +746,11 @@ class ConvOpSupportChecker : public BaseOpSupportChecker {
 }
 
 bool ConvOpSupportChecker::HasSupportedInputsImpl(const NodeUnit& node_unit) const {
-  // TODO, change to use node unit and quant_param of IODef
-  const auto& node = node_unit.GetNode();
-  if (node.OpType() != "QLinearConv")
+  if (node_unit.OpType() != "QLinearConv")
     return BaseOpSupportChecker::HasSupportedInputsImpl(node_unit);
 
   // QLinearConv only supports input of uint8 for now
-  if (!HasValidBinaryOpQuantizedInputs(node))
+  if (!HasValidBinaryOpQuantizedInputs(node_unit))
     return false;
 
   return true;
@@ -810,12 +816,16 @@ bool ConvOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initial
       return false;
     }
 
-    // a/b/y_scale
-    if (!HasValidQuantizationScales(initializers, node, {1, 4, 6}, params))
+    // Check input scales and ZPs
+    if (!HasValidQuantizationScales(initializers, node_unit, {0, 1}, params, true /* is_input */))
+      return false;
+    if (!HasValidQuantizationZeroPoints(initializers, node_unit, {0, 1}, true /* is_input */))
       return false;
 
-    // a/b/y_zero_point
-    if (!HasValidQuantizationZeroPoints(initializers, node, {2, 5, 7}))
+    // Check output scale and ZP
+    if (!HasValidQuantizationScales(initializers, node_unit, {0}, params, false /* is_input */))
+      return false;
+    if (!HasValidQuantizationZeroPoints(initializers, node_unit, {0}, false /* is_input */))
       return false;
   }
 
@@ -914,13 +924,11 @@ class GemmOpSupportChecker : public BaseOpSupportChecker {
 };
 
 bool GemmOpSupportChecker::HasSupportedInputsImpl(const NodeUnit& node_unit) const {
-  // TODO, change to use node unit and quant_param of IODef
-  const auto& node = node_unit.GetNode();
-  if (node.OpType() != "QLinearMatMul")
+  if (node_unit.OpType() != "QLinearMatMul")
     return BaseOpSupportChecker::HasSupportedInputsImpl(node_unit);
 
   // QLinearMatMul
-  if (!HasValidBinaryOpQuantizedInputs(node))
+  if (!HasValidBinaryOpQuantizedInputs(node_unit))
     return false;
 
   return true;
@@ -1078,12 +1086,16 @@ bool GemmOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initial
       }
 
       // All scale/zero points are initializer scalars
-      // a/b/y_scale
-      if (!HasValidQuantizationScales(initializers, node, {1, 4, 6}, params))
+      // Check input scales and ZPs
+      if (!HasValidQuantizationScales(initializers, node_unit, {0, 1}, params, true /* is_input */))
+        return false;
+      if (!HasValidQuantizationZeroPoints(initializers, node_unit, {0, 1}, true /* is_input */))
         return false;
 
-      // a/b/y_zero_point
-      if (!HasValidQuantizationZeroPoints(initializers, node, {2, 5, 7}))
+      // Check output scale and ZP
+      if (!HasValidQuantizationScales(initializers, node_unit, {0}, params, false /* is_input */))
+        return false;
+      if (!HasValidQuantizationZeroPoints(initializers, node_unit, {0}, false /* is_input */))
         return false;
     }
   } else {
@@ -1113,7 +1125,7 @@ class UnaryOpSupportChecker : public BaseOpSupportChecker {
 
   int GetMinSupportedOpSet(const NodeUnit& node_unit) const override;
 
-  static bool IsQuantizedOpSupported(const InitializedTensorSet& initializers, const Node& node,
+  static bool IsQuantizedOpSupported(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
                                      const OpSupportCheckParams& params);
 };
 
@@ -1137,9 +1149,8 @@ class UnaryOpSupportChecker : public BaseOpSupportChecker {
 
 bool UnaryOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
                                               const OpSupportCheckParams& params) const {
-  const auto& node = node_unit.GetNode();
-  if (node.OpType() == "QLinearSigmoid")
-    return IsQuantizedOpSupported(initializers, node, params);
+  if (node_unit.OpType() == "QLinearSigmoid")
+    return IsQuantizedOpSupported(initializers, node_unit, params);
   else  // Everything except "QLinearSigmoid" are by default supported
     return true;
 }
@@ -1160,13 +1171,11 @@ int32_t UnaryOpSupportChecker::GetMinSupportedNNAPIFeatureLevel(const NodeUnit&
 }
 
 bool UnaryOpSupportChecker::HasSupportedInputsImpl(const NodeUnit& node_unit) const {
-  // TODO, change to use node unit and quant_param of IODef
-  const auto& node = node_unit.GetNode();
   // We only need to override input check for QLinearSigmoid
-  if (node.OpType() != "QLinearSigmoid")
+  if (node_unit.OpType() != "QLinearSigmoid")
     return BaseOpSupportChecker::HasSupportedInputsImpl(node_unit);
 
-  return HasValidUnaryOpQuantizedInputs(node);
+  return HasValidUnaryOpQuantizedInputs(node_unit);
 }
 
 // All ops except "Sin" opset 5- uses consumed_inputs attribute which is not supported for now
@@ -1180,35 +1189,35 @@ int UnaryOpSupportChecker::GetMinSupportedOpSet(const NodeUnit& node_unit) const
 }
 
 /* static */ bool UnaryOpSupportChecker::IsQuantizedOpSupported(
-    const InitializedTensorSet& initializers, const Node& node, const OpSupportCheckParams& params) {
-  const auto& op_type = node.OpType();
+    const InitializedTensorSet& initializers, const NodeUnit& node_unit, const OpSupportCheckParams& params) {
+  const auto& op_type = node_unit.OpType();
   ORT_ENFORCE(op_type == "QLinearSigmoid");
 
-  const auto& op_name = node.Name();
-  const auto input_defs(node.InputDefs());
-  // const auto output_defs(node.OutputDefs());
+  const auto& op_name = node_unit.Name();
 
-  if (input_defs.size() < 4)
+  // Check input scales and ZPs
+  if (!HasValidQuantizationScales(initializers, node_unit, {0}, params, true /* is_input */))
     return false;
-
-  bool has_output_zp = input_defs.size() == 5;
-
-  if (!HasValidQuantizationScales(initializers, node, {1, 3}, params))
+  if (!HasValidQuantizationZeroPoints(initializers, node_unit, {0}, true /* is_input */))
     return false;
 
-  if (!HasValidQuantizationZeroPoints(initializers, node,
-                                      has_output_zp
-                                          ? std::vector<size_t>{2}
-                                          : std::vector<size_t>{2, 4}))
+  // Check output scale and ZP
+  if (!HasValidQuantizationScales(initializers, node_unit, {0}, params, false /* is_input */))
+    return false;
+  if (!HasValidQuantizationZeroPoints(initializers, node_unit, {0}, false /* is_input */))
     return false;
 
+  return false;
+
   // NNAPI requires the scale be 1.f/256 and zero point to be 0
   // See https://android.googlesource.com/platform/frameworks/ml/+/refs/heads/android10-c2f2-release/nn/common/operations/Activation.cpp#180
   float output_scale = 0.0f;
-  auto status = GetQuantizationScale(initializers, node, 3, output_scale);
+  int32_t output_zp = 0;
+  auto status = GetQuantizationScaleAndZeroPoint(initializers, node_unit.Outputs()[0], node_unit.ModelPath(),
+                                                 output_scale, output_zp);
   if (!status.IsOK()) {
     LOGS_DEFAULT(ERROR) << "Op [" << op_type << "] name [" << op_name
-                        << "] GetQuantizationScale failed, message: " << status.ErrorMessage();
+                        << "] GetQuantizationScaleAndZeroPoint failed, message: " << status.ErrorMessage();
     return false;
   }
 
@@ -1218,20 +1227,10 @@ int UnaryOpSupportChecker::GetMinSupportedOpSet(const NodeUnit& node_unit) const
     return false;
   }
 
-  int32_t output_zp;
-  if (has_output_zp) {
-    status = GetQuantizationZeroPoint(initializers, node, 4, output_zp);
-    if (!status.IsOK()) {
-      LOGS_DEFAULT(ERROR) << "Op [" << op_type << "] name [" << op_name
-                          << "] GetQuantizationZeroPoint failed, message: " << status.ErrorMessage();
-      return false;
-    }
-
-    if (output_zp != 0) {
-      LOGS_DEFAULT(VERBOSE) << "Op [" << op_type << "] name [" << op_name
-                            << "] output zero point can only be 0, actual zero point: " << output_scale;
-      return false;
-    }
+  if (output_zp != 0) {
+    LOGS_DEFAULT(VERBOSE) << "Op [" << op_type << "] name [" << op_name
+                          << "] output zero point can only be 0, actual zero point: " << output_scale;
+    return false;
   }
 
   return true;
@@ -1299,12 +1298,12 @@ class SqueezeOpSupportChecker : public BaseOpSupportChecker {
 
 bool SqueezeOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
                                                 const OpSupportCheckParams& /* params */) const {
-  const auto& node = node_unit.GetNode();
+  const auto& inputs = node_unit.Inputs();
   Shape input_shape;
-  if (!GetShape(*node.InputDefs()[0], input_shape))
+  if (!GetShape(inputs[0].node_arg, input_shape))
     return false;
 
-  const auto input_size = input_shape.size();
+  const auto input_size = inputs.size();
   if (input_size > 4 || input_size == 0) {
     LOGS_DEFAULT(VERBOSE) << "Squeeze only supports 1-4d shape, input is "
                           << input_size << "d shape";
@@ -1312,8 +1311,8 @@ bool SqueezeOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& init
   }
 
   // Squeeze opset 13 use input 1 as axes, if we have input 1 then it need to be an initializer
-  if (node.SinceVersion() > 12 && node.InputDefs().size() > 1) {
-    const auto& axes_name = node.InputDefs()[1]->Name();
+  if (node_unit.SinceVersion() > 12 && input_size > 1) {
+    const auto& axes_name = inputs[1].node_arg.Name();
     if (!Contains(initializers, axes_name)) {
       LOGS_DEFAULT(VERBOSE) << "Input axes of Squeeze must be known";
       return false;
@@ -1340,28 +1339,23 @@ class QuantizeLinearOpSupportChecker : public BaseOpSupportChecker {
 
 bool QuantizeLinearOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
                                                        const OpSupportCheckParams& params) const {
-  const auto& node = node_unit.GetNode();
-  const auto input_defs(node.InputDefs());
-  const auto output_defs(node.OutputDefs());
-
   int32_t output_type;
-  if (!GetType(*output_defs[0], output_type))
+  if (!GetType(node_unit.Outputs()[0].node_arg, output_type))
     return false;
 
   if (output_type != ONNX_NAMESPACE::TensorProto_DataType_UINT8) {
-    LOGS_DEFAULT(VERBOSE) << "[" << node.OpType()
+    LOGS_DEFAULT(VERBOSE) << "[" << node_unit.OpType()
                           << "] output type: [" << output_type
                           << "] is not supported for now";
     return false;
   }
 
-  if (!HasValidQuantizationScales(initializers, node, {1}, params))
+  // For QuantizeLinear only output is quantized
+  // Check output scale and ZP
+  if (!HasValidQuantizationScales(initializers, node_unit, {0}, params, false /* is_input */))
+    return false;
+  if (!HasValidQuantizationZeroPoints(initializers, node_unit, {0}, false /* is_input */))
     return false;
-
-  if (input_defs.size() == 3) {  // has zero_point input
-    if (!HasValidQuantizationZeroPoints(initializers, node, {2}))
-      return false;
-  }
 
   return true;
 }
@@ -1384,15 +1378,12 @@ class DequantizeLinearOpSupportChecker : public BaseOpSupportChecker {
 
 bool DequantizeLinearOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
                                                          const OpSupportCheckParams& params) const {
-  const auto& node = node_unit.GetNode();
-  const auto input_defs(node.InputDefs());
-  if (!HasValidQuantizationScales(initializers, node, {1}, params))
+  // For DequantizeLinear only input is quantized
+  // Check input scale and ZP
+  if (!HasValidQuantizationScales(initializers, node_unit, {0}, params, true /* is_input */))
+    return false;
+  if (!HasValidQuantizationZeroPoints(initializers, node_unit, {0}, true /* is_input */))
     return false;
-
-  if (input_defs.size() == 3) {  // has zero_point input
-    if (!HasValidQuantizationZeroPoints(initializers, node, {2}))
-      return false;
-  }
 
   return true;
 }

From 7ee1ecb7474b6fa9dad9444d7d94f0a2a0dc9976 Mon Sep 17 00:00:00 2001
From: Guoyu Wang <wanggy@outlook.com>
Date: Thu, 6 Jan 2022 23:46:18 -0800
Subject: [PATCH 10/23] minor update

---
 .../providers/nnapi/nnapi_builtin/builders/helper.h  | 12 ++++++------
 .../nnapi/nnapi_builtin/builders/op_builder.cc       |  2 ++
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.h b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.h
index 35f24dda439fb..8a87f119c72cf 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.h
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.h
@@ -103,12 +103,6 @@ ConvType GetConvType(const onnxruntime::Node& node, const InitializedTensorSet&
 // Such as QLinearConv, QLinearMatMul, QLinearAdd, ...
 bool IsQLinearBinaryOp(QLinearOpType qlinear_op_type);
 
-common::Status GetQuantizationScale(const InitializedTensorSet& initializers, const Node& node,
-                                    size_t idx, float& scale);
-
-common::Status GetQuantizationZeroPoint(const InitializedTensorSet& initializers,
-                                        const Node& node, size_t idx, int32_t& zero_point) ORT_MUST_USE_RESULT;
-
 // Check if a qlinear unary op has valid inputs, Qlinear[Sigmoid/AveragePool]
 bool HasValidUnaryOpQuantizedInputs(const NodeUnit& node_unit);
 // Check if a qlinear binary op has valid inputs, Qlinear[Conv/MatMul/Add]
@@ -122,6 +116,12 @@ bool HasValidQuantizationScales(const InitializedTensorSet& initializers, const
 bool HasValidQuantizationZeroPoints(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
                                     const std::vector<size_t>& indices, bool is_input);
 
+common::Status GetQuantizationScale(const InitializedTensorSet& initializers, const Node& node,
+                                    size_t idx, float& scale);
+
+common::Status GetQuantizationZeroPoint(const InitializedTensorSet& initializers,
+                                        const Node& node, size_t idx, int32_t& zero_point) ORT_MUST_USE_RESULT;
+
 common::Status GetQuantizationScaleAndZeroPoint(
     const InitializedTensorSet& initializers, const NodeUnitIODef& io_def, const Path& model_path,
     float& scale, int32_t& zero_point);
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc
index bc9fa92f30e06..a1ae271f7d066 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc
@@ -664,6 +664,8 @@ static void AddQuantizationScaleAndZeroPointToSkip(ModelBuilder& model_builder,
   }
 }
 
+// Ignore the input (with quantization scale and ZP if available)
+// The input (usually weight) is already embedded in the NNAPI model
 static void AddInputToSkip(ModelBuilder& model_builder, const NodeUnitIODef& io_def) {
   model_builder.AddInitializerToSkip(io_def.node_arg.Name());  // main input
   if (io_def.quant_param)

From 221776b9a0082832b37426c55b6dcee3e61a843c Mon Sep 17 00:00:00 2001
From: Guoyu Wang <wanggy@outlook.com>
Date: Fri, 7 Jan 2022 12:42:28 -0800
Subject: [PATCH 11/23] add activation handleing for node_unit

---
 .../nnapi_builtin/builders/model_builder.cc   | 99 ++++++++++++++++---
 .../nnapi_builtin/builders/model_builder.h    |  9 +-
 .../providers/shared/node_unit/node_unit.cc   |  1 +
 .../providers/shared/node_unit/node_unit.h    |  7 +-
 4 files changed, 96 insertions(+), 20 deletions(-)

diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.cc
index f0c209ddf8d25..a96c67960b7d3 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.cc
@@ -56,6 +56,7 @@ Status ModelBuilder::Prepare() {
   GetAllQuantizedOpInputs();
   PreprocessInitializers();
   PreprocessActivations();
+  PreprocessActivations_nu();
   ORT_RETURN_IF_ERROR(RegisterInitializers());
   ORT_RETURN_IF_ERROR(RegisterModelInputs());
   ORT_RETURN_IF_ERROR(AddOperations());
@@ -115,12 +116,9 @@ Status ModelBuilder::GetTargetDevices() {
 }
 
 void ModelBuilder::PreprocessInitializers() {
-  const auto& node_indices = graph_viewer_.GetNodesInTopologicalOrder();
-  for (size_t i = 0; i < node_indices.size(); i++) {
-    const auto* node(graph_viewer_.GetNode(node_indices[i]));
-    const auto& node_unit = GetNodeUnit(node);
-    if (const auto* op_builder = GetOpBuilder(node_unit)) {
-      op_builder->AddInitializersToSkip(*this, node_unit);
+  for (const auto& node_unit : node_unit_holder_) {
+    if (const auto* op_builder = GetOpBuilder(*node_unit)) {
+      op_builder->AddInitializersToSkip(*this, *node_unit);
     }
   }
 }
@@ -147,6 +145,26 @@ void ModelBuilder::PreprocessActivations() {
   }
 }
 
+void ModelBuilder::PreprocessActivations_nu() {
+  for (const auto& node_unit : node_unit_holder_) {
+    const auto& node = node_unit->GetNode();
+    const auto& op_type(node.OpType());
+    if (op_type == "Relu") {
+      activation_node_units_.emplace(node_unit.get(), ANEURALNETWORKS_FUSED_RELU);
+    } else if (op_type == "Clip") {  // Relu1 or Relu6
+      float min, max;
+      if (!GetClipMinMax(GetInitializerTensors(), node, min, max, logging::LoggingManager::DefaultLogger()))
+        continue;
+
+      if (min == -1.0f && max == 1.0f) {
+        activation_node_units_.emplace(node_unit.get(), ANEURALNETWORKS_FUSED_RELU1);
+      } else if (min == 0.0f && max == 6.0f) {
+        activation_node_units_.emplace(node_unit.get(), ANEURALNETWORKS_FUSED_RELU6);
+      }
+    }
+  }
+}
+
 const NodeUnit& ModelBuilder::GetNodeUnit(const Node* node) const {
   // Do we want to throw here if the node is not in the map?
   return *node_unit_map_.at(node);
@@ -167,16 +185,11 @@ void ModelBuilder::PreprocessNodeUnits() {
 
 // Help to get all quantized operators' input and the NodeUnit(s) using the input
 void ModelBuilder::GetAllQuantizedOpInputs() {
-  const auto& node_indices = graph_viewer_.GetNodesInTopologicalOrder();
-  for (const auto& node_idx : node_indices) {
-    const auto* node(graph_viewer_.GetNode(node_idx));
-    // TODO check if the node_unit has already been processed
-    const auto& node_unit = GetNodeUnit(node);
-
+  for (const auto& node_unit : node_unit_holder_) {
     // TODO, hookup getting quantized inputs with QDQ NodeUnits and remove the ORT_ENFORCE
-    ORT_ENFORCE(node_unit.UnitType() == NodeUnit::Type::SingleNode, "QDQ NodeUnit is not yet implemented");
+    ORT_ENFORCE(node_unit->UnitType() == NodeUnit::Type::SingleNode, "QDQ NodeUnit is not yet implemented");
 
-    auto qlinear_op_type = GetQLinearOpType(node_unit.GetNode());
+    auto qlinear_op_type = GetQLinearOpType(node_unit->GetNode());
 
     // Not a qlinear op
     if (qlinear_op_type == QLinearOpType::Unknown)
@@ -193,11 +206,11 @@ void ModelBuilder::GetAllQuantizedOpInputs() {
 
     // All qlinear ops EXCEPT QuantizeLinear has quantized input
     if (qlinear_op_type != QLinearOpType::QuantizeLinear) {
-      add_quantized_input(node_unit, 0);
+      add_quantized_input(*node_unit, 0);
     }
 
     if (IsQLinearBinaryOp(qlinear_op_type)) {
-      add_quantized_input(node_unit, 1);
+      add_quantized_input(*node_unit, 1);
     }
 
     // TODO, add handling for varidiac nodes such as QLinearConcat
@@ -511,15 +524,23 @@ Status ModelBuilder::AddOperandFromPersistMemoryBuffer(
 
 Status ModelBuilder::AddOperations() {
   const auto& node_indices = graph_viewer_.GetNodesInTopologicalOrder();
+  std::unordered_set<const NodeUnit*> processed_node_units;
   for (size_t i = 0; i < node_indices.size(); i++) {
     const auto* node(graph_viewer_.GetNode(node_indices[i]));
     const NodeUnit& node_unit = GetNodeUnit(node);
+
+    // Since a NodeUnit may contain multiple nodes, avoid processing the same NodeUnit multiple times
+    if (Contains(processed_node_units, &node_unit))
+      continue;
+
     if (const auto* op_builder = GetOpBuilder(node_unit)) {
       ORT_RETURN_IF_ERROR(op_builder->AddToModelBuilder(*this, node_unit));
     } else {
       return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
                              "Node [", node_unit.Name(), "], type [", node_unit.OpType(), "] is not supported");
     }
+
+    processed_node_units.insert(&node_unit);
   }
 
   return Status::OK();
@@ -625,6 +646,52 @@ Status ModelBuilder::Compile(std::unique_ptr<Model>& model) {
   return Status::OK();
 }
 
+int32_t ModelBuilder::FindActivation_nu(const NodeUnit& node_unit, const NodeArg& output) {
+  (void)node_unit;
+  int32_t fuse_code = ANEURALNETWORKS_FUSED_NONE;
+  if (node_unit.GetOutputNodes().size() != 1)
+    return fuse_code;
+
+  const auto& output_node = *node_unit.GetOutputNodes()[0];
+
+  // TODO, add support of activation fusion for quantized node group (qdq or qlinear)
+  // We do not support activation fusion for quantized operators for now
+  auto qlinear_op_type = GetQLinearOpType(node_unit.GetNode());
+  if (qlinear_op_type != QLinearOpType::Unknown)
+    return fuse_code;
+
+  for (auto it = output_node.OutputEdgesBegin(), end = output_node.OutputEdgesEnd(); it != end; ++it) {
+    const auto& dst_node = it->GetNode();
+    const auto* dst_input = dst_node.InputDefs()[it->GetDstArgIndex()];
+    const auto& dst_node_unit = GetNodeUnit(&dst_node);
+    if (Contains(activation_node_units_, &dst_node_unit)) {
+      if (&output == dst_input) {
+        fuse_code = activation_node_units_.at(&dst_node_unit);
+      }
+    } else {
+      // if there is any other non-relu node using the output
+      // will add relu separately
+      if (&output == dst_input)
+        return ANEURALNETWORKS_FUSED_NONE;
+    }
+  }
+
+  // if output is a graph output, will add activation separately
+  if (fuse_code != ANEURALNETWORKS_FUSED_NONE) {
+    const auto& graph_outputs = graph_viewer_.GetOutputs();
+    if (std::find(graph_outputs.cbegin(), graph_outputs.cend(), &output) != graph_outputs.cend()) {
+      return ANEURALNETWORKS_FUSED_NONE;
+    }
+
+    LOGS_DEFAULT(VERBOSE) << "Node [" << node_unit.Name() << "] type [" << node_unit.OpType()
+                          << "], fused the output [" << output.Name() << "]";
+
+    fused_activations_.insert(output.Name());
+  }
+
+  return fuse_code;
+}
+
 int32_t ModelBuilder::FindActivation(const Node& node, const NodeArg& output) {
   int32_t fuse_code = ANEURALNETWORKS_FUSED_NONE;
 
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.h b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.h
index 9b7dd0a289aad..07b8035d96f5a 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.h
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.h
@@ -49,6 +49,7 @@ class ModelBuilder {
 
   // Find if an output has a fuseable activation (Relu)
   int32_t FindActivation(const Node& node, const NodeArg& output);
+  int32_t FindActivation_nu(const NodeUnit& node_unit, const NodeArg& output);
 
   // Add an NNAPI scalar operand
   Status AddOperandFromScalar(bool value, uint32_t& index) ORT_MUST_USE_RESULT;
@@ -142,6 +143,9 @@ class ModelBuilder {
   // All activation nodes (Relu, Relu1, Relu6) as a map <NodeIndex, activation_code>
   std::unordered_map<NodeIndex, int32_t> activation_nodes_;
 
+  // All activation nodes (Relu, Relu1, Relu6) as a map <const NodeUnit*, activation_code>
+  std::unordered_map<const NodeUnit*, int32_t> activation_node_units_;
+
   std::unordered_map<std::string, std::shared_ptr<IOpSupportChecker>> op_support_checkers_;
 
   // Operands in nhwc
@@ -182,6 +186,7 @@ class ModelBuilder {
   void PreprocessInitializers();
   // Preprocess all the activation nodes (Relu/Relu1/Relu6) for easy query later
   void PreprocessActivations();
+  void PreprocessActivations_nu();
   // Copy and process all the initializers to NNAPI model
   Status RegisterInitializers() ORT_MUST_USE_RESULT;
   Status RegisterModelInputs() ORT_MUST_USE_RESULT;
@@ -192,7 +197,9 @@ class ModelBuilder {
 
   // Get all quantized inputs in the underlying graph_viewer
   void GetAllQuantizedOpInputs();
-  // Go through the underlying graph_viewer, and generate NodeUnits
+
+  // Go through the underlying graph_viewer, and generate NodeUnits, Many initializing functions are
+  // using the result of PreprocessNodeUnits, this need to run early in the Prepare()
   void PreprocessNodeUnits();
 
   Status SetOperandValue(uint32_t index, Model::NNMemory* memory,
diff --git a/onnxruntime/core/providers/shared/node_unit/node_unit.cc b/onnxruntime/core/providers/shared/node_unit/node_unit.cc
index 3d761dfbe971f..3c5a829a4d497 100644
--- a/onnxruntime/core/providers/shared/node_unit/node_unit.cc
+++ b/onnxruntime/core/providers/shared/node_unit/node_unit.cc
@@ -83,6 +83,7 @@ bool IsVariadicQLinearOp(QLinearOpType type) {
 
 NodeUnit::NodeUnit(const Node& node)
     : nodes_{&node},
+      output_nodes_{&node},
       node_(node),
       type_(Type::SingleNode) {
   InitForNode();
diff --git a/onnxruntime/core/providers/shared/node_unit/node_unit.h b/onnxruntime/core/providers/shared/node_unit/node_unit.h
index 1d02854c327b4..73fcca032a013 100644
--- a/onnxruntime/core/providers/shared/node_unit/node_unit.h
+++ b/onnxruntime/core/providers/shared/node_unit/node_unit.h
@@ -64,15 +64,16 @@ class NodeUnit {
   ProviderType GetExecutionProviderType() const noexcept;
 
   const Node& GetNode() const noexcept { return node_; }
-
+  const std::vector<const Node*> GetOutputNodes() const noexcept { return output_nodes_; }
   const std::vector<const Node*> GetAllNodes() const noexcept { return nodes_; }
 
  private:
   std::vector<NodeUnitIODef> inputs_;
   std::vector<NodeUnitIODef> outputs_;
 
-  const std::vector<const Node*> nodes_;  // all nodes in this NodeUnit
-  const Node& node_;                      // target Node
+  const std::vector<const Node*> nodes_;         // all nodes in this NodeUnit
+  const std::vector<const Node*> output_nodes_;  // all the nodes producing outputs for this NodeUnit
+  const Node& node_;                             // target Node
   Type type_;
 
   void InitForNode();  // Initializing for single Node

From 35292cb58e16e841b3126eb4b35b69bab483ea4c Mon Sep 17 00:00:00 2001
From: Guoyu Wang <wanggy@outlook.com>
Date: Fri, 7 Jan 2022 16:03:43 -0800
Subject: [PATCH 12/23] add bin and relu support

---
 .../nnapi_builtin/builders/op_builder.cc      | 104 ++++++++++++++----
 1 file changed, 84 insertions(+), 20 deletions(-)

diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc
index a1ae271f7d066..88a2e2824d70c 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc
@@ -126,6 +126,28 @@ Status TransposeNCHWToNHWC(ModelBuilder& model_builder,
   return TransposeBetweenNCHWAndNHWC(model_builder, input, output, true /* nchw_to_nhwc */);
 }
 
+// Convert the input from nchw to nhwc
+// Caller should ensure input is currently in nchw format using ModelBuilder::IsOperandNHWC
+Status GetNHWCInput_nu(ModelBuilder& model_builder, const NodeUnit& node_unit, size_t input_index, std::string& nhwc_input) {
+  const auto& nchw_input = node_unit.Inputs()[input_index].node_arg.Name();
+  if (!model_builder.GetNHWCOperand(nchw_input, nhwc_input)) {
+    nhwc_input = model_builder.GetUniqueName(nchw_input + "_nchw_to_nhwc");
+    ORT_RETURN_IF_ERROR(TransposeNCHWToNHWC(model_builder, nchw_input, nhwc_input));
+  }
+  return Status::OK();
+}
+
+// Convert the input from nhwc to nchw
+// Caller should ensure input is currently in nhwc format using ModelBuilder::IsOperandNHWC
+Status GetNCHWInput_nu(ModelBuilder& model_builder, const NodeUnit& node_unit, size_t input_index, std::string& nchw_input) {
+  const auto& nhwc_input = node_unit.Inputs()[input_index].node_arg.Name();
+  if (!model_builder.GetNCHWOperand(nhwc_input, nchw_input)) {
+    nchw_input = model_builder.GetUniqueName(nhwc_input + "_nhwc_to_nchw");
+    ORT_RETURN_IF_ERROR(TransposeNHWCToNCHW(model_builder, nhwc_input, nchw_input));
+  }
+  return Status::OK();
+}
+
 // Convert the input from nchw to nhwc
 // Caller should ensure input is currently in nchw format using ModelBuilder::IsOperandNHWC
 Status GetNHWCInput(ModelBuilder& model_builder, const Node& node, size_t input_index, std::string& input) {
@@ -148,6 +170,33 @@ Status GetNCHWInput(ModelBuilder& model_builder, const Node& node, size_t input_
   return Status::OK();
 }
 
+// Transpose layouts if necessary for element wise operators with 2 inputs
+// and return the layout type of output tensor
+// If both inputs have same layout, the output will have the same layout
+// Otherwise we will need transpose the nhwc input back to nchw, and output will be nchw
+Status TransposeBinaryOpInputLayout_nu(ModelBuilder& model_builder, const NodeUnit& node_unit,
+                                       std::string& input1, std::string& input2,
+                                       bool& output_is_nhwc) ORT_MUST_USE_RESULT;
+Status TransposeBinaryOpInputLayout_nu(ModelBuilder& model_builder, const NodeUnit& node_unit,
+                                       std::string& input1, std::string& input2,
+                                       bool& output_is_nhwc) {
+  bool input1_is_nhwc = model_builder.IsOperandNHWC(input1);
+  bool input2_is_nhwc = model_builder.IsOperandNHWC(input2);
+  output_is_nhwc = false;
+
+  if (input1_is_nhwc == input2_is_nhwc) {
+    output_is_nhwc = input1_is_nhwc;
+  } else if (input1_is_nhwc) {
+    // need transpose input1 back to nchw
+    ORT_RETURN_IF_ERROR(GetNCHWInput_nu(model_builder, node_unit, 0, input1));
+  } else {  // input2_is_nhwc
+    // need transpose input2 back to nchw
+    ORT_RETURN_IF_ERROR(GetNCHWInput_nu(model_builder, node_unit, 1, input2));
+  }
+
+  return Status::OK();
+}
+
 // Transpose layouts if necessary for element wise operators with 2 inputs
 // and return the layout type of output tensor
 // If both inputs have same layout, the output will have the same layout
@@ -510,6 +559,28 @@ static Status HandleAutoPad(const Shape& input_shape,
   return Status::OK();
 }
 
+// Get scales and zero points for the qlinear binary ops (which has 2 input and 1 output)
+// QLinearConv, QLinearMatmul, QLinearAdd
+// a, b are inputs, and y is output
+static Status GetBinaryOpQuantizationScaleAndZeroPoint_nu(
+    const ModelBuilder& model_builder, const NodeUnit& node_unit,
+    float& a_scale, float& b_scale, float& y_scale,
+    int32_t& a_zero_point, int32_t& b_zero_point, int32_t& y_zero_point) ORT_MUST_USE_RESULT;
+static Status GetBinaryOpQuantizationScaleAndZeroPoint_nu(
+    const ModelBuilder& model_builder, const NodeUnit& node_unit,
+    float& a_scale, float& b_scale, float& y_scale,
+    int32_t& a_zero_point, int32_t& b_zero_point, int32_t& y_zero_point) {
+  const auto& initializers = model_builder.GetInitializerTensors();
+  ORT_RETURN_IF_ERROR(GetQuantizationScaleAndZeroPoint(
+      initializers, node_unit.Inputs()[0], node_unit.ModelPath(), a_scale, a_zero_point));
+  ORT_RETURN_IF_ERROR(GetQuantizationScaleAndZeroPoint(
+      initializers, node_unit.Inputs()[1], node_unit.ModelPath(), b_scale, b_zero_point));
+  ORT_RETURN_IF_ERROR(GetQuantizationScaleAndZeroPoint(
+      initializers, node_unit.Outputs()[0], node_unit.ModelPath(), y_scale, y_zero_point));
+
+  return Status::OK();
+}
+
 // Get scales and zero points for the qlinear binary ops (which has 2 input and 1 output)
 // QLinearConv, QLinearMatmul, QLinearAdd
 // a, b are inputs, and y is output
@@ -758,9 +829,8 @@ void BinaryOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const N
 }
 
 Status BinaryOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const {
-  const auto& node = node_unit.GetNode();
-  const auto& op_type(node.OpType());
-  const auto input_defs(node.InputDefs());
+  const auto& op_type(node_unit.OpType());
+  const auto& inputs = node_unit.Inputs();
 
   int32_t op_code;
   bool add_activation = true;
@@ -780,18 +850,13 @@ Status BinaryOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
     return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "UnaryOpBuilder, unknown op: ", op_type);
   }
 
-  size_t a_idx = 0, b_idx = 1;
-  if (op_is_qlinear) {
-    b_idx = 3;
-  }
-
-  std::string input1 = input_defs[a_idx]->Name();
-  std::string input2 = input_defs[b_idx]->Name();
-  const auto& output = node.OutputDefs()[0]->Name();
+  std::string input1 = inputs[0].node_arg.Name();
+  std::string input2 = inputs[1].node_arg.Name();
+  const auto& output = node_unit.Outputs()[0].node_arg.Name();
 
   bool output_is_nhwc = false;
   ORT_RETURN_IF_ERROR(
-      TransposeBinaryOpInputLayout(model_builder, node, a_idx, b_idx, input1, input2, output_is_nhwc));
+      TransposeBinaryOpInputLayout_nu(model_builder, node_unit, input1, input2, output_is_nhwc));
 
   float a_scale = 0.0f,
         b_scale = 0.0f,
@@ -801,9 +866,9 @@ Status BinaryOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
           y_zero_point = 0;
 
   if (op_is_qlinear) {
-    ORT_RETURN_IF_ERROR(GetBinaryOpQuantizationScaleAndZeroPoint(model_builder, node,
-                                                                 a_scale, b_scale, y_scale,
-                                                                 a_zero_point, b_zero_point, y_zero_point));
+    ORT_RETURN_IF_ERROR(GetBinaryOpQuantizationScaleAndZeroPoint_nu(model_builder, node_unit,
+                                                                    a_scale, b_scale, y_scale,
+                                                                    a_zero_point, b_zero_point, y_zero_point));
   }
 
   // Verify if the scale and zero point matchs from onnx input and nnapi input match
@@ -814,7 +879,7 @@ Status BinaryOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
 
   int32_t fuse_code = ANEURALNETWORKS_FUSED_NONE;
   if (add_activation) {
-    fuse_code = model_builder.FindActivation(node, *node.OutputDefs()[0]);
+    fuse_code = model_builder.FindActivation_nu(node_unit, node_unit.Outputs()[0].node_arg);
   }
 
   return AddBinaryOperator(op_code, model_builder,
@@ -833,20 +898,19 @@ class ReluOpBuilder : public BaseOpBuilder {
 };
 
 Status ReluOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const {
-  const auto& node = node_unit.GetNode();
   auto& shaper(model_builder.GetShaper());
   const auto& operand_indices(model_builder.GetOperandIndices());
   const auto& operand_types(model_builder.GetOperandTypes());
 
-  const auto& input = node.InputDefs()[0]->Name();
-  const auto& output = node.OutputDefs()[0]->Name();
+  const auto& input = node_unit.Inputs()[0].node_arg.Name();
+  const auto& output = node_unit.Outputs()[0].node_arg.Name();
   bool output_is_nhwc = model_builder.IsOperandNHWC(input);
   ORT_RETURN_IF_ERROR(shaper.Identity(input, output));
   const OperandType output_operand_type(operand_types.at(input).type, shaper[output]);
 
   // skip this relu if it is some op's fuse output
   if (Contains(model_builder.GetFusedActivations(), input)) {
-    LOGS_DEFAULT(VERBOSE) << "Relu Node [" << node.Name() << "] fused";
+    LOGS_DEFAULT(VERBOSE) << "Relu Node [" << node_unit.Name() << "] fused";
     model_builder.RegisterOperand(output, operand_indices.at(input), output_operand_type, output_is_nhwc);
   } else {
     std::vector<uint32_t> input_indices;

From 21d055558db5b460d664f5ce47a41998140699c6 Mon Sep 17 00:00:00 2001
From: Guoyu Wang <wanggy@outlook.com>
Date: Sat, 8 Jan 2022 22:28:50 -0800
Subject: [PATCH 13/23] move more ops to node_unit

---
 .../nnapi_builtin/builders/op_builder.cc      | 45 +++++++++----------
 .../core/providers/shared/utils/utils.cc      |  4 ++
 .../core/providers/shared/utils/utils.h       |  8 +++-
 3 files changed, 30 insertions(+), 27 deletions(-)

diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc
index 88a2e2824d70c..141319bb2be3f 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc
@@ -563,14 +563,13 @@ static Status HandleAutoPad(const Shape& input_shape,
 // QLinearConv, QLinearMatmul, QLinearAdd
 // a, b are inputs, and y is output
 static Status GetBinaryOpQuantizationScaleAndZeroPoint_nu(
-    const ModelBuilder& model_builder, const NodeUnit& node_unit,
+    const InitializedTensorSet& initializers, const NodeUnit& node_unit,
     float& a_scale, float& b_scale, float& y_scale,
     int32_t& a_zero_point, int32_t& b_zero_point, int32_t& y_zero_point) ORT_MUST_USE_RESULT;
 static Status GetBinaryOpQuantizationScaleAndZeroPoint_nu(
-    const ModelBuilder& model_builder, const NodeUnit& node_unit,
+    const InitializedTensorSet& initializers, const NodeUnit& node_unit,
     float& a_scale, float& b_scale, float& y_scale,
     int32_t& a_zero_point, int32_t& b_zero_point, int32_t& y_zero_point) {
-  const auto& initializers = model_builder.GetInitializerTensors();
   ORT_RETURN_IF_ERROR(GetQuantizationScaleAndZeroPoint(
       initializers, node_unit.Inputs()[0], node_unit.ModelPath(), a_scale, a_zero_point));
   ORT_RETURN_IF_ERROR(GetQuantizationScaleAndZeroPoint(
@@ -866,9 +865,10 @@ Status BinaryOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
           y_zero_point = 0;
 
   if (op_is_qlinear) {
-    ORT_RETURN_IF_ERROR(GetBinaryOpQuantizationScaleAndZeroPoint_nu(model_builder, node_unit,
-                                                                    a_scale, b_scale, y_scale,
-                                                                    a_zero_point, b_zero_point, y_zero_point));
+    ORT_RETURN_IF_ERROR(GetBinaryOpQuantizationScaleAndZeroPoint_nu(
+        model_builder.GetInitializerTensors(), node_unit,
+        a_scale, b_scale, y_scale,
+        a_zero_point, b_zero_point, y_zero_point));
   }
 
   // Verify if the scale and zero point matchs from onnx input and nnapi input match
@@ -932,12 +932,11 @@ class TransposeOpBuilder : public BaseOpBuilder {
 };
 
 Status TransposeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const {
-  const auto& node = node_unit.GetNode();
   auto& shaper(model_builder.GetShaper());
 
-  auto input = node.InputDefs()[0]->Name();
-  const auto& output = node.OutputDefs()[0]->Name();
-  NodeAttrHelper helper(node);
+  auto input = node_unit.Inputs()[0].node_arg.Name();
+  const auto& output = node_unit.Outputs()[0].node_arg.Name();
+  NodeAttrHelper helper(node_unit);
   std::vector<int32_t> perm = helper.Get("perm", std::vector<int32_t>());
   auto input_dims = shaper[input].size();
   if (perm.empty()) {
@@ -956,7 +955,7 @@ Status TransposeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, co
       perm[i] = axis_nchw_to_nhwc[perm[i]];
   }
 
-  std::string perm_name = model_builder.GetUniqueName(node.Name() + input + "perm");
+  std::string perm_name = model_builder.GetUniqueName(node_unit.Name() + input + "perm");
 
   // It is possible this onnx transpose operator can be nchw->nhwc, but so far I don't see
   // any scenario will do this since onnx is nchw only, assume the output is always not nhwc
@@ -1267,15 +1266,13 @@ void PoolOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const Nod
 }
 
 Status PoolOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const {
-  const auto& node = node_unit.GetNode();
-
   auto& shaper(model_builder.GetShaper());
   const auto& operand_indices(model_builder.GetOperandIndices());
   const auto& operand_types(model_builder.GetOperandTypes());
 
-  NodeAttrHelper helper(node);
+  NodeAttrHelper helper(node_unit);
 
-  auto input = node.InputDefs()[0]->Name();
+  auto input = node_unit.Inputs()[0].node_arg.Name();
   bool use_nchw = model_builder.UseNCHW();
   bool input_is_nhwc = model_builder.IsOperandNHWC(input);
   bool output_is_nhwc = false;
@@ -1284,12 +1281,12 @@ Status PoolOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
   } else {
     output_is_nhwc = true;
     if (!input_is_nhwc) {
-      ORT_RETURN_IF_ERROR(GetNHWCInput(model_builder, node, 0, input));
+      ORT_RETURN_IF_ERROR(GetNHWCInput_nu(model_builder, node_unit, 0, input));
     }
   }
 
-  const auto& output = node.OutputDefs()[0]->Name();
-  const auto& op_type = node.OpType();
+  const auto& output = node_unit.Outputs()[0].node_arg.Name();
+  const auto& op_type = node_unit.OpType();
 
   int32_t op_code;
   bool is_qlinear_average_pool = op_type == "QLinearAveragePool";
@@ -1329,7 +1326,7 @@ Status PoolOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
     }
   }
 
-  int32_t fuse_code = model_builder.FindActivation(node, *node.OutputDefs()[0]);
+  int32_t fuse_code = model_builder.FindActivation_nu(node_unit, node_unit.Outputs()[0].node_arg);
 
   // Get output scale and zero point if this is QLinearAveragePool
   // Otherwise we will use the scale and zero point of the input
@@ -1339,16 +1336,14 @@ Status PoolOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
   if (is_qlinear_average_pool) {
     const auto& initializers = model_builder.GetInitializerTensors();
     float x_scale = 0.0f;
-    ORT_RETURN_IF_ERROR(GetQuantizationScale(initializers, node, 1 /* idx */, x_scale));
     int32_t x_zero_point = 0;
-    ORT_RETURN_IF_ERROR(GetQuantizationZeroPoint(initializers, node, 2 /* idx */, x_zero_point));
+    ORT_RETURN_IF_ERROR(GetQuantizationScaleAndZeroPoint(
+        initializers, node_unit.Inputs()[0], node_unit.ModelPath(), x_scale, x_zero_point));
 
     // Verify if the scale and zero point values from onnx input and nnapi input match
     ORT_RETURN_IF_ERROR(IsValidInputQuantizedType(model_builder, input, x_scale, x_zero_point));
-
-    ORT_RETURN_IF_ERROR(GetQuantizationScale(initializers, node, 3 /* idx */, y_scale));
-    if (node.InputDefs().size() > 4)
-      ORT_RETURN_IF_ERROR(GetQuantizationZeroPoint(initializers, node, 4 /* idx */, y_zero_point));
+    ORT_RETURN_IF_ERROR(GetQuantizationScaleAndZeroPoint(
+        initializers, node_unit.Outputs()[0], node_unit.ModelPath(), y_scale, y_zero_point));
   }
 
   std::vector<uint32_t> input_indices;
diff --git a/onnxruntime/core/providers/shared/utils/utils.cc b/onnxruntime/core/providers/shared/utils/utils.cc
index 6f38a8e368ea4..d0e33062d7f00 100644
--- a/onnxruntime/core/providers/shared/utils/utils.cc
+++ b/onnxruntime/core/providers/shared/utils/utils.cc
@@ -8,6 +8,7 @@
 #include <core/framework/tensorprotoutils.h>
 #include <core/graph/graph.h>
 #include <core/providers/common.h>
+#include "core/providers/shared/node_unit/node_unit.h"
 
 namespace onnxruntime {
 
@@ -81,6 +82,9 @@ bool GetClipMinMax(const InitializedTensorSet& initializers, const Node& node,
 NodeAttrHelper::NodeAttrHelper(const onnxruntime::Node& node)
     : node_attributes_(node.GetAttributes()) {}
 
+NodeAttrHelper::NodeAttrHelper(const NodeUnit& node_unit)
+    : node_attributes_(node_unit.GetNode().GetAttributes()) {}
+
 float NodeAttrHelper::Get(const std::string& key, float def_val) const {
   if (!HasAttr(key))
     return def_val;
diff --git a/onnxruntime/core/providers/shared/utils/utils.h b/onnxruntime/core/providers/shared/utils/utils.h
index 925df731fcee3..b6884f53d1c2e 100644
--- a/onnxruntime/core/providers/shared/utils/utils.h
+++ b/onnxruntime/core/providers/shared/utils/utils.h
@@ -17,6 +17,7 @@ class Logger;
 
 class Node;
 class NodeArg;
+class NodeUnit;
 
 // Get the min/max of a Clip operator.
 // If min/max are not known initializer tensors, will return false
@@ -34,7 +35,10 @@ bool GetType(const NodeArg& node_arg, int32_t& type, const logging::Logger& logg
  */
 class NodeAttrHelper {
  public:
-  NodeAttrHelper(const onnxruntime::Node& node);
+  NodeAttrHelper(const Node& node);
+
+  // Get the attributes from the target node of the node_unit
+  NodeAttrHelper(const NodeUnit& node_unit);
 
   float Get(const std::string& key, float def_val) const;
 
@@ -52,7 +56,7 @@ class NodeAttrHelper {
   bool HasAttr(const std::string& key) const;
 
  private:
-  const onnxruntime::NodeAttributes& node_attributes_;
+  const NodeAttributes& node_attributes_;
 };
 
 }  // namespace onnxruntime

From 61b7e7e4ad23e9c98b83e491d368360bd4ce4b55 Mon Sep 17 00:00:00 2001
From: Guoyu Wang <wanggy@outlook.com>
Date: Mon, 10 Jan 2022 13:42:37 -0800
Subject: [PATCH 14/23] move conv to node_unit

---
 .../nnapi/nnapi_builtin/builders/helper.cc    | 21 +++++
 .../nnapi/nnapi_builtin/builders/helper.h     |  1 +
 .../nnapi_builtin/builders/op_builder.cc      | 91 ++++++++++++++-----
 3 files changed, 91 insertions(+), 22 deletions(-)

diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.cc
index 71f5628211655..c0ded705a6c18 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.cc
@@ -68,6 +68,27 @@ QLinearOpType GetQLinearOpType(const onnxruntime::Node& node) {
   return QLinearOpType::Unknown;
 }
 
+ConvType GetConvType_nu(const NodeUnit& node_unit, const InitializedTensorSet& initializers) {
+  NodeAttrHelper helper(node_unit);
+  const auto group = helper.Get("group", 1);
+
+  const auto& weight = node_unit.Inputs()[1].node_arg.Name();
+  const auto& weight_tensor = *initializers.at(weight);
+
+  // For ONNX we only have 1 conv ops
+  // For NNAPI we have 3
+  // Input is (N, C, H, W)
+  // group == 1,                                   --> regular conv
+  // group != 1 && weight is (M, 1, kH, kW),       --> depthwise conv
+  // group != 1 && weight is (M, C/group, kH, kW), --> grouped conv
+  if (group == 1)
+    return ConvType::Regular;
+  else if ((weight_tensor.dims()[1] == 1))
+    return ConvType::Depthwise;
+  else
+    return ConvType::Grouped;
+}
+
 ConvType GetConvType(const onnxruntime::Node& node, const InitializedTensorSet& initializers) {
   const auto& op_type = node.OpType();
   bool is_qlinear_conv = (op_type == "QLinearConv");
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.h b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.h
index 8a87f119c72cf..281157539c3dc 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.h
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.h
@@ -98,6 +98,7 @@ QLinearOpType GetQLinearOpType(const onnxruntime::Node& node);
 // Return the type of the conv ops,
 // This function assumes the input is a 2d conv node
 ConvType GetConvType(const onnxruntime::Node& node, const InitializedTensorSet& initializers);
+ConvType GetConvType_nu(const NodeUnit& node_unit, const InitializedTensorSet& initializers);
 
 // This qlinear op is an operator takes 2 inputs and produces 1 output
 // Such as QLinearConv, QLinearMatMul, QLinearAdd, ...
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc
index 141319bb2be3f..0b734a4a62773 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc
@@ -580,9 +580,6 @@ static Status GetBinaryOpQuantizationScaleAndZeroPoint_nu(
   return Status::OK();
 }
 
-// Get scales and zero points for the qlinear binary ops (which has 2 input and 1 output)
-// QLinearConv, QLinearMatmul, QLinearAdd
-// a, b are inputs, and y is output
 static Status GetBinaryOpQuantizationScaleAndZeroPoint(
     const ModelBuilder& model_builder, const Node& node,
     float& a_scale, float& b_scale, float& y_scale,
@@ -611,6 +608,61 @@ static Status GetBinaryOpQuantizationScaleAndZeroPoint(
 // If the Qlinear[Conv/MatMul] is using per-tensor u8s8, the weight/B tensor
 // will be convert to uint8 later, will return the same scale and 128 as zero point
 // Also will set is_per_tensor_u8s8 to true to be used later
+static Status GetConvMatMulOpQuantizationScaleAndZeroPoint_nu(
+    const ModelBuilder& model_builder, const NodeUnit& node_unit,
+    float& a_scale, float& w_scale, float& y_scale,
+    int32_t& a_zero_point, int32_t& w_zero_point, int32_t& y_zero_point,
+    optional<std::vector<float>>& w_scales, bool& is_per_tensor_u8s8) ORT_MUST_USE_RESULT;
+static Status GetConvMatMulOpQuantizationScaleAndZeroPoint_nu(
+    const ModelBuilder& model_builder, const NodeUnit& node_unit,
+    float& a_scale, float& w_scale, float& y_scale,
+    int32_t& a_zero_point, int32_t& w_zero_point, int32_t& y_zero_point,
+    optional<std::vector<float>>& w_scales, bool& is_per_tensor_u8s8) {
+  is_per_tensor_u8s8 = false;
+  const auto& initializers(model_builder.GetInitializerTensors());
+  // Get scale and zero points
+  // We will handle per-channel weight scale and zero point later
+  ORT_RETURN_IF_ERROR(
+      GetBinaryOpQuantizationScaleAndZeroPoint_nu(initializers, node_unit,
+                                                  a_scale, w_scale, y_scale,
+                                                  a_zero_point, w_zero_point, y_zero_point));
+
+  const auto& inputs = node_unit.Inputs();
+  const auto& weight_tensor = *initializers.at(inputs[1].node_arg.Name());
+
+  // We are done here is this is u8u8 QLinearConv
+  if (weight_tensor.data_type() == ONNX_NAMESPACE::TensorProto_DataType_UINT8)
+    return Status::OK();
+
+  // This is per-tensor u8s8
+  // NNAPI does not support per-tensor u8s8
+  // For this case we will need to convert the int8 weight tensor to uint8
+  // And have same scale and 128 as zero point
+  // The conversion of the weight tensor itself will be done in the OpBuilder
+  const auto& scale_tensor = *initializers.at(inputs[1].quant_param->scale.Name());
+  int64_t scale_dim = scale_tensor.dims().empty() ? 1 : scale_tensor.dims()[0];
+  if (scale_dim == 1) {
+    w_zero_point = 128;
+    is_per_tensor_u8s8 = true;
+    return Status::OK();
+  }
+
+  // Now we have u8s8 per-channel QlinearConv
+  // u8s8 QlinearConv always have 0 as zero point so we are not getting it here
+  // and we do not use w_scale here, so we reset them back to 0
+  w_scale = 0.0f;
+  w_zero_point = 0;
+
+  // We need to copy the 1d scales array for per-channel quantization
+  std::vector<uint8_t> unpacked_tensor;
+  ORT_RETURN_IF_ERROR(onnxruntime::utils::UnpackInitializerData(scale_tensor, unpacked_tensor));
+  const float* scales = reinterpret_cast<const float*>(unpacked_tensor.data());
+  const size_t scales_size = scale_tensor.dims().empty() ? 1 : scale_tensor.dims()[0];
+  std::vector<float> scales_vec(scales, scales + scales_size);
+  w_scales = onnxruntime::make_optional(std::move(scales_vec));
+  return Status::OK();
+}
+
 static Status GetConvMatMulOpQuantizationScaleAndZeroPoint(
     const ModelBuilder& model_builder, const Node& node,
     float& a_scale, float& w_scale, float& y_scale,
@@ -1423,13 +1475,12 @@ void ConvOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const Nod
 }
 
 Status ConvOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const {
-  const auto& node = node_unit.GetNode();
   auto& shaper(model_builder.GetShaper());
   const auto& operand_indices(model_builder.GetOperandIndices());
   const auto& operand_types(model_builder.GetOperandTypes());
   const auto& initializers(model_builder.GetInitializerTensors());
-  NodeAttrHelper helper(node);
-  const auto input_defs = node.InputDefs();
+  NodeAttrHelper helper(node_unit);
+  const auto inputs = node_unit.Inputs();
   bool is_qlinear_conv = IsQuantizedOp(node_unit);
 
   // onnx strides are in the order height, width
@@ -1445,11 +1496,7 @@ Status ConvOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
   const auto onnx_dilations = helper.Get("dilations", std::vector<int>{1, 1});
   const auto group = helper.Get("group", 1);
 
-  size_t x_idx = 0,
-         w_idx = is_qlinear_conv ? 3 : 1,
-         b_idx = is_qlinear_conv ? 8 : 2;
-
-  auto input = input_defs[x_idx]->Name();
+  auto input = inputs[0].node_arg.Name();
   bool use_nchw = model_builder.UseNCHW();
   bool input_is_nhwc = model_builder.IsOperandNHWC(input);
   bool output_is_nhwc = false;
@@ -1458,13 +1505,13 @@ Status ConvOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
   } else {
     output_is_nhwc = true;
     if (!input_is_nhwc) {
-      ORT_RETURN_IF_ERROR(GetNHWCInput(model_builder, node, x_idx, input));
+      ORT_RETURN_IF_ERROR(GetNHWCInput_nu(model_builder, node_unit, 0, input));
     }
   }
 
-  const auto& weight = input_defs[w_idx]->Name();
+  const auto& weight = inputs[1].node_arg.Name();
   const auto& weight_tensor = *initializers.at(weight);
-  auto conv_type = GetConvType(node, model_builder.GetGraphViewer().GetAllInitializedTensors());
+  auto conv_type = GetConvType_nu(node_unit, model_builder.GetInitializerTensors());
   bool conv_2d = (conv_type == ConvType::Regular),
        depthwise_conv_2d = (conv_type == ConvType::Depthwise),
        grouped_conv_2d = (conv_type == ConvType::Grouped);
@@ -1480,10 +1527,10 @@ Status ConvOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
   optional<std::vector<float>> w_scales;
   bool is_per_tensor_u8s8 = false;
   if (is_qlinear_conv) {
-    ORT_RETURN_IF_ERROR(GetConvMatMulOpQuantizationScaleAndZeroPoint(model_builder, node,
-                                                                     x_scale, w_scale, y_scale,
-                                                                     x_zero_point, w_zero_point, y_zero_point,
-                                                                     w_scales, is_per_tensor_u8s8));
+    ORT_RETURN_IF_ERROR(GetConvMatMulOpQuantizationScaleAndZeroPoint_nu(model_builder, node_unit,
+                                                                        x_scale, w_scale, y_scale,
+                                                                        x_zero_point, w_zero_point, y_zero_point,
+                                                                        w_scales, is_per_tensor_u8s8));
   }
 
   Shape onnx_weight_shape;
@@ -1536,8 +1583,8 @@ Status ConvOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
     ORT_RETURN_IF_ERROR(IsValidConvWeightQuantizedType(model_builder, weight, w_scale, w_zero_point, w_scales));
   }
 
-  bool hasBias = (input_defs.size() > b_idx);
-  std::string bias = hasBias ? input_defs[b_idx]->Name() : weight + "_bias";
+  bool hasBias = (inputs.size() > 2);
+  std::string bias = hasBias ? inputs[2].node_arg.Name() : weight + "_bias";
   if (!hasBias) {
     const auto weight_dimen = shaper[weight];
     Shape bias_dimen;
@@ -1613,7 +1660,7 @@ Status ConvOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
     }
   }
 
-  int32_t fuse_code = model_builder.FindActivation(node, *node.OutputDefs()[0]);
+  int32_t fuse_code = model_builder.FindActivation_nu(node_unit, node_unit.Outputs()[1].node_arg);
   ADD_SCALAR_OPERAND(model_builder, input_indices, fuse_code);
 
   if (model_builder.GetNNAPIFeatureLevel() > ANEURALNETWORKS_FEATURE_LEVEL_2) {
@@ -1631,7 +1678,7 @@ Status ConvOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
   }
 
   int32_t operationCode;
-  const auto& output = node.OutputDefs()[0]->Name();
+  const auto& output = node_unit.Outputs()[0].node_arg.Name();
   if (conv_2d || grouped_conv_2d) {
     operationCode = conv_2d ? ANEURALNETWORKS_CONV_2D
                             : ANEURALNETWORKS_GROUPED_CONV_2D;

From 7aa4f9d91f233423d36cbfc55d3ebd4ba944c916 Mon Sep 17 00:00:00 2001
From: Guoyu Wang <wanggy@outlook.com>
Date: Mon, 10 Jan 2022 14:11:39 -0800
Subject: [PATCH 15/23] move gemm/cast to node_unit

---
 .../nnapi_builtin/builders/op_builder.cc      | 119 +++---------------
 1 file changed, 19 insertions(+), 100 deletions(-)

diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc
index 0b734a4a62773..f116b37d200e2 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc
@@ -580,25 +580,6 @@ static Status GetBinaryOpQuantizationScaleAndZeroPoint_nu(
   return Status::OK();
 }
 
-static Status GetBinaryOpQuantizationScaleAndZeroPoint(
-    const ModelBuilder& model_builder, const Node& node,
-    float& a_scale, float& b_scale, float& y_scale,
-    int32_t& a_zero_point, int32_t& b_zero_point, int32_t& y_zero_point) ORT_MUST_USE_RESULT;
-static Status GetBinaryOpQuantizationScaleAndZeroPoint(
-    const ModelBuilder& model_builder, const Node& node,
-    float& a_scale, float& b_scale, float& y_scale,
-    int32_t& a_zero_point, int32_t& b_zero_point, int32_t& y_zero_point) {
-  const auto& initializers = model_builder.GetInitializerTensors();
-  ORT_RETURN_IF_ERROR(GetQuantizationScale(initializers, node, 1, a_scale));
-  ORT_RETURN_IF_ERROR(GetQuantizationScale(initializers, node, 4, b_scale));
-  ORT_RETURN_IF_ERROR(GetQuantizationScale(initializers, node, 6, y_scale));
-  ORT_RETURN_IF_ERROR(GetQuantizationZeroPoint(initializers, node, 2, a_zero_point));
-  ORT_RETURN_IF_ERROR(GetQuantizationZeroPoint(initializers, node, 5, b_zero_point));
-  ORT_RETURN_IF_ERROR(GetQuantizationZeroPoint(initializers, node, 7, y_zero_point));
-
-  return Status::OK();
-}
-
 // Get scale and zero point for
 // [QlinearConv] input, weight, output
 // [QlinearMatMul] A, B, Y
@@ -663,61 +644,6 @@ static Status GetConvMatMulOpQuantizationScaleAndZeroPoint_nu(
   return Status::OK();
 }
 
-static Status GetConvMatMulOpQuantizationScaleAndZeroPoint(
-    const ModelBuilder& model_builder, const Node& node,
-    float& a_scale, float& w_scale, float& y_scale,
-    int32_t& a_zero_point, int32_t& w_zero_point, int32_t& y_zero_point,
-    optional<std::vector<float>>& w_scales, bool& is_per_tensor_u8s8) ORT_MUST_USE_RESULT;
-static Status GetConvMatMulOpQuantizationScaleAndZeroPoint(
-    const ModelBuilder& model_builder, const Node& node,
-    float& a_scale, float& w_scale, float& y_scale,
-    int32_t& a_zero_point, int32_t& w_zero_point, int32_t& y_zero_point,
-    optional<std::vector<float>>& w_scales, bool& is_per_tensor_u8s8) {
-  is_per_tensor_u8s8 = false;
-  // Get scale and zero points
-  // We will handle per-channel weight scale and zero point later
-  ORT_RETURN_IF_ERROR(
-      GetBinaryOpQuantizationScaleAndZeroPoint(model_builder, node,
-                                               a_scale, w_scale, y_scale,
-                                               a_zero_point, w_zero_point, y_zero_point));
-
-  const auto input_defs = node.InputDefs();
-  const auto& initializers(model_builder.GetInitializerTensors());
-  const auto& weight_tensor = *initializers.at(input_defs[3]->Name());
-
-  // We are done here is this is u8u8 QLinearConv
-  if (weight_tensor.data_type() == ONNX_NAMESPACE::TensorProto_DataType_UINT8)
-    return Status::OK();
-
-  // This is per-tensor u8s8
-  // NNAPI does not support per-tensor u8s8
-  // For this case we will need to convert the int8 weight tensor to uint8
-  // And have same scale and 128 as zero point
-  // The conversion of the weight tensor itself will be done in the OpBuilder
-  const auto& scale_tensor = *initializers.at(input_defs[4]->Name());
-  int64_t scale_dim = scale_tensor.dims().empty() ? 1 : scale_tensor.dims()[0];
-  if (scale_dim == 1) {
-    w_zero_point = 128;
-    is_per_tensor_u8s8 = true;
-    return Status::OK();
-  }
-
-  // Now we have u8s8 per-channel QlinearConv
-  // u8s8 QlinearConv always have 0 as zero point so we are not getting it here
-  // and we do not use w_scale here, so we reset them back to 0
-  w_scale = 0.0f;
-  w_zero_point = 0;
-
-  // We need to copy the 1d scales array for per-channel quantization
-  std::vector<uint8_t> unpacked_tensor;
-  ORT_RETURN_IF_ERROR(onnxruntime::utils::UnpackInitializerData(scale_tensor, unpacked_tensor));
-  const float* scales = reinterpret_cast<const float*>(unpacked_tensor.data());
-  const size_t scales_size = scale_tensor.dims().empty() ? 1 : scale_tensor.dims()[0];
-  std::vector<float> scales_vec(scales, scales + scales_size);
-  w_scales = onnxruntime::make_optional(std::move(scales_vec));
-  return Status::OK();
-}
-
 // NNAPI has the quantization scale and zero point embedded in the ANeuralNetworksOperandType
 // ONNX has the quantization scale and zero point as the inputs of the qlinear operators
 // We want to verify the scale and zeropoint of the ONNX inputs matches the values embedded in the NNAPI inputs
@@ -1710,13 +1636,12 @@ class CastOpBuilder : public BaseOpBuilder {
 };
 
 Status CastOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const {
-  const auto& node = node_unit.GetNode();
   auto& shaper(model_builder.GetShaper());
   const auto& operand_indices(model_builder.GetOperandIndices());
-  NodeAttrHelper helper(node);
+  NodeAttrHelper helper(node_unit);
 
-  const auto& input = node.InputDefs()[0]->Name();
-  const auto& output = node.OutputDefs()[0]->Name();
+  const auto& input = node_unit.Inputs()[0].node_arg.Name();
+  const auto& output = node_unit.Outputs()[0].node_arg.Name();
   bool output_is_nhwc = model_builder.IsOperandNHWC(input);
 
   auto to = helper.Get("to", 0);
@@ -1875,25 +1800,19 @@ void GemmOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const Nod
 }
 
 Status GemmOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const {
-  const auto& node = node_unit.GetNode();
-
   auto& shaper(model_builder.GetShaper());
   const auto& operand_indices(model_builder.GetOperandIndices());
   const auto& operand_types(model_builder.GetOperandTypes());
   const auto& initializers(model_builder.GetInitializerTensors());
 
-  const auto& op = node.OpType();
-  const auto input_defs(node.InputDefs());
-  NodeAttrHelper helper(node);
+  const auto& op = node_unit.OpType();
+  const auto& inputs = node_unit.Inputs();
+  NodeAttrHelper helper(node_unit);
   bool is_qlinear_matmul = op == "QLinearMatMul";
 
-  size_t a_idx = 0,
-         b_idx = is_qlinear_matmul ? 3 : 1,
-         c_idx = 2;  // QLinearMatMul has no bias
-
-  const auto& input1 = input_defs[a_idx]->Name();
-  const auto& input2 = input_defs[b_idx]->Name();
-  const auto& output = node.OutputDefs()[0]->Name();
+  const auto& input1 = inputs[0].node_arg.Name();
+  const auto& input2 = inputs[1].node_arg.Name();
+  const auto& output = node_unit.Outputs()[0].node_arg.Name();
   const auto transB = helper.Get("transB", 0);
 
   float a_scale = 0.0f,
@@ -1907,10 +1826,10 @@ Status GemmOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
   if (is_qlinear_matmul) {
     optional<std::vector<float>> w_scales;
     ORT_RETURN_IF_ERROR(
-        GetConvMatMulOpQuantizationScaleAndZeroPoint(model_builder, node,
-                                                     a_scale, b_scale, y_scale,
-                                                     a_zero_point, b_zero_point, y_zero_point,
-                                                     w_scales, is_per_tensor_u8s8));
+        GetConvMatMulOpQuantizationScaleAndZeroPoint_nu(model_builder, node_unit,
+                                                        a_scale, b_scale, y_scale,
+                                                        a_zero_point, b_zero_point, y_zero_point,
+                                                        w_scales, is_per_tensor_u8s8));
   }
 
   uint32_t input_2_idx;
@@ -1939,14 +1858,14 @@ Status GemmOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
   }
 
   uint32_t bias_idx;
-  bool has_bias = (op == "Gemm") && (input_defs.size() > 2);
+  bool has_bias = inputs.size() > 2;
   if (has_bias) {
-    const auto& bias = input_defs[c_idx]->Name();
+    const auto& bias = inputs[2].node_arg.Name();
     // We need squeeze the input tensor to 1d if necessary
     if (shaper[bias].size() > 1) {
-      std::string bias_squeezed = model_builder.GetUniqueName(node.Name() + op + "_bias_squeezed");
+      std::string bias_squeezed = model_builder.GetUniqueName(node_unit.Name() + op + "_bias_squeezed");
       // We will use squeeze all here
-      ORT_RETURN_IF_ERROR(AddSqueezeOp(model_builder, node.Name(),
+      ORT_RETURN_IF_ERROR(AddSqueezeOp(model_builder, node_unit.Name(),
                                        bias, bias_squeezed,
                                        {} /* axes */));
       bias_idx = operand_indices.at(bias_squeezed);
@@ -1959,7 +1878,7 @@ Status GemmOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
     }
   } else {
     // No C supplied, we need a vector of 0
-    std::string bias = model_builder.GetUniqueName(node.Name() + op + "_bias");
+    std::string bias = model_builder.GetUniqueName(node_unit.Name() + op + "_bias");
     const auto& bias_type = operand_types.at(input2).type;
     const Shape& bias_dimen = {shaper[input2][0]};
     if (bias_type == Type::TENSOR_FLOAT32) {
@@ -1981,7 +1900,7 @@ Status GemmOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
   input_indices.push_back(operand_indices.at(input1));  // A
   input_indices.push_back(input_2_idx);                 // B
   input_indices.push_back(bias_idx);                    // C
-  int32_t fuse_code = model_builder.FindActivation(node, *node.OutputDefs()[0]);
+  int32_t fuse_code = model_builder.FindActivation_nu(node_unit, node_unit.Outputs()[0].node_arg);
   ADD_SCALAR_OPERAND(model_builder, input_indices, fuse_code);
 
   ORT_RETURN_IF_ERROR(shaper.FC(input1, input2, output));

From cc6d468823f168bb5c2d93063bd1ea04d29d1677 Mon Sep 17 00:00:00 2001
From: Guoyu Wang <wanggy@outlook.com>
Date: Mon, 10 Jan 2022 21:13:48 -0800
Subject: [PATCH 16/23] remove redundant functions

---
 .../nnapi/nnapi_builtin/builders/helper.cc    |  68 +---
 .../nnapi/nnapi_builtin/builders/helper.h     |  15 +-
 .../nnapi_builtin/builders/model_builder.cc   |  62 ---
 .../nnapi_builtin/builders/model_builder.h    |   2 -
 .../nnapi_builtin/builders/op_builder.cc      | 354 +++++++-----------
 .../builders/op_support_checker.cc            | 179 ++++-----
 6 files changed, 225 insertions(+), 455 deletions(-)

diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.cc
index c0ded705a6c18..3cf271ae317b8 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.cc
@@ -89,32 +89,6 @@ ConvType GetConvType_nu(const NodeUnit& node_unit, const InitializedTensorSet& i
     return ConvType::Grouped;
 }
 
-ConvType GetConvType(const onnxruntime::Node& node, const InitializedTensorSet& initializers) {
-  const auto& op_type = node.OpType();
-  bool is_qlinear_conv = (op_type == "QLinearConv");
-  ORT_ENFORCE(op_type == "Conv" || is_qlinear_conv);
-
-  NodeAttrHelper helper(node);
-  const auto group = helper.Get("group", 1);
-
-  size_t w_idx = is_qlinear_conv ? 3 : 1;
-  const auto& weight = node.InputDefs()[w_idx]->Name();
-  const auto& weight_tensor = *initializers.at(weight);
-
-  // For ONNX we only have 1 conv ops
-  // For NNAPI we have 3
-  // Input is (N, C, H, W)
-  // group == 1,                                   --> regular conv
-  // group != 1 && weight is (M, 1, kH, kW),       --> depthwise conv
-  // group != 1 && weight is (M, C/group, kH, kW), --> grouped conv
-  if (group == 1)
-    return ConvType::Regular;
-  else if ((weight_tensor.dims()[1] == 1))
-    return ConvType::Depthwise;
-  else
-    return ConvType::Grouped;
-}
-
 bool IsQLinearBinaryOp(QLinearOpType qlinear_op_type) {
   return qlinear_op_type == QLinearOpType::QLinearConv ||
          qlinear_op_type == QLinearOpType::QLinearMatMul ||
@@ -346,35 +320,6 @@ bool HasValidQuantizationZeroPoints(const InitializedTensorSet& initializers, co
   return true;
 }
 
-common::Status GetQuantizationScale(const InitializedTensorSet& initializers, const Node& node,
-                                    size_t idx, float& scale) {
-  std::vector<uint8_t> unpacked_tensor;
-  const auto& name = node.InputDefs()[idx]->Name();
-  const auto& scale_tensor = *initializers.at(name);
-  ORT_RETURN_IF_ERROR(
-      onnxruntime::utils::UnpackInitializerData(scale_tensor, node.ModelPath(), unpacked_tensor));
-
-  // The scale should be one or more floats
-  ORT_RETURN_IF(unpacked_tensor.size() < 4, "The initializer [", name, "] should have one or more floats ",
-                "with size no less than 4, actual size: ", unpacked_tensor.size());
-  scale = reinterpret_cast<const float*>(unpacked_tensor.data())[0];
-  return Status::OK();
-}
-
-common::Status GetQuantizationZeroPoint(const InitializedTensorSet& initializers,
-                                        const Node& node, size_t idx, int32_t& zero_point) {
-  std::vector<uint8_t> unpacked_tensor;
-  const auto& name = node.InputDefs()[idx]->Name();
-  const auto& zero_point_tensor = *initializers.at(name);
-  ORT_RETURN_IF_ERROR(
-      onnxruntime::utils::UnpackInitializerData(zero_point_tensor, node.ModelPath(), unpacked_tensor));
-
-  ORT_RETURN_IF(unpacked_tensor.empty(), "The initializer [", name, "] is empty");
-  // Onnx quantization uses uint8 [int8 not yet supported], need to cast to int32_t used by NNAPI
-  zero_point = static_cast<int32_t>(unpacked_tensor[0]);
-  return Status::OK();
-}
-
 common::Status GetQuantizationScaleAndZeroPoint(
     const InitializedTensorSet& initializers, const NodeUnitIODef& io_def, const Path& model_path,
     float& scale, int32_t& zero_point) {
@@ -461,9 +406,9 @@ bool GetType(const NodeArg& node_arg, int32_t& type) {
   return true;
 }
 
-void GetFlattenOutputShape(const Node& node, const Shape& input_shape, int32_t& dim_1, int32_t& dim_2) {
+void GetFlattenOutputShape_nu(const NodeUnit& node_unit, const Shape& input_shape, int32_t& dim_1, int32_t& dim_2) {
   int32_t rank = static_cast<int>(input_shape.size());
-  NodeAttrHelper helper(node);
+  NodeAttrHelper helper(node_unit);
   int32_t axis = helper.Get("axis", 1);
   // axis == rank is a valid input, but invalid for HandleNegativeAxis
   // Skip non-negative axis here
@@ -589,10 +534,11 @@ std::string Shape2String(const std::vector<uint32_t>& shape) {
   return os.str();
 }
 
-bool CheckIsInitializer(const InitializedTensorSet& initializers, const Node& node,
-                        size_t input_idx, const char* input_name) {
-  if (!Contains(initializers, node.InputDefs()[input_idx]->Name())) {
-    LOGS_DEFAULT(VERBOSE) << input_name << " of " << node.OpType() << " must be an initializer tensor";
+bool CheckIsInitializer(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+                        const std::string& input_name, const char* input_string) {
+  if (!Contains(initializers, input_name)) {
+    LOGS_DEFAULT(VERBOSE) << input_string << " of " << node_unit.Name() << "of type ["
+                          << node_unit.OpType() << "] must be an initializer tensor";
     return false;
   }
 
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.h b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.h
index 281157539c3dc..b3e3ffe43f072 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.h
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.h
@@ -97,7 +97,6 @@ QLinearOpType GetQLinearOpType(const onnxruntime::Node& node);
 
 // Return the type of the conv ops,
 // This function assumes the input is a 2d conv node
-ConvType GetConvType(const onnxruntime::Node& node, const InitializedTensorSet& initializers);
 ConvType GetConvType_nu(const NodeUnit& node_unit, const InitializedTensorSet& initializers);
 
 // This qlinear op is an operator takes 2 inputs and produces 1 output
@@ -117,12 +116,6 @@ bool HasValidQuantizationScales(const InitializedTensorSet& initializers, const
 bool HasValidQuantizationZeroPoints(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
                                     const std::vector<size_t>& indices, bool is_input);
 
-common::Status GetQuantizationScale(const InitializedTensorSet& initializers, const Node& node,
-                                    size_t idx, float& scale);
-
-common::Status GetQuantizationZeroPoint(const InitializedTensorSet& initializers,
-                                        const Node& node, size_t idx, int32_t& zero_point) ORT_MUST_USE_RESULT;
-
 common::Status GetQuantizationScaleAndZeroPoint(
     const InitializedTensorSet& initializers, const NodeUnitIODef& io_def, const Path& model_path,
     float& scale, int32_t& zero_point);
@@ -137,7 +130,7 @@ bool GetShape(const NodeArg& node_arg, Shape& shape);
 bool GetType(const NodeArg& node_arg, int32_t& type);
 
 // Get the output shape of Flatten Op
-void GetFlattenOutputShape(const Node& node, const Shape& input_shape, int32_t& dim_1, int32_t& dim_2);
+void GetFlattenOutputShape_nu(const NodeUnit& node_unit, const Shape& input_shape, int32_t& dim_1, int32_t& dim_2);
 
 // If a node is supported by NNAPI
 bool IsNodeSupported(const NodeUnit& node_unit, const GraphViewer& graph_viewer, const OpSupportCheckParams& params);
@@ -158,8 +151,10 @@ bool IsValidSupportedNodeGroup(const std::vector<const Node*>& supported_node_gr
 std::string Shape2String(const std::vector<uint32_t>& shape);
 
 // Check the given input is an initializer tensor
-bool CheckIsInitializer(const InitializedTensorSet& initializers, const Node& node,
-                        size_t index, const char* input_name) ORT_MUST_USE_RESULT;
+// input_name is the name of the initializer
+// input_string is the string describing the input in the output message (if nay)
+bool CheckIsInitializer(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+                        const std::string& input_name, const char* input_string) ORT_MUST_USE_RESULT;
 
 }  // namespace nnapi
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.cc
index a96c67960b7d3..7b5ac1aa6b0fb 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.cc
@@ -55,7 +55,6 @@ Status ModelBuilder::Prepare() {
   PreprocessNodeUnits();
   GetAllQuantizedOpInputs();
   PreprocessInitializers();
-  PreprocessActivations();
   PreprocessActivations_nu();
   ORT_RETURN_IF_ERROR(RegisterInitializers());
   ORT_RETURN_IF_ERROR(RegisterModelInputs());
@@ -123,28 +122,6 @@ void ModelBuilder::PreprocessInitializers() {
   }
 }
 
-void ModelBuilder::PreprocessActivations() {
-  const auto& node_indices = graph_viewer_.GetNodesInTopologicalOrder();
-  for (size_t i = 0; i < node_indices.size(); i++) {
-    const auto* node(graph_viewer_.GetNode(node_indices[i]));
-    const auto& op_type(node->OpType());
-
-    if (op_type == "Relu") {
-      activation_nodes_.emplace(node->Index(), ANEURALNETWORKS_FUSED_RELU);
-    } else if (op_type == "Clip") {  // Relu1 or Relu6
-      float min, max;
-      if (!GetClipMinMax(GetInitializerTensors(), *node, min, max, logging::LoggingManager::DefaultLogger()))
-        continue;
-
-      if (min == -1.0f && max == 1.0f) {
-        activation_nodes_.emplace(node->Index(), ANEURALNETWORKS_FUSED_RELU1);
-      } else if (min == 0.0f && max == 6.0f) {
-        activation_nodes_.emplace(node->Index(), ANEURALNETWORKS_FUSED_RELU6);
-      }
-    }
-  }
-}
-
 void ModelBuilder::PreprocessActivations_nu() {
   for (const auto& node_unit : node_unit_holder_) {
     const auto& node = node_unit->GetNode();
@@ -692,45 +669,6 @@ int32_t ModelBuilder::FindActivation_nu(const NodeUnit& node_unit, const NodeArg
   return fuse_code;
 }
 
-int32_t ModelBuilder::FindActivation(const Node& node, const NodeArg& output) {
-  int32_t fuse_code = ANEURALNETWORKS_FUSED_NONE;
-
-  // We do not support activation fusion for quantized operators for now
-  auto qlinear_op_type = GetQLinearOpType(node);
-  if (qlinear_op_type != QLinearOpType::Unknown)
-    return fuse_code;
-
-  for (auto it = node.OutputEdgesBegin(), end = node.OutputEdgesEnd(); it != end; ++it) {
-    const auto& dst_node = it->GetNode();
-    const auto* dst_input = dst_node.InputDefs()[it->GetDstArgIndex()];
-    if (Contains(activation_nodes_, dst_node.Index())) {
-      if (&output == dst_input) {
-        fuse_code = activation_nodes_.at(dst_node.Index());
-      }
-    } else {
-      // if there is any other non-relu node using the output
-      // will add relu separately
-      if (&output == dst_input)
-        return ANEURALNETWORKS_FUSED_NONE;
-    }
-  }
-
-  // if output is a graph output, will add relu separately
-  if (fuse_code != ANEURALNETWORKS_FUSED_NONE) {
-    for (const auto* graph_output : graph_viewer_.GetOutputs()) {
-      if (&output == graph_output)
-        return ANEURALNETWORKS_FUSED_NONE;
-    }
-
-    LOGS_DEFAULT(VERBOSE) << "Node [" << node.Name() << "] type [" << node.OpType()
-                          << "], fused the output [" << output.Name() << "]";
-
-    fused_activations_.insert(output.Name());
-  }
-
-  return fuse_code;
-}
-
 /* static */ const IOpBuilder* ModelBuilder::GetOpBuilder(const NodeUnit& node_unit) {
   const auto& op_builders = GetOpBuilders();
   const auto& op_type = node_unit.GetNode().OpType();
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.h b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.h
index 07b8035d96f5a..45a05773b853c 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.h
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.h
@@ -48,7 +48,6 @@ class ModelBuilder {
                       const std::vector<bool>& is_nhwc_vec) ORT_MUST_USE_RESULT;
 
   // Find if an output has a fuseable activation (Relu)
-  int32_t FindActivation(const Node& node, const NodeArg& output);
   int32_t FindActivation_nu(const NodeUnit& node_unit, const NodeArg& output);
 
   // Add an NNAPI scalar operand
@@ -185,7 +184,6 @@ class ModelBuilder {
   // If a NNAPI operation will use initializers directly, we will add the initializers to the skip list
   void PreprocessInitializers();
   // Preprocess all the activation nodes (Relu/Relu1/Relu6) for easy query later
-  void PreprocessActivations();
   void PreprocessActivations_nu();
   // Copy and process all the initializers to NNAPI model
   Status RegisterInitializers() ORT_MUST_USE_RESULT;
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc
index f116b37d200e2..da6e1e9841620 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc
@@ -148,28 +148,6 @@ Status GetNCHWInput_nu(ModelBuilder& model_builder, const NodeUnit& node_unit, s
   return Status::OK();
 }
 
-// Convert the input from nchw to nhwc
-// Caller should ensure input is currently in nchw format using ModelBuilder::IsOperandNHWC
-Status GetNHWCInput(ModelBuilder& model_builder, const Node& node, size_t input_index, std::string& input) {
-  const auto& nchw_input = node.InputDefs()[input_index]->Name();
-  if (!model_builder.GetNHWCOperand(nchw_input, input)) {
-    input = model_builder.GetUniqueName(nchw_input + "_nchw_to_nhwc");
-    ORT_RETURN_IF_ERROR(TransposeNCHWToNHWC(model_builder, nchw_input, input));
-  }
-  return Status::OK();
-}
-
-// Convert the input from nhwc to nchw
-// Caller should ensure input is currently in nhwc format using ModelBuilder::IsOperandNHWC
-Status GetNCHWInput(ModelBuilder& model_builder, const Node& node, size_t input_index, std::string& input) {
-  const auto& nhwc_input = node.InputDefs()[input_index]->Name();
-  if (!model_builder.GetNCHWOperand(nhwc_input, input)) {
-    input = model_builder.GetUniqueName(nhwc_input + "_nhwc_to_nchw");
-    ORT_RETURN_IF_ERROR(TransposeNHWCToNCHW(model_builder, nhwc_input, input));
-  }
-  return Status::OK();
-}
-
 // Transpose layouts if necessary for element wise operators with 2 inputs
 // and return the layout type of output tensor
 // If both inputs have same layout, the output will have the same layout
@@ -197,35 +175,6 @@ Status TransposeBinaryOpInputLayout_nu(ModelBuilder& model_builder, const NodeUn
   return Status::OK();
 }
 
-// Transpose layouts if necessary for element wise operators with 2 inputs
-// and return the layout type of output tensor
-// If both inputs have same layout, the output will have the same layout
-// Otherwise we will need transpose the nhwc input back to nchw, and output will be nchw
-Status TransposeBinaryOpInputLayout(ModelBuilder& model_builder, const Node& node,
-                                    size_t input1_idx, size_t input2_idx,
-                                    std::string& input1, std::string& input2,
-                                    bool& output_is_nhwc) ORT_MUST_USE_RESULT;
-Status TransposeBinaryOpInputLayout(ModelBuilder& model_builder, const Node& node,
-                                    size_t input1_idx, size_t input2_idx,
-                                    std::string& input1, std::string& input2,
-                                    bool& output_is_nhwc) {
-  bool input1_is_nhwc = model_builder.IsOperandNHWC(input1);
-  bool input2_is_nhwc = model_builder.IsOperandNHWC(input2);
-  output_is_nhwc = false;
-
-  if (input1_is_nhwc == input2_is_nhwc) {
-    output_is_nhwc = input1_is_nhwc;
-  } else if (input1_is_nhwc) {
-    // need transpose input1 back to nchw
-    ORT_RETURN_IF_ERROR(GetNCHWInput(model_builder, node, input1_idx, input1));
-  } else {  // input2_is_nhwc
-    // need transpose input2 back to nchw
-    ORT_RETURN_IF_ERROR(GetNCHWInput(model_builder, node, input2_idx, input2));
-  }
-
-  return Status::OK();
-}
-
 static Status AddBinaryOperator(int32_t op_type,
                                 ModelBuilder& model_builder,
                                 const std::string& input1,
@@ -746,16 +695,20 @@ class BaseOpBuilder : public IOpBuilder {
 
  protected:
   virtual Status AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const ORT_MUST_USE_RESULT = 0;
+  static bool IsOpSupported(const ModelBuilder& model_builder, const NodeUnit& node_unit) ORT_MUST_USE_RESULT;
 };
 
-Status BaseOpBuilder::AddToModelBuilder(ModelBuilder& model_builder, const NodeUnit& node_unit) const {
+/* static */ bool BaseOpBuilder::IsOpSupported(const ModelBuilder& model_builder, const NodeUnit& node_unit) {
   OpSupportCheckParams params{
       model_builder.GetNNAPIFeatureLevel(),
       model_builder.UseNCHW(),
   };
 
-  ORT_RETURN_IF_NOT(IsNodeSupported(node_unit, model_builder.GetGraphViewer(), params),
-                    "Unsupported operator ", node_unit.OpType());
+  return IsNodeSupported(node_unit, model_builder.GetGraphViewer(), params);
+}
+
+Status BaseOpBuilder::AddToModelBuilder(ModelBuilder& model_builder, const NodeUnit& node_unit) const {
+  ORT_RETURN_IF_NOT(IsOpSupported(model_builder, node_unit), "Unsupported operator ", node_unit.OpType());
   ORT_RETURN_IF_ERROR(AddToModelBuilderImpl(model_builder, node_unit));
   LOGS_DEFAULT(VERBOSE) << "Operator name: [" << node_unit.Name()
                         << "] type: [" << node_unit.OpType() << "] was added";
@@ -951,12 +904,13 @@ Status TransposeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, co
 class ReshapeOpBuilder : public BaseOpBuilder {
  public:
   void AddInitializersToSkip(ModelBuilder& model_builder, const NodeUnit& node_unit) const override;
-  static Status AddReshapeOperator(ModelBuilder& model_builder, const Node& node,
+  static Status AddReshapeOperator(ModelBuilder& model_builder, const NodeUnit& node_unit,
                                    const std::string& input, const std::vector<int32_t>& shape) ORT_MUST_USE_RESULT;
 
  private:
   Status AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const override ORT_MUST_USE_RESULT;
-  static bool CanSkipReshape(const ModelBuilder& model_builder, const Node& node, size_t input_rank, size_t output_rank);
+  static bool CanSkipReshape(const ModelBuilder& model_builder, const NodeUnit& node_unit,
+                             size_t input_rank, size_t output_rank);
 };
 
 void ReshapeOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const NodeUnit& node_unit) const {
@@ -975,25 +929,34 @@ void ReshapeOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const
 // between NNAPI CPU impl and Hardware Accelerator impl and will speed up the execution
 // If we are going to skip the reshape, we will still add correct shape and operand type for the output in
 // onnxruntime::nnapi::Model.
-/* static */ bool ReshapeOpBuilder::CanSkipReshape(const ModelBuilder& model_builder, const Node& node,
+/* static */ bool ReshapeOpBuilder::CanSkipReshape(const ModelBuilder& model_builder, const NodeUnit& node_unit,
                                                    size_t input_rank, size_t output_rank) {
-  const auto& output = node.OutputDefs()[0]->Name();
+  const auto& output_node_arg = node_unit.Outputs()[0].node_arg;
+  const auto& output_name = output_node_arg.Name();
+  const auto& output_node = *node_unit.GetOutputNodes()[0];
+
   // We will go through all the output edges
-  for (auto it = node.OutputEdgesBegin(), end = node.OutputEdgesEnd(); it != end; ++it) {
-    const auto& op_type = it->GetNode().OpType();
+  for (auto it = output_node.OutputEdgesBegin(), end = output_node.OutputEdgesEnd(); it != end; ++it) {
+    const auto& dest_node_unit = model_builder.GetNodeUnit(&it->GetNode());
+    const auto& op_type = dest_node_unit.OpType();
     // TODO add quantized matmul when reshape support quantized input
     if (op_type != "Gemm" && op_type != "MatMul") {
       LOGS_DEFAULT(VERBOSE) << "Reshape/Flatten can only be skipped when the output is Gemm/Matmul"
                             << " or no op is using the output (output is graph output)"
-                            << ", output name, " << output
+                            << ", output name, " << output_name
                             << " is used by " << op_type;
       return false;
     }
 
+    // Now the dest node is Gemm/Matmul, we want to make sure it is supported
+    if (!BaseOpBuilder::IsOpSupported(model_builder, node_unit)) {
+      return false;
+    }
+
     // NNAPI ANEURALNETWORKS_FULLY_CONNECTED will only flatten the input 0
-    if (it->GetDstArgIndex() != 0) {
+    if (&output_node_arg != &dest_node_unit.Inputs()[0].node_arg) {
       LOGS_DEFAULT(VERBOSE) << "Reshape/Flatten can only be skipped when the output is input 0 of Gemm/Matmul"
-                            << ", output name, " << output;
+                            << ", output name, " << output_name;
       return false;
     }
 
@@ -1001,7 +964,7 @@ void ReshapeOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const
     // And NNAPI ANEURALNETWORKS_FULLY_CONNECTED will only flatten input rank >= 2
     if (input_rank < 2 || output_rank != 2) {
       LOGS_DEFAULT(VERBOSE) << "Reshape/Flatten can only be skipped when input_rank >= 2 and output_rank == 2"
-                            << ", output name, " << output
+                            << ", output name, " << output_name
                             << ", the actual input_rank, " << input_rank
                             << ", the actual output_rank, " << output_rank;
       return false;
@@ -1012,26 +975,26 @@ void ReshapeOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const
   // Check if the Reshape output is a graph output, if so we cannot skip the Reshape
   // We do not care the case where the Reshape output is a dead end
   for (const auto* node_arg : model_builder.GetGraphViewer().GetOutputs()) {
-    if (node_arg->Name() == output) {
+    if (node_arg == &output_node_arg) {
       LOGS_DEFAULT(VERBOSE) << "Reshape/Flatten can not be skipped when the output is a graph output"
-                            << ", output name, " << output;
+                            << ", output name, " << output_name;
       return false;
     }
   }
 
   LOGS_DEFAULT(VERBOSE) << "Skipping Reshape/Flatten node ["
-                        << node.Name() << "] with output, " << output;
+                        << node_unit.Name() << "] with output, " << output_name;
   return true;
 }
 
 /* static */ Status ReshapeOpBuilder::AddReshapeOperator(ModelBuilder& model_builder,
-                                                         const Node& node,
+                                                         const NodeUnit& node_unit,
                                                          const std::string& input,
                                                          const std::vector<int32_t>& shape) {
   auto& shaper(model_builder.GetShaper());
   const auto& operand_indices(model_builder.GetOperandIndices());
   const auto& operand_types(model_builder.GetOperandTypes());
-  const auto& output = node.OutputDefs()[0]->Name();
+  const auto& output = node_unit.Outputs()[0].node_arg.Name();
   ORT_RETURN_IF_ERROR(shaper.Reshape(input, shape, output));
   auto input_rank = shaper[input].size();
   auto output_rank = shaper[output].size();
@@ -1039,7 +1002,7 @@ void ReshapeOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const
   // Since Reshape is not running using hardware in NNAPI for some CPU (e.g. Qualcomm SD for now)
   // We will try to see if we the skip the Reshape to prevent context switching between
   // NNAPI CPU impl and NNAPI hardware accelerator impl
-  if (CanSkipReshape(model_builder, node, input_rank, output_rank)) {
+  if (CanSkipReshape(model_builder, node_unit, input_rank, output_rank)) {
     // Since reshape can be skipped, only register the dimension and type, with same index and new name
     const OperandType output_operand_type(operand_types.at(input).type, shaper[output]);
     model_builder.RegisterOperand(output, operand_indices.at(input), output_operand_type, false);
@@ -1050,7 +1013,7 @@ void ReshapeOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const
     input_indices.push_back(operand_indices.at(input));
     // Add new shape
     Shape shape_dimen = {static_cast<uint32_t>(shape.size())};
-    std::string shape_name = model_builder.GetUniqueName(node.Name() + input + "newshape");
+    std::string shape_name = model_builder.GetUniqueName(node_unit.Name() + input + "newshape");
     OperandType shape_operand_type(Type::TENSOR_INT32, shape_dimen);
     ORT_RETURN_IF_ERROR(model_builder.AddOperandFromPersistMemoryBuffer(shape_name, shape.data(), shape_operand_type));
     input_indices.push_back(operand_indices.at(shape_name));
@@ -1063,17 +1026,16 @@ void ReshapeOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const
 }
 
 Status ReshapeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const {
-  const auto& node = node_unit.GetNode();
   auto& shaper(model_builder.GetShaper());
   const auto& initializers(model_builder.GetInitializerTensors());
 
-  auto input = node.InputDefs()[0]->Name();
+  auto input = node_unit.Inputs()[0].node_arg.Name();
   if (model_builder.IsOperandNHWC(input)) {
     // We want to transpose nhwc operand back to nchw before reshape
-    ORT_RETURN_IF_ERROR(GetNCHWInput(model_builder, node, 0, input));
+    ORT_RETURN_IF_ERROR(GetNCHWInput_nu(model_builder, node_unit, 0, input));
   }
 
-  const auto& shape_tensor = *initializers.at(node.InputDefs()[1]->Name());
+  const auto& shape_tensor = *initializers.at(node_unit.Inputs()[1].node_arg.Name());
   std::vector<uint8_t> unpacked_tensor;
   ORT_RETURN_IF_ERROR(onnxruntime::utils::UnpackInitializerData(shape_tensor, unpacked_tensor));
   const int64_t* raw_shape = reinterpret_cast<const int64_t*>(unpacked_tensor.data());
@@ -1087,7 +1049,7 @@ Status ReshapeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, cons
     shape[i] = dim == 0 ? input_shape[i] : dim;
   }
 
-  return AddReshapeOperator(model_builder, node, input, shape);
+  return AddReshapeOperator(model_builder, node_unit, input, shape);
 }
 
 #pragma endregion op_reshape
@@ -1111,21 +1073,21 @@ void BatchNormalizationOpBuilder::AddInitializersToSkip(ModelBuilder& model_buil
 }
 
 Status BatchNormalizationOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const {
-  const auto& node = node_unit.GetNode();
   auto& shaper(model_builder.GetShaper());
   const auto& operand_types(model_builder.GetOperandTypes());
   const auto& initializers(model_builder.GetInitializerTensors());
-  NodeAttrHelper helper(node);
+  NodeAttrHelper helper(node_unit);
+  const auto& inputs = node_unit.Inputs();
 
   // For reshape we are not really doing anything but
   // register a new operand with new shape
-  const auto& input = node.InputDefs()[0]->Name();
-  const auto& output = node.OutputDefs()[0]->Name();
+  const auto& input = inputs[0].node_arg.Name();
+  const auto& output = node_unit.Outputs()[0].node_arg.Name();
 
-  const auto& scale_tensor = *initializers.at(node.InputDefs()[1]->Name());
-  const auto& bias_tensor = *initializers.at(node.InputDefs()[2]->Name());
-  const auto& mean_tensor = *initializers.at(node.InputDefs()[3]->Name());
-  const auto& var_tensor = *initializers.at(node.InputDefs()[4]->Name());
+  const auto& scale_tensor = *initializers.at(inputs[1].node_arg.Name());
+  const auto& bias_tensor = *initializers.at(inputs[2].node_arg.Name());
+  const auto& mean_tensor = *initializers.at(inputs[3].node_arg.Name());
+  const auto& var_tensor = *initializers.at(inputs[4].node_arg.Name());
   const auto eps = helper.Get("epsilon", 1e-5f);
 
   const auto size = SafeInt<uint32_t>(scale_tensor.dims()[0]);
@@ -1155,9 +1117,9 @@ Status BatchNormalizationOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_bu
                 bias_data[i]);
   }
 
-  const auto tensor_a_name = model_builder.GetUniqueName(node.Name() + input + "_imm_a");
-  const auto tensor_b_name = model_builder.GetUniqueName(node.Name() + input + "_imm_b");
-  const auto tensor_imm_product_name = model_builder.GetUniqueName(node.Name() + input + "_imm_mul");
+  const auto tensor_a_name = model_builder.GetUniqueName(node_unit.Name() + input + "_imm_a");
+  const auto tensor_b_name = model_builder.GetUniqueName(node_unit.Name() + input + "_imm_b");
+  const auto tensor_imm_product_name = model_builder.GetUniqueName(node_unit.Name() + input + "_imm_mul");
   Shape tensor_a_dimen = {size};
 
   bool input_is_nhwc = model_builder.IsOperandNHWC(input);
@@ -1191,7 +1153,7 @@ Status BatchNormalizationOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_bu
                                         output_is_nhwc));
 
   // Add
-  int32_t fuse_code = model_builder.FindActivation(node, *node.OutputDefs()[0]);
+  int32_t fuse_code = model_builder.FindActivation_nu(node_unit, node_unit.Outputs()[0].node_arg);
   ORT_RETURN_IF_ERROR(AddBinaryOperator(ANEURALNETWORKS_ADD,
                                         model_builder,
                                         tensor_imm_product_name, tensor_b_name,
@@ -1676,21 +1638,20 @@ class SoftMaxOpBuilder : public BaseOpBuilder {
 };
 
 Status SoftMaxOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const {
-  const auto& node = node_unit.GetNode();
   auto& shaper(model_builder.GetShaper());
   const auto& operand_indices(model_builder.GetOperandIndices());
   const auto& operand_types(model_builder.GetOperandTypes());
   const auto android_feature_level = model_builder.GetNNAPIFeatureLevel();
-  NodeAttrHelper helper(node);
+  NodeAttrHelper helper(node_unit);
 
-  auto input = node.InputDefs()[0]->Name();
+  auto input = node_unit.Inputs()[0].node_arg.Name();
   bool input_is_nhwc = model_builder.IsOperandNHWC(input);
   bool output_is_nhwc = input_is_nhwc;
   if (android_feature_level < ANEURALNETWORKS_FEATURE_LEVEL_3) {
     if (model_builder.IsOperandNHWC(input)) {
       output_is_nhwc = false;
       // We want to transpose nhwc operand back to nchw before softmax
-      ORT_RETURN_IF_ERROR(GetNCHWInput(model_builder, node, 0, input));
+      ORT_RETURN_IF_ERROR(GetNCHWInput_nu(model_builder, node_unit, 0, input));
     }
   }
 
@@ -1700,7 +1661,7 @@ Status SoftMaxOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, cons
     axis = axis_nchw_to_nhwc[axis];
   }
 
-  const auto& output = node.OutputDefs()[0]->Name();
+  const auto& output = node_unit.Outputs()[0].node_arg.Name();
   float beta = 1.f;
   std::vector<uint32_t> input_indices;
   input_indices.push_back(operand_indices.at(input));
@@ -1728,16 +1689,14 @@ class IdentityOpBuilder : public BaseOpBuilder {
 };
 
 Status IdentityOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const {
-  const auto& node = node_unit.GetNode();
-
   // Identity is not really going to do anything
   // Just register the dimension and type, with same index and new name
   auto& shaper(model_builder.GetShaper());
   const auto& operand_indices(model_builder.GetOperandIndices());
   const auto& operand_types(model_builder.GetOperandTypes());
 
-  const auto& input = node.InputDefs()[0]->Name();
-  const auto& output = node.OutputDefs()[0]->Name();
+  const auto& input = node_unit.Inputs()[0].node_arg.Name();
+  const auto& output = node_unit.Outputs()[0].node_arg.Name();
   bool output_is_nhwc = model_builder.IsOperandNHWC(input);
 
   std::vector<uint32_t> input_indices;
@@ -1791,7 +1750,7 @@ void GemmOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const Nod
     if (op == "MatMul") {
       model_builder.AddInitializerToSkip(inputs[1].node_arg.Name());
     } else if (op == "Gemm") {
-      NodeAttrHelper helper(node_unit.GetNode());
+      NodeAttrHelper helper(node_unit);
       const auto transB = helper.Get("transB", 0);
       if (transB == 0)
         model_builder.AddInitializerToSkip(inputs[1].node_arg.Name());
@@ -1956,14 +1915,13 @@ void UnaryOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const No
 }
 
 Status UnaryOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const {
-  const auto& node = node_unit.GetNode();
   auto& shaper(model_builder.GetShaper());
   const auto& operand_indices(model_builder.GetOperandIndices());
   const auto& operand_types(model_builder.GetOperandTypes());
-  const auto& op_type(node.OpType());
+  const auto& op_type(node_unit.OpType());
 
-  const auto& input = node.InputDefs()[0]->Name();
-  const auto& output = node.OutputDefs()[0]->Name();
+  const auto& input = node_unit.Inputs()[0].node_arg.Name();
+  const auto& output = node_unit.Outputs()[0].node_arg.Name();
   bool output_is_nhwc = model_builder.IsOperandNHWC(input);
 
   ORT_RETURN_IF_ERROR(shaper.Identity(input, output));
@@ -1997,9 +1955,9 @@ Status UnaryOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
   if (is_qlinear_sigmoid) {
     const auto& initializers = model_builder.GetInitializerTensors();
     float x_scale = 0.0f;
-    ORT_RETURN_IF_ERROR(GetQuantizationScale(initializers, node, 1, x_scale));
     int32_t x_zero_point = 0;
-    ORT_RETURN_IF_ERROR(GetQuantizationZeroPoint(initializers, node, 2, x_zero_point));
+    ORT_RETURN_IF_ERROR(GetQuantizationScaleAndZeroPoint(
+        initializers, node_unit.Inputs()[0], node_unit.ModelPath(), x_scale, x_zero_point));
 
     // Verify if the scale and zero point values from onnx input and nnapi input match
     ORT_RETURN_IF_ERROR(IsValidInputQuantizedType(model_builder, input, x_scale, x_zero_point));
@@ -2027,17 +1985,17 @@ class ConcatOpBuilder : public BaseOpBuilder {
 };
 
 Status ConcatOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const {
-  const auto& node = node_unit.GetNode();
   auto& shaper(model_builder.GetShaper());
   const auto& operand_indices(model_builder.GetOperandIndices());
   const auto& operand_types(model_builder.GetOperandTypes());
-  NodeAttrHelper helper(node);
+  NodeAttrHelper helper(node_unit);
+  const auto& inputs = node_unit.Inputs();
 
   std::vector<uint32_t> input_indices;
-  const auto& input0 = node.InputDefs()[0]->Name();
+  const auto& input0 = inputs[0].node_arg.Name();
   bool all_input_have_same_layout = true;
   bool output_is_nhwc = false;
-  const auto node_input_size = node.InputDefs().size();
+  const auto node_input_size = inputs.size();
 
   // First if the inputs are uint8, we need verify all the inputs have same scale and zero points
   if (operand_types.at(input0).type == android::nn::wrapper::Type::TENSOR_QUANT8_ASYMM) {
@@ -2046,7 +2004,7 @@ Status ConcatOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
 
     // Compare scale and zp of input0 to input1~n
     for (size_t i = 1; i < node_input_size; i++) {
-      const auto& type = operand_types.at(node.InputDefs()[i]->Name());
+      const auto& type = operand_types.at(inputs[i].node_arg.Name());
       ORT_RETURN_IF_NOT(scale == type.operandType.scale,
                         "Input[", i, "]'s scale: ", type.operandType.scale,
                         " is different than input[0]'s scale: ", scale);
@@ -2061,31 +2019,31 @@ Status ConcatOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
   for (size_t i = 0; i < node_input_size - 1; i++) {
     all_input_have_same_layout =
         all_input_have_same_layout &&
-        model_builder.IsOperandNHWC(node.InputDefs()[i]->Name()) ==
-            model_builder.IsOperandNHWC(node.InputDefs()[i + 1]->Name());
+        model_builder.IsOperandNHWC(inputs[i].node_arg.Name()) ==
+            model_builder.IsOperandNHWC(inputs[i + 1].node_arg.Name());
   }
 
-  std::vector<std::string> inputs;
-  inputs.reserve(node_input_size);
+  std::vector<std::string> input_names;
+  input_names.reserve(node_input_size);
   if (all_input_have_same_layout) {
     // if all the inputs are of same layout, output will be the same layout
     output_is_nhwc = model_builder.IsOperandNHWC(input0);
 
     for (size_t i = 0; i < node_input_size; i++) {
-      auto input = node.InputDefs()[i]->Name();
+      auto input = inputs[i].node_arg.Name();
       input_indices.push_back(operand_indices.at(input));
-      inputs.push_back(input);
+      input_names.push_back(input);
     }
   } else {
     // if all the inputs are not same layout,
     // will need transpos those nhwc tensors back to nchw
     for (size_t i = 0; i < node_input_size; i++) {
-      auto input = node.InputDefs()[i]->Name();
+      auto input = inputs[i].node_arg.Name();
       if (model_builder.IsOperandNHWC(input)) {
-        ORT_RETURN_IF_ERROR(GetNCHWInput(model_builder, node, i, input));
+        ORT_RETURN_IF_ERROR(GetNCHWInput_nu(model_builder, node_unit, i, input));
       }
       input_indices.push_back(operand_indices.at(input));
-      inputs.push_back(input);
+      input_names.push_back(input);
     }
   }
 
@@ -2101,8 +2059,8 @@ Status ConcatOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
   }
   ADD_SCALAR_OPERAND(model_builder, input_indices, axis);
 
-  const auto& output = node.OutputDefs()[0]->Name();
-  ORT_RETURN_IF_ERROR(shaper.Concat(inputs, axis, output));
+  const auto& output = node_unit.Outputs()[0].node_arg.Name();
+  ORT_RETURN_IF_ERROR(shaper.Concat(input_names, axis, output));
   OperandType output_operand_type = operand_types.at(input0);
   output_operand_type.SetDimensions(shaper[output]);
   ORT_RETURN_IF_ERROR(model_builder.AddOperation(ANEURALNETWORKS_CONCATENATION, input_indices,
@@ -2120,7 +2078,7 @@ class SqueezeOpBuilder : public BaseOpBuilder {
 
  private:
   Status AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const override ORT_MUST_USE_RESULT;
-  static Status GetAxes(ModelBuilder& model_builder, const Node& node, std::vector<int32_t>& axes);
+  static Status GetAxes(ModelBuilder& model_builder, const NodeUnit& node_unit, std::vector<int32_t>& axes);
 };
 
 void SqueezeOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const NodeUnit& node_unit) const {
@@ -2130,13 +2088,13 @@ void SqueezeOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const
 }
 
 /* static */ Status SqueezeOpBuilder::GetAxes(ModelBuilder& model_builder,
-                                              const Node& node, std::vector<int32_t>& axes) {
+                                              const NodeUnit& node_unit, std::vector<int32_t>& axes) {
   // Squeeze opset 13 use input as axes
-  if (node.SinceVersion() > 12) {
+  if (node_unit.SinceVersion() > 12) {
     // If axes is not supplied, return an empty axes as default to squeeze all
-    if (node.InputDefs().size() > 1) {
+    if (node_unit.Inputs().size() > 1) {
       const auto& initializers(model_builder.GetInitializerTensors());
-      const auto& axes_tensor = *initializers.at(node.InputDefs()[1]->Name());
+      const auto& axes_tensor = *initializers.at(node_unit.Inputs()[1].node_arg.Name());
       std::vector<uint8_t> unpacked_tensor;
       ORT_RETURN_IF_ERROR(onnxruntime::utils::UnpackInitializerData(axes_tensor, unpacked_tensor));
       const int64_t* raw_axes = reinterpret_cast<const int64_t*>(unpacked_tensor.data());
@@ -2148,7 +2106,7 @@ void SqueezeOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const
       }
     }
   } else {
-    NodeAttrHelper helper(node);
+    NodeAttrHelper helper(node_unit);
     axes = helper.Get("axes", std::vector<int32_t>());
   }
 
@@ -2156,17 +2114,15 @@ void SqueezeOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const
 }
 
 Status SqueezeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const {
-  const auto& node = node_unit.GetNode();
-
-  auto input = node.InputDefs()[0]->Name();
+  auto input = node_unit.Inputs()[0].node_arg.Name();
   if (model_builder.IsOperandNHWC(input)) {
     // We want to transpose nhwc operand back to nchw before squeeze
-    ORT_RETURN_IF_ERROR(GetNCHWInput(model_builder, node, 0, input));
+    ORT_RETURN_IF_ERROR(GetNCHWInput_nu(model_builder, node_unit, 0, input));
   }
 
   std::vector<int32_t> axes;
-  ORT_RETURN_IF_ERROR(GetAxes(model_builder, node, axes));
-  return AddSqueezeOp(model_builder, node.Name(), input, node.OutputDefs()[0]->Name(), axes);
+  ORT_RETURN_IF_ERROR(GetAxes(model_builder, node_unit, axes));
+  return AddSqueezeOp(model_builder, node_unit.Name(), input, node_unit.Outputs()[0].node_arg.Name(), axes);
 }
 
 #pragma endregion
@@ -2186,24 +2142,19 @@ void QuantizeLinearOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder,
 }
 
 Status QuantizeLinearOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const {
-  const auto& node = node_unit.GetNode();
   auto& shaper(model_builder.GetShaper());
   const auto& operand_indices(model_builder.GetOperandIndices());
-  const auto input_defs(node.InputDefs());
 
-  const auto& input = input_defs[0]->Name();
-  const auto& output = node.OutputDefs()[0]->Name();
+  const auto& input = node_unit.Inputs()[0].node_arg.Name();
+  const auto& output = node_unit.Outputs()[0].node_arg.Name();
   bool output_is_nhwc = model_builder.IsOperandNHWC(input);
 
   float scale = 0.0f;
-  ORT_RETURN_IF_ERROR(GetQuantizationScale(model_builder.GetInitializerTensors(), node, 1, scale));
   int32_t zero_point = 0;
-  Type output_type = Type::TENSOR_QUANT8_ASYMM;
-
-  if (input_defs.size() == 3) {  // Get zero point
-    ORT_RETURN_IF_ERROR(GetQuantizationZeroPoint(model_builder.GetInitializerTensors(), node, 2, zero_point));
-  }
+  ORT_RETURN_IF_ERROR(GetQuantizationScaleAndZeroPoint(
+      model_builder.GetInitializerTensors(), node_unit.Outputs()[0], node_unit.ModelPath(), scale, zero_point));
 
+  Type output_type = Type::TENSOR_QUANT8_ASYMM;
   ORT_RETURN_IF_ERROR(shaper.Identity(input, output));
   const OperandType output_operand_type(output_type, shaper[output], scale, zero_point);
   std::vector<uint32_t> input_indices;
@@ -2230,22 +2181,18 @@ void DequantizeLinearOpBuilder::AddInitializersToSkip(ModelBuilder& model_builde
 }
 
 Status DequantizeLinearOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const {
-  const auto& node = node_unit.GetNode();
-
   auto& shaper(model_builder.GetShaper());
   const auto& operand_indices(model_builder.GetOperandIndices());
-  const auto input_defs(node.InputDefs());
+  const auto& inputs = node_unit.Inputs();
 
-  const auto& input = input_defs[0]->Name();
-  const auto& output = node.OutputDefs()[0]->Name();
+  const auto& input = inputs[0].node_arg.Name();
+  const auto& output = node_unit.Outputs()[0].node_arg.Name();
   bool output_is_nhwc = model_builder.IsOperandNHWC(input);
 
   float scale = 0.0;
-  ORT_RETURN_IF_ERROR(GetQuantizationScale(model_builder.GetInitializerTensors(), node, 1, scale));
   int32_t zero_point = 0;
-  if (input_defs.size() == 3) {  // Get zero point
-    ORT_RETURN_IF_ERROR(GetQuantizationZeroPoint(model_builder.GetInitializerTensors(), node, 2, zero_point));
-  }
+  ORT_RETURN_IF_ERROR(GetQuantizationScaleAndZeroPoint(
+      model_builder.GetInitializerTensors(), node_unit.Inputs()[0], node_unit.ModelPath(), scale, zero_point));
 
   ORT_RETURN_IF_ERROR(IsValidInputQuantizedType(model_builder, input, scale, zero_point));
 
@@ -2269,21 +2216,20 @@ class LRNOpBuilder : public BaseOpBuilder {
 };
 
 Status LRNOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const {
-  const auto& node = node_unit.GetNode();
   auto& shaper(model_builder.GetShaper());
   const auto& operand_indices(model_builder.GetOperandIndices());
   const auto& operand_types(model_builder.GetOperandTypes());
-  NodeAttrHelper helper(node);
+  NodeAttrHelper helper(node_unit);
   const auto android_feature_level = model_builder.GetNNAPIFeatureLevel();
 
-  auto input = node.InputDefs()[0]->Name();
-  const auto& output = node.OutputDefs()[0]->Name();
+  auto input = node_unit.Inputs()[0].node_arg.Name();
+  const auto& output = node_unit.Outputs()[0].node_arg.Name();
   bool output_is_nhwc = model_builder.IsOperandNHWC(input);
   if (android_feature_level < ANEURALNETWORKS_FEATURE_LEVEL_3) {
     // on android api level 28, we need to transpose the nchw input to nhwc
     output_is_nhwc = true;
     if (!model_builder.IsOperandNHWC(input)) {
-      ORT_RETURN_IF_ERROR(GetNHWCInput(model_builder, node, 0, input));
+      ORT_RETURN_IF_ERROR(GetNHWCInput_nu(model_builder, node_unit, 0, input));
     }
   }
 
@@ -2340,27 +2286,26 @@ void ClipOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const Nod
 }
 
 Status ClipOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const {
-  const auto& node = node_unit.GetNode();
-
   auto& shaper(model_builder.GetShaper());
   const auto& operand_indices(model_builder.GetOperandIndices());
   const auto& operand_types(model_builder.GetOperandTypes());
 
-  const auto& input = node.InputDefs()[0]->Name();
-  const auto& output = node.OutputDefs()[0]->Name();
+  const auto& input = node_unit.Inputs()[0].node_arg.Name();
+  const auto& output = node_unit.Outputs()[0].node_arg.Name();
   bool output_is_nhwc = model_builder.IsOperandNHWC(input);
 
   ORT_RETURN_IF_ERROR(shaper.Identity(input, output));
   const OperandType output_operand_type(operand_types.at(input).type, shaper[output]);
 
   if (Contains(model_builder.GetFusedActivations(), input)) {
-    LOGS_DEFAULT(VERBOSE) << "Clip Node [" << node.Name() << "] fused";
+    LOGS_DEFAULT(VERBOSE) << "Clip Node [" << node_unit.Name() << "] fused";
     model_builder.RegisterOperand(output, operand_indices.at(input), output_operand_type, output_is_nhwc);
     return Status::OK();
   }
 
   float min, max;
-  GetClipMinMax(model_builder.GetInitializerTensors(), node, min, max, logging::LoggingManager::DefaultLogger());
+  GetClipMinMax(model_builder.GetInitializerTensors(), node_unit.GetNode(), min, max,
+                logging::LoggingManager::DefaultLogger());
 
   int32_t op_code;
   if (min == 0.0f && max == 6.0f)
@@ -2404,17 +2349,16 @@ void ResizeOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const N
 }
 
 Status ResizeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const {
-  const auto& node = node_unit.GetNode();
   auto& shaper(model_builder.GetShaper());
   const auto& operand_indices(model_builder.GetOperandIndices());
   const auto& operand_types(model_builder.GetOperandTypes());
   const auto& initializers(model_builder.GetInitializerTensors());
-  NodeAttrHelper helper(node);
-  const auto input_defs = node.InputDefs();
+  NodeAttrHelper helper(node_unit);
+  const auto& inputs = node_unit.Inputs();
   const auto android_feature_level = model_builder.GetNNAPIFeatureLevel();
-  const auto& output = node.OutputDefs()[0]->Name();
+  const auto& output = node_unit.Outputs()[0].node_arg.Name();
 
-  auto input = input_defs[0]->Name();
+  auto input = inputs[0].node_arg.Name();
   bool use_nchw = model_builder.UseNCHW();
   bool input_is_nhwc = model_builder.IsOperandNHWC(input);
   bool output_is_nhwc = false;
@@ -2423,7 +2367,7 @@ Status ResizeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
   } else {
     output_is_nhwc = true;
     if (!input_is_nhwc) {
-      ORT_RETURN_IF_ERROR(GetNHWCInput(model_builder, node, 0, input));
+      ORT_RETURN_IF_ERROR(GetNHWCInput_nu(model_builder, node_unit, 0, input));
     }
   }
 
@@ -2436,8 +2380,8 @@ Status ResizeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
   bool using_half_pixel = coord_trans_mode == "half_pixel";
   bool using_align_corners = coord_trans_mode == "align_corners";
 
-  if (input_defs.size() == 3) {  // we are using scales
-    const auto& scales_name = input_defs[2]->Name();
+  if (inputs.size() == 3) {  // we are using scales
+    const auto& scales_name = inputs[2].node_arg.Name();
     const auto& scales_tensor = *initializers.at(scales_name);
     std::vector<uint8_t> unpacked_tensor;
     ORT_RETURN_IF_ERROR(onnxruntime::utils::UnpackInitializerData(scales_tensor, unpacked_tensor));
@@ -2447,7 +2391,7 @@ Status ResizeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
     ORT_RETURN_IF_ERROR(
         shaper.ResizeUsingScales(input, scale_h, scale_w, use_nchw, output));
   } else {  // we are using sizes
-    const auto& sizes_name = input_defs[3]->Name();
+    const auto& sizes_name = inputs[3].node_arg.Name();
     const auto& sizes_tensor = *initializers.at(sizes_name);
     std::vector<uint8_t> unpacked_tensor;
     ORT_RETURN_IF_ERROR(onnxruntime::utils::UnpackInitializerData(sizes_tensor, unpacked_tensor));
@@ -2498,27 +2442,25 @@ class FlattenOpBuilder : public BaseOpBuilder {
 };
 
 Status FlattenOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const {
-  const auto& node = node_unit.GetNode();
-
-  auto input = node.InputDefs()[0]->Name();
+  auto input = node_unit.Inputs()[0].node_arg.Name();
   if (model_builder.IsOperandNHWC(input)) {
     // We want to transpose nhwc operand back to nchw before reshape
-    ORT_RETURN_IF_ERROR(GetNCHWInput(model_builder, node, 0, input));
+    ORT_RETURN_IF_ERROR(GetNCHWInput_nu(model_builder, node_unit, 0, input));
   }
 
   // Flatten is basically a reshape to 2d tensor
   // Get the shape for Reshape here
   Shape input_shape;
-  GetShape(*node.InputDefs()[0], input_shape);
+  GetShape(node_unit.Inputs()[0].node_arg, input_shape);
   int32_t dim_1 = 1;
   int32_t dim_2 = 1;
-  GetFlattenOutputShape(node, input_shape, dim_1, dim_2);
+  GetFlattenOutputShape_nu(node_unit, input_shape, dim_1, dim_2);
   // If the input is of dynamic shape, replace 0 (dynamic) dimension with -1
   // We cannot have dim_1 and dim_2 both be 0 here, it was checked in IsOpSupportedImpl
   dim_1 = dim_1 == 0 ? -1 : dim_1;
   dim_2 = dim_2 == 0 ? -1 : dim_2;
   std::vector<int32_t> shape{dim_1, dim_2};
-  return ReshapeOpBuilder::AddReshapeOperator(model_builder, node, input, shape);
+  return ReshapeOpBuilder::AddReshapeOperator(model_builder, node_unit, input, shape);
 }
 
 #pragma endregion
@@ -2528,12 +2470,12 @@ Status FlattenOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, cons
 class MinMaxOpBuilder : public BaseOpBuilder {
  public:
   static void CreateSharedOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
-  static Status AddMinMaxOperator(ModelBuilder& model_builder, const Node& node,
-                                  const std::string& input1, const std::string& input2,
-                                  bool output_is_nhwc) ORT_MUST_USE_RESULT;
 
  private:
   Status AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const override ORT_MUST_USE_RESULT;
+  static Status AddMinMaxOperator(ModelBuilder& model_builder, const NodeUnit& node_unit,
+                                  const std::string& input1, const std::string& input2,
+                                  bool output_is_nhwc) ORT_MUST_USE_RESULT;
 };
 
 /* static */ void MinMaxOpBuilder::CreateSharedOpBuilder(
@@ -2546,16 +2488,16 @@ class MinMaxOpBuilder : public BaseOpBuilder {
       });
 }
 
-/* static */ Status MinMaxOpBuilder::AddMinMaxOperator(ModelBuilder& model_builder, const Node& node,
+/* static */ Status MinMaxOpBuilder::AddMinMaxOperator(ModelBuilder& model_builder, const NodeUnit& node_unit,
                                                        const std::string& input1, const std::string& input2,
                                                        bool output_is_nhwc) {
   auto& shaper(model_builder.GetShaper());
   const auto& operand_indices(model_builder.GetOperandIndices());
   const auto& operand_types(model_builder.GetOperandTypes());
 
-  const auto& output = node.OutputDefs()[0]->Name();
+  const auto& output = node_unit.Outputs()[0].node_arg.Name();
 
-  const auto& op_type(node.OpType());
+  const auto& op_type(node_unit.OpType());
   int32_t op_code;
   if (op_type == "Min")
     op_code = ANEURALNETWORKS_MINIMUM;
@@ -2577,17 +2519,14 @@ class MinMaxOpBuilder : public BaseOpBuilder {
 }
 
 Status MinMaxOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const {
-  const auto& node = node_unit.GetNode();
-  const auto input_defs(node.InputDefs());
-  std::string input1 = input_defs[0]->Name();
-  std::string input2 = input_defs[1]->Name();
+  const auto& inputs = node_unit.Inputs();
+  std::string input1 = inputs[0].node_arg.Name();
+  std::string input2 = inputs[1].node_arg.Name();
   bool output_is_nhwc = false;
-  ORT_RETURN_IF_ERROR(TransposeBinaryOpInputLayout(model_builder, node,
-                                                   0 /* input1_idx */,
-                                                   1 /* input2_idx */,
-                                                   input1, input2, output_is_nhwc));
+  ORT_RETURN_IF_ERROR(TransposeBinaryOpInputLayout_nu(model_builder, node_unit,
+                                                      input1, input2, output_is_nhwc));
 
-  return AddMinMaxOperator(model_builder, node, input1, input2, output_is_nhwc);
+  return AddMinMaxOperator(model_builder, node_unit, input1, input2, output_is_nhwc);
 }
 
 #pragma endregion
@@ -2600,17 +2539,15 @@ class EluOpBuilder : public BaseOpBuilder {
 };
 
 Status EluOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const {
-  const auto& node = node_unit.GetNode();
-
   auto& shaper(model_builder.GetShaper());
   const auto& operand_indices(model_builder.GetOperandIndices());
   const auto& operand_types(model_builder.GetOperandTypes());
-  const auto& input = node.InputDefs()[0]->Name();
-  const auto& output = node.OutputDefs()[0]->Name();
+  const auto& input = node_unit.Inputs()[0].node_arg.Name();
+  const auto& output = node_unit.Outputs()[0].node_arg.Name();
   bool output_is_nhwc = model_builder.IsOperandNHWC(input);
   ORT_RETURN_IF_ERROR(shaper.Identity(input, output));
   const OperandType output_operand_type(operand_types.at(input).type, shaper[output]);
-  NodeAttrHelper helper(node);
+  NodeAttrHelper helper(node_unit);
   const auto alpha = helper.Get("alpha", 1.0f);
   std::vector<uint32_t> input_indices;
   input_indices.push_back(operand_indices.at(input));
@@ -2645,13 +2582,11 @@ void SliceOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const No
 }
 
 Status SliceOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const {
-  const auto& node = node_unit.GetNode();
-
   auto& shaper(model_builder.GetShaper());
   const auto& operand_indices(model_builder.GetOperandIndices());
   const auto& operand_types(model_builder.GetOperandTypes());
-  const auto input_defs = node.InputDefs();
-  const auto& input_shape = shaper[input_defs[0]->Name()];
+  const auto& inputs = node_unit.Inputs();
+  const auto& input_shape = shaper[inputs[0].node_arg.Name()];
   std::vector<int64_t> input_shape_64(input_shape.cbegin(), input_shape.cend());
   SliceOp::PrepareForComputeMetadata compute_metadata(input_shape_64);
 
@@ -2665,15 +2600,14 @@ Status SliceOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
     std::vector<int64_t> input_axes;
     std::vector<int64_t> input_steps;
 
-    const auto CopyInputData = [&node, &model_builder](size_t input_idx, std::vector<int64_t>& data) {
+    const auto CopyInputData = [&inputs, &model_builder](size_t input_idx, std::vector<int64_t>& data) {
       data.clear();
-      const auto input_defs = node.InputDefs();
 
       // This is an optional input, return empty vector
-      if (input_defs.size() <= input_idx)
+      if (inputs.size() <= input_idx)
         return Status::OK();
 
-      const auto& input_name = input_defs[input_idx]->Name();
+      const auto& input_name = inputs[input_idx].node_arg.Name();
       const auto& initializers(model_builder.GetInitializerTensors());
 
       const auto& tensor = *initializers.at(input_name);
@@ -2715,8 +2649,8 @@ Status SliceOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
                  std::back_inserter(nnapi_output_shape),
                  [](int64_t i) { return SafeInt<uint32_t>(i); });
 
-  const auto& input = node.InputDefs()[0]->Name();
-  const auto& output = node.OutputDefs()[0]->Name();
+  const auto& input = inputs[0].node_arg.Name();
+  const auto& output = node_unit.Outputs()[0].node_arg.Name();
   bool output_is_nhwc = model_builder.IsOperandNHWC(input);
 
   // No shape inference for Slice, everything is calculated here, we only need to add the output shape
@@ -2731,14 +2665,14 @@ Status SliceOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
   Shape param_dimen = {static_cast<uint32_t>(input_shape.size())};
 
   // helper function to add begin/end/strides of ANEURALNETWORKS_STRIDED_SLICE
-  const auto AddOperand = [&model_builder, &node, &input_indices, &operand_indices](
+  const auto AddOperand = [&model_builder, &node_unit, &input_indices, &operand_indices](
                               const char* name, const Shape& shape, const std::vector<int64_t>& param_raw_data) {
     std::vector<int32_t> param_data;
     param_data.reserve(param_raw_data.size());
     std::transform(param_raw_data.cbegin(), param_raw_data.cend(),
                    std::back_inserter(param_data),
                    [](int64_t i) { return SafeInt<int32_t>(i); });
-    std::string param_name = model_builder.GetUniqueName(node.Name() + name);
+    std::string param_name = model_builder.GetUniqueName(node_unit.Name() + name);
     OperandType param_operand_type(Type::TENSOR_INT32, shape);
     ORT_RETURN_IF_ERROR(
         model_builder.AddOperandFromPersistMemoryBuffer(param_name, param_data.data(), param_operand_type));
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_support_checker.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_support_checker.cc
index 262440859843c..30854ca06cb52 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_support_checker.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_support_checker.cc
@@ -431,7 +431,7 @@ bool ReshapeOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& init
   const int64_t* raw_perm = reinterpret_cast<const int64_t*>(unpacked_tensor.data());
   const auto perm_size = SafeInt<uint32_t>(perm_tensor.dims()[0]);
 
-  NodeAttrHelper helper(node_unit.GetNode());
+  NodeAttrHelper helper(node_unit);
   const bool allow_zero = helper.Get("allowzero ", 0) == 1;
   for (uint32_t i = 0; i < perm_size; i++) {
     // NNAPI reshape does not support 0 as dimension
@@ -466,16 +466,15 @@ class BatchNormalizationOpSupportChecker : public BaseOpSupportChecker {
 
 bool BatchNormalizationOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
                                                            const OpSupportCheckParams& /* params */) const {
-  const auto& node = node_unit.GetNode();
-  if (node.OutputDefs().size() != 1) {
+  if (node_unit.Outputs().size() != 1) {
     LOGS_DEFAULT(VERBOSE) << "Your onnx model may be in training mode, please export "
                              "it in test mode.";
     return false;
   }
 
-  const auto& input_defs = node.InputDefs();
+  const auto& inputs = node_unit.Inputs();
   Shape input_shape;
-  if (!GetShape(*input_defs[0], input_shape))
+  if (!GetShape(inputs[0].node_arg, input_shape))
     return false;
 
   const auto input_size = input_shape.size();
@@ -485,17 +484,17 @@ bool BatchNormalizationOpSupportChecker::IsOpSupportedImpl(const InitializedTens
     return false;
   }
 
-  NodeAttrHelper helper(node);
+  NodeAttrHelper helper(node_unit);
   const auto spatial = helper.Get("spatial", 1);
   if (spatial != 1) {
     LOGS_DEFAULT(VERBOSE) << "Non-spatial BN is not supported";
     return false;
   }
 
-  const auto& scale_name = input_defs[1]->Name();
-  const auto& b_name = input_defs[2]->Name();
-  const auto& mean_name = input_defs[3]->Name();
-  const auto& var_name = input_defs[4]->Name();
+  const auto& scale_name = inputs[1].node_arg.Name();
+  const auto& b_name = inputs[2].node_arg.Name();
+  const auto& mean_name = inputs[3].node_arg.Name();
+  const auto& var_name = inputs[4].node_arg.Name();
   if (!Contains(initializers, scale_name)) {
     LOGS_DEFAULT(VERBOSE) << "Scale of BN must be known";
     return false;
@@ -552,25 +551,24 @@ class PoolOpSupportChecker : public BaseOpSupportChecker {
 
 bool PoolOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
                                              const OpSupportCheckParams& params) const {
-  const auto& node = node_unit.GetNode();
-  const auto& op_name = node.Name();
-  const auto& op_type = node.OpType();
-  const auto& input_defs = node.InputDefs();
+  const auto& op_name = node_unit.Name();
+  const auto& op_type = node_unit.OpType();
+  const auto& inputs = node_unit.Inputs();
   Shape input_shape;
-  if (!GetShape(*input_defs[0], input_shape))
+  if (!GetShape(inputs[0].node_arg, input_shape))
     return false;
 
   const auto input_size = input_shape.size();
   if (input_size != 4) {
     LOGS_DEFAULT(VERBOSE)
         << op_type << " only supports rank-4 tensor, input ["
-        << input_defs[0]->Name() << "] has actual dim count " << input_size;
+        << inputs[0].node_arg.Name() << "] has actual dim count " << input_size;
     return false;
   }
 
   bool is_qlinear_average_pool = op_type == "QLinearAveragePool";
   if (op_type == "AveragePool" || op_type == "MaxPool" || is_qlinear_average_pool) {
-    NodeAttrHelper helper(node);
+    NodeAttrHelper helper(node_unit);
 
     const auto count_include_pad = helper.Get("count_include_pad", 0);
     if (count_include_pad == 1) {
@@ -600,7 +598,7 @@ bool PoolOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initial
       return false;
     }
 
-    if (node.OutputDefs().size() != 1) {
+    if (node_unit.Outputs().size() != 1) {
       LOGS_DEFAULT(VERBOSE) << "Argmax in maxpooling is not supported";
       return false;
     }
@@ -611,12 +609,6 @@ bool PoolOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initial
 
   // We need to check if we have valid scales and zero points for QLinearAveragePool
   if (is_qlinear_average_pool) {
-    if (input_defs.size() < 4)
-      return false;
-
-    // the output zero point can be optional
-    bool has_output_zp = input_defs.size() == 5;
-
     // Check input scales and ZPs
     if (!HasValidQuantizationScales(initializers, node_unit, {0}, params, true /* is_input */))
       return false;
@@ -632,19 +624,23 @@ bool PoolOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initial
 
     // NNAPI requires Quantized Average Pool has same scale and zero point for both input and output
     float input_scale = 0.0f;
-    auto status = GetQuantizationScale(initializers, node, 1, input_scale);
+    int32_t input_zp = 0;
+    auto status = GetQuantizationScaleAndZeroPoint(
+        initializers, node_unit.Inputs()[0], node_unit.ModelPath(), input_scale, input_zp);
     if (!status.IsOK()) {
       LOGS_DEFAULT(ERROR) << "Op [" << op_type << "] name [" << op_name
-                          << "] GetQuantizationScale for input_scale failed, message: "
+                          << "] GetQuantizationScaleAndZeroPoint for input_scale/zp failed, message: "
                           << status.ErrorMessage();
       return false;
     }
 
     float output_scale = 0.0f;
-    status = GetQuantizationScale(initializers, node, 3, output_scale);
+    int32_t output_zp = 0;
+    status = GetQuantizationScaleAndZeroPoint(
+        initializers, node_unit.Outputs()[0], node_unit.ModelPath(), output_scale, output_zp);
     if (!status.IsOK()) {
       LOGS_DEFAULT(ERROR) << "Op [" << op_type << "] name [" << op_name
-                          << "] GetQuantizationScale for output_scale failed, message: "
+                          << "] GetQuantizationScaleAndZeroPoint for output_scale/zp failed, message: "
                           << status.ErrorMessage();
       return false;
     }
@@ -656,26 +652,6 @@ bool PoolOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initial
       return false;
     }
 
-    int32_t input_zp = 0;
-    int32_t output_zp = 0;
-    status = GetQuantizationZeroPoint(initializers, node, 2, input_zp);
-    if (!status.IsOK()) {
-      LOGS_DEFAULT(ERROR) << "Op [" << op_type << "] name [" << op_name
-                          << "] GetQuantizationZeroPoint for input_zp failed, message: "
-                          << status.ErrorMessage();
-      return false;
-    }
-
-    if (has_output_zp) {
-      status = GetQuantizationZeroPoint(initializers, node, 4, output_zp);
-      if (!status.IsOK()) {
-        LOGS_DEFAULT(ERROR) << "Op [" << op_type << "] name [" << op_name
-                            << "] GetQuantizationZeroPoint for output_zp failed, message: "
-                            << status.ErrorMessage();
-        return false;
-      }
-    }
-
     if (input_zp != output_zp) {
       LOGS_DEFAULT(VERBOSE) << "Op [" << op_type << "] name [" << op_name
                             << "] has different input_zp: " << input_zp
@@ -758,21 +734,19 @@ bool ConvOpSupportChecker::HasSupportedInputsImpl(const NodeUnit& node_unit) con
 
 bool ConvOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
                                              const OpSupportCheckParams& params) const {
-  const auto& node = node_unit.GetNode();
-  const auto& op_type = node.OpType();
+  const auto& op_type = node_unit.OpType();
   const bool is_qlinear_conv = (op_type == "QLinearConv");
 
   // We don't support nhwc com.microsoft.QLinearConv for now
-  if (is_qlinear_conv && node.Domain() == kMSDomain) {
+  if (is_qlinear_conv && node_unit.Domain() == kMSDomain) {
     LOGS_DEFAULT(VERBOSE) << "com.microsoft.QLinearConv is not supported";
     return false;
   }
 
-  const auto input_defs = node.InputDefs();
-  NodeAttrHelper helper(node);
-  size_t w_idx = is_qlinear_conv ? 3 : 1;
+  const auto& inputs = node_unit.Inputs();
+  NodeAttrHelper helper(node_unit);
   const auto group = helper.Get("group", 1);
-  const auto weight_name = input_defs[w_idx]->Name();
+  const auto weight_name = inputs[1].node_arg.Name();
   if (Contains(initializers, weight_name)) {
     const auto& tensor = *initializers.at(weight_name);
     if (tensor.dims().size() != 4) {
@@ -801,7 +775,7 @@ bool ConvOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initial
   if (is_qlinear_conv) {
     // For QLinearConv, we only support uint8 output now
     int32_t output_type;
-    if (!GetType(*node.OutputDefs()[0], output_type))
+    if (!GetType(node_unit.Outputs()[0].node_arg, output_type))
       return false;
 
     if (output_type != ONNX_NAMESPACE::TensorProto_DataType_UINT8) {
@@ -811,7 +785,7 @@ bool ConvOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initial
       return false;
     }
 
-    if (input_defs.size() > 8 && !Contains(initializers, input_defs[8]->Name())) {
+    if (inputs.size() > 2 && !Contains(initializers, inputs[2].node_arg.Name())) {
       LOGS_DEFAULT(VERBOSE) << "Bias of QLinearConv must be known";
       return false;
     }
@@ -852,8 +826,7 @@ class CastOpSupportChecker : public BaseOpSupportChecker {
 
 bool CastOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& /* initializers */, const NodeUnit& node_unit,
                                              const OpSupportCheckParams& /* params */) const {
-  const auto& node = node_unit.GetNode();
-  NodeAttrHelper helper(node);
+  NodeAttrHelper helper(node_unit);
   const auto to = helper.Get("to", 0);
   if (to != ONNX_NAMESPACE::TensorProto::FLOAT &&
       to != ONNX_NAMESPACE::TensorProto::INT32) {
@@ -881,9 +854,8 @@ class SoftMaxOpSupportChecker : public BaseOpSupportChecker {
 
 bool SoftMaxOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& /* initializers */, const NodeUnit& node_unit,
                                                 const OpSupportCheckParams& params) const {
-  const auto& node = node_unit.GetNode();
   Shape input_shape;
-  if (!GetShape(*node.InputDefs()[0], input_shape))
+  if (!GetShape(node_unit.Inputs()[0].node_arg, input_shape))
     return false;
 
   const auto input_size = input_shape.size();
@@ -894,7 +866,7 @@ bool SoftMaxOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& /* i
   }
 
   if (params.android_feature_level < ANEURALNETWORKS_FEATURE_LEVEL_3) {
-    NodeAttrHelper helper(node);
+    NodeAttrHelper helper(node_unit);
     int32_t axis = helper.Get("axis", 1);
     if (axis != 1) {
       LOGS_DEFAULT(VERBOSE)
@@ -990,19 +962,13 @@ int GemmOpSupportChecker::GetMinSupportedOpSet(const NodeUnit& node_unit) const
 
 bool GemmOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
                                              const OpSupportCheckParams& params) const {
-  const auto& node = node_unit.GetNode();
-  const auto& op_type = node.OpType();
-  const auto input_defs(node.InputDefs());
-  size_t a_idx = 0, b_idx = 1, c_idx = 2;  // A*B+C
+  const auto& op_type = node_unit.OpType();
+  const auto& inputs = node_unit.Inputs();
   bool is_qlinear_matmul = op_type == "QLinearMatMul";
-  if (is_qlinear_matmul) {
-    a_idx = 0;
-    b_idx = 3;
-  }
 
   Shape a_shape;
   {
-    if (!GetShape(*input_defs[a_idx], a_shape))
+    if (!GetShape(inputs[0].node_arg, a_shape))
       return false;
 
     if (a_shape.size() != 2) {
@@ -1013,7 +979,7 @@ bool GemmOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initial
 
   Shape b_shape;
   {
-    if (!GetShape(*input_defs[b_idx], b_shape))
+    if (!GetShape(inputs[1].node_arg, b_shape))
       return false;
 
     if (b_shape.size() != 2) {
@@ -1026,7 +992,7 @@ bool GemmOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initial
     // Only support
     // 1. A*B'+C
     // 2. A*B+C and B is an initializer
-    NodeAttrHelper helper(node);
+    NodeAttrHelper helper(node_unit);
     const auto transA = helper.Get("transA", 0);
     const auto transB = helper.Get("transB", 0);
     const auto alpha = helper.Get("alpha", 1.0f);
@@ -1042,14 +1008,14 @@ bool GemmOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initial
       return false;
     }
 
-    if (transB == 0 && !Contains(initializers, input_defs[b_idx]->Name())) {
+    if (transB == 0 && !Contains(initializers, inputs[1].node_arg.Name())) {
       LOGS_DEFAULT(VERBOSE) << "B of Gemm must be known if transB != 1";
       return false;
     }
 
-    if (input_defs.size() == 3) {
+    if (inputs.size() == 3) {
       Shape c_shape;
-      if (!GetShape(*input_defs[c_idx], c_shape))
+      if (!GetShape(inputs[2].node_arg, c_shape))
         return false;
 
       uint32_t c_size;
@@ -1067,7 +1033,7 @@ bool GemmOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initial
     }
   } else if (op_type == "MatMul" || is_qlinear_matmul) {
     // Only support A*B B is an initializer
-    if (!Contains(initializers, input_defs[b_idx]->Name())) {
+    if (!Contains(initializers, inputs[1].node_arg.Name())) {
       LOGS_DEFAULT(VERBOSE) << "B of MatMul must be known";
       return false;
     }
@@ -1075,7 +1041,7 @@ bool GemmOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initial
     if (is_qlinear_matmul) {
       // For QLinearMatMul, we only support uint8 output now
       int32_t output_type;
-      if (!GetType(*node.OutputDefs()[0], output_type))
+      if (!GetType(node_unit.Outputs()[0].node_arg, output_type))
         return false;
 
       if (output_type != ONNX_NAMESPACE::TensorProto_DataType_UINT8) {
@@ -1250,9 +1216,8 @@ class ConcatOpSupportChecker : public BaseOpSupportChecker {
 
 bool ConcatOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& /* initializers */, const NodeUnit& node_unit,
                                                const OpSupportCheckParams& /* params */) const {
-  const auto& node = node_unit.GetNode();
   Shape input_shape;
-  if (!GetShape(*node.InputDefs()[0], input_shape))
+  if (GetShape(node_unit.Inputs()[0].node_arg, input_shape))
     return false;
 
   const auto input_size = input_shape.size();
@@ -1420,9 +1385,8 @@ class LRNOpSupportChecker : public BaseOpSupportChecker {
 
 bool LRNOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& /* initializers */, const NodeUnit& node_unit,
                                             const OpSupportCheckParams& /* params */) const {
-  const auto& node = node_unit.GetNode();
   Shape input_shape;
-  if (!GetShape(*node.InputDefs()[0], input_shape))
+  if (!GetShape(node_unit.Inputs()[0].node_arg, input_shape))
     return false;
 
   const auto input_size = input_shape.size();
@@ -1447,9 +1411,8 @@ class ClipOpSupportChecker : public BaseOpSupportChecker {
 
 bool ClipOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
                                              const OpSupportCheckParams& /* params */) const {
-  const auto& node = node_unit.GetNode();
   float min, max;
-  if (!GetClipMinMax(initializers, node, min, max, logging::LoggingManager::DefaultLogger()))
+  if (!GetClipMinMax(initializers, node_unit.GetNode(), min, max, logging::LoggingManager::DefaultLogger()))
     return false;
 
   // We only supoort relu6 or relu1
@@ -1484,9 +1447,8 @@ class ResizeOpSupportChecker : public BaseOpSupportChecker {
 
 bool ResizeOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
                                                const OpSupportCheckParams& params) const {
-  const auto& node = node_unit.GetNode();
   Shape input_shape;
-  if (!GetShape(*node.InputDefs()[0], input_shape))
+  if (!GetShape(node_unit.Inputs()[0].node_arg, input_shape))
     return false;
 
   const auto input_size = input_shape.size();
@@ -1497,7 +1459,7 @@ bool ResizeOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initi
   }
 
   {  // check attributes
-    NodeAttrHelper helper(node);
+    NodeAttrHelper helper(node_unit);
     const auto mode = helper.Get("mode", "nearest");
     bool is_linear_resize = mode == "linear";
     bool is_nearest_resize = mode == "nearest";
@@ -1544,27 +1506,27 @@ bool ResizeOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initi
   }
 
   {  // scales and sizes (if present) must be initializers
-    const auto input_defs = node.InputDefs();
-    if (input_defs.size() < 3) {
+    const auto inputs = node_unit.Inputs();
+    if (inputs.size() < 3) {
       LOGS_DEFAULT(VERBOSE) << "Input scales or sizes of Resize must be known";
       return false;
     }
 
     // scales
-    if (input_defs.size() == 3 && !Contains(initializers, input_defs[2]->Name())) {
+    if (inputs.size() == 3 && !Contains(initializers, inputs[2].node_arg.Name())) {
       LOGS_DEFAULT(VERBOSE) << "Input scales of Resize must be known";
       return false;
     }
 
     // sizes
-    if (input_defs.size() > 3 && !Contains(initializers, input_defs[3]->Name())) {
+    if (inputs.size() > 3 && !Contains(initializers, inputs[3].node_arg.Name())) {
       LOGS_DEFAULT(VERBOSE) << "Input sizes of Resize must be known";
       return false;
     }
 
     // We want to check if the scales or sizes are not trying to resize on N/C channels here
-    if (input_defs.size() == 3) {  // we are using scales
-      const auto& scales_tensor = *initializers.at(input_defs[2]->Name());
+    if (inputs.size() == 3) {  // we are using scales
+      const auto& scales_tensor = *initializers.at(inputs[2].node_arg.Name());
       std::vector<uint8_t> unpacked_tensor;
       auto status = onnxruntime::utils::UnpackInitializerData(scales_tensor, unpacked_tensor);
       if (!status.IsOK()) {
@@ -1582,7 +1544,7 @@ bool ResizeOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initi
       }
     } else {
       // we are using sizes
-      const auto& sizes_name = input_defs[3]->Name();
+      const auto& sizes_name = inputs[3].node_arg.Name();
       const auto& sizes_tensor = *initializers.at(sizes_name);
       std::vector<uint8_t> unpacked_tensor;
       auto status = onnxruntime::utils::UnpackInitializerData(sizes_tensor, unpacked_tensor);
@@ -1647,9 +1609,8 @@ class FlattenOpSupportChecker : public BaseOpSupportChecker {
 
 bool FlattenOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& /* initializers */, const NodeUnit& node_unit,
                                                 const OpSupportCheckParams& /* params */) const {
-  const auto& node = node_unit.GetNode();
   Shape input_shape;
-  if (!GetShape(*node.InputDefs()[0], input_shape))
+  if (!GetShape(node_unit.Inputs()[0].node_arg, input_shape))
     return false;
 
   if (input_shape.size() > 4 || input_shape.empty()) {
@@ -1660,7 +1621,7 @@ bool FlattenOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& /* i
 
   int32_t dim_1 = 1;
   int32_t dim_2 = 1;
-  GetFlattenOutputShape(node, input_shape, dim_1, dim_2);
+  GetFlattenOutputShape_nu(node_unit, input_shape, dim_1, dim_2);
 
   if (dim_1 == 0 && dim_2 == 0) {
     LOGS_DEFAULT(VERBOSE) << "The dynamical input shape " << Shape2String(input_shape)
@@ -1705,11 +1666,10 @@ class MinMaxOpSupportChecker : public BaseOpSupportChecker {
 
 bool MinMaxOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& /* initializers */, const NodeUnit& node_unit,
                                                const OpSupportCheckParams& /* params */) const {
-  const auto& node = node_unit.GetNode();
   // TODO support 2+ inputs for Min/Max op
-  if (node.InputDefs().size() != 2) {
-    LOGS_DEFAULT(VERBOSE) << "[" << node.OpType() << "] only supports 2 inputs, "
-                          << "actual input number, " << node.InputDefs().size();
+  if (node_unit.Inputs().size() != 2) {
+    LOGS_DEFAULT(VERBOSE) << "[" << node_unit.OpType() << "] only supports 2 inputs, "
+                          << "actual input number, " << node_unit.Inputs().size();
     return false;
   }
 
@@ -1751,9 +1711,8 @@ class SliceOpSupportChecker : public BaseOpSupportChecker {
 
 bool SliceOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
                                               const OpSupportCheckParams& /* params */) const {
-  const auto& node = node_unit.GetNode();
   Shape input_shape;
-  if (!GetShape(*node.InputDefs()[0], input_shape))
+  if (!GetShape(node_unit.Inputs()[0].node_arg, input_shape))
     return false;
 
   if (input_shape.size() > 4) {
@@ -1768,19 +1727,19 @@ bool SliceOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initia
     return false;
   }
 
-  if (!CheckIsInitializer(initializers, node, 1, "starts")) {
+  if (!CheckIsInitializer(initializers, node_unit, node_unit.Inputs()[1].node_arg.Name(), "starts")) {
     return false;
   }
-  if (!CheckIsInitializer(initializers, node, 2, "ends")) {
+  if (!CheckIsInitializer(initializers, node_unit, node_unit.Inputs()[2].node_arg.Name(), "ends")) {
     return false;
   }
-  const auto& input_defs = node.InputDefs();
-  if (input_defs.size() > 3) {
-    if (!CheckIsInitializer(initializers, node, 3, "axes")) {
+  const auto& inputs = node_unit.Inputs();
+  if (inputs.size() > 3) {
+    if (!CheckIsInitializer(initializers, node_unit, node_unit.Inputs()[3].node_arg.Name(), "axes")) {
       return false;
     }
-    if (input_defs.size() > 4) {
-      if (!CheckIsInitializer(initializers, node, 4, "steps")) {
+    if (inputs.size() > 4) {
+      if (!CheckIsInitializer(initializers, node_unit, node_unit.Inputs()[4].node_arg.Name(), "steps")) {
         return false;
       }
     }

From edff0b4a2b00f31be6b66242554c1278b7bacc1b Mon Sep 17 00:00:00 2001
From: Guoyu Wang <wanggy@outlook.com>
Date: Mon, 10 Jan 2022 21:19:55 -0800
Subject: [PATCH 17/23] remove function postfix

---
 .../nnapi/nnapi_builtin/builders/helper.cc    |  4 +-
 .../nnapi/nnapi_builtin/builders/helper.h     |  4 +-
 .../nnapi_builtin/builders/model_builder.cc   |  6 +-
 .../nnapi_builtin/builders/model_builder.h    |  4 +-
 .../nnapi_builtin/builders/op_builder.cc      | 90 +++++++++----------
 .../builders/op_support_checker.cc            |  2 +-
 6 files changed, 55 insertions(+), 55 deletions(-)

diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.cc
index 3cf271ae317b8..b6bb93a420b8f 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.cc
@@ -68,7 +68,7 @@ QLinearOpType GetQLinearOpType(const onnxruntime::Node& node) {
   return QLinearOpType::Unknown;
 }
 
-ConvType GetConvType_nu(const NodeUnit& node_unit, const InitializedTensorSet& initializers) {
+ConvType GetConvType(const NodeUnit& node_unit, const InitializedTensorSet& initializers) {
   NodeAttrHelper helper(node_unit);
   const auto group = helper.Get("group", 1);
 
@@ -406,7 +406,7 @@ bool GetType(const NodeArg& node_arg, int32_t& type) {
   return true;
 }
 
-void GetFlattenOutputShape_nu(const NodeUnit& node_unit, const Shape& input_shape, int32_t& dim_1, int32_t& dim_2) {
+void GetFlattenOutputShape(const NodeUnit& node_unit, const Shape& input_shape, int32_t& dim_1, int32_t& dim_2) {
   int32_t rank = static_cast<int>(input_shape.size());
   NodeAttrHelper helper(node_unit);
   int32_t axis = helper.Get("axis", 1);
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.h b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.h
index b3e3ffe43f072..22e15130baae7 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.h
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.h
@@ -97,7 +97,7 @@ QLinearOpType GetQLinearOpType(const onnxruntime::Node& node);
 
 // Return the type of the conv ops,
 // This function assumes the input is a 2d conv node
-ConvType GetConvType_nu(const NodeUnit& node_unit, const InitializedTensorSet& initializers);
+ConvType GetConvType(const NodeUnit& node_unit, const InitializedTensorSet& initializers);
 
 // This qlinear op is an operator takes 2 inputs and produces 1 output
 // Such as QLinearConv, QLinearMatMul, QLinearAdd, ...
@@ -130,7 +130,7 @@ bool GetShape(const NodeArg& node_arg, Shape& shape);
 bool GetType(const NodeArg& node_arg, int32_t& type);
 
 // Get the output shape of Flatten Op
-void GetFlattenOutputShape_nu(const NodeUnit& node_unit, const Shape& input_shape, int32_t& dim_1, int32_t& dim_2);
+void GetFlattenOutputShape(const NodeUnit& node_unit, const Shape& input_shape, int32_t& dim_1, int32_t& dim_2);
 
 // If a node is supported by NNAPI
 bool IsNodeSupported(const NodeUnit& node_unit, const GraphViewer& graph_viewer, const OpSupportCheckParams& params);
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.cc
index 7b5ac1aa6b0fb..bea1cc09ff5fd 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.cc
@@ -55,7 +55,7 @@ Status ModelBuilder::Prepare() {
   PreprocessNodeUnits();
   GetAllQuantizedOpInputs();
   PreprocessInitializers();
-  PreprocessActivations_nu();
+  PreprocessActivations();
   ORT_RETURN_IF_ERROR(RegisterInitializers());
   ORT_RETURN_IF_ERROR(RegisterModelInputs());
   ORT_RETURN_IF_ERROR(AddOperations());
@@ -122,7 +122,7 @@ void ModelBuilder::PreprocessInitializers() {
   }
 }
 
-void ModelBuilder::PreprocessActivations_nu() {
+void ModelBuilder::PreprocessActivations() {
   for (const auto& node_unit : node_unit_holder_) {
     const auto& node = node_unit->GetNode();
     const auto& op_type(node.OpType());
@@ -623,7 +623,7 @@ Status ModelBuilder::Compile(std::unique_ptr<Model>& model) {
   return Status::OK();
 }
 
-int32_t ModelBuilder::FindActivation_nu(const NodeUnit& node_unit, const NodeArg& output) {
+int32_t ModelBuilder::FindActivation(const NodeUnit& node_unit, const NodeArg& output) {
   (void)node_unit;
   int32_t fuse_code = ANEURALNETWORKS_FUSED_NONE;
   if (node_unit.GetOutputNodes().size() != 1)
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.h b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.h
index 45a05773b853c..81b016423b27d 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.h
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.h
@@ -48,7 +48,7 @@ class ModelBuilder {
                       const std::vector<bool>& is_nhwc_vec) ORT_MUST_USE_RESULT;
 
   // Find if an output has a fuseable activation (Relu)
-  int32_t FindActivation_nu(const NodeUnit& node_unit, const NodeArg& output);
+  int32_t FindActivation(const NodeUnit& node_unit, const NodeArg& output);
 
   // Add an NNAPI scalar operand
   Status AddOperandFromScalar(bool value, uint32_t& index) ORT_MUST_USE_RESULT;
@@ -184,7 +184,7 @@ class ModelBuilder {
   // If a NNAPI operation will use initializers directly, we will add the initializers to the skip list
   void PreprocessInitializers();
   // Preprocess all the activation nodes (Relu/Relu1/Relu6) for easy query later
-  void PreprocessActivations_nu();
+  void PreprocessActivations();
   // Copy and process all the initializers to NNAPI model
   Status RegisterInitializers() ORT_MUST_USE_RESULT;
   Status RegisterModelInputs() ORT_MUST_USE_RESULT;
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc
index da6e1e9841620..6d0db0ac6c002 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc
@@ -128,7 +128,7 @@ Status TransposeNCHWToNHWC(ModelBuilder& model_builder,
 
 // Convert the input from nchw to nhwc
 // Caller should ensure input is currently in nchw format using ModelBuilder::IsOperandNHWC
-Status GetNHWCInput_nu(ModelBuilder& model_builder, const NodeUnit& node_unit, size_t input_index, std::string& nhwc_input) {
+Status GetNHWCInput(ModelBuilder& model_builder, const NodeUnit& node_unit, size_t input_index, std::string& nhwc_input) {
   const auto& nchw_input = node_unit.Inputs()[input_index].node_arg.Name();
   if (!model_builder.GetNHWCOperand(nchw_input, nhwc_input)) {
     nhwc_input = model_builder.GetUniqueName(nchw_input + "_nchw_to_nhwc");
@@ -139,7 +139,7 @@ Status GetNHWCInput_nu(ModelBuilder& model_builder, const NodeUnit& node_unit, s
 
 // Convert the input from nhwc to nchw
 // Caller should ensure input is currently in nhwc format using ModelBuilder::IsOperandNHWC
-Status GetNCHWInput_nu(ModelBuilder& model_builder, const NodeUnit& node_unit, size_t input_index, std::string& nchw_input) {
+Status GetNCHWInput(ModelBuilder& model_builder, const NodeUnit& node_unit, size_t input_index, std::string& nchw_input) {
   const auto& nhwc_input = node_unit.Inputs()[input_index].node_arg.Name();
   if (!model_builder.GetNCHWOperand(nhwc_input, nchw_input)) {
     nchw_input = model_builder.GetUniqueName(nhwc_input + "_nhwc_to_nchw");
@@ -152,12 +152,12 @@ Status GetNCHWInput_nu(ModelBuilder& model_builder, const NodeUnit& node_unit, s
 // and return the layout type of output tensor
 // If both inputs have same layout, the output will have the same layout
 // Otherwise we will need transpose the nhwc input back to nchw, and output will be nchw
-Status TransposeBinaryOpInputLayout_nu(ModelBuilder& model_builder, const NodeUnit& node_unit,
-                                       std::string& input1, std::string& input2,
-                                       bool& output_is_nhwc) ORT_MUST_USE_RESULT;
-Status TransposeBinaryOpInputLayout_nu(ModelBuilder& model_builder, const NodeUnit& node_unit,
-                                       std::string& input1, std::string& input2,
-                                       bool& output_is_nhwc) {
+Status TransposeBinaryOpInputLayout(ModelBuilder& model_builder, const NodeUnit& node_unit,
+                                    std::string& input1, std::string& input2,
+                                    bool& output_is_nhwc) ORT_MUST_USE_RESULT;
+Status TransposeBinaryOpInputLayout(ModelBuilder& model_builder, const NodeUnit& node_unit,
+                                    std::string& input1, std::string& input2,
+                                    bool& output_is_nhwc) {
   bool input1_is_nhwc = model_builder.IsOperandNHWC(input1);
   bool input2_is_nhwc = model_builder.IsOperandNHWC(input2);
   output_is_nhwc = false;
@@ -166,10 +166,10 @@ Status TransposeBinaryOpInputLayout_nu(ModelBuilder& model_builder, const NodeUn
     output_is_nhwc = input1_is_nhwc;
   } else if (input1_is_nhwc) {
     // need transpose input1 back to nchw
-    ORT_RETURN_IF_ERROR(GetNCHWInput_nu(model_builder, node_unit, 0, input1));
+    ORT_RETURN_IF_ERROR(GetNCHWInput(model_builder, node_unit, 0, input1));
   } else {  // input2_is_nhwc
     // need transpose input2 back to nchw
-    ORT_RETURN_IF_ERROR(GetNCHWInput_nu(model_builder, node_unit, 1, input2));
+    ORT_RETURN_IF_ERROR(GetNCHWInput(model_builder, node_unit, 1, input2));
   }
 
   return Status::OK();
@@ -511,11 +511,11 @@ static Status HandleAutoPad(const Shape& input_shape,
 // Get scales and zero points for the qlinear binary ops (which has 2 input and 1 output)
 // QLinearConv, QLinearMatmul, QLinearAdd
 // a, b are inputs, and y is output
-static Status GetBinaryOpQuantizationScaleAndZeroPoint_nu(
+static Status GetBinaryOpQuantizationScaleAndZeroPoint(
     const InitializedTensorSet& initializers, const NodeUnit& node_unit,
     float& a_scale, float& b_scale, float& y_scale,
     int32_t& a_zero_point, int32_t& b_zero_point, int32_t& y_zero_point) ORT_MUST_USE_RESULT;
-static Status GetBinaryOpQuantizationScaleAndZeroPoint_nu(
+static Status GetBinaryOpQuantizationScaleAndZeroPoint(
     const InitializedTensorSet& initializers, const NodeUnit& node_unit,
     float& a_scale, float& b_scale, float& y_scale,
     int32_t& a_zero_point, int32_t& b_zero_point, int32_t& y_zero_point) {
@@ -538,12 +538,12 @@ static Status GetBinaryOpQuantizationScaleAndZeroPoint_nu(
 // If the Qlinear[Conv/MatMul] is using per-tensor u8s8, the weight/B tensor
 // will be convert to uint8 later, will return the same scale and 128 as zero point
 // Also will set is_per_tensor_u8s8 to true to be used later
-static Status GetConvMatMulOpQuantizationScaleAndZeroPoint_nu(
+static Status GetConvMatMulOpQuantizationScaleAndZeroPoint(
     const ModelBuilder& model_builder, const NodeUnit& node_unit,
     float& a_scale, float& w_scale, float& y_scale,
     int32_t& a_zero_point, int32_t& w_zero_point, int32_t& y_zero_point,
     optional<std::vector<float>>& w_scales, bool& is_per_tensor_u8s8) ORT_MUST_USE_RESULT;
-static Status GetConvMatMulOpQuantizationScaleAndZeroPoint_nu(
+static Status GetConvMatMulOpQuantizationScaleAndZeroPoint(
     const ModelBuilder& model_builder, const NodeUnit& node_unit,
     float& a_scale, float& w_scale, float& y_scale,
     int32_t& a_zero_point, int32_t& w_zero_point, int32_t& y_zero_point,
@@ -553,9 +553,9 @@ static Status GetConvMatMulOpQuantizationScaleAndZeroPoint_nu(
   // Get scale and zero points
   // We will handle per-channel weight scale and zero point later
   ORT_RETURN_IF_ERROR(
-      GetBinaryOpQuantizationScaleAndZeroPoint_nu(initializers, node_unit,
-                                                  a_scale, w_scale, y_scale,
-                                                  a_zero_point, w_zero_point, y_zero_point));
+      GetBinaryOpQuantizationScaleAndZeroPoint(initializers, node_unit,
+                                               a_scale, w_scale, y_scale,
+                                               a_zero_point, w_zero_point, y_zero_point));
 
   const auto& inputs = node_unit.Inputs();
   const auto& weight_tensor = *initializers.at(inputs[1].node_arg.Name());
@@ -786,7 +786,7 @@ Status BinaryOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
 
   bool output_is_nhwc = false;
   ORT_RETURN_IF_ERROR(
-      TransposeBinaryOpInputLayout_nu(model_builder, node_unit, input1, input2, output_is_nhwc));
+      TransposeBinaryOpInputLayout(model_builder, node_unit, input1, input2, output_is_nhwc));
 
   float a_scale = 0.0f,
         b_scale = 0.0f,
@@ -796,7 +796,7 @@ Status BinaryOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
           y_zero_point = 0;
 
   if (op_is_qlinear) {
-    ORT_RETURN_IF_ERROR(GetBinaryOpQuantizationScaleAndZeroPoint_nu(
+    ORT_RETURN_IF_ERROR(GetBinaryOpQuantizationScaleAndZeroPoint(
         model_builder.GetInitializerTensors(), node_unit,
         a_scale, b_scale, y_scale,
         a_zero_point, b_zero_point, y_zero_point));
@@ -810,7 +810,7 @@ Status BinaryOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
 
   int32_t fuse_code = ANEURALNETWORKS_FUSED_NONE;
   if (add_activation) {
-    fuse_code = model_builder.FindActivation_nu(node_unit, node_unit.Outputs()[0].node_arg);
+    fuse_code = model_builder.FindActivation(node_unit, node_unit.Outputs()[0].node_arg);
   }
 
   return AddBinaryOperator(op_code, model_builder,
@@ -1032,7 +1032,7 @@ Status ReshapeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, cons
   auto input = node_unit.Inputs()[0].node_arg.Name();
   if (model_builder.IsOperandNHWC(input)) {
     // We want to transpose nhwc operand back to nchw before reshape
-    ORT_RETURN_IF_ERROR(GetNCHWInput_nu(model_builder, node_unit, 0, input));
+    ORT_RETURN_IF_ERROR(GetNCHWInput(model_builder, node_unit, 0, input));
   }
 
   const auto& shape_tensor = *initializers.at(node_unit.Inputs()[1].node_arg.Name());
@@ -1153,7 +1153,7 @@ Status BatchNormalizationOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_bu
                                         output_is_nhwc));
 
   // Add
-  int32_t fuse_code = model_builder.FindActivation_nu(node_unit, node_unit.Outputs()[0].node_arg);
+  int32_t fuse_code = model_builder.FindActivation(node_unit, node_unit.Outputs()[0].node_arg);
   ORT_RETURN_IF_ERROR(AddBinaryOperator(ANEURALNETWORKS_ADD,
                                         model_builder,
                                         tensor_imm_product_name, tensor_b_name,
@@ -1221,7 +1221,7 @@ Status PoolOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
   } else {
     output_is_nhwc = true;
     if (!input_is_nhwc) {
-      ORT_RETURN_IF_ERROR(GetNHWCInput_nu(model_builder, node_unit, 0, input));
+      ORT_RETURN_IF_ERROR(GetNHWCInput(model_builder, node_unit, 0, input));
     }
   }
 
@@ -1266,7 +1266,7 @@ Status PoolOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
     }
   }
 
-  int32_t fuse_code = model_builder.FindActivation_nu(node_unit, node_unit.Outputs()[0].node_arg);
+  int32_t fuse_code = model_builder.FindActivation(node_unit, node_unit.Outputs()[0].node_arg);
 
   // Get output scale and zero point if this is QLinearAveragePool
   // Otherwise we will use the scale and zero point of the input
@@ -1393,13 +1393,13 @@ Status ConvOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
   } else {
     output_is_nhwc = true;
     if (!input_is_nhwc) {
-      ORT_RETURN_IF_ERROR(GetNHWCInput_nu(model_builder, node_unit, 0, input));
+      ORT_RETURN_IF_ERROR(GetNHWCInput(model_builder, node_unit, 0, input));
     }
   }
 
   const auto& weight = inputs[1].node_arg.Name();
   const auto& weight_tensor = *initializers.at(weight);
-  auto conv_type = GetConvType_nu(node_unit, model_builder.GetInitializerTensors());
+  auto conv_type = GetConvType(node_unit, model_builder.GetInitializerTensors());
   bool conv_2d = (conv_type == ConvType::Regular),
        depthwise_conv_2d = (conv_type == ConvType::Depthwise),
        grouped_conv_2d = (conv_type == ConvType::Grouped);
@@ -1415,10 +1415,10 @@ Status ConvOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
   optional<std::vector<float>> w_scales;
   bool is_per_tensor_u8s8 = false;
   if (is_qlinear_conv) {
-    ORT_RETURN_IF_ERROR(GetConvMatMulOpQuantizationScaleAndZeroPoint_nu(model_builder, node_unit,
-                                                                        x_scale, w_scale, y_scale,
-                                                                        x_zero_point, w_zero_point, y_zero_point,
-                                                                        w_scales, is_per_tensor_u8s8));
+    ORT_RETURN_IF_ERROR(GetConvMatMulOpQuantizationScaleAndZeroPoint(model_builder, node_unit,
+                                                                     x_scale, w_scale, y_scale,
+                                                                     x_zero_point, w_zero_point, y_zero_point,
+                                                                     w_scales, is_per_tensor_u8s8));
   }
 
   Shape onnx_weight_shape;
@@ -1548,7 +1548,7 @@ Status ConvOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
     }
   }
 
-  int32_t fuse_code = model_builder.FindActivation_nu(node_unit, node_unit.Outputs()[1].node_arg);
+  int32_t fuse_code = model_builder.FindActivation(node_unit, node_unit.Outputs()[1].node_arg);
   ADD_SCALAR_OPERAND(model_builder, input_indices, fuse_code);
 
   if (model_builder.GetNNAPIFeatureLevel() > ANEURALNETWORKS_FEATURE_LEVEL_2) {
@@ -1651,7 +1651,7 @@ Status SoftMaxOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, cons
     if (model_builder.IsOperandNHWC(input)) {
       output_is_nhwc = false;
       // We want to transpose nhwc operand back to nchw before softmax
-      ORT_RETURN_IF_ERROR(GetNCHWInput_nu(model_builder, node_unit, 0, input));
+      ORT_RETURN_IF_ERROR(GetNCHWInput(model_builder, node_unit, 0, input));
     }
   }
 
@@ -1785,10 +1785,10 @@ Status GemmOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
   if (is_qlinear_matmul) {
     optional<std::vector<float>> w_scales;
     ORT_RETURN_IF_ERROR(
-        GetConvMatMulOpQuantizationScaleAndZeroPoint_nu(model_builder, node_unit,
-                                                        a_scale, b_scale, y_scale,
-                                                        a_zero_point, b_zero_point, y_zero_point,
-                                                        w_scales, is_per_tensor_u8s8));
+        GetConvMatMulOpQuantizationScaleAndZeroPoint(model_builder, node_unit,
+                                                     a_scale, b_scale, y_scale,
+                                                     a_zero_point, b_zero_point, y_zero_point,
+                                                     w_scales, is_per_tensor_u8s8));
   }
 
   uint32_t input_2_idx;
@@ -1859,7 +1859,7 @@ Status GemmOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
   input_indices.push_back(operand_indices.at(input1));  // A
   input_indices.push_back(input_2_idx);                 // B
   input_indices.push_back(bias_idx);                    // C
-  int32_t fuse_code = model_builder.FindActivation_nu(node_unit, node_unit.Outputs()[0].node_arg);
+  int32_t fuse_code = model_builder.FindActivation(node_unit, node_unit.Outputs()[0].node_arg);
   ADD_SCALAR_OPERAND(model_builder, input_indices, fuse_code);
 
   ORT_RETURN_IF_ERROR(shaper.FC(input1, input2, output));
@@ -2040,7 +2040,7 @@ Status ConcatOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
     for (size_t i = 0; i < node_input_size; i++) {
       auto input = inputs[i].node_arg.Name();
       if (model_builder.IsOperandNHWC(input)) {
-        ORT_RETURN_IF_ERROR(GetNCHWInput_nu(model_builder, node_unit, i, input));
+        ORT_RETURN_IF_ERROR(GetNCHWInput(model_builder, node_unit, i, input));
       }
       input_indices.push_back(operand_indices.at(input));
       input_names.push_back(input);
@@ -2117,7 +2117,7 @@ Status SqueezeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, cons
   auto input = node_unit.Inputs()[0].node_arg.Name();
   if (model_builder.IsOperandNHWC(input)) {
     // We want to transpose nhwc operand back to nchw before squeeze
-    ORT_RETURN_IF_ERROR(GetNCHWInput_nu(model_builder, node_unit, 0, input));
+    ORT_RETURN_IF_ERROR(GetNCHWInput(model_builder, node_unit, 0, input));
   }
 
   std::vector<int32_t> axes;
@@ -2229,7 +2229,7 @@ Status LRNOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const No
     // on android api level 28, we need to transpose the nchw input to nhwc
     output_is_nhwc = true;
     if (!model_builder.IsOperandNHWC(input)) {
-      ORT_RETURN_IF_ERROR(GetNHWCInput_nu(model_builder, node_unit, 0, input));
+      ORT_RETURN_IF_ERROR(GetNHWCInput(model_builder, node_unit, 0, input));
     }
   }
 
@@ -2367,7 +2367,7 @@ Status ResizeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
   } else {
     output_is_nhwc = true;
     if (!input_is_nhwc) {
-      ORT_RETURN_IF_ERROR(GetNHWCInput_nu(model_builder, node_unit, 0, input));
+      ORT_RETURN_IF_ERROR(GetNHWCInput(model_builder, node_unit, 0, input));
     }
   }
 
@@ -2445,7 +2445,7 @@ Status FlattenOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, cons
   auto input = node_unit.Inputs()[0].node_arg.Name();
   if (model_builder.IsOperandNHWC(input)) {
     // We want to transpose nhwc operand back to nchw before reshape
-    ORT_RETURN_IF_ERROR(GetNCHWInput_nu(model_builder, node_unit, 0, input));
+    ORT_RETURN_IF_ERROR(GetNCHWInput(model_builder, node_unit, 0, input));
   }
 
   // Flatten is basically a reshape to 2d tensor
@@ -2454,7 +2454,7 @@ Status FlattenOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, cons
   GetShape(node_unit.Inputs()[0].node_arg, input_shape);
   int32_t dim_1 = 1;
   int32_t dim_2 = 1;
-  GetFlattenOutputShape_nu(node_unit, input_shape, dim_1, dim_2);
+  GetFlattenOutputShape(node_unit, input_shape, dim_1, dim_2);
   // If the input is of dynamic shape, replace 0 (dynamic) dimension with -1
   // We cannot have dim_1 and dim_2 both be 0 here, it was checked in IsOpSupportedImpl
   dim_1 = dim_1 == 0 ? -1 : dim_1;
@@ -2523,8 +2523,8 @@ Status MinMaxOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
   std::string input1 = inputs[0].node_arg.Name();
   std::string input2 = inputs[1].node_arg.Name();
   bool output_is_nhwc = false;
-  ORT_RETURN_IF_ERROR(TransposeBinaryOpInputLayout_nu(model_builder, node_unit,
-                                                      input1, input2, output_is_nhwc));
+  ORT_RETURN_IF_ERROR(TransposeBinaryOpInputLayout(model_builder, node_unit,
+                                                   input1, input2, output_is_nhwc));
 
   return AddMinMaxOperator(model_builder, node_unit, input1, input2, output_is_nhwc);
 }
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_support_checker.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_support_checker.cc
index 30854ca06cb52..015cba644047b 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_support_checker.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_support_checker.cc
@@ -1621,7 +1621,7 @@ bool FlattenOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& /* i
 
   int32_t dim_1 = 1;
   int32_t dim_2 = 1;
-  GetFlattenOutputShape_nu(node_unit, input_shape, dim_1, dim_2);
+  GetFlattenOutputShape(node_unit, input_shape, dim_1, dim_2);
 
   if (dim_1 == 0 && dim_2 == 0) {
     LOGS_DEFAULT(VERBOSE) << "The dynamical input shape " << Shape2String(input_shape)

From a551aff180fb22347b0546912d4c5bdad735e097 Mon Sep 17 00:00:00 2001
From: Guoyu Wang <wanggy@outlook.com>
Date: Tue, 11 Jan 2022 00:12:47 -0800
Subject: [PATCH 18/23] fix NNAPI CI failure

---
 .../nnapi/nnapi_builtin/builders/op_support_checker.cc    | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_support_checker.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_support_checker.cc
index 015cba644047b..452eec62a80c7 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_support_checker.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_support_checker.cc
@@ -1268,15 +1268,15 @@ bool SqueezeOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& init
   if (!GetShape(inputs[0].node_arg, input_shape))
     return false;
 
-  const auto input_size = inputs.size();
-  if (input_size > 4 || input_size == 0) {
+  const auto input_dim = input_shape.size();
+  if (input_dim > 4 || input_dim == 0) {
     LOGS_DEFAULT(VERBOSE) << "Squeeze only supports 1-4d shape, input is "
-                          << input_size << "d shape";
+                          << input_dim << "d shape";
     return false;
   }
 
   // Squeeze opset 13 use input 1 as axes, if we have input 1 then it need to be an initializer
-  if (node_unit.SinceVersion() > 12 && input_size > 1) {
+  if (node_unit.SinceVersion() > 12 && inputs.size() > 1) {
     const auto& axes_name = inputs[1].node_arg.Name();
     if (!Contains(initializers, axes_name)) {
       LOGS_DEFAULT(VERBOSE) << "Input axes of Squeeze must be known";

From 718e6b844dbb6c4f90f2e89c97b894563c62196b Mon Sep 17 00:00:00 2001
From: Guoyu Wang <wanggy@outlook.com>
Date: Wed, 12 Jan 2022 14:08:22 -0800
Subject: [PATCH 19/23] address CR comments

---
 .../nnapi/nnapi_builtin/builders/helper.cc    |  5 +-
 .../nnapi/nnapi_builtin/builders/helper.h     |  4 +-
 .../nnapi_builtin/builders/model_builder.cc   |  4 +-
 .../nnapi_builtin/builders/model_builder.h    |  3 -
 .../nnapi_builtin/builders/op_builder.cc      | 57 ++++++++++---------
 .../core/providers/shared/utils/utils.h       |  4 +-
 6 files changed, 37 insertions(+), 40 deletions(-)

diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.cc
index b6bb93a420b8f..cab24bb93d51e 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.cc
@@ -333,7 +333,6 @@ common::Status GetQuantizationScaleAndZeroPoint(
 
   const auto unpack_tensor = [&model_path](const InitializedTensorSet& initializers,
                                            const std::string& name, std::vector<uint8_t>& unpacked_tensor) {
-    unpacked_tensor.clear();
     const auto& tensor = *initializers.at(name);
     ORT_RETURN_IF_ERROR(
         onnxruntime::utils::UnpackInitializerData(tensor, model_path, unpacked_tensor));
@@ -535,9 +534,9 @@ std::string Shape2String(const std::vector<uint32_t>& shape) {
 }
 
 bool CheckIsInitializer(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
-                        const std::string& input_name, const char* input_string) {
+                        const std::string& input_name, const char* input_description) {
   if (!Contains(initializers, input_name)) {
-    LOGS_DEFAULT(VERBOSE) << input_string << " of " << node_unit.Name() << "of type ["
+    LOGS_DEFAULT(VERBOSE) << input_description << " of " << node_unit.Name() << "of type ["
                           << node_unit.OpType() << "] must be an initializer tensor";
     return false;
   }
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.h b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.h
index 22e15130baae7..88fadf2a135ed 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.h
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.h
@@ -152,9 +152,9 @@ std::string Shape2String(const std::vector<uint32_t>& shape);
 
 // Check the given input is an initializer tensor
 // input_name is the name of the initializer
-// input_string is the string describing the input in the output message (if nay)
+// input_description is the string describing the input in the output message (if any)
 bool CheckIsInitializer(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
-                        const std::string& input_name, const char* input_string) ORT_MUST_USE_RESULT;
+                        const std::string& input_name, const char* input_description) ORT_MUST_USE_RESULT;
 
 }  // namespace nnapi
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.cc
index bea1cc09ff5fd..b666125b86312 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.cc
@@ -143,7 +143,8 @@ void ModelBuilder::PreprocessActivations() {
 }
 
 const NodeUnit& ModelBuilder::GetNodeUnit(const Node* node) const {
-  // Do we want to throw here if the node is not in the map?
+  // In theory, if node_unit_map_ is generated correctly, see PreprocessNodeUnits(), a NodeUnit can be
+  // found for any single node in the graph_viewer_, unless the given node is not from graph_viewer_
   return *node_unit_map_.at(node);
 }
 
@@ -624,7 +625,6 @@ Status ModelBuilder::Compile(std::unique_ptr<Model>& model) {
 }
 
 int32_t ModelBuilder::FindActivation(const NodeUnit& node_unit, const NodeArg& output) {
-  (void)node_unit;
   int32_t fuse_code = ANEURALNETWORKS_FUSED_NONE;
   if (node_unit.GetOutputNodes().size() != 1)
     return fuse_code;
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.h b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.h
index 81b016423b27d..7b188d1677b78 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.h
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.h
@@ -139,9 +139,6 @@ class ModelBuilder {
 
   std::unordered_set<std::string> skipped_initializers_;
 
-  // All activation nodes (Relu, Relu1, Relu6) as a map <NodeIndex, activation_code>
-  std::unordered_map<NodeIndex, int32_t> activation_nodes_;
-
   // All activation nodes (Relu, Relu1, Relu6) as a map <const NodeUnit*, activation_code>
   std::unordered_map<const NodeUnit*, int32_t> activation_node_units_;
 
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc
index 6d0db0ac6c002..1faaffaee30bb 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc
@@ -651,13 +651,14 @@ static Status IsValidConvWeightQuantizedType(const ModelBuilder& model_builder,
   return Status::OK();
 }
 
-static void AddQuantizationScaleAndZeroPointToSkip(ModelBuilder& model_builder, const NodeUnitIODef& io_def) {
+static void AddQuantizationScaleAndZeroPointToSkip(ModelBuilder& model_builder,
+                                                   const NodeUnitIODef::QuantParam& quant_param) {
   // If we reach here, we assume the io_def has quant_param
-  model_builder.AddInitializerToSkip(io_def.quant_param->scale.Name());  // scale
-  LOGS_DEFAULT(VERBOSE) << io_def.quant_param->scale.Name() << "is skipped";
-  if (io_def.quant_param->zero_point) {
-    model_builder.AddInitializerToSkip(io_def.quant_param->zero_point->Name());  // zero_point
-    LOGS_DEFAULT(VERBOSE) << io_def.quant_param->zero_point->Name() << "is skipped";
+  model_builder.AddInitializerToSkip(quant_param.scale.Name());  // scale
+  LOGS_DEFAULT(VERBOSE) << quant_param.scale.Name() << "is skipped";
+  if (quant_param.zero_point) {
+    model_builder.AddInitializerToSkip(quant_param.zero_point->Name());  // zero_point
+    LOGS_DEFAULT(VERBOSE) << quant_param.zero_point->Name() << "is skipped";
   }
 }
 
@@ -666,7 +667,7 @@ static void AddQuantizationScaleAndZeroPointToSkip(ModelBuilder& model_builder,
 static void AddInputToSkip(ModelBuilder& model_builder, const NodeUnitIODef& io_def) {
   model_builder.AddInitializerToSkip(io_def.node_arg.Name());  // main input
   if (io_def.quant_param)
-    AddQuantizationScaleAndZeroPointToSkip(model_builder, io_def);
+    AddQuantizationScaleAndZeroPointToSkip(model_builder, *io_def.quant_param);
 }
 
 template <class T>
@@ -725,7 +726,7 @@ class BinaryOpBuilder : public BaseOpBuilder {
   static void CreateSharedOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
 
  private:
-  static bool IsQuantizedOp(const NodeUnit& node_unit);  // TODO, see if we want to move this to BaseOpBuilder
+  static bool IsQuantizedOp(const NodeUnit& node_unit) ORT_MUST_USE_RESULT;  // TODO, see if we want to move this to BaseOpBuilder
   Status AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const override ORT_MUST_USE_RESULT;
 };
 
@@ -739,9 +740,9 @@ void BinaryOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const N
     return;
 
   const auto& inputs = node_unit.Inputs();
-  AddQuantizationScaleAndZeroPointToSkip(model_builder, inputs[0]);               // a_scale, a_zp
-  AddQuantizationScaleAndZeroPointToSkip(model_builder, inputs[1]);               // b_scale, b_zp
-  AddQuantizationScaleAndZeroPointToSkip(model_builder, node_unit.Outputs()[0]);  // y_scale, y_zp
+  AddQuantizationScaleAndZeroPointToSkip(model_builder, *inputs[0].quant_param);               // a_scale, a_zp
+  AddQuantizationScaleAndZeroPointToSkip(model_builder, *inputs[1].quant_param);               // b_scale, b_zp
+  AddQuantizationScaleAndZeroPointToSkip(model_builder, *node_unit.Outputs()[0].quant_param);  // y_scale, y_zp
 }
 
 /* static */ void BinaryOpBuilder::CreateSharedOpBuilder(
@@ -1174,7 +1175,7 @@ class PoolOpBuilder : public BaseOpBuilder {
   static void CreateSharedOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
 
  private:
-  static bool IsQuantizedOp(const NodeUnit& node_unit);  // TODO, see if we want to move this to BaseOpBuilder
+  static bool IsQuantizedOp(const NodeUnit& node_unit) ORT_MUST_USE_RESULT;  // TODO, see if we want to move this to BaseOpBuilder
   Status AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const override ORT_MUST_USE_RESULT;
 };
 
@@ -1188,8 +1189,8 @@ void PoolOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const Nod
     return;
 
   // skip input/output scales and zeropoints
-  AddQuantizationScaleAndZeroPointToSkip(model_builder, node_unit.Inputs()[0]);   // x_scale, x_zp
-  AddQuantizationScaleAndZeroPointToSkip(model_builder, node_unit.Outputs()[0]);  // y_scale, y_zp
+  AddQuantizationScaleAndZeroPointToSkip(model_builder, *node_unit.Inputs()[0].quant_param);   // x_scale, x_zp
+  AddQuantizationScaleAndZeroPointToSkip(model_builder, *node_unit.Outputs()[0].quant_param);  // y_scale, y_zp
 }
 
 /* static */ void PoolOpBuilder::CreateSharedOpBuilder(
@@ -1328,7 +1329,7 @@ class ConvOpBuilder : public BaseOpBuilder {
   static void CreateSharedOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
 
  private:
-  static bool IsQuantizedOp(const NodeUnit& node_unit);  // TODO, see if we want to move this to BaseOpBuilder
+  static bool IsQuantizedOp(const NodeUnit& node_unit) ORT_MUST_USE_RESULT;  // TODO, see if we want to move this to BaseOpBuilder
   Status AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const override ORT_MUST_USE_RESULT;
 };
 
@@ -1352,9 +1353,9 @@ void ConvOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const Nod
   const auto& inputs = node_unit.Inputs();
   // skip the weight for conv as we need to transpose
   if (IsQuantizedOp(node_unit)) {
-    AddQuantizationScaleAndZeroPointToSkip(model_builder, inputs[0]);               // x_scale, x_zp
-    AddInputToSkip(model_builder, inputs[1]);                                       // w, w_scale, w_zp
-    AddQuantizationScaleAndZeroPointToSkip(model_builder, node_unit.Outputs()[0]);  // y_scale, y_zp
+    AddQuantizationScaleAndZeroPointToSkip(model_builder, *inputs[0].quant_param);               // x_scale, x_zp
+    AddInputToSkip(model_builder, inputs[1]);                                                    // w, w_scale, w_zp
+    AddQuantizationScaleAndZeroPointToSkip(model_builder, *node_unit.Outputs()[0].quant_param);  // y_scale, y_zp
     if (inputs.size() > 2)
       AddInputToSkip(model_builder, inputs[2]);  // B, B_scale, B_zp
   } else {
@@ -1548,7 +1549,7 @@ Status ConvOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
     }
   }
 
-  int32_t fuse_code = model_builder.FindActivation(node_unit, node_unit.Outputs()[1].node_arg);
+  int32_t fuse_code = model_builder.FindActivation(node_unit, node_unit.Outputs()[0].node_arg);
   ADD_SCALAR_OPERAND(model_builder, input_indices, fuse_code);
 
   if (model_builder.GetNNAPIFeatureLevel() > ANEURALNETWORKS_FEATURE_LEVEL_2) {
@@ -1718,7 +1719,7 @@ class GemmOpBuilder : public BaseOpBuilder {
   static void CreateSharedOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
 
  private:
-  static bool IsQuantizedOp(const NodeUnit& node_unit);  // TODO, see if we want to move this to BaseOpBuilder
+  static bool IsQuantizedOp(const NodeUnit& node_unit) ORT_MUST_USE_RESULT;  // TODO, see if we want to move this to BaseOpBuilder
   Status AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const override ORT_MUST_USE_RESULT;
 };
 
@@ -1741,9 +1742,9 @@ class GemmOpBuilder : public BaseOpBuilder {
 void GemmOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const NodeUnit& node_unit) const {
   const auto& inputs = node_unit.Inputs();
   if (IsQuantizedOp(node_unit)) {
-    AddQuantizationScaleAndZeroPointToSkip(model_builder, inputs[0]);               // b_scale, b_zp
-    AddInputToSkip(model_builder, inputs[1]);                                       // b, b_scale, b_zp
-    AddQuantizationScaleAndZeroPointToSkip(model_builder, node_unit.Outputs()[0]);  // y_scale, y_zp
+    AddQuantizationScaleAndZeroPointToSkip(model_builder, *inputs[0].quant_param);               // b_scale, b_zp
+    AddInputToSkip(model_builder, inputs[1]);                                                    // b, b_scale, b_zp
+    AddQuantizationScaleAndZeroPointToSkip(model_builder, *node_unit.Outputs()[0].quant_param);  // y_scale, y_zp
   } else {
     const auto& op = node_unit.OpType();
     const auto& inputs = node_unit.Inputs();
@@ -1879,7 +1880,7 @@ class UnaryOpBuilder : public BaseOpBuilder {
   static void CreateSharedOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
 
  private:
-  static bool IsQuantizedOp(const NodeUnit& node_unit);  // TODO, see if we want to move this to BaseOpBuilder
+  static bool IsQuantizedOp(const NodeUnit& node_unit) ORT_MUST_USE_RESULT;  // TODO, see if we want to move this to BaseOpBuilder
   Status AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const override ORT_MUST_USE_RESULT;
 };
 
@@ -1892,8 +1893,8 @@ void UnaryOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const No
   if (!IsQuantizedOp(node_unit))
     return;
 
-  AddQuantizationScaleAndZeroPointToSkip(model_builder, node_unit.Inputs()[0]);   // x_scale, x_zp
-  AddQuantizationScaleAndZeroPointToSkip(model_builder, node_unit.Outputs()[0]);  // y_scale, y_zp
+  AddQuantizationScaleAndZeroPointToSkip(model_builder, *node_unit.Inputs()[0].quant_param);   // x_scale, x_zp
+  AddQuantizationScaleAndZeroPointToSkip(model_builder, *node_unit.Outputs()[0].quant_param);  // y_scale, y_zp
 }
 
 /* static */ void UnaryOpBuilder::CreateSharedOpBuilder(
@@ -2138,7 +2139,7 @@ class QuantizeLinearOpBuilder : public BaseOpBuilder {
 };
 
 void QuantizeLinearOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const NodeUnit& node_unit) const {
-  AddQuantizationScaleAndZeroPointToSkip(model_builder, node_unit.Outputs()[0]);  // y_scale, y_zp
+  AddQuantizationScaleAndZeroPointToSkip(model_builder, *node_unit.Outputs()[0].quant_param);  // y_scale, y_zp
 }
 
 Status QuantizeLinearOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const {
@@ -2177,7 +2178,7 @@ class DequantizeLinearOpBuilder : public BaseOpBuilder {
 };
 
 void DequantizeLinearOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const NodeUnit& node_unit) const {
-  AddQuantizationScaleAndZeroPointToSkip(model_builder, node_unit.Inputs()[0]);  // x_scale, x_zp
+  AddQuantizationScaleAndZeroPointToSkip(model_builder, *node_unit.Inputs()[0].quant_param);  // x_scale, x_zp
 }
 
 Status DequantizeLinearOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const {
diff --git a/onnxruntime/core/providers/shared/utils/utils.h b/onnxruntime/core/providers/shared/utils/utils.h
index b6884f53d1c2e..26898aa95e893 100644
--- a/onnxruntime/core/providers/shared/utils/utils.h
+++ b/onnxruntime/core/providers/shared/utils/utils.h
@@ -35,10 +35,10 @@ bool GetType(const NodeArg& node_arg, int32_t& type, const logging::Logger& logg
  */
 class NodeAttrHelper {
  public:
-  NodeAttrHelper(const Node& node);
+  explicit NodeAttrHelper(const Node& node);
 
   // Get the attributes from the target node of the node_unit
-  NodeAttrHelper(const NodeUnit& node_unit);
+  explicit NodeAttrHelper(const NodeUnit& node_unit);
 
   float Get(const std::string& key, float def_val) const;
 

From 8d0692b75e53c44de9812e501cd324c170d9dbe2 Mon Sep 17 00:00:00 2001
From: Guoyu Wang <wanggy@outlook.com>
Date: Wed, 12 Jan 2022 16:30:48 -0800
Subject: [PATCH 20/23] remove redudant ORT_MUST_USE_RESULT, and clear includes

---
 .../nnapi/nnapi_builtin/builders/helper.cc    |  14 +-
 .../nnapi/nnapi_builtin/builders/helper.h     |   2 +-
 .../nnapi_builtin/builders/model_builder.cc   |  15 +-
 .../nnapi_builtin/builders/model_builder.h    |  62 +++---
 .../nnapi_builtin/builders/op_builder.cc      | 141 ++++---------
 .../nnapi/nnapi_builtin/builders/op_builder.h |   6 +-
 .../builders/op_support_checker.cc            |  10 +-
 .../nnapi/nnapi_builtin/builders/shaper.cc    |   2 +-
 .../nnapi/nnapi_builtin/builders/shaper.h     | 187 +++++++++---------
 .../providers/nnapi/nnapi_builtin/model.cc    |   4 +-
 .../providers/nnapi/nnapi_builtin/model.h     |  15 +-
 .../nnapi_builtin/nnapi_execution_provider.cc |   8 -
 12 files changed, 189 insertions(+), 277 deletions(-)

diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.cc
index cab24bb93d51e..a4097d6716026 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.cc
@@ -5,16 +5,16 @@
 #include <string>
 #include <vector>
 
-#include <core/common/safeint.h>
-#include <core/common/logging/logging.h>
-#include <core/framework/tensorprotoutils.h>
-#include <core/graph/graph.h>
-#include <core/graph/graph_viewer.h>
-#include <core/providers/common.h>
+#include "helper.h"
 
+#include "core/common/safeint.h"
+#include "core/common/logging/logging.h"
+#include "core/framework/tensorprotoutils.h"
+#include "core/graph/graph.h"
+#include "core/graph/graph_viewer.h"
+#include "core/providers/common.h"
 #include "core/providers/shared/node_unit/node_unit.h"
 #include "core/providers/shared/utils/utils.h"
-#include "helper.h"
 #include "op_support_checker.h"
 
 namespace onnxruntime {
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.h b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.h
index 88fadf2a135ed..c3729fb1c8f10 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.h
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.h
@@ -154,7 +154,7 @@ std::string Shape2String(const std::vector<uint32_t>& shape);
 // input_name is the name of the initializer
 // input_description is the string describing the input in the output message (if any)
 bool CheckIsInitializer(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
-                        const std::string& input_name, const char* input_description) ORT_MUST_USE_RESULT;
+                        const std::string& input_name, const char* input_description);
 
 }  // namespace nnapi
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.cc
index b666125b86312..56be9aa755ff7 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.cc
@@ -1,16 +1,19 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include <core/common/logging/logging.h>
-#include <core/common/safeint.h>
-#include <core/framework/tensorprotoutils.h>
+#include "model_builder.h"
 
+#include "core/common/logging/logging.h"
+#include "core/common/safeint.h"
+#include "core/common/status.h"
+#include "core/framework/tensorprotoutils.h"
+#include "core/graph/graph_viewer.h"
 #include "core/providers/common.h"
 #include "core/providers/shared/node_unit/node_unit.h"
 #include "core/providers/shared/utils/utils.h"
 #include "core/providers/nnapi/nnapi_builtin/nnapi_lib/nnapi_implementation.h"
+
 #include "helper.h"
-#include "model_builder.h"
 #include "op_builder.h"
 #include "op_support_checker.h"
 
@@ -689,6 +692,10 @@ std::string ModelBuilder::GetUniqueName(const std::string& base_name) {
   return unique_name;
 }
 
+const InitializedTensorSet& ModelBuilder::GetInitializerTensors() const {
+  return graph_viewer_.GetAllInitializedTensors();
+}
+
 void ModelBuilder::RegisterNHWCOperand(const std::string& name) {
   nhwc_operands_.insert(name);
 }
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.h b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.h
index 7b188d1677b78..0fbeec7dc5d98 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.h
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.h
@@ -5,19 +5,22 @@
 #include <onnx/onnx_pb.h>
 #include <unordered_set>
 
-#include <core/graph/graph_viewer.h>
+#include "core/graph/basic_types.h"
 #include "core/providers/nnapi/nnapi_builtin/model.h"
 #include "core/providers/nnapi/nnapi_builtin/nnapi_lib/NeuralNetworksWrapper.h"
-#include "op_support_checker.h"
 #include "shaper.h"
 
 namespace onnxruntime {
 
+class GraphViewer;
 class NodeUnit;
+class Node;
+class NodeArg;
 
 namespace nnapi {
 
 class IOpBuilder;
+class IOpSupportChecker;
 
 class ModelBuilder {
  public:
@@ -37,28 +40,28 @@ class ModelBuilder {
 
   ModelBuilder(const GraphViewer& graph_viewer);
 
-  Status Compile(std::unique_ptr<Model>& model) ORT_MUST_USE_RESULT;
+  common::Status Compile(std::unique_ptr<Model>& model);
 
   int32_t GetNNAPIFeatureLevel() const;
 
   // Add an NNAPI operation (operator)
-  Status AddOperation(int op, const std::vector<uint32_t>& input_indices,
-                      const std::vector<std::string>& output_names,
-                      const std::vector<android::nn::wrapper::OperandType>& types,
-                      const std::vector<bool>& is_nhwc_vec) ORT_MUST_USE_RESULT;
+  common::Status AddOperation(int op, const std::vector<uint32_t>& input_indices,
+                              const std::vector<std::string>& output_names,
+                              const std::vector<android::nn::wrapper::OperandType>& types,
+                              const std::vector<bool>& is_nhwc_vec);
 
   // Find if an output has a fuseable activation (Relu)
   int32_t FindActivation(const NodeUnit& node_unit, const NodeArg& output);
 
   // Add an NNAPI scalar operand
-  Status AddOperandFromScalar(bool value, uint32_t& index) ORT_MUST_USE_RESULT;
-  Status AddOperandFromScalar(float value, uint32_t& index) ORT_MUST_USE_RESULT;
-  Status AddOperandFromScalar(int32_t value, uint32_t& index) ORT_MUST_USE_RESULT;
+  common::Status AddOperandFromScalar(bool value, uint32_t& index);
+  common::Status AddOperandFromScalar(float value, uint32_t& index);
+  common::Status AddOperandFromScalar(int32_t value, uint32_t& index);
 
   // Add an NNAPI tensor operand (and allocate persist buffer)
-  Status AddOperandFromPersistMemoryBuffer(
+  common::Status AddOperandFromPersistMemoryBuffer(
       const std::string& name, const void* buffer,
-      const android::nn::wrapper::OperandType& operand_type) ORT_MUST_USE_RESULT;
+      const android::nn::wrapper::OperandType& operand_type);
 
   // The initializer will be processed separately, skip it as an initializer
   void AddInitializerToSkip(const std::string& tensor_name);
@@ -98,7 +101,7 @@ class ModelBuilder {
   const std::unordered_set<std::string>&
   GetFusedActivations() const { return fused_activations_; }
 
-  const InitializedTensorSet& GetInitializerTensors() const { return graph_viewer_.GetAllInitializedTensors(); }
+  const InitializedTensorSet& GetInitializerTensors() const;
 
   const GraphViewer& GetGraphViewer() const { return graph_viewer_; }
 
@@ -112,10 +115,10 @@ class ModelBuilder {
   // Get the NodeUnit which contains the given node
   const NodeUnit& GetNodeUnit(const Node* node) const;
 
-  Status SetNHWCToNCHWOperandMap(const std::string& nhwc_name,
-                                 const std::string& nchw_name) ORT_MUST_USE_RESULT;
-  Status SetNCHWToNHWCOperandMap(const std::string& nchw_name,
-                                 const std::string& nhwc_name) ORT_MUST_USE_RESULT;
+  common::Status SetNHWCToNCHWOperandMap(const std::string& nhwc_name,
+                                         const std::string& nchw_name);
+  common::Status SetNCHWToNHWCOperandMap(const std::string& nchw_name,
+                                         const std::string& nhwc_name);
 
  private:
   const NnApi* nnapi_{nullptr};
@@ -174,19 +177,19 @@ class ModelBuilder {
   uint32_t next_index_ = 0;
 
   // Convert the onnx model to ANeuralNetworksModel
-  Status Prepare() ORT_MUST_USE_RESULT;
+  common::Status Prepare();
 
-  Status GetTargetDevices() ORT_MUST_USE_RESULT;
+  common::Status GetTargetDevices();
 
   // If a NNAPI operation will use initializers directly, we will add the initializers to the skip list
   void PreprocessInitializers();
   // Preprocess all the activation nodes (Relu/Relu1/Relu6) for easy query later
   void PreprocessActivations();
   // Copy and process all the initializers to NNAPI model
-  Status RegisterInitializers() ORT_MUST_USE_RESULT;
-  Status RegisterModelInputs() ORT_MUST_USE_RESULT;
-  Status AddOperations() ORT_MUST_USE_RESULT;
-  Status RegisterModelOutputs() ORT_MUST_USE_RESULT;
+  common::Status RegisterInitializers();
+  common::Status RegisterModelInputs();
+  common::Status AddOperations();
+  common::Status RegisterModelOutputs();
   // After constructing the NNAPI model, will set the shape inferencing record to the Model
   void RegisterModelShaper();
 
@@ -197,14 +200,13 @@ class ModelBuilder {
   // using the result of PreprocessNodeUnits, this need to run early in the Prepare()
   void PreprocessNodeUnits();
 
-  Status SetOperandValue(uint32_t index, Model::NNMemory* memory,
-                         size_t size, size_t offset) ORT_MUST_USE_RESULT;
+  common::Status SetOperandValue(uint32_t index, Model::NNMemory* memory, size_t size, size_t offset);
 
-  Status AddNewNNAPIOperand(const android::nn::wrapper::OperandType& type, uint32_t& index) ORT_MUST_USE_RESULT;
-  Status AddNewOperand(const std::string& name,
-                       const android::nn::wrapper::OperandType& operand_type,
-                       bool is_nhwc,
-                       uint32_t& index) ORT_MUST_USE_RESULT;
+  common::Status AddNewNNAPIOperand(const android::nn::wrapper::OperandType& type, uint32_t& index);
+  common::Status AddNewOperand(const std::string& name,
+                               const android::nn::wrapper::OperandType& operand_type,
+                               bool is_nhwc,
+                               uint32_t& index);
 
   static const IOpBuilder* GetOpBuilder(const NodeUnit& node_unit);
 };
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc
index 1faaffaee30bb..9689f72156eba 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc
@@ -3,12 +3,13 @@
 
 #include "op_builder.h"
 
-#include <core/common/logging/logging.h>
-#include <core/common/safeint.h>
-#include <core/framework/tensorprotoutils.h>
-#include <core/providers/common.h>
 #include <onnx/onnx_pb.h>
 
+#include "core/common/logging/logging.h"
+#include "core/common/safeint.h"
+#include "core/framework/tensorprotoutils.h"
+#include "core/graph/graph_viewer.h"
+#include "core/providers/common.h"
 #include "core/providers/shared/utils/utils.h"
 #include "core/providers/shared/node_unit/node_unit.h"
 #include "core/providers/cpu/tensor/slice_helper.h"
@@ -35,12 +36,6 @@ struct OpBuilderRegistrations {
     input_indices.push_back(_index);                                               \
   }
 
-Status AddTransposeOperator(ModelBuilder& model_builder,
-                            const std::string& input,
-                            const std::string& perm_name,
-                            std::vector<int32_t> perm,
-                            const std::string& output,
-                            bool output_is_nhwc) ORT_MUST_USE_RESULT;
 Status AddTransposeOperator(ModelBuilder& model_builder,
                             const std::string& input,
                             const std::string& perm_name,
@@ -67,10 +62,6 @@ Status AddTransposeOperator(ModelBuilder& model_builder,
                                     {output_operand_type}, {output_is_nhwc});
 }
 
-Status TransposeBetweenNCHWAndNHWC(ModelBuilder& model_builder,
-                                   const std::string& input,
-                                   const std::string& output,
-                                   bool nchw_to_nhwc) ORT_MUST_USE_RESULT;
 Status TransposeBetweenNCHWAndNHWC(ModelBuilder& model_builder,
                                    const std::string& input,
                                    const std::string& output,
@@ -108,18 +99,12 @@ Status TransposeBetweenNCHWAndNHWC(ModelBuilder& model_builder,
   return Status::OK();
 }
 
-Status TransposeNHWCToNCHW(ModelBuilder& model_builder,
-                           const std::string& input,
-                           const std::string& output) ORT_MUST_USE_RESULT;
 Status TransposeNHWCToNCHW(ModelBuilder& model_builder,
                            const std::string& input,
                            const std::string& output) {
   return TransposeBetweenNCHWAndNHWC(model_builder, input, output, false /* nchw_to_nhwc */);
 }
 
-Status TransposeNCHWToNHWC(ModelBuilder& model_builder,
-                           const std::string& input,
-                           const std::string& output) ORT_MUST_USE_RESULT;
 Status TransposeNCHWToNHWC(ModelBuilder& model_builder,
                            const std::string& input,
                            const std::string& output) {
@@ -152,9 +137,6 @@ Status GetNCHWInput(ModelBuilder& model_builder, const NodeUnit& node_unit, size
 // and return the layout type of output tensor
 // If both inputs have same layout, the output will have the same layout
 // Otherwise we will need transpose the nhwc input back to nchw, and output will be nchw
-Status TransposeBinaryOpInputLayout(ModelBuilder& model_builder, const NodeUnit& node_unit,
-                                    std::string& input1, std::string& input2,
-                                    bool& output_is_nhwc) ORT_MUST_USE_RESULT;
 Status TransposeBinaryOpInputLayout(ModelBuilder& model_builder, const NodeUnit& node_unit,
                                     std::string& input1, std::string& input2,
                                     bool& output_is_nhwc) {
@@ -184,17 +166,7 @@ static Status AddBinaryOperator(int32_t op_type,
                                 const std::string& output,
                                 bool output_is_nhwc,
                                 float output_scale = 0.0f,
-                                int32_t output_zero_point = 0) ORT_MUST_USE_RESULT;
-static Status AddBinaryOperator(int32_t op_type,
-                                ModelBuilder& model_builder,
-                                const std::string& input1,
-                                const std::string& input2,
-                                bool add_activation,
-                                int32_t fuse_code,
-                                const std::string& output,
-                                bool output_is_nhwc,
-                                float output_scale,
-                                int32_t output_zero_point) {
+                                int32_t output_zero_point = 0) {
   auto& shaper(model_builder.GetShaper());
   const auto& operand_indices(model_builder.GetOperandIndices());
   const auto& operand_types(model_builder.GetOperandTypes());
@@ -215,10 +187,6 @@ static Status AddBinaryOperator(int32_t op_type,
   return Status::OK();
 }
 
-static Status AddSqueezeOp(ModelBuilder& model_builder,
-                           const std::string& node_name,
-                           const std::string& input, const std::string& output,
-                           std::vector<int32_t> axes) ORT_MUST_USE_RESULT;
 static Status AddSqueezeOp(ModelBuilder& model_builder,
                            const std::string& node_name,
                            const std::string& input, const std::string& output,
@@ -279,11 +247,6 @@ enum DataLayout {
 // since NNAPI requires X and W to be same type for per-tensor quantization,
 // the initializer tensor W will be converted from int8 to uint8 by flip each byte by XOR 0x80
 // byte ^ 0x80 == byte + 128
-static Status AddInitializerInNewLayout(ModelBuilder& model_builder,
-                                        const std::string& name,
-                                        const OperandType& source_operand_type,
-                                        DataLayout new_layout,
-                                        bool is_per_tensor_u8s8) ORT_MUST_USE_RESULT;
 static Status AddInitializerInNewLayout(ModelBuilder& model_builder,
                                         const std::string& name,
                                         const OperandType& source_operand_type,
@@ -369,10 +332,6 @@ static Status AddInitializerInNewLayout(ModelBuilder& model_builder,
 // and input B is signed int8), in this case, since NNAPI requires A and B to be same type,
 // the initializer tensor B will be converted from int8 to uint8 by flip each byte by XOR 0x80
 // byte ^ 0x80 == byte + 128
-static Status AddInitializerTransposed(ModelBuilder& model_builder,
-                                       const OperandType& source_operand_type,
-                                       const std::string& name,
-                                       bool is_per_tensor_u8s8) ORT_MUST_USE_RESULT;
 static Status AddInitializerTransposed(ModelBuilder& model_builder,
                                        const OperandType& source_operand_type,
                                        const std::string& name,
@@ -421,12 +380,6 @@ static Status AddInitializerTransposed(ModelBuilder& model_builder,
   return model_builder.AddOperandFromPersistMemoryBuffer(name, &buffer[0], operand_type);
 }
 
-static Status ComputeConvPads(
-    const Shape& input_dimen,
-    const uint32_t weight_size_y, const uint32_t weight_size_x,
-    const std::vector<int32_t>& onnx_pads, const std::vector<int32_t>& onnx_strides, const std::vector<int32_t>& onnx_dilations,
-    AutoPadType auto_pad_type, bool nchw,
-    std::vector<int32_t>& pads_out) ORT_MUST_USE_RESULT;
 static Status ComputeConvPads(
     const Shape& input_dimen,
     const uint32_t weight_size_y, const uint32_t weight_size_x,
@@ -460,16 +413,6 @@ static Status ComputeConvPads(
   return Status::OK();
 }
 
-static Status HandleAutoPad(const Shape& input_shape,
-                            const uint32_t weight_size_y,
-                            const uint32_t weight_size_x,
-                            const std::vector<int32_t>& onnx_strides,
-                            const std::vector<int32_t>& onnx_dilations,
-                            AutoPadType auto_pad_type,
-                            bool use_nchw,
-                            std::vector<int32_t>& onnx_pads,
-                            int32_t& nnapi_padding_code,
-                            bool& use_auto_pad) ORT_MUST_USE_RESULT;
 static Status HandleAutoPad(const Shape& input_shape,
                             const uint32_t weight_size_y,
                             const uint32_t weight_size_x,
@@ -511,10 +454,6 @@ static Status HandleAutoPad(const Shape& input_shape,
 // Get scales and zero points for the qlinear binary ops (which has 2 input and 1 output)
 // QLinearConv, QLinearMatmul, QLinearAdd
 // a, b are inputs, and y is output
-static Status GetBinaryOpQuantizationScaleAndZeroPoint(
-    const InitializedTensorSet& initializers, const NodeUnit& node_unit,
-    float& a_scale, float& b_scale, float& y_scale,
-    int32_t& a_zero_point, int32_t& b_zero_point, int32_t& y_zero_point) ORT_MUST_USE_RESULT;
 static Status GetBinaryOpQuantizationScaleAndZeroPoint(
     const InitializedTensorSet& initializers, const NodeUnit& node_unit,
     float& a_scale, float& b_scale, float& y_scale,
@@ -538,11 +477,6 @@ static Status GetBinaryOpQuantizationScaleAndZeroPoint(
 // If the Qlinear[Conv/MatMul] is using per-tensor u8s8, the weight/B tensor
 // will be convert to uint8 later, will return the same scale and 128 as zero point
 // Also will set is_per_tensor_u8s8 to true to be used later
-static Status GetConvMatMulOpQuantizationScaleAndZeroPoint(
-    const ModelBuilder& model_builder, const NodeUnit& node_unit,
-    float& a_scale, float& w_scale, float& y_scale,
-    int32_t& a_zero_point, int32_t& w_zero_point, int32_t& y_zero_point,
-    optional<std::vector<float>>& w_scales, bool& is_per_tensor_u8s8) ORT_MUST_USE_RESULT;
 static Status GetConvMatMulOpQuantizationScaleAndZeroPoint(
     const ModelBuilder& model_builder, const NodeUnit& node_unit,
     float& a_scale, float& w_scale, float& y_scale,
@@ -596,10 +530,6 @@ static Status GetConvMatMulOpQuantizationScaleAndZeroPoint(
 // NNAPI has the quantization scale and zero point embedded in the ANeuralNetworksOperandType
 // ONNX has the quantization scale and zero point as the inputs of the qlinear operators
 // We want to verify the scale and zeropoint of the ONNX inputs matches the values embedded in the NNAPI inputs
-static Status IsValidInputQuantizedType(const ModelBuilder& model_builder,
-                                        const std::string& input_name,
-                                        float scale,
-                                        int32_t zero_point) ORT_MUST_USE_RESULT;
 static Status IsValidInputQuantizedType(const ModelBuilder& model_builder,
                                         const std::string& input_name,
                                         float scale,
@@ -622,11 +552,6 @@ static Status IsValidInputQuantizedType(const ModelBuilder& model_builder,
   return Status::OK();
 }
 
-static Status IsValidConvWeightQuantizedType(const ModelBuilder& model_builder,
-                                             const std::string& input_name,
-                                             float scale,
-                                             int32_t zero_point,
-                                             const optional<std::vector<float>>& scales) ORT_MUST_USE_RESULT;
 static Status IsValidConvWeightQuantizedType(const ModelBuilder& model_builder,
                                              const std::string& input_name,
                                              float scale,
@@ -692,10 +617,10 @@ class BaseOpBuilder : public IOpBuilder {
  public:
   virtual ~BaseOpBuilder() = default;
   virtual void AddInitializersToSkip(ModelBuilder& /* model_builder */, const NodeUnit& /* node_unit */) const override {}
-  Status AddToModelBuilder(ModelBuilder& model_builder, const NodeUnit& node_unit) const override final ORT_MUST_USE_RESULT;
+  Status AddToModelBuilder(ModelBuilder& model_builder, const NodeUnit& node_unit) const override final;
 
  protected:
-  virtual Status AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const ORT_MUST_USE_RESULT = 0;
+  virtual Status AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const = 0;
   static bool IsOpSupported(const ModelBuilder& model_builder, const NodeUnit& node_unit) ORT_MUST_USE_RESULT;
 };
 
@@ -727,7 +652,7 @@ class BinaryOpBuilder : public BaseOpBuilder {
 
  private:
   static bool IsQuantizedOp(const NodeUnit& node_unit) ORT_MUST_USE_RESULT;  // TODO, see if we want to move this to BaseOpBuilder
-  Status AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const override ORT_MUST_USE_RESULT;
+  Status AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const override;
 };
 
 /* static */ bool BinaryOpBuilder::IsQuantizedOp(const NodeUnit& node_unit) {
@@ -826,7 +751,7 @@ Status BinaryOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
 
 class ReluOpBuilder : public BaseOpBuilder {
  private:
-  Status AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const override ORT_MUST_USE_RESULT;
+  Status AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const override;
 };
 
 Status ReluOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const {
@@ -860,7 +785,7 @@ Status ReluOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
 
 class TransposeOpBuilder : public BaseOpBuilder {
  private:
-  Status AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const override ORT_MUST_USE_RESULT;
+  Status AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const override;
 };
 
 Status TransposeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const {
@@ -906,10 +831,10 @@ class ReshapeOpBuilder : public BaseOpBuilder {
  public:
   void AddInitializersToSkip(ModelBuilder& model_builder, const NodeUnit& node_unit) const override;
   static Status AddReshapeOperator(ModelBuilder& model_builder, const NodeUnit& node_unit,
-                                   const std::string& input, const std::vector<int32_t>& shape) ORT_MUST_USE_RESULT;
+                                   const std::string& input, const std::vector<int32_t>& shape);
 
  private:
-  Status AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const override ORT_MUST_USE_RESULT;
+  Status AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const override;
   static bool CanSkipReshape(const ModelBuilder& model_builder, const NodeUnit& node_unit,
                              size_t input_rank, size_t output_rank);
 };
@@ -1062,7 +987,7 @@ class BatchNormalizationOpBuilder : public BaseOpBuilder {
   void AddInitializersToSkip(ModelBuilder& model_builder, const NodeUnit& node_unit) const override;
 
  private:
-  Status AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const override ORT_MUST_USE_RESULT;
+  Status AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const override;
 };
 
 void BatchNormalizationOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const NodeUnit& node_unit) const {
@@ -1176,7 +1101,7 @@ class PoolOpBuilder : public BaseOpBuilder {
 
  private:
   static bool IsQuantizedOp(const NodeUnit& node_unit) ORT_MUST_USE_RESULT;  // TODO, see if we want to move this to BaseOpBuilder
-  Status AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const override ORT_MUST_USE_RESULT;
+  Status AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const override;
 };
 
 /* static */ bool PoolOpBuilder::IsQuantizedOp(const NodeUnit& node_unit) {
@@ -1330,7 +1255,7 @@ class ConvOpBuilder : public BaseOpBuilder {
 
  private:
   static bool IsQuantizedOp(const NodeUnit& node_unit) ORT_MUST_USE_RESULT;  // TODO, see if we want to move this to BaseOpBuilder
-  Status AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const override ORT_MUST_USE_RESULT;
+  Status AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const override;
 };
 
 /* static */ bool ConvOpBuilder::IsQuantizedOp(const NodeUnit& node_unit) {
@@ -1595,7 +1520,7 @@ Status ConvOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
 
 class CastOpBuilder : public BaseOpBuilder {
  private:
-  Status AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const override ORT_MUST_USE_RESULT;
+  Status AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const override;
 };
 
 Status CastOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const {
@@ -1635,7 +1560,7 @@ Status CastOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
 
 class SoftMaxOpBuilder : public BaseOpBuilder {
  private:
-  Status AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const override ORT_MUST_USE_RESULT;
+  Status AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const override;
 };
 
 Status SoftMaxOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const {
@@ -1686,7 +1611,7 @@ Status SoftMaxOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, cons
 
 class IdentityOpBuilder : public BaseOpBuilder {
  private:
-  Status AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const override ORT_MUST_USE_RESULT;
+  Status AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const override;
 };
 
 Status IdentityOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const {
@@ -1720,7 +1645,7 @@ class GemmOpBuilder : public BaseOpBuilder {
 
  private:
   static bool IsQuantizedOp(const NodeUnit& node_unit) ORT_MUST_USE_RESULT;  // TODO, see if we want to move this to BaseOpBuilder
-  Status AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const override ORT_MUST_USE_RESULT;
+  Status AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const override;
 };
 
 /* static */ bool GemmOpBuilder::IsQuantizedOp(const NodeUnit& node_unit) {
@@ -1881,7 +1806,7 @@ class UnaryOpBuilder : public BaseOpBuilder {
 
  private:
   static bool IsQuantizedOp(const NodeUnit& node_unit) ORT_MUST_USE_RESULT;  // TODO, see if we want to move this to BaseOpBuilder
-  Status AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const override ORT_MUST_USE_RESULT;
+  Status AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const override;
 };
 
 /* static */ bool UnaryOpBuilder::IsQuantizedOp(const NodeUnit& node_unit) {
@@ -1982,7 +1907,7 @@ Status UnaryOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
 
 class ConcatOpBuilder : public BaseOpBuilder {
  private:
-  Status AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const override ORT_MUST_USE_RESULT;
+  Status AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const override;
 };
 
 Status ConcatOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const {
@@ -2078,7 +2003,7 @@ class SqueezeOpBuilder : public BaseOpBuilder {
   void AddInitializersToSkip(ModelBuilder& model_builder, const NodeUnit& node_unit) const override;
 
  private:
-  Status AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const override ORT_MUST_USE_RESULT;
+  Status AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const override;
   static Status GetAxes(ModelBuilder& model_builder, const NodeUnit& node_unit, std::vector<int32_t>& axes);
 };
 
@@ -2135,7 +2060,7 @@ class QuantizeLinearOpBuilder : public BaseOpBuilder {
   void AddInitializersToSkip(ModelBuilder& model_builder, const NodeUnit& node_unit) const override;
 
  private:
-  Status AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const override ORT_MUST_USE_RESULT;
+  Status AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const override;
 };
 
 void QuantizeLinearOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const NodeUnit& node_unit) const {
@@ -2174,7 +2099,7 @@ class DequantizeLinearOpBuilder : public BaseOpBuilder {
   void AddInitializersToSkip(ModelBuilder& model_builder, const NodeUnit& node_unit) const override;
 
  private:
-  Status AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const override ORT_MUST_USE_RESULT;
+  Status AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const override;
 };
 
 void DequantizeLinearOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const NodeUnit& node_unit) const {
@@ -2213,7 +2138,7 @@ Status DequantizeLinearOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_buil
 
 class LRNOpBuilder : public BaseOpBuilder {
  private:
-  Status AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const override ORT_MUST_USE_RESULT;
+  Status AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const override;
 };
 
 Status LRNOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const {
@@ -2274,7 +2199,7 @@ class ClipOpBuilder : public BaseOpBuilder {
   void AddInitializersToSkip(ModelBuilder& model_builder, const NodeUnit& node_unit) const override;
 
  private:
-  Status AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const override ORT_MUST_USE_RESULT;
+  Status AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const override;
 };
 
 void ClipOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const NodeUnit& node_unit) const {
@@ -2333,7 +2258,7 @@ class ResizeOpBuilder : public BaseOpBuilder {
   void AddInitializersToSkip(ModelBuilder& model_builder, const NodeUnit& node_unit) const override;
 
  private:
-  Status AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const override ORT_MUST_USE_RESULT;
+  Status AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const override;
 };
 
 void ResizeOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const NodeUnit& node_unit) const {
@@ -2439,7 +2364,7 @@ Status ResizeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
 
 class FlattenOpBuilder : public BaseOpBuilder {
  private:
-  Status AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const override ORT_MUST_USE_RESULT;
+  Status AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const override;
 };
 
 Status FlattenOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const {
@@ -2473,10 +2398,10 @@ class MinMaxOpBuilder : public BaseOpBuilder {
   static void CreateSharedOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
 
  private:
-  Status AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const override ORT_MUST_USE_RESULT;
+  Status AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const override;
   static Status AddMinMaxOperator(ModelBuilder& model_builder, const NodeUnit& node_unit,
                                   const std::string& input1, const std::string& input2,
-                                  bool output_is_nhwc) ORT_MUST_USE_RESULT;
+                                  bool output_is_nhwc);
 };
 
 /* static */ void MinMaxOpBuilder::CreateSharedOpBuilder(
@@ -2536,7 +2461,7 @@ Status MinMaxOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
 
 class EluOpBuilder : public BaseOpBuilder {
  private:
-  Status AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const override ORT_MUST_USE_RESULT;
+  Status AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const override;
 };
 
 Status EluOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const {
@@ -2566,7 +2491,7 @@ class SliceOpBuilder : public BaseOpBuilder {
   void AddInitializersToSkip(ModelBuilder& model_builder, const NodeUnit& node_unit) const override;
 
  private:
-  Status AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const override ORT_MUST_USE_RESULT;
+  Status AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const override;
 };
 
 void SliceOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const NodeUnit& node_unit) const {
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.h b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.h
index f25a6591040bc..6483c432f1442 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.h
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.h
@@ -7,7 +7,6 @@
 #include <unordered_map>
 
 #include "core/graph/basic_types.h"
-#include "core/session/onnxruntime_c_api.h"
 
 namespace onnxruntime {
 
@@ -31,7 +30,7 @@ class IOpBuilder {
   virtual void AddInitializersToSkip(ModelBuilder& model_builder, const NodeUnit& node_unit) const = 0;
 
   // Add the operator to NNAPI model
-  virtual common::Status AddToModelBuilder(ModelBuilder& model_builder, const NodeUnit& node_unit) const ORT_MUST_USE_RESULT = 0;
+  virtual common::Status AddToModelBuilder(ModelBuilder& model_builder, const NodeUnit& node_unit) const = 0;
 };
 
 // Get the lookup table with IOpBuilder delegates for different onnx operators
@@ -40,8 +39,7 @@ class IOpBuilder {
 const std::unordered_map<std::string, const IOpBuilder*>& GetOpBuilders();
 
 // Transpose the NHWC input to NCHW output
-common::Status TransposeNHWCToNCHW(ModelBuilder& model_builder, const std::string& input, const std::string& output)
-    ORT_MUST_USE_RESULT;
+common::Status TransposeNHWCToNCHW(ModelBuilder& model_builder, const std::string& input, const std::string& output);
 
 }  // namespace nnapi
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_support_checker.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_support_checker.cc
index 452eec62a80c7..bc5a326bfe0c5 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_support_checker.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_support_checker.cc
@@ -1,16 +1,16 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include <core/common/logging/logging.h>
-#include <core/common/safeint.h>
-#include <core/framework/tensorprotoutils.h>
-#include <core/graph/graph.h>
+#include "op_support_checker.h"
 
+#include "core/common/logging/logging.h"
+#include "core/common/safeint.h"
+#include "core/framework/tensorprotoutils.h"
+#include "core/graph/graph.h"
 #include "core/providers/common.h"
 #include "core/providers/shared/node_unit/node_unit.h"
 #include "core/providers/shared/utils/utils.h"
 #include "helper.h"
-#include "op_support_checker.h"
 
 namespace onnxruntime {
 namespace nnapi {
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/shaper.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/shaper.cc
index 2bcc167622b3e..1652237622e71 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/shaper.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/shaper.cc
@@ -3,8 +3,8 @@
 
 #include "core/providers/common.h"
 
-#include "helper.h"
 #include "shaper.h"
+#include "helper.h"
 
 namespace onnxruntime {
 namespace nnapi {
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/shaper.h b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/shaper.h
index b9299454dce44..8656328804f46 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/shaper.h
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/shaper.h
@@ -6,7 +6,8 @@
 #include <string>
 #include <unordered_map>
 #include <vector>
-#include <core/session/onnxruntime_c_api.h>
+
+#include "core/common/status.h"
 
 namespace onnxruntime {
 namespace nnapi {
@@ -20,115 +21,103 @@ class Shaper {
     return shape_map_.at(key);
   }
 
-  Status Conv(const std::string& input_name,
-              const std::string& weight_name,
-              const std::vector<int32_t>& onnx_pads,
-              const std::vector<int32_t>& onnx_strides,
-              const std::vector<int32_t>& onnx_dilations,
-              bool nchw,
-              const std::string& output_name) ORT_MUST_USE_RESULT;
-
-  Status DepthwiseConv(const std::string& input_name,
-                       const std::string& weight_name,
-                       const std::vector<int32_t>& onnx_pads,
-                       const std::vector<int32_t>& onnx_strides,
-                       const std::vector<int32_t>& onnx_dilations,
-                       bool nchw,
-                       const std::string& output_name) ORT_MUST_USE_RESULT;
-
-  Status Pool(const std::string& input_name,
-              const std::vector<int32_t>& onnx_pads,
-              const std::vector<int32_t>& onnx_strides,
-              const std::vector<int32_t>& kernel_shape,
-              bool nchw,
-              const std::string& output_name) ORT_MUST_USE_RESULT;
-
-  Status Reshape(const std::string& input_name, const std::vector<int32_t>& shape, const std::string& output_name)
-      ORT_MUST_USE_RESULT;
-
-  Status Transpose(const std::string& input_name, const std::vector<int32_t>& perm, const std::string& output_name)
-      ORT_MUST_USE_RESULT;
-
-  Status Eltwise(const std::string& input1_name, const std::string& input2_name, const std::string& output_name)
-      ORT_MUST_USE_RESULT;
-
-  Status Identity(const std::string& input_name, const std::string& output_name) ORT_MUST_USE_RESULT;
-
-  Status FC(const std::string& input1_name, const std::string& input2_name, const std::string& output_name)
-      ORT_MUST_USE_RESULT;
-
-  Status Concat(const std::vector<std::string>& input_names, const int32_t axis, const std::string& output_name)
-      ORT_MUST_USE_RESULT;
-
-  Status Squeeze(const std::string& input_name, const std::vector<int32_t>& axes, const std::string& output_name)
-      ORT_MUST_USE_RESULT;
-
-  Status ResizeUsingScales(const std::string& input_name,
-                           const float scale_h, const float scale_w,
-                           bool nchw,
-                           const std::string& output_name) ORT_MUST_USE_RESULT;
-  Status ResizeUsingOutputSizes(const std::string& input_name,
-                                const uint32_t output_h, const uint32_t output_w,
-                                bool nchw,
-                                const std::string& output_name) ORT_MUST_USE_RESULT;
+  common::Status Conv(const std::string& input_name,
+                      const std::string& weight_name,
+                      const std::vector<int32_t>& onnx_pads,
+                      const std::vector<int32_t>& onnx_strides,
+                      const std::vector<int32_t>& onnx_dilations,
+                      bool nchw,
+                      const std::string& output_name);
+
+  common::Status DepthwiseConv(const std::string& input_name,
+                               const std::string& weight_name,
+                               const std::vector<int32_t>& onnx_pads,
+                               const std::vector<int32_t>& onnx_strides,
+                               const std::vector<int32_t>& onnx_dilations,
+                               bool nchw,
+                               const std::string& output_name);
+
+  common::Status Pool(const std::string& input_name,
+                      const std::vector<int32_t>& onnx_pads,
+                      const std::vector<int32_t>& onnx_strides,
+                      const std::vector<int32_t>& kernel_shape,
+                      bool nchw,
+                      const std::string& output_name);
+
+  common::Status Reshape(const std::string& input_name, const std::vector<int32_t>& shape, const std::string& output_name);
+
+  common::Status Transpose(const std::string& input_name, const std::vector<int32_t>& perm, const std::string& output_name);
+
+  common::Status Eltwise(const std::string& input1_name, const std::string& input2_name, const std::string& output_name);
+
+  common::Status Identity(const std::string& input_name, const std::string& output_name);
+
+  common::Status FC(const std::string& input1_name, const std::string& input2_name, const std::string& output_name);
+
+  common::Status Concat(const std::vector<std::string>& input_names, const int32_t axis, const std::string& output_name);
+
+  common::Status Squeeze(const std::string& input_name, const std::vector<int32_t>& axes, const std::string& output_name);
+
+  common::Status ResizeUsingScales(const std::string& input_name,
+                                   const float scale_h, const float scale_w,
+                                   bool nchw,
+                                   const std::string& output_name);
+  common::Status ResizeUsingOutputSizes(const std::string& input_name,
+                                        const uint32_t output_h, const uint32_t output_w,
+                                        bool nchw,
+                                        const std::string& output_name);
 
   // If the shape of certain input is dynamic
   // Use the following 2 functions to update the particular shape
   // and calculate the new output shape
   // Only perform this when the NNAPI model is finalized!
-  Status UpdateShape(const std::string& name, const Shape& new_shape) ORT_MUST_USE_RESULT;
-  Status UpdateDynamicDimensions() ORT_MUST_USE_RESULT;
+  common::Status UpdateShape(const std::string& name, const Shape& new_shape);
+  common::Status UpdateDynamicDimensions();
 
   void Clear();
 
  private:
-  Status ConvImpl(const std::string& input_name,
-                  const std::string& weight_name,
-                  const std::vector<int32_t>& onnx_pads,
-                  const std::vector<int32_t>& onnx_strides,
-                  const std::vector<int32_t>& onnx_dilations,
-                  bool nchw,
-                  const std::string& output_name) ORT_MUST_USE_RESULT;
-
-  Status DepthwiseConvImpl(const std::string& input_name,
-                           const std::string& weight_name,
-                           const std::vector<int32_t>& onnx_pads,
-                           const std::vector<int32_t>& onnx_strides,
-                           const std::vector<int32_t>& onnx_dilations,
-                           bool nchw,
-                           const std::string& output_name) ORT_MUST_USE_RESULT;
-
-  Status PoolImpl(const std::string& input_name,
-                  const std::vector<int32_t>& onnx_pads,
-                  const std::vector<int32_t>& onnx_strides,
-                  const std::vector<int32_t>& kernel_shape,
-                  bool nchw,
-                  const std::string& output_name) ORT_MUST_USE_RESULT;
-
-  Status ReshapeImpl(const std::string& input_name, const std::vector<int32_t>& shape, const std::string& output_name)
-      ORT_MUST_USE_RESULT;
-  Status TransposeImpl(const std::string& input_name, const std::vector<int32_t>& perm, const std::string& output_name)
-      ORT_MUST_USE_RESULT;
-  Status EltwiseImpl(const std::string& input1_name, const std::string& input2_name, const std::string& output_name)
-      ORT_MUST_USE_RESULT;
-  Status IdentityImpl(const std::string& input_name, const std::string& output_name) ORT_MUST_USE_RESULT;
-  Status FCImpl(const std::string& input1_name, const std::string& input2_name, const std::string& output_name)
-      ORT_MUST_USE_RESULT;
-  Status ConcatImpl(const std::vector<std::string>& input_names, const int32_t axis, const std::string& output_name)
-      ORT_MUST_USE_RESULT;
-  Status SqueezeImpl(const std::string& input_names, const std::vector<int32_t>& axes, const std::string& output_name)
-      ORT_MUST_USE_RESULT;
-  Status ResizeUsingScalesImpl(const std::string& input_name,
-                               const float scale_h, const float scale_w,
-                               bool nchw,
-                               const std::string& output_name) ORT_MUST_USE_RESULT;
-  Status ResizeUsingOutputSizesImpl(const std::string& input_name,
-                                    const uint32_t output_h, const uint32_t output_w,
-                                    bool nchw,
-                                    const std::string& output_name) ORT_MUST_USE_RESULT;
+  common::Status ConvImpl(const std::string& input_name,
+                          const std::string& weight_name,
+                          const std::vector<int32_t>& onnx_pads,
+                          const std::vector<int32_t>& onnx_strides,
+                          const std::vector<int32_t>& onnx_dilations,
+                          bool nchw,
+                          const std::string& output_name);
+
+  common::Status DepthwiseConvImpl(const std::string& input_name,
+                                   const std::string& weight_name,
+                                   const std::vector<int32_t>& onnx_pads,
+                                   const std::vector<int32_t>& onnx_strides,
+                                   const std::vector<int32_t>& onnx_dilations,
+                                   bool nchw,
+                                   const std::string& output_name);
+
+  common::Status PoolImpl(const std::string& input_name,
+                          const std::vector<int32_t>& onnx_pads,
+                          const std::vector<int32_t>& onnx_strides,
+                          const std::vector<int32_t>& kernel_shape,
+                          bool nchw,
+                          const std::string& output_name);
+
+  common::Status ReshapeImpl(const std::string& input_name, const std::vector<int32_t>& shape, const std::string& output_name);
+  common::Status TransposeImpl(const std::string& input_name, const std::vector<int32_t>& perm, const std::string& output_name);
+  common::Status EltwiseImpl(const std::string& input1_name, const std::string& input2_name, const std::string& output_name);
+  common::Status IdentityImpl(const std::string& input_name, const std::string& output_name);
+  common::Status FCImpl(const std::string& input1_name, const std::string& input2_name, const std::string& output_name);
+  common::Status ConcatImpl(const std::vector<std::string>& input_names, const int32_t axis, const std::string& output_name);
+  common::Status SqueezeImpl(const std::string& input_names, const std::vector<int32_t>& axes, const std::string& output_name);
+  common::Status ResizeUsingScalesImpl(const std::string& input_name,
+                                       const float scale_h, const float scale_w,
+                                       bool nchw,
+                                       const std::string& output_name);
+  common::Status ResizeUsingOutputSizesImpl(const std::string& input_name,
+                                            const uint32_t output_h, const uint32_t output_w,
+                                            bool nchw,
+                                            const std::string& output_name);
 
   std::unordered_map<std::string, Shape> shape_map_;
-  std::vector<std::function<Status(Shaper&)>> shape_ops_;
+  std::vector<std::function<common::Status(Shaper&)>> shape_ops_;
 };
 
 }  // namespace nnapi
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/model.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/model.cc
index 7a2036252ae8f..887384e6bd3bc 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/model.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/model.cc
@@ -1,9 +1,9 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include <core/common/logging/logging.h>
-
 #include "model.h"
+
+#include "core/common/logging/logging.h"
 #include "core/providers/common.h"
 #include "core/providers/nnapi/nnapi_builtin/builders/helper.h"
 #include "core/providers/nnapi/nnapi_builtin/nnapi_lib/nnapi_implementation.h"
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/model.h b/onnxruntime/core/providers/nnapi/nnapi_builtin/model.h
index 8ce72538affc4..6326e60cf9797 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/model.h
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/model.h
@@ -103,7 +103,7 @@ class Model {
   // this output may need special handling
   bool IsScalarOutput(const std::string& output_name) const;
 
-  Status PrepareForExecution(std::unique_ptr<Execution>& execution) ORT_MUST_USE_RESULT;
+  common::Status PrepareForExecution(std::unique_ptr<Execution>& execution);
 
  private:
   const NnApi* nnapi_{nullptr};
@@ -143,7 +143,7 @@ class Model {
 
   void AddScalarOutput(const std::string& output_name);
 
-  void SetShaper(const Shaper shaper) { shaper_ = shaper; }
+  void SetShaper(const Shaper& shaper) { shaper_ = shaper; }
 
   int32_t GetNNAPIFeatureLevel() const;
 };
@@ -172,17 +172,16 @@ class Execution {
 
   // Set the input/output data buffers
   // These need to be called before calling Predict()
-  Status SetInputBuffers(const std::vector<InputBuffer>& inputs) ORT_MUST_USE_RESULT;
-  Status SetOutputBuffers(const std::vector<OutputBuffer>& outputs) ORT_MUST_USE_RESULT;
+  common::Status SetInputBuffers(const std::vector<InputBuffer>& inputs);
+  common::Status SetOutputBuffers(const std::vector<OutputBuffer>& outputs);
 
   // Execute the NNAPI model
   // if there is dynamic output shape, will output the actual output shapes
-  Status Predict(const std::vector<int32_t>& dynamic_outputs, std::vector<Shaper::Shape>& dynamic_output_shapes)
-      ORT_MUST_USE_RESULT;
+  common::Status Predict(const std::vector<int32_t>& dynamic_outputs, std::vector<Shaper::Shape>& dynamic_output_shapes);
 
  private:
-  Status SetInputBuffer(const int32_t index, const InputBuffer& input) ORT_MUST_USE_RESULT;
-  Status SetOutputBuffer(const int32_t index, const OutputBuffer& output) ORT_MUST_USE_RESULT;
+  common::Status SetInputBuffer(const int32_t index, const InputBuffer& input);
+  common::Status SetOutputBuffer(const int32_t index, const OutputBuffer& output);
 
   const NnApi* nnapi_{nullptr};
   ANeuralNetworksExecution* execution_;
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/nnapi_execution_provider.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/nnapi_execution_provider.cc
index 150db8f593e11..85a0cf3ad13fc 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/nnapi_execution_provider.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/nnapi_execution_provider.cc
@@ -187,14 +187,6 @@ NnapiExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_view
 }
 
 #ifdef __ANDROID__
-static Status GetOutputBuffer(Ort::CustomOpApi& ort,
-                              OrtKernelContext* context,
-                              const nnapi::Model& model,
-                              const std::string& output_name,
-                              const std::vector<uint32_t>& output_shape,
-                              const android::nn::wrapper::Type output_type,
-                              void** output_buffer) ORT_MUST_USE_RESULT;
-
 static Status GetOutputBuffer(Ort::CustomOpApi& ort,
                               OrtKernelContext* context,
                               const nnapi::Model& model,

From e075eaac338cf10b073be13d9c6c39847bfacee7 Mon Sep 17 00:00:00 2001
From: Guoyu Wang <wanggy@outlook.com>
Date: Wed, 12 Jan 2022 17:02:12 -0800
Subject: [PATCH 21/23] Simplify FindActivation

---
 .../nnapi_builtin/builders/model_builder.cc   | 20 ++++++++++++++++---
 .../nnapi_builtin/builders/model_builder.h    |  5 +++--
 .../nnapi_builtin/builders/op_builder.cc      | 10 +++++-----
 3 files changed, 25 insertions(+), 10 deletions(-)

diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.cc
index 56be9aa755ff7..12beec0f9419c 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.cc
@@ -627,12 +627,26 @@ Status ModelBuilder::Compile(std::unique_ptr<Model>& model) {
   return Status::OK();
 }
 
-int32_t ModelBuilder::FindActivation(const NodeUnit& node_unit, const NodeArg& output) {
+int32_t ModelBuilder::FindActivation(const NodeUnit& node_unit) {
   int32_t fuse_code = ANEURALNETWORKS_FUSED_NONE;
-  if (node_unit.GetOutputNodes().size() != 1)
+  const auto& output_nodes = node_unit.GetOutputNodes();
+  if (node_unit.GetOutputNodes().size() != 1) {
+    LOGS_DEFAULT(VERBOSE) << "FindActivation does not support, NodeUnit [" << node_unit.Name()
+                          << "] type [" << node_unit.OpType()
+                          << "], with " << output_nodes.size() << " output nodes";
     return fuse_code;
+  }
+
+  const auto& outputs = node_unit.Outputs();
+  if (outputs.size() != 1) {
+    LOGS_DEFAULT(VERBOSE) << "FindActivation does not support, NodeUnit [" << node_unit.Name()
+                          << "] type [" << node_unit.OpType()
+                          << "], with " << outputs.size() << " outputs";
+    return fuse_code;
+  }
 
-  const auto& output_node = *node_unit.GetOutputNodes()[0];
+  const NodeArg& output = outputs[0].node_arg;
+  const auto& output_node = *output_nodes[0];
 
   // TODO, add support of activation fusion for quantized node group (qdq or qlinear)
   // We do not support activation fusion for quantized operators for now
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.h b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.h
index 0fbeec7dc5d98..2269c986f60ea 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.h
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.h
@@ -50,8 +50,9 @@ class ModelBuilder {
                               const std::vector<android::nn::wrapper::OperandType>& types,
                               const std::vector<bool>& is_nhwc_vec);
 
-  // Find if an output has a fuseable activation (Relu)
-  int32_t FindActivation(const NodeUnit& node_unit, const NodeArg& output);
+  // Find if the given node_unit has a fuseable activation (Relu/Relu1/Relu6)
+  // For now we only support node_unit with a single output
+  int32_t FindActivation(const NodeUnit& node_unit);
 
   // Add an NNAPI scalar operand
   common::Status AddOperandFromScalar(bool value, uint32_t& index);
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc
index 9689f72156eba..acab410be1462 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc
@@ -736,7 +736,7 @@ Status BinaryOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
 
   int32_t fuse_code = ANEURALNETWORKS_FUSED_NONE;
   if (add_activation) {
-    fuse_code = model_builder.FindActivation(node_unit, node_unit.Outputs()[0].node_arg);
+    fuse_code = model_builder.FindActivation(node_unit);
   }
 
   return AddBinaryOperator(op_code, model_builder,
@@ -1079,7 +1079,7 @@ Status BatchNormalizationOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_bu
                                         output_is_nhwc));
 
   // Add
-  int32_t fuse_code = model_builder.FindActivation(node_unit, node_unit.Outputs()[0].node_arg);
+  int32_t fuse_code = model_builder.FindActivation(node_unit);
   ORT_RETURN_IF_ERROR(AddBinaryOperator(ANEURALNETWORKS_ADD,
                                         model_builder,
                                         tensor_imm_product_name, tensor_b_name,
@@ -1192,7 +1192,7 @@ Status PoolOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
     }
   }
 
-  int32_t fuse_code = model_builder.FindActivation(node_unit, node_unit.Outputs()[0].node_arg);
+  int32_t fuse_code = model_builder.FindActivation(node_unit);
 
   // Get output scale and zero point if this is QLinearAveragePool
   // Otherwise we will use the scale and zero point of the input
@@ -1474,7 +1474,7 @@ Status ConvOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
     }
   }
 
-  int32_t fuse_code = model_builder.FindActivation(node_unit, node_unit.Outputs()[0].node_arg);
+  int32_t fuse_code = model_builder.FindActivation(node_unit);
   ADD_SCALAR_OPERAND(model_builder, input_indices, fuse_code);
 
   if (model_builder.GetNNAPIFeatureLevel() > ANEURALNETWORKS_FEATURE_LEVEL_2) {
@@ -1785,7 +1785,7 @@ Status GemmOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
   input_indices.push_back(operand_indices.at(input1));  // A
   input_indices.push_back(input_2_idx);                 // B
   input_indices.push_back(bias_idx);                    // C
-  int32_t fuse_code = model_builder.FindActivation(node_unit, node_unit.Outputs()[0].node_arg);
+  int32_t fuse_code = model_builder.FindActivation(node_unit);
   ADD_SCALAR_OPERAND(model_builder, input_indices, fuse_code);
 
   ORT_RETURN_IF_ERROR(shaper.FC(input1, input2, output));

From 613035cce12aa65ce007a036d583ebc753890fdd Mon Sep 17 00:00:00 2001
From: Guoyu Wang <wanggy@outlook.com>
Date: Fri, 14 Jan 2022 16:24:56 -0800
Subject: [PATCH 22/23] address CR comments

---
 .../core/providers/nnapi/nnapi_builtin/builders/helper.cc   | 6 +++---
 .../providers/nnapi/nnapi_builtin/builders/model_builder.cc | 6 ++----
 .../providers/nnapi/nnapi_builtin/builders/op_builder.cc    | 5 ++---
 .../nnapi/nnapi_builtin/builders/op_support_checker.cc      | 4 ++--
 onnxruntime/core/providers/shared/node_unit/node_unit.h     | 2 +-
 5 files changed, 10 insertions(+), 13 deletions(-)

diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.cc
index a4097d6716026..af93017649e78 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.cc
@@ -153,9 +153,9 @@ bool HasValidQuantizationScales(const InitializedTensorSet& initializers, const
   const auto& io_defs = is_input ? node_unit.Inputs() : node_unit.Outputs();
   for (const auto idx : indices) {
     if (idx >= io_defs.size()) {
-      LOGS_DEFAULT(VERBOSE) << "HasValidQuantizationScales, "
-                            << (is_input ? "Input" : "Output") << " index,  " << idx
-                            << " >= size, " << io_defs.size();
+      LOGS_DEFAULT(VERBOSE) << (is_input ? "Input" : "Output") << " index,  " << idx
+                            << " >= size, " << io_defs.size()
+                            << " of NodeUnit: " << node_unit.Name();
       return false;
     }
 
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.cc
index 12beec0f9419c..fe6eade431770 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.cc
@@ -173,16 +173,14 @@ void ModelBuilder::GetAllQuantizedOpInputs() {
     auto qlinear_op_type = GetQLinearOpType(node_unit->GetNode());
 
     // Not a qlinear op
+    // TODO, add handling for QDQ NodeUnit
     if (qlinear_op_type == QLinearOpType::Unknown)
       continue;
 
     const auto add_quantized_input =
         [&all_quantized_op_inputs = all_quantized_op_inputs_](const NodeUnit& node_unit, size_t input_idx) {
           const auto& input_name = node_unit.Inputs()[input_idx].node_arg.Name();
-          if (Contains(all_quantized_op_inputs, input_name))
-            all_quantized_op_inputs.at(input_name).push_back(&node_unit);
-          else
-            all_quantized_op_inputs.emplace(input_name, std::vector<const NodeUnit*>{&node_unit});
+          all_quantized_op_inputs[input_name].push_back(&node_unit);
         };
 
     // All qlinear ops EXCEPT QuantizeLinear has quantized input
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc
index acab410be1462..bff260cb6741c 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc
@@ -791,7 +791,7 @@ class TransposeOpBuilder : public BaseOpBuilder {
 Status TransposeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const {
   auto& shaper(model_builder.GetShaper());
 
-  auto input = node_unit.Inputs()[0].node_arg.Name();
+  const auto& input = node_unit.Inputs()[0].node_arg.Name();
   const auto& output = node_unit.Outputs()[0].node_arg.Name();
   NodeAttrHelper helper(node_unit);
   std::vector<int32_t> perm = helper.Get("perm", std::vector<int32_t>());
@@ -1672,7 +1672,6 @@ void GemmOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const Nod
     AddQuantizationScaleAndZeroPointToSkip(model_builder, *node_unit.Outputs()[0].quant_param);  // y_scale, y_zp
   } else {
     const auto& op = node_unit.OpType();
-    const auto& inputs = node_unit.Inputs();
     if (op == "MatMul") {
       model_builder.AddInitializerToSkip(inputs[1].node_arg.Name());
     } else if (op == "Gemm") {
@@ -1956,7 +1955,7 @@ Status ConcatOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
     output_is_nhwc = model_builder.IsOperandNHWC(input0);
 
     for (size_t i = 0; i < node_input_size; i++) {
-      auto input = inputs[i].node_arg.Name();
+      const auto& input = inputs[i].node_arg.Name();
       input_indices.push_back(operand_indices.at(input));
       input_names.push_back(input);
     }
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_support_checker.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_support_checker.cc
index bc5a326bfe0c5..9bd25abc62a77 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_support_checker.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_support_checker.cc
@@ -46,7 +46,7 @@ bool HasExternalInitializer(const InitializedTensorSet& initializers, const Node
       return true;
 
     if (!input.quant_param)
-      return false;
+      continue;
 
     if (is_ext_initializer(input.quant_param->scale))
       return true;
@@ -314,7 +314,7 @@ bool BinaryOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initi
   if (op_is_qlinear) {
     // For QLinearAdd, we only support uint8 output now
     int32_t output_type;
-    if (!GetType(inputs[0].node_arg, output_type))
+    if (!GetType(node_unit.Outputs()[0].node_arg, output_type))
       return false;
 
     if (output_type != ONNX_NAMESPACE::TensorProto_DataType_UINT8) {
diff --git a/onnxruntime/core/providers/shared/node_unit/node_unit.h b/onnxruntime/core/providers/shared/node_unit/node_unit.h
index 73fcca032a013..bea1e476a72da 100644
--- a/onnxruntime/core/providers/shared/node_unit/node_unit.h
+++ b/onnxruntime/core/providers/shared/node_unit/node_unit.h
@@ -25,7 +25,7 @@ struct NodeGroup;
 // If the optional quant_param is present, then this is a quantized input,
 // otherwise this is a regular input
 struct NodeUnitIODef {
-  // The quantization parmeter, scale is manadatory, and zero_point is optional
+  // The quantization parameter, scale is manadatory, and zero_point is optional
   struct QuantParam {
     const NodeArg& scale;
     const NodeArg* zero_point{nullptr};

From e0191e59d542f1d379da4ed2a3cb095745fad5fc Mon Sep 17 00:00:00 2001
From: Guoyu Wang <wanggy@outlook.com>
Date: Fri, 14 Jan 2022 17:44:18 -0800
Subject: [PATCH 23/23] address CR comments

---
 .../builders/op_support_checker.cc            |  6 ++---
 .../providers/shared/node_unit/node_unit.cc   | 27 +++++++++----------
 .../providers/shared/node_unit/node_unit.h    |  6 ++---
 3 files changed, 17 insertions(+), 22 deletions(-)

diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_support_checker.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_support_checker.cc
index 9bd25abc62a77..75eab4c837d00 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_support_checker.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_support_checker.cc
@@ -1268,10 +1268,10 @@ bool SqueezeOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& init
   if (!GetShape(inputs[0].node_arg, input_shape))
     return false;
 
-  const auto input_dim = input_shape.size();
-  if (input_dim > 4 || input_dim == 0) {
+  const auto input_rank = input_shape.size();
+  if (input_rank > 4 || input_rank == 0) {
     LOGS_DEFAULT(VERBOSE) << "Squeeze only supports 1-4d shape, input is "
-                          << input_dim << "d shape";
+                          << input_rank << "d shape";
     return false;
   }
 
diff --git a/onnxruntime/core/providers/shared/node_unit/node_unit.cc b/onnxruntime/core/providers/shared/node_unit/node_unit.cc
index 3c5a829a4d497..d443fe858f36b 100644
--- a/onnxruntime/core/providers/shared/node_unit/node_unit.cc
+++ b/onnxruntime/core/providers/shared/node_unit/node_unit.cc
@@ -82,27 +82,24 @@ bool IsVariadicQLinearOp(QLinearOpType type) {
 }  // namespace
 
 NodeUnit::NodeUnit(const Node& node)
-    : nodes_{&node},
-      output_nodes_{&node},
-      node_(node),
+    : output_nodes_{&node},
+      target_node_(node),
       type_(Type::SingleNode) {
   InitForNode();
 }
 
-const std::string& NodeUnit::Domain() const noexcept { return node_.Domain(); }
-const std::string& NodeUnit::OpType() const noexcept { return node_.OpType(); }
-const std::string& NodeUnit::Name() const noexcept { return node_.Name(); }
-int NodeUnit::SinceVersion() const noexcept { return node_.SinceVersion(); }
-NodeIndex NodeUnit::Index() const noexcept { return node_.Index(); }
-const Path& NodeUnit::ModelPath() const noexcept { return node_.ModelPath(); }
-ProviderType NodeUnit::GetExecutionProviderType() const noexcept { return node_.GetExecutionProviderType(); }
+const std::string& NodeUnit::Domain() const noexcept { return target_node_.Domain(); }
+const std::string& NodeUnit::OpType() const noexcept { return target_node_.OpType(); }
+const std::string& NodeUnit::Name() const noexcept { return target_node_.Name(); }
+int NodeUnit::SinceVersion() const noexcept { return target_node_.SinceVersion(); }
+NodeIndex NodeUnit::Index() const noexcept { return target_node_.Index(); }
+const Path& NodeUnit::ModelPath() const noexcept { return target_node_.ModelPath(); }
+ProviderType NodeUnit::GetExecutionProviderType() const noexcept { return target_node_.GetExecutionProviderType(); }
 
 void NodeUnit::InitForNode() {
-  const auto& input_defs = node_.InputDefs();
-  const auto& output_defs = node_.OutputDefs();
-  // The 1st step is to hookup the NodeUnit with the NNAPI builder interface
-  // So we are not handling quantization here now
-  auto qlinear_type = GetQLinearOpType(node_);
+  const auto& input_defs = target_node_.InputDefs();
+  const auto& output_defs = target_node_.OutputDefs();
+  auto qlinear_type = GetQLinearOpType(target_node_);
   if (qlinear_type == QLinearOpType::Unknown ||
       IsVariadicQLinearOp(qlinear_type)) {  // TODO, add variadic support
     // Not a Qlinear op, add all inputs / outputs
diff --git a/onnxruntime/core/providers/shared/node_unit/node_unit.h b/onnxruntime/core/providers/shared/node_unit/node_unit.h
index bea1e476a72da..e109703c9316f 100644
--- a/onnxruntime/core/providers/shared/node_unit/node_unit.h
+++ b/onnxruntime/core/providers/shared/node_unit/node_unit.h
@@ -63,17 +63,15 @@ class NodeUnit {
   const Path& ModelPath() const noexcept;
   ProviderType GetExecutionProviderType() const noexcept;
 
-  const Node& GetNode() const noexcept { return node_; }
+  const Node& GetNode() const noexcept { return target_node_; }
   const std::vector<const Node*> GetOutputNodes() const noexcept { return output_nodes_; }
-  const std::vector<const Node*> GetAllNodes() const noexcept { return nodes_; }
 
  private:
   std::vector<NodeUnitIODef> inputs_;
   std::vector<NodeUnitIODef> outputs_;
 
-  const std::vector<const Node*> nodes_;         // all nodes in this NodeUnit
   const std::vector<const Node*> output_nodes_;  // all the nodes producing outputs for this NodeUnit
-  const Node& node_;                             // target Node
+  const Node& target_node_;
   Type type_;
 
   void InitForNode();  // Initializing for single Node