diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.cc index 5b958c686d007..f492a4cbbcb4b 100644 --- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.cc +++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.cc @@ -61,6 +61,8 @@ QuantizedOpType GetQuantizedOpType(const NodeUnit& node_unit) { return QuantizedOpType::QLinearMatMul; else if (op_type == "QLinearAdd") return QuantizedOpType::QLinearAdd; + else if (op_type == "QLinearMul") + return QuantizedOpType::QLinearMul; else if (op_type == "QLinearSigmoid") return QuantizedOpType::QLinearSigmoid; else if (op_type == "QLinearAveragePool") @@ -72,6 +74,10 @@ QuantizedOpType GetQuantizedOpType(const NodeUnit& node_unit) { return QuantizedOpType::QDQResize; else if (op_type == "AveragePool") return QuantizedOpType::QDQAveragePool; + else if (op_type == "Add") + return QuantizedOpType::QDQAdd; + else if (op_type == "Mul") + return QuantizedOpType::QDQMul; } else { // throw? // Do we want to throw here? seems got neglected last time @@ -114,25 +120,13 @@ bool IsQuantizedPool(QuantizedOpType quant_op_type) { bool IsQuantizedBinaryOp(QuantizedOpType quant_op_type) { return quant_op_type == QuantizedOpType::QLinearMatMul || quant_op_type == QuantizedOpType::QLinearAdd || + quant_op_type == QuantizedOpType::QLinearMul || + quant_op_type == QuantizedOpType::QDQAdd || + quant_op_type == QuantizedOpType::QDQMul || IsQuantizedConv(quant_op_type); } -bool HasValidUnaryOpQuantizedInputs(const NodeUnit& node_unit) { - int32_t input_type; - if (!GetType(node_unit.Inputs()[0].node_arg, input_type)) - return false; - - if (input_type != ONNX_NAMESPACE::TensorProto_DataType_UINT8) { - LOGS_DEFAULT(VERBOSE) << "[" << node_unit.OpType() - << "] Input type: [" << input_type - << "] is not supported for now"; - return false; - } - - return true; -} - -bool HasValidBinaryOpQuantizedInputs(const NodeUnit& node_unit) { +bool HasValidBinaryOpQuantizedInputTypes(const NodeUnit& node_unit) { auto quant_op_type = GetQuantizedOpType(node_unit); int32_t a_input_type, b_input_type; if (!IsQuantizedBinaryOp(quant_op_type)) { @@ -146,16 +140,17 @@ bool HasValidBinaryOpQuantizedInputs(const NodeUnit& node_unit) { if (!GetType(inputs[1].node_arg, b_input_type)) return false; - // QlinearConv supports u8u8 or u8s8 - // QLinearMatMul/Add only support u8u8 - bool is_quant_conv = IsQuantizedConv(quant_op_type); + // QlinearConv/MatMul supports u8u8 or u8s8 + // QLinearAdd/QLinearMul only support u8u8 + bool is_quant_conv_or_matmul = IsQuantizedConv(quant_op_type) || (quant_op_type == QuantizedOpType::QLinearMatMul); + bool has_valid_qlinear_conv_weight = (b_input_type == ONNX_NAMESPACE::TensorProto_DataType_UINT8 || b_input_type == ONNX_NAMESPACE::TensorProto_DataType_INT8); if (a_input_type != ONNX_NAMESPACE::TensorProto_DataType_UINT8 || - (!is_quant_conv && a_input_type != b_input_type) || - (is_quant_conv && !has_valid_qlinear_conv_weight)) { + (!is_quant_conv_or_matmul && a_input_type != b_input_type) || + (is_quant_conv_or_matmul && !has_valid_qlinear_conv_weight)) { LOGS_DEFAULT(VERBOSE) << "[" << node_unit.OpType() << "] A Input type: [" << a_input_type << "] B Input type: [" << b_input_type @@ -166,182 +161,6 @@ bool HasValidBinaryOpQuantizedInputs(const NodeUnit& node_unit) { return true; } -bool HasValidQuantizationScales(const InitializedTensorSet& initializers, const NodeUnit& node_unit, - const std::vector& indices, const OpSupportCheckParams& params, bool is_input) { - const auto& op_type = node_unit.OpType(); - auto quant_op_type = GetQuantizedOpType(node_unit); - bool is_quant_conv = IsQuantizedConv(quant_op_type); - bool is_quant_matmul = (quant_op_type == QuantizedOpType::QLinearMatMul); - const auto& io_defs = is_input ? node_unit.Inputs() : node_unit.Outputs(); - for (const auto idx : indices) { - if (idx >= io_defs.size()) { - LOGS_DEFAULT(VERBOSE) << (is_input ? "Input" : "Output") << " index, " << idx - << " >= size, " << io_defs.size() - << " of NodeUnit: " << node_unit.Name(); - return false; - } - - const auto& io_def = io_defs[idx]; - if (!io_def.quant_param.has_value()) { - LOGS_DEFAULT(VERBOSE) << "HasValidQuantizationZeroPoints, Input index, " << idx - << " has no quant_param"; - return false; - } - - const auto scale_name = io_def.quant_param->scale.Name(); - - if (!Contains(initializers, scale_name)) { - LOGS_DEFAULT(VERBOSE) << "The scale of " << op_type << " must be an initializer tensor"; - return false; - } - - // If this op is Qlinear[Conv/MatMul], we want to check u8s8 support for weight tensor (or B tensor for QlinearMatMul) - bool is_conv_matmul_weight = is_input && (is_quant_conv || is_quant_matmul) && idx == 1; - bool is_conv_matmul_u8s8_weight = false; - - if (is_conv_matmul_weight) { - const auto& weight_tensor = *initializers.at(io_def.node_arg.Name()); - is_conv_matmul_u8s8_weight = weight_tensor.data_type() == ONNX_NAMESPACE::TensorProto_DataType_INT8; - } - - const auto& scale_tensor = *initializers.at(scale_name); - int64_t scales_dim = scale_tensor.dims().empty() ? 1 : scale_tensor.dims()[0]; - if (!is_conv_matmul_u8s8_weight) { - if (scales_dim != 1) { - LOGS_DEFAULT(VERBOSE) << op_type << " does not support per-channel quantization, " - << " for now, only u8s8 QlinearConv supports per-channel quantization on API 29+"; - return false; - } - } else if (scales_dim != 1) { - // For u8s8 Qlinear[Conv/MatMul], we support - // 1. Per-tensor, the weight will be transformed to uint8 later - // 2. Per-channel, only from Android API level 29 - if (is_quant_matmul) { - LOGS_DEFAULT(VERBOSE) << "QLinearMatMul does not support per-channel quantization"; - return false; - } - - if (params.android_feature_level < ANEURALNETWORKS_FEATURE_LEVEL_3) { - LOGS_DEFAULT(VERBOSE) << op_type << " only supports per-channel quantization on Android API 29+, " - << "system NNAPI feature level: " << params.android_feature_level; - return false; - } - - const auto& weight_tensor = *initializers.at(io_def.node_arg.Name()); - if (weight_tensor.dims()[0] != scales_dim) { - LOGS_DEFAULT(VERBOSE) << op_type << " mismatch int8 per-channel quantization weight," - << " weight dimension[0] " << weight_tensor.dims()[0] - << " scale dimension " << scales_dim; - return false; - } - } - } - - return true; -} - -bool HasValidQuantizationZeroPoints(const InitializedTensorSet& initializers, const NodeUnit& node_unit, - const std::vector& indices, bool is_input) { - const auto& op_type = node_unit.OpType(); - auto quant_op_type = GetQuantizedOpType(node_unit); - bool is_quant_conv = IsQuantizedConv(quant_op_type); - bool is_quant_matmul = (quant_op_type == QuantizedOpType::QLinearMatMul); - - const auto& io_defs = is_input ? node_unit.Inputs() : node_unit.Outputs(); - for (const auto idx : indices) { - if (idx >= io_defs.size()) { - LOGS_DEFAULT(VERBOSE) << "HasValidQuantizationZeroPoints, " - << (is_input ? "Input" : "Output") << " index, " << idx - << " >= size, " << io_defs.size(); - return false; - } - - const auto& io_def = io_defs[idx]; - if (!io_def.quant_param.has_value()) { - LOGS_DEFAULT(VERBOSE) << "HasValidQuantizationZeroPoints, Input index, " << idx - << " has no quant_param"; - return false; - } - - // zero point is optional here - if (!io_def.quant_param->zero_point) - return true; - - const auto& zero_point_name = io_def.quant_param->zero_point->Name(); - if (!Contains(initializers, zero_point_name)) { - LOGS_DEFAULT(VERBOSE) << "The zero point of " << op_type << " must be an initializer tensor"; - return false; - } - - bool is_conv_matmul_weight = is_input && (is_quant_conv || is_quant_matmul) && idx == 1; - bool is_conv_matmul_u8s8_weight = false; - - if (is_conv_matmul_weight) { - const auto& weight_tensor = *initializers.at(io_def.node_arg.Name()); - is_conv_matmul_u8s8_weight = weight_tensor.data_type() == ONNX_NAMESPACE::TensorProto_DataType_INT8; - } - - const auto& zero_tensor = *initializers.at(zero_point_name); - int64_t zero_dim = zero_tensor.dims().empty() ? 1 : zero_tensor.dims()[0]; - - if (!is_conv_matmul_u8s8_weight) { - if (zero_dim != 1) { - LOGS_DEFAULT(VERBOSE) << op_type << " does not support per-channel quantization, " - << " for now, only u8s8 QlinearConv supports per-channel quantization on API 29+"; - return false; - } - } else { - // For u8s8 Qlinear[Conv/MatMul], we support - // 1. Per-tensor, the weight will be transformed to uint8 later - // 2. Per-channel, only from Android API level 29 - if (zero_tensor.data_type() != ONNX_NAMESPACE::TensorProto_DataType_INT8) { - LOGS_DEFAULT(VERBOSE) << "u8s8 Qlinear[Conv/MatMul] only supports int8 zero point for weight, " - << "actual zero point type: [" << zero_tensor.data_type() << "]"; - return false; - } - - if (zero_dim != 1) { - if (is_quant_matmul) { - LOGS_DEFAULT(VERBOSE) << "QLinearMatMul does not support per-channel quantization"; - return false; - } - } - - // For onnx, u8s8 QlinearConv, the weight zero point can be a scalar, - // or a tensor with same channel as weight, for NNAPI we only support it be - // 0 (scalar) or all 0 (tensor), NNAPI will assume the zero point for per-channel - // quantization is 0 there is no input for it - const auto& weight_tensor = *initializers.at(io_def.node_arg.Name()); - if (weight_tensor.dims()[0] != zero_dim && zero_dim != 1) { - LOGS_DEFAULT(VERBOSE) << op_type << " mismatch int8 per-channel quantization weight," - << " weight dimension[0] " << weight_tensor.dims()[0] - << " zero point dimension " << zero_dim; - return false; - } - - std::vector unpacked_tensor; - auto status = onnxruntime::utils::UnpackInitializerData(zero_tensor, node_unit.ModelPath(), unpacked_tensor); - if (!status.IsOK()) { - LOGS_DEFAULT(ERROR) << "Qlinear[Conv/MatMul] error when unpack zero tensor: " << zero_point_name - << ", error msg: " << status.ErrorMessage(); - return false; - } - - // Verify all onnx weight zero point(s) are 0(s) - const int8_t* zero_points = reinterpret_cast(unpacked_tensor.data()); - for (size_t i = 0; i < unpacked_tensor.size(); i++) { - if (zero_points[i] != 0) { - LOGS_DEFAULT(VERBOSE) << "u8s8 Qlinear[Conv/MatMul] only support 0 as zero point, " - << "zero_points[" << i << "] has value: " << zero_points[i]; - return false; - } - } - } - } - - return true; -} - common::Status GetQuantizationScaleAndZeroPoint( const InitializedTensorSet& initializers, const NodeUnitIODef& io_def, const Path& model_path, float& scale, int32_t& zero_point) { @@ -387,8 +206,8 @@ common::Status GetQuantizationScaleAndZeroPoint( common::Status GetQuantizationScaleAndZeroPoint( const InitializedTensorSet& initializers, const NodeUnit& node_unit, const std::string& name, - float& scale, int32_t& zero_point, bool is_input) { - const auto& io_defs = is_input ? node_unit.Inputs() : node_unit.Outputs(); + float& scale, int32_t& zero_point, IOKind io_kind) { + const auto& io_defs = io_kind == IOKind::Input ? node_unit.Inputs() : node_unit.Outputs(); for (const auto& io_def : io_defs) { if (io_def.node_arg.Name() == name) return GetQuantizationScaleAndZeroPoint(initializers, io_def, node_unit.ModelPath(), diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.h b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.h index 2d99bce246184..73ea329d0b31b 100644 --- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.h +++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.h @@ -82,12 +82,14 @@ enum class QuantizedOpType : uint8_t { QLinearAdd, QLinearSigmoid, QLinearAveragePool, + QLinearMul, // Not yet supported - // QLinearMul, // QLinearReduceMean, QDQConv, QDQResize, QDQAveragePool, + QDQAdd, + QDQMul, // TODO, add other QDQ NodeUnit types }; @@ -97,6 +99,11 @@ enum class ConvType : uint8_t { Grouped, }; +enum class IOKind : uint8_t { + Input, + Output, +}; + QuantizedOpType GetQuantizedOpType(const NodeUnit& node_unit); // Return the type of the conv ops, @@ -113,18 +120,8 @@ bool IsQuantizedPool(QuantizedOpType quant_op_type); // Such as QLinearConv, QLinearMatMul, QLinearAdd, QDQConv,... bool IsQuantizedBinaryOp(QuantizedOpType quant_op_type); -// Check if a qlinear unary op has valid inputs, Qlinear[Sigmoid/AveragePool] -bool HasValidUnaryOpQuantizedInputs(const NodeUnit& node_unit); // Check if a qlinear binary op has valid inputs, Qlinear[Conv/MatMul/Add] -bool HasValidBinaryOpQuantizedInputs(const NodeUnit& node_unit); - -// Check if a qlinear op has valid scales for given indices -bool HasValidQuantizationScales(const InitializedTensorSet& initializers, const NodeUnit& node_unit, - const std::vector& indices, const OpSupportCheckParams& params, bool is_input); - -// Check if a qlinear op has valid zero points for given indices -bool HasValidQuantizationZeroPoints(const InitializedTensorSet& initializers, const NodeUnit& node_unit, - const std::vector& indices, bool is_input); +bool HasValidBinaryOpQuantizedInputTypes(const NodeUnit& node_unit); common::Status GetQuantizationScaleAndZeroPoint( const InitializedTensorSet& initializers, const NodeUnitIODef& io_def, const Path& model_path, @@ -132,7 +129,7 @@ common::Status GetQuantizationScaleAndZeroPoint( common::Status GetQuantizationScaleAndZeroPoint( const InitializedTensorSet& initializers, const NodeUnit& node_unit, const std::string& name, - float& scale, int32_t& zero_point, bool is_input = true); + float& scale, int32_t& zero_point, IOKind io_kind = IOKind::Input); // Get Shape/Type of a NodeArg // TODO, move to shared_utils diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.cc index 0037e1a1d487d..3f95048335358 100644 --- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.cc +++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.cc @@ -210,7 +210,7 @@ static Status GetInputDataType( // TODO, verify the scale and zero point match if there are multiple op using same input const auto* node_unit = all_quantized_op_inputs.at(name)[0]; ORT_RETURN_IF_ERROR(GetQuantizationScaleAndZeroPoint( - initializers, *node_unit, name, scale, zero_point, true /* is_input */)); + initializers, *node_unit, name, scale, zero_point, IOKind::Input)); break; } // case ONNX_NAMESPACE::TensorProto_DataType_INT8: diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc index 8a61e2a105127..47e8fe2eef749 100644 --- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc +++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc @@ -452,7 +452,7 @@ static Status HandleAutoPad(const Shape& input_shape, } // Get scales and zero points for the qlinear binary ops (which has 2 input and 1 output) -// QLinearConv, QLinearMatmul, QLinearAdd +// QLinearConv, QLinearMatmul, QLinearAdd, QLinearMul // a, b are inputs, and y is output static Status GetBinaryOpQuantizationScaleAndZeroPoint( const InitializedTensorSet& initializers, const NodeUnit& node_unit, @@ -656,8 +656,11 @@ class BinaryOpBuilder : public BaseOpBuilder { }; /* static */ bool BinaryOpBuilder::IsQuantizedOp(const NodeUnit& node_unit) { - // TODO, add support for QDQ NodeUnit - return node_unit.OpType() == "QLinearAdd"; + const auto quant_type = GetQuantizedOpType(node_unit); + return quant_type == QuantizedOpType::QLinearAdd || + quant_type == QuantizedOpType::QLinearMul || + quant_type == QuantizedOpType::QDQAdd || + quant_type == QuantizedOpType::QDQMul; } void BinaryOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const NodeUnit& node_unit) const { @@ -680,6 +683,7 @@ void BinaryOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const N "Mul", "Div", "QLinearAdd", + "QLinearMul", "Pow", }); } @@ -690,12 +694,12 @@ Status BinaryOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const int32_t op_code; bool add_activation = true; - bool op_is_qlinear = op_type == "QLinearAdd"; - if (op_type == "Add" || op_is_qlinear) { + bool is_quant_op = IsQuantizedOp(node_unit); + if (op_type == "Add" || op_type == "QLinearAdd") { // Add/QLinearAdd/QDQAdd op_code = ANEURALNETWORKS_ADD; } else if (op_type == "Sub") { op_code = ANEURALNETWORKS_SUB; - } else if (op_type == "Mul") { + } else if (op_type == "Mul" || op_type == "QLinearMul") { // Mul/QLinearMul/QDQMul op_code = ANEURALNETWORKS_MUL; } else if (op_type == "Div") { op_code = ANEURALNETWORKS_DIV; @@ -721,7 +725,7 @@ Status BinaryOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const b_zero_point = 0, y_zero_point = 0; - if (op_is_qlinear) { + if (is_quant_op) { ORT_RETURN_IF_ERROR(GetBinaryOpQuantizationScaleAndZeroPoint( model_builder.GetInitializerTensors(), node_unit, a_scale, b_scale, y_scale, @@ -729,7 +733,7 @@ Status BinaryOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const } // Verify if the scale and zero point matchs from onnx input and nnapi input match - if (op_is_qlinear) { + if (is_quant_op) { ORT_RETURN_IF_ERROR(IsValidInputQuantizedType(model_builder, input1, a_scale, a_zero_point)); ORT_RETURN_IF_ERROR(IsValidInputQuantizedType(model_builder, input2, b_scale, b_zero_point)); } @@ -2717,6 +2721,7 @@ static OpBuilderRegistrations CreateOpBuilderRegistrations() { NNAPI_EP_ADD_SHARED_OP_BUILDER("Mul", BinaryOpBuilder); NNAPI_EP_ADD_SHARED_OP_BUILDER("Pow", BinaryOpBuilder); NNAPI_EP_ADD_SHARED_OP_BUILDER("QLinearAdd", BinaryOpBuilder); + NNAPI_EP_ADD_SHARED_OP_BUILDER("QLinearMul", BinaryOpBuilder); NNAPI_EP_ADD_SHARED_OP_BUILDER("Sub", BinaryOpBuilder); } diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_support_checker.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_support_checker.cc index 950fb302f1b53..01b51fed399f6 100644 --- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_support_checker.cc +++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_support_checker.cc @@ -22,7 +22,21 @@ struct OpSupportCheckerRegistrations { std::unordered_map op_support_checker_map; }; -bool HasExternalInitializer(const InitializedTensorSet& initializers, const NodeUnit& node_unit) { +template +void CreateSharedOpSupportCheckerImpl(const std::string& op_type, + OpSupportCheckerRegistrations& op_registrations, + const std::vector& op_types) { + // The shared OpSupportChecker is already in the OpSupportCheckerRegistrations + if (op_registrations.op_support_checker_map.find(op_type) != op_registrations.op_support_checker_map.cend()) + return; + + op_registrations.support_checkers.push_back(std::make_unique()); + for (const auto& op : op_types) { + op_registrations.op_support_checker_map.emplace(op, op_registrations.support_checkers.back().get()); + } +} + +static bool HasExternalInitializer(const InitializedTensorSet& initializers, const NodeUnit& node_unit) { const auto is_ext_initializer = [&](const NodeArg& node_arg) { const auto& input_name(node_arg.Name()); @@ -58,18 +72,200 @@ bool HasExternalInitializer(const InitializedTensorSet& initializers, const Node return false; } -template -void CreateSharedOpSupportCheckerImpl(const std::string& op_type, - OpSupportCheckerRegistrations& op_registrations, - const std::vector& op_types) { - // The shared OpSupportChecker is already in the OpSupportCheckerRegistrations - if (op_registrations.op_support_checker_map.find(op_type) != op_registrations.op_support_checker_map.cend()) - return; +static bool IsQuantizationScaleSupported(const InitializedTensorSet& initializers, + const NodeUnitIODef& io_def, + const OpSupportCheckParams& params, + const std::string& op_type, + bool is_quant_matmul, + bool is_conv_matmul_u8s8_weight) { + const auto scale_name = io_def.quant_param->scale.Name(); + auto it = initializers.find(scale_name); + if (it == initializers.cend()) { + LOGS_DEFAULT(VERBOSE) << "The scale of " << op_type << " must be an initializer tensor"; + return false; + } + + const auto& scale_tensor = *it->second; + int64_t scales_dim = scale_tensor.dims().empty() ? 1 : scale_tensor.dims()[0]; + if (!is_conv_matmul_u8s8_weight) { + if (scales_dim != 1) { + LOGS_DEFAULT(VERBOSE) << op_type << " does not support per-channel quantization, " + << " for now, only u8s8 QlinearConv supports per-channel quantization on API 29+"; + return false; + } + } else if (scales_dim != 1) { + // For u8s8 Qlinear[Conv/MatMul], we support + // 1. Per-tensor, the weight will be transformed to uint8 later + // 2. Per-channel, only from Android API level 29 + if (is_quant_matmul) { + LOGS_DEFAULT(VERBOSE) << "QLinearMatMul does not support per-channel quantization"; + return false; + } - op_registrations.support_checkers.push_back(std::make_unique()); - for (const auto& op : op_types) { - op_registrations.op_support_checker_map.emplace(op, op_registrations.support_checkers.back().get()); + if (params.android_feature_level < ANEURALNETWORKS_FEATURE_LEVEL_3) { + LOGS_DEFAULT(VERBOSE) << op_type << " only supports per-channel quantization on Android API 29+, " + << "system NNAPI feature level: " << params.android_feature_level; + return false; + } + + Shape weight_shape; + if (!GetShape(io_def.node_arg, weight_shape)) + return false; + + if (weight_shape[0] != scales_dim) { + LOGS_DEFAULT(VERBOSE) << op_type << " mismatch int8 per-channel quantization weight," + << " weight dimension[0] " << weight_shape[0] + << " scale dimension " << scales_dim; + return false; + } + } + + return true; +} + +static bool IsQuantizationZeroPointSupported(const InitializedTensorSet& initializers, + const NodeUnitIODef& io_def, + const std::string& op_type, + const Path& model_path, + bool is_quant_matmul, + bool is_conv_matmul_u8s8_weight) { + // zero point is optional here + if (!io_def.quant_param->zero_point) + return true; + + const auto& zero_point_name = io_def.quant_param->zero_point->Name(); + if (!Contains(initializers, zero_point_name)) { + LOGS_DEFAULT(VERBOSE) << "The zero point of " << op_type << " must be an initializer tensor"; + return false; + } + + const auto& zero_tensor = *initializers.at(zero_point_name); + int64_t zero_dim = zero_tensor.dims().empty() ? 1 : zero_tensor.dims()[0]; + + if (!is_conv_matmul_u8s8_weight) { + if (zero_dim != 1) { + LOGS_DEFAULT(VERBOSE) << op_type << " does not support per-channel quantization, " + << " for now, only u8s8 QlinearConv supports per-channel quantization on API 29+"; + return false; + } + } else { + // For u8s8 Qlinear[Conv/MatMul], we support + // 1. Per-tensor, the weight will be transformed to uint8 later + // 2. Per-channel, only from Android API level 29 + if (zero_tensor.data_type() != ONNX_NAMESPACE::TensorProto_DataType_INT8) { + LOGS_DEFAULT(VERBOSE) << "u8s8 Qlinear[Conv/MatMul] only supports int8 zero point for weight, " + << "actual zero point type: [" << zero_tensor.data_type() << "]"; + return false; + } + + if (zero_dim != 1) { + if (is_quant_matmul) { + LOGS_DEFAULT(VERBOSE) << "QLinearMatMul does not support per-channel quantization"; + return false; + } + } + + // For onnx, u8s8 QlinearConv, the weight zero point can be a scalar, + // or a tensor with same channel as weight, for NNAPI we only support it be + // 0 (scalar) or all 0 (tensor), NNAPI will assume the zero point for per-channel + // quantization is 0 there is no input for it + Shape weight_shape; + if (!GetShape(io_def.node_arg, weight_shape)) + return false; + + if (weight_shape[0] != zero_dim && zero_dim != 1) { + LOGS_DEFAULT(VERBOSE) << op_type << " mismatch int8 per-channel quantization weight," + << " weight dimension[0] " << weight_shape[0] + << " zero point dimension " << zero_dim; + return false; + } + + std::vector unpacked_tensor; + auto status = onnxruntime::utils::UnpackInitializerData(zero_tensor, model_path, unpacked_tensor); + if (!status.IsOK()) { + LOGS_DEFAULT(ERROR) << "Qlinear[Conv/MatMul] error when unpack zero tensor: " << zero_point_name + << ", error msg: " << status.ErrorMessage(); + return false; + } + + // Verify all onnx weight zero point(s) are 0(s) + const int8_t* zero_points = reinterpret_cast(unpacked_tensor.data()); + for (size_t i = 0; i < unpacked_tensor.size(); i++) { + if (zero_points[i] != 0) { + LOGS_DEFAULT(VERBOSE) << "u8s8 Qlinear[Conv/MatMul] only support 0 as zero point, " + << "zero_points[" << i << "] has value: " << zero_points[i]; + return false; + } + } } + + return true; +} + +// Check if the given quantized input(s) or output(s) is supported +static bool IsQuantizedIOSupported(const InitializedTensorSet& initializers, const NodeUnit& node_unit, + const std::vector& indices, const OpSupportCheckParams& params, IOKind io_kind) { + const auto& op_type = node_unit.OpType(); + auto quant_op_type = GetQuantizedOpType(node_unit); + + ORT_ENFORCE(quant_op_type != QuantizedOpType::Unknown, "[", op_type, "] is not a quantized op"); + + bool is_input = io_kind == IOKind::Input; + bool is_quant_conv = IsQuantizedConv(quant_op_type); + bool is_quant_matmul = (quant_op_type == QuantizedOpType::QLinearMatMul); + const auto& io_defs = is_input ? node_unit.Inputs() : node_unit.Outputs(); + + for (const auto idx : indices) { + if (idx >= io_defs.size()) { + LOGS_DEFAULT(VERBOSE) << (is_input ? "Input" : "Output") << " index, " << idx + << " >= size, " << io_defs.size() + << " of NodeUnit: " << node_unit.Name(); + return false; + } + + const auto& io_def = io_defs[idx]; + ORT_ENFORCE(io_def.quant_param.has_value(), "Input index, ", idx, " has no quant_param"); + + // If this op is Qlinear[Conv/MatMul], we want to check u8s8 support for weight tensor (or B tensor for QlinearMatMul) + bool is_conv_matmul_weight = is_input && (is_quant_conv || is_quant_matmul) && idx == 1; + bool is_conv_matmul_u8s8_weight = false; + + if (is_conv_matmul_weight) { + int32_t weight_type; + if (!GetType(io_def.node_arg, weight_type)) + return false; + is_conv_matmul_u8s8_weight = weight_type == ONNX_NAMESPACE::TensorProto_DataType_INT8; + } + + int32_t input_type; + if (!GetType(io_def.node_arg, input_type)) + return false; + + // We only support u8 for most of the inputs and all outputs, with the exception for Quantized MatMul and Conv, + // which allows s8 weight (u8s8) + // TODO, add support of s8s8 + if (input_type != ONNX_NAMESPACE::TensorProto_DataType_UINT8 && + !(input_type == ONNX_NAMESPACE::TensorProto_DataType_INT8 && is_conv_matmul_u8s8_weight)) { + LOGS_DEFAULT(VERBOSE) << op_type << "NodeUnit [" << node_unit.Name() + << "], type [" << op_type << "]'s " + << (is_input ? "Input" : "Output") << " index [" << idx + << "] has unsupported type [" << input_type << "]"; + return false; + } + + // Check scale and zero point + if (!IsQuantizationScaleSupported(initializers, io_def, params, op_type, + is_quant_matmul, is_conv_matmul_u8s8_weight)) { + return false; + } + + if (!IsQuantizationZeroPointSupported(initializers, io_def, op_type, node_unit.ModelPath(), + is_quant_matmul, is_conv_matmul_u8s8_weight)) { + return false; + } + } + + return true; } #pragma endregion helpers @@ -100,7 +296,9 @@ class BaseOpSupportChecker : public IOpSupportChecker { return ANEURALNETWORKS_FEATURE_LEVEL_1; } - virtual bool HasSupportedInputsImpl(const NodeUnit& node_unit) const; + virtual bool HasSupportedInputOutputsImpl( + const InitializedTensorSet& initializers, const NodeUnit& node_unit, + const OpSupportCheckParams& params) const; virtual int GetMinSupportedOpSet(const NodeUnit& /* node_unit */) const { return 1; } virtual int GetMaxSupportedOpSet(const NodeUnit& /* node_unit */) const { return 15; } @@ -112,7 +310,8 @@ class BaseOpSupportChecker : public IOpSupportChecker { private: bool HasSupportedOpSet(const NodeUnit& node_unit) const; - bool HasSupportedInputs(const NodeUnit& node_unit) const; + bool HasSupportedInputOutputs(const InitializedTensorSet& initializers, const NodeUnit& node_unit, + const OpSupportCheckParams& params) const; }; /* static */ void BaseOpSupportChecker::CreateSharedOpSupportChecker( @@ -138,7 +337,7 @@ bool BaseOpSupportChecker::IsOpSupported(const InitializedTensorSet& initializer if (!IsNodeUnitTypeSupported(node_unit)) return false; - if (!HasSupportedInputs(node_unit)) + if (!HasSupportedInputOutputs(initializers, node_unit, params)) return false; // We do not support external initializers for now @@ -151,7 +350,8 @@ bool BaseOpSupportChecker::IsOpSupported(const InitializedTensorSet& initializer return IsOpSupportedImpl(initializers, node_unit, params); } -bool BaseOpSupportChecker::HasSupportedInputs(const NodeUnit& node_unit) const { +bool BaseOpSupportChecker::HasSupportedInputOutputs(const InitializedTensorSet& initializers, const NodeUnit& node_unit, + const OpSupportCheckParams& params) const { // We do not support unknown(null) input shape auto has_supported_shape = [](const NodeArg& node_arg, const std::string& name, const std::string op_type) { const auto* shape_proto = node_arg.Shape(); @@ -185,10 +385,12 @@ bool BaseOpSupportChecker::HasSupportedInputs(const NodeUnit& node_unit) const { return false; } } - return HasSupportedInputsImpl(node_unit); + return HasSupportedInputOutputsImpl(initializers, node_unit, params); } -bool BaseOpSupportChecker::HasSupportedInputsImpl(const NodeUnit& node_unit) const { +bool BaseOpSupportChecker::HasSupportedInputOutputsImpl( + const InitializedTensorSet& /* initializers */, const NodeUnit& node_unit, + const OpSupportCheckParams& /* params */) const { // We only check the type of input 0 by default // specific op builder can override this const auto& input = node_unit.Inputs()[0].node_arg; @@ -245,8 +447,13 @@ class BinaryOpSupportChecker : public BaseOpSupportChecker { const OpSupportCheckParams& params) const override; bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const NodeUnit& node_unit, const OpSupportCheckParams& params) const override; - bool HasSupportedInputsImpl(const NodeUnit& node_unit) const override; + bool HasSupportedInputOutputsImpl( + const InitializedTensorSet& initializers, const NodeUnit& node_unit, + const OpSupportCheckParams& params) const override; int GetMinSupportedOpSet(const NodeUnit& node_unit) const override; + + bool IsNodeUnitTypeSupported(const NodeUnit& node_unit) const override; + static bool IsQuantizedOp(const NodeUnit& node_unit); }; /* static */ void BinaryOpSupportChecker::CreateSharedOpSupportChecker( @@ -259,10 +466,29 @@ class BinaryOpSupportChecker : public BaseOpSupportChecker { "Mul", "Div", "QLinearAdd", + "QLinearMul", "Pow", }); } +bool BinaryOpSupportChecker::IsNodeUnitTypeSupported(const NodeUnit& node_unit) const { + if (node_unit.UnitType() == NodeUnit::Type::QDQGroup) { + const auto quant_type = GetQuantizedOpType(node_unit); + return quant_type == QuantizedOpType::QDQAdd || + quant_type == QuantizedOpType::QDQMul; + } + + return true; +} + +/* static */ bool BinaryOpSupportChecker::IsQuantizedOp(const NodeUnit& node_unit) { + const auto quant_type = GetQuantizedOpType(node_unit); + return quant_type == QuantizedOpType::QLinearAdd || + quant_type == QuantizedOpType::QLinearMul || + quant_type == QuantizedOpType::QDQAdd || + quant_type == QuantizedOpType::QDQMul; +} + int32_t BinaryOpSupportChecker::GetMinSupportedNNAPIFeatureLevel( const NodeUnit& node_unit, const OpSupportCheckParams& /* params */) const { const auto& op(node_unit.OpType()); @@ -281,21 +507,29 @@ int BinaryOpSupportChecker::GetMinSupportedOpSet(const NodeUnit& node_unit) cons const auto& op(node_unit.OpType()); // Add/Sub/Mul/Div/Pow opset 6- has broadcast attributes we do not support now - if (op != "QLinearAdd") + if (op != "QLinearAdd" && op != "QLinearMul") return 7; return 1; } -bool BinaryOpSupportChecker::HasSupportedInputsImpl(const NodeUnit& node_unit) const { - bool is_qlinear_add = node_unit.OpType() == "QLinearAdd"; +bool BinaryOpSupportChecker::HasSupportedInputOutputsImpl( + const InitializedTensorSet& initializers, const NodeUnit& node_unit, + const OpSupportCheckParams& params) const { + bool is_quantized_op = IsQuantizedOp(node_unit); bool is_pow = node_unit.OpType() == "Pow"; - if (!is_qlinear_add && !is_pow) - return BaseOpSupportChecker::HasSupportedInputsImpl(node_unit); + if (!is_quantized_op && !is_pow) + return BaseOpSupportChecker::HasSupportedInputOutputsImpl(initializers, node_unit, params); - if (is_qlinear_add) { - // QLinearAdd - if (!HasValidBinaryOpQuantizedInputs(node_unit)) + if (is_quantized_op) { + // QLinearAdd/QDQAdd/QLinearMul/QDQMul + if (!HasValidBinaryOpQuantizedInputTypes(node_unit)) + return false; + + if (!IsQuantizedIOSupported(initializers, node_unit, {0, 1}, params, IOKind::Input)) + return false; + + if (!IsQuantizedIOSupported(initializers, node_unit, {0}, params, IOKind::Output)) return false; } @@ -320,11 +554,10 @@ bool BinaryOpSupportChecker::HasSupportedInputsImpl(const NodeUnit& node_unit) c return true; } -bool BinaryOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initializers, const NodeUnit& node_unit, - const OpSupportCheckParams& params) const { +bool BinaryOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& /* initializers */, const NodeUnit& node_unit, + const OpSupportCheckParams& /* params */) const { const auto& op_type(node_unit.OpType()); const auto& inputs = node_unit.Inputs(); - bool op_is_qlinear = op_type == "QLinearAdd"; Shape input1_shape, input2_shape; if (!GetShape(inputs[0].node_arg, input1_shape) || !GetShape(inputs[1].node_arg, input2_shape)) @@ -339,32 +572,6 @@ bool BinaryOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initi return false; } - if (op_is_qlinear) { - // For QLinearAdd, we only support uint8 output now - int32_t output_type; - if (!GetType(node_unit.Outputs()[0].node_arg, output_type)) - return false; - - if (output_type != ONNX_NAMESPACE::TensorProto_DataType_UINT8) { - LOGS_DEFAULT(VERBOSE) << "[" << op_type - << "] output type: [" << output_type - << "] is not supported for now"; - return false; - } - - // Check input scales and ZPs - if (!HasValidQuantizationScales(initializers, node_unit, {0, 1}, params, true /* is_input */)) - return false; - if (!HasValidQuantizationZeroPoints(initializers, node_unit, {0, 1}, true /* is_input */)) - return false; - - // Check output scale and ZP - if (!HasValidQuantizationScales(initializers, node_unit, {0}, params, false /* is_input */)) - return false; - if (!HasValidQuantizationZeroPoints(initializers, node_unit, {0}, false /* is_input */)) - return false; - } - return true; } @@ -382,7 +589,9 @@ class TransposeOpSupportChecker : public BaseOpSupportChecker { return ANEURALNETWORKS_FEATURE_LEVEL_2; } - bool HasSupportedInputsImpl(const NodeUnit& node_unit) const override; + bool HasSupportedInputOutputsImpl( + const InitializedTensorSet& initializers, const NodeUnit& node_unit, + const OpSupportCheckParams& params) const override; }; bool TransposeOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& /* initializers */, const NodeUnit& node_unit, @@ -401,7 +610,9 @@ bool TransposeOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& /* return true; } -bool TransposeOpSupportChecker::HasSupportedInputsImpl(const NodeUnit& node_unit) const { +bool TransposeOpSupportChecker::HasSupportedInputOutputsImpl( + const InitializedTensorSet& /* initializers */, const NodeUnit& node_unit, + const OpSupportCheckParams& /* params */) const { int32_t input_type; if (!GetType(node_unit.Inputs()[0].node_arg, input_type)) return false; @@ -561,8 +772,10 @@ class PoolOpSupportChecker : public BaseOpSupportChecker { return params.use_nchw ? ANEURALNETWORKS_FEATURE_LEVEL_3 : ANEURALNETWORKS_FEATURE_LEVEL_2; } - bool HasSupportedInputsImpl(const NodeUnit& node_unit) const override; - bool IsNodeUnitTypeSupported(const NodeUnit& /* node_unit */) const override { return true; } + bool HasSupportedInputOutputsImpl( + const InitializedTensorSet& initializers, const NodeUnit& node_unit, + const OpSupportCheckParams& params) const override; + bool IsNodeUnitTypeSupported(const NodeUnit& /* node_unit */) const override; static bool IsQuantizedOp(const NodeUnit& node_unit); }; @@ -579,12 +792,21 @@ class PoolOpSupportChecker : public BaseOpSupportChecker { }); } +bool PoolOpSupportChecker::IsNodeUnitTypeSupported(const NodeUnit& node_unit) const { + if (node_unit.UnitType() == NodeUnit::Type::QDQGroup) { + const auto quant_type = GetQuantizedOpType(node_unit); + return quant_type == QuantizedOpType::QDQAveragePool; + } + + return true; +} + /* static */ bool PoolOpSupportChecker::IsQuantizedOp(const NodeUnit& node_unit) { return IsQuantizedPool(GetQuantizedOpType(node_unit)); } bool PoolOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initializers, const NodeUnit& node_unit, - const OpSupportCheckParams& params) const { + const OpSupportCheckParams& /* params */) const { const auto& op_name = node_unit.Name(); const auto& op_type = node_unit.OpType(); const auto& inputs = node_unit.Inputs(); @@ -601,7 +823,8 @@ bool PoolOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initial } bool is_quant_pool = IsQuantizedOp(node_unit); - if (op_type == "AveragePool" || op_type == "MaxPool" || op_type == "QLinearAveragePool") { + bool is_average_pool = op_type == "AveragePool" || op_type == "QLinearAveragePool"; + if (is_average_pool || op_type == "MaxPool") { NodeAttrHelper helper(node_unit); const auto count_include_pad = helper.Get("count_include_pad", 0); @@ -642,20 +865,7 @@ bool PoolOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initial } // We need to check if we have valid scales and zero points for QLinearAveragePool - if (is_quant_pool) { - // Check input scales and ZPs - if (!HasValidQuantizationScales(initializers, node_unit, {0}, params, true /* is_input */)) - return false; - if (!HasValidQuantizationZeroPoints(initializers, node_unit, {0}, true /* is_input */)) - return false; - - // Check output scale and ZP - - if (!HasValidQuantizationScales(initializers, node_unit, {0}, params, false /* is_input */)) - return false; - if (!HasValidQuantizationZeroPoints(initializers, node_unit, {0}, false /* is_input */)) - return false; - + if (is_average_pool && is_quant_pool) { // NNAPI requires Quantized Average Pool has same scale and zero point for both input and output float input_scale = 0.0f; int32_t input_zp = 0; @@ -697,14 +907,23 @@ bool PoolOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initial return true; } -bool PoolOpSupportChecker::HasSupportedInputsImpl(const NodeUnit& node_unit) const { - bool is_max_pool = node_unit.OpType() == "MaxPool"; +bool PoolOpSupportChecker::HasSupportedInputOutputsImpl( + const InitializedTensorSet& initializers, const NodeUnit& node_unit, + const OpSupportCheckParams& params) const { + const auto& op_type = node_unit.OpType(); bool is_quant_pool = IsQuantizedOp(node_unit); - if (!is_max_pool && !is_quant_pool) - return BaseOpSupportChecker::HasSupportedInputsImpl(node_unit); + bool is_max_pool = op_type == "MaxPool"; + bool is_average_pool = op_type == "AveragePool" || op_type == "QLinearAveragePool"; + bool is_quant_average_pool = is_quant_pool && is_average_pool; + if (!is_max_pool && !is_quant_average_pool) + return BaseOpSupportChecker::HasSupportedInputOutputsImpl(initializers, node_unit, params); + + if (is_quant_average_pool) { + if (!IsQuantizedIOSupported(initializers, node_unit, {0}, params, IOKind::Input)) + return false; - if (is_quant_pool) { - return HasValidUnaryOpQuantizedInputs(node_unit); + if (!IsQuantizedIOSupported(initializers, node_unit, {0}, params, IOKind::Output)) + return false; } // is_max_pool @@ -742,7 +961,9 @@ class ConvOpSupportChecker : public BaseOpSupportChecker { return params.use_nchw ? ANEURALNETWORKS_FEATURE_LEVEL_3 : ANEURALNETWORKS_FEATURE_LEVEL_2; } - bool HasSupportedInputsImpl(const NodeUnit& node_unit) const override; + bool HasSupportedInputOutputsImpl( + const InitializedTensorSet& /* initializers */, const NodeUnit& node_unit, + const OpSupportCheckParams& /* params */) const override; bool IsNodeUnitTypeSupported(const NodeUnit& /* node_unit */) const override { return true; } static bool IsQuantizedOp(const NodeUnit& node_unit); }; @@ -761,12 +982,20 @@ class ConvOpSupportChecker : public BaseOpSupportChecker { return IsQuantizedConv(GetQuantizedOpType(node_unit)); } -bool ConvOpSupportChecker::HasSupportedInputsImpl(const NodeUnit& node_unit) const { +bool ConvOpSupportChecker::HasSupportedInputOutputsImpl( + const InitializedTensorSet& initializers, const NodeUnit& node_unit, + const OpSupportCheckParams& params) const { if (!IsQuantizedOp(node_unit)) - return BaseOpSupportChecker::HasSupportedInputsImpl(node_unit); + return BaseOpSupportChecker::HasSupportedInputOutputsImpl(initializers, node_unit, params); // QLinearConv only supports input of uint8 for now - if (!HasValidBinaryOpQuantizedInputs(node_unit)) + if (!HasValidBinaryOpQuantizedInputTypes(node_unit)) + return false; + + if (!IsQuantizedIOSupported(initializers, node_unit, {0, 1}, params, IOKind::Input)) + return false; + + if (!IsQuantizedIOSupported(initializers, node_unit, {0}, params, IOKind::Output)) return false; return true; @@ -813,34 +1042,10 @@ bool ConvOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initial } if (is_quant_conv) { - // For QLinearConv, we only support uint8 output now - int32_t output_type; - if (!GetType(node_unit.Outputs()[0].node_arg, output_type)) - return false; - - if (output_type != ONNX_NAMESPACE::TensorProto_DataType_UINT8) { - LOGS_DEFAULT(VERBOSE) << "[" << op_type - << "] output type: [" << output_type - << "] is not supported for now"; - return false; - } - if (inputs.size() > 2 && !Contains(initializers, inputs[2].node_arg.Name())) { LOGS_DEFAULT(VERBOSE) << "Bias of QLinearConv must be known"; return false; } - - // Check input scales and ZPs - if (!HasValidQuantizationScales(initializers, node_unit, {0, 1}, params, true /* is_input */)) - return false; - if (!HasValidQuantizationZeroPoints(initializers, node_unit, {0, 1}, true /* is_input */)) - return false; - - // Check output scale and ZP - if (!HasValidQuantizationScales(initializers, node_unit, {0}, params, false /* is_input */)) - return false; - if (!HasValidQuantizationZeroPoints(initializers, node_unit, {0}, false /* is_input */)) - return false; } return true; @@ -931,16 +1136,26 @@ class GemmOpSupportChecker : public BaseOpSupportChecker { private: bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const NodeUnit& node_unit, const OpSupportCheckParams& params) const override; - bool HasSupportedInputsImpl(const NodeUnit& node_unit) const override; + bool HasSupportedInputOutputsImpl( + const InitializedTensorSet& /* initializers */, const NodeUnit& node_unit, + const OpSupportCheckParams& /* params */) const override; int GetMinSupportedOpSet(const NodeUnit& node_unit) const override; }; -bool GemmOpSupportChecker::HasSupportedInputsImpl(const NodeUnit& node_unit) const { +bool GemmOpSupportChecker::HasSupportedInputOutputsImpl( + const InitializedTensorSet& initializers, const NodeUnit& node_unit, + const OpSupportCheckParams& params) const { if (node_unit.OpType() != "QLinearMatMul") - return BaseOpSupportChecker::HasSupportedInputsImpl(node_unit); + return BaseOpSupportChecker::HasSupportedInputOutputsImpl(initializers, node_unit, params); // QLinearMatMul - if (!HasValidBinaryOpQuantizedInputs(node_unit)) + if (!HasValidBinaryOpQuantizedInputTypes(node_unit)) + return false; + + if (!IsQuantizedIOSupported(initializers, node_unit, {0, 1}, params, IOKind::Input)) + return false; + + if (!IsQuantizedIOSupported(initializers, node_unit, {0}, params, IOKind::Output)) return false; return true; @@ -1077,33 +1292,6 @@ bool GemmOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initial LOGS_DEFAULT(VERBOSE) << "B of MatMul must be known"; return false; } - - if (is_qlinear_matmul) { - // For QLinearMatMul, we only support uint8 output now - int32_t output_type; - if (!GetType(node_unit.Outputs()[0].node_arg, output_type)) - return false; - - if (output_type != ONNX_NAMESPACE::TensorProto_DataType_UINT8) { - LOGS_DEFAULT(VERBOSE) << "[" << op_type - << "] output type: [" << output_type - << "] is not supported for now"; - return false; - } - - // All scale/zero points are initializer scalars - // Check input scales and ZPs - if (!HasValidQuantizationScales(initializers, node_unit, {0, 1}, params, true /* is_input */)) - return false; - if (!HasValidQuantizationZeroPoints(initializers, node_unit, {0, 1}, true /* is_input */)) - return false; - - // Check output scale and ZP - if (!HasValidQuantizationScales(initializers, node_unit, {0}, params, false /* is_input */)) - return false; - if (!HasValidQuantizationZeroPoints(initializers, node_unit, {0}, false /* is_input */)) - return false; - } } else { LOGS_DEFAULT(VERBOSE) << "GemmOpSupportChecker, unknown op: " << op_type; } @@ -1127,7 +1315,9 @@ class UnaryOpSupportChecker : public BaseOpSupportChecker { int32_t GetMinSupportedNNAPIFeatureLevel(const NodeUnit& /* node_unit */, const OpSupportCheckParams& params) const override; - bool HasSupportedInputsImpl(const NodeUnit& node_unit) const override; + bool HasSupportedInputOutputsImpl( + const InitializedTensorSet& /* initializers */, const NodeUnit& node_unit, + const OpSupportCheckParams& /* params */) const override; int GetMinSupportedOpSet(const NodeUnit& node_unit) const override; @@ -1176,12 +1366,20 @@ int32_t UnaryOpSupportChecker::GetMinSupportedNNAPIFeatureLevel(const NodeUnit& return ANEURALNETWORKS_FEATURE_LEVEL_1; } -bool UnaryOpSupportChecker::HasSupportedInputsImpl(const NodeUnit& node_unit) const { +bool UnaryOpSupportChecker::HasSupportedInputOutputsImpl( + const InitializedTensorSet& initializers, const NodeUnit& node_unit, + const OpSupportCheckParams& params) const { // We only need to override input check for QLinearSigmoid if (node_unit.OpType() != "QLinearSigmoid") - return BaseOpSupportChecker::HasSupportedInputsImpl(node_unit); + return BaseOpSupportChecker::HasSupportedInputOutputsImpl(initializers, node_unit, params); - return HasValidUnaryOpQuantizedInputs(node_unit); + if (!IsQuantizedIOSupported(initializers, node_unit, {0}, params, IOKind::Input)) + return false; + + if (!IsQuantizedIOSupported(initializers, node_unit, {0}, params, IOKind::Output)) + return false; + + return true; } // All ops except "Sin" opset 5- uses consumed_inputs attribute which is not supported for now @@ -1195,24 +1393,11 @@ int UnaryOpSupportChecker::GetMinSupportedOpSet(const NodeUnit& node_unit) const } /* static */ bool UnaryOpSupportChecker::IsQuantizedOpSupported( - const InitializedTensorSet& initializers, const NodeUnit& node_unit, const OpSupportCheckParams& params) { + const InitializedTensorSet& initializers, const NodeUnit& node_unit, const OpSupportCheckParams& /* params */) { const auto& op_type = node_unit.OpType(); ORT_ENFORCE(op_type == "QLinearSigmoid"); - const auto& op_name = node_unit.Name(); - // Check input scales and ZPs - if (!HasValidQuantizationScales(initializers, node_unit, {0}, params, true /* is_input */)) - return false; - if (!HasValidQuantizationZeroPoints(initializers, node_unit, {0}, true /* is_input */)) - return false; - - // Check output scale and ZP - if (!HasValidQuantizationScales(initializers, node_unit, {0}, params, false /* is_input */)) - return false; - if (!HasValidQuantizationZeroPoints(initializers, node_unit, {0}, false /* is_input */)) - return false; - // NNAPI requires the scale be 1.f/256 and zero point to be 0 // See https://android.googlesource.com/platform/frameworks/ml/+/refs/heads/android10-c2f2-release/nn/common/operations/Activation.cpp#180 float output_scale = 0.0f; @@ -1249,7 +1434,9 @@ class ConcatOpSupportChecker : public BaseOpSupportChecker { bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const NodeUnit& node_unit, const OpSupportCheckParams& params) const override; - bool HasSupportedInputsImpl(const NodeUnit& node_unit) const override; + bool HasSupportedInputOutputsImpl( + const InitializedTensorSet& /* initializers */, const NodeUnit& node_unit, + const OpSupportCheckParams& /* params */) const override; }; bool ConcatOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& /* initializers */, const NodeUnit& node_unit, @@ -1268,7 +1455,9 @@ bool ConcatOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& /* in return true; } -bool ConcatOpSupportChecker::HasSupportedInputsImpl(const NodeUnit& node_unit) const { +bool ConcatOpSupportChecker::HasSupportedInputOutputsImpl( + const InitializedTensorSet& /* initializers */, const NodeUnit& node_unit, + const OpSupportCheckParams& /* params */) const { int32_t input_type; if (!GetType(node_unit.Inputs()[0].node_arg, input_type)) return false; @@ -1331,37 +1520,17 @@ bool SqueezeOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& init class QuantizeLinearOpSupportChecker : public BaseOpSupportChecker { private: - bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const NodeUnit& node_unit, - const OpSupportCheckParams& params) const override; - int32_t GetMinSupportedNNAPIFeatureLevel(const NodeUnit& /* node_unit */, const OpSupportCheckParams& /* params */) const override { return ANEURALNETWORKS_FEATURE_LEVEL_3; } -}; -bool QuantizeLinearOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initializers, const NodeUnit& node_unit, - const OpSupportCheckParams& params) const { - int32_t output_type; - if (!GetType(node_unit.Outputs()[0].node_arg, output_type)) - return false; - - if (output_type != ONNX_NAMESPACE::TensorProto_DataType_UINT8) { - LOGS_DEFAULT(VERBOSE) << "[" << node_unit.OpType() - << "] output type: [" << output_type - << "] is not supported for now"; - return false; + bool HasSupportedInputOutputsImpl( + const InitializedTensorSet& initializers, const NodeUnit& node_unit, + const OpSupportCheckParams& params) const override { + return IsQuantizedIOSupported(initializers, node_unit, {0}, params, IOKind::Output); } - - // For QuantizeLinear only output is quantized - // Check output scale and ZP - if (!HasValidQuantizationScales(initializers, node_unit, {0}, params, false /* is_input */)) - return false; - if (!HasValidQuantizationZeroPoints(initializers, node_unit, {0}, false /* is_input */)) - return false; - - return true; -} +}; #pragma endregion @@ -1369,42 +1538,17 @@ bool QuantizeLinearOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSe class DequantizeLinearOpSupportChecker : public BaseOpSupportChecker { private: - bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const NodeUnit& node_unit, - const OpSupportCheckParams& params) const override; - int32_t GetMinSupportedNNAPIFeatureLevel(const NodeUnit& /* node_unit */, const OpSupportCheckParams& /* params */) const override { return ANEURALNETWORKS_FEATURE_LEVEL_1; } - bool HasSupportedInputsImpl(const NodeUnit& node_unit) const override; -}; -bool DequantizeLinearOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initializers, const NodeUnit& node_unit, - const OpSupportCheckParams& params) const { - // For DequantizeLinear only input is quantized - // Check input scale and ZP - if (!HasValidQuantizationScales(initializers, node_unit, {0}, params, true /* is_input */)) - return false; - if (!HasValidQuantizationZeroPoints(initializers, node_unit, {0}, true /* is_input */)) - return false; - - return true; -} - -bool DequantizeLinearOpSupportChecker::HasSupportedInputsImpl(const NodeUnit& node_unit) const { - int32_t input_type; - if (!GetType(node_unit.Inputs()[0].node_arg, input_type)) - return false; - - if (input_type != ONNX_NAMESPACE::TensorProto_DataType_UINT8) { - LOGS_DEFAULT(VERBOSE) << "[" << node_unit.OpType() - << "] Input type: [" << input_type - << "] is not supported for now"; - return false; + bool HasSupportedInputOutputsImpl( + const InitializedTensorSet& initializers, const NodeUnit& node_unit, + const OpSupportCheckParams& params) const override { + return IsQuantizedIOSupported(initializers, node_unit, {0}, params, IOKind::Input); } - - return true; -} +}; #pragma endregion @@ -1480,7 +1624,9 @@ class ResizeOpSupportChecker : public BaseOpSupportChecker { // We only support Resize opset 11+ here int GetMinSupportedOpSet(const NodeUnit& /* node_unit */) const override { return 11; } - bool HasSupportedInputsImpl(const NodeUnit& node_unit) const override; + bool HasSupportedInputOutputsImpl( + const InitializedTensorSet& /* initializers */, const NodeUnit& node_unit, + const OpSupportCheckParams& /* params */) const override; bool IsNodeUnitTypeSupported(const NodeUnit& /* node_unit */) const override { return true; } static bool IsQuantizedOp(const NodeUnit& node_unit) ORT_MUST_USE_RESULT; // TODO, see if we want to move this to BaseOpBuilder }; @@ -1609,33 +1755,6 @@ bool ResizeOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initi } } - if (IsQuantizedOp(node_unit)) { - // For QDQResize, we only support uint8 output now - // TODO, add int8 support to NNAPI, and maybe move all the output type check into a virtual function - // similar to HasSupportedInputsImpl - int32_t output_type; - if (!GetType(node_unit.Outputs()[0].node_arg, output_type)) - return false; - - if (output_type != ONNX_NAMESPACE::TensorProto_DataType_UINT8) { - LOGS_DEFAULT(VERBOSE) << "[Resize] output type: [" << output_type - << "] is not supported for now"; - return false; - } - - // Check input scales and ZPs - if (!HasValidQuantizationScales(initializers, node_unit, {0}, params, true /* is_input */)) - return false; - if (!HasValidQuantizationZeroPoints(initializers, node_unit, {0}, true /* is_input */)) - return false; - - // Check output scale and ZP - if (!HasValidQuantizationScales(initializers, node_unit, {0}, params, false /* is_input */)) - return false; - if (!HasValidQuantizationZeroPoints(initializers, node_unit, {0}, false /* is_input */)) - return false; - } - return true; } @@ -1653,7 +1772,9 @@ int32_t ResizeOpSupportChecker::GetMinSupportedNNAPIFeatureLevel(const NodeUnit& return ANEURALNETWORKS_FEATURE_LEVEL_2; } -bool ResizeOpSupportChecker::HasSupportedInputsImpl(const NodeUnit& node_unit) const { +bool ResizeOpSupportChecker::HasSupportedInputOutputsImpl( + const InitializedTensorSet& initializers, const NodeUnit& node_unit, + const OpSupportCheckParams& params) const { int32_t input_type; if (!GetType(node_unit.Inputs()[0].node_arg, input_type)) return false; @@ -1666,6 +1787,14 @@ bool ResizeOpSupportChecker::HasSupportedInputsImpl(const NodeUnit& node_unit) c return false; } + if (IsQuantizedOp(node_unit)) { + if (!IsQuantizedIOSupported(initializers, node_unit, {0}, params, IOKind::Input)) + return false; + + if (!IsQuantizedIOSupported(initializers, node_unit, {0}, params, IOKind::Output)) + return false; + } + return true; } @@ -1870,6 +1999,7 @@ static OpSupportCheckerRegistrations CreateOpSupportCheckerRegistrations() { NNAPI_EP_ADD_SHARED_OP_SUPPORT_CHECKER("Mul", BinaryOpSupportChecker); NNAPI_EP_ADD_SHARED_OP_SUPPORT_CHECKER("Pow", BinaryOpSupportChecker); NNAPI_EP_ADD_SHARED_OP_SUPPORT_CHECKER("QLinearAdd", BinaryOpSupportChecker); + NNAPI_EP_ADD_SHARED_OP_SUPPORT_CHECKER("QLinearMul", BinaryOpSupportChecker); NNAPI_EP_ADD_SHARED_OP_SUPPORT_CHECKER("Sub", BinaryOpSupportChecker); } diff --git a/onnxruntime/test/contrib_ops/qlinear_binary_op_test.cc b/onnxruntime/test/contrib_ops/qlinear_binary_op_test.cc index 8dbc58c12ce74..c63d79806e777 100644 --- a/onnxruntime/test/contrib_ops/qlinear_binary_op_test.cc +++ b/onnxruntime/test/contrib_ops/qlinear_binary_op_test.cc @@ -43,61 +43,87 @@ void RunQLinearMathTestFromFloat( const quantization::Params& a_params, const std::vector& b, const std::vector& b_shape_origin, const quantization::Params& b_params, - const quantization::Params& c_params, - bool input_b_is_initializer = false, - bool all_initializer_scale_zero_point = false) { - size_t number_dims = std::max(a_shape_origin.size(), b_shape_origin.size()); - std::vector a_shape = PrefixingDims(a_shape_origin, number_dims); - std::vector b_shape = PrefixingDims(b_shape_origin, number_dims); - // calc broadcasting shaped - std::vector c_shape(number_dims, 1); - for (size_t axis = 0; axis < number_dims; ++axis) { - if (a_shape[axis] != b_shape[axis] && (a_shape[axis] != 1 && b_shape[axis] != 1)) { - ORT_THROW("Shapes can not be broadcasted"); + const quantization::Params& c_params) { + const auto run_test = [&](bool input_b_is_initializer, + bool all_initializer_scale_zero_point) { + size_t number_dims = std::max(a_shape_origin.size(), b_shape_origin.size()); + std::vector a_shape = PrefixingDims(a_shape_origin, number_dims); + std::vector b_shape = PrefixingDims(b_shape_origin, number_dims); + // calc broadcasting shaped + std::vector c_shape(number_dims, 1); + for (size_t axis = 0; axis < number_dims; ++axis) { + if (a_shape[axis] != b_shape[axis] && (a_shape[axis] != 1 && b_shape[axis] != 1)) { + ORT_THROW("Shapes can not be broadcasted"); + } + c_shape[axis] = std::max(a_shape[axis], b_shape[axis]); } - c_shape[axis] = std::max(a_shape[axis], b_shape[axis]); - } - std::vector a_strides, b_strides, c_strides; - auto c_size = CalcStrides(c_shape, c_strides, false); - auto a_size = CalcStrides(a_shape, a_strides, true); - auto b_size = CalcStrides(b_shape, b_strides, true); - if (a_size != static_cast(a.size()) || b_size != static_cast(b.size())) { - ORT_THROW("Input size not match input shape!"); - } - constexpr int qmax = std::numeric_limits::max(); - constexpr int qmin = std::numeric_limits::min(); - - OpTester test(op_name, 1, onnxruntime::kMSDomain); - std::vector a_quantized = QuantizeTestVector(a, a_params); - test.template AddInput("A", a_shape_origin, a_quantized); - test.AddInput("A_scale", {}, {a_params.scale}, all_initializer_scale_zero_point); - test.template AddInput("A_zero_point", {}, {a_params.zero_point}, all_initializer_scale_zero_point); - - std::vector b_quantized = QuantizeTestVector(b, b_params); - test.template AddInput("B", b_shape_origin, b_quantized, input_b_is_initializer); - test.AddInput("B_scale", {}, {b_params.scale}, all_initializer_scale_zero_point); - test.template AddInput("B_zero_point", {}, {b_params.zero_point}, all_initializer_scale_zero_point); - - test.AddInput("C_scale", {}, {c_params.scale}, all_initializer_scale_zero_point); - test.template AddInput("C_zero_point", {}, {c_params.zero_point}, all_initializer_scale_zero_point); - std::vector c(c_size); - for (int64_t offset = 0; offset < c_size; ++offset) { - int64_t remain = offset, a_offset = 0, b_offset = 0; - for (size_t axis = 0; axis < number_dims; ++axis) { - int64_t index = remain / c_strides[axis]; - remain = remain % c_strides[axis]; - a_offset += index * a_strides[axis]; - b_offset += index * b_strides[axis]; + std::vector a_strides, b_strides, c_strides; + auto c_size = CalcStrides(c_shape, c_strides, false); + auto a_size = CalcStrides(a_shape, a_strides, true); + auto b_size = CalcStrides(b_shape, b_strides, true); + if (a_size != static_cast(a.size()) || b_size != static_cast(b.size())) { + ORT_THROW("Input size not match input shape!"); + } + constexpr int qmax = std::numeric_limits::max(); + constexpr int qmin = std::numeric_limits::min(); + + OpTester test(op_name, 1, onnxruntime::kMSDomain); + std::vector a_quantized = QuantizeTestVector(a, a_params); + test.template AddInput("A", a_shape_origin, a_quantized); + test.AddInput("A_scale", {}, {a_params.scale}, all_initializer_scale_zero_point); + test.template AddInput("A_zero_point", {}, {a_params.zero_point}, all_initializer_scale_zero_point); + + std::vector b_quantized = QuantizeTestVector(b, b_params); + test.template AddInput("B", b_shape_origin, b_quantized, input_b_is_initializer); + test.AddInput("B_scale", {}, {b_params.scale}, all_initializer_scale_zero_point); + test.template AddInput("B_zero_point", {}, {b_params.zero_point}, all_initializer_scale_zero_point); + + test.AddInput("C_scale", {}, {c_params.scale}, all_initializer_scale_zero_point); + test.template AddInput("C_zero_point", {}, {c_params.zero_point}, all_initializer_scale_zero_point); + std::vector c(c_size); + for (int64_t offset = 0; offset < c_size; ++offset) { + int64_t remain = offset, a_offset = 0, b_offset = 0; + for (size_t axis = 0; axis < number_dims; ++axis) { + int64_t index = remain / c_strides[axis]; + remain = remain % c_strides[axis]; + a_offset += index * a_strides[axis]; + b_offset += index * b_strides[axis]; + } + + float a_dequantized = quantization::Dequantize(a_quantized[a_offset], a_params); + float b_dequantized = quantization::Dequantize(b_quantized[b_offset], b_params); + c[offset] = clampi(static_cast(std::nearbyintf(calc(a_dequantized, b_dequantized) / c_params.scale)) + c_params.zero_point, qmin, qmax); } - float a_dequantized = quantization::Dequantize(a_quantized[a_offset], a_params); - float b_dequantized = quantization::Dequantize(b_quantized[b_offset], b_params); - c[offset] = clampi(static_cast(std::nearbyintf(calc(a_dequantized, b_dequantized) / c_params.scale)) + c_params.zero_point, qmin, qmax); - } - test.template AddOutput("C", c_shape, c); + float abs_error = 0.0f; + + // For quantized models, NNAPI's rounding is different than CPU provider + // Sometimes the result is within +/-1 of result of CPU provider + // For ONNX, we use rounding to nearest ties to even. + // For NNAPI, it is using std::round which is HALF_AWAY_FROM_ZERO, see + // https://android.googlesource.com/platform/frameworks/ml/+/refs/heads/master/nn/common/operations/Quantize.cpp + // Use 1 as abs_error which is the smallest possbile for uint8_t + // + // NOTE, for now the tolerance will only apply if the NNAPI is actually used, + // if for any reason the execution falls back to CPU, we still expect an exact match + // See, 'void Check(...' in onnxruntime/test/providers/provider_test_utils.cc +#ifdef USE_NNAPI + abs_error = 1.0f; +#endif + + test.template AddOutput("C", c_shape, c, false /* sort_output */, 0.0f /* rel_error */, abs_error); + + test.Run(); + }; + + run_test(false /* input_b_is_initializer */, false /* all_initializer_scale_zero_point */); - test.Run(); + // NNAPI will require all the scales and zero points be initializers + run_test(false /* input_b_is_initializer */, true /* all_initializer_scale_zero_point */); + + // We also want to test the case input B is an initializer + run_test(true /* input_b_is_initializer */, true /* all_initializer_scale_zero_point */); } // total 32 + 31 elements to cover all path @@ -145,22 +171,6 @@ TEST(QLinearBinaryOpTest, AddU8VectorVectorFull) { A, {63}, A_params, B, {63}, B_params, C_params); - - // NNAPI will require all the scales and zero points be initializers - // We also want to test the case input B is an initializer - RunQLinearMathTestFromFloat("QLinearAdd", add_function, - A, {63}, A_params, - B, {63}, B_params, - C_params, - false /* input_b_is_initializer */, - true /* all_initializer_scale_zero_point */); - - RunQLinearMathTestFromFloat("QLinearAdd", add_function, - A, {63}, A_params, - B, {63}, B_params, - C_params, - true /* input_b_is_initializer */, - true /* all_initializer_scale_zero_point */); } TEST(QLinearBinaryOpTest, AddU8VectorVectorBroadcast) { @@ -180,22 +190,6 @@ TEST(QLinearBinaryOpTest, AddU8VectorVectorBroadcast) { A, {3, 3, 7}, A_params, B, {3, 1, 7}, B_params, C_params); - - // NNAPI will require all the scales and zero points be initializers - // We also want to test the case input B is an initializer - RunQLinearMathTestFromFloat("QLinearAdd", add_function, - A, {3, 3, 7}, A_params, - B, {3, 1, 7}, B_params, - C_params, - false /* input_b_is_initializer */, - true /* all_initializer_scale_zero_point */); - - RunQLinearMathTestFromFloat("QLinearAdd", add_function, - A, {3, 3, 7}, A_params, - B, {3, 1, 7}, B_params, - C_params, - true /* input_b_is_initializer */, - true /* all_initializer_scale_zero_point */); } TEST(QLinearBinaryOpTest, AddU8ScalarVectorFull) { @@ -212,22 +206,6 @@ TEST(QLinearBinaryOpTest, AddU8ScalarVectorFull) { B, {1}, B_params, A, {63}, A_params, C_params); - - // NNAPI will require all the scales and zero points be initializers - // We also want to test the case input B is an initializer - RunQLinearMathTestFromFloat("QLinearAdd", add_function, - B, {1}, B_params, - A, {63}, A_params, - C_params, - false /* input_b_is_initializer */, - true /* all_initializer_scale_zero_point */); - - RunQLinearMathTestFromFloat("QLinearAdd", add_function, - B, {1}, B_params, - A, {63}, A_params, - C_params, - true /* input_b_is_initializer */, - true /* all_initializer_scale_zero_point */); } TEST(QLinearBinaryOpTest, AddU8ScalarVectorBroadcast) { @@ -244,22 +222,6 @@ TEST(QLinearBinaryOpTest, AddU8ScalarVectorBroadcast) { B, {3, 1, 1}, B_params, A, {3, 7, 3}, A_params, C_params); - - // NNAPI will require all the scales and zero points be initializers - // We also want to test the case input B is an initializer - RunQLinearMathTestFromFloat("QLinearAdd", add_function, - B, {3, 1, 1}, B_params, - A, {3, 7, 3}, A_params, - C_params, - false /* input_b_is_initializer */, - true /* all_initializer_scale_zero_point */); - - RunQLinearMathTestFromFloat("QLinearAdd", add_function, - B, {3, 1, 1}, B_params, - A, {3, 7, 3}, A_params, - C_params, - true /* input_b_is_initializer */, - true /* all_initializer_scale_zero_point */); } TEST(QLinearBinaryOpTest, AddU8VectorScalarFull) { @@ -276,22 +238,6 @@ TEST(QLinearBinaryOpTest, AddU8VectorScalarFull) { A, {63}, A_params, B, {1}, B_params, C_params); - - // NNAPI will require all the scales and zero points be initializers - // We also want to test the case input B is an initializer - RunQLinearMathTestFromFloat("QLinearAdd", add_function, - A, {63}, A_params, - B, {1}, B_params, - C_params, - false /* input_b_is_initializer */, - true /* all_initializer_scale_zero_point */); - - RunQLinearMathTestFromFloat("QLinearAdd", add_function, - A, {63}, A_params, - B, {1}, B_params, - C_params, - true /* input_b_is_initializer */, - true /* all_initializer_scale_zero_point */); } TEST(QLinearBinaryOpTest, AddU8VectorScalarBroadcast) { @@ -308,22 +254,6 @@ TEST(QLinearBinaryOpTest, AddU8VectorScalarBroadcast) { A, {3, 7, 3}, A_params, B, {1, 1, 3}, B_params, C_params); - - // NNAPI will require all the scales and zero points be initializers - // We also want to test the case input B is an initializer - RunQLinearMathTestFromFloat("QLinearAdd", add_function, - A, {3, 7, 3}, A_params, - B, {1, 1, 3}, B_params, - C_params, - false /* input_b_is_initializer */, - true /* all_initializer_scale_zero_point */); - - RunQLinearMathTestFromFloat("QLinearAdd", add_function, - A, {3, 7, 3}, A_params, - B, {1, 1, 3}, B_params, - C_params, - true /* input_b_is_initializer */, - true /* all_initializer_scale_zero_point */); } TEST(QLinearBinaryOpTest, AddS8VectorVectorFull) { diff --git a/onnxruntime/test/optimizer/qdq_test_utils.h b/onnxruntime/test/optimizer/qdq_test_utils.h index 7a054c9a64352..fbbb0b8af1028 100644 --- a/onnxruntime/test/optimizer/qdq_test_utils.h +++ b/onnxruntime/test/optimizer/qdq_test_utils.h @@ -81,10 +81,27 @@ GetQDQTestCaseFn BuildQDQConvTestCase(const std::vector& input_shape, c template GetQDQTestCaseFn BuildQDQAveragePoolTestCase(const std::vector& input_shape) { return [input_shape](ModelTestBuilder& builder) { + +#ifdef USE_NNAPI // NNAPI require consistent scales/ZPs for DQ -> Pool -> Q + float dq_scale = 0.0038f; + float pool_output_scale = 0.0038f; + float q_scale = 0.0038f; + InputType dq_zp = std::numeric_limits::max() / 2; + InputType pool_output_zp = std::numeric_limits::max() / 2; + InputType q_zp = std::numeric_limits::max() / 2; +#else + float dq_scale = 0.0035f; + float pool_output_scale = 0.0038f; + float q_scale = 0.0039f; + InputType dq_zp = 7; + InputType pool_output_zp = std::numeric_limits::max() / 2; + InputType q_zp = std::numeric_limits::max() / 2; +#endif + auto* input_arg = builder.MakeInput(input_shape, -1.f, 1.f); auto* output_arg = builder.MakeOutput(); // add QDQ + AveragePool - auto* dq_output = AddQDQNodePair(builder, input_arg, .0035f, 7); + auto* dq_output = AddQDQNodePair(builder, input_arg, dq_scale, dq_zp); auto* averagepool_output = builder.MakeIntermediate(); Node& pool_node = builder.AddNode("AveragePool", {dq_output}, {averagepool_output}); std::vector pads((input_shape.size() - 2) * 2, 1); @@ -95,12 +112,12 @@ GetQDQTestCaseFn BuildQDQAveragePoolTestCase(const std::vector& input_s // add QDQ output auto* q_output = builder.MakeIntermediate(); builder.AddQuantizeLinearNode(averagepool_output, - .0038f, - std::numeric_limits::max() / 2, + pool_output_scale, + pool_output_zp, q_output); builder.AddDequantizeLinearNode(q_output, - .0039f, - std::numeric_limits::max() / 2, + q_scale, + q_zp, output_arg); }; } @@ -110,5 +127,65 @@ GetQDQTestCaseFn BuildQDQResizeTestCase(const std::vector& input_shape, const std::string& mode = "nearest", const std::string& coordinate_transformation_mode = "half_pixel"); +template +GetQDQTestCaseFn BuildBinaryOpTestCase(const std::vector& input_shape, + const std::string& op_type) { + return [input_shape, op_type](ModelTestBuilder& builder) { + auto* input1_arg = builder.MakeInput(input_shape, -1.f, 1.f); + auto* input2_arg = builder.MakeInput(input_shape, -1.f, 1.f); + auto* output_arg = builder.MakeOutput(); + +#ifdef USE_NNAPI // NNAPI require consistent scales for DQ -> bin_op_input and bin_op_output-> Q + float q_scale = 0.008f; + float op_input_scale = 0.008f; + float op_output_scale = 0.0076f; + float dq_scale = 0.0076f; +#else + float q_scale = 0.008f; + float op_input_scale = 0.0079f; + float op_output_scale = 0.0076f; + float dq_scale = 0.0078f; +#endif + + // add QDQ 1 + auto* q1_output = builder.MakeIntermediate(); + auto* dq1_output = builder.MakeIntermediate(); + builder.AddQuantizeLinearNode(input1_arg, + q_scale, + std::numeric_limits::max() / 2, + q1_output); + builder.AddDequantizeLinearNode(q1_output, + op_input_scale, + std::numeric_limits::max() / 2, + dq1_output); + + // add QDQ 2 + auto* q2_output = builder.MakeIntermediate(); + auto* dq2_output = builder.MakeIntermediate(); + builder.AddQuantizeLinearNode(input2_arg, + q_scale, + std::numeric_limits::max() / 2, + q2_output); + builder.AddDequantizeLinearNode(q2_output, + op_input_scale, + std::numeric_limits::max() / 2, + dq2_output); + + // add binary operator + auto* binary_op_output = builder.MakeIntermediate(); + builder.AddNode(op_type, {dq1_output, dq2_output}, {binary_op_output}); + + // add QDQ output + auto* q3_output = builder.MakeIntermediate(); + builder.AddQuantizeLinearNode(binary_op_output, + op_output_scale, + std::numeric_limits::max() / 2, + q3_output); + builder.AddDequantizeLinearNode(q3_output, + dq_scale, + std::numeric_limits::max() / 2, + output_arg); + }; +} } // namespace test } // namespace onnxruntime \ No newline at end of file diff --git a/onnxruntime/test/optimizer/qdq_transformer_test.cc b/onnxruntime/test/optimizer/qdq_transformer_test.cc index 7ea62a2817100..73b07e8adbd6b 100644 --- a/onnxruntime/test/optimizer/qdq_transformer_test.cc +++ b/onnxruntime/test/optimizer/qdq_transformer_test.cc @@ -39,7 +39,7 @@ namespace test { template void QDQTransformerConvTests() { auto test_case = [&](const std::vector& input_shape, const std::vector& weights_shape) { - auto check_conv_graph = [&](InferenceSessionWrapper& session) { + auto check_graph = [&](InferenceSessionWrapper& session) { auto op_to_count = CountOpsInGraph(session.GetGraph()); if constexpr (std::is_same::value && std::is_same::value && @@ -57,7 +57,7 @@ void QDQTransformerConvTests() { }; TransformerTester(BuildQDQConvTestCase(input_shape, weights_shape), - check_conv_graph, + check_graph, TransformerLevel::Level1, TransformerLevel::Level2, 12 /*opset_version*/, @@ -136,7 +136,7 @@ TEST(QDQTransformerTests, ConvMaxPoolReshape_UInt8) { builder.AddQuantizeLinearNode(reshape_output, .0039f, 135, output_arg); }; - auto check_mp_reshape_graph = [&](InferenceSessionWrapper& session) { + auto check_graph = [&](InferenceSessionWrapper& session) { auto op_to_count = CountOpsInGraph(session.GetGraph()); EXPECT_EQ(op_to_count["QLinearConv"], 1); EXPECT_EQ(op_to_count["MaxPool"], 1); @@ -146,7 +146,7 @@ TEST(QDQTransformerTests, ConvMaxPoolReshape_UInt8) { }; TransformerTester(build_test_case, - check_mp_reshape_graph, + check_graph, TransformerLevel::Level1, TransformerLevel::Level2, opset_version); @@ -197,7 +197,7 @@ TEST(QDQTransformerTests, ConvMaxPoolReshape_Int8) { } }; - auto check_mp_reshape_graph = [&](InferenceSessionWrapper& session) { + auto check_graph = [&](InferenceSessionWrapper& session) { auto op_to_count = CountOpsInGraph(session.GetGraph()); EXPECT_EQ(op_to_count["QLinearConv"], 1); EXPECT_EQ(op_to_count["MaxPool"], 1); @@ -206,7 +206,7 @@ TEST(QDQTransformerTests, ConvMaxPoolReshape_Int8) { EXPECT_EQ(op_to_count["DequantizeLinear"], 0); }; - TransformerTester(build_test_case, check_mp_reshape_graph, TransformerLevel::Level1, TransformerLevel::Level2); + TransformerTester(build_test_case, check_graph, TransformerLevel::Level1, TransformerLevel::Level2); }; test_case({1, 12, 37}, {32, 12, 5}); @@ -217,7 +217,7 @@ TEST(QDQTransformerTests, ConvMaxPoolReshape_Int8) { template void QDQTransformerAveragePoolTests() { auto test_case = [&](const std::vector& input_shape) { - auto check_averagepool_op_graph = [&](InferenceSessionWrapper& session) { + auto check_graph = [&](InferenceSessionWrapper& session) { auto op_to_count = CountOpsInGraph(session.GetGraph()); if constexpr (std::is_same::value) { EXPECT_EQ(op_to_count["com.microsoft.QLinearAveragePool"], 1); @@ -233,7 +233,7 @@ void QDQTransformerAveragePoolTests() { }; TransformerTester(BuildQDQAveragePoolTestCase(input_shape), - check_averagepool_op_graph, + check_graph, TransformerLevel::Level1, TransformerLevel::Level2, 12 /*opset_version*/, @@ -266,52 +266,7 @@ TEST(QDQTransformerTests, AveragePool_U8S8) { template void QDQTransformerBinaryOpTests(const std::string& op_type) { auto test_case = [&](const std::vector& input_shape) { - auto build_test_case = [&](ModelTestBuilder& builder) { - auto* input1_arg = builder.MakeInput(input_shape, -1.f, 1.f); - auto* input2_arg = builder.MakeInput(input_shape, -1.f, 1.f); - auto* output_arg = builder.MakeOutput(); - - // add QDQ 1 - auto* q1_output = builder.MakeIntermediate(); - auto* dq1_output = builder.MakeIntermediate(); - builder.AddQuantizeLinearNode(input1_arg, - .004f, - std::numeric_limits::max() / 2, - q1_output); - builder.AddDequantizeLinearNode(q1_output, - .0039f, - std::numeric_limits::max() / 2, - dq1_output); - - // add QDQ 2 - auto* q2_output = builder.MakeIntermediate(); - auto* dq2_output = builder.MakeIntermediate(); - builder.AddQuantizeLinearNode(input2_arg, - .004f, - std::numeric_limits::max() / 2, - q2_output); - builder.AddDequantizeLinearNode(q2_output, - .0039f, - std::numeric_limits::max() / 2, - dq2_output); - - // add binary operator - auto* binary_op_output = builder.MakeIntermediate(); - builder.AddNode(op_type, {dq1_output, dq2_output}, {binary_op_output}); - - // add QDQ output - auto* q3_output = builder.MakeIntermediate(); - builder.AddQuantizeLinearNode(binary_op_output, - .0038f, - std::numeric_limits::max() / 2, - q3_output); - builder.AddDequantizeLinearNode(q3_output, - .0039f, - std::numeric_limits::max() / 2, - output_arg); - }; - - auto check_binary_op_graph = [&](InferenceSessionWrapper& session) { + auto check_graph = [&](InferenceSessionWrapper& session) { auto op_to_count = CountOpsInGraph(session.GetGraph()); if (std::is_same::value && std::is_same::value) { @@ -327,8 +282,8 @@ void QDQTransformerBinaryOpTests(const std::string& op_type) { } }; - TransformerTester(build_test_case, - check_binary_op_graph, + TransformerTester(BuildBinaryOpTestCase(input_shape, op_type), + check_graph, TransformerLevel::Level1, TransformerLevel::Level2, 12 /*opset_version*/, @@ -426,7 +381,7 @@ void QDQTransformerMatMulTests(bool has_output_q) { } }; - auto check_binary_op_graph = [&](InferenceSessionWrapper& session) { + auto check_graph = [&](InferenceSessionWrapper& session) { auto op_to_count = CountOpsInGraph(session.GetGraph()); if (has_output_q) { if constexpr (std::is_same::value && @@ -459,7 +414,7 @@ void QDQTransformerMatMulTests(bool has_output_q) { }; TransformerTester(build_test_case, - check_binary_op_graph, + check_graph, TransformerLevel::Level1, TransformerLevel::Level2, 12 /*opset_version*/, @@ -696,14 +651,14 @@ TEST(QDQTransformerTests, Gather) { builder.AddQuantizeLinearNode(gather_output, .003f, 1, output_arg); }; - auto check_matmul_graph = [&](InferenceSessionWrapper& session) { + auto check_graph = [&](InferenceSessionWrapper& session) { auto op_to_count = CountOpsInGraph(session.GetGraph()); EXPECT_EQ(op_to_count["Gather"], 1); EXPECT_EQ(op_to_count["QuantizeLinear"], 0); EXPECT_EQ(op_to_count["DequantizeLinear"], 0); }; - TransformerTester(build_test_case, check_matmul_graph, TransformerLevel::Level1, TransformerLevel::Level2); + TransformerTester(build_test_case, check_graph, TransformerLevel::Level1, TransformerLevel::Level2); }; test_case({12, 37}, {24, 12}); @@ -728,14 +683,14 @@ TEST(QDQTransformerTests, Transpose) { builder.AddQuantizeLinearNode(transpose_output, .003f, 1, output_arg); }; - auto check_matmul_graph = [&](InferenceSessionWrapper& session) { + auto check_graph = [&](InferenceSessionWrapper& session) { auto op_to_count = CountOpsInGraph(session.GetGraph()); EXPECT_EQ(op_to_count["Transpose"], 1); EXPECT_EQ(op_to_count["QuantizeLinear"], 0); EXPECT_EQ(op_to_count["DequantizeLinear"], 0); }; - TransformerTester(build_test_case, check_matmul_graph, TransformerLevel::Level1, TransformerLevel::Level2); + TransformerTester(build_test_case, check_graph, TransformerLevel::Level1, TransformerLevel::Level2); }; test_case({2, 13, 12, 37}, {0, 3, 1, 2}); @@ -760,13 +715,13 @@ TEST(QDQTransformerTests, Transpose_No_Fusion) { builder.AddQuantizeLinearNode(transpose_output, .003f, 1, output_arg); }; - auto check_matmul_graph = [&](InferenceSessionWrapper& session) { + auto check_graph = [&](InferenceSessionWrapper& session) { auto op_to_count = CountOpsInGraph(session.GetGraph()); EXPECT_EQ(op_to_count["QuantizeLinear"], 1); EXPECT_EQ(op_to_count["DequantizeLinear"], 1); }; - TransformerTester(build_test_case, check_matmul_graph, TransformerLevel::Level1, TransformerLevel::Level2); + TransformerTester(build_test_case, check_graph, TransformerLevel::Level1, TransformerLevel::Level2); }; test_case({2, 13, 12, 37}, {0, 3, 1, 2}); @@ -775,7 +730,7 @@ TEST(QDQTransformerTests, Transpose_No_Fusion) { TEST(QDQTransformerTests, Resize) { auto test_case = [&](const std::vector& input1_shape, const std::vector& sizes_shape) { - auto check_resize_graph = [&](InferenceSessionWrapper& session) { + auto check_graph = [&](InferenceSessionWrapper& session) { auto op_to_count = CountOpsInGraph(session.GetGraph()); EXPECT_EQ(op_to_count["Resize"], 1); EXPECT_EQ(op_to_count["QuantizeLinear"], 0); @@ -783,7 +738,7 @@ TEST(QDQTransformerTests, Resize) { }; TransformerTester(BuildQDQResizeTestCase(input1_shape, sizes_shape), - check_resize_graph, + check_graph, TransformerLevel::Level1, TransformerLevel::Level2); }; @@ -828,7 +783,7 @@ TEST(QDQTransformerTests, Resize_No_Fusion) { builder.AddQuantizeLinearNode(resize_output, .003f, 1, output_arg); }; - auto check_qdq_graph = [&](InferenceSessionWrapper& session) { + auto check_graph = [&](InferenceSessionWrapper& session) { auto op_to_count = CountOpsInGraph(session.GetGraph()); EXPECT_EQ(op_to_count["Resize"], 1); EXPECT_EQ(op_to_count["Concat"], 1); @@ -836,7 +791,7 @@ TEST(QDQTransformerTests, Resize_No_Fusion) { EXPECT_EQ(op_to_count["DequantizeLinear"], 1); }; - TransformerTester(build_test_case, check_qdq_graph, + TransformerTester(build_test_case, check_graph, TransformerLevel::Level1, TransformerLevel::Level2); }; @@ -867,7 +822,7 @@ TEST(QDQTransformerTests, ResizeReshape) { builder.AddNode("Reshape", {qdq_resize_output, reshape_shape}, {output_arg}); }; - auto check_qdq_graph = [&](InferenceSessionWrapper& session) { + auto check_graph = [&](InferenceSessionWrapper& session) { auto op_to_count = CountOpsInGraph(session.GetGraph()); EXPECT_EQ(op_to_count["Resize"], 1); EXPECT_EQ(op_to_count["Reshape"], 1); @@ -875,7 +830,7 @@ TEST(QDQTransformerTests, ResizeReshape) { EXPECT_EQ(op_to_count["DequantizeLinear"], 1); }; - TransformerTester(build_test_case, check_qdq_graph, + TransformerTester(build_test_case, check_graph, TransformerLevel::Level1, TransformerLevel::Level2); }; @@ -905,13 +860,13 @@ TEST(QDQTransformerTests, ArgMax) { argmax_node.AddAttribute("select_last_index", static_cast(select_last_index)); }; - auto check_argmax_graph = [&](InferenceSessionWrapper& session) { + auto check_graph = [&](InferenceSessionWrapper& session) { auto op_to_count = CountOpsInGraph(session.GetGraph()); EXPECT_EQ(op_to_count["ArgMax"], 1); EXPECT_EQ(op_to_count["DequantizeLinear"], 0); }; - TransformerTester(build_test_case, check_argmax_graph, + TransformerTester(build_test_case, check_graph, TransformerLevel::Level1, TransformerLevel::Level2, /* opset_version */ 13); @@ -939,14 +894,14 @@ TEST(QDQTransformerTests, QLinearMatMul) { builder.AddQuantizeLinearNode(matmul_output, .0039f, 135, output_arg); }; - auto check_matmul_graph = [&](InferenceSessionWrapper& session) { + auto check_graph = [&](InferenceSessionWrapper& session) { auto op_to_count = CountOpsInGraph(session.GetGraph()); EXPECT_EQ(op_to_count["QLinearMatMul"], 1); EXPECT_EQ(op_to_count["QuantizeLinear"], 2); EXPECT_EQ(op_to_count["DequantizeLinear"], 0); }; - TransformerTester(build_test_case, check_matmul_graph, TransformerLevel::Level1, TransformerLevel::Level2); + TransformerTester(build_test_case, check_graph, TransformerLevel::Level1, TransformerLevel::Level2); }; test_case({12, 37}, {37, 12}); @@ -970,7 +925,7 @@ TEST(QDQTransformerTests, MatMul_No_Fusion) { builder.AddQuantizeLinearNode(matmul_output, .0039f, 135, output_arg); }; - auto check_matmul_graph = [&](InferenceSessionWrapper& session) { + auto check_graph = [&](InferenceSessionWrapper& session) { auto op_to_count = CountOpsInGraph(session.GetGraph()); EXPECT_EQ(op_to_count["MatMul"], 1); EXPECT_EQ(op_to_count["QLinearMatMul"], 0); @@ -978,7 +933,7 @@ TEST(QDQTransformerTests, MatMul_No_Fusion) { EXPECT_EQ(op_to_count["DequantizeLinear"], 1); }; - TransformerTester(build_test_case, check_matmul_graph, TransformerLevel::Level1, TransformerLevel::Level2); + TransformerTester(build_test_case, check_graph, TransformerLevel::Level1, TransformerLevel::Level2); }; test_case({12, 37}, {37, 12}); @@ -1006,7 +961,7 @@ TEST(QDQTransformerTests, MatMul_1st_Input_Int8) { builder.AddQuantizeLinearNode(matmul_output, .0039f, 135, output_arg); }; - auto check_matmul_graph = [&](InferenceSessionWrapper& session) { + auto check_graph = [&](InferenceSessionWrapper& session) { auto op_to_count = CountOpsInGraph(session.GetGraph()); EXPECT_EQ(op_to_count["MatMul"], 1); EXPECT_EQ(op_to_count["QLinearMatMul"], 0); @@ -1014,7 +969,7 @@ TEST(QDQTransformerTests, MatMul_1st_Input_Int8) { EXPECT_EQ(op_to_count["DequantizeLinear"], 2); }; - TransformerTester(build_test_case, check_matmul_graph, TransformerLevel::Level1, TransformerLevel::Level2); + TransformerTester(build_test_case, check_graph, TransformerLevel::Level1, TransformerLevel::Level2); }; test_case({12, 37}, {37, 12}); @@ -1043,7 +998,7 @@ TEST(QDQTransformerTests, MatMulIntegerToFloat) { builder.AddNode("MatMul", {dq_output_1, dq_output_2}, {output_arg}); }; - auto check_matmul_graph = [&](InferenceSessionWrapper& session) { + auto check_graph = [&](InferenceSessionWrapper& session) { auto op_to_count = CountOpsInGraph(session.GetGraph()); EXPECT_EQ(op_to_count["com.microsoft.MatMulIntegerToFloat"], 1); EXPECT_EQ(op_to_count["QuantizeLinear"], 0); @@ -1051,7 +1006,7 @@ TEST(QDQTransformerTests, MatMulIntegerToFloat) { }; TransformerTester(build_test_case, - check_matmul_graph, + check_graph, TransformerLevel::Level1, TransformerLevel::Level2, 12 /*opset_version*/, @@ -1086,7 +1041,7 @@ TEST(QDQTransformerTests, ConvRelu) { builder.AddQuantizeLinearNode(relu_output, .0039f, is_zp_zero ? 0 : 1, output_arg); }; - auto check_mp_reshape_graph = [&](InferenceSessionWrapper& session) { + auto check_graph = [&](InferenceSessionWrapper& session) { auto op_to_count = CountOpsInGraph(session.GetGraph()); if (is_zp_zero) { EXPECT_EQ(op_to_count["QLinearConv"], 1); @@ -1104,7 +1059,7 @@ TEST(QDQTransformerTests, ConvRelu) { } }; - TransformerTester(build_test_case, check_mp_reshape_graph, TransformerLevel::Level1, TransformerLevel::Level2); + TransformerTester(build_test_case, check_graph, TransformerLevel::Level1, TransformerLevel::Level2); }; test_case({1, 12, 37}, {32, 12, 5}, true); @@ -1150,7 +1105,7 @@ TEST(QDQTransformerTests, ConvAveragePoolReshape_UInt8) { builder.AddDequantizeLinearNode(q_output, .0035f, 135, output_arg); }; - auto check_mp_reshape_graph = [&](InferenceSessionWrapper& session) { + auto check_graph = [&](InferenceSessionWrapper& session) { auto op_to_count = CountOpsInGraph(session.GetGraph()); EXPECT_EQ(op_to_count["QLinearConv"], 1); EXPECT_EQ(op_to_count["com.microsoft.QLinearAveragePool"], 1); @@ -1160,7 +1115,7 @@ TEST(QDQTransformerTests, ConvAveragePoolReshape_UInt8) { }; TransformerTester(build_test_case, - check_mp_reshape_graph, + check_graph, TransformerLevel::Level1, TransformerLevel::Level2, 12 /*opset_version*/, @@ -1213,7 +1168,7 @@ TEST(QDQTransformerTests, ConvAveragePoolReshape_Int8) { } }; - auto check_mp_reshape_graph = [&](InferenceSessionWrapper& session) { + auto check_graph = [&](InferenceSessionWrapper& session) { auto op_to_count = CountOpsInGraph(session.GetGraph()); EXPECT_EQ(op_to_count["QLinearConv"], 1); EXPECT_EQ(op_to_count["com.microsoft.QLinearAveragePool"], 1); @@ -1223,7 +1178,7 @@ TEST(QDQTransformerTests, ConvAveragePoolReshape_Int8) { }; TransformerTester(build_test_case, - check_mp_reshape_graph, + check_graph, TransformerLevel::Level1, TransformerLevel::Level2, 12 /*opset_version*/, @@ -1277,7 +1232,7 @@ TEST(QDQTransformerTests, ConvAveragePoolReshape_Int8_Fail) { } }; - auto check_mp_reshape_graph = [&](InferenceSessionWrapper& session) { + auto check_graph = [&](InferenceSessionWrapper& session) { auto op_to_count = CountOpsInGraph(session.GetGraph()); EXPECT_EQ(op_to_count["Conv"], 1); EXPECT_EQ(op_to_count["QLinearConv"], 0); @@ -1288,7 +1243,7 @@ TEST(QDQTransformerTests, ConvAveragePoolReshape_Int8_Fail) { }; TransformerTester(build_test_case, - check_mp_reshape_graph, + check_graph, TransformerLevel::Level1, TransformerLevel::Level2, 12 /*opset_version*/, @@ -1325,7 +1280,7 @@ void QDQTransformerLeakyReluTests() { output_arg); }; - auto check_binary_op_graph = [&](InferenceSessionWrapper& session) { + auto check_graph = [&](InferenceSessionWrapper& session) { auto op_to_count = CountOpsInGraph(session.GetGraph()); if constexpr (std::is_same::value) { EXPECT_EQ(op_to_count["com.microsoft.QLinearLeakyRelu"], 1); @@ -1341,7 +1296,7 @@ void QDQTransformerLeakyReluTests() { }; TransformerTester(build_test_case, - check_binary_op_graph, + check_graph, TransformerLevel::Level1, TransformerLevel::Level2, 12 /*opset_version*/, @@ -1401,7 +1356,7 @@ TEST(QDQTransformerTests, ConvTranspose_QBackward) { } }; - auto check_mp_reshape_graph = [&](InferenceSessionWrapper& session) { + auto check_graph = [&](InferenceSessionWrapper& session) { auto op_to_count = CountOpsInGraph(session.GetGraph()); EXPECT_EQ(op_to_count["QLinearConv"], 1); EXPECT_EQ(op_to_count["Transpose"], 1); @@ -1410,7 +1365,7 @@ TEST(QDQTransformerTests, ConvTranspose_QBackward) { }; TransformerTester(build_test_case, - check_mp_reshape_graph, + check_graph, TransformerLevel::Level1, TransformerLevel::Level2); }; @@ -1461,7 +1416,7 @@ TEST(QDQTransformerTests, QBackward_MutilpleSteps) { } }; - auto check_mp_reshape_graph = [&](InferenceSessionWrapper& session) { + auto check_graph = [&](InferenceSessionWrapper& session) { auto op_to_count = CountOpsInGraph(session.GetGraph()); EXPECT_EQ(op_to_count["QLinearConv"], 1); EXPECT_EQ(op_to_count["MaxPool"], 1); @@ -1472,7 +1427,7 @@ TEST(QDQTransformerTests, QBackward_MutilpleSteps) { }; TransformerTester(build_test_case, - check_mp_reshape_graph, + check_graph, TransformerLevel::Level1, TransformerLevel::Level2); }; @@ -1512,7 +1467,7 @@ TEST(QDQTransformerTests, ConvTranspose_DQForward) { } }; - auto check_mp_reshape_graph = [&](InferenceSessionWrapper& session) { + auto check_graph = [&](InferenceSessionWrapper& session) { auto op_to_count = CountOpsInGraph(session.GetGraph()); EXPECT_EQ(op_to_count["QLinearConv"], 1); EXPECT_EQ(op_to_count["Transpose"], 1); @@ -1521,7 +1476,7 @@ TEST(QDQTransformerTests, ConvTranspose_DQForward) { }; TransformerTester(build_test_case, - check_mp_reshape_graph, + check_graph, TransformerLevel::Level1, TransformerLevel::Level2); }; @@ -1572,7 +1527,7 @@ TEST(QDQTransformerTests, DQForward_MutilpleSteps) { } }; - auto check_mp_reshape_graph = [&](InferenceSessionWrapper& session) { + auto check_graph = [&](InferenceSessionWrapper& session) { auto op_to_count = CountOpsInGraph(session.GetGraph()); EXPECT_EQ(op_to_count["QLinearConv"], 1); EXPECT_EQ(op_to_count["MaxPool"], 1); @@ -1583,7 +1538,7 @@ TEST(QDQTransformerTests, DQForward_MutilpleSteps) { }; TransformerTester(build_test_case, - check_mp_reshape_graph, + check_graph, TransformerLevel::Level1, TransformerLevel::Level2); }; @@ -1704,7 +1659,7 @@ TEST(QDQTransformerTests, Concat) { } }; - auto check_mp_reshape_graph = [&input_shapes, &has_input_float, &has_input_int8, &has_output_int8](InferenceSessionWrapper& session) { + auto check_graph = [&input_shapes, &has_input_float, &has_input_int8, &has_output_int8](InferenceSessionWrapper& session) { auto op_to_count = CountOpsInGraph(session.GetGraph()); if (has_input_float || has_input_int8 || has_output_int8) { EXPECT_EQ(op_to_count["com.microsoft.QLinearConcat"], 0); @@ -1716,7 +1671,7 @@ TEST(QDQTransformerTests, Concat) { }; TransformerTester(build_test_case, - check_mp_reshape_graph, + check_graph, TransformerLevel::Level1, TransformerLevel::Level2, 12 /*opset_version*/, @@ -1763,7 +1718,7 @@ TEST(QDQTransformerTests, QDQPropagation_QDQCancelOut) { builder.AddNode("Reshape", {maxpool_output, reshape_shape}, {output_arg}); }; - auto check_mp_reshape_graph = [&](InferenceSessionWrapper& session) { + auto check_graph = [&](InferenceSessionWrapper& session) { auto op_to_count = CountOpsInGraph(session.GetGraph()); EXPECT_EQ(op_to_count["MaxPool"], 1); EXPECT_EQ(op_to_count["Reshape"], 1); @@ -1773,7 +1728,7 @@ TEST(QDQTransformerTests, QDQPropagation_QDQCancelOut) { }; TransformerTester(build_test_case, - check_mp_reshape_graph, + check_graph, TransformerLevel::Level1, TransformerLevel::Level2); }; @@ -1799,7 +1754,7 @@ TEST(QDQTransformerTests, QDQPropagation_QDQ_CancelOut_More) { builder.AddQuantizeLinearNode(reshape_output, same_scale ? .004f : .0039f, same_zp ? 129 : 128, output_arg); }; - auto check_mp_reshape_graph = [&](InferenceSessionWrapper& session) { + auto check_graph = [&](InferenceSessionWrapper& session) { auto op_to_count = CountOpsInGraph(session.GetGraph()); EXPECT_EQ(op_to_count["Reshape"], 1); EXPECT_EQ(op_to_count["QuantizeLinear"], same_scale && same_zp ? 1 : 2); @@ -1807,7 +1762,7 @@ TEST(QDQTransformerTests, QDQPropagation_QDQ_CancelOut_More) { }; TransformerTester(build_test_case, - check_mp_reshape_graph, + check_graph, TransformerLevel::Level1, TransformerLevel::Level2); }; @@ -1833,7 +1788,7 @@ TEST(QDQTransformerTests, QDQPropagation_Q_No_Parent) { builder.AddQuantizeLinearNode(transpose_output, .0035f, 135, output_arg); }; - auto check_mp_reshape_graph = [&](InferenceSessionWrapper& session) { + auto check_graph = [&](InferenceSessionWrapper& session) { GraphViewer graph_viewer(session.GetGraph()); const auto& node_topology_list = graph_viewer.GetNodesInTopologicalOrder(); EXPECT_EQ(graph_viewer.GetNode(node_topology_list[0])->OpType(), "QuantizeLinear"); @@ -1841,7 +1796,7 @@ TEST(QDQTransformerTests, QDQPropagation_Q_No_Parent) { }; TransformerTester(build_test_case, - check_mp_reshape_graph, + check_graph, TransformerLevel::Level1, TransformerLevel::Level2); }; @@ -1866,7 +1821,7 @@ TEST(QDQTransformerTests, QDQPropagation_DQ_No_Children) { transpose_node.AddAttribute("perm", perms); }; - auto check_mp_reshape_graph = [&](InferenceSessionWrapper& session) { + auto check_graph = [&](InferenceSessionWrapper& session) { auto op_to_count = CountOpsInGraph(session.GetGraph()); GraphViewer graph_viewer(session.GetGraph()); const auto& node_topology_list = graph_viewer.GetNodesInTopologicalOrder(); @@ -1875,7 +1830,7 @@ TEST(QDQTransformerTests, QDQPropagation_DQ_No_Children) { }; TransformerTester(build_test_case, - check_mp_reshape_graph, + check_graph, TransformerLevel::Level1, TransformerLevel::Level2); }; @@ -1902,7 +1857,7 @@ TEST(QDQTransformerTests, QDQPropagation_Per_Layer_No_Propagation) { transpose_node.AddAttribute("perm", perms); }; - auto check_mp_reshape_graph = [&](InferenceSessionWrapper& session) { + auto check_graph = [&](InferenceSessionWrapper& session) { auto op_to_count = CountOpsInGraph(session.GetGraph()); GraphViewer graph_viewer(session.GetGraph()); const auto& node_topology_list = graph_viewer.GetNodesInTopologicalOrder(); @@ -1911,7 +1866,7 @@ TEST(QDQTransformerTests, QDQPropagation_Per_Layer_No_Propagation) { }; TransformerTester(build_test_case, - check_mp_reshape_graph, + check_graph, TransformerLevel::Level1, TransformerLevel::Level2); }; @@ -1935,14 +1890,14 @@ TEST(QDQTransformerTests, QDQPropagation_DQ_Q) { builder.AddQuantizeLinearNode(dq_output, .0035f, 135, output_arg); }; - auto check_mp_reshape_graph = [&](InferenceSessionWrapper& session) { + auto check_graph = [&](InferenceSessionWrapper& session) { auto op_to_count = CountOpsInGraph(session.GetGraph()); EXPECT_EQ(op_to_count["QuantizeLinear"], 1); EXPECT_EQ(op_to_count["DequantizeLinear"], 1); }; TransformerTester(build_test_case, - check_mp_reshape_graph, + check_graph, TransformerLevel::Level1, TransformerLevel::Level2); }; diff --git a/onnxruntime/test/providers/nnapi/nnapi_basic_test.cc b/onnxruntime/test/providers/nnapi/nnapi_basic_test.cc index 4316ff7a51d9e..834f0a7378275 100644 --- a/onnxruntime/test/providers/nnapi/nnapi_basic_test.cc +++ b/onnxruntime/test/providers/nnapi/nnapi_basic_test.cc @@ -271,7 +271,9 @@ TEST(NnapiExecutionProviderTest, TestNoShapeInputModel) { << "No node should be taken by the NNAPI EP"; } -static void RunQDQModelTest(const GetQDQTestCaseFn& build_test_case, const char* test_description) { +static void RunQDQModelTest(const GetQDQTestCaseFn& build_test_case, + const char* test_description, + const EPVerificationParams& params = EPVerificationParams()) { onnxruntime::Model model(test_description, false, DefaultLoggingManager().DefaultLogger()); Graph& graph = model.MainGraph(); ModelTestBuilder helper(graph); @@ -286,7 +288,7 @@ static void RunQDQModelTest(const GetQDQTestCaseFn& build_test_case, const char* #if defined(__ANDROID__) RunAndVerifyOutputsWithEP(model_data, "NnapiExecutionProviderTest.TestQDQModel", std::make_unique(0), - helper.feeds_); + helper.feeds_, params); #else // test load only SessionOptions so; @@ -306,7 +308,8 @@ TEST(NnapiExecutionProviderTest, TestQDQConv) { uint8_t /* OutputType */>( {1, 1, 5, 5} /*input_shape*/, {1, 1, 3, 3} /*weights_shape*/), - "nnapi_qdq_test_graph_conv"); + "nnapi_qdq_test_graph_conv", + {true /* verify_entire_graph_use_ep */}); } TEST(NnapiExecutionProviderTest, TestQDQResize) { @@ -316,14 +319,44 @@ TEST(NnapiExecutionProviderTest, TestQDQResize) { {1, 3, 32, 32} /* sizes_data */, "linear" /* mode */, "asymmetric" /* coordinate_transformation_mode */), - "nnapi_qdq_test_graph_resize"); + "nnapi_qdq_test_graph_resize", + {true /* verify_entire_graph_use_ep */}); } TEST(NnapiExecutionProviderTest, TestQDQAveragePool) { + // NNAPI use different rounding, which may cause ~1% difference in the result RunQDQModelTest(BuildQDQAveragePoolTestCase( {1, 3, 32, 32} /* input_shape */), - "nnapi_qdq_test_graph_averagepool"); + "nnapi_qdq_test_graph_averagepool", + { + true /* verify_entire_graph_use_ep */, + 1e-2f /* fp32_abs_err */, + }); +} + +TEST(NnapiExecutionProviderTest, TestQDQAdd) { + RunQDQModelTest(BuildBinaryOpTestCase( + {1, 23, 13, 13} /* input_shape */, + "Add" /* op_type */), + "nnapi_qdq_test_graph_add", + {true /* verify_entire_graph_use_ep */}); +} + +TEST(NnapiExecutionProviderTest, TestQDQMul) { + // NNAPI use different rounding, which may cause ~1% difference in the result + RunQDQModelTest(BuildBinaryOpTestCase( + {1, 23, 13, 13} /* input_shape */, + "Mul" /* op_type */), + "nnapi_qdq_test_graph_mul", + { + true /* verify_entire_graph_use_ep */, + 1e-2f /* fp32_abs_err */, + }); } #endif // !(ORT_MINIMAL_BUILD) diff --git a/onnxruntime/test/util/include/test_utils.h b/onnxruntime/test/util/include/test_utils.h index 60d2a3e8caba2..50859d826fa2d 100644 --- a/onnxruntime/test/util/include/test_utils.h +++ b/onnxruntime/test/util/include/test_utils.h @@ -15,6 +15,18 @@ class Graph; namespace test { +// struct to hold some verification params for RunAndVerifyOutputsWithEP +struct EPVerificationParams { + // Verify the entire graph is taken by the EP + // if this is set to false, then will verify that at least one node is assigned to 'execution_provider' + bool verify_entire_graph_use_ep{false}; + + // Some EP may use different rounding than ORT CPU EP, which may cause a bigger abs error than + // the default of 1e-5f, especially for scenarios such as [Q -> Quantized op -> DQ] + // Set this only if this is necessary + float fp32_abs_err = 1e-5f; +}; + // return number of nodes in the Graph and any subgraphs that are assigned to the specified execution provider int CountAssignedNodes(const Graph& current_graph, const std::string& ep_type); @@ -23,13 +35,14 @@ int CountAssignedNodes(const Graph& current_graph, const std::string& ep_type); void RunAndVerifyOutputsWithEP(const ORTCHAR_T* model_path, const char* log_id, std::unique_ptr execution_provider, - const NameMLValMap& feeds); + const NameMLValMap& feeds, + const EPVerificationParams& params = EPVerificationParams()); // helper function that takes in model_data -// used in nnapi qdq model tests void RunAndVerifyOutputsWithEP(const std::string& model_data, const char* log_id, std::unique_ptr execution_provider, - const NameMLValMap& feeds); + const NameMLValMap& feeds, + const EPVerificationParams& params = EPVerificationParams()); } // namespace test } // namespace onnxruntime diff --git a/onnxruntime/test/util/test_utils.cc b/onnxruntime/test/util/test_utils.cc index 220b1342f12d9..b069b08810cb8 100644 --- a/onnxruntime/test/util/test_utils.cc +++ b/onnxruntime/test/util/test_utils.cc @@ -18,7 +18,8 @@ namespace onnxruntime { namespace test { static void VerifyOutputs(const std::vector& output_names, const std::vector& expected_fetches, - const std::vector& fetches) { + const std::vector& fetches, + const EPVerificationParams& params) { ASSERT_EQ(expected_fetches.size(), fetches.size()); for (size_t i = 0, end = expected_fetches.size(); i < end; ++i) { @@ -40,10 +41,8 @@ static void VerifyOutputs(const std::vector& output_names, << " mismatch for " << output_names[i]; break; case ONNX_NAMESPACE::TensorProto_DataType_FLOAT: { - constexpr float abs_err = 1e-5f; - EXPECT_THAT(ltensor.DataAsSpan(), - ::testing::Pointwise(::testing::FloatNear(abs_err), rtensor.DataAsSpan())); + ::testing::Pointwise(::testing::FloatNear(params.fp32_abs_err), rtensor.DataAsSpan())); break; } default: @@ -72,16 +71,18 @@ int CountAssignedNodes(const Graph& current_graph, const std::string& ep_type) { void RunAndVerifyOutputsWithEP(const ORTCHAR_T* model_path, const char* log_id, std::unique_ptr execution_provider, - const NameMLValMap& feeds) { + const NameMLValMap& feeds, + const EPVerificationParams& params) { // read raw data from model provided by the model_path std::ifstream stream(model_path, std::ios::in | std::ios::binary); std::string model_data((std::istreambuf_iterator(stream)), std::istreambuf_iterator()); - RunAndVerifyOutputsWithEP(model_data, log_id, std::move(execution_provider), feeds); + RunAndVerifyOutputsWithEP(model_data, log_id, std::move(execution_provider), feeds, params); } void RunAndVerifyOutputsWithEP(const std::string& model_data, const char* log_id, std::unique_ptr execution_provider, - const NameMLValMap& feeds) { + const NameMLValMap& feeds, + const EPVerificationParams& params) { SessionOptions so; so.session_logid = log_id; RunOptions run_options; @@ -122,12 +123,17 @@ void RunAndVerifyOutputsWithEP(const std::string& model_data, const char* log_id // make sure that some nodes are assigned to the EP, otherwise this test is pointless... const auto& graph2 = session_object2.GetGraph(); auto ep_nodes = CountAssignedNodes(graph2, provider_type); - ASSERT_GT(ep_nodes, 0) << "No nodes were assigned to " << provider_type; + if (params.verify_entire_graph_use_ep) { + // Verify the entire graph is assigned to the EP + ASSERT_EQ(ep_nodes, graph2.NumberOfNodes()) << "Not all nodes were assigned to " << provider_type; + } else { + ASSERT_GT(ep_nodes, 0) << "No nodes were assigned to " << provider_type; + } // Run with EP and verify the result std::vector fetches; ASSERT_STATUS_OK(session_object2.Run(run_options, feeds, output_names, &fetches)); - VerifyOutputs(output_names, expected_fetches, fetches); + VerifyOutputs(output_names, expected_fetches, fetches, params); } #if !defined(DISABLE_SPARSE_TENSORS)