Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[NNAPI QDQ] AddQDQAdd/Mul, update to NNAPI QDQ handling, update some test settings #10483

Merged
merged 7 commits into from
Feb 9, 2022
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
210 changes: 13 additions & 197 deletions onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.cc
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,10 @@ QuantizedOpType GetQuantizedOpType(const NodeUnit& node_unit) {
return QuantizedOpType::QDQResize;
else if (op_type == "AveragePool")
return QuantizedOpType::QDQAveragePool;
else if (op_type == "Add")
return QuantizedOpType::QDQAdd;
else if (op_type == "Mul")
return QuantizedOpType::QDQMul;
} else {
// throw?
// Do we want to throw here? seems got neglected last time
Expand Down Expand Up @@ -114,25 +118,12 @@ bool IsQuantizedPool(QuantizedOpType quant_op_type) {
bool IsQuantizedBinaryOp(QuantizedOpType quant_op_type) {
return quant_op_type == QuantizedOpType::QLinearMatMul ||
quant_op_type == QuantizedOpType::QLinearAdd ||
quant_op_type == QuantizedOpType::QDQAdd ||
quant_op_type == QuantizedOpType::QDQMul ||
IsQuantizedConv(quant_op_type);
}

bool HasValidUnaryOpQuantizedInputs(const NodeUnit& node_unit) {
int32_t input_type;
if (!GetType(node_unit.Inputs()[0].node_arg, input_type))
return false;

if (input_type != ONNX_NAMESPACE::TensorProto_DataType_UINT8) {
LOGS_DEFAULT(VERBOSE) << "[" << node_unit.OpType()
<< "] Input type: [" << input_type
<< "] is not supported for now";
return false;
}

return true;
}

bool HasValidBinaryOpQuantizedInputs(const NodeUnit& node_unit) {
bool HasValidBinaryOpQuantizedInputTypes(const NodeUnit& node_unit) {
auto quant_op_type = GetQuantizedOpType(node_unit);
int32_t a_input_type, b_input_type;
if (!IsQuantizedBinaryOp(quant_op_type)) {
Expand All @@ -146,16 +137,17 @@ bool HasValidBinaryOpQuantizedInputs(const NodeUnit& node_unit) {
if (!GetType(inputs[1].node_arg, b_input_type))
return false;

// QlinearConv supports u8u8 or u8s8
// QLinearMatMul/Add only support u8u8
bool is_quant_conv = IsQuantizedConv(quant_op_type);
// QlinearConv/Mul supports u8u8 or u8s8
guoyu-wang marked this conversation as resolved.
Show resolved Hide resolved
// QLinearAdd only support u8u8
bool is_quant_conv_or_matmul = IsQuantizedConv(quant_op_type) || (quant_op_type == QuantizedOpType::QLinearMatMul);

bool has_valid_qlinear_conv_weight =
(b_input_type == ONNX_NAMESPACE::TensorProto_DataType_UINT8 ||
b_input_type == ONNX_NAMESPACE::TensorProto_DataType_INT8);

if (a_input_type != ONNX_NAMESPACE::TensorProto_DataType_UINT8 ||
(!is_quant_conv && a_input_type != b_input_type) ||
(is_quant_conv && !has_valid_qlinear_conv_weight)) {
(!is_quant_conv_or_matmul && a_input_type != b_input_type) ||
(is_quant_conv_or_matmul && !has_valid_qlinear_conv_weight)) {
LOGS_DEFAULT(VERBOSE) << "[" << node_unit.OpType()
<< "] A Input type: [" << a_input_type
<< "] B Input type: [" << b_input_type
Expand All @@ -166,182 +158,6 @@ bool HasValidBinaryOpQuantizedInputs(const NodeUnit& node_unit) {
return true;
}

bool HasValidQuantizationScales(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
const std::vector<size_t>& indices, const OpSupportCheckParams& params, bool is_input) {
const auto& op_type = node_unit.OpType();
auto quant_op_type = GetQuantizedOpType(node_unit);
bool is_quant_conv = IsQuantizedConv(quant_op_type);
bool is_quant_matmul = (quant_op_type == QuantizedOpType::QLinearMatMul);
const auto& io_defs = is_input ? node_unit.Inputs() : node_unit.Outputs();
for (const auto idx : indices) {
if (idx >= io_defs.size()) {
LOGS_DEFAULT(VERBOSE) << (is_input ? "Input" : "Output") << " index, " << idx
<< " >= size, " << io_defs.size()
<< " of NodeUnit: " << node_unit.Name();
return false;
}

const auto& io_def = io_defs[idx];
if (!io_def.quant_param.has_value()) {
LOGS_DEFAULT(VERBOSE) << "HasValidQuantizationZeroPoints, Input index, " << idx
<< " has no quant_param";
return false;
}

const auto scale_name = io_def.quant_param->scale.Name();

if (!Contains(initializers, scale_name)) {
LOGS_DEFAULT(VERBOSE) << "The scale of " << op_type << " must be an initializer tensor";
return false;
}

// If this op is Qlinear[Conv/MatMul], we want to check u8s8 support for weight tensor (or B tensor for QlinearMatMul)
bool is_conv_matmul_weight = is_input && (is_quant_conv || is_quant_matmul) && idx == 1;
bool is_conv_matmul_u8s8_weight = false;

if (is_conv_matmul_weight) {
const auto& weight_tensor = *initializers.at(io_def.node_arg.Name());
is_conv_matmul_u8s8_weight = weight_tensor.data_type() == ONNX_NAMESPACE::TensorProto_DataType_INT8;
}

const auto& scale_tensor = *initializers.at(scale_name);
int64_t scales_dim = scale_tensor.dims().empty() ? 1 : scale_tensor.dims()[0];
if (!is_conv_matmul_u8s8_weight) {
if (scales_dim != 1) {
LOGS_DEFAULT(VERBOSE) << op_type << " does not support per-channel quantization, "
<< " for now, only u8s8 QlinearConv supports per-channel quantization on API 29+";
return false;
}
} else if (scales_dim != 1) {
// For u8s8 Qlinear[Conv/MatMul], we support
// 1. Per-tensor, the weight will be transformed to uint8 later
// 2. Per-channel, only from Android API level 29
if (is_quant_matmul) {
LOGS_DEFAULT(VERBOSE) << "QLinearMatMul does not support per-channel quantization";
return false;
}

if (params.android_feature_level < ANEURALNETWORKS_FEATURE_LEVEL_3) {
LOGS_DEFAULT(VERBOSE) << op_type << " only supports per-channel quantization on Android API 29+, "
<< "system NNAPI feature level: " << params.android_feature_level;
return false;
}

const auto& weight_tensor = *initializers.at(io_def.node_arg.Name());
if (weight_tensor.dims()[0] != scales_dim) {
LOGS_DEFAULT(VERBOSE) << op_type << " mismatch int8 per-channel quantization weight,"
<< " weight dimension[0] " << weight_tensor.dims()[0]
<< " scale dimension " << scales_dim;
return false;
}
}
}

return true;
}

bool HasValidQuantizationZeroPoints(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
const std::vector<size_t>& indices, bool is_input) {
const auto& op_type = node_unit.OpType();
auto quant_op_type = GetQuantizedOpType(node_unit);
bool is_quant_conv = IsQuantizedConv(quant_op_type);
bool is_quant_matmul = (quant_op_type == QuantizedOpType::QLinearMatMul);

const auto& io_defs = is_input ? node_unit.Inputs() : node_unit.Outputs();
for (const auto idx : indices) {
if (idx >= io_defs.size()) {
LOGS_DEFAULT(VERBOSE) << "HasValidQuantizationZeroPoints, "
<< (is_input ? "Input" : "Output") << " index, " << idx
<< " >= size, " << io_defs.size();
return false;
}

const auto& io_def = io_defs[idx];
if (!io_def.quant_param.has_value()) {
LOGS_DEFAULT(VERBOSE) << "HasValidQuantizationZeroPoints, Input index, " << idx
<< " has no quant_param";
return false;
}

// zero point is optional here
if (!io_def.quant_param->zero_point)
return true;

const auto& zero_point_name = io_def.quant_param->zero_point->Name();
if (!Contains(initializers, zero_point_name)) {
LOGS_DEFAULT(VERBOSE) << "The zero point of " << op_type << " must be an initializer tensor";
return false;
}

bool is_conv_matmul_weight = is_input && (is_quant_conv || is_quant_matmul) && idx == 1;
bool is_conv_matmul_u8s8_weight = false;

if (is_conv_matmul_weight) {
const auto& weight_tensor = *initializers.at(io_def.node_arg.Name());
is_conv_matmul_u8s8_weight = weight_tensor.data_type() == ONNX_NAMESPACE::TensorProto_DataType_INT8;
}

const auto& zero_tensor = *initializers.at(zero_point_name);
int64_t zero_dim = zero_tensor.dims().empty() ? 1 : zero_tensor.dims()[0];

if (!is_conv_matmul_u8s8_weight) {
if (zero_dim != 1) {
LOGS_DEFAULT(VERBOSE) << op_type << " does not support per-channel quantization, "
<< " for now, only u8s8 QlinearConv supports per-channel quantization on API 29+";
return false;
}
} else {
// For u8s8 Qlinear[Conv/MatMul], we support
// 1. Per-tensor, the weight will be transformed to uint8 later
// 2. Per-channel, only from Android API level 29
if (zero_tensor.data_type() != ONNX_NAMESPACE::TensorProto_DataType_INT8) {
LOGS_DEFAULT(VERBOSE) << "u8s8 Qlinear[Conv/MatMul] only supports int8 zero point for weight, "
<< "actual zero point type: [" << zero_tensor.data_type() << "]";
return false;
}

if (zero_dim != 1) {
if (is_quant_matmul) {
LOGS_DEFAULT(VERBOSE) << "QLinearMatMul does not support per-channel quantization";
return false;
}
}

// For onnx, u8s8 QlinearConv, the weight zero point can be a scalar,
// or a tensor with same channel as weight, for NNAPI we only support it be
// 0 (scalar) or all 0 (tensor), NNAPI will assume the zero point for per-channel
// quantization is 0 there is no input for it
const auto& weight_tensor = *initializers.at(io_def.node_arg.Name());
if (weight_tensor.dims()[0] != zero_dim && zero_dim != 1) {
LOGS_DEFAULT(VERBOSE) << op_type << " mismatch int8 per-channel quantization weight,"
<< " weight dimension[0] " << weight_tensor.dims()[0]
<< " zero point dimension " << zero_dim;
return false;
}

std::vector<uint8_t> unpacked_tensor;
auto status = onnxruntime::utils::UnpackInitializerData(zero_tensor, node_unit.ModelPath(), unpacked_tensor);
if (!status.IsOK()) {
LOGS_DEFAULT(ERROR) << "Qlinear[Conv/MatMul] error when unpack zero tensor: " << zero_point_name
<< ", error msg: " << status.ErrorMessage();
return false;
}

// Verify all onnx weight zero point(s) are 0(s)
const int8_t* zero_points = reinterpret_cast<const int8_t*>(unpacked_tensor.data());
for (size_t i = 0; i < unpacked_tensor.size(); i++) {
if (zero_points[i] != 0) {
LOGS_DEFAULT(VERBOSE) << "u8s8 Qlinear[Conv/MatMul] only support 0 as zero point, "
<< "zero_points[" << i << "] has value: " << zero_points[i];
return false;
}
}
}
}

return true;
}

common::Status GetQuantizationScaleAndZeroPoint(
const InitializedTensorSet& initializers, const NodeUnitIODef& io_def, const Path& model_path,
float& scale, int32_t& zero_point) {
Expand Down
14 changes: 3 additions & 11 deletions onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.h
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,8 @@ enum class QuantizedOpType : uint8_t {
QDQConv,
QDQResize,
QDQAveragePool,
QDQAdd,
QDQMul,
// TODO, add other QDQ NodeUnit types
};

Expand All @@ -113,18 +115,8 @@ bool IsQuantizedPool(QuantizedOpType quant_op_type);
// Such as QLinearConv, QLinearMatMul, QLinearAdd, QDQConv,...
bool IsQuantizedBinaryOp(QuantizedOpType quant_op_type);

// Check if a qlinear unary op has valid inputs, Qlinear[Sigmoid/AveragePool]
bool HasValidUnaryOpQuantizedInputs(const NodeUnit& node_unit);
// Check if a qlinear binary op has valid inputs, Qlinear[Conv/MatMul/Add]
bool HasValidBinaryOpQuantizedInputs(const NodeUnit& node_unit);

// Check if a qlinear op has valid scales for given indices
bool HasValidQuantizationScales(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
const std::vector<size_t>& indices, const OpSupportCheckParams& params, bool is_input);

// Check if a qlinear op has valid zero points for given indices
bool HasValidQuantizationZeroPoints(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
const std::vector<size_t>& indices, bool is_input);
bool HasValidBinaryOpQuantizedInputTypes(const NodeUnit& node_unit);

common::Status GetQuantizationScaleAndZeroPoint(
const InitializedTensorSet& initializers, const NodeUnitIODef& io_def, const Path& model_path,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -656,8 +656,10 @@ class BinaryOpBuilder : public BaseOpBuilder {
};

/* static */ bool BinaryOpBuilder::IsQuantizedOp(const NodeUnit& node_unit) {
// TODO, add support for QDQ NodeUnit
return node_unit.OpType() == "QLinearAdd";
const auto quant_type = GetQuantizedOpType(node_unit);
return quant_type == QuantizedOpType::QLinearAdd ||
quant_type == QuantizedOpType::QDQAdd ||
quant_type == QuantizedOpType::QDQMul;
}

void BinaryOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const NodeUnit& node_unit) const {
Expand Down Expand Up @@ -690,12 +692,12 @@ Status BinaryOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const

int32_t op_code;
bool add_activation = true;
bool op_is_qlinear = op_type == "QLinearAdd";
if (op_type == "Add" || op_is_qlinear) {
bool is_quant_op = IsQuantizedOp(node_unit);
if (op_type == "Add" || is_quant_op) { // Add/QLinearAdd/QDQAdd
op_code = ANEURALNETWORKS_ADD;
} else if (op_type == "Sub") {
op_code = ANEURALNETWORKS_SUB;
} else if (op_type == "Mul") {
} else if (op_type == "Mul" || is_quant_op) { // Mul/QDQMul
op_code = ANEURALNETWORKS_MUL;
} else if (op_type == "Div") {
op_code = ANEURALNETWORKS_DIV;
Expand All @@ -721,15 +723,15 @@ Status BinaryOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
b_zero_point = 0,
y_zero_point = 0;

if (op_is_qlinear) {
if (is_quant_op) {
ORT_RETURN_IF_ERROR(GetBinaryOpQuantizationScaleAndZeroPoint(
model_builder.GetInitializerTensors(), node_unit,
a_scale, b_scale, y_scale,
a_zero_point, b_zero_point, y_zero_point));
}

// Verify if the scale and zero point matchs from onnx input and nnapi input match
if (op_is_qlinear) {
if (is_quant_op) {
ORT_RETURN_IF_ERROR(IsValidInputQuantizedType(model_builder, input1, a_scale, a_zero_point));
ORT_RETURN_IF_ERROR(IsValidInputQuantizedType(model_builder, input2, b_scale, b_zero_point));
}
Expand Down
Loading