diff --git a/neural_compressor/adaptor/onnxrt.py b/neural_compressor/adaptor/onnxrt.py index 864fb8674ab..4350a51d860 100644 --- a/neural_compressor/adaptor/onnxrt.py +++ b/neural_compressor/adaptor/onnxrt.py @@ -31,6 +31,7 @@ from neural_compressor.utils.utility import Statistics from neural_compressor.experimental.data.dataloaders.base_dataloader import BaseDataLoader from neural_compressor.conf.dotdict import deep_get +from neural_compressor.adaptor.ox_utils.util import split_shared_bias import math onnx = LazyImport("onnx") @@ -465,6 +466,7 @@ def _pre_optimize(self, model, level=1): if self.graph_optimization.gemm2matmul else tmp_model model.model = self._rename_node(model.model) model = self._revert_fusedconv(model) + model = split_shared_bias(model) model.topological_sort() self.pre_optimized_model = model diff --git a/neural_compressor/adaptor/ox_utils/operators/split.py b/neural_compressor/adaptor/ox_utils/operators/split.py index 34397561d3c..aa31f6d0e27 100644 --- a/neural_compressor/adaptor/ox_utils/operators/split.py +++ b/neural_compressor/adaptor/ox_utils/operators/split.py @@ -21,6 +21,18 @@ attribute_to_kwarg from .base_operator import QuantOperatorBase from neural_compressor.adaptor.ox_utils.util import QuantizedValue +from .qdq_base_operator import QDQOperatorBase + +class QDQSplit(QDQOperatorBase): + def __init__(self, onnx_quantizer, onnx_node): + super().__init__(onnx_quantizer, onnx_node) + + def quantize(self): + node = self.node + self.quantizer.quantize_inputs(node, [0]) + if not self.disable_qdq_for_node_output or self.quantizer != 'qdq': + self.quantizer.quantize_outputs(self.node, direct_int8=True) + node.name = node.name + "_quant" class QSplit(QuantOperatorBase): def __init__(self, onnx_quantizer, onnx_node): @@ -30,7 +42,7 @@ def convert(self): node = self.node parent = self.quantizer.model.get_parents(node)[0] children = self.quantizer.model.get_children(node) - if len(children) == 0: + if parent.op_type != 'DequantizeLinear' or len(children) == 0: return kwargs = {} for attribute in node.attribute: diff --git a/neural_compressor/adaptor/ox_utils/quantizer.py b/neural_compressor/adaptor/ox_utils/quantizer.py index b2ef4c69e5e..20edc6f7b32 100644 --- a/neural_compressor/adaptor/ox_utils/quantizer.py +++ b/neural_compressor/adaptor/ox_utils/quantizer.py @@ -442,6 +442,13 @@ def quantize_inputs(self, node, indices=None, [tensor_name + "_dequantized"]) self.replace_input.append([node, tensor_name, dequant_node.output[0]]) self.new_nodes.extend([qlinear_node, dequant_node]) + quantized_value = QuantizedValue(weight.name, q_weight_name, + scale_name, + zp_name, + QuantizedValueType.Initializer, + None, dtype) + if weight.name not in self.quantized_value_map: + self.quantized_value_map[weight.name] = quantized_value else: weight = self._get_quantized_weight(initializer, dtype, scheme) self._update_weight(weight) @@ -557,13 +564,12 @@ def quantize_bias_tensor(self, node): [bias_name + '_dequantized'], bias_name + '_DequantizeLinear') self.new_nodes.append(dequant_node) self.replace_input.append([find_by_name(node.name, self.model.nodes()), - bias_name, dequant_node.output[0]]) + bias_name, bias_name + '_dequantized']) def quantize_bias(self, bias_name, input_name, weight_name, new_node_list=[]): ''' Quantized the bias. Zero Point == 0 and Scale == Input_Scale * Weight_Scale ''' - # get scale for weight weight_scale_initializer = find_by_name(weight_name + '_scale', self.model.initializer()) weight_scale = self.tensor_proto_to_array(weight_scale_initializer) @@ -624,8 +630,6 @@ def quantize_bias(self, bias_name, input_name, weight_name, new_node_list=[]): quantized_bias_zp_name, QuantizedValueType.Initializer, None, onnx_proto.TensorProto.INT32) - if bias_name not in self.quantized_value_map: - self.quantized_value_map[bias_name] = quantized_value return quantized_bias_name, quantized_value def _dynamic_quantize_bias(self, input_name, weight_scale_name, \ @@ -701,40 +705,49 @@ def quantize_weight_per_channel(self, weight_name, weight_qType, scheme, channel if initializer is None: raise ValueError("{} is not an initializer", weight_name) - weights = self.tensor_proto_to_array(initializer) - channel_count = weights.shape[channel_axis] - rmin_list = [] - rmax_list = [] - zero_point_list = [] - scale_list = [] - quantized_per_channel_data_list = [] - for i in range(channel_count): - per_channel_data = weights.take(i, channel_axis) - rmin, rmax, zero_point, scale, quantized_per_channel_data = quantize_data( - per_channel_data.flatten().tolist(), _get_qrange_for_qType(weight_qType, - self.reduce_range), weight_qType, scheme) - rmin_list.append(rmin) - rmax_list.append(rmax) - zero_point_list.append(zero_point) - scale_list.append(scale) - quantized_per_channel_data_list.append(quantized_per_channel_data) - - # combine per_channel_data into one - reshape_dims = list(weights.shape) # deep copy - reshape_dims[channel_axis] = 1 # only one per channel for reshape - quantized_weights = np.asarray(quantized_per_channel_data_list[0]).reshape(reshape_dims) - for i in range(1, len(quantized_per_channel_data_list)): - channel_weights = np.asarray(quantized_per_channel_data_list[i]).reshape(reshape_dims) - quantized_weights = np.concatenate((quantized_weights, channel_weights), channel_axis) - - weight = QuantizedInitializer(initializer.name, initializer, rmin_list, rmax_list, - zero_point_list, scale_list, - weights, - quantized_weights.flatten().tolist(), - channel_axis, weight_qType) - - self._update_weight(weight) - return (weight.name + "_quantized", weight.name + "_zero_point", weight.name + "_scale") + if initializer.name not in self.quantized_value_map: + weights = self.tensor_proto_to_array(initializer) + channel_count = weights.shape[channel_axis] + rmin_list = [] + rmax_list = [] + zero_point_list = [] + scale_list = [] + quantized_per_channel_data_list = [] + for i in range(channel_count): + per_channel_data = weights.take(i, channel_axis) + rmin, rmax, zero_point, scale, quantized_per_channel_data = quantize_data( + per_channel_data.flatten().tolist(), _get_qrange_for_qType(weight_qType, + self.reduce_range), weight_qType, scheme) + rmin_list.append(rmin) + rmax_list.append(rmax) + zero_point_list.append(zero_point) + scale_list.append(scale) + quantized_per_channel_data_list.append(quantized_per_channel_data) + + # combine per_channel_data into one + reshape_dims = list(weights.shape) # deep copy + reshape_dims[channel_axis] = 1 # only one per channel for reshape + quantized_weights = np.asarray(quantized_per_channel_data_list[0]).reshape(reshape_dims) + for i in range(1, len(quantized_per_channel_data_list)): + channel_weights = np.asarray(quantized_per_channel_data_list[i]).reshape(reshape_dims) + quantized_weights = np.concatenate((quantized_weights, channel_weights), channel_axis) + + weight = QuantizedInitializer(initializer.name, initializer, rmin_list, rmax_list, + zero_point_list, scale_list, + weights, + quantized_weights.flatten().tolist(), + channel_axis, weight_qType) + + self._update_weight(weight) + quantized_value = QuantizedValue(weight.name, weight.name + "_quantized", + weight.name + "_scale", + weight.name + "_zero_point", + QuantizedValueType.Initializer, + None, weight_qType) + self.quantized_value_map[weight.name] = quantized_value + + return (initializer.name + "_quantized", initializer.name + "_zero_point", + initializer.name + "_scale") def _update_weight(self, weight): ''' diff --git a/neural_compressor/adaptor/ox_utils/registry.py b/neural_compressor/adaptor/ox_utils/registry.py index 241bcc8d6ce..da14051279a 100644 --- a/neural_compressor/adaptor/ox_utils/registry.py +++ b/neural_compressor/adaptor/ox_utils/registry.py @@ -28,7 +28,7 @@ from .operators.maxpool import QMaxPool, QDQMaxPool from .operators.gavgpool import QGlobalAveragePool from .operators.lstm import LSTMQuant, QDQLSTM -from .operators.split import QSplit +from .operators.split import QSplit, QDQSplit from .operators.concat import QLinearConcat, QDQConcat from .operators.pad import QPad, QDQPad from .operators.pooling import QLinearPool, QDQPool @@ -92,7 +92,7 @@ "AveragePool": QDQPool, "Unsqueeze" : QDQDirect8BitOp, "Concat": QDQConcat, - "Split": QDQDirect8BitOp + "Split": QDQSplit } CastRegistry = { diff --git a/neural_compressor/adaptor/ox_utils/util.py b/neural_compressor/adaptor/ox_utils/util.py index 609ee11c812..1da59059749 100644 --- a/neural_compressor/adaptor/ox_utils/util.py +++ b/neural_compressor/adaptor/ox_utils/util.py @@ -60,6 +60,24 @@ def _get_qrange_for_qType(qType, reduce_range=False): else: raise ValueError('unsupported quantization data type') +def split_shared_bias(model): + for input_name, node_list in model.input_name_to_nodes.items(): + if len(node_list) > 1 and input_name in [i.name for i in model.model.graph.initializer]: + for node in node_list[1:]: + if node.op_type not in ['Conv', 'FusedConv']: + continue + if node.input[2] == input_name: + new_input_name = node.input[2] + '_nc_split_' + node.name + new_input = helper.make_tensor( + new_input_name, + model.get_initializer(input_name).data_type, + model.get_initializer(input_name).dims, + model.get_initializer(input_name).raw_data, + True) + model.add_initializer(new_input) + node.input[2] = new_input_name + return model + def convert_np_to_float16(np_array, min_positive_val=1e-7, max_finite_val=1e4): # pragma: no cover ''' Convert float32 numpy array to float16 without changing sign or finiteness.