Skip to content

Commit

Permalink
Fix quant issue caused by tensors having same name (#1618)
Browse files Browse the repository at this point in the history
Signed-off-by: Mengni Wang <[email protected]>
  • Loading branch information
mengniwang95 authored Feb 27, 2024
1 parent 14b7b0a commit 0a20f30
Show file tree
Hide file tree
Showing 3 changed files with 127 additions and 53 deletions.
127 changes: 76 additions & 51 deletions neural_compressor/adaptor/ox_utils/quantizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -570,20 +570,21 @@ def quantize_inputs(self, node, indices=None, initializer_use_weight_qType=True,
if self.add_qdq_pair_to_weight and self.mode == "qdq":
weight = self._get_quantized_weight(initializer, dtype, scheme)
self._update_weight(weight)
node.input[idx] = weight.name
q_weight_name = weight.name + "_quantized"
zp_name = weight.name + "_zero_point"
scale_name = weight.name + "_scale"
qlinear_node = make_quant_node(
tensor_name + "_QuantizeLinear",
weight.name + "_QuantizeLinear",
[tensor_name, scale_name, zp_name],
[tensor_name + "_quantized"],
[weight.name + "_quantized"],
)
dequant_node = make_dquant_node(
tensor_name + "_DequantizeLinear",
[tensor_name + "_quantized", scale_name, zp_name],
[tensor_name + "_dequantized"],
weight.name + "_DequantizeLinear",
[weight.name + "_quantized", scale_name, zp_name],
[weight.name + "_dequantized"],
)
self.replace_input.append([node, tensor_name, dequant_node.output[0]])
self.replace_input.append([node, weight.name, dequant_node.output[0]])
self.new_nodes.extend([qlinear_node, dequant_node])
quantized_value = QuantizedValue(
weight.name, q_weight_name, scale_name, zp_name, QuantizedValueType.Initializer, None, dtype
Expand All @@ -593,17 +594,18 @@ def quantize_inputs(self, node, indices=None, initializer_use_weight_qType=True,
else:
weight = self._get_quantized_weight(initializer, dtype, scheme)
self._update_weight(weight)
node.input[idx] = weight.name
q_weight_name = weight.name + "_quantized"
zp_name = weight.name + "_zero_point"
scale_name = weight.name + "_scale"

inputs = [q_weight_name, scale_name, zp_name]
output_name = tensor_name + "_DequantizeLinear"
dequant_node = onnx.helper.make_node(
"DequantizeLinear", inputs, [tensor_name + "_dequantized"], tensor_name + "_DequantizeLinear"
"DequantizeLinear", inputs, [weight.name + "_dequantized"], weight.name + "_DequantizeLinear"
)
self.new_nodes.append(dequant_node)
self.replace_input.append([node, tensor_name, dequant_node.output[0]])
self.replace_input.append([node, weight.name, dequant_node.output[0]])
quantized_value = QuantizedValue(
weight.name, q_weight_name, scale_name, zp_name, QuantizedValueType.Initializer, None, dtype
)
Expand Down Expand Up @@ -721,7 +723,8 @@ def quantize_bias_tensor(self, node):
if len(beta_attribute):
beta = onnx.helper.get_attribute_value(beta_attribute[0])
_, quant_value = self.quantize_bias(bias_name, input_name, weight_name, beta)
self.model.remove_initializer(find_by_name(bias_name, self.model.initializer()))
if self.model.get_initializer_share_num(bias_name) == 1:
self.model.remove_initializer(find_by_name(bias_name, self.model.initializer()))
inputs = [quant_value.q_name, quant_value.scale_name, quant_value.zp_name]
axis = None
if find_by_name(weight_name + "_DequantizeLinear", self.new_nodes):
Expand Down Expand Up @@ -855,79 +858,96 @@ def quantize_weights_per_channel(self, node, indices, weight_qType, scheme, axis
self.quantize_inputs(node, indices)
return

for idx, weight_name in enumerate(node.input):
for idx, inp in enumerate(node.input):
if idx not in indices:
continue

if self.add_qdq_pair_to_weight and self.mode == "qdq":
q_name, zp_name, scale_name = self.quantize_weight_per_channel(weight_name, weight_qType, scheme, axis)
q_name, zp_name, scale_name = self.quantize_weight_per_channel(inp, weight_qType, scheme, axis)
weight_name = (
("_").join([inp, str(weight_qType)]) if self.model.get_initializer_share_num(inp) > 1 else inp
)
qlinear_node = make_quant_node(
weight_name + "_QuantizeLinear", [weight_name, scale_name, zp_name], [weight_name + "_quantized"]
weight_name + "_QuantizeLinear",
[inp, scale_name, zp_name],
[q_name],
axis,
)
dequant_node = make_dquant_node(
weight_name + "_DequantizeLinear",
[weight_name + "_quantized", scale_name, zp_name],
[q_name, scale_name, zp_name],
[weight_name + "_dequantized"],
axis,
)
node.input[idx] = weight_name
self.replace_input.append([node, weight_name, dequant_node.output[0]])
self.new_nodes.extend([qlinear_node, dequant_node])
else:
q_name, zp_name, scale_name = self.quantize_weight_per_channel(weight_name, weight_qType, scheme, axis)
inputs = [q_name, scale_name, zp_name]
q_name, zp_name, scale_name = self.quantize_weight_per_channel(inp, weight_qType, scheme, axis)
weight_name = (
("_").join([inp, str(weight_qType)]) if self.model.get_initializer_share_num(inp) > 1 else inp
)
dequant_node = make_dquant_node(
weight_name + "_DequantizeLinear",
[q_name, scale_name, zp_name],
[weight_name + "_dequantized"],
axis,
)
self.new_nodes.append(dequant_node)
node.input[idx] = weight_name

# Replace weight_name with output of DequantizeLinear
self.replace_input.append([node, weight_name, dequant_node.output[0]])

def quantize_weight_per_channel(self, weight_name, weight_qType, scheme, channel_axis):
"""Quantize weight per-channel."""
name = (
("_").join([weight_name, str(weight_qType)])
if self.model.get_initializer_share_num(weight_name) > 1
else weight_name
)
if name in self.quantized_value_map:
return (name + "_quantized", name + "_zero_point", name + "_scale")

initializer = find_by_name(weight_name, self.model.initializer())
if initializer is None:
raise ValueError("{} is not an initializer", weight_name)

if initializer.name not in self.quantized_value_map:
weights = (
self.tensor_proto_to_array(initializer, os.path.dirname(self.model.model_path))
if self.model.model_path is not None
else self.tensor_proto_to_array(initializer)
)
rmin, rmax, zero_point, scale, quantized_weights = quantize_data_per_channel(
weights, channel_axis, _get_qrange_for_qType(weight_qType, self.reduce_range), weight_qType, scheme
)
weights = (
self.tensor_proto_to_array(initializer, os.path.dirname(self.model.model_path))
if self.model.model_path is not None
else self.tensor_proto_to_array(initializer)
)
rmin, rmax, zero_point, scale, quantized_weights = quantize_data_per_channel(
weights, channel_axis, _get_qrange_for_qType(weight_qType, self.reduce_range), weight_qType, scheme
)

weight = QuantizedInitializer(
initializer.name,
initializer,
rmin,
rmax,
zero_point,
scale,
weights,
quantized_weights.flatten().tolist(),
channel_axis,
weight_qType,
)
weight = QuantizedInitializer(
name,
initializer,
rmin,
rmax,
zero_point,
scale,
weights,
quantized_weights.flatten().tolist(),
channel_axis,
weight_qType,
)

self._update_weight(weight)
quantized_value = QuantizedValue(
weight.name,
weight.name + "_quantized",
weight.name + "_scale",
weight.name + "_zero_point",
QuantizedValueType.Initializer,
None,
weight_qType,
)
self.quantized_value_map[weight.name] = quantized_value
self._update_weight(weight)
quantized_value = QuantizedValue(
weight.name,
weight.name + "_quantized",
weight.name + "_scale",
weight.name + "_zero_point",
QuantizedValueType.Initializer,
None,
weight_qType,
)
self.quantized_value_map[weight.name] = quantized_value

return (initializer.name + "_quantized", initializer.name + "_zero_point", initializer.name + "_scale")
return (weight.name + "_quantized", weight.name + "_zero_point", weight.name + "_scale")

def _update_weight(self, weight):
"""Update weight.
Expand Down Expand Up @@ -1018,8 +1038,13 @@ def _get_quantization_params(self, param_name):

def _get_quantized_weight(self, initializer, qType, scheme):
"""Get quantized weight."""
if initializer.name in self.quantized_value_map:
return self.quantized_value_map[initializer.name]
name = (
("_").join([initializer.name, str(qType)])
if self.model.get_initializer_share_num(initializer.name) > 1
else initializer.name
)
if name in self.quantized_value_map:
return self.quantized_value_map[name]
weights_data = (
self.tensor_proto_to_array(initializer, os.path.dirname(self.model.model_path))
if self.model.model_path is not None
Expand All @@ -1029,7 +1054,7 @@ def _get_quantized_weight(self, initializer, qType, scheme):
weights_data.flatten().tolist(), _get_qrange_for_qType(qType, self.reduce_range), qType, scheme
)
weight = QuantizedInitializer(
initializer.name,
name,
initializer,
[rmin],
[rmax],
Expand Down
7 changes: 5 additions & 2 deletions neural_compressor/adaptor/ox_utils/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,9 +132,12 @@ class QuantType(Enum): # pragma: no cover
QUInt8 = 1


def make_quant_node(name, inputs, outputs):
def make_quant_node(name, inputs, outputs, axis=None):
"""Make a QuantizeLinear node."""
return helper.make_node("QuantizeLinear", inputs, outputs, name)
if axis is not None:
return helper.make_node("QuantizeLinear", inputs, outputs, name, axis=axis)
else:
return helper.make_node("QuantizeLinear", inputs, outputs, name)


def make_dquant_node(name, inputs, outputs, axis=None):
Expand Down
46 changes: 46 additions & 0 deletions test/adaptor/onnxrt_adaptor/test_adaptor_onnxrt.py
Original file line number Diff line number Diff line change
Expand Up @@ -668,6 +668,33 @@ def build_gemm_model():
return model


def build_model_share_init():
initializers = []
input = helper.make_tensor_value_info("input", TensorProto.FLOAT, [1, 3, 224, 224])
conv_weight_initializer = numpy_helper.from_array(
np.random.randint(-1, 2, [222, 3, 3, 3]).astype(np.float32), name="conv_weight"
)
conv_bias_initializer = numpy_helper.from_array(np.random.randint(1, 2, [222]).astype(np.float32), name="conv_bias")
conv_node = helper.make_node("Conv", ["input", "conv_weight", "conv_bias"], ["conv_output"], name="conv")

add_node = helper.make_node("Add", ["conv_bias", "conv_output"], ["add_output"], name="add")

div_node = helper.make_node("Div", ["add_output", "conv_bias"], ["div_output"], name="div")

output = helper.make_tensor_value_info("div_output", TensorProto.FLOAT, [1, 222, 222, 222])
initializers = [conv_weight_initializer, conv_bias_initializer]
graph = helper.make_graph(
[conv_node, add_node, div_node],
"test",
[input],
[output],
initializer=initializers,
)
model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 13)])
model.ir_version = 7
return model


def build_benchmark():
seq = """
from neural_compressor.experimental import Benchmark
Expand Down Expand Up @@ -864,6 +891,7 @@ def setUpClass(self):
self.gemm_model = build_gemm_model()
self.conv_model2 = build_conv_model2()
self.conv_model3 = build_conv_model3()
self.shared_init_model = build_model_share_init()
export_onnx_nlp_model(self.distilbert_model, self.distilbert_export_path, 14)
export_onnx_nlp_model(self.albert_model, self.albert_export_path, 14)
self.distilbert_model = onnx.load(self.distilbert_export_path)
Expand Down Expand Up @@ -1809,6 +1837,24 @@ def test_cuda_ep_env_set(self):
# check TENSORRT is not loaded if backend is not onnxrt_trt_ep
self.assertEqual(os.environ.get("ORT_TENSORRT_UNAVAILABLE"), "1")

def test_model_share_init(self):
config = PostTrainingQuantConfig(approach="static")
q_model = quantization.fit(self.shared_init_model, config, calib_dataloader=self.cv_dataloader)
self.assertNotEqual(q_model, None)
ort.InferenceSession(q_model.model.SerializeToString(), providers=ort.get_available_providers())

config = PostTrainingQuantConfig(approach="dynamic")
q_model = quantization.fit(self.shared_init_model, config, calib_dataloader=self.cv_dataloader)
self.assertNotEqual(q_model, None)
ort.InferenceSession(q_model.model.SerializeToString(), providers=ort.get_available_providers())

config = PostTrainingQuantConfig(
approach="static", quant_format="QDQ", recipes={"add_qdq_pair_to_weight": True}
)
q_model = quantization.fit(self.shared_init_model, config, calib_dataloader=self.cv_dataloader)
self.assertNotEqual(q_model, None)
ort.InferenceSession(q_model.model.SerializeToString(), providers=ort.get_available_providers())


if __name__ == "__main__":
unittest.main()

0 comments on commit 0a20f30

Please sign in to comment.