From ca8d4459d44c55817b61178b4b573eb645b6c4ec Mon Sep 17 00:00:00 2001
From: Adrian Lizarraga <adlizarraga@microsoft.com>
Date: Wed, 31 Jan 2024 10:38:01 -0800
Subject: [PATCH] Add contrib Q/DQ ops to symbolic shape inference tool
 (#19340)

### Description
Adds type/shape inferencing support for MSFT domain QuantizeLinear and
DequantizeLinear operators to symbolic_shape_infer.py


### Motivation and Context
Need a way to infer the types and shapes of Q/DQ ops in models that use
the MSFT domain versions (e.g., int16 quantization).
---
 .../python/tools/symbolic_shape_infer.py      |  27 +++
 ...untime_test_python_symbolic_shape_infer.py | 202 ++++++++++++++++++
 2 files changed, 229 insertions(+)

diff --git a/onnxruntime/python/tools/symbolic_shape_infer.py b/onnxruntime/python/tools/symbolic_shape_infer.py
index ef4c4ae906243..9823e8264e17b 100755
--- a/onnxruntime/python/tools/symbolic_shape_infer.py
+++ b/onnxruntime/python/tools/symbolic_shape_infer.py
@@ -197,6 +197,7 @@ def __init__(self, int_max, auto_merge, guess_output_rank, verbose, prefix=""):
             "BiasGelu": self._infer_BiasGelu,
             "BiasSplitGelu": self._infer_BiasSplitGelu,
             "DecoderMaskedMultiHeadAttention": self._infer_DecoderMaskedMultiHeadAttention,
+            "DequantizeLinear": self._infer_DequantizeLinear,
             "EmbedLayerNormalization": self._infer_EmbedLayerNormalization,
             "FastGelu": self._infer_FastGelu,
             "GatedRelativePositionBias": self._infer_GatedRelativePositionBias,
@@ -212,6 +213,7 @@ def __init__(self, int_max, auto_merge, guess_output_rank, verbose, prefix=""):
             "PackedAttention": self._infer_PackedAttention,
             "PackedMultiHeadAttention": self._infer_PackedMultiHeadAttention,
             "PythonOp": self._infer_PythonOp,
+            "QuantizeLinear": self._infer_QuantizeLinear,
             "QuickGelu": self._infer_FastGelu,
             "RelativePositionBias": self._infer_RelativePositionBias,
             "RemovePadding": self._infer_RemovePadding,
@@ -457,6 +459,8 @@ def _onnx_infer_single_node(self, node):
             "GemmFastGelu",
             "LayerNormalization",
             "LongformerAttention",
+            "DequantizeLinear",
+            "QuantizeLinear",
             "RelativePositionBias",
             "RemovePadding",
             "RestorePadding",
@@ -979,6 +983,29 @@ def _infer_NhwcConv(self, node):  # noqa: N802
             )
         )
 
+    def _infer_DequantizeLinear(self, node):  # noqa: N802
+        # Get the output data type from the scale input (index 1, required).
+        output_dtype = self.known_vi_[node.input[1]].type.tensor_type.elem_type
+
+        # Get the output shape from the first input.
+        output_shape = self._get_shape(node, 0)
+
+        vi = self.known_vi_[node.output[0]]
+        vi.CopyFrom(helper.make_tensor_value_info(node.output[0], output_dtype, output_shape))
+
+    def _infer_QuantizeLinear(self, node):  # noqa: N802
+        # Get the output data type from the zero-point input (index 2, optional).
+        # Otherwise, default to uint8
+        output_dtype = onnx.TensorProto.UINT8
+        if len(node.input) > 2 and node.input[2]:
+            output_dtype = self.known_vi_[node.input[2]].type.tensor_type.elem_type
+
+        # Get the output shape from the first input.
+        output_shape = self._get_shape(node, 0)
+
+        vi = self.known_vi_[node.output[0]]
+        vi.CopyFrom(helper.make_tensor_value_info(node.output[0], output_dtype, output_shape))
+
     def _infer_Einsum(self, node):  # noqa: N802
         # ref:https://github.com/onnx/onnx/blob/623dfaa0151b2e4ce49779c3ec31cbd78c592b80/onnx/defs/math/defs.cc#L3275
         equation = get_attribute(node, "equation")
diff --git a/onnxruntime/test/python/onnxruntime_test_python_symbolic_shape_infer.py b/onnxruntime/test/python/onnxruntime_test_python_symbolic_shape_infer.py
index 67db411ddc246..eca1430448e8e 100644
--- a/onnxruntime/test/python/onnxruntime_test_python_symbolic_shape_infer.py
+++ b/onnxruntime/test/python/onnxruntime_test_python_symbolic_shape_infer.py
@@ -392,6 +392,208 @@ def test_div_precision(self):
         self.assertEqual(len(output_dims), 1)
         self.assertEqual(output_dims[0].dim_value, 512)
 
+    def test_quantize_linear(self):
+        """
+        Test ONNX QuantizeLinear op.
+        Check that the output shape is propagated from the first input and that the output data
+        type comes from the zero-point input.
+        """
+        initializers = [
+            helper.make_tensor(
+                "scale",
+                TensorProto.FLOAT,
+                [],
+                [1.0],
+            ),
+            helper.make_tensor(
+                "zero_point",
+                TensorProto.INT8,
+                [],
+                [16],
+            ),
+        ]
+
+        nodes = [
+            helper.make_node(
+                "QuantizeLinear",
+                inputs=[
+                    "input_f32",
+                    "scale",
+                    "zero_point",
+                ],
+                outputs=["output_s8"],
+            ),
+        ]
+
+        inputs = [
+            helper.make_tensor_value_info("input_f32", TensorProto.FLOAT, ["b", 2, 3, 4]),
+        ]
+
+        outputs = [
+            helper.make_tensor_value_info("output_s8", TensorProto.UNDEFINED, None),
+        ]
+
+        graph = helper.make_graph(nodes, "QuantizeLinear_Test", inputs, outputs, initializers)
+        model = helper.make_model(graph)
+
+        inferred = SymbolicShapeInference.infer_shapes(model, auto_merge=True)
+
+        expected_shapes = [
+            helper.make_tensor_value_info("output_s8", TensorProto.INT8, ["b", 2, 3, 4]),
+        ]
+        self._check_shapes(graph, inferred.graph, expected_shapes)
+
+    def test_quantize_linear_ms_domain(self):
+        """
+        Test QuantizeLinear op ('com.microsoft' domain).
+        Check that the output shape is propagated from the first input and that the output data
+        type comes from the zero-point input.
+        """
+        initializers = [
+            helper.make_tensor(
+                "scale",
+                TensorProto.FLOAT,
+                [],
+                [1.0],
+            ),
+            helper.make_tensor(
+                "zero_point",
+                TensorProto.UINT16,
+                [],
+                [16],
+            ),
+        ]
+
+        nodes = [
+            helper.make_node(
+                "QuantizeLinear",
+                inputs=[
+                    "input_f32",
+                    "scale",
+                    "zero_point",
+                ],
+                outputs=["output_u16"],
+                domain="com.microsoft",
+            ),
+        ]
+
+        inputs = [
+            helper.make_tensor_value_info("input_f32", TensorProto.FLOAT, ["b", 2, 3, 4]),
+        ]
+
+        outputs = [
+            helper.make_tensor_value_info("output_u16", TensorProto.UNDEFINED, None),
+        ]
+
+        graph = helper.make_graph(nodes, "QuantizeLinear_MSDomain_Test", inputs, outputs, initializers)
+        model = helper.make_model(graph)
+
+        inferred = SymbolicShapeInference.infer_shapes(model, auto_merge=True)
+
+        expected_shapes = [
+            helper.make_tensor_value_info("output_u16", TensorProto.UINT16, ["b", 2, 3, 4]),
+        ]
+        self._check_shapes(graph, inferred.graph, expected_shapes)
+
+    def test_quantize_linear_no_zp_input(self):
+        """
+        Test QuantizeLinear op ('com.microsoft' domain).
+        Check that the output shape is propagated from the first input.
+        The zero-point input is missing, so the output data type should default to uint8.
+        """
+        initializers = [
+            helper.make_tensor(
+                "scale",
+                TensorProto.FLOAT,
+                [],
+                [1.0],
+            ),
+        ]
+
+        nodes = [
+            helper.make_node(
+                "QuantizeLinear",
+                inputs=[
+                    "input_f32",
+                    "scale",
+                ],
+                outputs=["output_u8"],
+                domain="com.microsoft",
+            ),
+        ]
+
+        inputs = [
+            helper.make_tensor_value_info("input_f32", TensorProto.FLOAT, ["b", 2, 3, 4]),
+        ]
+
+        outputs = [
+            helper.make_tensor_value_info("output_u8", TensorProto.UNDEFINED, None),
+        ]
+
+        graph = helper.make_graph(nodes, "QuantizeLinear_NoZP_Test", inputs, outputs, initializers)
+        model = helper.make_model(graph)
+
+        inferred = SymbolicShapeInference.infer_shapes(model, auto_merge=True)
+
+        # Check that the output shape is propagated from the first input and that the
+        # output data type comes from the zero-point input.
+        expected_shapes = [
+            helper.make_tensor_value_info("output_u8", TensorProto.UINT8, ["b", 2, 3, 4]),
+        ]
+        self._check_shapes(graph, inferred.graph, expected_shapes)
+
+    def test_dequantize_linear_ms_domain(self):
+        """
+        Test DequantizeLinear operator ('com.microsoft' domain).
+        Check that the output shape is propagated from the first input and that the output data
+        type comes from the scale input.
+        """
+        initializers = [
+            helper.make_tensor(
+                "scale",
+                TensorProto.FLOAT,
+                [],
+                [1.0],
+            ),
+            helper.make_tensor(
+                "zero_point",
+                TensorProto.UINT16,
+                [],
+                [16],
+            ),
+        ]
+
+        nodes = [
+            helper.make_node(
+                "DequantizeLinear",
+                inputs=[
+                    "input_u16",
+                    "scale",
+                    "zero_point",
+                ],
+                outputs=["output_f32"],
+                domain="com.microsoft",
+            ),
+        ]
+
+        inputs = [
+            helper.make_tensor_value_info("input_u16", TensorProto.UINT16, ["b", 2, 3, 4]),
+        ]
+
+        outputs = [
+            helper.make_tensor_value_info("output_f32", TensorProto.UNDEFINED, None),
+        ]
+
+        graph = helper.make_graph(nodes, "DequantizeLinear_MSDomain_Test", inputs, outputs, initializers)
+        model = helper.make_model(graph)
+
+        inferred = SymbolicShapeInference.infer_shapes(model, auto_merge=True)
+
+        expected_shapes = [
+            helper.make_tensor_value_info("output_f32", TensorProto.FLOAT, ["b", 2, 3, 4]),
+        ]
+        self._check_shapes(graph, inferred.graph, expected_shapes)
+
 
 class TestSymbolicShapeInferenceForSlice(unittest.TestCase):
     def check_slice_of_concat(self, input_dims, start, end, step, expected_output_dim):