From 9b529388bf3a6589e2a25cd4c6391c11d63b2b93 Mon Sep 17 00:00:00 2001
From: lvliang-intel <liang1.lv@intel.com>
Date: Wed, 21 Sep 2022 18:56:40 +0800
Subject: [PATCH] Fix transformer_lt_mlperf accuracy drop issue (#1248)

---
 .../tf_utils/quantize_graph/qdq/fuse_qdq_matmul.py    | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/neural_compressor/adaptor/tf_utils/quantize_graph/qdq/fuse_qdq_matmul.py b/neural_compressor/adaptor/tf_utils/quantize_graph/qdq/fuse_qdq_matmul.py
index fd025995737..6c1f4ea8a6b 100644
--- a/neural_compressor/adaptor/tf_utils/quantize_graph/qdq/fuse_qdq_matmul.py
+++ b/neural_compressor/adaptor/tf_utils/quantize_graph/qdq/fuse_qdq_matmul.py
@@ -32,6 +32,17 @@ def __init__(self, **kwargs):
         self.sorted_patterns = sorted(self.patterns,
                                       key=lambda i: len(i),
                                       reverse=True)
+        # TODO Remove this when TFDO supports output_quantization_mode 'MIN_FIRST'
+        # Root cause of the transformer_lt_mlperf model accuracy drop:
+        # MatMul + Relu fusion ==> the output quantization mode only can be set to 'SCALED', 
+        # if the input_quantization_mode of the next _QuantizedMatMul is set to 'MIN_FIRST'.
+        # the mismatch will cause the accrucy drop.
+        if not self.performance_only:
+            if ['Dequantize', 'MatMul', 'Relu', 'QuantizeV2'] in self.sorted_patterns:
+                self.sorted_patterns.remove(['Dequantize', 'MatMul', 'Relu', 'QuantizeV2'])
+            if ['Dequantize', 'MatMul', 'BiasAdd', 'Relu', 'QuantizeV2'] in self.sorted_patterns:
+                self.sorted_patterns.remove(['Dequantize', 'MatMul', 'BiasAdd', 'Relu', 'QuantizeV2'])
+        
         self.exclude_matmul_nodes = []
         self.fusion_op_type = set(fusion[1] for fusion in self.patterns)
         self.fusion_mapping = {