microsoft · yihonglyu · Jul 27, 2023 · Jun 6, 2023 · Jun 7, 2023 · Jun 8, 2023
diff --git a/onnxruntime/python/tools/quantization/quantize.py b/onnxruntime/python/tools/quantization/quantize.py
@@ -144,6 +144,16 @@ def __init__(
                         a DeQuantizeLinear node. If False, it remains floating-point bias and does not insert
                         any quantization nodes associated with biases.
                         This extra option is only effective when quant_format is QuantFormat.QDQ.
+                    SmoothQuant = True/False :
+                        Default is False. If enabled, SmoothQuant algorithm will be applied before quantization to do
+                        fake input channel quantization.
+                    SmoothQuantAlpha = float :
+                        Default is 0.5. It only works if SmoothQuant is True. It controls the difficulty of weight
+                        and activation quantization. A larger alpha value could be used on models with more significant
+                        activation outliers to migrate more quantization difficulty to weights.
+                    SmoothQuantFolding = True/False :
+                        Default is True. It only works if SmoothQuant is True. If enabled, inserted Mul ops during
+                        SmoothQuant will be folded into the previous op if the previous op is foldable.
             execution_provider : A enum indicates the Execution Provider such as: CPU, TRT, NNAPI, SNE, etc.
         Raises:
             ValueError: Raise ValueError if execution provider is unknown
@@ -330,6 +340,16 @@ def quantize_static(
                     Default is 0.01. Constant smoothing factor to use when computing the moving average of the
                     minimum and maximum values. Effective only when the calibration method selected is MinMax and
                     when CalibMovingAverage is set to True.
+                SmoothQuant = True/False :
+                    Default is False. If enabled, SmoothQuant algorithm will be applied before quantization to do
+                    fake input channel quantization.
+                SmoothQuantAlpha = float :
+                    Default is 0.5. It only works if SmoothQuant is True. It controls the difficulty of weight
+                    and activation quantization. A larger alpha value could be used on models with more significant
+                    activation outliers to migrate more quantization difficulty to weights.
+                SmoothQuantFolding = True/False :
+                    Default is True. It only works if SmoothQuant is True. If enabled, inserted Mul ops during
+                    SmoothQuant will be folded into the previous op if the previous op is foldable.
     """
 
     extra_options = extra_options or {}
@@ -362,6 +382,36 @@ def quantize_static(
         key: extra_options.get(name) for (name, key) in calib_extra_options_keys if name in extra_options
     }
 
+    if extra_options.get("SmoothQuant", False):
+        import importlib
+
+        try:
+            importlib.import_module("neural_compressor.adaptor.ox_utils.smooth_quant")
+        except Exception as e:
+            logging.error(f"{e}.")
+            raise RuntimeError("neural-compressor is not correctly installed. Please check your environment.") from e
+
+        import copy
+
+        from neural_compressor.adaptor.ox_utils.smooth_quant import ORTSmoothQuant
+
+        from .quant_utils import save_and_reload_model
+
+        def inc_dataloader():
+            data_reader = copy.deepcopy(calibration_data_reader)
+            for data in data_reader:
+                yield data, None
+
+        orig_nodes = [i.name for i in model.graph.node]
+        dataloader = inc_dataloader()
+        sq = ORTSmoothQuant(model_input, dataloader, reduce_range)
+        del dataloader
+        model = sq.transform(
+            extra_options.get("SmoothQuantAlpha", 0.5), extra_options.get("SmoothQuantFolding", True)
+        ).model
+        nodes_to_exclude.extend([i.name for i in model.graph.node if i.name not in orig_nodes])
+        model = save_and_reload_model(model)
+
     with tempfile.TemporaryDirectory(prefix="ort.quant.") as quant_tmp_dir:
         calibrator = create_calibrator(
             model,

diff --git a/onnxruntime/test/python/quantization/test_quantize_static.py b/onnxruntime/test/python/quantization/test_quantize_static.py
@@ -98,6 +98,19 @@ def test_static_quant_config(self):
         check_model_correctness(self, self._model_fp32_path, quant_model_path, data_reader.get_next())
         data_reader.rewind()
 
+    def test_smooth_quant(self):
+        data_reader = InputFeedsNegOneZeroOne(10, {"input": [1, self._channel_size, 1, 3]})
+        quant_config = StaticQuantConfig(data_reader, extra_options={"SmoothQuant": True})
+        quant_model_path = str(Path(self._tmp_model_dir.name) / "quant.config.onnx")
+        quantize(self._model_fp32_path, quant_model_path, quant_config)
+
+        data_reader.rewind()
+        check_model_correctness(self, self._model_fp32_path, quant_model_path, data_reader.get_next())
+        data_reader.rewind()
+
+        model = onnx.load(quant_model_path)
+        self.assertIn("Mul", [i.op_type for i in model.graph.node])
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -18,3 +18,4 @@ scipy
 sympy
 wheel
 setuptools>=41.4.0
+neural-compressor
-Original file line number
+Diff line change
@@ Expand Up / @@ -18,3 +18,4 @@ scipy @@
     sympy
     wheel
     setuptools>=41.4.0
+    neural-compressor