From c1fb2e47798358a13ad571d8127195526e211b7b Mon Sep 17 00:00:00 2001
From: violetch24 <zixuan@aia-sdp-spr-117706.jf.intel.com>
Date: Wed, 17 Jul 2024 23:06:18 -0700
Subject: [PATCH 1/4] add docstring for static quant and smooth quant

Signed-off-by: violetch24 <zixuan@aia-sdp-spr-117706.jf.intel.com>
---
 .../algorithms/smooth_quant/save_load.py      |  2 +-
 .../algorithms/smooth_quant/smooth_quant.py   | 58 +++++++++++++------
 .../algorithms/static_quant/save_load.py      | 14 +++++
 .../algorithms/static_quant/static_quant.py   | 22 +++----
 .../torch/quantization/config.py              | 39 ++++++++++++-
 5 files changed, 103 insertions(+), 32 deletions(-)

diff --git a/neural_compressor/torch/algorithms/smooth_quant/save_load.py b/neural_compressor/torch/algorithms/smooth_quant/save_load.py
index 7c5613a15bb..505c2c6e407 100644
--- a/neural_compressor/torch/algorithms/smooth_quant/save_load.py
+++ b/neural_compressor/torch/algorithms/smooth_quant/save_load.py
@@ -32,7 +32,7 @@ def recover_model_from_json(model, json_file_path, example_inputs):  # pragma: n
         example_inputs (tuple or torch.Tensor or dict): example inputs that will be passed to the ipex function.
 
     Returns:
-        (object): quantized model
+        model (object): quantized model
     """
     from torch.ao.quantization.observer import MinMaxObserver
 
diff --git a/neural_compressor/torch/algorithms/smooth_quant/smooth_quant.py b/neural_compressor/torch/algorithms/smooth_quant/smooth_quant.py
index fdfb51640ac..9212a5ee4bd 100644
--- a/neural_compressor/torch/algorithms/smooth_quant/smooth_quant.py
+++ b/neural_compressor/torch/algorithms/smooth_quant/smooth_quant.py
@@ -61,9 +61,9 @@ def prepare(self, model, example_inputs, inplace=True, *args, **kwargs):
         """Prepares a given model for quantization.
 
         Args:
-            model: A float model to be quantized.
-            example_inputs: Used to trace torch model.
-            inplace: Whether to carry out model transformations in-place. Defaults to True.
+            model (torch.nn.Module): raw fp32 model or prepared model.
+            example_inputs (tensor/tuple/dict): used to trace torch model.
+            inplace (bool, optional): whether to carry out model transformations in-place. Defaults to True.
 
         Returns:
             A prepared model.
@@ -128,9 +128,9 @@ def convert(self, model, example_inputs, inplace=True, *args, **kwargs):
         """Converts a prepared model to a quantized model.
 
         Args:
-            model: The prepared model to be converted.
-            example_inputs: Used to trace torch model.
-            inplace: Whether to carry out model transformations in-place. Defaults to True.
+            model (QuantizationInterceptionModule): the prepared model to be converted.
+            example_inputs (tensor/tuple/dict): used to trace torch model.
+            inplace (bool, optional): whether to carry out model transformations in-place. Defaults to True.
 
         Returns:
             A quantized model.
@@ -153,14 +153,14 @@ def convert(self, model, example_inputs, inplace=True, *args, **kwargs):
         return model
 
     def quantize(self, model, tune_cfg, run_fn, example_inputs, inplace=True, *args, **kwargs):
-        """Execute the quantize process on the specified model.
+        """Executes the quantize process on the specified model.
 
         Args:
-            model: a float model to be quantized.
-            tune_cfg: quantization config for ops.
-            run_fn: a calibration function for calibrating the model.
-            example_inputs: used to trace torch model.
-            inplace: whether to carry out model transformations in-place.
+            model (torch.nn.Module): raw fp32 model or prepared model.
+            tune_cfg (OrderedDict): quantization config for ops.
+            run_fn (Callable): a calibration function for calibrating the model.
+            example_inputs (tensor/tuple/dict): used to trace torch model.
+            inplace (bool, optional): whether to carry out model transformations in-place. Defaults to True.
 
         Returns:
             A quantized model.
@@ -255,6 +255,22 @@ def quantize(self, model, tune_cfg, run_fn, example_inputs, inplace=True, *args,
 def qdq_quantize(
     model, tune_cfg, run_fn, example_inputs, inplace, cfgs, op_infos_from_cfgs, output_tensor_id_op_name, sq
 ):
+    """Executes the smooth quantize process.
+
+    Args:
+        model (torch.nn.Module): raw fp32 model or prepared model.
+        tune_cfg (OrderedDict): quantization config for ops.
+        run_fn (Callable): a calibration function for calibrating the model.
+        example_inputs (tensor/tuple/dict): used to trace torch model.
+        inplace (bool): whether to carry out model transformations in-place. Defaults to True.
+        cfgs (dict): the input configs.
+        op_infos_from_cfgs (dict): op infos retrieved from configs.
+        output_tensor_id_op_name (dict): dictionary of output tensor op names.
+        sq (TorchSmoothQuant): TorchSmoothQuant class containing sq infos.
+
+    Returns:
+        A quantized model.
+    """
     smoothquant_scale_info = sq.sq_scale_info
     sq_minmax_init = True if tune_cfg.get("act_algo", "kl") == "minmax" else False
 
@@ -325,6 +341,14 @@ def qdq_quantize(
 
 
 def _apply_pre_optimization(model, tune_cfg, sq, recover=False):
+    """Retrieves sq info to absorb the scale to the layer at output channel.
+
+    Args:
+        model (QuantizationInterceptionModule): a prepared model.
+        tune_cfg (OrderedDict): quantization config for ops.
+        sq (TorchSmoothQuant): TorchSmoothQuant class containing sq infos.
+        recover (bool, optional): whether to recover the scale. Defaults to False.
+    """
     sq_max_info = {}
     if sq.record_max_info:
         sq_max_info = sq.max_value_info
@@ -354,13 +378,13 @@ def _apply_pre_optimization(model, tune_cfg, sq, recover=False):
 
 
 def _ipex_post_quant_process(model, example_inputs, use_bf16, inplace=False):
-    """Convert to a jit model.
+    """Converts to a jit model.
 
     Args:
-        model: a prepared model.
-        example_inputs: used to trace torch model.
-        use_bf16: whether to use bf16 for mixed precision.
-        inplace: whether to carry out model transformations in-place.
+        model (QuantizationInterceptionModule): a prepared model.
+        example_inputs (tensor/tuple/dict): used to trace torch model.
+        use_bf16 (bool): whether to use bf16 for mixed precision.
+        inplace (bool, optional): whether to carry out model transformations in-place. Defaults to True.
 
     Returns:
         A converted jit model.
diff --git a/neural_compressor/torch/algorithms/static_quant/save_load.py b/neural_compressor/torch/algorithms/static_quant/save_load.py
index 9a7808c17eb..27b5c3b9a4b 100644
--- a/neural_compressor/torch/algorithms/static_quant/save_load.py
+++ b/neural_compressor/torch/algorithms/static_quant/save_load.py
@@ -27,6 +27,12 @@
 
 
 def save(model, output_dir="./saved_results"):
+    """Saves the quantized model to the output path.
+
+    Args:
+        model (RecursiveScriptModule): raw fp32 model or prepared model.
+        output_dir (str, optional): output path to save the quantized model.
+    """
     if not os.path.exists(output_dir):
         os.mkdir(output_dir)
 
@@ -48,6 +54,14 @@ def save(model, output_dir="./saved_results"):
 
 
 def load(output_dir="./saved_results"):
+    """Loads the quantized model from the output path.
+
+    Args:
+        output_dir (str, optional): output path to load the quantized model.
+
+    Returns:
+        A quantized model.
+    """
     qmodel_file_path = os.path.join(os.path.abspath(os.path.expanduser(output_dir)), WEIGHT_NAME)
     model = torch.jit.load(qmodel_file_path)
     model = torch.jit.freeze(model.eval())
diff --git a/neural_compressor/torch/algorithms/static_quant/static_quant.py b/neural_compressor/torch/algorithms/static_quant/static_quant.py
index 08dc5a1035f..bfeb96f409f 100644
--- a/neural_compressor/torch/algorithms/static_quant/static_quant.py
+++ b/neural_compressor/torch/algorithms/static_quant/static_quant.py
@@ -64,9 +64,9 @@ def prepare(self, model, example_inputs, inplace=True, *args, **kwargs):
         """Prepares a given model for quantization.
 
         Args:
-            model: A float model to be quantized.
-            example_inputs: Used to trace torch model.
-            inplace: Whether to carry out model transformations in-place. Defaults to True.
+            model (torch.nn.Module): raw fp32 model or prepared model.
+            example_inputs (tensor/tuple/dict): used to trace torch model.
+            inplace (bool, optional): whether to carry out model transformations in-place. Defaults to True.
 
         Returns:
             A prepared model.
@@ -134,9 +134,9 @@ def convert(self, model, example_inputs, inplace=True, *args, **kwargs):
         """Converts a prepared model to a quantized model.
 
         Args:
-            model: The prepared model to be converted.
-            example_inputs: Used to trace torch model.
-            inplace: Whether to carry out model transformations in-place. Defaults to True.
+            model (QuantizationInterceptionModule): the prepared model to be converted.
+            example_inputs (tensor/tuple/dict): used to trace torch model.
+            inplace (bool, optional): whether to carry out model transformations in-place. Defaults to True.
 
         Returns:
             A quantized model.
@@ -170,13 +170,13 @@ def convert(self, model, example_inputs, inplace=True, *args, **kwargs):
 
 
 def _ipex_post_quant_process(model, example_inputs, use_bf16, inplace=False):
-    """Convert to a jit model.
+    """Converts to a jit model.
 
     Args:
-        model: a prepared model.
-        example_inputs: used to trace torch model.
-        use_bf16: whether to use bf16 for mixed precision.
-        inplace: whether to carry out model transformations in-place.
+        model (QuantizationInterceptionModule): a prepared model.
+        example_inputs (tensor/tuple/dict): used to trace torch model.
+        use_bf16 (bool): whether to use bf16 for mixed precision.
+        inplace (bool, optional): whether to carry out model transformations in-place. Defaults to True.
 
     Returns:
         A converted jit model.
diff --git a/neural_compressor/torch/quantization/config.py b/neural_compressor/torch/quantization/config.py
index 2c43f1e59c1..be0b3226c66 100644
--- a/neural_compressor/torch/quantization/config.py
+++ b/neural_compressor/torch/quantization/config.py
@@ -1097,7 +1097,19 @@ def __init__(
         white_list: Optional[List[OP_NAME_OR_MODULE_TYPE]] = DEFAULT_WHITE_LIST,
         model_info: Optional[List[Tuple[str, Callable]]] = None,
     ):
-        """Init Static Quant Configs."""
+        """Init StaticQuant Config.
+
+        Args:
+            w_dtype (str): Data type for weights, default is "int8".
+            w_sym (bool): Whether to use symmetric quantization for weights, default is True.
+            w_granularity (str): Level of quantization granularity for weights, default is "per_channel".
+            w_algo (str): Quatization algorithm used to compute parameters for weights, default is "minmax".
+            act_dtype (str): Data type for activations, default is "uint8".
+            act_sym (bool): Whether to use symmetric quantization for activations, default is False.
+            act_granularity (str): Level of quantization granularity for activations, default is "per_channel".
+            act_algo (str): Quatization algorithm used to compute parameters for activations, default is "minmax".
+            excluded_precisions (list): Precisions to be excluded, Default value is empty list.
+        """
         super().__init__(white_list=white_list)
         self.w_dtype = w_dtype
         self.w_sym = w_sym
@@ -1205,7 +1217,7 @@ def __init__(
         act_dtype: str = "uint8",
         act_sym: bool = False,
         act_granularity: str = "per_tensor",
-        act_algo: str = "kl",
+        act_algo: str = "minmax",
         excluded_precisions: list = [],
         alpha: float = 0.5,
         folding: bool = False,
@@ -1220,7 +1232,28 @@ def __init__(
         auto_alpha_args: dict = None,
         white_list: Optional[List[OP_NAME_OR_MODULE_TYPE]] = DEFAULT_WHITE_LIST,
     ):
-        """Init SmoothQuant Configs."""
+        """Init SmoothQuant Config.
+
+        Args:
+            w_dtype (str): Data type for weights, default is "int8".
+            w_sym (bool): Whether to use symmetric quantization for weights, default is True.
+            w_granularity (str): Level of quantization granularity for weights, default is "per_channel".
+            w_algo (str): Quatization algorithm used to compute parameters for weights, default is "minmax".
+            act_dtype (str): Data type for activations, default is "uint8".
+            act_sym (bool): Whether to use symmetric quantization for activations, default is False.
+            act_granularity (str): Level of quantization granularity for activations, default is "per_channel".
+            act_algo (str): Quatization algorithm used to compute parameters for activations, default is "minmax".
+            excluded_precisions (list): Precisions to be excluded, Default value is empty list.
+            alpha (float): Value to balance input and weight quantization error, between 0 and 1, default is 0.5.
+            folding (bool): Whether to fold mul into the previous layer, default is False.
+            scale_sharing (bool): Whether share the same scale for layers with the same input, default is False.
+            init_alpha (float): Value to get baseline quantization error for auto-tuning, default is 0.5.
+            alpha_min (float): Min value of auto-tuning alpha search space, default is 0.0.
+            alpha_max (float): Max value of auto-tuning alpha search space, default is 1.0.
+            alpha_step (float): Step_size of auto-tuning alpha search space, default is 0.1.
+            shared_criterion (str): Criterion for input LayerNorm op of a transformer block, default is "max".
+            do_blockwise (bool): Whether to enable block-wise auto-tuning, default is False.
+        """
         super().__init__(white_list=white_list)
         self.w_dtype = w_dtype
         self.w_sym = w_sym

From efe25e137c3bde4ff8acff3f863113279c3a493c Mon Sep 17 00:00:00 2001
From: violetch24 <zixuan@aia-sdp-spr-117706.jf.intel.com>
Date: Thu, 18 Jul 2024 01:24:55 -0700
Subject: [PATCH 2/4] format fix

Signed-off-by: violetch24 <zixuan@aia-sdp-spr-117706.jf.intel.com>
---
 .../torch/algorithms/smooth_quant/__init__.py |  2 ++
 .../algorithms/smooth_quant/save_load.py      |  2 ++
 .../algorithms/smooth_quant/smooth_quant.py   |  8 +++--
 .../torch/algorithms/static_quant/__init__.py |  2 ++
 .../algorithms/static_quant/save_load.py      |  2 ++
 .../algorithms/static_quant/static_quant.py   |  4 +++
 .../torch/algorithms/static_quant/utility.py  | 29 +++++++++++++++++--
 7 files changed, 43 insertions(+), 6 deletions(-)

diff --git a/neural_compressor/torch/algorithms/smooth_quant/__init__.py b/neural_compressor/torch/algorithms/smooth_quant/__init__.py
index bb420d9b673..7fb415d78ec 100644
--- a/neural_compressor/torch/algorithms/smooth_quant/__init__.py
+++ b/neural_compressor/torch/algorithms/smooth_quant/__init__.py
@@ -12,6 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""The SmoothQuant-related modules."""
+
 
 from .utility import *
 from .smooth_quant import SmoothQuantQuantizer
diff --git a/neural_compressor/torch/algorithms/smooth_quant/save_load.py b/neural_compressor/torch/algorithms/smooth_quant/save_load.py
index 505c2c6e407..6c469688321 100644
--- a/neural_compressor/torch/algorithms/smooth_quant/save_load.py
+++ b/neural_compressor/torch/algorithms/smooth_quant/save_load.py
@@ -11,6 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""Save and load the quantized model."""
+
 
 # pylint:disable=import-error
 import torch
diff --git a/neural_compressor/torch/algorithms/smooth_quant/smooth_quant.py b/neural_compressor/torch/algorithms/smooth_quant/smooth_quant.py
index 9212a5ee4bd..ec3d528e087 100644
--- a/neural_compressor/torch/algorithms/smooth_quant/smooth_quant.py
+++ b/neural_compressor/torch/algorithms/smooth_quant/smooth_quant.py
@@ -14,6 +14,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""The quantizer using SmoothQuant path."""
+
 
 import json
 import os
@@ -263,9 +265,9 @@ def qdq_quantize(
         run_fn (Callable): a calibration function for calibrating the model.
         example_inputs (tensor/tuple/dict): used to trace torch model.
         inplace (bool): whether to carry out model transformations in-place. Defaults to True.
-        cfgs (dict): the input configs.
-        op_infos_from_cfgs (dict): op infos retrieved from configs.
-        output_tensor_id_op_name (dict): dictionary of output tensor op names.
+        cfgs (dict): configs loaded from ipex config path.
+        op_infos_from_cfgs (dict): dict containing configs that have been parsed for each op.
+        output_tensor_id_op_name (dict): dict containing op names corresponding to 'op_infos_from_cfgs'.
         sq (TorchSmoothQuant): TorchSmoothQuant class containing sq infos.
 
     Returns:
diff --git a/neural_compressor/torch/algorithms/static_quant/__init__.py b/neural_compressor/torch/algorithms/static_quant/__init__.py
index 46b4583fbba..965240ebbf1 100644
--- a/neural_compressor/torch/algorithms/static_quant/__init__.py
+++ b/neural_compressor/torch/algorithms/static_quant/__init__.py
@@ -12,6 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""The StaticQuant-related modules."""
+
 
 from .utility import *
 from .static_quant import StaticQuantQuantizer
diff --git a/neural_compressor/torch/algorithms/static_quant/save_load.py b/neural_compressor/torch/algorithms/static_quant/save_load.py
index 27b5c3b9a4b..013e9c7eb51 100644
--- a/neural_compressor/torch/algorithms/static_quant/save_load.py
+++ b/neural_compressor/torch/algorithms/static_quant/save_load.py
@@ -11,6 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""Save and load the quantized model."""
+
 
 # pylint:disable=import-error
 import json
diff --git a/neural_compressor/torch/algorithms/static_quant/static_quant.py b/neural_compressor/torch/algorithms/static_quant/static_quant.py
index bfeb96f409f..e713a1a97d6 100644
--- a/neural_compressor/torch/algorithms/static_quant/static_quant.py
+++ b/neural_compressor/torch/algorithms/static_quant/static_quant.py
@@ -14,6 +14,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""The quantizer using StaticQuant path."""
+
 
 import json
 import os
@@ -50,6 +52,8 @@
 
 
 class StaticQuantQuantizer(Quantizer):
+    """The quantizer using Static Quant."""
+
     def __init__(self, quant_config: OrderedDict = {}):
         """Init a StaticQuantQuantizer object.
 
diff --git a/neural_compressor/torch/algorithms/static_quant/utility.py b/neural_compressor/torch/algorithms/static_quant/utility.py
index f4930a22ddd..a89bad3a91f 100644
--- a/neural_compressor/torch/algorithms/static_quant/utility.py
+++ b/neural_compressor/torch/algorithms/static_quant/utility.py
@@ -11,6 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""Utility functions for Static quantization."""
+
 
 import copy
 import json
@@ -70,6 +72,17 @@
 
 
 def cfg_to_qconfig(tune_cfg, cfgs, op_infos_from_cfgs, output_tensor_id_op_name):  # pragma: no cover
+    """Updates json file in ipex_config_path.
+
+    Args:
+        tune_cfg (dict): dictionary of quantization configuration.
+        cfgs (dict): configs loaded from ipex config path.
+        op_infos_from_cfgs (dict): dict containing configs that have been parsed for each op.
+        output_tensor_ids_op_name (dict): dict containing op names corresponding to 'op_infos_from_cfgs'.
+
+    Returns:
+        user_cfg (dict): quantization configuration for ops.
+    """
     assert cfgs is not None, "No configure for IPEX int8 model..."
     op_infos = copy.deepcopy(op_infos_from_cfgs)
     cfgs, user_cfg = check_cfg_and_qconfig(tune_cfg["op"], cfgs, op_infos, output_tensor_id_op_name)
@@ -164,6 +177,14 @@ def check_cfg_and_qconfig(user_cfg, cfgs, op_infos_from_cfgs, output_tensor_ids_
 
 
 def generate_xpu_qconfig(tune_cfg):  # pragma: no cover
+    """Generates qconfig for quantiztaion on xpu device.
+
+    Args:
+        tune_cfg (dict): dictionary of quantization configuration.
+
+    Returns:
+        qconfig (dict): quantization configuration for ops.
+    """
     # qconfig observer & config constants for ipex-xpu
     from torch.ao.quantization import HistogramObserver, MinMaxObserver, QConfig
 
@@ -305,9 +326,12 @@ def get_quantizable_ops_recursively(model, example_inputs):  # pragma: no cover
     Args:
         model (object): input model
         example_inputs (dict|list|tuple|torch.Tensor): used to trace torch model.
+
     Returns:
         quantizable_ops (list): list of tuples of op_name and op_type.
-        cfgs (dict): dict of configuration
+        cfgs (dict): dict of configuration.
+        op_infos_from_cfgs (dict): dict containing configs that have been parsed for each op.
+        output_tensor_ids_op_name (dict): dict containing op names corresponding to 'op_infos_from_cfgs'.
     """
     quantizable_ops = []
     op_name_info = []
@@ -438,6 +462,7 @@ def dump_model_op_stats(user_cfg):
 
     Args:
         user_cfg (dict): quantization config
+
     Returns:
         None
     """
@@ -493,7 +518,6 @@ def parse_cfgs(cfgs):  # pragma: no cover
     Args:
         cfgs (dict): the input configs.
 
-
     Returns:
         ops_name (list): list of op names.
         tune_cfg (dict): dictionary of quantization configuration.
@@ -625,7 +649,6 @@ def detect_block(self) -> Dict[str, List[List[str]]]:
         """Traverse the model definition and return the attention blocks and ffn blocks.
 
         Returns:
-
             blocks: A dict include the detected attention blocks and ffn blocks.
         """
         # Step 1: Traverse model definition and record the op position

From 0b480bea96dd61955d303d45416eb31bd292ba0d Mon Sep 17 00:00:00 2001
From: violetch24 <zixuan@aia-sdp-spr-117706.jf.intel.com>
Date: Thu, 18 Jul 2024 21:04:05 -0700
Subject: [PATCH 3/4] update scan path

Signed-off-by: violetch24 <zixuan@aia-sdp-spr-117706.jf.intel.com>
---
 .../scripts/codeScan/pydocstyle/scan_path.txt |   2 +
 .../algorithms/smooth_quant/smooth_quant.py   |   2 +
 .../torch/algorithms/smooth_quant/utility.py  | 773 ++++++++++++++----
 3 files changed, 638 insertions(+), 139 deletions(-)

diff --git a/.azure-pipelines/scripts/codeScan/pydocstyle/scan_path.txt b/.azure-pipelines/scripts/codeScan/pydocstyle/scan_path.txt
index b524f1f61db..95c72f09fa0 100644
--- a/.azure-pipelines/scripts/codeScan/pydocstyle/scan_path.txt
+++ b/.azure-pipelines/scripts/codeScan/pydocstyle/scan_path.txt
@@ -15,3 +15,5 @@
 /neural-compressor/neural_compressor/strategy
 /neural-compressor/neural_compressor/training.py
 /neural-compressor/neural_compressor/utils
+/neural-compressor/neural_compressor/torch/algorithms/static_quant/
+/neural-compressor/neural_compressor/torch/algorithms/smooth_quant/
\ No newline at end of file
diff --git a/neural_compressor/torch/algorithms/smooth_quant/smooth_quant.py b/neural_compressor/torch/algorithms/smooth_quant/smooth_quant.py
index ec3d528e087..ad8d552768c 100644
--- a/neural_compressor/torch/algorithms/smooth_quant/smooth_quant.py
+++ b/neural_compressor/torch/algorithms/smooth_quant/smooth_quant.py
@@ -51,6 +51,8 @@
 
 
 class SmoothQuantQuantizer(Quantizer):
+    """SmoothQuantQuantizer Class."""
+
     def __init__(self, quant_config: OrderedDict = {}):  # pragma: no cover
         """Init a SmoothQuantQuantizer object.
 
diff --git a/neural_compressor/torch/algorithms/smooth_quant/utility.py b/neural_compressor/torch/algorithms/smooth_quant/utility.py
index b8a5b9669ff..a5f6aadec65 100644
--- a/neural_compressor/torch/algorithms/smooth_quant/utility.py
+++ b/neural_compressor/torch/algorithms/smooth_quant/utility.py
@@ -11,6 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""Utility functions for Smooth quantization."""
+
 
 import copy
 import json
@@ -53,7 +55,9 @@ def get_quantizable_ops_recursively(model, example_inputs, alpha, act_algo, inpl
 
     Returns:
         quantizable_ops (list): list of tuples of op_name and op_type.
-        cfgs (dict): dict of configuration
+        cfgs (dict): dict of configuration.
+        op_infos_from_cfgs (dict): op infos from configs.
+        output_tensor_ids_op_name (dict): dictionary of output tensor op names.
     """
     quantizable_ops = []
     # group ops by position for transform-based model
@@ -173,6 +177,9 @@ def check_cfg_and_qconfig(
         cfgs (dict): the input configs.
         op_infos_from_cfgs (dict): op infos from configs.
         output_tensor_ids_op_name (dict): dictionary of output tensor op names.
+        alpha (float): Value to balance input and weight quantization error,
+            between 0 and 1, default is 0.5.
+        smooth_quant (bool, optional): whether to use smooth quant.
 
     Returns:
         cfgs (dict).
@@ -243,6 +250,20 @@ def check_cfg_and_qconfig(
 def cfg_to_qconfig(
     tune_cfg, cfgs, op_infos_from_cfgs, output_tensor_id_op_name, alpha=0.5, smooth_quant=True
 ):  # pragma: no cover
+    """Check configs and quantization configs.
+
+    Args:
+        user_cfg (dict): quantization configuration for ops.
+        cfgs (dict): configs loaded from ipex config path.
+        op_infos_from_cfgs (dict): dict containing configs that have been parsed for each op.
+        output_tensor_ids_op_name (dict): dict containing op names corresponding to 'op_infos_from_cfgs'.
+        alpha (float): Value to balance input and weight quantization error,
+            between 0 and 1, default is 0.5.
+        smooth_quant (bool, optional): whether to use smooth quant.
+
+    Returns:
+        cfgs (dict): updated configs.
+    """
     assert cfgs is not None, "No configure for IPEX int8 model..."
     op_infos = copy.deepcopy(op_infos_from_cfgs)
     cfgs = check_cfg_and_qconfig(tune_cfg["op"], cfgs, op_infos, output_tensor_id_op_name, alpha, smooth_quant)
@@ -255,7 +276,8 @@ def dump_model_op_stats(user_cfg):
     """This is a function to dump quantizable ops of model to user.
 
     Args:
-        user_cfg (dict): quantization config
+        user_cfg (dict): quantization config.
+
     Returns:
         None
     """
@@ -305,6 +327,16 @@ def dump_model_op_stats(user_cfg):
 
 
 def get_parent(node, all_parents=False):  # pragma: no cover
+    """Get the parent node(s) of a given node.
+
+    Args:
+        node (Node): The node whose parent(s) are to be retrieved.
+        all_parents (bool, optional): Whether to return all parents or just the first one. Defaults to False.
+
+    Returns:
+        list: The parent node if `all_parents` is False, otherwise a list of all parent nodes.
+            Returns None if no parents are found.
+    """
     if node.inputs() is None:
         return None
     elif len(list(node.inputs())) == 0:
@@ -403,6 +435,15 @@ def update_sq_scale(ipex_config_path, smoothquant_scale_info):  # pragma: no cov
 
 
 def enough_memo_store_scale(device, need_space):  # pragma: no cover
+    """Check if there is enough memory available to store a specified amount of data.
+
+    Args:
+        device (str): The device type ('cuda' for GPU or 'cpu' for CPU).
+        need_space (int): The amount of memory needed, in bytes.
+
+    Returns:
+        bool: True if there is enough memory available, False otherwise.
+    """
     if device == "cuda":  # pragma: no cover
         current_gpu_index = torch.cuda.current_device()
         total_memory = torch.cuda.get_device_properties(current_gpu_index).total_memory
@@ -416,6 +457,18 @@ def enough_memo_store_scale(device, need_space):  # pragma: no cover
 
 
 def move_input_to_device(input, device=torch.device("cpu")):  # pragma: no cover
+    """Move the input data to the specified device.
+
+    Args:
+        input (dict, list, tuple, or torch.Tensor): The input data to be moved.
+            Can be a dictionary, list, tuple, or a tensor.
+        device (torch.device, optional): The device to which the input should be moved.
+            Defaults to CPU.
+
+    Returns:
+        The input data moved to the specified device,
+            with the same type as the input (dict, list, tuple, or tensor).
+    """
     if isinstance(input, dict) or isinstance(input, UserDict):
         tmp_input = {}
         for k, inp in input.items():
@@ -433,6 +486,21 @@ def move_input_to_device(input, device=torch.device("cpu")):  # pragma: no cover
 
 
 def forward_wrapper(model, input, device=torch.device("cpu")):  # pragma: no cover
+    """Apply the model to the input data on the specified device.
+
+    Args:
+        model (torch.nn.Module): The model to be applied.
+        input (dict, list, tuple, or zip): The input data to be fed to the model.
+            Can be a dictionary, list, tuple, or a zip of arguments and keyword arguments.
+        device (torch.device, optional): The device on which the model and input should be located.
+            Defaults to CPU.
+
+    Returns:
+        The output of the model after applying it to the input data.
+
+    Raises:
+        Exception: Logs warnings if there are issues with moving the model or input to the device.
+    """
     try:
         model = model.to(device)
         input = move_input_to_device(input, device)
@@ -455,6 +523,21 @@ def forward_wrapper(model, input, device=torch.device("cpu")):  # pragma: no cov
 
 
 def model_forward(model, dataloader, iters, device):  # pragma: no cover
+    """Run the model on data from the dataloader for a specified number of iterations.
+
+    Args:
+        model (torch.nn.Module): The model to be used for forward passes.
+        dataloader (DataLoader): The dataloader providing the input data and labels.
+        iters (int): The maximum number of iterations to run.
+            If -1, run until the dataloader is exhausted.
+        device (torch.device): The device on which the model and data are located.
+
+    Returns:
+        None
+
+    Raises:
+        Exception: Handles exceptions during the forward pass and retries if needed.
+    """
     try:
         cnt = 0
         for idx, (input, label) in enumerate(dataloader):
@@ -472,6 +555,18 @@ def model_forward(model, dataloader, iters, device):  # pragma: no cover
 
 
 def build_captured_dataloader(model, run_fn, calib_num=None):
+    """Build a dataloader that captures input data and keyword arguments used in forward passes of the model.
+
+    Args:
+        model (torch.nn.Module): The model whose inputs will be captured.
+        run_fn (function): A function to run the model, which will use the InputCaptureModule to collect inputs.
+        calib_num (int, optional): The number of inputs to capture for calibration. If None, capture all inputs.
+
+    Returns:
+        torch.nn.Module: The original model.
+        CapturedDataloader: A dataloader with the captured inputs and keyword arguments.
+    """
+
     class CapturedDataloader:
         def __init__(self, args_list, kwargs_list) -> None:
             self.args_list = args_list
@@ -509,6 +604,18 @@ def forward(self, *args, **kwargs):
 
 
 def cal_scale(input_max_abs, weights, alpha, weight_max_lb=1e-5):  # pragma: no cover
+    """Calculate the scaling factor for weights based on the input max values and weight magnitudes.
+
+    Args:
+        input_max_abs (Tensor): The maximum absolute values of the inputs.
+        weights (list of Tensor): The list of weight tensors to be concatenated and processed.
+        alpha (float): A parameter to balance the scaling between inputs and weights.
+        weight_max_lb (float, optional): The lower bound for weight magnitudes to avoid division by zero.
+            Defaults to 1e-5.
+
+    Returns:
+        Tensor: The calculated scaling factors for the weights.
+    """
     weights = torch.cat(weights, dim=0)
     weight_max = torch.max(torch.abs(weights), dim=0)[0]
     weight_max = torch.clip(weight_max, weight_max_lb)
@@ -521,6 +628,19 @@ def cal_scale(input_max_abs, weights, alpha, weight_max_lb=1e-5):  # pragma: no
 
 
 def model_forward_per_sample(model, sample, device):  # pragma: no cover
+    """Perform a forward pass of the model on a single sample.
+
+    Args:
+        model (torch.nn.Module): The model to be applied.
+        sample (Tensor or tuple): The input sample or a tuple of inputs to be passed to the model.
+        device (torch.device): The device on which the model and input sample are located.
+
+    Returns:
+        Tensor: The output of the model after applying it to the sample.
+
+    Raises:
+        Exception: Handles exceptions during the forward pass and retries if needed.
+    """
     try:
         output = forward_wrapper(model, sample, device)
         return output
@@ -531,6 +651,22 @@ def model_forward_per_sample(model, sample, device):  # pragma: no cover
 
 
 def quant_dequant_w_v1(m, num_bits=8, scheme="sym"):  # pragma: no cover
+    """Quantize and dequantize the weights of a layer.
+
+    Args:
+        m (torch.nn.Module): The layer whose weights are to be quantized and dequantized.
+            Supports torch.nn.Linear and torch.nn.Conv2d.
+        num_bits (int, optional): The number of bits for quantization.
+            Defaults to 8.
+        scheme (str, optional): The quantization scheme to use.
+            Can be "sym" for symmetric or "asym" for asymmetric quantization. Defaults to "sym".
+
+    Returns:
+        Tensor: The quantized and dequantized weights of the layer.
+
+    Raises:
+        Warning: Logs a warning if the layer type is not supported.
+    """
     eps = torch.finfo(torch.float32).eps
     if isinstance(m, torch.nn.Linear):
         x = m.weight
@@ -589,6 +725,22 @@ def quant_dequant_w_v1(m, num_bits=8, scheme="sym"):  # pragma: no cover
 
 
 def quant_dequant_x_v1(x, min_x=None, max_x=None, num_bits=8):  # pragma: no cover
+    """Quantize and dequantize a tensor.
+
+    Args:
+        x (Tensor): The input tensor to be quantized and dequantized.
+        min_x (Tensor, optional): The minimum value of the input tensor.
+            If None, it will be computed from x. Defaults to None.
+        max_x (Tensor, optional): The maximum value of the input tensor.
+            If None, it will be computed from x. Defaults to None.
+        num_bits (int, optional): The number of bits for quantization. Defaults to 8.
+
+    Returns:
+        Tensor: The quantized and dequantized tensor.
+
+    Raises:
+        None: No specific exceptions are raised, but input values are clipped to avoid invalid operations.
+    """
     eps = torch.finfo(torch.float32).eps
     q_min, q_max = 0, 2.0**num_bits - 1.0
     if max_x is None or min_x is None:
@@ -605,10 +757,15 @@ def quant_dequant_x_v1(x, min_x=None, max_x=None, num_bits=8):  # pragma: no cov
 
 
 def reshape_scale_as_weight(layer, scale):  # pragma: no cover
-    """Reshape the scale for weight input channel, depthwise output channel
-    :param layer:  torch module
-    :param scale: orig scale
-    :return: reshaped scale."""
+    """Reshape the scale for weight input channel, depthwise output channel.
+
+    Args:
+        layer (torch.nn.Module): Torch module.
+        scale (Tensor): Original scale.
+
+    Returns:
+        Tensor: Reshaped scale.
+    """
     if hasattr(layer, "orig_layer"):
         layer = layer.orig_layer
     if isinstance(layer, torch.nn.Conv2d) and layer.groups > 1:  ##only depthwise conv could hit here
@@ -624,9 +781,14 @@ def reshape_scale_as_weight(layer, scale):  # pragma: no cover
 
 
 def reshape_in_channel_to_last(layer_name, model):  # pragma: no cover
-    """Move the input channel to the last dim
-    :param layer_name: Layer name
-    :return: The reshaped weight."""
+    """Move the input channel to the last dimension.
+
+    Args:
+        layer_name (str): Layer name.
+
+    Returns:
+        Tensor: The reshaped weight.
+    """
     layer = get_module(model, layer_name)
     if layer.__class__.__name__ == "WrapperLayer":
         layer = layer.orig_layer
@@ -639,11 +801,14 @@ def reshape_in_channel_to_last(layer_name, model):  # pragma: no cover
 
 
 def reshape_scale_as_input(layer, scale):  # pragma: no cover
-    """Reshape the scale for input feature in channel
-    :param layer:
+    """Reshape the scale for input feature in channel.
+
+    Args:
+        layer (torch.nn.Module): Torch module.
+        scale (Tensor): Original scale.
 
-    :param scale:
-    :return:
+    Returns:
+        Tensor: Reshaped scale.
     """
     if hasattr(layer, "orig_layer"):
         layer = layer.orig_layer
@@ -660,9 +825,10 @@ def reshape_scale_as_input(layer, scale):  # pragma: no cover
 
 
 def register_autotune(name):  # pragma: no cover
-    """Class decorator to register a smoothquant auto-tune subclass.
+    """Class decorator to register a SmoothQuant auto-tune subclass.
 
-    :return: the class of register
+    Returns:
+        type: The class of register.
     """
 
     def register(auto_tune):
@@ -673,7 +839,17 @@ def register(auto_tune):
 
 
 class Calibration:  # pragma: no cover
+    """Calibration class."""
+
     def __init__(self, model, dataloder=None, q_func=None, device="cpu"):
+        """Initialize the Calibration class.
+
+        Args:
+            model (torch.nn.Module): The model to be calibrated.
+            dataloder (DataLoader, optional): DataLoader providing the calibration data. Defaults to None.
+            q_func (Callable, optional): A function for quantization. Defaults to None.
+            device (str, optional): The device to perform calibration on. Defaults to "cpu".
+        """
         self.model = model
         self.dataloader = dataloder
         self.q_func = q_func
@@ -681,9 +857,14 @@ def __init__(self, model, dataloder=None, q_func=None, device="cpu"):
 
     @torch.no_grad()
     def _save_input_pc_hook(self, name):
-        """A forward hook to save input max of a module
-        :param name: the module name
-        :return: A hook function."""
+        """A forward hook to save input max of a module.
+
+        Args:
+            name (str): The module name.
+
+        Returns:
+            function: A hook function.
+        """
 
         def save_input_hook(module, inputs, outputs):
             input = inputs[0]
@@ -703,9 +884,13 @@ def save_input_hook(module, inputs, outputs):
 
     @torch.no_grad()
     def _add_min_max_observer(self, modules):
-        """
-        :param modules: the modules which the observer will insert to
-        :return:
+        """Insert observers into the given modules.
+
+        Args:
+            modules (list): The modules to which the observer will be inserted.
+
+        Returns:
+            None
         """
         self.hook_handles = []
         for key in modules.keys():
@@ -715,17 +900,25 @@ def _add_min_max_observer(self, modules):
 
     @torch.no_grad()
     def _remove_observer(self):
-        """Remove the observer from the model
-        :return:"""
+        """Remove the observer from the model.
+
+        Returns:
+            None
+        """
         for hook_handle in self.hook_handles:
             hook_handle.remove()
 
     @torch.no_grad()
     def _dump_min_max(self, calib_iter=100):
-        """Dump min max per channel information, the min max value will be saved in input_maxes attribute
-        :param calibration_method: only support min_max currently
-        :param calib_iter: Sample size for calibration
-        :return:"""
+        """Dump min-max per channel information; the min-max values will be saved in the input_maxes attribute.
+
+        Args:
+            calibration_method (str): Only supports 'min_max' currently.
+            calib_iter (int): Sample size for calibration.
+
+        Returns:
+            None
+        """
         logger.info("Calibrating...")
         if self.q_func:
             self.q_func(self.model)
@@ -735,10 +928,15 @@ def _dump_min_max(self, calib_iter=100):
 
     @torch.no_grad()
     def calibrate(self, calib_iter, op_types=[torch.nn.Conv2d, torch.nn.Linear]):  ##TODO transformers.conv1d
-        """
-        :param absorb_to_layer: A dict,key is the absorb layer, val is a list of the to be smoothed layer
-        :param calib_iter: Data size for calibration
-        :return: A dict that saved the layer name and the channel-wise max value info
+        """Process the absorb layer and smooth layers, then return the channel-wise max value info.
+
+        Args:
+            absorb_to_layer (dict): A dictionary where keys are absorb layers and values are lists
+                of layers to be smoothed.
+            calib_iter (int): Data size for calibration.
+
+        Returns:
+            dict: A dictionary containing the layer names and channel-wise max value information.
         """
         ##hook all the module
         self.input_mins = {}
@@ -757,7 +955,19 @@ def calibrate(self, calib_iter, op_types=[torch.nn.Conv2d, torch.nn.Linear]):  #
 
 
 class GraphTrace:  # pragma: no cover
+    """GraphTrace Class."""
+
     def __init__(self):
+        """Initialize the GraphTrace class with supported operations and layers.
+
+        Attributes:
+            supported_torch_module_to_aten (dict): A mapping from PyTorch module names
+                to their corresponding ATen operation names.
+            skip_ops_to_find_absorb (list of str): A list of ATen operations that should
+                be skipped when searching for operations to absorb.
+            could_absorb_layers (list of str): A list of ATen operations that are eligible
+                for absorption during graph tracing.
+        """
         self.supported_torch_module_to_aten = {
             "Linear": "aten::linear",
             "Conv2d": "aten::_convolution",
@@ -786,6 +996,19 @@ def __init__(self):
         ]  ##TODO,support more norm
 
     def trace(self, model, dummy_input):
+        """Trace and freeze a model using TorchScript, handling various input formats and devices.
+
+        Args:
+            model (torch.nn.Module): The model to be traced and frozen.
+            dummy_input (Tensor, dict, or tuple): A dummy input or a dictionary of inputs
+                for tracing the model.
+
+        Returns:
+            torch.jit.ScriptModule or None: The traced and frozen model, or None if tracing failed.
+
+        Raises:
+            Exception: Logs warnings if tracing or freezing the model fails.
+        """
         traced_model = None
         optimize_numerics = False
         orig_device = str(next(model.parameters()).device)
@@ -816,6 +1039,17 @@ def trace(self, model, dummy_input):
         return traced_model
 
     def get_nodes(self, traced_model, op_types=["Linear"]):
+        """Extract nodes of specified types from a traced model's computation graph.
+
+        Args:
+            traced_model (torch.jit.ScriptModule): The traced and frozen model.
+            op_types (list or str, optional): The types of operations to extract nodes for.
+                Defaults to ["Linear"].
+
+        Returns:
+            list of tuple: A list of tuples where each tuple contains a node
+                and its operation type.
+        """
         if isinstance(op_types, str):
             op_types = [op_types]
         nodes = []
@@ -828,6 +1062,14 @@ def get_nodes(self, traced_model, op_types=["Linear"]):
         return nodes
 
     def get_prev_absorb_layer(self, nodes):
+        """Find previous layers that can be absorbed based on the given nodes.
+
+        Args:
+            nodes (list): A list of nodes for which to find absorbable previous layers.
+
+        Returns:
+            list: A list of previous layers that can be absorbed, or None if no suitable layer is found.
+        """
         prev_absorb_layer = []
         for node in nodes:
             parent = get_parent(node)
@@ -856,6 +1098,14 @@ def get_prev_absorb_layer(self, nodes):
         return prev_absorb_layer
 
     def skip_op_absorb_helper(self, parent_node):
+        """Helper function to determine if a node should be skipped for absorption based on its outputs.
+
+        Args:
+            parent_node (torch.jit.Node): The node to evaluate for absorption suitability.
+
+        Returns:
+            bool: True if the node can be absorbed, False otherwise.
+        """
         for val_user in list(parent_node.outputs())[0].uses():
             next_node = val_user.user
             if next_node.kind() == "aten::size":
@@ -871,6 +1121,14 @@ def skip_op_absorb_helper(self, parent_node):
         return True
 
     def mapping_torch_module_to_aten(self, op_types):
+        """Map specified torch module operation types to their corresponding ATen operation types.
+
+        Args:
+            op_types (list of str): A list of operation types to be mapped from torch module to ATen.
+
+        Returns:
+            list: A list of unique ATen operation types corresponding to the provided torch module operation types.
+        """
         res = []
         for op in op_types:
             if op not in self.supported_torch_module_to_aten.keys():
@@ -881,10 +1139,13 @@ def mapping_torch_module_to_aten(self, op_types):
         return res
 
     def _check_valid_conv(self, module):
-        """Remove group conv except depthwise conv
-        :param module:
+        """Remove group convolution layers except depthwise convolution.
+
+        Args:
+            module (torch.nn.Module): The module to process.
 
-        :return:
+        Returns:
+            None
         """
         if not isinstance(module, torch.nn.Conv2d):
             return True
@@ -896,6 +1157,19 @@ def _check_valid_conv(self, module):
         return True
 
     def get_absorb_to_layer(self, model, example_input, op_types, skip_unsupported_layers=True):
+        """Determine which layers in the model can be absorbed by other layers and map them accordingly.
+
+        Args:
+            model (torch.nn.Module): The model to analyze for absorbable layers.
+            example_input (Tensor, dict, or tuple): Example input to trace the model.
+            op_types (list of str): List of operation types to be considered for absorption.
+            skip_unsupported_layers (bool, optional): Whether to exclude layers that are not supported.
+                Defaults to True.
+
+        Returns:
+            absorb_to_layer (dict): A dictionary mapping absorbable layer names to the layers they can absorb.
+            no_absorb_layers (list): A list of layer names that could not be absorbed.
+        """
         traced_model = self.trace(model, example_input)
         if traced_model is None:
             return None, None
@@ -924,6 +1198,17 @@ def get_absorb_to_layer(self, model, example_input, op_types, skip_unsupported_l
         return absorb_to_layer, no_absorb_layers
 
     def remove_unsupported_layers(self, model, absorb_to_layer, no_absorb_layers):
+        """Filter out unsupported layers from the absorb-to-layer mapping based on model's layer types.
+
+        Args:
+            model (torch.nn.Module): The model containing layers to be checked.
+            absorb_to_layer (dict): A dictionary mapping absorbable layer names to layers they can absorb.
+            no_absorb_layers (list): A list to collect names of layers that cannot be absorbed.
+
+        Returns:
+            dict: A dictionary with only the supported layers, mapping absorbable layer names
+                to valid layers they can absorb.
+        """
         res = {}
         for key in absorb_to_layer.keys():
             absorb_layer = get_module(model, key)
@@ -946,6 +1231,8 @@ def remove_unsupported_layers(self, model, absorb_to_layer, no_absorb_layers):
 
 @register_autotune("version1")
 class AutoAlpha:  # pragma: no cover
+    """AutoAlpha Class."""
+
     def __init__(
         self,
         model,
@@ -966,7 +1253,6 @@ def __init__(
         n_samples=32,
     ):
         """Initialize the AutoAlpha tuner with necessary parameters and components."""
-
         self.model = model.to("cpu")
         self.model.eval()
         self.dataloader = dataloader
@@ -991,8 +1277,11 @@ def __init__(
         self.device = device
 
     def tune(self):
-        """The main entry of auto_alpha
-        :return: Optimal alpha values and scales based on user-defined recipes."""
+        """The main entry of auto_alpha.
+
+        Returns:
+            tuple: Optimal alpha values and scales based on user-defined recipes.
+        """
         calib = Calibration(self.model, self.dataloader, self.q_func, self.device)
         calib_iter = 100
         self.input_mins, self.input_maxes = calib.calibrate(calib_iter, self.op_types)
@@ -1047,9 +1336,13 @@ def get_blocks(self):
         return block_names
 
     def _add_blockwise_observer(self, block_modules):
-        """
-        :param block_modules: the block modules which the observer will insert to
-        :return:
+        """Insert observers into the block modules.
+
+        Args:
+            block_modules (list): The block modules to which the observer will be inserted.
+
+        Returns:
+            None
         """
         self.blockwise_hook_handles = []
         for key in block_modules.keys():
@@ -1058,9 +1351,14 @@ def _add_blockwise_observer(self, block_modules):
             self.blockwise_hook_handles.append(hook_handle)
 
     def _save_blockwise_hook(self, name):
-        """A forward hook to save inputs/outputs of a block
-        :param name: the block name
-        :return: A hook function."""
+        """A forward hook to save inputs and outputs of a block.
+
+        Args:
+            name (str): The block name.
+
+        Returns:
+            function: A hook function.
+        """
 
         def save_blockwise_hook(module, inputs, outputs):
             self.block_inputs[name] = inputs[0]
@@ -1102,8 +1400,11 @@ def _change_qdq_for_auto(self, enable=True):
                 module.disable_quant()
 
     def _qdq_model_wrapper_for_auto(self, save_q_input=False):
-        """Wrapper all the module with qdq
-        :return:"""
+        """Wrap all the modules with QDQ (Quantize-Dequantize) operations.
+
+        Returns:
+            None
+        """
         module_names = self._get_all_hook_module_names()
         self.to_unwrap_module_names = module_names
         for name in module_names:
@@ -1114,8 +1415,11 @@ def _qdq_model_wrapper_for_auto(self, save_q_input=False):
             set_module(self.model, name, new_module)
 
     def _qdq_model_unwrapper_for_auto(self):
-        """Unwrapper all the module with qdq
-        :return:"""
+        """Unwrap all the modules from QDQ (Quantize-Dequantize) operations.
+
+        Returns:
+            None
+        """
         module_names = self.to_unwrap_module_names
         for name in module_names:
             module = get_module(self.model, name)
@@ -1124,11 +1428,16 @@ def _qdq_model_unwrapper_for_auto(self):
             set_module(self.model, name, module.orig_layer)
 
     def _cal_scales(self, absorb_to_layer, input_maxes, alpha=0.5):
-        """Cal the adjust scales
-        :param absorb_to_layer: A dict mapping absorb layer to smooth quantized layer
-        :param input_maxes: The channel-wise input max info for layers
-        :param alpha: Alpha value to balance the quantization difficulty of activation and weight, a float of a dict
-        :return:"""
+        """Calculate the adjustment scales.
+
+        Args:
+            absorb_to_layer (dict): A dictionary mapping absorb layers to smooth quantized layers.
+            input_maxes (dict): The channel-wise input max information for layers.
+            alpha (float or dict): Alpha value to balance the quantization difficulty of activation and weight.
+
+        Returns:
+            dict: A dictionary containing the calculated adjustment scales.
+        """
         absorb_to_input_maxes = {}
         for key in absorb_to_layer.keys():
             layer_name = absorb_to_layer[key][0]
@@ -1173,12 +1482,17 @@ def _cal_scales(self, absorb_to_layer, input_maxes, alpha=0.5):
         return absorb_scales_info, weight_scales_info
 
     def _get_auto_loss(self, output, output_q, loss_type="abs", loss_alpha=1.0):
-        """Get the loss for auto tuning
-        :param output: Fp32 output for one layer
-        :param output_q: Quant output for one layer
-        :param loss_type: The type of loss
-        :param loss_alpha: Loss alpha i for mean scale error
-        :return: A tensor of the loss."""
+        """Get the loss for auto-tuning.
+
+        Args:
+            output (Tensor): FP32 output for one layer.
+            output_q (Tensor): Quantized output for one layer.
+            loss_type (str): The type of loss.
+            loss_alpha (float): Loss alpha value for mean scale error.
+
+        Returns:
+            Tensor: A tensor containing the calculated loss.
+        """
         if len(output.shape) <= 2:
             max_value = torch.max(torch.abs(output))
         else:
@@ -1194,8 +1508,11 @@ def _get_auto_loss(self, output, output_q, loss_type="abs", loss_alpha=1.0):
             return torch.sum((output - output_q) ** 2)
 
     def _get_sq_layer_names(self):
-        """Get all the layers that could be smooth quanted
-        :return: All the sq layer names."""
+        """Get all the layers that could be smooth quantized.
+
+        Returns:
+            list: All the smooth quantization layer names.
+        """
         ##TODO this may not fit for folding=False
         module_names = []
         for key in self.absorb_to_layer:
@@ -1203,9 +1520,10 @@ def _get_sq_layer_names(self):
         return module_names
 
     def _get_best_alpha(self, absorb_to_layer, loss_alphas, shared_criterion):
-        """Obtain the optimal alpha values based on shared criterion and loss values recorded in auto-tuning step.
+        """Obtain the optimal alpha values based on shared criteria and loss values recorded in the auto-tuning step.
 
-        :return: A dict of layerwise alpha values.
+        Returns:
+            dict: A dictionary of layerwise alpha values.
         """
 
         def dict_to_list(dic):
@@ -1250,7 +1568,8 @@ def dict_to_list(dic):
     def _get_one_batch_auto_loss(self, input, alpha_space, orig_best_alpha, input_maxes):
         """Calculate the losses for all alpha values given an input.
 
-        :return: A dict of op-wise loss values with respect to alpha values.
+        Returns:
+            dict: A dictionary of operation-wise loss values with respect to alpha values.
         """
         self._change_qdq_for_auto(enable=False)
         module_names = self._get_sq_layer_names()
@@ -1300,7 +1619,8 @@ def _get_one_batch_auto_loss(self, input, alpha_space, orig_best_alpha, input_ma
     def _get_one_batch_auto_loss_blockwise(self, input, alpha_space, orig_best_alpha, input_maxes):
         """Calculate the losses for all alpha values given an input in blockwise tuning mode.
 
-        :return: A dict of blockwise-wise loss values with respect to alpha values.
+        Returns:
+            dict: A dictionary of blockwise loss values with respect to alpha values.
         """
         self._change_qdq_for_auto(enable=False)
         module_names = self._get_sq_layer_names()
@@ -1374,9 +1694,10 @@ def _get_one_batch_auto_loss_blockwise(self, input, alpha_space, orig_best_alpha
         return loss_alphas
 
     def opwise_rank(self, loss_alphas, best_alphas):
-        """Rank the final losses of ops based on their ratio with respect to op output norm.
+        """Rank the final losses of operations based on their ratio with respect to operation output norm.
 
-        :return:
+        Returns:
+            dict: A dictionary of ranked operations with their loss ratios.
         """
         max_op, max_ratio, max_key = "", 0, ""
         ratio_info = {}
@@ -1410,7 +1731,8 @@ def opwise_rank(self, loss_alphas, best_alphas):
     def default_tune_setup(self):
         """Setup default auto-tune settings.
 
-        :return: A dict of op-wise loss values with respect to alpha values.
+        Returns:
+            dict: A dictionary of operation-wise loss values with respect to alpha values.
         """
         round_num = max(  # Initialize the alpha search space
             len(str(self.alpha_min).split(".")[1]),
@@ -1574,12 +1896,13 @@ def _auto_tune_alpha_blockwise(self):
 
 
 class TorchSmoothQuant:  # pragma: no cover
-    """Fake input channel quantization, for more details please refer to
+    """Fake input channel quantization.
+
+    For more details please refer to:
     [1] SmoothQuant: Accurate and Efficient
     Post-Training Quantization for Large Language Models
     [2] SPIQ: Data-Free Per-Channel Static Input Quantization
     Currently, we only handle the layers whose smooth scale could be absorbed, we will support other layers later.
-
     We only support inplace mode which means the model weights will be changed, you can call recover function
     to recover the weights if needed
     """
@@ -1594,10 +1917,16 @@ def __init__(
         scale_sharing=True,
         record_max_info=False,
     ):
-        """
-        :param model: Torch model :param dataloader: Calibration dataloader :param traced_model: A specific model
-        shares the same architecture as the model and could be traced by torch.jit. If not supplied, we use model
-        instead.
+        """Init TorchSmoothQuant Class.
+
+        Args:
+            model (torch.nn.Module): Torch model.
+            dataloader (DataLoader): Calibration dataloader.
+            traced_model (Optional[torch.jit.ScriptModule]): A specific model that shares the same architecture as the model and
+                could be traced by torch.jit. If not supplied, the model will be used instead.
+
+        Returns:
+            None
         """
         self.model = model
         if not isinstance(self.model, torch.nn.Module):
@@ -1631,18 +1960,26 @@ def __init__(
         self.need_calibration = False
 
     def _get_device(self):
-        """Get the model device
-        :return:Model device."""
+        """Get the model device.
+
+        Returns:
+            torch.device: The device on which the model is located.
+        """
         for _, p in self.model.named_parameters():
             return p.data.device, p.data.dtype
 
     def _scale_layer_weight(self, layer_name, scale, alpha=0.5, input_minmax=None):  ##input channel
-        """Scale the layer weights at input channel, depthwise conv output channel
-        :param layer_name: The layer name
-        :param scale: The scale to be multiplied
-        :param alpha: alpha for SQLinearWrapper
-        :param input_minmax: input_minmax for SQLinearWrapper
-        :return:"""
+        """Scale the layer weights at input channel and depthwise convolution output channel.
+
+        Args:
+            layer_name (str): The layer name.
+            scale (Tensor): The scale to be multiplied.
+            alpha (float): Alpha value for SQLinearWrapper.
+            input_minmax (tuple): Input min and max values for SQLinearWrapper.
+
+        Returns:
+            None
+        """
         layer = get_module(self.model, layer_name)
         if self.insert_mul:
             layer = get_module(self.model, layer_name)
@@ -1658,11 +1995,16 @@ def _scale_layer_weight(self, layer_name, scale, alpha=0.5, input_minmax=None):
         return scale
 
     def _absorb_scales(self, layer_name, scale):  ##output channel
-        """Absorb the scale to the layer at output channel
-        :param layer_name: The module name
-        :param scale: The scale to be absorbed
-        :param alpha_key: The alpha passed to SQLinearWrapper
-        :return:"""
+        """Absorb the scale to the layer at the output channel.
+
+        Args:
+            layer_name (str): The module name.
+            scale (Tensor): The scale to be absorbed.
+            alpha_key (str): The alpha value passed to SQLinearWrapper.
+
+        Returns:
+            None
+        """
         if self.insert_mul or not self.allow_absorb:
             return  # absorb is updated in SQLinearWrapper in def _scale_layer_weight
 
@@ -1722,6 +2064,17 @@ def _absorb_scales(self, layer_name, scale):  ##output channel
                 layer.bias *= scale
 
     def _export_sq_info(self, absorb_to_layer, input_maxes, alpha=0.5):
+        """Export information required for SmoothQuant including scales and min/max values.
+
+        Args:
+            absorb_to_layer (dict): A dictionary mapping absorbable layer names to layers they can absorb.
+            input_maxes (dict): A dictionary mapping layer names to their channel-wise maximum values.
+            alpha (float or dict, optional): Alpha value(s) to balance the quantization difficulty
+                of activation and weight. Defaults to 0.5.
+
+        Returns:
+            None
+        """
         absorb_to_input_maxes = {}
         for key in absorb_to_layer.keys():
             layer_name = absorb_to_layer[key][0]
@@ -1770,11 +2123,16 @@ def _export_sq_info(self, absorb_to_layer, input_maxes, alpha=0.5):
                 }
 
     def _cal_scales(self, absorb_to_layer, input_maxes, alpha=0.5):
-        """Cal the adjust scales
-        :param absorb_to_layer: A dict mapping absorb layer to smooth quantized layer
-        :param input_maxes: The channel-wise input max info for layers
-        :param alpha: Alpha value to balance the quantization difficulty of activation and weight, a float of a dict
-        :return:"""
+        """Calculate the adjustment scales.
+
+        Args:
+            absorb_to_layer (dict): A dictionary mapping absorb layers to smooth quantized layers.
+            input_maxes (dict): The channel-wise input max information for layers.
+            alpha (float or dict): Alpha value to balance the quantization difficulty of activation and weight.
+
+        Returns:
+            dict: A dictionary containing the calculated adjustment scales.
+        """
         absorb_to_input_maxes = {}
         for key in absorb_to_layer.keys():
             layer_name = absorb_to_layer[key][0]
@@ -1801,11 +2159,16 @@ def _cal_scales(self, absorb_to_layer, input_maxes, alpha=0.5):
         return absorb_scales_info, weight_scales_info
 
     def _adjust_parameters(self, absorb_to_layer, input_maxes, alpha=0.5):
-        """Adjust the weights and biases
-        :param absorb_to_layer: A dict mapping absorb layer to smooth quantized layer
-        :param input_maxes: The channel-wise input max info for layers
-        :param alpha: Alpha value to balance the quantization difficulty of activation and weight, a float of a dict
-        :return:"""
+        """Adjust the weights and biases.
+
+        Args:
+            absorb_to_layer (dict): A dictionary mapping absorb layers to smooth quantized layers.
+            input_maxes (dict): The channel-wise input max information for layers.
+            alpha (float or dict): Alpha value to balance the quantization difficulty of activation and weight.
+
+        Returns:
+            None
+        """
         absorb_scales_info, weight_scales_info = self._cal_scales(absorb_to_layer, input_maxes, alpha)
         if not absorb_scales_info or not weight_scales_info:
             return weight_scales_info, absorb_scales_info
@@ -1823,14 +2186,17 @@ def _adjust_parameters(self, absorb_to_layer, input_maxes, alpha=0.5):
         return weight_scales_info, absorb_scales_info
 
     def _check_need_calibration(self, alpha, percentile, op_types, scales_per_op, calib_iter):
-        """
-        check need calibration or not
-        :param alpha: current alpha
-        :param percentile: current percentile
-        :param op_types: current op_types
-        :param scales_per_op: current scales_per_op
-        :param calib_iter:: current scales_per_op
-        :return:
+        """Check if calibration is needed.
+
+        Args:
+            alpha (float or dict): Current alpha values.
+            percentile (float): Current percentile.
+            op_types (list): Current operation types.
+            scales_per_op (dict): Current scales per operation.
+            calib_iter (int): Current calibration iterations.
+
+        Returns:
+            bool: True if calibration is needed, False otherwise.
         """
         need_calib = True
         from peft import PeftModel  # pylint: disable=E0401
@@ -1860,6 +2226,17 @@ def _check_need_calibration(self, alpha, percentile, op_types, scales_per_op, ca
 
     @torch.no_grad()
     def _parse_absorb_to_layers(self, op_types, folding):
+        """Parse and map layers in the model for smooth quantization based on specified operation types.
+
+        Args:
+            op_types (list): List of operation types (e.g., ["Linear"]) to consider for quantization.
+            folding (bool): Flag indicating whether to insert multiplication operations (False) or
+                just handle foldable layers (True) for quantization.
+
+        Returns:
+            dict or None: Dictionary mapping absorb layer names to lists of layers that can be quantized.
+                If tracing fails or no layers can be quantized, returns None.
+        """
         str_op_types = [i.__name__ for i in op_types]
         self_absorb_layers = {}
         if self.insert_mul:
@@ -1930,24 +2307,26 @@ def transform(
             "n_samples": 32,  ##512 for cuda, 128 for cpu?
         },
     ):
-        """The main entry of smooth quant
-        :param alpha: Alpha value to balance the quantization difficulty of activation and weight, please refer
-        to the paper for more details
-        :param folding: whether insert mul(False) or just allow foldable layers(True) for SmoothQuant
-        :param percentile: Not supported now
-        :param op_types: The op typed to be smooth quantized
-        :param scales_per_op: Not supported now
-        :param calib_iter: Data size for calibration
-        :param weight_clip: Whether to clip weight_max when calculating scales.
-
-        :param auto_alpha_args: Hyperparameters used to set the alpha search space in SQ auto-tuning.
-            By default, the search space is 0.0-1.0 with step_size 0.1.
-            do_blockwise: Whether to do blockwise auto-tuning.
-        :param init_alpha: A hyperparameter that is used in SQ auto-tuning; by default it is 0.5.
-        :return: A FP32 model with the same architecture as the orig model but with different weight which will be
-        benefit to quantization.
+        """The main entry of SmoothQuant.
+
+        Args:
+            alpha (float or dict): Alpha value to balance the quantization difficulty of activation and weight.
+                Please refer to the paper for more details.
+            folding (bool): Whether to insert multiplication (False) or just allow foldable layers (True) for SmoothQuant.
+            percentile (float): Not supported currently.
+            op_types (list): The operation types to be smooth quantized.
+            scales_per_op (dict): Not supported currently.
+            calib_iter (int): Data size for calibration.
+            weight_clip (bool): Whether to clip weight_max when calculating scales.
+            auto_alpha_args (dict): Hyperparameters used to set the alpha search space in SQ auto-tuning.
+                By default, the search space is 0.0-1.0 with step_size 0.1.
+            do_blockwise (bool): Whether to perform blockwise auto-tuning.
+            init_alpha (float): A hyperparameter used in SQ auto-tuning; by default, it is 0.5.
+
+        Returns:
+            torch.nn.Module: An FP32 model with the same architecture as the original model
+                but with modified weights, which will benefit quantization.
         """
-
         if not isinstance(self.model, torch.nn.Module):
             logger.warning("smoothquant is ignored since the model is not a torch module")
             return self.model
@@ -2054,6 +2433,20 @@ def transform(
         return self.model
 
     def output_is_equal(self, out1, out2, atol=1e-04):
+        """Compare two outputs to determine if they are approximately equal within a specified tolerance.
+
+        Args:
+            out1 (Union[tuple, dict, torch.Tensor]): The first output to compare.
+            out2 (Union[tuple, dict, torch.Tensor]): The second output to compare.
+            atol (float, optional): The absolute tolerance for the comparison. Default is 1e-04.
+
+        Returns:
+            bool: True if the outputs are approximately equal within the tolerance, False otherwise.
+
+        Raises:
+            Exception: If any unexpected error occurs during comparison, a warning is logged,
+                and True is returned to indicate that automatic checking failed.
+        """
         try:
             if isinstance(out1, tuple):
                 return all(torch.all(torch.isclose(out1[i], out2[i], atol=atol)) for i in range(len(out1)))
@@ -2071,8 +2464,11 @@ def output_is_equal(self, out1, out2, atol=1e-04):
 
     @torch.no_grad()
     def revert(self):
-        """Revert the model weights
-        :return:"""
+        """Revert the model weights to their original state.
+
+        Returns:
+            None
+        """
         for key in self.weight_scale_info:
             self._scale_layer_weight(key, 1.0 / self.weight_scale_info[key])
         for key in self.absorb_scales_info:
@@ -2081,11 +2477,14 @@ def revert(self):
         self.absorb_scales_info = {}
 
     def _get_all_layer_names(self, op_types=[torch.nn.Linear]):
-        """Try the model to find the layers which can be smooth quantized.
+        """Identify the layers which can be smooth quantized.
 
-        :param op_types: The op types to be smooth quantized
-        :return:
-        self_absorb_layer: A dict, absorb layer name (itself): layers to be smooth quantized
+        Args:
+            op_types (list): The operation types to be smooth quantized.
+
+        Returns:
+            dict: A dictionary where the keys are absorb layer names (themselves)
+                and the values are lists of layers to be smooth quantized.
         """
         self_absorb_layer = {}
         op_types = [torch.nn.Linear]  # TODO： only support SQLinearWrapper
@@ -2095,6 +2494,14 @@ def _get_all_layer_names(self, op_types=[torch.nn.Linear]):
         return self_absorb_layer
 
     def _get_example_input(self):
+        """Retrieve an example input from the dataloader or return the pre-stored example inputs.
+
+        Returns:
+            Union[torch.Tensor, None]: The example input if available, otherwise None.
+
+        Raises:
+            RuntimeError: If an error occurs while fetching inputs from the dataloader.
+        """
         if self.dataloader is None and self.example_inputs is None:
             return None
         if self.example_inputs is None:
@@ -2110,14 +2517,16 @@ def _get_example_input(self):
         return self.example_inputs
 
     def _trace(self, op_types, skip_unsupported_layers=True):
-        """Try the model to find the layers which can be smooth quantized.
+        """Identify the layers which can be smooth quantized.
 
-        :param op_types: The op types to be smooth quantized
-        :return:
-        absorb_to_layer: A dict, absorb layer name:layers to be smooth quantized
-        no_absorb_layers: A list saving the layers which could not find the absorb layer
-        """
+        Args:
+            op_types (list): The operation types to be smooth quantized.
 
+        Returns:
+            dict: A dictionary where keys are absorb layer names and values are lists of
+                layers to be smooth quantized.
+            list: A list of layers for which no absorb layer was found.
+        """
         tg = GraphTrace()
         self._get_example_input()
         absorb_to_layer, no_absorb_layers = tg.get_absorb_to_layer(
@@ -2149,7 +2558,18 @@ def _trace(self, op_types, skip_unsupported_layers=True):
 
 
 class SQLinearWrapper(torch.nn.Module):  # pragma: no cover
+    """SQLinearWrapper Class."""
+
     def __init__(self, module, input_scale, input_minmax, alpha=0.5, dtype=torch.quint8):
+        """Initialize the class.
+
+        Args:
+            module (torch.nn.Module): The module to be wrapped.
+            input_scale (Tensor): The scale for input features.
+            input_minmax (Tuple[Tensor, Tensor]): The min and max values for input features.
+            alpha (float, optional): A parameter for scaling. Defaults to 0.5.
+            dtype (torch.dtype, optional): The data type for quantization. Defaults to torch.quint8.
+        """
         super().__init__()
         self.register_buffer("input_scale", input_scale)
         self.alpha = alpha
@@ -2162,9 +2582,22 @@ def __init__(self, module, input_scale, input_minmax, alpha=0.5, dtype=torch.qui
 
     @property
     def weight(self):
+        """Get the weight of the sq_linear module.
+
+        Returns:
+            Tensor: The weight of the sq_linear module.
+        """
         return self.sq_linear.weight
 
     def forward(self, X):
+        """Forward pass of the module.
+
+        Args:
+            X (Tensor): The input tensor.
+
+        Returns:
+            Tensor: The output tensor after applying the sq_linear module.
+        """
         if self.ipex:
             X = self.sq_linear(X)
         else:
@@ -2173,7 +2606,16 @@ def forward(self, X):
         return X
 
     def _calculate_qparams(self, input_scale, input_minmax, dtype=torch.quint8):
-        # calculate scale and zero_point
+        """Calculate scale and zero-point for quantization.
+
+        Args:
+            input_scale (Tensor): The scale for input features.
+            input_minmax (Tuple[Tensor, Tensor]): The min and max values for input features.
+            dtype (torch.dtype, optional): The data type for quantization. Defaults to torch.quint8.
+
+        Returns:
+            Tuple[Tensor, Tensor]: The calculated scale and zero-point.
+        """
         if dtype == torch.quint8:
             quant_min, quant_max = 0, 255
         min_val = torch.min(input_minmax[0] * input_scale)
@@ -2188,7 +2630,11 @@ def _calculate_qparams(self, input_scale, input_minmax, dtype=torch.quint8):
         return scale, zero_point
 
     def _get_weight_scale(self):
-        # get weight scale and zero_point
+        """Get the weight scale and zero-point.
+
+        Returns:
+            Tensor: The scale of the weight.
+        """
         from torch.ao.quantization.observer import default_per_channel_weight_observer
 
         obs = default_per_channel_weight_observer()
@@ -2197,20 +2643,36 @@ def _get_weight_scale(self):
         return scale
 
     def _update_sq_linear(self):
-        # remove mul and reset sq_linear for ipex inference
+        """Update the sq_linear module by removing the multiplication of scale.
+
+        This method adjusts the weight of sq_linear for ipex inference.
+        """
         scale = self.input_scale.view(1, self.input_scale.shape[0])
         with torch.no_grad():
             self.sq_linear.weight /= scale
 
     def _recover_sq_linear(self):
-        # remove mul and reset sq_linear for ipex inference
+        """Recover the original sq_linear module by restoring the multiplication of scale.
+
+        This method adjusts the weight of sq_linear for ipex inference.
+        """
         scale = self.input_scale.view(1, self.input_scale.shape[0])
         with torch.no_grad():
             self.sq_linear.weight *= scale
 
 
 class WrapperLayer(torch.nn.Module):  # pragma: no cover
+    """WrapperLayer Class."""
+
     def __init__(self, layer, input_min, input_max, save_q_input=False):
+        """Initialize the WrapperLayer.
+
+        Args:
+            layer (torch.nn.Module): The original layer to be wrapped.
+            input_min (Tensor): Minimum value of the input.
+            input_max (Tensor): Maximum value of the input.
+            save_q_input (bool, optional): Whether to save the quantized input. Defaults to False.
+        """
         super(WrapperLayer, self).__init__()
         self.add_module("orig_layer", layer)  # set orig_layer in get/set_module
         self.quant = False
@@ -2224,17 +2686,34 @@ def __init__(self, layer, input_min, input_max, save_q_input=False):
         self.do_blockwise = False
 
     def enable_quant(self):
+        """Enable quantization for the layer."""
         self.quant = True
 
     def disable_quant(self):
+        """Disable quantization for the layer."""
         self.quant = False
 
     def update_scale(self, input_scale, weight_scale):
+        """Update the input and weight scales.
+
+        Args:
+            input_scale (Tensor): The scale for the input.
+            weight_scale (Tensor): The scale for the weight.
+        """
         self.input_scale = input_scale
         self.weight_scale = weight_scale
 
-    ##TODO better tradeoff performance and memory, currently it's too slow
     def q_dq_forward(self, x, input_scale, weight_scale):
+        """Perform quantization and dequantization forward pass.
+
+        Args:
+            x (Tensor): The input tensor.
+            input_scale (Tensor): The scale for the input.
+            weight_scale (Tensor): The scale for the weight.
+
+        Returns:
+            Tensor: The output tensor after quantization and dequantization.
+        """
         layer_copy = copy.deepcopy(self.orig_layer)
         if weight_scale is not None:
             layer_copy.weight *= weight_scale
@@ -2249,6 +2728,15 @@ def q_dq_forward(self, x, input_scale, weight_scale):
         return output
 
     def q_dq_forward_blockwise(self, x, input_scale):
+        """Perform blockwise quantization and dequantization forward pass.
+
+        Args:
+            x (Tensor): The input tensor.
+            input_scale (Tensor): The scale for the input.
+
+        Returns:
+            Tensor: The output tensor after blockwise quantization and dequantization.
+        """
         layer_copy = copy.deepcopy(self.orig_layer)
         if input_scale is None:
             x = quant_dequant_x_v1(x, self.input_min, self.input_max)
@@ -2259,6 +2747,14 @@ def q_dq_forward_blockwise(self, x, input_scale):
         return output
 
     def forward(self, x):
+        """Perform the forward pass of the module.
+
+        Args:
+            x (Tensor): The input tensor.
+
+        Returns:
+            Tensor: The output tensor.
+        """
         if self.quant:
             # self.q_input = x * scale ##save the q_input
             if self.save_q_input:
@@ -2267,7 +2763,6 @@ def forward(self, x):
                 output = self.q_dq_forward(x, self.input_scale, self.weight_scale)
             else:
                 output = self.q_dq_forward_blockwise(x, self.input_scale)
-
         else:
             output = self.orig_layer(x)
         self.output = output

From f7f1fea369a3174c57a2b5efb9c20a924fa85be4 Mon Sep 17 00:00:00 2001
From: Zixuan Cheng <110808245+violetch24@users.noreply.github.com>
Date: Fri, 19 Jul 2024 14:03:12 +0800
Subject: [PATCH 4/4] Update utility.py

---
 neural_compressor/torch/algorithms/smooth_quant/utility.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/neural_compressor/torch/algorithms/smooth_quant/utility.py b/neural_compressor/torch/algorithms/smooth_quant/utility.py
index a5f6aadec65..9783e190001 100644
--- a/neural_compressor/torch/algorithms/smooth_quant/utility.py
+++ b/neural_compressor/torch/algorithms/smooth_quant/utility.py
@@ -1922,8 +1922,8 @@ def __init__(
         Args:
             model (torch.nn.Module): Torch model.
             dataloader (DataLoader): Calibration dataloader.
-            traced_model (Optional[torch.jit.ScriptModule]): A specific model that shares the same architecture as the model and
-                could be traced by torch.jit. If not supplied, the model will be used instead.
+            traced_model (Optional[torch.jit.ScriptModule]): A specific model that shares the same architecture
+                as the model and could be traced by torch.jit. If not supplied, the model will be used instead.
 
         Returns:
             None
@@ -2312,7 +2312,8 @@ def transform(
         Args:
             alpha (float or dict): Alpha value to balance the quantization difficulty of activation and weight.
                 Please refer to the paper for more details.
-            folding (bool): Whether to insert multiplication (False) or just allow foldable layers (True) for SmoothQuant.
+            folding (bool): Whether to insert multiplication (False) or just allow foldable layers (True)
+                for SmoothQuant.
             percentile (float): Not supported currently.
             op_types (list): The operation types to be smooth quantized.
             scales_per_op (dict): Not supported currently.