Add docstring for WOQ&LayerWise (#1938)

Signed-off-by: Kaihui-intel <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: xinhe <[email protected]>
intel · Jul 23, 2024 · 0c52e12 · 0c52e12
1 parent 08914d6
commit 0c52e12
Show file tree

Hide file tree

Showing 14 changed files with 599 additions and 70 deletions.
diff --git a/.azure-pipelines/scripts/codeScan/pydocstyle/scan_path.txt b/.azure-pipelines/scripts/codeScan/pydocstyle/scan_path.txt
@@ -20,4 +20,5 @@
 /neural_compressor/torch/algorithms/pt2e_quant
 /neural_compressor/torch/export
 /neural_compressor/common
-/neural_compressor/torch/algorithms/weight_only/hqq
+/neural_compressor/torch/algorithms/weight_only
+/neural_compressor/torch/algorithms/layer_wise
diff --git a/neural_compressor/torch/algorithms/layer_wise/load.py b/neural_compressor/torch/algorithms/layer_wise/load.py
@@ -152,8 +152,7 @@ def load(
     # The first line of this docstring overrides the one Sphinx generates for the
     # documentation. We need it so that Sphinx doesn't leak `pickle`s path from
     # the build environment (e.g. `<module 'pickle' from '/leaked/path').
-
-    """Load(f, map_location=None, pickle_module=pickle, *, weights_only=False, **pickle_load_args)
+    """Load(f, map_location=None, pickle_module=pickle, *, weights_only=False, **pickle_load_args).
 
     Loads an object saved with :func:`torch.save` from a file.
 
@@ -198,6 +197,8 @@ def load(
         pickle_load_args: (Python 3 only) optional keyword arguments passed over to
             :func:`pickle_module.load` and :func:`pickle_module.Unpickler`, e.g.,
             :attr:`errors=...`.
+        prefix (str): the module prefix name.
+        tensor_name (str): the tensor name.
 
     .. warning::
         :func:`torch.load()` unless `weights_only` parameter is set to `True`,

diff --git a/neural_compressor/torch/algorithms/layer_wise/modified_pickle.py b/neural_compressor/torch/algorithms/layer_wise/modified_pickle.py
@@ -94,15 +94,13 @@ class PickleError(Exception):
 
 
 class PicklingError(PickleError):
-    """This exception is raised when an unpicklable object is passed to the
-    dump() method."""
+    """This exception is raised when an unpicklable object is passed to the dump() method."""
 
     pass
 
 
 class UnpicklingError(PickleError):
-    """This exception is raised when there is a problem unpickling an object,
-    such as a security violation.
+    """This exception is raised when there is a problem unpickling an object, such as a security violation.
 
     Note that other exceptions may also be raised during unpickling, including
     (but not necessarily limited to) AttributeError, EOFError, ImportError,
@@ -367,6 +365,7 @@ def whichmodule(obj, name):  # pragma: no cover
 
 def encode_long(x):  # pragma: no cover
     r"""Encode a long to a two's complement little-endian binary string.
+
     Note that 0 is a special case, returning an empty string, to save a
     byte in the LONG1 pickling context.
 
@@ -503,7 +502,6 @@ def dump(self, obj):
 
     def memoize(self, obj):
         """Store an object in the memo."""
-
         # The Pickler memo is a dictionary mapping object ids to 2-tuples
         # that contain the Unpickler memo key and the object being memoized.
         # The memo key is written to the pickle and will become

diff --git a/neural_compressor/torch/algorithms/layer_wise/utils.py b/neural_compressor/torch/algorithms/layer_wise/utils.py
@@ -35,14 +35,18 @@
 
 
 class QDQLayer(torch.nn.Module):
+    """Quantized and Dequantized Layer."""
+
     def __init__(self, module, input_scale=None) -> None:
+        """Init the QDQLayer object."""
         super().__init__()
         self.quant = torch.ao.quantization.QuantStub()
         self.module = module
         self.dequant = torch.ao.quantization.DeQuantStub()
         self.input_scale = input_scale
 
     def forward(self, X):
+        """Forward function."""
         if self.input_scale is not None:
             X = torch.mul(X, self.input_scale)
         X = self.quant(X)
@@ -220,6 +224,16 @@ def _get_path(pretrained_model_name_or_path):
 
 
 def load_value(model, param_name, path):
+    """Load the module value.
+
+    Args:
+        model (torch.nn.module): torch model.
+        param_name (str): module name.
+        path (str): path to load state_dict per layer.
+
+    Returns:
+        tensor: the module value.
+    """
     if "lm_head" in param_name and getattr(model.config, "tie_word_embeddings", True):
         input_embeddings = model.get_input_embeddings()
         modules = get_named_children(model)
@@ -235,6 +249,14 @@ def load_value(model, param_name, path):
 
 
 def load_module(model, module_name, path, device="cpu"):
+    """Load all named parameters of module.
+
+    Args:
+        model (torch.nn.module): torch model.
+        module_name (str): module name.
+        path (str): path to load state_dict per layer.
+        device (str, optional): module device. Defaults to "cpu".
+    """
     module = get_module(model, module_name)
     for n, p in module.named_parameters():
         param_name = module_name + "." + n
@@ -243,6 +265,18 @@ def load_module(model, module_name, path, device="cpu"):
 
 
 def register_weight_hooks(model, path, device="cpu", clean_weight=True, saved_path=None):
+    """Register weight hooks for model.
+
+    Args:
+        model (torch.nn.module): torch model.
+        path (str): path to load state_dict per layer.
+        device (str, optional): module device. Defaults to "cpu".
+        clean_weight (bool, optional): to clean model weight. Defaults to True.
+        saved_path (str, optional): path to save module weight. Defaults to None.
+
+    Returns:
+        list: handlers.
+    """
     if saved_path:
         os.makedirs(saved_path, exist_ok=True)
 
@@ -280,6 +314,7 @@ def hook(module, input, output):
 
 
 def clean_module_weight(module):
+    """Clean module weight."""
     if isinstance(module, QDQLayer):
         submodule = module.module
     else:

diff --git a/neural_compressor/torch/algorithms/weight_only/__init__.py b/neural_compressor/torch/algorithms/weight_only/__init__.py
@@ -11,6 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+"""Weight-Only algorithms."""
 
 from .save_load import save, load
diff --git a/neural_compressor/torch/algorithms/weight_only/autoround.py b/neural_compressor/torch/algorithms/weight_only/autoround.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+"""AutoRound quantization."""
 import copy
 import json
 import time
@@ -28,6 +28,8 @@
 
 
 class AutoRoundQuantizer(Quantizer):
+    """AutoRound Quantizer."""
+
     def __init__(
         self,
         quant_config: dict = {},
@@ -94,11 +96,11 @@ def __init__(
             lr_scheduler: The learning rate scheduler to be used.
             dataset (str): The default dataset name (default is "NeelNanda/pile-10k").
             enable_quanted_input (bool): Whether to use the output of the previous quantized block as
-                the input for the current block (default is True).
+                                            the input for the current block (default is True).
             enable_minmax_tuning (bool): Whether to enable weight min-max tuning (default is True).
             lr (float): The learning rate (default is None, will be set to 1.0/iters).
             minmax_lr (float): The learning rate for min-max tuning
-                (default is None, it will be set to lr automatically).
+                                    (default is None, it will be set to lr automatically).
             low_gpu_mem_usage (bool): Whether to use low GPU memory (default is True).
             iters (int): Number of iterations (default is 200).
             seqlen (int): Data length of the sequence for tuning (default is 2048).
@@ -111,7 +113,7 @@ def __init__(
             dynamic_max_gap (int): The dynamic maximum gap (default is -1).
             data_type (str): The data type to be used (default is "int").
             scale_dtype (str): The data type of quantization scale to be used (default is "float16"), different kernels
-                have different choices.
+                                    have different choices.
             multimodal(bool): Enable multimodal model quantization, (default is "False").
             act_bits (int): Number of bits for activation quantization. Default is 32.
             act_group_size (int): Group size for activation quantization. Default is None.
@@ -153,6 +155,7 @@ def __init__(
 
     def prepare(self, model: torch.nn.Module, *args, **kwargs):
         """Prepares a given model for quantization.
+
         Args:
             model (torch.nn.Module): The model to be prepared.
 
@@ -163,6 +166,14 @@ def prepare(self, model: torch.nn.Module, *args, **kwargs):
         return prepare_model
 
     def convert(self, model: torch.nn.Module, *args, **kwargs):
+        """Convert the prepared model to a quantized model.
+
+        Args:
+            model (torch.nn.Module): the prepared model
+
+        Returns:
+            The quantized model.
+        """
         dataloader = CapturedDataloader(model.args_list, model.kwargs_list)
         model = model.orig_model
         rounder = AutoRound(
@@ -216,7 +227,7 @@ def get_dataloader(tokenizer, seqlen, dataset_name="NeelNanda/pile-10k", seed=42
         split (str, optional): The data split to use. Defaults to None.
         seed (int, optional): The random seed for reproducibility. Defaults to 42.
         bs (int, optional): The batch size. Defaults to 4.
-        n_samples (int, optional): The total number of samples to include. Defaults to 512.
+        nsamples (int, optional): The total number of samples to include. Defaults to 128.
 
     Returns:
         DataLoader: The DataLoader for the calibrated dataset.

diff --git a/neural_compressor/torch/algorithms/weight_only/awq.py b/neural_compressor/torch/algorithms/weight_only/awq.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+"""AWQ quantization."""
 # Copied from neural_compressor/adaptor/torch_utils/awq.py
 
 import copy
@@ -40,11 +40,16 @@ def _get_absorb_per_block(model, example_inputs, folding=False, weight_config={}
     """Get absorbed layer per block.
 
     Args:
-        model (torch.nn.Module): input model
-        example_inputs: example_inputs
+        model (torch.nn.Module): input model.
+        example_inputs (tensor/tuple/dict, optional): used to trace torch model.
+        folding (bool, optional): whether only allow update scale when it can be fold
+                                    to upper layer. Defaults to False.
+        weight_config (dict, optional): the quantization configuration. Defaults to {}.
 
     Returns:
-        block_absorb_dict: dict of absorbed layer per block. eg. {0, [[absorbed_1, xx], [xx]], ...}
+        block_absorb_dict: The dict of absorbed layer per block. eg. {0, [[absorbed_1, xx], [xx]], ...}
+        absorb_layer_dict: The layer dict that scale can be absorbed. The dict is the inverse of
+                                block_absorb_dict for all blocks.
     """
     block_absorb_dict = {}  # record absorbed layer per block
     absorb_layer_dict = {}  # record absorb layers for absorbed layers
@@ -94,10 +99,12 @@ def _get_absorb_dict(model, absorb_layer_dict):
 
     Args:
         model (torch.nn.Module): input model
-        absorb_layer_dict (dict): The layer dict that scale can be absorbed, default is {}.
+        absorb_layer_dict (dict): The layer type dict that scale can be absorbed, default is {}.
 
     Returns:
         block_absorb_dict: dict of absorbed layer per block. eg. {0, [[absorbed_1, xx], [xx]], ...}
+        new_absorb_layer_dict: The layer dict that scale can be absorbed. The dict is the inverse of
+                                block_absorb_dict for all blocks.
     """
     block_absorb_dict = {}
     block_prefix, block_num = get_block_prefix(model)
@@ -121,6 +128,15 @@ def _get_absorb_dict(model, absorb_layer_dict):
 
 @torch.no_grad()
 def _get_weight_scale(weight, q_group_size=-1):
+    """Get scale for weight.
+
+    Args:
+        weight (tensor): input weight
+        q_group_size (int, optional): how many elements share one scale/zp. Defaults to -1.
+
+    Returns:
+        scale: the scale of input weight.
+    """
     org_shape = weight.shape
     if q_group_size > 0:
         weight = weight.view(-1, q_group_size)
@@ -526,6 +542,8 @@ def module_inference(self, model, inputs):
 
 
 class AWQQuantizer(Quantizer):
+    """AWQ Quantizer."""
+
     def __init__(self, quant_config: OrderedDict = {}, absorb_layer_dict: dict = {}):
         """Init an AWQQuantizer object.