Skip to content

Commit

Permalink
Add docstring for WOQ&LayerWise (#1938)
Browse files Browse the repository at this point in the history
Signed-off-by: Kaihui-intel <[email protected]>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: xinhe <[email protected]>
  • Loading branch information
3 people authored Jul 23, 2024
1 parent 08914d6 commit 0c52e12
Show file tree
Hide file tree
Showing 14 changed files with 599 additions and 70 deletions.
3 changes: 2 additions & 1 deletion .azure-pipelines/scripts/codeScan/pydocstyle/scan_path.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,5 @@
/neural_compressor/torch/algorithms/pt2e_quant
/neural_compressor/torch/export
/neural_compressor/common
/neural_compressor/torch/algorithms/weight_only/hqq
/neural_compressor/torch/algorithms/weight_only
/neural_compressor/torch/algorithms/layer_wise
5 changes: 3 additions & 2 deletions neural_compressor/torch/algorithms/layer_wise/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,8 +152,7 @@ def load(
# The first line of this docstring overrides the one Sphinx generates for the
# documentation. We need it so that Sphinx doesn't leak `pickle`s path from
# the build environment (e.g. `<module 'pickle' from '/leaked/path').

"""Load(f, map_location=None, pickle_module=pickle, *, weights_only=False, **pickle_load_args)
"""Load(f, map_location=None, pickle_module=pickle, *, weights_only=False, **pickle_load_args).
Loads an object saved with :func:`torch.save` from a file.
Expand Down Expand Up @@ -198,6 +197,8 @@ def load(
pickle_load_args: (Python 3 only) optional keyword arguments passed over to
:func:`pickle_module.load` and :func:`pickle_module.Unpickler`, e.g.,
:attr:`errors=...`.
prefix (str): the module prefix name.
tensor_name (str): the tensor name.
.. warning::
:func:`torch.load()` unless `weights_only` parameter is set to `True`,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -94,15 +94,13 @@ class PickleError(Exception):


class PicklingError(PickleError):
"""This exception is raised when an unpicklable object is passed to the
dump() method."""
"""This exception is raised when an unpicklable object is passed to the dump() method."""

pass


class UnpicklingError(PickleError):
"""This exception is raised when there is a problem unpickling an object,
such as a security violation.
"""This exception is raised when there is a problem unpickling an object, such as a security violation.
Note that other exceptions may also be raised during unpickling, including
(but not necessarily limited to) AttributeError, EOFError, ImportError,
Expand Down Expand Up @@ -367,6 +365,7 @@ def whichmodule(obj, name): # pragma: no cover

def encode_long(x): # pragma: no cover
r"""Encode a long to a two's complement little-endian binary string.
Note that 0 is a special case, returning an empty string, to save a
byte in the LONG1 pickling context.
Expand Down Expand Up @@ -503,7 +502,6 @@ def dump(self, obj):

def memoize(self, obj):
"""Store an object in the memo."""

# The Pickler memo is a dictionary mapping object ids to 2-tuples
# that contain the Unpickler memo key and the object being memoized.
# The memo key is written to the pickle and will become
Expand Down
35 changes: 35 additions & 0 deletions neural_compressor/torch/algorithms/layer_wise/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,14 +35,18 @@


class QDQLayer(torch.nn.Module):
"""Quantized and Dequantized Layer."""

def __init__(self, module, input_scale=None) -> None:
"""Init the QDQLayer object."""
super().__init__()
self.quant = torch.ao.quantization.QuantStub()
self.module = module
self.dequant = torch.ao.quantization.DeQuantStub()
self.input_scale = input_scale

def forward(self, X):
"""Forward function."""
if self.input_scale is not None:
X = torch.mul(X, self.input_scale)
X = self.quant(X)
Expand Down Expand Up @@ -220,6 +224,16 @@ def _get_path(pretrained_model_name_or_path):


def load_value(model, param_name, path):
"""Load the module value.
Args:
model (torch.nn.module): torch model.
param_name (str): module name.
path (str): path to load state_dict per layer.
Returns:
tensor: the module value.
"""
if "lm_head" in param_name and getattr(model.config, "tie_word_embeddings", True):
input_embeddings = model.get_input_embeddings()
modules = get_named_children(model)
Expand All @@ -235,6 +249,14 @@ def load_value(model, param_name, path):


def load_module(model, module_name, path, device="cpu"):
"""Load all named parameters of module.
Args:
model (torch.nn.module): torch model.
module_name (str): module name.
path (str): path to load state_dict per layer.
device (str, optional): module device. Defaults to "cpu".
"""
module = get_module(model, module_name)
for n, p in module.named_parameters():
param_name = module_name + "." + n
Expand All @@ -243,6 +265,18 @@ def load_module(model, module_name, path, device="cpu"):


def register_weight_hooks(model, path, device="cpu", clean_weight=True, saved_path=None):
"""Register weight hooks for model.
Args:
model (torch.nn.module): torch model.
path (str): path to load state_dict per layer.
device (str, optional): module device. Defaults to "cpu".
clean_weight (bool, optional): to clean model weight. Defaults to True.
saved_path (str, optional): path to save module weight. Defaults to None.
Returns:
list: handlers.
"""
if saved_path:
os.makedirs(saved_path, exist_ok=True)

Expand Down Expand Up @@ -280,6 +314,7 @@ def hook(module, input, output):


def clean_module_weight(module):
"""Clean module weight."""
if isinstance(module, QDQLayer):
submodule = module.module
else:
Expand Down
2 changes: 1 addition & 1 deletion neural_compressor/torch/algorithms/weight_only/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Weight-Only algorithms."""

from .save_load import save, load
21 changes: 16 additions & 5 deletions neural_compressor/torch/algorithms/weight_only/autoround.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""AutoRound quantization."""
import copy
import json
import time
Expand All @@ -28,6 +28,8 @@


class AutoRoundQuantizer(Quantizer):
"""AutoRound Quantizer."""

def __init__(
self,
quant_config: dict = {},
Expand Down Expand Up @@ -94,11 +96,11 @@ def __init__(
lr_scheduler: The learning rate scheduler to be used.
dataset (str): The default dataset name (default is "NeelNanda/pile-10k").
enable_quanted_input (bool): Whether to use the output of the previous quantized block as
the input for the current block (default is True).
the input for the current block (default is True).
enable_minmax_tuning (bool): Whether to enable weight min-max tuning (default is True).
lr (float): The learning rate (default is None, will be set to 1.0/iters).
minmax_lr (float): The learning rate for min-max tuning
(default is None, it will be set to lr automatically).
(default is None, it will be set to lr automatically).
low_gpu_mem_usage (bool): Whether to use low GPU memory (default is True).
iters (int): Number of iterations (default is 200).
seqlen (int): Data length of the sequence for tuning (default is 2048).
Expand All @@ -111,7 +113,7 @@ def __init__(
dynamic_max_gap (int): The dynamic maximum gap (default is -1).
data_type (str): The data type to be used (default is "int").
scale_dtype (str): The data type of quantization scale to be used (default is "float16"), different kernels
have different choices.
have different choices.
multimodal(bool): Enable multimodal model quantization, (default is "False").
act_bits (int): Number of bits for activation quantization. Default is 32.
act_group_size (int): Group size for activation quantization. Default is None.
Expand Down Expand Up @@ -153,6 +155,7 @@ def __init__(

def prepare(self, model: torch.nn.Module, *args, **kwargs):
"""Prepares a given model for quantization.
Args:
model (torch.nn.Module): The model to be prepared.
Expand All @@ -163,6 +166,14 @@ def prepare(self, model: torch.nn.Module, *args, **kwargs):
return prepare_model

def convert(self, model: torch.nn.Module, *args, **kwargs):
"""Convert the prepared model to a quantized model.
Args:
model (torch.nn.Module): the prepared model
Returns:
The quantized model.
"""
dataloader = CapturedDataloader(model.args_list, model.kwargs_list)
model = model.orig_model
rounder = AutoRound(
Expand Down Expand Up @@ -216,7 +227,7 @@ def get_dataloader(tokenizer, seqlen, dataset_name="NeelNanda/pile-10k", seed=42
split (str, optional): The data split to use. Defaults to None.
seed (int, optional): The random seed for reproducibility. Defaults to 42.
bs (int, optional): The batch size. Defaults to 4.
n_samples (int, optional): The total number of samples to include. Defaults to 512.
nsamples (int, optional): The total number of samples to include. Defaults to 128.
Returns:
DataLoader: The DataLoader for the calibrated dataset.
Expand Down
28 changes: 23 additions & 5 deletions neural_compressor/torch/algorithms/weight_only/awq.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""AWQ quantization."""
# Copied from neural_compressor/adaptor/torch_utils/awq.py

import copy
Expand Down Expand Up @@ -40,11 +40,16 @@ def _get_absorb_per_block(model, example_inputs, folding=False, weight_config={}
"""Get absorbed layer per block.
Args:
model (torch.nn.Module): input model
example_inputs: example_inputs
model (torch.nn.Module): input model.
example_inputs (tensor/tuple/dict, optional): used to trace torch model.
folding (bool, optional): whether only allow update scale when it can be fold
to upper layer. Defaults to False.
weight_config (dict, optional): the quantization configuration. Defaults to {}.
Returns:
block_absorb_dict: dict of absorbed layer per block. eg. {0, [[absorbed_1, xx], [xx]], ...}
block_absorb_dict: The dict of absorbed layer per block. eg. {0, [[absorbed_1, xx], [xx]], ...}
absorb_layer_dict: The layer dict that scale can be absorbed. The dict is the inverse of
block_absorb_dict for all blocks.
"""
block_absorb_dict = {} # record absorbed layer per block
absorb_layer_dict = {} # record absorb layers for absorbed layers
Expand Down Expand Up @@ -94,10 +99,12 @@ def _get_absorb_dict(model, absorb_layer_dict):
Args:
model (torch.nn.Module): input model
absorb_layer_dict (dict): The layer dict that scale can be absorbed, default is {}.
absorb_layer_dict (dict): The layer type dict that scale can be absorbed, default is {}.
Returns:
block_absorb_dict: dict of absorbed layer per block. eg. {0, [[absorbed_1, xx], [xx]], ...}
new_absorb_layer_dict: The layer dict that scale can be absorbed. The dict is the inverse of
block_absorb_dict for all blocks.
"""
block_absorb_dict = {}
block_prefix, block_num = get_block_prefix(model)
Expand All @@ -121,6 +128,15 @@ def _get_absorb_dict(model, absorb_layer_dict):

@torch.no_grad()
def _get_weight_scale(weight, q_group_size=-1):
"""Get scale for weight.
Args:
weight (tensor): input weight
q_group_size (int, optional): how many elements share one scale/zp. Defaults to -1.
Returns:
scale: the scale of input weight.
"""
org_shape = weight.shape
if q_group_size > 0:
weight = weight.view(-1, q_group_size)
Expand Down Expand Up @@ -526,6 +542,8 @@ def module_inference(self, model, inputs):


class AWQQuantizer(Quantizer):
"""AWQ Quantizer."""

def __init__(self, quant_config: OrderedDict = {}, absorb_layer_dict: dict = {}):
"""Init an AWQQuantizer object.
Expand Down
Loading

0 comments on commit 0c52e12

Please sign in to comment.