From 0c52e1243b78734e95fc348834303bc3c3cfe369 Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Tue, 23 Jul 2024 09:59:17 +0800 Subject: [PATCH] Add docstring for WOQ&LayerWise (#1938) Signed-off-by: Kaihui-intel Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: xinhe --- .../scripts/codeScan/pydocstyle/scan_path.txt | 3 +- .../torch/algorithms/layer_wise/load.py | 5 +- .../algorithms/layer_wise/modified_pickle.py | 8 +- .../torch/algorithms/layer_wise/utils.py | 35 ++++ .../torch/algorithms/weight_only/__init__.py | 2 +- .../torch/algorithms/weight_only/autoround.py | 21 ++- .../torch/algorithms/weight_only/awq.py | 28 ++- .../torch/algorithms/weight_only/gptq.py | 101 ++++++++++- .../torch/algorithms/weight_only/modules.py | 102 ++++++++++- .../torch/algorithms/weight_only/rtn.py | 4 +- .../torch/algorithms/weight_only/save_load.py | 34 +++- .../torch/algorithms/weight_only/teq.py | 70 +++++--- .../torch/algorithms/weight_only/utility.py | 92 ++++++++-- .../torch/quantization/config.py | 164 +++++++++++++++++- 14 files changed, 599 insertions(+), 70 deletions(-) diff --git a/.azure-pipelines/scripts/codeScan/pydocstyle/scan_path.txt b/.azure-pipelines/scripts/codeScan/pydocstyle/scan_path.txt index b5a69eaa938..1acfa95c75b 100644 --- a/.azure-pipelines/scripts/codeScan/pydocstyle/scan_path.txt +++ b/.azure-pipelines/scripts/codeScan/pydocstyle/scan_path.txt @@ -20,4 +20,5 @@ /neural_compressor/torch/algorithms/pt2e_quant /neural_compressor/torch/export /neural_compressor/common -/neural_compressor/torch/algorithms/weight_only/hqq +/neural_compressor/torch/algorithms/weight_only +/neural_compressor/torch/algorithms/layer_wise \ No newline at end of file diff --git a/neural_compressor/torch/algorithms/layer_wise/load.py b/neural_compressor/torch/algorithms/layer_wise/load.py index a883bfe3848..a5176104b76 100644 --- a/neural_compressor/torch/algorithms/layer_wise/load.py +++ b/neural_compressor/torch/algorithms/layer_wise/load.py @@ -152,8 +152,7 @@ def load( # The first line of this docstring overrides the one Sphinx generates for the # documentation. We need it so that Sphinx doesn't leak `pickle`s path from # the build environment (e.g. ` None: + """Init the QDQLayer object.""" super().__init__() self.quant = torch.ao.quantization.QuantStub() self.module = module @@ -43,6 +46,7 @@ def __init__(self, module, input_scale=None) -> None: self.input_scale = input_scale def forward(self, X): + """Forward function.""" if self.input_scale is not None: X = torch.mul(X, self.input_scale) X = self.quant(X) @@ -220,6 +224,16 @@ def _get_path(pretrained_model_name_or_path): def load_value(model, param_name, path): + """Load the module value. + + Args: + model (torch.nn.module): torch model. + param_name (str): module name. + path (str): path to load state_dict per layer. + + Returns: + tensor: the module value. + """ if "lm_head" in param_name and getattr(model.config, "tie_word_embeddings", True): input_embeddings = model.get_input_embeddings() modules = get_named_children(model) @@ -235,6 +249,14 @@ def load_value(model, param_name, path): def load_module(model, module_name, path, device="cpu"): + """Load all named parameters of module. + + Args: + model (torch.nn.module): torch model. + module_name (str): module name. + path (str): path to load state_dict per layer. + device (str, optional): module device. Defaults to "cpu". + """ module = get_module(model, module_name) for n, p in module.named_parameters(): param_name = module_name + "." + n @@ -243,6 +265,18 @@ def load_module(model, module_name, path, device="cpu"): def register_weight_hooks(model, path, device="cpu", clean_weight=True, saved_path=None): + """Register weight hooks for model. + + Args: + model (torch.nn.module): torch model. + path (str): path to load state_dict per layer. + device (str, optional): module device. Defaults to "cpu". + clean_weight (bool, optional): to clean model weight. Defaults to True. + saved_path (str, optional): path to save module weight. Defaults to None. + + Returns: + list: handlers. + """ if saved_path: os.makedirs(saved_path, exist_ok=True) @@ -280,6 +314,7 @@ def hook(module, input, output): def clean_module_weight(module): + """Clean module weight.""" if isinstance(module, QDQLayer): submodule = module.module else: diff --git a/neural_compressor/torch/algorithms/weight_only/__init__.py b/neural_compressor/torch/algorithms/weight_only/__init__.py index fc9ef0a5b3b..3ff6ec8b145 100644 --- a/neural_compressor/torch/algorithms/weight_only/__init__.py +++ b/neural_compressor/torch/algorithms/weight_only/__init__.py @@ -11,6 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +"""Weight-Only algorithms.""" from .save_load import save, load diff --git a/neural_compressor/torch/algorithms/weight_only/autoround.py b/neural_compressor/torch/algorithms/weight_only/autoround.py index 6f5a022cfee..9ff488573c0 100644 --- a/neural_compressor/torch/algorithms/weight_only/autoround.py +++ b/neural_compressor/torch/algorithms/weight_only/autoround.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +"""AutoRound quantization.""" import copy import json import time @@ -28,6 +28,8 @@ class AutoRoundQuantizer(Quantizer): + """AutoRound Quantizer.""" + def __init__( self, quant_config: dict = {}, @@ -94,11 +96,11 @@ def __init__( lr_scheduler: The learning rate scheduler to be used. dataset (str): The default dataset name (default is "NeelNanda/pile-10k"). enable_quanted_input (bool): Whether to use the output of the previous quantized block as - the input for the current block (default is True). + the input for the current block (default is True). enable_minmax_tuning (bool): Whether to enable weight min-max tuning (default is True). lr (float): The learning rate (default is None, will be set to 1.0/iters). minmax_lr (float): The learning rate for min-max tuning - (default is None, it will be set to lr automatically). + (default is None, it will be set to lr automatically). low_gpu_mem_usage (bool): Whether to use low GPU memory (default is True). iters (int): Number of iterations (default is 200). seqlen (int): Data length of the sequence for tuning (default is 2048). @@ -111,7 +113,7 @@ def __init__( dynamic_max_gap (int): The dynamic maximum gap (default is -1). data_type (str): The data type to be used (default is "int"). scale_dtype (str): The data type of quantization scale to be used (default is "float16"), different kernels - have different choices. + have different choices. multimodal(bool): Enable multimodal model quantization, (default is "False"). act_bits (int): Number of bits for activation quantization. Default is 32. act_group_size (int): Group size for activation quantization. Default is None. @@ -153,6 +155,7 @@ def __init__( def prepare(self, model: torch.nn.Module, *args, **kwargs): """Prepares a given model for quantization. + Args: model (torch.nn.Module): The model to be prepared. @@ -163,6 +166,14 @@ def prepare(self, model: torch.nn.Module, *args, **kwargs): return prepare_model def convert(self, model: torch.nn.Module, *args, **kwargs): + """Convert the prepared model to a quantized model. + + Args: + model (torch.nn.Module): the prepared model + + Returns: + The quantized model. + """ dataloader = CapturedDataloader(model.args_list, model.kwargs_list) model = model.orig_model rounder = AutoRound( @@ -216,7 +227,7 @@ def get_dataloader(tokenizer, seqlen, dataset_name="NeelNanda/pile-10k", seed=42 split (str, optional): The data split to use. Defaults to None. seed (int, optional): The random seed for reproducibility. Defaults to 42. bs (int, optional): The batch size. Defaults to 4. - n_samples (int, optional): The total number of samples to include. Defaults to 512. + nsamples (int, optional): The total number of samples to include. Defaults to 128. Returns: DataLoader: The DataLoader for the calibrated dataset. diff --git a/neural_compressor/torch/algorithms/weight_only/awq.py b/neural_compressor/torch/algorithms/weight_only/awq.py index b8c4329de3b..63ae6b08564 100644 --- a/neural_compressor/torch/algorithms/weight_only/awq.py +++ b/neural_compressor/torch/algorithms/weight_only/awq.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +"""AWQ quantization.""" # Copied from neural_compressor/adaptor/torch_utils/awq.py import copy @@ -40,11 +40,16 @@ def _get_absorb_per_block(model, example_inputs, folding=False, weight_config={} """Get absorbed layer per block. Args: - model (torch.nn.Module): input model - example_inputs: example_inputs + model (torch.nn.Module): input model. + example_inputs (tensor/tuple/dict, optional): used to trace torch model. + folding (bool, optional): whether only allow update scale when it can be fold + to upper layer. Defaults to False. + weight_config (dict, optional): the quantization configuration. Defaults to {}. Returns: - block_absorb_dict: dict of absorbed layer per block. eg. {0, [[absorbed_1, xx], [xx]], ...} + block_absorb_dict: The dict of absorbed layer per block. eg. {0, [[absorbed_1, xx], [xx]], ...} + absorb_layer_dict: The layer dict that scale can be absorbed. The dict is the inverse of + block_absorb_dict for all blocks. """ block_absorb_dict = {} # record absorbed layer per block absorb_layer_dict = {} # record absorb layers for absorbed layers @@ -94,10 +99,12 @@ def _get_absorb_dict(model, absorb_layer_dict): Args: model (torch.nn.Module): input model - absorb_layer_dict (dict): The layer dict that scale can be absorbed, default is {}. + absorb_layer_dict (dict): The layer type dict that scale can be absorbed, default is {}. Returns: block_absorb_dict: dict of absorbed layer per block. eg. {0, [[absorbed_1, xx], [xx]], ...} + new_absorb_layer_dict: The layer dict that scale can be absorbed. The dict is the inverse of + block_absorb_dict for all blocks. """ block_absorb_dict = {} block_prefix, block_num = get_block_prefix(model) @@ -121,6 +128,15 @@ def _get_absorb_dict(model, absorb_layer_dict): @torch.no_grad() def _get_weight_scale(weight, q_group_size=-1): + """Get scale for weight. + + Args: + weight (tensor): input weight + q_group_size (int, optional): how many elements share one scale/zp. Defaults to -1. + + Returns: + scale: the scale of input weight. + """ org_shape = weight.shape if q_group_size > 0: weight = weight.view(-1, q_group_size) @@ -526,6 +542,8 @@ def module_inference(self, model, inputs): class AWQQuantizer(Quantizer): + """AWQ Quantizer.""" + def __init__(self, quant_config: OrderedDict = {}, absorb_layer_dict: dict = {}): """Init an AWQQuantizer object. diff --git a/neural_compressor/torch/algorithms/weight_only/gptq.py b/neural_compressor/torch/algorithms/weight_only/gptq.py index eae9f7c3a84..43bf5061bfa 100644 --- a/neural_compressor/torch/algorithms/weight_only/gptq.py +++ b/neural_compressor/torch/algorithms/weight_only/gptq.py @@ -14,6 +14,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +"""GPTQ quantization.""" import gc import math @@ -181,7 +182,8 @@ def __init__( *args, **kwargs, ): - """ + """Init RAWGPTQuantizer. + Args: model: the fp32 model to quantize weight_config (dict, optional): contains all info required by GPTQ. Defaults to {}. For example, @@ -196,10 +198,13 @@ def __init__( } ... } + nsamples (int): the number of calibration data samples. + use_max_length (bool): set all sequence length to be same length. + max_seq_length (int): the same length of all sequence length. dataloader: an iterable containing calibration datasets, contains (inputs, targets) use_layer_wise (bool): Enables quantize model per layer. Defaults to False. model_path (str): Model path that is used to load state_dict per layer. - device: cpu or cuda + device (str): cpu or cuda. """ # model self.model = model @@ -246,6 +251,11 @@ def __init__( self.nsamples = nsamples def prepare_layer_wise(self, model_path): + """Prepare for layer-wise quantization, including registering hooks and setting up the model path. + + Args: + model_path (str): Model path that is used to load state_dict per layer. + """ import os from neural_compressor.torch.algorithms.layer_wise import LWQ_WORKSPACE, get_path, register_weight_hooks @@ -260,12 +270,20 @@ def prepare_layer_wise(self, model_path): ) def get_full_layer_name(self, sub_layer_name, block_idx): + """Get full layer name. + + Args: + sub_layer_name (str): sub layer name + block_idx (int): index of block + + Returns: + str: The full name of layer. + """ transformer_name = self.gptq_related_blocks["transformers_name"] return ".".join([transformer_name, str(block_idx), sub_layer_name]) def check_layer_config(self): """Copy arguments from weight_config to built-in attributes.""" - for layer_name, config in self.weight_config.items(): self.weight_config[layer_name]["dtype"] = config.get("dtype", self.dtype_default) self.weight_config[layer_name]["bits"] = config.get("bits", self.bits_default) @@ -314,6 +332,14 @@ def get_layer_config(self, layer_name): return config def track_hidden_states(self, data): + """Track hidden states. + + Args: + data (tensor/tuple/list): input data. + + Returns: + tensor. + """ if isinstance(data, torch.Tensor): return data elif isinstance(data, tuple) or isinstance(data, list): @@ -382,6 +408,7 @@ def model_forward(model, *args, **kwargs): @torch.no_grad() def remove_prepare_for_calibration(self): + """Prepare for GPTQ quantization.""" # output inp data shape logger.info("All calibration data's shape =>") # check all hidden_states shape @@ -404,6 +431,15 @@ def remove_prepare_for_calibration(self): logger.info("GPTQ quantization prepared.") def gather_single_batch_from_dict(self, data_dict, idx): + """Gather single batch from a dict. + + Args: + data_dict (dict): data dict. + idx (int): index + + Returns: + dict: single batch. + """ # obtain a set of keyword input from cache single_batch = {} for k, v in data_dict.items(): @@ -411,6 +447,15 @@ def gather_single_batch_from_dict(self, data_dict, idx): return single_batch def gather_single_batch_from_list(self, data_list, idx): + """Gather single batch from a list. + + Args: + data_dict (dict): data list. + idx (int): index + + Returns: + list: single batch. + """ # obtain a set of keyword input from cache single_batch = [] for data_item in data_list: @@ -418,6 +463,11 @@ def gather_single_batch_from_list(self, data_list, idx): return single_batch def update_blockwise_hidden_states(self, outs): + """Update the blockwise hidden states. + + Args: + outs: the output of block. + """ if "hidden_states" in self.cache_key_arguments: self.cache_key_arguments["hidden_states"] = outs[:] else: @@ -645,12 +695,13 @@ def tmp(_, inp, out): class GPTQ: - """ - Please refer to: + """Please refer to the following. + GPTQ: Accurate Post-training Compression for Generative Pretrained Transformers (https://arxiv.org/abs/2210.17323) """ def __init__(self, layer, W, device="cpu"): + """Init GPTQ.""" self.layer = layer self.device = device # W = layer.weight.data.clone() @@ -666,6 +717,7 @@ def __init__(self, layer, W, device="cpu"): self.perm = None # act_order choice def add_batch(self, inp, out): + """Add inputs and outputs to gptq object.""" # if DEBUG: # self.inp1 = inp # self.out1 = out @@ -697,6 +749,22 @@ def add_batch(self, inp, out): self.H += inp.matmul(inp.t()) # H = X*X, which should be a sym matrix def fasterquant(self, W, blocksize=128, percdamp=0.01, groupsize=-1, act_order=False, static_groups=False): + """Run quantization. + + Args: + W (tensor): weight tensor. + block_size (int): Execute quantization per block, block shape = [C_out, block_size]. Default to 128. + percdamp (float): percdamp (float): Percentage of Hessian's diagonal values' average, which will be added + to Hessian's diagonal to increase numerical stability. Defaults to 0.01. + groupsize (int): Size of weight groups. Defaults to -1. + act_order (bool): Whether to sort Hessian's diagonal values to rearrange channel-wise quantization order. + Defaults to False. + static_groups (bool): Whether to calculate group wise quantization parameters in advance. This option + mitigate actorder's extra computational requirements. Default to False. + + Returns: + scale, zero, Q + """ # W = self.layer.weight.data.clone() weight_shape, weight_dtype = W.shape, W.data.dtype if isinstance(self.layer, nn.Conv2d): @@ -824,6 +892,7 @@ def fasterquant(self, W, blocksize=128, percdamp=0.01, groupsize=-1, act_order=F return scale, zero, Q def free(self): + """Free memory.""" if DEBUG: self.inp1 = None self.out1 = None @@ -834,13 +903,17 @@ def free(self): class Quantizer(nn.Module): + """Quantizer.""" + def __init__(self, shape=1): + """Init Quantizer.""" super(Quantizer, self).__init__() self.maxq = 0 self.register_buffer("scale", torch.zeros(shape)) self.register_buffer("zero", torch.zeros(shape)) def configure(self, weight_config_this_layer, norm=2.4, grid=100, maxshrink=0.8, trits=False): + """Configure the quantizer.""" for k, v in weight_config_this_layer.items(): setattr(self, k, v) # self.maxq = torch.tensor(2**self.bits - 1) @@ -854,6 +927,7 @@ def configure(self, weight_config_this_layer, norm=2.4, grid=100, maxshrink=0.8, self.maxq = -1 def find_params(self, x, weight=False): + """Find scale and zero for weight.""" dev = x.device # NF4 FP4 if self.dtype != "int": @@ -991,6 +1065,11 @@ def quantize(self, x, scale, zero, maxq): return scale * (q - zero) def ready(self): + """Quantizer is ready. + + Returns: + bool: True or False. + """ return torch.all(self.scale != 0) @@ -998,8 +1077,10 @@ def ready(self): class GPTQuantizer(INCQuantizer): + """GPTQ Quantizer.""" + def __init__(self, quant_config={}): - """Init a RTNQuantizer object. + """Init a GPTQQuantizer object. Args: quant_config (OrderedDict, optional): quantization config for ops. Defaults to {}. @@ -1041,6 +1122,14 @@ def prepare( @torch.no_grad() def convert(self, model, *args, **kwargs): + """Convert the prepared model to a quantized model. + + Args: + model (torch.nn.Module): the prepared model + + Returns: + The quantized model. + """ self.gptq_quantizer.model = model self.gptq_quantizer.remove_prepare_for_calibration() diff --git a/neural_compressor/torch/algorithms/weight_only/modules.py b/neural_compressor/torch/algorithms/weight_only/modules.py index 503a469b0c7..969cf455559 100644 --- a/neural_compressor/torch/algorithms/weight_only/modules.py +++ b/neural_compressor/torch/algorithms/weight_only/modules.py @@ -31,7 +31,10 @@ class QDQLayer(torch.nn.Module): + """Quantized and dequantized layer.""" + def __init__(self, module, input_scale=None) -> None: + """Init the QDQLayer object.""" super().__init__() self.quant = torch.ao.quantization.QuantStub() self.module = module @@ -39,6 +42,7 @@ def __init__(self, module, input_scale=None) -> None: self.input_scale = input_scale def forward(self, X): + """Forward function.""" if self.input_scale is not None: X = torch.mul(X, self.input_scale) X = self.quant(X) @@ -48,6 +52,8 @@ def forward(self, X): class WeightOnlyLinear(torch.nn.Module): + """Weight Only Linear.""" + def __init__( self, in_features, @@ -64,6 +70,31 @@ def __init__( device="cpu", use_optimum_format=True, ): + """Init the WeightOnlyLinear object. + + Args: + in_features (int): input features. + out_features (int): out features. + dtype (str, optional): the data type of the quantized model. Defaults to "int". + bits (int, optional): number of bits for quantization. Defaults to 4. + group_size (int, optional): size of the quantization group. Defaults to 32. + zp (bool, optional): zero point. Defaults to False. + bias (bool, optional): module bias. Defaults to False. + scale_dtype (torch.Tensor, optional): the data type of quantization scale to be used. + Defaults to torch.float32. + compression_dtype (torch.Tensor, optional): the target dtype after comoression. + Defaults to torch.int32. + compression_dim (int, optional): select from [0, 1], 0 is output channel, 1 is input channel. + Defaults to 1. + g_idx (bool, optional): for recording the channel order. + device (str, optional): choose device for compression. Defaults to cpu. + use_optimum_format (bool, optional): use the popular huggingface compression format. + 1: compression_dim: weight = 1, zeros = 0 and both are transposed. + 2: zeros -= 1 before compression. + 3: g_idx: use same number for one group instead of recording the channel order. + 4. parameter name changed, such as 'packed_weight' -> 'qweight'. + 5. zeros is always needed even for sym. + """ super().__init__() self.use_optimum_format = use_optimum_format self.dtype = dtype @@ -172,6 +203,7 @@ def __init__( self.g_idx = None def pack(self, int_weight, scale, zp, bias, g_idx=None): + """Pack int weight.""" if self.use_optimum_format: self.scales = self.scales.T.contiguous() self.qweight = self.qweight.T.contiguous() @@ -225,6 +257,7 @@ def pack(self, int_weight, scale, zp, bias, g_idx=None): self.qzeros = self.qzeros.T.contiguous() def recover(self): + """Recover fp32 weight from packed weight.""" logger.debug(f"Recovering {self} weight") scales = self.scales.T.contiguous() if self.use_optimum_format else self.scales qweight = self.qweight.T.contiguous() if self.use_optimum_format else self.qweight @@ -271,6 +304,14 @@ def recover(self): return fp32_weight def pack_tensor_with_torch(self, raw_tensor): + """Pack the tensor with torch. + + Args: + raw_tensor (tensor): raw tensor. + + Returns: + tensor: packed tensor. + """ target_len = math.ceil(raw_tensor.shape[1] / self.n_pack) packed_tensor = torch.zeros(raw_tensor.shape[0], target_len, dtype=self.compression_dtype).to(raw_tensor.device) mask = torch.tensor(2**self.bits - 1, dtype=self.compression_dtype).to(raw_tensor.device) @@ -286,6 +327,14 @@ def pack_tensor_with_torch(self, raw_tensor): return packed_tensor def unpack_tensor_with_torch(self, packed_tensor): + """Unpack the tensor with torch. + + Args: + packed_tensor (tensor): packed tensor. + + Returns: + tensor: unpacked tensor. + """ target_dtype = torch.int8 if not hasattr(self, "qzeros") or "int" not in self.dtype else torch.uint8 target_len = packed_tensor.shape[1] * self.n_pack unpacked_tensor = torch.zeros(packed_tensor.shape[0], target_len, dtype=target_dtype).to(packed_tensor.device) @@ -307,6 +356,7 @@ def unpack_tensor_with_torch(self, packed_tensor): def pack_array_with_numba_b4_c32( raw_array: np.ndarray, packed_array: np.ndarray, n_pack: int, new_in_features: int ) -> np.ndarray: + """Pack the array with numba when bits=4 and compress_bits=32.""" for i in range(new_in_features): packed_array[:, i] = ( ((raw_array[:, i * n_pack + 7] & 0b1111) << 28) @@ -325,6 +375,7 @@ def pack_array_with_numba_b4_c32( def pack_array_with_numba_b4_c16( raw_array: np.ndarray, packed_array: np.ndarray, n_pack: int, new_in_features: int ) -> np.ndarray: + """Pack the array with numba when bits=4 and compress_bits=16.""" for i in range(new_in_features): packed_array[:, i] = ( ((raw_array[:, i * n_pack + 3] & 0b1111) << 12) @@ -339,6 +390,7 @@ def pack_array_with_numba_b4_c16( def pack_array_with_numba_b4_c8( raw_array: np.ndarray, packed_array: np.ndarray, n_pack: int, new_in_features: int ) -> np.ndarray: + """Pack the array with numba when bits=4 and compress_bits=8.""" for i in range(new_in_features): packed_array[:, i] = ((raw_array[:, i * n_pack + 1] & 0b1111) << 4) | (raw_array[:, i * n_pack] & 0b1111) return packed_array @@ -348,6 +400,7 @@ def pack_array_with_numba_b4_c8( def pack_array_with_numba_b4_c64( raw_array: np.ndarray, packed_array: np.ndarray, n_pack: int, new_in_features: int ) -> np.ndarray: + """Pack the array with numba when bits=4 and compress_bits=64.""" for i in range(new_in_features): packed_array[:, i] = ( ((raw_array[:, i * n_pack + 15] & 0b1111) << 60) @@ -374,6 +427,7 @@ def pack_array_with_numba_b4_c64( def pack_array_with_numba_b8_c32( raw_array: np.ndarray, packed_array: np.ndarray, n_pack: int, new_in_features: int ) -> np.ndarray: + """Pack the array with numba when bits=8 and compress_bits=32.""" for i in range(new_in_features): packed_array[:, i] = ( ((raw_array[:, i * n_pack + 3] & 0b11111111) << 24) @@ -388,6 +442,7 @@ def pack_array_with_numba_b8_c32( def pack_array_with_numba_b8_c16( raw_array: np.ndarray, packed_array: np.ndarray, n_pack: int, new_in_features: int ) -> np.ndarray: + """Pack the array with numba when bits=8 and compress_bits=16.""" for i in range(new_in_features): packed_array[:, i] = ( ((raw_array[:, i * n_pack + 3] & 0b11111111) << 24) @@ -402,6 +457,7 @@ def pack_array_with_numba_b8_c16( def pack_array_with_numba_b8_c8( raw_array: np.ndarray, packed_array: np.ndarray, n_pack: int, new_in_features: int ) -> np.ndarray: + """Pack the array with numba when bits=8 and compress_bits=8.""" for i in range(new_in_features): packed_array[:, i] = raw_array[:, i * n_pack] & 0b11111111 return packed_array @@ -411,6 +467,7 @@ def pack_array_with_numba_b8_c8( def pack_array_with_numba_b8_c64( raw_array: np.ndarray, packed_array: np.ndarray, n_pack: int, new_in_features: int ) -> np.ndarray: + """Pack the array with numba when bits=8 and compress_bits=64.""" for i in range(new_in_features): packed_array[:, i] = ( ((raw_array[:, i * n_pack + 7] & 0b11111111) << 56) @@ -429,6 +486,7 @@ def pack_array_with_numba_b8_c64( def pack_array_with_numba_b2_c32( raw_array: np.ndarray, packed_array: np.ndarray, n_pack: int, new_in_features: int ) -> np.ndarray: + """Pack the array with numba when bits=2 and compress_bits=32.""" for i in range(new_in_features): packed_array[:, i] = ( ((raw_array[:, i * n_pack + 15] & 0b11) << 30) @@ -455,6 +513,7 @@ def pack_array_with_numba_b2_c32( def pack_array_with_numba_b2_c16( raw_array: np.ndarray, packed_array: np.ndarray, n_pack: int, new_in_features: int ) -> np.ndarray: + """Pack the array with numba when bits=2 and compress_bits=16.""" for i in range(new_in_features): packed_array[:, i] = ( ((raw_array[:, i * n_pack + 7] & 0b11) << 14) @@ -473,6 +532,7 @@ def pack_array_with_numba_b2_c16( def pack_array_with_numba_b2_c8( raw_array: np.ndarray, packed_array: np.ndarray, n_pack: int, new_in_features: int ) -> np.ndarray: + """Pack the array with numba when bits=2 and compress_bits=8.""" for i in range(new_in_features): packed_array[:, i] = ( ((raw_array[:, i * n_pack + 3] & 0b11) << 6) @@ -487,6 +547,7 @@ def pack_array_with_numba_b2_c8( def pack_array_with_numba_b2_c64( raw_array: np.ndarray, packed_array: np.ndarray, n_pack: int, new_in_features: int ) -> np.ndarray: + """Pack the array with numba when bits=2 and compress_bits=64.""" for i in range(new_in_features): packed_array[:, i] = ( ((raw_array[:, i * n_pack + 31] & 0b11) << 62) @@ -549,6 +610,7 @@ def pack_array_with_numba( return pack_method(raw_array, packed_array, n_pack, new_in_features) def pack_tensor_with_numpy_impl(self, raw_tensor): + """The implement of packing tensor with numpy.""" raw_array = raw_tensor.cpu().numpy() target_len = np.ceil(raw_array.shape[1] / self.n_pack).astype(int) target_dtype = torch.tensor(0, dtype=self.compression_dtype).numpy().dtype @@ -567,6 +629,7 @@ def pack_tensor_with_numpy_impl(self, raw_tensor): return packed_tensor def pack_tensor_with_numpy(self, raw_tensor): + """Pack the tensor with numpy.""" if self.bits not in [2, 4, 8]: return self.pack_tensor_with_numpy_impl(raw_tensor) compression_dtype = torch.tensor(0, dtype=self.compression_dtype).numpy().dtype @@ -576,6 +639,7 @@ def pack_tensor_with_numpy(self, raw_tensor): return torch.from_numpy(packed_array).to(device=raw_tensor.device) def unpack_tensor_with_numpy(self, packed_tensor): + """Unpack the packed tensor with numpy.""" packed_array = packed_tensor.cpu().numpy() target_dtype = np.int8 if not hasattr(self, "qzeros") or "int" not in self.dtype else np.uint8 target_len = packed_array.shape[1] * self.n_pack @@ -595,18 +659,21 @@ def unpack_tensor_with_numpy(self, packed_tensor): return unpacked_tensor def pack_tensor(self, raw_tensor): + """Pack tensor.""" if "cuda" in raw_tensor.device.type: return self.pack_tensor_with_torch(raw_tensor) else: return self.pack_tensor_with_numpy(raw_tensor) def unpack_tensor(self, packed_tensor): + """Unpack tensor.""" if "cuda" in packed_tensor.device.type: return self.unpack_tensor_with_torch(packed_tensor) else: return self.unpack_tensor_with_numpy(packed_tensor) def forward(self, input): + """Forward function.""" if not hasattr(self, "weight"): weight = self.recover() device = self.scales.device @@ -624,6 +691,11 @@ def forward(self, input): return F.linear(input, weight, self.bias) def extra_repr(self) -> str: + """Extract the configuration string. + + Returns: + str: the configuration string. + """ tmp_str = "in_features={}, out_features={}, bits={}, group_size={}, bias={}".format( self.in_features, self.out_features, @@ -657,7 +729,8 @@ def forward(ctx, inputs, num_bits=4, group_size=1024, scheme="asym"): @staticmethod def backward(ctx, grad_outputs): - """ + """Backward function. + Args: ctx: Pytorch convention. grad_output: A tensor of gradient of outputs @@ -672,11 +745,15 @@ class TEQLinearFakeQuant(torch.nn.Module): """Wrapper quantization linear.""" def __init__(self, orig_layer, alpha=None, num_bits=4, group_size=-1, scheme="asym"): - """A forward hook to linear module - :param orig_layer: the original module - :param alpha: trainable alpha/scale - :param num_bits: quantization level - :param group_size: for fine-grained quantization.""" + """A forward hook to linear module. + + Args: + orig_layer: the original module + alpha: trainable alpha/scale + num_bits: quantization level + group_size: for fine-grained quantization. + scheme: symmetric quantization or asymmetric quantization. + """ super(TEQLinearFakeQuant, self).__init__() self.orig_layer = orig_layer self.alpha = alpha @@ -686,6 +763,7 @@ def __init__(self, orig_layer, alpha=None, num_bits=4, group_size=-1, scheme="as self.scheme = scheme def forward(self, x): + """Forward function.""" alpha = torch.clip(self.alpha, 1e-5) shape_len = len(x.shape) - 1 shape = (1,) * shape_len + (-1,) @@ -700,9 +778,12 @@ class MulLinear(torch.nn.Module): """Linear wrapper to apply scale to input.""" def __init__(self, module, input_scale=None): - """A forward hook to save input max of a module - :param module: the linear module - :param input_scale: scale for input.""" + """A forward hook to save input max of a module. + + Args: + module: the linear module. + input_scale: scale for input. + """ super().__init__() if input_scale is None: input_scale = torch.empty(module.in_features) @@ -711,13 +792,16 @@ def __init__(self, module, input_scale=None): @property def weight(self): + """Property weight.""" return self.linear.weight @weight.setter def weight(self, weight): + """Property weight setter.""" self.linear.weight = weight def forward(self, X): + """Forward function.""" X = torch.mul(X, self.input_scale) X = self.linear(X) return X diff --git a/neural_compressor/torch/algorithms/weight_only/rtn.py b/neural_compressor/torch/algorithms/weight_only/rtn.py index c04327a62f4..509674d01c6 100644 --- a/neural_compressor/torch/algorithms/weight_only/rtn.py +++ b/neural_compressor/torch/algorithms/weight_only/rtn.py @@ -17,7 +17,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +"""RTN quantization.""" import copy from collections import OrderedDict @@ -42,6 +42,8 @@ class RTNQuantizer(Quantizer): + """RTN Quantizer.""" + def __init__(self, quant_config: OrderedDict = {}): """Init a RTNQuantizer object. diff --git a/neural_compressor/torch/algorithms/weight_only/save_load.py b/neural_compressor/torch/algorithms/weight_only/save_load.py index 4a6e6a0d488..b3e2d95523b 100644 --- a/neural_compressor/torch/algorithms/weight_only/save_load.py +++ b/neural_compressor/torch/algorithms/weight_only/save_load.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +"""WOQ save and load.""" # pylint:disable=import-error import copy @@ -26,6 +26,12 @@ def save(model, output_dir="./saved_results"): + """Save the quantized model and config to the output path. + + Args: + model (torch.nn.module): raw fp32 model or prepared model. + output_dir (str, optional): output path to save. + """ os.makedirs(output_dir, exist_ok=True) qmodel_weight_file_path = os.path.join(os.path.abspath(os.path.expanduser(output_dir)), WEIGHT_NAME) qconfig_file_path = os.path.join(os.path.abspath(os.path.expanduser(output_dir)), QCONFIG_NAME) @@ -65,6 +71,7 @@ def load(model_name_or_path, original_model=None, format=LoadFormat.DEFAULT, dev kwargs (remaining dictionary of keyword arguments, optional): remaining dictionary of keyword arguments for loading huggingface models. will be passed to the huggingface model's `__init__` method, such as 'trust_remote_code', 'revision'. + Returns: torch.nn.Module: quantized model """ @@ -74,7 +81,10 @@ def load(model_name_or_path, original_model=None, format=LoadFormat.DEFAULT, dev class WOQModelLoader: + """WOQ Model Loader.""" + def __init__(self, model_name_or_path, original_model=None, format=LoadFormat.DEFAULT, device="cpu", **kwargs): + """Init the WOQModelLoader object.""" # TODO: When loading WOQ model, use different WeightOnlyLinear module according to device. self.model_name_or_path = model_name_or_path self.original_model = original_model @@ -85,6 +95,14 @@ def __init__(self, model_name_or_path, original_model=None, format=LoadFormat.DE self.loaded_state_dict_keys = {} def load_woq_model(self): + """Load quantized weight-only quantization model. + + Raises: + ValueError: `format` in load function can only be 'huggingface' or 'default'. + + Returns: + torch.nn.Module: quantized model + """ if self.format == LoadFormat.HUGGINGFACE: model = self.load_hf_format_woq_model() logger.info("Loading HuggingFace weight-only quantization model successfully.") @@ -119,6 +137,15 @@ def load_woq_model(self): return model def load_inc_format_woq_model(self, qmodel_weight_file_path, qconfig_file_path): + """Load INC weight-only quantized model in local. + + Args: + qmodel_weight_file_path (str): path to the quantized model. + qconfig_file_path (str): path to the quant config. + + Returns: + torch.nn.Module: quantized model + """ qweights = torch.load(qmodel_weight_file_path) self.loaded_state_dict_keys = qweights.keys() @@ -130,6 +157,11 @@ def load_inc_format_woq_model(self, qmodel_weight_file_path, qconfig_file_path): return model def load_hf_format_woq_model(self): + """Load HuggingFace weight-only quantized model. + + Returns: + torch.nn.Module: quantized model + """ # check required package from neural_compressor.torch.utils import is_package_available diff --git a/neural_compressor/torch/algorithms/weight_only/teq.py b/neural_compressor/torch/algorithms/weight_only/teq.py index 595a2e8479f..f97efcf4e99 100644 --- a/neural_compressor/torch/algorithms/weight_only/teq.py +++ b/neural_compressor/torch/algorithms/weight_only/teq.py @@ -14,7 +14,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -# +"""TEQ quantization.""" from typing import Any, List @@ -39,10 +39,15 @@ class TrainableEquivalentTransformation: _PREPARE_ATTRS_PREFIX = "_prepare_" def __init__(self, model, weight_config={}, absorb_to_layer=None, folding=True, example_inputs=None): - """ - :param model: the model for quantization - :param weight_config (dict, optional): contains all info required by RTN. Defaults to {}. - :param example_inputs: inputs for trace + """Init the TrainableEquivalentTransformation object. + + Args: + model (torch.nn.module): the model for quantization + weight_config (dict, optional): contains all info required by RTN. Defaults to {}. + absorb_to_layer (dict): The layer dict that scale can be absorbed. Default to None. + folding(bool): Allow insert mul before linear when the scale cannot be absorbed by last layer. + Default to True. + example_inputs: inputs for trace. Default to None. """ self.model = model self.weight_config = weight_config @@ -78,8 +83,11 @@ def _post_init(self): self._post_initialized = True def _get_device(self): - """Get the model device - :return:Model device.""" + """Get the model device. + + Returns: + str: Model device. + """ device = get_accelerator().current_device_name() return device @@ -88,10 +96,11 @@ def _get_dtype(self): return p.data.dtype def add_tuning_scale(self, sqrt_w_init=False): - """The main entry of smooth quant - to the paper for more details - :param sqrt_w_init: use sqrt weight to init.""" + """Add tuning scales. + Args: + sqrt_w_init: use sqrt weight to init. + """ if not self.absorb_to_layer: self.absorb_to_layer = self._detect_absorb_to_layer(self.model, self.folding, self.example_inputs) if not self._post_initialized: @@ -157,10 +166,13 @@ def add_tuning_scale(self, sqrt_w_init=False): @torch.no_grad() def _absorb_scales(self, layer, scale, layer_name=""): - """Absorb the scale to the layer at output channel - :param layer: The module - :param scale: The scale to be absorbed - :param layer_name: The layer name.""" + """Absorb the scale to the layer at output channel. + + Args: + layer: the module. + scale: the scale to be absorbed. + layer_name: the layer name. + """ # for insert mul if not self.folding: # pragma: no cover if isinstance(layer, MulLinear): @@ -226,10 +238,12 @@ def _absorb_scales(self, layer, scale, layer_name=""): @torch.no_grad() def _scale_layer_weight(self, layer, scale): ##input channel - """Scale the layer weights at input channel, depthwise conv output channel - :param layer_name: The layer name - :param scale: The scale to be multiplied - :return:""" + """Scale the layer weights at input channel, depthwise conv output channel. + + Args: + layer: the layer. + scale: the scale to be multiplied. + """ if layer.__class__.__name__ == "MulLinear": layer = layer.linear @@ -331,10 +345,11 @@ def quantize(self, **kwargs): self.model = model def save(self, save_scale_file="", save_state_dict_file=""): - """ - save alpha/scale or model weight - :param save_scale_file: save alpha/scale with torch.save - :param save_state_dict_file: save model state_dict + """Save alpha/scale or model weight. + + Args: + save_scale_file: path to save alpha/scale with torch.save. + save_state_dict_file: path to save model state_dict. """ if save_scale_file: # pragma: no cover torch.save(self.trained_alphas, save_scale_file) @@ -344,8 +359,10 @@ def save(self, save_scale_file="", save_state_dict_file=""): class TEQuantizer(Quantizer): + """TEQ Quantizer.""" def __init__(self, quant_config, folding, example_inputs, absorb_to_layer=None): + """Init the TEQuantizer object.""" super().__init__(quant_config=quant_config) self.folding = folding self.absorb_to_layer = absorb_to_layer @@ -363,6 +380,7 @@ def prepare(self, model, *args, **kwargs): Args: model: A float model to be quantized. + Returns: A prepared model. """ @@ -376,6 +394,14 @@ def prepare(self, model, *args, **kwargs): return float_model def convert(self, model, *args: Any, **kwargs: Any): + """Convert the prepared model to a quantized model. + + Args: + model (torch.nn.Module): the prepared model + + Returns: + The quantized model. + """ for attr in self._quantizer._PREPARE_ATTRS: setattr(self._quantizer, attr, getattr(model, self._quantizer._PREPARE_ATTRS_PREFIX + attr, None)) self._quantizer.model = model diff --git a/neural_compressor/torch/algorithms/weight_only/utility.py b/neural_compressor/torch/algorithms/weight_only/utility.py index 8f46b778ec5..255c2d6db2a 100644 --- a/neural_compressor/torch/algorithms/weight_only/utility.py +++ b/neural_compressor/torch/algorithms/weight_only/utility.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +"""Weight-Only utility.""" import torch from neural_compressor.torch.utils import accelerator, device_synchronize, logger @@ -527,6 +527,7 @@ def quant_weight_w_scale(weight, scale, zp=None, group_size=-1, dtype="int"): # AWQ Required, copy from neural_compressor/adaptor/torch_utils/smooth_quant.py def model_forward(model, dataloader, iters, device): + """The model forward function.""" try: cnt = 0 for idx, (input, label) in enumerate(dataloader): @@ -546,6 +547,7 @@ def model_forward(model, dataloader, iters, device): # copy from neural_compressor/adaptor/torch_utils/smooth_quant.py # TODO: potential bug, data type def forward_wrapper(model, input, device=torch.device("cpu")): + """The forward wrapper.""" try: model = model.to(device) input = move_input_to_device(input, device) @@ -566,6 +568,7 @@ def forward_wrapper(model, input, device=torch.device("cpu")): # copy from neural_compressor/adaptor/torch_utils/smooth_quant.py def move_input_to_device(input, device=torch.device("cpu")): + """Move input to the spevific device.""" if isinstance(input, dict) or isinstance(input, UserDict): tmp_input = {} for k, inp in input.items(): @@ -669,6 +672,7 @@ def get_absorb_layers(model, example_inputs, supported_layers=["Linear"], foldin # copy from neural_compressor/adaptor/torch_utils/smooth_quant.py def get_parent(node, all_parents=False): + """Get parent of node.""" if node.inputs() is None: return None elif len(list(node.inputs())) == 0: @@ -705,9 +709,10 @@ def get_module(model, key): # copy from neural_compressor/adaptor/torch_utils/smooth_quant.py class GraphTrace: - """""" + """GraphTrace.""" def __init__(self): + """Init the GraphTrace object.""" self.supported_torch_module_to_aten = { "Linear": "aten::linear", "Conv2d": "aten::_convolution", @@ -736,6 +741,15 @@ def __init__(self): ] ##TODO,support more norm def trace(self, model, dummy_input): + """Trace a torch model. + + Args: + model (torch.nn.module): model to be trace. + dummy_input : dummy input. + + Returns: + traced model. + """ traced_model = None optimize_numerics = False orig_device = str(next(model.parameters()).device) @@ -775,6 +789,15 @@ def trace(self, model, dummy_input): return traced_model def get_nodes(self, traced_model, op_types=["Linear"]): + """Get nodes from traced model. + + Args: + traced_model: traced model. + op_types (list, optional): . Defaults to ["Linear"]. + + Returns: + list: nodes. + """ if isinstance(op_types, str): op_types = [op_types] nodes = [] @@ -787,6 +810,14 @@ def get_nodes(self, traced_model, op_types=["Linear"]): return nodes def get_prev_absorb_layer(self, nodes): + """Get previous absorb layers. + + Args: + nodes (list): target nodes. + + Returns: + list: previous absorb layer + """ prev_absorb_layer = [] for node in nodes: parent = get_parent(node) @@ -815,6 +846,14 @@ def get_prev_absorb_layer(self, nodes): return prev_absorb_layer def skip_op_absorb_helper(self, parent_node): + """Skip op absorption. + + Args: + parent_node : parent node. + + Returns: + bool: True or False. + """ for val_user in list(parent_node.outputs())[0].uses(): next_node = val_user.user if next_node.kind() == "aten::size": @@ -830,6 +869,14 @@ def skip_op_absorb_helper(self, parent_node): return True def mapping_torch_module_to_aten(self, op_types): + """Mapping torch module to aten. + + Args: + op_types : op types. + + Returns: + list: the mapping results. + """ res = [] for op in op_types: if op not in self.supported_torch_module_to_aten.keys(): @@ -840,11 +887,7 @@ def mapping_torch_module_to_aten(self, op_types): return res def _check_valid_conv(self, module): - """Remove group conv except depthwise conv - :param module: - - :return: - """ + """Remove group conv except depthwise conv.""" if not isinstance(module, torch.nn.Conv2d): return True if module.groups > 1: @@ -855,6 +898,17 @@ def _check_valid_conv(self, module): return True def get_absorb_to_layer(self, model, example_input, op_types, skip_unsupported_layers=True): + """Get absorbed layers of a model. + + Args: + model: torch model + example_input: used to trace torch model. + op_types: op types. + skip_unsupported_layers (bool, optional): unsupported layers to skip. Defaults to True. + + Returns: + absorb to layer, no absorb layers + """ traced_model = self.trace(model, example_input) if traced_model is None: return None, None @@ -883,6 +937,16 @@ def get_absorb_to_layer(self, model, example_input, op_types, skip_unsupported_l return absorb_to_layer, no_absorb_layers def remove_unsupported_layers(self, model, absorb_to_layer, no_absorb_layers): + """Remove unsupported layers from layers to be absorb. + + Args: + model : torch model. + absorb_to_layer (dict): layers to be absorb. + no_absorb_layers (dict): unsupported layers. + + Returns: + dict: the new layers to be absorb. + """ res = {} for key in absorb_to_layer.keys(): absorb_layer = get_module(model, key) @@ -931,6 +995,7 @@ def get_example_input(dataloader, i=1): Args: dataloader (object): calibration dataset. + Returns: example_inp (object). """ @@ -1044,10 +1109,15 @@ def get_module_input_output( total_values = defaultdict(defaultdict) def _save_input_output_hook(name, record_input=False, record_output=False): - """ - A forward hook to save input and output values of a module - param name: the module name - return: A hook function + """A forward hook to save input and output values of a module. + + Args: + name: the module name. + record_input (bool): to record input. + record_ouput (bool): to record output. + + Returns: + A hook function """ def _hook(module, inputs, outputs): diff --git a/neural_compressor/torch/quantization/config.py b/neural_compressor/torch/quantization/config.py index 75e6460a53e..66f01a50c75 100644 --- a/neural_compressor/torch/quantization/config.py +++ b/neural_compressor/torch/quantization/config.py @@ -162,6 +162,8 @@ def __init__( double_quant_use_sym (bool): Indicates whether double_quant scale are symmetric. Default is True. double_quant_group_size (int): Size of double_quant groups. Default is 32. quant_lm_head (bool): Indicates whether quantize the lm_head layer in transformers。 Default is False. + white_list (Optional[List[OP_NAME_OR_MODULE_TYPE]]): White list of operator names or module types. + Default is DEFAULT_WHITE_LIST. """ super().__init__(white_list=white_list) self.dtype = dtype @@ -184,6 +186,11 @@ def __init__( @classmethod def register_supported_configs(cls) -> List[OperatorConfig]: + """Register supported configurations for RTN. + + Returns: + List[OperatorConfig]: List of supported operator configurations. + """ supported_configs = [] linear_rtn_config = RTNConfig( dtype=[ @@ -220,6 +227,16 @@ def register_supported_configs(cls) -> List[OperatorConfig]: def to_config_mapping( self, config_list: List[BaseConfig] = None, model_info: List[Tuple[str, str]] = None ) -> OrderedDictType[Union[str, str], OrderedDictType[str, BaseConfig]]: + """Convert the configuration to a mapping. + + Args: + config_list (List[BaseConfig]): List of base configurations. Default is None. + model_info (List[Tuple[str, str]]): List of tuples containing the name and type of each module in the model. + Default is None. + + Returns: + OrderedDictType[Union[str, str], OrderedDictType[str, BaseConfig]]: The configuration mapping. + """ if not self.quant_lm_head: self.set_local( LM_HEAD_NAMES, RTNConfig(dtype="fp32", use_layer_wise=self.use_layer_wise, model_path=self.model_path) @@ -229,6 +246,14 @@ def to_config_mapping( @staticmethod def get_model_info(model: torch.nn.Module) -> List[Tuple[str, Callable]]: + """Get information about the model. + + Args: + model (torch.nn.Module): The model. + + Returns: + List[Tuple[str, Callable]]: List of tuples containing the name and type of each module in the model. + """ filter_result = [] for op_name, module in model.named_modules(): if isinstance(module, WOQ_WHITE_LIST): @@ -239,12 +264,22 @@ def get_model_info(model: torch.nn.Module) -> List[Tuple[str, Callable]]: @classmethod def get_config_set_for_tuning(cls) -> Union[None, "RTNConfig", List["RTNConfig"]]: + """Get the configuration set for tuning. + + Returns: + Union[None, "RTNConfig", List["RTNConfig"]]: The configuration set for tuning. + """ return RTNConfig( dtype=["int4", "nf4"], use_sym=[True, False], group_size=[32, 128], use_mse_search=[False, True] ) @classmethod def get_predefined_configs(cls) -> Dict[torch_utils.ProcessorType, "RTNConfig"]: + """Get the predefined configuration set. + + Returns: + Dict[torch_utils.ProcessorType, "RTNConfig"]: The configuration of RTN. + """ pre_defined_configs: Dict[torch_utils.ProcessorType, RTNConfig] = {} pre_defined_configs[torch_utils.ProcessorType.Client] = cls(use_layer_wise=True) pre_defined_configs[torch_utils.ProcessorType.Server] = cls() @@ -252,11 +287,28 @@ def get_predefined_configs(cls) -> Dict[torch_utils.ProcessorType, "RTNConfig"]: def get_default_rtn_config(processor_type: Optional[Union[str, torch_utils.ProcessorType]] = None) -> RTNConfig: + """Get the default configuration of RTN. + + Args: + processor_type (Optional[Union[str, torch_utils.ProcessorType]], optional): The user-specified processor type. + Defaults to None. + + Returns: + RTNConfig: _description_ + """ process_type = torch_utils.get_processor_type_from_user_config(processor_type) return RTNConfig.get_predefined_configs()[process_type] def get_default_double_quant_config(type="BNB_NF4"): + """Get the default configuration of double quant. + + Args: + type (str, optional): double quant type. Defaults to "BNB_NF4". + + Returns: + dict: double quant config. + """ from neural_compressor.torch.utils.constants import DOUBLE_QUANT_CONFIGS assert type in DOUBLE_QUANT_CONFIGS, "Supported double quant configs: {}".format(list(DOUBLE_QUANT_CONFIGS.keys())) @@ -348,6 +400,8 @@ def __init__( static_groups (bool): Whether to calculate group wise quantization parameters in advance. This option mitigate actorder's extra computational requirements. Default is False. + white_list (Optional[List[OP_NAME_OR_MODULE_TYPE]]): White list of operator names or module types. + Default is DEFAULT_WHITE_LIST. """ assert not quant_lm_head, "GPTQ doesn't support lm_head quantization currently, it's coming soon!" super().__init__(white_list=white_list) @@ -375,6 +429,11 @@ def __init__( @classmethod def register_supported_configs(cls) -> List[OperatorConfig]: + """Register supported configurations for GPTQ. + + Returns: + List[OperatorConfig]: List of supported operator configurations. + """ supported_configs = [] # TODO(Yi) linear_gptq_config = GPTQConfig() @@ -385,6 +444,16 @@ def register_supported_configs(cls) -> List[OperatorConfig]: def to_config_mapping( self, config_list: List[BaseConfig] = None, model_info: List[Tuple[str, str]] = None ) -> OrderedDictType[Union[str, str], OrderedDictType[str, BaseConfig]]: + """Convert the configuration to a mapping. + + Args: + config_list (List[BaseConfig]): List of base configurations. Default is None. + model_info (List[Tuple[str, str]]): List of tuples containing the name and type of each module in the model. + Default is None. + + Returns: + OrderedDictType[Union[str, str], OrderedDictType[str, BaseConfig]]: The configuration mapping. + """ if not self.quant_lm_head: self.set_local( LM_HEAD_NAMES, GPTQConfig(dtype="fp32", use_layer_wise=self.use_layer_wise, model_path=self.model_path) @@ -394,6 +463,14 @@ def to_config_mapping( @staticmethod def get_model_info(model: torch.nn.Module) -> List[Tuple[str, Callable]]: + """Get information about the model. + + Args: + model (torch.nn.Module): The model. + + Returns: + List[Tuple[str, Callable]]: List of tuples containing the name and type of each module in the model. + """ filter_result = [] for op_name, module in model.named_modules(): if isinstance(module, WOQ_WHITE_LIST): @@ -404,6 +481,11 @@ def get_model_info(model: torch.nn.Module) -> List[Tuple[str, Callable]]: @classmethod def get_config_set_for_tuning(cls) -> Union[None, "GPTQConfig", List["GPTQConfig"]]: + """Get the configuration set for tuning. + + Returns: + Union[None, "GPTQConfig", List["GPTQConfig"]]: The configuration set for tuning. + """ # TODO fwk owner needs to update it. return GPTQConfig(act_order=[True, False], use_sym=[False, True]) @@ -505,6 +587,8 @@ def __init__( folding(bool): Allow insert mul before linear when the scale cannot be absorbed by last layer, default is False. absorb_layer_dict (dict): The layer dict that scale can be absorbed, default is {}. + white_list (Optional[List[OP_NAME_OR_MODULE_TYPE]]): White list of operator names or module types. + Default is DEFAULT_WHITE_LIST. """ super().__init__(white_list=white_list) self.dtype = dtype @@ -531,6 +615,11 @@ def __init__( @classmethod def register_supported_configs(cls) -> List[OperatorConfig]: + """Register supported configurations for AWQ. + + Returns: + List[OperatorConfig]: List of supported operator configurations. + """ supported_configs = [] # TODO(Yi) linear_awq_config = AWQConfig() @@ -541,6 +630,16 @@ def register_supported_configs(cls) -> List[OperatorConfig]: def to_config_mapping( self, config_list: List[BaseConfig] = None, model_info: List[Tuple[str, str]] = None ) -> OrderedDictType[Union[str, str], OrderedDictType[str, BaseConfig]]: + """Convert the configuration to a mapping. + + Args: + config_list (List[BaseConfig]): List of base configurations. Default is None. + model_info (List[Tuple[str, str]]): List of tuples containing the name and type of each module in the model. + Default is None. + + Returns: + OrderedDictType[Union[str, str], OrderedDictType[str, BaseConfig]]: The configuration mapping. + """ if not self.quant_lm_head: self.set_local( LM_HEAD_NAMES, AWQConfig(dtype="fp32", use_layer_wise=self.use_layer_wise, model_path=self.model_path) @@ -550,6 +649,14 @@ def to_config_mapping( @staticmethod def get_model_info(model: torch.nn.Module) -> List[Tuple[str, Callable]]: + """Get information about the model. + + Args: + model (torch.nn.Module): The model. + + Returns: + List[Tuple[str, Callable]]: List of tuples containing the name and type of each module in the model. + """ filter_result = [] for op_name, module in model.named_modules(): if isinstance(module, WOQ_WHITE_LIST): @@ -560,6 +667,11 @@ def get_model_info(model: torch.nn.Module) -> List[Tuple[str, Callable]]: @classmethod def get_config_set_for_tuning(cls) -> Union[None, "AWQConfig", List["AWQConfig"]]: + """Get the configuration set for tuning. + + Returns: + Union[None, "AWQConfig", List["AWQConfig"]]: The configuration set for tuning. + """ # TODO fwk owner needs to update it. return AWQConfig(bits=[4, 6]) @@ -648,6 +760,8 @@ def __init__( absorb_to_layer (dict): The layer dict that scale can be absorbed, default is {}. folding(bool): Allow insert mul before linear when the scale cannot be absorbed by last layer, default is False. + white_list (Optional[List[OP_NAME_OR_MODULE_TYPE]]): White list of operator names or module types. + Default is DEFAULT_WHITE_LIST. """ super().__init__(white_list=white_list) self.dtype = dtype @@ -671,6 +785,11 @@ def __init__( @classmethod def register_supported_configs(cls) -> List[OperatorConfig]: + """Register supported configurations for TEQ. + + Returns: + List[OperatorConfig]: List of supported operator configurations. + """ supported_configs = [] # TODO(Yi) linear_teq_config = TEQConfig() @@ -681,6 +800,16 @@ def register_supported_configs(cls) -> List[OperatorConfig]: def to_config_mapping( self, config_list: List[BaseConfig] = None, model_info: List[Tuple[str, str]] = None ) -> OrderedDictType[Union[str, str], OrderedDictType[str, BaseConfig]]: + """Convert the configuration to a mapping. + + Args: + config_list (List[BaseConfig]): List of base configurations. Default is None. + model_info (List[Tuple[str, str]]): List of tuples containing the name and type of each module in the model. + Default is None. + + Returns: + OrderedDictType[Union[str, str], OrderedDictType[str, BaseConfig]]: The configuration mapping. + """ if not self.quant_lm_head: self.set_local(LM_HEAD_NAMES, TEQConfig(dtype="fp32")) config_mapping = super().to_config_mapping(config_list, model_info) @@ -688,6 +817,14 @@ def to_config_mapping( @staticmethod def get_model_info(model: torch.nn.Module) -> List[Tuple[str, Callable]]: + """Get information about the model. + + Args: + model (torch.nn.Module): The model. + + Returns: + List[Tuple[str, Callable]]: List of tuples containing the name and type of each module in the model. + """ filter_result = [] for op_name, module in model.named_modules(): if isinstance(module, WOQ_WHITE_LIST): @@ -698,6 +835,11 @@ def get_model_info(model: torch.nn.Module) -> List[Tuple[str, Callable]]: @classmethod def get_config_set_for_tuning(cls) -> Union[None, "TEQConfig", List["TEQConfig"]]: + """Get the configuration set for tuning. + + Returns: + Union[None, "TEQConfig", List["TEQConfig"]]: The configuration set for tuning. + """ # TODO fwk owner needs to update it. return TEQConfig(bits=[4, 6]) @@ -805,9 +947,11 @@ def __init__( not_use_best_mse (bool): Whether to use mean squared error (default is False). dynamic_max_gap (int): The dynamic maximum gap (default is -1). scale_dtype (str): The data type of quantization scale to be used (default is "float16"), different kernels - have different choices. + have different choices. use_layer_wise (bool): Enables quantize model per layer. Defaults to False. multimodal(bool): Enable multimodal model quantization, (default is "False"). + white_list (Optional[List[OP_NAME_OR_MODULE_TYPE]]): White list of operator names or module types. + Default is DEFAULT_WHITE_LIST. """ super().__init__(white_list=white_list) self.dtype = dtype @@ -842,6 +986,11 @@ def __init__( @classmethod def register_supported_configs(cls) -> List[OperatorConfig]: + """Register supported configurations for AutoRound. + + Returns: + List[OperatorConfig]: List of supported operator configurations. + """ supported_configs = [] # TODO(Yi) linear_AUTOROUND_config = AutoRoundConfig() @@ -851,6 +1000,14 @@ def register_supported_configs(cls) -> List[OperatorConfig]: @staticmethod def get_model_info(model: torch.nn.Module) -> List[Tuple[str, Callable]]: + """Get information about the model. + + Args: + model (torch.nn.Module): The model. + + Returns: + List[Tuple[str, Callable]]: List of tuples containing the name and type of each module in the model. + """ filter_result = [] for op_name, module in model.named_modules(): if isinstance(module, WOQ_WHITE_LIST): @@ -861,6 +1018,11 @@ def get_model_info(model: torch.nn.Module) -> List[Tuple[str, Callable]]: @classmethod def get_config_set_for_tuning(cls) -> Union[None, "AutoRoundConfig", List["AutoRoundConfig"]]: + """Get the configuration set for tuning. + + Returns: + Union[None, "AutoRoundConfig", List["AutoRoundConfig"]]: The configuration set for tuning. + """ # TODO fwk owner needs to update it. return AutoRoundConfig(bits=[4, 6])