From 1c426a0738c68403a2ef3aaef9b19ecff8f2721a Mon Sep 17 00:00:00 2001 From: Yiyang Cai <49231152+YIYANGCAI@users.noreply.github.com> Date: Tue, 23 Jan 2024 20:45:27 +0800 Subject: [PATCH] Support static_groups options in GPTQ API (#1478) Signed-off-by: YIYANGCAI --- docs/source/quantization_weight_only.md | 1 + .../quantization/llm/run_clm_no_trainer.py | 6 +++- neural_compressor/adaptor/pytorch.py | 1 + neural_compressor/adaptor/torch_utils/gptq.py | 35 ++++++++++++++++--- .../test_weight_only_quantization.py | 1 + 5 files changed, 38 insertions(+), 6 deletions(-) diff --git a/docs/source/quantization_weight_only.md b/docs/source/quantization_weight_only.md index 25ddd78400b..50f5913893e 100644 --- a/docs/source/quantization_weight_only.md +++ b/docs/source/quantization_weight_only.md @@ -86,6 +86,7 @@ Notes: | pad_max_length | 2048 | Whether to align calibration data to a fixed length. This value should not exceed model's acceptable sequence length. Please refer to model's config json to find out this value.| | use_max_length | False | Whether to align all calibration data to fixed length, which equals to pad_max_length. | | block_size | 128 | Execute GPTQ quantization per block, block shape = [$C_{out}$, block_size] | +| static_groups | False | Whether to calculate group wise quantization parameters in advance. This option mitigate actorder's extra computational requirements | **Note:** Neural compressor provides `Unsigned integer for asymmetric quantization` and `Signed integer for symmetric quantization`. Please follow the below section to compress the low bit data type for saving. diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_clm_no_trainer.py b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_clm_no_trainer.py index 5aae4b28d51..fc3799f5fd9 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_clm_no_trainer.py +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_clm_no_trainer.py @@ -78,6 +78,7 @@ this should align with your model config, \ and your dataset builder args: args.pad_max_length') parser.add_argument('--gptq_debug', action='store_true', help='Whether to use debug model ') +parser.add_argument('--gptq_static_groups', action='store_true', help='Use determined group to do quantization') # ==============code generation args=========== parser.add_argument("--code_generation", action="store_true") parser.add_argument("--n_samples", default=200, type=int) @@ -277,7 +278,8 @@ def calib_func(prepared_model): 'block_size': args.gptq_block_size, 'nsamples': args.gptq_nsamples, 'use_max_length': args.gptq_use_max_length, - 'pad_max_length': args.gptq_pad_max_length + 'pad_max_length': args.gptq_pad_max_length, + 'static_groups': args.gptq_static_groups, } # GPTQ: use assistive functions to modify calib_dataloader and calib_func # TEQ: set calib_func=None, use default training func as calib_func @@ -293,6 +295,7 @@ def calib_func(prepared_model): # for test on various models, keep the code of directly call gptq_quantize if args.gptq_debug: + from neural_compressor.adaptor.torch_utils.weight_only import gptq_quantize gptq_conf = { @@ -301,6 +304,7 @@ def calib_func(prepared_model): 'group_size': args.woq_group_size, # -1 (per-channel) 'sym': (args.woq_scheme == "sym"), 'act_order': args.gptq_actorder, + 'static_groups': args.gptq_static_groups, } } q_model_gptq_debug, gptq_config = gptq_quantize( diff --git a/neural_compressor/adaptor/pytorch.py b/neural_compressor/adaptor/pytorch.py index eb62db6b40f..fa9a19a976c 100644 --- a/neural_compressor/adaptor/pytorch.py +++ b/neural_compressor/adaptor/pytorch.py @@ -4709,6 +4709,7 @@ def gptq_quantize(self, model, tune_cfg, dataloader): "percdamp": self.recipes["gptq_args"].get("percdamp", 0.01), "act_order": self.recipes["gptq_args"].get("act_order", False), "block_size": self.recipes["gptq_args"].get("block_size", True), + "static_groups": self.recipes["gptq_args"].get("static_groups", False), } nsamples = self.recipes["gptq_args"].get("nsamples", 128) use_max_length = self.recipes["gptq_args"].get("use_max_length", False) diff --git a/neural_compressor/adaptor/torch_utils/gptq.py b/neural_compressor/adaptor/torch_utils/gptq.py index 1a33addb364..371150b779c 100644 --- a/neural_compressor/adaptor/torch_utils/gptq.py +++ b/neural_compressor/adaptor/torch_utils/gptq.py @@ -232,6 +232,7 @@ def __init__( self.percdamp_default = 0.01 self.sym_default = False self.act_order_default = False + self.static_groups_default = False self.perchannel_default = True self.mse_default = False self.check_layer_config() @@ -406,6 +407,9 @@ def check_layer_config(self): tmp_weight_config[name]["percdamp"] = self.weight_config.get("pecdamp", self.percdamp_default) tmp_weight_config[name]["sym"] = self.weight_config.get("sym", self.sym_default) tmp_weight_config[name]["act_order"] = self.weight_config.get("act_order", self.act_order_default) + tmp_weight_config[name]["static_groups"] = self.weight_config.get( + "static_groups", self.static_groups_default + ) tmp_weight_config[name]["perchannel"] = self.weight_config.get("perchannel", self.perchannel_default) tmp_weight_config[name]["mse"] = self.weight_config.get("mse", self.mse_default) self.weight_config = tmp_weight_config @@ -417,6 +421,9 @@ def check_layer_config(self): self.weight_config[layer_name]["percdamp"] = config.get("pecdamp", self.percdamp_default) self.weight_config[layer_name]["sym"] = config.get("sym", self.sym_default) self.weight_config[layer_name]["act_order"] = config.get("act_order", self.act_order_default) + self.weight_config[layer_name]["static_groups"] = config.get( + "static_groups", self.static_groups_default + ) self.weight_config[layer_name]["perchannel"] = config.get("perchannel", self.perchannel_default) self.weight_config[layer_name]["mse"] = config.get("mse", self.mse_default) @@ -631,6 +638,7 @@ def tmp(_, inp, out): percdamp=weight_config_this_layer["percdamp"], groupsize=weight_config_this_layer["group_size"], act_order=weight_config_this_layer["act_order"], + static_groups=weight_config_this_layer["static_groups"], ) if self.layer_wise: from ..torch_utils.layer_wise_quant.utils import ( @@ -745,7 +753,7 @@ def add_batch(self, inp, out): # self.H += 2 / self.nsamples * inp.matmul(inp.t()) self.H += inp.matmul(inp.t()) # H = X*X, which should be a sysm matrix - def fasterquant(self, W, blocksize=128, percdamp=0.01, groupsize=-1, act_order=False): + def fasterquant(self, W, blocksize=128, percdamp=0.01, groupsize=-1, act_order=False, static_groups=False): # W = self.layer.weight.data.clone() weight_shape, weight_dtype = W.shape, W.data.dtype if isinstance(self.layer, nn.Conv2d): @@ -765,6 +773,17 @@ def fasterquant(self, W, blocksize=128, percdamp=0.01, groupsize=-1, act_order=F H[dead, dead] = 1 W[:, dead] = 0 # such channel makes no contribution to quantization computation + # enable static_groups + # calculate the quantization parameters for original group in advance. + if static_groups: + import copy + + groups = [] + for i in range(0, self.columns, groupsize): + quantizer = copy.deepcopy(self.quantizer) + quantizer.find_params(W[:, i : (i + groupsize)], weight=True) + groups.append(quantizer) + # rearrange considering the diag's value if act_order: perm = torch.argsort(torch.diag(H), descending=True) @@ -801,10 +820,16 @@ def fasterquant(self, W, blocksize=128, percdamp=0.01, groupsize=-1, act_order=F d = Hinv1[i, i] if groupsize != -1: - if (i1 + i) % groupsize == 0: - self.quantizer.find_params(W[:, (i1 + i) : (i1 + i + groupsize)], weight=True) - scale.append(self.quantizer.scale) - zero.append(self.quantizer.zero) + if not static_groups: + if (i1 + i) % groupsize == 0: + self.quantizer.find_params(W[:, (i1 + i) : (i1 + i + groupsize)], weight=True) + scale.append(self.quantizer.scale) + zero.append(self.quantizer.zero) + else: + idx = i1 + i + if act_order: + idx = perm[idx] + self.quantizer = groups[idx // groupsize] q = quantize(w.unsqueeze(1), self.quantizer.scale, self.quantizer.zero, self.quantizer.maxq).flatten() Q1[:, i] = q diff --git a/test/quantization/test_weight_only_quantization.py b/test/quantization/test_weight_only_quantization.py index 087b985b15b..5c991b74c5a 100644 --- a/test/quantization/test_weight_only_quantization.py +++ b/test/quantization/test_weight_only_quantization.py @@ -147,6 +147,7 @@ def __iter__(self): "sym": False, "percdamp": 0.01, "act_order": True, + "static_groups": True, }, "transformer.h.2.attn.k_proj": { "wbits": 3,