Support true sequential options in gptq (#1656)

Signed-off-by: YIYANGCAI <[email protected]>
intel · May 23, 2024 · 92c9423 · 92c9423
1 parent d640297
commit 92c9423
Show file tree

Hide file tree

Showing 6 changed files with 260 additions and 102 deletions.
diff --git a/docs/source/quantization_weight_only.md b/docs/source/quantization_weight_only.md
@@ -87,6 +87,8 @@ Notes:
 |  use_max_length  | False | Whether to align all calibration data to fixed length, which equals to pad_max_length. |
 |  block_size  | 128 | Execute GPTQ quantization per block, block shape = [$C_{out}$, block_size] |
 |  static_groups  | False | Whether to calculate group wise quantization parameters in advance. This option mitigate actorder's extra computational requirements |
+|  true_sequential  | False | Whether to quantize layers within a transformer block in their original order. This can lead to higher accuracy but slower overall quantization process. |
+|  lm_head  | False | Whether to quantize the lm_head (linear layer related to prediction in the end of the language models). |
 
 **Note:** Neural compressor provides `Unsigned integer for asymmetric quantization` and `Signed integer for symmetric quantization`. Please follow the below section to compress the low bit data type for saving.
 

diff --git a/...s/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_clm_no_trainer.py b/...s/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_clm_no_trainer.py
@@ -77,6 +77,8 @@
                                                                            this should align with your model config, \
                                                                            and your dataset builder args: args.pad_max_length')
 parser.add_argument('--gptq_static_groups', action='store_true', help='Use determined group to do quantization')
+parser.add_argument('--gptq_true_sequential', action='store_true', help="Whether to run in true_sequential model.")
+parser.add_argument('--gptq_lm_head', action='store_true', help="Whether to use GPTQ to quantize the output layer of the LLMs.")
 # ==============code generation args===========
 parser.add_argument("--code_generation", action="store_true")
 parser.add_argument("--n_samples", default=200, type=int)
@@ -278,7 +280,8 @@ def calib_func(prepared_model):
             'use_max_length': args.gptq_use_max_length,
             'pad_max_length': args.gptq_pad_max_length,
             'static_groups': args.gptq_static_groups,
-            "enable_mse_search": args.woq_enable_mse_search,
+            "true_sequential": args.gptq_true_sequential,
+            "lm_head": args.gptq_lm_head,
         }
         # GPTQ: use assistive functions to modify calib_dataloader and calib_func
         # TEQ: set calib_func=None, use default training func as calib_func

diff --git a/neural_compressor/adaptor/pytorch.py b/neural_compressor/adaptor/pytorch.py
@@ -4722,6 +4722,8 @@ def gptq_quantize(self, model, tune_cfg, dataloader):
                     "act_order": self.recipes["gptq_args"].get("act_order", False),
                     "block_size": self.recipes["gptq_args"].get("block_size", True),
                     "static_groups": self.recipes["gptq_args"].get("static_groups", False),
+                    "true_sequential": self.recipes["gptq_args"].get("true_sequential", False),
+                    "lm_head": self.recipes["gptq_args"].get("lm_head", False),
                 }
         nsamples = self.recipes["gptq_args"].get("nsamples", 128)
         use_max_length = self.recipes["gptq_args"].get("use_max_length", False)