Enhance INC WOQ model loading & support Huggingface WOQ model loading (…

…#1826) Signed-off-by: yuwenzho <[email protected]>
intel · Jun 13, 2024 · 0eced14 · 0eced14
1 parent 6733dab
commit 0eced14
Show file tree

Hide file tree

Showing 15 changed files with 762 additions and 52 deletions.
diff --git a/docs/3x/PT_WeightOnlyQuant.md b/docs/3x/PT_WeightOnlyQuant.md
@@ -31,13 +31,13 @@ Theoretically, round-to-nearest (RTN) is the most straightforward way to quantiz
 
 ## Supported Matrix
 
-| Algorithms/Backend |   PyTorch eager mode  |  
+| Algorithms/Backend |   PyTorch eager mode  |
 |--------------|----------|
 |       RTN      |  &#10004;  |
 |       GPTQ     |  &#10004;  |
 |       AutoRound|  &#10004;  |
 |       AWQ      |  &#10004;  |
-|       TEQ      |  &#10004;  | 
+|       TEQ      |  &#10004;  |
 |       HQQ      |  &#10004;  |
 > **RTN:** A quantification method that we can think of very intuitively. It does not require additional datasets and is a very fast quantization method. Generally speaking, RTN will convert the weight into a uniformly distributed integer data type, but some algorithms, such as Qlora, propose a non-uniform NF4 data type and prove its theoretical optimality.
 
@@ -64,8 +64,8 @@ WeightOnlyQuant quantization for PyTorch is using prepare and convert [APIs](./P
 | bits (int)| [1, ..., 8] |
 | group_size (int)| [-1, 1, ..., $C_{in}$] |
 | use_sym (bool)| [True, False] |
-|               use_double_quant (bool)       |  [True, False]                           | 
-|               double_quant_dtype (str)      |  ['int']                      | 
+|               use_double_quant (bool)       |  [True, False]                           |
+|               double_quant_dtype (str)      |  ['int']                      |
 |               double_quant_bits (int)       |  [1, ..., bits] |
 |               double_quant_use_sym (bool)   |  [True, False] |
 |               double_quant_group_size (int) |  [-1, 1, ..., $C_{in}$]                           |
@@ -98,7 +98,7 @@ model = convert(model)
 #### GPTQ
 |  gptq_args  | comments |      default value                                                       |
 |----------|-------------|-------------------------------------------------------------------|
-|               use_mse_search (bool)   |  Enables mean squared error (MSE) search                                                                                                   |  False 
+|               use_mse_search (bool)   |  Enables mean squared error (MSE) search                                                                                                   |  False
 |               use_layer_wise (bool)   |  Enables quantize model per layer                                                                                                          |  False |
 |               model_path (str)        |  Model path that is used to load   state_dict per layer                                                                                    |                    |
 |               use_double_quant (bool) |  Enables double quantization                                                                                                               |  False  |
@@ -120,7 +120,7 @@ model = convert(model)
 #### AutoRound
 |  autoround_args  | comments |      default value                                                       |
 |----------|-------------|-------------------------------------------------------------------|
-|             enable_full_range (bool)        |  Whether to enable full range   quantization                                               | False     
+|             enable_full_range (bool)        |  Whether to enable full range   quantization                                               | False
 |             batch_size (int)                |  Batch size for training                                                                   | 8         |
 |             lr_scheduler                    |  The learning rate scheduler to be   used                                                  |     None                 |
 |             enable_quanted_input (bool)     |  Whether to use quantized input   data                                                     | True      |
@@ -251,8 +251,8 @@ from neural_compressor.torch.quantization import load
 
 orig_model = YOURMODEL()
 loaded_model = load(
-    "saved_results", model=orig_model
-)  # Please note that the model parameter passes the original model.
+    "saved_results", original_model=orig_model
+)  # Please note that the original_model parameter passes the original model.
 ```
 
 

diff --git a/...x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/habana_fp8/run_llm.py b/...x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/habana_fp8/run_llm.py
@@ -63,7 +63,7 @@
 parser.add_argument("--calib_iters", default=100, type=int,
                     help="calibration iters.")
 parser.add_argument("--tasks", nargs='+', default=["lambada_openai"], \
-                    type=str, choices=["hellaswag", "lambada_openai", "piqa", "winogrande", "copa", 
+                    type=str, choices=["hellaswag", "lambada_openai", "piqa", "winogrande", "copa",
                                        "rte", "openbookqa", "lambada_standard", "wikitext"],
                     help="tasks list for accuracy validation")
 parser.add_argument("--limit", default=None, type=int,
@@ -117,10 +117,10 @@
         for examples in calib_dataset:
             calib_data.append(
                 tokenizer(
-                    examples["text"], 
-                    return_tensors="pt", 
-                    max_length=64, 
-                    padding="max_length", 
+                    examples["text"],
+                    return_tensors="pt",
+                    max_length=64,
+                    padding="max_length",
                     truncation=True
                 )
             )
@@ -154,7 +154,7 @@ def calib_func(model):
 
 
 
-# If torch.matmul and torch.bmm are not replaced by INC module, 
+# If torch.matmul and torch.bmm are not replaced by INC module,
 # Below codes can make torch.matmul and torch.bmm run on fp8 by injection.
 if not args.skip_fp8_mm and args.precision in ['fp8_e4m3', 'fp8_e5m2']:
     def replace_torch_mm_bmm():

diff --git a/...i/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_clm_no_trainer.py b/...i/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_clm_no_trainer.py
@@ -367,7 +367,7 @@ def run_fn(model):
         user_model = prepare(model=user_model, quant_config=quant_config, example_inputs=example_inputs)
         run_fn(user_model)
         user_model = convert(user_model)
-    
+
     user_model.save(args.output_dir)
 
 
@@ -377,9 +377,10 @@ def run_fn(model):
     print("load int8 model")
 
     from neural_compressor.torch.quantization import load
+    user_model, _ = get_user_model()
     tokenizer = AutoTokenizer.from_pretrained(args.model)
     config = AutoConfig.from_pretrained(args.model)
-    user_model = load(os.path.abspath(os.path.expanduser(args.output_dir)))
+    user_model = load(os.path.abspath(os.path.expanduser(args.output_dir)), user_model)
     setattr(user_model, "config", config)
 else:
     user_model, tokenizer = get_user_model()

diff --git a/neural_compressor/torch/algorithms/weight_only/__init__.py b/neural_compressor/torch/algorithms/weight_only/__init__.py
@@ -11,3 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+
+from .save_load import save, load