enable weight only quantization for language modeling (#1053)

Signed-off-by: Cheng, Zixuan <[email protected]>
intel · Jul 4, 2023 · 4b24be1 · 4b24be1
1 parent 6c30464
commit 4b24be1
Show file tree

Hide file tree

Showing 8 changed files with 977 additions and 1 deletion.
diff --git a/docs/source/quantization_weight_only.md b/docs/source/quantization_weight_only.md
@@ -46,7 +46,7 @@ conf = PostTrainingQuantConfig(
     op_type_dict={
         '.*':{ 	# re.match
             "weight": {
-                'bit': 8, # 1-8 bit 
+                'bits': 8, # 1-8 bit 
                 'group_size': -1,  # -1 (per-channel)
                 'scheme': 'sym', 
                 'algorithm': 'RTN', 

diff --git a/examples/.config/model_params_pytorch.json b/examples/.config/model_params_pytorch.json
@@ -454,6 +454,13 @@
       "main_script": "run_clm.py",
       "batch_size": 8
     },
+    "gpt_j_wikitext_weight_only":{
+      "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/ptq_weight_only",
+      "dataset_location": "",
+      "input_model": "/tf_dataset2/models/pytorch/gpt-j-6B",
+      "main_script": "run_clm.py",
+      "batch_size": 8
+    },
     "gpt_neox":{
       "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/ptq_static/fx",
       "dataset_location": "/tf_dataset/pytorch/glue_data_new/oscar",

diff --git a/examples/README.md b/examples/README.md
@@ -635,6 +635,12 @@ Intel® Neural Compressor validated examples with multiple compression technique
     <td>Post-Training Static Quantization</td>
     <td><a href="./pytorch/nlp/huggingface_models/language-modeling/quantization/ptq_static/fx">fx</a> / <a href="./pytorch/nlp/huggingface_models/language-modeling/quantization/ptq_static/ipex/smooth_quant">smooth quant</a></td>
   </tr>
+  <tr>
+    <td>EleutherAI/gpt-j-6B</td>
+    <td>Natural Language Processing</td>
+    <td>Post-Training Weight Only Quantization</td>
+    <td><a href="./pytorch/nlp/huggingface_models/language-modeling/quantization/ptq_weight_only">weight_only</a></td>
+  </tr>
   <tr>
     <td>abeja/gpt-neox-japanese-2.7b</td>
     <td>Natural Language Processing</td>

diff --git a/...nlp/huggingface_models/language-modeling/quantization/ptq_weight_only/README.md b/...nlp/huggingface_models/language-modeling/quantization/ptq_weight_only/README.md
@@ -0,0 +1,105 @@
+Step-by-Step
+============
+
+This document is used to list the steps of reproducing weight only quantization and benchmarking results.
+
+# Prerequisite
+## 1. Environment
+Python 3.6 or higher version is recommended.
+The dependent packages are all in requirements, please install as following.
+```shell
+cd examples/pytorch/nlp/huggingface_models/language-modeling/quantization/ptq_weight_only
+pip install -r requirements.txt
+```
+
+# Run
+## 1. Quantization
+```shell
+python run_clm.py \
+  --model_name_or_path EleutherAI/gpt-j-6B \
+  --dataset_name wikitext\
+  --dataset_config_name wikitext-2-raw-v1 \
+  --do_train \
+  --do_eval \
+  --weight_only_bits 8 \
+  --weight_only_group -1 \
+  --weight_only_scheme sym \
+  --weight_only_algorithm RTN \
+  --tune \
+  --output_dir saved_results
+```
+> NOTE
+>
+> `saved_results` is the path to finetuned output_dir.
+
+or
+```bash
+sh run_tuning.sh --topology=topology_name --input_model=model_name_or_path --weight_only_bits=8 --weight_only_group=-1 --weight_only_scheme=sym --weight_only_algorithm=RTN
+```
+
+> NOTE
+>
+> `weight_only_bits`, `weight_only_group`, `weight_only_scheme`, and `weight_only_algorithm` can be modified by user. For details, please refer to [README](../../../../../../../docs/source/quantization_weight_only.md).
+
+## 2. Benchmark
+```bash
+# int8
+sh run_benchmark.sh --topology=topology_name --mode=performance --int8=true --input_model=model_name_or_path  --config=saved_results
+# fp32
+sh run_benchmark.sh --topology=topology_name --mode=performance --input_model=model_name_or_path
+```
+## 3. Validated Model List
+<table>
+<thead>
+  <tr>
+    <th>Topology Name</th>
+    <th>Model Name</th>
+    <th>Dataset/Task Name</th>
+  </tr>
+</thead>
+<tbody align="center">
+  <tr>
+    <td>gpt_j_wikitext</td>
+    <td><a href="https://huggingface.co/EleutherAI/gpt-j-6B">EleutherAI/gpt-j-6B</a></td>
+    <td><a href="https://huggingface.co/datasets/wikitext">wikitext</a></td>
+  </tr>
+</tbody>
+</table>
+
+## 4. Saving and Loading Model
+### Saving model:
+```python
+from neural_compressor.config import AccuracyCriterion, PostTrainingQuantConfig
+from neural_compressor import quantization
+op_type_dict={
+    '.*':{
+        "weight": {
+            'bits': 8,
+            'group_size': 32,
+            'scheme': 'sym', 
+            'algorithm': 'RTN', 
+        },
+    },
+}
+accuracy_criterion = AccuracyCriterion(higher_is_better=False, tolerable_loss=0.01)
+conf = PostTrainingQuantConfig(accuracy_criterion=accuracy_criterion,
+                            approach='weight_only',
+                            op_type_dict=op_type_dict)
+q_model = quantization.fit(model,
+                           conf,
+                           calib_dataloader=dataloader(),
+                           eval_func=eval_func)
+q_model.save("output_dir")
+```
+Here, `q_model` is the Neural Compressor model class, so it has "save" API:
+
+```python
+q_model.save("Path_to_save_quantized_model")
+```
+### Loading model:
+```python
+from neural_compressor.utils.pytorch import load
+quantized_model = load(tuned_checkpoint, model)
+```
+--------
+For more details, please refer to the [sample code](./run_clm.py).
diff --git a/...ch/nlp/huggingface_models/language-modeling/quantization/ptq_weight_only/requirements.txt b/...ch/nlp/huggingface_models/language-modeling/quantization/ptq_weight_only/requirements.txt
@@ -0,0 +1,8 @@
+sentencepiece!=0.1.92
+protobuf
+evaluate
+datasets
+transformers>=4.22.0
+accelerate
+torch>=1.9.0
+pytest # adapt transformers 4.30.x
diff --git a/...ch/nlp/huggingface_models/language-modeling/quantization/ptq_weight_only/run_benchmark.sh b/...ch/nlp/huggingface_models/language-modeling/quantization/ptq_weight_only/run_benchmark.sh
@@ -0,0 +1,89 @@
+#!/bin/bash
+set -x
+
+function main {
+
+  init_params "$@"
+  run_benchmark
+
+}
+
+# init params
+function init_params {
+  iters=100
+  batch_size=16
+  tuned_checkpoint=saved_results
+  echo ${max_eval_samples}
+  for var in "$@"
+  do
+    case $var in
+      --topology=*)
+          topology=$(echo $var |cut -f2 -d=)
+      ;;
+      --dataset_location=*)
+          dataset_location=$(echo $var |cut -f2 -d=)
+      ;;
+      --input_model=*)
+          input_model=$(echo $var |cut -f2 -d=)
+      ;;
+      --mode=*)
+          mode=$(echo $var |cut -f2 -d=)
+      ;;
+      --batch_size=*)
+          batch_size=$(echo $var |cut -f2 -d=)
+      ;;
+      --iters=*)
+          iters=$(echo ${var} |cut -f2 -d=)
+      ;;
+      --int8=*)
+          int8=$(echo ${var} |cut -f2 -d=)
+      ;;
+      --config=*)
+          tuned_checkpoint=$(echo $var |cut -f2 -d=)
+      ;;
+      *)
+          echo "Error: No such parameter: ${var}"
+          exit 1
+      ;;
+    esac
+  done
+
+}
+
+
+# run_benchmark
+function run_benchmark {
+    extra_cmd=''
+
+    if [[ ${mode} == "accuracy" ]]; then
+        mode_cmd=" --accuracy "
+    elif [[ ${mode} == "performance" ]]; then
+        mode_cmd=" --performance --iters "${iters}
+    else
+        echo "Error: No such mode: ${mode}"
+        exit 1
+    fi
+
+    if [ "${topology}" = "gpt_j_wikitext_weight_only" ]; then
+        TASK_NAME='wikitext'
+        model_name_or_path=${input_model}
+        extra_cmd='--dataset_config_name=wikitext-2-raw-v1'
+    fi
+
+    if [[ ${int8} == "true" ]]; then
+        extra_cmd=$extra_cmd" --int8"
+    fi
+    echo $extra_cmd
+
+    python -u run_clm.py \
+        --model_name_or_path ${input_model} \
+        --dataset_name ${TASK_NAME} \
+        --do_eval \
+        --per_device_eval_batch_size ${batch_size} \
+        --output_dir ${tuned_checkpoint} \
+        ${mode_cmd} \
+        ${extra_cmd}
+
+}
+
+main "$@"