Add woq examples (#1982)

Signed-off-by: Kaihui-intel <[email protected]> Signed-off-by: Sun, Xuehao <[email protected]> Co-authored-by: Sun, Xuehao <[email protected]>
intel · Oct 10, 2024 · 2bb257e · 2bb257e
1 parent 586eb88
commit 2bb257e
Show file tree

Hide file tree

Showing 5 changed files with 309 additions and 45 deletions.
diff --git a/examples/.config/model_params_pytorch_3x.json b/examples/.config/model_params_pytorch_3x.json
@@ -84,6 +84,34 @@
         "main_script": "run_clm_no_trainer.py",
         "batch_size": 8
       },
+      "gpt_j_woq_awq_int4":{
+        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
+        "dataset_location": "",
+        "input_model": "",
+        "main_script": "run_clm_no_trainer.py",
+        "batch_size": 1
+      },
+      "opt_125m_woq_awq_int4":{
+        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
+        "dataset_location": "",
+        "input_model": "",
+        "main_script": "run_clm_no_trainer.py",
+        "batch_size": 1
+      },
+      "opt_125m_woq_autoround_int4":{
+        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
+        "dataset_location": "",
+        "input_model": "",
+        "main_script": "run_clm_no_trainer.py",
+        "batch_size": 1
+      },
+      "opt_125m_woq_autotune_int4":{
+        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
+        "dataset_location": "",
+        "input_model": "",
+        "main_script": "run_clm_no_trainer.py",
+        "batch_size": 1
+      },
       "gpt_j_ipex":{
         "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant/ipex",
         "dataset_location": "",

diff --git a/...rch/nlp/huggingface_models/language-modeling/quantization/weight_only/README.md b/...rch/nlp/huggingface_models/language-modeling/quantization/weight_only/README.md
@@ -35,9 +35,8 @@ python run_clm_no_trainer.py \
     --woq_group_size 128 \
     --gptq_max_seq_length 2048 \
     --gptq_use_max_length \
-    --accuracy \
-    --tasks "lambada_openai" \
-    --double_quant_type "BNB_NF4"
+    --double_quant_type "BNB_NF4" \
+    --output_dir saved_results
 
 # "--woq_algo RTN" is used to enable RTN algorithms
 python run_clm_no_trainer.py \
@@ -48,9 +47,38 @@ python run_clm_no_trainer.py \
     --woq_bits 4 \
     --woq_scheme asym \
     --woq_group_size 128 \
+    --double_quant_type "BNB_NF4"
+    --output_dir saved_results
+
+# "--woq_algo AWQ" is used to enable AWQ algorithms
+python run_clm_no_trainer.py \
+    --model EleutherAI/gpt-j-6B \
+    --dataset NeelNanda/pile-10k \
+    --quantize \
+    --woq_algo AWQ \
+    --woq_bits 4 \
+    --woq_scheme asym \
+    --woq_group_size 128 \
+    --calib_iters 128
+
+# "--woq_algo AutoRound" is used to enable AutoRound algorithms
+python run_clm_no_trainer.py \
+    --model EleutherAI/gpt-j-6B \
+    --dataset NeelNanda/pile-10k \
+    --quantize \
+    --woq_algo AutoRound \
+    --woq_bits 4 \
+    --woq_scheme asym \
+    --woq_group_size 128
+
+# "--accuracy" for eval
+python run_clm_no_trainer.py \
+    --model EleutherAI/gpt-j-6B \
+    --dataset NeelNanda/pile-10k \
+    --int8 \
     --accuracy \
     --tasks "lambada_openai" \
-    --double_quant_type "BNB_NF4"
+    --output_dir saved_results
 ```
 **Notes**: Weight-only quantization based on fake quantization is previewly supported and supports RTN, GPTQ[1], AWQ[2], TEQ algorithms. For more details, please refer to [link](https://github.com/intel/neural-compressor/blob/master/docs/source/quantization_weight_only.md). Our GPTQ API support various CLMs including GPTJ, OPTs, Blooms, Llamas, Falcons, MPTs, ChatGLMs, etc. Simply replace the "--model" argument with other models to quantize different CLMs with GPTQ.
 
@@ -72,8 +100,6 @@ python run_clm_no_trainer.py \
     --woq_group_size 128 \
     --gptq_max_seq_length 2048 \
     --gptq_use_max_length \
-    --accuracy \
-    --tasks "lambada_openai" \
     --double_quant_type "BNB_NF4"
 
 # "--woq_algo RTN" is used to enable RTN algorithms
@@ -85,13 +111,40 @@ python run_clm_no_trainer.py \
     --woq_bits 4 \
     --woq_scheme asym \
     --woq_group_size 128 \
+    --double_quant_type "BNB_NF4"
+
+# "--woq_algo AWQ" is used to enable AWQ algorithms
+python run_clm_no_trainer.py \
+    --model facebook/opt-125m \
+    --dataset NeelNanda/pile-10k \
+    --quantize \
+    --woq_algo AWQ \
+    --woq_bits 4 \
+    --woq_scheme asym \
+    --woq_group_size 128 \
+    --calib_iters 128
+
+# "--woq_algo AutoRound" is used to enable AutoRound algorithms
+python run_clm_no_trainer.py \
+    --model facebook/opt-125m \
+    --dataset NeelNanda/pile-10k \
+    --quantize \
+    --woq_algo AutoRound \
+    --woq_bits 4 \
+    --woq_scheme asym \
+    --woq_group_size 128
+
+# "--accuracy" for eval
+python run_clm_no_trainer.py \
+    --model facebook/opt-125m  \
+    --dataset NeelNanda/pile-10k \
+    --int8 \
     --accuracy \
     --tasks "lambada_openai" \
-    --double_quant_type "BNB_NF4"
+    --output_dir saved_results
 ```
 
 ### LLAMA2-7b/13b/70b
->Note: LLAMA requires IPEX requirements >= 2.1 to get better accuracy.
 #### Quantization
 
 ```bash
@@ -107,8 +160,6 @@ python run_clm_no_trainer.py \
     --woq_group_size 128 \
     --gptq_max_seq_length 2048 \
     --gptq_use_max_length \
-    --accuracy \
-    --tasks "lambada_openai" \
     --double_quant_type "BNB_NF4"
 
 # "--woq_algo RTN" is used to enable RTN algorithms
@@ -120,8 +171,6 @@ python run_clm_no_trainer.py \
     --woq_bits 4 \
     --woq_scheme asym \
     --woq_group_size 128 \
-    --accuracy \
-    --tasks "lambada_openai" \
     --double_quant_type "BNB_NF4"
 ```
 

diff --git a/...ytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_benchmark.sh b/...ytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_benchmark.sh
@@ -70,58 +70,59 @@ function run_benchmark {
     fi
     echo $extra_cmd
 
-        if [ "${topology}" = "opt_125m_woq_gptq_int4" ]; then
+    if [ "${topology}" = "opt_125m_woq_gptq_int4" ]; then
         model_name_or_path="facebook/opt-125m"
-        extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
     elif [ "${topology}" = "opt_125m_woq_gptq_int4_dq_bnb" ]; then
         model_name_or_path="facebook/opt-125m"
-        extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
-        extra_cmd=$extra_cmd" --double_quant_type BNB_NF4"
     elif [ "${topology}" = "opt_125m_woq_gptq_int4_dq_ggml" ]; then
         model_name_or_path="facebook/opt-125m"
-        extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length --gptq_percdamp 0.1 --gptq_actorder"
-        extra_cmd=$extra_cmd" --double_quant_type GGML_TYPE_Q4_K"
     elif [ "${topology}" = "llama2_7b_gptq_int4" ]; then
         model_name_or_path="meta-llama/Llama-2-7b-hf"
-        extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
     elif [ "${topology}" = "llama2_7b_gptq_int4_dq_bnb" ]; then
         model_name_or_path="meta-llama/Llama-2-7b-hf"
-        extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
-        extra_cmd=$extra_cmd" --double_quant_type BNB_NF4"
     elif [ "${topology}" = "llama2_7b_gptq_int4_dq_ggml" ]; then
         model_name_or_path="meta-llama/Llama-2-7b-hf"
-        extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
-        extra_cmd=$extra_cmd" --double_quant_type GGML_TYPE_Q4_K"
     elif [ "${topology}" = "gpt_j_woq_rtn_int4" ]; then
         model_name_or_path="EleutherAI/gpt-j-6b"
-        extra_cmd=$extra_cmd" --woq_algo RTN --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search"
     elif [ "${topology}" = "gpt_j_woq_rtn_int4_dq_bnb" ]; then
-        model_name_or_path="EleutherAI/gpt-j-6b"\
-        extra_cmd=$extra_cmd" --woq_algo RTN --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search"
-        extra_cmd=$extra_cmd" --double_quant_type BNB_NF4"
+        model_name_or_path="EleutherAI/gpt-j-6b"
     elif [ "${topology}" = "gpt_j_woq_rtn_int4_dq_ggml" ]; then
-        model_name_or_path="EleutherAI/gpt-j-6b"\
-        extra_cmd=$extra_cmd" --woq_algo RTN --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search"
-        extra_cmd=$extra_cmd" --double_quant_type GGML_TYPE_Q4_K"
+        model_name_or_path="EleutherAI/gpt-j-6b"
     elif [ "${topology}" = "gpt_j_woq_gptq_int4" ]; then
         model_name_or_path="EleutherAI/gpt-j-6b"
-        extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
     elif [ "${topology}" = "gpt_j_woq_gptq_int4_dq_bnb" ]; then
         model_name_or_path="EleutherAI/gpt-j-6b"
-        extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
-        extra_cmd=$extra_cmd" --double_quant_type BNB_NF4"
     elif [ "${topology}" = "gpt_j_woq_gptq_int4_dq_ggml" ]; then
         model_name_or_path="EleutherAI/gpt-j-6b"
-        extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
-        extra_cmd=$extra_cmd" --double_quant_type GGML_TYPE_Q4_K"
+    elif [ "${topology}" = "gpt_j_woq_awq_int4" ]; then
+        model_name_or_path="EleutherAI/gpt-j-6b"
+    elif [ "${topology}" = "opt_125m_woq_awq_int4" ]; then
+        model_name_or_path="facebook/opt-125m"
+    elif [ "${topology}" = "opt_125m_woq_autoround_int4" ]; then
+        model_name_or_path="facebook/opt-125m"
+        extra_cmd=$extra_cmd" --woq_algo AutoRound"
+    elif [ "${topology}" = "opt_125m_woq_autotune_int4" ]; then
+        model_name_or_path="facebook/opt-125m"
     fi
 
-    python -u run_clm_no_trainer.py \
-        --model ${model_name_or_path} \
-        --output_dir ${tuned_checkpoint} \
-        --task ${task} \
-        --batch_size ${batch_size} \
-        ${extra_cmd} ${mode_cmd}
+    if [[ ${mode} == "accuracy" ]]; then
+        python -u run_clm_no_trainer.py \
+            --model ${model_name_or_path} \
+            --output_dir ${tuned_checkpoint} \
+            --task ${task} \
+            --batch_size ${batch_size} \
+            ${extra_cmd} ${mode_cmd}
+    elif [[ ${mode} == "performance" ]]; then
+        incbench --num_cores_per_instance 4 run_clm_no_trainer.py \
+            --model ${model_name_or_path} \
+            --batch_size ${batch_size} \
+            --output_dir ${tuned_checkpoint} \
+            ${extra_cmd} ${mode_cmd}
+    else
+        echo "Error: No such mode: ${mode}"
+        exit 1
+    fi
+
 }
 
 main "$@"