Skip to content

Commit

Permalink
Add woq examples (#1982)
Browse files Browse the repository at this point in the history
Signed-off-by: Kaihui-intel <[email protected]>
Signed-off-by: Sun, Xuehao <[email protected]>
Co-authored-by: Sun, Xuehao <[email protected]>
  • Loading branch information
Kaihui-intel and XuehaoSun authored Oct 10, 2024
1 parent 586eb88 commit 2bb257e
Show file tree
Hide file tree
Showing 5 changed files with 309 additions and 45 deletions.
28 changes: 28 additions & 0 deletions examples/.config/model_params_pytorch_3x.json
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,34 @@
"main_script": "run_clm_no_trainer.py",
"batch_size": 8
},
"gpt_j_woq_awq_int4":{
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
"dataset_location": "",
"input_model": "",
"main_script": "run_clm_no_trainer.py",
"batch_size": 1
},
"opt_125m_woq_awq_int4":{
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
"dataset_location": "",
"input_model": "",
"main_script": "run_clm_no_trainer.py",
"batch_size": 1
},
"opt_125m_woq_autoround_int4":{
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
"dataset_location": "",
"input_model": "",
"main_script": "run_clm_no_trainer.py",
"batch_size": 1
},
"opt_125m_woq_autotune_int4":{
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
"dataset_location": "",
"input_model": "",
"main_script": "run_clm_no_trainer.py",
"batch_size": 1
},
"gpt_j_ipex":{
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant/ipex",
"dataset_location": "",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,8 @@ python run_clm_no_trainer.py \
--woq_group_size 128 \
--gptq_max_seq_length 2048 \
--gptq_use_max_length \
--accuracy \
--tasks "lambada_openai" \
--double_quant_type "BNB_NF4"
--double_quant_type "BNB_NF4" \
--output_dir saved_results

# "--woq_algo RTN" is used to enable RTN algorithms
python run_clm_no_trainer.py \
Expand All @@ -48,9 +47,38 @@ python run_clm_no_trainer.py \
--woq_bits 4 \
--woq_scheme asym \
--woq_group_size 128 \
--double_quant_type "BNB_NF4"
--output_dir saved_results

# "--woq_algo AWQ" is used to enable AWQ algorithms
python run_clm_no_trainer.py \
--model EleutherAI/gpt-j-6B \
--dataset NeelNanda/pile-10k \
--quantize \
--woq_algo AWQ \
--woq_bits 4 \
--woq_scheme asym \
--woq_group_size 128 \
--calib_iters 128

# "--woq_algo AutoRound" is used to enable AutoRound algorithms
python run_clm_no_trainer.py \
--model EleutherAI/gpt-j-6B \
--dataset NeelNanda/pile-10k \
--quantize \
--woq_algo AutoRound \
--woq_bits 4 \
--woq_scheme asym \
--woq_group_size 128

# "--accuracy" for eval
python run_clm_no_trainer.py \
--model EleutherAI/gpt-j-6B \
--dataset NeelNanda/pile-10k \
--int8 \
--accuracy \
--tasks "lambada_openai" \
--double_quant_type "BNB_NF4"
--output_dir saved_results
```
**Notes**: Weight-only quantization based on fake quantization is previewly supported and supports RTN, GPTQ[1], AWQ[2], TEQ algorithms. For more details, please refer to [link](https://github.com/intel/neural-compressor/blob/master/docs/source/quantization_weight_only.md). Our GPTQ API support various CLMs including GPTJ, OPTs, Blooms, Llamas, Falcons, MPTs, ChatGLMs, etc. Simply replace the "--model" argument with other models to quantize different CLMs with GPTQ.

Expand All @@ -72,8 +100,6 @@ python run_clm_no_trainer.py \
--woq_group_size 128 \
--gptq_max_seq_length 2048 \
--gptq_use_max_length \
--accuracy \
--tasks "lambada_openai" \
--double_quant_type "BNB_NF4"

# "--woq_algo RTN" is used to enable RTN algorithms
Expand All @@ -85,13 +111,40 @@ python run_clm_no_trainer.py \
--woq_bits 4 \
--woq_scheme asym \
--woq_group_size 128 \
--double_quant_type "BNB_NF4"

# "--woq_algo AWQ" is used to enable AWQ algorithms
python run_clm_no_trainer.py \
--model facebook/opt-125m \
--dataset NeelNanda/pile-10k \
--quantize \
--woq_algo AWQ \
--woq_bits 4 \
--woq_scheme asym \
--woq_group_size 128 \
--calib_iters 128

# "--woq_algo AutoRound" is used to enable AutoRound algorithms
python run_clm_no_trainer.py \
--model facebook/opt-125m \
--dataset NeelNanda/pile-10k \
--quantize \
--woq_algo AutoRound \
--woq_bits 4 \
--woq_scheme asym \
--woq_group_size 128

# "--accuracy" for eval
python run_clm_no_trainer.py \
--model facebook/opt-125m \
--dataset NeelNanda/pile-10k \
--int8 \
--accuracy \
--tasks "lambada_openai" \
--double_quant_type "BNB_NF4"
--output_dir saved_results
```

### LLAMA2-7b/13b/70b
>Note: LLAMA requires IPEX requirements >= 2.1 to get better accuracy.
#### Quantization

```bash
Expand All @@ -107,8 +160,6 @@ python run_clm_no_trainer.py \
--woq_group_size 128 \
--gptq_max_seq_length 2048 \
--gptq_use_max_length \
--accuracy \
--tasks "lambada_openai" \
--double_quant_type "BNB_NF4"

# "--woq_algo RTN" is used to enable RTN algorithms
Expand All @@ -120,8 +171,6 @@ python run_clm_no_trainer.py \
--woq_bits 4 \
--woq_scheme asym \
--woq_group_size 128 \
--accuracy \
--tasks "lambada_openai" \
--double_quant_type "BNB_NF4"
```

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,58 +70,59 @@ function run_benchmark {
fi
echo $extra_cmd

if [ "${topology}" = "opt_125m_woq_gptq_int4" ]; then
if [ "${topology}" = "opt_125m_woq_gptq_int4" ]; then
model_name_or_path="facebook/opt-125m"
extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
elif [ "${topology}" = "opt_125m_woq_gptq_int4_dq_bnb" ]; then
model_name_or_path="facebook/opt-125m"
extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
extra_cmd=$extra_cmd" --double_quant_type BNB_NF4"
elif [ "${topology}" = "opt_125m_woq_gptq_int4_dq_ggml" ]; then
model_name_or_path="facebook/opt-125m"
extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length --gptq_percdamp 0.1 --gptq_actorder"
extra_cmd=$extra_cmd" --double_quant_type GGML_TYPE_Q4_K"
elif [ "${topology}" = "llama2_7b_gptq_int4" ]; then
model_name_or_path="meta-llama/Llama-2-7b-hf"
extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
elif [ "${topology}" = "llama2_7b_gptq_int4_dq_bnb" ]; then
model_name_or_path="meta-llama/Llama-2-7b-hf"
extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
extra_cmd=$extra_cmd" --double_quant_type BNB_NF4"
elif [ "${topology}" = "llama2_7b_gptq_int4_dq_ggml" ]; then
model_name_or_path="meta-llama/Llama-2-7b-hf"
extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
extra_cmd=$extra_cmd" --double_quant_type GGML_TYPE_Q4_K"
elif [ "${topology}" = "gpt_j_woq_rtn_int4" ]; then
model_name_or_path="EleutherAI/gpt-j-6b"
extra_cmd=$extra_cmd" --woq_algo RTN --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search"
elif [ "${topology}" = "gpt_j_woq_rtn_int4_dq_bnb" ]; then
model_name_or_path="EleutherAI/gpt-j-6b"\
extra_cmd=$extra_cmd" --woq_algo RTN --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search"
extra_cmd=$extra_cmd" --double_quant_type BNB_NF4"
model_name_or_path="EleutherAI/gpt-j-6b"
elif [ "${topology}" = "gpt_j_woq_rtn_int4_dq_ggml" ]; then
model_name_or_path="EleutherAI/gpt-j-6b"\
extra_cmd=$extra_cmd" --woq_algo RTN --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search"
extra_cmd=$extra_cmd" --double_quant_type GGML_TYPE_Q4_K"
model_name_or_path="EleutherAI/gpt-j-6b"
elif [ "${topology}" = "gpt_j_woq_gptq_int4" ]; then
model_name_or_path="EleutherAI/gpt-j-6b"
extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
elif [ "${topology}" = "gpt_j_woq_gptq_int4_dq_bnb" ]; then
model_name_or_path="EleutherAI/gpt-j-6b"
extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
extra_cmd=$extra_cmd" --double_quant_type BNB_NF4"
elif [ "${topology}" = "gpt_j_woq_gptq_int4_dq_ggml" ]; then
model_name_or_path="EleutherAI/gpt-j-6b"
extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
extra_cmd=$extra_cmd" --double_quant_type GGML_TYPE_Q4_K"
elif [ "${topology}" = "gpt_j_woq_awq_int4" ]; then
model_name_or_path="EleutherAI/gpt-j-6b"
elif [ "${topology}" = "opt_125m_woq_awq_int4" ]; then
model_name_or_path="facebook/opt-125m"
elif [ "${topology}" = "opt_125m_woq_autoround_int4" ]; then
model_name_or_path="facebook/opt-125m"
extra_cmd=$extra_cmd" --woq_algo AutoRound"
elif [ "${topology}" = "opt_125m_woq_autotune_int4" ]; then
model_name_or_path="facebook/opt-125m"
fi

python -u run_clm_no_trainer.py \
--model ${model_name_or_path} \
--output_dir ${tuned_checkpoint} \
--task ${task} \
--batch_size ${batch_size} \
${extra_cmd} ${mode_cmd}
if [[ ${mode} == "accuracy" ]]; then
python -u run_clm_no_trainer.py \
--model ${model_name_or_path} \
--output_dir ${tuned_checkpoint} \
--task ${task} \
--batch_size ${batch_size} \
${extra_cmd} ${mode_cmd}
elif [[ ${mode} == "performance" ]]; then
incbench --num_cores_per_instance 4 run_clm_no_trainer.py \
--model ${model_name_or_path} \
--batch_size ${batch_size} \
--output_dir ${tuned_checkpoint} \
${extra_cmd} ${mode_cmd}
else
echo "Error: No such mode: ${mode}"
exit 1
fi

}

main "$@"
Loading

0 comments on commit 2bb257e

Please sign in to comment.