-
Notifications
You must be signed in to change notification settings - Fork 258
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
enable weight only quantization for language modeling (#1053)
Signed-off-by: Cheng, Zixuan <[email protected]>
- Loading branch information
1 parent
6c30464
commit 4b24be1
Showing
8 changed files
with
977 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
105 changes: 105 additions & 0 deletions
105
...nlp/huggingface_models/language-modeling/quantization/ptq_weight_only/README.md
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,105 @@ | ||
Step-by-Step | ||
============ | ||
|
||
This document is used to list the steps of reproducing weight only quantization and benchmarking results. | ||
|
||
# Prerequisite | ||
## 1. Environment | ||
Python 3.6 or higher version is recommended. | ||
The dependent packages are all in requirements, please install as following. | ||
```shell | ||
cd examples/pytorch/nlp/huggingface_models/language-modeling/quantization/ptq_weight_only | ||
pip install -r requirements.txt | ||
``` | ||
|
||
# Run | ||
## 1. Quantization | ||
```shell | ||
python run_clm.py \ | ||
--model_name_or_path EleutherAI/gpt-j-6B \ | ||
--dataset_name wikitext\ | ||
--dataset_config_name wikitext-2-raw-v1 \ | ||
--do_train \ | ||
--do_eval \ | ||
--weight_only_bits 8 \ | ||
--weight_only_group -1 \ | ||
--weight_only_scheme sym \ | ||
--weight_only_algorithm RTN \ | ||
--tune \ | ||
--output_dir saved_results | ||
``` | ||
> NOTE | ||
> | ||
> `saved_results` is the path to finetuned output_dir. | ||
or | ||
```bash | ||
sh run_tuning.sh --topology=topology_name --input_model=model_name_or_path --weight_only_bits=8 --weight_only_group=-1 --weight_only_scheme=sym --weight_only_algorithm=RTN | ||
``` | ||
|
||
> NOTE | ||
> | ||
> `weight_only_bits`, `weight_only_group`, `weight_only_scheme`, and `weight_only_algorithm` can be modified by user. For details, please refer to [README](../../../../../../../docs/source/quantization_weight_only.md). | ||
## 2. Benchmark | ||
```bash | ||
# int8 | ||
sh run_benchmark.sh --topology=topology_name --mode=performance --int8=true --input_model=model_name_or_path --config=saved_results | ||
# fp32 | ||
sh run_benchmark.sh --topology=topology_name --mode=performance --input_model=model_name_or_path | ||
``` | ||
## 3. Validated Model List | ||
<table> | ||
<thead> | ||
<tr> | ||
<th>Topology Name</th> | ||
<th>Model Name</th> | ||
<th>Dataset/Task Name</th> | ||
</tr> | ||
</thead> | ||
<tbody align="center"> | ||
<tr> | ||
<td>gpt_j_wikitext</td> | ||
<td><a href="https://huggingface.co/EleutherAI/gpt-j-6B">EleutherAI/gpt-j-6B</a></td> | ||
<td><a href="https://huggingface.co/datasets/wikitext">wikitext</a></td> | ||
</tr> | ||
</tbody> | ||
</table> | ||
|
||
## 4. Saving and Loading Model | ||
### Saving model: | ||
```python | ||
from neural_compressor.config import AccuracyCriterion, PostTrainingQuantConfig | ||
from neural_compressor import quantization | ||
op_type_dict={ | ||
'.*':{ | ||
"weight": { | ||
'bits': 8, | ||
'group_size': 32, | ||
'scheme': 'sym', | ||
'algorithm': 'RTN', | ||
}, | ||
}, | ||
} | ||
accuracy_criterion = AccuracyCriterion(higher_is_better=False, tolerable_loss=0.01) | ||
conf = PostTrainingQuantConfig(accuracy_criterion=accuracy_criterion, | ||
approach='weight_only', | ||
op_type_dict=op_type_dict) | ||
q_model = quantization.fit(model, | ||
conf, | ||
calib_dataloader=dataloader(), | ||
eval_func=eval_func) | ||
q_model.save("output_dir") | ||
``` | ||
Here, `q_model` is the Neural Compressor model class, so it has "save" API: | ||
|
||
```python | ||
q_model.save("Path_to_save_quantized_model") | ||
``` | ||
### Loading model: | ||
```python | ||
from neural_compressor.utils.pytorch import load | ||
quantized_model = load(tuned_checkpoint, model) | ||
``` | ||
-------- | ||
For more details, please refer to the [sample code](./run_clm.py). |
8 changes: 8 additions & 0 deletions
8
...ch/nlp/huggingface_models/language-modeling/quantization/ptq_weight_only/requirements.txt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
sentencepiece!=0.1.92 | ||
protobuf | ||
evaluate | ||
datasets | ||
transformers>=4.22.0 | ||
accelerate | ||
torch>=1.9.0 | ||
pytest # adapt transformers 4.30.x |
89 changes: 89 additions & 0 deletions
89
...ch/nlp/huggingface_models/language-modeling/quantization/ptq_weight_only/run_benchmark.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,89 @@ | ||
#!/bin/bash | ||
set -x | ||
|
||
function main { | ||
|
||
init_params "$@" | ||
run_benchmark | ||
|
||
} | ||
|
||
# init params | ||
function init_params { | ||
iters=100 | ||
batch_size=16 | ||
tuned_checkpoint=saved_results | ||
echo ${max_eval_samples} | ||
for var in "$@" | ||
do | ||
case $var in | ||
--topology=*) | ||
topology=$(echo $var |cut -f2 -d=) | ||
;; | ||
--dataset_location=*) | ||
dataset_location=$(echo $var |cut -f2 -d=) | ||
;; | ||
--input_model=*) | ||
input_model=$(echo $var |cut -f2 -d=) | ||
;; | ||
--mode=*) | ||
mode=$(echo $var |cut -f2 -d=) | ||
;; | ||
--batch_size=*) | ||
batch_size=$(echo $var |cut -f2 -d=) | ||
;; | ||
--iters=*) | ||
iters=$(echo ${var} |cut -f2 -d=) | ||
;; | ||
--int8=*) | ||
int8=$(echo ${var} |cut -f2 -d=) | ||
;; | ||
--config=*) | ||
tuned_checkpoint=$(echo $var |cut -f2 -d=) | ||
;; | ||
*) | ||
echo "Error: No such parameter: ${var}" | ||
exit 1 | ||
;; | ||
esac | ||
done | ||
|
||
} | ||
|
||
|
||
# run_benchmark | ||
function run_benchmark { | ||
extra_cmd='' | ||
|
||
if [[ ${mode} == "accuracy" ]]; then | ||
mode_cmd=" --accuracy " | ||
elif [[ ${mode} == "performance" ]]; then | ||
mode_cmd=" --performance --iters "${iters} | ||
else | ||
echo "Error: No such mode: ${mode}" | ||
exit 1 | ||
fi | ||
|
||
if [ "${topology}" = "gpt_j_wikitext_weight_only" ]; then | ||
TASK_NAME='wikitext' | ||
model_name_or_path=${input_model} | ||
extra_cmd='--dataset_config_name=wikitext-2-raw-v1' | ||
fi | ||
|
||
if [[ ${int8} == "true" ]]; then | ||
extra_cmd=$extra_cmd" --int8" | ||
fi | ||
echo $extra_cmd | ||
|
||
python -u run_clm.py \ | ||
--model_name_or_path ${input_model} \ | ||
--dataset_name ${TASK_NAME} \ | ||
--do_eval \ | ||
--per_device_eval_batch_size ${batch_size} \ | ||
--output_dir ${tuned_checkpoint} \ | ||
${mode_cmd} \ | ||
${extra_cmd} | ||
|
||
} | ||
|
||
main "$@" |
Oops, something went wrong.