Skip to content

Commit

Permalink
PT2ONNX dynamic quantization export (#988)
Browse files Browse the repository at this point in the history
Signed-off-by: yuwenzho <[email protected]>
  • Loading branch information
yuwenzho authored Jun 30, 2023
1 parent d869227 commit 1655326
Show file tree
Hide file tree
Showing 11 changed files with 333 additions and 110 deletions.
12 changes: 4 additions & 8 deletions docs/source/export.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,15 +37,15 @@ Here is the workflow of our export API for PyTorch/Tensorflow FP32/INT8 model.
</tr>
<tr>
<td>Post-Training Static Quantized INT8</td>
<td>QLinear/QDQ INT8</td>
<td>QOperator/QDQ INT8</td>
</tr>
<tr>
<td>Post-Training Dynamic Quantized INT8</td>
<td>/</td>
<td>QOperator INT8</td>
</tr>
<tr>
<td>Quantization-aware Training INT8</td>
<td>QLinear/QDQ INT8</td>
<td>QOperator/QDQ INT8</td>
</tr>
<tr>
<td rowspan="3">TensorFlow</td>
Expand All @@ -63,10 +63,6 @@ Here is the workflow of our export API for PyTorch/Tensorflow FP32/INT8 model.
</tbody>
</table>

> **Note**: Follow this step to export a post training dynamic quantized ONNX model from PyTorch model: \
1. export FP32 PyTorch model to FP32 ONNX model. \
2. use FP32 ONNX model as the input model for post training dynamic quantization.

## Examples

### PyTorch Model
Expand Down Expand Up @@ -96,7 +92,7 @@ from neural_compressor.config import Torch2ONNXConfig
int8_onnx_config = Torch2ONNXConfig(
dtype="int8",
opset_version=14,
quant_format="QLinear", # or QDQ
quant_format="QOperator", # or QDQ
example_inputs=torch.randn(1, 3, 224, 224),
input_names=['input'],
output_names=['output'],
Expand Down
32 changes: 32 additions & 0 deletions examples/.config/model_params_pt2onnx.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,14 @@
"main_script": "main.py",
"batch_size": 100
},
"resnet18_dynamic": {
"model_src_dir": "image_recognition/torchvision_models/export/fx",
"source_model_dataset": "/tf_dataset/pytorch/ImageNet/raw",
"target_model_dataset": "/tf_dataset2/datasets/imagenet/ImagenetRaw/ImagenetRaw_small_5000",
"input_model": "resnet18",
"main_script": "main.py",
"batch_size": 100
},
"resnet50": {
"model_src_dir": "image_recognition/torchvision_models/export/fx",
"source_model_dataset": "/tf_dataset/pytorch/ImageNet/raw",
Expand All @@ -16,6 +24,14 @@
"main_script": "main.py",
"batch_size": 100
},
"resnet50_dynamic": {
"model_src_dir": "image_recognition/torchvision_models/export/fx",
"source_model_dataset": "/tf_dataset/pytorch/ImageNet/raw",
"target_model_dataset": "/tf_dataset2/datasets/imagenet/ImagenetRaw/ImagenetRaw_small_5000",
"input_model": "resnet50",
"main_script": "main.py",
"batch_size": 100
},
"bert_base_MRPC": {
"model_src_dir": "nlp/huggingface_models/text-classification/export/fx",
"source_model_dataset": "mrpc",
Expand All @@ -24,13 +40,29 @@
"main_script": "run_glue.py",
"batch_size": 64
},
"bert_base_MRPC_dynamic": {
"model_src_dir": "nlp/huggingface_models/text-classification/export/fx",
"source_model_dataset": "mrpc",
"target_model_dataset": "mrpc",
"input_model": "/tf_dataset/pytorch/glue_data/base_weights/bert_MRPC_output",
"main_script": "run_glue.py",
"batch_size": 64
},
"bert_large_MRPC": {
"model_src_dir": "nlp/huggingface_models/text-classification/export/fx",
"source_model_dataset": "mrpc",
"target_model_dataset": "mrpc",
"input_model": "/tf_dataset/pytorch/glue_data/weights/bert_MRPC_output",
"main_script": "run_glue.py",
"batch_size": 64
},
"bert_large_MRPC_dynamic": {
"model_src_dir": "nlp/huggingface_models/text-classification/export/fx",
"source_model_dataset": "mrpc",
"target_model_dataset": "mrpc",
"input_model": "/tf_dataset/pytorch/glue_data/weights/bert_MRPC_output",
"main_script": "run_glue.py",
"batch_size": 64
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ Run run_export.sh to get ONNX model from PyTorch model.
# export fp32 model
bash run_export.sh --input_model=resnet50 --dtype=fp32 --dataset_location=/path/to/pytorch-imagenet --output_model=resnet50-fp32.onnx
# export int8 model
bash run_export.sh --input_model=resnet50 --dtype=int8 --quant_format=[QDQ|QLinear] --dataset_location=/path/to/pytorch-imagenet --output_model=resnet50-int8.onnx
bash run_export.sh --input_model=resnet50 --dtype=int8 --quant_format=[QDQ|QOperator] --dataset_location=/path/to/pytorch-imagenet --output_model=resnet50-int8.onnx --approach=[static|dynamic]
```

### 2. To get the benchmark of exported and tuned models, includes Batch_size and Throughput:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -90,8 +90,10 @@
parser.add_argument('--export', dest='export', action='store_true', help='run export')
parser.add_argument('--export_dtype', default='fp32', choices=['fp32', 'int8'],
help='choose the data type [fp32/int8] of PyTorch model to be exported.')
parser.add_argument('--quant_format', default='QDQ', choices=['QDQ', 'QLinear'],
help='choose the format [QDQ/QLinear] of int8 ONNX model exported.')
parser.add_argument('--quant_format', default='QDQ', choices=['QDQ', 'QOperator'],
help='choose the format [QDQ/QOperator] of int8 ONNX model exported.')
parser.add_argument('--approach', default='static', choices=['static', 'dynamic'],
help='Post-Training Quantization method.')

best_acc1 = 0

Expand Down Expand Up @@ -190,7 +192,7 @@ def eval_func(model):
if args.export and args.export_dtype == 'int8':
from neural_compressor import PostTrainingQuantConfig
from neural_compressor import quantization
conf = PostTrainingQuantConfig()
conf = PostTrainingQuantConfig(approach=args.approach)
q_model = quantization.fit(model,
conf,
calib_dataloader=val_loader,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ function main {
# init params
function init_params {
dtype='fp32'
quant_format='QDQ' # or QLinear
quant_format='QDQ' # or QOperator
tuned_checkpoint=saved_results
for var in "$@"
do
Expand All @@ -31,6 +31,9 @@ function init_params {
--quant_format=*)
quant_format=$(echo $var |cut -f2 -d=)
;;
--approach=*)
approach=$(echo $var |cut -f2 -d=)
;;
esac
done

Expand All @@ -48,6 +51,7 @@ function run_tuning {
--export \
--export_dtype ${dtype} \
--quant_format ${quant_format} \
--approach ${approach} \
${dataset_location}

}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ Please pass in the name of dataset, supported datasets are 'mrpc', 'qqp', 'qnli'
# export fp32 model
bash run_export.sh --input_model=[model_name_or_path] --dataset_location=[dataset_name] --dtype=fp32 --output_model=bert-fp32.onnx
# export int8 model
bash run_export.sh --input_model=[model_name_or_path] --dataset_location=[dataset_name] --dtype=int8 --quant_format=[QDQ/QLinear] --output_model=bert-int8.onnx
bash run_export.sh --input_model=[model_name_or_path] --dataset_location=[dataset_name] --dtype=int8 --quant_format=[QDQ/QOperator] --output_model=bert-int8.onnx --approach=[static|dynamic]
```

### 2. Get the benchmark results of exported and tuned models, including Batch_size and Throughput:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ function main {
# init params
function init_params {
dtype='fp32'
quant_format='QDQ' # or QLinear
quant_format='QDQ' # or QOperator
for var in "$@"
do
case $var in
Expand All @@ -30,6 +30,9 @@ function init_params {
--quant_format=*)
quant_format=$(echo $var |cut -f2 -d=)
;;
--approach=*)
approach=$(echo $var |cut -f2 -d=)
;;
esac
done

Expand Down Expand Up @@ -60,6 +63,7 @@ function run_tuning {
--quant_format ${quant_format} \
--output_dir ${tuned_checkpoint} \
--overwrite_output_dir \
--approach ${approach} \
${extra_cmd}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ class ModelArguments:
default="fp32", metadata={"help": "choose the data type [fp32/int8] of PyTorch model to be exported."}
)
quant_format: str = field(
default="QDQ", metadata={"help": "choose the format [QDQ/QLinear] of int8 ONNX model exported."}
default="QDQ", metadata={"help": "choose the format [QDQ/QOperator] of int8 ONNX model exported."}
)
output_model: str = field(
default="model.onnx", metadata={"help": "the name of exported model."}
Expand All @@ -210,6 +210,12 @@ class ModelArguments:
"help": "The inference iterations to run for benchmark."
},
)
approach: str = field(
default='static',
metadata={
"help": "Post-Training Quantization method."
},
)


def main():
Expand Down Expand Up @@ -541,13 +547,18 @@ def eval_func(model):
strategy_kwargs={"confidence_batches": 1},
max_trials=600,
)
conf = PostTrainingQuantConfig(
approach="static",
quant_level=1,
tuning_criterion=tuning_criterion,
op_type_dict={"Embedding":FP32},
calibration_sampling_size=[300],
)
if model_args.approach == "static":
conf = PostTrainingQuantConfig(
approach=model_args.approach,
quant_level=1,
tuning_criterion=tuning_criterion,
op_type_dict={"Embedding":FP32},
calibration_sampling_size=[300],
)
elif model_args.approach == "dynamic":
conf = PostTrainingQuantConfig(
approach=model_args.approach,
)
q_model = fit(model, conf=conf, calib_dataloader=eval_dataloader, eval_func=eval_func)
from neural_compressor.utils.load_huggingface import save_for_huggingface_upstream
save_for_huggingface_upstream(q_model, tokenizer, training_args.output_dir)
Expand Down
Loading

0 comments on commit 1655326

Please sign in to comment.