PT2ONNX dynamic quantization export (#988)

Signed-off-by: yuwenzho <[email protected]>
intel · Jun 30, 2023 · 1655326 · 1655326
1 parent d869227
commit 1655326
Show file tree

Hide file tree

Showing 11 changed files with 333 additions and 110 deletions.
diff --git a/docs/source/export.md b/docs/source/export.md
@@ -37,15 +37,15 @@ Here is the workflow of our export API for PyTorch/Tensorflow FP32/INT8 model.
   </tr>
   <tr>
     <td>Post-Training Static Quantized INT8</td>
-    <td>QLinear/QDQ INT8</td>
+    <td>QOperator/QDQ INT8</td>
   </tr>
   <tr>
     <td>Post-Training Dynamic Quantized INT8</td>
-    <td>/</td>
+    <td>QOperator INT8</td>
   </tr>
   <tr>
     <td>Quantization-aware Training INT8</td>
-    <td>QLinear/QDQ INT8</td>
+    <td>QOperator/QDQ INT8</td>
   </tr>
   <tr>
     <td rowspan="3">TensorFlow</td>
@@ -63,10 +63,6 @@ Here is the workflow of our export API for PyTorch/Tensorflow FP32/INT8 model.
 </tbody>
 </table>
 
-> **Note**: Follow this step to export a post training dynamic quantized ONNX model from PyTorch model: \
-        1. export FP32 PyTorch model to FP32 ONNX model.  \
-        2. use FP32 ONNX model as the input model for post training dynamic quantization.
-
 ## Examples
 
 ### PyTorch Model
@@ -96,7 +92,7 @@ from neural_compressor.config import Torch2ONNXConfig
 int8_onnx_config = Torch2ONNXConfig(
     dtype="int8",
     opset_version=14,
-    quant_format="QLinear", # or QDQ
+    quant_format="QOperator", # or QDQ
     example_inputs=torch.randn(1, 3, 224, 224),
     input_names=['input'],
     output_names=['output'],

diff --git a/examples/.config/model_params_pt2onnx.json b/examples/.config/model_params_pt2onnx.json
@@ -8,6 +8,14 @@
       "main_script": "main.py",
       "batch_size": 100
     },
+    "resnet18_dynamic": {
+      "model_src_dir": "image_recognition/torchvision_models/export/fx",
+      "source_model_dataset": "/tf_dataset/pytorch/ImageNet/raw",
+      "target_model_dataset": "/tf_dataset2/datasets/imagenet/ImagenetRaw/ImagenetRaw_small_5000",
+      "input_model": "resnet18",
+      "main_script": "main.py",
+      "batch_size": 100
+    },
     "resnet50": {
       "model_src_dir": "image_recognition/torchvision_models/export/fx",
       "source_model_dataset": "/tf_dataset/pytorch/ImageNet/raw",
@@ -16,6 +24,14 @@
       "main_script": "main.py",
       "batch_size": 100
     },
+    "resnet50_dynamic": {
+      "model_src_dir": "image_recognition/torchvision_models/export/fx",
+      "source_model_dataset": "/tf_dataset/pytorch/ImageNet/raw",
+      "target_model_dataset": "/tf_dataset2/datasets/imagenet/ImagenetRaw/ImagenetRaw_small_5000",
+      "input_model": "resnet50",
+      "main_script": "main.py",
+      "batch_size": 100
+    },
     "bert_base_MRPC": {
       "model_src_dir": "nlp/huggingface_models/text-classification/export/fx",
       "source_model_dataset": "mrpc",
@@ -24,13 +40,29 @@
       "main_script": "run_glue.py",
       "batch_size": 64
     },
+    "bert_base_MRPC_dynamic": {
+      "model_src_dir": "nlp/huggingface_models/text-classification/export/fx",
+      "source_model_dataset": "mrpc",
+      "target_model_dataset": "mrpc",
+      "input_model": "/tf_dataset/pytorch/glue_data/base_weights/bert_MRPC_output",
+      "main_script": "run_glue.py",
+      "batch_size": 64
+    },
     "bert_large_MRPC": {
       "model_src_dir": "nlp/huggingface_models/text-classification/export/fx",
       "source_model_dataset": "mrpc",
       "target_model_dataset": "mrpc",
       "input_model": "/tf_dataset/pytorch/glue_data/weights/bert_MRPC_output",
       "main_script": "run_glue.py",
       "batch_size": 64
+    },
+    "bert_large_MRPC_dynamic": {
+      "model_src_dir": "nlp/huggingface_models/text-classification/export/fx",
+      "source_model_dataset": "mrpc",
+      "target_model_dataset": "mrpc",
+      "input_model": "/tf_dataset/pytorch/glue_data/weights/bert_MRPC_output",
+      "main_script": "run_glue.py",
+      "batch_size": 64
     }
   }
 }
diff --git a/examples/pytorch/image_recognition/torchvision_models/export/fx/README.md b/examples/pytorch/image_recognition/torchvision_models/export/fx/README.md
@@ -34,7 +34,7 @@ Run run_export.sh to get ONNX model from PyTorch model.
 # export fp32 model
 bash run_export.sh --input_model=resnet50 --dtype=fp32 --dataset_location=/path/to/pytorch-imagenet --output_model=resnet50-fp32.onnx
 # export int8 model
-bash run_export.sh --input_model=resnet50 --dtype=int8 --quant_format=[QDQ|QLinear] --dataset_location=/path/to/pytorch-imagenet --output_model=resnet50-int8.onnx
+bash run_export.sh --input_model=resnet50 --dtype=int8 --quant_format=[QDQ|QOperator] --dataset_location=/path/to/pytorch-imagenet --output_model=resnet50-int8.onnx --approach=[static|dynamic]
 ```
 
 ### 2. To get the benchmark of exported and tuned models, includes Batch_size and Throughput: 

diff --git a/examples/pytorch/image_recognition/torchvision_models/export/fx/main.py b/examples/pytorch/image_recognition/torchvision_models/export/fx/main.py
@@ -90,8 +90,10 @@
 parser.add_argument('--export', dest='export', action='store_true', help='run export')
 parser.add_argument('--export_dtype', default='fp32', choices=['fp32', 'int8'],
                     help='choose the data type [fp32/int8] of PyTorch model to be exported.')
-parser.add_argument('--quant_format', default='QDQ', choices=['QDQ', 'QLinear'],
-                    help='choose the format [QDQ/QLinear] of int8 ONNX model exported.')
+parser.add_argument('--quant_format', default='QDQ', choices=['QDQ', 'QOperator'],
+                    help='choose the format [QDQ/QOperator] of int8 ONNX model exported.')
+parser.add_argument('--approach', default='static', choices=['static', 'dynamic'],
+                    help='Post-Training Quantization method.')
 
 best_acc1 = 0
 
@@ -190,7 +192,7 @@ def eval_func(model):
     if args.export and args.export_dtype == 'int8':
         from neural_compressor import PostTrainingQuantConfig
         from neural_compressor import quantization
-        conf = PostTrainingQuantConfig()
+        conf = PostTrainingQuantConfig(approach=args.approach)
         q_model = quantization.fit(model,
                                     conf,
                                     calib_dataloader=val_loader,

diff --git a/examples/pytorch/image_recognition/torchvision_models/export/fx/run_export.sh b/examples/pytorch/image_recognition/torchvision_models/export/fx/run_export.sh
@@ -11,7 +11,7 @@ function main {
 # init params
 function init_params {
     dtype='fp32'
-    quant_format='QDQ' # or QLinear
+    quant_format='QDQ' # or QOperator
     tuned_checkpoint=saved_results
     for var in "$@"
     do
@@ -31,6 +31,9 @@ function init_params {
             --quant_format=*)
                 quant_format=$(echo $var |cut -f2 -d=)
             ;;
+            --approach=*)
+                approach=$(echo $var |cut -f2 -d=)
+            ;;
         esac
     done
 
@@ -48,6 +51,7 @@ function run_tuning {
             --export \
             --export_dtype ${dtype} \
             --quant_format ${quant_format} \
+            --approach ${approach} \
             ${dataset_location}
 
 }

diff --git a/examples/pytorch/nlp/huggingface_models/text-classification/export/fx/README.md b/examples/pytorch/nlp/huggingface_models/text-classification/export/fx/README.md
@@ -45,7 +45,7 @@ Please pass in the name of dataset, supported datasets are 'mrpc', 'qqp', 'qnli'
 # export fp32 model
 bash run_export.sh --input_model=[model_name_or_path] --dataset_location=[dataset_name] --dtype=fp32 --output_model=bert-fp32.onnx
 # export int8 model
-bash run_export.sh --input_model=[model_name_or_path] --dataset_location=[dataset_name]  --dtype=int8 --quant_format=[QDQ/QLinear] --output_model=bert-int8.onnx
+bash run_export.sh --input_model=[model_name_or_path] --dataset_location=[dataset_name]  --dtype=int8 --quant_format=[QDQ/QOperator] --output_model=bert-int8.onnx --approach=[static|dynamic]
 ``` 
 
 ### 2. Get the benchmark results of exported and tuned models, including Batch_size and Throughput: 

diff --git a/examples/pytorch/nlp/huggingface_models/text-classification/export/fx/run_export.sh b/examples/pytorch/nlp/huggingface_models/text-classification/export/fx/run_export.sh
@@ -11,7 +11,7 @@ function main {
 # init params
 function init_params {
     dtype='fp32'
-    quant_format='QDQ' # or QLinear
+    quant_format='QDQ' # or QOperator
     for var in "$@"
     do
         case $var in
@@ -30,6 +30,9 @@ function init_params {
         --quant_format=*)
             quant_format=$(echo $var |cut -f2 -d=)
         ;;
+        --approach=*)
+            approach=$(echo $var |cut -f2 -d=)
+        ;;
         esac
     done
 
@@ -60,6 +63,7 @@ function run_tuning {
         --quant_format ${quant_format} \
         --output_dir ${tuned_checkpoint} \
         --overwrite_output_dir \
+        --approach ${approach} \
         ${extra_cmd}
 }
 

diff --git a/examples/pytorch/nlp/huggingface_models/text-classification/export/fx/run_glue.py b/examples/pytorch/nlp/huggingface_models/text-classification/export/fx/run_glue.py
@@ -190,7 +190,7 @@ class ModelArguments:
         default="fp32", metadata={"help": "choose the data type [fp32/int8] of PyTorch model to be exported."}
     )
     quant_format: str = field(
-        default="QDQ", metadata={"help": "choose the format [QDQ/QLinear] of int8 ONNX model exported."}
+        default="QDQ", metadata={"help": "choose the format [QDQ/QOperator] of int8 ONNX model exported."}
     )
     output_model: str = field(
         default="model.onnx", metadata={"help": "the name of exported model."}
@@ -210,6 +210,12 @@ class ModelArguments:
             "help": "The inference iterations to run for benchmark."
         },
     )
+    approach: str = field(
+        default='static',
+        metadata={
+            "help": "Post-Training Quantization method."
+        },
+    )
 
 
 def main():
@@ -541,13 +547,18 @@ def eval_func(model):
             strategy_kwargs={"confidence_batches": 1},
             max_trials=600,
         )
-        conf = PostTrainingQuantConfig(
-            approach="static", 
-            quant_level=1,
-            tuning_criterion=tuning_criterion,
-            op_type_dict={"Embedding":FP32},
-            calibration_sampling_size=[300],
-        )
+        if model_args.approach == "static":
+            conf = PostTrainingQuantConfig(
+                approach=model_args.approach, 
+                quant_level=1,
+                tuning_criterion=tuning_criterion,
+                op_type_dict={"Embedding":FP32},
+                calibration_sampling_size=[300],
+            )
+        elif model_args.approach == "dynamic":
+            conf = PostTrainingQuantConfig(
+                approach=model_args.approach,
+            )
         q_model = fit(model, conf=conf, calib_dataloader=eval_dataloader, eval_func=eval_func)
         from neural_compressor.utils.load_huggingface import save_for_huggingface_upstream
         save_for_huggingface_upstream(q_model, tokenizer, training_args.output_dir)