diff --git a/examples/.config/model_params_onnxrt.json b/examples/.config/model_params_onnxrt.json
index 2480fe54c7b..e0b08c699ad 100644
--- a/examples/.config/model_params_onnxrt.json
+++ b/examples/.config/model_params_onnxrt.json
@@ -46,7 +46,7 @@
"new_benchmark": true
},
"bert_base_MRPC_static": {
- "model_src_dir": "language_translation/bert/quantization/ptq",
+ "model_src_dir": "nlp/bert/quantization/ptq",
"dataset_location": "/tf_dataset/pytorch/glue_data/MRPC",
"input_model": "/tf_dataset2/models/onnx/bert_base_MRPC/bert.onnx",
"yaml": "bert_static.yaml",
@@ -55,7 +55,7 @@
"new_benchmark": true
},
"bert_base_MRPC_dynamic": {
- "model_src_dir": "language_translation/bert/quantization/ptq",
+ "model_src_dir": "nlp/bert/quantization/ptq",
"dataset_location": "/tf_dataset/pytorch/glue_data/MRPC",
"input_model": "/tf_dataset2/models/onnx/bert_base_MRPC/bert.onnx",
"yaml": "bert_dynamic.yaml",
@@ -64,7 +64,7 @@
"new_benchmark": true
},
"distilbert_base_MRPC": {
- "model_src_dir": "language_translation/distilbert/quantization/ptq",
+ "model_src_dir": "nlp/distilbert/quantization/ptq",
"dataset_location": "/tf_dataset/pytorch/glue_data/MRPC",
"input_model": "/tf_dataset2/models/onnx/distilbert_base_MRPC/distilbert-base-uncased.onnx",
"yaml": "distilbert.yaml",
@@ -73,7 +73,7 @@
"new_benchmark": true
},
"mobilebert_MRPC": {
- "model_src_dir": "language_translation/mobilebert/quantization/ptq",
+ "model_src_dir": "nlp/mobilebert/quantization/ptq",
"dataset_location": "/tf_dataset/pytorch/glue_data/MRPC",
"input_model": "/tf_dataset2/models/onnx/mobilebert_MRPC/mobilebert-uncased.onnx",
"yaml": "mobilebert.yaml",
@@ -82,7 +82,7 @@
"new_benchmark": true
},
"roberta_base_MRPC": {
- "model_src_dir": "language_translation/roberta/quantization/ptq",
+ "model_src_dir": "nlp/roberta/quantization/ptq",
"dataset_location": "/tf_dataset/pytorch/glue_data/MRPC",
"input_model": "/tf_dataset2/models/onnx/roberta_base_MRPC/roberta-base.onnx",
"yaml": "roberta.yaml",
@@ -118,7 +118,7 @@
"new_benchmark": true
},
"bert_squad_model_zoo": {
- "model_src_dir": "language_translation/onnx_model_zoo/bert-squad/quantization/ptq",
+ "model_src_dir": "nlp/onnx_model_zoo/bert-squad/quantization/ptq",
"dataset_location": "/tf_dataset2/datasets/squad",
"input_model": "/tf_dataset2/models/onnx/bert_squad/bert_squad_model_zoo.onnx",
"yaml": "bert.yaml",
@@ -127,7 +127,7 @@
"new_benchmark": true
},
"mobilebert_squad_mlperf": {
- "model_src_dir": "language_translation/onnx_model_zoo/mobilebert/quantization/ptq",
+ "model_src_dir": "nlp/onnx_model_zoo/mobilebert/quantization/ptq",
"dataset_location": "/tf_dataset2/datasets/squad",
"input_model": "/tf_dataset2/models/onnx/mobilebert_squad/mobilebert_squad_mlperf.onnx",
"yaml": "mobilebert.yaml",
@@ -136,7 +136,7 @@
"new_benchmark": true
},
"gpt2_lm_head_wikitext_model_zoo": {
- "model_src_dir": "language_translation/onnx_model_zoo/gpt2/quantization/ptq",
+ "model_src_dir": "nlp/onnx_model_zoo/gpt2/quantization/ptq",
"dataset_location": "/tf_dataset2/datasets/wikitext/wikitext-2-raw/",
"input_model": "/tf_dataset2/models/onnx/gpt2/gpt2_lm_head_wikitext_model_zoo.onnx",
"yaml": "gpt2.yaml",
@@ -352,7 +352,7 @@
"new_benchmark": true
},
"bert_base_MRPC_static_qdq": {
- "model_src_dir": "language_translation/bert/quantization/ptq",
+ "model_src_dir": "nlp/bert/quantization/ptq",
"dataset_location": "/tf_dataset/pytorch/glue_data/MRPC",
"input_model": "/tf_dataset2/models/onnx/bert_base_MRPC/bert.onnx",
"yaml": "bert_qdq.yaml",
@@ -361,7 +361,7 @@
"new_benchmark": true
},
"distilbert_base_MRPC_qdq": {
- "model_src_dir": "language_translation/distilbert/quantization/ptq",
+ "model_src_dir": "nlp/distilbert/quantization/ptq",
"dataset_location": "/tf_dataset/pytorch/glue_data/MRPC",
"input_model": "/tf_dataset2/models/onnx/distilbert_base_MRPC/distilbert-base-uncased.onnx",
"yaml": "distilbert_qdq.yaml",
@@ -370,7 +370,7 @@
"new_benchmark": true
},
"mobilebert_MRPC_qdq": {
- "model_src_dir": "language_translation/mobilebert/quantization/ptq",
+ "model_src_dir": "nlp/mobilebert/quantization/ptq",
"dataset_location": "/tf_dataset/pytorch/glue_data/MRPC",
"input_model": "/tf_dataset2/models/onnx/mobilebert_MRPC/mobilebert-uncased.onnx",
"yaml": "mobilebert_qdq.yaml",
@@ -379,7 +379,7 @@
"new_benchmark": true
},
"roberta_base_MRPC_qdq": {
- "model_src_dir": "language_translation/roberta/quantization/ptq",
+ "model_src_dir": "nlp/roberta/quantization/ptq",
"dataset_location": "/tf_dataset/pytorch/glue_data/MRPC",
"input_model": "/tf_dataset2/models/onnx/roberta_base_MRPC/roberta-base.onnx",
"yaml": "roberta_qdq.yaml",
@@ -415,7 +415,7 @@
"new_benchmark": true
},
"bert_squad_model_zoo_qdq": {
- "model_src_dir": "language_translation/onnx_model_zoo/bert-squad/quantization/ptq",
+ "model_src_dir": "nlp/onnx_model_zoo/bert-squad/quantization/ptq",
"dataset_location": "/tf_dataset2/datasets/squad",
"input_model": "/tf_dataset2/models/onnx/bert_squad/bert_squad_model_zoo.onnx",
"yaml": "bert_qdq.yaml",
@@ -424,7 +424,7 @@
"new_benchmark": true
},
"mobilebert_squad_mlperf_qdq": {
- "model_src_dir": "language_translation/onnx_model_zoo/mobilebert/quantization/ptq",
+ "model_src_dir": "nlp/onnx_model_zoo/mobilebert/quantization/ptq",
"dataset_location": "/tf_dataset2/datasets/squad",
"input_model": "/tf_dataset2/models/onnx/mobilebert_squad/mobilebert_squad_mlperf-13.onnx",
"yaml": "mobilebert_qdq.yaml",
@@ -631,13 +631,103 @@
"new_benchmark": true
},
"BiDAF": {
- "model_src_dir": "language_translation/onnx_model_zoo/BiDAF/quantization/ptq",
+ "model_src_dir": "nlp/onnx_model_zoo/BiDAF/quantization/ptq",
"dataset_location": "/tf_dataset2/datasets/squad/dev-v1.1.json",
"input_model": "/tf_dataset2/models/onnx/BiDAF/bidaf-11.onnx",
"yaml": "bidaf.yaml",
"strategy": "basic",
"batch_size": 1,
"new_benchmark": true
+ },
+ "hf_bert-base-uncased_dynamic": {
+ "model_src_dir": "nlp/huggingface_model/text_classification/quantization/ptq",
+ "dataset_location": "/tf_dataset/pytorch/glue_data/MRPC",
+ "input_model": "/tf_dataset2/models/onnx/hf_bert-base-uncased_dynamic/bert-base-uncased-mrpc.onnx",
+ "yaml": "glue_dynamic.yaml",
+ "strategy": "basic",
+ "batch_size": 1,
+ "new_benchmark": true
+ },
+ "hf_roberta-base_dynamic": {
+ "model_src_dir": "nlp/huggingface_model/text_classification/quantization/ptq",
+ "dataset_location": "/tf_dataset/pytorch/glue_data/MRPC",
+ "input_model": "/tf_dataset2/models/onnx/hf_roberta-base_dynamic/roberta-base-mrpc.onnx",
+ "yaml": "glue_dynamic.yaml",
+ "strategy": "basic",
+ "batch_size": 1,
+ "new_benchmark": true
+ },
+ "hf_xlm-roberta-base_dynamic": {
+ "model_src_dir": "nlp/huggingface_model/text_classification/quantization/ptq",
+ "dataset_location": "/tf_dataset/pytorch/glue_data/MRPC",
+ "input_model": "/tf_dataset2/models/onnx/hf_xlm-roberta-base_dynamic/xlm-roberta-base-mrpc.onnx",
+ "yaml": "glue_dynamic.yaml",
+ "strategy": "basic",
+ "batch_size": 1,
+ "new_benchmark": true
+ },
+ "hf_camembert-base_dynamic": {
+ "model_src_dir": "nlp/huggingface_model/text_classification/quantization/ptq",
+ "dataset_location": "/tf_dataset/pytorch/glue_data/MRPC",
+ "input_model": "/tf_dataset2/models/onnx/hf_camembert-base_dynamic/camembert-base-mrpc.onnx",
+ "yaml": "glue_dynamic.yaml",
+ "strategy": "basic",
+ "batch_size": 1,
+ "new_benchmark": true
+ },
+ "hf_MiniLM-L12-H384-uncased_dynamic": {
+ "model_src_dir": "nlp/huggingface_model/text_classification/quantization/ptq",
+ "dataset_location": "/tf_dataset/pytorch/glue_data/MRPC",
+ "input_model": "/tf_dataset2/models/onnx/hf_MiniLM-L12-H384-uncased_dynamic/MiniLM-L12-H384-uncased-mrpc.onnx",
+ "yaml": "glue_dynamic.yaml",
+ "strategy": "basic",
+ "batch_size": 1,
+ "new_benchmark": true
+ },
+ "hf_distilbert-base-uncased_dynamic": {
+ "model_src_dir": "nlp/huggingface_model/text_classification/quantization/ptq",
+ "dataset_location": "/tf_dataset/pytorch/glue_data/SST-2/",
+ "input_model": "/tf_dataset2/models/onnx/hf_distilbert-base-uncased_dynamic/distilbert-base-uncased-finetuned-sst-2-english.onnx",
+ "yaml": "glue_dynamic.yaml",
+ "strategy": "basic",
+ "batch_size": 1,
+ "new_benchmark": true
+ },
+ "hf_albert-base-v2_dynamic": {
+ "model_src_dir": "nlp/huggingface_model/text_classification/quantization/ptq",
+ "dataset_location": "/tf_dataset/pytorch/glue_data/SST-2/",
+ "input_model": "/tf_dataset2/models/onnx/hf_albert-base-v2_dynamic/albert-base-v2-sst2.onnx",
+ "yaml": "glue_dynamic.yaml",
+ "strategy": "basic",
+ "batch_size": 1,
+ "new_benchmark": true
+ },
+ "hf_MiniLM-L6-H384-uncased_dynamic": {
+ "model_src_dir": "nlp/huggingface_model/text_classification/quantization/ptq",
+ "dataset_location": "/tf_dataset/pytorch/glue_data/SST-2/",
+ "input_model": "/tf_dataset2/models/onnx/hf_MiniLM-L6-H384-uncased_dynamic/MiniLM-L6-H384-uncased-sst2.onnx",
+ "yaml": "glue_dynamic.yaml",
+ "strategy": "basic",
+ "batch_size": 1,
+ "new_benchmark": true
+ },
+ "hf_spanbert_dynamic": {
+ "model_src_dir": "nlp/huggingface_model/question_answering/quantization/ptq",
+ "dataset_location": "/tf_dataset2/datasets/squad",
+ "input_model": "/tf_dataset2/models/onnx/hf_spanbert_dynamic/spanbert-finetuned-squadv1.onnx",
+ "yaml": "qa_dynamic.yaml",
+ "strategy": "basic",
+ "batch_size": 1,
+ "new_benchmark": true
+ },
+ "hf_bert-base-multilingual-cased_dynamic": {
+ "model_src_dir": "nlp/huggingface_model/question_answering/quantization/ptq",
+ "dataset_location": "/tf_dataset2/datasets/squad",
+ "input_model": "/tf_dataset2/models/onnx/hf_bert-base-multilingual-cased_dynamic/bert-base-multilingual-cased-finetuned-squad.onnx",
+ "yaml": "qa_dynamic.yaml",
+ "strategy": "basic",
+ "batch_size": 1,
+ "new_benchmark": true
}
}
}
diff --git a/examples/README.md b/examples/README.md
index 7342bbe1e8a..0698d8e7d41 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -855,55 +855,115 @@ IntelĀ® Neural Compressor validated examples with multiple compression technique
BERT base MRPC |
Natural Language Processing |
Post-Training Static Quantization |
- integerops / qdq |
+ integerops / qdq |
BERT base MRPC |
Natural Language Processing |
Post-Training Dynamic Quantization |
- integerops |
+ integerops |
DistilBERT base MRPC |
Natural Language Processing |
Post-Training Dynamic / Static Quantization |
- integerops / qdq |
+ integerops / qdq |
Mobile bert MRPC |
Natural Language Processing |
Post-Training Dynamic / Static Quantization |
- integerops / qdq |
+ integerops / qdq |
Roberta base MRPC |
Natural Language Processing |
Post-Training Dynamic / Static Quantization |
- integerops / qdq |
+ integerops / qdq |
BERT SQuAD |
Natural Language Processing |
Post-Training Dynamic / Static Quantization |
- integerops / qdq |
+ integerops / qdq |
GPT2 lm head WikiText |
Natural Language Processing |
Post-Training Dynamic Quantization |
- integerops |
+ integerops |
MobileBERT SQuAD MLPerf |
Natural Language Processing |
Post-Training Dynamic / Static Quantization |
- integerops / qdq |
+ integerops / qdq |
BiDAF |
Natural Language Processing |
Post-Training Dynamic Quantization |
- integerops |
+ integerops |
+
+
+ BERT base uncased MRPC (HuggingFace) |
+ Natural Language Processing |
+ Post-Training Static Quantization |
+ qdq |
+
+
+ Roberta base MRPC (HuggingFace) |
+ Natural Language Processing |
+ Post-Training Static Quantization |
+ qdq |
+
+
+ XLM Roberta base MRPC (HuggingFace) |
+ Natural Language Processing |
+ Post-Training Static Quantization |
+ qdq |
+
+
+ Camembert base MRPC (HuggingFace) |
+ Natural Language Processing |
+ Post-Training Static Quantization |
+ qdq |
+
+
+ MiniLM L12 H384 uncased MRPC (HuggingFace) |
+ Natural Language Processing |
+ Post-Training Static Quantization |
+ qdq |
+
+
+ Distilbert base uncased SST-2 (HuggingFace) |
+ Natural Language Processing |
+ Post-Training Static Quantization |
+ qdq |
+
+
+ Albert base v2 SST-2 (HuggingFace) |
+ Natural Language Processing |
+ Post-Training Static Quantization |
+ qdq |
+
+
+ MiniLM L6 H384 uncased SST-2 (HuggingFace) |
+ Natural Language Processing |
+ Post-Training Static Quantization |
+ qdq |
+
+
+ Spanbert SQuAD (HuggingFace) |
+ Natural Language Processing |
+ Post-Training Static Quantization |
+ qdq |
+
+
+ Bert base multilingual cased SQuAD (HuggingFace) |
+ Natural Language Processing |
+ Post-Training Static Quantization |
+ qdq |
SSD MobileNet V1 |
diff --git a/examples/onnxrt/language_translation/bert/quantization/ptq/README.md b/examples/onnxrt/nlp/bert/quantization/ptq/README.md
similarity index 100%
rename from examples/onnxrt/language_translation/bert/quantization/ptq/README.md
rename to examples/onnxrt/nlp/bert/quantization/ptq/README.md
diff --git a/examples/onnxrt/language_translation/bert/quantization/ptq/bert_dynamic.yaml b/examples/onnxrt/nlp/bert/quantization/ptq/bert_dynamic.yaml
similarity index 100%
rename from examples/onnxrt/language_translation/bert/quantization/ptq/bert_dynamic.yaml
rename to examples/onnxrt/nlp/bert/quantization/ptq/bert_dynamic.yaml
diff --git a/examples/onnxrt/language_translation/bert/quantization/ptq/bert_qdq.yaml b/examples/onnxrt/nlp/bert/quantization/ptq/bert_qdq.yaml
similarity index 100%
rename from examples/onnxrt/language_translation/bert/quantization/ptq/bert_qdq.yaml
rename to examples/onnxrt/nlp/bert/quantization/ptq/bert_qdq.yaml
diff --git a/examples/onnxrt/language_translation/bert/quantization/ptq/bert_static.yaml b/examples/onnxrt/nlp/bert/quantization/ptq/bert_static.yaml
similarity index 100%
rename from examples/onnxrt/language_translation/bert/quantization/ptq/bert_static.yaml
rename to examples/onnxrt/nlp/bert/quantization/ptq/bert_static.yaml
diff --git a/examples/onnxrt/language_translation/bert/quantization/ptq/export.py b/examples/onnxrt/nlp/bert/quantization/ptq/export.py
similarity index 100%
rename from examples/onnxrt/language_translation/bert/quantization/ptq/export.py
rename to examples/onnxrt/nlp/bert/quantization/ptq/export.py
diff --git a/examples/onnxrt/language_translation/bert/quantization/ptq/main.py b/examples/onnxrt/nlp/bert/quantization/ptq/main.py
similarity index 100%
rename from examples/onnxrt/language_translation/bert/quantization/ptq/main.py
rename to examples/onnxrt/nlp/bert/quantization/ptq/main.py
diff --git a/examples/onnxrt/language_translation/bert/quantization/ptq/prepare_data.sh b/examples/onnxrt/nlp/bert/quantization/ptq/prepare_data.sh
similarity index 100%
rename from examples/onnxrt/language_translation/bert/quantization/ptq/prepare_data.sh
rename to examples/onnxrt/nlp/bert/quantization/ptq/prepare_data.sh
diff --git a/examples/onnxrt/language_translation/bert/quantization/ptq/prepare_model.sh b/examples/onnxrt/nlp/bert/quantization/ptq/prepare_model.sh
similarity index 100%
rename from examples/onnxrt/language_translation/bert/quantization/ptq/prepare_model.sh
rename to examples/onnxrt/nlp/bert/quantization/ptq/prepare_model.sh
diff --git a/examples/onnxrt/language_translation/bert/quantization/ptq/requirements.txt b/examples/onnxrt/nlp/bert/quantization/ptq/requirements.txt
similarity index 100%
rename from examples/onnxrt/language_translation/bert/quantization/ptq/requirements.txt
rename to examples/onnxrt/nlp/bert/quantization/ptq/requirements.txt
diff --git a/examples/onnxrt/language_translation/bert/quantization/ptq/run_benchmark.sh b/examples/onnxrt/nlp/bert/quantization/ptq/run_benchmark.sh
similarity index 100%
rename from examples/onnxrt/language_translation/bert/quantization/ptq/run_benchmark.sh
rename to examples/onnxrt/nlp/bert/quantization/ptq/run_benchmark.sh
diff --git a/examples/onnxrt/language_translation/bert/quantization/ptq/run_tuning.sh b/examples/onnxrt/nlp/bert/quantization/ptq/run_tuning.sh
similarity index 100%
rename from examples/onnxrt/language_translation/bert/quantization/ptq/run_tuning.sh
rename to examples/onnxrt/nlp/bert/quantization/ptq/run_tuning.sh
diff --git a/examples/onnxrt/language_translation/distilbert/quantization/ptq/distilbert.yaml b/examples/onnxrt/nlp/distilbert/quantization/ptq/distilbert.yaml
similarity index 100%
rename from examples/onnxrt/language_translation/distilbert/quantization/ptq/distilbert.yaml
rename to examples/onnxrt/nlp/distilbert/quantization/ptq/distilbert.yaml
diff --git a/examples/onnxrt/language_translation/distilbert/quantization/ptq/distilbert_qdq.yaml b/examples/onnxrt/nlp/distilbert/quantization/ptq/distilbert_qdq.yaml
similarity index 100%
rename from examples/onnxrt/language_translation/distilbert/quantization/ptq/distilbert_qdq.yaml
rename to examples/onnxrt/nlp/distilbert/quantization/ptq/distilbert_qdq.yaml
diff --git a/examples/onnxrt/language_translation/distilbert/quantization/ptq/export.py b/examples/onnxrt/nlp/distilbert/quantization/ptq/export.py
similarity index 100%
rename from examples/onnxrt/language_translation/distilbert/quantization/ptq/export.py
rename to examples/onnxrt/nlp/distilbert/quantization/ptq/export.py
diff --git a/examples/onnxrt/language_translation/distilbert/quantization/ptq/main.py b/examples/onnxrt/nlp/distilbert/quantization/ptq/main.py
similarity index 100%
rename from examples/onnxrt/language_translation/distilbert/quantization/ptq/main.py
rename to examples/onnxrt/nlp/distilbert/quantization/ptq/main.py
diff --git a/examples/onnxrt/language_translation/distilbert/quantization/ptq/prepare_data.sh b/examples/onnxrt/nlp/distilbert/quantization/ptq/prepare_data.sh
similarity index 100%
rename from examples/onnxrt/language_translation/distilbert/quantization/ptq/prepare_data.sh
rename to examples/onnxrt/nlp/distilbert/quantization/ptq/prepare_data.sh
diff --git a/examples/onnxrt/language_translation/distilbert/quantization/ptq/prepare_model.sh b/examples/onnxrt/nlp/distilbert/quantization/ptq/prepare_model.sh
similarity index 100%
rename from examples/onnxrt/language_translation/distilbert/quantization/ptq/prepare_model.sh
rename to examples/onnxrt/nlp/distilbert/quantization/ptq/prepare_model.sh
diff --git a/examples/onnxrt/language_translation/distilbert/quantization/ptq/readme.md b/examples/onnxrt/nlp/distilbert/quantization/ptq/readme.md
similarity index 100%
rename from examples/onnxrt/language_translation/distilbert/quantization/ptq/readme.md
rename to examples/onnxrt/nlp/distilbert/quantization/ptq/readme.md
diff --git a/examples/onnxrt/language_translation/distilbert/quantization/ptq/requirements.txt b/examples/onnxrt/nlp/distilbert/quantization/ptq/requirements.txt
similarity index 100%
rename from examples/onnxrt/language_translation/distilbert/quantization/ptq/requirements.txt
rename to examples/onnxrt/nlp/distilbert/quantization/ptq/requirements.txt
diff --git a/examples/onnxrt/language_translation/distilbert/quantization/ptq/run_benchmark.sh b/examples/onnxrt/nlp/distilbert/quantization/ptq/run_benchmark.sh
similarity index 100%
rename from examples/onnxrt/language_translation/distilbert/quantization/ptq/run_benchmark.sh
rename to examples/onnxrt/nlp/distilbert/quantization/ptq/run_benchmark.sh
diff --git a/examples/onnxrt/language_translation/distilbert/quantization/ptq/run_tuning.sh b/examples/onnxrt/nlp/distilbert/quantization/ptq/run_tuning.sh
similarity index 100%
rename from examples/onnxrt/language_translation/distilbert/quantization/ptq/run_tuning.sh
rename to examples/onnxrt/nlp/distilbert/quantization/ptq/run_tuning.sh
diff --git a/examples/onnxrt/nlp/huggingface_model/question_answering/quantization/ptq/README.md b/examples/onnxrt/nlp/huggingface_model/question_answering/quantization/ptq/README.md
new file mode 100644
index 00000000000..55538ff591c
--- /dev/null
+++ b/examples/onnxrt/nlp/huggingface_model/question_answering/quantization/ptq/README.md
@@ -0,0 +1,43 @@
+# Evaluate performance of ONNX Runtime(Huggingface Question Answering)
+>ONNX runtime quantization is under active development. please use 1.6.0+ to get more quantization support.
+
+This example load a language translation model and confirm its accuracy and speed based on [SQuAD]((https://rajpurkar.github.io/SQuAD-explorer/)) task.
+
+### Environment
+Please use latest onnx and onnxruntime version.
+
+### Prepare dataset
+You should download SQuAD dataset from [SQuAD dataset link](https://rajpurkar.github.io/SQuAD-explorer/).
+
+### Prepare model
+
+Supported model identifier from [huggingface.co](https://huggingface.co/):
+
+| Model Identifier |
+|:-----------------------------------------------:|
+| mrm8488/spanbert-finetuned-squadv1 |
+| salti/bert-base-multilingual-cased-finetuned-squad |
+
+
+```bash
+python export.py --model_name_or_path=mrm8488/spanbert-finetuned-squadv1 \ # or other supported model identifier
+```
+
+### Quantization
+
+Dynamic quantize:
+
+```bash
+bash run_tuning.sh --input_model=/path/to/model \ # model path as *.onnx
+ --output_model=/path/to/model_tune \
+ --config=qa_dynamic.yaml
+```
+
+### Benchmark
+
+```bash
+bash run_benchmark.sh --input_model=/path/to/model \ # model path as *.onnx
+ --config=qa_dynamic.yaml
+ --mode=performance # or accuracy
+```
+
diff --git a/examples/onnxrt/nlp/huggingface_model/question_answering/quantization/ptq/export.py b/examples/onnxrt/nlp/huggingface_model/question_answering/quantization/ptq/export.py
new file mode 100644
index 00000000000..08824f90405
--- /dev/null
+++ b/examples/onnxrt/nlp/huggingface_model/question_answering/quantization/ptq/export.py
@@ -0,0 +1,50 @@
+import argparse
+
+import torch
+from transformers import AutoConfig, AutoModelForQuestionAnswering
+
+def export_onnx_model(args, model):
+ with torch.no_grad():
+ symbolic_names = {0: 'batch_size', 1: 'max_seq_len'}
+ inputs = {'input_ids': torch.ones(1, args.max_len, dtype=torch.int64),
+ 'token_type_ids': torch.ones(1, args.max_len, dtype=torch.int64),
+ 'attention_mask': torch.ones(1, args.max_len, dtype=torch.int64)}
+ torch.onnx.export(model, # model being run
+ (inputs['input_ids'], # model input (or a tuple for multiple inputs)
+ inputs['token_type_ids'],
+ inputs['attention_mask']),
+ args.output_model, # where to save the model (can be a file or file-like object)
+ opset_version=11, # the ONNX version to export the model
+ do_constant_folding=True, # whether to execute constant folding
+ input_names=['input_ids', # the model's input names
+ 'token_type_ids',
+ 'attention_mask'],
+ dynamic_axes={'input_ids': symbolic_names, # variable length axes
+ 'token_type_ids' : symbolic_names,
+ 'attention_mask' : symbolic_names})
+ print("ONNX Model exported to {0}".format(args.output_model))
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(
+ description='Export huggingface onnx model',
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument(
+ '--model_name_or_path',
+ type=str,
+ choices=['mrm8488/spanbert-finetuned-squadv1',
+ 'salti/bert-base-multilingual-cased-finetuned-squad'],
+ help='pretrained model name or path ')
+ parser.add_argument(
+ '--max_len',
+ type=int,
+ default=512,
+ help='Maximum length of the sentence pairs')
+ args = parser.parse_args()
+ args.output_model = args.model_name_or_path.split('/')[1] + '.onnx'
+
+ model = AutoModelForQuestionAnswering.from_pretrained(
+ args.model_name_or_path,
+ config=AutoConfig.from_pretrained(args.model_name_or_path))
+
+ export_onnx_model(args, model)
\ No newline at end of file
diff --git a/examples/onnxrt/nlp/huggingface_model/question_answering/quantization/ptq/main.py b/examples/onnxrt/nlp/huggingface_model/question_answering/quantization/ptq/main.py
new file mode 100644
index 00000000000..1866be2d602
--- /dev/null
+++ b/examples/onnxrt/nlp/huggingface_model/question_answering/quantization/ptq/main.py
@@ -0,0 +1,614 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2020 The HuggingFace Team All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Fine-tuning the library models for question answering using a slightly adapted version of the š¤ Trainer.
+"""
+# You can also adapt this script on your own question answering task. Pointers for this are left as comments.
+
+import logging
+import os
+import sys
+from dataclasses import dataclass, field
+from typing import Optional
+
+import datasets
+from datasets import load_dataset, load_metric
+
+from torch.utils.data import Dataset, DataLoader
+
+import sys
+import onnx
+import onnxruntime as ort
+import numpy as np
+import transformers
+from trainer_qa import QuestionAnsweringTrainer
+from transformers import (
+ AutoConfig,
+ AutoModelForQuestionAnswering,
+ AutoTokenizer,
+ DataCollatorWithPadding,
+ EvalPrediction,
+ HfArgumentParser,
+ PreTrainedTokenizerFast,
+ TrainingArguments,
+ default_data_collator,
+ set_seed,
+)
+from transformers.trainer_utils import get_last_checkpoint
+from transformers.utils import check_min_version, send_example_telemetry
+from transformers.utils.versions import require_version
+from utils_qa import postprocess_qa_predictions
+
+
+# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
+# check_min_version("4.22.0.dev0")
+
+require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class ModelArguments:
+ """
+ Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
+ """
+
+ model_name_or_path: str = field(
+ default=None, metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
+ )
+ config_name: Optional[str] = field(
+ default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
+ )
+ tokenizer_name: Optional[str] = field(
+ default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
+ )
+ cache_dir: Optional[str] = field(
+ default=None,
+ metadata={"help": "Path to directory to store the pretrained models downloaded from huggingface.co"},
+ )
+ model_revision: str = field(
+ default="main",
+ metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
+ )
+ use_auth_token: bool = field(
+ default=False,
+ metadata={
+ "help": (
+ "Will use the token generated when running `transformers-cli login` (necessary to use this script "
+ "with private models)."
+ )
+ },
+ )
+ model_path: str = field(
+ default=None,
+ metadata={"help": ("onnx model path")},
+ )
+ tune: bool = field(
+ default=False,
+ metadata={"help": ("INC tune")},
+ )
+ benchmark: bool = field(
+ default=False,
+ metadata={"help": ("INC benchmark")},
+ )
+ mode: str = field(
+ default='performance',
+ metadata={"help": ("INC benchmark mode")},
+ )
+ config: str = field(
+ default='bert-base-multilingual-cased-static.yaml',
+ metadata={"help": ("INC config")},
+ )
+ save_path: str = field(
+ default=None,
+ metadata={"help": ("onnx int8 model path")},
+ )
+ num_heads: int = field(
+ default=12,
+ metadata={"help": ("onnx model optimize num_heads")},
+ )
+ hidden_size: int = field(
+ default=768,
+ metadata={"help": ("onnx model optimize hidden_size")},
+ )
+
+
+@dataclass
+class DataTrainingArguments:
+ """
+ Arguments pertaining to what data we are going to input our model for training and eval.
+ """
+
+ dataset_name: Optional[str] = field(
+ default='squad', metadata={"help": "The name of the dataset to use (via the datasets library)."}
+ )
+ dataset_config_name: Optional[str] = field(
+ default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
+ )
+ train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."})
+ validation_file: Optional[str] = field(
+ default=None,
+ metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."},
+ )
+ test_file: Optional[str] = field(
+ default=None,
+ metadata={"help": "An optional input test data file to evaluate the perplexity on (a text file)."},
+ )
+ overwrite_cache: bool = field(
+ default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
+ )
+ preprocessing_num_workers: Optional[int] = field(
+ default=None,
+ metadata={"help": "The number of processes to use for the preprocessing."},
+ )
+ max_seq_length: int = field(
+ default=512,
+ metadata={
+ "help": (
+ "The maximum total input sequence length after tokenization. Sequences longer "
+ "than this will be truncated, sequences shorter will be padded."
+ )
+ },
+ )
+ pad_to_max_length: bool = field(
+ default=True,
+ metadata={
+ "help": (
+ "Whether to pad all samples to `max_seq_length`. If False, will pad the samples dynamically when"
+ " batching to the maximum length in the batch (which can be faster on GPU but will be slower on TPU)."
+ )
+ },
+ )
+ max_train_samples: Optional[int] = field(
+ default=None,
+ metadata={
+ "help": (
+ "For debugging purposes or quicker training, truncate the number of training examples to this "
+ "value if set."
+ )
+ },
+ )
+ max_eval_samples: Optional[int] = field(
+ default=None,
+ metadata={
+ "help": (
+ "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
+ "value if set."
+ )
+ },
+ )
+ max_predict_samples: Optional[int] = field(
+ default=None,
+ metadata={
+ "help": (
+ "For debugging purposes or quicker training, truncate the number of prediction examples to this "
+ "value if set."
+ )
+ },
+ )
+ version_2_with_negative: bool = field(
+ default=False, metadata={"help": "If true, some of the examples do not have an answer."}
+ )
+ null_score_diff_threshold: float = field(
+ default=0.0,
+ metadata={
+ "help": (
+ "The threshold used to select the null answer: if the best answer has a score that is less than "
+ "the score of the null answer minus this threshold, the null answer is selected for this example. "
+ "Only useful when `version_2_with_negative=True`."
+ )
+ },
+ )
+ doc_stride: int = field(
+ default=256,
+ metadata={"help": "When splitting up a long document into chunks, how much stride to take between chunks."},
+ )
+ n_best_size: int = field(
+ default=20,
+ metadata={"help": "The total number of n-best predictions to generate when looking for an answer."},
+ )
+ max_answer_length: int = field(
+ default=30,
+ metadata={
+ "help": (
+ "The maximum length of an answer that can be generated. This is needed because the start "
+ "and end predictions are not conditioned on one another."
+ )
+ },
+ )
+
+ def __post_init__(self):
+ if (
+ self.dataset_name is None
+ and self.train_file is None
+ and self.validation_file is None
+ and self.test_file is None
+ ):
+ raise ValueError("Need either a dataset name or a training/validation file/test_file.")
+ else:
+ if self.train_file is not None:
+ extension = self.train_file.split(".")[-1]
+ assert extension in ["csv", "json"], "`train_file` should be a csv or a json file."
+ if self.validation_file is not None:
+ extension = self.validation_file.split(".")[-1]
+ assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file."
+ if self.test_file is not None:
+ extension = self.test_file.split(".")[-1]
+ assert extension in ["csv", "json"], "`test_file` should be a csv or a json file."
+
+class SquadDataset(Dataset):
+ def __init__(self, dataloader, bs=1):
+ self.dataloader = dataloader
+ self.bs = bs
+ self.input_ids = []
+ self.token_type_ids = []
+ self.attention_mask = []
+ for idx, inputs in enumerate(self.dataloader):
+ self.input_ids.append(np.array(inputs['input_ids'], dtype=np.int64))
+ self.token_type_ids.append(np.array(inputs['token_type_ids'], dtype=np.int64))
+ self.attention_mask.append(np.array(inputs['attention_mask'], dtype=np.int64))
+
+ def __getitem__(self, index):
+ return (self.input_ids[index:index + self.bs][0][0], self.token_type_ids[index:index + self.bs][0][0], self.attention_mask[index:index + self.bs][0][0]), 0
+ # return (self.input_ids[index:index + self.bs][0], self.attention_mask[index:index + self.bs][0], self.token_type_ids[index:index + self.bs][0]), 0
+
+ def __len__(self):
+ assert len(self.input_ids) == len(self.attention_mask)
+ assert len(self.input_ids) == len(self.token_type_ids)
+ return len(self.input_ids)
+
+
+def main():
+ # See all possible arguments in src/transformers/training_args.py
+ # or by passing the --help flag to this script.
+ # We now keep distinct sets of args, for a cleaner separation of concerns.
+
+ parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
+ model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+
+ training_args.do_eval = True
+ training_args.per_device_eval_batch_size = 1
+
+ # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
+ # information sent is the one passed as arguments along with your Python/PyTorch versions.
+ send_example_telemetry("run_qa", model_args, data_args)
+
+ # Setup logging
+ logging.basicConfig(
+ format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+ datefmt="%m/%d/%Y %H:%M:%S",
+ handlers=[logging.StreamHandler(sys.stdout)],
+ )
+
+ log_level = training_args.get_process_log_level()
+ logger.setLevel(log_level)
+ datasets.utils.logging.set_verbosity(log_level)
+ transformers.utils.logging.set_verbosity(log_level)
+ transformers.utils.logging.enable_default_handler()
+ transformers.utils.logging.enable_explicit_format()
+
+ # Log on each process the small summary:
+ logger.warning(
+ f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
+ + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+ )
+ logger.info(f"Training/evaluation parameters {training_args}")
+
+ # Detecting last checkpoint.
+ last_checkpoint = None
+ if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
+ last_checkpoint = get_last_checkpoint(training_args.output_dir)
+ if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
+ raise ValueError(
+ f"Output directory ({training_args.output_dir}) already exists and is not empty. "
+ "Use --overwrite_output_dir to overcome."
+ )
+ elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
+ logger.info(
+ f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
+ "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
+ )
+
+ # Set seed before initializing model.
+ set_seed(training_args.seed)
+
+ # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
+ # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
+ # (the dataset will be downloaded automatically from the datasets Hub).
+ #
+ # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
+ # 'text' is found. You can easily tweak this behavior (see below).
+ #
+ # In distributed training, the load_dataset function guarantee that only one local process can concurrently
+ # download the dataset.
+ if data_args.dataset_name is not None:
+ # Downloading and loading a dataset from the hub.
+ raw_datasets = load_dataset(
+ data_args.dataset_name,
+ data_args.dataset_config_name,
+ cache_dir=model_args.cache_dir,
+ use_auth_token=True if model_args.use_auth_token else None,
+ )
+ print(type(raw_datasets))
+ else:
+ data_files = {}
+ if data_args.train_file is not None:
+ data_files["train"] = data_args.train_file
+ extension = data_args.train_file.split(".")[-1]
+
+ if data_args.validation_file is not None:
+ data_files["validation"] = data_args.validation_file
+ extension = data_args.validation_file.split(".")[-1]
+ if data_args.test_file is not None:
+ data_files["test"] = data_args.test_file
+ extension = data_args.test_file.split(".")[-1]
+ raw_datasets = load_dataset(
+ extension,
+ data_files=data_files,
+ field="data",
+ cache_dir=model_args.cache_dir,
+ use_auth_token=True if model_args.use_auth_token else None,
+ )
+ # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
+ # https://huggingface.co/docs/datasets/loading_datasets.html.
+
+ # Load pretrained model and tokenizer
+ #
+ # Distributed training:
+ # The .from_pretrained methods guarantee that only one local process can concurrently
+ # download model & vocab.
+ config = AutoConfig.from_pretrained(
+ model_args.config_name if model_args.config_name else model_args.model_name_or_path,
+ cache_dir=model_args.cache_dir,
+ revision=model_args.model_revision,
+ use_auth_token=True if model_args.use_auth_token else None,
+ )
+ tokenizer = AutoTokenizer.from_pretrained(
+ model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
+ cache_dir=model_args.cache_dir,
+ use_fast=True,
+ revision=model_args.model_revision,
+ use_auth_token=True if model_args.use_auth_token else None,
+ )
+ model = AutoModelForQuestionAnswering.from_pretrained(
+ model_args.model_name_or_path,
+ from_tf=bool(".ckpt" in model_args.model_name_or_path),
+ config=config,
+ cache_dir=model_args.cache_dir,
+ revision=model_args.model_revision,
+ use_auth_token=True if model_args.use_auth_token else None,
+ )
+
+ # Tokenizer check: this script requires a fast tokenizer.
+ if not isinstance(tokenizer, PreTrainedTokenizerFast):
+ raise ValueError(
+ "This example script only works for models that have a fast tokenizer. Checkout the big table of models at"
+ " https://huggingface.co/transformers/index.html#supported-frameworks to find the model types that meet"
+ " this requirement"
+ )
+
+ # Preprocessing the datasets.
+ # Preprocessing is slighlty different for training and evaluation.
+ if training_args.do_train:
+ column_names = raw_datasets["train"].column_names
+ elif training_args.do_eval:
+ column_names = raw_datasets["validation"].column_names
+ else:
+ column_names = raw_datasets["test"].column_names
+ question_column_name = "question" if "question" in column_names else column_names[0]
+ context_column_name = "context" if "context" in column_names else column_names[1]
+ answer_column_name = "answers" if "answers" in column_names else column_names[2]
+
+ # Padding side determines if we do (question|context) or (context|question).
+ pad_on_right = tokenizer.padding_side == "right"
+
+ if data_args.max_seq_length > tokenizer.model_max_length:
+ logger.warning(
+ f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
+ f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
+ )
+ max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
+
+ # Validation preprocessing
+ def prepare_validation_features(examples):
+ # Some of the questions have lots of whitespace on the left, which is not useful and will make the
+ # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
+ # left whitespace
+ examples[question_column_name] = [q.lstrip() for q in examples[question_column_name]]
+
+ # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
+ # in one example possible giving several features when a context is long, each of those features having a
+ # context that overlaps a bit the context of the previous feature.
+ tokenized_examples = tokenizer(
+ examples[question_column_name if pad_on_right else context_column_name],
+ examples[context_column_name if pad_on_right else question_column_name],
+ truncation="only_second" if pad_on_right else "only_first",
+ max_length=max_seq_length,
+ stride=data_args.doc_stride,
+ return_overflowing_tokens=True,
+ return_offsets_mapping=True,
+ padding="max_length" if data_args.pad_to_max_length else False,
+ )
+
+ # Since one example might give us several features if it has a long context, we need a map from a feature to
+ # its corresponding example. This key gives us just that.
+ sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
+
+ # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the
+ # corresponding example_id and we will store the offset mappings.
+ tokenized_examples["example_id"] = []
+
+ for i in range(len(tokenized_examples["input_ids"])):
+ # Grab the sequence corresponding to that example (to know what is the context and what is the question).
+ sequence_ids = tokenized_examples.sequence_ids(i)
+ context_index = 1 if pad_on_right else 0
+
+ # One example can give several spans, this is the index of the example containing this span of text.
+ sample_index = sample_mapping[i]
+ tokenized_examples["example_id"].append(examples["id"][sample_index])
+
+ # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token
+ # position is part of the context or not.
+ tokenized_examples["offset_mapping"][i] = [
+ (o if sequence_ids[k] == context_index else None)
+ for k, o in enumerate(tokenized_examples["offset_mapping"][i])
+ ]
+
+ return tokenized_examples
+
+ if training_args.do_eval:
+ if "validation" not in raw_datasets:
+ raise ValueError("--do_eval requires a validation dataset")
+ eval_examples = raw_datasets["validation"]
+ if data_args.max_eval_samples is not None:
+ # We will select sample from whole data
+ max_eval_samples = min(len(eval_examples), data_args.max_eval_samples)
+ eval_examples = eval_examples.select(range(max_eval_samples))
+ # Validation Feature Creation
+ with training_args.main_process_first(desc="validation dataset map pre-processing"):
+ eval_dataset = eval_examples.map(
+ prepare_validation_features,
+ batched=True,
+ num_proc=data_args.preprocessing_num_workers,
+ remove_columns=column_names,
+ load_from_cache_file=not data_args.overwrite_cache,
+ desc="Running tokenizer on validation dataset",
+ )
+ if data_args.max_eval_samples is not None:
+ # During Feature creation dataset samples might increase, we will select required samples again
+ max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
+ eval_dataset = eval_dataset.select(range(max_eval_samples))
+
+
+ # Data collator
+ # We have already padded to max length if the corresponding flag is True, otherwise we need to pad in the data
+ # collator.
+ data_collator = (
+ default_data_collator
+ if data_args.pad_to_max_length
+ else DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8 if training_args.fp16 else None)
+ )
+
+ # Post-processing:
+ def post_processing_function(examples, features, predictions, stage="eval"):
+ # Post-processing: we match the start logits and end logits to answers in the original context.
+ predictions = postprocess_qa_predictions(
+ examples=examples,
+ features=features,
+ predictions=predictions,
+ version_2_with_negative=data_args.version_2_with_negative,
+ n_best_size=data_args.n_best_size,
+ max_answer_length=data_args.max_answer_length,
+ null_score_diff_threshold=data_args.null_score_diff_threshold,
+ output_dir=training_args.output_dir,
+ log_level=log_level,
+ prefix=stage,
+ )
+ # Format the result to the format the metric expects.
+ if data_args.version_2_with_negative:
+ formatted_predictions = [
+ {"id": k, "prediction_text": v, "no_answer_probability": 0.0} for k, v in predictions.items()
+ ]
+ else:
+ formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()]
+
+ references = [{"id": ex["id"], "answers": ex[answer_column_name]} for ex in examples]
+ return EvalPrediction(predictions=formatted_predictions, label_ids=references)
+
+ metric = load_metric("squad_v2" if data_args.version_2_with_negative else "squad")
+
+ def compute_metrics(p: EvalPrediction):
+ return metric.compute(predictions=p.predictions, references=p.label_ids)
+
+ # Initialize our Trainer
+ trainer = QuestionAnsweringTrainer(
+ model=model,
+ args=training_args,
+ train_dataset=None,
+ eval_dataset=eval_dataset if training_args.do_eval else None,
+ eval_examples=eval_examples if training_args.do_eval else None,
+ tokenizer=tokenizer,
+ data_collator=data_collator,
+ post_process_function=post_processing_function,
+ compute_metrics=compute_metrics,
+ )
+
+ eval_dataloader = trainer.get_dataloader(eval_dataset)
+
+ def eval_func(model, *args):
+ logger.info("*** Evaluate ***")
+ metrics = trainer.evaluate(onnx_model=model)
+ print('eval_func', metrics)
+
+ max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
+ metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
+
+ trainer.log_metrics("eval", metrics)
+ trainer.save_metrics("eval", metrics)
+ return metrics['eval_f1']
+
+ if model_args.tune:
+ from onnxruntime.transformers import optimizer
+ from onnxruntime.transformers.onnx_model_bert import BertOptimizationOptions
+ opt_options = BertOptimizationOptions('bert')
+ opt_options.enable_embed_layer_norm = False
+
+ model_optimizer = optimizer.optimize_model(
+ model_args.model_path,
+ 'bert',
+ num_heads=model_args.num_heads,
+ hidden_size=model_args.hidden_size,
+ optimization_options=opt_options)
+ model = model_optimizer.model
+
+ b_dataloader = SquadDataset(eval_dataloader)
+ b_dataloader = DataLoader(b_dataloader)
+ from neural_compressor.experimental import Quantization, common
+ quantize = Quantization(model_args.config)
+ quantize.model = common.Model(model)
+ quantize.calib_dataloader = b_dataloader
+ quantize.eval_func = eval_func
+ q_model = quantize()
+ q_model.save(model_args.save_path)
+
+ if model_args.benchmark:
+ from neural_compressor.experimental import Benchmark, common
+ model = onnx.load(model_args.model_path)
+ if model_args.mode == 'performance':
+ from neural_compressor.data import DATALOADERS, DATASETS
+ session = ort.InferenceSession(model_args.model_path, None)
+ input_tensors = session.get_inputs()
+ shape = []
+ for i in range(len(input_tensors)):
+ shape.append((1, 512))
+ onnx_datasets = DATASETS('onnxrt_integerops')
+ dummy_dataset = onnx_datasets['dummy'](shape=shape, low=1, high=1, dtype='int64', label=True)
+ evaluator = Benchmark(model_args.config)
+ evaluator.model = common.Model(model)
+ evaluator.b_dataloader = common.DataLoader(dummy_dataset)
+ evaluator(model_args.mode)
+ elif model_args.mode == 'accuracy':
+ b_dataloader = SquadDataset(eval_dataloader)
+ b_dataloader = DataLoader(b_dataloader)
+ evaluator = Benchmark(model_args.config)
+ evaluator.b_dataloader = b_dataloader
+ evaluator.b_func = eval_func
+ evaluator.model = common.Model(model)
+ evaluator(model_args.mode)
+
+if __name__ == "__main__":
+ main()
\ No newline at end of file
diff --git a/examples/onnxrt/nlp/huggingface_model/question_answering/quantization/ptq/qa_dynamic.yaml b/examples/onnxrt/nlp/huggingface_model/question_answering/quantization/ptq/qa_dynamic.yaml
new file mode 100644
index 00000000000..e76dfbc315c
--- /dev/null
+++ b/examples/onnxrt/nlp/huggingface_model/question_answering/quantization/ptq/qa_dynamic.yaml
@@ -0,0 +1,36 @@
+#
+# Copyright (c) 2021 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+version: 1.0
+
+model: # mandatory. used to specify model specific information.
+ name: question_answering
+ framework: onnxrt_integerops # mandatory. possible values are tensorflow, mxnet, pytorch, pytorch_ipex, onnxrt_integerops and onnxrt_qlinearops.
+
+evaluation: # optional. required if user doesn't provide eval_func in neural_compressor.Quantization.
+ performance: # optional. used to benchmark performance of passing model.
+ warmup: 0
+ iteration: 100
+ configs:
+ cores_per_instance: 28
+ num_of_instance: 1
+
+quantization:
+ approach: post_training_dynamic_quant # optional. default value is post_training_static_quant.
+
+tuning:
+ accuracy_criterion:
+ relative: 0.01 # optional. default value is relative, other value is absolute. this example allows relative accuracy loss: 1%.
+ random_seed: 9527 # optional. random seed for deterministic tuning.
\ No newline at end of file
diff --git a/examples/onnxrt/nlp/huggingface_model/question_answering/quantization/ptq/requirements.txt b/examples/onnxrt/nlp/huggingface_model/question_answering/quantization/ptq/requirements.txt
new file mode 100644
index 00000000000..30412bea132
--- /dev/null
+++ b/examples/onnxrt/nlp/huggingface_model/question_answering/quantization/ptq/requirements.txt
@@ -0,0 +1,7 @@
+datasets
+onnx
+onnxruntime
+onnxruntime-extensions; python_version < '3.10'
+transformers==4.21.0
+torch
+tensorboard
\ No newline at end of file
diff --git a/examples/onnxrt/nlp/huggingface_model/question_answering/quantization/ptq/run_benchmark.sh b/examples/onnxrt/nlp/huggingface_model/question_answering/quantization/ptq/run_benchmark.sh
new file mode 100644
index 00000000000..2eef1e0e4b3
--- /dev/null
+++ b/examples/onnxrt/nlp/huggingface_model/question_answering/quantization/ptq/run_benchmark.sh
@@ -0,0 +1,49 @@
+#!/bin/bash
+set -x
+
+function main {
+
+ init_params "$@"
+ run_benchmark
+
+}
+
+# init params
+function init_params {
+ for var in "$@"
+ do
+ case $var in
+ --config=*)
+ config=$(echo $var |cut -f2 -d=)
+ ;;
+ --input_model=*)
+ input_model=$(echo $var |cut -f2 -d=)
+ ;;
+ --mode=*)
+ mode=$(echo $var |cut -f2 -d=)
+ ;;
+ esac
+ done
+
+}
+
+# run_benchmark
+function run_benchmark {
+
+ if [[ "${input_model}" =~ "spanbert" ]]; then
+ model_name_or_path="mrm8488/spanbert-finetuned-squadv1"
+ elif [[ "${input_model}" =~ "bert-base" ]]; then
+ model_name_or_path="salti/bert-base-multilingual-cased-finetuned-squad"
+ fi
+
+ python main.py \
+ --model_path ${input_model} \
+ --config ${config} \
+ --mode=${mode} \
+ --model_name_or_path=${model_name_or_path} \
+ --output_dir './output' \
+ --benchmark
+
+}
+
+main "$@"
\ No newline at end of file
diff --git a/examples/onnxrt/nlp/huggingface_model/question_answering/quantization/ptq/run_tuning.sh b/examples/onnxrt/nlp/huggingface_model/question_answering/quantization/ptq/run_tuning.sh
new file mode 100644
index 00000000000..9e0eb872250
--- /dev/null
+++ b/examples/onnxrt/nlp/huggingface_model/question_answering/quantization/ptq/run_tuning.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+set -x
+
+function main {
+ init_params "$@"
+ run_tuning
+}
+
+# init params
+function init_params {
+ for var in "$@"
+ do
+ case $var in
+ --config=*)
+ config=$(echo $var |cut -f2 -d=)
+ ;;
+ --input_model=*)
+ input_model=$(echo $var |cut -f2 -d=)
+ ;;
+ --output_model=*)
+ output_model=$(echo $var |cut -f2 -d=)
+ ;;
+ esac
+ done
+
+}
+
+# run_tuning
+function run_tuning {
+
+ if [[ "${input_model}" =~ "spanbert" ]]; then
+ model_name_or_path="mrm8488/spanbert-finetuned-squadv1"
+ num_heads=12
+ hidden_size=768
+ elif [[ "${input_model}" =~ "bert-base" ]]; then
+ model_name_or_path="salti/bert-base-multilingual-cased-finetuned-squad"
+ num_heads=12
+ hidden_size=768
+ fi
+
+ python main.py \
+ --model_path ${input_model} \
+ --save_path ${output_model} \
+ --config ${config} \
+ --output_dir './output' \
+ --model_name_or_path=${model_name_or_path} \
+ --num_heads ${num_heads} \
+ --hidden_size ${hidden_size} \
+ --tune
+}
+
+main "$@"
\ No newline at end of file
diff --git a/examples/onnxrt/nlp/huggingface_model/question_answering/quantization/ptq/trainer_qa.py b/examples/onnxrt/nlp/huggingface_model/question_answering/quantization/ptq/trainer_qa.py
new file mode 100644
index 00000000000..2da65c8a9f0
--- /dev/null
+++ b/examples/onnxrt/nlp/huggingface_model/question_answering/quantization/ptq/trainer_qa.py
@@ -0,0 +1,489 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Team All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+A subclass of `Trainer` specific to Question-Answering tasks
+"""
+
+from transformers import Trainer, is_torch_tpu_available
+from transformers.deepspeed import deepspeed_init, is_deepspeed_zero3_enabled
+from transformers.trainer_utils import (
+ PREFIX_CHECKPOINT_DIR,
+ BestRun,
+ EvalLoopOutput,
+ EvalPrediction,
+ FSDPOption,
+ HPSearchBackend,
+ HubStrategy,
+ IntervalStrategy,
+ PredictionOutput,
+ RemoveColumnsCollator,
+ ShardedDDPOption,
+ TrainerMemoryTracker,
+ TrainOutput,
+ default_compute_objective,
+ default_hp_space,
+ denumpify_detensorize,
+ enable_full_determinism,
+ find_executable_batch_size,
+ get_last_checkpoint,
+ has_length,
+ number_of_arguments,
+ seed_worker,
+ set_seed,
+ speed_metrics,
+)
+from transformers.utils import (
+ is_sagemaker_mp_enabled,
+ is_torch_tpu_available,
+ logging,
+)
+from transformers.trainer_pt_utils import (
+ IterableDatasetShard,
+ LabelSmoother,
+ LengthGroupedSampler,
+ SequentialDistributedSampler,
+ ShardSampler,
+ distributed_broadcast_scalars,
+ distributed_concat,
+ find_batch_size,
+ get_module_class_from_name,
+ get_parameter_names,
+ nested_concat,
+ nested_detach,
+ nested_numpify,
+ nested_truncate,
+ nested_xla_mesh_reduce,
+ reissue_pt_warnings
+)
+import onnxruntime
+import onnx
+from torch.utils.data import DataLoader
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union, NamedTuple
+import numpy as np
+
+import torch
+from torch import nn
+
+logger = logging.get_logger(__name__)
+
+if is_sagemaker_mp_enabled():
+ import smdistributed.modelparallel.torch as smp
+ from smdistributed.modelparallel import __version__ as SMP_VERSION
+
+ IS_SAGEMAKER_MP_POST_1_10 = version.parse(SMP_VERSION) >= version.parse("1.10")
+
+ from .trainer_pt_utils import smp_forward_backward, smp_forward_only, smp_gather, smp_nested_concat
+
+
+if is_torch_tpu_available(check_device=False):
+ import torch_xla.core.xla_model as xm
+ import torch_xla.debug.metrics as met
+ import torch_xla.distributed.parallel_loader as pl
+
+def has_length(dataset):
+ """
+ Checks if the dataset implements __len__() and it doesn't raise an error
+ """
+ try:
+ return len(dataset) is not None
+ except TypeError:
+ # TypeError: len() of unsized object
+ return False
+
+class EvalLoopOutput(NamedTuple):
+ predictions: Union[np.ndarray, Tuple[np.ndarray]]
+ label_ids: Optional[Union[np.ndarray, Tuple[np.ndarray]]]
+ metrics: Optional[Dict[str, float]]
+ num_samples: Optional[int]
+
+class QuestionAnsweringTrainer(Trainer):
+ def __init__(self, *args, eval_examples=None, post_process_function=None, **kwargs):
+ super().__init__(*args, **kwargs)
+ self.eval_examples = eval_examples
+ self.post_process_function = post_process_function
+
+ def get_dataloader(self, eval_dataset):
+ return self.get_eval_dataloader(eval_dataset)
+
+ def evaluate(self, onnx_model, eval_dataset=None, eval_examples=None, ignore_keys=None, metric_key_prefix: str = "eval"):
+ eval_dataset = self.eval_dataset if eval_dataset is None else eval_dataset
+ eval_dataloader = self.get_eval_dataloader(eval_dataset)
+ eval_examples = self.eval_examples if eval_examples is None else eval_examples
+
+ # Temporarily disable metric computation, we will do it in the loop here.
+ compute_metrics = self.compute_metrics
+ self.compute_metrics = None
+ eval_loop = self.evaluation_loop
+ print('eval_dataloader', type(eval_dataloader))
+ print('onnx_model', type(onnx_model))
+ try:
+ output = eval_loop(
+ dataloader=eval_dataloader,
+ description="Evaluation",
+ prediction_loss_only=True if compute_metrics is None else None,
+ ignore_keys=ignore_keys,
+ onnx_model=onnx_model,
+ )
+ finally:
+ self.compute_metrics = compute_metrics
+
+ if self.post_process_function is not None and self.compute_metrics is not None:
+ eval_preds = self.post_process_function(eval_examples, eval_dataset, output.predictions)
+ metrics = self.compute_metrics(eval_preds)
+
+ # Prefix all keys with metric_key_prefix + '_'
+ for key in list(metrics.keys()):
+ if not key.startswith(f"{metric_key_prefix}_"):
+ metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key)
+
+ self.log(metrics)
+ else:
+ metrics = {}
+
+ if self.args.tpu_metrics_debug or self.args.debug:
+ # tpu-comment: Logging debug metrics for PyTorch/XLA (compile, execute times, ops, etc.)
+ xm.master_print(met.metrics_report())
+
+ self.control = self.callback_handler.on_evaluate(self.args, self.state, self.control, metrics)
+ return metrics
+
+ def predict(self, predict_dataset, predict_examples, ignore_keys=None, metric_key_prefix: str = "test"):
+ predict_dataloader = self.get_test_dataloader(predict_dataset)
+
+ # Temporarily disable metric computation, we will do it in the loop here.
+ compute_metrics = self.compute_metrics
+ self.compute_metrics = None
+ eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop
+ try:
+ output = eval_loop(
+ predict_dataloader,
+ description="Prediction",
+ # No point gathering the predictions if there are no metrics, otherwise we defer to
+ # self.args.prediction_loss_only
+ prediction_loss_only=True if compute_metrics is None else None,
+ ignore_keys=ignore_keys,
+ )
+ finally:
+ self.compute_metrics = compute_metrics
+
+ if self.post_process_function is None or self.compute_metrics is None:
+ return output
+
+ predictions = self.post_process_function(predict_examples, predict_dataset, output.predictions, "predict")
+ metrics = self.compute_metrics(predictions)
+
+ # Prefix all keys with metric_key_prefix + '_'
+ for key in list(metrics.keys()):
+ if not key.startswith(f"{metric_key_prefix}_"):
+ metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key)
+
+ return PredictionOutput(predictions=predictions.predictions, label_ids=predictions.label_ids, metrics=metrics)
+
+ def evaluation_loop(
+ self,
+ dataloader: DataLoader,
+ description: str,
+ prediction_loss_only: Optional[bool] = None,
+ ignore_keys: Optional[List[str]] = None,
+ metric_key_prefix: str = "eval",
+ onnx_model: onnx.onnx_ml_pb2.ModelProto = None,
+ ) -> EvalLoopOutput:
+ """
+ Prediction/evaluation loop, shared by `Trainer.evaluate()` and `Trainer.predict()`.
+ Works both with or without labels.
+ """
+ args = self.args
+
+ prediction_loss_only = prediction_loss_only if prediction_loss_only is not None else args.prediction_loss_only
+
+ # if eval is called w/o train init deepspeed here
+ if args.deepspeed and not self.deepspeed:
+
+ # XXX: eval doesn't have `resume_from_checkpoint` arg but we should be able to do eval
+ # from the checkpoint eventually
+ deepspeed_engine, _, _ = deepspeed_init(
+ self, num_training_steps=0, resume_from_checkpoint=None, inference=True
+ )
+ self.model = deepspeed_engine.module
+ self.model_wrapped = deepspeed_engine
+ self.deepspeed = deepspeed_engine
+
+ model = self._wrap_model(self.model, training=False, dataloader=dataloader)
+
+ # if full fp16 or bf16 eval is wanted and this ``evaluation`` or ``predict`` isn't called
+ # while ``train`` is running, cast it to the right dtype first and then put on device
+ if not self.is_in_train:
+ if args.fp16_full_eval:
+ model = model.to(dtype=torch.float16, device=args.device)
+ elif args.bf16_full_eval:
+ model = model.to(dtype=torch.bfloat16, device=args.device)
+
+ batch_size = self.args.eval_batch_size
+
+ logger.info(f"***** Running {description} *****")
+ if has_length(dataloader):
+ logger.info(f" Num examples = {self.num_examples(dataloader)}")
+ else:
+ logger.info(" Num examples: Unknown")
+ logger.info(f" Batch size = {batch_size}")
+
+ model.eval()
+
+ self.callback_handler.eval_dataloader = dataloader
+ # Do this before wrapping.
+ eval_dataset = getattr(dataloader, "dataset", None)
+
+ if is_torch_tpu_available():
+ dataloader = pl.ParallelLoader(dataloader, [args.device]).per_device_loader(args.device)
+
+ if args.past_index >= 0:
+ self._past = None
+
+ # Initialize containers
+ # losses/preds/labels on GPU/TPU (accumulated for eval_accumulation_steps)
+ losses_host = None
+ preds_host = None
+ labels_host = None
+ inputs_host = None
+
+ # losses/preds/labels on CPU (final containers)
+ all_losses = None
+ all_preds = None
+ all_labels = None
+ all_inputs = None
+ # Will be useful when we have an iterable dataset so don't know its length.
+
+ onnx_session = onnxruntime.InferenceSession(onnx_model.SerializeToString(), None)
+ observed_num_examples = 0
+ # Main evaluation loop
+ for step, inputs in enumerate(dataloader):
+ # Update the observed num examples
+ observed_batch_size = find_batch_size(inputs)
+ if observed_batch_size is not None:
+ observed_num_examples += observed_batch_size
+ # For batch samplers, batch_size is not known by the dataloader in advance.
+ if batch_size is None:
+ batch_size = observed_batch_size
+
+ # Prediction step
+ loss, logits, labels = self.prediction_step(onnx_session, model, inputs, prediction_loss_only, ignore_keys=ignore_keys)
+ inputs_decode = self._prepare_input(inputs["input_ids"]) if args.include_inputs_for_metrics else None
+
+ if is_torch_tpu_available():
+ xm.mark_step()
+
+ # Update containers on host
+ if loss is not None:
+ losses = self._nested_gather(loss.repeat(batch_size))
+ losses_host = losses if losses_host is None else torch.cat((losses_host, losses), dim=0)
+ if labels is not None:
+ labels = self._pad_across_processes(labels)
+ labels = self._nested_gather(labels)
+ labels_host = labels if labels_host is None else nested_concat(labels_host, labels, padding_index=-100)
+ if inputs_decode is not None:
+ inputs_decode = self._pad_across_processes(inputs_decode)
+ inputs_decode = self._nested_gather(inputs_decode)
+ inputs_host = (
+ inputs_decode
+ if inputs_host is None
+ else nested_concat(inputs_host, inputs_decode, padding_index=-100)
+ )
+ if logits is not None:
+ logits = self._pad_across_processes(logits)
+ logits = self._nested_gather(logits)
+ if self.preprocess_logits_for_metrics is not None:
+ logits = self.preprocess_logits_for_metrics(logits, labels)
+ preds_host = logits if preds_host is None else nested_concat(preds_host, logits, padding_index=-100)
+ self.control = self.callback_handler.on_prediction_step(args, self.state, self.control)
+
+ # Gather all tensors and put them back on the CPU if we have done enough accumulation steps.
+ if args.eval_accumulation_steps is not None and (step + 1) % args.eval_accumulation_steps == 0:
+ if losses_host is not None:
+ losses = nested_numpify(losses_host)
+ all_losses = losses if all_losses is None else np.concatenate((all_losses, losses), axis=0)
+ if preds_host is not None:
+ logits = nested_numpify(preds_host)
+ all_preds = logits if all_preds is None else nested_concat(all_preds, logits, padding_index=-100)
+ if inputs_host is not None:
+ inputs_decode = nested_numpify(inputs_host)
+ all_inputs = (
+ inputs_decode
+ if all_inputs is None
+ else nested_concat(all_inputs, inputs_decode, padding_index=-100)
+ )
+ if labels_host is not None:
+ labels = nested_numpify(labels_host)
+ all_labels = (
+ labels if all_labels is None else nested_concat(all_labels, labels, padding_index=-100)
+ )
+
+ # Set back to None to begin a new accumulation
+ losses_host, preds_host, inputs_host, labels_host = None, None, None, None
+
+ if args.past_index and hasattr(self, "_past"):
+ # Clean the state at the end of the evaluation loop
+ delattr(self, "_past")
+
+ # Gather all remaining tensors and put them back on the CPU
+ if losses_host is not None:
+ losses = nested_numpify(losses_host)
+ all_losses = losses if all_losses is None else np.concatenate((all_losses, losses), axis=0)
+ if preds_host is not None:
+ logits = nested_numpify(preds_host)
+ all_preds = logits if all_preds is None else nested_concat(all_preds, logits, padding_index=-100)
+ if inputs_host is not None:
+ inputs_decode = nested_numpify(inputs_host)
+ all_inputs = (
+ inputs_decode if all_inputs is None else nested_concat(all_inputs, inputs_decode, padding_index=-100)
+ )
+ if labels_host is not None:
+ labels = nested_numpify(labels_host)
+ all_labels = labels if all_labels is None else nested_concat(all_labels, labels, padding_index=-100)
+
+ # Number of samples
+ if has_length(eval_dataset):
+ num_samples = len(eval_dataset)
+ # The instance check is weird and does not actually check for the type, but whether the dataset has the right
+ # methods. Therefore we need to make sure it also has the attribute.
+ elif isinstance(eval_dataset, IterableDatasetShard) and hasattr(eval_dataset, "num_examples"):
+ num_samples = eval_dataset.num_examples
+ else:
+ if has_length(dataloader):
+ num_samples = self.num_examples(dataloader)
+ else: # both len(dataloader.dataset) and len(dataloader) fail
+ num_samples = observed_num_examples
+
+ # Number of losses has been rounded to a multiple of batch_size and in a distributed training, the number of
+ # samplers has been rounded to a multiple of batch_size, so we truncate.
+ if all_losses is not None:
+ all_losses = all_losses[:num_samples]
+ if all_preds is not None:
+ all_preds = nested_truncate(all_preds, num_samples)
+ if all_labels is not None:
+ all_labels = nested_truncate(all_labels, num_samples)
+ if all_inputs is not None:
+ all_inputs = nested_truncate(all_inputs, num_samples)
+
+ # Metrics!
+ if self.compute_metrics is not None and all_preds is not None and all_labels is not None:
+ if args.include_inputs_for_metrics:
+ metrics = self.compute_metrics(
+ EvalPrediction(predictions=all_preds, label_ids=all_labels, inputs=all_inputs)
+ )
+ else:
+ metrics = self.compute_metrics(EvalPrediction(predictions=all_preds, label_ids=all_labels))
+ else:
+ metrics = {}
+
+ # To be JSON-serializable, we need to remove numpy types or zero-d tensors
+ metrics = denumpify_detensorize(metrics)
+
+ if all_losses is not None:
+ metrics[f"{metric_key_prefix}_loss"] = all_losses.mean().item()
+
+ # Prefix all keys with metric_key_prefix + '_'
+ for key in list(metrics.keys()):
+ if not key.startswith(f"{metric_key_prefix}_"):
+ metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key)
+
+ return EvalLoopOutput(predictions=all_preds, label_ids=all_labels, metrics=metrics, num_samples=num_samples)
+
+ def prediction_step(
+ self,
+ onnx_session,
+ model: nn.Module,
+ inputs: Dict[str, Union[torch.Tensor, Any]],
+ prediction_loss_only: bool,
+ ignore_keys: Optional[List[str]] = None,
+ ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor]]:
+ """
+ Perform an evaluation step on `model` using `inputs`.
+ Subclass and override to inject custom behavior.
+ Args:
+ model (`nn.Module`):
+ The model to evaluate.
+ inputs (`Dict[str, Union[torch.Tensor, Any]]`):
+ The inputs and targets of the model.
+ The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
+ argument `labels`. Check your model's documentation for all accepted arguments.
+ prediction_loss_only (`bool`):
+ Whether or not to return the loss only.
+ ignore_keys (`Lst[str]`, *optional*):
+ A list of keys in the output of your model (if it is a dictionary) that should be ignored when
+ gathering predictions.
+ Return:
+ Tuple[Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss,
+ logits and labels (each being optional).
+ """
+ has_labels = all(inputs.get(k) is not None for k in self.label_names)
+ inputs = self._prepare_inputs(inputs)
+ if ignore_keys is None:
+ if hasattr(self.model, "config"):
+ ignore_keys = getattr(self.model.config, "keys_to_ignore_at_inference", [])
+ else:
+ ignore_keys = []
+
+ # labels may be popped when computing the loss (label smoothing for instance) so we grab them first.
+ if has_labels:
+ labels = nested_detach(tuple(inputs.get(name) for name in self.label_names))
+ if len(labels) == 1:
+ labels = labels[0]
+ else:
+ labels = None
+
+ with torch.no_grad():
+ if is_sagemaker_mp_enabled():
+ raw_outputs = smp_forward_only(model, inputs)
+ if has_labels:
+ if isinstance(raw_outputs, dict):
+ loss_mb = raw_outputs["loss"]
+ logits_mb = tuple(v for k, v in raw_outputs.items() if k not in ignore_keys + ["loss"])
+ else:
+ loss_mb = raw_outputs[0]
+ logits_mb = raw_outputs[1:]
+
+ loss = loss_mb.reduce_mean().detach().cpu()
+ logits = smp_nested_concat(logits_mb)
+ else:
+ loss = None
+ if isinstance(raw_outputs, dict):
+ logits_mb = tuple(v for k, v in raw_outputs.items() if k not in ignore_keys)
+ else:
+ logits_mb = raw_outputs
+ logits = smp_nested_concat(logits_mb)
+ else:
+ if has_labels:
+ with self.compute_loss_context_manager():
+ loss, outputs = self.compute_loss(model, inputs, return_outputs=True)
+ loss = loss.mean().detach()
+
+ if isinstance(outputs, dict):
+ logits = tuple(v for k, v in outputs.items() if k not in ignore_keys + ["loss"])
+ else:
+ logits = outputs[1:]
+ else:
+ loss = None
+ with self.compute_loss_context_manager():
+ data = {"input_ids": np.array(inputs['input_ids'], dtype=np.int64),
+ "attention_mask": np.array(inputs['token_type_ids'], dtype=np.int64),
+ "token_type_ids": np.array(inputs['attention_mask'], dtype=np.int64)}
+ outputs2 = onnx_session.run(None, data)
+ logits2 = tuple((torch.from_numpy(outputs2[0]), torch.from_numpy(outputs2[1])))
+ # TODO: this needs to be fixed and made cleaner later.
+ if self.args.past_index >= 0:
+ self._past = outputs[self.args.past_index - 1]
+
+ logits2 = nested_detach(logits2)
+ return (loss, logits2, labels)
\ No newline at end of file
diff --git a/examples/onnxrt/nlp/huggingface_model/question_answering/quantization/ptq/utils_qa.py b/examples/onnxrt/nlp/huggingface_model/question_answering/quantization/ptq/utils_qa.py
new file mode 100644
index 00000000000..96af7f1d6bd
--- /dev/null
+++ b/examples/onnxrt/nlp/huggingface_model/question_answering/quantization/ptq/utils_qa.py
@@ -0,0 +1,440 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Team All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Post-processing utilities for question answering.
+"""
+import collections
+import json
+import logging
+import os
+from typing import Optional, Tuple
+
+import numpy as np
+from tqdm.auto import tqdm
+
+
+logger = logging.getLogger(__name__)
+
+
+def postprocess_qa_predictions(
+ examples,
+ features,
+ predictions: Tuple[np.ndarray, np.ndarray],
+ version_2_with_negative: bool = False,
+ n_best_size: int = 20,
+ max_answer_length: int = 30,
+ null_score_diff_threshold: float = 0.0,
+ output_dir: Optional[str] = None,
+ prefix: Optional[str] = None,
+ log_level: Optional[int] = logging.WARNING,
+):
+ """
+ Post-processes the predictions of a question-answering model to convert them to answers that are substrings of the
+ original contexts. This is the base postprocessing functions for models that only return start and end logits.
+ Args:
+ examples: The non-preprocessed dataset (see the main script for more information).
+ features: The processed dataset (see the main script for more information).
+ predictions (:obj:`Tuple[np.ndarray, np.ndarray]`):
+ The predictions of the model: two arrays containing the start logits and the end logits respectively. Its
+ first dimension must match the number of elements of :obj:`features`.
+ version_2_with_negative (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ Whether or not the underlying dataset contains examples with no answers.
+ n_best_size (:obj:`int`, `optional`, defaults to 20):
+ The total number of n-best predictions to generate when looking for an answer.
+ max_answer_length (:obj:`int`, `optional`, defaults to 30):
+ The maximum length of an answer that can be generated. This is needed because the start and end predictions
+ are not conditioned on one another.
+ null_score_diff_threshold (:obj:`float`, `optional`, defaults to 0):
+ The threshold used to select the null answer: if the best answer has a score that is less than the score of
+ the null answer minus this threshold, the null answer is selected for this example (note that the score of
+ the null answer for an example giving several features is the minimum of the scores for the null answer on
+ each feature: all features must be aligned on the fact they `want` to predict a null answer).
+ Only useful when :obj:`version_2_with_negative` is :obj:`True`.
+ output_dir (:obj:`str`, `optional`):
+ If provided, the dictionaries of predictions, n_best predictions (with their scores and logits) and, if
+ :obj:`version_2_with_negative=True`, the dictionary of the scores differences between best and null
+ answers, are saved in `output_dir`.
+ prefix (:obj:`str`, `optional`):
+ If provided, the dictionaries mentioned above are saved with `prefix` added to their names.
+ log_level (:obj:`int`, `optional`, defaults to ``logging.WARNING``):
+ ``logging`` log level (e.g., ``logging.WARNING``)
+ """
+ if len(predictions) != 2:
+ raise ValueError("`predictions` should be a tuple with two elements (start_logits, end_logits).")
+ all_start_logits, all_end_logits = predictions
+
+ if len(predictions[0]) != len(features):
+ raise ValueError(f"Got {len(predictions[0])} predictions and {len(features)} features.")
+
+ # Build a map example to its corresponding features.
+ example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
+ features_per_example = collections.defaultdict(list)
+ for i, feature in enumerate(features):
+ features_per_example[example_id_to_index[feature["example_id"]]].append(i)
+
+ # The dictionaries we have to fill.
+ all_predictions = collections.OrderedDict()
+ all_nbest_json = collections.OrderedDict()
+ if version_2_with_negative:
+ scores_diff_json = collections.OrderedDict()
+
+ # Logging.
+ logger.setLevel(log_level)
+ logger.info(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")
+
+ # Let's loop over all the examples!
+ for example_index, example in enumerate(tqdm(examples)):
+ # Those are the indices of the features associated to the current example.
+ feature_indices = features_per_example[example_index]
+
+ min_null_prediction = None
+ prelim_predictions = []
+
+ # Looping through all the features associated to the current example.
+ for feature_index in feature_indices:
+ # We grab the predictions of the model for this feature.
+ start_logits = all_start_logits[feature_index]
+ end_logits = all_end_logits[feature_index]
+ # This is what will allow us to map some the positions in our logits to span of texts in the original
+ # context.
+ offset_mapping = features[feature_index]["offset_mapping"]
+ # Optional `token_is_max_context`, if provided we will remove answers that do not have the maximum context
+ # available in the current feature.
+ token_is_max_context = features[feature_index].get("token_is_max_context", None)
+
+ # Update minimum null prediction.
+ feature_null_score = start_logits[0] + end_logits[0]
+ if min_null_prediction is None or min_null_prediction["score"] > feature_null_score:
+ min_null_prediction = {
+ "offsets": (0, 0),
+ "score": feature_null_score,
+ "start_logit": start_logits[0],
+ "end_logit": end_logits[0],
+ }
+
+ # Go through all possibilities for the `n_best_size` greater start and end logits.
+ start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
+ end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
+ for start_index in start_indexes:
+ for end_index in end_indexes:
+ # Don't consider out-of-scope answers, either because the indices are out of bounds or correspond
+ # to part of the input_ids that are not in the context.
+ if (
+ start_index >= len(offset_mapping)
+ or end_index >= len(offset_mapping)
+ or offset_mapping[start_index] is None
+ or len(offset_mapping[start_index]) < 2
+ or offset_mapping[end_index] is None
+ or len(offset_mapping[end_index]) < 2
+ ):
+ continue
+ # Don't consider answers with a length that is either < 0 or > max_answer_length.
+ if end_index < start_index or end_index - start_index + 1 > max_answer_length:
+ continue
+ # Don't consider answer that don't have the maximum context available (if such information is
+ # provided).
+ if token_is_max_context is not None and not token_is_max_context.get(str(start_index), False):
+ continue
+
+ prelim_predictions.append(
+ {
+ "offsets": (offset_mapping[start_index][0], offset_mapping[end_index][1]),
+ "score": start_logits[start_index] + end_logits[end_index],
+ "start_logit": start_logits[start_index],
+ "end_logit": end_logits[end_index],
+ }
+ )
+ if version_2_with_negative and min_null_prediction is not None:
+ # Add the minimum null prediction
+ prelim_predictions.append(min_null_prediction)
+ null_score = min_null_prediction["score"]
+
+ # Only keep the best `n_best_size` predictions.
+ predictions = sorted(prelim_predictions, key=lambda x: x["score"], reverse=True)[:n_best_size]
+
+ # Add back the minimum null prediction if it was removed because of its low score.
+ if (
+ version_2_with_negative
+ and min_null_prediction is not None
+ and not any(p["offsets"] == (0, 0) for p in predictions)
+ ):
+ predictions.append(min_null_prediction)
+
+ # Use the offsets to gather the answer text in the original context.
+ context = example["context"]
+ for pred in predictions:
+ offsets = pred.pop("offsets")
+ pred["text"] = context[offsets[0] : offsets[1]]
+
+ # In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid
+ # failure.
+ if len(predictions) == 0 or (len(predictions) == 1 and predictions[0]["text"] == ""):
+ predictions.insert(0, {"text": "empty", "start_logit": 0.0, "end_logit": 0.0, "score": 0.0})
+
+ # Compute the softmax of all scores (we do it with numpy to stay independent from torch/tf in this file, using
+ # the LogSumExp trick).
+ scores = np.array([pred.pop("score") for pred in predictions])
+ exp_scores = np.exp(scores - np.max(scores))
+ probs = exp_scores / exp_scores.sum()
+
+ # Include the probabilities in our predictions.
+ for prob, pred in zip(probs, predictions):
+ pred["probability"] = prob
+
+ # Pick the best prediction. If the null answer is not possible, this is easy.
+ if not version_2_with_negative:
+ all_predictions[example["id"]] = predictions[0]["text"]
+ else:
+ # Otherwise we first need to find the best non-empty prediction.
+ i = 0
+ while predictions[i]["text"] == "":
+ i += 1
+ best_non_null_pred = predictions[i]
+
+ # Then we compare to the null prediction using the threshold.
+ score_diff = null_score - best_non_null_pred["start_logit"] - best_non_null_pred["end_logit"]
+ scores_diff_json[example["id"]] = float(score_diff) # To be JSON-serializable.
+ if score_diff > null_score_diff_threshold:
+ all_predictions[example["id"]] = ""
+ else:
+ all_predictions[example["id"]] = best_non_null_pred["text"]
+
+ # Make `predictions` JSON-serializable by casting np.float back to float.
+ all_nbest_json[example["id"]] = [
+ {k: (float(v) if isinstance(v, (np.float16, np.float32, np.float64)) else v) for k, v in pred.items()}
+ for pred in predictions
+ ]
+
+ # If we have an output_dir, let's save all those dicts.
+ if output_dir is not None:
+ if not os.path.isdir(output_dir):
+ raise EnvironmentError(f"{output_dir} is not a directory.")
+
+ prediction_file = os.path.join(
+ output_dir, "predictions.json" if prefix is None else f"{prefix}_predictions.json"
+ )
+ nbest_file = os.path.join(
+ output_dir, "nbest_predictions.json" if prefix is None else f"{prefix}_nbest_predictions.json"
+ )
+ if version_2_with_negative:
+ null_odds_file = os.path.join(
+ output_dir, "null_odds.json" if prefix is None else f"{prefix}_null_odds.json"
+ )
+
+ logger.info(f"Saving predictions to {prediction_file}.")
+ with open(prediction_file, "w") as writer:
+ writer.write(json.dumps(all_predictions, indent=4) + "\n")
+ logger.info(f"Saving nbest_preds to {nbest_file}.")
+ with open(nbest_file, "w") as writer:
+ writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
+ if version_2_with_negative:
+ logger.info(f"Saving null_odds to {null_odds_file}.")
+ with open(null_odds_file, "w") as writer:
+ writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
+
+ return all_predictions
+
+
+def postprocess_qa_predictions_with_beam_search(
+ examples,
+ features,
+ predictions: Tuple[np.ndarray, np.ndarray],
+ version_2_with_negative: bool = False,
+ n_best_size: int = 20,
+ max_answer_length: int = 30,
+ start_n_top: int = 5,
+ end_n_top: int = 5,
+ output_dir: Optional[str] = None,
+ prefix: Optional[str] = None,
+ log_level: Optional[int] = logging.WARNING,
+):
+ """
+ Post-processes the predictions of a question-answering model with beam search to convert them to answers that are substrings of the
+ original contexts. This is the postprocessing functions for models that return start and end logits, indices, as well as
+ cls token predictions.
+ Args:
+ examples: The non-preprocessed dataset (see the main script for more information).
+ features: The processed dataset (see the main script for more information).
+ predictions (:obj:`Tuple[np.ndarray, np.ndarray]`):
+ The predictions of the model: two arrays containing the start logits and the end logits respectively. Its
+ first dimension must match the number of elements of :obj:`features`.
+ version_2_with_negative (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ Whether or not the underlying dataset contains examples with no answers.
+ n_best_size (:obj:`int`, `optional`, defaults to 20):
+ The total number of n-best predictions to generate when looking for an answer.
+ max_answer_length (:obj:`int`, `optional`, defaults to 30):
+ The maximum length of an answer that can be generated. This is needed because the start and end predictions
+ are not conditioned on one another.
+ start_n_top (:obj:`int`, `optional`, defaults to 5):
+ The number of top start logits too keep when searching for the :obj:`n_best_size` predictions.
+ end_n_top (:obj:`int`, `optional`, defaults to 5):
+ The number of top end logits too keep when searching for the :obj:`n_best_size` predictions.
+ output_dir (:obj:`str`, `optional`):
+ If provided, the dictionaries of predictions, n_best predictions (with their scores and logits) and, if
+ :obj:`version_2_with_negative=True`, the dictionary of the scores differences between best and null
+ answers, are saved in `output_dir`.
+ prefix (:obj:`str`, `optional`):
+ If provided, the dictionaries mentioned above are saved with `prefix` added to their names.
+ log_level (:obj:`int`, `optional`, defaults to ``logging.WARNING``):
+ ``logging`` log level (e.g., ``logging.WARNING``)
+ """
+ if len(predictions) != 5:
+ raise ValueError("`predictions` should be a tuple with five elements.")
+ start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits = predictions
+
+ if len(predictions[0]) != len(features):
+ raise ValueError(f"Got {len(predictions[0])} predictions and {len(features)} features.")
+
+ # Build a map example to its corresponding features.
+ example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
+ features_per_example = collections.defaultdict(list)
+ for i, feature in enumerate(features):
+ features_per_example[example_id_to_index[feature["example_id"]]].append(i)
+
+ # The dictionaries we have to fill.
+ all_predictions = collections.OrderedDict()
+ all_nbest_json = collections.OrderedDict()
+ scores_diff_json = collections.OrderedDict() if version_2_with_negative else None
+
+ # Logging.
+ logger.setLevel(log_level)
+ logger.info(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")
+
+ # Let's loop over all the examples!
+ for example_index, example in enumerate(tqdm(examples)):
+ # Those are the indices of the features associated to the current example.
+ feature_indices = features_per_example[example_index]
+
+ min_null_score = None
+ prelim_predictions = []
+
+ # Looping through all the features associated to the current example.
+ for feature_index in feature_indices:
+ # We grab the predictions of the model for this feature.
+ start_log_prob = start_top_log_probs[feature_index]
+ start_indexes = start_top_index[feature_index]
+ end_log_prob = end_top_log_probs[feature_index]
+ end_indexes = end_top_index[feature_index]
+ feature_null_score = cls_logits[feature_index]
+ # This is what will allow us to map some the positions in our logits to span of texts in the original
+ # context.
+ offset_mapping = features[feature_index]["offset_mapping"]
+ # Optional `token_is_max_context`, if provided we will remove answers that do not have the maximum context
+ # available in the current feature.
+ token_is_max_context = features[feature_index].get("token_is_max_context", None)
+
+ # Update minimum null prediction
+ if min_null_score is None or feature_null_score < min_null_score:
+ min_null_score = feature_null_score
+
+ # Go through all possibilities for the `n_start_top`/`n_end_top` greater start and end logits.
+ for i in range(start_n_top):
+ for j in range(end_n_top):
+ start_index = int(start_indexes[i])
+ j_index = i * end_n_top + j
+ end_index = int(end_indexes[j_index])
+ # Don't consider out-of-scope answers (last part of the test should be unnecessary because of the
+ # p_mask but let's not take any risk)
+ if (
+ start_index >= len(offset_mapping)
+ or end_index >= len(offset_mapping)
+ or offset_mapping[start_index] is None
+ or len(offset_mapping[start_index]) < 2
+ or offset_mapping[end_index] is None
+ or len(offset_mapping[end_index]) < 2
+ ):
+ continue
+
+ # Don't consider answers with a length negative or > max_answer_length.
+ if end_index < start_index or end_index - start_index + 1 > max_answer_length:
+ continue
+ # Don't consider answer that don't have the maximum context available (if such information is
+ # provided).
+ if token_is_max_context is not None and not token_is_max_context.get(str(start_index), False):
+ continue
+ prelim_predictions.append(
+ {
+ "offsets": (offset_mapping[start_index][0], offset_mapping[end_index][1]),
+ "score": start_log_prob[i] + end_log_prob[j_index],
+ "start_log_prob": start_log_prob[i],
+ "end_log_prob": end_log_prob[j_index],
+ }
+ )
+
+ # Only keep the best `n_best_size` predictions.
+ predictions = sorted(prelim_predictions, key=lambda x: x["score"], reverse=True)[:n_best_size]
+
+ # Use the offsets to gather the answer text in the original context.
+ context = example["context"]
+ for pred in predictions:
+ offsets = pred.pop("offsets")
+ pred["text"] = context[offsets[0] : offsets[1]]
+
+ # In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid
+ # failure.
+ if len(predictions) == 0:
+ # Without predictions min_null_score is going to be None and None will cause an exception later
+ min_null_score = -2e-6
+ predictions.insert(0, {"text": "", "start_logit": -1e-6, "end_logit": -1e-6, "score": min_null_score})
+
+ # Compute the softmax of all scores (we do it with numpy to stay independent from torch/tf in this file, using
+ # the LogSumExp trick).
+ scores = np.array([pred.pop("score") for pred in predictions])
+ exp_scores = np.exp(scores - np.max(scores))
+ probs = exp_scores / exp_scores.sum()
+
+ # Include the probabilities in our predictions.
+ for prob, pred in zip(probs, predictions):
+ pred["probability"] = prob
+
+ # Pick the best prediction and set the probability for the null answer.
+ all_predictions[example["id"]] = predictions[0]["text"]
+ if version_2_with_negative:
+ scores_diff_json[example["id"]] = float(min_null_score)
+
+ # Make `predictions` JSON-serializable by casting np.float back to float.
+ all_nbest_json[example["id"]] = [
+ {k: (float(v) if isinstance(v, (np.float16, np.float32, np.float64)) else v) for k, v in pred.items()}
+ for pred in predictions
+ ]
+
+ # If we have an output_dir, let's save all those dicts.
+ if output_dir is not None:
+ if not os.path.isdir(output_dir):
+ raise EnvironmentError(f"{output_dir} is not a directory.")
+
+ prediction_file = os.path.join(
+ output_dir, "predictions.json" if prefix is None else f"{prefix}_predictions.json"
+ )
+ nbest_file = os.path.join(
+ output_dir, "nbest_predictions.json" if prefix is None else f"{prefix}_nbest_predictions.json"
+ )
+ if version_2_with_negative:
+ null_odds_file = os.path.join(
+ output_dir, "null_odds.json" if prefix is None else f"{prefix}_null_odds.json"
+ )
+
+ logger.info(f"Saving predictions to {prediction_file}.")
+ with open(prediction_file, "w") as writer:
+ writer.write(json.dumps(all_predictions, indent=4) + "\n")
+ logger.info(f"Saving nbest_preds to {nbest_file}.")
+ with open(nbest_file, "w") as writer:
+ writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
+ if version_2_with_negative:
+ logger.info(f"Saving null_odds to {null_odds_file}.")
+ with open(null_odds_file, "w") as writer:
+ writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
+
+ return all_predictions, scores_diff_json
\ No newline at end of file
diff --git a/examples/onnxrt/nlp/huggingface_model/text_classification/quantization/ptq/README.md b/examples/onnxrt/nlp/huggingface_model/text_classification/quantization/ptq/README.md
new file mode 100644
index 00000000000..0afa7d99629
--- /dev/null
+++ b/examples/onnxrt/nlp/huggingface_model/text_classification/quantization/ptq/README.md
@@ -0,0 +1,56 @@
+# Evaluate performance of ONNX Runtime(Huggingface Text Classification)
+>ONNX runtime quantization is under active development. please use 1.6.0+ to get more quantization support.
+
+This example load a language translation model and confirm its accuracy and speed based on [GLUE data](https://gluebenchmark.com/).
+
+### Environment
+Please use latest onnx and onnxruntime version.
+
+### Prepare dataset
+download the GLUE data with `prepare_data.sh` script.
+
+```shell
+export GLUE_DIR=/path/to/glue_data
+export TASK_NAME=MRPC # or SST
+
+bash prepare_data.sh --data_dir=$GLUE_DIR --task_name=$TASK_NAME
+```
+
+### Prepare model
+
+Supported model identifier from [huggingface.co](https://huggingface.co/):
+
+| Model Identifier |
+|:-----------------------------------------------:|
+| Intel/bert-base-uncased-mrpc |
+| Intel/roberta-base-mrpc |
+| Intel/xlm-roberta-base-mrpc |
+| Intel/camembert-base-mrpc |
+| distilbert-base-uncased-finetuned-sst-2-english |
+| Alireza1044/albert-base-v2-sst2 |
+| Intel/MiniLM-L12-H384-uncased-mrpc |
+| philschmid/MiniLM-L6-H384-uncased-sst2 |
+
+```bash
+python export.py --model_name_or_path=Intel/bert-base-uncased-mrpc \ # or other supported model identifier
+```
+
+### Quantization
+
+Quantize model with dynamic quantization:
+
+```bash
+bash run_tuning.sh --config=glue_dynamic.yaml \
+ --input_model=path/to/model \ # model path as *.onnx
+ --output_model=path/to/model_tune \ # model path as *.onnx
+ --data_path=path/to/glue/data
+```
+
+### Benchmark
+
+```bash
+bash run_benchmark.sh --config=glue_dynamic.yaml \
+ --input_model=path/to/model \ # model path as *.onnx
+ --data_path=path/to/glue/data \
+ --mode=performance # or accuracy
+```
diff --git a/examples/onnxrt/nlp/huggingface_model/text_classification/quantization/ptq/export.py b/examples/onnxrt/nlp/huggingface_model/text_classification/quantization/ptq/export.py
new file mode 100644
index 00000000000..f2a38e747b3
--- /dev/null
+++ b/examples/onnxrt/nlp/huggingface_model/text_classification/quantization/ptq/export.py
@@ -0,0 +1,72 @@
+import argparse
+
+import torch
+from transformers import AutoConfig, AutoModelForSequenceClassification
+
+def export_onnx_model(args, model):
+ with torch.no_grad():
+ symbolic_names = {0: 'batch_size', 1: 'max_seq_len'}
+ if args.model_name_or_path in ['Intel/roberta-base-mrpc',
+ 'Intel/xlm-roberta-base-mrpc',
+ 'Intel/camembert-base-mrpc',
+ 'distilbert-base-uncased-finetuned-sst-2-english']:
+ inputs = {'input_ids': torch.ones(1, args.max_len, dtype=torch.int64),
+ 'attention_mask': torch.ones(1, args.max_len, dtype=torch.int64)}
+ torch.onnx.export(model, # model being run
+ (inputs['input_ids'], # model input (or a tuple for multiple inputs)
+ inputs['attention_mask']),
+ args.output_model, # where to save the model (can be a file or file-like object)
+ opset_version=14, # the ONNX version to export the model
+ do_constant_folding=True, # whether to execute constant folding
+ input_names=['input_ids', # the model's input names
+ 'attention_mask'],
+ dynamic_axes={'input_ids': symbolic_names, # variable length axes
+ 'attention_mask' : symbolic_names})
+ else:
+ inputs = {'input_ids': torch.ones(1, args.max_len, dtype=torch.int64),
+ 'token_type_ids': torch.ones(1, args.max_len, dtype=torch.int64),
+ 'attention_mask': torch.ones(1, args.max_len, dtype=torch.int64)}
+ torch.onnx.export(model, # model being run
+ (inputs['input_ids'], # model input (or a tuple for multiple inputs)
+ inputs['token_type_ids'],
+ inputs['attention_mask']),
+ args.output_model, # where to save the model (can be a file or file-like object)
+ opset_version=14, # the ONNX version to export the model
+ do_constant_folding=True, # whether to execute constant folding
+ input_names=['input_ids', # the model's input names
+ 'token_type_ids',
+ 'attention_mask'],
+ dynamic_axes={'input_ids': symbolic_names, # variable length axes
+ 'token_type_ids' : symbolic_names,
+ 'attention_mask' : symbolic_names})
+ print("ONNX Model exported to {0}".format(args.output_model))
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(
+ description='Export huggingface onnx model',
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument(
+ '--model_name_or_path',
+ type=str,
+ choices=['Intel/bert-base-uncased-mrpc',
+ 'Intel/roberta-base-mrpc',
+ 'Intel/xlm-roberta-base-mrpc',
+ 'Intel/camembert-base-mrpc',
+ 'distilbert-base-uncased-finetuned-sst-2-english',
+ 'Alireza1044/albert-base-v2-sst2',
+ 'philschmid/MiniLM-L6-H384-uncased-sst2',
+ 'Intel/MiniLM-L12-H384-uncased-mrpc'],
+ help='pretrained model name or path')
+ parser.add_argument(
+ '--max_len',
+ type=int,
+ default=128,
+ help='Maximum length of the sentence pairs')
+ args = parser.parse_args()
+ args.output_model = args.model_name_or_path.split('/')[-1] + '.onnx'
+
+ model = AutoModelForSequenceClassification.from_pretrained(
+ args.model_name_or_path,
+ config=AutoConfig.from_pretrained(args.model_name_or_path))
+
+ export_onnx_model(args, model)
\ No newline at end of file
diff --git a/examples/onnxrt/nlp/huggingface_model/text_classification/quantization/ptq/glue_dynamic.yaml b/examples/onnxrt/nlp/huggingface_model/text_classification/quantization/ptq/glue_dynamic.yaml
new file mode 100644
index 00000000000..fa9a22ce874
--- /dev/null
+++ b/examples/onnxrt/nlp/huggingface_model/text_classification/quantization/ptq/glue_dynamic.yaml
@@ -0,0 +1,36 @@
+#
+# Copyright (c) 2021 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+version: 1.0
+
+model: # mandatory. used to specify model specific information.
+ name: text_classification
+ framework: onnxrt_integerops # mandatory. possible values are tensorflow, mxnet, pytorch, pytorch_ipex, onnxrt_integerops and onnxrt_qlinearops.
+
+evaluation: # optional. required if user doesn't provide eval_func in neural_compressor.Quantization.
+ performance: # optional. used to benchmark performance of passing model.
+ warmup: 10
+ iteration: 100
+ configs:
+ cores_per_instance: 28
+ num_of_instance: 1
+
+quantization:
+ approach: post_training_dynamic_quant # optional. default value is post_training_static_quant.
+
+tuning:
+ accuracy_criterion:
+ relative: 0.01 # optional. default value is relative, other value is absolute. this example allows relative accuracy loss: 1%.
+ random_seed: 9527 # optional. random seed for deterministic tuning.
\ No newline at end of file
diff --git a/examples/onnxrt/nlp/huggingface_model/text_classification/quantization/ptq/main.py b/examples/onnxrt/nlp/huggingface_model/text_classification/quantization/ptq/main.py
new file mode 100644
index 00000000000..d5051af3816
--- /dev/null
+++ b/examples/onnxrt/nlp/huggingface_model/text_classification/quantization/ptq/main.py
@@ -0,0 +1,422 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint:disable=redefined-outer-name,logging-format-interpolation
+
+import logging
+import argparse
+import onnx
+import onnxruntime as ort
+import transformers
+import os
+import torch
+import numpy as np
+from dataclasses import dataclass
+from typing import List, Optional, Union
+import sys
+from neural_compressor.data import DATALOADERS, DATASETS
+
+
+class ONNXRTBertDataset:
+ """Dataset used for model Bert.
+ Args: data_dir (str): The input data dir.
+ model_name_or_path (str): Path to pre-trained student model or shortcut name,
+ selected in the list:
+ max_seq_length (int, default=128): The maximum length after tokenization.
+ Sequences longer than this will be truncated,
+ sequences shorter will be padded.
+ do_lower_case (bool, default=True): Whether to lowercase the input when tokenizing.
+ task (str, default=mrpc): The name of the task to fine-tune.
+ Choices include mrpc, qqp, qnli, rte,
+ sts-b, cola, mnli, wnli.
+ model_type (str, default='bert'): model type, support 'distilbert', 'bert',
+ 'mobilebert', 'roberta'.
+ dynamic_length (bool, default=False): Whether to use fixed sequence length.
+ evaluate (bool, default=True): Whether do evaluation or training.
+ transform (transform object, default=None): transform to process input data.
+ filter (Filter objects, default=None): filter out examples according
+ to specific conditions.
+ """
+ def __init__(self, data_dir, model_name_or_path, max_seq_length=128,\
+ do_lower_case=True, task='mrpc', model_type='bert', dynamic_length=False,\
+ evaluate=True, transform=None, filter=None):
+ task = task.lower()
+ model_type = model_type.lower()
+ assert task in ['mrpc', 'qqp', 'qnli', 'rte', 'sts-b', 'cola', \
+ 'mnli', 'wnli', 'sst-2'], 'Unsupported task type'
+ assert model_type in ['distilbert', 'bert', 'mobilebert', 'roberta'], 'Unsupported \
+ model type'
+ self.dynamic_length = dynamic_length
+ self.model_type = model_type
+ self.max_seq_length = max_seq_length
+ tokenizer = transformers.AutoTokenizer.from_pretrained(model_name_or_path,
+ do_lower_case=do_lower_case)
+ self.dataset = load_and_cache_examples(data_dir, model_name_or_path, \
+ max_seq_length, task, model_type, tokenizer, evaluate)
+
+ def __len__(self):
+ return len(self.dataset)
+
+ def __getitem__(self, index):
+ # return self.dataset[index]
+ batch = tuple(t.detach().cpu().numpy() if not isinstance(t, np.ndarray) else t for t in self.dataset[index])
+ return batch[:3], batch[-1]
+
+def load_and_cache_examples(data_dir, model_name_or_path, max_seq_length, task, \
+ model_type, tokenizer, evaluate):
+ from torch.utils.data import TensorDataset
+
+ processor = transformers.glue_processors[task]()
+ output_mode = transformers.glue_output_modes[task]
+ # Load data features from cache or dataset file
+ if not os.path.exists("./dataset_cached"):
+ os.makedirs("./dataset_cached")
+ cached_features_file = os.path.join("./dataset_cached", 'cached_{}_{}_{}_{}'.format(
+ 'dev' if evaluate else 'train',
+ list(filter(None, model_name_or_path.split('/'))).pop(),
+ str(max_seq_length),
+ str(task)))
+ if os.path.exists(cached_features_file):
+ logger.info("Load features from cached file {}.".format(cached_features_file))
+ features = torch.load(cached_features_file)
+ else:
+ logger.info("Create features from dataset file at {}.".format(data_dir))
+ label_list = processor.get_labels()
+ examples = processor.get_dev_examples(data_dir) if evaluate else \
+ processor.get_train_examples(data_dir)
+ features = convert_examples_to_features(examples,
+ tokenizer,
+ task=task,
+ label_list=label_list,
+ max_length=max_seq_length,
+ output_mode=output_mode,
+ )
+ logger.info("Save features into cached file {}.".format(cached_features_file))
+ torch.save(features, cached_features_file)
+ # Convert to Tensors and build dataset
+ all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
+ all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
+ all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
+ all_seq_lengths = torch.tensor([f.seq_length for f in features], dtype=torch.long)
+ if output_mode == "classification":
+ all_labels = torch.tensor([f.label for f in features], dtype=torch.long)
+ elif output_mode == "regression":
+ all_labels = torch.tensor([f.label for f in features], dtype=torch.float)
+ dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, \
+ all_seq_lengths, all_labels)
+ return dataset
+
+def convert_examples_to_features(
+ examples,
+ tokenizer,
+ max_length=128,
+ task=None,
+ label_list=None,
+ output_mode="classification",
+ pad_token=0,
+ pad_token_segment_id=0,
+ mask_padding_with_zero=True,
+):
+ processor = transformers.glue_processors[task]()
+ if label_list is None:
+ label_list = processor.get_labels()
+ logger.info("Use label list {} for task {}.".format(label_list, task))
+ label_map = {label: i for i, label in enumerate(label_list)}
+ features = []
+ for (ex_index, example) in enumerate(examples):
+ inputs = tokenizer.encode_plus(
+ example.text_a,
+ example.text_b,
+ add_special_tokens=True,
+ max_length=max_length,
+ return_token_type_ids=True,
+ truncation=True,
+ )
+ input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]
+ # The mask has 1 for real tokens and 0 for padding tokens. Only real
+ # tokens are attended to.
+ attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
+
+ # Zero-pad up to the sequence length.
+ seq_length = len(input_ids)
+ padding_length = max_length - len(input_ids)
+
+ input_ids = input_ids + ([pad_token] * padding_length)
+ attention_mask = attention_mask + \
+ ([0 if mask_padding_with_zero else 1] * padding_length)
+ token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)
+
+ assert len(input_ids) == max_length, \
+ "Error with input_ids length {} vs {}".format(
+ len(input_ids), max_length)
+ assert len(attention_mask) == max_length, \
+ "Error with attention_mask length {} vs {}".format(
+ len(attention_mask), max_length
+ )
+ assert len(token_type_ids) == max_length, \
+ "Error with token_type_ids length {} vs {}".format(
+ len(token_type_ids), max_length
+ )
+ if output_mode == "classification":
+ label = label_map[example.label]
+ elif output_mode == "regression":
+ label = float(example.label)
+ else:
+ raise KeyError(output_mode)
+
+ feats = InputFeatures(
+ input_ids=input_ids,
+ attention_mask=attention_mask,
+ token_type_ids=token_type_ids,
+ label=label,
+ seq_length=seq_length,
+ )
+ features.append(feats)
+ return features
+
+@dataclass(frozen=True)
+class InputFeatures:
+ """
+ A single set of features of data.
+ Property names are the same names as the corresponding inputs to a model.
+ Args:
+ input_ids: Indices of input sequence tokens in the vocabulary.
+ attention_mask: Mask to avoid performing attention on padding token indices.
+ Mask values selected in ``[0, 1]``: Usually ``1`` for tokens that are NOT MASKED,
+ ``0`` for MASKED (padded) tokens.
+ token_type_ids: (Optional) Segment token indices to indicate first and second
+ portions of the inputs. Only some models use them.
+ label: (Optional) Label corresponding to the input. Int for classification problems,
+ float for regression problems.
+ seq_length: (Optional) The length of input sequence before padding.
+ """
+
+ input_ids: List[int]
+ attention_mask: Optional[List[int]] = None
+ token_type_ids: Optional[List[int]] = None
+ label: Optional[Union[int, float]] = None
+ seq_length: Optional[List[int]] = None
+
+class ONNXRTGLUE:
+ """Computes GLUE score.
+
+ Args:
+ task (str, default=mrpc): The name of the task.
+ Choices include mrpc, qqp, qnli, rte,
+ sts-b, cola, mnli, wnli.
+
+ """
+ def __init__(self, task='mrpc'):
+ assert task in ['mrpc', 'qqp', 'qnli', 'rte', 'sts-b', 'cola', \
+ 'mnli', 'wnli', 'sst-2'], 'Unsupported task type'
+ self.pred_list = None
+ self.label_list = None
+ self.task = task
+ self.return_key = {
+ "cola": "mcc",
+ "mrpc": "f1",
+ "sts-b": "corr",
+ "qqp": "acc",
+ "mnli": "mnli/acc",
+ "qnli": "acc",
+ "rte": "acc",
+ "wnli": "acc",
+ "sst-2": "acc"
+ }
+
+ def update(self, preds, labels):
+ """add preds and labels to storage"""
+ if isinstance(preds, list) and len(preds) == 1:
+ preds = preds[0]
+ if isinstance(labels, list) and len(labels) == 1:
+ labels = labels[0]
+ if self.pred_list is None:
+ self.pred_list = preds
+ self.label_list = labels
+ else:
+ self.pred_list = np.append(self.pred_list, preds, axis=0)
+ self.label_list = np.append(self.label_list, labels, axis=0)
+
+ def reset(self):
+ """clear preds and labels storage"""
+ self.pred_list = None
+ self.label_list = None
+
+ def result(self):
+ """calculate metric"""
+ output_mode = transformers.glue_output_modes[self.task]
+
+ if output_mode == "classification":
+ processed_preds = np.argmax(self.pred_list, axis=1)
+ elif output_mode == "regression":
+ processed_preds = np.squeeze(self.pred_list)
+ result = transformers.glue_compute_metrics(\
+ self.task, processed_preds, self.label_list)
+ return result[self.return_key[self.task]]
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
+ datefmt = '%m/%d/%Y %H:%M:%S',
+ level = logging.WARN)
+
+if __name__ == "__main__":
+ logger.info('Evaluating ONNXRuntime full precision accuracy and performance:')
+ parser = argparse.ArgumentParser(
+ description='BERT fine-tune examples for classification/regression tasks.',
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument(
+ '--model_path',
+ type=str,
+ help="Pre-trained resnet50 model on onnx file"
+ )
+ parser.add_argument(
+ '--benchmark',
+ action='store_true', \
+ default=False
+ )
+ parser.add_argument(
+ '--tune',
+ action='store_true', \
+ default=False,
+ help="whether quantize the model"
+ )
+ parser.add_argument(
+ '--config',
+ type=str,
+ help="config yaml path"
+ )
+ parser.add_argument(
+ '--output_model',
+ type=str,
+ default=None,
+ help="output model path"
+ )
+ parser.add_argument(
+ '--mode',
+ type=str,
+ help="benchmark mode of performance or accuracy"
+ )
+ parser.add_argument(
+ '--data_path',
+ type=str,
+ help="input data path"
+ )
+ parser.add_argument(
+ '--batch_size',
+ default=8,
+ type=int,
+ )
+ parser.add_argument(
+ '--model_name_or_path',
+ type=str,
+ choices=['Intel/bert-base-uncased-mrpc',
+ 'Intel/roberta-base-mrpc',
+ 'Intel/xlm-roberta-base-mrpc',
+ 'Intel/camembert-base-mrpc',
+ 'distilbert-base-uncased-finetuned-sst-2-english',
+ 'Alireza1044/albert-base-v2-sst2',
+ 'philschmid/MiniLM-L6-H384-uncased-sst2',
+ 'Intel/MiniLM-L12-H384-uncased-mrpc'],
+ help="pretrained model name or path"
+ )
+ parser.add_argument(
+ '--task',
+ type=str,
+ choices=['mrpc', 'qqp', 'qnli', 'rte', 'sts-b', 'cola', \
+ 'mnli', 'wnli', 'sst-2'],
+ help="GLUE task name"
+ )
+ parser.add_argument(
+ '--num_heads',
+ default=12,
+ type=int,
+ )
+ parser.add_argument(
+ '--hidden_size',
+ default=768,
+ type=int,
+ )
+
+ args = parser.parse_args()
+
+ dataset = ONNXRTBertDataset(data_dir=args.data_path,
+ model_name_or_path=args.model_name_or_path,
+ task=args.task)
+ dataloader = DATALOADERS['onnxrt_integerops'](dataset, batch_size=args.batch_size)
+ metric = ONNXRTGLUE(args.task)
+
+ def eval_func(model, *args):
+ metric.reset()
+ import tqdm
+ session = ort.InferenceSession(model.SerializeToString(), None)
+ ort_inputs = {}
+ len_inputs = len(session.get_inputs())
+ inputs_names = [session.get_inputs()[i].name for i in range(len_inputs)]
+ for idx, (inputs, labels) in enumerate(dataloader):
+ if not isinstance(labels, list):
+ labels = [labels]
+ inputs = inputs[:len_inputs]
+ for i in range(len_inputs):
+ ort_inputs.update({inputs_names[i]: inputs[i]})
+ predictions = session.run(None, ort_inputs)
+ metric.update(predictions[0], labels)
+ return metric.result()
+
+ if args.benchmark:
+ from neural_compressor.experimental import Benchmark, common
+ model = onnx.load(args.model_path)
+ if args.mode == 'performance':
+ session = ort.InferenceSession(args.model_path, None)
+ input_tensors = session.get_inputs()
+ shape = []
+ for i in range(len(input_tensors)):
+ shape.append((1, 128))
+ datasets = DATASETS('onnxrt_integerops')
+ dummy_dataset = datasets['dummy'](shape=shape, low=1, high=1, dtype='int64', label=True)
+ evaluator = Benchmark(args.config)
+ evaluator.model = common.Model(model)
+ evaluator.b_dataloader = common.DataLoader(dummy_dataset)
+ evaluator(args.mode)
+ elif args.mode == 'accuracy':
+ evaluator = Benchmark(args.config)
+ evaluator.model = common.Model(model)
+ evaluator.b_dataloader = dataloader
+ evaluator.metric = metric
+ evaluator.b_func = eval_func
+ evaluator(args.mode)
+
+ if args.tune:
+ from onnxruntime.transformers import optimizer
+ from onnxruntime.transformers.onnx_model_bert import BertOptimizationOptions
+ opt_options = BertOptimizationOptions('bert')
+ opt_options.enable_embed_layer_norm = False
+
+ model_optimizer = optimizer.optimize_model(
+ args.model_path,
+ 'bert',
+ num_heads=args.num_heads,
+ hidden_size=args.hidden_size,
+ optimization_options=opt_options)
+ model = model_optimizer.model
+
+ from neural_compressor import options
+ from neural_compressor.experimental import Quantization, common
+ options.onnxrt.graph_optimization.level = 'ENABLE_BASIC'
+ quantize = Quantization(args.config)
+ quantize.model = model
+ quantize.eval_func = eval_func
+ q_model = quantize()
+ q_model.save(args.output_model)
diff --git a/examples/onnxrt/language_translation/mobilebert/quantization/ptq/prepare_data.sh b/examples/onnxrt/nlp/huggingface_model/text_classification/quantization/ptq/prepare_data.sh
similarity index 100%
rename from examples/onnxrt/language_translation/mobilebert/quantization/ptq/prepare_data.sh
rename to examples/onnxrt/nlp/huggingface_model/text_classification/quantization/ptq/prepare_data.sh
diff --git a/examples/onnxrt/nlp/huggingface_model/text_classification/quantization/ptq/requirements.txt b/examples/onnxrt/nlp/huggingface_model/text_classification/quantization/ptq/requirements.txt
new file mode 100644
index 00000000000..a5e81be3aad
--- /dev/null
+++ b/examples/onnxrt/nlp/huggingface_model/text_classification/quantization/ptq/requirements.txt
@@ -0,0 +1,7 @@
+torch
+transformers==4.16.0
+onnx
+onnxruntime
+coloredlogs
+sympy
+onnxruntime-extensions; python_version < '3.10'
diff --git a/examples/onnxrt/nlp/huggingface_model/text_classification/quantization/ptq/run_benchmark.sh b/examples/onnxrt/nlp/huggingface_model/text_classification/quantization/ptq/run_benchmark.sh
new file mode 100644
index 00000000000..c72b109a530
--- /dev/null
+++ b/examples/onnxrt/nlp/huggingface_model/text_classification/quantization/ptq/run_benchmark.sh
@@ -0,0 +1,81 @@
+#!/bin/bash
+set -x
+
+function main {
+
+ init_params "$@"
+ run_benchmark
+
+}
+
+# init params
+function init_params {
+ for var in "$@"
+ do
+ case $var in
+ --config=*)
+ config=$(echo $var |cut -f2 -d=)
+ ;;
+ --input_model=*)
+ input_model=$(echo $var |cut -f2 -d=)
+ ;;
+ --mode=*)
+ mode=$(echo $var |cut -f2 -d=)
+ ;;
+ --data_path=*)
+ data_path=$(echo $var |cut -f2 -d=)
+ ;;
+ esac
+ done
+
+}
+
+# run_benchmark
+function run_benchmark {
+
+ if [[ "${input_model}" =~ "bert-base" ]]; then
+ model_name_or_path="Intel/bert-base-uncased-mrpc"
+ TASK_NAME='mrpc'
+ fi
+ if [[ "${input_model}" =~ "roberta-base" ]]; then
+ model_name_or_path="Intel/roberta-base-mrpc"
+ TASK_NAME='mrpc'
+ fi
+ if [[ "${input_model}" =~ "xlm-roberta-base" ]]; then
+ model_name_or_path="Intel/xlm-roberta-base-mrpc"
+ TASK_NAME='mrpc'
+ fi
+ if [[ "${input_model}" =~ "camembert-base" ]]; then
+ model_name_or_path="Intel/camembert-base-mrpc"
+ TASK_NAME='mrpc'
+ fi
+ if [[ "${input_model}" =~ "distilbert-base" ]]; then
+ model_name_or_path="distilbert-base-uncased-finetuned-sst-2-english"
+ TASK_NAME='sst-2'
+ fi
+ if [[ "${input_model}" =~ "albert-base" ]]; then
+ model_name_or_path="Alireza1044/albert-base-v2-sst2"
+ TASK_NAME='sst-2'
+ fi
+ if [[ "${input_model}" =~ "MiniLM-L6" ]]; then
+ model_name_or_path="philschmid/MiniLM-L6-H384-uncased-sst2"
+ TASK_NAME='sst-2'
+ fi
+ if [[ "${input_model}" =~ "MiniLM-L12" ]]; then
+ model_name_or_path="Intel/MiniLM-L12-H384-uncased-mrpc"
+ TASK_NAME='mrpc'
+ fi
+
+ python main.py \
+ --model_name_or_path ${model_name_or_path} \
+ --model_path ${input_model} \
+ --config ${config} \
+ --data_path ${data_path} \
+ --task ${TASK_NAME} \
+ --mode=${mode} \
+ --benchmark
+
+}
+
+main "$@"
+
diff --git a/examples/onnxrt/nlp/huggingface_model/text_classification/quantization/ptq/run_tuning.sh b/examples/onnxrt/nlp/huggingface_model/text_classification/quantization/ptq/run_tuning.sh
new file mode 100644
index 00000000000..7d141154355
--- /dev/null
+++ b/examples/onnxrt/nlp/huggingface_model/text_classification/quantization/ptq/run_tuning.sh
@@ -0,0 +1,98 @@
+#!/bin/bash
+set -x
+
+function main {
+ init_params "$@"
+ run_tuning
+}
+
+# init params
+function init_params {
+ for var in "$@"
+ do
+ case $var in
+ --config=*)
+ config=$(echo $var |cut -f2 -d=)
+ ;;
+ --input_model=*)
+ input_model=$(echo $var |cut -f2 -d=)
+ ;;
+ --output_model=*)
+ output_model=$(echo $var |cut -f2 -d=)
+ ;;
+ --data_path=*)
+ data_path=$(echo $var |cut -f2 -d=)
+ ;;
+ esac
+ done
+
+}
+
+# run_tuning
+function run_tuning {
+
+ if [[ "${input_model}" =~ "bert-base" ]]; then
+ model_name_or_path="Intel/bert-base-uncased-mrpc"
+ TASK_NAME='mrpc'
+ num_heads=12
+ hidden_size=768
+ fi
+ if [[ "${input_model}" =~ "roberta-base" ]]; then
+ model_name_or_path="Intel/roberta-base-mrpc"
+ TASK_NAME='mrpc'
+ num_heads=12
+ hidden_size=768
+ fi
+ if [[ "${input_model}" =~ "xlm-roberta-base" ]]; then
+ model_name_or_path="Intel/xlm-roberta-base-mrpc"
+ TASK_NAME='mrpc'
+ num_heads=12
+ hidden_size=768
+ fi
+ if [[ "${input_model}" =~ "camembert-base" ]]; then
+ model_name_or_path="Intel/camembert-base-mrpc"
+ TASK_NAME='mrpc'
+ num_heads=12
+ hidden_size=768
+ fi
+ if [[ "${input_model}" =~ "distilbert-base" ]]; then
+ model_name_or_path="distilbert-base-uncased-finetuned-sst-2-english"
+ TASK_NAME='sst-2'
+ num_heads=12
+ hidden_size=768
+ fi
+ if [[ "${input_model}" =~ "albert-base" ]]; then
+ model_name_or_path="Alireza1044/albert-base-v2-sst2"
+ TASK_NAME='sst-2'
+ num_heads=12
+ hidden_size=768
+ fi
+ if [[ "${input_model}" =~ "MiniLM-L6" ]]; then
+ model_name_or_path="philschmid/MiniLM-L6-H384-uncased-sst2"
+ TASK_NAME='sst-2'
+ num_heads=12
+ hidden_size=384
+ fi
+ if [[ "${input_model}" =~ "MiniLM-L12" ]]; then
+ model_name_or_path="Intel/MiniLM-L12-H384-uncased-mrpc"
+ TASK_NAME='mrpc'
+ num_heads=12
+ hidden_size=384
+ fi
+
+ python main.py \
+ --model_name_or_path ${model_name_or_path} \
+ --model_path ${input_model} \
+ --output_model ${output_model} \
+ --config ${config} \
+ --data_path ${data_path} \
+ --task ${TASK_NAME} \
+ --num_heads ${num_heads} \
+ --hidden_size ${hidden_size} \
+ --tune
+}
+
+main "$@"
+
+
+
diff --git a/examples/onnxrt/language_translation/mobilebert/quantization/ptq/export.py b/examples/onnxrt/nlp/mobilebert/quantization/ptq/export.py
similarity index 100%
rename from examples/onnxrt/language_translation/mobilebert/quantization/ptq/export.py
rename to examples/onnxrt/nlp/mobilebert/quantization/ptq/export.py
diff --git a/examples/onnxrt/language_translation/mobilebert/quantization/ptq/main.py b/examples/onnxrt/nlp/mobilebert/quantization/ptq/main.py
similarity index 100%
rename from examples/onnxrt/language_translation/mobilebert/quantization/ptq/main.py
rename to examples/onnxrt/nlp/mobilebert/quantization/ptq/main.py
diff --git a/examples/onnxrt/language_translation/mobilebert/quantization/ptq/mobilebert.yaml b/examples/onnxrt/nlp/mobilebert/quantization/ptq/mobilebert.yaml
similarity index 100%
rename from examples/onnxrt/language_translation/mobilebert/quantization/ptq/mobilebert.yaml
rename to examples/onnxrt/nlp/mobilebert/quantization/ptq/mobilebert.yaml
diff --git a/examples/onnxrt/language_translation/mobilebert/quantization/ptq/mobilebert_qdq.yaml b/examples/onnxrt/nlp/mobilebert/quantization/ptq/mobilebert_qdq.yaml
similarity index 100%
rename from examples/onnxrt/language_translation/mobilebert/quantization/ptq/mobilebert_qdq.yaml
rename to examples/onnxrt/nlp/mobilebert/quantization/ptq/mobilebert_qdq.yaml
diff --git a/examples/onnxrt/language_translation/roberta/quantization/ptq/prepare_data.sh b/examples/onnxrt/nlp/mobilebert/quantization/ptq/prepare_data.sh
similarity index 100%
rename from examples/onnxrt/language_translation/roberta/quantization/ptq/prepare_data.sh
rename to examples/onnxrt/nlp/mobilebert/quantization/ptq/prepare_data.sh
diff --git a/examples/onnxrt/language_translation/mobilebert/quantization/ptq/prepare_model.sh b/examples/onnxrt/nlp/mobilebert/quantization/ptq/prepare_model.sh
similarity index 100%
rename from examples/onnxrt/language_translation/mobilebert/quantization/ptq/prepare_model.sh
rename to examples/onnxrt/nlp/mobilebert/quantization/ptq/prepare_model.sh
diff --git a/examples/onnxrt/language_translation/mobilebert/quantization/ptq/readme.md b/examples/onnxrt/nlp/mobilebert/quantization/ptq/readme.md
similarity index 100%
rename from examples/onnxrt/language_translation/mobilebert/quantization/ptq/readme.md
rename to examples/onnxrt/nlp/mobilebert/quantization/ptq/readme.md
diff --git a/examples/onnxrt/language_translation/mobilebert/quantization/ptq/requirements.txt b/examples/onnxrt/nlp/mobilebert/quantization/ptq/requirements.txt
similarity index 100%
rename from examples/onnxrt/language_translation/mobilebert/quantization/ptq/requirements.txt
rename to examples/onnxrt/nlp/mobilebert/quantization/ptq/requirements.txt
diff --git a/examples/onnxrt/language_translation/mobilebert/quantization/ptq/run_benchmark.sh b/examples/onnxrt/nlp/mobilebert/quantization/ptq/run_benchmark.sh
similarity index 100%
rename from examples/onnxrt/language_translation/mobilebert/quantization/ptq/run_benchmark.sh
rename to examples/onnxrt/nlp/mobilebert/quantization/ptq/run_benchmark.sh
diff --git a/examples/onnxrt/language_translation/mobilebert/quantization/ptq/run_tuning.sh b/examples/onnxrt/nlp/mobilebert/quantization/ptq/run_tuning.sh
similarity index 100%
rename from examples/onnxrt/language_translation/mobilebert/quantization/ptq/run_tuning.sh
rename to examples/onnxrt/nlp/mobilebert/quantization/ptq/run_tuning.sh
diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/BiDAF/quantization/ptq/README.md b/examples/onnxrt/nlp/onnx_model_zoo/BiDAF/quantization/ptq/README.md
similarity index 100%
rename from examples/onnxrt/language_translation/onnx_model_zoo/BiDAF/quantization/ptq/README.md
rename to examples/onnxrt/nlp/onnx_model_zoo/BiDAF/quantization/ptq/README.md
diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/BiDAF/quantization/ptq/bidaf.yaml b/examples/onnxrt/nlp/onnx_model_zoo/BiDAF/quantization/ptq/bidaf.yaml
similarity index 100%
rename from examples/onnxrt/language_translation/onnx_model_zoo/BiDAF/quantization/ptq/bidaf.yaml
rename to examples/onnxrt/nlp/onnx_model_zoo/BiDAF/quantization/ptq/bidaf.yaml
diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/BiDAF/quantization/ptq/main.py b/examples/onnxrt/nlp/onnx_model_zoo/BiDAF/quantization/ptq/main.py
similarity index 100%
rename from examples/onnxrt/language_translation/onnx_model_zoo/BiDAF/quantization/ptq/main.py
rename to examples/onnxrt/nlp/onnx_model_zoo/BiDAF/quantization/ptq/main.py
diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/BiDAF/quantization/ptq/requirements.txt b/examples/onnxrt/nlp/onnx_model_zoo/BiDAF/quantization/ptq/requirements.txt
similarity index 100%
rename from examples/onnxrt/language_translation/onnx_model_zoo/BiDAF/quantization/ptq/requirements.txt
rename to examples/onnxrt/nlp/onnx_model_zoo/BiDAF/quantization/ptq/requirements.txt
diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/BiDAF/quantization/ptq/run_benchmark.sh b/examples/onnxrt/nlp/onnx_model_zoo/BiDAF/quantization/ptq/run_benchmark.sh
similarity index 100%
rename from examples/onnxrt/language_translation/onnx_model_zoo/BiDAF/quantization/ptq/run_benchmark.sh
rename to examples/onnxrt/nlp/onnx_model_zoo/BiDAF/quantization/ptq/run_benchmark.sh
diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/BiDAF/quantization/ptq/run_tuning.sh b/examples/onnxrt/nlp/onnx_model_zoo/BiDAF/quantization/ptq/run_tuning.sh
similarity index 100%
rename from examples/onnxrt/language_translation/onnx_model_zoo/BiDAF/quantization/ptq/run_tuning.sh
rename to examples/onnxrt/nlp/onnx_model_zoo/BiDAF/quantization/ptq/run_tuning.sh
diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/bert-squad/quantization/ptq/bert.yaml b/examples/onnxrt/nlp/onnx_model_zoo/bert-squad/quantization/ptq/bert.yaml
similarity index 100%
rename from examples/onnxrt/language_translation/onnx_model_zoo/bert-squad/quantization/ptq/bert.yaml
rename to examples/onnxrt/nlp/onnx_model_zoo/bert-squad/quantization/ptq/bert.yaml
diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/bert-squad/quantization/ptq/bert_qdq.yaml b/examples/onnxrt/nlp/onnx_model_zoo/bert-squad/quantization/ptq/bert_qdq.yaml
similarity index 100%
rename from examples/onnxrt/language_translation/onnx_model_zoo/bert-squad/quantization/ptq/bert_qdq.yaml
rename to examples/onnxrt/nlp/onnx_model_zoo/bert-squad/quantization/ptq/bert_qdq.yaml
diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/bert-squad/quantization/ptq/main.py b/examples/onnxrt/nlp/onnx_model_zoo/bert-squad/quantization/ptq/main.py
similarity index 100%
rename from examples/onnxrt/language_translation/onnx_model_zoo/bert-squad/quantization/ptq/main.py
rename to examples/onnxrt/nlp/onnx_model_zoo/bert-squad/quantization/ptq/main.py
diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/bert-squad/quantization/ptq/readme.md b/examples/onnxrt/nlp/onnx_model_zoo/bert-squad/quantization/ptq/readme.md
similarity index 100%
rename from examples/onnxrt/language_translation/onnx_model_zoo/bert-squad/quantization/ptq/readme.md
rename to examples/onnxrt/nlp/onnx_model_zoo/bert-squad/quantization/ptq/readme.md
diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/bert-squad/quantization/ptq/requirements.txt b/examples/onnxrt/nlp/onnx_model_zoo/bert-squad/quantization/ptq/requirements.txt
similarity index 100%
rename from examples/onnxrt/language_translation/onnx_model_zoo/bert-squad/quantization/ptq/requirements.txt
rename to examples/onnxrt/nlp/onnx_model_zoo/bert-squad/quantization/ptq/requirements.txt
diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/bert-squad/quantization/ptq/run_benchmark.sh b/examples/onnxrt/nlp/onnx_model_zoo/bert-squad/quantization/ptq/run_benchmark.sh
similarity index 100%
rename from examples/onnxrt/language_translation/onnx_model_zoo/bert-squad/quantization/ptq/run_benchmark.sh
rename to examples/onnxrt/nlp/onnx_model_zoo/bert-squad/quantization/ptq/run_benchmark.sh
diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/bert-squad/quantization/ptq/run_onnx_squad.py b/examples/onnxrt/nlp/onnx_model_zoo/bert-squad/quantization/ptq/run_onnx_squad.py
similarity index 100%
rename from examples/onnxrt/language_translation/onnx_model_zoo/bert-squad/quantization/ptq/run_onnx_squad.py
rename to examples/onnxrt/nlp/onnx_model_zoo/bert-squad/quantization/ptq/run_onnx_squad.py
diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/bert-squad/quantization/ptq/run_tuning.sh b/examples/onnxrt/nlp/onnx_model_zoo/bert-squad/quantization/ptq/run_tuning.sh
similarity index 100%
rename from examples/onnxrt/language_translation/onnx_model_zoo/bert-squad/quantization/ptq/run_tuning.sh
rename to examples/onnxrt/nlp/onnx_model_zoo/bert-squad/quantization/ptq/run_tuning.sh
diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/bert-squad/quantization/ptq/squad_evaluate.py b/examples/onnxrt/nlp/onnx_model_zoo/bert-squad/quantization/ptq/squad_evaluate.py
similarity index 100%
rename from examples/onnxrt/language_translation/onnx_model_zoo/bert-squad/quantization/ptq/squad_evaluate.py
rename to examples/onnxrt/nlp/onnx_model_zoo/bert-squad/quantization/ptq/squad_evaluate.py
diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/bert-squad/quantization/ptq/tokenization.py b/examples/onnxrt/nlp/onnx_model_zoo/bert-squad/quantization/ptq/tokenization.py
similarity index 100%
rename from examples/onnxrt/language_translation/onnx_model_zoo/bert-squad/quantization/ptq/tokenization.py
rename to examples/onnxrt/nlp/onnx_model_zoo/bert-squad/quantization/ptq/tokenization.py
diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/gpt2/quantization/ptq/export.py b/examples/onnxrt/nlp/onnx_model_zoo/gpt2/quantization/ptq/export.py
similarity index 100%
rename from examples/onnxrt/language_translation/onnx_model_zoo/gpt2/quantization/ptq/export.py
rename to examples/onnxrt/nlp/onnx_model_zoo/gpt2/quantization/ptq/export.py
diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/gpt2/quantization/ptq/gpt2.py b/examples/onnxrt/nlp/onnx_model_zoo/gpt2/quantization/ptq/gpt2.py
similarity index 100%
rename from examples/onnxrt/language_translation/onnx_model_zoo/gpt2/quantization/ptq/gpt2.py
rename to examples/onnxrt/nlp/onnx_model_zoo/gpt2/quantization/ptq/gpt2.py
diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/gpt2/quantization/ptq/gpt2.yaml b/examples/onnxrt/nlp/onnx_model_zoo/gpt2/quantization/ptq/gpt2.yaml
similarity index 100%
rename from examples/onnxrt/language_translation/onnx_model_zoo/gpt2/quantization/ptq/gpt2.yaml
rename to examples/onnxrt/nlp/onnx_model_zoo/gpt2/quantization/ptq/gpt2.yaml
diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/gpt2/quantization/ptq/readme.md b/examples/onnxrt/nlp/onnx_model_zoo/gpt2/quantization/ptq/readme.md
similarity index 100%
rename from examples/onnxrt/language_translation/onnx_model_zoo/gpt2/quantization/ptq/readme.md
rename to examples/onnxrt/nlp/onnx_model_zoo/gpt2/quantization/ptq/readme.md
diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/gpt2/quantization/ptq/requirements.txt b/examples/onnxrt/nlp/onnx_model_zoo/gpt2/quantization/ptq/requirements.txt
similarity index 100%
rename from examples/onnxrt/language_translation/onnx_model_zoo/gpt2/quantization/ptq/requirements.txt
rename to examples/onnxrt/nlp/onnx_model_zoo/gpt2/quantization/ptq/requirements.txt
diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/gpt2/quantization/ptq/run_benchmark.sh b/examples/onnxrt/nlp/onnx_model_zoo/gpt2/quantization/ptq/run_benchmark.sh
similarity index 100%
rename from examples/onnxrt/language_translation/onnx_model_zoo/gpt2/quantization/ptq/run_benchmark.sh
rename to examples/onnxrt/nlp/onnx_model_zoo/gpt2/quantization/ptq/run_benchmark.sh
diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/gpt2/quantization/ptq/run_tuning.sh b/examples/onnxrt/nlp/onnx_model_zoo/gpt2/quantization/ptq/run_tuning.sh
similarity index 100%
rename from examples/onnxrt/language_translation/onnx_model_zoo/gpt2/quantization/ptq/run_tuning.sh
rename to examples/onnxrt/nlp/onnx_model_zoo/gpt2/quantization/ptq/run_tuning.sh
diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/mobilebert/quantization/ptq/main.py b/examples/onnxrt/nlp/onnx_model_zoo/mobilebert/quantization/ptq/main.py
similarity index 100%
rename from examples/onnxrt/language_translation/onnx_model_zoo/mobilebert/quantization/ptq/main.py
rename to examples/onnxrt/nlp/onnx_model_zoo/mobilebert/quantization/ptq/main.py
diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/mobilebert/quantization/ptq/mobilebert.yaml b/examples/onnxrt/nlp/onnx_model_zoo/mobilebert/quantization/ptq/mobilebert.yaml
similarity index 100%
rename from examples/onnxrt/language_translation/onnx_model_zoo/mobilebert/quantization/ptq/mobilebert.yaml
rename to examples/onnxrt/nlp/onnx_model_zoo/mobilebert/quantization/ptq/mobilebert.yaml
diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/mobilebert/quantization/ptq/mobilebert_qdq.yaml b/examples/onnxrt/nlp/onnx_model_zoo/mobilebert/quantization/ptq/mobilebert_qdq.yaml
similarity index 100%
rename from examples/onnxrt/language_translation/onnx_model_zoo/mobilebert/quantization/ptq/mobilebert_qdq.yaml
rename to examples/onnxrt/nlp/onnx_model_zoo/mobilebert/quantization/ptq/mobilebert_qdq.yaml
diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/mobilebert/quantization/ptq/readme.md b/examples/onnxrt/nlp/onnx_model_zoo/mobilebert/quantization/ptq/readme.md
similarity index 100%
rename from examples/onnxrt/language_translation/onnx_model_zoo/mobilebert/quantization/ptq/readme.md
rename to examples/onnxrt/nlp/onnx_model_zoo/mobilebert/quantization/ptq/readme.md
diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/mobilebert/quantization/ptq/requirements.txt b/examples/onnxrt/nlp/onnx_model_zoo/mobilebert/quantization/ptq/requirements.txt
similarity index 100%
rename from examples/onnxrt/language_translation/onnx_model_zoo/mobilebert/quantization/ptq/requirements.txt
rename to examples/onnxrt/nlp/onnx_model_zoo/mobilebert/quantization/ptq/requirements.txt
diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/mobilebert/quantization/ptq/run_benchmark.sh b/examples/onnxrt/nlp/onnx_model_zoo/mobilebert/quantization/ptq/run_benchmark.sh
similarity index 100%
rename from examples/onnxrt/language_translation/onnx_model_zoo/mobilebert/quantization/ptq/run_benchmark.sh
rename to examples/onnxrt/nlp/onnx_model_zoo/mobilebert/quantization/ptq/run_benchmark.sh
diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/mobilebert/quantization/ptq/run_onnx_squad.py b/examples/onnxrt/nlp/onnx_model_zoo/mobilebert/quantization/ptq/run_onnx_squad.py
similarity index 100%
rename from examples/onnxrt/language_translation/onnx_model_zoo/mobilebert/quantization/ptq/run_onnx_squad.py
rename to examples/onnxrt/nlp/onnx_model_zoo/mobilebert/quantization/ptq/run_onnx_squad.py
diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/mobilebert/quantization/ptq/run_tuning.sh b/examples/onnxrt/nlp/onnx_model_zoo/mobilebert/quantization/ptq/run_tuning.sh
similarity index 100%
rename from examples/onnxrt/language_translation/onnx_model_zoo/mobilebert/quantization/ptq/run_tuning.sh
rename to examples/onnxrt/nlp/onnx_model_zoo/mobilebert/quantization/ptq/run_tuning.sh
diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/mobilebert/quantization/ptq/squad_evaluate.py b/examples/onnxrt/nlp/onnx_model_zoo/mobilebert/quantization/ptq/squad_evaluate.py
similarity index 100%
rename from examples/onnxrt/language_translation/onnx_model_zoo/mobilebert/quantization/ptq/squad_evaluate.py
rename to examples/onnxrt/nlp/onnx_model_zoo/mobilebert/quantization/ptq/squad_evaluate.py
diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/mobilebert/quantization/ptq/tokenization.py b/examples/onnxrt/nlp/onnx_model_zoo/mobilebert/quantization/ptq/tokenization.py
similarity index 100%
rename from examples/onnxrt/language_translation/onnx_model_zoo/mobilebert/quantization/ptq/tokenization.py
rename to examples/onnxrt/nlp/onnx_model_zoo/mobilebert/quantization/ptq/tokenization.py
diff --git a/examples/onnxrt/language_translation/roberta/quantization/ptq/export.py b/examples/onnxrt/nlp/roberta/quantization/ptq/export.py
similarity index 100%
rename from examples/onnxrt/language_translation/roberta/quantization/ptq/export.py
rename to examples/onnxrt/nlp/roberta/quantization/ptq/export.py
diff --git a/examples/onnxrt/language_translation/roberta/quantization/ptq/main.py b/examples/onnxrt/nlp/roberta/quantization/ptq/main.py
similarity index 100%
rename from examples/onnxrt/language_translation/roberta/quantization/ptq/main.py
rename to examples/onnxrt/nlp/roberta/quantization/ptq/main.py
diff --git a/examples/onnxrt/nlp/roberta/quantization/ptq/prepare_data.sh b/examples/onnxrt/nlp/roberta/quantization/ptq/prepare_data.sh
new file mode 100644
index 00000000000..8e434a5c521
--- /dev/null
+++ b/examples/onnxrt/nlp/roberta/quantization/ptq/prepare_data.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+set -x
+
+function main {
+ init_params "$@"
+ download_data
+
+}
+
+# init params
+function init_params {
+
+ for var in "$@"
+ do
+ case $var in
+ --data_dir=*)
+ data_dir=$(echo $var |cut -f2 -d=)
+ ;;
+ --task_name=*)
+ task_name=$(echo $var |cut -f2 -d=)
+ ;;
+ esac
+ done
+
+}
+
+# run_tuning
+function download_data {
+ wget https://raw.githubusercontent.com/huggingface/transformers/f98ef14d161d7bcdc9808b5ec399981481411cc1/utils/download_glue_data.py
+ python download_glue_data.py --data_dir=${data_dir} --tasks=${task_name}
+}
+
+main "$@"
+
diff --git a/examples/onnxrt/language_translation/roberta/quantization/ptq/prepare_model.sh b/examples/onnxrt/nlp/roberta/quantization/ptq/prepare_model.sh
similarity index 100%
rename from examples/onnxrt/language_translation/roberta/quantization/ptq/prepare_model.sh
rename to examples/onnxrt/nlp/roberta/quantization/ptq/prepare_model.sh
diff --git a/examples/onnxrt/language_translation/roberta/quantization/ptq/readme.md b/examples/onnxrt/nlp/roberta/quantization/ptq/readme.md
similarity index 100%
rename from examples/onnxrt/language_translation/roberta/quantization/ptq/readme.md
rename to examples/onnxrt/nlp/roberta/quantization/ptq/readme.md
diff --git a/examples/onnxrt/language_translation/roberta/quantization/ptq/requirements.txt b/examples/onnxrt/nlp/roberta/quantization/ptq/requirements.txt
similarity index 100%
rename from examples/onnxrt/language_translation/roberta/quantization/ptq/requirements.txt
rename to examples/onnxrt/nlp/roberta/quantization/ptq/requirements.txt
diff --git a/examples/onnxrt/language_translation/roberta/quantization/ptq/roberta.yaml b/examples/onnxrt/nlp/roberta/quantization/ptq/roberta.yaml
similarity index 100%
rename from examples/onnxrt/language_translation/roberta/quantization/ptq/roberta.yaml
rename to examples/onnxrt/nlp/roberta/quantization/ptq/roberta.yaml
diff --git a/examples/onnxrt/language_translation/roberta/quantization/ptq/roberta_qdq.yaml b/examples/onnxrt/nlp/roberta/quantization/ptq/roberta_qdq.yaml
similarity index 100%
rename from examples/onnxrt/language_translation/roberta/quantization/ptq/roberta_qdq.yaml
rename to examples/onnxrt/nlp/roberta/quantization/ptq/roberta_qdq.yaml
diff --git a/examples/onnxrt/language_translation/roberta/quantization/ptq/run_benchmark.sh b/examples/onnxrt/nlp/roberta/quantization/ptq/run_benchmark.sh
similarity index 100%
rename from examples/onnxrt/language_translation/roberta/quantization/ptq/run_benchmark.sh
rename to examples/onnxrt/nlp/roberta/quantization/ptq/run_benchmark.sh
diff --git a/examples/onnxrt/language_translation/roberta/quantization/ptq/run_tuning.sh b/examples/onnxrt/nlp/roberta/quantization/ptq/run_tuning.sh
similarity index 100%
rename from examples/onnxrt/language_translation/roberta/quantization/ptq/run_tuning.sh
rename to examples/onnxrt/nlp/roberta/quantization/ptq/run_tuning.sh