diff --git a/examples/.config/model_params_onnxrt.json b/examples/.config/model_params_onnxrt.json
index 2480fe54c7b..e0b08c699ad 100644
--- a/examples/.config/model_params_onnxrt.json
+++ b/examples/.config/model_params_onnxrt.json
@@ -46,7 +46,7 @@
       "new_benchmark": true
     },
     "bert_base_MRPC_static": {
-      "model_src_dir": "language_translation/bert/quantization/ptq",
+      "model_src_dir": "nlp/bert/quantization/ptq",
       "dataset_location": "/tf_dataset/pytorch/glue_data/MRPC",
       "input_model": "/tf_dataset2/models/onnx/bert_base_MRPC/bert.onnx",
       "yaml": "bert_static.yaml",
@@ -55,7 +55,7 @@
       "new_benchmark": true
     },
     "bert_base_MRPC_dynamic": {
-      "model_src_dir": "language_translation/bert/quantization/ptq",
+      "model_src_dir": "nlp/bert/quantization/ptq",
       "dataset_location": "/tf_dataset/pytorch/glue_data/MRPC",
       "input_model": "/tf_dataset2/models/onnx/bert_base_MRPC/bert.onnx",
       "yaml": "bert_dynamic.yaml",
@@ -64,7 +64,7 @@
       "new_benchmark": true
     },
     "distilbert_base_MRPC": {
-      "model_src_dir": "language_translation/distilbert/quantization/ptq",
+      "model_src_dir": "nlp/distilbert/quantization/ptq",
       "dataset_location": "/tf_dataset/pytorch/glue_data/MRPC",
       "input_model": "/tf_dataset2/models/onnx/distilbert_base_MRPC/distilbert-base-uncased.onnx",
       "yaml": "distilbert.yaml",
@@ -73,7 +73,7 @@
       "new_benchmark": true
     },
     "mobilebert_MRPC": {
-      "model_src_dir": "language_translation/mobilebert/quantization/ptq",
+      "model_src_dir": "nlp/mobilebert/quantization/ptq",
       "dataset_location": "/tf_dataset/pytorch/glue_data/MRPC",
       "input_model": "/tf_dataset2/models/onnx/mobilebert_MRPC/mobilebert-uncased.onnx",
       "yaml": "mobilebert.yaml",
@@ -82,7 +82,7 @@
       "new_benchmark": true
     },
     "roberta_base_MRPC": {
-      "model_src_dir": "language_translation/roberta/quantization/ptq",
+      "model_src_dir": "nlp/roberta/quantization/ptq",
       "dataset_location": "/tf_dataset/pytorch/glue_data/MRPC",
       "input_model": "/tf_dataset2/models/onnx/roberta_base_MRPC/roberta-base.onnx",
       "yaml": "roberta.yaml",
@@ -118,7 +118,7 @@
       "new_benchmark": true
     },
     "bert_squad_model_zoo": {
-      "model_src_dir": "language_translation/onnx_model_zoo/bert-squad/quantization/ptq",
+      "model_src_dir": "nlp/onnx_model_zoo/bert-squad/quantization/ptq",
       "dataset_location": "/tf_dataset2/datasets/squad",
       "input_model": "/tf_dataset2/models/onnx/bert_squad/bert_squad_model_zoo.onnx",
       "yaml": "bert.yaml",
@@ -127,7 +127,7 @@
       "new_benchmark": true
     },
     "mobilebert_squad_mlperf": {
-      "model_src_dir": "language_translation/onnx_model_zoo/mobilebert/quantization/ptq",
+      "model_src_dir": "nlp/onnx_model_zoo/mobilebert/quantization/ptq",
       "dataset_location": "/tf_dataset2/datasets/squad",
       "input_model": "/tf_dataset2/models/onnx/mobilebert_squad/mobilebert_squad_mlperf.onnx",
       "yaml": "mobilebert.yaml",
@@ -136,7 +136,7 @@
       "new_benchmark": true
     },
     "gpt2_lm_head_wikitext_model_zoo": {
-      "model_src_dir": "language_translation/onnx_model_zoo/gpt2/quantization/ptq",
+      "model_src_dir": "nlp/onnx_model_zoo/gpt2/quantization/ptq",
       "dataset_location": "/tf_dataset2/datasets/wikitext/wikitext-2-raw/",
       "input_model": "/tf_dataset2/models/onnx/gpt2/gpt2_lm_head_wikitext_model_zoo.onnx",
       "yaml": "gpt2.yaml",
@@ -352,7 +352,7 @@
       "new_benchmark": true
     },
     "bert_base_MRPC_static_qdq": {
-      "model_src_dir": "language_translation/bert/quantization/ptq",
+      "model_src_dir": "nlp/bert/quantization/ptq",
       "dataset_location": "/tf_dataset/pytorch/glue_data/MRPC",
       "input_model": "/tf_dataset2/models/onnx/bert_base_MRPC/bert.onnx",
       "yaml": "bert_qdq.yaml",
@@ -361,7 +361,7 @@
       "new_benchmark": true
     },
     "distilbert_base_MRPC_qdq": {
-      "model_src_dir": "language_translation/distilbert/quantization/ptq",
+      "model_src_dir": "nlp/distilbert/quantization/ptq",
       "dataset_location": "/tf_dataset/pytorch/glue_data/MRPC",
       "input_model": "/tf_dataset2/models/onnx/distilbert_base_MRPC/distilbert-base-uncased.onnx",
       "yaml": "distilbert_qdq.yaml",
@@ -370,7 +370,7 @@
       "new_benchmark": true
     },
     "mobilebert_MRPC_qdq": {
-      "model_src_dir": "language_translation/mobilebert/quantization/ptq",
+      "model_src_dir": "nlp/mobilebert/quantization/ptq",
       "dataset_location": "/tf_dataset/pytorch/glue_data/MRPC",
       "input_model": "/tf_dataset2/models/onnx/mobilebert_MRPC/mobilebert-uncased.onnx",
       "yaml": "mobilebert_qdq.yaml",
@@ -379,7 +379,7 @@
       "new_benchmark": true
     },
     "roberta_base_MRPC_qdq": {
-      "model_src_dir": "language_translation/roberta/quantization/ptq",
+      "model_src_dir": "nlp/roberta/quantization/ptq",
       "dataset_location": "/tf_dataset/pytorch/glue_data/MRPC",
       "input_model": "/tf_dataset2/models/onnx/roberta_base_MRPC/roberta-base.onnx",
       "yaml": "roberta_qdq.yaml",
@@ -415,7 +415,7 @@
       "new_benchmark": true
     },
     "bert_squad_model_zoo_qdq": {
-      "model_src_dir": "language_translation/onnx_model_zoo/bert-squad/quantization/ptq",
+      "model_src_dir": "nlp/onnx_model_zoo/bert-squad/quantization/ptq",
       "dataset_location": "/tf_dataset2/datasets/squad",
       "input_model": "/tf_dataset2/models/onnx/bert_squad/bert_squad_model_zoo.onnx",
       "yaml": "bert_qdq.yaml",
@@ -424,7 +424,7 @@
       "new_benchmark": true
     },
     "mobilebert_squad_mlperf_qdq": {
-      "model_src_dir": "language_translation/onnx_model_zoo/mobilebert/quantization/ptq",
+      "model_src_dir": "nlp/onnx_model_zoo/mobilebert/quantization/ptq",
       "dataset_location": "/tf_dataset2/datasets/squad",
       "input_model": "/tf_dataset2/models/onnx/mobilebert_squad/mobilebert_squad_mlperf-13.onnx",
       "yaml": "mobilebert_qdq.yaml",
@@ -631,13 +631,103 @@
       "new_benchmark": true
     },
     "BiDAF": {
-      "model_src_dir": "language_translation/onnx_model_zoo/BiDAF/quantization/ptq",
+      "model_src_dir": "nlp/onnx_model_zoo/BiDAF/quantization/ptq",
       "dataset_location": "/tf_dataset2/datasets/squad/dev-v1.1.json",
       "input_model": "/tf_dataset2/models/onnx/BiDAF/bidaf-11.onnx",
       "yaml": "bidaf.yaml",
       "strategy": "basic",
       "batch_size": 1,
       "new_benchmark": true
+    },
+    "hf_bert-base-uncased_dynamic": {
+      "model_src_dir": "nlp/huggingface_model/text_classification/quantization/ptq",
+      "dataset_location": "/tf_dataset/pytorch/glue_data/MRPC",
+      "input_model": "/tf_dataset2/models/onnx/hf_bert-base-uncased_dynamic/bert-base-uncased-mrpc.onnx",
+      "yaml": "glue_dynamic.yaml",
+      "strategy": "basic",
+      "batch_size": 1,
+      "new_benchmark": true
+    },
+    "hf_roberta-base_dynamic": {
+      "model_src_dir": "nlp/huggingface_model/text_classification/quantization/ptq",
+      "dataset_location": "/tf_dataset/pytorch/glue_data/MRPC",
+      "input_model": "/tf_dataset2/models/onnx/hf_roberta-base_dynamic/roberta-base-mrpc.onnx",
+      "yaml": "glue_dynamic.yaml",
+      "strategy": "basic",
+      "batch_size": 1,
+      "new_benchmark": true
+    },
+    "hf_xlm-roberta-base_dynamic": {
+      "model_src_dir": "nlp/huggingface_model/text_classification/quantization/ptq",
+      "dataset_location": "/tf_dataset/pytorch/glue_data/MRPC",
+      "input_model": "/tf_dataset2/models/onnx/hf_xlm-roberta-base_dynamic/xlm-roberta-base-mrpc.onnx",
+      "yaml": "glue_dynamic.yaml",
+      "strategy": "basic",
+      "batch_size": 1,
+      "new_benchmark": true
+    },
+    "hf_camembert-base_dynamic": {
+      "model_src_dir": "nlp/huggingface_model/text_classification/quantization/ptq",
+      "dataset_location": "/tf_dataset/pytorch/glue_data/MRPC",
+      "input_model": "/tf_dataset2/models/onnx/hf_camembert-base_dynamic/camembert-base-mrpc.onnx",
+      "yaml": "glue_dynamic.yaml",
+      "strategy": "basic",
+      "batch_size": 1,
+      "new_benchmark": true
+    },
+    "hf_MiniLM-L12-H384-uncased_dynamic": {
+      "model_src_dir": "nlp/huggingface_model/text_classification/quantization/ptq",
+      "dataset_location": "/tf_dataset/pytorch/glue_data/MRPC",
+      "input_model": "/tf_dataset2/models/onnx/hf_MiniLM-L12-H384-uncased_dynamic/MiniLM-L12-H384-uncased-mrpc.onnx",
+      "yaml": "glue_dynamic.yaml",
+      "strategy": "basic",
+      "batch_size": 1,
+      "new_benchmark": true
+    },
+    "hf_distilbert-base-uncased_dynamic": {
+      "model_src_dir": "nlp/huggingface_model/text_classification/quantization/ptq",
+      "dataset_location": "/tf_dataset/pytorch/glue_data/SST-2/",
+      "input_model": "/tf_dataset2/models/onnx/hf_distilbert-base-uncased_dynamic/distilbert-base-uncased-finetuned-sst-2-english.onnx",
+      "yaml": "glue_dynamic.yaml",
+      "strategy": "basic",
+      "batch_size": 1,
+      "new_benchmark": true
+    },
+    "hf_albert-base-v2_dynamic": {
+      "model_src_dir": "nlp/huggingface_model/text_classification/quantization/ptq",
+      "dataset_location": "/tf_dataset/pytorch/glue_data/SST-2/",
+      "input_model": "/tf_dataset2/models/onnx/hf_albert-base-v2_dynamic/albert-base-v2-sst2.onnx",
+      "yaml": "glue_dynamic.yaml",
+      "strategy": "basic",
+      "batch_size": 1,
+      "new_benchmark": true
+    },
+    "hf_MiniLM-L6-H384-uncased_dynamic": {
+      "model_src_dir": "nlp/huggingface_model/text_classification/quantization/ptq",
+      "dataset_location": "/tf_dataset/pytorch/glue_data/SST-2/",
+      "input_model": "/tf_dataset2/models/onnx/hf_MiniLM-L6-H384-uncased_dynamic/MiniLM-L6-H384-uncased-sst2.onnx",
+      "yaml": "glue_dynamic.yaml",
+      "strategy": "basic",
+      "batch_size": 1,
+      "new_benchmark": true
+    },
+    "hf_spanbert_dynamic": {
+      "model_src_dir": "nlp/huggingface_model/question_answering/quantization/ptq",
+      "dataset_location": "/tf_dataset2/datasets/squad",
+      "input_model": "/tf_dataset2/models/onnx/hf_spanbert_dynamic/spanbert-finetuned-squadv1.onnx",
+      "yaml": "qa_dynamic.yaml",
+      "strategy": "basic",
+      "batch_size": 1,
+      "new_benchmark": true
+    },
+    "hf_bert-base-multilingual-cased_dynamic": {
+      "model_src_dir": "nlp/huggingface_model/question_answering/quantization/ptq",
+      "dataset_location": "/tf_dataset2/datasets/squad",
+      "input_model": "/tf_dataset2/models/onnx/hf_bert-base-multilingual-cased_dynamic/bert-base-multilingual-cased-finetuned-squad.onnx",
+      "yaml": "qa_dynamic.yaml",
+      "strategy": "basic",
+      "batch_size": 1,
+      "new_benchmark": true
     }
   }
 }
diff --git a/examples/README.md b/examples/README.md
index 7342bbe1e8a..0698d8e7d41 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -855,55 +855,115 @@ Intel® Neural Compressor validated examples with multiple compression technique
     <td>BERT base MRPC</td>
     <td>Natural Language Processing</td>
     <td>Post-Training Static Quantization</td>
-    <td><a href="./onnxrt/language_translation/bert/quantization/ptq">integerops</a> / <a href="./onnxrt/language_translation/bert/quantization/ptq">qdq</a></td>
+    <td><a href="./onnxrt/nlp/bert/quantization/ptq">integerops</a> / <a href="./onnxrt/nlp/bert/quantization/ptq">qdq</a></td>
   </tr>
   <tr>
     <td>BERT base MRPC</td>
     <td>Natural Language Processing</td>
     <td>Post-Training Dynamic Quantization</td>
-    <td><a href="./onnxrt/language_translation/bert/quantization/ptq">integerops</a></td>
+    <td><a href="./onnxrt/nlp/bert/quantization/ptq">integerops</a></td>
   </tr>
   <tr>
     <td>DistilBERT base MRPC</td>
     <td>Natural Language Processing</td>
     <td>Post-Training Dynamic / Static Quantization</td>
-    <td><a href="./onnxrt/language_translation/distilbert/quantization/ptq">integerops</a> / <a href="./onnxrt/language_translation/distilbert/quantization/ptq">qdq</a></td>
+    <td><a href="./onnxrt/nlp/distilbert/quantization/ptq">integerops</a> / <a href="./onnxrt/nlp/distilbert/quantization/ptq">qdq</a></td>
   </tr>
   <tr>
     <td>Mobile bert MRPC</td>
     <td>Natural Language Processing</td>
     <td>Post-Training Dynamic / Static Quantization</td>
-    <td><a href="./onnxrt/language_translation/mobilebert/quantization/ptq">integerops</a> / <a href="./onnxrt/language_translation/mobilebert/quantization/ptq">qdq</a></td>
+    <td><a href="./onnxrt/nlp/mobilebert/quantization/ptq">integerops</a> / <a href="./onnxrt/nlp/mobilebert/quantization/ptq">qdq</a></td>
   </tr>
   <tr>
     <td>Roberta base MRPC</td>
     <td>Natural Language Processing</td>
     <td>Post-Training Dynamic / Static Quantization</td>
-    <td><a href="./onnxrt/language_translation/roberta/quantization/ptq">integerops</a> / <a href="./onnxrt/language_translation/roberta/quantization/ptq">qdq</a></td>
+    <td><a href="./onnxrt/nlp/roberta/quantization/ptq">integerops</a> / <a href="./onnxrt/nlp/roberta/quantization/ptq">qdq</a></td>
   </tr>
   <tr>
     <td>BERT SQuAD</td>
     <td>Natural Language Processing</td>
     <td>Post-Training Dynamic / Static Quantization</td>
-    <td><a href="./onnxrt/language_translation/onnx_model_zoo/bert-squad/quantization/ptq">integerops</a> / <a href="./onnxrt/language_translation/onnx_model_zoo/bert-squad/quantization/ptq">qdq</a></td>
+    <td><a href="./onnxrt/nlp/bert-squad/quantization/ptq">integerops</a> / <a href="./onnxrt/nlp/bert-squad/quantization/ptq">qdq</a></td>
   </tr>
   <tr>
     <td>GPT2 lm head WikiText</td>
     <td>Natural Language Processing</td>
     <td>Post-Training Dynamic Quantization</td>
-    <td><a href="./onnxrt/language_translation/onnx_model_zoo/gpt2/quantization/ptq">integerops</a></td>
+    <td><a href="./onnxrt/nlp/gpb2/quantization/ptq">integerops</a></td>
   </tr>
   <tr>
     <td>MobileBERT SQuAD MLPerf</td>
     <td>Natural Language Processing</td>
     <td>Post-Training Dynamic / Static Quantization</td>
-    <td><a href="./onnxrt/language_translation/onnx_model_zoo/mobilebert/quantization/ptq">integerops</a> / <a href="./onnxrt/language_translation/onnx_model_zoo/mobilebert/quantization/ptq">qdq</a></td>
+    <td><a href="./onnxrt/nlp/mobilebert/quantization/ptq">integerops</a> / <a href="./onnxrt/nlp/mobilebert/quantization/ptq">qdq</a></td>
   </tr>
   <tr>
     <td>BiDAF</td>
     <td>Natural Language Processing</td>
     <td>Post-Training Dynamic Quantization</td>
-    <td><a href="./onnxrt/language_translation/onnx_model_zoo/BiDAF/quantization/ptq">integerops</a></td>
+    <td><a href="./onnxrt/nlp/onnx_model_zoo/BiDAF/quantization/ptq">integerops</a></td>
+  </tr>
+  <tr>
+    <td>BERT base uncased MRPC (HuggingFace)</td>
+    <td>Natural Language Processing</td>
+    <td>Post-Training Static Quantization</td>
+    <td><a href="./onnxrt/nlp/huggingface_model/text_classification/quantization/ptq">qdq</a></td>
+  </tr>
+  <tr>
+    <td>Roberta base MRPC (HuggingFace)</td>
+    <td>Natural Language Processing</td>
+    <td>Post-Training Static Quantization</td>
+    <td><a href="./onnxrt/nlp/huggingface_model/text_classification/quantization/ptq">qdq</a></td>
+  </tr>
+  <tr>
+    <td>XLM Roberta base MRPC (HuggingFace)</td>
+    <td>Natural Language Processing</td>
+    <td>Post-Training Static Quantization</td>
+    <td><a href="./onnxrt/nlp/huggingface_model/text_classification/quantization/ptq">qdq</a></td>
+  </tr>
+  <tr>
+    <td>Camembert base MRPC (HuggingFace)</td>
+    <td>Natural Language Processing</td>
+    <td>Post-Training Static Quantization</td>
+    <td><a href="./onnxrt/nlp/huggingface_model/text_classification/quantization/ptq">qdq</a></td>
+  </tr>
+  <tr>
+    <td>MiniLM L12 H384 uncased MRPC (HuggingFace)</td>
+    <td>Natural Language Processing</td>
+    <td>Post-Training Static Quantization</td>
+    <td><a href="./onnxrt/nlp/huggingface_model/text_classification/quantization/ptq">qdq</a></td>
+  </tr>
+  <tr>
+    <td>Distilbert base uncased SST-2 (HuggingFace)</td>
+    <td>Natural Language Processing</td>
+    <td>Post-Training Static Quantization</td>
+    <td><a href="./onnxrt/nlp/huggingface_model/text_classification/quantization/ptq">qdq</a></td>
+  </tr>
+  <tr>
+    <td>Albert base v2 SST-2 (HuggingFace)</td>
+    <td>Natural Language Processing</td>
+    <td>Post-Training Static Quantization</td>
+    <td><a href="./onnxrt/nlp/huggingface_model/text_classification/quantization/ptq">qdq</a></td>
+  </tr>
+  <tr>
+    <td>MiniLM L6 H384 uncased SST-2 (HuggingFace)</td>
+    <td>Natural Language Processing</td>
+    <td>Post-Training Static Quantization</td>
+    <td><a href="./onnxrt/nlp/huggingface_model/text_classification/quantization/ptq">qdq</a></td>
+  </tr>
+  <tr>
+    <td>Spanbert SQuAD (HuggingFace)</td>
+    <td>Natural Language Processing</td>
+    <td>Post-Training Static Quantization</td>
+    <td><a href="./onnxrt/nlp/huggingface_model/question_answering/quantization/ptq">qdq</a></td>
+  </tr>
+  <tr>
+    <td>Bert base multilingual cased SQuAD (HuggingFace)</td>
+    <td>Natural Language Processing</td>
+    <td>Post-Training Static Quantization</td>
+    <td><a href="./onnxrt/nlp/huggingface_model/question_answering/quantization/ptq">qdq</a></td>
   </tr>
   <tr>
     <td>SSD MobileNet V1</td>
diff --git a/examples/onnxrt/language_translation/bert/quantization/ptq/README.md b/examples/onnxrt/nlp/bert/quantization/ptq/README.md
similarity index 100%
rename from examples/onnxrt/language_translation/bert/quantization/ptq/README.md
rename to examples/onnxrt/nlp/bert/quantization/ptq/README.md
diff --git a/examples/onnxrt/language_translation/bert/quantization/ptq/bert_dynamic.yaml b/examples/onnxrt/nlp/bert/quantization/ptq/bert_dynamic.yaml
similarity index 100%
rename from examples/onnxrt/language_translation/bert/quantization/ptq/bert_dynamic.yaml
rename to examples/onnxrt/nlp/bert/quantization/ptq/bert_dynamic.yaml
diff --git a/examples/onnxrt/language_translation/bert/quantization/ptq/bert_qdq.yaml b/examples/onnxrt/nlp/bert/quantization/ptq/bert_qdq.yaml
similarity index 100%
rename from examples/onnxrt/language_translation/bert/quantization/ptq/bert_qdq.yaml
rename to examples/onnxrt/nlp/bert/quantization/ptq/bert_qdq.yaml
diff --git a/examples/onnxrt/language_translation/bert/quantization/ptq/bert_static.yaml b/examples/onnxrt/nlp/bert/quantization/ptq/bert_static.yaml
similarity index 100%
rename from examples/onnxrt/language_translation/bert/quantization/ptq/bert_static.yaml
rename to examples/onnxrt/nlp/bert/quantization/ptq/bert_static.yaml
diff --git a/examples/onnxrt/language_translation/bert/quantization/ptq/export.py b/examples/onnxrt/nlp/bert/quantization/ptq/export.py
similarity index 100%
rename from examples/onnxrt/language_translation/bert/quantization/ptq/export.py
rename to examples/onnxrt/nlp/bert/quantization/ptq/export.py
diff --git a/examples/onnxrt/language_translation/bert/quantization/ptq/main.py b/examples/onnxrt/nlp/bert/quantization/ptq/main.py
similarity index 100%
rename from examples/onnxrt/language_translation/bert/quantization/ptq/main.py
rename to examples/onnxrt/nlp/bert/quantization/ptq/main.py
diff --git a/examples/onnxrt/language_translation/bert/quantization/ptq/prepare_data.sh b/examples/onnxrt/nlp/bert/quantization/ptq/prepare_data.sh
similarity index 100%
rename from examples/onnxrt/language_translation/bert/quantization/ptq/prepare_data.sh
rename to examples/onnxrt/nlp/bert/quantization/ptq/prepare_data.sh
diff --git a/examples/onnxrt/language_translation/bert/quantization/ptq/prepare_model.sh b/examples/onnxrt/nlp/bert/quantization/ptq/prepare_model.sh
similarity index 100%
rename from examples/onnxrt/language_translation/bert/quantization/ptq/prepare_model.sh
rename to examples/onnxrt/nlp/bert/quantization/ptq/prepare_model.sh
diff --git a/examples/onnxrt/language_translation/bert/quantization/ptq/requirements.txt b/examples/onnxrt/nlp/bert/quantization/ptq/requirements.txt
similarity index 100%
rename from examples/onnxrt/language_translation/bert/quantization/ptq/requirements.txt
rename to examples/onnxrt/nlp/bert/quantization/ptq/requirements.txt
diff --git a/examples/onnxrt/language_translation/bert/quantization/ptq/run_benchmark.sh b/examples/onnxrt/nlp/bert/quantization/ptq/run_benchmark.sh
similarity index 100%
rename from examples/onnxrt/language_translation/bert/quantization/ptq/run_benchmark.sh
rename to examples/onnxrt/nlp/bert/quantization/ptq/run_benchmark.sh
diff --git a/examples/onnxrt/language_translation/bert/quantization/ptq/run_tuning.sh b/examples/onnxrt/nlp/bert/quantization/ptq/run_tuning.sh
similarity index 100%
rename from examples/onnxrt/language_translation/bert/quantization/ptq/run_tuning.sh
rename to examples/onnxrt/nlp/bert/quantization/ptq/run_tuning.sh
diff --git a/examples/onnxrt/language_translation/distilbert/quantization/ptq/distilbert.yaml b/examples/onnxrt/nlp/distilbert/quantization/ptq/distilbert.yaml
similarity index 100%
rename from examples/onnxrt/language_translation/distilbert/quantization/ptq/distilbert.yaml
rename to examples/onnxrt/nlp/distilbert/quantization/ptq/distilbert.yaml
diff --git a/examples/onnxrt/language_translation/distilbert/quantization/ptq/distilbert_qdq.yaml b/examples/onnxrt/nlp/distilbert/quantization/ptq/distilbert_qdq.yaml
similarity index 100%
rename from examples/onnxrt/language_translation/distilbert/quantization/ptq/distilbert_qdq.yaml
rename to examples/onnxrt/nlp/distilbert/quantization/ptq/distilbert_qdq.yaml
diff --git a/examples/onnxrt/language_translation/distilbert/quantization/ptq/export.py b/examples/onnxrt/nlp/distilbert/quantization/ptq/export.py
similarity index 100%
rename from examples/onnxrt/language_translation/distilbert/quantization/ptq/export.py
rename to examples/onnxrt/nlp/distilbert/quantization/ptq/export.py
diff --git a/examples/onnxrt/language_translation/distilbert/quantization/ptq/main.py b/examples/onnxrt/nlp/distilbert/quantization/ptq/main.py
similarity index 100%
rename from examples/onnxrt/language_translation/distilbert/quantization/ptq/main.py
rename to examples/onnxrt/nlp/distilbert/quantization/ptq/main.py
diff --git a/examples/onnxrt/language_translation/distilbert/quantization/ptq/prepare_data.sh b/examples/onnxrt/nlp/distilbert/quantization/ptq/prepare_data.sh
similarity index 100%
rename from examples/onnxrt/language_translation/distilbert/quantization/ptq/prepare_data.sh
rename to examples/onnxrt/nlp/distilbert/quantization/ptq/prepare_data.sh
diff --git a/examples/onnxrt/language_translation/distilbert/quantization/ptq/prepare_model.sh b/examples/onnxrt/nlp/distilbert/quantization/ptq/prepare_model.sh
similarity index 100%
rename from examples/onnxrt/language_translation/distilbert/quantization/ptq/prepare_model.sh
rename to examples/onnxrt/nlp/distilbert/quantization/ptq/prepare_model.sh
diff --git a/examples/onnxrt/language_translation/distilbert/quantization/ptq/readme.md b/examples/onnxrt/nlp/distilbert/quantization/ptq/readme.md
similarity index 100%
rename from examples/onnxrt/language_translation/distilbert/quantization/ptq/readme.md
rename to examples/onnxrt/nlp/distilbert/quantization/ptq/readme.md
diff --git a/examples/onnxrt/language_translation/distilbert/quantization/ptq/requirements.txt b/examples/onnxrt/nlp/distilbert/quantization/ptq/requirements.txt
similarity index 100%
rename from examples/onnxrt/language_translation/distilbert/quantization/ptq/requirements.txt
rename to examples/onnxrt/nlp/distilbert/quantization/ptq/requirements.txt
diff --git a/examples/onnxrt/language_translation/distilbert/quantization/ptq/run_benchmark.sh b/examples/onnxrt/nlp/distilbert/quantization/ptq/run_benchmark.sh
similarity index 100%
rename from examples/onnxrt/language_translation/distilbert/quantization/ptq/run_benchmark.sh
rename to examples/onnxrt/nlp/distilbert/quantization/ptq/run_benchmark.sh
diff --git a/examples/onnxrt/language_translation/distilbert/quantization/ptq/run_tuning.sh b/examples/onnxrt/nlp/distilbert/quantization/ptq/run_tuning.sh
similarity index 100%
rename from examples/onnxrt/language_translation/distilbert/quantization/ptq/run_tuning.sh
rename to examples/onnxrt/nlp/distilbert/quantization/ptq/run_tuning.sh
diff --git a/examples/onnxrt/nlp/huggingface_model/question_answering/quantization/ptq/README.md b/examples/onnxrt/nlp/huggingface_model/question_answering/quantization/ptq/README.md
new file mode 100644
index 00000000000..55538ff591c
--- /dev/null
+++ b/examples/onnxrt/nlp/huggingface_model/question_answering/quantization/ptq/README.md
@@ -0,0 +1,43 @@
+# Evaluate performance of ONNX Runtime(Huggingface Question Answering) 
+>ONNX runtime quantization is under active development. please use 1.6.0+ to get more quantization support. 
+
+This example load a language translation model and confirm its accuracy and speed based on [SQuAD]((https://rajpurkar.github.io/SQuAD-explorer/)) task. 
+
+### Environment
+Please use latest onnx and onnxruntime version.
+
+### Prepare dataset
+You should download SQuAD dataset from [SQuAD dataset link](https://rajpurkar.github.io/SQuAD-explorer/).
+
+### Prepare model
+
+Supported model identifier from [huggingface.co](https://huggingface.co/):
+
+|                 Model Identifier                |
+|:-----------------------------------------------:|
+|           mrm8488/spanbert-finetuned-squadv1          |
+|             salti/bert-base-multilingual-cased-finetuned-squad             |
+
+
+```bash
+python export.py --model_name_or_path=mrm8488/spanbert-finetuned-squadv1 \ # or other supported model identifier
+```
+
+### Quantization
+
+Dynamic quantize:
+
+```bash
+bash run_tuning.sh --input_model=/path/to/model \ # model path as *.onnx
+                   --output_model=/path/to/model_tune \
+                   --config=qa_dynamic.yaml
+```
+
+### Benchmark
+
+```bash
+bash run_benchmark.sh --input_model=/path/to/model \ # model path as *.onnx
+                      --config=qa_dynamic.yaml
+                      --mode=performance # or accuracy
+```
+
diff --git a/examples/onnxrt/nlp/huggingface_model/question_answering/quantization/ptq/export.py b/examples/onnxrt/nlp/huggingface_model/question_answering/quantization/ptq/export.py
new file mode 100644
index 00000000000..08824f90405
--- /dev/null
+++ b/examples/onnxrt/nlp/huggingface_model/question_answering/quantization/ptq/export.py
@@ -0,0 +1,50 @@
+import argparse
+
+import torch
+from transformers import AutoConfig, AutoModelForQuestionAnswering
+
+def export_onnx_model(args, model):
+    with torch.no_grad():
+        symbolic_names = {0: 'batch_size', 1: 'max_seq_len'}
+        inputs = {'input_ids':      torch.ones(1, args.max_len, dtype=torch.int64),
+                  'token_type_ids': torch.ones(1, args.max_len, dtype=torch.int64),
+                  'attention_mask': torch.ones(1, args.max_len, dtype=torch.int64)}
+        torch.onnx.export(model,                            # model being run
+                        (inputs['input_ids'],               # model input (or a tuple for multiple inputs) 
+                        inputs['token_type_ids'],
+                        inputs['attention_mask']),          
+                        args.output_model,                  # where to save the model (can be a file or file-like object)
+                        opset_version=11,                   # the ONNX version to export the model
+                        do_constant_folding=True,           # whether to execute constant folding
+                        input_names=['input_ids',           # the model's input names
+                                    'token_type_ids',
+                                    'attention_mask'],
+                        dynamic_axes={'input_ids': symbolic_names,        # variable length axes
+                                    'token_type_ids' : symbolic_names,
+                                    'attention_mask' : symbolic_names})
+        print("ONNX Model exported to {0}".format(args.output_model))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+    description='Export huggingface onnx model',
+    formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument(
+        '--model_name_or_path',
+        type=str,
+        choices=['mrm8488/spanbert-finetuned-squadv1',
+                'salti/bert-base-multilingual-cased-finetuned-squad'],
+        help='pretrained model name or path ')
+    parser.add_argument(
+        '--max_len',
+        type=int,
+        default=512,
+        help='Maximum length of the sentence pairs')
+    args = parser.parse_args()
+    args.output_model = args.model_name_or_path.split('/')[1] + '.onnx'
+
+    model = AutoModelForQuestionAnswering.from_pretrained(
+        args.model_name_or_path,
+        config=AutoConfig.from_pretrained(args.model_name_or_path))
+
+    export_onnx_model(args, model)
\ No newline at end of file
diff --git a/examples/onnxrt/nlp/huggingface_model/question_answering/quantization/ptq/main.py b/examples/onnxrt/nlp/huggingface_model/question_answering/quantization/ptq/main.py
new file mode 100644
index 00000000000..1866be2d602
--- /dev/null
+++ b/examples/onnxrt/nlp/huggingface_model/question_answering/quantization/ptq/main.py
@@ -0,0 +1,614 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2020 The HuggingFace Team All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Fine-tuning the library models for question answering using a slightly adapted version of the 🤗 Trainer.
+"""
+# You can also adapt this script on your own question answering task. Pointers for this are left as comments.
+
+import logging
+import os
+import sys
+from dataclasses import dataclass, field
+from typing import Optional
+
+import datasets
+from datasets import load_dataset, load_metric
+
+from torch.utils.data import Dataset, DataLoader
+
+import sys
+import onnx
+import onnxruntime as ort
+import numpy as np
+import transformers
+from trainer_qa import QuestionAnsweringTrainer
+from transformers import (
+    AutoConfig,
+    AutoModelForQuestionAnswering,
+    AutoTokenizer,
+    DataCollatorWithPadding,
+    EvalPrediction,
+    HfArgumentParser,
+    PreTrainedTokenizerFast,
+    TrainingArguments,
+    default_data_collator,
+    set_seed,
+)
+from transformers.trainer_utils import get_last_checkpoint
+from transformers.utils import check_min_version, send_example_telemetry
+from transformers.utils.versions import require_version
+from utils_qa import postprocess_qa_predictions
+
+
+# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
+# check_min_version("4.22.0.dev0")
+
+require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
+    """
+
+    model_name_or_path: str = field(
+        default=None, metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
+    )
+    config_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
+    )
+    tokenizer_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
+    )
+    cache_dir: Optional[str] = field(
+        default=None,
+        metadata={"help": "Path to directory to store the pretrained models downloaded from huggingface.co"},
+    )
+    model_revision: str = field(
+        default="main",
+        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
+    )
+    use_auth_token: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Will use the token generated when running `transformers-cli login` (necessary to use this script "
+                "with private models)."
+            )
+        },
+    )
+    model_path: str = field(
+        default=None,
+        metadata={"help": ("onnx model path")},
+    )
+    tune: bool = field(
+        default=False,
+        metadata={"help": ("INC tune")},
+    )
+    benchmark: bool = field(
+        default=False,
+        metadata={"help": ("INC benchmark")},
+    )
+    mode: str = field(
+        default='performance',
+        metadata={"help": ("INC benchmark mode")},
+    )
+    config: str = field(
+        default='bert-base-multilingual-cased-static.yaml',
+        metadata={"help": ("INC config")},
+    )
+    save_path: str = field(
+        default=None,
+        metadata={"help": ("onnx int8 model path")},
+    )
+    num_heads: int = field(
+        default=12,
+        metadata={"help": ("onnx model optimize num_heads")},
+    )
+    hidden_size: int = field(
+        default=768,
+        metadata={"help": ("onnx model optimize hidden_size")},
+    )
+
+
+@dataclass
+class DataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+    """
+
+    dataset_name: Optional[str] = field(
+        default='squad', metadata={"help": "The name of the dataset to use (via the datasets library)."}
+    )
+    dataset_config_name: Optional[str] = field(
+        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
+    )
+    train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."})
+    validation_file: Optional[str] = field(
+        default=None,
+        metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."},
+    )
+    test_file: Optional[str] = field(
+        default=None,
+        metadata={"help": "An optional input test data file to evaluate the perplexity on (a text file)."},
+    )
+    overwrite_cache: bool = field(
+        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
+    )
+    preprocessing_num_workers: Optional[int] = field(
+        default=None,
+        metadata={"help": "The number of processes to use for the preprocessing."},
+    )
+    max_seq_length: int = field(
+        default=512,
+        metadata={
+            "help": (
+                "The maximum total input sequence length after tokenization. Sequences longer "
+                "than this will be truncated, sequences shorter will be padded."
+            )
+        },
+    )
+    pad_to_max_length: bool = field(
+        default=True,
+        metadata={
+            "help": (
+                "Whether to pad all samples to `max_seq_length`. If False, will pad the samples dynamically when"
+                " batching to the maximum length in the batch (which can be faster on GPU but will be slower on TPU)."
+            )
+        },
+    )
+    max_train_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": (
+                "For debugging purposes or quicker training, truncate the number of training examples to this "
+                "value if set."
+            )
+        },
+    )
+    max_eval_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": (
+                "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
+                "value if set."
+            )
+        },
+    )
+    max_predict_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": (
+                "For debugging purposes or quicker training, truncate the number of prediction examples to this "
+                "value if set."
+            )
+        },
+    )
+    version_2_with_negative: bool = field(
+        default=False, metadata={"help": "If true, some of the examples do not have an answer."}
+    )
+    null_score_diff_threshold: float = field(
+        default=0.0,
+        metadata={
+            "help": (
+                "The threshold used to select the null answer: if the best answer has a score that is less than "
+                "the score of the null answer minus this threshold, the null answer is selected for this example. "
+                "Only useful when `version_2_with_negative=True`."
+            )
+        },
+    )
+    doc_stride: int = field(
+        default=256,
+        metadata={"help": "When splitting up a long document into chunks, how much stride to take between chunks."},
+    )
+    n_best_size: int = field(
+        default=20,
+        metadata={"help": "The total number of n-best predictions to generate when looking for an answer."},
+    )
+    max_answer_length: int = field(
+        default=30,
+        metadata={
+            "help": (
+                "The maximum length of an answer that can be generated. This is needed because the start "
+                "and end predictions are not conditioned on one another."
+            )
+        },
+    )
+
+    def __post_init__(self):
+        if (
+            self.dataset_name is None
+            and self.train_file is None
+            and self.validation_file is None
+            and self.test_file is None
+        ):
+            raise ValueError("Need either a dataset name or a training/validation file/test_file.")
+        else:
+            if self.train_file is not None:
+                extension = self.train_file.split(".")[-1]
+                assert extension in ["csv", "json"], "`train_file` should be a csv or a json file."
+            if self.validation_file is not None:
+                extension = self.validation_file.split(".")[-1]
+                assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file."
+            if self.test_file is not None:
+                extension = self.test_file.split(".")[-1]
+                assert extension in ["csv", "json"], "`test_file` should be a csv or a json file."
+
+class SquadDataset(Dataset):
+    def __init__(self, dataloader, bs=1):
+        self.dataloader = dataloader
+        self.bs = bs
+        self.input_ids = []
+        self.token_type_ids = []
+        self.attention_mask = []
+        for idx, inputs in enumerate(self.dataloader):
+            self.input_ids.append(np.array(inputs['input_ids'], dtype=np.int64))
+            self.token_type_ids.append(np.array(inputs['token_type_ids'], dtype=np.int64))
+            self.attention_mask.append(np.array(inputs['attention_mask'], dtype=np.int64))
+    
+    def __getitem__(self, index):
+        return (self.input_ids[index:index + self.bs][0][0], self.token_type_ids[index:index + self.bs][0][0], self.attention_mask[index:index + self.bs][0][0]), 0
+        # return (self.input_ids[index:index + self.bs][0], self.attention_mask[index:index + self.bs][0], self.token_type_ids[index:index + self.bs][0]), 0
+
+    def __len__(self):
+        assert len(self.input_ids) == len(self.attention_mask)
+        assert len(self.input_ids) == len(self.token_type_ids)
+        return len(self.input_ids)
+    
+
+def main():
+    # See all possible arguments in src/transformers/training_args.py
+    # or by passing the --help flag to this script.
+    # We now keep distinct sets of args, for a cleaner separation of concerns.
+
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
+    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+
+    training_args.do_eval = True
+    training_args.per_device_eval_batch_size = 1
+
+    # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
+    # information sent is the one passed as arguments along with your Python/PyTorch versions.
+    send_example_telemetry("run_qa", model_args, data_args)
+
+    # Setup logging
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        handlers=[logging.StreamHandler(sys.stdout)],
+    )
+
+    log_level = training_args.get_process_log_level()
+    logger.setLevel(log_level)
+    datasets.utils.logging.set_verbosity(log_level)
+    transformers.utils.logging.set_verbosity(log_level)
+    transformers.utils.logging.enable_default_handler()
+    transformers.utils.logging.enable_explicit_format()
+
+    # Log on each process the small summary:
+    logger.warning(
+        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
+        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+    )
+    logger.info(f"Training/evaluation parameters {training_args}")
+
+    # Detecting last checkpoint.
+    last_checkpoint = None
+    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
+        last_checkpoint = get_last_checkpoint(training_args.output_dir)
+        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
+            raise ValueError(
+                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
+                "Use --overwrite_output_dir to overcome."
+            )
+        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
+            logger.info(
+                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
+                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
+            )
+
+    # Set seed before initializing model.
+    set_seed(training_args.seed)
+
+    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
+    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
+    # (the dataset will be downloaded automatically from the datasets Hub).
+    #
+    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
+    # 'text' is found. You can easily tweak this behavior (see below).
+    #
+    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
+    # download the dataset.
+    if data_args.dataset_name is not None:
+        # Downloading and loading a dataset from the hub.
+        raw_datasets = load_dataset(
+            data_args.dataset_name,
+            data_args.dataset_config_name,
+            cache_dir=model_args.cache_dir,
+            use_auth_token=True if model_args.use_auth_token else None,
+        )
+        print(type(raw_datasets))
+    else:
+        data_files = {}
+        if data_args.train_file is not None:
+            data_files["train"] = data_args.train_file
+            extension = data_args.train_file.split(".")[-1]
+
+        if data_args.validation_file is not None:
+            data_files["validation"] = data_args.validation_file
+            extension = data_args.validation_file.split(".")[-1]
+        if data_args.test_file is not None:
+            data_files["test"] = data_args.test_file
+            extension = data_args.test_file.split(".")[-1]
+        raw_datasets = load_dataset(
+            extension,
+            data_files=data_files,
+            field="data",
+            cache_dir=model_args.cache_dir,
+            use_auth_token=True if model_args.use_auth_token else None,
+        )
+    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
+    # https://huggingface.co/docs/datasets/loading_datasets.html.
+
+    # Load pretrained model and tokenizer
+    #
+    # Distributed training:
+    # The .from_pretrained methods guarantee that only one local process can concurrently
+    # download model & vocab.
+    config = AutoConfig.from_pretrained(
+        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
+    )
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        use_fast=True,
+        revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
+    )
+    model = AutoModelForQuestionAnswering.from_pretrained(
+        model_args.model_name_or_path,
+        from_tf=bool(".ckpt" in model_args.model_name_or_path),
+        config=config,
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
+    )
+
+    # Tokenizer check: this script requires a fast tokenizer.
+    if not isinstance(tokenizer, PreTrainedTokenizerFast):
+        raise ValueError(
+            "This example script only works for models that have a fast tokenizer. Checkout the big table of models at"
+            " https://huggingface.co/transformers/index.html#supported-frameworks to find the model types that meet"
+            " this requirement"
+        )
+
+    # Preprocessing the datasets.
+    # Preprocessing is slighlty different for training and evaluation.
+    if training_args.do_train:
+        column_names = raw_datasets["train"].column_names
+    elif training_args.do_eval:
+        column_names = raw_datasets["validation"].column_names
+    else:
+        column_names = raw_datasets["test"].column_names
+    question_column_name = "question" if "question" in column_names else column_names[0]
+    context_column_name = "context" if "context" in column_names else column_names[1]
+    answer_column_name = "answers" if "answers" in column_names else column_names[2]
+
+    # Padding side determines if we do (question|context) or (context|question).
+    pad_on_right = tokenizer.padding_side == "right"
+
+    if data_args.max_seq_length > tokenizer.model_max_length:
+        logger.warning(
+            f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
+            f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
+        )
+    max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
+    
+    # Validation preprocessing
+    def prepare_validation_features(examples):
+        # Some of the questions have lots of whitespace on the left, which is not useful and will make the
+        # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
+        # left whitespace
+        examples[question_column_name] = [q.lstrip() for q in examples[question_column_name]]
+
+        # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
+        # in one example possible giving several features when a context is long, each of those features having a
+        # context that overlaps a bit the context of the previous feature.
+        tokenized_examples = tokenizer(
+            examples[question_column_name if pad_on_right else context_column_name],
+            examples[context_column_name if pad_on_right else question_column_name],
+            truncation="only_second" if pad_on_right else "only_first",
+            max_length=max_seq_length,
+            stride=data_args.doc_stride,
+            return_overflowing_tokens=True,
+            return_offsets_mapping=True,
+            padding="max_length" if data_args.pad_to_max_length else False,
+        )
+
+        # Since one example might give us several features if it has a long context, we need a map from a feature to
+        # its corresponding example. This key gives us just that.
+        sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
+
+        # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the
+        # corresponding example_id and we will store the offset mappings.
+        tokenized_examples["example_id"] = []
+
+        for i in range(len(tokenized_examples["input_ids"])):
+            # Grab the sequence corresponding to that example (to know what is the context and what is the question).
+            sequence_ids = tokenized_examples.sequence_ids(i)
+            context_index = 1 if pad_on_right else 0
+
+            # One example can give several spans, this is the index of the example containing this span of text.
+            sample_index = sample_mapping[i]
+            tokenized_examples["example_id"].append(examples["id"][sample_index])
+
+            # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token
+            # position is part of the context or not.
+            tokenized_examples["offset_mapping"][i] = [
+                (o if sequence_ids[k] == context_index else None)
+                for k, o in enumerate(tokenized_examples["offset_mapping"][i])
+            ]
+
+        return tokenized_examples
+
+    if training_args.do_eval:
+        if "validation" not in raw_datasets:
+            raise ValueError("--do_eval requires a validation dataset")
+        eval_examples = raw_datasets["validation"]
+        if data_args.max_eval_samples is not None:
+            # We will select sample from whole data
+            max_eval_samples = min(len(eval_examples), data_args.max_eval_samples)
+            eval_examples = eval_examples.select(range(max_eval_samples))
+        # Validation Feature Creation
+        with training_args.main_process_first(desc="validation dataset map pre-processing"):
+            eval_dataset = eval_examples.map(
+                prepare_validation_features,
+                batched=True,
+                num_proc=data_args.preprocessing_num_workers,
+                remove_columns=column_names,
+                load_from_cache_file=not data_args.overwrite_cache,
+                desc="Running tokenizer on validation dataset",
+            )
+        if data_args.max_eval_samples is not None:
+            # During Feature creation dataset samples might increase, we will select required samples again
+            max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
+            eval_dataset = eval_dataset.select(range(max_eval_samples))
+
+
+    # Data collator
+    # We have already padded to max length if the corresponding flag is True, otherwise we need to pad in the data
+    # collator.
+    data_collator = (
+        default_data_collator
+        if data_args.pad_to_max_length
+        else DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8 if training_args.fp16 else None)
+    )
+
+    # Post-processing:
+    def post_processing_function(examples, features, predictions, stage="eval"):
+        # Post-processing: we match the start logits and end logits to answers in the original context.
+        predictions = postprocess_qa_predictions(
+            examples=examples,
+            features=features,
+            predictions=predictions,
+            version_2_with_negative=data_args.version_2_with_negative,
+            n_best_size=data_args.n_best_size,
+            max_answer_length=data_args.max_answer_length,
+            null_score_diff_threshold=data_args.null_score_diff_threshold,
+            output_dir=training_args.output_dir,
+            log_level=log_level,
+            prefix=stage,
+        )
+        # Format the result to the format the metric expects.
+        if data_args.version_2_with_negative:
+            formatted_predictions = [
+                {"id": k, "prediction_text": v, "no_answer_probability": 0.0} for k, v in predictions.items()
+            ]
+        else:
+            formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()]
+
+        references = [{"id": ex["id"], "answers": ex[answer_column_name]} for ex in examples]
+        return EvalPrediction(predictions=formatted_predictions, label_ids=references)
+
+    metric = load_metric("squad_v2" if data_args.version_2_with_negative else "squad")
+
+    def compute_metrics(p: EvalPrediction):
+        return metric.compute(predictions=p.predictions, references=p.label_ids)
+
+    # Initialize our Trainer
+    trainer = QuestionAnsweringTrainer(
+        model=model,
+        args=training_args,
+        train_dataset=None,
+        eval_dataset=eval_dataset if training_args.do_eval else None,
+        eval_examples=eval_examples if training_args.do_eval else None,
+        tokenizer=tokenizer,
+        data_collator=data_collator,
+        post_process_function=post_processing_function,
+        compute_metrics=compute_metrics,
+    )
+
+    eval_dataloader = trainer.get_dataloader(eval_dataset)
+
+    def eval_func(model, *args):
+        logger.info("*** Evaluate ***")
+        metrics = trainer.evaluate(onnx_model=model)
+        print('eval_func', metrics)
+
+        max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
+        metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
+
+        trainer.log_metrics("eval", metrics)
+        trainer.save_metrics("eval", metrics)
+        return metrics['eval_f1']
+    
+    if model_args.tune:
+        from onnxruntime.transformers import optimizer
+        from onnxruntime.transformers.onnx_model_bert import BertOptimizationOptions
+        opt_options = BertOptimizationOptions('bert')
+        opt_options.enable_embed_layer_norm = False
+
+        model_optimizer = optimizer.optimize_model(
+            model_args.model_path,
+            'bert',
+            num_heads=model_args.num_heads,
+            hidden_size=model_args.hidden_size,
+            optimization_options=opt_options)
+        model = model_optimizer.model
+
+        b_dataloader = SquadDataset(eval_dataloader)
+        b_dataloader = DataLoader(b_dataloader)
+        from neural_compressor.experimental import Quantization, common
+        quantize = Quantization(model_args.config)
+        quantize.model = common.Model(model)
+        quantize.calib_dataloader = b_dataloader
+        quantize.eval_func = eval_func
+        q_model = quantize()
+        q_model.save(model_args.save_path)
+
+    if model_args.benchmark:
+        from neural_compressor.experimental import Benchmark, common
+        model = onnx.load(model_args.model_path)
+        if model_args.mode == 'performance':
+            from neural_compressor.data import DATALOADERS, DATASETS
+            session = ort.InferenceSession(model_args.model_path, None)
+            input_tensors = session.get_inputs()
+            shape = []
+            for i in range(len(input_tensors)):
+                shape.append((1, 512))
+            onnx_datasets = DATASETS('onnxrt_integerops')
+            dummy_dataset = onnx_datasets['dummy'](shape=shape, low=1, high=1, dtype='int64', label=True)
+            evaluator = Benchmark(model_args.config)
+            evaluator.model = common.Model(model)
+            evaluator.b_dataloader = common.DataLoader(dummy_dataset)
+            evaluator(model_args.mode)
+        elif model_args.mode == 'accuracy':
+            b_dataloader = SquadDataset(eval_dataloader)
+            b_dataloader = DataLoader(b_dataloader)
+            evaluator = Benchmark(model_args.config)
+            evaluator.b_dataloader = b_dataloader
+            evaluator.b_func = eval_func
+            evaluator.model = common.Model(model)
+            evaluator(model_args.mode)
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/examples/onnxrt/nlp/huggingface_model/question_answering/quantization/ptq/qa_dynamic.yaml b/examples/onnxrt/nlp/huggingface_model/question_answering/quantization/ptq/qa_dynamic.yaml
new file mode 100644
index 00000000000..e76dfbc315c
--- /dev/null
+++ b/examples/onnxrt/nlp/huggingface_model/question_answering/quantization/ptq/qa_dynamic.yaml
@@ -0,0 +1,36 @@
+#
+# Copyright (c) 2021 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+version: 1.0
+
+model:                                               # mandatory. used to specify model specific information.
+  name: question_answering 
+  framework: onnxrt_integerops                       # mandatory. possible values are tensorflow, mxnet, pytorch, pytorch_ipex, onnxrt_integerops and onnxrt_qlinearops.
+
+evaluation:                                          # optional. required if user doesn't provide eval_func in neural_compressor.Quantization.
+  performance:                                       # optional. used to benchmark performance of passing model.
+    warmup: 0
+    iteration: 100
+    configs:
+      cores_per_instance: 28
+      num_of_instance: 1
+
+quantization:
+  approach: post_training_dynamic_quant              # optional. default value is post_training_static_quant.                       
+  
+tuning:
+  accuracy_criterion:
+    relative:  0.01                                  # optional. default value is relative, other value is absolute. this example allows relative accuracy loss: 1%.
+  random_seed: 9527                                  # optional. random seed for deterministic tuning.
\ No newline at end of file
diff --git a/examples/onnxrt/nlp/huggingface_model/question_answering/quantization/ptq/requirements.txt b/examples/onnxrt/nlp/huggingface_model/question_answering/quantization/ptq/requirements.txt
new file mode 100644
index 00000000000..30412bea132
--- /dev/null
+++ b/examples/onnxrt/nlp/huggingface_model/question_answering/quantization/ptq/requirements.txt
@@ -0,0 +1,7 @@
+datasets
+onnx
+onnxruntime
+onnxruntime-extensions; python_version < '3.10'
+transformers==4.21.0
+torch
+tensorboard
\ No newline at end of file
diff --git a/examples/onnxrt/nlp/huggingface_model/question_answering/quantization/ptq/run_benchmark.sh b/examples/onnxrt/nlp/huggingface_model/question_answering/quantization/ptq/run_benchmark.sh
new file mode 100644
index 00000000000..2eef1e0e4b3
--- /dev/null
+++ b/examples/onnxrt/nlp/huggingface_model/question_answering/quantization/ptq/run_benchmark.sh
@@ -0,0 +1,49 @@
+#!/bin/bash
+set -x
+
+function main {
+
+  init_params "$@"
+  run_benchmark
+
+}
+
+# init params
+function init_params {
+  for var in "$@"
+  do
+    case $var in
+      --config=*)
+          config=$(echo $var |cut -f2 -d=)
+      ;;
+      --input_model=*)
+          input_model=$(echo $var |cut -f2 -d=)
+      ;;
+      --mode=*)
+          mode=$(echo $var |cut -f2 -d=)
+      ;;
+    esac
+  done
+
+}
+
+# run_benchmark
+function run_benchmark {
+
+    if [[ "${input_model}" =~ "spanbert" ]]; then
+        model_name_or_path="mrm8488/spanbert-finetuned-squadv1"
+    elif [[ "${input_model}" =~ "bert-base" ]]; then
+        model_name_or_path="salti/bert-base-multilingual-cased-finetuned-squad"
+    fi
+
+    python main.py \
+            --model_path ${input_model} \
+            --config ${config} \
+            --mode=${mode} \
+            --model_name_or_path=${model_name_or_path} \
+            --output_dir './output' \
+            --benchmark
+            
+}
+
+main "$@"
\ No newline at end of file
diff --git a/examples/onnxrt/nlp/huggingface_model/question_answering/quantization/ptq/run_tuning.sh b/examples/onnxrt/nlp/huggingface_model/question_answering/quantization/ptq/run_tuning.sh
new file mode 100644
index 00000000000..9e0eb872250
--- /dev/null
+++ b/examples/onnxrt/nlp/huggingface_model/question_answering/quantization/ptq/run_tuning.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+set -x
+
+function main {
+  init_params "$@"
+  run_tuning
+}
+
+# init params
+function init_params {
+  for var in "$@"
+  do
+    case $var in
+      --config=*)
+          config=$(echo $var |cut -f2 -d=)
+      ;;    
+      --input_model=*)
+          input_model=$(echo $var |cut -f2 -d=)
+      ;;
+      --output_model=*)
+          output_model=$(echo $var |cut -f2 -d=)
+      ;;
+    esac
+  done
+
+}
+
+# run_tuning
+function run_tuning {
+  
+    if [[ "${input_model}" =~ "spanbert" ]]; then
+        model_name_or_path="mrm8488/spanbert-finetuned-squadv1"
+        num_heads=12
+        hidden_size=768
+    elif [[ "${input_model}" =~ "bert-base" ]]; then
+        model_name_or_path="salti/bert-base-multilingual-cased-finetuned-squad"
+        num_heads=12
+        hidden_size=768
+    fi
+
+    python main.py \
+            --model_path ${input_model} \
+            --save_path ${output_model} \
+            --config ${config} \
+            --output_dir './output' \
+            --model_name_or_path=${model_name_or_path} \
+            --num_heads ${num_heads} \
+            --hidden_size ${hidden_size} \
+            --tune 
+}
+
+main "$@"
\ No newline at end of file
diff --git a/examples/onnxrt/nlp/huggingface_model/question_answering/quantization/ptq/trainer_qa.py b/examples/onnxrt/nlp/huggingface_model/question_answering/quantization/ptq/trainer_qa.py
new file mode 100644
index 00000000000..2da65c8a9f0
--- /dev/null
+++ b/examples/onnxrt/nlp/huggingface_model/question_answering/quantization/ptq/trainer_qa.py
@@ -0,0 +1,489 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Team All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+A subclass of `Trainer` specific to Question-Answering tasks
+"""
+
+from transformers import Trainer, is_torch_tpu_available
+from transformers.deepspeed import deepspeed_init, is_deepspeed_zero3_enabled
+from transformers.trainer_utils import (
+    PREFIX_CHECKPOINT_DIR,
+    BestRun,
+    EvalLoopOutput,
+    EvalPrediction,
+    FSDPOption,
+    HPSearchBackend,
+    HubStrategy,
+    IntervalStrategy,
+    PredictionOutput,
+    RemoveColumnsCollator,
+    ShardedDDPOption,
+    TrainerMemoryTracker,
+    TrainOutput,
+    default_compute_objective,
+    default_hp_space,
+    denumpify_detensorize,
+    enable_full_determinism,
+    find_executable_batch_size,
+    get_last_checkpoint,
+    has_length,
+    number_of_arguments,
+    seed_worker,
+    set_seed,
+    speed_metrics,
+)
+from transformers.utils import (
+    is_sagemaker_mp_enabled,
+    is_torch_tpu_available,
+    logging,
+)
+from transformers.trainer_pt_utils import (
+    IterableDatasetShard,
+    LabelSmoother,
+    LengthGroupedSampler,
+    SequentialDistributedSampler,
+    ShardSampler,
+    distributed_broadcast_scalars,
+    distributed_concat,
+    find_batch_size,
+    get_module_class_from_name,
+    get_parameter_names,
+    nested_concat,
+    nested_detach,
+    nested_numpify,
+    nested_truncate,
+    nested_xla_mesh_reduce,
+    reissue_pt_warnings
+)
+import onnxruntime
+import onnx
+from torch.utils.data import DataLoader
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union, NamedTuple
+import numpy as np
+
+import torch
+from torch import nn
+
+logger = logging.get_logger(__name__)
+
+if is_sagemaker_mp_enabled():
+    import smdistributed.modelparallel.torch as smp
+    from smdistributed.modelparallel import __version__ as SMP_VERSION
+
+    IS_SAGEMAKER_MP_POST_1_10 = version.parse(SMP_VERSION) >= version.parse("1.10")
+
+    from .trainer_pt_utils import smp_forward_backward, smp_forward_only, smp_gather, smp_nested_concat
+
+
+if is_torch_tpu_available(check_device=False):
+    import torch_xla.core.xla_model as xm
+    import torch_xla.debug.metrics as met
+    import torch_xla.distributed.parallel_loader as pl
+
+def has_length(dataset):
+    """
+    Checks if the dataset implements __len__() and it doesn't raise an error
+    """
+    try:
+        return len(dataset) is not None
+    except TypeError:
+        # TypeError: len() of unsized object
+        return False
+
+class EvalLoopOutput(NamedTuple):
+    predictions: Union[np.ndarray, Tuple[np.ndarray]]
+    label_ids: Optional[Union[np.ndarray, Tuple[np.ndarray]]]
+    metrics: Optional[Dict[str, float]]
+    num_samples: Optional[int]
+
+class QuestionAnsweringTrainer(Trainer):
+    def __init__(self, *args, eval_examples=None, post_process_function=None, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.eval_examples = eval_examples
+        self.post_process_function = post_process_function
+    
+    def get_dataloader(self, eval_dataset):
+        return self.get_eval_dataloader(eval_dataset)
+
+    def evaluate(self, onnx_model, eval_dataset=None, eval_examples=None, ignore_keys=None, metric_key_prefix: str = "eval"):
+        eval_dataset = self.eval_dataset if eval_dataset is None else eval_dataset
+        eval_dataloader = self.get_eval_dataloader(eval_dataset)
+        eval_examples = self.eval_examples if eval_examples is None else eval_examples
+
+        # Temporarily disable metric computation, we will do it in the loop here.
+        compute_metrics = self.compute_metrics
+        self.compute_metrics = None
+        eval_loop = self.evaluation_loop
+        print('eval_dataloader', type(eval_dataloader))
+        print('onnx_model', type(onnx_model))
+        try:
+            output = eval_loop(
+                dataloader=eval_dataloader,
+                description="Evaluation",
+                prediction_loss_only=True if compute_metrics is None else None,
+                ignore_keys=ignore_keys,
+                onnx_model=onnx_model,
+            )
+        finally:
+            self.compute_metrics = compute_metrics
+
+        if self.post_process_function is not None and self.compute_metrics is not None:
+            eval_preds = self.post_process_function(eval_examples, eval_dataset, output.predictions)
+            metrics = self.compute_metrics(eval_preds)
+
+            # Prefix all keys with metric_key_prefix + '_'
+            for key in list(metrics.keys()):
+                if not key.startswith(f"{metric_key_prefix}_"):
+                    metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key)
+
+            self.log(metrics)
+        else:
+            metrics = {}
+
+        if self.args.tpu_metrics_debug or self.args.debug:
+            # tpu-comment: Logging debug metrics for PyTorch/XLA (compile, execute times, ops, etc.)
+            xm.master_print(met.metrics_report())
+
+        self.control = self.callback_handler.on_evaluate(self.args, self.state, self.control, metrics)
+        return metrics
+
+    def predict(self, predict_dataset, predict_examples, ignore_keys=None, metric_key_prefix: str = "test"):
+        predict_dataloader = self.get_test_dataloader(predict_dataset)
+
+        # Temporarily disable metric computation, we will do it in the loop here.
+        compute_metrics = self.compute_metrics
+        self.compute_metrics = None
+        eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop
+        try:
+            output = eval_loop(
+                predict_dataloader,
+                description="Prediction",
+                # No point gathering the predictions if there are no metrics, otherwise we defer to
+                # self.args.prediction_loss_only
+                prediction_loss_only=True if compute_metrics is None else None,
+                ignore_keys=ignore_keys,
+            )
+        finally:
+            self.compute_metrics = compute_metrics
+
+        if self.post_process_function is None or self.compute_metrics is None:
+            return output
+
+        predictions = self.post_process_function(predict_examples, predict_dataset, output.predictions, "predict")
+        metrics = self.compute_metrics(predictions)
+
+        # Prefix all keys with metric_key_prefix + '_'
+        for key in list(metrics.keys()):
+            if not key.startswith(f"{metric_key_prefix}_"):
+                metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key)
+
+        return PredictionOutput(predictions=predictions.predictions, label_ids=predictions.label_ids, metrics=metrics)
+
+    def evaluation_loop(
+        self,
+        dataloader: DataLoader,
+        description: str,
+        prediction_loss_only: Optional[bool] = None,
+        ignore_keys: Optional[List[str]] = None,
+        metric_key_prefix: str = "eval",
+        onnx_model: onnx.onnx_ml_pb2.ModelProto = None,
+    ) -> EvalLoopOutput:
+        """
+        Prediction/evaluation loop, shared by `Trainer.evaluate()` and `Trainer.predict()`.
+        Works both with or without labels.
+        """
+        args = self.args
+
+        prediction_loss_only = prediction_loss_only if prediction_loss_only is not None else args.prediction_loss_only
+
+        # if eval is called w/o train init deepspeed here
+        if args.deepspeed and not self.deepspeed:
+
+            # XXX: eval doesn't have `resume_from_checkpoint` arg but we should be able to do eval
+            # from the checkpoint eventually
+            deepspeed_engine, _, _ = deepspeed_init(
+                self, num_training_steps=0, resume_from_checkpoint=None, inference=True
+            )
+            self.model = deepspeed_engine.module
+            self.model_wrapped = deepspeed_engine
+            self.deepspeed = deepspeed_engine
+
+        model = self._wrap_model(self.model, training=False, dataloader=dataloader)
+
+        # if full fp16 or bf16 eval is wanted and this ``evaluation`` or ``predict`` isn't called
+        # while ``train`` is running, cast it to the right dtype first and then put on device
+        if not self.is_in_train:
+            if args.fp16_full_eval:
+                model = model.to(dtype=torch.float16, device=args.device)
+            elif args.bf16_full_eval:
+                model = model.to(dtype=torch.bfloat16, device=args.device)
+
+        batch_size = self.args.eval_batch_size
+
+        logger.info(f"***** Running {description} *****")
+        if has_length(dataloader):
+            logger.info(f"  Num examples = {self.num_examples(dataloader)}")
+        else:
+            logger.info("  Num examples: Unknown")
+        logger.info(f"  Batch size = {batch_size}")
+
+        model.eval()
+
+        self.callback_handler.eval_dataloader = dataloader
+        # Do this before wrapping.
+        eval_dataset = getattr(dataloader, "dataset", None)
+
+        if is_torch_tpu_available():
+            dataloader = pl.ParallelLoader(dataloader, [args.device]).per_device_loader(args.device)
+
+        if args.past_index >= 0:
+            self._past = None
+
+        # Initialize containers
+        # losses/preds/labels on GPU/TPU (accumulated for eval_accumulation_steps)
+        losses_host = None
+        preds_host = None
+        labels_host = None
+        inputs_host = None
+
+        # losses/preds/labels on CPU (final containers)
+        all_losses = None
+        all_preds = None
+        all_labels = None
+        all_inputs = None
+        # Will be useful when we have an iterable dataset so don't know its length.
+
+        onnx_session = onnxruntime.InferenceSession(onnx_model.SerializeToString(), None)
+        observed_num_examples = 0
+        # Main evaluation loop
+        for step, inputs in enumerate(dataloader):
+            # Update the observed num examples
+            observed_batch_size = find_batch_size(inputs)
+            if observed_batch_size is not None:
+                observed_num_examples += observed_batch_size
+                # For batch samplers, batch_size is not known by the dataloader in advance.
+                if batch_size is None:
+                    batch_size = observed_batch_size
+
+            # Prediction step
+            loss, logits, labels = self.prediction_step(onnx_session, model, inputs, prediction_loss_only, ignore_keys=ignore_keys)
+            inputs_decode = self._prepare_input(inputs["input_ids"]) if args.include_inputs_for_metrics else None
+
+            if is_torch_tpu_available():
+                xm.mark_step()
+
+            # Update containers on host
+            if loss is not None:
+                losses = self._nested_gather(loss.repeat(batch_size))
+                losses_host = losses if losses_host is None else torch.cat((losses_host, losses), dim=0)
+            if labels is not None:
+                labels = self._pad_across_processes(labels)
+                labels = self._nested_gather(labels)
+                labels_host = labels if labels_host is None else nested_concat(labels_host, labels, padding_index=-100)
+            if inputs_decode is not None:
+                inputs_decode = self._pad_across_processes(inputs_decode)
+                inputs_decode = self._nested_gather(inputs_decode)
+                inputs_host = (
+                    inputs_decode
+                    if inputs_host is None
+                    else nested_concat(inputs_host, inputs_decode, padding_index=-100)
+                )
+            if logits is not None:
+                logits = self._pad_across_processes(logits)
+                logits = self._nested_gather(logits)
+                if self.preprocess_logits_for_metrics is not None:
+                    logits = self.preprocess_logits_for_metrics(logits, labels)
+                preds_host = logits if preds_host is None else nested_concat(preds_host, logits, padding_index=-100)
+            self.control = self.callback_handler.on_prediction_step(args, self.state, self.control)
+
+            # Gather all tensors and put them back on the CPU if we have done enough accumulation steps.
+            if args.eval_accumulation_steps is not None and (step + 1) % args.eval_accumulation_steps == 0:
+                if losses_host is not None:
+                    losses = nested_numpify(losses_host)
+                    all_losses = losses if all_losses is None else np.concatenate((all_losses, losses), axis=0)
+                if preds_host is not None:
+                    logits = nested_numpify(preds_host)
+                    all_preds = logits if all_preds is None else nested_concat(all_preds, logits, padding_index=-100)
+                if inputs_host is not None:
+                    inputs_decode = nested_numpify(inputs_host)
+                    all_inputs = (
+                        inputs_decode
+                        if all_inputs is None
+                        else nested_concat(all_inputs, inputs_decode, padding_index=-100)
+                    )
+                if labels_host is not None:
+                    labels = nested_numpify(labels_host)
+                    all_labels = (
+                        labels if all_labels is None else nested_concat(all_labels, labels, padding_index=-100)
+                    )
+
+                # Set back to None to begin a new accumulation
+                losses_host, preds_host, inputs_host, labels_host = None, None, None, None
+
+        if args.past_index and hasattr(self, "_past"):
+            # Clean the state at the end of the evaluation loop
+            delattr(self, "_past")
+
+        # Gather all remaining tensors and put them back on the CPU
+        if losses_host is not None:
+            losses = nested_numpify(losses_host)
+            all_losses = losses if all_losses is None else np.concatenate((all_losses, losses), axis=0)
+        if preds_host is not None:
+            logits = nested_numpify(preds_host)
+            all_preds = logits if all_preds is None else nested_concat(all_preds, logits, padding_index=-100)
+        if inputs_host is not None:
+            inputs_decode = nested_numpify(inputs_host)
+            all_inputs = (
+                inputs_decode if all_inputs is None else nested_concat(all_inputs, inputs_decode, padding_index=-100)
+            )
+        if labels_host is not None:
+            labels = nested_numpify(labels_host)
+            all_labels = labels if all_labels is None else nested_concat(all_labels, labels, padding_index=-100)
+
+        # Number of samples
+        if has_length(eval_dataset):
+            num_samples = len(eval_dataset)
+        # The instance check is weird and does not actually check for the type, but whether the dataset has the right
+        # methods. Therefore we need to make sure it also has the attribute.
+        elif isinstance(eval_dataset, IterableDatasetShard) and hasattr(eval_dataset, "num_examples"):
+            num_samples = eval_dataset.num_examples
+        else:
+            if has_length(dataloader):
+                num_samples = self.num_examples(dataloader)
+            else:  # both len(dataloader.dataset) and len(dataloader) fail
+                num_samples = observed_num_examples
+
+        # Number of losses has been rounded to a multiple of batch_size and in a distributed training, the number of
+        # samplers has been rounded to a multiple of batch_size, so we truncate.
+        if all_losses is not None:
+            all_losses = all_losses[:num_samples]
+        if all_preds is not None:
+            all_preds = nested_truncate(all_preds, num_samples)
+        if all_labels is not None:
+            all_labels = nested_truncate(all_labels, num_samples)
+        if all_inputs is not None:
+            all_inputs = nested_truncate(all_inputs, num_samples)
+
+        # Metrics!
+        if self.compute_metrics is not None and all_preds is not None and all_labels is not None:
+            if args.include_inputs_for_metrics:
+                metrics = self.compute_metrics(
+                    EvalPrediction(predictions=all_preds, label_ids=all_labels, inputs=all_inputs)
+                )
+            else:
+                metrics = self.compute_metrics(EvalPrediction(predictions=all_preds, label_ids=all_labels))
+        else:
+            metrics = {}
+
+        # To be JSON-serializable, we need to remove numpy types or zero-d tensors
+        metrics = denumpify_detensorize(metrics)
+
+        if all_losses is not None:
+            metrics[f"{metric_key_prefix}_loss"] = all_losses.mean().item()
+
+        # Prefix all keys with metric_key_prefix + '_'
+        for key in list(metrics.keys()):
+            if not key.startswith(f"{metric_key_prefix}_"):
+                metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key)
+
+        return EvalLoopOutput(predictions=all_preds, label_ids=all_labels, metrics=metrics, num_samples=num_samples)
+
+    def prediction_step(
+        self,
+        onnx_session,
+        model: nn.Module,
+        inputs: Dict[str, Union[torch.Tensor, Any]],
+        prediction_loss_only: bool,
+        ignore_keys: Optional[List[str]] = None,
+    ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor]]:
+        """
+        Perform an evaluation step on `model` using `inputs`.
+        Subclass and override to inject custom behavior.
+        Args:
+            model (`nn.Module`):
+                The model to evaluate.
+            inputs (`Dict[str, Union[torch.Tensor, Any]]`):
+                The inputs and targets of the model.
+                The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
+                argument `labels`. Check your model's documentation for all accepted arguments.
+            prediction_loss_only (`bool`):
+                Whether or not to return the loss only.
+            ignore_keys (`Lst[str]`, *optional*):
+                A list of keys in the output of your model (if it is a dictionary) that should be ignored when
+                gathering predictions.
+        Return:
+            Tuple[Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss,
+            logits and labels (each being optional).
+        """
+        has_labels = all(inputs.get(k) is not None for k in self.label_names)
+        inputs = self._prepare_inputs(inputs)
+        if ignore_keys is None:
+            if hasattr(self.model, "config"):
+                ignore_keys = getattr(self.model.config, "keys_to_ignore_at_inference", [])
+            else:
+                ignore_keys = []
+
+        # labels may be popped when computing the loss (label smoothing for instance) so we grab them first.
+        if has_labels:
+            labels = nested_detach(tuple(inputs.get(name) for name in self.label_names))
+            if len(labels) == 1:
+                labels = labels[0]
+        else:
+            labels = None
+
+        with torch.no_grad():
+            if is_sagemaker_mp_enabled():
+                raw_outputs = smp_forward_only(model, inputs)
+                if has_labels:
+                    if isinstance(raw_outputs, dict):
+                        loss_mb = raw_outputs["loss"]
+                        logits_mb = tuple(v for k, v in raw_outputs.items() if k not in ignore_keys + ["loss"])
+                    else:
+                        loss_mb = raw_outputs[0]
+                        logits_mb = raw_outputs[1:]
+
+                    loss = loss_mb.reduce_mean().detach().cpu()
+                    logits = smp_nested_concat(logits_mb)
+                else:
+                    loss = None
+                    if isinstance(raw_outputs, dict):
+                        logits_mb = tuple(v for k, v in raw_outputs.items() if k not in ignore_keys)
+                    else:
+                        logits_mb = raw_outputs
+                    logits = smp_nested_concat(logits_mb)
+            else:
+                if has_labels:
+                    with self.compute_loss_context_manager():
+                        loss, outputs = self.compute_loss(model, inputs, return_outputs=True)
+                    loss = loss.mean().detach()
+
+                    if isinstance(outputs, dict):
+                        logits = tuple(v for k, v in outputs.items() if k not in ignore_keys + ["loss"])
+                    else:
+                        logits = outputs[1:]
+                else:
+                    loss = None
+                    with self.compute_loss_context_manager():
+                        data = {"input_ids": np.array(inputs['input_ids'], dtype=np.int64),
+                                "attention_mask": np.array(inputs['token_type_ids'], dtype=np.int64),
+                                "token_type_ids": np.array(inputs['attention_mask'], dtype=np.int64)}
+                        outputs2 = onnx_session.run(None, data)
+                    logits2 = tuple((torch.from_numpy(outputs2[0]), torch.from_numpy(outputs2[1])))
+                    # TODO: this needs to be fixed and made cleaner later.
+                    if self.args.past_index >= 0:
+                        self._past = outputs[self.args.past_index - 1]
+
+        logits2 = nested_detach(logits2)
+        return (loss, logits2, labels)
\ No newline at end of file
diff --git a/examples/onnxrt/nlp/huggingface_model/question_answering/quantization/ptq/utils_qa.py b/examples/onnxrt/nlp/huggingface_model/question_answering/quantization/ptq/utils_qa.py
new file mode 100644
index 00000000000..96af7f1d6bd
--- /dev/null
+++ b/examples/onnxrt/nlp/huggingface_model/question_answering/quantization/ptq/utils_qa.py
@@ -0,0 +1,440 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Team All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Post-processing utilities for question answering.
+"""
+import collections
+import json
+import logging
+import os
+from typing import Optional, Tuple
+
+import numpy as np
+from tqdm.auto import tqdm
+
+
+logger = logging.getLogger(__name__)
+
+
+def postprocess_qa_predictions(
+    examples,
+    features,
+    predictions: Tuple[np.ndarray, np.ndarray],
+    version_2_with_negative: bool = False,
+    n_best_size: int = 20,
+    max_answer_length: int = 30,
+    null_score_diff_threshold: float = 0.0,
+    output_dir: Optional[str] = None,
+    prefix: Optional[str] = None,
+    log_level: Optional[int] = logging.WARNING,
+):
+    """
+    Post-processes the predictions of a question-answering model to convert them to answers that are substrings of the
+    original contexts. This is the base postprocessing functions for models that only return start and end logits.
+    Args:
+        examples: The non-preprocessed dataset (see the main script for more information).
+        features: The processed dataset (see the main script for more information).
+        predictions (:obj:`Tuple[np.ndarray, np.ndarray]`):
+            The predictions of the model: two arrays containing the start logits and the end logits respectively. Its
+            first dimension must match the number of elements of :obj:`features`.
+        version_2_with_negative (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether or not the underlying dataset contains examples with no answers.
+        n_best_size (:obj:`int`, `optional`, defaults to 20):
+            The total number of n-best predictions to generate when looking for an answer.
+        max_answer_length (:obj:`int`, `optional`, defaults to 30):
+            The maximum length of an answer that can be generated. This is needed because the start and end predictions
+            are not conditioned on one another.
+        null_score_diff_threshold (:obj:`float`, `optional`, defaults to 0):
+            The threshold used to select the null answer: if the best answer has a score that is less than the score of
+            the null answer minus this threshold, the null answer is selected for this example (note that the score of
+            the null answer for an example giving several features is the minimum of the scores for the null answer on
+            each feature: all features must be aligned on the fact they `want` to predict a null answer).
+            Only useful when :obj:`version_2_with_negative` is :obj:`True`.
+        output_dir (:obj:`str`, `optional`):
+            If provided, the dictionaries of predictions, n_best predictions (with their scores and logits) and, if
+            :obj:`version_2_with_negative=True`, the dictionary of the scores differences between best and null
+            answers, are saved in `output_dir`.
+        prefix (:obj:`str`, `optional`):
+            If provided, the dictionaries mentioned above are saved with `prefix` added to their names.
+        log_level (:obj:`int`, `optional`, defaults to ``logging.WARNING``):
+            ``logging`` log level (e.g., ``logging.WARNING``)
+    """
+    if len(predictions) != 2:
+        raise ValueError("`predictions` should be a tuple with two elements (start_logits, end_logits).")
+    all_start_logits, all_end_logits = predictions
+
+    if len(predictions[0]) != len(features):
+        raise ValueError(f"Got {len(predictions[0])} predictions and {len(features)} features.")
+
+    # Build a map example to its corresponding features.
+    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
+    features_per_example = collections.defaultdict(list)
+    for i, feature in enumerate(features):
+        features_per_example[example_id_to_index[feature["example_id"]]].append(i)
+
+    # The dictionaries we have to fill.
+    all_predictions = collections.OrderedDict()
+    all_nbest_json = collections.OrderedDict()
+    if version_2_with_negative:
+        scores_diff_json = collections.OrderedDict()
+
+    # Logging.
+    logger.setLevel(log_level)
+    logger.info(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")
+
+    # Let's loop over all the examples!
+    for example_index, example in enumerate(tqdm(examples)):
+        # Those are the indices of the features associated to the current example.
+        feature_indices = features_per_example[example_index]
+
+        min_null_prediction = None
+        prelim_predictions = []
+
+        # Looping through all the features associated to the current example.
+        for feature_index in feature_indices:
+            # We grab the predictions of the model for this feature.
+            start_logits = all_start_logits[feature_index]
+            end_logits = all_end_logits[feature_index]
+            # This is what will allow us to map some the positions in our logits to span of texts in the original
+            # context.
+            offset_mapping = features[feature_index]["offset_mapping"]
+            # Optional `token_is_max_context`, if provided we will remove answers that do not have the maximum context
+            # available in the current feature.
+            token_is_max_context = features[feature_index].get("token_is_max_context", None)
+
+            # Update minimum null prediction.
+            feature_null_score = start_logits[0] + end_logits[0]
+            if min_null_prediction is None or min_null_prediction["score"] > feature_null_score:
+                min_null_prediction = {
+                    "offsets": (0, 0),
+                    "score": feature_null_score,
+                    "start_logit": start_logits[0],
+                    "end_logit": end_logits[0],
+                }
+
+            # Go through all possibilities for the `n_best_size` greater start and end logits.
+            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
+            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
+            for start_index in start_indexes:
+                for end_index in end_indexes:
+                    # Don't consider out-of-scope answers, either because the indices are out of bounds or correspond
+                    # to part of the input_ids that are not in the context.
+                    if (
+                        start_index >= len(offset_mapping)
+                        or end_index >= len(offset_mapping)
+                        or offset_mapping[start_index] is None
+                        or len(offset_mapping[start_index]) < 2
+                        or offset_mapping[end_index] is None
+                        or len(offset_mapping[end_index]) < 2
+                    ):
+                        continue
+                    # Don't consider answers with a length that is either < 0 or > max_answer_length.
+                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
+                        continue
+                    # Don't consider answer that don't have the maximum context available (if such information is
+                    # provided).
+                    if token_is_max_context is not None and not token_is_max_context.get(str(start_index), False):
+                        continue
+
+                    prelim_predictions.append(
+                        {
+                            "offsets": (offset_mapping[start_index][0], offset_mapping[end_index][1]),
+                            "score": start_logits[start_index] + end_logits[end_index],
+                            "start_logit": start_logits[start_index],
+                            "end_logit": end_logits[end_index],
+                        }
+                    )
+        if version_2_with_negative and min_null_prediction is not None:
+            # Add the minimum null prediction
+            prelim_predictions.append(min_null_prediction)
+            null_score = min_null_prediction["score"]
+
+        # Only keep the best `n_best_size` predictions.
+        predictions = sorted(prelim_predictions, key=lambda x: x["score"], reverse=True)[:n_best_size]
+
+        # Add back the minimum null prediction if it was removed because of its low score.
+        if (
+            version_2_with_negative
+            and min_null_prediction is not None
+            and not any(p["offsets"] == (0, 0) for p in predictions)
+        ):
+            predictions.append(min_null_prediction)
+
+        # Use the offsets to gather the answer text in the original context.
+        context = example["context"]
+        for pred in predictions:
+            offsets = pred.pop("offsets")
+            pred["text"] = context[offsets[0] : offsets[1]]
+
+        # In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid
+        # failure.
+        if len(predictions) == 0 or (len(predictions) == 1 and predictions[0]["text"] == ""):
+            predictions.insert(0, {"text": "empty", "start_logit": 0.0, "end_logit": 0.0, "score": 0.0})
+
+        # Compute the softmax of all scores (we do it with numpy to stay independent from torch/tf in this file, using
+        # the LogSumExp trick).
+        scores = np.array([pred.pop("score") for pred in predictions])
+        exp_scores = np.exp(scores - np.max(scores))
+        probs = exp_scores / exp_scores.sum()
+
+        # Include the probabilities in our predictions.
+        for prob, pred in zip(probs, predictions):
+            pred["probability"] = prob
+
+        # Pick the best prediction. If the null answer is not possible, this is easy.
+        if not version_2_with_negative:
+            all_predictions[example["id"]] = predictions[0]["text"]
+        else:
+            # Otherwise we first need to find the best non-empty prediction.
+            i = 0
+            while predictions[i]["text"] == "":
+                i += 1
+            best_non_null_pred = predictions[i]
+
+            # Then we compare to the null prediction using the threshold.
+            score_diff = null_score - best_non_null_pred["start_logit"] - best_non_null_pred["end_logit"]
+            scores_diff_json[example["id"]] = float(score_diff)  # To be JSON-serializable.
+            if score_diff > null_score_diff_threshold:
+                all_predictions[example["id"]] = ""
+            else:
+                all_predictions[example["id"]] = best_non_null_pred["text"]
+
+        # Make `predictions` JSON-serializable by casting np.float back to float.
+        all_nbest_json[example["id"]] = [
+            {k: (float(v) if isinstance(v, (np.float16, np.float32, np.float64)) else v) for k, v in pred.items()}
+            for pred in predictions
+        ]
+
+    # If we have an output_dir, let's save all those dicts.
+    if output_dir is not None:
+        if not os.path.isdir(output_dir):
+            raise EnvironmentError(f"{output_dir} is not a directory.")
+
+        prediction_file = os.path.join(
+            output_dir, "predictions.json" if prefix is None else f"{prefix}_predictions.json"
+        )
+        nbest_file = os.path.join(
+            output_dir, "nbest_predictions.json" if prefix is None else f"{prefix}_nbest_predictions.json"
+        )
+        if version_2_with_negative:
+            null_odds_file = os.path.join(
+                output_dir, "null_odds.json" if prefix is None else f"{prefix}_null_odds.json"
+            )
+
+        logger.info(f"Saving predictions to {prediction_file}.")
+        with open(prediction_file, "w") as writer:
+            writer.write(json.dumps(all_predictions, indent=4) + "\n")
+        logger.info(f"Saving nbest_preds to {nbest_file}.")
+        with open(nbest_file, "w") as writer:
+            writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
+        if version_2_with_negative:
+            logger.info(f"Saving null_odds to {null_odds_file}.")
+            with open(null_odds_file, "w") as writer:
+                writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
+
+    return all_predictions
+
+
+def postprocess_qa_predictions_with_beam_search(
+    examples,
+    features,
+    predictions: Tuple[np.ndarray, np.ndarray],
+    version_2_with_negative: bool = False,
+    n_best_size: int = 20,
+    max_answer_length: int = 30,
+    start_n_top: int = 5,
+    end_n_top: int = 5,
+    output_dir: Optional[str] = None,
+    prefix: Optional[str] = None,
+    log_level: Optional[int] = logging.WARNING,
+):
+    """
+    Post-processes the predictions of a question-answering model with beam search to convert them to answers that are substrings of the
+    original contexts. This is the postprocessing functions for models that return start and end logits, indices, as well as
+    cls token predictions.
+    Args:
+        examples: The non-preprocessed dataset (see the main script for more information).
+        features: The processed dataset (see the main script for more information).
+        predictions (:obj:`Tuple[np.ndarray, np.ndarray]`):
+            The predictions of the model: two arrays containing the start logits and the end logits respectively. Its
+            first dimension must match the number of elements of :obj:`features`.
+        version_2_with_negative (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether or not the underlying dataset contains examples with no answers.
+        n_best_size (:obj:`int`, `optional`, defaults to 20):
+            The total number of n-best predictions to generate when looking for an answer.
+        max_answer_length (:obj:`int`, `optional`, defaults to 30):
+            The maximum length of an answer that can be generated. This is needed because the start and end predictions
+            are not conditioned on one another.
+        start_n_top (:obj:`int`, `optional`, defaults to 5):
+            The number of top start logits too keep when searching for the :obj:`n_best_size` predictions.
+        end_n_top (:obj:`int`, `optional`, defaults to 5):
+            The number of top end logits too keep when searching for the :obj:`n_best_size` predictions.
+        output_dir (:obj:`str`, `optional`):
+            If provided, the dictionaries of predictions, n_best predictions (with their scores and logits) and, if
+            :obj:`version_2_with_negative=True`, the dictionary of the scores differences between best and null
+            answers, are saved in `output_dir`.
+        prefix (:obj:`str`, `optional`):
+            If provided, the dictionaries mentioned above are saved with `prefix` added to their names.
+        log_level (:obj:`int`, `optional`, defaults to ``logging.WARNING``):
+            ``logging`` log level (e.g., ``logging.WARNING``)
+    """
+    if len(predictions) != 5:
+        raise ValueError("`predictions` should be a tuple with five elements.")
+    start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits = predictions
+
+    if len(predictions[0]) != len(features):
+        raise ValueError(f"Got {len(predictions[0])} predictions and {len(features)} features.")
+
+    # Build a map example to its corresponding features.
+    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
+    features_per_example = collections.defaultdict(list)
+    for i, feature in enumerate(features):
+        features_per_example[example_id_to_index[feature["example_id"]]].append(i)
+
+    # The dictionaries we have to fill.
+    all_predictions = collections.OrderedDict()
+    all_nbest_json = collections.OrderedDict()
+    scores_diff_json = collections.OrderedDict() if version_2_with_negative else None
+
+    # Logging.
+    logger.setLevel(log_level)
+    logger.info(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")
+
+    # Let's loop over all the examples!
+    for example_index, example in enumerate(tqdm(examples)):
+        # Those are the indices of the features associated to the current example.
+        feature_indices = features_per_example[example_index]
+
+        min_null_score = None
+        prelim_predictions = []
+
+        # Looping through all the features associated to the current example.
+        for feature_index in feature_indices:
+            # We grab the predictions of the model for this feature.
+            start_log_prob = start_top_log_probs[feature_index]
+            start_indexes = start_top_index[feature_index]
+            end_log_prob = end_top_log_probs[feature_index]
+            end_indexes = end_top_index[feature_index]
+            feature_null_score = cls_logits[feature_index]
+            # This is what will allow us to map some the positions in our logits to span of texts in the original
+            # context.
+            offset_mapping = features[feature_index]["offset_mapping"]
+            # Optional `token_is_max_context`, if provided we will remove answers that do not have the maximum context
+            # available in the current feature.
+            token_is_max_context = features[feature_index].get("token_is_max_context", None)
+
+            # Update minimum null prediction
+            if min_null_score is None or feature_null_score < min_null_score:
+                min_null_score = feature_null_score
+
+            # Go through all possibilities for the `n_start_top`/`n_end_top` greater start and end logits.
+            for i in range(start_n_top):
+                for j in range(end_n_top):
+                    start_index = int(start_indexes[i])
+                    j_index = i * end_n_top + j
+                    end_index = int(end_indexes[j_index])
+                    # Don't consider out-of-scope answers (last part of the test should be unnecessary because of the
+                    # p_mask but let's not take any risk)
+                    if (
+                        start_index >= len(offset_mapping)
+                        or end_index >= len(offset_mapping)
+                        or offset_mapping[start_index] is None
+                        or len(offset_mapping[start_index]) < 2
+                        or offset_mapping[end_index] is None
+                        or len(offset_mapping[end_index]) < 2
+                    ):
+                        continue
+
+                    # Don't consider answers with a length negative or > max_answer_length.
+                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
+                        continue
+                    # Don't consider answer that don't have the maximum context available (if such information is
+                    # provided).
+                    if token_is_max_context is not None and not token_is_max_context.get(str(start_index), False):
+                        continue
+                    prelim_predictions.append(
+                        {
+                            "offsets": (offset_mapping[start_index][0], offset_mapping[end_index][1]),
+                            "score": start_log_prob[i] + end_log_prob[j_index],
+                            "start_log_prob": start_log_prob[i],
+                            "end_log_prob": end_log_prob[j_index],
+                        }
+                    )
+
+        # Only keep the best `n_best_size` predictions.
+        predictions = sorted(prelim_predictions, key=lambda x: x["score"], reverse=True)[:n_best_size]
+
+        # Use the offsets to gather the answer text in the original context.
+        context = example["context"]
+        for pred in predictions:
+            offsets = pred.pop("offsets")
+            pred["text"] = context[offsets[0] : offsets[1]]
+
+        # In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid
+        # failure.
+        if len(predictions) == 0:
+            # Without predictions min_null_score is going to be None and None will cause an exception later
+            min_null_score = -2e-6
+            predictions.insert(0, {"text": "", "start_logit": -1e-6, "end_logit": -1e-6, "score": min_null_score})
+
+        # Compute the softmax of all scores (we do it with numpy to stay independent from torch/tf in this file, using
+        # the LogSumExp trick).
+        scores = np.array([pred.pop("score") for pred in predictions])
+        exp_scores = np.exp(scores - np.max(scores))
+        probs = exp_scores / exp_scores.sum()
+
+        # Include the probabilities in our predictions.
+        for prob, pred in zip(probs, predictions):
+            pred["probability"] = prob
+
+        # Pick the best prediction and set the probability for the null answer.
+        all_predictions[example["id"]] = predictions[0]["text"]
+        if version_2_with_negative:
+            scores_diff_json[example["id"]] = float(min_null_score)
+
+        # Make `predictions` JSON-serializable by casting np.float back to float.
+        all_nbest_json[example["id"]] = [
+            {k: (float(v) if isinstance(v, (np.float16, np.float32, np.float64)) else v) for k, v in pred.items()}
+            for pred in predictions
+        ]
+
+    # If we have an output_dir, let's save all those dicts.
+    if output_dir is not None:
+        if not os.path.isdir(output_dir):
+            raise EnvironmentError(f"{output_dir} is not a directory.")
+
+        prediction_file = os.path.join(
+            output_dir, "predictions.json" if prefix is None else f"{prefix}_predictions.json"
+        )
+        nbest_file = os.path.join(
+            output_dir, "nbest_predictions.json" if prefix is None else f"{prefix}_nbest_predictions.json"
+        )
+        if version_2_with_negative:
+            null_odds_file = os.path.join(
+                output_dir, "null_odds.json" if prefix is None else f"{prefix}_null_odds.json"
+            )
+
+        logger.info(f"Saving predictions to {prediction_file}.")
+        with open(prediction_file, "w") as writer:
+            writer.write(json.dumps(all_predictions, indent=4) + "\n")
+        logger.info(f"Saving nbest_preds to {nbest_file}.")
+        with open(nbest_file, "w") as writer:
+            writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
+        if version_2_with_negative:
+            logger.info(f"Saving null_odds to {null_odds_file}.")
+            with open(null_odds_file, "w") as writer:
+                writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
+
+    return all_predictions, scores_diff_json
\ No newline at end of file
diff --git a/examples/onnxrt/nlp/huggingface_model/text_classification/quantization/ptq/README.md b/examples/onnxrt/nlp/huggingface_model/text_classification/quantization/ptq/README.md
new file mode 100644
index 00000000000..0afa7d99629
--- /dev/null
+++ b/examples/onnxrt/nlp/huggingface_model/text_classification/quantization/ptq/README.md
@@ -0,0 +1,56 @@
+# Evaluate performance of ONNX Runtime(Huggingface Text Classification) 
+>ONNX runtime quantization is under active development. please use 1.6.0+ to get more quantization support. 
+
+This example load a language translation model and confirm its accuracy and speed based on [GLUE data](https://gluebenchmark.com/). 
+
+### Environment
+Please use latest onnx and onnxruntime version.
+
+### Prepare dataset
+download the GLUE data with `prepare_data.sh` script.
+
+```shell
+export GLUE_DIR=/path/to/glue_data
+export TASK_NAME=MRPC # or SST
+
+bash prepare_data.sh --data_dir=$GLUE_DIR --task_name=$TASK_NAME
+```
+
+### Prepare model
+
+Supported model identifier from [huggingface.co](https://huggingface.co/):
+
+|                 Model Identifier                |
+|:-----------------------------------------------:|
+|           Intel/bert-base-uncased-mrpc          |
+|             Intel/roberta-base-mrpc             |
+|           Intel/xlm-roberta-base-mrpc           |
+|            Intel/camembert-base-mrpc            |
+| distilbert-base-uncased-finetuned-sst-2-english |
+|         Alireza1044/albert-base-v2-sst2         |
+|        Intel/MiniLM-L12-H384-uncased-mrpc       |
+|      philschmid/MiniLM-L6-H384-uncased-sst2     |
+
+```bash
+python export.py --model_name_or_path=Intel/bert-base-uncased-mrpc \ # or other supported model identifier
+```
+
+### Quantization
+
+Quantize model with dynamic quantization:
+
+```bash
+bash run_tuning.sh --config=glue_dynamic.yaml \ 
+                   --input_model=path/to/model \ # model path as *.onnx
+                   --output_model=path/to/model_tune \ # model path as *.onnx
+                   --data_path=path/to/glue/data
+```
+
+### Benchmark
+
+```bash
+bash run_benchmark.sh --config=glue_dynamic.yaml \ 
+                   --input_model=path/to/model \ # model path as *.onnx
+                   --data_path=path/to/glue/data \ 
+                   --mode=performance # or accuracy
+```
diff --git a/examples/onnxrt/nlp/huggingface_model/text_classification/quantization/ptq/export.py b/examples/onnxrt/nlp/huggingface_model/text_classification/quantization/ptq/export.py
new file mode 100644
index 00000000000..f2a38e747b3
--- /dev/null
+++ b/examples/onnxrt/nlp/huggingface_model/text_classification/quantization/ptq/export.py
@@ -0,0 +1,72 @@
+import argparse
+
+import torch
+from transformers import AutoConfig, AutoModelForSequenceClassification
+
+def export_onnx_model(args, model):
+    with torch.no_grad():
+        symbolic_names = {0: 'batch_size', 1: 'max_seq_len'}
+        if args.model_name_or_path in ['Intel/roberta-base-mrpc', 
+                                        'Intel/xlm-roberta-base-mrpc', 
+                                        'Intel/camembert-base-mrpc', 
+                                        'distilbert-base-uncased-finetuned-sst-2-english']:
+            inputs = {'input_ids':      torch.ones(1, args.max_len, dtype=torch.int64),
+                    'attention_mask': torch.ones(1, args.max_len, dtype=torch.int64)}
+            torch.onnx.export(model,                            # model being run
+                            (inputs['input_ids'],               # model input (or a tuple for multiple inputs) 
+                            inputs['attention_mask']),          
+                            args.output_model,                  # where to save the model (can be a file or file-like object)
+                            opset_version=14,                   # the ONNX version to export the model
+                            do_constant_folding=True,           # whether to execute constant folding
+                            input_names=['input_ids',           # the model's input names
+                                        'attention_mask'],
+                            dynamic_axes={'input_ids': symbolic_names,        # variable length axes
+                                        'attention_mask' : symbolic_names})
+        else:
+            inputs = {'input_ids':      torch.ones(1, args.max_len, dtype=torch.int64),
+                      'token_type_ids': torch.ones(1, args.max_len, dtype=torch.int64),
+                    'attention_mask': torch.ones(1, args.max_len, dtype=torch.int64)}
+            torch.onnx.export(model,                            # model being run
+                            (inputs['input_ids'],               # model input (or a tuple for multiple inputs) 
+                            inputs['token_type_ids'],
+                            inputs['attention_mask']),          
+                            args.output_model,                  # where to save the model (can be a file or file-like object)
+                            opset_version=14,                   # the ONNX version to export the model
+                            do_constant_folding=True,           # whether to execute constant folding
+                            input_names=['input_ids',           # the model's input names
+                                        'token_type_ids',
+                                        'attention_mask'],
+                            dynamic_axes={'input_ids': symbolic_names,        # variable length axes
+                                        'token_type_ids' : symbolic_names,
+                                        'attention_mask' : symbolic_names})
+        print("ONNX Model exported to {0}".format(args.output_model))
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+    description='Export huggingface onnx model',
+    formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument(
+        '--model_name_or_path',
+        type=str,
+        choices=['Intel/bert-base-uncased-mrpc',
+                'Intel/roberta-base-mrpc',
+                'Intel/xlm-roberta-base-mrpc',
+                'Intel/camembert-base-mrpc',
+                'distilbert-base-uncased-finetuned-sst-2-english',
+                'Alireza1044/albert-base-v2-sst2',
+                'philschmid/MiniLM-L6-H384-uncased-sst2',
+                'Intel/MiniLM-L12-H384-uncased-mrpc'],
+        help='pretrained model name or path')
+    parser.add_argument(
+        '--max_len',
+        type=int,
+        default=128,
+        help='Maximum length of the sentence pairs')
+    args = parser.parse_args()
+    args.output_model = args.model_name_or_path.split('/')[-1] + '.onnx'
+
+    model = AutoModelForSequenceClassification.from_pretrained(
+        args.model_name_or_path,
+        config=AutoConfig.from_pretrained(args.model_name_or_path))
+
+    export_onnx_model(args, model)
\ No newline at end of file
diff --git a/examples/onnxrt/nlp/huggingface_model/text_classification/quantization/ptq/glue_dynamic.yaml b/examples/onnxrt/nlp/huggingface_model/text_classification/quantization/ptq/glue_dynamic.yaml
new file mode 100644
index 00000000000..fa9a22ce874
--- /dev/null
+++ b/examples/onnxrt/nlp/huggingface_model/text_classification/quantization/ptq/glue_dynamic.yaml
@@ -0,0 +1,36 @@
+#
+# Copyright (c) 2021 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+version: 1.0
+
+model:                                               # mandatory. used to specify model specific information.
+  name: text_classification 
+  framework: onnxrt_integerops                       # mandatory. possible values are tensorflow, mxnet, pytorch, pytorch_ipex, onnxrt_integerops and onnxrt_qlinearops.
+
+evaluation:                                          # optional. required if user doesn't provide eval_func in neural_compressor.Quantization.
+  performance:                                       # optional. used to benchmark performance of passing model.
+    warmup: 10
+    iteration: 100
+    configs:
+      cores_per_instance: 28
+      num_of_instance: 1
+
+quantization:
+  approach: post_training_dynamic_quant              # optional. default value is post_training_static_quant.                                                   
+  
+tuning:
+  accuracy_criterion:
+    relative:  0.01                                  # optional. default value is relative, other value is absolute. this example allows relative accuracy loss: 1%.
+  random_seed: 9527                                  # optional. random seed for deterministic tuning.
\ No newline at end of file
diff --git a/examples/onnxrt/nlp/huggingface_model/text_classification/quantization/ptq/main.py b/examples/onnxrt/nlp/huggingface_model/text_classification/quantization/ptq/main.py
new file mode 100644
index 00000000000..d5051af3816
--- /dev/null
+++ b/examples/onnxrt/nlp/huggingface_model/text_classification/quantization/ptq/main.py
@@ -0,0 +1,422 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint:disable=redefined-outer-name,logging-format-interpolation
+
+import logging
+import argparse
+import onnx
+import onnxruntime as ort
+import transformers
+import os
+import torch
+import numpy as np
+from dataclasses import dataclass
+from typing import List, Optional, Union
+import sys
+from neural_compressor.data import DATALOADERS, DATASETS
+
+
+class ONNXRTBertDataset:
+    """Dataset used for model Bert.
+    Args: data_dir (str): The input data dir.
+          model_name_or_path (str): Path to pre-trained student model or shortcut name,
+                                    selected in the list:
+          max_seq_length (int, default=128): The maximum length after tokenization.
+                                Sequences longer than this will be truncated,
+                                sequences shorter will be padded.
+          do_lower_case (bool, default=True): Whether to lowercase the input when tokenizing.
+          task (str, default=mrpc): The name of the task to fine-tune.
+                                    Choices include mrpc, qqp, qnli, rte,
+                                    sts-b, cola, mnli, wnli.
+          model_type (str, default='bert'): model type, support 'distilbert', 'bert',
+                                            'mobilebert', 'roberta'.
+          dynamic_length (bool, default=False): Whether to use fixed sequence length.
+          evaluate (bool, default=True): Whether do evaluation or training.
+          transform (transform object, default=None):  transform to process input data.
+          filter (Filter objects, default=None): filter out examples according
+                                                 to specific conditions.
+    """
+    def __init__(self, data_dir, model_name_or_path, max_seq_length=128,\
+                do_lower_case=True, task='mrpc', model_type='bert', dynamic_length=False,\
+                evaluate=True, transform=None, filter=None):
+        task = task.lower()
+        model_type = model_type.lower()
+        assert task in ['mrpc', 'qqp', 'qnli', 'rte', 'sts-b', 'cola', \
+            'mnli', 'wnli', 'sst-2'], 'Unsupported task type'
+        assert model_type in ['distilbert', 'bert', 'mobilebert', 'roberta'], 'Unsupported \
+            model type'
+        self.dynamic_length = dynamic_length
+        self.model_type = model_type
+        self.max_seq_length = max_seq_length
+        tokenizer = transformers.AutoTokenizer.from_pretrained(model_name_or_path,
+            do_lower_case=do_lower_case)
+        self.dataset = load_and_cache_examples(data_dir, model_name_or_path, \
+            max_seq_length, task, model_type, tokenizer, evaluate)
+
+    def __len__(self):
+        return len(self.dataset)
+
+    def __getitem__(self, index):
+        # return self.dataset[index]
+        batch = tuple(t.detach().cpu().numpy() if not isinstance(t, np.ndarray) else t for t in self.dataset[index])
+        return batch[:3], batch[-1]
+
+def load_and_cache_examples(data_dir, model_name_or_path, max_seq_length, task, \
+    model_type, tokenizer, evaluate):
+    from torch.utils.data import TensorDataset
+
+    processor = transformers.glue_processors[task]()
+    output_mode = transformers.glue_output_modes[task]
+    # Load data features from cache or dataset file
+    if not os.path.exists("./dataset_cached"):
+        os.makedirs("./dataset_cached")
+    cached_features_file = os.path.join("./dataset_cached", 'cached_{}_{}_{}_{}'.format(
+        'dev' if evaluate else 'train',
+        list(filter(None, model_name_or_path.split('/'))).pop(),
+        str(max_seq_length),
+        str(task)))
+    if os.path.exists(cached_features_file):
+        logger.info("Load features from cached file {}.".format(cached_features_file))
+        features = torch.load(cached_features_file)
+    else:
+        logger.info("Create features from dataset file at {}.".format(data_dir))
+        label_list = processor.get_labels()
+        examples = processor.get_dev_examples(data_dir) if evaluate else \
+            processor.get_train_examples(data_dir)
+        features = convert_examples_to_features(examples,
+                                                tokenizer,
+                                                task=task,
+                                                label_list=label_list,
+                                                max_length=max_seq_length,
+                                                output_mode=output_mode,
+        )
+        logger.info("Save features into cached file {}.".format(cached_features_file))
+        torch.save(features, cached_features_file)
+    # Convert to Tensors and build dataset
+    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
+    all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
+    all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
+    all_seq_lengths = torch.tensor([f.seq_length for f in features], dtype=torch.long)
+    if output_mode == "classification":
+        all_labels = torch.tensor([f.label for f in features], dtype=torch.long)
+    elif output_mode == "regression":
+        all_labels = torch.tensor([f.label for f in features], dtype=torch.float)
+    dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, \
+        all_seq_lengths, all_labels)
+    return dataset
+
+def convert_examples_to_features(
+    examples,
+    tokenizer,
+    max_length=128,
+    task=None,
+    label_list=None,
+    output_mode="classification",
+    pad_token=0,
+    pad_token_segment_id=0,
+    mask_padding_with_zero=True,
+):
+    processor = transformers.glue_processors[task]()
+    if label_list is None:
+        label_list = processor.get_labels()
+        logger.info("Use label list {} for task {}.".format(label_list, task))
+    label_map = {label: i for i, label in enumerate(label_list)}
+    features = []
+    for (ex_index, example) in enumerate(examples):
+        inputs = tokenizer.encode_plus(
+            example.text_a,
+            example.text_b,
+            add_special_tokens=True,
+            max_length=max_length,
+            return_token_type_ids=True,
+            truncation=True,
+        )
+        input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]
+        # The mask has 1 for real tokens and 0 for padding tokens. Only real
+        # tokens are attended to.
+        attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
+
+        # Zero-pad up to the sequence length.
+        seq_length = len(input_ids)
+        padding_length = max_length - len(input_ids)
+
+        input_ids = input_ids + ([pad_token] * padding_length)
+        attention_mask = attention_mask + \
+            ([0 if mask_padding_with_zero else 1] * padding_length)
+        token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)
+
+        assert len(input_ids) == max_length, \
+            "Error with input_ids length {} vs {}".format(
+            len(input_ids), max_length)
+        assert len(attention_mask) == max_length, \
+            "Error with attention_mask length {} vs {}".format(
+            len(attention_mask), max_length
+        )
+        assert len(token_type_ids) == max_length, \
+            "Error with token_type_ids length {} vs {}".format(
+            len(token_type_ids), max_length
+        )
+        if output_mode == "classification":
+            label = label_map[example.label]
+        elif output_mode == "regression":
+            label = float(example.label)
+        else:
+            raise KeyError(output_mode)
+
+        feats = InputFeatures(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            label=label,
+            seq_length=seq_length,
+        )
+        features.append(feats)
+    return features
+
+@dataclass(frozen=True)
+class InputFeatures:
+    """
+    A single set of features of data.
+    Property names are the same names as the corresponding inputs to a model.
+    Args:
+        input_ids: Indices of input sequence tokens in the vocabulary.
+        attention_mask: Mask to avoid performing attention on padding token indices.
+            Mask values selected in ``[0, 1]``: Usually ``1`` for tokens that are NOT MASKED,
+            ``0`` for MASKED (padded) tokens.
+        token_type_ids: (Optional) Segment token indices to indicate first and second
+            portions of the inputs. Only some models use them.
+        label: (Optional) Label corresponding to the input. Int for classification problems,
+            float for regression problems.
+        seq_length: (Optional) The length of input sequence before padding.
+    """
+
+    input_ids: List[int]
+    attention_mask: Optional[List[int]] = None
+    token_type_ids: Optional[List[int]] = None
+    label: Optional[Union[int, float]] = None
+    seq_length: Optional[List[int]] = None
+
+class ONNXRTGLUE:
+    """Computes GLUE score.
+
+    Args:
+        task (str, default=mrpc): The name of the task.
+                                  Choices include mrpc, qqp, qnli, rte,
+                                  sts-b, cola, mnli, wnli.
+
+    """
+    def __init__(self, task='mrpc'):
+        assert task in ['mrpc', 'qqp', 'qnli', 'rte', 'sts-b', 'cola', \
+            'mnli', 'wnli', 'sst-2'], 'Unsupported task type'
+        self.pred_list = None
+        self.label_list = None
+        self.task = task
+        self.return_key = {
+            "cola": "mcc",
+            "mrpc": "f1",
+            "sts-b": "corr",
+            "qqp": "acc",
+            "mnli": "mnli/acc",
+            "qnli": "acc",
+            "rte": "acc",
+            "wnli": "acc",
+            "sst-2": "acc"
+        }
+
+    def update(self, preds, labels):
+        """add preds and labels to storage"""
+        if isinstance(preds, list) and len(preds) == 1:
+            preds = preds[0]
+        if isinstance(labels, list) and len(labels) == 1:
+            labels = labels[0]
+        if self.pred_list is None:
+            self.pred_list = preds
+            self.label_list = labels
+        else:
+            self.pred_list = np.append(self.pred_list, preds, axis=0)
+            self.label_list = np.append(self.label_list, labels, axis=0)
+
+    def reset(self):
+        """clear preds and labels storage"""
+        self.pred_list = None
+        self.label_list = None
+
+    def result(self):
+        """calculate metric"""
+        output_mode = transformers.glue_output_modes[self.task]
+
+        if output_mode == "classification":
+            processed_preds = np.argmax(self.pred_list, axis=1)
+        elif output_mode == "regression":
+            processed_preds = np.squeeze(self.pred_list)
+        result = transformers.glue_compute_metrics(\
+            self.task, processed_preds, self.label_list)
+        return result[self.return_key[self.task]]
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
+                    datefmt = '%m/%d/%Y %H:%M:%S',
+                    level = logging.WARN)
+
+if __name__ == "__main__":
+    logger.info('Evaluating ONNXRuntime full precision accuracy and performance:')
+    parser = argparse.ArgumentParser(
+    description='BERT fine-tune examples for classification/regression tasks.',
+    formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument(
+        '--model_path',
+        type=str,
+        help="Pre-trained resnet50 model on onnx file"
+    )
+    parser.add_argument(
+        '--benchmark',
+        action='store_true', \
+        default=False
+    )
+    parser.add_argument(
+        '--tune',
+        action='store_true', \
+        default=False,
+        help="whether quantize the model"
+    )
+    parser.add_argument(
+       '--config',
+       type=str,
+       help="config yaml path"
+    )
+    parser.add_argument(
+        '--output_model',
+        type=str,
+        default=None,
+        help="output model path"
+    )
+    parser.add_argument(
+        '--mode',
+        type=str,
+        help="benchmark mode of performance or accuracy"
+    )
+    parser.add_argument(
+        '--data_path',
+        type=str,
+        help="input data path"
+    )
+    parser.add_argument(
+        '--batch_size',
+        default=8,
+        type=int,
+    )
+    parser.add_argument(
+        '--model_name_or_path',
+        type=str,
+        choices=['Intel/bert-base-uncased-mrpc',
+                'Intel/roberta-base-mrpc',
+                'Intel/xlm-roberta-base-mrpc',
+                'Intel/camembert-base-mrpc',
+                'distilbert-base-uncased-finetuned-sst-2-english',
+                'Alireza1044/albert-base-v2-sst2',
+                'philschmid/MiniLM-L6-H384-uncased-sst2',
+                'Intel/MiniLM-L12-H384-uncased-mrpc'],
+        help="pretrained model name or path"
+    )
+    parser.add_argument(
+        '--task',
+        type=str,
+        choices=['mrpc', 'qqp', 'qnli', 'rte', 'sts-b', 'cola', \
+                'mnli', 'wnli', 'sst-2'],
+        help="GLUE task name"
+    )
+    parser.add_argument(
+        '--num_heads',
+        default=12,
+        type=int,
+    )
+    parser.add_argument(
+        '--hidden_size',
+        default=768,
+        type=int,
+    )
+ 
+    args = parser.parse_args()
+
+    dataset = ONNXRTBertDataset(data_dir=args.data_path,
+                                model_name_or_path=args.model_name_or_path,
+                                task=args.task)
+    dataloader = DATALOADERS['onnxrt_integerops'](dataset, batch_size=args.batch_size)
+    metric = ONNXRTGLUE(args.task)
+
+    def eval_func(model, *args):
+        metric.reset()
+        import tqdm
+        session = ort.InferenceSession(model.SerializeToString(), None)
+        ort_inputs = {}
+        len_inputs = len(session.get_inputs())
+        inputs_names = [session.get_inputs()[i].name for i in range(len_inputs)]
+        for idx, (inputs, labels) in enumerate(dataloader):
+            if not isinstance(labels, list):
+                labels = [labels]
+            inputs = inputs[:len_inputs]
+            for i in range(len_inputs):
+                ort_inputs.update({inputs_names[i]: inputs[i]})
+            predictions = session.run(None, ort_inputs)
+            metric.update(predictions[0], labels)
+        return metric.result()
+
+    if args.benchmark:
+        from neural_compressor.experimental import Benchmark, common
+        model = onnx.load(args.model_path)
+        if args.mode == 'performance':
+            session = ort.InferenceSession(args.model_path, None)
+            input_tensors = session.get_inputs()
+            shape = []
+            for i in range(len(input_tensors)):
+                shape.append((1, 128))
+            datasets = DATASETS('onnxrt_integerops')
+            dummy_dataset = datasets['dummy'](shape=shape, low=1, high=1, dtype='int64', label=True)
+            evaluator = Benchmark(args.config)
+            evaluator.model = common.Model(model)
+            evaluator.b_dataloader = common.DataLoader(dummy_dataset)
+            evaluator(args.mode)
+        elif args.mode == 'accuracy':
+            evaluator = Benchmark(args.config)
+            evaluator.model = common.Model(model)
+            evaluator.b_dataloader = dataloader
+            evaluator.metric = metric
+            evaluator.b_func = eval_func
+            evaluator(args.mode)
+
+    if args.tune:
+        from onnxruntime.transformers import optimizer
+        from onnxruntime.transformers.onnx_model_bert import BertOptimizationOptions
+        opt_options = BertOptimizationOptions('bert')
+        opt_options.enable_embed_layer_norm = False
+
+        model_optimizer = optimizer.optimize_model(
+            args.model_path,
+            'bert',
+            num_heads=args.num_heads,
+            hidden_size=args.hidden_size,
+            optimization_options=opt_options)
+        model = model_optimizer.model
+
+        from neural_compressor import options
+        from neural_compressor.experimental import Quantization, common
+        options.onnxrt.graph_optimization.level = 'ENABLE_BASIC'
+        quantize = Quantization(args.config)
+        quantize.model = model
+        quantize.eval_func = eval_func
+        q_model = quantize()
+        q_model.save(args.output_model)
diff --git a/examples/onnxrt/language_translation/mobilebert/quantization/ptq/prepare_data.sh b/examples/onnxrt/nlp/huggingface_model/text_classification/quantization/ptq/prepare_data.sh
similarity index 100%
rename from examples/onnxrt/language_translation/mobilebert/quantization/ptq/prepare_data.sh
rename to examples/onnxrt/nlp/huggingface_model/text_classification/quantization/ptq/prepare_data.sh
diff --git a/examples/onnxrt/nlp/huggingface_model/text_classification/quantization/ptq/requirements.txt b/examples/onnxrt/nlp/huggingface_model/text_classification/quantization/ptq/requirements.txt
new file mode 100644
index 00000000000..a5e81be3aad
--- /dev/null
+++ b/examples/onnxrt/nlp/huggingface_model/text_classification/quantization/ptq/requirements.txt
@@ -0,0 +1,7 @@
+torch
+transformers==4.16.0
+onnx
+onnxruntime
+coloredlogs
+sympy
+onnxruntime-extensions; python_version < '3.10'
diff --git a/examples/onnxrt/nlp/huggingface_model/text_classification/quantization/ptq/run_benchmark.sh b/examples/onnxrt/nlp/huggingface_model/text_classification/quantization/ptq/run_benchmark.sh
new file mode 100644
index 00000000000..c72b109a530
--- /dev/null
+++ b/examples/onnxrt/nlp/huggingface_model/text_classification/quantization/ptq/run_benchmark.sh
@@ -0,0 +1,81 @@
+#!/bin/bash
+set -x
+
+function main {
+
+  init_params "$@"
+  run_benchmark
+
+}
+
+# init params
+function init_params {
+  for var in "$@"
+  do
+    case $var in
+      --config=*)
+          config=$(echo $var |cut -f2 -d=)
+      ;;
+      --input_model=*)
+          input_model=$(echo $var |cut -f2 -d=)
+      ;;
+      --mode=*)
+          mode=$(echo $var |cut -f2 -d=)
+      ;;
+      --data_path=*)
+          data_path=$(echo $var |cut -f2 -d=)
+      ;;
+    esac
+  done
+
+}
+
+# run_benchmark
+function run_benchmark {
+    
+    if [[ "${input_model}" =~ "bert-base" ]]; then
+        model_name_or_path="Intel/bert-base-uncased-mrpc"
+        TASK_NAME='mrpc'
+    fi
+    if [[ "${input_model}" =~ "roberta-base" ]]; then
+        model_name_or_path="Intel/roberta-base-mrpc"
+        TASK_NAME='mrpc'
+    fi
+    if [[ "${input_model}" =~ "xlm-roberta-base" ]]; then
+        model_name_or_path="Intel/xlm-roberta-base-mrpc"
+        TASK_NAME='mrpc'
+    fi
+    if [[ "${input_model}" =~ "camembert-base" ]]; then
+        model_name_or_path="Intel/camembert-base-mrpc"
+        TASK_NAME='mrpc'
+    fi
+    if [[ "${input_model}" =~ "distilbert-base" ]]; then
+        model_name_or_path="distilbert-base-uncased-finetuned-sst-2-english"
+        TASK_NAME='sst-2'
+    fi
+    if [[ "${input_model}" =~ "albert-base" ]]; then
+        model_name_or_path="Alireza1044/albert-base-v2-sst2"
+        TASK_NAME='sst-2'
+    fi
+    if [[ "${input_model}" =~ "MiniLM-L6" ]]; then
+        model_name_or_path="philschmid/MiniLM-L6-H384-uncased-sst2"
+        TASK_NAME='sst-2'
+    fi
+    if [[ "${input_model}" =~ "MiniLM-L12" ]]; then
+        model_name_or_path="Intel/MiniLM-L12-H384-uncased-mrpc"
+        TASK_NAME='mrpc'
+    fi
+
+    python main.py \
+            --model_name_or_path ${model_name_or_path} \
+            --model_path ${input_model} \
+            --config ${config} \
+            --data_path ${data_path} \
+            --task ${TASK_NAME} \
+            --mode=${mode} \
+            --benchmark
+            
+}
+
+main "$@"
+
diff --git a/examples/onnxrt/nlp/huggingface_model/text_classification/quantization/ptq/run_tuning.sh b/examples/onnxrt/nlp/huggingface_model/text_classification/quantization/ptq/run_tuning.sh
new file mode 100644
index 00000000000..7d141154355
--- /dev/null
+++ b/examples/onnxrt/nlp/huggingface_model/text_classification/quantization/ptq/run_tuning.sh
@@ -0,0 +1,98 @@
+#!/bin/bash
+set -x
+
+function main {
+  init_params "$@"
+  run_tuning
+}
+
+# init params
+function init_params {
+  for var in "$@"
+  do
+    case $var in
+      --config=*)
+          config=$(echo $var |cut -f2 -d=)
+      ;;    
+      --input_model=*)
+          input_model=$(echo $var |cut -f2 -d=)
+      ;;
+      --output_model=*)
+          output_model=$(echo $var |cut -f2 -d=)
+      ;;
+      --data_path=*)
+          data_path=$(echo $var |cut -f2 -d=)
+      ;;
+    esac
+  done
+
+}
+
+# run_tuning
+function run_tuning {
+
+    if [[ "${input_model}" =~ "bert-base" ]]; then
+        model_name_or_path="Intel/bert-base-uncased-mrpc"
+        TASK_NAME='mrpc'
+        num_heads=12
+        hidden_size=768
+    fi
+    if [[ "${input_model}" =~ "roberta-base" ]]; then
+        model_name_or_path="Intel/roberta-base-mrpc"
+        TASK_NAME='mrpc'
+        num_heads=12
+        hidden_size=768
+    fi
+    if [[ "${input_model}" =~ "xlm-roberta-base" ]]; then
+        model_name_or_path="Intel/xlm-roberta-base-mrpc"
+        TASK_NAME='mrpc'
+        num_heads=12
+        hidden_size=768
+    fi
+    if [[ "${input_model}" =~ "camembert-base" ]]; then
+        model_name_or_path="Intel/camembert-base-mrpc"
+        TASK_NAME='mrpc'
+        num_heads=12
+        hidden_size=768
+    fi
+    if [[ "${input_model}" =~ "distilbert-base" ]]; then
+        model_name_or_path="distilbert-base-uncased-finetuned-sst-2-english"
+        TASK_NAME='sst-2'
+        num_heads=12
+        hidden_size=768
+    fi
+    if [[ "${input_model}" =~ "albert-base" ]]; then
+        model_name_or_path="Alireza1044/albert-base-v2-sst2"
+        TASK_NAME='sst-2'
+        num_heads=12
+        hidden_size=768
+    fi
+    if [[ "${input_model}" =~ "MiniLM-L6" ]]; then
+        model_name_or_path="philschmid/MiniLM-L6-H384-uncased-sst2"
+        TASK_NAME='sst-2'
+        num_heads=12
+        hidden_size=384
+    fi
+    if [[ "${input_model}" =~ "MiniLM-L12" ]]; then
+        model_name_or_path="Intel/MiniLM-L12-H384-uncased-mrpc"
+        TASK_NAME='mrpc'
+        num_heads=12
+        hidden_size=384
+    fi
+
+    python main.py \
+            --model_name_or_path ${model_name_or_path} \
+            --model_path ${input_model} \
+            --output_model ${output_model} \
+            --config ${config} \
+            --data_path ${data_path} \
+            --task ${TASK_NAME} \
+            --num_heads ${num_heads} \
+            --hidden_size ${hidden_size} \
+            --tune
+}
+
+main "$@"
+
+
+
diff --git a/examples/onnxrt/language_translation/mobilebert/quantization/ptq/export.py b/examples/onnxrt/nlp/mobilebert/quantization/ptq/export.py
similarity index 100%
rename from examples/onnxrt/language_translation/mobilebert/quantization/ptq/export.py
rename to examples/onnxrt/nlp/mobilebert/quantization/ptq/export.py
diff --git a/examples/onnxrt/language_translation/mobilebert/quantization/ptq/main.py b/examples/onnxrt/nlp/mobilebert/quantization/ptq/main.py
similarity index 100%
rename from examples/onnxrt/language_translation/mobilebert/quantization/ptq/main.py
rename to examples/onnxrt/nlp/mobilebert/quantization/ptq/main.py
diff --git a/examples/onnxrt/language_translation/mobilebert/quantization/ptq/mobilebert.yaml b/examples/onnxrt/nlp/mobilebert/quantization/ptq/mobilebert.yaml
similarity index 100%
rename from examples/onnxrt/language_translation/mobilebert/quantization/ptq/mobilebert.yaml
rename to examples/onnxrt/nlp/mobilebert/quantization/ptq/mobilebert.yaml
diff --git a/examples/onnxrt/language_translation/mobilebert/quantization/ptq/mobilebert_qdq.yaml b/examples/onnxrt/nlp/mobilebert/quantization/ptq/mobilebert_qdq.yaml
similarity index 100%
rename from examples/onnxrt/language_translation/mobilebert/quantization/ptq/mobilebert_qdq.yaml
rename to examples/onnxrt/nlp/mobilebert/quantization/ptq/mobilebert_qdq.yaml
diff --git a/examples/onnxrt/language_translation/roberta/quantization/ptq/prepare_data.sh b/examples/onnxrt/nlp/mobilebert/quantization/ptq/prepare_data.sh
similarity index 100%
rename from examples/onnxrt/language_translation/roberta/quantization/ptq/prepare_data.sh
rename to examples/onnxrt/nlp/mobilebert/quantization/ptq/prepare_data.sh
diff --git a/examples/onnxrt/language_translation/mobilebert/quantization/ptq/prepare_model.sh b/examples/onnxrt/nlp/mobilebert/quantization/ptq/prepare_model.sh
similarity index 100%
rename from examples/onnxrt/language_translation/mobilebert/quantization/ptq/prepare_model.sh
rename to examples/onnxrt/nlp/mobilebert/quantization/ptq/prepare_model.sh
diff --git a/examples/onnxrt/language_translation/mobilebert/quantization/ptq/readme.md b/examples/onnxrt/nlp/mobilebert/quantization/ptq/readme.md
similarity index 100%
rename from examples/onnxrt/language_translation/mobilebert/quantization/ptq/readme.md
rename to examples/onnxrt/nlp/mobilebert/quantization/ptq/readme.md
diff --git a/examples/onnxrt/language_translation/mobilebert/quantization/ptq/requirements.txt b/examples/onnxrt/nlp/mobilebert/quantization/ptq/requirements.txt
similarity index 100%
rename from examples/onnxrt/language_translation/mobilebert/quantization/ptq/requirements.txt
rename to examples/onnxrt/nlp/mobilebert/quantization/ptq/requirements.txt
diff --git a/examples/onnxrt/language_translation/mobilebert/quantization/ptq/run_benchmark.sh b/examples/onnxrt/nlp/mobilebert/quantization/ptq/run_benchmark.sh
similarity index 100%
rename from examples/onnxrt/language_translation/mobilebert/quantization/ptq/run_benchmark.sh
rename to examples/onnxrt/nlp/mobilebert/quantization/ptq/run_benchmark.sh
diff --git a/examples/onnxrt/language_translation/mobilebert/quantization/ptq/run_tuning.sh b/examples/onnxrt/nlp/mobilebert/quantization/ptq/run_tuning.sh
similarity index 100%
rename from examples/onnxrt/language_translation/mobilebert/quantization/ptq/run_tuning.sh
rename to examples/onnxrt/nlp/mobilebert/quantization/ptq/run_tuning.sh
diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/BiDAF/quantization/ptq/README.md b/examples/onnxrt/nlp/onnx_model_zoo/BiDAF/quantization/ptq/README.md
similarity index 100%
rename from examples/onnxrt/language_translation/onnx_model_zoo/BiDAF/quantization/ptq/README.md
rename to examples/onnxrt/nlp/onnx_model_zoo/BiDAF/quantization/ptq/README.md
diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/BiDAF/quantization/ptq/bidaf.yaml b/examples/onnxrt/nlp/onnx_model_zoo/BiDAF/quantization/ptq/bidaf.yaml
similarity index 100%
rename from examples/onnxrt/language_translation/onnx_model_zoo/BiDAF/quantization/ptq/bidaf.yaml
rename to examples/onnxrt/nlp/onnx_model_zoo/BiDAF/quantization/ptq/bidaf.yaml
diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/BiDAF/quantization/ptq/main.py b/examples/onnxrt/nlp/onnx_model_zoo/BiDAF/quantization/ptq/main.py
similarity index 100%
rename from examples/onnxrt/language_translation/onnx_model_zoo/BiDAF/quantization/ptq/main.py
rename to examples/onnxrt/nlp/onnx_model_zoo/BiDAF/quantization/ptq/main.py
diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/BiDAF/quantization/ptq/requirements.txt b/examples/onnxrt/nlp/onnx_model_zoo/BiDAF/quantization/ptq/requirements.txt
similarity index 100%
rename from examples/onnxrt/language_translation/onnx_model_zoo/BiDAF/quantization/ptq/requirements.txt
rename to examples/onnxrt/nlp/onnx_model_zoo/BiDAF/quantization/ptq/requirements.txt
diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/BiDAF/quantization/ptq/run_benchmark.sh b/examples/onnxrt/nlp/onnx_model_zoo/BiDAF/quantization/ptq/run_benchmark.sh
similarity index 100%
rename from examples/onnxrt/language_translation/onnx_model_zoo/BiDAF/quantization/ptq/run_benchmark.sh
rename to examples/onnxrt/nlp/onnx_model_zoo/BiDAF/quantization/ptq/run_benchmark.sh
diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/BiDAF/quantization/ptq/run_tuning.sh b/examples/onnxrt/nlp/onnx_model_zoo/BiDAF/quantization/ptq/run_tuning.sh
similarity index 100%
rename from examples/onnxrt/language_translation/onnx_model_zoo/BiDAF/quantization/ptq/run_tuning.sh
rename to examples/onnxrt/nlp/onnx_model_zoo/BiDAF/quantization/ptq/run_tuning.sh
diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/bert-squad/quantization/ptq/bert.yaml b/examples/onnxrt/nlp/onnx_model_zoo/bert-squad/quantization/ptq/bert.yaml
similarity index 100%
rename from examples/onnxrt/language_translation/onnx_model_zoo/bert-squad/quantization/ptq/bert.yaml
rename to examples/onnxrt/nlp/onnx_model_zoo/bert-squad/quantization/ptq/bert.yaml
diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/bert-squad/quantization/ptq/bert_qdq.yaml b/examples/onnxrt/nlp/onnx_model_zoo/bert-squad/quantization/ptq/bert_qdq.yaml
similarity index 100%
rename from examples/onnxrt/language_translation/onnx_model_zoo/bert-squad/quantization/ptq/bert_qdq.yaml
rename to examples/onnxrt/nlp/onnx_model_zoo/bert-squad/quantization/ptq/bert_qdq.yaml
diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/bert-squad/quantization/ptq/main.py b/examples/onnxrt/nlp/onnx_model_zoo/bert-squad/quantization/ptq/main.py
similarity index 100%
rename from examples/onnxrt/language_translation/onnx_model_zoo/bert-squad/quantization/ptq/main.py
rename to examples/onnxrt/nlp/onnx_model_zoo/bert-squad/quantization/ptq/main.py
diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/bert-squad/quantization/ptq/readme.md b/examples/onnxrt/nlp/onnx_model_zoo/bert-squad/quantization/ptq/readme.md
similarity index 100%
rename from examples/onnxrt/language_translation/onnx_model_zoo/bert-squad/quantization/ptq/readme.md
rename to examples/onnxrt/nlp/onnx_model_zoo/bert-squad/quantization/ptq/readme.md
diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/bert-squad/quantization/ptq/requirements.txt b/examples/onnxrt/nlp/onnx_model_zoo/bert-squad/quantization/ptq/requirements.txt
similarity index 100%
rename from examples/onnxrt/language_translation/onnx_model_zoo/bert-squad/quantization/ptq/requirements.txt
rename to examples/onnxrt/nlp/onnx_model_zoo/bert-squad/quantization/ptq/requirements.txt
diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/bert-squad/quantization/ptq/run_benchmark.sh b/examples/onnxrt/nlp/onnx_model_zoo/bert-squad/quantization/ptq/run_benchmark.sh
similarity index 100%
rename from examples/onnxrt/language_translation/onnx_model_zoo/bert-squad/quantization/ptq/run_benchmark.sh
rename to examples/onnxrt/nlp/onnx_model_zoo/bert-squad/quantization/ptq/run_benchmark.sh
diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/bert-squad/quantization/ptq/run_onnx_squad.py b/examples/onnxrt/nlp/onnx_model_zoo/bert-squad/quantization/ptq/run_onnx_squad.py
similarity index 100%
rename from examples/onnxrt/language_translation/onnx_model_zoo/bert-squad/quantization/ptq/run_onnx_squad.py
rename to examples/onnxrt/nlp/onnx_model_zoo/bert-squad/quantization/ptq/run_onnx_squad.py
diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/bert-squad/quantization/ptq/run_tuning.sh b/examples/onnxrt/nlp/onnx_model_zoo/bert-squad/quantization/ptq/run_tuning.sh
similarity index 100%
rename from examples/onnxrt/language_translation/onnx_model_zoo/bert-squad/quantization/ptq/run_tuning.sh
rename to examples/onnxrt/nlp/onnx_model_zoo/bert-squad/quantization/ptq/run_tuning.sh
diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/bert-squad/quantization/ptq/squad_evaluate.py b/examples/onnxrt/nlp/onnx_model_zoo/bert-squad/quantization/ptq/squad_evaluate.py
similarity index 100%
rename from examples/onnxrt/language_translation/onnx_model_zoo/bert-squad/quantization/ptq/squad_evaluate.py
rename to examples/onnxrt/nlp/onnx_model_zoo/bert-squad/quantization/ptq/squad_evaluate.py
diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/bert-squad/quantization/ptq/tokenization.py b/examples/onnxrt/nlp/onnx_model_zoo/bert-squad/quantization/ptq/tokenization.py
similarity index 100%
rename from examples/onnxrt/language_translation/onnx_model_zoo/bert-squad/quantization/ptq/tokenization.py
rename to examples/onnxrt/nlp/onnx_model_zoo/bert-squad/quantization/ptq/tokenization.py
diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/gpt2/quantization/ptq/export.py b/examples/onnxrt/nlp/onnx_model_zoo/gpt2/quantization/ptq/export.py
similarity index 100%
rename from examples/onnxrt/language_translation/onnx_model_zoo/gpt2/quantization/ptq/export.py
rename to examples/onnxrt/nlp/onnx_model_zoo/gpt2/quantization/ptq/export.py
diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/gpt2/quantization/ptq/gpt2.py b/examples/onnxrt/nlp/onnx_model_zoo/gpt2/quantization/ptq/gpt2.py
similarity index 100%
rename from examples/onnxrt/language_translation/onnx_model_zoo/gpt2/quantization/ptq/gpt2.py
rename to examples/onnxrt/nlp/onnx_model_zoo/gpt2/quantization/ptq/gpt2.py
diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/gpt2/quantization/ptq/gpt2.yaml b/examples/onnxrt/nlp/onnx_model_zoo/gpt2/quantization/ptq/gpt2.yaml
similarity index 100%
rename from examples/onnxrt/language_translation/onnx_model_zoo/gpt2/quantization/ptq/gpt2.yaml
rename to examples/onnxrt/nlp/onnx_model_zoo/gpt2/quantization/ptq/gpt2.yaml
diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/gpt2/quantization/ptq/readme.md b/examples/onnxrt/nlp/onnx_model_zoo/gpt2/quantization/ptq/readme.md
similarity index 100%
rename from examples/onnxrt/language_translation/onnx_model_zoo/gpt2/quantization/ptq/readme.md
rename to examples/onnxrt/nlp/onnx_model_zoo/gpt2/quantization/ptq/readme.md
diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/gpt2/quantization/ptq/requirements.txt b/examples/onnxrt/nlp/onnx_model_zoo/gpt2/quantization/ptq/requirements.txt
similarity index 100%
rename from examples/onnxrt/language_translation/onnx_model_zoo/gpt2/quantization/ptq/requirements.txt
rename to examples/onnxrt/nlp/onnx_model_zoo/gpt2/quantization/ptq/requirements.txt
diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/gpt2/quantization/ptq/run_benchmark.sh b/examples/onnxrt/nlp/onnx_model_zoo/gpt2/quantization/ptq/run_benchmark.sh
similarity index 100%
rename from examples/onnxrt/language_translation/onnx_model_zoo/gpt2/quantization/ptq/run_benchmark.sh
rename to examples/onnxrt/nlp/onnx_model_zoo/gpt2/quantization/ptq/run_benchmark.sh
diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/gpt2/quantization/ptq/run_tuning.sh b/examples/onnxrt/nlp/onnx_model_zoo/gpt2/quantization/ptq/run_tuning.sh
similarity index 100%
rename from examples/onnxrt/language_translation/onnx_model_zoo/gpt2/quantization/ptq/run_tuning.sh
rename to examples/onnxrt/nlp/onnx_model_zoo/gpt2/quantization/ptq/run_tuning.sh
diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/mobilebert/quantization/ptq/main.py b/examples/onnxrt/nlp/onnx_model_zoo/mobilebert/quantization/ptq/main.py
similarity index 100%
rename from examples/onnxrt/language_translation/onnx_model_zoo/mobilebert/quantization/ptq/main.py
rename to examples/onnxrt/nlp/onnx_model_zoo/mobilebert/quantization/ptq/main.py
diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/mobilebert/quantization/ptq/mobilebert.yaml b/examples/onnxrt/nlp/onnx_model_zoo/mobilebert/quantization/ptq/mobilebert.yaml
similarity index 100%
rename from examples/onnxrt/language_translation/onnx_model_zoo/mobilebert/quantization/ptq/mobilebert.yaml
rename to examples/onnxrt/nlp/onnx_model_zoo/mobilebert/quantization/ptq/mobilebert.yaml
diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/mobilebert/quantization/ptq/mobilebert_qdq.yaml b/examples/onnxrt/nlp/onnx_model_zoo/mobilebert/quantization/ptq/mobilebert_qdq.yaml
similarity index 100%
rename from examples/onnxrt/language_translation/onnx_model_zoo/mobilebert/quantization/ptq/mobilebert_qdq.yaml
rename to examples/onnxrt/nlp/onnx_model_zoo/mobilebert/quantization/ptq/mobilebert_qdq.yaml
diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/mobilebert/quantization/ptq/readme.md b/examples/onnxrt/nlp/onnx_model_zoo/mobilebert/quantization/ptq/readme.md
similarity index 100%
rename from examples/onnxrt/language_translation/onnx_model_zoo/mobilebert/quantization/ptq/readme.md
rename to examples/onnxrt/nlp/onnx_model_zoo/mobilebert/quantization/ptq/readme.md
diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/mobilebert/quantization/ptq/requirements.txt b/examples/onnxrt/nlp/onnx_model_zoo/mobilebert/quantization/ptq/requirements.txt
similarity index 100%
rename from examples/onnxrt/language_translation/onnx_model_zoo/mobilebert/quantization/ptq/requirements.txt
rename to examples/onnxrt/nlp/onnx_model_zoo/mobilebert/quantization/ptq/requirements.txt
diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/mobilebert/quantization/ptq/run_benchmark.sh b/examples/onnxrt/nlp/onnx_model_zoo/mobilebert/quantization/ptq/run_benchmark.sh
similarity index 100%
rename from examples/onnxrt/language_translation/onnx_model_zoo/mobilebert/quantization/ptq/run_benchmark.sh
rename to examples/onnxrt/nlp/onnx_model_zoo/mobilebert/quantization/ptq/run_benchmark.sh
diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/mobilebert/quantization/ptq/run_onnx_squad.py b/examples/onnxrt/nlp/onnx_model_zoo/mobilebert/quantization/ptq/run_onnx_squad.py
similarity index 100%
rename from examples/onnxrt/language_translation/onnx_model_zoo/mobilebert/quantization/ptq/run_onnx_squad.py
rename to examples/onnxrt/nlp/onnx_model_zoo/mobilebert/quantization/ptq/run_onnx_squad.py
diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/mobilebert/quantization/ptq/run_tuning.sh b/examples/onnxrt/nlp/onnx_model_zoo/mobilebert/quantization/ptq/run_tuning.sh
similarity index 100%
rename from examples/onnxrt/language_translation/onnx_model_zoo/mobilebert/quantization/ptq/run_tuning.sh
rename to examples/onnxrt/nlp/onnx_model_zoo/mobilebert/quantization/ptq/run_tuning.sh
diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/mobilebert/quantization/ptq/squad_evaluate.py b/examples/onnxrt/nlp/onnx_model_zoo/mobilebert/quantization/ptq/squad_evaluate.py
similarity index 100%
rename from examples/onnxrt/language_translation/onnx_model_zoo/mobilebert/quantization/ptq/squad_evaluate.py
rename to examples/onnxrt/nlp/onnx_model_zoo/mobilebert/quantization/ptq/squad_evaluate.py
diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/mobilebert/quantization/ptq/tokenization.py b/examples/onnxrt/nlp/onnx_model_zoo/mobilebert/quantization/ptq/tokenization.py
similarity index 100%
rename from examples/onnxrt/language_translation/onnx_model_zoo/mobilebert/quantization/ptq/tokenization.py
rename to examples/onnxrt/nlp/onnx_model_zoo/mobilebert/quantization/ptq/tokenization.py
diff --git a/examples/onnxrt/language_translation/roberta/quantization/ptq/export.py b/examples/onnxrt/nlp/roberta/quantization/ptq/export.py
similarity index 100%
rename from examples/onnxrt/language_translation/roberta/quantization/ptq/export.py
rename to examples/onnxrt/nlp/roberta/quantization/ptq/export.py
diff --git a/examples/onnxrt/language_translation/roberta/quantization/ptq/main.py b/examples/onnxrt/nlp/roberta/quantization/ptq/main.py
similarity index 100%
rename from examples/onnxrt/language_translation/roberta/quantization/ptq/main.py
rename to examples/onnxrt/nlp/roberta/quantization/ptq/main.py
diff --git a/examples/onnxrt/nlp/roberta/quantization/ptq/prepare_data.sh b/examples/onnxrt/nlp/roberta/quantization/ptq/prepare_data.sh
new file mode 100644
index 00000000000..8e434a5c521
--- /dev/null
+++ b/examples/onnxrt/nlp/roberta/quantization/ptq/prepare_data.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+set -x
+
+function main {
+  init_params "$@"
+  download_data
+
+}
+
+# init params
+function init_params {
+
+  for var in "$@"
+  do
+    case $var in
+      --data_dir=*)
+          data_dir=$(echo $var |cut -f2 -d=)
+      ;;
+      --task_name=*)
+          task_name=$(echo $var |cut -f2 -d=)
+      ;;
+    esac
+  done
+
+}
+
+# run_tuning
+function download_data {
+    wget https://raw.githubusercontent.com/huggingface/transformers/f98ef14d161d7bcdc9808b5ec399981481411cc1/utils/download_glue_data.py
+    python download_glue_data.py --data_dir=${data_dir} --tasks=${task_name}
+}
+
+main "$@"
+
diff --git a/examples/onnxrt/language_translation/roberta/quantization/ptq/prepare_model.sh b/examples/onnxrt/nlp/roberta/quantization/ptq/prepare_model.sh
similarity index 100%
rename from examples/onnxrt/language_translation/roberta/quantization/ptq/prepare_model.sh
rename to examples/onnxrt/nlp/roberta/quantization/ptq/prepare_model.sh
diff --git a/examples/onnxrt/language_translation/roberta/quantization/ptq/readme.md b/examples/onnxrt/nlp/roberta/quantization/ptq/readme.md
similarity index 100%
rename from examples/onnxrt/language_translation/roberta/quantization/ptq/readme.md
rename to examples/onnxrt/nlp/roberta/quantization/ptq/readme.md
diff --git a/examples/onnxrt/language_translation/roberta/quantization/ptq/requirements.txt b/examples/onnxrt/nlp/roberta/quantization/ptq/requirements.txt
similarity index 100%
rename from examples/onnxrt/language_translation/roberta/quantization/ptq/requirements.txt
rename to examples/onnxrt/nlp/roberta/quantization/ptq/requirements.txt
diff --git a/examples/onnxrt/language_translation/roberta/quantization/ptq/roberta.yaml b/examples/onnxrt/nlp/roberta/quantization/ptq/roberta.yaml
similarity index 100%
rename from examples/onnxrt/language_translation/roberta/quantization/ptq/roberta.yaml
rename to examples/onnxrt/nlp/roberta/quantization/ptq/roberta.yaml
diff --git a/examples/onnxrt/language_translation/roberta/quantization/ptq/roberta_qdq.yaml b/examples/onnxrt/nlp/roberta/quantization/ptq/roberta_qdq.yaml
similarity index 100%
rename from examples/onnxrt/language_translation/roberta/quantization/ptq/roberta_qdq.yaml
rename to examples/onnxrt/nlp/roberta/quantization/ptq/roberta_qdq.yaml
diff --git a/examples/onnxrt/language_translation/roberta/quantization/ptq/run_benchmark.sh b/examples/onnxrt/nlp/roberta/quantization/ptq/run_benchmark.sh
similarity index 100%
rename from examples/onnxrt/language_translation/roberta/quantization/ptq/run_benchmark.sh
rename to examples/onnxrt/nlp/roberta/quantization/ptq/run_benchmark.sh
diff --git a/examples/onnxrt/language_translation/roberta/quantization/ptq/run_tuning.sh b/examples/onnxrt/nlp/roberta/quantization/ptq/run_tuning.sh
similarity index 100%
rename from examples/onnxrt/language_translation/roberta/quantization/ptq/run_tuning.sh
rename to examples/onnxrt/nlp/roberta/quantization/ptq/run_tuning.sh