Add CodeBert ONNX model (#1121)

Signed-off-by: yiliu30 <[email protected]>
intel · Aug 14, 2023 · 5e584e6 · 5e584e6
1 parent c83d01d
commit 5e584e6
Show file tree

Hide file tree

Showing 17 changed files with 1,129 additions and 2 deletions.
diff --git a/.azure-pipelines/scripts/codeScan/pyspelling/inc_dict.txt b/.azure-pipelines/scripts/codeScan/pyspelling/inc_dict.txt
@@ -2702,6 +2702,10 @@ Vanhoucke
 ONNXCommunityMeetup
 luYBWA
 pQ
+CodeXGLUE
+codebert
+codexglue
+jsonl
 xgb
 xgboost
 hpo

diff --git a/examples/.config/model_params_onnxrt.json b/examples/.config/model_params_onnxrt.json
@@ -846,7 +846,20 @@
       "input_model": "/tf_dataset2/models/onnx/hf_deberta/deberta-v3-base-mrpc.onnx",
       "main_script": "main.py",
       "batch_size": 1
-    }
+    },
+    "hf_codebert": {
+        "model_src_dir": "nlp/huggingface_model/code_detection/quantization/ptq_static",
+        "dataset_location": "/tf_dataset2/datasets/devign_dataset/valid.jsonl",
+        "input_model": "/tf_dataset2/models/onnx/hf_codebert/codebert-model.onnx",
+        "main_script": "main.py",
+        "batch_size": 1
+      },
+      "hf_codebert_dynamic": {
+        "model_src_dir": "nlp/huggingface_model/code_detection/quantization/ptq_dynamic",
+        "dataset_location": "/tf_dataset2/datasets/devign_dataset/valid.jsonl",
+        "input_model": "/tf_dataset2/models/onnx/hf_codebert/codebert-model.onnx",
+        "main_script": "main.py",
+        "batch_size": 1
+      }
   }
 }
-
diff --git a/examples/README.md b/examples/README.md
@@ -1108,6 +1108,18 @@ Intel® Neural Compressor validated examples with multiple compression technique
     <td>Post-Training Static Quantization</td>
     <td><a href="./onnxrt/body_analysis/onnx_model_zoo/arcface/quantization/ptq_static">qlinearops</a></td>
   </tr>
+  <tr>
+    <td>CodeBert</td>
+    <td>Natural Language Processing</td>
+    <td>Post-Training Static Quantization</td>
+    <td><a href="./onnxrt/nlp/huggingface_model/code_detection/quantization/ptq_static">qlinearops</a></td>
+  </tr>
+  <tr>
+    <td>CodeBert</td>
+    <td>Natural Language Processing</td>
+    <td>Post-Training Dynamic Quantization</td>
+    <td><a href="./onnxrt/nlp/huggingface_model/code_detection/quantization/ptq_dynamic">integerops</a></td>
+  </tr>
   <tr>
     <td>BERT base MRPC</td>
     <td>Natural Language Processing</td>

diff --git a/.../onnxrt/nlp/huggingface_model/code_detection/quantization/ptq_dynamic/README.md b/.../onnxrt/nlp/huggingface_model/code_detection/quantization/ptq_dynamic/README.md
@@ -0,0 +1,54 @@
+Step-by-Step
+============
+
+This example quantizes the [microsoft/codebert-base](https://huggingface.co/microsoft/codebert-base) fine-tuned on the the [code defect detection](https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/Defect-detection#codexglue----defect-detection) task.
+
+# Prerequisite
+
+## 1. Environment
+```shell
+pip install neural-compressor
+pip install -r requirements.txt
+```
+> Note: Validated ONNX Runtime [Version](/docs/source/installation_guide.md#validated-software-environment).
+
+
+## 2. Prepare Dataset
+Run `prepare_data.sh` script to download dataset from website to `dataset` folder and pre-process it:
+
+```shell
+bash prepare_data.sh
+```
+## 3. Prepare Model
+
+Fine-tuning the model on [code defect detection](https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/Defect-detection#codexglue----defect-detection) task.
+```
+bash run_fine_tuning.sh --train_dataset_location=./dataset/train.jsonl --dataset_location=./dataset/valid.jsonl  --fine_tune
+```
+
+Export model to ONNX format. 
+```bash
+# TODO replace the model name after uploading the model to the hugging face
+optimum-cli export onnx --model Intel/TBD-MODEL-NAME --task text-classification onnx_model/
+```
+
+# Run
+
+## 1. Quantization
+
+Static quantization with QOperator format:
+
+```bash
+bash run_quant.sh --input_model=/path/to/model \ # model path as *.onnx
+                   --output_model=/path/to/model_tune \
+                   --dataset_location=path/to/glue/data
+```
+
+## 2. Benchmark
+
+```bash
+bash run_benchmark.sh --input_model=path/to/model \ # model path as *.onnx
+                      --dataset_location=path/to/glue/data \ 
+                      --batch_size=batch_size \ 
+                      --mode=performance # or accuracy
+```