Enable ONNXRT NLP example deberta-v3-base (#890)

Signed-off-by: yuwenzho <[email protected]>
intel · May 19, 2023 · abac54e · abac54e
1 parent 53551c2
commit abac54e
Show file tree

Hide file tree

Showing 18 changed files with 163 additions and 30 deletions.
diff --git a/.azure-pipelines/scripts/codeScan/pyspelling/inc_dict.txt b/.azure-pipelines/scripts/codeScan/pyspelling/inc_dict.txt
@@ -2607,4 +2607,6 @@ layoutlmv
 funsd
 layoutlmft
 nielsr
-HYPJUDY
+HYPJUDY
+DeBERTa
+unilm
diff --git a/examples/.config/model_params_onnxrt.json b/examples/.config/model_params_onnxrt.json
@@ -811,6 +811,20 @@
       "input_model": "/tf_dataset2/models/onnx/hf_layoutlmft/layoutlmft-model.onnx",
       "main_script": "main.py",
       "batch_size": 1
+    },
+    "hf_deberta_dynamic": {
+      "model_src_dir": "nlp/huggingface_model/text_classification/quantization/ptq_dynamic",
+      "dataset_location": "/tf_dataset/pytorch/glue_data/MRPC",
+      "input_model": "/tf_dataset2/models/onnx/hf_deberta/deberta-v3-base-mrpc.onnx",
+      "main_script": "main.py",
+      "batch_size": 1
+    },
+    "hf_deberta": {
+      "model_src_dir": "nlp/huggingface_model/text_classification/quantization/ptq_static",
+      "dataset_location": "/tf_dataset/pytorch/glue_data/MRPC",
+      "input_model": "/tf_dataset2/models/onnx/hf_deberta/deberta-v3-base-mrpc.onnx",
+      "main_script": "main.py",
+      "batch_size": 1
     }
   }
 }

diff --git a/examples/README.md b/examples/README.md
@@ -1186,6 +1186,14 @@ Intel® Neural Compressor validated examples with multiple compression technique
         <a href="./onnxrt/nlp/huggingface_model/text_classification/quantization/ptq_dynamic">integerops</a> / <a href="./onnxrt/nlp/huggingface_model/text_classification/quantization/ptq_static">qlinearops</a>
     </td>
   </tr>
+  <tr>
+    <td>DeBERTa v3 base MRPC (HuggingFace)</td>
+    <td>Natural Language Processing</td>
+    <td>Post-Training Dynamic / Static Quantization</td>
+    <td> 
+        <a href="./onnxrt/nlp/huggingface_model/text_classification/quantization/ptq_dynamic">integerops</a> / <a href="./onnxrt/nlp/huggingface_model/text_classification/quantization/ptq_static">qlinearops</a>
+    </td>
+  </tr>
   <tr>
     <td>Spanbert SQuAD (HuggingFace)</td>
     <td>Natural Language Processing</td>

diff --git a/...rt/nlp/huggingface_model/text_classification/quantization/ptq_dynamic/README.md b/...rt/nlp/huggingface_model/text_classification/quantization/ptq_dynamic/README.md
@@ -31,6 +31,7 @@ Supported model identifier from [huggingface.co](https://huggingface.co/):
 |         M-FAC/bert-mini-finetuned-mrpc          |
 |           Intel/xlnet-base-cased-mrpc           |
 |            Intel/bart-large-mrpc                |
+|             Intel/deberta-v3-base-mrpc          |
 
 ```bash
 python export.py --model_name_or_path=Intel/bert-base-uncased-mrpc \ # or other supported model identifier

diff --git a/examples/onnxrt/nlp/huggingface_model/text_classification/quantization/ptq_dynamic/export.py b/examples/onnxrt/nlp/huggingface_model/text_classification/quantization/ptq_dynamic/export.py
@@ -10,9 +10,10 @@ def export_onnx_model(args, model):
                                         'Intel/xlm-roberta-base-mrpc', 
                                         'Intel/camembert-base-mrpc', 
                                         'distilbert-base-uncased-finetuned-sst-2-english',
-                                        'Intel/xlnet-base-cased-mrpc']:
+                                        'Intel/xlnet-base-cased-mrpc',
+                                        'Intel/deberta-v3-base-mrpc']:
             inputs = {'input_ids':      torch.ones(1, args.max_len, dtype=torch.int64),
-                    'attention_mask': torch.ones(1, args.max_len, dtype=torch.int64)}
+                      'attention_mask': torch.ones(1, args.max_len, dtype=torch.int64)}
             torch.onnx.export(model,                            # model being run
                             (inputs['input_ids'],               # model input (or a tuple for multiple inputs) 
                             inputs['attention_mask']),          
@@ -27,7 +28,7 @@ def export_onnx_model(args, model):
         else:
             inputs = {'input_ids':      torch.ones(1, args.max_len, dtype=torch.int64),
                       'attention_mask': torch.ones(1, args.max_len, dtype=torch.int64),
-                    'token_type_ids': torch.ones(1, args.max_len, dtype=torch.int64)}
+                      'token_type_ids': torch.ones(1, args.max_len, dtype=torch.int64)}
             torch.onnx.export(model,                            # model being run
                             (inputs['input_ids'],               # model input (or a tuple for multiple inputs) 
                             inputs['attention_mask'],
@@ -63,7 +64,9 @@ def export_onnx_model(args, model):
                 'Intel/electra-small-discriminator-mrpc',
                 'M-FAC/bert-mini-finetuned-mrpc',
                 'Intel/xlnet-base-cased-mrpc',
-                'Intel/bart-large-mrpc'],
+                'Intel/bart-large-mrpc',
+                'Intel/deberta-v3-base-mrpc'
+                ],
         help='pretrained model name or path')
     parser.add_argument(
         '--max_len',

diff --git a/...nnxrt/nlp/huggingface_model/text_classification/quantization/ptq_dynamic/requirements.txt b/...nnxrt/nlp/huggingface_model/text_classification/quantization/ptq_dynamic/requirements.txt
@@ -6,4 +6,5 @@ onnxruntime
 coloredlogs
 sympy
 onnxruntime-extensions; python_version < '3.10'
-numpy==1.23.5
+numpy==1.23.5
+sentencepiece
diff --git a/...nnxrt/nlp/huggingface_model/text_classification/quantization/ptq_dynamic/run_benchmark.sh b/...nnxrt/nlp/huggingface_model/text_classification/quantization/ptq_dynamic/run_benchmark.sh
@@ -85,6 +85,10 @@ function run_benchmark {
         model_name_or_path="Intel/bart-large-mrpc"
         TASK_NAME='mrpc'
     fi
+    if [[ "${input_model}" =~ "deberta" ]]; then
+        model_name_or_path="microsoft/deberta-v3-base"
+        TASK_NAME='mrpc'
+    fi
 
     python main.py \
             --model_name_or_path ${model_name_or_path} \

diff --git a/...s/onnxrt/nlp/huggingface_model/text_classification/quantization/ptq_dynamic/run_tuning.sh b/...s/onnxrt/nlp/huggingface_model/text_classification/quantization/ptq_dynamic/run_tuning.sh
@@ -106,6 +106,12 @@ function run_tuning {
         num_heads=16
         hidden_size=4096
     fi
+    if [[ "${input_model}" =~ "deberta" ]]; then
+        model_name_or_path="microsoft/deberta-v3-base"
+        TASK_NAME='mrpc'
+        num_heads=12
+        hidden_size=768
+    fi
 
     python main.py \
             --model_name_or_path ${model_name_or_path} \

diff --git a/...xrt/nlp/huggingface_model/text_classification/quantization/ptq_static/README.md b/...xrt/nlp/huggingface_model/text_classification/quantization/ptq_static/README.md
@@ -30,7 +30,8 @@ Supported model identifier from [huggingface.co](https://huggingface.co/):
 |        Intel/electra-small-discriminator-mrpc   |
 |         M-FAC/bert-mini-finetuned-mrpc          |
 |           Intel/xlnet-base-cased-mrpc           |
-|            Intel/bart-large-mrpc                |
+|            Intel/bart-large-mrpc                |.
+|             Intel/deberta-v3-base-mrpc          |
 
 ```bash
 python export.py --model_name_or_path=Intel/bert-base-uncased-mrpc \ # or other supported model identifier

diff --git a/examples/onnxrt/nlp/huggingface_model/text_classification/quantization/ptq_static/export.py b/examples/onnxrt/nlp/huggingface_model/text_classification/quantization/ptq_static/export.py
@@ -9,9 +9,11 @@ def export_onnx_model(args, model):
         if args.model_name_or_path in ['Intel/roberta-base-mrpc', 
                                         'Intel/xlm-roberta-base-mrpc', 
                                         'Intel/camembert-base-mrpc', 
-                                        'distilbert-base-uncased-finetuned-sst-2-english']:
+                                        'distilbert-base-uncased-finetuned-sst-2-english',
+                                        'Intel/xlnet-base-cased-mrpc',
+                                        'Intel/deberta-v3-base-mrpc']:
             inputs = {'input_ids':      torch.ones(1, args.max_len, dtype=torch.int64),
-                    'attention_mask': torch.ones(1, args.max_len, dtype=torch.int64)}
+                      'attention_mask': torch.ones(1, args.max_len, dtype=torch.int64)}
             torch.onnx.export(model,                            # model being run
                             (inputs['input_ids'],               # model input (or a tuple for multiple inputs) 
                             inputs['attention_mask']),          
@@ -26,7 +28,7 @@ def export_onnx_model(args, model):
         else:
             inputs = {'input_ids':      torch.ones(1, args.max_len, dtype=torch.int64),
                       'attention_mask': torch.ones(1, args.max_len, dtype=torch.int64),
-                    'token_type_ids': torch.ones(1, args.max_len, dtype=torch.int64)}
+                      'token_type_ids': torch.ones(1, args.max_len, dtype=torch.int64)}
             torch.onnx.export(model,                            # model being run
                             (inputs['input_ids'],               # model input (or a tuple for multiple inputs) 
                             inputs['attention_mask'],
@@ -35,12 +37,12 @@ def export_onnx_model(args, model):
                             opset_version=14,                   # the ONNX version to export the model
                             do_constant_folding=True,           # whether to execute constant folding
                             input_names=['input_ids',           # the model's input names
-                                        'attention_mask',
-                                        'token_type_ids'],
+                                         'attention_mask',
+                                         'token_type_ids'],
                             output_names=['logits'],
                             dynamic_axes={'input_ids': symbolic_names,        # variable length axes
-                                        'attention_mask' : symbolic_names,
-                                        'token_type_ids' : symbolic_names})
+                                          'attention_mask' : symbolic_names,
+                                          'token_type_ids' : symbolic_names})
         print("ONNX Model exported to {0}".format(args.output_model))
 
 if __name__ == "__main__":
@@ -57,7 +59,14 @@ def export_onnx_model(args, model):
                 'distilbert-base-uncased-finetuned-sst-2-english',
                 'Alireza1044/albert-base-v2-sst2',
                 'philschmid/MiniLM-L6-H384-uncased-sst2',
-                'Intel/MiniLM-L12-H384-uncased-mrpc'],
+                'Intel/MiniLM-L12-H384-uncased-mrpc',
+                'bert-base-cased-finetuned-mrpc',
+                'Intel/electra-small-discriminator-mrpc',
+                'M-FAC/bert-mini-finetuned-mrpc',
+                'Intel/xlnet-base-cased-mrpc',
+                'Intel/bart-large-mrpc',
+                'Intel/deberta-v3-base-mrpc'
+                ],
         help='pretrained model name or path')
     parser.add_argument(
         '--max_len',
@@ -71,4 +80,8 @@ def export_onnx_model(args, model):
         args.model_name_or_path,
         config=AutoConfig.from_pretrained(args.model_name_or_path))
 
-    export_onnx_model(args, model)
+    if args.model_name_or_path == 'Intel/bart-large-mrpc':
+        import os
+        os.system('python -m transformers.onnx --model=Intel/bart-large-mrpc --feature=sequence-classification bart-large-mrpc')
+    else:
+        export_onnx_model(args, model)
diff --git a/examples/onnxrt/nlp/huggingface_model/text_classification/quantization/ptq_static/main.py b/examples/onnxrt/nlp/huggingface_model/text_classification/quantization/ptq_static/main.py
@@ -409,15 +409,19 @@ def eval_func(model, *args):
 
         from neural_compressor import quantization, PostTrainingQuantConfig
         from neural_compressor.utils.constant import FP32
-        fp32_op_names = None
+        specific_quant_config = {}
         if args.model_name_or_path == 'Intel/bart-large-mrpc':
             fp32_op_names = ['/model/(en|de)coder/layers.*/fc(1|2)/MatMul']
+            specific_quant_config['op_name_dict'] = {op_name:FP32 for op_name in fp32_op_names}
         elif args.model_name_or_path == 'Alireza1044/albert-base-v2-sst2':
             fp32_op_names = ['Gemm_1410_MatMul', 'MatMul_(259|168)']
+            specific_quant_config['op_name_dict'] = {op_name:FP32 for op_name in fp32_op_names}
+        elif args.model_name_or_path == 'Intel/deberta-v3-base-mrpc':
+            specific_quant_config['op_type_dict'] = {'^((?!(MatMul|Gather)).)*$': FP32}
+            specific_quant_config['quant_level'] = 1
         config = PostTrainingQuantConfig(approach='static',
                                          quant_format=args.quant_format,
-                                         op_name_dict={op_name:FP32 for op_name in fp32_op_names} \
-                                            if fp32_op_names else None,)
+                                         **specific_quant_config)
         q_model = quantization.fit(model, 
                                    config,
                                    eval_func=eval_func,

diff --git a/...onnxrt/nlp/huggingface_model/text_classification/quantization/ptq_static/requirements.txt b/...onnxrt/nlp/huggingface_model/text_classification/quantization/ptq_static/requirements.txt
@@ -6,4 +6,5 @@ onnxruntime
 coloredlogs
 sympy
 onnxruntime-extensions; python_version < '3.10'
-numpy==1.23.5
+numpy==1.23.5
+sentencepiece
diff --git a/...onnxrt/nlp/huggingface_model/text_classification/quantization/ptq_static/run_benchmark.sh b/...onnxrt/nlp/huggingface_model/text_classification/quantization/ptq_static/run_benchmark.sh
@@ -85,6 +85,10 @@ function run_benchmark {
         model_name_or_path="Intel/bart-large-mrpc"
         TASK_NAME='mrpc'
     fi
+    if [[ "${input_model}" =~ "deberta" ]]; then
+        model_name_or_path="microsoft/deberta-v3-base"
+        TASK_NAME='mrpc'
+    fi
 
     python main.py \
             --model_name_or_path ${model_name_or_path} \

diff --git a/...es/onnxrt/nlp/huggingface_model/text_classification/quantization/ptq_static/run_tuning.sh b/...es/onnxrt/nlp/huggingface_model/text_classification/quantization/ptq_static/run_tuning.sh
@@ -109,6 +109,12 @@ function run_tuning {
         num_heads=16
         hidden_size=4096
     fi
+    if [[ "${input_model}" =~ "deberta" ]]; then
+        model_name_or_path="microsoft/deberta-v3-base"
+        TASK_NAME='mrpc'
+        num_heads=12
+        hidden_size=768
+    fi
 
     python main.py \
             --model_name_or_path ${model_name_or_path} \

diff --git a/...ngface_model/token_classification/layoutlmft/quantization/ptq_dynamic/README.md b/...ngface_model/token_classification/layoutlmft/quantization/ptq_dynamic/README.md
@@ -14,7 +14,7 @@ bash install_layoutlmft.sh
 > Note: Validated ONNX Runtime [Version](/docs/source/installation_guide.md#validated-software-environment).
 
 ## 2. Prepare Model
-Finetune on FUNSD
+Finetune on FUNSD. Refer to the [link](https://github.com/microsoft/unilm/tree/master/layoutlm#fine-tuning-example-on-funsd)
 
 ```bash
 python main.py \

diff --git a/...ingface_model/token_classification/layoutlmft/quantization/ptq_static/README.md b/...ingface_model/token_classification/layoutlmft/quantization/ptq_static/README.md
@@ -14,7 +14,7 @@ bash install_layoutlmft.sh
 > Note: Validated ONNX Runtime [Version](/docs/source/installation_guide.md#validated-software-environment).
 
 ## 2. Prepare Model
-Finetune on FUNSD
+Finetune on FUNSD. Refer to the [link](https://github.com/microsoft/unilm/tree/master/layoutlm#fine-tuning-example-on-funsd)
 
 ```bash
 python main.py \

diff --git a/neural_compressor/adaptor/onnxrt.py b/neural_compressor/adaptor/onnxrt.py
@@ -995,14 +995,16 @@ def query_fw_capability(self, model):
 
         ffn_matmul = []
         attention_matmul_optype = [node.op_type for node in attention_matmul]
+        # find matmul ops in feed forward network (FFN) structure whitch mainly in transfomers based NLP models
         if len(attention_matmul) > 0 and 'Attention' in attention_matmul_optype:
+            # model is optimized and Attention is fused,
+            # index of Attention is used as split to find FFN MatMul
             first_attention_index = attention_matmul_optype.index('Attention')
             attention_matmul_optype = attention_matmul_optype[first_attention_index:]
             attention_matmul = attention_matmul[first_attention_index:]
             attention_index = list(np.where(np.array(attention_matmul_optype) == 'Attention')[0])
             block_len = attention_index[1] - attention_index[0] if len(attention_index) > 2 else 4
             for idx in range(len(attention_index)):
-                # to find matmul in ffn
                 if idx != len(attention_index) - 1:
                     index = attention_index[idx + 1]
                     if index - 2 >= 0 and index - 1 >= 0:
@@ -1014,6 +1016,29 @@ def query_fw_capability(self, model):
                         index + block_len - 1 < len(attention_matmul):
                         ffn_matmul.append([attention_matmul[index + block_len - 2], 
                                         attention_matmul[index + block_len - 1]])
+        else:
+            # model is not optimized or Attention isn't fused, 
+            # query MatMul, key MatMul and value MatMul are used as split to find FFN MatMul
+            qkv = self.pre_optimized_model.find_qkv_in_attention(find_all=True)
+            if len(qkv) != 0:
+                attention_starts = [nodes[0] for nodes in qkv]
+                attention_index = [np.where(np.array([n.name for n in attention_matmul]) \
+                                            == attention_start)[0].tolist()[0] \
+                                                for attention_start in attention_starts]
+                block_len = attention_index[1] - attention_index[0] if len(attention_index) > 2 else 4
+                for idx in range(len(attention_index)):
+                    if idx != len(attention_index) - 1:
+                        index = attention_index[idx + 1]
+                        if index - 2 >= 0 and index - 1 >= 0:
+                            ffn_matmul.append([attention_matmul[index - 2],
+                                            attention_matmul[index - 1]])
+                    else:
+                        index = attention_index[idx]
+                        if index + block_len - 2 < len(attention_matmul) and \
+                            index + block_len - 1 < len(attention_matmul):
+                            ffn_matmul.append([attention_matmul[index + block_len - 2],
+                                            attention_matmul[index + block_len - 1]])
+
         block_wise = []
         for block in reversed(ffn_matmul):
             node_info = []
-Original file line number
+Diff line change
@@ Expand Up / @@ -2607,4 +2607,6 @@ layoutlmv @@
     funsd
     layoutlmft
     nielsr
-    HYPJUDY
+    HYPJUDY
+    DeBERTa
+    unilm