Skip to content

Commit

Permalink
Enable ONNXRT NLP example deberta-v3-base (#890)
Browse files Browse the repository at this point in the history
Signed-off-by: yuwenzho <[email protected]>
  • Loading branch information
yuwenzho authored May 19, 2023
1 parent 53551c2 commit abac54e
Show file tree
Hide file tree
Showing 18 changed files with 163 additions and 30 deletions.
4 changes: 3 additions & 1 deletion .azure-pipelines/scripts/codeScan/pyspelling/inc_dict.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2607,4 +2607,6 @@ layoutlmv
funsd
layoutlmft
nielsr
HYPJUDY
HYPJUDY
DeBERTa
unilm
14 changes: 14 additions & 0 deletions examples/.config/model_params_onnxrt.json
Original file line number Diff line number Diff line change
Expand Up @@ -811,6 +811,20 @@
"input_model": "/tf_dataset2/models/onnx/hf_layoutlmft/layoutlmft-model.onnx",
"main_script": "main.py",
"batch_size": 1
},
"hf_deberta_dynamic": {
"model_src_dir": "nlp/huggingface_model/text_classification/quantization/ptq_dynamic",
"dataset_location": "/tf_dataset/pytorch/glue_data/MRPC",
"input_model": "/tf_dataset2/models/onnx/hf_deberta/deberta-v3-base-mrpc.onnx",
"main_script": "main.py",
"batch_size": 1
},
"hf_deberta": {
"model_src_dir": "nlp/huggingface_model/text_classification/quantization/ptq_static",
"dataset_location": "/tf_dataset/pytorch/glue_data/MRPC",
"input_model": "/tf_dataset2/models/onnx/hf_deberta/deberta-v3-base-mrpc.onnx",
"main_script": "main.py",
"batch_size": 1
}
}
}
Expand Down
8 changes: 8 additions & 0 deletions examples/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -1186,6 +1186,14 @@ Intel® Neural Compressor validated examples with multiple compression technique
<a href="./onnxrt/nlp/huggingface_model/text_classification/quantization/ptq_dynamic">integerops</a> / <a href="./onnxrt/nlp/huggingface_model/text_classification/quantization/ptq_static">qlinearops</a>
</td>
</tr>
<tr>
<td>DeBERTa v3 base MRPC (HuggingFace)</td>
<td>Natural Language Processing</td>
<td>Post-Training Dynamic / Static Quantization</td>
<td>
<a href="./onnxrt/nlp/huggingface_model/text_classification/quantization/ptq_dynamic">integerops</a> / <a href="./onnxrt/nlp/huggingface_model/text_classification/quantization/ptq_static">qlinearops</a>
</td>
</tr>
<tr>
<td>Spanbert SQuAD (HuggingFace)</td>
<td>Natural Language Processing</td>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ Supported model identifier from [huggingface.co](https://huggingface.co/):
| M-FAC/bert-mini-finetuned-mrpc |
| Intel/xlnet-base-cased-mrpc |
| Intel/bart-large-mrpc |
| Intel/deberta-v3-base-mrpc |

```bash
python export.py --model_name_or_path=Intel/bert-base-uncased-mrpc \ # or other supported model identifier
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,10 @@ def export_onnx_model(args, model):
'Intel/xlm-roberta-base-mrpc',
'Intel/camembert-base-mrpc',
'distilbert-base-uncased-finetuned-sst-2-english',
'Intel/xlnet-base-cased-mrpc']:
'Intel/xlnet-base-cased-mrpc',
'Intel/deberta-v3-base-mrpc']:
inputs = {'input_ids': torch.ones(1, args.max_len, dtype=torch.int64),
'attention_mask': torch.ones(1, args.max_len, dtype=torch.int64)}
'attention_mask': torch.ones(1, args.max_len, dtype=torch.int64)}
torch.onnx.export(model, # model being run
(inputs['input_ids'], # model input (or a tuple for multiple inputs)
inputs['attention_mask']),
Expand All @@ -27,7 +28,7 @@ def export_onnx_model(args, model):
else:
inputs = {'input_ids': torch.ones(1, args.max_len, dtype=torch.int64),
'attention_mask': torch.ones(1, args.max_len, dtype=torch.int64),
'token_type_ids': torch.ones(1, args.max_len, dtype=torch.int64)}
'token_type_ids': torch.ones(1, args.max_len, dtype=torch.int64)}
torch.onnx.export(model, # model being run
(inputs['input_ids'], # model input (or a tuple for multiple inputs)
inputs['attention_mask'],
Expand Down Expand Up @@ -63,7 +64,9 @@ def export_onnx_model(args, model):
'Intel/electra-small-discriminator-mrpc',
'M-FAC/bert-mini-finetuned-mrpc',
'Intel/xlnet-base-cased-mrpc',
'Intel/bart-large-mrpc'],
'Intel/bart-large-mrpc',
'Intel/deberta-v3-base-mrpc'
],
help='pretrained model name or path')
parser.add_argument(
'--max_len',
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,5 @@ onnxruntime
coloredlogs
sympy
onnxruntime-extensions; python_version < '3.10'
numpy==1.23.5
numpy==1.23.5
sentencepiece
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,10 @@ function run_benchmark {
model_name_or_path="Intel/bart-large-mrpc"
TASK_NAME='mrpc'
fi
if [[ "${input_model}" =~ "deberta" ]]; then
model_name_or_path="microsoft/deberta-v3-base"
TASK_NAME='mrpc'
fi

python main.py \
--model_name_or_path ${model_name_or_path} \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,12 @@ function run_tuning {
num_heads=16
hidden_size=4096
fi
if [[ "${input_model}" =~ "deberta" ]]; then
model_name_or_path="microsoft/deberta-v3-base"
TASK_NAME='mrpc'
num_heads=12
hidden_size=768
fi

python main.py \
--model_name_or_path ${model_name_or_path} \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@ Supported model identifier from [huggingface.co](https://huggingface.co/):
| Intel/electra-small-discriminator-mrpc |
| M-FAC/bert-mini-finetuned-mrpc |
| Intel/xlnet-base-cased-mrpc |
| Intel/bart-large-mrpc |
| Intel/bart-large-mrpc |.
| Intel/deberta-v3-base-mrpc |

```bash
python export.py --model_name_or_path=Intel/bert-base-uncased-mrpc \ # or other supported model identifier
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,11 @@ def export_onnx_model(args, model):
if args.model_name_or_path in ['Intel/roberta-base-mrpc',
'Intel/xlm-roberta-base-mrpc',
'Intel/camembert-base-mrpc',
'distilbert-base-uncased-finetuned-sst-2-english']:
'distilbert-base-uncased-finetuned-sst-2-english',
'Intel/xlnet-base-cased-mrpc',
'Intel/deberta-v3-base-mrpc']:
inputs = {'input_ids': torch.ones(1, args.max_len, dtype=torch.int64),
'attention_mask': torch.ones(1, args.max_len, dtype=torch.int64)}
'attention_mask': torch.ones(1, args.max_len, dtype=torch.int64)}
torch.onnx.export(model, # model being run
(inputs['input_ids'], # model input (or a tuple for multiple inputs)
inputs['attention_mask']),
Expand All @@ -26,7 +28,7 @@ def export_onnx_model(args, model):
else:
inputs = {'input_ids': torch.ones(1, args.max_len, dtype=torch.int64),
'attention_mask': torch.ones(1, args.max_len, dtype=torch.int64),
'token_type_ids': torch.ones(1, args.max_len, dtype=torch.int64)}
'token_type_ids': torch.ones(1, args.max_len, dtype=torch.int64)}
torch.onnx.export(model, # model being run
(inputs['input_ids'], # model input (or a tuple for multiple inputs)
inputs['attention_mask'],
Expand All @@ -35,12 +37,12 @@ def export_onnx_model(args, model):
opset_version=14, # the ONNX version to export the model
do_constant_folding=True, # whether to execute constant folding
input_names=['input_ids', # the model's input names
'attention_mask',
'token_type_ids'],
'attention_mask',
'token_type_ids'],
output_names=['logits'],
dynamic_axes={'input_ids': symbolic_names, # variable length axes
'attention_mask' : symbolic_names,
'token_type_ids' : symbolic_names})
'attention_mask' : symbolic_names,
'token_type_ids' : symbolic_names})
print("ONNX Model exported to {0}".format(args.output_model))

if __name__ == "__main__":
Expand All @@ -57,7 +59,14 @@ def export_onnx_model(args, model):
'distilbert-base-uncased-finetuned-sst-2-english',
'Alireza1044/albert-base-v2-sst2',
'philschmid/MiniLM-L6-H384-uncased-sst2',
'Intel/MiniLM-L12-H384-uncased-mrpc'],
'Intel/MiniLM-L12-H384-uncased-mrpc',
'bert-base-cased-finetuned-mrpc',
'Intel/electra-small-discriminator-mrpc',
'M-FAC/bert-mini-finetuned-mrpc',
'Intel/xlnet-base-cased-mrpc',
'Intel/bart-large-mrpc',
'Intel/deberta-v3-base-mrpc'
],
help='pretrained model name or path')
parser.add_argument(
'--max_len',
Expand All @@ -71,4 +80,8 @@ def export_onnx_model(args, model):
args.model_name_or_path,
config=AutoConfig.from_pretrained(args.model_name_or_path))

export_onnx_model(args, model)
if args.model_name_or_path == 'Intel/bart-large-mrpc':
import os
os.system('python -m transformers.onnx --model=Intel/bart-large-mrpc --feature=sequence-classification bart-large-mrpc')
else:
export_onnx_model(args, model)
Original file line number Diff line number Diff line change
Expand Up @@ -409,15 +409,19 @@ def eval_func(model, *args):

from neural_compressor import quantization, PostTrainingQuantConfig
from neural_compressor.utils.constant import FP32
fp32_op_names = None
specific_quant_config = {}
if args.model_name_or_path == 'Intel/bart-large-mrpc':
fp32_op_names = ['/model/(en|de)coder/layers.*/fc(1|2)/MatMul']
specific_quant_config['op_name_dict'] = {op_name:FP32 for op_name in fp32_op_names}
elif args.model_name_or_path == 'Alireza1044/albert-base-v2-sst2':
fp32_op_names = ['Gemm_1410_MatMul', 'MatMul_(259|168)']
specific_quant_config['op_name_dict'] = {op_name:FP32 for op_name in fp32_op_names}
elif args.model_name_or_path == 'Intel/deberta-v3-base-mrpc':
specific_quant_config['op_type_dict'] = {'^((?!(MatMul|Gather)).)*$': FP32}
specific_quant_config['quant_level'] = 1
config = PostTrainingQuantConfig(approach='static',
quant_format=args.quant_format,
op_name_dict={op_name:FP32 for op_name in fp32_op_names} \
if fp32_op_names else None,)
**specific_quant_config)
q_model = quantization.fit(model,
config,
eval_func=eval_func,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,5 @@ onnxruntime
coloredlogs
sympy
onnxruntime-extensions; python_version < '3.10'
numpy==1.23.5
numpy==1.23.5
sentencepiece
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,10 @@ function run_benchmark {
model_name_or_path="Intel/bart-large-mrpc"
TASK_NAME='mrpc'
fi
if [[ "${input_model}" =~ "deberta" ]]; then
model_name_or_path="microsoft/deberta-v3-base"
TASK_NAME='mrpc'
fi

python main.py \
--model_name_or_path ${model_name_or_path} \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,12 @@ function run_tuning {
num_heads=16
hidden_size=4096
fi
if [[ "${input_model}" =~ "deberta" ]]; then
model_name_or_path="microsoft/deberta-v3-base"
TASK_NAME='mrpc'
num_heads=12
hidden_size=768
fi

python main.py \
--model_name_or_path ${model_name_or_path} \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ bash install_layoutlmft.sh
> Note: Validated ONNX Runtime [Version](/docs/source/installation_guide.md#validated-software-environment).
## 2. Prepare Model
Finetune on FUNSD
Finetune on FUNSD. Refer to the [link](https://github.com/microsoft/unilm/tree/master/layoutlm#fine-tuning-example-on-funsd)

```bash
python main.py \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ bash install_layoutlmft.sh
> Note: Validated ONNX Runtime [Version](/docs/source/installation_guide.md#validated-software-environment).
## 2. Prepare Model
Finetune on FUNSD
Finetune on FUNSD. Refer to the [link](https://github.com/microsoft/unilm/tree/master/layoutlm#fine-tuning-example-on-funsd)

```bash
python main.py \
Expand Down
27 changes: 26 additions & 1 deletion neural_compressor/adaptor/onnxrt.py
Original file line number Diff line number Diff line change
Expand Up @@ -995,14 +995,16 @@ def query_fw_capability(self, model):

ffn_matmul = []
attention_matmul_optype = [node.op_type for node in attention_matmul]
# find matmul ops in feed forward network (FFN) structure whitch mainly in transfomers based NLP models
if len(attention_matmul) > 0 and 'Attention' in attention_matmul_optype:
# model is optimized and Attention is fused,
# index of Attention is used as split to find FFN MatMul
first_attention_index = attention_matmul_optype.index('Attention')
attention_matmul_optype = attention_matmul_optype[first_attention_index:]
attention_matmul = attention_matmul[first_attention_index:]
attention_index = list(np.where(np.array(attention_matmul_optype) == 'Attention')[0])
block_len = attention_index[1] - attention_index[0] if len(attention_index) > 2 else 4
for idx in range(len(attention_index)):
# to find matmul in ffn
if idx != len(attention_index) - 1:
index = attention_index[idx + 1]
if index - 2 >= 0 and index - 1 >= 0:
Expand All @@ -1014,6 +1016,29 @@ def query_fw_capability(self, model):
index + block_len - 1 < len(attention_matmul):
ffn_matmul.append([attention_matmul[index + block_len - 2],
attention_matmul[index + block_len - 1]])
else:
# model is not optimized or Attention isn't fused,
# query MatMul, key MatMul and value MatMul are used as split to find FFN MatMul
qkv = self.pre_optimized_model.find_qkv_in_attention(find_all=True)
if len(qkv) != 0:
attention_starts = [nodes[0] for nodes in qkv]
attention_index = [np.where(np.array([n.name for n in attention_matmul]) \
== attention_start)[0].tolist()[0] \
for attention_start in attention_starts]
block_len = attention_index[1] - attention_index[0] if len(attention_index) > 2 else 4
for idx in range(len(attention_index)):
if idx != len(attention_index) - 1:
index = attention_index[idx + 1]
if index - 2 >= 0 and index - 1 >= 0:
ffn_matmul.append([attention_matmul[index - 2],
attention_matmul[index - 1]])
else:
index = attention_index[idx]
if index + block_len - 2 < len(attention_matmul) and \
index + block_len - 1 < len(attention_matmul):
ffn_matmul.append([attention_matmul[index + block_len - 2],
attention_matmul[index + block_len - 1]])

block_wise = []
for block in reversed(ffn_matmul):
node_info = []
Expand Down
Loading

0 comments on commit abac54e

Please sign in to comment.