diff --git a/examples/.config/model_params_onnxrt.json b/examples/.config/model_params_onnxrt.json index 2480fe54c7b..e0b08c699ad 100644 --- a/examples/.config/model_params_onnxrt.json +++ b/examples/.config/model_params_onnxrt.json @@ -46,7 +46,7 @@ "new_benchmark": true }, "bert_base_MRPC_static": { - "model_src_dir": "language_translation/bert/quantization/ptq", + "model_src_dir": "nlp/bert/quantization/ptq", "dataset_location": "/tf_dataset/pytorch/glue_data/MRPC", "input_model": "/tf_dataset2/models/onnx/bert_base_MRPC/bert.onnx", "yaml": "bert_static.yaml", @@ -55,7 +55,7 @@ "new_benchmark": true }, "bert_base_MRPC_dynamic": { - "model_src_dir": "language_translation/bert/quantization/ptq", + "model_src_dir": "nlp/bert/quantization/ptq", "dataset_location": "/tf_dataset/pytorch/glue_data/MRPC", "input_model": "/tf_dataset2/models/onnx/bert_base_MRPC/bert.onnx", "yaml": "bert_dynamic.yaml", @@ -64,7 +64,7 @@ "new_benchmark": true }, "distilbert_base_MRPC": { - "model_src_dir": "language_translation/distilbert/quantization/ptq", + "model_src_dir": "nlp/distilbert/quantization/ptq", "dataset_location": "/tf_dataset/pytorch/glue_data/MRPC", "input_model": "/tf_dataset2/models/onnx/distilbert_base_MRPC/distilbert-base-uncased.onnx", "yaml": "distilbert.yaml", @@ -73,7 +73,7 @@ "new_benchmark": true }, "mobilebert_MRPC": { - "model_src_dir": "language_translation/mobilebert/quantization/ptq", + "model_src_dir": "nlp/mobilebert/quantization/ptq", "dataset_location": "/tf_dataset/pytorch/glue_data/MRPC", "input_model": "/tf_dataset2/models/onnx/mobilebert_MRPC/mobilebert-uncased.onnx", "yaml": "mobilebert.yaml", @@ -82,7 +82,7 @@ "new_benchmark": true }, "roberta_base_MRPC": { - "model_src_dir": "language_translation/roberta/quantization/ptq", + "model_src_dir": "nlp/roberta/quantization/ptq", "dataset_location": "/tf_dataset/pytorch/glue_data/MRPC", "input_model": "/tf_dataset2/models/onnx/roberta_base_MRPC/roberta-base.onnx", "yaml": "roberta.yaml", @@ -118,7 +118,7 @@ "new_benchmark": true }, "bert_squad_model_zoo": { - "model_src_dir": "language_translation/onnx_model_zoo/bert-squad/quantization/ptq", + "model_src_dir": "nlp/onnx_model_zoo/bert-squad/quantization/ptq", "dataset_location": "/tf_dataset2/datasets/squad", "input_model": "/tf_dataset2/models/onnx/bert_squad/bert_squad_model_zoo.onnx", "yaml": "bert.yaml", @@ -127,7 +127,7 @@ "new_benchmark": true }, "mobilebert_squad_mlperf": { - "model_src_dir": "language_translation/onnx_model_zoo/mobilebert/quantization/ptq", + "model_src_dir": "nlp/onnx_model_zoo/mobilebert/quantization/ptq", "dataset_location": "/tf_dataset2/datasets/squad", "input_model": "/tf_dataset2/models/onnx/mobilebert_squad/mobilebert_squad_mlperf.onnx", "yaml": "mobilebert.yaml", @@ -136,7 +136,7 @@ "new_benchmark": true }, "gpt2_lm_head_wikitext_model_zoo": { - "model_src_dir": "language_translation/onnx_model_zoo/gpt2/quantization/ptq", + "model_src_dir": "nlp/onnx_model_zoo/gpt2/quantization/ptq", "dataset_location": "/tf_dataset2/datasets/wikitext/wikitext-2-raw/", "input_model": "/tf_dataset2/models/onnx/gpt2/gpt2_lm_head_wikitext_model_zoo.onnx", "yaml": "gpt2.yaml", @@ -352,7 +352,7 @@ "new_benchmark": true }, "bert_base_MRPC_static_qdq": { - "model_src_dir": "language_translation/bert/quantization/ptq", + "model_src_dir": "nlp/bert/quantization/ptq", "dataset_location": "/tf_dataset/pytorch/glue_data/MRPC", "input_model": "/tf_dataset2/models/onnx/bert_base_MRPC/bert.onnx", "yaml": "bert_qdq.yaml", @@ -361,7 +361,7 @@ "new_benchmark": true }, "distilbert_base_MRPC_qdq": { - "model_src_dir": "language_translation/distilbert/quantization/ptq", + "model_src_dir": "nlp/distilbert/quantization/ptq", "dataset_location": "/tf_dataset/pytorch/glue_data/MRPC", "input_model": "/tf_dataset2/models/onnx/distilbert_base_MRPC/distilbert-base-uncased.onnx", "yaml": "distilbert_qdq.yaml", @@ -370,7 +370,7 @@ "new_benchmark": true }, "mobilebert_MRPC_qdq": { - "model_src_dir": "language_translation/mobilebert/quantization/ptq", + "model_src_dir": "nlp/mobilebert/quantization/ptq", "dataset_location": "/tf_dataset/pytorch/glue_data/MRPC", "input_model": "/tf_dataset2/models/onnx/mobilebert_MRPC/mobilebert-uncased.onnx", "yaml": "mobilebert_qdq.yaml", @@ -379,7 +379,7 @@ "new_benchmark": true }, "roberta_base_MRPC_qdq": { - "model_src_dir": "language_translation/roberta/quantization/ptq", + "model_src_dir": "nlp/roberta/quantization/ptq", "dataset_location": "/tf_dataset/pytorch/glue_data/MRPC", "input_model": "/tf_dataset2/models/onnx/roberta_base_MRPC/roberta-base.onnx", "yaml": "roberta_qdq.yaml", @@ -415,7 +415,7 @@ "new_benchmark": true }, "bert_squad_model_zoo_qdq": { - "model_src_dir": "language_translation/onnx_model_zoo/bert-squad/quantization/ptq", + "model_src_dir": "nlp/onnx_model_zoo/bert-squad/quantization/ptq", "dataset_location": "/tf_dataset2/datasets/squad", "input_model": "/tf_dataset2/models/onnx/bert_squad/bert_squad_model_zoo.onnx", "yaml": "bert_qdq.yaml", @@ -424,7 +424,7 @@ "new_benchmark": true }, "mobilebert_squad_mlperf_qdq": { - "model_src_dir": "language_translation/onnx_model_zoo/mobilebert/quantization/ptq", + "model_src_dir": "nlp/onnx_model_zoo/mobilebert/quantization/ptq", "dataset_location": "/tf_dataset2/datasets/squad", "input_model": "/tf_dataset2/models/onnx/mobilebert_squad/mobilebert_squad_mlperf-13.onnx", "yaml": "mobilebert_qdq.yaml", @@ -631,13 +631,103 @@ "new_benchmark": true }, "BiDAF": { - "model_src_dir": "language_translation/onnx_model_zoo/BiDAF/quantization/ptq", + "model_src_dir": "nlp/onnx_model_zoo/BiDAF/quantization/ptq", "dataset_location": "/tf_dataset2/datasets/squad/dev-v1.1.json", "input_model": "/tf_dataset2/models/onnx/BiDAF/bidaf-11.onnx", "yaml": "bidaf.yaml", "strategy": "basic", "batch_size": 1, "new_benchmark": true + }, + "hf_bert-base-uncased_dynamic": { + "model_src_dir": "nlp/huggingface_model/text_classification/quantization/ptq", + "dataset_location": "/tf_dataset/pytorch/glue_data/MRPC", + "input_model": "/tf_dataset2/models/onnx/hf_bert-base-uncased_dynamic/bert-base-uncased-mrpc.onnx", + "yaml": "glue_dynamic.yaml", + "strategy": "basic", + "batch_size": 1, + "new_benchmark": true + }, + "hf_roberta-base_dynamic": { + "model_src_dir": "nlp/huggingface_model/text_classification/quantization/ptq", + "dataset_location": "/tf_dataset/pytorch/glue_data/MRPC", + "input_model": "/tf_dataset2/models/onnx/hf_roberta-base_dynamic/roberta-base-mrpc.onnx", + "yaml": "glue_dynamic.yaml", + "strategy": "basic", + "batch_size": 1, + "new_benchmark": true + }, + "hf_xlm-roberta-base_dynamic": { + "model_src_dir": "nlp/huggingface_model/text_classification/quantization/ptq", + "dataset_location": "/tf_dataset/pytorch/glue_data/MRPC", + "input_model": "/tf_dataset2/models/onnx/hf_xlm-roberta-base_dynamic/xlm-roberta-base-mrpc.onnx", + "yaml": "glue_dynamic.yaml", + "strategy": "basic", + "batch_size": 1, + "new_benchmark": true + }, + "hf_camembert-base_dynamic": { + "model_src_dir": "nlp/huggingface_model/text_classification/quantization/ptq", + "dataset_location": "/tf_dataset/pytorch/glue_data/MRPC", + "input_model": "/tf_dataset2/models/onnx/hf_camembert-base_dynamic/camembert-base-mrpc.onnx", + "yaml": "glue_dynamic.yaml", + "strategy": "basic", + "batch_size": 1, + "new_benchmark": true + }, + "hf_MiniLM-L12-H384-uncased_dynamic": { + "model_src_dir": "nlp/huggingface_model/text_classification/quantization/ptq", + "dataset_location": "/tf_dataset/pytorch/glue_data/MRPC", + "input_model": "/tf_dataset2/models/onnx/hf_MiniLM-L12-H384-uncased_dynamic/MiniLM-L12-H384-uncased-mrpc.onnx", + "yaml": "glue_dynamic.yaml", + "strategy": "basic", + "batch_size": 1, + "new_benchmark": true + }, + "hf_distilbert-base-uncased_dynamic": { + "model_src_dir": "nlp/huggingface_model/text_classification/quantization/ptq", + "dataset_location": "/tf_dataset/pytorch/glue_data/SST-2/", + "input_model": "/tf_dataset2/models/onnx/hf_distilbert-base-uncased_dynamic/distilbert-base-uncased-finetuned-sst-2-english.onnx", + "yaml": "glue_dynamic.yaml", + "strategy": "basic", + "batch_size": 1, + "new_benchmark": true + }, + "hf_albert-base-v2_dynamic": { + "model_src_dir": "nlp/huggingface_model/text_classification/quantization/ptq", + "dataset_location": "/tf_dataset/pytorch/glue_data/SST-2/", + "input_model": "/tf_dataset2/models/onnx/hf_albert-base-v2_dynamic/albert-base-v2-sst2.onnx", + "yaml": "glue_dynamic.yaml", + "strategy": "basic", + "batch_size": 1, + "new_benchmark": true + }, + "hf_MiniLM-L6-H384-uncased_dynamic": { + "model_src_dir": "nlp/huggingface_model/text_classification/quantization/ptq", + "dataset_location": "/tf_dataset/pytorch/glue_data/SST-2/", + "input_model": "/tf_dataset2/models/onnx/hf_MiniLM-L6-H384-uncased_dynamic/MiniLM-L6-H384-uncased-sst2.onnx", + "yaml": "glue_dynamic.yaml", + "strategy": "basic", + "batch_size": 1, + "new_benchmark": true + }, + "hf_spanbert_dynamic": { + "model_src_dir": "nlp/huggingface_model/question_answering/quantization/ptq", + "dataset_location": "/tf_dataset2/datasets/squad", + "input_model": "/tf_dataset2/models/onnx/hf_spanbert_dynamic/spanbert-finetuned-squadv1.onnx", + "yaml": "qa_dynamic.yaml", + "strategy": "basic", + "batch_size": 1, + "new_benchmark": true + }, + "hf_bert-base-multilingual-cased_dynamic": { + "model_src_dir": "nlp/huggingface_model/question_answering/quantization/ptq", + "dataset_location": "/tf_dataset2/datasets/squad", + "input_model": "/tf_dataset2/models/onnx/hf_bert-base-multilingual-cased_dynamic/bert-base-multilingual-cased-finetuned-squad.onnx", + "yaml": "qa_dynamic.yaml", + "strategy": "basic", + "batch_size": 1, + "new_benchmark": true } } } diff --git a/examples/README.md b/examples/README.md index 7342bbe1e8a..0698d8e7d41 100644 --- a/examples/README.md +++ b/examples/README.md @@ -855,55 +855,115 @@ IntelĀ® Neural Compressor validated examples with multiple compression technique BERT base MRPC Natural Language Processing Post-Training Static Quantization - integerops / qdq + integerops / qdq BERT base MRPC Natural Language Processing Post-Training Dynamic Quantization - integerops + integerops DistilBERT base MRPC Natural Language Processing Post-Training Dynamic / Static Quantization - integerops / qdq + integerops / qdq Mobile bert MRPC Natural Language Processing Post-Training Dynamic / Static Quantization - integerops / qdq + integerops / qdq Roberta base MRPC Natural Language Processing Post-Training Dynamic / Static Quantization - integerops / qdq + integerops / qdq BERT SQuAD Natural Language Processing Post-Training Dynamic / Static Quantization - integerops / qdq + integerops / qdq GPT2 lm head WikiText Natural Language Processing Post-Training Dynamic Quantization - integerops + integerops MobileBERT SQuAD MLPerf Natural Language Processing Post-Training Dynamic / Static Quantization - integerops / qdq + integerops / qdq BiDAF Natural Language Processing Post-Training Dynamic Quantization - integerops + integerops + + + BERT base uncased MRPC (HuggingFace) + Natural Language Processing + Post-Training Static Quantization + qdq + + + Roberta base MRPC (HuggingFace) + Natural Language Processing + Post-Training Static Quantization + qdq + + + XLM Roberta base MRPC (HuggingFace) + Natural Language Processing + Post-Training Static Quantization + qdq + + + Camembert base MRPC (HuggingFace) + Natural Language Processing + Post-Training Static Quantization + qdq + + + MiniLM L12 H384 uncased MRPC (HuggingFace) + Natural Language Processing + Post-Training Static Quantization + qdq + + + Distilbert base uncased SST-2 (HuggingFace) + Natural Language Processing + Post-Training Static Quantization + qdq + + + Albert base v2 SST-2 (HuggingFace) + Natural Language Processing + Post-Training Static Quantization + qdq + + + MiniLM L6 H384 uncased SST-2 (HuggingFace) + Natural Language Processing + Post-Training Static Quantization + qdq + + + Spanbert SQuAD (HuggingFace) + Natural Language Processing + Post-Training Static Quantization + qdq + + + Bert base multilingual cased SQuAD (HuggingFace) + Natural Language Processing + Post-Training Static Quantization + qdq SSD MobileNet V1 diff --git a/examples/onnxrt/language_translation/bert/quantization/ptq/README.md b/examples/onnxrt/nlp/bert/quantization/ptq/README.md similarity index 100% rename from examples/onnxrt/language_translation/bert/quantization/ptq/README.md rename to examples/onnxrt/nlp/bert/quantization/ptq/README.md diff --git a/examples/onnxrt/language_translation/bert/quantization/ptq/bert_dynamic.yaml b/examples/onnxrt/nlp/bert/quantization/ptq/bert_dynamic.yaml similarity index 100% rename from examples/onnxrt/language_translation/bert/quantization/ptq/bert_dynamic.yaml rename to examples/onnxrt/nlp/bert/quantization/ptq/bert_dynamic.yaml diff --git a/examples/onnxrt/language_translation/bert/quantization/ptq/bert_qdq.yaml b/examples/onnxrt/nlp/bert/quantization/ptq/bert_qdq.yaml similarity index 100% rename from examples/onnxrt/language_translation/bert/quantization/ptq/bert_qdq.yaml rename to examples/onnxrt/nlp/bert/quantization/ptq/bert_qdq.yaml diff --git a/examples/onnxrt/language_translation/bert/quantization/ptq/bert_static.yaml b/examples/onnxrt/nlp/bert/quantization/ptq/bert_static.yaml similarity index 100% rename from examples/onnxrt/language_translation/bert/quantization/ptq/bert_static.yaml rename to examples/onnxrt/nlp/bert/quantization/ptq/bert_static.yaml diff --git a/examples/onnxrt/language_translation/bert/quantization/ptq/export.py b/examples/onnxrt/nlp/bert/quantization/ptq/export.py similarity index 100% rename from examples/onnxrt/language_translation/bert/quantization/ptq/export.py rename to examples/onnxrt/nlp/bert/quantization/ptq/export.py diff --git a/examples/onnxrt/language_translation/bert/quantization/ptq/main.py b/examples/onnxrt/nlp/bert/quantization/ptq/main.py similarity index 100% rename from examples/onnxrt/language_translation/bert/quantization/ptq/main.py rename to examples/onnxrt/nlp/bert/quantization/ptq/main.py diff --git a/examples/onnxrt/language_translation/bert/quantization/ptq/prepare_data.sh b/examples/onnxrt/nlp/bert/quantization/ptq/prepare_data.sh similarity index 100% rename from examples/onnxrt/language_translation/bert/quantization/ptq/prepare_data.sh rename to examples/onnxrt/nlp/bert/quantization/ptq/prepare_data.sh diff --git a/examples/onnxrt/language_translation/bert/quantization/ptq/prepare_model.sh b/examples/onnxrt/nlp/bert/quantization/ptq/prepare_model.sh similarity index 100% rename from examples/onnxrt/language_translation/bert/quantization/ptq/prepare_model.sh rename to examples/onnxrt/nlp/bert/quantization/ptq/prepare_model.sh diff --git a/examples/onnxrt/language_translation/bert/quantization/ptq/requirements.txt b/examples/onnxrt/nlp/bert/quantization/ptq/requirements.txt similarity index 100% rename from examples/onnxrt/language_translation/bert/quantization/ptq/requirements.txt rename to examples/onnxrt/nlp/bert/quantization/ptq/requirements.txt diff --git a/examples/onnxrt/language_translation/bert/quantization/ptq/run_benchmark.sh b/examples/onnxrt/nlp/bert/quantization/ptq/run_benchmark.sh similarity index 100% rename from examples/onnxrt/language_translation/bert/quantization/ptq/run_benchmark.sh rename to examples/onnxrt/nlp/bert/quantization/ptq/run_benchmark.sh diff --git a/examples/onnxrt/language_translation/bert/quantization/ptq/run_tuning.sh b/examples/onnxrt/nlp/bert/quantization/ptq/run_tuning.sh similarity index 100% rename from examples/onnxrt/language_translation/bert/quantization/ptq/run_tuning.sh rename to examples/onnxrt/nlp/bert/quantization/ptq/run_tuning.sh diff --git a/examples/onnxrt/language_translation/distilbert/quantization/ptq/distilbert.yaml b/examples/onnxrt/nlp/distilbert/quantization/ptq/distilbert.yaml similarity index 100% rename from examples/onnxrt/language_translation/distilbert/quantization/ptq/distilbert.yaml rename to examples/onnxrt/nlp/distilbert/quantization/ptq/distilbert.yaml diff --git a/examples/onnxrt/language_translation/distilbert/quantization/ptq/distilbert_qdq.yaml b/examples/onnxrt/nlp/distilbert/quantization/ptq/distilbert_qdq.yaml similarity index 100% rename from examples/onnxrt/language_translation/distilbert/quantization/ptq/distilbert_qdq.yaml rename to examples/onnxrt/nlp/distilbert/quantization/ptq/distilbert_qdq.yaml diff --git a/examples/onnxrt/language_translation/distilbert/quantization/ptq/export.py b/examples/onnxrt/nlp/distilbert/quantization/ptq/export.py similarity index 100% rename from examples/onnxrt/language_translation/distilbert/quantization/ptq/export.py rename to examples/onnxrt/nlp/distilbert/quantization/ptq/export.py diff --git a/examples/onnxrt/language_translation/distilbert/quantization/ptq/main.py b/examples/onnxrt/nlp/distilbert/quantization/ptq/main.py similarity index 100% rename from examples/onnxrt/language_translation/distilbert/quantization/ptq/main.py rename to examples/onnxrt/nlp/distilbert/quantization/ptq/main.py diff --git a/examples/onnxrt/language_translation/distilbert/quantization/ptq/prepare_data.sh b/examples/onnxrt/nlp/distilbert/quantization/ptq/prepare_data.sh similarity index 100% rename from examples/onnxrt/language_translation/distilbert/quantization/ptq/prepare_data.sh rename to examples/onnxrt/nlp/distilbert/quantization/ptq/prepare_data.sh diff --git a/examples/onnxrt/language_translation/distilbert/quantization/ptq/prepare_model.sh b/examples/onnxrt/nlp/distilbert/quantization/ptq/prepare_model.sh similarity index 100% rename from examples/onnxrt/language_translation/distilbert/quantization/ptq/prepare_model.sh rename to examples/onnxrt/nlp/distilbert/quantization/ptq/prepare_model.sh diff --git a/examples/onnxrt/language_translation/distilbert/quantization/ptq/readme.md b/examples/onnxrt/nlp/distilbert/quantization/ptq/readme.md similarity index 100% rename from examples/onnxrt/language_translation/distilbert/quantization/ptq/readme.md rename to examples/onnxrt/nlp/distilbert/quantization/ptq/readme.md diff --git a/examples/onnxrt/language_translation/distilbert/quantization/ptq/requirements.txt b/examples/onnxrt/nlp/distilbert/quantization/ptq/requirements.txt similarity index 100% rename from examples/onnxrt/language_translation/distilbert/quantization/ptq/requirements.txt rename to examples/onnxrt/nlp/distilbert/quantization/ptq/requirements.txt diff --git a/examples/onnxrt/language_translation/distilbert/quantization/ptq/run_benchmark.sh b/examples/onnxrt/nlp/distilbert/quantization/ptq/run_benchmark.sh similarity index 100% rename from examples/onnxrt/language_translation/distilbert/quantization/ptq/run_benchmark.sh rename to examples/onnxrt/nlp/distilbert/quantization/ptq/run_benchmark.sh diff --git a/examples/onnxrt/language_translation/distilbert/quantization/ptq/run_tuning.sh b/examples/onnxrt/nlp/distilbert/quantization/ptq/run_tuning.sh similarity index 100% rename from examples/onnxrt/language_translation/distilbert/quantization/ptq/run_tuning.sh rename to examples/onnxrt/nlp/distilbert/quantization/ptq/run_tuning.sh diff --git a/examples/onnxrt/nlp/huggingface_model/question_answering/quantization/ptq/README.md b/examples/onnxrt/nlp/huggingface_model/question_answering/quantization/ptq/README.md new file mode 100644 index 00000000000..55538ff591c --- /dev/null +++ b/examples/onnxrt/nlp/huggingface_model/question_answering/quantization/ptq/README.md @@ -0,0 +1,43 @@ +# Evaluate performance of ONNX Runtime(Huggingface Question Answering) +>ONNX runtime quantization is under active development. please use 1.6.0+ to get more quantization support. + +This example load a language translation model and confirm its accuracy and speed based on [SQuAD]((https://rajpurkar.github.io/SQuAD-explorer/)) task. + +### Environment +Please use latest onnx and onnxruntime version. + +### Prepare dataset +You should download SQuAD dataset from [SQuAD dataset link](https://rajpurkar.github.io/SQuAD-explorer/). + +### Prepare model + +Supported model identifier from [huggingface.co](https://huggingface.co/): + +| Model Identifier | +|:-----------------------------------------------:| +| mrm8488/spanbert-finetuned-squadv1 | +| salti/bert-base-multilingual-cased-finetuned-squad | + + +```bash +python export.py --model_name_or_path=mrm8488/spanbert-finetuned-squadv1 \ # or other supported model identifier +``` + +### Quantization + +Dynamic quantize: + +```bash +bash run_tuning.sh --input_model=/path/to/model \ # model path as *.onnx + --output_model=/path/to/model_tune \ + --config=qa_dynamic.yaml +``` + +### Benchmark + +```bash +bash run_benchmark.sh --input_model=/path/to/model \ # model path as *.onnx + --config=qa_dynamic.yaml + --mode=performance # or accuracy +``` + diff --git a/examples/onnxrt/nlp/huggingface_model/question_answering/quantization/ptq/export.py b/examples/onnxrt/nlp/huggingface_model/question_answering/quantization/ptq/export.py new file mode 100644 index 00000000000..08824f90405 --- /dev/null +++ b/examples/onnxrt/nlp/huggingface_model/question_answering/quantization/ptq/export.py @@ -0,0 +1,50 @@ +import argparse + +import torch +from transformers import AutoConfig, AutoModelForQuestionAnswering + +def export_onnx_model(args, model): + with torch.no_grad(): + symbolic_names = {0: 'batch_size', 1: 'max_seq_len'} + inputs = {'input_ids': torch.ones(1, args.max_len, dtype=torch.int64), + 'token_type_ids': torch.ones(1, args.max_len, dtype=torch.int64), + 'attention_mask': torch.ones(1, args.max_len, dtype=torch.int64)} + torch.onnx.export(model, # model being run + (inputs['input_ids'], # model input (or a tuple for multiple inputs) + inputs['token_type_ids'], + inputs['attention_mask']), + args.output_model, # where to save the model (can be a file or file-like object) + opset_version=11, # the ONNX version to export the model + do_constant_folding=True, # whether to execute constant folding + input_names=['input_ids', # the model's input names + 'token_type_ids', + 'attention_mask'], + dynamic_axes={'input_ids': symbolic_names, # variable length axes + 'token_type_ids' : symbolic_names, + 'attention_mask' : symbolic_names}) + print("ONNX Model exported to {0}".format(args.output_model)) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description='Export huggingface onnx model', + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument( + '--model_name_or_path', + type=str, + choices=['mrm8488/spanbert-finetuned-squadv1', + 'salti/bert-base-multilingual-cased-finetuned-squad'], + help='pretrained model name or path ') + parser.add_argument( + '--max_len', + type=int, + default=512, + help='Maximum length of the sentence pairs') + args = parser.parse_args() + args.output_model = args.model_name_or_path.split('/')[1] + '.onnx' + + model = AutoModelForQuestionAnswering.from_pretrained( + args.model_name_or_path, + config=AutoConfig.from_pretrained(args.model_name_or_path)) + + export_onnx_model(args, model) \ No newline at end of file diff --git a/examples/onnxrt/nlp/huggingface_model/question_answering/quantization/ptq/main.py b/examples/onnxrt/nlp/huggingface_model/question_answering/quantization/ptq/main.py new file mode 100644 index 00000000000..1866be2d602 --- /dev/null +++ b/examples/onnxrt/nlp/huggingface_model/question_answering/quantization/ptq/main.py @@ -0,0 +1,614 @@ +#!/usr/bin/env python +# coding=utf-8 +# Copyright 2020 The HuggingFace Team All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Fine-tuning the library models for question answering using a slightly adapted version of the šŸ¤— Trainer. +""" +# You can also adapt this script on your own question answering task. Pointers for this are left as comments. + +import logging +import os +import sys +from dataclasses import dataclass, field +from typing import Optional + +import datasets +from datasets import load_dataset, load_metric + +from torch.utils.data import Dataset, DataLoader + +import sys +import onnx +import onnxruntime as ort +import numpy as np +import transformers +from trainer_qa import QuestionAnsweringTrainer +from transformers import ( + AutoConfig, + AutoModelForQuestionAnswering, + AutoTokenizer, + DataCollatorWithPadding, + EvalPrediction, + HfArgumentParser, + PreTrainedTokenizerFast, + TrainingArguments, + default_data_collator, + set_seed, +) +from transformers.trainer_utils import get_last_checkpoint +from transformers.utils import check_min_version, send_example_telemetry +from transformers.utils.versions import require_version +from utils_qa import postprocess_qa_predictions + + +# Will error if the minimal version of Transformers is not installed. Remove at your own risks. +# check_min_version("4.22.0.dev0") + +require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt") + +logger = logging.getLogger(__name__) + + +@dataclass +class ModelArguments: + """ + Arguments pertaining to which model/config/tokenizer we are going to fine-tune from. + """ + + model_name_or_path: str = field( + default=None, metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"} + ) + config_name: Optional[str] = field( + default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"} + ) + tokenizer_name: Optional[str] = field( + default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} + ) + cache_dir: Optional[str] = field( + default=None, + metadata={"help": "Path to directory to store the pretrained models downloaded from huggingface.co"}, + ) + model_revision: str = field( + default="main", + metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, + ) + use_auth_token: bool = field( + default=False, + metadata={ + "help": ( + "Will use the token generated when running `transformers-cli login` (necessary to use this script " + "with private models)." + ) + }, + ) + model_path: str = field( + default=None, + metadata={"help": ("onnx model path")}, + ) + tune: bool = field( + default=False, + metadata={"help": ("INC tune")}, + ) + benchmark: bool = field( + default=False, + metadata={"help": ("INC benchmark")}, + ) + mode: str = field( + default='performance', + metadata={"help": ("INC benchmark mode")}, + ) + config: str = field( + default='bert-base-multilingual-cased-static.yaml', + metadata={"help": ("INC config")}, + ) + save_path: str = field( + default=None, + metadata={"help": ("onnx int8 model path")}, + ) + num_heads: int = field( + default=12, + metadata={"help": ("onnx model optimize num_heads")}, + ) + hidden_size: int = field( + default=768, + metadata={"help": ("onnx model optimize hidden_size")}, + ) + + +@dataclass +class DataTrainingArguments: + """ + Arguments pertaining to what data we are going to input our model for training and eval. + """ + + dataset_name: Optional[str] = field( + default='squad', metadata={"help": "The name of the dataset to use (via the datasets library)."} + ) + dataset_config_name: Optional[str] = field( + default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} + ) + train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."}) + validation_file: Optional[str] = field( + default=None, + metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."}, + ) + test_file: Optional[str] = field( + default=None, + metadata={"help": "An optional input test data file to evaluate the perplexity on (a text file)."}, + ) + overwrite_cache: bool = field( + default=False, metadata={"help": "Overwrite the cached training and evaluation sets"} + ) + preprocessing_num_workers: Optional[int] = field( + default=None, + metadata={"help": "The number of processes to use for the preprocessing."}, + ) + max_seq_length: int = field( + default=512, + metadata={ + "help": ( + "The maximum total input sequence length after tokenization. Sequences longer " + "than this will be truncated, sequences shorter will be padded." + ) + }, + ) + pad_to_max_length: bool = field( + default=True, + metadata={ + "help": ( + "Whether to pad all samples to `max_seq_length`. If False, will pad the samples dynamically when" + " batching to the maximum length in the batch (which can be faster on GPU but will be slower on TPU)." + ) + }, + ) + max_train_samples: Optional[int] = field( + default=None, + metadata={ + "help": ( + "For debugging purposes or quicker training, truncate the number of training examples to this " + "value if set." + ) + }, + ) + max_eval_samples: Optional[int] = field( + default=None, + metadata={ + "help": ( + "For debugging purposes or quicker training, truncate the number of evaluation examples to this " + "value if set." + ) + }, + ) + max_predict_samples: Optional[int] = field( + default=None, + metadata={ + "help": ( + "For debugging purposes or quicker training, truncate the number of prediction examples to this " + "value if set." + ) + }, + ) + version_2_with_negative: bool = field( + default=False, metadata={"help": "If true, some of the examples do not have an answer."} + ) + null_score_diff_threshold: float = field( + default=0.0, + metadata={ + "help": ( + "The threshold used to select the null answer: if the best answer has a score that is less than " + "the score of the null answer minus this threshold, the null answer is selected for this example. " + "Only useful when `version_2_with_negative=True`." + ) + }, + ) + doc_stride: int = field( + default=256, + metadata={"help": "When splitting up a long document into chunks, how much stride to take between chunks."}, + ) + n_best_size: int = field( + default=20, + metadata={"help": "The total number of n-best predictions to generate when looking for an answer."}, + ) + max_answer_length: int = field( + default=30, + metadata={ + "help": ( + "The maximum length of an answer that can be generated. This is needed because the start " + "and end predictions are not conditioned on one another." + ) + }, + ) + + def __post_init__(self): + if ( + self.dataset_name is None + and self.train_file is None + and self.validation_file is None + and self.test_file is None + ): + raise ValueError("Need either a dataset name or a training/validation file/test_file.") + else: + if self.train_file is not None: + extension = self.train_file.split(".")[-1] + assert extension in ["csv", "json"], "`train_file` should be a csv or a json file." + if self.validation_file is not None: + extension = self.validation_file.split(".")[-1] + assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file." + if self.test_file is not None: + extension = self.test_file.split(".")[-1] + assert extension in ["csv", "json"], "`test_file` should be a csv or a json file." + +class SquadDataset(Dataset): + def __init__(self, dataloader, bs=1): + self.dataloader = dataloader + self.bs = bs + self.input_ids = [] + self.token_type_ids = [] + self.attention_mask = [] + for idx, inputs in enumerate(self.dataloader): + self.input_ids.append(np.array(inputs['input_ids'], dtype=np.int64)) + self.token_type_ids.append(np.array(inputs['token_type_ids'], dtype=np.int64)) + self.attention_mask.append(np.array(inputs['attention_mask'], dtype=np.int64)) + + def __getitem__(self, index): + return (self.input_ids[index:index + self.bs][0][0], self.token_type_ids[index:index + self.bs][0][0], self.attention_mask[index:index + self.bs][0][0]), 0 + # return (self.input_ids[index:index + self.bs][0], self.attention_mask[index:index + self.bs][0], self.token_type_ids[index:index + self.bs][0]), 0 + + def __len__(self): + assert len(self.input_ids) == len(self.attention_mask) + assert len(self.input_ids) == len(self.token_type_ids) + return len(self.input_ids) + + +def main(): + # See all possible arguments in src/transformers/training_args.py + # or by passing the --help flag to this script. + # We now keep distinct sets of args, for a cleaner separation of concerns. + + parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) + model_args, data_args, training_args = parser.parse_args_into_dataclasses() + + training_args.do_eval = True + training_args.per_device_eval_batch_size = 1 + + # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The + # information sent is the one passed as arguments along with your Python/PyTorch versions. + send_example_telemetry("run_qa", model_args, data_args) + + # Setup logging + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + handlers=[logging.StreamHandler(sys.stdout)], + ) + + log_level = training_args.get_process_log_level() + logger.setLevel(log_level) + datasets.utils.logging.set_verbosity(log_level) + transformers.utils.logging.set_verbosity(log_level) + transformers.utils.logging.enable_default_handler() + transformers.utils.logging.enable_explicit_format() + + # Log on each process the small summary: + logger.warning( + f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" + ) + logger.info(f"Training/evaluation parameters {training_args}") + + # Detecting last checkpoint. + last_checkpoint = None + if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: + last_checkpoint = get_last_checkpoint(training_args.output_dir) + if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: + raise ValueError( + f"Output directory ({training_args.output_dir}) already exists and is not empty. " + "Use --overwrite_output_dir to overcome." + ) + elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: + logger.info( + f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " + "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." + ) + + # Set seed before initializing model. + set_seed(training_args.seed) + + # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) + # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ + # (the dataset will be downloaded automatically from the datasets Hub). + # + # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called + # 'text' is found. You can easily tweak this behavior (see below). + # + # In distributed training, the load_dataset function guarantee that only one local process can concurrently + # download the dataset. + if data_args.dataset_name is not None: + # Downloading and loading a dataset from the hub. + raw_datasets = load_dataset( + data_args.dataset_name, + data_args.dataset_config_name, + cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, + ) + print(type(raw_datasets)) + else: + data_files = {} + if data_args.train_file is not None: + data_files["train"] = data_args.train_file + extension = data_args.train_file.split(".")[-1] + + if data_args.validation_file is not None: + data_files["validation"] = data_args.validation_file + extension = data_args.validation_file.split(".")[-1] + if data_args.test_file is not None: + data_files["test"] = data_args.test_file + extension = data_args.test_file.split(".")[-1] + raw_datasets = load_dataset( + extension, + data_files=data_files, + field="data", + cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, + ) + # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at + # https://huggingface.co/docs/datasets/loading_datasets.html. + + # Load pretrained model and tokenizer + # + # Distributed training: + # The .from_pretrained methods guarantee that only one local process can concurrently + # download model & vocab. + config = AutoConfig.from_pretrained( + model_args.config_name if model_args.config_name else model_args.model_name_or_path, + cache_dir=model_args.cache_dir, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + tokenizer = AutoTokenizer.from_pretrained( + model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, + cache_dir=model_args.cache_dir, + use_fast=True, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + model = AutoModelForQuestionAnswering.from_pretrained( + model_args.model_name_or_path, + from_tf=bool(".ckpt" in model_args.model_name_or_path), + config=config, + cache_dir=model_args.cache_dir, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + + # Tokenizer check: this script requires a fast tokenizer. + if not isinstance(tokenizer, PreTrainedTokenizerFast): + raise ValueError( + "This example script only works for models that have a fast tokenizer. Checkout the big table of models at" + " https://huggingface.co/transformers/index.html#supported-frameworks to find the model types that meet" + " this requirement" + ) + + # Preprocessing the datasets. + # Preprocessing is slighlty different for training and evaluation. + if training_args.do_train: + column_names = raw_datasets["train"].column_names + elif training_args.do_eval: + column_names = raw_datasets["validation"].column_names + else: + column_names = raw_datasets["test"].column_names + question_column_name = "question" if "question" in column_names else column_names[0] + context_column_name = "context" if "context" in column_names else column_names[1] + answer_column_name = "answers" if "answers" in column_names else column_names[2] + + # Padding side determines if we do (question|context) or (context|question). + pad_on_right = tokenizer.padding_side == "right" + + if data_args.max_seq_length > tokenizer.model_max_length: + logger.warning( + f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the" + f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." + ) + max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) + + # Validation preprocessing + def prepare_validation_features(examples): + # Some of the questions have lots of whitespace on the left, which is not useful and will make the + # truncation of the context fail (the tokenized question will take a lots of space). So we remove that + # left whitespace + examples[question_column_name] = [q.lstrip() for q in examples[question_column_name]] + + # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results + # in one example possible giving several features when a context is long, each of those features having a + # context that overlaps a bit the context of the previous feature. + tokenized_examples = tokenizer( + examples[question_column_name if pad_on_right else context_column_name], + examples[context_column_name if pad_on_right else question_column_name], + truncation="only_second" if pad_on_right else "only_first", + max_length=max_seq_length, + stride=data_args.doc_stride, + return_overflowing_tokens=True, + return_offsets_mapping=True, + padding="max_length" if data_args.pad_to_max_length else False, + ) + + # Since one example might give us several features if it has a long context, we need a map from a feature to + # its corresponding example. This key gives us just that. + sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping") + + # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the + # corresponding example_id and we will store the offset mappings. + tokenized_examples["example_id"] = [] + + for i in range(len(tokenized_examples["input_ids"])): + # Grab the sequence corresponding to that example (to know what is the context and what is the question). + sequence_ids = tokenized_examples.sequence_ids(i) + context_index = 1 if pad_on_right else 0 + + # One example can give several spans, this is the index of the example containing this span of text. + sample_index = sample_mapping[i] + tokenized_examples["example_id"].append(examples["id"][sample_index]) + + # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token + # position is part of the context or not. + tokenized_examples["offset_mapping"][i] = [ + (o if sequence_ids[k] == context_index else None) + for k, o in enumerate(tokenized_examples["offset_mapping"][i]) + ] + + return tokenized_examples + + if training_args.do_eval: + if "validation" not in raw_datasets: + raise ValueError("--do_eval requires a validation dataset") + eval_examples = raw_datasets["validation"] + if data_args.max_eval_samples is not None: + # We will select sample from whole data + max_eval_samples = min(len(eval_examples), data_args.max_eval_samples) + eval_examples = eval_examples.select(range(max_eval_samples)) + # Validation Feature Creation + with training_args.main_process_first(desc="validation dataset map pre-processing"): + eval_dataset = eval_examples.map( + prepare_validation_features, + batched=True, + num_proc=data_args.preprocessing_num_workers, + remove_columns=column_names, + load_from_cache_file=not data_args.overwrite_cache, + desc="Running tokenizer on validation dataset", + ) + if data_args.max_eval_samples is not None: + # During Feature creation dataset samples might increase, we will select required samples again + max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples) + eval_dataset = eval_dataset.select(range(max_eval_samples)) + + + # Data collator + # We have already padded to max length if the corresponding flag is True, otherwise we need to pad in the data + # collator. + data_collator = ( + default_data_collator + if data_args.pad_to_max_length + else DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8 if training_args.fp16 else None) + ) + + # Post-processing: + def post_processing_function(examples, features, predictions, stage="eval"): + # Post-processing: we match the start logits and end logits to answers in the original context. + predictions = postprocess_qa_predictions( + examples=examples, + features=features, + predictions=predictions, + version_2_with_negative=data_args.version_2_with_negative, + n_best_size=data_args.n_best_size, + max_answer_length=data_args.max_answer_length, + null_score_diff_threshold=data_args.null_score_diff_threshold, + output_dir=training_args.output_dir, + log_level=log_level, + prefix=stage, + ) + # Format the result to the format the metric expects. + if data_args.version_2_with_negative: + formatted_predictions = [ + {"id": k, "prediction_text": v, "no_answer_probability": 0.0} for k, v in predictions.items() + ] + else: + formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()] + + references = [{"id": ex["id"], "answers": ex[answer_column_name]} for ex in examples] + return EvalPrediction(predictions=formatted_predictions, label_ids=references) + + metric = load_metric("squad_v2" if data_args.version_2_with_negative else "squad") + + def compute_metrics(p: EvalPrediction): + return metric.compute(predictions=p.predictions, references=p.label_ids) + + # Initialize our Trainer + trainer = QuestionAnsweringTrainer( + model=model, + args=training_args, + train_dataset=None, + eval_dataset=eval_dataset if training_args.do_eval else None, + eval_examples=eval_examples if training_args.do_eval else None, + tokenizer=tokenizer, + data_collator=data_collator, + post_process_function=post_processing_function, + compute_metrics=compute_metrics, + ) + + eval_dataloader = trainer.get_dataloader(eval_dataset) + + def eval_func(model, *args): + logger.info("*** Evaluate ***") + metrics = trainer.evaluate(onnx_model=model) + print('eval_func', metrics) + + max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset) + metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) + + trainer.log_metrics("eval", metrics) + trainer.save_metrics("eval", metrics) + return metrics['eval_f1'] + + if model_args.tune: + from onnxruntime.transformers import optimizer + from onnxruntime.transformers.onnx_model_bert import BertOptimizationOptions + opt_options = BertOptimizationOptions('bert') + opt_options.enable_embed_layer_norm = False + + model_optimizer = optimizer.optimize_model( + model_args.model_path, + 'bert', + num_heads=model_args.num_heads, + hidden_size=model_args.hidden_size, + optimization_options=opt_options) + model = model_optimizer.model + + b_dataloader = SquadDataset(eval_dataloader) + b_dataloader = DataLoader(b_dataloader) + from neural_compressor.experimental import Quantization, common + quantize = Quantization(model_args.config) + quantize.model = common.Model(model) + quantize.calib_dataloader = b_dataloader + quantize.eval_func = eval_func + q_model = quantize() + q_model.save(model_args.save_path) + + if model_args.benchmark: + from neural_compressor.experimental import Benchmark, common + model = onnx.load(model_args.model_path) + if model_args.mode == 'performance': + from neural_compressor.data import DATALOADERS, DATASETS + session = ort.InferenceSession(model_args.model_path, None) + input_tensors = session.get_inputs() + shape = [] + for i in range(len(input_tensors)): + shape.append((1, 512)) + onnx_datasets = DATASETS('onnxrt_integerops') + dummy_dataset = onnx_datasets['dummy'](shape=shape, low=1, high=1, dtype='int64', label=True) + evaluator = Benchmark(model_args.config) + evaluator.model = common.Model(model) + evaluator.b_dataloader = common.DataLoader(dummy_dataset) + evaluator(model_args.mode) + elif model_args.mode == 'accuracy': + b_dataloader = SquadDataset(eval_dataloader) + b_dataloader = DataLoader(b_dataloader) + evaluator = Benchmark(model_args.config) + evaluator.b_dataloader = b_dataloader + evaluator.b_func = eval_func + evaluator.model = common.Model(model) + evaluator(model_args.mode) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/examples/onnxrt/nlp/huggingface_model/question_answering/quantization/ptq/qa_dynamic.yaml b/examples/onnxrt/nlp/huggingface_model/question_answering/quantization/ptq/qa_dynamic.yaml new file mode 100644 index 00000000000..e76dfbc315c --- /dev/null +++ b/examples/onnxrt/nlp/huggingface_model/question_answering/quantization/ptq/qa_dynamic.yaml @@ -0,0 +1,36 @@ +# +# Copyright (c) 2021 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +version: 1.0 + +model: # mandatory. used to specify model specific information. + name: question_answering + framework: onnxrt_integerops # mandatory. possible values are tensorflow, mxnet, pytorch, pytorch_ipex, onnxrt_integerops and onnxrt_qlinearops. + +evaluation: # optional. required if user doesn't provide eval_func in neural_compressor.Quantization. + performance: # optional. used to benchmark performance of passing model. + warmup: 0 + iteration: 100 + configs: + cores_per_instance: 28 + num_of_instance: 1 + +quantization: + approach: post_training_dynamic_quant # optional. default value is post_training_static_quant. + +tuning: + accuracy_criterion: + relative: 0.01 # optional. default value is relative, other value is absolute. this example allows relative accuracy loss: 1%. + random_seed: 9527 # optional. random seed for deterministic tuning. \ No newline at end of file diff --git a/examples/onnxrt/nlp/huggingface_model/question_answering/quantization/ptq/requirements.txt b/examples/onnxrt/nlp/huggingface_model/question_answering/quantization/ptq/requirements.txt new file mode 100644 index 00000000000..30412bea132 --- /dev/null +++ b/examples/onnxrt/nlp/huggingface_model/question_answering/quantization/ptq/requirements.txt @@ -0,0 +1,7 @@ +datasets +onnx +onnxruntime +onnxruntime-extensions; python_version < '3.10' +transformers==4.21.0 +torch +tensorboard \ No newline at end of file diff --git a/examples/onnxrt/nlp/huggingface_model/question_answering/quantization/ptq/run_benchmark.sh b/examples/onnxrt/nlp/huggingface_model/question_answering/quantization/ptq/run_benchmark.sh new file mode 100644 index 00000000000..2eef1e0e4b3 --- /dev/null +++ b/examples/onnxrt/nlp/huggingface_model/question_answering/quantization/ptq/run_benchmark.sh @@ -0,0 +1,49 @@ +#!/bin/bash +set -x + +function main { + + init_params "$@" + run_benchmark + +} + +# init params +function init_params { + for var in "$@" + do + case $var in + --config=*) + config=$(echo $var |cut -f2 -d=) + ;; + --input_model=*) + input_model=$(echo $var |cut -f2 -d=) + ;; + --mode=*) + mode=$(echo $var |cut -f2 -d=) + ;; + esac + done + +} + +# run_benchmark +function run_benchmark { + + if [[ "${input_model}" =~ "spanbert" ]]; then + model_name_or_path="mrm8488/spanbert-finetuned-squadv1" + elif [[ "${input_model}" =~ "bert-base" ]]; then + model_name_or_path="salti/bert-base-multilingual-cased-finetuned-squad" + fi + + python main.py \ + --model_path ${input_model} \ + --config ${config} \ + --mode=${mode} \ + --model_name_or_path=${model_name_or_path} \ + --output_dir './output' \ + --benchmark + +} + +main "$@" \ No newline at end of file diff --git a/examples/onnxrt/nlp/huggingface_model/question_answering/quantization/ptq/run_tuning.sh b/examples/onnxrt/nlp/huggingface_model/question_answering/quantization/ptq/run_tuning.sh new file mode 100644 index 00000000000..9e0eb872250 --- /dev/null +++ b/examples/onnxrt/nlp/huggingface_model/question_answering/quantization/ptq/run_tuning.sh @@ -0,0 +1,52 @@ +#!/bin/bash +set -x + +function main { + init_params "$@" + run_tuning +} + +# init params +function init_params { + for var in "$@" + do + case $var in + --config=*) + config=$(echo $var |cut -f2 -d=) + ;; + --input_model=*) + input_model=$(echo $var |cut -f2 -d=) + ;; + --output_model=*) + output_model=$(echo $var |cut -f2 -d=) + ;; + esac + done + +} + +# run_tuning +function run_tuning { + + if [[ "${input_model}" =~ "spanbert" ]]; then + model_name_or_path="mrm8488/spanbert-finetuned-squadv1" + num_heads=12 + hidden_size=768 + elif [[ "${input_model}" =~ "bert-base" ]]; then + model_name_or_path="salti/bert-base-multilingual-cased-finetuned-squad" + num_heads=12 + hidden_size=768 + fi + + python main.py \ + --model_path ${input_model} \ + --save_path ${output_model} \ + --config ${config} \ + --output_dir './output' \ + --model_name_or_path=${model_name_or_path} \ + --num_heads ${num_heads} \ + --hidden_size ${hidden_size} \ + --tune +} + +main "$@" \ No newline at end of file diff --git a/examples/onnxrt/nlp/huggingface_model/question_answering/quantization/ptq/trainer_qa.py b/examples/onnxrt/nlp/huggingface_model/question_answering/quantization/ptq/trainer_qa.py new file mode 100644 index 00000000000..2da65c8a9f0 --- /dev/null +++ b/examples/onnxrt/nlp/huggingface_model/question_answering/quantization/ptq/trainer_qa.py @@ -0,0 +1,489 @@ +# coding=utf-8 +# Copyright 2020 The HuggingFace Team All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +A subclass of `Trainer` specific to Question-Answering tasks +""" + +from transformers import Trainer, is_torch_tpu_available +from transformers.deepspeed import deepspeed_init, is_deepspeed_zero3_enabled +from transformers.trainer_utils import ( + PREFIX_CHECKPOINT_DIR, + BestRun, + EvalLoopOutput, + EvalPrediction, + FSDPOption, + HPSearchBackend, + HubStrategy, + IntervalStrategy, + PredictionOutput, + RemoveColumnsCollator, + ShardedDDPOption, + TrainerMemoryTracker, + TrainOutput, + default_compute_objective, + default_hp_space, + denumpify_detensorize, + enable_full_determinism, + find_executable_batch_size, + get_last_checkpoint, + has_length, + number_of_arguments, + seed_worker, + set_seed, + speed_metrics, +) +from transformers.utils import ( + is_sagemaker_mp_enabled, + is_torch_tpu_available, + logging, +) +from transformers.trainer_pt_utils import ( + IterableDatasetShard, + LabelSmoother, + LengthGroupedSampler, + SequentialDistributedSampler, + ShardSampler, + distributed_broadcast_scalars, + distributed_concat, + find_batch_size, + get_module_class_from_name, + get_parameter_names, + nested_concat, + nested_detach, + nested_numpify, + nested_truncate, + nested_xla_mesh_reduce, + reissue_pt_warnings +) +import onnxruntime +import onnx +from torch.utils.data import DataLoader +from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union, NamedTuple +import numpy as np + +import torch +from torch import nn + +logger = logging.get_logger(__name__) + +if is_sagemaker_mp_enabled(): + import smdistributed.modelparallel.torch as smp + from smdistributed.modelparallel import __version__ as SMP_VERSION + + IS_SAGEMAKER_MP_POST_1_10 = version.parse(SMP_VERSION) >= version.parse("1.10") + + from .trainer_pt_utils import smp_forward_backward, smp_forward_only, smp_gather, smp_nested_concat + + +if is_torch_tpu_available(check_device=False): + import torch_xla.core.xla_model as xm + import torch_xla.debug.metrics as met + import torch_xla.distributed.parallel_loader as pl + +def has_length(dataset): + """ + Checks if the dataset implements __len__() and it doesn't raise an error + """ + try: + return len(dataset) is not None + except TypeError: + # TypeError: len() of unsized object + return False + +class EvalLoopOutput(NamedTuple): + predictions: Union[np.ndarray, Tuple[np.ndarray]] + label_ids: Optional[Union[np.ndarray, Tuple[np.ndarray]]] + metrics: Optional[Dict[str, float]] + num_samples: Optional[int] + +class QuestionAnsweringTrainer(Trainer): + def __init__(self, *args, eval_examples=None, post_process_function=None, **kwargs): + super().__init__(*args, **kwargs) + self.eval_examples = eval_examples + self.post_process_function = post_process_function + + def get_dataloader(self, eval_dataset): + return self.get_eval_dataloader(eval_dataset) + + def evaluate(self, onnx_model, eval_dataset=None, eval_examples=None, ignore_keys=None, metric_key_prefix: str = "eval"): + eval_dataset = self.eval_dataset if eval_dataset is None else eval_dataset + eval_dataloader = self.get_eval_dataloader(eval_dataset) + eval_examples = self.eval_examples if eval_examples is None else eval_examples + + # Temporarily disable metric computation, we will do it in the loop here. + compute_metrics = self.compute_metrics + self.compute_metrics = None + eval_loop = self.evaluation_loop + print('eval_dataloader', type(eval_dataloader)) + print('onnx_model', type(onnx_model)) + try: + output = eval_loop( + dataloader=eval_dataloader, + description="Evaluation", + prediction_loss_only=True if compute_metrics is None else None, + ignore_keys=ignore_keys, + onnx_model=onnx_model, + ) + finally: + self.compute_metrics = compute_metrics + + if self.post_process_function is not None and self.compute_metrics is not None: + eval_preds = self.post_process_function(eval_examples, eval_dataset, output.predictions) + metrics = self.compute_metrics(eval_preds) + + # Prefix all keys with metric_key_prefix + '_' + for key in list(metrics.keys()): + if not key.startswith(f"{metric_key_prefix}_"): + metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key) + + self.log(metrics) + else: + metrics = {} + + if self.args.tpu_metrics_debug or self.args.debug: + # tpu-comment: Logging debug metrics for PyTorch/XLA (compile, execute times, ops, etc.) + xm.master_print(met.metrics_report()) + + self.control = self.callback_handler.on_evaluate(self.args, self.state, self.control, metrics) + return metrics + + def predict(self, predict_dataset, predict_examples, ignore_keys=None, metric_key_prefix: str = "test"): + predict_dataloader = self.get_test_dataloader(predict_dataset) + + # Temporarily disable metric computation, we will do it in the loop here. + compute_metrics = self.compute_metrics + self.compute_metrics = None + eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop + try: + output = eval_loop( + predict_dataloader, + description="Prediction", + # No point gathering the predictions if there are no metrics, otherwise we defer to + # self.args.prediction_loss_only + prediction_loss_only=True if compute_metrics is None else None, + ignore_keys=ignore_keys, + ) + finally: + self.compute_metrics = compute_metrics + + if self.post_process_function is None or self.compute_metrics is None: + return output + + predictions = self.post_process_function(predict_examples, predict_dataset, output.predictions, "predict") + metrics = self.compute_metrics(predictions) + + # Prefix all keys with metric_key_prefix + '_' + for key in list(metrics.keys()): + if not key.startswith(f"{metric_key_prefix}_"): + metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key) + + return PredictionOutput(predictions=predictions.predictions, label_ids=predictions.label_ids, metrics=metrics) + + def evaluation_loop( + self, + dataloader: DataLoader, + description: str, + prediction_loss_only: Optional[bool] = None, + ignore_keys: Optional[List[str]] = None, + metric_key_prefix: str = "eval", + onnx_model: onnx.onnx_ml_pb2.ModelProto = None, + ) -> EvalLoopOutput: + """ + Prediction/evaluation loop, shared by `Trainer.evaluate()` and `Trainer.predict()`. + Works both with or without labels. + """ + args = self.args + + prediction_loss_only = prediction_loss_only if prediction_loss_only is not None else args.prediction_loss_only + + # if eval is called w/o train init deepspeed here + if args.deepspeed and not self.deepspeed: + + # XXX: eval doesn't have `resume_from_checkpoint` arg but we should be able to do eval + # from the checkpoint eventually + deepspeed_engine, _, _ = deepspeed_init( + self, num_training_steps=0, resume_from_checkpoint=None, inference=True + ) + self.model = deepspeed_engine.module + self.model_wrapped = deepspeed_engine + self.deepspeed = deepspeed_engine + + model = self._wrap_model(self.model, training=False, dataloader=dataloader) + + # if full fp16 or bf16 eval is wanted and this ``evaluation`` or ``predict`` isn't called + # while ``train`` is running, cast it to the right dtype first and then put on device + if not self.is_in_train: + if args.fp16_full_eval: + model = model.to(dtype=torch.float16, device=args.device) + elif args.bf16_full_eval: + model = model.to(dtype=torch.bfloat16, device=args.device) + + batch_size = self.args.eval_batch_size + + logger.info(f"***** Running {description} *****") + if has_length(dataloader): + logger.info(f" Num examples = {self.num_examples(dataloader)}") + else: + logger.info(" Num examples: Unknown") + logger.info(f" Batch size = {batch_size}") + + model.eval() + + self.callback_handler.eval_dataloader = dataloader + # Do this before wrapping. + eval_dataset = getattr(dataloader, "dataset", None) + + if is_torch_tpu_available(): + dataloader = pl.ParallelLoader(dataloader, [args.device]).per_device_loader(args.device) + + if args.past_index >= 0: + self._past = None + + # Initialize containers + # losses/preds/labels on GPU/TPU (accumulated for eval_accumulation_steps) + losses_host = None + preds_host = None + labels_host = None + inputs_host = None + + # losses/preds/labels on CPU (final containers) + all_losses = None + all_preds = None + all_labels = None + all_inputs = None + # Will be useful when we have an iterable dataset so don't know its length. + + onnx_session = onnxruntime.InferenceSession(onnx_model.SerializeToString(), None) + observed_num_examples = 0 + # Main evaluation loop + for step, inputs in enumerate(dataloader): + # Update the observed num examples + observed_batch_size = find_batch_size(inputs) + if observed_batch_size is not None: + observed_num_examples += observed_batch_size + # For batch samplers, batch_size is not known by the dataloader in advance. + if batch_size is None: + batch_size = observed_batch_size + + # Prediction step + loss, logits, labels = self.prediction_step(onnx_session, model, inputs, prediction_loss_only, ignore_keys=ignore_keys) + inputs_decode = self._prepare_input(inputs["input_ids"]) if args.include_inputs_for_metrics else None + + if is_torch_tpu_available(): + xm.mark_step() + + # Update containers on host + if loss is not None: + losses = self._nested_gather(loss.repeat(batch_size)) + losses_host = losses if losses_host is None else torch.cat((losses_host, losses), dim=0) + if labels is not None: + labels = self._pad_across_processes(labels) + labels = self._nested_gather(labels) + labels_host = labels if labels_host is None else nested_concat(labels_host, labels, padding_index=-100) + if inputs_decode is not None: + inputs_decode = self._pad_across_processes(inputs_decode) + inputs_decode = self._nested_gather(inputs_decode) + inputs_host = ( + inputs_decode + if inputs_host is None + else nested_concat(inputs_host, inputs_decode, padding_index=-100) + ) + if logits is not None: + logits = self._pad_across_processes(logits) + logits = self._nested_gather(logits) + if self.preprocess_logits_for_metrics is not None: + logits = self.preprocess_logits_for_metrics(logits, labels) + preds_host = logits if preds_host is None else nested_concat(preds_host, logits, padding_index=-100) + self.control = self.callback_handler.on_prediction_step(args, self.state, self.control) + + # Gather all tensors and put them back on the CPU if we have done enough accumulation steps. + if args.eval_accumulation_steps is not None and (step + 1) % args.eval_accumulation_steps == 0: + if losses_host is not None: + losses = nested_numpify(losses_host) + all_losses = losses if all_losses is None else np.concatenate((all_losses, losses), axis=0) + if preds_host is not None: + logits = nested_numpify(preds_host) + all_preds = logits if all_preds is None else nested_concat(all_preds, logits, padding_index=-100) + if inputs_host is not None: + inputs_decode = nested_numpify(inputs_host) + all_inputs = ( + inputs_decode + if all_inputs is None + else nested_concat(all_inputs, inputs_decode, padding_index=-100) + ) + if labels_host is not None: + labels = nested_numpify(labels_host) + all_labels = ( + labels if all_labels is None else nested_concat(all_labels, labels, padding_index=-100) + ) + + # Set back to None to begin a new accumulation + losses_host, preds_host, inputs_host, labels_host = None, None, None, None + + if args.past_index and hasattr(self, "_past"): + # Clean the state at the end of the evaluation loop + delattr(self, "_past") + + # Gather all remaining tensors and put them back on the CPU + if losses_host is not None: + losses = nested_numpify(losses_host) + all_losses = losses if all_losses is None else np.concatenate((all_losses, losses), axis=0) + if preds_host is not None: + logits = nested_numpify(preds_host) + all_preds = logits if all_preds is None else nested_concat(all_preds, logits, padding_index=-100) + if inputs_host is not None: + inputs_decode = nested_numpify(inputs_host) + all_inputs = ( + inputs_decode if all_inputs is None else nested_concat(all_inputs, inputs_decode, padding_index=-100) + ) + if labels_host is not None: + labels = nested_numpify(labels_host) + all_labels = labels if all_labels is None else nested_concat(all_labels, labels, padding_index=-100) + + # Number of samples + if has_length(eval_dataset): + num_samples = len(eval_dataset) + # The instance check is weird and does not actually check for the type, but whether the dataset has the right + # methods. Therefore we need to make sure it also has the attribute. + elif isinstance(eval_dataset, IterableDatasetShard) and hasattr(eval_dataset, "num_examples"): + num_samples = eval_dataset.num_examples + else: + if has_length(dataloader): + num_samples = self.num_examples(dataloader) + else: # both len(dataloader.dataset) and len(dataloader) fail + num_samples = observed_num_examples + + # Number of losses has been rounded to a multiple of batch_size and in a distributed training, the number of + # samplers has been rounded to a multiple of batch_size, so we truncate. + if all_losses is not None: + all_losses = all_losses[:num_samples] + if all_preds is not None: + all_preds = nested_truncate(all_preds, num_samples) + if all_labels is not None: + all_labels = nested_truncate(all_labels, num_samples) + if all_inputs is not None: + all_inputs = nested_truncate(all_inputs, num_samples) + + # Metrics! + if self.compute_metrics is not None and all_preds is not None and all_labels is not None: + if args.include_inputs_for_metrics: + metrics = self.compute_metrics( + EvalPrediction(predictions=all_preds, label_ids=all_labels, inputs=all_inputs) + ) + else: + metrics = self.compute_metrics(EvalPrediction(predictions=all_preds, label_ids=all_labels)) + else: + metrics = {} + + # To be JSON-serializable, we need to remove numpy types or zero-d tensors + metrics = denumpify_detensorize(metrics) + + if all_losses is not None: + metrics[f"{metric_key_prefix}_loss"] = all_losses.mean().item() + + # Prefix all keys with metric_key_prefix + '_' + for key in list(metrics.keys()): + if not key.startswith(f"{metric_key_prefix}_"): + metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key) + + return EvalLoopOutput(predictions=all_preds, label_ids=all_labels, metrics=metrics, num_samples=num_samples) + + def prediction_step( + self, + onnx_session, + model: nn.Module, + inputs: Dict[str, Union[torch.Tensor, Any]], + prediction_loss_only: bool, + ignore_keys: Optional[List[str]] = None, + ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor]]: + """ + Perform an evaluation step on `model` using `inputs`. + Subclass and override to inject custom behavior. + Args: + model (`nn.Module`): + The model to evaluate. + inputs (`Dict[str, Union[torch.Tensor, Any]]`): + The inputs and targets of the model. + The dictionary will be unpacked before being fed to the model. Most models expect the targets under the + argument `labels`. Check your model's documentation for all accepted arguments. + prediction_loss_only (`bool`): + Whether or not to return the loss only. + ignore_keys (`Lst[str]`, *optional*): + A list of keys in the output of your model (if it is a dictionary) that should be ignored when + gathering predictions. + Return: + Tuple[Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss, + logits and labels (each being optional). + """ + has_labels = all(inputs.get(k) is not None for k in self.label_names) + inputs = self._prepare_inputs(inputs) + if ignore_keys is None: + if hasattr(self.model, "config"): + ignore_keys = getattr(self.model.config, "keys_to_ignore_at_inference", []) + else: + ignore_keys = [] + + # labels may be popped when computing the loss (label smoothing for instance) so we grab them first. + if has_labels: + labels = nested_detach(tuple(inputs.get(name) for name in self.label_names)) + if len(labels) == 1: + labels = labels[0] + else: + labels = None + + with torch.no_grad(): + if is_sagemaker_mp_enabled(): + raw_outputs = smp_forward_only(model, inputs) + if has_labels: + if isinstance(raw_outputs, dict): + loss_mb = raw_outputs["loss"] + logits_mb = tuple(v for k, v in raw_outputs.items() if k not in ignore_keys + ["loss"]) + else: + loss_mb = raw_outputs[0] + logits_mb = raw_outputs[1:] + + loss = loss_mb.reduce_mean().detach().cpu() + logits = smp_nested_concat(logits_mb) + else: + loss = None + if isinstance(raw_outputs, dict): + logits_mb = tuple(v for k, v in raw_outputs.items() if k not in ignore_keys) + else: + logits_mb = raw_outputs + logits = smp_nested_concat(logits_mb) + else: + if has_labels: + with self.compute_loss_context_manager(): + loss, outputs = self.compute_loss(model, inputs, return_outputs=True) + loss = loss.mean().detach() + + if isinstance(outputs, dict): + logits = tuple(v for k, v in outputs.items() if k not in ignore_keys + ["loss"]) + else: + logits = outputs[1:] + else: + loss = None + with self.compute_loss_context_manager(): + data = {"input_ids": np.array(inputs['input_ids'], dtype=np.int64), + "attention_mask": np.array(inputs['token_type_ids'], dtype=np.int64), + "token_type_ids": np.array(inputs['attention_mask'], dtype=np.int64)} + outputs2 = onnx_session.run(None, data) + logits2 = tuple((torch.from_numpy(outputs2[0]), torch.from_numpy(outputs2[1]))) + # TODO: this needs to be fixed and made cleaner later. + if self.args.past_index >= 0: + self._past = outputs[self.args.past_index - 1] + + logits2 = nested_detach(logits2) + return (loss, logits2, labels) \ No newline at end of file diff --git a/examples/onnxrt/nlp/huggingface_model/question_answering/quantization/ptq/utils_qa.py b/examples/onnxrt/nlp/huggingface_model/question_answering/quantization/ptq/utils_qa.py new file mode 100644 index 00000000000..96af7f1d6bd --- /dev/null +++ b/examples/onnxrt/nlp/huggingface_model/question_answering/quantization/ptq/utils_qa.py @@ -0,0 +1,440 @@ +# coding=utf-8 +# Copyright 2020 The HuggingFace Team All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Post-processing utilities for question answering. +""" +import collections +import json +import logging +import os +from typing import Optional, Tuple + +import numpy as np +from tqdm.auto import tqdm + + +logger = logging.getLogger(__name__) + + +def postprocess_qa_predictions( + examples, + features, + predictions: Tuple[np.ndarray, np.ndarray], + version_2_with_negative: bool = False, + n_best_size: int = 20, + max_answer_length: int = 30, + null_score_diff_threshold: float = 0.0, + output_dir: Optional[str] = None, + prefix: Optional[str] = None, + log_level: Optional[int] = logging.WARNING, +): + """ + Post-processes the predictions of a question-answering model to convert them to answers that are substrings of the + original contexts. This is the base postprocessing functions for models that only return start and end logits. + Args: + examples: The non-preprocessed dataset (see the main script for more information). + features: The processed dataset (see the main script for more information). + predictions (:obj:`Tuple[np.ndarray, np.ndarray]`): + The predictions of the model: two arrays containing the start logits and the end logits respectively. Its + first dimension must match the number of elements of :obj:`features`. + version_2_with_negative (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not the underlying dataset contains examples with no answers. + n_best_size (:obj:`int`, `optional`, defaults to 20): + The total number of n-best predictions to generate when looking for an answer. + max_answer_length (:obj:`int`, `optional`, defaults to 30): + The maximum length of an answer that can be generated. This is needed because the start and end predictions + are not conditioned on one another. + null_score_diff_threshold (:obj:`float`, `optional`, defaults to 0): + The threshold used to select the null answer: if the best answer has a score that is less than the score of + the null answer minus this threshold, the null answer is selected for this example (note that the score of + the null answer for an example giving several features is the minimum of the scores for the null answer on + each feature: all features must be aligned on the fact they `want` to predict a null answer). + Only useful when :obj:`version_2_with_negative` is :obj:`True`. + output_dir (:obj:`str`, `optional`): + If provided, the dictionaries of predictions, n_best predictions (with their scores and logits) and, if + :obj:`version_2_with_negative=True`, the dictionary of the scores differences between best and null + answers, are saved in `output_dir`. + prefix (:obj:`str`, `optional`): + If provided, the dictionaries mentioned above are saved with `prefix` added to their names. + log_level (:obj:`int`, `optional`, defaults to ``logging.WARNING``): + ``logging`` log level (e.g., ``logging.WARNING``) + """ + if len(predictions) != 2: + raise ValueError("`predictions` should be a tuple with two elements (start_logits, end_logits).") + all_start_logits, all_end_logits = predictions + + if len(predictions[0]) != len(features): + raise ValueError(f"Got {len(predictions[0])} predictions and {len(features)} features.") + + # Build a map example to its corresponding features. + example_id_to_index = {k: i for i, k in enumerate(examples["id"])} + features_per_example = collections.defaultdict(list) + for i, feature in enumerate(features): + features_per_example[example_id_to_index[feature["example_id"]]].append(i) + + # The dictionaries we have to fill. + all_predictions = collections.OrderedDict() + all_nbest_json = collections.OrderedDict() + if version_2_with_negative: + scores_diff_json = collections.OrderedDict() + + # Logging. + logger.setLevel(log_level) + logger.info(f"Post-processing {len(examples)} example predictions split into {len(features)} features.") + + # Let's loop over all the examples! + for example_index, example in enumerate(tqdm(examples)): + # Those are the indices of the features associated to the current example. + feature_indices = features_per_example[example_index] + + min_null_prediction = None + prelim_predictions = [] + + # Looping through all the features associated to the current example. + for feature_index in feature_indices: + # We grab the predictions of the model for this feature. + start_logits = all_start_logits[feature_index] + end_logits = all_end_logits[feature_index] + # This is what will allow us to map some the positions in our logits to span of texts in the original + # context. + offset_mapping = features[feature_index]["offset_mapping"] + # Optional `token_is_max_context`, if provided we will remove answers that do not have the maximum context + # available in the current feature. + token_is_max_context = features[feature_index].get("token_is_max_context", None) + + # Update minimum null prediction. + feature_null_score = start_logits[0] + end_logits[0] + if min_null_prediction is None or min_null_prediction["score"] > feature_null_score: + min_null_prediction = { + "offsets": (0, 0), + "score": feature_null_score, + "start_logit": start_logits[0], + "end_logit": end_logits[0], + } + + # Go through all possibilities for the `n_best_size` greater start and end logits. + start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist() + end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist() + for start_index in start_indexes: + for end_index in end_indexes: + # Don't consider out-of-scope answers, either because the indices are out of bounds or correspond + # to part of the input_ids that are not in the context. + if ( + start_index >= len(offset_mapping) + or end_index >= len(offset_mapping) + or offset_mapping[start_index] is None + or len(offset_mapping[start_index]) < 2 + or offset_mapping[end_index] is None + or len(offset_mapping[end_index]) < 2 + ): + continue + # Don't consider answers with a length that is either < 0 or > max_answer_length. + if end_index < start_index or end_index - start_index + 1 > max_answer_length: + continue + # Don't consider answer that don't have the maximum context available (if such information is + # provided). + if token_is_max_context is not None and not token_is_max_context.get(str(start_index), False): + continue + + prelim_predictions.append( + { + "offsets": (offset_mapping[start_index][0], offset_mapping[end_index][1]), + "score": start_logits[start_index] + end_logits[end_index], + "start_logit": start_logits[start_index], + "end_logit": end_logits[end_index], + } + ) + if version_2_with_negative and min_null_prediction is not None: + # Add the minimum null prediction + prelim_predictions.append(min_null_prediction) + null_score = min_null_prediction["score"] + + # Only keep the best `n_best_size` predictions. + predictions = sorted(prelim_predictions, key=lambda x: x["score"], reverse=True)[:n_best_size] + + # Add back the minimum null prediction if it was removed because of its low score. + if ( + version_2_with_negative + and min_null_prediction is not None + and not any(p["offsets"] == (0, 0) for p in predictions) + ): + predictions.append(min_null_prediction) + + # Use the offsets to gather the answer text in the original context. + context = example["context"] + for pred in predictions: + offsets = pred.pop("offsets") + pred["text"] = context[offsets[0] : offsets[1]] + + # In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid + # failure. + if len(predictions) == 0 or (len(predictions) == 1 and predictions[0]["text"] == ""): + predictions.insert(0, {"text": "empty", "start_logit": 0.0, "end_logit": 0.0, "score": 0.0}) + + # Compute the softmax of all scores (we do it with numpy to stay independent from torch/tf in this file, using + # the LogSumExp trick). + scores = np.array([pred.pop("score") for pred in predictions]) + exp_scores = np.exp(scores - np.max(scores)) + probs = exp_scores / exp_scores.sum() + + # Include the probabilities in our predictions. + for prob, pred in zip(probs, predictions): + pred["probability"] = prob + + # Pick the best prediction. If the null answer is not possible, this is easy. + if not version_2_with_negative: + all_predictions[example["id"]] = predictions[0]["text"] + else: + # Otherwise we first need to find the best non-empty prediction. + i = 0 + while predictions[i]["text"] == "": + i += 1 + best_non_null_pred = predictions[i] + + # Then we compare to the null prediction using the threshold. + score_diff = null_score - best_non_null_pred["start_logit"] - best_non_null_pred["end_logit"] + scores_diff_json[example["id"]] = float(score_diff) # To be JSON-serializable. + if score_diff > null_score_diff_threshold: + all_predictions[example["id"]] = "" + else: + all_predictions[example["id"]] = best_non_null_pred["text"] + + # Make `predictions` JSON-serializable by casting np.float back to float. + all_nbest_json[example["id"]] = [ + {k: (float(v) if isinstance(v, (np.float16, np.float32, np.float64)) else v) for k, v in pred.items()} + for pred in predictions + ] + + # If we have an output_dir, let's save all those dicts. + if output_dir is not None: + if not os.path.isdir(output_dir): + raise EnvironmentError(f"{output_dir} is not a directory.") + + prediction_file = os.path.join( + output_dir, "predictions.json" if prefix is None else f"{prefix}_predictions.json" + ) + nbest_file = os.path.join( + output_dir, "nbest_predictions.json" if prefix is None else f"{prefix}_nbest_predictions.json" + ) + if version_2_with_negative: + null_odds_file = os.path.join( + output_dir, "null_odds.json" if prefix is None else f"{prefix}_null_odds.json" + ) + + logger.info(f"Saving predictions to {prediction_file}.") + with open(prediction_file, "w") as writer: + writer.write(json.dumps(all_predictions, indent=4) + "\n") + logger.info(f"Saving nbest_preds to {nbest_file}.") + with open(nbest_file, "w") as writer: + writer.write(json.dumps(all_nbest_json, indent=4) + "\n") + if version_2_with_negative: + logger.info(f"Saving null_odds to {null_odds_file}.") + with open(null_odds_file, "w") as writer: + writer.write(json.dumps(scores_diff_json, indent=4) + "\n") + + return all_predictions + + +def postprocess_qa_predictions_with_beam_search( + examples, + features, + predictions: Tuple[np.ndarray, np.ndarray], + version_2_with_negative: bool = False, + n_best_size: int = 20, + max_answer_length: int = 30, + start_n_top: int = 5, + end_n_top: int = 5, + output_dir: Optional[str] = None, + prefix: Optional[str] = None, + log_level: Optional[int] = logging.WARNING, +): + """ + Post-processes the predictions of a question-answering model with beam search to convert them to answers that are substrings of the + original contexts. This is the postprocessing functions for models that return start and end logits, indices, as well as + cls token predictions. + Args: + examples: The non-preprocessed dataset (see the main script for more information). + features: The processed dataset (see the main script for more information). + predictions (:obj:`Tuple[np.ndarray, np.ndarray]`): + The predictions of the model: two arrays containing the start logits and the end logits respectively. Its + first dimension must match the number of elements of :obj:`features`. + version_2_with_negative (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not the underlying dataset contains examples with no answers. + n_best_size (:obj:`int`, `optional`, defaults to 20): + The total number of n-best predictions to generate when looking for an answer. + max_answer_length (:obj:`int`, `optional`, defaults to 30): + The maximum length of an answer that can be generated. This is needed because the start and end predictions + are not conditioned on one another. + start_n_top (:obj:`int`, `optional`, defaults to 5): + The number of top start logits too keep when searching for the :obj:`n_best_size` predictions. + end_n_top (:obj:`int`, `optional`, defaults to 5): + The number of top end logits too keep when searching for the :obj:`n_best_size` predictions. + output_dir (:obj:`str`, `optional`): + If provided, the dictionaries of predictions, n_best predictions (with their scores and logits) and, if + :obj:`version_2_with_negative=True`, the dictionary of the scores differences between best and null + answers, are saved in `output_dir`. + prefix (:obj:`str`, `optional`): + If provided, the dictionaries mentioned above are saved with `prefix` added to their names. + log_level (:obj:`int`, `optional`, defaults to ``logging.WARNING``): + ``logging`` log level (e.g., ``logging.WARNING``) + """ + if len(predictions) != 5: + raise ValueError("`predictions` should be a tuple with five elements.") + start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits = predictions + + if len(predictions[0]) != len(features): + raise ValueError(f"Got {len(predictions[0])} predictions and {len(features)} features.") + + # Build a map example to its corresponding features. + example_id_to_index = {k: i for i, k in enumerate(examples["id"])} + features_per_example = collections.defaultdict(list) + for i, feature in enumerate(features): + features_per_example[example_id_to_index[feature["example_id"]]].append(i) + + # The dictionaries we have to fill. + all_predictions = collections.OrderedDict() + all_nbest_json = collections.OrderedDict() + scores_diff_json = collections.OrderedDict() if version_2_with_negative else None + + # Logging. + logger.setLevel(log_level) + logger.info(f"Post-processing {len(examples)} example predictions split into {len(features)} features.") + + # Let's loop over all the examples! + for example_index, example in enumerate(tqdm(examples)): + # Those are the indices of the features associated to the current example. + feature_indices = features_per_example[example_index] + + min_null_score = None + prelim_predictions = [] + + # Looping through all the features associated to the current example. + for feature_index in feature_indices: + # We grab the predictions of the model for this feature. + start_log_prob = start_top_log_probs[feature_index] + start_indexes = start_top_index[feature_index] + end_log_prob = end_top_log_probs[feature_index] + end_indexes = end_top_index[feature_index] + feature_null_score = cls_logits[feature_index] + # This is what will allow us to map some the positions in our logits to span of texts in the original + # context. + offset_mapping = features[feature_index]["offset_mapping"] + # Optional `token_is_max_context`, if provided we will remove answers that do not have the maximum context + # available in the current feature. + token_is_max_context = features[feature_index].get("token_is_max_context", None) + + # Update minimum null prediction + if min_null_score is None or feature_null_score < min_null_score: + min_null_score = feature_null_score + + # Go through all possibilities for the `n_start_top`/`n_end_top` greater start and end logits. + for i in range(start_n_top): + for j in range(end_n_top): + start_index = int(start_indexes[i]) + j_index = i * end_n_top + j + end_index = int(end_indexes[j_index]) + # Don't consider out-of-scope answers (last part of the test should be unnecessary because of the + # p_mask but let's not take any risk) + if ( + start_index >= len(offset_mapping) + or end_index >= len(offset_mapping) + or offset_mapping[start_index] is None + or len(offset_mapping[start_index]) < 2 + or offset_mapping[end_index] is None + or len(offset_mapping[end_index]) < 2 + ): + continue + + # Don't consider answers with a length negative or > max_answer_length. + if end_index < start_index or end_index - start_index + 1 > max_answer_length: + continue + # Don't consider answer that don't have the maximum context available (if such information is + # provided). + if token_is_max_context is not None and not token_is_max_context.get(str(start_index), False): + continue + prelim_predictions.append( + { + "offsets": (offset_mapping[start_index][0], offset_mapping[end_index][1]), + "score": start_log_prob[i] + end_log_prob[j_index], + "start_log_prob": start_log_prob[i], + "end_log_prob": end_log_prob[j_index], + } + ) + + # Only keep the best `n_best_size` predictions. + predictions = sorted(prelim_predictions, key=lambda x: x["score"], reverse=True)[:n_best_size] + + # Use the offsets to gather the answer text in the original context. + context = example["context"] + for pred in predictions: + offsets = pred.pop("offsets") + pred["text"] = context[offsets[0] : offsets[1]] + + # In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid + # failure. + if len(predictions) == 0: + # Without predictions min_null_score is going to be None and None will cause an exception later + min_null_score = -2e-6 + predictions.insert(0, {"text": "", "start_logit": -1e-6, "end_logit": -1e-6, "score": min_null_score}) + + # Compute the softmax of all scores (we do it with numpy to stay independent from torch/tf in this file, using + # the LogSumExp trick). + scores = np.array([pred.pop("score") for pred in predictions]) + exp_scores = np.exp(scores - np.max(scores)) + probs = exp_scores / exp_scores.sum() + + # Include the probabilities in our predictions. + for prob, pred in zip(probs, predictions): + pred["probability"] = prob + + # Pick the best prediction and set the probability for the null answer. + all_predictions[example["id"]] = predictions[0]["text"] + if version_2_with_negative: + scores_diff_json[example["id"]] = float(min_null_score) + + # Make `predictions` JSON-serializable by casting np.float back to float. + all_nbest_json[example["id"]] = [ + {k: (float(v) if isinstance(v, (np.float16, np.float32, np.float64)) else v) for k, v in pred.items()} + for pred in predictions + ] + + # If we have an output_dir, let's save all those dicts. + if output_dir is not None: + if not os.path.isdir(output_dir): + raise EnvironmentError(f"{output_dir} is not a directory.") + + prediction_file = os.path.join( + output_dir, "predictions.json" if prefix is None else f"{prefix}_predictions.json" + ) + nbest_file = os.path.join( + output_dir, "nbest_predictions.json" if prefix is None else f"{prefix}_nbest_predictions.json" + ) + if version_2_with_negative: + null_odds_file = os.path.join( + output_dir, "null_odds.json" if prefix is None else f"{prefix}_null_odds.json" + ) + + logger.info(f"Saving predictions to {prediction_file}.") + with open(prediction_file, "w") as writer: + writer.write(json.dumps(all_predictions, indent=4) + "\n") + logger.info(f"Saving nbest_preds to {nbest_file}.") + with open(nbest_file, "w") as writer: + writer.write(json.dumps(all_nbest_json, indent=4) + "\n") + if version_2_with_negative: + logger.info(f"Saving null_odds to {null_odds_file}.") + with open(null_odds_file, "w") as writer: + writer.write(json.dumps(scores_diff_json, indent=4) + "\n") + + return all_predictions, scores_diff_json \ No newline at end of file diff --git a/examples/onnxrt/nlp/huggingface_model/text_classification/quantization/ptq/README.md b/examples/onnxrt/nlp/huggingface_model/text_classification/quantization/ptq/README.md new file mode 100644 index 00000000000..0afa7d99629 --- /dev/null +++ b/examples/onnxrt/nlp/huggingface_model/text_classification/quantization/ptq/README.md @@ -0,0 +1,56 @@ +# Evaluate performance of ONNX Runtime(Huggingface Text Classification) +>ONNX runtime quantization is under active development. please use 1.6.0+ to get more quantization support. + +This example load a language translation model and confirm its accuracy and speed based on [GLUE data](https://gluebenchmark.com/). + +### Environment +Please use latest onnx and onnxruntime version. + +### Prepare dataset +download the GLUE data with `prepare_data.sh` script. + +```shell +export GLUE_DIR=/path/to/glue_data +export TASK_NAME=MRPC # or SST + +bash prepare_data.sh --data_dir=$GLUE_DIR --task_name=$TASK_NAME +``` + +### Prepare model + +Supported model identifier from [huggingface.co](https://huggingface.co/): + +| Model Identifier | +|:-----------------------------------------------:| +| Intel/bert-base-uncased-mrpc | +| Intel/roberta-base-mrpc | +| Intel/xlm-roberta-base-mrpc | +| Intel/camembert-base-mrpc | +| distilbert-base-uncased-finetuned-sst-2-english | +| Alireza1044/albert-base-v2-sst2 | +| Intel/MiniLM-L12-H384-uncased-mrpc | +| philschmid/MiniLM-L6-H384-uncased-sst2 | + +```bash +python export.py --model_name_or_path=Intel/bert-base-uncased-mrpc \ # or other supported model identifier +``` + +### Quantization + +Quantize model with dynamic quantization: + +```bash +bash run_tuning.sh --config=glue_dynamic.yaml \ + --input_model=path/to/model \ # model path as *.onnx + --output_model=path/to/model_tune \ # model path as *.onnx + --data_path=path/to/glue/data +``` + +### Benchmark + +```bash +bash run_benchmark.sh --config=glue_dynamic.yaml \ + --input_model=path/to/model \ # model path as *.onnx + --data_path=path/to/glue/data \ + --mode=performance # or accuracy +``` diff --git a/examples/onnxrt/nlp/huggingface_model/text_classification/quantization/ptq/export.py b/examples/onnxrt/nlp/huggingface_model/text_classification/quantization/ptq/export.py new file mode 100644 index 00000000000..f2a38e747b3 --- /dev/null +++ b/examples/onnxrt/nlp/huggingface_model/text_classification/quantization/ptq/export.py @@ -0,0 +1,72 @@ +import argparse + +import torch +from transformers import AutoConfig, AutoModelForSequenceClassification + +def export_onnx_model(args, model): + with torch.no_grad(): + symbolic_names = {0: 'batch_size', 1: 'max_seq_len'} + if args.model_name_or_path in ['Intel/roberta-base-mrpc', + 'Intel/xlm-roberta-base-mrpc', + 'Intel/camembert-base-mrpc', + 'distilbert-base-uncased-finetuned-sst-2-english']: + inputs = {'input_ids': torch.ones(1, args.max_len, dtype=torch.int64), + 'attention_mask': torch.ones(1, args.max_len, dtype=torch.int64)} + torch.onnx.export(model, # model being run + (inputs['input_ids'], # model input (or a tuple for multiple inputs) + inputs['attention_mask']), + args.output_model, # where to save the model (can be a file or file-like object) + opset_version=14, # the ONNX version to export the model + do_constant_folding=True, # whether to execute constant folding + input_names=['input_ids', # the model's input names + 'attention_mask'], + dynamic_axes={'input_ids': symbolic_names, # variable length axes + 'attention_mask' : symbolic_names}) + else: + inputs = {'input_ids': torch.ones(1, args.max_len, dtype=torch.int64), + 'token_type_ids': torch.ones(1, args.max_len, dtype=torch.int64), + 'attention_mask': torch.ones(1, args.max_len, dtype=torch.int64)} + torch.onnx.export(model, # model being run + (inputs['input_ids'], # model input (or a tuple for multiple inputs) + inputs['token_type_ids'], + inputs['attention_mask']), + args.output_model, # where to save the model (can be a file or file-like object) + opset_version=14, # the ONNX version to export the model + do_constant_folding=True, # whether to execute constant folding + input_names=['input_ids', # the model's input names + 'token_type_ids', + 'attention_mask'], + dynamic_axes={'input_ids': symbolic_names, # variable length axes + 'token_type_ids' : symbolic_names, + 'attention_mask' : symbolic_names}) + print("ONNX Model exported to {0}".format(args.output_model)) + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description='Export huggingface onnx model', + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument( + '--model_name_or_path', + type=str, + choices=['Intel/bert-base-uncased-mrpc', + 'Intel/roberta-base-mrpc', + 'Intel/xlm-roberta-base-mrpc', + 'Intel/camembert-base-mrpc', + 'distilbert-base-uncased-finetuned-sst-2-english', + 'Alireza1044/albert-base-v2-sst2', + 'philschmid/MiniLM-L6-H384-uncased-sst2', + 'Intel/MiniLM-L12-H384-uncased-mrpc'], + help='pretrained model name or path') + parser.add_argument( + '--max_len', + type=int, + default=128, + help='Maximum length of the sentence pairs') + args = parser.parse_args() + args.output_model = args.model_name_or_path.split('/')[-1] + '.onnx' + + model = AutoModelForSequenceClassification.from_pretrained( + args.model_name_or_path, + config=AutoConfig.from_pretrained(args.model_name_or_path)) + + export_onnx_model(args, model) \ No newline at end of file diff --git a/examples/onnxrt/nlp/huggingface_model/text_classification/quantization/ptq/glue_dynamic.yaml b/examples/onnxrt/nlp/huggingface_model/text_classification/quantization/ptq/glue_dynamic.yaml new file mode 100644 index 00000000000..fa9a22ce874 --- /dev/null +++ b/examples/onnxrt/nlp/huggingface_model/text_classification/quantization/ptq/glue_dynamic.yaml @@ -0,0 +1,36 @@ +# +# Copyright (c) 2021 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +version: 1.0 + +model: # mandatory. used to specify model specific information. + name: text_classification + framework: onnxrt_integerops # mandatory. possible values are tensorflow, mxnet, pytorch, pytorch_ipex, onnxrt_integerops and onnxrt_qlinearops. + +evaluation: # optional. required if user doesn't provide eval_func in neural_compressor.Quantization. + performance: # optional. used to benchmark performance of passing model. + warmup: 10 + iteration: 100 + configs: + cores_per_instance: 28 + num_of_instance: 1 + +quantization: + approach: post_training_dynamic_quant # optional. default value is post_training_static_quant. + +tuning: + accuracy_criterion: + relative: 0.01 # optional. default value is relative, other value is absolute. this example allows relative accuracy loss: 1%. + random_seed: 9527 # optional. random seed for deterministic tuning. \ No newline at end of file diff --git a/examples/onnxrt/nlp/huggingface_model/text_classification/quantization/ptq/main.py b/examples/onnxrt/nlp/huggingface_model/text_classification/quantization/ptq/main.py new file mode 100644 index 00000000000..d5051af3816 --- /dev/null +++ b/examples/onnxrt/nlp/huggingface_model/text_classification/quantization/ptq/main.py @@ -0,0 +1,422 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint:disable=redefined-outer-name,logging-format-interpolation + +import logging +import argparse +import onnx +import onnxruntime as ort +import transformers +import os +import torch +import numpy as np +from dataclasses import dataclass +from typing import List, Optional, Union +import sys +from neural_compressor.data import DATALOADERS, DATASETS + + +class ONNXRTBertDataset: + """Dataset used for model Bert. + Args: data_dir (str): The input data dir. + model_name_or_path (str): Path to pre-trained student model or shortcut name, + selected in the list: + max_seq_length (int, default=128): The maximum length after tokenization. + Sequences longer than this will be truncated, + sequences shorter will be padded. + do_lower_case (bool, default=True): Whether to lowercase the input when tokenizing. + task (str, default=mrpc): The name of the task to fine-tune. + Choices include mrpc, qqp, qnli, rte, + sts-b, cola, mnli, wnli. + model_type (str, default='bert'): model type, support 'distilbert', 'bert', + 'mobilebert', 'roberta'. + dynamic_length (bool, default=False): Whether to use fixed sequence length. + evaluate (bool, default=True): Whether do evaluation or training. + transform (transform object, default=None): transform to process input data. + filter (Filter objects, default=None): filter out examples according + to specific conditions. + """ + def __init__(self, data_dir, model_name_or_path, max_seq_length=128,\ + do_lower_case=True, task='mrpc', model_type='bert', dynamic_length=False,\ + evaluate=True, transform=None, filter=None): + task = task.lower() + model_type = model_type.lower() + assert task in ['mrpc', 'qqp', 'qnli', 'rte', 'sts-b', 'cola', \ + 'mnli', 'wnli', 'sst-2'], 'Unsupported task type' + assert model_type in ['distilbert', 'bert', 'mobilebert', 'roberta'], 'Unsupported \ + model type' + self.dynamic_length = dynamic_length + self.model_type = model_type + self.max_seq_length = max_seq_length + tokenizer = transformers.AutoTokenizer.from_pretrained(model_name_or_path, + do_lower_case=do_lower_case) + self.dataset = load_and_cache_examples(data_dir, model_name_or_path, \ + max_seq_length, task, model_type, tokenizer, evaluate) + + def __len__(self): + return len(self.dataset) + + def __getitem__(self, index): + # return self.dataset[index] + batch = tuple(t.detach().cpu().numpy() if not isinstance(t, np.ndarray) else t for t in self.dataset[index]) + return batch[:3], batch[-1] + +def load_and_cache_examples(data_dir, model_name_or_path, max_seq_length, task, \ + model_type, tokenizer, evaluate): + from torch.utils.data import TensorDataset + + processor = transformers.glue_processors[task]() + output_mode = transformers.glue_output_modes[task] + # Load data features from cache or dataset file + if not os.path.exists("./dataset_cached"): + os.makedirs("./dataset_cached") + cached_features_file = os.path.join("./dataset_cached", 'cached_{}_{}_{}_{}'.format( + 'dev' if evaluate else 'train', + list(filter(None, model_name_or_path.split('/'))).pop(), + str(max_seq_length), + str(task))) + if os.path.exists(cached_features_file): + logger.info("Load features from cached file {}.".format(cached_features_file)) + features = torch.load(cached_features_file) + else: + logger.info("Create features from dataset file at {}.".format(data_dir)) + label_list = processor.get_labels() + examples = processor.get_dev_examples(data_dir) if evaluate else \ + processor.get_train_examples(data_dir) + features = convert_examples_to_features(examples, + tokenizer, + task=task, + label_list=label_list, + max_length=max_seq_length, + output_mode=output_mode, + ) + logger.info("Save features into cached file {}.".format(cached_features_file)) + torch.save(features, cached_features_file) + # Convert to Tensors and build dataset + all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) + all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) + all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) + all_seq_lengths = torch.tensor([f.seq_length for f in features], dtype=torch.long) + if output_mode == "classification": + all_labels = torch.tensor([f.label for f in features], dtype=torch.long) + elif output_mode == "regression": + all_labels = torch.tensor([f.label for f in features], dtype=torch.float) + dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, \ + all_seq_lengths, all_labels) + return dataset + +def convert_examples_to_features( + examples, + tokenizer, + max_length=128, + task=None, + label_list=None, + output_mode="classification", + pad_token=0, + pad_token_segment_id=0, + mask_padding_with_zero=True, +): + processor = transformers.glue_processors[task]() + if label_list is None: + label_list = processor.get_labels() + logger.info("Use label list {} for task {}.".format(label_list, task)) + label_map = {label: i for i, label in enumerate(label_list)} + features = [] + for (ex_index, example) in enumerate(examples): + inputs = tokenizer.encode_plus( + example.text_a, + example.text_b, + add_special_tokens=True, + max_length=max_length, + return_token_type_ids=True, + truncation=True, + ) + input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"] + # The mask has 1 for real tokens and 0 for padding tokens. Only real + # tokens are attended to. + attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) + + # Zero-pad up to the sequence length. + seq_length = len(input_ids) + padding_length = max_length - len(input_ids) + + input_ids = input_ids + ([pad_token] * padding_length) + attention_mask = attention_mask + \ + ([0 if mask_padding_with_zero else 1] * padding_length) + token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length) + + assert len(input_ids) == max_length, \ + "Error with input_ids length {} vs {}".format( + len(input_ids), max_length) + assert len(attention_mask) == max_length, \ + "Error with attention_mask length {} vs {}".format( + len(attention_mask), max_length + ) + assert len(token_type_ids) == max_length, \ + "Error with token_type_ids length {} vs {}".format( + len(token_type_ids), max_length + ) + if output_mode == "classification": + label = label_map[example.label] + elif output_mode == "regression": + label = float(example.label) + else: + raise KeyError(output_mode) + + feats = InputFeatures( + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + label=label, + seq_length=seq_length, + ) + features.append(feats) + return features + +@dataclass(frozen=True) +class InputFeatures: + """ + A single set of features of data. + Property names are the same names as the corresponding inputs to a model. + Args: + input_ids: Indices of input sequence tokens in the vocabulary. + attention_mask: Mask to avoid performing attention on padding token indices. + Mask values selected in ``[0, 1]``: Usually ``1`` for tokens that are NOT MASKED, + ``0`` for MASKED (padded) tokens. + token_type_ids: (Optional) Segment token indices to indicate first and second + portions of the inputs. Only some models use them. + label: (Optional) Label corresponding to the input. Int for classification problems, + float for regression problems. + seq_length: (Optional) The length of input sequence before padding. + """ + + input_ids: List[int] + attention_mask: Optional[List[int]] = None + token_type_ids: Optional[List[int]] = None + label: Optional[Union[int, float]] = None + seq_length: Optional[List[int]] = None + +class ONNXRTGLUE: + """Computes GLUE score. + + Args: + task (str, default=mrpc): The name of the task. + Choices include mrpc, qqp, qnli, rte, + sts-b, cola, mnli, wnli. + + """ + def __init__(self, task='mrpc'): + assert task in ['mrpc', 'qqp', 'qnli', 'rte', 'sts-b', 'cola', \ + 'mnli', 'wnli', 'sst-2'], 'Unsupported task type' + self.pred_list = None + self.label_list = None + self.task = task + self.return_key = { + "cola": "mcc", + "mrpc": "f1", + "sts-b": "corr", + "qqp": "acc", + "mnli": "mnli/acc", + "qnli": "acc", + "rte": "acc", + "wnli": "acc", + "sst-2": "acc" + } + + def update(self, preds, labels): + """add preds and labels to storage""" + if isinstance(preds, list) and len(preds) == 1: + preds = preds[0] + if isinstance(labels, list) and len(labels) == 1: + labels = labels[0] + if self.pred_list is None: + self.pred_list = preds + self.label_list = labels + else: + self.pred_list = np.append(self.pred_list, preds, axis=0) + self.label_list = np.append(self.label_list, labels, axis=0) + + def reset(self): + """clear preds and labels storage""" + self.pred_list = None + self.label_list = None + + def result(self): + """calculate metric""" + output_mode = transformers.glue_output_modes[self.task] + + if output_mode == "classification": + processed_preds = np.argmax(self.pred_list, axis=1) + elif output_mode == "regression": + processed_preds = np.squeeze(self.pred_list) + result = transformers.glue_compute_metrics(\ + self.task, processed_preds, self.label_list) + return result[self.return_key[self.task]] + +logger = logging.getLogger(__name__) +logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', + datefmt = '%m/%d/%Y %H:%M:%S', + level = logging.WARN) + +if __name__ == "__main__": + logger.info('Evaluating ONNXRuntime full precision accuracy and performance:') + parser = argparse.ArgumentParser( + description='BERT fine-tune examples for classification/regression tasks.', + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument( + '--model_path', + type=str, + help="Pre-trained resnet50 model on onnx file" + ) + parser.add_argument( + '--benchmark', + action='store_true', \ + default=False + ) + parser.add_argument( + '--tune', + action='store_true', \ + default=False, + help="whether quantize the model" + ) + parser.add_argument( + '--config', + type=str, + help="config yaml path" + ) + parser.add_argument( + '--output_model', + type=str, + default=None, + help="output model path" + ) + parser.add_argument( + '--mode', + type=str, + help="benchmark mode of performance or accuracy" + ) + parser.add_argument( + '--data_path', + type=str, + help="input data path" + ) + parser.add_argument( + '--batch_size', + default=8, + type=int, + ) + parser.add_argument( + '--model_name_or_path', + type=str, + choices=['Intel/bert-base-uncased-mrpc', + 'Intel/roberta-base-mrpc', + 'Intel/xlm-roberta-base-mrpc', + 'Intel/camembert-base-mrpc', + 'distilbert-base-uncased-finetuned-sst-2-english', + 'Alireza1044/albert-base-v2-sst2', + 'philschmid/MiniLM-L6-H384-uncased-sst2', + 'Intel/MiniLM-L12-H384-uncased-mrpc'], + help="pretrained model name or path" + ) + parser.add_argument( + '--task', + type=str, + choices=['mrpc', 'qqp', 'qnli', 'rte', 'sts-b', 'cola', \ + 'mnli', 'wnli', 'sst-2'], + help="GLUE task name" + ) + parser.add_argument( + '--num_heads', + default=12, + type=int, + ) + parser.add_argument( + '--hidden_size', + default=768, + type=int, + ) + + args = parser.parse_args() + + dataset = ONNXRTBertDataset(data_dir=args.data_path, + model_name_or_path=args.model_name_or_path, + task=args.task) + dataloader = DATALOADERS['onnxrt_integerops'](dataset, batch_size=args.batch_size) + metric = ONNXRTGLUE(args.task) + + def eval_func(model, *args): + metric.reset() + import tqdm + session = ort.InferenceSession(model.SerializeToString(), None) + ort_inputs = {} + len_inputs = len(session.get_inputs()) + inputs_names = [session.get_inputs()[i].name for i in range(len_inputs)] + for idx, (inputs, labels) in enumerate(dataloader): + if not isinstance(labels, list): + labels = [labels] + inputs = inputs[:len_inputs] + for i in range(len_inputs): + ort_inputs.update({inputs_names[i]: inputs[i]}) + predictions = session.run(None, ort_inputs) + metric.update(predictions[0], labels) + return metric.result() + + if args.benchmark: + from neural_compressor.experimental import Benchmark, common + model = onnx.load(args.model_path) + if args.mode == 'performance': + session = ort.InferenceSession(args.model_path, None) + input_tensors = session.get_inputs() + shape = [] + for i in range(len(input_tensors)): + shape.append((1, 128)) + datasets = DATASETS('onnxrt_integerops') + dummy_dataset = datasets['dummy'](shape=shape, low=1, high=1, dtype='int64', label=True) + evaluator = Benchmark(args.config) + evaluator.model = common.Model(model) + evaluator.b_dataloader = common.DataLoader(dummy_dataset) + evaluator(args.mode) + elif args.mode == 'accuracy': + evaluator = Benchmark(args.config) + evaluator.model = common.Model(model) + evaluator.b_dataloader = dataloader + evaluator.metric = metric + evaluator.b_func = eval_func + evaluator(args.mode) + + if args.tune: + from onnxruntime.transformers import optimizer + from onnxruntime.transformers.onnx_model_bert import BertOptimizationOptions + opt_options = BertOptimizationOptions('bert') + opt_options.enable_embed_layer_norm = False + + model_optimizer = optimizer.optimize_model( + args.model_path, + 'bert', + num_heads=args.num_heads, + hidden_size=args.hidden_size, + optimization_options=opt_options) + model = model_optimizer.model + + from neural_compressor import options + from neural_compressor.experimental import Quantization, common + options.onnxrt.graph_optimization.level = 'ENABLE_BASIC' + quantize = Quantization(args.config) + quantize.model = model + quantize.eval_func = eval_func + q_model = quantize() + q_model.save(args.output_model) diff --git a/examples/onnxrt/language_translation/mobilebert/quantization/ptq/prepare_data.sh b/examples/onnxrt/nlp/huggingface_model/text_classification/quantization/ptq/prepare_data.sh similarity index 100% rename from examples/onnxrt/language_translation/mobilebert/quantization/ptq/prepare_data.sh rename to examples/onnxrt/nlp/huggingface_model/text_classification/quantization/ptq/prepare_data.sh diff --git a/examples/onnxrt/nlp/huggingface_model/text_classification/quantization/ptq/requirements.txt b/examples/onnxrt/nlp/huggingface_model/text_classification/quantization/ptq/requirements.txt new file mode 100644 index 00000000000..a5e81be3aad --- /dev/null +++ b/examples/onnxrt/nlp/huggingface_model/text_classification/quantization/ptq/requirements.txt @@ -0,0 +1,7 @@ +torch +transformers==4.16.0 +onnx +onnxruntime +coloredlogs +sympy +onnxruntime-extensions; python_version < '3.10' diff --git a/examples/onnxrt/nlp/huggingface_model/text_classification/quantization/ptq/run_benchmark.sh b/examples/onnxrt/nlp/huggingface_model/text_classification/quantization/ptq/run_benchmark.sh new file mode 100644 index 00000000000..c72b109a530 --- /dev/null +++ b/examples/onnxrt/nlp/huggingface_model/text_classification/quantization/ptq/run_benchmark.sh @@ -0,0 +1,81 @@ +#!/bin/bash +set -x + +function main { + + init_params "$@" + run_benchmark + +} + +# init params +function init_params { + for var in "$@" + do + case $var in + --config=*) + config=$(echo $var |cut -f2 -d=) + ;; + --input_model=*) + input_model=$(echo $var |cut -f2 -d=) + ;; + --mode=*) + mode=$(echo $var |cut -f2 -d=) + ;; + --data_path=*) + data_path=$(echo $var |cut -f2 -d=) + ;; + esac + done + +} + +# run_benchmark +function run_benchmark { + + if [[ "${input_model}" =~ "bert-base" ]]; then + model_name_or_path="Intel/bert-base-uncased-mrpc" + TASK_NAME='mrpc' + fi + if [[ "${input_model}" =~ "roberta-base" ]]; then + model_name_or_path="Intel/roberta-base-mrpc" + TASK_NAME='mrpc' + fi + if [[ "${input_model}" =~ "xlm-roberta-base" ]]; then + model_name_or_path="Intel/xlm-roberta-base-mrpc" + TASK_NAME='mrpc' + fi + if [[ "${input_model}" =~ "camembert-base" ]]; then + model_name_or_path="Intel/camembert-base-mrpc" + TASK_NAME='mrpc' + fi + if [[ "${input_model}" =~ "distilbert-base" ]]; then + model_name_or_path="distilbert-base-uncased-finetuned-sst-2-english" + TASK_NAME='sst-2' + fi + if [[ "${input_model}" =~ "albert-base" ]]; then + model_name_or_path="Alireza1044/albert-base-v2-sst2" + TASK_NAME='sst-2' + fi + if [[ "${input_model}" =~ "MiniLM-L6" ]]; then + model_name_or_path="philschmid/MiniLM-L6-H384-uncased-sst2" + TASK_NAME='sst-2' + fi + if [[ "${input_model}" =~ "MiniLM-L12" ]]; then + model_name_or_path="Intel/MiniLM-L12-H384-uncased-mrpc" + TASK_NAME='mrpc' + fi + + python main.py \ + --model_name_or_path ${model_name_or_path} \ + --model_path ${input_model} \ + --config ${config} \ + --data_path ${data_path} \ + --task ${TASK_NAME} \ + --mode=${mode} \ + --benchmark + +} + +main "$@" + diff --git a/examples/onnxrt/nlp/huggingface_model/text_classification/quantization/ptq/run_tuning.sh b/examples/onnxrt/nlp/huggingface_model/text_classification/quantization/ptq/run_tuning.sh new file mode 100644 index 00000000000..7d141154355 --- /dev/null +++ b/examples/onnxrt/nlp/huggingface_model/text_classification/quantization/ptq/run_tuning.sh @@ -0,0 +1,98 @@ +#!/bin/bash +set -x + +function main { + init_params "$@" + run_tuning +} + +# init params +function init_params { + for var in "$@" + do + case $var in + --config=*) + config=$(echo $var |cut -f2 -d=) + ;; + --input_model=*) + input_model=$(echo $var |cut -f2 -d=) + ;; + --output_model=*) + output_model=$(echo $var |cut -f2 -d=) + ;; + --data_path=*) + data_path=$(echo $var |cut -f2 -d=) + ;; + esac + done + +} + +# run_tuning +function run_tuning { + + if [[ "${input_model}" =~ "bert-base" ]]; then + model_name_or_path="Intel/bert-base-uncased-mrpc" + TASK_NAME='mrpc' + num_heads=12 + hidden_size=768 + fi + if [[ "${input_model}" =~ "roberta-base" ]]; then + model_name_or_path="Intel/roberta-base-mrpc" + TASK_NAME='mrpc' + num_heads=12 + hidden_size=768 + fi + if [[ "${input_model}" =~ "xlm-roberta-base" ]]; then + model_name_or_path="Intel/xlm-roberta-base-mrpc" + TASK_NAME='mrpc' + num_heads=12 + hidden_size=768 + fi + if [[ "${input_model}" =~ "camembert-base" ]]; then + model_name_or_path="Intel/camembert-base-mrpc" + TASK_NAME='mrpc' + num_heads=12 + hidden_size=768 + fi + if [[ "${input_model}" =~ "distilbert-base" ]]; then + model_name_or_path="distilbert-base-uncased-finetuned-sst-2-english" + TASK_NAME='sst-2' + num_heads=12 + hidden_size=768 + fi + if [[ "${input_model}" =~ "albert-base" ]]; then + model_name_or_path="Alireza1044/albert-base-v2-sst2" + TASK_NAME='sst-2' + num_heads=12 + hidden_size=768 + fi + if [[ "${input_model}" =~ "MiniLM-L6" ]]; then + model_name_or_path="philschmid/MiniLM-L6-H384-uncased-sst2" + TASK_NAME='sst-2' + num_heads=12 + hidden_size=384 + fi + if [[ "${input_model}" =~ "MiniLM-L12" ]]; then + model_name_or_path="Intel/MiniLM-L12-H384-uncased-mrpc" + TASK_NAME='mrpc' + num_heads=12 + hidden_size=384 + fi + + python main.py \ + --model_name_or_path ${model_name_or_path} \ + --model_path ${input_model} \ + --output_model ${output_model} \ + --config ${config} \ + --data_path ${data_path} \ + --task ${TASK_NAME} \ + --num_heads ${num_heads} \ + --hidden_size ${hidden_size} \ + --tune +} + +main "$@" + + + diff --git a/examples/onnxrt/language_translation/mobilebert/quantization/ptq/export.py b/examples/onnxrt/nlp/mobilebert/quantization/ptq/export.py similarity index 100% rename from examples/onnxrt/language_translation/mobilebert/quantization/ptq/export.py rename to examples/onnxrt/nlp/mobilebert/quantization/ptq/export.py diff --git a/examples/onnxrt/language_translation/mobilebert/quantization/ptq/main.py b/examples/onnxrt/nlp/mobilebert/quantization/ptq/main.py similarity index 100% rename from examples/onnxrt/language_translation/mobilebert/quantization/ptq/main.py rename to examples/onnxrt/nlp/mobilebert/quantization/ptq/main.py diff --git a/examples/onnxrt/language_translation/mobilebert/quantization/ptq/mobilebert.yaml b/examples/onnxrt/nlp/mobilebert/quantization/ptq/mobilebert.yaml similarity index 100% rename from examples/onnxrt/language_translation/mobilebert/quantization/ptq/mobilebert.yaml rename to examples/onnxrt/nlp/mobilebert/quantization/ptq/mobilebert.yaml diff --git a/examples/onnxrt/language_translation/mobilebert/quantization/ptq/mobilebert_qdq.yaml b/examples/onnxrt/nlp/mobilebert/quantization/ptq/mobilebert_qdq.yaml similarity index 100% rename from examples/onnxrt/language_translation/mobilebert/quantization/ptq/mobilebert_qdq.yaml rename to examples/onnxrt/nlp/mobilebert/quantization/ptq/mobilebert_qdq.yaml diff --git a/examples/onnxrt/language_translation/roberta/quantization/ptq/prepare_data.sh b/examples/onnxrt/nlp/mobilebert/quantization/ptq/prepare_data.sh similarity index 100% rename from examples/onnxrt/language_translation/roberta/quantization/ptq/prepare_data.sh rename to examples/onnxrt/nlp/mobilebert/quantization/ptq/prepare_data.sh diff --git a/examples/onnxrt/language_translation/mobilebert/quantization/ptq/prepare_model.sh b/examples/onnxrt/nlp/mobilebert/quantization/ptq/prepare_model.sh similarity index 100% rename from examples/onnxrt/language_translation/mobilebert/quantization/ptq/prepare_model.sh rename to examples/onnxrt/nlp/mobilebert/quantization/ptq/prepare_model.sh diff --git a/examples/onnxrt/language_translation/mobilebert/quantization/ptq/readme.md b/examples/onnxrt/nlp/mobilebert/quantization/ptq/readme.md similarity index 100% rename from examples/onnxrt/language_translation/mobilebert/quantization/ptq/readme.md rename to examples/onnxrt/nlp/mobilebert/quantization/ptq/readme.md diff --git a/examples/onnxrt/language_translation/mobilebert/quantization/ptq/requirements.txt b/examples/onnxrt/nlp/mobilebert/quantization/ptq/requirements.txt similarity index 100% rename from examples/onnxrt/language_translation/mobilebert/quantization/ptq/requirements.txt rename to examples/onnxrt/nlp/mobilebert/quantization/ptq/requirements.txt diff --git a/examples/onnxrt/language_translation/mobilebert/quantization/ptq/run_benchmark.sh b/examples/onnxrt/nlp/mobilebert/quantization/ptq/run_benchmark.sh similarity index 100% rename from examples/onnxrt/language_translation/mobilebert/quantization/ptq/run_benchmark.sh rename to examples/onnxrt/nlp/mobilebert/quantization/ptq/run_benchmark.sh diff --git a/examples/onnxrt/language_translation/mobilebert/quantization/ptq/run_tuning.sh b/examples/onnxrt/nlp/mobilebert/quantization/ptq/run_tuning.sh similarity index 100% rename from examples/onnxrt/language_translation/mobilebert/quantization/ptq/run_tuning.sh rename to examples/onnxrt/nlp/mobilebert/quantization/ptq/run_tuning.sh diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/BiDAF/quantization/ptq/README.md b/examples/onnxrt/nlp/onnx_model_zoo/BiDAF/quantization/ptq/README.md similarity index 100% rename from examples/onnxrt/language_translation/onnx_model_zoo/BiDAF/quantization/ptq/README.md rename to examples/onnxrt/nlp/onnx_model_zoo/BiDAF/quantization/ptq/README.md diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/BiDAF/quantization/ptq/bidaf.yaml b/examples/onnxrt/nlp/onnx_model_zoo/BiDAF/quantization/ptq/bidaf.yaml similarity index 100% rename from examples/onnxrt/language_translation/onnx_model_zoo/BiDAF/quantization/ptq/bidaf.yaml rename to examples/onnxrt/nlp/onnx_model_zoo/BiDAF/quantization/ptq/bidaf.yaml diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/BiDAF/quantization/ptq/main.py b/examples/onnxrt/nlp/onnx_model_zoo/BiDAF/quantization/ptq/main.py similarity index 100% rename from examples/onnxrt/language_translation/onnx_model_zoo/BiDAF/quantization/ptq/main.py rename to examples/onnxrt/nlp/onnx_model_zoo/BiDAF/quantization/ptq/main.py diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/BiDAF/quantization/ptq/requirements.txt b/examples/onnxrt/nlp/onnx_model_zoo/BiDAF/quantization/ptq/requirements.txt similarity index 100% rename from examples/onnxrt/language_translation/onnx_model_zoo/BiDAF/quantization/ptq/requirements.txt rename to examples/onnxrt/nlp/onnx_model_zoo/BiDAF/quantization/ptq/requirements.txt diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/BiDAF/quantization/ptq/run_benchmark.sh b/examples/onnxrt/nlp/onnx_model_zoo/BiDAF/quantization/ptq/run_benchmark.sh similarity index 100% rename from examples/onnxrt/language_translation/onnx_model_zoo/BiDAF/quantization/ptq/run_benchmark.sh rename to examples/onnxrt/nlp/onnx_model_zoo/BiDAF/quantization/ptq/run_benchmark.sh diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/BiDAF/quantization/ptq/run_tuning.sh b/examples/onnxrt/nlp/onnx_model_zoo/BiDAF/quantization/ptq/run_tuning.sh similarity index 100% rename from examples/onnxrt/language_translation/onnx_model_zoo/BiDAF/quantization/ptq/run_tuning.sh rename to examples/onnxrt/nlp/onnx_model_zoo/BiDAF/quantization/ptq/run_tuning.sh diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/bert-squad/quantization/ptq/bert.yaml b/examples/onnxrt/nlp/onnx_model_zoo/bert-squad/quantization/ptq/bert.yaml similarity index 100% rename from examples/onnxrt/language_translation/onnx_model_zoo/bert-squad/quantization/ptq/bert.yaml rename to examples/onnxrt/nlp/onnx_model_zoo/bert-squad/quantization/ptq/bert.yaml diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/bert-squad/quantization/ptq/bert_qdq.yaml b/examples/onnxrt/nlp/onnx_model_zoo/bert-squad/quantization/ptq/bert_qdq.yaml similarity index 100% rename from examples/onnxrt/language_translation/onnx_model_zoo/bert-squad/quantization/ptq/bert_qdq.yaml rename to examples/onnxrt/nlp/onnx_model_zoo/bert-squad/quantization/ptq/bert_qdq.yaml diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/bert-squad/quantization/ptq/main.py b/examples/onnxrt/nlp/onnx_model_zoo/bert-squad/quantization/ptq/main.py similarity index 100% rename from examples/onnxrt/language_translation/onnx_model_zoo/bert-squad/quantization/ptq/main.py rename to examples/onnxrt/nlp/onnx_model_zoo/bert-squad/quantization/ptq/main.py diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/bert-squad/quantization/ptq/readme.md b/examples/onnxrt/nlp/onnx_model_zoo/bert-squad/quantization/ptq/readme.md similarity index 100% rename from examples/onnxrt/language_translation/onnx_model_zoo/bert-squad/quantization/ptq/readme.md rename to examples/onnxrt/nlp/onnx_model_zoo/bert-squad/quantization/ptq/readme.md diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/bert-squad/quantization/ptq/requirements.txt b/examples/onnxrt/nlp/onnx_model_zoo/bert-squad/quantization/ptq/requirements.txt similarity index 100% rename from examples/onnxrt/language_translation/onnx_model_zoo/bert-squad/quantization/ptq/requirements.txt rename to examples/onnxrt/nlp/onnx_model_zoo/bert-squad/quantization/ptq/requirements.txt diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/bert-squad/quantization/ptq/run_benchmark.sh b/examples/onnxrt/nlp/onnx_model_zoo/bert-squad/quantization/ptq/run_benchmark.sh similarity index 100% rename from examples/onnxrt/language_translation/onnx_model_zoo/bert-squad/quantization/ptq/run_benchmark.sh rename to examples/onnxrt/nlp/onnx_model_zoo/bert-squad/quantization/ptq/run_benchmark.sh diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/bert-squad/quantization/ptq/run_onnx_squad.py b/examples/onnxrt/nlp/onnx_model_zoo/bert-squad/quantization/ptq/run_onnx_squad.py similarity index 100% rename from examples/onnxrt/language_translation/onnx_model_zoo/bert-squad/quantization/ptq/run_onnx_squad.py rename to examples/onnxrt/nlp/onnx_model_zoo/bert-squad/quantization/ptq/run_onnx_squad.py diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/bert-squad/quantization/ptq/run_tuning.sh b/examples/onnxrt/nlp/onnx_model_zoo/bert-squad/quantization/ptq/run_tuning.sh similarity index 100% rename from examples/onnxrt/language_translation/onnx_model_zoo/bert-squad/quantization/ptq/run_tuning.sh rename to examples/onnxrt/nlp/onnx_model_zoo/bert-squad/quantization/ptq/run_tuning.sh diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/bert-squad/quantization/ptq/squad_evaluate.py b/examples/onnxrt/nlp/onnx_model_zoo/bert-squad/quantization/ptq/squad_evaluate.py similarity index 100% rename from examples/onnxrt/language_translation/onnx_model_zoo/bert-squad/quantization/ptq/squad_evaluate.py rename to examples/onnxrt/nlp/onnx_model_zoo/bert-squad/quantization/ptq/squad_evaluate.py diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/bert-squad/quantization/ptq/tokenization.py b/examples/onnxrt/nlp/onnx_model_zoo/bert-squad/quantization/ptq/tokenization.py similarity index 100% rename from examples/onnxrt/language_translation/onnx_model_zoo/bert-squad/quantization/ptq/tokenization.py rename to examples/onnxrt/nlp/onnx_model_zoo/bert-squad/quantization/ptq/tokenization.py diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/gpt2/quantization/ptq/export.py b/examples/onnxrt/nlp/onnx_model_zoo/gpt2/quantization/ptq/export.py similarity index 100% rename from examples/onnxrt/language_translation/onnx_model_zoo/gpt2/quantization/ptq/export.py rename to examples/onnxrt/nlp/onnx_model_zoo/gpt2/quantization/ptq/export.py diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/gpt2/quantization/ptq/gpt2.py b/examples/onnxrt/nlp/onnx_model_zoo/gpt2/quantization/ptq/gpt2.py similarity index 100% rename from examples/onnxrt/language_translation/onnx_model_zoo/gpt2/quantization/ptq/gpt2.py rename to examples/onnxrt/nlp/onnx_model_zoo/gpt2/quantization/ptq/gpt2.py diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/gpt2/quantization/ptq/gpt2.yaml b/examples/onnxrt/nlp/onnx_model_zoo/gpt2/quantization/ptq/gpt2.yaml similarity index 100% rename from examples/onnxrt/language_translation/onnx_model_zoo/gpt2/quantization/ptq/gpt2.yaml rename to examples/onnxrt/nlp/onnx_model_zoo/gpt2/quantization/ptq/gpt2.yaml diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/gpt2/quantization/ptq/readme.md b/examples/onnxrt/nlp/onnx_model_zoo/gpt2/quantization/ptq/readme.md similarity index 100% rename from examples/onnxrt/language_translation/onnx_model_zoo/gpt2/quantization/ptq/readme.md rename to examples/onnxrt/nlp/onnx_model_zoo/gpt2/quantization/ptq/readme.md diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/gpt2/quantization/ptq/requirements.txt b/examples/onnxrt/nlp/onnx_model_zoo/gpt2/quantization/ptq/requirements.txt similarity index 100% rename from examples/onnxrt/language_translation/onnx_model_zoo/gpt2/quantization/ptq/requirements.txt rename to examples/onnxrt/nlp/onnx_model_zoo/gpt2/quantization/ptq/requirements.txt diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/gpt2/quantization/ptq/run_benchmark.sh b/examples/onnxrt/nlp/onnx_model_zoo/gpt2/quantization/ptq/run_benchmark.sh similarity index 100% rename from examples/onnxrt/language_translation/onnx_model_zoo/gpt2/quantization/ptq/run_benchmark.sh rename to examples/onnxrt/nlp/onnx_model_zoo/gpt2/quantization/ptq/run_benchmark.sh diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/gpt2/quantization/ptq/run_tuning.sh b/examples/onnxrt/nlp/onnx_model_zoo/gpt2/quantization/ptq/run_tuning.sh similarity index 100% rename from examples/onnxrt/language_translation/onnx_model_zoo/gpt2/quantization/ptq/run_tuning.sh rename to examples/onnxrt/nlp/onnx_model_zoo/gpt2/quantization/ptq/run_tuning.sh diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/mobilebert/quantization/ptq/main.py b/examples/onnxrt/nlp/onnx_model_zoo/mobilebert/quantization/ptq/main.py similarity index 100% rename from examples/onnxrt/language_translation/onnx_model_zoo/mobilebert/quantization/ptq/main.py rename to examples/onnxrt/nlp/onnx_model_zoo/mobilebert/quantization/ptq/main.py diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/mobilebert/quantization/ptq/mobilebert.yaml b/examples/onnxrt/nlp/onnx_model_zoo/mobilebert/quantization/ptq/mobilebert.yaml similarity index 100% rename from examples/onnxrt/language_translation/onnx_model_zoo/mobilebert/quantization/ptq/mobilebert.yaml rename to examples/onnxrt/nlp/onnx_model_zoo/mobilebert/quantization/ptq/mobilebert.yaml diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/mobilebert/quantization/ptq/mobilebert_qdq.yaml b/examples/onnxrt/nlp/onnx_model_zoo/mobilebert/quantization/ptq/mobilebert_qdq.yaml similarity index 100% rename from examples/onnxrt/language_translation/onnx_model_zoo/mobilebert/quantization/ptq/mobilebert_qdq.yaml rename to examples/onnxrt/nlp/onnx_model_zoo/mobilebert/quantization/ptq/mobilebert_qdq.yaml diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/mobilebert/quantization/ptq/readme.md b/examples/onnxrt/nlp/onnx_model_zoo/mobilebert/quantization/ptq/readme.md similarity index 100% rename from examples/onnxrt/language_translation/onnx_model_zoo/mobilebert/quantization/ptq/readme.md rename to examples/onnxrt/nlp/onnx_model_zoo/mobilebert/quantization/ptq/readme.md diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/mobilebert/quantization/ptq/requirements.txt b/examples/onnxrt/nlp/onnx_model_zoo/mobilebert/quantization/ptq/requirements.txt similarity index 100% rename from examples/onnxrt/language_translation/onnx_model_zoo/mobilebert/quantization/ptq/requirements.txt rename to examples/onnxrt/nlp/onnx_model_zoo/mobilebert/quantization/ptq/requirements.txt diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/mobilebert/quantization/ptq/run_benchmark.sh b/examples/onnxrt/nlp/onnx_model_zoo/mobilebert/quantization/ptq/run_benchmark.sh similarity index 100% rename from examples/onnxrt/language_translation/onnx_model_zoo/mobilebert/quantization/ptq/run_benchmark.sh rename to examples/onnxrt/nlp/onnx_model_zoo/mobilebert/quantization/ptq/run_benchmark.sh diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/mobilebert/quantization/ptq/run_onnx_squad.py b/examples/onnxrt/nlp/onnx_model_zoo/mobilebert/quantization/ptq/run_onnx_squad.py similarity index 100% rename from examples/onnxrt/language_translation/onnx_model_zoo/mobilebert/quantization/ptq/run_onnx_squad.py rename to examples/onnxrt/nlp/onnx_model_zoo/mobilebert/quantization/ptq/run_onnx_squad.py diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/mobilebert/quantization/ptq/run_tuning.sh b/examples/onnxrt/nlp/onnx_model_zoo/mobilebert/quantization/ptq/run_tuning.sh similarity index 100% rename from examples/onnxrt/language_translation/onnx_model_zoo/mobilebert/quantization/ptq/run_tuning.sh rename to examples/onnxrt/nlp/onnx_model_zoo/mobilebert/quantization/ptq/run_tuning.sh diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/mobilebert/quantization/ptq/squad_evaluate.py b/examples/onnxrt/nlp/onnx_model_zoo/mobilebert/quantization/ptq/squad_evaluate.py similarity index 100% rename from examples/onnxrt/language_translation/onnx_model_zoo/mobilebert/quantization/ptq/squad_evaluate.py rename to examples/onnxrt/nlp/onnx_model_zoo/mobilebert/quantization/ptq/squad_evaluate.py diff --git a/examples/onnxrt/language_translation/onnx_model_zoo/mobilebert/quantization/ptq/tokenization.py b/examples/onnxrt/nlp/onnx_model_zoo/mobilebert/quantization/ptq/tokenization.py similarity index 100% rename from examples/onnxrt/language_translation/onnx_model_zoo/mobilebert/quantization/ptq/tokenization.py rename to examples/onnxrt/nlp/onnx_model_zoo/mobilebert/quantization/ptq/tokenization.py diff --git a/examples/onnxrt/language_translation/roberta/quantization/ptq/export.py b/examples/onnxrt/nlp/roberta/quantization/ptq/export.py similarity index 100% rename from examples/onnxrt/language_translation/roberta/quantization/ptq/export.py rename to examples/onnxrt/nlp/roberta/quantization/ptq/export.py diff --git a/examples/onnxrt/language_translation/roberta/quantization/ptq/main.py b/examples/onnxrt/nlp/roberta/quantization/ptq/main.py similarity index 100% rename from examples/onnxrt/language_translation/roberta/quantization/ptq/main.py rename to examples/onnxrt/nlp/roberta/quantization/ptq/main.py diff --git a/examples/onnxrt/nlp/roberta/quantization/ptq/prepare_data.sh b/examples/onnxrt/nlp/roberta/quantization/ptq/prepare_data.sh new file mode 100644 index 00000000000..8e434a5c521 --- /dev/null +++ b/examples/onnxrt/nlp/roberta/quantization/ptq/prepare_data.sh @@ -0,0 +1,34 @@ +#!/bin/bash +set -x + +function main { + init_params "$@" + download_data + +} + +# init params +function init_params { + + for var in "$@" + do + case $var in + --data_dir=*) + data_dir=$(echo $var |cut -f2 -d=) + ;; + --task_name=*) + task_name=$(echo $var |cut -f2 -d=) + ;; + esac + done + +} + +# run_tuning +function download_data { + wget https://raw.githubusercontent.com/huggingface/transformers/f98ef14d161d7bcdc9808b5ec399981481411cc1/utils/download_glue_data.py + python download_glue_data.py --data_dir=${data_dir} --tasks=${task_name} +} + +main "$@" + diff --git a/examples/onnxrt/language_translation/roberta/quantization/ptq/prepare_model.sh b/examples/onnxrt/nlp/roberta/quantization/ptq/prepare_model.sh similarity index 100% rename from examples/onnxrt/language_translation/roberta/quantization/ptq/prepare_model.sh rename to examples/onnxrt/nlp/roberta/quantization/ptq/prepare_model.sh diff --git a/examples/onnxrt/language_translation/roberta/quantization/ptq/readme.md b/examples/onnxrt/nlp/roberta/quantization/ptq/readme.md similarity index 100% rename from examples/onnxrt/language_translation/roberta/quantization/ptq/readme.md rename to examples/onnxrt/nlp/roberta/quantization/ptq/readme.md diff --git a/examples/onnxrt/language_translation/roberta/quantization/ptq/requirements.txt b/examples/onnxrt/nlp/roberta/quantization/ptq/requirements.txt similarity index 100% rename from examples/onnxrt/language_translation/roberta/quantization/ptq/requirements.txt rename to examples/onnxrt/nlp/roberta/quantization/ptq/requirements.txt diff --git a/examples/onnxrt/language_translation/roberta/quantization/ptq/roberta.yaml b/examples/onnxrt/nlp/roberta/quantization/ptq/roberta.yaml similarity index 100% rename from examples/onnxrt/language_translation/roberta/quantization/ptq/roberta.yaml rename to examples/onnxrt/nlp/roberta/quantization/ptq/roberta.yaml diff --git a/examples/onnxrt/language_translation/roberta/quantization/ptq/roberta_qdq.yaml b/examples/onnxrt/nlp/roberta/quantization/ptq/roberta_qdq.yaml similarity index 100% rename from examples/onnxrt/language_translation/roberta/quantization/ptq/roberta_qdq.yaml rename to examples/onnxrt/nlp/roberta/quantization/ptq/roberta_qdq.yaml diff --git a/examples/onnxrt/language_translation/roberta/quantization/ptq/run_benchmark.sh b/examples/onnxrt/nlp/roberta/quantization/ptq/run_benchmark.sh similarity index 100% rename from examples/onnxrt/language_translation/roberta/quantization/ptq/run_benchmark.sh rename to examples/onnxrt/nlp/roberta/quantization/ptq/run_benchmark.sh diff --git a/examples/onnxrt/language_translation/roberta/quantization/ptq/run_tuning.sh b/examples/onnxrt/nlp/roberta/quantization/ptq/run_tuning.sh similarity index 100% rename from examples/onnxrt/language_translation/roberta/quantization/ptq/run_tuning.sh rename to examples/onnxrt/nlp/roberta/quantization/ptq/run_tuning.sh