Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

🦙 llama2 optimization #641

Merged
merged 10 commits into from
Oct 28, 2023
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 49 additions & 0 deletions examples/llama2/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# Whisper optimization using ORT toolchain
This folder contains a sample use case of Olive to optimize a [Llama2](https://huggingface.co/meta-llama/Llama-2-7b-hf) model using ONNXRuntime tools.

Performs optimization pipeline:
- CPU, FP32: *PyTorch Model -> Onnx Model -> Transformers Optimized Onnx Model fp32*
- CPU, INT8: *PyTorch Model -> Onnx Model -> Transformers Optimized Onnx Model fp32 -> Onnx Dynamic Quantization*
- CPU, INT4: *PyTorch Model -> Onnx Model -> Transformers Optimized Onnx Model fp32 -> Onnx Block wise int4 Quantization*
- GPU, FP32: *PyTorch Model -> Onnx Model -> Transformers Optimized Onnx Model fp32*
- GPU, FP16: *PyTorch Model -> Onnx Model -> Transformers Optimized Onnx Model fp16 + Grouped Query Attention*
- GPU, INT4: *PyTorch Model -> Onnx Model -> Transformers Optimized Onnx Model fp16 + Grouped Query Attention -> Onnx Block wise int4 Quantization*

Outputs the final model and latency results.

## Prerequisites
### Clone the repository and install Olive

Refer to the instructions in the [examples README](../README.md) to clone the repository and install Olive.

### Pip requirements
Install the necessary python packages:
```
python -m pip install -r requirements.txt
```

## Run the config to optimize the model
First, install required packages according to passes.

CPU:
```bash
# setup related packages
python -m olive.workflows.run --config ort_converter_merged_llama2_cpu.json --setup

# run to optimize the model: FP32/INT8/INT4
python -m olive.workflows.run --config ort_converter_merged_llama2_cpu.json
```

GPU:
```bash
# setup related packages
python -m olive.workflows.run --config ort_converter_merged_llama2_gpu.json --setup

# run to optimize the model: FP32/INT8/INT4
python -m olive.workflows.run --config ort_converter_merged_llama2_gpu.json
```


## TODO
- [ ] Add generation example of the optimized model.
- [ ] Attach the benchmark results.
110 changes: 110 additions & 0 deletions examples/llama2/ort_converter_merged_llama2_cpu.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
{
"input_model":{
"type": "PyTorchModel",
"config": {
"model_script": "user_script.py",
"io_config": "get_merged_decoder_with_past_io_config",
"dummy_inputs_func": "get_merged_decoder_with_past_kv_inputs",
"hf_config": {
"model_name" : "meta-llama/Llama-2-7b-hf",
"model_class" : "LlamaForCausalLM"
}
}
},
"evaluators": {
"merged_evaluator": {
"metrics":[
{
"name": "onnx_merged_latency",
"type": "latency",
"sub_types": [
{"name": "avg", "priority": 1}
],
"user_config": {
"user_script": "user_script.py",
"dataloader_func": "dataloader_func_for_merged",
"batch_size": 1
}
}
]
}
},
"passes": {
"conversion_merged": {
"type": "OnnxConversion",
trajepl marked this conversation as resolved.
Show resolved Hide resolved
"config": {
"target_opset": 13,
"save_as_external_data": true,
"all_tensors_to_one_file": true
}
},
"transformers_optimization_fp16": {
"type": "OrtTransformersOptimization",
"disable_search": true,
"config": {
"save_as_external_data": true,
"all_tensors_to_one_file": true,
"model_type": "gpt2",
"opt_level": 0,
"only_onnxruntime": false,
"keep_io_types": false,
"float16": true,
"use_gqa": true
}
},
"transformers_optimization_fp32": {
"type": "OrtTransformersOptimization",
"disable_search": true,
"config": {
"save_as_external_data": true,
"all_tensors_to_one_file": true,
"model_type": "gpt2",
"opt_level": 0,
"only_onnxruntime": false,
"keep_io_types": false,
"float16": false,
"use_gqa": false
}
},
"onnx_dynamic_quant_int8": {
"type": "OnnxDynamicQuantization",
"disable_search": true,
"config": {
"save_as_external_data": true,
"all_tensors_to_one_file": true,
"op_types_to_quantize": ["MatMul", "Gemm"],
"per_channel": false,
"reduce_range": false,
"MatMulConstBOnly": true
}
},
"blockwise_quant_int4": {
"type": "OnnxMatMul4Quantizer",
"disable_search": true,
"config": {
"save_as_external_data": true,
"all_tensors_to_one_file": true,
"block_size": 32,
"is_symmetric": true
}
}
},
"pass_flows": [
["conversion_merged", "transformers_optimization_fp32"],
["conversion_merged", "transformers_optimization_fp32", "onnx_dynamic_quant_int8"],
["conversion_merged", "transformers_optimization_fp32", "blockwise_quant_int4"]
],
"engine": {
"search_strategy": {
"execution_order": "pass-by-pass",
"search_algorithm": "tpe",
"search_algorithm_config": {
"num_samples": 3,
"seed": 0
}
},
"evaluator": "merged_evaluator",
"cache_dir": "cache",
"output_dir" : "models/llama2-7b"
}
}
111 changes: 111 additions & 0 deletions examples/llama2/ort_converter_merged_llama2_gpu.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
{
"input_model":{
"type": "PyTorchModel",
"config": {
"model_script": "user_script.py",
"io_config": "get_merged_decoder_with_past_io_config",
"dummy_inputs_func": "get_merged_decoder_with_past_kv_inputs",
"hf_config": {
"model_name" : "meta-llama/Llama-2-7b-hf",
"model_class" : "LlamaForCausalLM"
}
}
},
"evaluators": {
"merged_evaluator": {
"metrics":[
{
"name": "onnx_merged_latency",
"type": "latency",
"sub_types": [
{"name": "avg", "priority": 1}
],
"user_config": {
"user_script": "user_script.py",
"dataloader_func": "dataloader_func_for_merged",
"batch_size": 1,
"io_bind": true
}
}
]
}
},
"passes": {
"conversion_merged": {
"type": "OnnxConversion",
"config": {
"target_opset": 13,
"save_as_external_data": true,
"all_tensors_to_one_file": true
}
},
"transformers_optimization_fp16": {
"type": "OrtTransformersOptimization",
"disable_search": true,
"config": {
"save_as_external_data": true,
"all_tensors_to_one_file": true,
"model_type": "gpt2",
"opt_level": 0,
"only_onnxruntime": false,
"keep_io_types": false,
"float16": true,
"use_gqa": true
}
},
"transformers_optimization_fp32": {
"type": "OrtTransformersOptimization",
"disable_search": true,
"config": {
"save_as_external_data": true,
"all_tensors_to_one_file": true,
"model_type": "gpt2",
"opt_level": 0,
"only_onnxruntime": false,
"keep_io_types": false,
"float16": false,
"use_gqa": false
}
},
"onnx_dynamic_quant_int8": {
"type": "OnnxDynamicQuantization",
"disable_search": true,
"config": {
"save_as_external_data": true,
"all_tensors_to_one_file": true,
"op_types_to_quantize": ["MatMul", "Gemm"],
"per_channel": false,
"reduce_range": false,
"MatMulConstBOnly": true
}
},
"blockwise_quant_int4": {
"type": "OnnxMatMul4Quantizer",
"disable_search": true,
"config": {
"save_as_external_data": true,
"all_tensors_to_one_file": true,
"block_size": 32,
"is_symmetric": true
}
}
},
"pass_flows": [
["conversion_merged", "transformers_optimization_fp16"],
["conversion_merged", "transformers_optimization_fp16", "blockwise_quant_int4"]
],
"engine": {
"search_strategy": {
"execution_order": "pass-by-pass",
"search_algorithm": "tpe",
"search_algorithm_config": {
"num_samples": 3,
"seed": 0
}
},
"evaluator": "merged_evaluator",
"execution_providers": ["CUDAExecutionProvider"],
"cache_dir": "cache",
"output_dir" : "models/llama2-7b"
}
}
6 changes: 6 additions & 0 deletions examples/llama2/requirement.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
git+https://github.com/huggingface/optimum.git
transformers>=4.33.2
onnx>=1.14.0
datasets>=2.8.0
protobuf==3.20.2
torch -i https://download.pytorch.org/whl/nightly/cu118
Loading