microsoft · trajepl · Oct 28, 2023 · Oct 12, 2023 · Oct 24, 2023 · Oct 24, 2023
diff --git a/examples/llama2/README.md b/examples/llama2/README.md
@@ -0,0 +1,49 @@
+# Whisper optimization using ORT toolchain
+This folder contains a sample use case of Olive to optimize a [Llama2](https://huggingface.co/meta-llama/Llama-2-7b-hf) model using ONNXRuntime tools.
+
+Performs optimization pipeline:
+- CPU, FP32: *PyTorch Model -> Onnx Model -> Transformers Optimized Onnx Model fp32*
+- CPU, INT8: *PyTorch Model -> Onnx Model -> Transformers Optimized Onnx Model fp32 -> Onnx Dynamic Quantization*
+- CPU, INT4: *PyTorch Model -> Onnx Model -> Transformers Optimized Onnx Model fp32 -> Onnx Block wise int4 Quantization*
+- GPU, FP32: *PyTorch Model -> Onnx Model -> Transformers Optimized Onnx Model fp32*
+- GPU, FP16: *PyTorch Model -> Onnx Model -> Transformers Optimized Onnx Model fp16 + Grouped Query Attention*
+- GPU, INT4: *PyTorch Model -> Onnx Model -> Transformers Optimized Onnx Model fp16 + Grouped Query Attention -> Onnx Block wise int4 Quantization*
+
+Outputs the final model and latency results.
+
+## Prerequisites
+### Clone the repository and install Olive
+
+Refer to the instructions in the [examples README](../README.md) to clone the repository and install Olive.
+
+### Pip requirements
+Install the necessary python packages:
+```
+python -m pip install -r requirements.txt
+```
+
+## Run the config to optimize the model
+First, install required packages according to passes.
+
+CPU:
+```bash
+# setup related packages
+python -m olive.workflows.run --config ort_converter_merged_llama2_cpu.json --setup
+
+# run to optimize the model: FP32/INT8/INT4
+python -m olive.workflows.run --config ort_converter_merged_llama2_cpu.json
+```
+
+GPU:
+```bash
+# setup related packages
+python -m olive.workflows.run --config ort_converter_merged_llama2_gpu.json --setup
+
+# run to optimize the model: FP32/INT8/INT4
+python -m olive.workflows.run --config ort_converter_merged_llama2_gpu.json
+```
+
+
+## TODO
+- [ ] Add generation example of the optimized model.
+- [ ] Attach the benchmark results.
diff --git a/examples/llama2/ort_converter_merged_llama2_cpu.json b/examples/llama2/ort_converter_merged_llama2_cpu.json
@@ -0,0 +1,110 @@
+{
+    "input_model":{
+        "type": "PyTorchModel",
+        "config": {
+            "model_script": "user_script.py",
+            "io_config": "get_merged_decoder_with_past_io_config",
+            "dummy_inputs_func": "get_merged_decoder_with_past_kv_inputs",
+            "hf_config": {
+                "model_name" : "meta-llama/Llama-2-7b-hf",
+                "model_class" : "LlamaForCausalLM"
+            }
+        }
+    },
+    "evaluators": {
+        "merged_evaluator": {
+            "metrics":[
+                {
+                    "name": "onnx_merged_latency",
+                    "type": "latency",
+                    "sub_types": [
+                        {"name": "avg", "priority": 1}
+                    ],
+                    "user_config": {
+                        "user_script": "user_script.py",
+                        "dataloader_func": "dataloader_func_for_merged",
+                        "batch_size": 1
+                    }
+                }
+            ]
+        }
+    },
+    "passes": {
+        "conversion_merged": {
+            "type": "OnnxConversion",
+            "config": {
+                "target_opset": 13,
+                "save_as_external_data": true,
+                "all_tensors_to_one_file": true
+            }
+        },
+        "transformers_optimization_fp16": {
+            "type": "OrtTransformersOptimization",
+            "disable_search": true,
+            "config": {
+                "save_as_external_data": true,
+                "all_tensors_to_one_file": true,
+                "model_type": "gpt2",
+                "opt_level": 0,
+                "only_onnxruntime": false,
+                "keep_io_types": false,
+                "float16": true,
+                "use_gqa": true
+            }
+        },
+        "transformers_optimization_fp32": {
+            "type": "OrtTransformersOptimization",
+            "disable_search": true,
+            "config": {
+                "save_as_external_data": true,
+                "all_tensors_to_one_file": true,
+                "model_type": "gpt2",
+                "opt_level": 0,
+                "only_onnxruntime": false,
+                "keep_io_types": false,
+                "float16": false,
+                "use_gqa": false
+            }
+        },
+        "onnx_dynamic_quant_int8": {
+            "type": "OnnxDynamicQuantization",
+            "disable_search": true,
+            "config": {
+                "save_as_external_data": true,
+                "all_tensors_to_one_file": true,
+                "op_types_to_quantize": ["MatMul", "Gemm"],
+                "per_channel": false,
+                "reduce_range": false,
+                "MatMulConstBOnly": true
+            }
+        },
+        "blockwise_quant_int4": {
+            "type": "OnnxMatMul4Quantizer",
+            "disable_search": true,
+            "config": {
+                "save_as_external_data": true,
+                "all_tensors_to_one_file": true,
+                "block_size": 32,
+                "is_symmetric": true
+            }
+        }
+    },
+    "pass_flows": [
+        ["conversion_merged", "transformers_optimization_fp32"],
+        ["conversion_merged", "transformers_optimization_fp32", "onnx_dynamic_quant_int8"],
+        ["conversion_merged", "transformers_optimization_fp32", "blockwise_quant_int4"]
+    ],
+    "engine": {
+        "search_strategy": {
+            "execution_order": "pass-by-pass",
+            "search_algorithm": "tpe",
+            "search_algorithm_config": {
+                "num_samples": 3,
+                "seed": 0
+            }
+        },
+        "evaluator": "merged_evaluator",
+        "cache_dir": "cache",
+        "output_dir" : "models/llama2-7b"
+    }
+}
diff --git a/examples/llama2/ort_converter_merged_llama2_gpu.json b/examples/llama2/ort_converter_merged_llama2_gpu.json
@@ -0,0 +1,111 @@
+{
+    "input_model":{
+        "type": "PyTorchModel",
+        "config": {
+            "model_script": "user_script.py",
+            "io_config": "get_merged_decoder_with_past_io_config",
+            "dummy_inputs_func": "get_merged_decoder_with_past_kv_inputs",
+            "hf_config": {
+                "model_name" : "meta-llama/Llama-2-7b-hf",
+                "model_class" : "LlamaForCausalLM"
+            }
+        }
+    },
+    "evaluators": {
+        "merged_evaluator": {
+            "metrics":[
+                {
+                    "name": "onnx_merged_latency",
+                    "type": "latency",
+                    "sub_types": [
+                        {"name": "avg", "priority": 1}
+                    ],
+                    "user_config": {
+                        "user_script": "user_script.py",
+                        "dataloader_func": "dataloader_func_for_merged",
+                        "batch_size": 1,
+                        "io_bind": true
+                    }
+                }
+            ]
+        }
+    },
+    "passes": {
+        "conversion_merged": {
+            "type": "OnnxConversion",
+            "config": {
+                "target_opset": 13,
+                "save_as_external_data": true,
+                "all_tensors_to_one_file": true
+            }
+        },
+        "transformers_optimization_fp16": {
+            "type": "OrtTransformersOptimization",
+            "disable_search": true,
+            "config": {
+                "save_as_external_data": true,
+                "all_tensors_to_one_file": true,
+                "model_type": "gpt2",
+                "opt_level": 0,
+                "only_onnxruntime": false,
+                "keep_io_types": false,
+                "float16": true,
+                "use_gqa": true
+            }
+        },
+        "transformers_optimization_fp32": {
+            "type": "OrtTransformersOptimization",
+            "disable_search": true,
+            "config": {
+                "save_as_external_data": true,
+                "all_tensors_to_one_file": true,
+                "model_type": "gpt2",
+                "opt_level": 0,
+                "only_onnxruntime": false,
+                "keep_io_types": false,
+                "float16": false,
+                "use_gqa": false
+            }
+        },
+        "onnx_dynamic_quant_int8": {
+            "type": "OnnxDynamicQuantization",
+            "disable_search": true,
+            "config": {
+                "save_as_external_data": true,
+                "all_tensors_to_one_file": true,
+                "op_types_to_quantize": ["MatMul", "Gemm"],
+                "per_channel": false,
+                "reduce_range": false,
+                "MatMulConstBOnly": true
+            }
+        },
+        "blockwise_quant_int4": {
+            "type": "OnnxMatMul4Quantizer",
+            "disable_search": true,
+            "config": {
+                "save_as_external_data": true,
+                "all_tensors_to_one_file": true,
+                "block_size": 32,
+                "is_symmetric": true
+            }
+        }
+    },
+    "pass_flows": [
+        ["conversion_merged", "transformers_optimization_fp16"],
+        ["conversion_merged", "transformers_optimization_fp16", "blockwise_quant_int4"]
+    ],
+    "engine": {
+        "search_strategy": {
+            "execution_order": "pass-by-pass",
+            "search_algorithm": "tpe",
+            "search_algorithm_config": {
+                "num_samples": 3,
+                "seed": 0
+            }
+        },
+        "evaluator": "merged_evaluator",
+        "execution_providers": ["CUDAExecutionProvider"],
+        "cache_dir": "cache",
+        "output_dir" : "models/llama2-7b"
+    }
+}
diff --git a/examples/llama2/requirement.txt b/examples/llama2/requirement.txt
@@ -0,0 +1,6 @@
+git+https://github.com/huggingface/optimum.git
+transformers>=4.33.2
+onnx>=1.14.0
+datasets>=2.8.0
+protobuf==3.20.2
+torch -i https://download.pytorch.org/whl/nightly/cu118