diff --git a/examples/.config/model_params_onnxrt.json b/examples/.config/model_params_onnxrt.json index d547f92e41f..a726e1a47d4 100644 --- a/examples/.config/model_params_onnxrt.json +++ b/examples/.config/model_params_onnxrt.json @@ -759,21 +759,21 @@ "llama-7b-rtn": { "model_src_dir": "nlp/huggingface_model/text_generation/llama/quantization/weight_only", "dataset_location": "", - "input_model": "/tf_dataset2/models/onnx/llama-7b", + "input_model": "/tf_dataset2/models/onnx/llama-2-7b", "main_script": "main.py", "batch_size": 1 }, "llama-7b-awq": { "model_src_dir": "nlp/huggingface_model/text_generation/llama/quantization/weight_only", "dataset_location": "", - "input_model": "/tf_dataset2/models/onnx/llama-7b", + "input_model": "/tf_dataset2/models/onnx/llama-2-7b", "main_script": "main.py", "batch_size": 1 }, "llama-7b-gptq": { "model_src_dir": "nlp/huggingface_model/text_generation/llama/quantization/weight_only", "dataset_location": "", - "input_model": "/tf_dataset2/models/onnx/llama-7b", + "input_model": "/tf_dataset2/models/onnx/llama-2-7b", "main_script": "main.py", "batch_size": 1 }, diff --git a/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/ptq_static/README.md b/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/ptq_static/README.md index bc78e89da11..14367330018 100644 --- a/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/ptq_static/README.md +++ b/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/ptq_static/README.md @@ -14,10 +14,20 @@ pip install -r requirements.txt ## 2. Prepare Model +Note that this README.md uses meta-llama/Llama-2-7b-hf as an example. There are other models available that can be used for quantization. The following table shows a few models' configurations: + +| Model | Num Hidden Layers| Num Attention Heads | Hidden Size | +| --- | --- | --- | --- | +| [meta-llama/Llama-2-7b-hf](https://huggingface.co/meta-llama/Llama-2-7b-hf) | 32 | 32 | 4096 | +| [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) | 32 | 32 | 4096 | +| [meta-llama/Llama-2-13b-hf](https://huggingface.co/meta-llama/Llama-2-13b-hf) | 40 | 40 | 5120 | +| [meta-llama/Llama-2-13b-chat-hf](https://huggingface.co/meta-llama/Llama-2-13b-chat-hf) | 40 | 40 | 5120 | +| [meta-llama/Llama-2-70b-hf](https://huggingface.co/meta-llama/Llama-2-70b-hf) | 80 | 64 | 8192 | +| [meta-llama/Llama-2-70b-chat-hf](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf) | 80 | 64 | 8192 | + +Export to ONNX model: ```bash -python prepare_model.py --input_model="decapoda-research/llama-7b-hf" --output_model="./llama_7b" -# or -python prepare_model.py --input_model="decapoda-research/llama-13b-hf" --output_model="./llama_13b" +optimum-cli export onnx --model meta-llama/Llama-2-7b-hf --task text-generation-with-past ./Llama-2-7b-hf ``` # Run @@ -30,7 +40,7 @@ bash run_quant.sh --input_model=/path/to/model \ # folder path of onnx model --batch_size=batch_size # optional \ --dataset NeelNanda/pile-10k \ --alpha 0.6 \ # 0.6 for llama-7b, 0.8 for llama-13b - --tokenizer=decapoda-research/llama-7b-hf \ # model name or folder path containing all relevant files for model's tokenizer + --tokenizer=meta-llama/Llama-2-7b-hf \ # model name or folder path containing all relevant files for model's tokenizer --quant_format="QOperator" # or QDQ, optional ``` @@ -42,7 +52,7 @@ Accuracy: bash run_benchmark.sh --input_model=path/to/model \ # folder path of onnx model --batch_size=batch_size \ # optional --mode=accuracy \ - --tokenizer=decapoda-research/llama-7b-hf \ # model name or folder path containing all relevant files for model's tokenizer + --tokenizer=meta-llama/Llama-2-7b-hf \ # model name or folder path containing all relevant files for model's tokenizer --tasks=lambada_openai ``` @@ -51,6 +61,6 @@ Performance: numactl -m 0 -C 0-3 bash run_benchmark.sh --input_model=path/to/model \ # folder path of onnx model --mode=performance \ --batch_size=batch_size # optional \ - --tokenizer=decapoda-research/llama-7b-hf \ # model name or folder path containing all relevant files for model's tokenizer + --tokenizer=meta-llama/Llama-2-7b-hf \ # model name or folder path containing all relevant files for model's tokenizer --intra_op_num_threads=4 ``` diff --git a/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/ptq_static/main.py b/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/ptq_static/main.py index 1cf19b1873c..c1095c822bd 100644 --- a/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/ptq_static/main.py +++ b/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/ptq_static/main.py @@ -72,7 +72,7 @@ '--tokenizer', type=str, help="pretrained model name or path of tokenizer files", - default="decapoda-research/llama-7b-hf" + default="meta-llama/Llama-2-7b-hf" ) parser.add_argument( '--workspace', diff --git a/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/ptq_static/run_quant.sh b/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/ptq_static/run_quant.sh index d556c20914d..9c7d2ff8c2a 100644 --- a/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/ptq_static/run_quant.sh +++ b/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/ptq_static/run_quant.sh @@ -62,7 +62,7 @@ function run_tuning { python main.py \ --quant_format ${quant_format-QOperator} \ --model_path ${input_model} \ - --tokenizer ${tokenizer-decapoda-research/llama-7b-hf} \ + --tokenizer ${tokenizer-meta-llama/Llama-2-7b-hf} \ --output_model ${output_model} \ --batch_size ${batch_size-1} \ --smooth_quant_alpha ${alpha-0.6} \ diff --git a/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/weight_only/README.md b/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/weight_only/README.md index f780509991a..2a278f2a9b4 100644 --- a/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/weight_only/README.md +++ b/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/weight_only/README.md @@ -14,8 +14,20 @@ pip install -r requirements.txt ## 2. Prepare Model +Note that this README.md uses meta-llama/Llama-2-7b-hf as an example. There are other models available that can be used for weight-only quantization. The following table shows a few models' configurations: + +| Model | Num Hidden Layers| Num Attention Heads | Hidden Size | +| --- | --- | --- | --- | +| [meta-llama/Llama-2-7b-hf](https://huggingface.co/meta-llama/Llama-2-7b-hf) | 32 | 32 | 4096 | +| [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) | 32 | 32 | 4096 | +| [meta-llama/Llama-2-13b-hf](https://huggingface.co/meta-llama/Llama-2-13b-hf) | 40 | 40 | 5120 | +| [meta-llama/Llama-2-13b-chat-hf](https://huggingface.co/meta-llama/Llama-2-13b-chat-hf) | 40 | 40 | 5120 | +| [meta-llama/Llama-2-70b-hf](https://huggingface.co/meta-llama/Llama-2-70b-hf) | 80 | 64 | 8192 | +| [meta-llama/Llama-2-70b-chat-hf](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf) | 80 | 64 | 8192 | + +Export to ONNX model: ```bash -optimum-cli export onnx --model decapoda-research/llama-7b-hf --task text-generation-with-past ./llama_7b +optimum-cli export onnx --model meta-llama/Llama-2-7b-hf --task text-generation-with-past ./Llama-2-7b-hf ``` # Run @@ -36,6 +48,6 @@ bash run_quant.sh --input_model=/path/to/model \ # folder path of onnx model ```bash bash run_benchmark.sh --input_model=path/to/model \ # folder path of onnx model --batch_size=batch_size \ # optional - --tokenizer=decapoda-research/llama-7b-hf \ # model name or folder path containing all relevant files for model's tokenizer + --tokenizer=meta-llama/Llama-2-7b-hf \ # model name or folder path containing all relevant files for model's tokenizer --tasks=lambada_openai ``` diff --git a/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/weight_only/main.py b/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/weight_only/main.py index a4bdba8bc38..233e19b7201 100644 --- a/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/weight_only/main.py +++ b/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/weight_only/main.py @@ -68,7 +68,7 @@ "--tokenizer", type=str, help="pretrained model name or path of tokenizer files", - default="decapoda-research/llama-7b-hf" + default="meta-llama/Llama-2-7b-hf" ) parser.add_argument( "--workspace", diff --git a/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/weight_only/run_benchmark.sh b/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/weight_only/run_benchmark.sh index 7307e2e188e..b6f3c73c016 100644 --- a/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/weight_only/run_benchmark.sh +++ b/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/weight_only/run_benchmark.sh @@ -39,7 +39,7 @@ function run_benchmark { python main.py \ --model_path ${input_model} \ --batch_size=${batch_size-1} \ - --tokenizer=${tokenizer-decapoda-research/llama-7b-hf} \ + --tokenizer=${tokenizer-meta-llama/Llama-2-7b-hf} \ --tasks=${tasks-lambada_openai} \ --benchmark diff --git a/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/weight_only/run_quant.sh b/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/weight_only/run_quant.sh index 706215dee96..b385c3628d4 100644 --- a/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/weight_only/run_quant.sh +++ b/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/weight_only/run_quant.sh @@ -58,7 +58,7 @@ function run_tuning { python main.py \ --model_path ${input_model} \ - --tokenizer ${tokenizer-decapoda-research/llama-7b-hf} \ + --tokenizer ${tokenizer-meta-llama/Llama-2-7b-hf} \ --output_model ${output_model} \ --batch_size ${batch_size-1} \ --dataset ${dataset-NeelNanda/pile-10k} \