From 7f2063f79f190e1fdfe3edcae0504ef9025f2c7c Mon Sep 17 00:00:00 2001
From: yuwenzho <yuwen.zhou@intel.com>
Date: Tue, 31 Oct 2023 21:46:44 +0800
Subject: [PATCH] update llama model

Signed-off-by: yuwenzho <yuwen.zhou@intel.com>
---
 examples/.config/model_params_onnxrt.json     |  6 ++---
 .../llama/quantization/ptq_static/README.md   | 22 ++++++++++++++-----
 .../llama/quantization/ptq_static/main.py     |  2 +-
 .../quantization/ptq_static/run_quant.sh      |  2 +-
 .../llama/quantization/weight_only/README.md  | 16 ++++++++++++--
 .../llama/quantization/weight_only/main.py    |  2 +-
 .../quantization/weight_only/run_benchmark.sh |  2 +-
 .../quantization/weight_only/run_quant.sh     |  2 +-
 8 files changed, 38 insertions(+), 16 deletions(-)

diff --git a/examples/.config/model_params_onnxrt.json b/examples/.config/model_params_onnxrt.json
index d547f92e41f..a726e1a47d4 100644
--- a/examples/.config/model_params_onnxrt.json
+++ b/examples/.config/model_params_onnxrt.json
@@ -759,21 +759,21 @@
     "llama-7b-rtn": {
       "model_src_dir": "nlp/huggingface_model/text_generation/llama/quantization/weight_only",
       "dataset_location": "",
-      "input_model": "/tf_dataset2/models/onnx/llama-7b",
+      "input_model": "/tf_dataset2/models/onnx/llama-2-7b",
       "main_script": "main.py",
       "batch_size": 1
     },
     "llama-7b-awq": {
       "model_src_dir": "nlp/huggingface_model/text_generation/llama/quantization/weight_only",
       "dataset_location": "",
-      "input_model": "/tf_dataset2/models/onnx/llama-7b",
+      "input_model": "/tf_dataset2/models/onnx/llama-2-7b",
       "main_script": "main.py",
       "batch_size": 1
     },
     "llama-7b-gptq": {
       "model_src_dir": "nlp/huggingface_model/text_generation/llama/quantization/weight_only",
       "dataset_location": "",
-      "input_model": "/tf_dataset2/models/onnx/llama-7b",
+      "input_model": "/tf_dataset2/models/onnx/llama-2-7b",
       "main_script": "main.py",
       "batch_size": 1
     },
diff --git a/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/ptq_static/README.md b/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/ptq_static/README.md
index bc78e89da11..14367330018 100644
--- a/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/ptq_static/README.md
+++ b/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/ptq_static/README.md
@@ -14,10 +14,20 @@ pip install -r requirements.txt
 
 ## 2. Prepare Model
 
+Note that this README.md uses meta-llama/Llama-2-7b-hf as an example. There are other models available that can be used for quantization. The following table shows a few models' configurations:
+
+| Model | Num Hidden Layers| Num Attention Heads | Hidden Size |
+| --- | --- | --- | --- |
+| [meta-llama/Llama-2-7b-hf](https://huggingface.co/meta-llama/Llama-2-7b-hf) | 32 | 32 | 4096 |
+| [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) | 32 | 32 | 4096 |
+| [meta-llama/Llama-2-13b-hf](https://huggingface.co/meta-llama/Llama-2-13b-hf) | 40 | 40 | 5120 |
+| [meta-llama/Llama-2-13b-chat-hf](https://huggingface.co/meta-llama/Llama-2-13b-chat-hf) | 40 | 40 | 5120 |
+| [meta-llama/Llama-2-70b-hf](https://huggingface.co/meta-llama/Llama-2-70b-hf) | 80 | 64 | 8192 |
+| [meta-llama/Llama-2-70b-chat-hf](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf) | 80 | 64 | 8192 |
+
+Export to ONNX model:
 ```bash
-python prepare_model.py  --input_model="decapoda-research/llama-7b-hf" --output_model="./llama_7b"
-# or
-python prepare_model.py  --input_model="decapoda-research/llama-13b-hf" --output_model="./llama_13b"
+optimum-cli export onnx --model meta-llama/Llama-2-7b-hf --task text-generation-with-past ./Llama-2-7b-hf
 ```
 
 # Run
@@ -30,7 +40,7 @@ bash run_quant.sh --input_model=/path/to/model \ # folder path of onnx model
                   --batch_size=batch_size # optional \
                   --dataset NeelNanda/pile-10k \
                   --alpha 0.6 \ # 0.6 for llama-7b, 0.8 for llama-13b
-                  --tokenizer=decapoda-research/llama-7b-hf \ # model name or folder path containing all relevant files for model's tokenizer
+                  --tokenizer=meta-llama/Llama-2-7b-hf \ # model name or folder path containing all relevant files for model's tokenizer
                   --quant_format="QOperator" # or QDQ, optional
 ```
 
@@ -42,7 +52,7 @@ Accuracy:
 bash run_benchmark.sh --input_model=path/to/model \ # folder path of onnx model
                       --batch_size=batch_size \ # optional 
                       --mode=accuracy \
-                      --tokenizer=decapoda-research/llama-7b-hf \ # model name or folder path containing all relevant files for model's tokenizer
+                      --tokenizer=meta-llama/Llama-2-7b-hf \ # model name or folder path containing all relevant files for model's tokenizer
                       --tasks=lambada_openai
 ```
 
@@ -51,6 +61,6 @@ Performance:
 numactl -m 0 -C 0-3 bash run_benchmark.sh --input_model=path/to/model \ # folder path of onnx model
                                           --mode=performance \
                                           --batch_size=batch_size # optional \
-                                          --tokenizer=decapoda-research/llama-7b-hf \ # model name or folder path containing all relevant files for model's tokenizer
+                                          --tokenizer=meta-llama/Llama-2-7b-hf \ # model name or folder path containing all relevant files for model's tokenizer
                                           --intra_op_num_threads=4
 ```
diff --git a/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/ptq_static/main.py b/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/ptq_static/main.py
index 1cf19b1873c..c1095c822bd 100644
--- a/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/ptq_static/main.py
+++ b/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/ptq_static/main.py
@@ -72,7 +72,7 @@
     '--tokenizer',
     type=str,
     help="pretrained model name or path of tokenizer files",
-    default="decapoda-research/llama-7b-hf"
+    default="meta-llama/Llama-2-7b-hf"
 )
 parser.add_argument(
     '--workspace',
diff --git a/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/ptq_static/run_quant.sh b/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/ptq_static/run_quant.sh
index d556c20914d..9c7d2ff8c2a 100644
--- a/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/ptq_static/run_quant.sh
+++ b/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/ptq_static/run_quant.sh
@@ -62,7 +62,7 @@ function run_tuning {
     python main.py \
             --quant_format ${quant_format-QOperator} \
             --model_path ${input_model} \
-	    --tokenizer ${tokenizer-decapoda-research/llama-7b-hf} \
+	    --tokenizer ${tokenizer-meta-llama/Llama-2-7b-hf} \
             --output_model ${output_model} \
             --batch_size ${batch_size-1} \
             --smooth_quant_alpha ${alpha-0.6} \
diff --git a/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/weight_only/README.md b/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/weight_only/README.md
index f780509991a..2a278f2a9b4 100644
--- a/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/weight_only/README.md
+++ b/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/weight_only/README.md
@@ -14,8 +14,20 @@ pip install -r requirements.txt
 
 ## 2. Prepare Model
 
+Note that this README.md uses meta-llama/Llama-2-7b-hf as an example. There are other models available that can be used for weight-only quantization. The following table shows a few models' configurations:
+
+| Model | Num Hidden Layers| Num Attention Heads | Hidden Size |
+| --- | --- | --- | --- |
+| [meta-llama/Llama-2-7b-hf](https://huggingface.co/meta-llama/Llama-2-7b-hf) | 32 | 32 | 4096 |
+| [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) | 32 | 32 | 4096 |
+| [meta-llama/Llama-2-13b-hf](https://huggingface.co/meta-llama/Llama-2-13b-hf) | 40 | 40 | 5120 |
+| [meta-llama/Llama-2-13b-chat-hf](https://huggingface.co/meta-llama/Llama-2-13b-chat-hf) | 40 | 40 | 5120 |
+| [meta-llama/Llama-2-70b-hf](https://huggingface.co/meta-llama/Llama-2-70b-hf) | 80 | 64 | 8192 |
+| [meta-llama/Llama-2-70b-chat-hf](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf) | 80 | 64 | 8192 |
+
+Export to ONNX model:
 ```bash
-optimum-cli export onnx --model decapoda-research/llama-7b-hf --task text-generation-with-past ./llama_7b
+optimum-cli export onnx --model meta-llama/Llama-2-7b-hf --task text-generation-with-past ./Llama-2-7b-hf
 ```
 
 # Run
@@ -36,6 +48,6 @@ bash run_quant.sh --input_model=/path/to/model \ # folder path of onnx model
 ```bash
 bash run_benchmark.sh --input_model=path/to/model \ # folder path of onnx model
                       --batch_size=batch_size \ # optional 
-                      --tokenizer=decapoda-research/llama-7b-hf \ # model name or folder path containing all relevant files for model's tokenizer
+                      --tokenizer=meta-llama/Llama-2-7b-hf \ # model name or folder path containing all relevant files for model's tokenizer
                       --tasks=lambada_openai
 ```
diff --git a/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/weight_only/main.py b/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/weight_only/main.py
index a4bdba8bc38..233e19b7201 100644
--- a/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/weight_only/main.py
+++ b/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/weight_only/main.py
@@ -68,7 +68,7 @@
     "--tokenizer",
     type=str,
     help="pretrained model name or path of tokenizer files",
-    default="decapoda-research/llama-7b-hf"
+    default="meta-llama/Llama-2-7b-hf"
 )
 parser.add_argument(
     "--workspace",
diff --git a/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/weight_only/run_benchmark.sh b/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/weight_only/run_benchmark.sh
index 7307e2e188e..b6f3c73c016 100644
--- a/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/weight_only/run_benchmark.sh
+++ b/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/weight_only/run_benchmark.sh
@@ -39,7 +39,7 @@ function run_benchmark {
     python main.py \
             --model_path ${input_model} \
             --batch_size=${batch_size-1} \
-            --tokenizer=${tokenizer-decapoda-research/llama-7b-hf} \
+            --tokenizer=${tokenizer-meta-llama/Llama-2-7b-hf} \
             --tasks=${tasks-lambada_openai} \
             --benchmark
             
diff --git a/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/weight_only/run_quant.sh b/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/weight_only/run_quant.sh
index 706215dee96..b385c3628d4 100644
--- a/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/weight_only/run_quant.sh
+++ b/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/weight_only/run_quant.sh
@@ -58,7 +58,7 @@ function run_tuning {
 
     python main.py \
             --model_path ${input_model} \
-	    --tokenizer ${tokenizer-decapoda-research/llama-7b-hf} \
+	    --tokenizer ${tokenizer-meta-llama/Llama-2-7b-hf} \
             --output_model ${output_model} \
             --batch_size ${batch_size-1} \
             --dataset ${dataset-NeelNanda/pile-10k} \