Add LLaMa onnx example (#985)

Signed-off-by: Mengni Wang <[email protected]>
intel · Aug 3, 2023 · 7fbcf54 · 7fbcf54
1 parent f0d51c2
commit 7fbcf54
Show file tree

Hide file tree

Showing 23 changed files with 542 additions and 5 deletions.
diff --git a/.azure-pipelines/scripts/codeScan/pyspelling/inc_dict.txt b/.azure-pipelines/scripts/codeScan/pyspelling/inc_dict.txt
@@ -419,6 +419,7 @@ DDP
 DDR
 de
 deberta
+decapoda
 DecodeImage
 deepengine
 deeplab
@@ -1187,6 +1188,7 @@ nd
 ndarray
 NDArray
 nderlu
+NeelNanda
 neox
 nepoch
 ner

diff --git a/.azure-pipelines/scripts/codeScan/pyspelling/pyspelling_conf.yaml b/.azure-pipelines/scripts/codeScan/pyspelling/pyspelling_conf.yaml
@@ -9,11 +9,11 @@ matrix:
     sources:
       - ${REPO_DIR}/docs/source/*.md
       - ${REPO_DIR}/*.md
-      - ${REPO_DIR}/examples/**/*.md|!${REPO_DIR}/examples/pytorch/**/huggingface_models/**/*.md
+      - ${REPO_DIR}/examples/**/*.md|!${REPO_DIR}/examples/pytorch/**/huggingface_models/**/*.md|!${REPO_DIR}/examples/README.md
       - ${REPO_DIR}/neural_compressor/**/*.md
       - ${REPO_DIR}/neural_coder/**/*.md
       - ${REPO_DIR}/neural_coder/*.md
       - ${REPO_DIR}/neural_solution/*.md
       - ${REPO_DIR}/neural_solution/docs/source/*.md
       - ${REPO_DIR}/neural_solution/examples/**/*.md
-      - ${REPO_DIR}/neural_insights/*.md
+      - ${REPO_DIR}/neural_insights/*.md
diff --git a/examples/.config/model_params_onnxrt.json b/examples/.config/model_params_onnxrt.json
@@ -736,19 +736,26 @@
       "batch_size": 1
     },
     "gpt-j-6B": {
-      "model_src_dir": "nlp/huggingface_model/text_generation/quantization/ptq_static",
+      "model_src_dir": "nlp/huggingface_model/text_generation/gptj/quantization/ptq_static",
       "dataset_location": "",
       "input_model": "/tf_dataset2/models/onnx/gpt-j-6b/model.onnx",
       "main_script": "main.py",
       "batch_size": 1
     },
     "gpt-j-6B_dynamic": {
-      "model_src_dir": "nlp/huggingface_model/text_generation/quantization/ptq_dynamic",
+      "model_src_dir": "nlp/huggingface_model/text_generation/gptj/quantization/ptq_dynamic",
       "dataset_location": "",
       "input_model": "/tf_dataset2/models/onnx/gpt-j-6b/model.onnx",
       "main_script": "main.py",
       "batch_size": 1
     },
+    "llama-7b": {
+      "model_src_dir": "nlp/huggingface_model/text_generation/llama/quantization/ptq_static",
+      "dataset_location": "",
+      "input_model": "/tf_dataset2/models/onnx/llama-7b",
+      "main_script": "main.py",
+      "batch_size": 1
+    },
     "hf_roberta-large": {
       "model_src_dir": "nlp/huggingface_model/question_answering/quantization/ptq_static",
       "dataset_location": "/tf_dataset2/datasets/squad",

diff --git a/examples/README.md b/examples/README.md
@@ -1431,7 +1431,15 @@ Intel® Neural Compressor validated examples with multiple compression technique
     <td>Text Generation</td>
     <td>Post-Training Dynamic / Static Quantization</td>
     <td>
-      <a href="./onnxrt/nlp/huggingface_model/text_generation/quantization/ptq_dynamic">integerops</a> / <a href="./onnxrt/nlp/huggingface_model/text_generation/quantization/ptq_static">qlinearops</a>
+      <a href="./onnxrt/nlp/huggingface_model/text_generation/gptj/quantization/ptq_dynamic">integerops</a> / <a href="./onnxrt/nlp/huggingface_model/text_generation/gptj/quantization/ptq_static">qlinearops</a>
+    </td>
+  </tr>
+  <tr>
+    <td>Llama-7B (HuggingFace)</td>
+    <td>Text Generation</td>
+    <td>Static Quantization</td>
+    <td>
+      <a href="./onnxrt/nlp/huggingface_model/text_generation/llama/quantization/ptq_static">qlinearops</a>
     </td>
   </tr>
 </tbody>

diff --git a/...ration/quantization/ptq_dynamic/README.md → ...n/gptj/quantization/ptq_dynamic/README.md b/...ration/quantization/ptq_dynamic/README.md → ...n/gptj/quantization/ptq_dynamic/README.md
diff --git a/...neration/quantization/ptq_dynamic/main.py → ...ion/gptj/quantization/ptq_dynamic/main.py b/...neration/quantization/ptq_dynamic/main.py → ...ion/gptj/quantization/ptq_dynamic/main.py
diff --git a/...quantization/ptq_dynamic/requirements.txt → ...quantization/ptq_dynamic/requirements.txt b/...quantization/ptq_dynamic/requirements.txt → ...quantization/ptq_dynamic/requirements.txt
diff --git a/...quantization/ptq_dynamic/run_benchmark.sh → ...quantization/ptq_dynamic/run_benchmark.sh b/...quantization/ptq_dynamic/run_benchmark.sh → ...quantization/ptq_dynamic/run_benchmark.sh
diff --git a/...ion/quantization/ptq_dynamic/run_quant.sh → ...ptj/quantization/ptq_dynamic/run_quant.sh b/...ion/quantization/ptq_dynamic/run_quant.sh → ...ptj/quantization/ptq_dynamic/run_quant.sh
diff --git a/...eration/quantization/ptq_static/README.md → ...on/gptj/quantization/ptq_static/README.md b/...eration/quantization/ptq_static/README.md → ...on/gptj/quantization/ptq_static/README.md
diff --git a/...eneration/quantization/ptq_static/main.py → ...tion/gptj/quantization/ptq_static/main.py b/...eneration/quantization/ptq_static/main.py → ...tion/gptj/quantization/ptq_static/main.py
diff --git a/.../quantization/ptq_static/requirements.txt → .../quantization/ptq_static/requirements.txt b/.../quantization/ptq_static/requirements.txt → .../quantization/ptq_static/requirements.txt
diff --git a/.../quantization/ptq_static/run_benchmark.sh → .../quantization/ptq_static/run_benchmark.sh b/.../quantization/ptq_static/run_benchmark.sh → .../quantization/ptq_static/run_benchmark.sh
diff --git a/...tion/quantization/ptq_static/run_quant.sh → ...gptj/quantization/ptq_static/run_quant.sh b/...tion/quantization/ptq_static/run_quant.sh → ...gptj/quantization/ptq_static/run_quant.sh
diff --git a/...t/nlp/huggingface_model/text_generation/llama/quantization/ptq_static/README.md b/...t/nlp/huggingface_model/text_generation/llama/quantization/ptq_static/README.md
@@ -0,0 +1,55 @@
+Step-by-Step
+============
+
+This example confirms llama's accuracy and speed based on [lambada](https://huggingface.co/datasets/lambada).
+
+# Prerequisite
+
+## 1. Environment
+```shell
+pip install neural-compressor
+pip install -r requirements.txt
+```
+> Note: Validated ONNX Runtime [Version](/docs/source/installation_guide.md#validated-software-environment).
+
+## 2. Prepare Model
+
+```bash
+optimum-cli export onnx --model decapoda-research/llama-7b-hf --task text-generation-with-past ./llama_7b
+optimum-cli export onnx --model decapoda-research/llama-13b-hf --task text-generation-with-past ./llama_13b
+```
+
+# Run
+
+## 1. Quantization
+
+```bash
+bash run_quant.sh --input_model=/path/to/model \ # folder path of onnx model
+                  --output_model=/path/to/model_tune \ # folder path to save onnx model
+                  --batch_size=batch_size # optional \
+                  --dataset NeelNanda/pile-10k \
+                  --alpha 0.6 \ # 0.6 for llama-7b, 0.8 for llama-13b
+                  --tokenizer=decapoda-research/llama-7b-hf \ # model name or folder path containing all relevant files for model's tokenizer
+                  --quant_format="QOperator" # or QDQ, optional
+```
+
+## 2. Benchmark
+
+Accuracy:
+
+```bash
+bash run_benchmark.sh --input_model=path/to/model \ # folder path of onnx model
+                      --batch_size=batch_size \ # optional 
+                      --mode=accuracy \
+                      --tokenizer=decapoda-research/llama-7b-hf \ # model name or folder path containing all relevant files for model's tokenizer
+                      --tasks=lambada_openai
+```
+
+Performance:
+```bash
+numactl -m 0 -C 0-3 bash run_benchmark.sh --input_model=path/to/model \ # folder path of onnx model
+                                          --mode=performance \
+                                          --batch_size=batch_size # optional \
+                                          --tokenizer=decapoda-research/llama-7b-hf \ # model name or folder path containing all relevant files for model's tokenizer
+                                          --intra_op_num_threads=4
+```
-Original file line number
+Diff line change
@@ Expand Up / @@ -419,6 +419,7 @@ DDP @@
     DDR
     de
     deberta
+    decapoda
     DecodeImage
     deepengine
     deeplab
@@ Expand Down Expand Up / @@ -1187,6 +1188,7 @@ nd @@
     ndarray
     NDArray
     nderlu
+    NeelNanda
     neox
     nepoch
     ner
@@ Expand Down @@