Update python API and reorg scripts (#16)

intel · Dec 27, 2023 · 40663ec · 40663ec
1 parent f57d4e1
commit 40663ec
Show file tree

Hide file tree

Showing 31 changed files with 92 additions and 54 deletions.
diff --git a/.github/workflows/scripts/models/cpp_graph_inference.sh b/.github/workflows/scripts/models/cpp_graph_inference.sh
@@ -13,7 +13,7 @@ function main() {
     model="$2"
     compiler_version="$3"
     working_dir="${WORKING_DIR}"
-    scripts_dir="${working_dir}/neural_speed/scripts"
+    scripts_dir="${working_dir}/neural_speed/convert"
     # init params
     if [[ "${model}" == "llama-2-7b-chat" ]]; then
         convert_script="${scripts_dir}/convert_llama.py"

diff --git a/README.md b/README.md
@@ -262,7 +262,7 @@ outputs = model.generate(inputs, streamer=streamer, max_new_tokens=300)
 ### 1. Run LLM with Python Script
 You can run LLM with one-click python script including conversion, quantization and inference.
 ```
-python neural_speed/scripts/run.py model-path --weight_dtype int4 -p "She opened the door and see"
+python scripts/run.py model-path --weight_dtype int4 -p "She opened the door and see"
 ```
 
 Argument description of run.py ([supported MatMul combinations](#supported-matrix-multiplication-data-types-combinations)):
@@ -296,24 +296,24 @@ Neural Speed assumes the compatible model format as [llama.cpp](https://github.c
 ```bash
 
 # convert the model directly use model id in Hugging Face. (recommended)
-python neural_speed/scripts/convert.py --outtype f32 --outfile ne-f32.bin EleutherAI/gpt-j-6b
+python scripts/convert.py --outtype f32 --outfile ne-f32.bin EleutherAI/gpt-j-6b
 
 # or you can download fp32 model (e.g., LLAMA2) from Hugging Face at first, then convert the pytorch model to ggml format.
 git clone https://huggingface.co/meta-llama/Llama-2-7b-chat-hf
-python neural_speed/scripts/convert.py --outtype f32 --outfile ne-f32.bin model_path
+python scripts/convert.py --outtype f32 --outfile ne-f32.bin model_path
 
-# To convert model with PEFT(Parameter-Efficient Fine-Tuning) adapter, you need to merge the PEFT adapter into the model first, use below command to merge the PEFT adapter and save the merged model, afterwards you can use 'neural_speed/scripts/convert.py' just like above mentioned.
-python neural_speed/scripts/load_peft_and_merge.py --model_name_or_path meta-llama/Llama-2-7b-hf --peft_name_or_path dfurman/llama-2-7b-instruct-peft --save_path ./Llama-2-7b-hf-instruct-peft
+# To convert model with PEFT(Parameter-Efficient Fine-Tuning) adapter, you need to merge the PEFT adapter into the model first, use below command to merge the PEFT adapter and save the merged model, afterwards you can use 'scripts/convert.py' just like above mentioned.
+python scripts/load_peft_and_merge.py --model_name_or_path meta-llama/Llama-2-7b-hf --peft_name_or_path dfurman/llama-2-7b-instruct-peft --save_path ./Llama-2-7b-hf-instruct-peft
 
 # quantize weights of fp32 ggml bin
 # model_name: llama, llama2, mpt, falcon, gptj, starcoder, dolly
 # optimized INT4 model with group size 128 (recommended)
-python neural_speed/scripts/quantize.py --model_name llama2 --model_file ne-f32.bin --out_file ne-q4_j.bin --weight_dtype int4 --group_size 128 --compute_dtype int8
+python scripts/quantize.py --model_name llama2 --model_file ne-f32.bin --out_file ne-q4_j.bin --weight_dtype int4 --group_size 128 --compute_dtype int8
 
 # Alternativly you could run ggml q4_0 format like following
-python neural_speed/scripts/quantize.py --model_name llama2 --model_file ne-f32.bin --out_file ne-q4_0.bin --weight_dtype int4 --use_ggml
+python scripts/quantize.py --model_name llama2 --model_file ne-f32.bin --out_file ne-q4_0.bin --weight_dtype int4 --use_ggml
 # optimized INT4 model with group size 32
-python neural_speed/scripts/quantize.py --model_name llama2 --model_file ne-f32.bin --out_file ne-q4_j.bin --weight_dtype int4 --group_size 32 --compute_dtype int8
+python scripts/quantize.py --model_name llama2 --model_file ne-f32.bin --out_file ne-q4_j.bin --weight_dtype int4 --group_size 32 --compute_dtype int8
 
 ```
 Argument description of quantize.py ([supported MatMul combinations](#supported-matrix-multiplication-data-types-combinations)):
@@ -355,17 +355,17 @@ We provide LLM inference script to run the quantized model. Please reach [us](ma
 # please type prompt about codes when run `StarCoder`, for example, -p "def fibonnaci(".
 
 #Linux and WSL
-OMP_NUM_THREADS=<physic_cores> numactl -m 0 -C 0-<physic_cores-1> python neural_speed/scripts/inference.py --model_name llama -m ne-q4_j.bin -c 512 -b 1024 -n 256 -t <physic_cores> --color -p "She opened the door and see"
+OMP_NUM_THREADS=<physic_cores> numactl -m 0 -C 0-<physic_cores-1> python scripts/inference.py --model_name llama -m ne-q4_j.bin -c 512 -b 1024 -n 256 -t <physic_cores> --color -p "She opened the door and see"
 
 # if you want to generate fixed outputs, please set --seed arg, for example:
-OMP_NUM_THREADS=<physic_cores> numactl -m 0 -C 0-<physic_cores-1> python neural_speed/scripts/inference.py --model_name llama -m ne-q4_j.bin -c 512 -b 1024 -n 256 -t <physic_cores> --color -p "She opened the door and see" --seed 12
+OMP_NUM_THREADS=<physic_cores> numactl -m 0 -C 0-<physic_cores-1> python scripts/inference.py --model_name llama -m ne-q4_j.bin -c 512 -b 1024 -n 256 -t <physic_cores> --color -p "She opened the door and see" --seed 12
 
 # if you want to reduce repeated generated texts, please set --repeat_penalty (value > 1.0, default = 1.0), for example:
-OMP_NUM_THREADS=<physic_cores> numactl -m 0 -C 0-<physic_cores-1> python neural_speed/scripts/inference.py --model_name llama -m ne-q4_j.bin -c 512 -b 1024 -n 256 -t <physic_cores> --color -p "She opened the door and see" --repeat_penalty 1.2
+OMP_NUM_THREADS=<physic_cores> numactl -m 0 -C 0-<physic_cores-1> python scripts/inference.py --model_name llama -m ne-q4_j.bin -c 512 -b 1024 -n 256 -t <physic_cores> --color -p "She opened the door and see" --repeat_penalty 1.2
 
 #Windows
 #Recommend to build and run our project in WSL to get a better and stable performance
-python neural_speed/scripts/inference.py --model_name llama -m ne-q4_j.bin -c 512 -b 1024 -n 256 -t <physic_cores|P-cores> --color -p "She opened the door and see"
+python scripts/inference.py --model_name llama -m ne-q4_j.bin -c 512 -b 1024 -n 256 -t <physic_cores|P-cores> --color -p "She opened the door and see"
 ```
 
 Argument description of inference.py:

diff --git a/developer_document.md b/developer_document.md
@@ -84,7 +84,7 @@ The term **"hyperparamters"** describes a value that is used to configure the be
 - n_vocab: the size of the model's vocabulary
 - n_embd: the size of the model's " embedding layer", which is used during prompt ingestion.
 - n_layer: the number of layers in the model; each layer represents a set of weights.
-Here we will use [convert_gptneox.py](neural_speed/scripts/convert_gptneox.py#L96) as an example,
+Here we will use [convert_gptneox.py](scripts/convert_gptneox.py#L96) as an example,
 ```python
 fout.write(struct.pack("i", hparams["num_attention_heads"]))
 fout.write(struct.pack("i", hparams.get("n_head_kv", 0)))  # multi-query attention
@@ -96,7 +96,7 @@ The above `fout` is the file we need to get, and the `num_attention`, `n_head_kv
 As the name implies, a model's vocabulary comprises components that are used by the model to generate language (text). However, unlike the vocabulary of a human, which consists of words, the vocabulary of a large language model consists of "tokens". A token can be an entire word, but oftentimes they are word fragments. Just like humans can compose millions of words from just a dozen or two letters, large language models use tokens to express a large number of words from a relatively smaller number of components. Consider a vocabulary with the following tokens: `whi`, `ch`, `le`, `who`, and `a`; this vocabulary can be used to create the English words `"which"`, `"while"`, `"who"`, `"a"`, and `"leach"`. How would the behavior change if the model contained the following tokens: `wh`, `ich`, `ile`, `o`, and `leach`? Choices such as these allow model-creators to tune the behavior and performance of their models.
 
 As described above, the model's hyperparameters typically contain a value that specifies the number of tokens in the vocabulary. The vocabulary is encoded as a list of tokens, each of which includes a 32-bit integer that specifies the length of the token. If your model has some new tokenizers, we suggest using a python tokenizer from transformers and feeding the input_ids to model Python API (python example in scripts folder)
-Here we will use [convert_gptneox.py](neural_speed/scripts/convert_gptneox.py#L122) as an example to processed the vocabulary of gptneox and written it into `fout`.
+Here we will use [convert_gptneox.py](scripts/convert_gptneox.py#L122) as an example to processed the vocabulary of gptneox and written it into `fout`.
 ```python
 encoder = tokenizer.vocab
 encoder.update(tokenizer.get_added_vocab())
@@ -108,7 +108,7 @@ byte_decoder = {v:k for k, v in byte_encoder.items()}
 Finally, and largest, component of a ITREX GRAPH file is the weights of the LLM that the file represents. Abstractly, a large language model is software that is used to generate language - just like software that is used to generate images can be improved by increasing the number of colors with which images can be rendered, large language models can be improved by increasing the number of weights in the model. The total number of weights in a model is referred to as the "size" of that model. For example, the dolly-v2-3b implementation of the gpt-neox-20b language model architecture is available in several sizes, like 3B and 20B, which stand for 3 billion and 20 billion, respectively. These numbers refer to the total number of weights in that model.
 
 As described in the hyperparameters section, weights are grouped in sets called "layers", which, like hyperparameters, have structures that are uniquely defined by the model architecture; within a layer, weights are grouped in structures called "tensors". So, for instance, both dolly-v2-3B and gpt-neox-20B use layers that comprise the same tensors, but dolly-v2-3B has relatively fewer layers when compared to gpt-neox-20B.
-Here we will use [convert_gptneox.py](neural_speed/scripts/convert_gptneox.py#L149) as an example to convert model weights to `fout`.
+Here we will use [convert_gptneox.py](scripts/convert_gptneox.py#L149) as an example to convert model weights to `fout`.
 ```python
 fout.write(struct.pack("iii", n_dims, len(str), ftype_cur))
 for i in range(n_dims):
@@ -412,7 +412,7 @@ Quantize model and use the jblas library for inference can lead to better perfor
 ```bash
 
 # convert the model directly use model path
-python neural_speed/scripts/convert_new_model.py --outtype f32 --outfile ne-f32.bin new_model_path
+python scripts/convert_new_model.py --outtype f32 --outfile ne-f32.bin new_model_path
 # optimized INT4 model with group size 128 (recommended)
 ./build/bin/quant_new_model --model_file ne-f32.bin --out_file ne-q4_j.bin --weight_dtype int4 --group_size 128 --compute_dtype int8
 ```

diff --git a/docs/tensor_parallelism.md b/docs/tensor_parallelism.md
@@ -91,7 +91,7 @@ make -j
 First you should download and convert the model to f32 format. You can also quantize the model to q4_0 format, but it is optional.
 
 ```shell
-python neural_speed/scripts/convert.py --outtype f32 --outfile EleutherAI/gpt-j-6b
+python scripts/convert.py --outtype f32 --outfile EleutherAI/gpt-j-6b
 ```
 Then quantize the model to q4_0 format(optional).
 

diff --git a/__init__.py → neural_speed/__init__.py b/__init__.py → neural_speed/__init__.py
@@ -17,7 +17,7 @@
 import os
 
 import torch
-from neural_speed.scripts.convert import convert_model
+from neural_speed.convert import convert_model
 from transformers import AutoConfig, AutoTokenizer
 
 model_maps = {"gpt_neox": "gptneox", "gpt_bigcode": "starcoder"}
@@ -73,7 +73,9 @@ def get_model_type(model_config):
             model_type = "chatglm2"
         return model_type
 
-    def init(self, model_name, not_quant=False, use_cache=False, **quant_kwargs):
+    def init(self, model_name, not_quant=False, use_cache=False,
+            weight_dtype="int4", alg="sym", group_size=32,
+            scale_dtype="fp32", compute_dtype="int8", use_ggml=False):
         self.config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
         self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
         model_type = Model.get_model_type(self.config)
@@ -83,15 +85,15 @@ def init(self, model_name, not_quant=False, use_cache=False, **quant_kwargs):
         output_path = "runtime_outs"
         os.makedirs(output_path, exist_ok=True)
         fp32_bin = "{}/ne_{}_f32.bin".format(output_path, model_type)
-        quant_desc = quant_kwargs['weight_dtype']
-        if quant_kwargs['use_ggml']:
+        quant_desc = weight_dtype
+        if use_ggml:
             quant_desc += "_ggml"
         else:
-            quant_desc += "_jblas_c" + quant_kwargs['compute_dtype']
-            if quant_kwargs['group_size'] == -1:
+            quant_desc += "_jblas_c" + compute_dtype
+            if group_size == -1:
                 quant_desc += "_pc"
             else:
-                quant_desc += "_g{}".format(quant_kwargs['group_size'])
+                quant_desc += "_g{}".format(group_size)
         quant_bin = "{}/ne_{}_q_{}.bin".format(output_path, model_type, quant_desc)
 
         if not_quant:
@@ -108,7 +110,9 @@ def init(self, model_name, not_quant=False, use_cache=False, **quant_kwargs):
         if not_quant:
             print("FP32 model will be used.")
             return
-        self.module.Model.quant_model(model_path=fp32_bin, out_path=quant_bin, **quant_kwargs)
+        self.module.Model.quant_model(model_path=fp32_bin, out_path=quant_bin,
+                                    weight_dtype=weight_dtype, alg=alg, group_size=group_size,
+                                    scale_dtype=scale_dtype, compute_dtype=compute_dtype, use_ggml=use_ggml)
         assert os.path.exists(quant_bin), "Fail to quantize model"
 
         # clean
@@ -130,7 +134,8 @@ def quant_model(self, model_type, model_path, out_path, **quant_kwargs):
         self.__import_package(model_type)
         self.module.Model.quant_model(model_path=model_path, out_path=out_path, **quant_kwargs)
 
-    def generate(self, input_ids, streamer=None, interactive=False, ignore_prompt=False, stopping_criteria=None,  **generate_kwargs):
+    def generate(self, input_ids, streamer=None, interactive=False, ignore_prompt=False,
+                 stopping_criteria=None,  **generate_kwargs):
         max_new_tokens = generate_kwargs.get("max_new_tokens", -1)
         if self.model is None:
             self.init_from_bin(self.model_type, self.bin_file, batch_size=input_ids.shape[0],

diff --git a/neural_speed/convert/__init__.py b/neural_speed/convert/__init__.py
@@ -0,0 +1,41 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from pathlib import Path
+from transformers import AutoConfig
+import subprocess
+
+model_maps = {"gpt_neox": "gptneox", "gpt_bigcode": "starcoder"}
+
+
+def convert_model(model, outfile, outtype):
+    config = AutoConfig.from_pretrained(model, trust_remote_code=True)
+    model_type = model_maps.get(config.model_type, config.model_type)
+
+    gpt_model = 'gptq' in str(model).lower()
+    if gpt_model:
+        path = Path(Path(__file__).parent.absolute(), "convert_gptq_{}.py".format(model_type))
+    else:
+        path = Path(Path(__file__).parent.absolute(), "convert_{}.py".format(model_type))
+    cmd = []
+    cmd.extend(["python", path])
+    cmd.extend(["--outfile", outfile])
+    cmd.extend(["--outtype", outtype])
+    cmd.extend([model])
+
+    print("cmd:", cmd)
+    subprocess.run(cmd)
diff --git a/neural_speed/scripts/common.py → neural_speed/convert/common.py b/neural_speed/scripts/common.py → neural_speed/convert/common.py
diff --git a/neural_speed/scripts/convert_baichuan.py → neural_speed/convert/convert_baichuan.py b/neural_speed/scripts/convert_baichuan.py → neural_speed/convert/convert_baichuan.py
diff --git a/neural_speed/scripts/convert_bloom.py → neural_speed/convert/convert_bloom.py b/neural_speed/scripts/convert_bloom.py → neural_speed/convert/convert_bloom.py
diff --git a/neural_speed/scripts/convert_chatglm.py → neural_speed/convert/convert_chatglm.py b/neural_speed/scripts/convert_chatglm.py → neural_speed/convert/convert_chatglm.py
diff --git a/neural_speed/scripts/convert_dolly.py → neural_speed/convert/convert_dolly.py b/neural_speed/scripts/convert_dolly.py → neural_speed/convert/convert_dolly.py
diff --git a/neural_speed/scripts/convert_falcon.py → neural_speed/convert/convert_falcon.py b/neural_speed/scripts/convert_falcon.py → neural_speed/convert/convert_falcon.py
diff --git a/neural_speed/scripts/convert_gptj.py → neural_speed/convert/convert_gptj.py b/neural_speed/scripts/convert_gptj.py → neural_speed/convert/convert_gptj.py
diff --git a/neural_speed/scripts/convert_gptneox.py → neural_speed/convert/convert_gptneox.py b/neural_speed/scripts/convert_gptneox.py → neural_speed/convert/convert_gptneox.py
diff --git a/neural_speed/scripts/convert_gptq_bloom.py → neural_speed/convert/convert_gptq_bloom.py b/neural_speed/scripts/convert_gptq_bloom.py → neural_speed/convert/convert_gptq_bloom.py
diff --git a/neural_speed/scripts/convert_gptq_llama.py → neural_speed/convert/convert_gptq_llama.py b/neural_speed/scripts/convert_gptq_llama.py → neural_speed/convert/convert_gptq_llama.py
diff --git a/neural_speed/scripts/convert_gptq_mistral.py → neural_speed/convert/convert_gptq_mistral.py b/neural_speed/scripts/convert_gptq_mistral.py → neural_speed/convert/convert_gptq_mistral.py
diff --git a/neural_speed/scripts/convert_llama.py → neural_speed/convert/convert_llama.py b/neural_speed/scripts/convert_llama.py → neural_speed/convert/convert_llama.py
diff --git a/neural_speed/scripts/convert_mistral.py → neural_speed/convert/convert_mistral.py b/neural_speed/scripts/convert_mistral.py → neural_speed/convert/convert_mistral.py
diff --git a/neural_speed/scripts/convert_mpt.py → neural_speed/convert/convert_mpt.py b/neural_speed/scripts/convert_mpt.py → neural_speed/convert/convert_mpt.py
diff --git a/neural_speed/scripts/convert_opt.py → neural_speed/convert/convert_opt.py b/neural_speed/scripts/convert_opt.py → neural_speed/convert/convert_opt.py
diff --git a/neural_speed/scripts/convert_qwen.py → neural_speed/convert/convert_qwen.py b/neural_speed/scripts/convert_qwen.py → neural_speed/convert/convert_qwen.py
diff --git a/neural_speed/scripts/convert_starcoder.py → neural_speed/convert/convert_starcoder.py b/neural_speed/scripts/convert_starcoder.py → neural_speed/convert/convert_starcoder.py
diff --git a/neural_speed/scripts/convert_whisper.py → neural_speed/convert/convert_whisper.py b/neural_speed/scripts/convert_whisper.py → neural_speed/convert/convert_whisper.py
diff --git a/neural_speed/scripts/convert.py → scripts/convert.py b/neural_speed/scripts/convert.py → scripts/convert.py
@@ -11,35 +11,11 @@
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
-import os
-import numpy as np
-from pathlib import Path
+
 import argparse
+from pathlib import Path
 from typing import List, Optional
-from transformers import AutoConfig
-import subprocess
-
-model_maps = {"gpt_neox": "gptneox", "gpt_bigcode": "starcoder"}
-
-
-def convert_model(model, outfile, outtype):
-    config = AutoConfig.from_pretrained(model, trust_remote_code=True)
-    model_type = model_maps.get(config.model_type, config.model_type)
-
-    gpt_model = 'gptq' in str(model).lower()
-    if gpt_model:
-        path = Path(Path(__file__).parent.absolute(), "convert_gptq_{}.py".format(model_type))
-    else:
-        path = Path(Path(__file__).parent.absolute(), "convert_{}.py".format(model_type))
-    cmd = []
-    cmd.extend(["python", path])
-    cmd.extend(["--outfile", outfile])
-    cmd.extend(["--outtype", outtype])
-    cmd.extend([model])
-
-    print("cmd:", cmd)
-    subprocess.run(cmd)
-
+from neural_speed.convert import convert_model
 
 def main(args_in: Optional[List[str]] = None) -> None:
     parser = argparse.ArgumentParser(description="Convert a PyTorch model to a NE compatible file")

diff --git a/neural_speed/scripts/inference.py → scripts/inference.py b/neural_speed/scripts/inference.py → scripts/inference.py
diff --git a/neural_speed/scripts/load_peft_and_merge.py → scripts/load_peft_and_merge.py b/neural_speed/scripts/load_peft_and_merge.py → scripts/load_peft_and_merge.py
diff --git a/neural_speed/scripts/__init__.py → scripts/python_api_example.py b/neural_speed/scripts/__init__.py → scripts/python_api_example.py
@@ -14,3 +14,19 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import sys
+from transformers import AutoTokenizer, TextStreamer
+from neural_speed import Model
+
+if len(sys.argv) != 2:
+    print("Usage: python python_api_example.py model_path")
+model_name = sys.argv[1]
+
+prompt = "Once upon a time, a little girl"
+tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+inputs = tokenizer(prompt, return_tensors="pt").input_ids
+streamer = TextStreamer(tokenizer)
+
+model = Model()
+model.init(model_name, weight_dtype="int4", compute_dtype="int8")
+outputs = model.generate(inputs, streamer=streamer, max_new_tokens=300, do_sample=True)
diff --git a/neural_speed/scripts/quantize.py → scripts/quantize.py b/neural_speed/scripts/quantize.py → scripts/quantize.py
diff --git a/neural_speed/scripts/run.py → scripts/run.py b/neural_speed/scripts/run.py → scripts/run.py
diff --git a/neural_speed/tests/requirements.txt → tests/requirements.txt b/neural_speed/tests/requirements.txt → tests/requirements.txt