From ce0ebc6652bc1c85a873ec6ed58f528b7e9782f1 Mon Sep 17 00:00:00 2001 From: Anas Ahouzi <112881240+aahouzi@users.noreply.github.com> Date: Tue, 20 Feb 2024 08:02:20 -0800 Subject: [PATCH 1/4] Support for large prompts --- scripts/inference.py | 42 ++++++++++++++++++++---------------------- scripts/run.py | 13 ++++++++++--- 2 files changed, 30 insertions(+), 25 deletions(-) diff --git a/scripts/inference.py b/scripts/inference.py index 4121b1b95..2c547d08a 100644 --- a/scripts/inference.py +++ b/scripts/inference.py @@ -12,21 +12,17 @@ # See the License for the specific language governing permissions and # limitations under the License. import os -import sys from pathlib import Path import argparse from typing import List, Optional import subprocess from transformers import AutoTokenizer +import neural_speed model_maps = {"gpt_neox": "gptneox", "llama2": "llama", "gpt_bigcode": "starcoder"} build_path = Path(Path(__file__).parent.absolute(), "../build/") -def is_win(): - return sys.platform.startswith('win') - - def main(args_in: Optional[List[str]] = None) -> None: parser = argparse.ArgumentParser(description="main program llm running") parser.add_argument("--model_name", type=str, help="Model name: String", required=True) @@ -39,6 +35,13 @@ def main(args_in: Optional[List[str]] = None) -> None: help="Prompt to start generation with: String (default: empty)", default="", ) + parser.add_argument( + "-f", + "--file", + type=str, + help="Path to a text file containing the prompt (for large prompts)", + default=None, + ) parser.add_argument( "--tokenizer", type=str, @@ -129,24 +132,18 @@ def main(args_in: Optional[List[str]] = None) -> None: args = parser.parse_args(args_in) print(args) - model_name = model_maps.get(args.model_name, args.model_name) - if is_win(): - path = Path(args.build_dir, "./Bin/Release/run_{}.exe".format(model_name)) + if args.file: + with open(args.file, 'r') as f: + prompt_text = f.read() else: - if args.one_click_run == "True": - import neural_speed - package_path = os.path.dirname(neural_speed.__file__) - path = Path(package_path, "./run_{}".format(model_name)) - else: - path = Path(args.build_dir, "./bin/run_{}".format(model_name)) - - if not path.exists(): - print("Please build graph first or select the correct model name.") - sys.exit(1) + prompt_text = args.prompt + model_name = model_maps.get(args.model_name, args.model_name) + package_path = os.path.dirname(neural_speed.__file__) + path = Path(package_path, "./run_{}".format(model_name)) cmd = [path] cmd.extend(["--model", args.model]) - cmd.extend(["--prompt", args.prompt]) + cmd.extend(["--prompt", prompt_text]) cmd.extend(["--n-predict", str(args.n_predict)]) cmd.extend(["--threads", str(args.threads)]) cmd.extend(["--batch-size-truncate", str(args.batch_size_truncate)]) @@ -167,7 +164,7 @@ def main(args_in: Optional[List[str]] = None) -> None: if (args.model_name == "chatglm"): tokenizer = AutoTokenizer.from_pretrained(args.tokenizer, trust_remote_code=True) - token_ids_list = tokenizer.encode(args.prompt) + token_ids_list = tokenizer.encode(prompt_text) token_ids_list = map(str, token_ids_list) token_ids_str = ', '.join(token_ids_list) cmd.extend(["--ids", token_ids_str]) @@ -207,14 +204,14 @@ def encode_history(history, max_length=4096): else: ids.append(ASSISTANT_TOKEN_ID) - content_ids = tokenizer.encode(args.prompt) + content_ids = tokenizer.encode(prompt_text) ids.extend(content_ids) ids.append(ASSISTANT_TOKEN_ID) truncate(ids, max_length) return ids - history = [args.prompt] + history = [prompt_text] token_ids_list = encode_history(history) token_ids_list = map(str, token_ids_list) token_ids_str = ', '.join(token_ids_list) @@ -226,3 +223,4 @@ def encode_history(history, max_length=4096): if __name__ == "__main__": main() + diff --git a/scripts/run.py b/scripts/run.py index 3ef18c96a..6e4f12ce9 100644 --- a/scripts/run.py +++ b/scripts/run.py @@ -11,8 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import os -import sys from pathlib import Path import argparse from typing import List, Optional @@ -65,7 +63,7 @@ def main(args_in: Optional[List[str]] = None) -> None: "--scale_dtype", type=str, choices=["fp32", "bf16", "fp8"], - help="Data type of scales: fp32/bf16 (dafault fp32)", + help="Data type of scales: fp32/bf16 (default fp32)", default="fp32", ) parser.add_argument( @@ -88,6 +86,13 @@ def main(args_in: Optional[List[str]] = None) -> None: help="Prompt to start generation with: String (default: empty)", default="Once upon a time, there existed a ", ) + parser.add_argument( + "-f", + "--file", + type=str, + help="Path to a text file containing the prompt (for large prompts)", + default=None, + ) parser.add_argument( "-n", "--n_predict", @@ -195,6 +200,7 @@ def main(args_in: Optional[List[str]] = None) -> None: infer_cmd.extend(["--model_name", model_type]) infer_cmd.extend(["-m", Path(work_path, "ne_{}_{}.bin".format(model_type, args.weight_dtype, args.group_size))]) infer_cmd.extend(["--prompt", args.prompt]) + infer_cmd.extend(["--file", args.file]) infer_cmd.extend(["--n_predict", str(args.n_predict)]) infer_cmd.extend(["--threads", str(args.threads)]) infer_cmd.extend(["--batch_size_truncate", str(args.batch_size_truncate)]) @@ -214,3 +220,4 @@ def main(args_in: Optional[List[str]] = None) -> None: if __name__ == "__main__": main() + From 2a18c9c356a0434f873a7ea276f47f672bc24590 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 20 Feb 2024 17:08:10 +0000 Subject: [PATCH 2/4] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- scripts/inference.py | 1 - scripts/run.py | 1 - 2 files changed, 2 deletions(-) diff --git a/scripts/inference.py b/scripts/inference.py index 2c547d08a..64a62cb0b 100644 --- a/scripts/inference.py +++ b/scripts/inference.py @@ -223,4 +223,3 @@ def encode_history(history, max_length=4096): if __name__ == "__main__": main() - diff --git a/scripts/run.py b/scripts/run.py index 6e4f12ce9..fbdc3a23a 100644 --- a/scripts/run.py +++ b/scripts/run.py @@ -220,4 +220,3 @@ def main(args_in: Optional[List[str]] = None) -> None: if __name__ == "__main__": main() - From f0d1c16a42a1095c94ad1985229746b64cc6fda8 Mon Sep 17 00:00:00 2001 From: Anas Ahouzi <112881240+aahouzi@users.noreply.github.com> Date: Thu, 22 Feb 2024 02:34:38 -0800 Subject: [PATCH 3/4] Fix text decoding issue on Windows --- scripts/inference.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/inference.py b/scripts/inference.py index 2c547d08a..68f523cf3 100644 --- a/scripts/inference.py +++ b/scripts/inference.py @@ -133,7 +133,7 @@ def main(args_in: Optional[List[str]] = None) -> None: args = parser.parse_args(args_in) print(args) if args.file: - with open(args.file, 'r') as f: + with open(args.file, 'r', encoding='utf-8') as f: prompt_text = f.read() else: prompt_text = args.prompt From 98b362c2ffef65c0fbf735ab135a756870c27550 Mon Sep 17 00:00:00 2001 From: Anas Ahouzi <112881240+aahouzi@users.noreply.github.com> Date: Fri, 1 Mar 2024 01:48:14 -0800 Subject: [PATCH 4/4] Add missing documentation --- docs/advanced_usage.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/advanced_usage.md b/docs/advanced_usage.md index b6dd139e3..c82c6b194 100644 --- a/docs/advanced_usage.md +++ b/docs/advanced_usage.md @@ -13,6 +13,7 @@ Argument description of run.py ([supported MatMul combinations](#supported-matri | --compute_dtype | Data type of Gemm computation: int8/bf16/fp16/fp32 (default: fp32) | | --use_ggml | Enable ggml for quantization and inference | | -p / --prompt | Prompt to start generation with: String (default: empty) | +| -f / --file | Path to a text file containing the prompt (for large prompts) | | -n / --n_predict | Number of tokens to predict: Int (default: -1, -1 = infinity) | | -t / --threads | Number of threads to use during computation: Int (default: 56) | | -b / --batch_size_truncate | Batch size for prompt processing: Int (default: 512) | @@ -22,6 +23,7 @@ Argument description of run.py ([supported MatMul combinations](#supported-matri | --color | Colorise output to distinguish prompt and user input from generations | | --keep | Number of tokens to keep from the initial prompt: Int (default: 0, -1 = all) | | --shift-roped-k | Use [ring-buffer](./docs/infinite_inference.md#shift-rope-k-and-ring-buffer) and thus do not re-computing after reaching ctx_size (default: False) | +| --token | Access token ID for models that require it (e.g: LLaMa2, etc..) | ### 1. Conversion and Quantization @@ -108,6 +110,7 @@ Argument description of inference.py: | -m / --model | Path to the executed model: String | | --build_dir | Path to the build file: String | | -p / --prompt | Prompt to start generation with: String (default: empty) | +| -f / --file | Path to a text file containing the prompt (for large prompts) | | -n / --n_predict | Number of tokens to predict: Int (default: -1, -1 = infinity) | | -t / --threads | Number of threads to use during computation: Int (default: 56) | | -b / --batch_size | Batch size for prompt processing: Int (default: 512) |