From 6e387a35eeccd25fec6f5af753d0a2f67323307d Mon Sep 17 00:00:00 2001 From: Olatunji Ruwase Date: Wed, 13 Sep 2023 15:33:39 -0400 Subject: [PATCH] FlexGen reference (#730) * FlexGen reference * Fix DS version and opt issue * Fix script --- .../huggingface/zero_inference/README.md | 1 + .../huggingface/zero_inference/run_model.py | 20 +++++++++++++------ 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/inference/huggingface/zero_inference/README.md b/inference/huggingface/zero_inference/README.md index d76de89cd..ef78e3da1 100644 --- a/inference/huggingface/zero_inference/README.md +++ b/inference/huggingface/zero_inference/README.md @@ -172,4 +172,5 @@ In running example above, only two fully connected layers (fc1 and fc2) and the ## References - DeepSpeed [ZeRO-Inference](https://www.deepspeed.ai/2022/09/09/zero-inference.html) +- Sheng, Ying et al. [FlexGen: High-Throughput Generative Inference of Large Language Models with a Single GPU](https://arxiv.org/abs/2303.06865) - Shen, Sheng, et al. "Q-bert: Hessian based ultra low precision quantization of bert." Proceedings of the AAAI Conference on Artificial Intelligence. Vol. 34. No. 05. 2020. diff --git a/inference/huggingface/zero_inference/run_model.py b/inference/huggingface/zero_inference/run_model.py index bc03e7499..5aa28fd7f 100644 --- a/inference/huggingface/zero_inference/run_model.py +++ b/inference/huggingface/zero_inference/run_model.py @@ -26,7 +26,7 @@ from packaging import version -assert version.parse(deepspeed.__version__) >= version.parse("0.10.2"), "ZeRO-Inference with weight quantization and kv cache offloading is available only in DeepSpeed 0.10.3+, please upgrade DeepSpeed" +assert version.parse(deepspeed.__version__) >= version.parse("0.10.3"), "ZeRO-Inference with weight quantization and kv cache offloading is available only in DeepSpeed 0.10.3+, please upgrade DeepSpeed" def get_model_config(model_name): if "175b" in model_name: @@ -161,11 +161,19 @@ def run_generation( return_token_type_ids = True padding_side = "left" if config.model_type in ["opt"] else "right" - tokenizer = AutoTokenizer.from_pretrained( - model_name, - return_token_type_ids=return_token_type_ids, - padding_side=padding_side - ) + if config.model_type == "opt": + tokenizer = AutoTokenizer.from_pretrained( + model_name.replace("175b", "66b"), + return_token_type_ids=return_token_type_ids, + padding_side=padding_side + ) + else: + tokenizer = AutoTokenizer.from_pretrained( + model_name, + return_token_type_ids=return_token_type_ids, + padding_side=padding_side + ) + tokenizer.pad_token = tokenizer.eos_token