diff --git a/inference/huggingface/zero_inference/README.md b/inference/huggingface/zero_inference/README.md index d76de89cd..ef78e3da1 100644 --- a/inference/huggingface/zero_inference/README.md +++ b/inference/huggingface/zero_inference/README.md @@ -172,4 +172,5 @@ In running example above, only two fully connected layers (fc1 and fc2) and the ## References - DeepSpeed [ZeRO-Inference](https://www.deepspeed.ai/2022/09/09/zero-inference.html) +- Sheng, Ying et al. [FlexGen: High-Throughput Generative Inference of Large Language Models with a Single GPU](https://arxiv.org/abs/2303.06865) - Shen, Sheng, et al. "Q-bert: Hessian based ultra low precision quantization of bert." Proceedings of the AAAI Conference on Artificial Intelligence. Vol. 34. No. 05. 2020. diff --git a/inference/huggingface/zero_inference/run_model.py b/inference/huggingface/zero_inference/run_model.py index bc03e7499..5aa28fd7f 100644 --- a/inference/huggingface/zero_inference/run_model.py +++ b/inference/huggingface/zero_inference/run_model.py @@ -26,7 +26,7 @@ from packaging import version -assert version.parse(deepspeed.__version__) >= version.parse("0.10.2"), "ZeRO-Inference with weight quantization and kv cache offloading is available only in DeepSpeed 0.10.3+, please upgrade DeepSpeed" +assert version.parse(deepspeed.__version__) >= version.parse("0.10.3"), "ZeRO-Inference with weight quantization and kv cache offloading is available only in DeepSpeed 0.10.3+, please upgrade DeepSpeed" def get_model_config(model_name): if "175b" in model_name: @@ -161,11 +161,19 @@ def run_generation( return_token_type_ids = True padding_side = "left" if config.model_type in ["opt"] else "right" - tokenizer = AutoTokenizer.from_pretrained( - model_name, - return_token_type_ids=return_token_type_ids, - padding_side=padding_side - ) + if config.model_type == "opt": + tokenizer = AutoTokenizer.from_pretrained( + model_name.replace("175b", "66b"), + return_token_type_ids=return_token_type_ids, + padding_side=padding_side + ) + else: + tokenizer = AutoTokenizer.from_pretrained( + model_name, + return_token_type_ids=return_token_type_ids, + padding_side=padding_side + ) + tokenizer.pad_token = tokenizer.eos_token