diff --git a/vllm/config.yaml b/vllm/config.yaml index e46fbafb..c74c6d91 100644 --- a/vllm/config.yaml +++ b/vllm/config.yaml @@ -1,17 +1,17 @@ -model_name: "Llama 3.1 70B Instruct VLLM" +model_name: "Llama 3.1 8B Instruct VLLM oom" python_version: py311 model_metadata: example_model_input: {"prompt": "what is the meaning of life"} - repo_id: meta-llama/Meta-Llama-3.1-70B-Instruct - openai_compatible: true + repo_id: meta-llama/Meta-Llama-3.1-8B-Instruct + openai_compatible: false vllm_config: - tensor_parallel_size: 4 + tensor_parallel_size: 1 max_model_len: 4096 enable_prefix_caching: true requirements: - vllm==0.5.4 resources: - accelerator: A100:4 + accelerator: A100 use_gpu: true runtime: predict_concurrency: 128 diff --git a/vllm/model/model.py b/vllm/model/model.py index ad30ddcc..0bf5227f 100644 --- a/vllm/model/model.py +++ b/vllm/model/model.py @@ -122,6 +122,8 @@ def load(self): logger.info(result.stdout) except subprocess.CalledProcessError as e: logger.error(f"Command failed with code {e.returncode}: {e.stderr}") + # testing + raise RuntimeError("Oops!") async def predict(self, model_input): if "messages" not in model_input and "prompt" not in model_input: