basetenlabs · tianshuc0731 · Aug 20, 2024
diff --git a/vllm/config.yaml b/vllm/config.yaml
@@ -1,17 +1,17 @@
-model_name: "Llama 3.1 70B Instruct VLLM"
+model_name: "Llama 3.1 8B Instruct VLLM oom"
 python_version: py311
 model_metadata:
   example_model_input: {"prompt": "what is the meaning of life"}
-  repo_id: meta-llama/Meta-Llama-3.1-70B-Instruct
-  openai_compatible: true
+  repo_id: meta-llama/Meta-Llama-3.1-8B-Instruct
+  openai_compatible: false
   vllm_config:
-    tensor_parallel_size: 4
+    tensor_parallel_size: 1
     max_model_len: 4096
     enable_prefix_caching: true
 requirements:
   - vllm==0.5.4
 resources:
-  accelerator: A100:4
+  accelerator: A100
   use_gpu: true
 runtime:
   predict_concurrency: 128

diff --git a/vllm/model/model.py b/vllm/model/model.py
@@ -122,6 +122,8 @@ def load(self):
                 logger.info(result.stdout)
             except subprocess.CalledProcessError as e:
                 logger.error(f"Command failed with code {e.returncode}: {e.stderr}")
+            # testing
+            raise RuntimeError("Oops!")
 
     async def predict(self, model_input):
         if "messages" not in model_input and "prompt" not in model_input: