From 96a982ad8fc232479384476b1596a880697cc1d0 Mon Sep 17 00:00:00 2001 From: OlivierDehaene <23298448+OlivierDehaene@users.noreply.github.com> Date: Wed, 25 Oct 2023 10:18:58 +0200 Subject: [PATCH] fix: better warmup error --- server/text_generation_server/models/flash_causal_lm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/text_generation_server/models/flash_causal_lm.py b/server/text_generation_server/models/flash_causal_lm.py index 1fe40c0c520..f1a4854f9ea 100644 --- a/server/text_generation_server/models/flash_causal_lm.py +++ b/server/text_generation_server/models/flash_causal_lm.py @@ -670,7 +670,7 @@ def warmup(self, batch: FlashCausalLMBatch): self.device, ) _, batch = self.generate_token(batch) - except Exception as e: + except torch.cuda.OutOfMemoryError as e: raise RuntimeError( f"Not enough memory to handle {len(batch.input_ids)} prefill tokens. " f"You need to decrease `--max-batch-prefill-tokens`"