From 616ca05ca0a499c85e737e9da39bf771f649a297 Mon Sep 17 00:00:00 2001 From: Lianhao Lu Date: Tue, 10 Dec 2024 07:05:08 +0000 Subject: [PATCH] Adapt to latest vllm changes - Remove --eager-enforce on hpu to improve performance - Refactor to the upstream docker entrypoint changes Fixes issue #631. Signed-off-by: Lianhao Lu --- helm-charts/common/agent/values.yaml | 2 +- helm-charts/common/llm-uservice/ci-vllm-gaudi-values.yaml | 2 +- helm-charts/common/vllm/gaudi-values.yaml | 4 +--- helm-charts/common/vllm/values.yaml | 2 +- 4 files changed, 4 insertions(+), 6 deletions(-) diff --git a/helm-charts/common/agent/values.yaml b/helm-charts/common/agent/values.yaml index 4e602d96..0ebfd4c3 100644 --- a/helm-charts/common/agent/values.yaml +++ b/helm-charts/common/agent/values.yaml @@ -14,7 +14,7 @@ tgi: vllm: enabled: false LLM_MODEL_ID: "mistralai/Mistral-7B-Instruct-v0.3" - extraCmdArgs: ["/bin/bash", "-c", "python3 -m vllm.entrypoints.openai.api_server --enforce-eager --model mistralai/Mistral-7B-Instruct-v0.3 --tensor-parallel-size 1 --host 0.0.0.0 --port 2080 --download-dir /data --block-size 128 --max-num-seqs 4096 --max-seq_len-to-capture 8192 --enable-auto-tool-choice --tool-call-parser mistral"] + extraCmdArgs: ["--tensor-parallel-size", "1", "--block-size", "128", "--max-num-seqs", "4096", "--max-seq_len-to-capture", "8192", "--enable-auto-tool-choice", "--tool-call-parser", "mistral"] replicaCount: 1 llm_endpoint_url: "" diff --git a/helm-charts/common/llm-uservice/ci-vllm-gaudi-values.yaml b/helm-charts/common/llm-uservice/ci-vllm-gaudi-values.yaml index 2438eaed..0f1170f3 100644 --- a/helm-charts/common/llm-uservice/ci-vllm-gaudi-values.yaml +++ b/helm-charts/common/llm-uservice/ci-vllm-gaudi-values.yaml @@ -13,7 +13,7 @@ vllm: tag: "latest" LLM_MODEL_ID: Intel/neural-chat-7b-v3-3 OMPI_MCA_btl_vader_single_copy_mechanism: none - extraCmdArgs: ["--enforce-eager","--tensor-parallel-size","1","--block-size","128","--max-num-seqs","256","--max-seq_len-to-capture","2048"] + extraCmdArgs: ["--tensor-parallel-size","1","--block-size","128","--max-num-seqs","256","--max-seq_len-to-capture","2048"] resources: limits: habana.ai/gaudi: 1 diff --git a/helm-charts/common/vllm/gaudi-values.yaml b/helm-charts/common/vllm/gaudi-values.yaml index 65e62204..5f35fe07 100644 --- a/helm-charts/common/vllm/gaudi-values.yaml +++ b/helm-charts/common/vllm/gaudi-values.yaml @@ -11,9 +11,7 @@ image: # VLLM_CPU_KVCACHE_SPACE: "40" OMPI_MCA_btl_vader_single_copy_mechanism: none -extraCmdArgs: ["--enforce-eager","--tensor-parallel-size","1","--block-size","128","--max-num-seqs","256","--max-seq_len-to-capture","2048"] -# Workaround for current HPU image with start command /bin/bash -# extraCmdArgs: ["/bin/bash","-c","python3 -m vllm.entrypoints.openai.api_server --enforce-eager --model Intel/neural-chat-7b-v3-3 --tensor-parallel-size 1 --host 0.0.0.0 --port 2080 --download-dir /data --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048"] +extraCmdArgs: ["--tensor-parallel-size","1","--block-size","128","--max-num-seqs","256","--max-seq_len-to-capture","2048"] resources: limits: habana.ai/gaudi: 1 diff --git a/helm-charts/common/vllm/values.yaml b/helm-charts/common/vllm/values.yaml index c8958e3e..b960a5ac 100644 --- a/helm-charts/common/vllm/values.yaml +++ b/helm-charts/common/vllm/values.yaml @@ -62,7 +62,7 @@ resources: {} # cpu: 100m # memory: 128Mi -extraCmdArgs: ["--enforce-eager", "--dtype", "auto"] +extraCmdArgs: [] livenessProbe: httpGet: