From 89a496eaf4d3f80ce316c0e399c96c880b25eec4 Mon Sep 17 00:00:00 2001 From: Eero Tamminen Date: Tue, 10 Dec 2024 20:40:38 +0200 Subject: [PATCH] Clean up ChatQnA vLLM Gaudi parameters Signed-off-by: Eero Tamminen --- helm-charts/chatqna/gaudi-vllm-values.yaml | 18 +++++++----------- helm-charts/chatqna/values.yaml | 14 -------------- .../common/vllm/templates/configmap.yaml | 3 +++ helm-charts/common/vllm/values.yaml | 5 +++++ 4 files changed, 15 insertions(+), 25 deletions(-) diff --git a/helm-charts/chatqna/gaudi-vllm-values.yaml b/helm-charts/chatqna/gaudi-vllm-values.yaml index 3eb76980..3b187333 100644 --- a/helm-charts/chatqna/gaudi-vllm-values.yaml +++ b/helm-charts/chatqna/gaudi-vllm-values.yaml @@ -30,19 +30,15 @@ vllm: periodSeconds: 5 timeoutSeconds: 1 - # TODO: these are taken from GenAIExamples HPU manifest as-is - # vLLM chart needs to adopt / apply relevant ones - HABANA_LOGS: "/tmp/habana_logs" - NUMBA_CACHE_DIR: "/tmp" PT_HPU_ENABLE_LAZY_COLLECTIVES: "true" OMPI_MCA_btl_vader_single_copy_mechanism: "none" - HF_HOME: "/tmp/.cache/huggingface" - GPU_MEMORY_UTILIZATION: "0.5" - DTYPE: "auto" - TENSOR_PARALLEL_SIZE: "1" - BLOCK_SIZE: "128" - MAX_NUM_SEQS: "256" - MAX_SEQ_LEN_TO_CAPTURE: "2048" + + extraCmdArgs: [ + "--tensor-parallel-size", "1", + "--block-size", "128", + "--max-num-seqs", "256", + "--max-seq_len-to-capture", "2048" + ] # Reranking: second largest bottleneck when reranking is in use diff --git a/helm-charts/chatqna/values.yaml b/helm-charts/chatqna/values.yaml index dcb0504f..c939f9b9 100644 --- a/helm-charts/chatqna/values.yaml +++ b/helm-charts/chatqna/values.yaml @@ -71,21 +71,7 @@ tgi: LLM_MODEL_ID: Intel/neural-chat-7b-v3-3 vllm: enabled: false - # TODO: manifest in GenAIExamples uses "meta-llama/Meta-Llama-3-8B-Instruct" instead? LLM_MODEL_ID: Intel/neural-chat-7b-v3-3 - # TODO: these are non-redundant/non-broken options used by Agent component, - # but I think their values should be handled inside vLLM component, with - # deployment applying numbers set in configMap, based on values YAML file - # variables. - extraCmdArgs: [ - "--enforce-eager", - "--tensor-parallel-size", "1", - "--dtype", "auto", - "--block-size", "128", - "--max-num-seqs", "256", - "--max-seq_len-to-capture", "2048", - "--gpu-memory-utilization", "0.5" - ] # disable guardrails-usvc by default # See guardrails-values.yaml for guardrail related options diff --git a/helm-charts/common/vllm/templates/configmap.yaml b/helm-charts/common/vllm/templates/configmap.yaml index 14a8ba24..5fbbf6b7 100644 --- a/helm-charts/common/vllm/templates/configmap.yaml +++ b/helm-charts/common/vllm/templates/configmap.yaml @@ -25,6 +25,9 @@ data: {{- if .Values.VLLM_CPU_KVCACHE_SPACE }} VLLM_CPU_KVCACHE_SPACE: {{ .Values.VLLM_CPU_KVCACHE_SPACE | quote}} {{- end }} + {{- if .Values.PT_HPU_ENABLE_LAZY_COLLECTIVES }} + PT_HPU_ENABLE_LAZY_COLLECTIVES: {{ .Values.PT_HPU_ENABLE_LAZY_COLLECTIVES | quote }} + {{- end }} {{- if .Values.OMPI_MCA_btl_vader_single_copy_mechanism }} OMPI_MCA_btl_vader_single_copy_mechanism: {{ .Values.OMPI_MCA_btl_vader_single_copy_mechanism | quote}} {{- end }} diff --git a/helm-charts/common/vllm/values.yaml b/helm-charts/common/vllm/values.yaml index 8db5f89a..6e9e6c9c 100644 --- a/helm-charts/common/vllm/values.yaml +++ b/helm-charts/common/vllm/values.yaml @@ -103,6 +103,11 @@ tolerations: [] affinity: {} LLM_MODEL_ID: Intel/neural-chat-7b-v3-3 + +# Environment variables for vLLM (set in configmap): +# https://docs.vllm.ai/en/latest/getting_started/gaudi-installation.html#environment-variables +OMPI_MCA_btl_vader_single_copy_mechanism: "" +PT_HPU_ENABLE_LAZY_COLLECTIVES: "" VLLM_CPU_KVCACHE_SPACE: "" global: