Skip to content

Commit

Permalink
Adapt ChatQnA vLLM Gaudi parameters to latest changes
Browse files Browse the repository at this point in the history
Signed-off-by: Eero Tamminen <[email protected]>
  • Loading branch information
eero-t committed Dec 11, 2024
1 parent f38b1b0 commit 908a4ee
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 25 deletions.
21 changes: 10 additions & 11 deletions helm-charts/chatqna/gaudi-vllm-values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,19 +30,18 @@ vllm:
periodSeconds: 5
timeoutSeconds: 1

# TODO: these are taken from GenAIExamples HPU manifest as-is
# vLLM chart needs to adopt / apply relevant ones
HABANA_LOGS: "/tmp/habana_logs"
NUMBA_CACHE_DIR: "/tmp"
# TODO: GenAIExamples HPU manifest mentions additional env vars:
# https://github.com/opea-project/GenAIExamples/blob/main/ChatQnA/kubernetes/intel/hpu/gaudi/manifest/chatqna-vllm.yaml#L194
# should they be specified here and/or in vLLM chart values?
PT_HPU_ENABLE_LAZY_COLLECTIVES: "true"
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
HF_HOME: "/tmp/.cache/huggingface"
GPU_MEMORY_UTILIZATION: "0.5"
DTYPE: "auto"
TENSOR_PARALLEL_SIZE: "1"
BLOCK_SIZE: "128"
MAX_NUM_SEQS: "256"
MAX_SEQ_LEN_TO_CAPTURE: "2048"

extraCmdArgs: [
"--tensor-parallel-size", "1",
"--block-size", "128",
"--max-num-seqs", "256",
"--max-seq_len-to-capture", "2048"
]


# Reranking: second largest bottleneck when reranking is in use
Expand Down
14 changes: 0 additions & 14 deletions helm-charts/chatqna/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -71,21 +71,7 @@ tgi:
LLM_MODEL_ID: Intel/neural-chat-7b-v3-3
vllm:
enabled: false
# TODO: manifest in GenAIExamples uses "meta-llama/Meta-Llama-3-8B-Instruct" instead?
LLM_MODEL_ID: Intel/neural-chat-7b-v3-3
# TODO: these are non-redundant/non-broken options used by Agent component,
# but I think their values should be handled inside vLLM component, with
# deployment applying numbers set in configMap, based on values YAML file
# variables.
extraCmdArgs: [
"--enforce-eager",
"--tensor-parallel-size", "1",
"--dtype", "auto",
"--block-size", "128",
"--max-num-seqs", "256",
"--max-seq_len-to-capture", "2048",
"--gpu-memory-utilization", "0.5"
]

# disable guardrails-usvc by default
# See guardrails-values.yaml for guardrail related options
Expand Down

0 comments on commit 908a4ee

Please sign in to comment.