From ddd372d3e40d6945abb1f89c5316f97252f1a5a4 Mon Sep 17 00:00:00 2001 From: "Wang, Kai Lawrence" <109344418+wangkl2@users.noreply.github.com> Date: Tue, 10 Dec 2024 13:19:56 +0800 Subject: [PATCH] Remove enforce-eager to enable HPU graphs for better vLLM perf (#954) * remove enforce-eager to enable HPU graphs Signed-off-by: Wang, Kai Lawrence * Increase the llm max timeout in ci for fully warmup Signed-off-by: Wang, Kai Lawrence --------- Signed-off-by: Wang, Kai Lawrence --- .../faq-generation/vllm/langchain/docker_compose_llm.yaml | 2 +- .../llms/summarization/vllm/langchain/docker_compose_llm.yaml | 2 +- .../vllm/langchain/dependency/launch_vllm_service.sh | 2 +- .../text-generation/vllm/langchain/docker_compose_llm.yaml | 2 +- .../vllm/llama_index/dependency/launch_vllm_service.sh | 2 +- .../text-generation/vllm/llama_index/docker_compose_llm.yaml | 2 +- .../test_llms_text-generation_vllm_langchain_on_intel_hpu.sh | 4 ++-- .../test_llms_text-generation_vllm_llamaindex_on_intel_hpu.sh | 4 ++-- 8 files changed, 10 insertions(+), 10 deletions(-) diff --git a/comps/llms/faq-generation/vllm/langchain/docker_compose_llm.yaml b/comps/llms/faq-generation/vllm/langchain/docker_compose_llm.yaml index 8ed64dd97..8b26dd751 100644 --- a/comps/llms/faq-generation/vllm/langchain/docker_compose_llm.yaml +++ b/comps/llms/faq-generation/vllm/langchain/docker_compose_llm.yaml @@ -23,7 +23,7 @@ services: cap_add: - SYS_NICE ipc: host - command: --enforce-eager --model $LLM_MODEL_ID --tensor-parallel-size 1 --host 0.0.0.0 --port 80 + command: --model $LLM_MODEL_ID --tensor-parallel-size 1 --host 0.0.0.0 --port 80 llm: image: opea/llm-faqgen-vllm:latest container_name: llm-faqgen-server diff --git a/comps/llms/summarization/vllm/langchain/docker_compose_llm.yaml b/comps/llms/summarization/vllm/langchain/docker_compose_llm.yaml index b93fd8030..720e7056d 100644 --- a/comps/llms/summarization/vllm/langchain/docker_compose_llm.yaml +++ b/comps/llms/summarization/vllm/langchain/docker_compose_llm.yaml @@ -23,7 +23,7 @@ services: cap_add: - SYS_NICE ipc: host - command: --enforce-eager --model $LLM_MODEL_ID --tensor-parallel-size 1 --host 0.0.0.0 --port 80 + command: --model $LLM_MODEL_ID --tensor-parallel-size 1 --host 0.0.0.0 --port 80 llm: image: opea/llm-docsum-vllm:latest container_name: llm-docsum-vllm-server diff --git a/comps/llms/text-generation/vllm/langchain/dependency/launch_vllm_service.sh b/comps/llms/text-generation/vllm/langchain/dependency/launch_vllm_service.sh index 6f6a7d211..83ecd6753 100644 --- a/comps/llms/text-generation/vllm/langchain/dependency/launch_vllm_service.sh +++ b/comps/llms/text-generation/vllm/langchain/dependency/launch_vllm_service.sh @@ -38,7 +38,7 @@ volume=$PWD/data # Build the Docker run command based on hardware mode if [ "$hw_mode" = "hpu" ]; then - docker run -d --rm --runtime=habana --name="vllm-service" -p $port_number:80 -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy -e HF_TOKEN=${HF_TOKEN} opea/vllm-gaudi:latest --enforce-eager --model $model_name --tensor-parallel-size $parallel_number --host 0.0.0.0 --port 80 --block-size $block_size --max-num-seqs $max_num_seqs --max-seq_len-to-capture $max_seq_len_to_capture + docker run -d --rm --runtime=habana --name="vllm-service" -p $port_number:80 -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy -e HF_TOKEN=${HF_TOKEN} opea/vllm-gaudi:latest --model $model_name --tensor-parallel-size $parallel_number --host 0.0.0.0 --port 80 --block-size $block_size --max-num-seqs $max_num_seqs --max-seq_len-to-capture $max_seq_len_to_capture else docker run -d --rm --name="vllm-service" -p $port_number:80 --network=host -v $volume:/data -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy -e HF_TOKEN=${HF_TOKEN} -e VLLM_CPU_KVCACHE_SPACE=40 opea/vllm-cpu:latest --model $model_name --host 0.0.0.0 --port 80 fi diff --git a/comps/llms/text-generation/vllm/langchain/docker_compose_llm.yaml b/comps/llms/text-generation/vllm/langchain/docker_compose_llm.yaml index e817c9f35..077ceee8b 100644 --- a/comps/llms/text-generation/vllm/langchain/docker_compose_llm.yaml +++ b/comps/llms/text-generation/vllm/langchain/docker_compose_llm.yaml @@ -23,7 +23,7 @@ services: cap_add: - SYS_NICE ipc: host - command: --enforce-eager --model $LLM_MODEL --tensor-parallel-size 1 --host 0.0.0.0 --port 80 + command: --model $LLM_MODEL --tensor-parallel-size 1 --host 0.0.0.0 --port 80 llm: image: opea/llm-vllm:latest container_name: llm-vllm-gaudi-server diff --git a/comps/llms/text-generation/vllm/llama_index/dependency/launch_vllm_service.sh b/comps/llms/text-generation/vllm/llama_index/dependency/launch_vllm_service.sh index d3363aa40..c8b2790b4 100644 --- a/comps/llms/text-generation/vllm/llama_index/dependency/launch_vllm_service.sh +++ b/comps/llms/text-generation/vllm/llama_index/dependency/launch_vllm_service.sh @@ -38,7 +38,7 @@ volume=$PWD/data # Build the Docker run command based on hardware mode if [ "$hw_mode" = "hpu" ]; then - docker run -d --rm --runtime=habana --name="vllm-service" -p $port_number:80 -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy -e HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN} opea/vllm-gaudi:latest --enforce-eager --model $model_name --tensor-parallel-size $parallel_number --host 0.0.0.0 --port 80 --block-size $block_size --max-num-seqs $max_num_seqs --max-seq_len-to-capture $max_seq_len_to_capture + docker run -d --rm --runtime=habana --name="vllm-service" -p $port_number:80 -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy -e HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN} opea/vllm-gaudi:latest --model $model_name --tensor-parallel-size $parallel_number --host 0.0.0.0 --port 80 --block-size $block_size --max-num-seqs $max_num_seqs --max-seq_len-to-capture $max_seq_len_to_capture else docker run -d --rm --name="vllm-service" -p $port_number:80 --network=host -v $volume:/data -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy -e HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN} -e VLLM_CPU_KVCACHE_SPACE=40 opea/vllm-cpu:latest --model $model_name --host 0.0.0.0 --port 80 fi diff --git a/comps/llms/text-generation/vllm/llama_index/docker_compose_llm.yaml b/comps/llms/text-generation/vllm/llama_index/docker_compose_llm.yaml index eeed7d19a..6bfc0d500 100644 --- a/comps/llms/text-generation/vllm/llama_index/docker_compose_llm.yaml +++ b/comps/llms/text-generation/vllm/llama_index/docker_compose_llm.yaml @@ -23,7 +23,7 @@ services: cap_add: - SYS_NICE ipc: host - command: --enforce-eager --model $LLM_MODEL --tensor-parallel-size 1 --host 0.0.0.0 --port 80 + command: --model $LLM_MODEL --tensor-parallel-size 1 --host 0.0.0.0 --port 80 llm: image: opea/llm-vllm-llamaindex:latest container_name: llm-vllm-gaudi-server diff --git a/tests/llms/test_llms_text-generation_vllm_langchain_on_intel_hpu.sh b/tests/llms/test_llms_text-generation_vllm_langchain_on_intel_hpu.sh index b1fd41e9a..6b8e468f8 100644 --- a/tests/llms/test_llms_text-generation_vllm_langchain_on_intel_hpu.sh +++ b/tests/llms/test_llms_text-generation_vllm_langchain_on_intel_hpu.sh @@ -48,7 +48,7 @@ function start_service() { --ipc=host \ -e HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN} \ opea/vllm-gaudi:comps \ - --enforce-eager --model $LLM_MODEL --tensor-parallel-size 1 --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048 + --model $LLM_MODEL --tensor-parallel-size 1 --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048 export vLLM_ENDPOINT="http://${ip_address}:${port_number}" docker run -d --rm \ @@ -62,7 +62,7 @@ function start_service() { # check whether vllm ray is fully ready n=0 - until [[ "$n" -ge 120 ]] || [[ $ready == true ]]; do + until [[ "$n" -ge 160 ]] || [[ $ready == true ]]; do docker logs test-comps-vllm-service > ${WORKPATH}/tests/test-comps-vllm-service.log n=$((n+1)) if grep -q throughput ${WORKPATH}/tests/test-comps-vllm-service.log; then diff --git a/tests/llms/test_llms_text-generation_vllm_llamaindex_on_intel_hpu.sh b/tests/llms/test_llms_text-generation_vllm_llamaindex_on_intel_hpu.sh index 3019d6c08..91a30ed85 100644 --- a/tests/llms/test_llms_text-generation_vllm_llamaindex_on_intel_hpu.sh +++ b/tests/llms/test_llms_text-generation_vllm_llamaindex_on_intel_hpu.sh @@ -48,7 +48,7 @@ function start_service() { --ipc=host \ -e HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN} \ opea/vllm-gaudi:comps \ - --enforce-eager --model $LLM_MODEL --tensor-parallel-size 1 --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048 + --model $LLM_MODEL --tensor-parallel-size 1 --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048 export vLLM_ENDPOINT="http://${ip_address}:${port_number}" docker run -d --rm \ @@ -62,7 +62,7 @@ function start_service() { # check whether vllm ray is fully ready n=0 - until [[ "$n" -ge 120 ]] || [[ $ready == true ]]; do + until [[ "$n" -ge 160 ]] || [[ $ready == true ]]; do docker logs test-comps-vllm-service > ${WORKPATH}/tests/test-comps-vllm-service.log n=$((n+1)) if grep -q throughput ${WORKPATH}/tests/test-comps-vllm-service.log; then