From cfcac3f0ec50d9db3852cf5790237617b67f61ec Mon Sep 17 00:00:00 2001 From: XinyaoWa Date: Wed, 14 Aug 2024 18:03:25 +0800 Subject: [PATCH] Fix vLLM and vLLM-on-Ray UT bug (#580) Signed-off-by: Xinyao Wang Co-authored-by: chen, suyue --- ChatQnA/docker/gaudi/README.md | 12 ++++++------ ChatQnA/docker/gaudi/compose.yaml | 2 +- ChatQnA/docker/gaudi/compose_vllm.yaml | 6 +++--- ChatQnA/docker/gaudi/compose_vllm_ray.yaml | 6 +++--- ChatQnA/tests/test_chatqna_on_gaudi.sh | 4 ++-- ...on_gaudi.sh => test_chatqna_vllm_on_gaudi.sh} | 14 ++++++++------ ...m_on_xeon.sh => test_chatqna_vllm_on_xeon.sh} | 16 ++++++++-------- ...audi.sh => test_chatqna_vllm_ray_on_gaudi.sh} | 11 ++++++----- 8 files changed, 37 insertions(+), 34 deletions(-) rename ChatQnA/tests/{_test_chatqna_vllm_on_gaudi.sh => test_chatqna_vllm_on_gaudi.sh} (96%) rename ChatQnA/tests/{_test_chatqna_vllm_on_xeon.sh => test_chatqna_vllm_on_xeon.sh} (96%) rename ChatQnA/tests/{_test_chatqna_vllm_ray_on_gaudi.sh => test_chatqna_vllm_ray_on_gaudi.sh} (97%) diff --git a/ChatQnA/docker/gaudi/README.md b/ChatQnA/docker/gaudi/README.md index 91df664fd..75bd412d5 100644 --- a/ChatQnA/docker/gaudi/README.md +++ b/ChatQnA/docker/gaudi/README.md @@ -173,9 +173,9 @@ export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3" export LLM_MODEL_ID_NAME="neural-chat-7b-v3-3" export TEI_EMBEDDING_ENDPOINT="http://${host_ip}:8090" export TEI_RERANKING_ENDPOINT="http://${host_ip}:8808" -export TGI_LLM_ENDPOINT="http://${host_ip}:8008" -export vLLM_LLM_ENDPOINT="http://${host_ip}:8008" -export vLLM_RAY_LLM_ENDPOINT="http://${host_ip}:8008" +export TGI_LLM_ENDPOINT="http://${host_ip}:8005" +export vLLM_LLM_ENDPOINT="http://${host_ip}:8007" +export vLLM_RAY_LLM_ENDPOINT="http://${host_ip}:8006" export LLM_SERVICE_PORT=9000 export REDIS_URL="redis://${host_ip}:6379" export INDEX_NAME="rag-redis" @@ -296,7 +296,7 @@ curl http://${host_ip}:8000/v1/reranking \ ```bash #TGI Service -curl http://${host_ip}:8008/generate \ +curl http://${host_ip}:8005/generate \ -X POST \ -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":64, "do_sample": true}}' \ -H 'Content-Type: application/json' @@ -304,7 +304,7 @@ curl http://${host_ip}:8008/generate \ ```bash #vLLM Service -curl http://${host_ip}:8008/v1/completions \ +curl http://${host_ip}:8007/v1/completions \ -H "Content-Type: application/json" \ -d '{ "model": "${LLM_MODEL_ID}", @@ -316,7 +316,7 @@ curl http://${host_ip}:8008/v1/completions \ ```bash #vLLM-on-Ray Service -curl http://${host_ip}:8008/v1/chat/completions \ +curl http://${host_ip}:8006/v1/chat/completions \ -H "Content-Type: application/json" \ -d '{"model": "${LLM_MODEL_ID}", "messages": [{"role": "user", "content": "What is Deep Learning?"}]}' ``` diff --git a/ChatQnA/docker/gaudi/compose.yaml b/ChatQnA/docker/gaudi/compose.yaml index 362ce1824..82b8d0b77 100644 --- a/ChatQnA/docker/gaudi/compose.yaml +++ b/ChatQnA/docker/gaudi/compose.yaml @@ -114,7 +114,7 @@ services: image: ghcr.io/huggingface/tgi-gaudi:2.0.1 container_name: tgi-gaudi-server ports: - - "8008:80" + - "8005:80" volumes: - "./data:/data" environment: diff --git a/ChatQnA/docker/gaudi/compose_vllm.yaml b/ChatQnA/docker/gaudi/compose_vllm.yaml index 9082bff46..ec82c9f22 100644 --- a/ChatQnA/docker/gaudi/compose_vllm.yaml +++ b/ChatQnA/docker/gaudi/compose_vllm.yaml @@ -112,7 +112,7 @@ services: image: opea/llm-vllm-hpu:latest container_name: vllm-gaudi-server ports: - - "8008:80" + - "8007:80" volumes: - "./data:/data" environment: @@ -122,12 +122,12 @@ services: HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} HABANA_VISIBLE_DEVICES: all OMPI_MCA_btl_vader_single_copy_mechanism: none - LLM_MODEL: ${LLM_MODEL_ID} + LLM_MODEL_ID: ${LLM_MODEL_ID} runtime: habana cap_add: - SYS_NICE ipc: host - command: /bin/bash -c "export VLLM_CPU_KVCACHE_SPACE=40 && python3 -m vllm.entrypoints.openai.api_server --enforce-eager --model $LLM_MODEL --tensor-parallel-size 1 --host 0.0.0.0 --port 80" + command: /bin/bash -c "export VLLM_CPU_KVCACHE_SPACE=40 && python3 -m vllm.entrypoints.openai.api_server --enforce-eager --model $LLM_MODEL_ID --tensor-parallel-size 1 --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048" llm: image: opea/llm-vllm:latest container_name: llm-vllm-gaudi-server diff --git a/ChatQnA/docker/gaudi/compose_vllm_ray.yaml b/ChatQnA/docker/gaudi/compose_vllm_ray.yaml index d7fa0ca6e..5d98a1600 100644 --- a/ChatQnA/docker/gaudi/compose_vllm_ray.yaml +++ b/ChatQnA/docker/gaudi/compose_vllm_ray.yaml @@ -112,7 +112,7 @@ services: image: opea/llm-vllm-ray-hpu:latest container_name: vllm-ray-gaudi-server ports: - - "8008:8000" + - "8006:8000" volumes: - "./data:/data" environment: @@ -122,12 +122,12 @@ services: HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} HABANA_VISIBLE_DEVICES: all OMPI_MCA_btl_vader_single_copy_mechanism: none - LLM_MODEL: ${LLM_MODEL_ID} + LLM_MODEL_ID: ${LLM_MODEL_ID} runtime: habana cap_add: - SYS_NICE ipc: host - command: /bin/bash -c "ray start --head && python vllm_ray_openai.py --port_number 8000 --model_id_or_path $LLM_MODEL --tensor_parallel_size 2 --enforce_eager True" + command: /bin/bash -c "ray start --head && python vllm_ray_openai.py --port_number 8000 --model_id_or_path $LLM_MODEL_ID --tensor_parallel_size 2 --enforce_eager True" llm: image: opea/llm-vllm-ray:latest container_name: llm-vllm-ray-gaudi-server diff --git a/ChatQnA/tests/test_chatqna_on_gaudi.sh b/ChatQnA/tests/test_chatqna_on_gaudi.sh index 3a5e7537b..7c25cb003 100644 --- a/ChatQnA/tests/test_chatqna_on_gaudi.sh +++ b/ChatQnA/tests/test_chatqna_on_gaudi.sh @@ -50,7 +50,7 @@ function start_services() { export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3" export TEI_EMBEDDING_ENDPOINT="http://${ip_address}:8090" export TEI_RERANKING_ENDPOINT="http://${ip_address}:8808" - export TGI_LLM_ENDPOINT="http://${ip_address}:8008" + export TGI_LLM_ENDPOINT="http://${ip_address}:8005" export REDIS_URL="redis://${ip_address}:6379" export REDIS_HOST=${ip_address} export INDEX_NAME="rag-redis" @@ -215,7 +215,7 @@ function validate_microservices() { # tgi for llm service validate_service \ - "${ip_address}:8008/generate" \ + "${ip_address}:8005/generate" \ "generated_text" \ "tgi-llm" \ "tgi-gaudi-server" \ diff --git a/ChatQnA/tests/_test_chatqna_vllm_on_gaudi.sh b/ChatQnA/tests/test_chatqna_vllm_on_gaudi.sh similarity index 96% rename from ChatQnA/tests/_test_chatqna_vllm_on_gaudi.sh rename to ChatQnA/tests/test_chatqna_vllm_on_gaudi.sh index 716a20678..99e63e06b 100644 --- a/ChatQnA/tests/_test_chatqna_vllm_on_gaudi.sh +++ b/ChatQnA/tests/test_chatqna_vllm_on_gaudi.sh @@ -50,7 +50,8 @@ function start_services() { export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3" export TEI_EMBEDDING_ENDPOINT="http://${ip_address}:8090" export TEI_RERANKING_ENDPOINT="http://${ip_address}:8808" - export vLLM_LLM_ENDPOINT="http://${ip_address}:8008" + export vLLM_LLM_ENDPOINT="http://${ip_address}:8007" + export LLM_SERVICE_PORT=9000 export REDIS_URL="redis://${ip_address}:6379" export INDEX_NAME="rag-redis" export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN} @@ -79,12 +80,13 @@ function start_services() { # Start Docker Containers docker compose -f compose_vllm.yaml up -d n=0 - until [[ "$n" -ge 180 ]]; do + until [[ "$n" -ge 25 ]]; do + echo "n=$n" docker logs vllm-gaudi-server > vllm_service_start.log - if grep -q Connected vllm_service_start.log; then + if grep -q "Warmup finished" vllm_service_start.log; then break fi - sleep 1s + sleep 20s n=$((n+1)) done } @@ -165,7 +167,7 @@ function validate_microservices() { # vllm for llm service validate_services \ - "${ip_address}:8008/v1/completions" \ + "${ip_address}:8007/v1/completions" \ "text" \ "vllm-llm" \ "vllm-gaudi-server" \ @@ -185,7 +187,7 @@ function validate_megaservice() { # Curl the Mega Service validate_services \ "${ip_address}:8888/v1/chatqna" \ - "billion" \ + "data:" \ "mega-chatqna" \ "chatqna-gaudi-backend-server" \ '{"messages": "What is the revenue of Nike in 2023?"}' diff --git a/ChatQnA/tests/_test_chatqna_vllm_on_xeon.sh b/ChatQnA/tests/test_chatqna_vllm_on_xeon.sh similarity index 96% rename from ChatQnA/tests/_test_chatqna_vllm_on_xeon.sh rename to ChatQnA/tests/test_chatqna_vllm_on_xeon.sh index e92954313..a424187cd 100644 --- a/ChatQnA/tests/_test_chatqna_vllm_on_xeon.sh +++ b/ChatQnA/tests/test_chatqna_vllm_on_xeon.sh @@ -26,16 +26,15 @@ function build_docker_images() { cd $WORKPATH/docker/ui docker build --no-cache -t opea/chatqna-ui:latest -f docker/Dockerfile . -# cd $WORKPATH -# git clone https://github.com/vllm-project/vllm.git -# cd vllm -# docker build --no-cache -t opea/vllm:latest -f Dockerfile.cpu . + # cd $WORKPATH + # git clone https://github.com/vllm-project/vllm.git + # cd vllm + # docker build --no-cache -t opea/vllm:latest -f Dockerfile.cpu . docker images } function start_services() { - # build vllm for each test instead of pull from local registry cd $WORKPATH git clone https://github.com/vllm-project/vllm.git cd vllm @@ -73,18 +72,19 @@ function start_services() { sed -i "s#image: opea/chatqna-ui:latest#image: opea/chatqna-ui:${IMAGE_TAG}#g" compose_vllm.yaml sed -i "s#image: opea/chatqna-conversation-ui:latest#image: opea/chatqna-conversation-ui:${IMAGE_TAG}#g" compose_vllm.yaml sed -i "s#image: opea/*#image: ${IMAGE_REPO}opea/#g" compose_vllm.yaml + sed -i "s#image: ${IMAGE_REPO}opea/vllm:latest#image: opea/vllm:latest#g" compose_vllm.yaml fi fi # Start Docker Containers docker compose -f compose_vllm.yaml up -d n=0 - until [[ "$n" -ge 100 ]]; do + until [[ "$n" -ge 10 ]]; do docker logs vllm-service > ${LOG_PATH}/vllm_service_start.log if grep -q Connected ${LOG_PATH}/vllm_service_start.log; then break fi - sleep 1s + sleep 10s n=$((n+1)) done } @@ -185,7 +185,7 @@ function validate_megaservice() { # Curl the Mega Service validate_services \ "${ip_address}:8888/v1/chatqna" \ - "billion" \ + "data" \ "mega-chatqna" \ "chatqna-xeon-backend-server" \ '{"messages": "What is the revenue of Nike in 2023?"}' diff --git a/ChatQnA/tests/_test_chatqna_vllm_ray_on_gaudi.sh b/ChatQnA/tests/test_chatqna_vllm_ray_on_gaudi.sh similarity index 97% rename from ChatQnA/tests/_test_chatqna_vllm_ray_on_gaudi.sh rename to ChatQnA/tests/test_chatqna_vllm_ray_on_gaudi.sh index ab41c3bb8..9099598bb 100644 --- a/ChatQnA/tests/_test_chatqna_vllm_ray_on_gaudi.sh +++ b/ChatQnA/tests/test_chatqna_vllm_ray_on_gaudi.sh @@ -50,7 +50,7 @@ function start_services() { export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3" export TEI_EMBEDDING_ENDPOINT="http://${ip_address}:8090" export TEI_RERANKING_ENDPOINT="http://${ip_address}:8808" - export vLLM_RAY_LLM_ENDPOINT="http://${ip_address}:8008" + export vLLM_RAY_LLM_ENDPOINT="http://${ip_address}:8006" export LLM_SERVICE_PORT=9000 export REDIS_URL="redis://${ip_address}:6379" export INDEX_NAME="rag-redis" @@ -80,12 +80,13 @@ function start_services() { # Start Docker Containers docker compose -f compose_vllm_ray.yaml up -d n=0 - until [[ "$n" -ge 400 ]]; do + until [[ "$n" -ge 25 ]]; do + echo "n=$n" docker logs vllm-ray-gaudi-server > vllm_ray_service_start.log - if grep -q Connected vllm_ray_service_start.log; then + if grep -q "Warmup finished" vllm_ray_service_start.log; then break fi - sleep 1s + sleep 20s n=$((n+1)) done } @@ -166,7 +167,7 @@ function validate_microservices() { # vllm-on-ray for llm service validate_services \ - "${ip_address}:8008/v1/chat/completions" \ + "${ip_address}:8006/v1/chat/completions" \ "content" \ "vllm-ray-llm" \ "vllm-ray-gaudi-server" \