From cfcac3f0ec50d9db3852cf5790237617b67f61ec Mon Sep 17 00:00:00 2001
From: XinyaoWa <xinyao.wang@intel.com>
Date: Wed, 14 Aug 2024 18:03:25 +0800
Subject: [PATCH] Fix vLLM and vLLM-on-Ray UT bug (#580)

Signed-off-by: Xinyao Wang <xinyao.wang@intel.com>
Co-authored-by: chen, suyue <suyue.chen@intel.com>
---
 ChatQnA/docker/gaudi/README.md                   | 12 ++++++------
 ChatQnA/docker/gaudi/compose.yaml                |  2 +-
 ChatQnA/docker/gaudi/compose_vllm.yaml           |  6 +++---
 ChatQnA/docker/gaudi/compose_vllm_ray.yaml       |  6 +++---
 ChatQnA/tests/test_chatqna_on_gaudi.sh           |  4 ++--
 ...on_gaudi.sh => test_chatqna_vllm_on_gaudi.sh} | 14 ++++++++------
 ...m_on_xeon.sh => test_chatqna_vllm_on_xeon.sh} | 16 ++++++++--------
 ...audi.sh => test_chatqna_vllm_ray_on_gaudi.sh} | 11 ++++++-----
 8 files changed, 37 insertions(+), 34 deletions(-)
 rename ChatQnA/tests/{_test_chatqna_vllm_on_gaudi.sh => test_chatqna_vllm_on_gaudi.sh} (96%)
 rename ChatQnA/tests/{_test_chatqna_vllm_on_xeon.sh => test_chatqna_vllm_on_xeon.sh} (96%)
 rename ChatQnA/tests/{_test_chatqna_vllm_ray_on_gaudi.sh => test_chatqna_vllm_ray_on_gaudi.sh} (97%)

diff --git a/ChatQnA/docker/gaudi/README.md b/ChatQnA/docker/gaudi/README.md
index 91df664fd..75bd412d5 100644
--- a/ChatQnA/docker/gaudi/README.md
+++ b/ChatQnA/docker/gaudi/README.md
@@ -173,9 +173,9 @@ export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
 export LLM_MODEL_ID_NAME="neural-chat-7b-v3-3"
 export TEI_EMBEDDING_ENDPOINT="http://${host_ip}:8090"
 export TEI_RERANKING_ENDPOINT="http://${host_ip}:8808"
-export TGI_LLM_ENDPOINT="http://${host_ip}:8008"
-export vLLM_LLM_ENDPOINT="http://${host_ip}:8008"
-export vLLM_RAY_LLM_ENDPOINT="http://${host_ip}:8008"
+export TGI_LLM_ENDPOINT="http://${host_ip}:8005"
+export vLLM_LLM_ENDPOINT="http://${host_ip}:8007"
+export vLLM_RAY_LLM_ENDPOINT="http://${host_ip}:8006"
 export LLM_SERVICE_PORT=9000
 export REDIS_URL="redis://${host_ip}:6379"
 export INDEX_NAME="rag-redis"
@@ -296,7 +296,7 @@ curl http://${host_ip}:8000/v1/reranking \
 
 ```bash
 #TGI Service
-curl http://${host_ip}:8008/generate \
+curl http://${host_ip}:8005/generate \
   -X POST \
   -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":64, "do_sample": true}}' \
   -H 'Content-Type: application/json'
@@ -304,7 +304,7 @@ curl http://${host_ip}:8008/generate \
 
 ```bash
 #vLLM Service
-curl http://${host_ip}:8008/v1/completions \
+curl http://${host_ip}:8007/v1/completions \
   -H "Content-Type: application/json" \
   -d '{
   "model": "${LLM_MODEL_ID}",
@@ -316,7 +316,7 @@ curl http://${host_ip}:8008/v1/completions \
 
 ```bash
 #vLLM-on-Ray Service
-curl http://${host_ip}:8008/v1/chat/completions \
+curl http://${host_ip}:8006/v1/chat/completions \
   -H "Content-Type: application/json" \
   -d '{"model": "${LLM_MODEL_ID}", "messages": [{"role": "user", "content": "What is Deep Learning?"}]}'
 ```
diff --git a/ChatQnA/docker/gaudi/compose.yaml b/ChatQnA/docker/gaudi/compose.yaml
index 362ce1824..82b8d0b77 100644
--- a/ChatQnA/docker/gaudi/compose.yaml
+++ b/ChatQnA/docker/gaudi/compose.yaml
@@ -114,7 +114,7 @@ services:
     image: ghcr.io/huggingface/tgi-gaudi:2.0.1
     container_name: tgi-gaudi-server
     ports:
-      - "8008:80"
+      - "8005:80"
     volumes:
       - "./data:/data"
     environment:
diff --git a/ChatQnA/docker/gaudi/compose_vllm.yaml b/ChatQnA/docker/gaudi/compose_vllm.yaml
index 9082bff46..ec82c9f22 100644
--- a/ChatQnA/docker/gaudi/compose_vllm.yaml
+++ b/ChatQnA/docker/gaudi/compose_vllm.yaml
@@ -112,7 +112,7 @@ services:
     image: opea/llm-vllm-hpu:latest
     container_name: vllm-gaudi-server
     ports:
-      - "8008:80"
+      - "8007:80"
     volumes:
       - "./data:/data"
     environment:
@@ -122,12 +122,12 @@ services:
       HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
       HABANA_VISIBLE_DEVICES: all
       OMPI_MCA_btl_vader_single_copy_mechanism: none
-      LLM_MODEL: ${LLM_MODEL_ID}
+      LLM_MODEL_ID: ${LLM_MODEL_ID}
     runtime: habana
     cap_add:
       - SYS_NICE
     ipc: host
-    command: /bin/bash -c "export VLLM_CPU_KVCACHE_SPACE=40 && python3 -m vllm.entrypoints.openai.api_server --enforce-eager --model $LLM_MODEL --tensor-parallel-size 1 --host 0.0.0.0 --port 80"
+    command: /bin/bash -c "export VLLM_CPU_KVCACHE_SPACE=40 && python3 -m vllm.entrypoints.openai.api_server --enforce-eager --model $LLM_MODEL_ID --tensor-parallel-size 1 --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048"
   llm:
     image: opea/llm-vllm:latest
     container_name: llm-vllm-gaudi-server
diff --git a/ChatQnA/docker/gaudi/compose_vllm_ray.yaml b/ChatQnA/docker/gaudi/compose_vllm_ray.yaml
index d7fa0ca6e..5d98a1600 100644
--- a/ChatQnA/docker/gaudi/compose_vllm_ray.yaml
+++ b/ChatQnA/docker/gaudi/compose_vllm_ray.yaml
@@ -112,7 +112,7 @@ services:
     image: opea/llm-vllm-ray-hpu:latest
     container_name: vllm-ray-gaudi-server
     ports:
-      - "8008:8000"
+      - "8006:8000"
     volumes:
       - "./data:/data"
     environment:
@@ -122,12 +122,12 @@ services:
       HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
       HABANA_VISIBLE_DEVICES: all
       OMPI_MCA_btl_vader_single_copy_mechanism: none
-      LLM_MODEL: ${LLM_MODEL_ID}
+      LLM_MODEL_ID: ${LLM_MODEL_ID}
     runtime: habana
     cap_add:
       - SYS_NICE
     ipc: host
-    command: /bin/bash -c "ray start --head && python vllm_ray_openai.py --port_number 8000 --model_id_or_path $LLM_MODEL --tensor_parallel_size 2 --enforce_eager True"
+    command: /bin/bash -c "ray start --head && python vllm_ray_openai.py --port_number 8000 --model_id_or_path $LLM_MODEL_ID --tensor_parallel_size 2 --enforce_eager True"
   llm:
     image: opea/llm-vllm-ray:latest
     container_name: llm-vllm-ray-gaudi-server
diff --git a/ChatQnA/tests/test_chatqna_on_gaudi.sh b/ChatQnA/tests/test_chatqna_on_gaudi.sh
index 3a5e7537b..7c25cb003 100644
--- a/ChatQnA/tests/test_chatqna_on_gaudi.sh
+++ b/ChatQnA/tests/test_chatqna_on_gaudi.sh
@@ -50,7 +50,7 @@ function start_services() {
     export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
     export TEI_EMBEDDING_ENDPOINT="http://${ip_address}:8090"
     export TEI_RERANKING_ENDPOINT="http://${ip_address}:8808"
-    export TGI_LLM_ENDPOINT="http://${ip_address}:8008"
+    export TGI_LLM_ENDPOINT="http://${ip_address}:8005"
     export REDIS_URL="redis://${ip_address}:6379"
     export REDIS_HOST=${ip_address}
     export INDEX_NAME="rag-redis"
@@ -215,7 +215,7 @@ function validate_microservices() {
 
     # tgi for llm service
     validate_service \
-        "${ip_address}:8008/generate" \
+        "${ip_address}:8005/generate" \
         "generated_text" \
         "tgi-llm" \
         "tgi-gaudi-server" \
diff --git a/ChatQnA/tests/_test_chatqna_vllm_on_gaudi.sh b/ChatQnA/tests/test_chatqna_vllm_on_gaudi.sh
similarity index 96%
rename from ChatQnA/tests/_test_chatqna_vllm_on_gaudi.sh
rename to ChatQnA/tests/test_chatqna_vllm_on_gaudi.sh
index 716a20678..99e63e06b 100644
--- a/ChatQnA/tests/_test_chatqna_vllm_on_gaudi.sh
+++ b/ChatQnA/tests/test_chatqna_vllm_on_gaudi.sh
@@ -50,7 +50,8 @@ function start_services() {
     export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
     export TEI_EMBEDDING_ENDPOINT="http://${ip_address}:8090"
     export TEI_RERANKING_ENDPOINT="http://${ip_address}:8808"
-    export vLLM_LLM_ENDPOINT="http://${ip_address}:8008"
+    export vLLM_LLM_ENDPOINT="http://${ip_address}:8007"
+    export LLM_SERVICE_PORT=9000
     export REDIS_URL="redis://${ip_address}:6379"
     export INDEX_NAME="rag-redis"
     export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
@@ -79,12 +80,13 @@ function start_services() {
     # Start Docker Containers
     docker compose -f compose_vllm.yaml up -d
     n=0
-    until [[ "$n" -ge 180 ]]; do
+    until [[ "$n" -ge 25 ]]; do
+        echo "n=$n"
         docker logs vllm-gaudi-server > vllm_service_start.log
-        if grep -q Connected vllm_service_start.log; then
+        if grep -q "Warmup finished" vllm_service_start.log; then
             break
         fi
-        sleep 1s
+        sleep 20s
         n=$((n+1))
     done
 }
@@ -165,7 +167,7 @@ function validate_microservices() {
 
     # vllm for llm service
     validate_services \
-        "${ip_address}:8008/v1/completions" \
+        "${ip_address}:8007/v1/completions" \
         "text" \
         "vllm-llm" \
         "vllm-gaudi-server" \
@@ -185,7 +187,7 @@ function validate_megaservice() {
     # Curl the Mega Service
     validate_services \
         "${ip_address}:8888/v1/chatqna" \
-        "billion" \
+        "data:" \
         "mega-chatqna" \
         "chatqna-gaudi-backend-server" \
         '{"messages": "What is the revenue of Nike in 2023?"}'
diff --git a/ChatQnA/tests/_test_chatqna_vllm_on_xeon.sh b/ChatQnA/tests/test_chatqna_vllm_on_xeon.sh
similarity index 96%
rename from ChatQnA/tests/_test_chatqna_vllm_on_xeon.sh
rename to ChatQnA/tests/test_chatqna_vllm_on_xeon.sh
index e92954313..a424187cd 100644
--- a/ChatQnA/tests/_test_chatqna_vllm_on_xeon.sh
+++ b/ChatQnA/tests/test_chatqna_vllm_on_xeon.sh
@@ -26,16 +26,15 @@ function build_docker_images() {
     cd $WORKPATH/docker/ui
     docker build --no-cache -t opea/chatqna-ui:latest -f docker/Dockerfile .
 
-#    cd $WORKPATH
-#    git clone https://github.com/vllm-project/vllm.git
-#    cd vllm
-#    docker build --no-cache -t opea/vllm:latest -f Dockerfile.cpu .
+    # cd $WORKPATH
+    # git clone https://github.com/vllm-project/vllm.git
+    # cd vllm
+    # docker build --no-cache -t opea/vllm:latest -f Dockerfile.cpu .
 
     docker images
 }
 
 function start_services() {
-    # build vllm for each test instead of pull from local registry
     cd $WORKPATH
     git clone https://github.com/vllm-project/vllm.git
     cd vllm
@@ -73,18 +72,19 @@ function start_services() {
             sed -i "s#image: opea/chatqna-ui:latest#image: opea/chatqna-ui:${IMAGE_TAG}#g" compose_vllm.yaml
             sed -i "s#image: opea/chatqna-conversation-ui:latest#image: opea/chatqna-conversation-ui:${IMAGE_TAG}#g" compose_vllm.yaml
             sed -i "s#image: opea/*#image: ${IMAGE_REPO}opea/#g" compose_vllm.yaml
+            sed -i "s#image: ${IMAGE_REPO}opea/vllm:latest#image: opea/vllm:latest#g" compose_vllm.yaml
         fi
     fi
 
     # Start Docker Containers
     docker compose -f compose_vllm.yaml up -d
     n=0
-    until [[ "$n" -ge 100 ]]; do
+    until [[ "$n" -ge 10 ]]; do
         docker logs vllm-service > ${LOG_PATH}/vllm_service_start.log
         if grep -q Connected ${LOG_PATH}/vllm_service_start.log; then
             break
         fi
-        sleep 1s
+        sleep 10s
         n=$((n+1))
     done
 }
@@ -185,7 +185,7 @@ function validate_megaservice() {
     # Curl the Mega Service
     validate_services \
         "${ip_address}:8888/v1/chatqna" \
-        "billion" \
+        "data" \
         "mega-chatqna" \
         "chatqna-xeon-backend-server" \
         '{"messages": "What is the revenue of Nike in 2023?"}'
diff --git a/ChatQnA/tests/_test_chatqna_vllm_ray_on_gaudi.sh b/ChatQnA/tests/test_chatqna_vllm_ray_on_gaudi.sh
similarity index 97%
rename from ChatQnA/tests/_test_chatqna_vllm_ray_on_gaudi.sh
rename to ChatQnA/tests/test_chatqna_vllm_ray_on_gaudi.sh
index ab41c3bb8..9099598bb 100644
--- a/ChatQnA/tests/_test_chatqna_vllm_ray_on_gaudi.sh
+++ b/ChatQnA/tests/test_chatqna_vllm_ray_on_gaudi.sh
@@ -50,7 +50,7 @@ function start_services() {
     export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
     export TEI_EMBEDDING_ENDPOINT="http://${ip_address}:8090"
     export TEI_RERANKING_ENDPOINT="http://${ip_address}:8808"
-    export vLLM_RAY_LLM_ENDPOINT="http://${ip_address}:8008"
+    export vLLM_RAY_LLM_ENDPOINT="http://${ip_address}:8006"
     export LLM_SERVICE_PORT=9000
     export REDIS_URL="redis://${ip_address}:6379"
     export INDEX_NAME="rag-redis"
@@ -80,12 +80,13 @@ function start_services() {
     # Start Docker Containers
     docker compose -f compose_vllm_ray.yaml up -d
     n=0
-    until [[ "$n" -ge 400 ]]; do
+    until [[ "$n" -ge 25 ]]; do
+        echo "n=$n"
         docker logs vllm-ray-gaudi-server > vllm_ray_service_start.log
-        if grep -q Connected vllm_ray_service_start.log; then
+        if grep -q "Warmup finished" vllm_ray_service_start.log; then
             break
         fi
-        sleep 1s
+        sleep 20s
         n=$((n+1))
     done
 }
@@ -166,7 +167,7 @@ function validate_microservices() {
 
     # vllm-on-ray for llm service
     validate_services \
-        "${ip_address}:8008/v1/chat/completions" \
+        "${ip_address}:8006/v1/chat/completions" \
         "content" \
         "vllm-ray-llm" \
         "vllm-ray-gaudi-server" \