Skip to content

Commit

Permalink
Fix vLLM and vLLM-on-Ray UT bug (opea-project#580)
Browse files Browse the repository at this point in the history
Signed-off-by: Xinyao Wang <[email protected]>
Co-authored-by: chen, suyue <[email protected]>
Signed-off-by: dmsuehir <[email protected]>
  • Loading branch information
2 people authored and dmsuehir committed Aug 16, 2024
1 parent 5b02384 commit 1ecee49
Show file tree
Hide file tree
Showing 8 changed files with 37 additions and 34 deletions.
12 changes: 6 additions & 6 deletions ChatQnA/docker/gaudi/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -173,9 +173,9 @@ export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
export LLM_MODEL_ID_NAME="neural-chat-7b-v3-3"
export TEI_EMBEDDING_ENDPOINT="http://${host_ip}:8090"
export TEI_RERANKING_ENDPOINT="http://${host_ip}:8808"
export TGI_LLM_ENDPOINT="http://${host_ip}:8008"
export vLLM_LLM_ENDPOINT="http://${host_ip}:8008"
export vLLM_RAY_LLM_ENDPOINT="http://${host_ip}:8008"
export TGI_LLM_ENDPOINT="http://${host_ip}:8005"
export vLLM_LLM_ENDPOINT="http://${host_ip}:8007"
export vLLM_RAY_LLM_ENDPOINT="http://${host_ip}:8006"
export LLM_SERVICE_PORT=9000
export REDIS_URL="redis://${host_ip}:6379"
export INDEX_NAME="rag-redis"
Expand Down Expand Up @@ -296,15 +296,15 @@ curl http://${host_ip}:8000/v1/reranking \

```bash
#TGI Service
curl http://${host_ip}:8008/generate \
curl http://${host_ip}:8005/generate \
-X POST \
-d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":64, "do_sample": true}}' \
-H 'Content-Type: application/json'
```

```bash
#vLLM Service
curl http://${host_ip}:8008/v1/completions \
curl http://${host_ip}:8007/v1/completions \
-H "Content-Type: application/json" \
-d '{
"model": "${LLM_MODEL_ID}",
Expand All @@ -316,7 +316,7 @@ curl http://${host_ip}:8008/v1/completions \

```bash
#vLLM-on-Ray Service
curl http://${host_ip}:8008/v1/chat/completions \
curl http://${host_ip}:8006/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{"model": "${LLM_MODEL_ID}", "messages": [{"role": "user", "content": "What is Deep Learning?"}]}'
```
Expand Down
2 changes: 1 addition & 1 deletion ChatQnA/docker/gaudi/compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ services:
image: ghcr.io/huggingface/tgi-gaudi:2.0.1
container_name: tgi-gaudi-server
ports:
- "8008:80"
- "8005:80"
volumes:
- "./data:/data"
environment:
Expand Down
6 changes: 3 additions & 3 deletions ChatQnA/docker/gaudi/compose_vllm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ services:
image: opea/llm-vllm-hpu:latest
container_name: vllm-gaudi-server
ports:
- "8008:80"
- "8007:80"
volumes:
- "./data:/data"
environment:
Expand All @@ -122,12 +122,12 @@ services:
HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
HABANA_VISIBLE_DEVICES: all
OMPI_MCA_btl_vader_single_copy_mechanism: none
LLM_MODEL: ${LLM_MODEL_ID}
LLM_MODEL_ID: ${LLM_MODEL_ID}
runtime: habana
cap_add:
- SYS_NICE
ipc: host
command: /bin/bash -c "export VLLM_CPU_KVCACHE_SPACE=40 && python3 -m vllm.entrypoints.openai.api_server --enforce-eager --model $LLM_MODEL --tensor-parallel-size 1 --host 0.0.0.0 --port 80"
command: /bin/bash -c "export VLLM_CPU_KVCACHE_SPACE=40 && python3 -m vllm.entrypoints.openai.api_server --enforce-eager --model $LLM_MODEL_ID --tensor-parallel-size 1 --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048"
llm:
image: opea/llm-vllm:latest
container_name: llm-vllm-gaudi-server
Expand Down
6 changes: 3 additions & 3 deletions ChatQnA/docker/gaudi/compose_vllm_ray.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ services:
image: opea/llm-vllm-ray-hpu:latest
container_name: vllm-ray-gaudi-server
ports:
- "8008:8000"
- "8006:8000"
volumes:
- "./data:/data"
environment:
Expand All @@ -122,12 +122,12 @@ services:
HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
HABANA_VISIBLE_DEVICES: all
OMPI_MCA_btl_vader_single_copy_mechanism: none
LLM_MODEL: ${LLM_MODEL_ID}
LLM_MODEL_ID: ${LLM_MODEL_ID}
runtime: habana
cap_add:
- SYS_NICE
ipc: host
command: /bin/bash -c "ray start --head && python vllm_ray_openai.py --port_number 8000 --model_id_or_path $LLM_MODEL --tensor_parallel_size 2 --enforce_eager True"
command: /bin/bash -c "ray start --head && python vllm_ray_openai.py --port_number 8000 --model_id_or_path $LLM_MODEL_ID --tensor_parallel_size 2 --enforce_eager True"
llm:
image: opea/llm-vllm-ray:latest
container_name: llm-vllm-ray-gaudi-server
Expand Down
4 changes: 2 additions & 2 deletions ChatQnA/tests/test_chatqna_on_gaudi.sh
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ function start_services() {
export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
export TEI_EMBEDDING_ENDPOINT="http://${ip_address}:8090"
export TEI_RERANKING_ENDPOINT="http://${ip_address}:8808"
export TGI_LLM_ENDPOINT="http://${ip_address}:8008"
export TGI_LLM_ENDPOINT="http://${ip_address}:8005"
export REDIS_URL="redis://${ip_address}:6379"
export REDIS_HOST=${ip_address}
export INDEX_NAME="rag-redis"
Expand Down Expand Up @@ -215,7 +215,7 @@ function validate_microservices() {

# tgi for llm service
validate_service \
"${ip_address}:8008/generate" \
"${ip_address}:8005/generate" \
"generated_text" \
"tgi-llm" \
"tgi-gaudi-server" \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,8 @@ function start_services() {
export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
export TEI_EMBEDDING_ENDPOINT="http://${ip_address}:8090"
export TEI_RERANKING_ENDPOINT="http://${ip_address}:8808"
export vLLM_LLM_ENDPOINT="http://${ip_address}:8008"
export vLLM_LLM_ENDPOINT="http://${ip_address}:8007"
export LLM_SERVICE_PORT=9000
export REDIS_URL="redis://${ip_address}:6379"
export INDEX_NAME="rag-redis"
export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
Expand Down Expand Up @@ -79,12 +80,13 @@ function start_services() {
# Start Docker Containers
docker compose -f compose_vllm.yaml up -d
n=0
until [[ "$n" -ge 180 ]]; do
until [[ "$n" -ge 25 ]]; do
echo "n=$n"
docker logs vllm-gaudi-server > vllm_service_start.log
if grep -q Connected vllm_service_start.log; then
if grep -q "Warmup finished" vllm_service_start.log; then
break
fi
sleep 1s
sleep 20s
n=$((n+1))
done
}
Expand Down Expand Up @@ -165,7 +167,7 @@ function validate_microservices() {

# vllm for llm service
validate_services \
"${ip_address}:8008/v1/completions" \
"${ip_address}:8007/v1/completions" \
"text" \
"vllm-llm" \
"vllm-gaudi-server" \
Expand All @@ -185,7 +187,7 @@ function validate_megaservice() {
# Curl the Mega Service
validate_services \
"${ip_address}:8888/v1/chatqna" \
"billion" \
"data:" \
"mega-chatqna" \
"chatqna-gaudi-backend-server" \
'{"messages": "What is the revenue of Nike in 2023?"}'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,16 +26,15 @@ function build_docker_images() {
cd $WORKPATH/docker/ui
docker build --no-cache -t opea/chatqna-ui:latest -f docker/Dockerfile .

# cd $WORKPATH
# git clone https://github.com/vllm-project/vllm.git
# cd vllm
# docker build --no-cache -t opea/vllm:latest -f Dockerfile.cpu .
# cd $WORKPATH
# git clone https://github.com/vllm-project/vllm.git
# cd vllm
# docker build --no-cache -t opea/vllm:latest -f Dockerfile.cpu .

docker images
}

function start_services() {
# build vllm for each test instead of pull from local registry
cd $WORKPATH
git clone https://github.com/vllm-project/vllm.git
cd vllm
Expand Down Expand Up @@ -73,18 +72,19 @@ function start_services() {
sed -i "s#image: opea/chatqna-ui:latest#image: opea/chatqna-ui:${IMAGE_TAG}#g" compose_vllm.yaml
sed -i "s#image: opea/chatqna-conversation-ui:latest#image: opea/chatqna-conversation-ui:${IMAGE_TAG}#g" compose_vllm.yaml
sed -i "s#image: opea/*#image: ${IMAGE_REPO}opea/#g" compose_vllm.yaml
sed -i "s#image: ${IMAGE_REPO}opea/vllm:latest#image: opea/vllm:latest#g" compose_vllm.yaml
fi
fi

# Start Docker Containers
docker compose -f compose_vllm.yaml up -d
n=0
until [[ "$n" -ge 100 ]]; do
until [[ "$n" -ge 10 ]]; do
docker logs vllm-service > ${LOG_PATH}/vllm_service_start.log
if grep -q Connected ${LOG_PATH}/vllm_service_start.log; then
break
fi
sleep 1s
sleep 10s
n=$((n+1))
done
}
Expand Down Expand Up @@ -185,7 +185,7 @@ function validate_megaservice() {
# Curl the Mega Service
validate_services \
"${ip_address}:8888/v1/chatqna" \
"billion" \
"data" \
"mega-chatqna" \
"chatqna-xeon-backend-server" \
'{"messages": "What is the revenue of Nike in 2023?"}'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ function start_services() {
export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
export TEI_EMBEDDING_ENDPOINT="http://${ip_address}:8090"
export TEI_RERANKING_ENDPOINT="http://${ip_address}:8808"
export vLLM_RAY_LLM_ENDPOINT="http://${ip_address}:8008"
export vLLM_RAY_LLM_ENDPOINT="http://${ip_address}:8006"
export LLM_SERVICE_PORT=9000
export REDIS_URL="redis://${ip_address}:6379"
export INDEX_NAME="rag-redis"
Expand Down Expand Up @@ -80,12 +80,13 @@ function start_services() {
# Start Docker Containers
docker compose -f compose_vllm_ray.yaml up -d
n=0
until [[ "$n" -ge 400 ]]; do
until [[ "$n" -ge 25 ]]; do
echo "n=$n"
docker logs vllm-ray-gaudi-server > vllm_ray_service_start.log
if grep -q Connected vllm_ray_service_start.log; then
if grep -q "Warmup finished" vllm_ray_service_start.log; then
break
fi
sleep 1s
sleep 20s
n=$((n+1))
done
}
Expand Down Expand Up @@ -166,7 +167,7 @@ function validate_microservices() {

# vllm-on-ray for llm service
validate_services \
"${ip_address}:8008/v1/chat/completions" \
"${ip_address}:8006/v1/chat/completions" \
"content" \
"vllm-ray-llm" \
"vllm-ray-gaudi-server" \
Expand Down

0 comments on commit 1ecee49

Please sign in to comment.