From 3d134d260b8968eb9ca18162b2f0d86aa15a85b3 Mon Sep 17 00:00:00 2001
From: Tianyi Liu <tianyi.liu@intel.com>
Date: Fri, 7 Jun 2024 10:52:01 +0800
Subject: [PATCH] Fix the vLLM docker compose issues (#134)

* refine the vLLM docker compose

Signed-off-by: tianyil1 <tianyi.liu@intel.com>

* update the vllm openai api call

Signed-off-by: tianyil1 <tianyi.liu@intel.com>

* refine the default network configuration in the docker-compose

Signed-off-by: tianyil1 <tianyi.liu@intel.com>

* refine the network config of docker compose and launch service

Signed-off-by: tianyil1 <tianyi.liu@intel.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: tianyil1 <tianyi.liu@intel.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 comps/llms/text-generation/vllm/docker_compose_llm.yaml | 4 +++-
 comps/llms/text-generation/vllm/launch_vllm_service.sh  | 2 +-
 comps/llms/text-generation/vllm/llm.py                  | 5 +++--
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/comps/llms/text-generation/vllm/docker_compose_llm.yaml b/comps/llms/text-generation/vllm/docker_compose_llm.yaml
index a5c22dfe8..71f8b7a40 100644
--- a/comps/llms/text-generation/vllm/docker_compose_llm.yaml
+++ b/comps/llms/text-generation/vllm/docker_compose_llm.yaml
@@ -14,9 +14,10 @@ services:
     environment:
       http_proxy: ${http_proxy}
       https_proxy: ${https_proxy}
+      no_proxy: ${no_proxy}
       LLM_MODEL_ID: ${LLM_MODEL_ID}
       HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-    command: cd / && export VLLM_CPU_KVCACHE_SPACE=40 && python3 -m vllm.entrypoints.openai.api_server --model $LLM_MODEL_ID --port 80
+    command: /bin/sh -c "cd / && export VLLM_CPU_KVCACHE_SPACE=40 && python3 -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --model $LLM_MODEL_ID --port 80"
   llm:
     image: opea/gen-ai-comps:llm-vllm-server
     container_name: llm-vllm-server
@@ -26,6 +27,7 @@ services:
     environment:
       http_proxy: ${http_proxy}
       https_proxy: ${https_proxy}
+      no_proxy: ${no_proxy}
       vLLM_LLM_ENDPOINT: ${vLLM_LLM_ENDPOINT}
       LLM_MODEL_ID: ${LLM_MODEL_ID}
       HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
diff --git a/comps/llms/text-generation/vllm/launch_vllm_service.sh b/comps/llms/text-generation/vllm/launch_vllm_service.sh
index 49df281ae..c6fc04210 100644
--- a/comps/llms/text-generation/vllm/launch_vllm_service.sh
+++ b/comps/llms/text-generation/vllm/launch_vllm_service.sh
@@ -22,4 +22,4 @@ fi
 volume=$PWD/data
 
 # Build the Docker run command based on the number of cards
-docker run -it --rm --name="ChatQnA_server" -p $port_number:$port_number --network=host -v $volume:/data -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy -e HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN} vllm:cpu /bin/bash -c "cd / && export VLLM_CPU_KVCACHE_SPACE=40 && python3 -m vllm.entrypoints.openai.api_server --model $model_name --port $port_number"
+docker run -it --rm --name="ChatQnA_server" -p $port_number:$port_number --network=host -v $volume:/data -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy -e HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN} vllm:cpu /bin/bash -c "cd / && export VLLM_CPU_KVCACHE_SPACE=40 && python3 -m vllm.entrypoints.openai.api_server --model $model_name --host 0.0.0.0 --port $port_number"
diff --git a/comps/llms/text-generation/vllm/llm.py b/comps/llms/text-generation/vllm/llm.py
index cc3406654..250df963f 100644
--- a/comps/llms/text-generation/vllm/llm.py
+++ b/comps/llms/text-generation/vllm/llm.py
@@ -5,6 +5,7 @@
 
 from fastapi.responses import StreamingResponse
 from langchain_community.llms import VLLMOpenAI
+from langsmith import traceable
 
 from comps import GeneratedDoc, LLMParamsDoc, ServiceType, opea_microservices, opea_telemetry, register_microservice
 
@@ -28,12 +29,12 @@ def post_process_text(text: str):
     host="0.0.0.0",
     port=9000,
 )
-@opea_telemetry
+@traceable(run_type="llm")
 def llm_generate(input: LLMParamsDoc):
     llm_endpoint = os.getenv("vLLM_LLM_ENDPOINT", "http://localhost:8080")
     llm = VLLMOpenAI(
         openai_api_key="EMPTY",
-        endpoint_url=llm_endpoint + "/v1",
+        openai_api_base=llm_endpoint + "/v1",
         max_tokens=input.max_new_tokens,
         model_name=os.getenv("LLM_MODEL_ID", "meta-llama/Meta-Llama-3-8B-Instruct"),
         top_p=input.top_p,