From 3d134d260b8968eb9ca18162b2f0d86aa15a85b3 Mon Sep 17 00:00:00 2001 From: Tianyi Liu Date: Fri, 7 Jun 2024 10:52:01 +0800 Subject: [PATCH] Fix the vLLM docker compose issues (#134) * refine the vLLM docker compose Signed-off-by: tianyil1 * update the vllm openai api call Signed-off-by: tianyil1 * refine the default network configuration in the docker-compose Signed-off-by: tianyil1 * refine the network config of docker compose and launch service Signed-off-by: tianyil1 * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: tianyil1 Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- comps/llms/text-generation/vllm/docker_compose_llm.yaml | 4 +++- comps/llms/text-generation/vllm/launch_vllm_service.sh | 2 +- comps/llms/text-generation/vllm/llm.py | 5 +++-- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/comps/llms/text-generation/vllm/docker_compose_llm.yaml b/comps/llms/text-generation/vllm/docker_compose_llm.yaml index a5c22dfe8..71f8b7a40 100644 --- a/comps/llms/text-generation/vllm/docker_compose_llm.yaml +++ b/comps/llms/text-generation/vllm/docker_compose_llm.yaml @@ -14,9 +14,10 @@ services: environment: http_proxy: ${http_proxy} https_proxy: ${https_proxy} + no_proxy: ${no_proxy} LLM_MODEL_ID: ${LLM_MODEL_ID} HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} - command: cd / && export VLLM_CPU_KVCACHE_SPACE=40 && python3 -m vllm.entrypoints.openai.api_server --model $LLM_MODEL_ID --port 80 + command: /bin/sh -c "cd / && export VLLM_CPU_KVCACHE_SPACE=40 && python3 -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --model $LLM_MODEL_ID --port 80" llm: image: opea/gen-ai-comps:llm-vllm-server container_name: llm-vllm-server @@ -26,6 +27,7 @@ services: environment: http_proxy: ${http_proxy} https_proxy: ${https_proxy} + no_proxy: ${no_proxy} vLLM_LLM_ENDPOINT: ${vLLM_LLM_ENDPOINT} LLM_MODEL_ID: ${LLM_MODEL_ID} HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} diff --git a/comps/llms/text-generation/vllm/launch_vllm_service.sh b/comps/llms/text-generation/vllm/launch_vllm_service.sh index 49df281ae..c6fc04210 100644 --- a/comps/llms/text-generation/vllm/launch_vllm_service.sh +++ b/comps/llms/text-generation/vllm/launch_vllm_service.sh @@ -22,4 +22,4 @@ fi volume=$PWD/data # Build the Docker run command based on the number of cards -docker run -it --rm --name="ChatQnA_server" -p $port_number:$port_number --network=host -v $volume:/data -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy -e HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN} vllm:cpu /bin/bash -c "cd / && export VLLM_CPU_KVCACHE_SPACE=40 && python3 -m vllm.entrypoints.openai.api_server --model $model_name --port $port_number" +docker run -it --rm --name="ChatQnA_server" -p $port_number:$port_number --network=host -v $volume:/data -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy -e HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN} vllm:cpu /bin/bash -c "cd / && export VLLM_CPU_KVCACHE_SPACE=40 && python3 -m vllm.entrypoints.openai.api_server --model $model_name --host 0.0.0.0 --port $port_number" diff --git a/comps/llms/text-generation/vllm/llm.py b/comps/llms/text-generation/vllm/llm.py index cc3406654..250df963f 100644 --- a/comps/llms/text-generation/vllm/llm.py +++ b/comps/llms/text-generation/vllm/llm.py @@ -5,6 +5,7 @@ from fastapi.responses import StreamingResponse from langchain_community.llms import VLLMOpenAI +from langsmith import traceable from comps import GeneratedDoc, LLMParamsDoc, ServiceType, opea_microservices, opea_telemetry, register_microservice @@ -28,12 +29,12 @@ def post_process_text(text: str): host="0.0.0.0", port=9000, ) -@opea_telemetry +@traceable(run_type="llm") def llm_generate(input: LLMParamsDoc): llm_endpoint = os.getenv("vLLM_LLM_ENDPOINT", "http://localhost:8080") llm = VLLMOpenAI( openai_api_key="EMPTY", - endpoint_url=llm_endpoint + "/v1", + openai_api_base=llm_endpoint + "/v1", max_tokens=input.max_new_tokens, model_name=os.getenv("LLM_MODEL_ID", "meta-llama/Meta-Llama-3-8B-Instruct"), top_p=input.top_p,