diff --git a/comps/llms/README.md b/comps/llms/README.md index 1892844b6..15c7c366c 100644 --- a/comps/llms/README.md +++ b/comps/llms/README.md @@ -130,7 +130,6 @@ export HUGGINGFACEHUB_API_TOKEN=${your_hf_api_token} export vLLM_LLM_ENDPOINT="http://${your_ip}:8008" export LLM_MODEL_ID=${your_hf_llm_model} export LANGCHAIN_TRACING_V2=true -export LANGCHAIN_API_KEY=${your_langchain_api_key} export LANGCHAIN_PROJECT="opea/llms" ``` @@ -141,8 +140,8 @@ export HUGGINGFACEHUB_API_TOKEN=${your_hf_api_token} export RAY_Serve_ENDPOINT="http://${your_ip}:8008" export LLM_MODEL=${your_hf_llm_model} export LANGCHAIN_TRACING_V2=true -export LANGCHAIN_API_KEY=${your_langchain_api_key} export LANGCHAIN_PROJECT="opea/llms" +export CHAT_PROCESSOR="ChatModelLlama" ``` ## 2.2 Build Docker Image @@ -156,16 +155,32 @@ docker build -t opea/llm-tgi:latest --build-arg https_proxy=$https_proxy --build ### 2.2.2 vLLM +Build vllm docker. + ```bash -cd ../../ -docker build -t opea/llm-vllm:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/text-generation/vllm/Dockerfile . +bash build_docker_vllm.sh +``` + +Build microservice docker. + +```bash +cd ../../../../ +docker build -t opea/llm-vllm:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/text-generation/vllm/docker/Dockerfile.microservice . ``` ### 2.2.3 Ray Serve +Build Ray Serve docker. + ```bash -cd ../../ -docker built -t opeas/llm-ray:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/text-generation/ray_serve/Dockerfile . +bash build_docker_rayserve.sh +``` + +Build microservice docker. + +```bash +cd ../../../../ +docker build -t opea/llm-ray:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/text-generation/ray_serve/docker/Dockerfile.microservice . ``` To start a docker container, you have two options: @@ -185,12 +200,28 @@ docker run -d --name="llm-tgi-server" -p 9000:9000 --ipc=host -e http_proxy=$htt ### 2.3.2 vLLM +Start vllm endpoint. + +```bash +bash launch_vllm_service.sh +``` + +Start vllm microservice. + ```bash -docker run -d --name="llm-vllm-server" -p 9000:9000 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e vLLM_LLM_ENDPOINT=$vLLM_LLM_ENDPOINT -e HUGGINGFACEHUB_API_TOKEN=$HUGGINGFACEHUB_API_TOKEN -e LLM_MODEL_ID=$LLM_MODEL_ID opea/llm-vllm:latest +docker run --name="llm-vllm-server" -p 9000:9000 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e no_proxy=${no_proxy} -e vLLM_LLM_ENDPOINT=$vLLM_LLM_ENDPOINT -e HUGGINGFACEHUB_API_TOKEN=$HUGGINGFACEHUB_API_TOKEN -e LLM_MODEL_ID=$LLM_MODEL_ID opea/llm-vllm:latest ``` ### 2.3.3 Ray Serve +Start Ray Serve endpoint. + +```bash +bash launch_ray_service.sh +``` + +Start Ray Serve microservice. + ```bash docker run -d --name="llm-ray-server" -p 9000:9000 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e RAY_Serve_ENDPOINT=$RAY_Serve_ENDPOINT -e HUGGINGFACEHUB_API_TOKEN=$HUGGINGFACEHUB_API_TOKEN -e LLM_MODEL=$LLM_MODEL opea/llm-ray:latest ``` @@ -250,11 +281,11 @@ curl http://${your_ip}:9000/v1/chat/completions \ ## 4. Validated Model -| Model | TGI-Gaudi | vLLM-CPU | Ray | -| ------------------------- | --------- | -------- | --- | -| Intel/neural-chat-7b-v3-3 | ✓ | ✓ | ✓ | -| Llama-2-7b-chat-hf | ✓ | ✓ | ✓ | -| Llama-2-70b-chat-hf | ✓ | - | x | -| Meta-Llama-3-8B-Instruct | ✓ | ✓ | ✓ | -| Meta-Llama-3-70B-Instruct | ✓ | - | x | -| Phi-3 | x | Limit 4K | ✓ | +| Model | TGI-Gaudi | vLLM-CPU | vLLM-Gaudi | Ray | +| ------------------------- | --------- | -------- | ---------- | --- | +| Intel/neural-chat-7b-v3-3 | ✓ | ✓ | ✓ | ✓ | +| Llama-2-7b-chat-hf | ✓ | ✓ | ✓ | ✓ | +| Llama-2-70b-chat-hf | ✓ | - | ✓ | x | +| Meta-Llama-3-8B-Instruct | ✓ | ✓ | ✓ | ✓ | +| Meta-Llama-3-70B-Instruct | ✓ | - | ✓ | x | +| Phi-3 | x | Limit 4K | Limit 4K | ✓ | diff --git a/comps/llms/text-generation/ray_serve/README.md b/comps/llms/text-generation/ray_serve/README.md index 71384aab6..6549ad960 100644 --- a/comps/llms/text-generation/ray_serve/README.md +++ b/comps/llms/text-generation/ray_serve/README.md @@ -21,7 +21,7 @@ export HF_TOKEN= And then you can make requests with the OpenAI-compatible APIs like below to check the service status: ```bash -curl http://127.0.0.1:8080/v1/chat/completions \ +curl http://127.0.0.1:8008/v1/chat/completions \ -H "Content-Type: application/json" \ -d '{ "model": , @@ -45,6 +45,6 @@ The ./serving/ray/launch_ray_service.sh script accepts five parameters: You have the flexibility to customize five parameters according to your specific needs. Additionally, you can set the Ray Gaudi endpoint by exporting the environment variable `RAY_Serve_ENDPOINT`: ```bash -export RAY_Serve_ENDPOINT="http://xxx.xxx.xxx.xxx:8080" +export RAY_Serve_ENDPOINT="http://xxx.xxx.xxx.xxx:8008" export LLM_MODEL= # example: export LLM_MODEL="meta-llama/Llama-2-7b-chat-hf" ``` diff --git a/comps/llms/text-generation/ray_serve/build_docker.sh b/comps/llms/text-generation/ray_serve/build_docker_rayserve.sh similarity index 88% rename from comps/llms/text-generation/ray_serve/build_docker.sh rename to comps/llms/text-generation/ray_serve/build_docker_rayserve.sh index a307f45b6..39a8d30c0 100755 --- a/comps/llms/text-generation/ray_serve/build_docker.sh +++ b/comps/llms/text-generation/ray_serve/build_docker_rayserve.sh @@ -7,7 +7,7 @@ cd docker docker build \ - -f Dockerfile ../../ \ + -f Dockerfile.rayserve ../../ \ -t ray_serve:habana \ --network=host \ --build-arg http_proxy=${http_proxy} \ diff --git a/comps/llms/text-generation/ray_serve/Dockerfile b/comps/llms/text-generation/ray_serve/docker/Dockerfile.microservice similarity index 100% rename from comps/llms/text-generation/ray_serve/Dockerfile rename to comps/llms/text-generation/ray_serve/docker/Dockerfile.microservice diff --git a/comps/llms/text-generation/ray_serve/docker/Dockerfile b/comps/llms/text-generation/ray_serve/docker/Dockerfile.rayserve similarity index 100% rename from comps/llms/text-generation/ray_serve/docker/Dockerfile rename to comps/llms/text-generation/ray_serve/docker/Dockerfile.rayserve diff --git a/comps/llms/text-generation/ray_serve/docker_compose_llm.yaml b/comps/llms/text-generation/ray_serve/docker_compose_llm.yaml index 797f4da80..570c61bcf 100644 --- a/comps/llms/text-generation/ray_serve/docker_compose_llm.yaml +++ b/comps/llms/text-generation/ray_serve/docker_compose_llm.yaml @@ -16,28 +16,26 @@ version: "3.8" services: ray_service: - image: rayllm:habana + image: ray_serve:habana container_name: ray-service ports: - "8008:80" runtime: habana + ipc: host + volumes: + - "./data:/data" environment: - - OMPI_MCA_btl_vader_single_copy_mechanism=none - - HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN} - - TRUST_REMOTE_CODE=${TRUST_REMOTE_CODE} - - LLM_MODEL=${LLM_MODEL} - - CHAT_PROCESSOR=${CHAT_PROCESSOR} + OMPI_MCA_btl_vader_single_copy_mechanism: none + TRUST_REMOTE_CODE: True + HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} + LLM_MODEL: ${LLM_MODEL} + CHAT_PROCESSOR: ${CHAT_PROCESSOR} + HABANA_VISIBLE_DEVICES: all cap_add: - - SYS_NICE - command: > - /bin/bash -c "ray start --head && - python api_server_openai.py --port_number 80 - --model_id_or_path ${LLM_MODEL} - --chat_processor ${CHAT_PROCESSOR} - --num_cpus_per_worker 8 - --num_hpus_per_worker 1" + - sys_nice + command: /bin/bash -c "ray start --head && python api_server_openai.py --port_number 80 --model_id_or_path ${LLM_MODEL} --chat_processor ${CHAT_PROCESSOR} --num_cpus_per_worker 8 --num_hpus_per_worker 1" llm: - image: opea/gen-ai-comps:llm-ray-server + image: opeas/llm-ray:latest container_name: llm-ray-server ports: - "9000:9000" @@ -48,7 +46,6 @@ services: RAY_Serve_ENDPOINT: ${RAY_Serve_ENDPOINT} LLM_MODEL: ${LLM_MODEL} HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} - LANGCHAIN_API_KEY: ${LANGCHAIN_API_KEY} restart: unless-stopped networks: diff --git a/comps/llms/text-generation/ray_serve/launch_ray_service.sh b/comps/llms/text-generation/ray_serve/launch_ray_service.sh index 5a2c4bb45..a7e825880 100755 --- a/comps/llms/text-generation/ray_serve/launch_ray_service.sh +++ b/comps/llms/text-generation/ray_serve/launch_ray_service.sh @@ -5,8 +5,8 @@ # SPDX-License-Identifier: Apache-2.0 # Set default values -default_port=8080 -default_model="meta-llama/Llama-2-7b-chat-hf" +default_port=8008 +default_model=${LLM_MODEL} default_chat_processor="ChatModelLlama" default_num_cpus_per_worker=8 default_num_hpus_per_worker=1 @@ -31,4 +31,4 @@ if [ "$#" -lt 0 ] || [ "$#" -gt 5 ]; then fi # Build the Docker run command based on the number of cards -docker run -it --runtime=habana --name="ChatQnA_server" -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host -p $port_number:$port_number -e HF_TOKEN=$HUGGINGFACEHUB_API_TOKEN -e TRUST_REMOTE_CODE=$TRUST_REMOTE_CODE ray_serve:habana /bin/bash -c "ray start --head && python api_server_openai.py --port_number $port_number --model_id_or_path $model_name --chat_processor $chat_processor --num_cpus_per_worker $num_cpus_per_worker --num_hpus_per_worker $num_hpus_per_worker" +docker run -it --runtime=habana --name="ray-service" -v $PWD/data:/data -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host -p $port_number:80 -e HF_TOKEN=$HUGGINGFACEHUB_API_TOKEN -e TRUST_REMOTE_CODE=True ray_serve:habana /bin/bash -c "ray start --head && python api_server_openai.py --port_number 80 --model_id_or_path $model_name --chat_processor $chat_processor --num_cpus_per_worker $num_cpus_per_worker --num_hpus_per_worker $num_hpus_per_worker" diff --git a/comps/llms/text-generation/vllm/README.md b/comps/llms/text-generation/vllm/README.md index 338631552..6f202c268 100644 --- a/comps/llms/text-generation/vllm/README.md +++ b/comps/llms/text-generation/vllm/README.md @@ -25,7 +25,7 @@ export HF_TOKEN= And then you can make requests like below to check the service status: ```bash -curl http://127.0.0.1:8080/v1/completions \ +curl http://127.0.0.1:8008/v1/completions \ -H "Content-Type: application/json" \ -d '{ "model": , @@ -46,6 +46,6 @@ The `./serving/vllm/launch_vllm_service.sh` script accepts three parameters: You have the flexibility to customize two parameters according to your specific needs. Additionally, you can set the vLLM endpoint by exporting the environment variable `vLLM_LLM_ENDPOINT`: ```bash -export vLLM_LLM_ENDPOINT="http://xxx.xxx.xxx.xxx:8080" +export vLLM_LLM_ENDPOINT="http://xxx.xxx.xxx.xxx:8008" export LLM_MODEL= # example: export LLM_MODEL="Intel/neural-chat-7b-v3-3" ``` diff --git a/comps/llms/text-generation/vllm/build_docker.sh b/comps/llms/text-generation/vllm/build_docker_vllm.sh similarity index 100% rename from comps/llms/text-generation/vllm/build_docker.sh rename to comps/llms/text-generation/vllm/build_docker_vllm.sh diff --git a/comps/llms/text-generation/vllm/Dockerfile b/comps/llms/text-generation/vllm/docker/Dockerfile.microservice similarity index 100% rename from comps/llms/text-generation/vllm/Dockerfile rename to comps/llms/text-generation/vllm/docker/Dockerfile.microservice diff --git a/comps/llms/text-generation/vllm/docker_compose_llm.yaml b/comps/llms/text-generation/vllm/docker_compose_llm.yaml index 71f8b7a40..99d10aa33 100644 --- a/comps/llms/text-generation/vllm/docker_compose_llm.yaml +++ b/comps/llms/text-generation/vllm/docker_compose_llm.yaml @@ -19,7 +19,7 @@ services: HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} command: /bin/sh -c "cd / && export VLLM_CPU_KVCACHE_SPACE=40 && python3 -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --model $LLM_MODEL_ID --port 80" llm: - image: opea/gen-ai-comps:llm-vllm-server + image: opea/llm-vllm:latest container_name: llm-vllm-server ports: - "9000:9000" diff --git a/comps/llms/text-generation/vllm/launch_vllm_service.sh b/comps/llms/text-generation/vllm/launch_vllm_service.sh index 7e32c8775..3e9dea219 100644 --- a/comps/llms/text-generation/vllm/launch_vllm_service.sh +++ b/comps/llms/text-generation/vllm/launch_vllm_service.sh @@ -1,25 +1,26 @@ #!/bin/bash - - # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 # Set default values -default_port=8080 +default_port=8008 default_hw_mode="cpu" -default_model="Intel/neural-chat-7b-v3-3" +default_model=${LLM_MODEL_ID} +default_parallel_number=1 # Assign arguments to variables port_number=${1:-$default_port} model_name=${2:-$default_model} hw_mode=${3:-$default_hw_mode} +parallel_number=${4:-$default_parallel_number} # Check if all required arguments are provided -if [ "$#" -lt 0 ] || [ "$#" -gt 3 ]; then - echo "Usage: $0 [port_number] [model_name] [hw_mode]" +if [ "$#" -lt 0 ] || [ "$#" -gt 4 ]; then + echo "Usage: $0 [port_number] [model_name] [hw_mode] [parallel_number]" echo "port_number: The port number assigned to the vLLM CPU endpoint, with the default being 8080." echo "model_name: The model name utilized for LLM, with the default set to 'Intel/neural-chat-7b-v3-3'." echo "hw_mode: The hardware mode utilized for LLM, with the default set to 'cpu', and the optional selection can be 'hpu'" + echo "parallel_number: parallel nodes number for 'hpu' mode" exit 1 fi @@ -28,7 +29,7 @@ volume=$PWD/data # Build the Docker run command based on hardware mode if [ "$hw_mode" = "hpu" ]; then - docker run -it --runtime=habana --rm --name="ChatQnA_server" -p $port_number:$port_number -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy -e HF_TOKEN=${HF_TOKEN} vllm:hpu /bin/bash -c "export VLLM_CPU_KVCACHE_SPACE=40 && python3 -m vllm.entrypoints.openai.api_server --enforce-eager --model $model_name --host 0.0.0.0 --port $port_number" + docker run -it --runtime=habana --rm --name="vllm-service" -p $port_number:80 -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy -e HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN} vllm:hpu /bin/bash -c "export VLLM_CPU_KVCACHE_SPACE=40 && python3 -m vllm.entrypoints.openai.api_server --enforce-eager --model $model_name --tensor-parallel-size $parallel_number --host 0.0.0.0 --port 80" else - docker run -it --rm --name="ChatQnA_server" -p $port_number:$port_number --network=host -v $volume:/data -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy -e HF_TOKEN=${HF_TOKEN} vllm:cpu /bin/bash -c "cd / && export VLLM_CPU_KVCACHE_SPACE=40 && python3 -m vllm.entrypoints.openai.api_server --enforce-eager --model $model_name --host 0.0.0.0 --port $port_number" + docker run -it --rm --name="vllm-service" -p $port_number:80 --network=host -v $volume:/data -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy -e HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN} vllm:cpu /bin/bash -c "cd / && export VLLM_CPU_KVCACHE_SPACE=40 && python3 -m vllm.entrypoints.openai.api_server --model $model_name --host 0.0.0.0 --port 80" fi diff --git a/comps/llms/text-generation/vllm/llm.py b/comps/llms/text-generation/vllm/llm.py index 250df963f..fab24a2ee 100644 --- a/comps/llms/text-generation/vllm/llm.py +++ b/comps/llms/text-generation/vllm/llm.py @@ -31,12 +31,13 @@ def post_process_text(text: str): ) @traceable(run_type="llm") def llm_generate(input: LLMParamsDoc): - llm_endpoint = os.getenv("vLLM_LLM_ENDPOINT", "http://localhost:8080") + llm_endpoint = os.getenv("vLLM_LLM_ENDPOINT", "http://localhost:8008") + model_name = os.getenv("LLM_MODEL_ID", "meta-llama/Meta-Llama-3-8B-Instruct") llm = VLLMOpenAI( openai_api_key="EMPTY", openai_api_base=llm_endpoint + "/v1", max_tokens=input.max_new_tokens, - model_name=os.getenv("LLM_MODEL_ID", "meta-llama/Meta-Llama-3-8B-Instruct"), + model_name=model_name, top_p=input.top_p, temperature=input.temperature, presence_penalty=input.repetition_penalty, diff --git a/comps/llms/text-generation/vllm/requirements.txt b/comps/llms/text-generation/vllm/requirements.txt index 7d72d98b5..d5959e9ff 100644 --- a/comps/llms/text-generation/vllm/requirements.txt +++ b/comps/llms/text-generation/vllm/requirements.txt @@ -6,6 +6,7 @@ langserve opentelemetry-api opentelemetry-exporter-otlp opentelemetry-sdk +setuptools==69.5.1 shortuuid transformers vllm