From 4638c1d45d1da1185529b1712108ffb38ca00093 Mon Sep 17 00:00:00 2001 From: "Chendi.Xue" Date: Tue, 1 Oct 2024 10:44:14 -0700 Subject: [PATCH] Enable vllm for Agent (#752) * Update Agent vllm client codes and test Signed-off-by: Chendi.Xue * Update README with vllm related update Signed-off-by: Chendi.Xue * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * remove unnecessary env Signed-off-by: Chendi.Xue * support plan_execute with vllm Signed-off-by: Chendi.Xue * Update README and test llama vllm support Signed-off-by: Chendi.Xue * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * update vllm_port to 8086 avoid conflict Signed-off-by: Chendi.Xue --------- Signed-off-by: Chendi.Xue Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- comps/agent/langchain/README.md | 62 +++++--- .../langchain/src/strategy/base_agent.py | 13 ++ .../src/strategy/planexec/planner.py | 63 ++++++-- .../langchain/src/strategy/react/planner.py | 7 +- comps/agent/langchain/src/utils.py | 24 ++- tests/agent/Dockerfile.hpu | 18 +++ tests/agent/planexec_vllm.yaml | 30 ++++ tests/agent/react_vllm.yaml | 30 ++++ .../test_agent_langchain_on_intel_hpu.sh | 150 +++++++++++++++++- 9 files changed, 342 insertions(+), 55 deletions(-) create mode 100644 tests/agent/Dockerfile.hpu create mode 100644 tests/agent/planexec_vllm.yaml create mode 100644 tests/agent/react_vllm.yaml diff --git a/comps/agent/langchain/README.md b/comps/agent/langchain/README.md index a063b7241..2e65f8589 100644 --- a/comps/agent/langchain/README.md +++ b/comps/agent/langchain/README.md @@ -20,14 +20,14 @@ Agents use LLM for reasoning and planning. We support 2 options of LLM engine: 1. Open-source LLMs served with TGI-gaudi. To use open-source llms, follow the instructions in [Section 2](#222-start-microservices) below. Note: we recommend using state-of-the-art LLMs, such as llama3.1-70B-instruct, to get higher success rate. 2. OpenAI LLMs via API calls. To use OpenAI llms, specify `llm_engine=openai` and `export OPENAI_API_KEY=` -| Agent type | `strategy` arg | Validated LLMs | Notes | -| ---------------- | ----------------- | ---------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------ | -| ReAct | `react_langchain` | GPT-4o-mini, [llama3.1-70B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct) | Only allows tools with one input variable | -| ReAct | `react_langgraph` | GPT-4o-mini | Currently does not work for open-source LLMs served with TGI-Gaudi | -| ReAct | `react_llama` | [llama3.1-70B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct) | Recommended for open-source LLMs served with TGI-Gaudi | -| RAG agent | `rag_agent` | GPT-4o-mini | Currently does not work for open-source LLMs served with TGI-Gaudi | -| RAG agent | `rag_agent_llama` | [llama3.1-70B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct) | Recommended for open-source LLMs served with TGI-Gaudi, only allows 1 tool with input variable to be "query" | -| Plan and execute | `plan_execute` | GPT-4o-mini | | +| Agent type | `strategy` arg | Validated LLMs | Notes | +| ---------------- | ----------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| ReAct | `react_langchain` | GPT-4o-mini, [llama3.1-70B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct) | Only allows tools with one input variable | +| ReAct | `react_langgraph` | GPT-4o-mini, [Mistral-7B-Instruct-v0.3](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3)-on-vllm, | Currently does not work for open-source LLMs served with TGI-Gaudi, [Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct)-on-vllm is not synced from vllm upstream to gaudi repo yet. | +| ReAct | `react_llama` | [llama3.1-70B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct) | Recommended for open-source LLMs served with TGI-Gaudi | +| RAG agent | `rag_agent` | GPT-4o-mini | Currently does not work for open-source LLMs served with TGI-Gaudi | +| RAG agent | `rag_agent_llama` | [llama3.1-70B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct) | Recommended for open-source LLMs served with TGI-Gaudi, only allows 1 tool with input variable to be "query" | +| Plan and execute | `plan_execute` | GPT-4o-mini, [Mistral-7B-Instruct-v0.3](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3)-on-vllm, [Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct)-on-vllm | | ### 1.3 Tools @@ -44,47 +44,57 @@ Currently we have implemented OpenAI chat completion compatible API for agents. ## 🚀2. Start Agent Microservice -### 2.1 Option 1: with Python - -#### 2.1.1 Install Requirements +#### 2.1 Build Microservices ```bash -cd comps/agent/langchain/ -pip install -r requirements.txt +cd GenAIComps/ # back to GenAIComps/ folder +docker build -t opea/agent-langchain:latest -f comps/agent/langchain/Dockerfile . ``` -#### 2.1.2 Start Microservice with Python Script +#### 2.2.1 Start Agent microservices with TGI ```bash -cd comps/agent/langchain/ -python agent.py -``` +export ip_address=$(hostname -I | awk '{print $1}') +export model=mistralai/Mistral-7B-Instruct-v0.3 +export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN} +export HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN} + +# TGI serving +docker run -d --runtime=habana --name "comps-tgi-gaudi-service" -p 8080:80 -v ./data:/data -e HF_TOKEN=$HF_TOKEN -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host ghcr.io/huggingface/tgi-gaudi:latest --model-id $model --max-input-tokens 4096 --max-total-tokens 8092 -### 2.2 Option 2. Start Microservice with Docker +# check status +docker logs comps-tgi-gaudi-service -#### 2.2.1 Build Microservices +# Agent +docker run -d --runtime=runc --name="comps-langchain-agent-endpoint" -v $WORKPATH/comps/agent/langchain/tools:/home/user/comps/agent/langchain/tools -p 9090:9090 --ipc=host -e HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN} -e model=${model} -e ip_address=${ip_address} -e strategy=react_langchain -e llm_endpoint_url=http://${ip_address}:8080 -e llm_engine=tgi -e recursion_limit=5 -e require_human_feedback=false -e tools=/home/user/comps/agent/langchain/tools/custom_tools.yaml opea/agent-langchain:latest -```bash -cd GenAIComps/ # back to GenAIComps/ folder -docker build -t opea/agent-langchain:latest -f comps/agent/langchain/Dockerfile . +# check status +docker logs comps-langchain-agent-endpoint ``` -#### 2.2.2 Start microservices +#### 2.2.2 Start Agent microservices with vllm ```bash export ip_address=$(hostname -I | awk '{print $1}') export model=mistralai/Mistral-7B-Instruct-v0.3 export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN} export HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN} +export vllm_volume=${YOUR_LOCAL_DIR_FOR_MODELS} + +# build vLLM image +git clone https://github.com/HabanaAI/vllm-fork.git +cd ./vllm-fork; git checkout habana_main; git tag v0.6.2.post1; +cp ${your_path}/GenAIComps/tests/agent/Dockerfile.hpu ./ +docker build -f Dockerfile.hpu -t opea/vllm:hpu --shm-size=128g . --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy # TGI serving -docker run -d --runtime=habana --name "comps-tgi-gaudi-service" -p 8080:80 -v ./data:/data -e HF_TOKEN=$HF_TOKEN -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host ghcr.io/huggingface/tgi-gaudi:latest --model-id $model --max-input-tokens 4096 --max-total-tokens 8092 +docker run -d --runtime=habana --rm --name "comps-vllm-gaudi-service" -p 8080:80 -v $vllm_volume:/data -e HF_TOKEN=$HF_TOKEN -e HF_HOME=/data -e OMPI_MCA_btl_vader_single_copy_mechanism=none -e PT_HPU_ENABLE_LAZY_COLLECTIVES=true -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e VLLM_SKIP_WARMUP=true --cap-add=sys_nice --ipc=host opea/vllm:hpu --model ${model} --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 4096 --max-seq_len-to-capture 8192 --enable-auto-tool-choice --tool-call-parser mistral # check status -docker logs comps-tgi-gaudi-service +docker logs comps-vllm-gaudi-service # Agent -docker run -d --runtime=runc --name="comps-langchain-agent-endpoint" -v $WORKPATH/comps/agent/langchain/tools:/home/user/comps/agent/langchain/tools -p 9090:9090 --ipc=host -e HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN} -e model=${model} -e ip_address=${ip_address} -e strategy=react_langchain -e llm_endpoint_url=http://${ip_address}:8080 -e llm_engine=tgi -e recursion_limit=5 -e require_human_feedback=false -e tools=/home/user/comps/agent/langchain/tools/custom_tools.yaml opea/agent-langchain:latest +docker run -d --runtime=runc --name="comps-langchain-agent-endpoint" -v $WORKPATH/comps/agent/langchain/tools:/home/user/comps/agent/langchain/tools -p 9090:9090 --ipc=host -e HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN} -e model=${model} -e ip_address=${ip_address} -e strategy=react_langgraph -e llm_endpoint_url=http://${ip_address}:8080 -e llm_engine=vllm -e recursion_limit=5 -e require_human_feedback=false -e tools=/home/user/comps/agent/langchain/tools/custom_tools.yaml opea/agent-langchain:latest # check status docker logs comps-langchain-agent-endpoint diff --git a/comps/agent/langchain/src/strategy/base_agent.py b/comps/agent/langchain/src/strategy/base_agent.py index ca0e12a96..3faf8f543 100644 --- a/comps/agent/langchain/src/strategy/base_agent.py +++ b/comps/agent/langchain/src/strategy/base_agent.py @@ -14,8 +14,21 @@ def __init__(self, args) -> None: self.app = None self.memory = None self.id = f"assistant_{self.__class__.__name__}_{uuid4()}" + self.args = args print(self.tools_descriptions) + @property + def is_vllm(self): + return self.args.llm_engine == "vllm" + + @property + def is_tgi(self): + return self.args.llm_engine == "tgi" + + @property + def is_openai(self): + return self.args.llm_engine == "openai" + def compile(self): pass diff --git a/comps/agent/langchain/src/strategy/planexec/planner.py b/comps/agent/langchain/src/strategy/planexec/planner.py index 4d872e9e7..f7630ca6f 100644 --- a/comps/agent/langchain/src/strategy/planexec/planner.py +++ b/comps/agent/langchain/src/strategy/planexec/planner.py @@ -6,10 +6,8 @@ from typing import Annotated, Any, List, Literal, Sequence, Tuple, TypedDict, Union from langchain.agents import AgentExecutor, create_react_agent -from langchain.output_parsers import PydanticOutputParser from langchain_core.exceptions import OutputParserException from langchain_core.messages import BaseMessage -from langchain_core.output_parsers import JsonOutputParser from langchain_core.output_parsers.openai_tools import PydanticToolsParser from langchain_core.outputs import Generation from langchain_core.prompts import PromptTemplate @@ -21,7 +19,7 @@ from pydantic import BaseModel, Field from ...global_var import threads_global_kv -from ...utils import has_multi_tool_inputs, tool_renderer +from ...utils import has_multi_tool_inputs, tool_renderer, wrap_chat from ..base_agent import BaseAgent from .prompt import ( answer_check_prompt, @@ -63,11 +61,17 @@ class PlanStepChecker: str: A decision for whether we should use this plan or not """ - def __init__(self, llm_endpoint, model_id=None): + def __init__(self, llm_endpoint, model_id=None, is_vllm=False): class grade(BaseModel): binary_score: str = Field(description="executable score 'yes' or 'no'") - llm = ChatHuggingFace(llm=llm_endpoint, model_id=model_id).bind_tools([grade]) + if is_vllm: + llm = wrap_chat(llm_endpoint, model_id).bind_tools( + [grade], tool_choice={"function": {"name": grade.__name__}} + ) + else: + llm = wrap_chat(llm_endpoint, model_id).bind_tools([grade]) + output_parser = PydanticToolsParser(tools=[grade], first_tool_only=True) self.chain = plan_check_prompt | llm | output_parser @@ -84,9 +88,13 @@ def __call__(self, state): # Define workflow Node class Planner: - def __init__(self, llm_endpoint, model_id=None, plan_checker=None): - # self.llm = planner_prompt | llm_endpoint | PydanticOutputParser(pydantic_object=Plan) - llm = ChatHuggingFace(llm=llm_endpoint, model_id=model_id).bind_tools([Plan]) + def __init__(self, llm_endpoint, model_id=None, plan_checker=None, is_vllm=False): + if is_vllm: + llm = wrap_chat(llm_endpoint, model_id).bind_tools( + [Plan], tool_choice={"function": {"name": Plan.__name__}} + ) + else: + llm = wrap_chat(llm_endpoint, model_id).bind_tools([Plan]) output_parser = PydanticToolsParser(tools=[Plan], first_tool_only=True) self.llm = planner_prompt | llm | output_parser self.plan_checker = plan_checker @@ -152,8 +160,13 @@ def __call__(self, state): class AnswerMaker: - def __init__(self, llm_endpoint, model_id=None): - llm = ChatHuggingFace(llm=llm_endpoint, model_id=model_id).bind_tools([Response]) + def __init__(self, llm_endpoint, model_id=None, is_vllm=False): + if is_vllm: + llm = wrap_chat(llm_endpoint, model_id).bind_tools( + [Response], tool_choice={"function": {"name": Response.__name__}} + ) + else: + llm = wrap_chat(llm_endpoint, model_id).bind_tools([Response]) output_parser = PydanticToolsParser(tools=[Response], first_tool_only=True) self.llm = answer_make_prompt | llm | output_parser @@ -180,11 +193,16 @@ class FinalAnswerChecker: str: A decision for whether we should use this plan or not """ - def __init__(self, llm_endpoint, model_id=None): + def __init__(self, llm_endpoint, model_id=None, is_vllm=False): class grade(BaseModel): binary_score: str = Field(description="executable score 'yes' or 'no'") - llm = ChatHuggingFace(llm=llm_endpoint, model_id=model_id).bind_tools([grade]) + if is_vllm: + llm = wrap_chat(llm_endpoint, model_id).bind_tools( + [grade], tool_choice={"function": {"name": grade.__name__}} + ) + else: + llm = wrap_chat(llm_endpoint, model_id).bind_tools([grade]) output_parser = PydanticToolsParser(tools=[grade], first_tool_only=True) self.chain = answer_check_prompt | llm | output_parser @@ -201,7 +219,7 @@ def __call__(self, state): class Replanner: def __init__(self, llm_endpoint, model_id=None, answer_checker=None): - llm = ChatHuggingFace(llm=llm_endpoint, model_id=model_id).bind_tools([Plan]) + llm = wrap_chat(llm_endpoint, model_id).bind_tools([Plan]) output_parser = PydanticToolsParser(tools=[Plan], first_tool_only=True) self.llm = replanner_prompt | llm | output_parser self.answer_checker = answer_checker @@ -227,11 +245,11 @@ def __init__(self, args, with_memory=False): super().__init__(args) # Define Node - plan_checker = PlanStepChecker(self.llm_endpoint, args.model) + plan_checker = PlanStepChecker(self.llm_endpoint, args.model, is_vllm=self.is_vllm) - plan_step = Planner(self.llm_endpoint, args.model, plan_checker) + plan_step = Planner(self.llm_endpoint, args.model, plan_checker, is_vllm=self.is_vllm) execute_step = Executor(self.llm_endpoint, args.model, self.tools_descriptions) - make_answer = AnswerMaker(self.llm_endpoint, args.model) + make_answer = AnswerMaker(self.llm_endpoint, args.model, is_vllm=self.is_vllm) # Define Graph workflow = StateGraph(PlanExecute) @@ -274,3 +292,16 @@ async def stream_generator(self, query, config, thread_id=None): yield f"data: {repr(event)}\n\n" yield "data: [DONE]\n\n" + + async def non_streaming_run(self, query, config): + initial_state = self.prepare_initial_state(query) + try: + async for s in self.app.astream(initial_state, config=config, stream_mode="values"): + for k, v in s.items(): + print(f"{k}: {v}\n") + + last_message = s["output"] + print("******Response: ", last_message) + return last_message + except Exception as e: + return str(e) diff --git a/comps/agent/langchain/src/strategy/react/planner.py b/comps/agent/langchain/src/strategy/react/planner.py index ab8274774..f5bb4d053 100644 --- a/comps/agent/langchain/src/strategy/react/planner.py +++ b/comps/agent/langchain/src/strategy/react/planner.py @@ -12,7 +12,7 @@ from langgraph.prebuilt import create_react_agent from ...global_var import threads_global_kv -from ...utils import has_multi_tool_inputs, tool_renderer +from ...utils import has_multi_tool_inputs, tool_renderer, wrap_chat from ..base_agent import BaseAgent from .prompt import REACT_SYS_MESSAGE, hwchase17_react_prompt @@ -85,10 +85,7 @@ class ReActAgentwithLanggraph(BaseAgent): def __init__(self, args, with_memory=False): super().__init__(args) - if isinstance(self.llm_endpoint, HuggingFaceEndpoint): - self.llm = ChatHuggingFace(llm=self.llm_endpoint, model_id=args.model) - elif isinstance(self.llm_endpoint, ChatOpenAI): - self.llm = self.llm_endpoint + self.llm = wrap_chat(self.llm_endpoint, args.model) tools = self.tools_descriptions diff --git a/comps/agent/langchain/src/utils.py b/comps/agent/langchain/src/utils.py index f7c35638a..4445c3866 100644 --- a/comps/agent/langchain/src/utils.py +++ b/comps/agent/langchain/src/utils.py @@ -6,6 +6,16 @@ from .config import env_config +def wrap_chat(llm_endpoint, model_id): + from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint + + if isinstance(llm_endpoint, HuggingFaceEndpoint): + llm = ChatHuggingFace(llm=llm_endpoint, model_id=model_id) + else: + llm = llm_endpoint + return llm + + def format_date(date): # input m/dd/yyyy hr:min # output yyyy-mm-dd @@ -46,15 +56,15 @@ def setup_hf_tgi_client(args): def setup_vllm_client(args): - from langchain_community.llms.vllm import VLLMOpenAI + from langchain_openai import ChatOpenAI openai_endpoint = f"{args.llm_endpoint_url}/v1" - llm = VLLMOpenAI( - openai_api_key="EMPTY", - openai_api_base=openai_endpoint, - model_name=args.model, - streaming=args.streaming, - ) + params = { + "temperature": args.temperature, + "max_tokens": args.max_new_tokens, + "streaming": args.streaming, + } + llm = ChatOpenAI(openai_api_key="EMPTY", openai_api_base=openai_endpoint, model_name=args.model, **params) return llm diff --git a/tests/agent/Dockerfile.hpu b/tests/agent/Dockerfile.hpu new file mode 100644 index 000000000..e92208771 --- /dev/null +++ b/tests/agent/Dockerfile.hpu @@ -0,0 +1,18 @@ +FROM vault.habana.ai/gaudi-docker/1.17.0/ubuntu22.04/habanalabs/pytorch-installer-2.3.1:latest + +COPY ./ /workspace/vllm + +WORKDIR /workspace/vllm + +RUN pip install -v cmake>=3.26 ninja packaging setuptools-scm>=8 wheel jinja2 -r requirements-hpu.txt + +ENV no_proxy=localhost,127.0.0.1 +ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true + +RUN VLLM_TARGET_DEVICE=hpu python3 setup.py install + +WORKDIR /workspace/ + +RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks + +ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"] diff --git a/tests/agent/planexec_vllm.yaml b/tests/agent/planexec_vllm.yaml new file mode 100644 index 000000000..6ab9d16f4 --- /dev/null +++ b/tests/agent/planexec_vllm.yaml @@ -0,0 +1,30 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +services: + agent: + image: ${agent_image} + container_name: test-comps-agent-endpoint + volumes: + - ${TOOLSET_PATH}:/home/user/tools/ + ports: + - "9095:9095" + ipc: host + environment: + ip_address: ${ip_address} + strategy: plan_execute + recursion_limit: ${recursion_limit} + llm_engine: vllm + HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} + llm_endpoint_url: ${LLM_ENDPOINT_URL} + model: ${LLM_MODEL_ID} + temperature: ${temperature} + max_new_tokens: ${max_new_tokens} + top_k: 10 + streaming: false + tools: /home/user/tools/custom_tools.yaml + require_human_feedback: false + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + port: 9095 diff --git a/tests/agent/react_vllm.yaml b/tests/agent/react_vllm.yaml new file mode 100644 index 000000000..63b292ae3 --- /dev/null +++ b/tests/agent/react_vllm.yaml @@ -0,0 +1,30 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +services: + agent: + image: ${agent_image} + container_name: test-comps-agent-endpoint + volumes: + - ${TOOLSET_PATH}:/home/user/tools/ + ports: + - "9095:9095" + ipc: host + environment: + ip_address: ${ip_address} + strategy: react_langgraph + recursion_limit: ${recursion_limit} + llm_engine: vllm + HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} + llm_endpoint_url: ${LLM_ENDPOINT_URL} + model: ${LLM_MODEL_ID} + temperature: ${temperature} + max_new_tokens: ${max_new_tokens} + top_k: 10 + streaming: false + tools: /home/user/tools/custom_tools.yaml + require_human_feedback: false + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + port: 9095 diff --git a/tests/agent/test_agent_langchain_on_intel_hpu.sh b/tests/agent/test_agent_langchain_on_intel_hpu.sh index a8b791474..a04b63ca6 100644 --- a/tests/agent/test_agent_langchain_on_intel_hpu.sh +++ b/tests/agent/test_agent_langchain_on_intel_hpu.sh @@ -9,6 +9,8 @@ LOG_PATH="$WORKPATH/tests" ip_address=$(hostname -I | awk '{print $1}') tgi_port=8085 tgi_volume=$WORKPATH/data +vllm_port=8086 +vllm_volume=$WORKPATH/data export agent_image="opea/agent-langchain:comps" export agent_container_name="test-comps-agent-endpoint" @@ -38,6 +40,25 @@ function build_docker_images() { fi } +function build_vllm_docker_images() { + echo "Building the vllm docker images" + cd $WORKPATH + echo $WORKPATH + if [ ! -d "./vllm" ]; then + git clone https://github.com/HabanaAI/vllm-fork.git + cd ./vllm-fork; git checkout habana_main; git tag v0.6.2.post1; cd .. + cp $WORKPATH/tests/agent/Dockerfile.hpu ./vllm-fork + fi + cd ./vllm-fork + docker build -f Dockerfile.hpu -t opea/vllm:hpu --shm-size=128g . --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy + if [ $? -ne 0 ]; then + echo "opea/vllm:hpu failed" + exit 1 + else + echo "opea/vllm:hpu successful" + fi +} + function start_tgi_service() { echo "token is ${HF_TOKEN}" @@ -59,6 +80,58 @@ function start_tgi_service() { echo "Service started successfully" } +function start_vllm_service() { + # redis endpoint + echo "token is ${HF_TOKEN}" + + #single card + echo "start vllm gaudi service" + docker run -d --runtime=habana --rm --name "test-comps-vllm-gaudi-service" -p $vllm_port:80 -v $vllm_volume:/data -e HF_TOKEN=$HF_TOKEN -e HF_HOME=/data -e OMPI_MCA_btl_vader_single_copy_mechanism=none -e PT_HPU_ENABLE_LAZY_COLLECTIVES=true -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e VLLM_SKIP_WARMUP=true --cap-add=sys_nice --ipc=host opea/vllm:hpu --model ${model} --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 4096 --max-seq_len-to-capture 8192 + sleep 5s + echo "Waiting vllm gaudi ready" + n=0 + until [[ "$n" -ge 100 ]] || [[ $ready == true ]]; do + docker logs test-comps-vllm-gaudi-service &> ${LOG_PATH}/vllm-gaudi-service.log + n=$((n+1)) + if grep -q "Uvicorn running on" ${LOG_PATH}/vllm-gaudi-service.log; then + break + fi + if grep -q "No such container" ${LOG_PATH}/vllm-gaudi-service.log; then + echo "container test-comps-vllm-gaudi-service not found" + exit 1 + fi + sleep 5s + done + sleep 5s + echo "Service started successfully" +} + +function start_vllm_auto_tool_choice_service() { + # redis endpoint + echo "token is ${HF_TOKEN}" + + #single card + echo "start vllm gaudi service" + docker run -d --runtime=habana --rm --name "test-comps-vllm-gaudi-service" -p $vllm_port:80 -v $vllm_volume:/data -e HF_TOKEN=$HF_TOKEN -e HF_HOME=/data -e OMPI_MCA_btl_vader_single_copy_mechanism=none -e PT_HPU_ENABLE_LAZY_COLLECTIVES=true -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e VLLM_SKIP_WARMUP=true --cap-add=sys_nice --ipc=host opea/vllm:hpu --model ${model} --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 4096 --max-seq_len-to-capture 8192 --enable-auto-tool-choice --tool-call-parser ${model_parser} + sleep 5s + echo "Waiting vllm gaudi ready" + n=0 + until [[ "$n" -ge 100 ]] || [[ $ready == true ]]; do + docker logs test-comps-vllm-gaudi-service &> ${LOG_PATH}/vllm-gaudi-service.log + n=$((n+1)) + if grep -q "Uvicorn running on" ${LOG_PATH}/vllm-gaudi-service.log; then + break + fi + if grep -q "No such container" ${LOG_PATH}/vllm-gaudi-service.log; then + echo "container test-comps-vllm-gaudi-service not found" + exit 1 + fi + sleep 5s + done + sleep 5s + echo "Service started successfully" +} + function start_react_langchain_agent_service() { echo "Starting react_langchain agent microservice" docker compose -f $WORKPATH/tests/agent/react_langchain.yaml up -d @@ -76,6 +149,22 @@ function start_react_langgraph_agent_service() { echo "Service started successfully" } +function start_react_langgraph_agent_service_vllm() { + echo "Starting react_langgraph agent microservice" + docker compose -f $WORKPATH/tests/agent/react_vllm.yaml up -d + sleep 5s + docker logs test-comps-agent-endpoint + echo "Service started successfully" +} + +function start_planexec_agent_service_vllm() { + echo "Starting planexec agent microservice" + docker compose -f $WORKPATH/tests/agent/planexec_vllm.yaml up -d + sleep 5s + docker logs test-comps-agent-endpoint + echo "Service started successfully" +} + function start_react_langgraph_agent_service_openai() { echo "Starting react_langgraph agent microservice" docker run -d --runtime=runc --name="test-comps-agent-endpoint" -v $WORKPATH/comps/agent/langchain/tools:/home/user/comps/agent/langchain/tools -p 9095:9095 --ipc=host -e model=gpt-4o-mini-2024-07-18 -e strategy=react_langgraph -e llm_engine=openai -e OPENAI_API_KEY=${OPENAI_API_KEY} -e recursion_limit=10 -e require_human_feedback=false -e tools=/home/user/comps/agent/langchain/tools/custom_tools.yaml opea/agent-langchain:comps @@ -150,6 +239,13 @@ function stop_tgi_docker() { echo "Docker containers stopped successfully" } +function stop_vllm_docker() { + cid=$(docker ps -aq --filter "name=test-comps-vllm-gaudi-service") + echo "Stopping the docker containers "${cid} + if [[ ! -z "$cid" ]]; then docker rm $cid -f && sleep 1s; fi + echo "Docker containers stopped successfully" +} + function stop_agent_docker() { cid=$(docker ps -aq --filter "name=test-comps-agent-endpoint") echo "Stopping the docker containers "${cid} @@ -159,14 +255,16 @@ function stop_agent_docker() { function stop_docker() { stop_tgi_docker + stop_vllm_docker stop_agent_docker } function main() { - # stop_agent_docker + stop_agent_docker stop_docker build_docker_images + # ==================== TGI tests ==================== start_tgi_service # test rag agent @@ -192,6 +290,56 @@ function main() { stop_agent_docker echo "=============================================" + stop_tgi_docker + + # ==================== VLLM tests ==================== + build_vllm_docker_images + + export model=mistralai/Mistral-7B-Instruct-v0.3 + export LLM_MODEL_ID=${model} + export model_parser=mistral + export LLM_ENDPOINT_URL="http://${ip_address}:${vllm_port}" + + # test react with vllm + start_vllm_auto_tool_choice_service + start_react_langgraph_agent_service_vllm + echo "===========Testing ReAct VLLM =============" + validate_microservice + stop_agent_docker + stop_vllm_docker + echo "=============================================" + + # test plan execute with vllm + start_vllm_service + start_planexec_agent_service_vllm + echo "===========Testing Plan Execute VLLM =============" + validate_microservice + stop_agent_docker + stop_vllm_docker + echo "=============================================" + + export model=meta-llama/Llama-3.1-8B-Instruct + export LLM_MODEL_ID=${model} + export model_parser=llama3_json + + # test react with vllm - llama3 support has not been synced to vllm-gaudi yet + # start_vllm_auto_tool_choice_service + # start_react_langgraph_agent_service_vllm + # echo "===========Testing ReAct VLLM =============" + # validate_microservice + # stop_agent_docker + # stop_vllm_docker + # echo "=============================================" + + # test plan execute with vllm + start_vllm_service + start_planexec_agent_service_vllm + echo "===========Testing Plan Execute VLLM =============" + validate_microservice + stop_agent_docker + stop_vllm_docker + echo "=============================================" + stop_docker echo y | docker system prune 2>&1 > /dev/null }