diff --git a/.github/workflows/_comps-workflow.yml b/.github/workflows/_comps-workflow.yml index 12db04eb74..0e693e75ea 100644 --- a/.github/workflows/_comps-workflow.yml +++ b/.github/workflows/_comps-workflow.yml @@ -65,7 +65,6 @@ jobs: fi if [[ $(grep -c "vllm-gaudi:" ${docker_compose_yml}) != 0 ]]; then git clone https://github.com/HabanaAI/vllm-fork.git vllm-fork - cd vllm-fork && git checkout 3c39626 && cd ../ fi - name: Get build list id: get-build-list diff --git a/.github/workflows/check-online-doc-build.yml b/.github/workflows/check-online-doc-build.yml index 4b3ebe066b..bf3e0867a2 100644 --- a/.github/workflows/check-online-doc-build.yml +++ b/.github/workflows/check-online-doc-build.yml @@ -13,7 +13,7 @@ on: jobs: build: - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 steps: - name: Checkout diff --git a/.github/workflows/docker/compose/llms-compose.yaml b/.github/workflows/docker/compose/llms-compose.yaml index 864d74bd80..059a32980b 100644 --- a/.github/workflows/docker/compose/llms-compose.yaml +++ b/.github/workflows/docker/compose/llms-compose.yaml @@ -11,9 +11,9 @@ services: build: dockerfile: comps/llms/text-generation/ollama/langchain/Dockerfile image: ${REGISTRY:-opea}/llm-ollama:${TAG:-latest} - llm-docsum-tgi: + llm-docsum: build: - dockerfile: comps/llms/summarization/tgi/langchain/Dockerfile + dockerfile: comps/llms/src/doc-summarization/Dockerfile image: ${REGISTRY:-opea}/llm-docsum-tgi:${TAG:-latest} llm-faqgen: build: @@ -46,11 +46,3 @@ services: build: dockerfile: comps/llms/utils/lm-eval/Dockerfile image: ${REGISTRY:-opea}/llm-eval:${TAG:-latest} - llm-textgen-predictionguard: - build: - dockerfile: comps/llms/text-generation/predictionguard/Dockerfile - image: ${REGISTRY:-opea}/llm-textgen-predictionguard:${TAG:-latest} - llm-docsum-vllm: - build: - dockerfile: comps/llms/summarization/vllm/langchain/Dockerfile - image: ${REGISTRY:-opea}/llm-docsum-vllm:${TAG:-latest} diff --git a/comps/agent/deployment/kubernetes/README.md b/comps/agent/deployment/kubernetes/README.md index e69de29bb2..158ee40818 100644 --- a/comps/agent/deployment/kubernetes/README.md +++ b/comps/agent/deployment/kubernetes/README.md @@ -0,0 +1,11 @@ +# Deploy Agent microservice on Kubernetes cluster + +- You should have Helm (version >= 3.15) installed. Refer to the [Helm Installation Guide](https://helm.sh/docs/intro/install/) for more information. +- For more deployment options, refer to [helm charts README](https://github.com/opea-project/GenAIInfra/tree/main/helm-charts#readme). + +## Deploy on Kubernetes + +``` +export HFTOKEN="insert-your-huggingface-token-here" +helm install agent oci://ghcr.io/opea-project/charts/agent --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} -f gaudi-values.yaml +``` diff --git a/comps/agent/deployment/kubernetes/gaudi-values.yaml b/comps/agent/deployment/kubernetes/gaudi-values.yaml new file mode 100644 index 0000000000..91ef5d1026 --- /dev/null +++ b/comps/agent/deployment/kubernetes/gaudi-values.yaml @@ -0,0 +1,38 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +# Accelerate inferencing in heaviest components to improve performance +# by overriding their subchart values + +tgi: + enabled: true + accelDevice: "gaudi" + image: + repository: ghcr.io/huggingface/tgi-gaudi + tag: "2.0.6" + resources: + limits: + habana.ai/gaudi: 4 + MAX_INPUT_LENGTH: "4096" + MAX_TOTAL_TOKENS: "8192" + CUDA_GRAPHS: "" + OMPI_MCA_btl_vader_single_copy_mechanism: "none" + PT_HPU_ENABLE_LAZY_COLLECTIVES: "true" + ENABLE_HPU_GRAPH: "true" + LIMIT_HPU_GRAPH: "true" + USE_FLASH_ATTENTION: "true" + FLASH_ATTENTION_RECOMPUTE: "true" + extraCmdArgs: ["--sharded","true","--num-shard","4"] + livenessProbe: + initialDelaySeconds: 5 + periodSeconds: 5 + timeoutSeconds: 1 + readinessProbe: + initialDelaySeconds: 5 + periodSeconds: 5 + timeoutSeconds: 1 + startupProbe: + initialDelaySeconds: 5 + periodSeconds: 5 + timeoutSeconds: 1 + failureThreshold: 120 diff --git a/comps/agent/src/agent.py b/comps/agent/src/agent.py index fc47c3132e..fb779f612c 100644 --- a/comps/agent/src/agent.py +++ b/comps/agent/src/agent.py @@ -5,7 +5,7 @@ import pathlib import sys from datetime import datetime -from typing import Union +from typing import List, Optional, Union from fastapi.responses import StreamingResponse @@ -40,7 +40,10 @@ agent_inst = instantiate_agent(args, args.strategy, with_memory=args.with_memory) -class AgentCompletionRequest(LLMParamsDoc): +class AgentCompletionRequest(ChatCompletionRequest): + # rewrite, specify tools in this turn of conversation + tool_choice: Optional[List[str]] = None + # for short/long term in-memory thread_id: str = "0" user_id: str = "0" @@ -52,42 +55,40 @@ class AgentCompletionRequest(LLMParamsDoc): host="0.0.0.0", port=args.port, ) -async def llm_generate(input: Union[LLMParamsDoc, ChatCompletionRequest, AgentCompletionRequest]): +async def llm_generate(input: AgentCompletionRequest): if logflag: logger.info(input) - input.stream = args.stream - config = {"recursion_limit": args.recursion_limit} + # don't use global stream setting + # input.stream = args.stream + config = {"recursion_limit": args.recursion_limit, "tool_choice": input.tool_choice} if args.with_memory: - if isinstance(input, AgentCompletionRequest): - config["configurable"] = {"thread_id": input.thread_id} - else: - config["configurable"] = {"thread_id": "0"} + config["configurable"] = {"thread_id": input.thread_id} if logflag: logger.info(type(agent_inst)) - if isinstance(input, LLMParamsDoc): - # use query as input - input_query = input.query + # openai compatible input + if isinstance(input.messages, str): + messages = input.messages else: - # openai compatible input - if isinstance(input.messages, str): - input_query = input.messages - else: - input_query = input.messages[-1]["content"] + # TODO: need handle multi-turn messages + messages = input.messages[-1]["content"] # 2. prepare the input for the agent if input.stream: logger.info("-----------STREAMING-------------") - return StreamingResponse(agent_inst.stream_generator(input_query, config), media_type="text/event-stream") + return StreamingResponse( + agent_inst.stream_generator(messages, config), + media_type="text/event-stream", + ) else: logger.info("-----------NOT STREAMING-------------") - response = await agent_inst.non_streaming_run(input_query, config) + response = await agent_inst.non_streaming_run(messages, config) logger.info("-----------Response-------------") - return GeneratedDoc(text=response, prompt=input_query) + return GeneratedDoc(text=response, prompt=messages) @register_microservice( diff --git a/comps/agent/src/integrations/strategy/react/planner.py b/comps/agent/src/integrations/strategy/react/planner.py index 773cc199ce..03914037eb 100644 --- a/comps/agent/src/integrations/strategy/react/planner.py +++ b/comps/agent/src/integrations/strategy/react/planner.py @@ -11,7 +11,7 @@ from langgraph.prebuilt import create_react_agent from ...global_var import threads_global_kv -from ...utils import has_multi_tool_inputs, tool_renderer +from ...utils import filter_tools, has_multi_tool_inputs, tool_renderer from ..base_agent import BaseAgent from .prompt import REACT_SYS_MESSAGE, hwchase17_react_prompt @@ -136,7 +136,8 @@ async def non_streaming_run(self, query, config): # does not rely on langchain bind_tools API # since tgi and vllm still do not have very good support for tool calling like OpenAI -from typing import Annotated, Sequence, TypedDict +import json +from typing import Annotated, List, Optional, Sequence, TypedDict from langchain_core.messages import AIMessage, BaseMessage from langchain_core.prompts import PromptTemplate @@ -154,6 +155,7 @@ class AgentState(TypedDict): """The state of the agent.""" messages: Annotated[Sequence[BaseMessage], add_messages] + tool_choice: Optional[List[str]] = None is_last_step: IsLastStep @@ -191,7 +193,11 @@ def __call__(self, state): history = assemble_history(messages) print("@@@ History: ", history) - tools_descriptions = tool_renderer(self.tools) + tools_used = self.tools + if state["tool_choice"] is not None: + tools_used = filter_tools(self.tools, state["tool_choice"]) + + tools_descriptions = tool_renderer(tools_used) print("@@@ Tools description: ", tools_descriptions) # invoke chain @@ -279,21 +285,45 @@ def prepare_initial_state(self, query): async def stream_generator(self, query, config): initial_state = self.prepare_initial_state(query) + if "tool_choice" in config: + initial_state["tool_choice"] = config.pop("tool_choice") + try: - async for event in self.app.astream(initial_state, config=config): - for node_name, node_state in event.items(): - yield f"--- CALL {node_name} ---\n" - for k, v in node_state.items(): - if v is not None: - yield f"{k}: {v}\n" + async for event in self.app.astream(initial_state, config=config, stream_mode=["updates"]): + event_type = event[0] + data = event[1] + if event_type == "updates": + for node_name, node_state in data.items(): + print(f"--- CALL {node_name} node ---\n") + for k, v in node_state.items(): + if v is not None: + print(f"------- {k}, {v} -------\n\n") + if node_name == "agent": + if v[0].content == "": + tool_names = [] + for tool_call in v[0].tool_calls: + tool_names.append(tool_call["name"]) + result = {"tool": tool_names} + else: + result = {"content": [v[0].content.replace("\n\n", "\n")]} + # ui needs this format + yield f"data: {json.dumps(result)}\n\n" + elif node_name == "tools": + full_content = v[0].content + tool_name = v[0].name + result = {"tool": tool_name, "content": [full_content]} + yield f"data: {json.dumps(result)}\n\n" + if not full_content: + continue - yield f"data: {repr(event)}\n\n" yield "data: [DONE]\n\n" except Exception as e: yield str(e) async def non_streaming_run(self, query, config): initial_state = self.prepare_initial_state(query) + if "tool_choice" in config: + initial_state["tool_choice"] = config.pop("tool_choice") try: async for s in self.app.astream(initial_state, config=config, stream_mode="values"): message = s["messages"][-1] diff --git a/comps/agent/src/integrations/utils.py b/comps/agent/src/integrations/utils.py index 6440671421..ce41bf47ac 100644 --- a/comps/agent/src/integrations/utils.py +++ b/comps/agent/src/integrations/utils.py @@ -86,6 +86,14 @@ def tool_renderer(tools): return "\n".join(tool_strings) +def filter_tools(tools, tools_choices): + tool_used = [] + for tool in tools: + if tool.name in tools_choices: + tool_used.append(tool) + return tool_used + + def has_multi_tool_inputs(tools): ret = False for tool in tools: diff --git a/comps/agent/src/tools/custom_tools.py b/comps/agent/src/tools/custom_tools.py index d87a99374c..2b11d91047 100644 --- a/comps/agent/src/tools/custom_tools.py +++ b/comps/agent/src/tools/custom_tools.py @@ -4,9 +4,17 @@ # tool for unit test def search_web(query: str) -> str: - """Search the web for a given query.""" + """Search the web knowledge for a given query.""" ret_text = """ The Linux Foundation AI & Data announced the Open Platform for Enterprise AI (OPEA) as its latest Sandbox Project. OPEA aims to accelerate secure, cost-effective generative AI (GenAI) deployments for businesses by driving interoperability across a diverse and heterogeneous ecosystem, starting with retrieval-augmented generation (RAG). """ return ret_text + + +def search_weather(query: str) -> str: + """Search the weather for a given query.""" + ret_text = """ + It's clear. + """ + return ret_text diff --git a/comps/asr/deployment/kubernetes/README.md b/comps/asr/deployment/kubernetes/README.md index e69de29bb2..54f5676832 100644 --- a/comps/asr/deployment/kubernetes/README.md +++ b/comps/asr/deployment/kubernetes/README.md @@ -0,0 +1,11 @@ +# Deploy ASR microservice on Kubernetes cluster + +- You should have Helm (version >= 3.15) installed. Refer to the [Helm Installation Guide](https://helm.sh/docs/intro/install/) for more information. +- For more deployment options, refer to [helm charts README](https://github.com/opea-project/GenAIInfra/tree/main/helm-charts#readme). + +## Deploy on Kubernetes + +``` +export HFTOKEN="insert-your-huggingface-token-here" +helm install asr oci://ghcr.io/opea-project/charts/asr --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} -f cpu-values.yaml +``` diff --git a/comps/asr/deployment/kubernetes/cpu-values.yaml b/comps/asr/deployment/kubernetes/cpu-values.yaml new file mode 100644 index 0000000000..221ea994d5 --- /dev/null +++ b/comps/asr/deployment/kubernetes/cpu-values.yaml @@ -0,0 +1,5 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +whisper: + enabled: true diff --git a/comps/chathistory/deployment/kubernetes/README.md b/comps/chathistory/deployment/kubernetes/README.md index e69de29bb2..cb105bb7db 100644 --- a/comps/chathistory/deployment/kubernetes/README.md +++ b/comps/chathistory/deployment/kubernetes/README.md @@ -0,0 +1,11 @@ +# Deploy chathistory microservice on Kubernetes cluster + +- You should have Helm (version >= 3.15) installed. Refer to the [Helm Installation Guide](https://helm.sh/docs/intro/install/) for more information. +- For more deployment options, refer to [helm charts README](https://github.com/opea-project/GenAIInfra/tree/main/helm-charts#readme). + +## Deploy on Kubernetes + +``` +export HFTOKEN="insert-your-huggingface-token-here" +helm install chathistory-usvc oci://ghcr.io/opea-project/charts/chathistory-usvc --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} -f cpu-values.yaml +``` diff --git a/comps/chathistory/deployment/kubernetes/cpu-values.yaml b/comps/chathistory/deployment/kubernetes/cpu-values.yaml new file mode 100644 index 0000000000..7850c0ee9d --- /dev/null +++ b/comps/chathistory/deployment/kubernetes/cpu-values.yaml @@ -0,0 +1,5 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +mongodb: + enabled: true diff --git a/comps/dataprep/deployment/kubernetes/README.md b/comps/dataprep/deployment/kubernetes/README.md new file mode 100644 index 0000000000..fc9d9ab0bf --- /dev/null +++ b/comps/dataprep/deployment/kubernetes/README.md @@ -0,0 +1,18 @@ +# Deploy dataprep microservice on Kubernetes cluster + +- You should have Helm (version >= 3.15) installed. Refer to the [Helm Installation Guide](https://helm.sh/docs/intro/install/) for more information. +- For more deployment options, refer to [helm charts README](https://github.com/opea-project/GenAIInfra/tree/main/helm-charts#readme). + +## Deploy on Kubernetes with redis VectorDB + +``` +export HFTOKEN="insert-your-huggingface-token-here" +helm install data-prep oci://ghcr.io/opea-project/charts/data-prep --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} -f redis-values.yaml +``` + +## Deploy on Kubernetes with milvus VectorDB + +``` +export HFTOKEN="insert-your-huggingface-token-here" +helm install data-prep oci://ghcr.io/opea-project/charts/data-prep --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} -f milvus-values.yaml +``` diff --git a/comps/dataprep/deployment/kubernetes/milvus-values.yaml b/comps/dataprep/deployment/kubernetes/milvus-values.yaml new file mode 100644 index 0000000000..e2bc6c243f --- /dev/null +++ b/comps/dataprep/deployment/kubernetes/milvus-values.yaml @@ -0,0 +1,30 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +milvus: + enabled: true + cluster: + enabled: false + etcd: + replicaCount: 1 + pulsar: + enabled: false + minio: + mode: standalone +redis-vector-db: + enabled: false +tei: + enabled: true + +image: + repository: opea/dataprep-milvus + +port: 6010 +# text embedding inference service URL, e.g. http://<service-name>:<port> +#TEI_EMBEDDING_ENDPOINT: "http://embedding-tei:80" +# milvus DB configurations +#MILVUS_HOST: "milvustest" +MILVUS_PORT: "19530" +COLLECTION_NAME: "rag_milvus" +MOSEC_EMBEDDING_ENDPOINT: "" +MOSEC_EMBEDDING_MODEL: "" diff --git a/comps/dataprep/deployment/kubernetes/redis-values.yaml b/comps/dataprep/deployment/kubernetes/redis-values.yaml new file mode 100644 index 0000000000..54853db043 --- /dev/null +++ b/comps/dataprep/deployment/kubernetes/redis-values.yaml @@ -0,0 +1,9 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +tei: + enabled: true +redis-vector-db: + enabled: true +milvus: + enabled: false diff --git a/comps/embeddings/deployment/kubernetes/README.md b/comps/embeddings/deployment/kubernetes/README.md index e69de29bb2..567987a983 100644 --- a/comps/embeddings/deployment/kubernetes/README.md +++ b/comps/embeddings/deployment/kubernetes/README.md @@ -0,0 +1,11 @@ +# Deploy Embedding microservice on Kubernetes cluster + +- You should have Helm (version >= 3.15) installed. Refer to the [Helm Installation Guide](https://helm.sh/docs/intro/install/) for more information. +- For more deployment options, refer to [helm charts README](https://github.com/opea-project/GenAIInfra/tree/main/helm-charts#readme). + +## Deploy on Kubernetes + +``` +export HFTOKEN="insert-your-huggingface-token-here" +helm install embedding-usvc oci://ghcr.io/opea-project/charts/embedding-usvc --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} -f cpu-values.yaml +``` diff --git a/comps/llms/summarization/vllm/langchain/__init__.py b/comps/embeddings/deployment/kubernetes/cpu-values.yaml similarity index 77% rename from comps/llms/summarization/vllm/langchain/__init__.py rename to comps/embeddings/deployment/kubernetes/cpu-values.yaml index 916f3a44b2..e2d62ff26f 100644 --- a/comps/llms/summarization/vllm/langchain/__init__.py +++ b/comps/embeddings/deployment/kubernetes/cpu-values.yaml @@ -1,2 +1,5 @@ # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 + +tei: + enabled: true diff --git a/comps/guardrails/deployment/kubernetes/README.md b/comps/guardrails/deployment/kubernetes/README.md index e69de29bb2..b309900a07 100644 --- a/comps/guardrails/deployment/kubernetes/README.md +++ b/comps/guardrails/deployment/kubernetes/README.md @@ -0,0 +1,11 @@ +# Deploy guardrails microservice on Kubernetes cluster + +- You should have Helm (version >= 3.15) installed. Refer to the [Helm Installation Guide](https://helm.sh/docs/intro/install/) for more information. +- For more deployment options, refer to [helm charts README](https://github.com/opea-project/GenAIInfra/tree/main/helm-charts#readme). + +## Deploy on Kubernetes + +``` +export HFTOKEN="insert-your-huggingface-token-here" +helm install guardrails oci://ghcr.io/opea-project/charts/guardrails --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} -f cpu-values.yaml +``` diff --git a/comps/guardrails/deployment/kubernetes/cpu-values.yaml b/comps/guardrails/deployment/kubernetes/cpu-values.yaml new file mode 100644 index 0000000000..346a39496e --- /dev/null +++ b/comps/guardrails/deployment/kubernetes/cpu-values.yaml @@ -0,0 +1,5 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +tgi-guardrails: + enabled: true diff --git a/comps/llms/deployment/docker_compose/doc-summarization_tgi.yaml b/comps/llms/deployment/docker_compose/doc-summarization_tgi.yaml new file mode 100644 index 0000000000..9a14e5e5c9 --- /dev/null +++ b/comps/llms/deployment/docker_compose/doc-summarization_tgi.yaml @@ -0,0 +1,52 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +version: "3.8" + +services: + tgi-service: + image: ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu + container_name: tgi-server + ports: + - ${LLM_ENDPOINT_PORT:-8008}:80 + volumes: + - "./data:/data" + shm_size: 1g + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} + host_ip: ${host_ip} + LLM_ENDPOINT_PORT: ${LLM_ENDPOINT_PORT} + healthcheck: + test: ["CMD-SHELL", "curl -f http://${host_ip}:${LLM_ENDPOINT_PORT}/health || exit 1"] + interval: 10s + timeout: 10s + retries: 100 + command: --model-id ${LLM_MODEL_ID} --max-input-length ${MAX_INPUT_TOKENS} --max-total-tokens ${MAX_TOTAL_TOKENS} + llm: + image: opea/llm-docsum:latest + container_name: llm-docsum-server + depends_on: + tgi-service: + condition: service_healthy + ports: + - ${DOCSUM_PORT:-9000}:9000 + ipc: host + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} + MAX_INPUT_TOKENS: ${MAX_INPUT_TOKENS} + MAX_TOTAL_TOKENS: ${MAX_TOTAL_TOKENS} + LLM_ENDPOINT: ${LLM_ENDPOINT} + LLM_MODEL_ID: ${LLM_MODEL_ID} + DocSum_COMPONENT_NAME: ${DocSum_COMPONENT_NAME} + LOGFLAG: ${LOGFLAG:-False} + restart: unless-stopped + +networks: + default: + driver: bridge diff --git a/comps/llms/deployment/docker_compose/doc-summarization_tgi_on_intel_hpu.yaml b/comps/llms/deployment/docker_compose/doc-summarization_tgi_on_intel_hpu.yaml new file mode 100644 index 0000000000..c8562a2c56 --- /dev/null +++ b/comps/llms/deployment/docker_compose/doc-summarization_tgi_on_intel_hpu.yaml @@ -0,0 +1,63 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +version: "3.8" + +services: + tgi-service: + image: ghcr.io/huggingface/tgi-gaudi:2.3.1 + container_name: tgi_gaudi_server + ports: + - ${LLM_ENDPOINT_PORT:-8008}:80 + volumes: + - "./data:/data" + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} + HF_HUB_DISABLE_PROGRESS_BARS: 1 + HF_HUB_ENABLE_HF_TRANSFER: 0 + HABANA_VISIBLE_DEVICES: all + OMPI_MCA_btl_vader_single_copy_mechanism: none + ENABLE_HPU_GRAPH: true + LIMIT_HPU_GRAPH: true + USE_FLASH_ATTENTION: true + FLASH_ATTENTION_RECOMPUTE: true + host_ip: ${host_ip} + LLM_ENDPOINT_PORT: ${LLM_ENDPOINT_PORT} + runtime: habana + cap_add: + - SYS_NICE + ipc: host + healthcheck: + test: ["CMD-SHELL", "curl -f http://${host_ip}:${LLM_ENDPOINT_PORT}/health || exit 1"] + interval: 10s + timeout: 10s + retries: 100 + command: --model-id ${LLM_MODEL_ID} --max-input-length ${MAX_INPUT_TOKENS} --max-total-tokens ${MAX_TOTAL_TOKENS} + llm: + image: opea/llm-docsum:latest + container_name: llm-docsum-server + depends_on: + tgi-service: + condition: service_healthy + ports: + - ${DOCSUM_PORT:-9000}:9000 + ipc: host + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} + MAX_INPUT_TOKENS: ${MAX_INPUT_TOKENS} + MAX_TOTAL_TOKENS: ${MAX_TOTAL_TOKENS} + LLM_ENDPOINT: ${LLM_ENDPOINT} + LLM_MODEL_ID: ${LLM_MODEL_ID} + DocSum_COMPONENT_NAME: ${DocSum_COMPONENT_NAME} + LOGFLAG: ${LOGFLAG:-False} + restart: unless-stopped + +networks: + default: + driver: bridge diff --git a/comps/llms/deployment/docker_compose/doc-summarization_vllm.yaml b/comps/llms/deployment/docker_compose/doc-summarization_vllm.yaml new file mode 100644 index 0000000000..d14da4d527 --- /dev/null +++ b/comps/llms/deployment/docker_compose/doc-summarization_vllm.yaml @@ -0,0 +1,55 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +version: "3.8" + +services: + vllm-service: + image: opea/vllm:latest + container_name: vllm-server + ports: + - ${LLM_ENDPOINT_PORT:-8008}:80 + volumes: + - "./data:/data" + shm_size: 128g + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} + LLM_MODEL_ID: ${LLM_MODEL_ID} + VLLM_TORCH_PROFILER_DIR: "/mnt" + host_ip: ${host_ip} + LLM_ENDPOINT_PORT: ${LLM_ENDPOINT_PORT} + VLLM_SKIP_WARMUP: ${VLLM_SKIP_WARMUP:-false} + healthcheck: + test: ["CMD-SHELL", "curl -f http://${host_ip}:${LLM_ENDPOINT_PORT}/health || exit 1"] + interval: 10s + timeout: 10s + retries: 100 + command: --model $LLM_MODEL_ID --host 0.0.0.0 --port 80 + llm: + image: opea/llm-docsum:latest + container_name: llm-docsum-server + depends_on: + vllm-service: + condition: service_healthy + ports: + - ${DOCSUM_PORT:-9000}:9000 + ipc: host + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} + MAX_INPUT_TOKENS: ${MAX_INPUT_TOKENS} + MAX_TOTAL_TOKENS: ${MAX_TOTAL_TOKENS} + LLM_ENDPOINT: ${LLM_ENDPOINT} + LLM_MODEL_ID: ${LLM_MODEL_ID} + DocSum_COMPONENT_NAME: ${DocSum_COMPONENT_NAME} + LOGFLAG: ${LOGFLAG:-False} + restart: unless-stopped + +networks: + default: + driver: bridge diff --git a/comps/llms/summarization/vllm/langchain/docker_compose_llm.yaml b/comps/llms/deployment/docker_compose/doc-summarization_vllm_on_intel_hpu.yaml similarity index 50% rename from comps/llms/summarization/vllm/langchain/docker_compose_llm.yaml rename to comps/llms/deployment/docker_compose/doc-summarization_vllm_on_intel_hpu.yaml index 26847387cc..1a00b0d052 100644 --- a/comps/llms/summarization/vllm/langchain/docker_compose_llm.yaml +++ b/comps/llms/deployment/docker_compose/doc-summarization_vllm_on_intel_hpu.yaml @@ -8,37 +8,52 @@ services: image: opea/vllm-gaudi:latest container_name: vllm-gaudi-server ports: - - "8008:80" + - ${LLM_ENDPOINT_PORT:-8008}:80 volumes: - "./data:/data" environment: no_proxy: ${no_proxy} http_proxy: ${http_proxy} https_proxy: ${https_proxy} - HF_TOKEN: ${HF_TOKEN} + HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} HABANA_VISIBLE_DEVICES: all OMPI_MCA_btl_vader_single_copy_mechanism: none LLM_MODEL_ID: ${LLM_MODEL_ID} + MAX_INPUT_TOKENS: ${MAX_INPUT_TOKENS} + VLLM_TORCH_PROFILER_DIR: "/mnt" + host_ip: ${host_ip} + LLM_ENDPOINT_PORT: ${LLM_ENDPOINT_PORT} + VLLM_SKIP_WARMUP: ${VLLM_SKIP_WARMUP:-false} runtime: habana cap_add: - SYS_NICE ipc: host - command: --model $LLM_MODEL_ID --tensor-parallel-size 1 --host 0.0.0.0 --port 80 + healthcheck: + test: ["CMD-SHELL", "curl -f http://${host_ip}:${LLM_ENDPOINT_PORT}/health || exit 1"] + interval: 10s + timeout: 10s + retries: 100 + command: --model $LLM_MODEL_ID --tensor-parallel-size 1 --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq-len-to-capture ${MAX_INPUT_TOKENS} llm: - image: opea/llm-docsum-vllm:latest - container_name: llm-docsum-vllm-server + image: opea/llm-docsum:latest + container_name: llm-docsum-server + depends_on: + vllm-service: + condition: service_healthy ports: - - "9000:9000" + - ${DOCSUM_PORT:-9000}:9000 ipc: host environment: no_proxy: ${no_proxy} http_proxy: ${http_proxy} https_proxy: ${https_proxy} - vLLM_ENDPOINT: ${vLLM_ENDPOINT} - HUGGINGFACEHUB_API_TOKEN: ${HF_TOKEN} - LLM_MODEL_ID: ${LLM_MODEL_ID} + HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} MAX_INPUT_TOKENS: ${MAX_INPUT_TOKENS} MAX_TOTAL_TOKENS: ${MAX_TOTAL_TOKENS} + LLM_ENDPOINT: ${LLM_ENDPOINT} + LLM_MODEL_ID: ${LLM_MODEL_ID} + DocSum_COMPONENT_NAME: ${DocSum_COMPONENT_NAME} + LOGFLAG: ${LOGFLAG:-False} restart: unless-stopped networks: diff --git a/comps/llms/deployment/kubernetes/README.md b/comps/llms/deployment/kubernetes/README.md index e69de29bb2..3c2ee474ba 100644 --- a/comps/llms/deployment/kubernetes/README.md +++ b/comps/llms/deployment/kubernetes/README.md @@ -0,0 +1,11 @@ +# Deploy LLM microservice on Kubernetes cluster + +- You should have Helm (version >= 3.15) installed. Refer to the [Helm Installation Guide](https://helm.sh/docs/intro/install/) for more information. +- For more deployment options, refer to [helm charts README](https://github.com/opea-project/GenAIInfra/tree/main/helm-charts#readme). + +## Deploy on Kubernetes + +``` +export HFTOKEN="insert-your-huggingface-token-here" +helm install llm oci://ghcr.io/opea-project/charts/llm-uservice --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} -f cpu-values.yaml +``` diff --git a/comps/llms/deployment/kubernetes/cpu-values.yaml b/comps/llms/deployment/kubernetes/cpu-values.yaml new file mode 100644 index 0000000000..a879a49505 --- /dev/null +++ b/comps/llms/deployment/kubernetes/cpu-values.yaml @@ -0,0 +1,9 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +tgi: + enabled: true +resources: + requests: + cpu: 100m + memory: 128Mi diff --git a/comps/llms/summarization/tgi/langchain/Dockerfile b/comps/llms/src/doc-summarization/Dockerfile similarity index 80% rename from comps/llms/summarization/tgi/langchain/Dockerfile rename to comps/llms/src/doc-summarization/Dockerfile index 3a73120547..a7c07df449 100644 --- a/comps/llms/summarization/tgi/langchain/Dockerfile +++ b/comps/llms/src/doc-summarization/Dockerfile @@ -19,10 +19,10 @@ COPY comps /home/user/comps RUN pip install --no-cache-dir --upgrade pip setuptools && \ if [ ${ARCH} = "cpu" ]; then pip install --no-cache-dir torch torchvision --index-url https://download.pytorch.org/whl/cpu; fi && \ - pip install --no-cache-dir -r /home/user/comps/llms/summarization/tgi/langchain/requirements.txt + pip install --no-cache-dir -r /home/user/comps/llms/src/doc-summarization/requirements.txt ENV PYTHONPATH=$PYTHONPATH:/home/user -WORKDIR /home/user/comps/llms/summarization/tgi/langchain +WORKDIR /home/user/comps/llms/src/doc-summarization ENTRYPOINT ["bash", "entrypoint.sh"] diff --git a/comps/llms/summarization/tgi/langchain/README.md b/comps/llms/src/doc-summarization/README.md similarity index 67% rename from comps/llms/summarization/tgi/langchain/README.md rename to comps/llms/src/doc-summarization/README.md index 5442fbf4ff..f32a0d7d1d 100644 --- a/comps/llms/summarization/tgi/langchain/README.md +++ b/comps/llms/src/doc-summarization/README.md @@ -1,65 +1,43 @@ -# Document Summary TGI Microservice +# Document Summary LLM Microservice -This microservice leverages LangChain to implement summarization strategies and facilitate LLM inference using Text Generation Inference on Intel Xeon and Gaudi2 processors. -[Text Generation Inference](https://github.com/huggingface/text-generation-inference) (TGI) is a toolkit for deploying and serving Large Language Models (LLMs). TGI enables high-performance text generation for the most popular open-source LLMs, including Llama, Falcon, StarCoder, BLOOM, GPT-NeoX, and more. +This microservice leverages LangChain to implement summarization strategies and facilitate LLM inference using Text Generation Inference on Intel Xeon and Gaudi2 processors. You can set backend service either [TGI](../../../third_parties/tgi) or [vLLM](../../../third_parties/vllm). -## 🚀1. Start Microservice with Python 🐍 (Option 1) +## 🚀1. Start Microservice with Docker 🐳 -To start the LLM microservice, you need to install python packages first. +### 1.1 Setup Environment Variables -### 1.1 Install Requirements +In order to start DocSum services, you need to setup the following environment variables first. ```bash -pip install -r requirements.txt -``` - -### 1.2 Start LLM Service - -```bash -export HF_TOKEN=${your_hf_api_token} -docker run -p 8008:80 -v ./data:/data --name llm-docsum-tgi --shm-size 1g ghcr.io/huggingface/text-generation-inference:2.1.0 --model-id ${your_hf_llm_model} -``` - -### 1.3 Verify the TGI Service - -```bash -curl http://${your_ip}:8008/v1/chat/completions \ - -X POST \ - -d '{"model": ${your_hf_llm_model}, "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens":17}' \ - -H 'Content-Type: application/json' +export host_ip=${your_host_ip} +export LLM_ENDPOINT_PORT=8008 +export DOCSUM_PORT=9000 +export HUGGINGFACEHUB_API_TOKEN=${your_hf_api_token} +export LLM_ENDPOINT="http://${host_ip}:${LLM_ENDPOINT_PORT}" +export LLM_MODEL_ID=${your_hf_llm_model} +export MAX_INPUT_TOKENS=2048 +export MAX_TOTAL_TOKENS=4096 +export DocSum_COMPONENT_NAME="OPEADocSum_TGI" # or "OPEADocSum_vLLM" ``` -### 1.4 Start LLM Service with Python Script +Please make sure MAX_TOTAL_TOKENS should be larger than (MAX_INPUT_TOKENS + max_new_tokens + 50), 50 is reserved prompt length. -```bash -export TGI_LLM_ENDPOINT="http://${your_ip}:8008" -python llm.py -``` +### 1.2 Build Docker Image -## 🚀2. Start Microservice with Docker 🐳 (Option 2) +Step 1: Prepare backend LLM docker image. -If you start an LLM microservice with docker, the `docker_compose_llm.yaml` file will automatically start a TGI/vLLM service with docker. +If you want to use vLLM backend, refer to [vLLM](../../../third_parties/vllm/src) to build vLLM docker images first. -### 2.1 Setup Environment Variables +No need for TGI. -In order to start TGI and LLM services, you need to setup the following environment variables first. +Step 2: Build FaqGen docker image. ```bash -export HF_TOKEN=${your_hf_api_token} -export TGI_LLM_ENDPOINT="http://${your_ip}:8008" -export LLM_MODEL_ID=${your_hf_llm_model} -export MAX_INPUT_TOKENS=2048 -export MAX_TOTAL_TOKENS=4096 +cd ../../../../ +docker build -t opea/llm-docsum:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/src/summarization/Dockerfile . ``` -Please make sure MAX_TOTAL_TOKENS should be larger than (MAX_INPUT_TOKENS + max_new_tokens + 50), 50 is reserved prompt length. - -### 2.2 Build Docker Image - -```bash -cd ../../../../../ -docker build -t opea/llm-docsum-tgi:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/summarization/tgi/langchain/Dockerfile . -``` +### 1.3 Run Docker To start a docker container, you have two options: @@ -68,16 +46,45 @@ To start a docker container, you have two options: You can choose one as needed. -### 2.3 Run Docker with CLI (Option A) +### 1.3.1 Run Docker with CLI (Option A) + +Step 1: Start the backend LLM service +Please refer to [TGI](../../../third_parties/tgi/deployment/docker_compose/) or [vLLM](../../../third_parties/vllm/deployment/docker_compose/) guideline to start a backend LLM service. + +Step 2: Start the DocSum microservices ```bash -docker run -d --name="llm-docsum-tgi-server" -p 9000:9000 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e TGI_LLM_ENDPOINT=$TGI_LLM_ENDPOINT -e HF_TOKEN=$HF_TOKEN -e MAX_INPUT_TOKENS=${MAX_INPUT_TOKENS} -e MAX_TOTAL_TOKENS=${MAX_TOTAL_TOKENS} opea/llm-docsum-tgi:latest +docker run -d \ + --name="llm-docsum-server" \ + -p 9000:9000 \ + --ipc=host \ + -e http_proxy=$http_proxy \ + -e https_proxy=$https_proxy \ + -e LLM_MODEL_ID=$LLM_MODEL_ID \ + -e LLM_ENDPOINT=$LLM_ENDPOINT \ + -e HUGGINGFACEHUB_API_TOKEN=$HUGGINGFACEHUB_API_TOKEN \ + -e DocSum_COMPONENT_NAME=$DocSum_COMPONENT_NAME \ + -e MAX_INPUT_TOKENS=${MAX_INPUT_TOKENS} \ + -e MAX_TOTAL_TOKENS=${MAX_TOTAL_TOKENS} \ + opea/llm-docsum:latest ``` -### 2.4 Run Docker with Docker Compose (Option B) +### 1.3.2 Run Docker with Docker Compose (Option B) ```bash -docker compose -f docker_compose_llm.yaml up -d +cd ../../deployment/docker_compose/ + +# Backend is TGI on xeon +docker compose -f doc-summarization_tgi.yaml up -d + +# Backend is TGI on gaudi +# docker compose -f doc-summarization_tgi_on_intel_hpu.yaml up -d + +# Backend is vLLM on xeon +# docker compose -f doc-summarization_vllm.yaml up -d + +# Backend is vLLM on gaudi +# docker compose -f doc-summarization_vllm_on_intel_hpu.yaml up -d ``` ## 🚀3. Consume LLM Service @@ -106,19 +113,19 @@ If you want to deal with long context, can select suitable summary type, details ```bash # Enable stream to receive a stream response. By default, this is set to True. -curl http://${your_ip}:9000/v1/chat/docsum \ +curl http://${your_ip}:9000/v1/docsum \ -X POST \ -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en"}' \ -H 'Content-Type: application/json' # Disable stream to receive a non-stream response. -curl http://${your_ip}:9000/v1/chat/docsum \ +curl http://${your_ip}:9000/v1/docsum \ -X POST \ -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "stream":false}' \ -H 'Content-Type: application/json' # Use Chinese mode -curl http://${your_ip}:9000/v1/chat/docsum \ +curl http://${your_ip}:9000/v1/docsum \ -X POST \ -d '{"query":"2024年9月26日,北京——今日,英特尔正式发布英特尔® 至强® 6性能核处理器(代号Granite Rapids),为AI、数据分析、科学计算等计算密集型业务提供卓越性能。", "max_tokens":32, "language":"zh", "stream":false}' \ -H 'Content-Type: application/json' @@ -139,7 +146,7 @@ In this mode LLM generate summary based on complete input text. In this case ple Truncate mode will truncate the input text and keep only the first chunk, whose length is equal to `min(MAX_TOTAL_TOKENS - input.max_tokens - 50, MAX_INPUT_TOKENS)` ```bash -curl http://${your_ip}:9000/v1/chat/docsum \ +curl http://${your_ip}:9000/v1/docsum \ -X POST \ -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "truncate", "chunk_size": 2000}' \ -H 'Content-Type: application/json' @@ -152,7 +159,7 @@ Map_reduce mode will split the inputs into multiple chunks, map each document to In this mode, default `chunk_size` is set to be `min(MAX_TOTAL_TOKENS - input.max_tokens - 50, MAX_INPUT_TOKENS)` ```bash -curl http://${your_ip}:9000/v1/chat/docsum \ +curl http://${your_ip}:9000/v1/docsum \ -X POST \ -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "map_reduce", "chunk_size": 2000, "stream":false}' \ -H 'Content-Type: application/json' @@ -165,7 +172,7 @@ Refin mode will split the inputs into multiple chunks, generate summary for the In this mode, default `chunk_size` is set to be `min(MAX_TOTAL_TOKENS - 2 * input.max_tokens - 128, MAX_INPUT_TOKENS)`. ```bash -curl http://${your_ip}:9000/v1/chat/docsum \ +curl http://${your_ip}:9000/v1/docsum \ -X POST \ -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "refine", "chunk_size": 2000}' \ -H 'Content-Type: application/json' diff --git a/comps/llms/summarization/vllm/langchain/entrypoint.sh b/comps/llms/src/doc-summarization/entrypoint.sh similarity index 81% rename from comps/llms/summarization/vllm/langchain/entrypoint.sh rename to comps/llms/src/doc-summarization/entrypoint.sh index d60eddd36b..64c8df3b4d 100644 --- a/comps/llms/summarization/vllm/langchain/entrypoint.sh +++ b/comps/llms/src/doc-summarization/entrypoint.sh @@ -5,4 +5,4 @@ pip --no-cache-dir install -r requirements-runtime.txt -python llm.py +python opea_docsum_microservice.py diff --git a/comps/llms/summarization/tgi/langchain/__init__.py b/comps/llms/src/doc-summarization/integrations/__init__.py similarity index 100% rename from comps/llms/summarization/tgi/langchain/__init__.py rename to comps/llms/src/doc-summarization/integrations/__init__.py diff --git a/comps/llms/src/doc-summarization/integrations/common.py b/comps/llms/src/doc-summarization/integrations/common.py new file mode 100644 index 0000000000..3fb0dde092 --- /dev/null +++ b/comps/llms/src/doc-summarization/integrations/common.py @@ -0,0 +1,204 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os + +import requests +from fastapi.responses import StreamingResponse +from langchain.chains.summarize import load_summarize_chain +from langchain.docstore.document import Document +from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter +from langchain_core.prompts import PromptTemplate +from transformers import AutoTokenizer + +from comps import CustomLogger, DocSumLLMParams, GeneratedDoc, OpeaComponent, ServiceType +from comps.cores.mega.utils import ConfigError, get_access_token, load_model_configs + +from .template import templ_en, templ_refine_en, templ_refine_zh, templ_zh + +logger = CustomLogger("llm_docsum") +logflag = os.getenv("LOGFLAG", False) + +# Environment variables +MODEL_NAME = os.getenv("LLM_MODEL_ID") +MODEL_CONFIGS = os.getenv("MODEL_CONFIGS") +TOKEN_URL = os.getenv("TOKEN_URL") +CLIENTID = os.getenv("CLIENTID") +CLIENT_SECRET = os.getenv("CLIENT_SECRET") +MAX_INPUT_TOKENS = int(os.getenv("MAX_INPUT_TOKENS", 2048)) +MAX_TOTAL_TOKENS = int(os.getenv("MAX_TOTAL_TOKENS", 4096)) + +if os.getenv("LLM_ENDPOINT") is not None: + DEFAULT_ENDPOINT = os.getenv("LLM_ENDPOINT") +elif os.getenv("TGI_LLM_ENDPOINT") is not None: + DEFAULT_ENDPOINT = os.getenv("TGI_LLM_ENDPOINT") +elif os.getenv("vLLM_ENDPOINT") is not None: + DEFAULT_ENDPOINT = os.getenv("vLLM_ENDPOINT") +else: + DEFAULT_ENDPOINT = "http://localhost:8080" + + +def get_llm_endpoint(): + if not MODEL_CONFIGS: + return DEFAULT_ENDPOINT + else: + # Validate and Load the models config if MODEL_CONFIGS is not null + configs_map = {} + try: + configs_map = load_model_configs(MODEL_CONFIGS) + except ConfigError as e: + logger.error(f"Failed to load model configurations: {e}") + raise ConfigError(f"Failed to load model configurations: {e}") + try: + return configs_map.get(MODEL_NAME).get("endpoint") + except ConfigError as e: + logger.error(f"Input model {MODEL_NAME} not present in model_configs. Error {e}") + raise ConfigError(f"Input model {MODEL_NAME} not present in model_configs") + + +class OPEADocSum(OpeaComponent): + """A specialized OPEA DocSum component derived from OpeaComponent. + + Attributes: + client (TGI/vLLM): An instance of the TGI/vLLM client for text generation. + """ + + def __init__(self, name: str, description: str, config: dict = None): + super().__init__(name, ServiceType.LLM.name.lower(), description, config) + self.access_token = ( + get_access_token(TOKEN_URL, CLIENTID, CLIENT_SECRET) if TOKEN_URL and CLIENTID and CLIENT_SECRET else None + ) + self.llm_endpoint = get_llm_endpoint() + self.tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) + health_status = self.check_health() + if not health_status: + logger.error("OPEADocSum health check failed.") + + async def generate(self, input: DocSumLLMParams, client): + """Invokes the TGI/vLLM LLM service to generate summarization for the provided input. + + Args: + input (DocSumLLMParams): The input text(s). + client: TGI/vLLM based client + """ + ### check summary type + summary_types = ["auto", "stuff", "truncate", "map_reduce", "refine"] + if input.summary_type not in summary_types: + raise NotImplementedError(f"Please specify the summary_type in {summary_types}") + if input.summary_type == "auto": ### Check input token length in auto mode + token_len = len(self.tokenizer.encode(input.query)) + if token_len > MAX_INPUT_TOKENS + 50: + input.summary_type = "refine" + if logflag: + logger.info( + f"Input token length {token_len} exceed MAX_INPUT_TOKENS + 50 {MAX_INPUT_TOKENS+50}, auto switch to 'refine' mode." + ) + else: + input.summary_type = "stuff" + if logflag: + logger.info( + f"Input token length {token_len} not exceed MAX_INPUT_TOKENS + 50 {MAX_INPUT_TOKENS+50}, auto switch to 'stuff' mode." + ) + + ### Check input language + if input.language in ["en", "auto"]: + templ = templ_en + templ_refine = templ_refine_en + elif input.language in ["zh"]: + templ = templ_zh + templ_refine = templ_refine_zh + else: + raise NotImplementedError('Please specify the input language in "en", "zh", "auto"') + + ## Prompt + PROMPT = PromptTemplate.from_template(templ) + if input.summary_type == "refine": + PROMPT_REFINE = PromptTemplate.from_template(templ_refine) + if logflag: + logger.info("After prompting:") + logger.info(PROMPT) + if input.summary_type == "refine": + logger.info(PROMPT_REFINE) + + ## Split text + if input.summary_type == "stuff": + text_splitter = CharacterTextSplitter() + else: + if input.summary_type == "refine": + if MAX_TOTAL_TOKENS <= 2 * input.max_tokens + 128: ## 128 is reserved prompt length + raise RuntimeError("In Refine mode, Please set MAX_TOTAL_TOKENS larger than (max_tokens * 2 + 128)") + max_input_tokens = min(MAX_TOTAL_TOKENS - 2 * input.max_tokens - 128, MAX_INPUT_TOKENS) + else: + if MAX_TOTAL_TOKENS <= input.max_tokens + 50: # 50 is reserved token length for prompt + raise RuntimeError("Please set MAX_TOTAL_TOKENS larger than max_tokens + 50)") + max_input_tokens = min(MAX_TOTAL_TOKENS - input.max_tokens - 50, MAX_INPUT_TOKENS) + chunk_size = min(input.chunk_size, max_input_tokens) if input.chunk_size > 0 else max_input_tokens + chunk_overlap = input.chunk_overlap if input.chunk_overlap > 0 else int(0.1 * chunk_size) + text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer( + tokenizer=self.tokenizer, chunk_size=chunk_size, chunk_overlap=chunk_overlap + ) + if logflag: + logger.info(f"set chunk size to: {chunk_size}") + logger.info(f"set chunk overlap to: {chunk_overlap}") + + texts = text_splitter.split_text(input.query) + docs = [Document(page_content=t) for t in texts] + if logflag: + logger.info(f"Split input query into {len(docs)} chunks") + logger.info(f"The character length of the first chunk is {len(texts[0])}") + + ## LLM chain + summary_type = input.summary_type + if summary_type == "stuff": + llm_chain = load_summarize_chain(llm=client, prompt=PROMPT) + elif summary_type == "truncate": + docs = [docs[0]] + llm_chain = load_summarize_chain(llm=client, prompt=PROMPT) + elif summary_type == "map_reduce": + llm_chain = load_summarize_chain( + llm=client, + map_prompt=PROMPT, + combine_prompt=PROMPT, + chain_type="map_reduce", + return_intermediate_steps=True, + ) + elif summary_type == "refine": + llm_chain = load_summarize_chain( + llm=client, + question_prompt=PROMPT, + refine_prompt=PROMPT_REFINE, + chain_type="refine", + return_intermediate_steps=True, + ) + else: + raise NotImplementedError(f"Please specify the summary_type in {summary_types}") + + if input.stream: + + async def stream_generator(): + from langserve.serialization import WellKnownLCSerializer + + _serializer = WellKnownLCSerializer() + async for chunk in llm_chain.astream_log(docs): + data = _serializer.dumps({"ops": chunk.ops}).decode("utf-8") + if logflag: + logger.info(data) + yield f"data: {data}\n\n" + yield "data: [DONE]\n\n" + + return StreamingResponse(stream_generator(), media_type="text/event-stream") + else: + response = await llm_chain.ainvoke(docs) + + if input.summary_type in ["map_reduce", "refine"]: + intermediate_steps = response["intermediate_steps"] + if logflag: + logger.info("intermediate_steps:") + logger.info(intermediate_steps) + + output_text = response["output_text"] + if logflag: + logger.info("\n\noutput_text:") + logger.info(output_text) + + return GeneratedDoc(text=output_text, prompt=input.query) diff --git a/comps/llms/src/doc-summarization/integrations/template.py b/comps/llms/src/doc-summarization/integrations/template.py new file mode 100644 index 0000000000..20ef59454c --- /dev/null +++ b/comps/llms/src/doc-summarization/integrations/template.py @@ -0,0 +1,58 @@ +# Copyright (C) 2025 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +templ_en = """Write a concise summary of the following: + + +"{text}" + + +CONCISE SUMMARY:""" + +templ_zh = """请简要概括以下内容: + + +"{text}" + + +概况:""" + + +templ_refine_en = """Your job is to produce a final summary. +We have provided an existing summary up to a certain point, then we will provide more context. +You need to refine the existing summary (only if needed) with new context and generate a final summary. + + +Existing Summary: +"{existing_answer}" + + + +New Context: +"{text}" + + + +Final Summary: + +""" + +templ_refine_zh = """\ +你的任务是生成一个最终摘要。 +我们已经处理好部分文本并生成初始摘要, 并提供了新的未处理文本 +你需要根据新提供的文本,结合初始摘要,生成一个最终摘要。 + + +初始摘要: +"{existing_answer}" + + + +新的文本: +"{text}" + + + +最终摘要: + +""" diff --git a/comps/llms/src/doc-summarization/integrations/tgi.py b/comps/llms/src/doc-summarization/integrations/tgi.py new file mode 100644 index 0000000000..a15c52e7d4 --- /dev/null +++ b/comps/llms/src/doc-summarization/integrations/tgi.py @@ -0,0 +1,76 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os + +import requests +from langchain_community.llms import HuggingFaceEndpoint + +from comps import CustomLogger, DocSumLLMParams, GeneratedDoc, OpeaComponent, OpeaComponentRegistry, ServiceType + +from .common import * + +logger = CustomLogger("llm_docsum_tgi") +logflag = os.getenv("LOGFLAG", False) + + +@OpeaComponentRegistry.register("OPEADocSum_TGI") +class OPEADocSum_TGI(OPEADocSum): + """A specialized OPEA DocSum TGI component derived from OPEADocSum for interacting with TGI services based on Lanchain HuggingFaceEndpoint API. + + Attributes: + client (TGI): An instance of the TGI client for text generation. + """ + + def check_health(self) -> bool: + """Checks the health of the TGI LLM service. + + Returns: + bool: True if the service is reachable and healthy, False otherwise. + """ + + try: + # response = requests.get(f"{self.llm_endpoint}/health") + + # Will remove after TGI gaudi fix health bug + url = f"{self.llm_endpoint}/generate" + data = {"inputs": "What is Deep Learning?", "parameters": {"max_new_tokens": 17}} + headers = {"Content-Type": "application/json"} + response = requests.post(url=url, json=data, headers=headers) + + if response.status_code == 200: + return True + else: + return False + except Exception as e: + logger.error(e) + logger.error("Health check failed") + return False + + async def invoke(self, input: DocSumLLMParams): + """Invokes the TGI LLM service to generate summarization output for the provided input. + + Args: + input (DocSumLLMParams): The input text(s). + """ + server_kwargs = {} + if self.access_token: + server_kwargs["headers"] = {"Authorization": f"Bearer {self.access_token}"} + + if input.stream and input.summary_type == "map_reduce": + logger.info("Map Reduce mode don't support stream=True, set to stream=False") + input.stream = False + self.client = HuggingFaceEndpoint( + endpoint_url=self.llm_endpoint, + max_new_tokens=input.max_tokens, + top_k=input.top_k, + top_p=input.top_p, + typical_p=input.typical_p, + temperature=input.temperature, + repetition_penalty=input.repetition_penalty, + streaming=input.stream, + server_kwargs=server_kwargs, + ) + result = await self.generate(input, self.client) + + return result diff --git a/comps/llms/src/doc-summarization/integrations/vllm.py b/comps/llms/src/doc-summarization/integrations/vllm.py new file mode 100644 index 0000000000..6651fbd203 --- /dev/null +++ b/comps/llms/src/doc-summarization/integrations/vllm.py @@ -0,0 +1,69 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os + +import requests +from langchain_community.llms import VLLMOpenAI + +from comps import CustomLogger, DocSumLLMParams, GeneratedDoc, OpeaComponent, OpeaComponentRegistry, ServiceType + +from .common import * + +logger = CustomLogger("llm_docsum_vllm") +logflag = os.getenv("LOGFLAG", False) + + +@OpeaComponentRegistry.register("OPEADocSum_vLLM") +class OPEADocSum_vLLM(OPEADocSum): + """A specialized OPEA DocSum vLLM component derived from OPEADocSum for interacting with vLLM services based on Lanchain VLLMOpenAI API. + + Attributes: + client (vLLM): An instance of the vLLM client for text generation. + """ + + def check_health(self) -> bool: + """Checks the health of the vLLM LLM service. + + Returns: + bool: True if the service is reachable and healthy, False otherwise. + """ + + try: + response = requests.get(f"{self.llm_endpoint}/health") + if response.status_code == 200: + return True + else: + return False + except Exception as e: + logger.error(e) + logger.error("Health check failed") + return False + + async def invoke(self, input: DocSumLLMParams): + """Invokes the vLLM LLM service to generate summarization output for the provided input. + + Args: + input (DocSumLLMParams): The input text(s). + """ + headers = {} + if self.access_token: + headers = {"Authorization": f"Bearer {self.access_token}"} + + if input.stream and input.summary_type == "map_reduce": + logger.info("Map Reduce mode don't support stream=True, set to stream=False") + input.stream = False + self.client = VLLMOpenAI( + openai_api_key="EMPTY", + openai_api_base=self.llm_endpoint + "/v1", + model_name=MODEL_NAME, + default_headers=headers, + max_tokens=input.max_tokens, + top_p=input.top_p, + streaming=input.stream, + temperature=input.temperature, + presence_penalty=input.repetition_penalty, + ) + result = await self.generate(input, self.client) + + return result diff --git a/comps/llms/src/doc-summarization/opea_docsum_microservice.py b/comps/llms/src/doc-summarization/opea_docsum_microservice.py new file mode 100644 index 0000000000..18d9e409f4 --- /dev/null +++ b/comps/llms/src/doc-summarization/opea_docsum_microservice.py @@ -0,0 +1,58 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os +import time + +from integrations.tgi import OPEADocSum_TGI +from integrations.vllm import OPEADocSum_vLLM + +from comps import ( + CustomLogger, + DocSumLLMParams, + OpeaComponentLoader, + ServiceType, + opea_microservices, + register_microservice, + register_statistics, + statistics_dict, +) + +logger = CustomLogger("llm_docsum") +logflag = os.getenv("LOGFLAG", False) + +llm_component_name = os.getenv("DocSum_COMPONENT_NAME", "OPEADocSum_TGI") +# Initialize OpeaComponentLoader +loader = OpeaComponentLoader(llm_component_name, description=f"OPEA LLM DocSum Component: {llm_component_name}") + + +@register_microservice( + name="opea_service@llm_docsum", + service_type=ServiceType.LLM, + endpoint="/v1/docsum", + host="0.0.0.0", + port=9000, +) +@register_statistics(names=["opea_service@llm_docsum"]) +async def llm_generate(input: DocSumLLMParams): + start = time.time() + + # Log the input if logging is enabled + if logflag: + logger.info(input) + + try: + # Use the controller to invoke the active component + response = await loader.invoke(input) + # Record statistics + statistics_dict["opea_service@llm_docsum"].append_latency(time.time() - start, None) + return response + + except Exception as e: + logger.error(f"Error during DocSum invocation: {e}") + raise + + +if __name__ == "__main__": + logger.info("OPEA DocSum Microservice is starting...") + opea_microservices["opea_service@llm_docsum"].start() diff --git a/comps/llms/summarization/tgi/langchain/requirements-runtime.txt b/comps/llms/src/doc-summarization/requirements-runtime.txt similarity index 100% rename from comps/llms/summarization/tgi/langchain/requirements-runtime.txt rename to comps/llms/src/doc-summarization/requirements-runtime.txt diff --git a/comps/llms/summarization/tgi/langchain/requirements.txt b/comps/llms/src/doc-summarization/requirements.txt similarity index 100% rename from comps/llms/summarization/tgi/langchain/requirements.txt rename to comps/llms/src/doc-summarization/requirements.txt diff --git a/comps/llms/text-generation/predictionguard/README.md b/comps/llms/src/text-generation/README_predictionguard.md similarity index 86% rename from comps/llms/text-generation/predictionguard/README.md rename to comps/llms/src/text-generation/README_predictionguard.md index 643434f2ee..32b46fe250 100644 --- a/comps/llms/text-generation/predictionguard/README.md +++ b/comps/llms/src/text-generation/README_predictionguard.md @@ -4,13 +4,6 @@ ## Get Started -### Build Docker Image - -```bash -cd ../../.. -docker build -t opea/llm-textgen-predictionguard:latest -f comps/llms/text-generation/predictionguard/Dockerfile . -``` - ### Run the Predictionguard Microservice ```bash @@ -28,7 +21,7 @@ curl -X POST http://localhost:9000/v1/chat/completions \ -H "Content-Type: application/json" \ -d '{ "model": "Hermes-2-Pro-Llama-3-8B", - "query": "Tell me a joke.", + "messages": "Tell me a joke.", "max_tokens": 100, "temperature": 0.7, "top_p": 0.9, @@ -44,7 +37,7 @@ curl -N -X POST http://localhost:9000/v1/chat/completions \ -H "Content-Type: application/json" \ -d '{ "model": "Hermes-2-Pro-Llama-3-8B", - "query": "Tell me a joke.", + "messages": "Tell me a joke.", "max_tokens": 100, "temperature": 0.7, "top_p": 0.9, diff --git a/comps/llms/src/text-generation/integrations/predictionguard.py b/comps/llms/src/text-generation/integrations/predictionguard.py new file mode 100644 index 0000000000..3dd8c9816f --- /dev/null +++ b/comps/llms/src/text-generation/integrations/predictionguard.py @@ -0,0 +1,101 @@ +# Copyright (C) 2024 Prediction Guard, Inc. +# SPDX-License-Identified: Apache-2.0 + +import os +import time + +from fastapi import HTTPException +from fastapi.responses import StreamingResponse +from predictionguard import PredictionGuard + +from comps import CustomLogger, OpeaComponent, OpeaComponentRegistry, ServiceType +from comps.cores.proto.api_protocol import ChatCompletionRequest + +logger = CustomLogger("opea_textgen_predictionguard") +logflag = os.getenv("LOGFLAG", False) + + +@OpeaComponentRegistry.register("OPEATextGen_Predictionguard") +class OPEATextGen_Predictionguard(OpeaComponent): + """A specialized OPEA TextGen component derived from OpeaComponent for interacting with Predictionguard services. + + Attributes: + client (Predictionguard): An instance of the Predictionguard client for text generation. + """ + + def __init__(self, name: str, description: str, config: dict = None): + super().__init__(name, ServiceType.LLM.name.lower(), description, config) + self.client = PredictionGuard() + health_status = self.check_health() + if not health_status: + logger.error("OPEATextGen_Predictionguard health check failed.") + else: + logger.info("OPEATextGen_Predictionguard health check success.") + + def check_health(self) -> bool: + """Checks the health of the Predictionguard LLM service. + + Returns: + bool: True if the service is reachable and healthy, False otherwise. + """ + + try: + response = self.client.models.list() + return response is not None + except Exception as e: + logger.error(e) + logger.error("Health check failed") + return False + + async def invoke(self, input: ChatCompletionRequest): + """Invokes the Predictionguard LLM service to generate output for the provided input. + + Args: + input (ChatCompletionRequest): The input text(s). + """ + if isinstance(input.messages, str): + messages = [ + { + "role": "system", + "content": "You are a helpful assistant. Your goal is to provide accurate, detailed, and safe responses to the user's queries.", + }, + {"role": "user", "content": input.messages}, + ] + else: + messages = input.messages + + if input.stream: + + async def stream_generator(): + chat_response = "" + for res in self.client.chat.completions.create( + model=input.model, + messages=messages, + max_tokens=input.max_tokens, + temperature=input.temperature, + top_p=input.top_p, + top_k=input.top_k, + stream=True, + ): + if "choices" in res["data"] and "delta" in res["data"]["choices"][0]: + delta_content = res["data"]["choices"][0]["delta"]["content"] + chat_response += delta_content + yield f"data: {delta_content}\n\n" + else: + yield "data: [DONE]\n\n" + + return StreamingResponse(stream_generator(), media_type="text/event-stream") + else: + try: + response = self.client.chat.completions.create( + model=input.model, + messages=messages, + max_tokens=input.max_tokens, + temperature=input.temperature, + top_p=input.top_p, + top_k=input.top_k, + ) + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) + + return response diff --git a/comps/llms/src/text-generation/opea_llm_microservice.py b/comps/llms/src/text-generation/opea_llm_microservice.py index fb24911c41..ee52c60741 100644 --- a/comps/llms/src/text-generation/opea_llm_microservice.py +++ b/comps/llms/src/text-generation/opea_llm_microservice.py @@ -6,6 +6,7 @@ from typing import Union from integrations.opea import OPEALLM +from integrations.predictionguard import OPEATextGen_Predictionguard from comps import ( CustomLogger, @@ -25,6 +26,8 @@ llm_component_name = os.getenv("LLM_COMPONENT_NAME", "OPEA_LLM") +if logflag: + logger.info(f"Get llm_component_name {llm_component_name}") # Initialize OpeaComponentLoader loader = OpeaComponentLoader(llm_component_name, description=f"OPEA LLM Component: {llm_component_name}") diff --git a/comps/llms/src/text-generation/requirements.txt b/comps/llms/src/text-generation/requirements.txt index 85b06a876d..5739dad9fa 100644 --- a/comps/llms/src/text-generation/requirements.txt +++ b/comps/llms/src/text-generation/requirements.txt @@ -8,6 +8,8 @@ openai==1.57.4 opentelemetry-api opentelemetry-exporter-otlp opentelemetry-sdk +Pillow +predictionguard prometheus-fastapi-instrumentator shortuuid transformers diff --git a/comps/llms/summarization/tgi/langchain/docker_compose_llm.yaml b/comps/llms/summarization/tgi/langchain/docker_compose_llm.yaml deleted file mode 100644 index 93579a5712..0000000000 --- a/comps/llms/summarization/tgi/langchain/docker_compose_llm.yaml +++ /dev/null @@ -1,37 +0,0 @@ -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -version: "3.8" - -services: - tgi_service: - image: ghcr.io/huggingface/text-generation-inference:2.1.0 - container_name: tgi-service - ports: - - "8008:80" - volumes: - - "./data:/data" - environment: - HF_TOKEN: ${HF_TOKEN} - shm_size: 1g - command: --model-id ${LLM_MODEL_ID} --max-input-length ${MAX_INPUT_TOKENS} --max-total-tokens ${MAX_TOTAL_TOKENS} - llm: - image: opea/llm-docsum-tgi:latest - container_name: llm-docsum-tgi-server - ports: - - "9000:9000" - ipc: host - environment: - no_proxy: ${no_proxy} - http_proxy: ${http_proxy} - https_proxy: ${https_proxy} - TGI_LLM_ENDPOINT: ${TGI_LLM_ENDPOINT} - HUGGINGFACEHUB_API_TOKEN: ${HF_TOKEN} - MAX_INPUT_TOKENS: ${MAX_INPUT_TOKENS} - MAX_TOTAL_TOKENS: ${MAX_TOTAL_TOKENS} - LLM_MODEL_ID: ${LLM_MODEL_ID} - restart: unless-stopped - -networks: - default: - driver: bridge diff --git a/comps/llms/summarization/tgi/langchain/entrypoint.sh b/comps/llms/summarization/tgi/langchain/entrypoint.sh deleted file mode 100644 index d60eddd36b..0000000000 --- a/comps/llms/summarization/tgi/langchain/entrypoint.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/usr/bin/env bash - -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -pip --no-cache-dir install -r requirements-runtime.txt - -python llm.py diff --git a/comps/llms/summarization/tgi/langchain/llm.py b/comps/llms/summarization/tgi/langchain/llm.py deleted file mode 100644 index addb090071..0000000000 --- a/comps/llms/summarization/tgi/langchain/llm.py +++ /dev/null @@ -1,245 +0,0 @@ -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -import os - -from fastapi.responses import StreamingResponse -from langchain.chains.summarize import load_summarize_chain -from langchain.docstore.document import Document -from langchain.prompts import PromptTemplate -from langchain_community.llms import HuggingFaceEndpoint -from langchain_text_splitters import CharacterTextSplitter, RecursiveCharacterTextSplitter -from transformers import AutoTokenizer - -from comps import CustomLogger, DocSumLLMParams, GeneratedDoc, ServiceType, opea_microservices, register_microservice -from comps.cores.mega.utils import get_access_token - -logger = CustomLogger("llm_docsum") -logflag = os.getenv("LOGFLAG", False) - -# Environment variables -TOKEN_URL = os.getenv("TOKEN_URL") -CLIENTID = os.getenv("CLIENTID") -CLIENT_SECRET = os.getenv("CLIENT_SECRET") -MAX_INPUT_TOKENS = int(os.getenv("MAX_INPUT_TOKENS", 2048)) -MAX_TOTAL_TOKENS = int(os.getenv("MAX_TOTAL_TOKENS", 4096)) -LLM_MODEL_ID = os.getenv("LLM_MODEL_ID", "Intel/neural-chat-7b-v3-3") - -templ_en = """Write a concise summary of the following: - - -"{text}" - - -CONCISE SUMMARY:""" - -templ_zh = """请简要概括以下内容: - - -"{text}" - - -概况:""" - - -templ_refine_en = """Your job is to produce a final summary. -We have provided an existing summary up to a certain point, then we will provide more context. -You need to refine the existing summary (only if needed) with new context and generate a final summary. - - -Existing Summary: -"{existing_answer}" - - - -New Context: -"{text}" - - - -Final Summary: - -""" - -templ_refine_zh = """\ -你的任务是生成一个最终摘要。 -我们已经处理好部分文本并生成初始摘要, 并提供了新的未处理文本 -你需要根据新提供的文本,结合初始摘要,生成一个最终摘要。 - - -初始摘要: -"{existing_answer}" - - - -新的文本: -"{text}" - - - -最终摘要: - -""" - - -@register_microservice( - name="opea_service@llm_docsum", - service_type=ServiceType.LLM, - endpoint="/v1/chat/docsum", - host="0.0.0.0", - port=9000, -) -async def llm_generate(input: DocSumLLMParams): - if logflag: - logger.info(input) - - ### check summary type - summary_types = ["auto", "stuff", "truncate", "map_reduce", "refine"] - if input.summary_type not in summary_types: - raise NotImplementedError(f"Please specify the summary_type in {summary_types}") - if input.summary_type == "auto": ### Check input token length in auto mode - token_len = len(tokenizer.encode(input.query)) - if token_len > MAX_INPUT_TOKENS + 50: - input.summary_type = "refine" - if logflag: - logger.info( - f"Input token length {token_len} exceed MAX_INPUT_TOKENS + 50 {MAX_INPUT_TOKENS+50}, auto switch to 'refine' mode." - ) - else: - input.summary_type = "stuff" - if logflag: - logger.info( - f"Input token length {token_len} not exceed MAX_INPUT_TOKENS + 50 {MAX_INPUT_TOKENS+50}, auto switch to 'stuff' mode." - ) - - if input.language in ["en", "auto"]: - templ = templ_en - templ_refine = templ_refine_en - elif input.language in ["zh"]: - templ = templ_zh - templ_refine = templ_refine_zh - else: - raise NotImplementedError('Please specify the input language in "en", "zh", "auto"') - - ## Prompt - PROMPT = PromptTemplate.from_template(templ) - if input.summary_type == "refine": - PROMPT_REFINE = PromptTemplate.from_template(templ_refine) - if logflag: - logger.info("After prompting:") - logger.info(PROMPT) - if input.summary_type == "refine": - logger.info(PROMPT_REFINE) - - ## Split text - if input.summary_type == "stuff": - text_splitter = CharacterTextSplitter() - else: - if input.summary_type == "refine": - if MAX_TOTAL_TOKENS <= 2 * input.max_tokens + 128: - raise RuntimeError("In Refine mode, Please set MAX_TOTAL_TOKENS larger than (max_tokens * 2 + 128)") - max_input_tokens = min( - MAX_TOTAL_TOKENS - 2 * input.max_tokens - 128, MAX_INPUT_TOKENS - ) # 128 is reserved token length for prompt - else: - if MAX_TOTAL_TOKENS <= input.max_tokens + 50: - raise RuntimeError("Please set MAX_TOTAL_TOKENS larger than max_tokens + 50)") - max_input_tokens = min( - MAX_TOTAL_TOKENS - input.max_tokens - 50, MAX_INPUT_TOKENS - ) # 50 is reserved token length for prompt - chunk_size = min(input.chunk_size, max_input_tokens) if input.chunk_size > 0 else max_input_tokens - chunk_overlap = input.chunk_overlap if input.chunk_overlap > 0 else int(0.1 * chunk_size) - text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer( - tokenizer=tokenizer, chunk_size=chunk_size, chunk_overlap=chunk_overlap - ) - if logflag: - logger.info(f"set chunk size to: {chunk_size}") - logger.info(f"set chunk overlap to: {chunk_overlap}") - - texts = text_splitter.split_text(input.query) - docs = [Document(page_content=t) for t in texts] - if logflag: - logger.info(f"Split input query into {len(docs)} chunks") - logger.info(f"The character length of the first chunk is {len(texts[0])}") - - ## Access auth - access_token = ( - get_access_token(TOKEN_URL, CLIENTID, CLIENT_SECRET) if TOKEN_URL and CLIENTID and CLIENT_SECRET else None - ) - server_kwargs = {} - if access_token: - server_kwargs["headers"] = {"Authorization": f"Bearer {access_token}"} - - ## LLM - if input.stream and input.summary_type == "map_reduce": - logger.info("Map Reduce mode don't support stream=True, set to stream=False") - input.stream = False - llm_endpoint = os.getenv("TGI_LLM_ENDPOINT", "http://localhost:8080") - llm = HuggingFaceEndpoint( - endpoint_url=llm_endpoint, - max_new_tokens=input.max_tokens, - top_k=input.top_k, - top_p=input.top_p, - typical_p=input.typical_p, - temperature=input.temperature, - repetition_penalty=input.repetition_penalty, - streaming=input.stream, - server_kwargs=server_kwargs, - ) - - ## LLM chain - summary_type = input.summary_type - if summary_type == "stuff": - llm_chain = load_summarize_chain(llm=llm, prompt=PROMPT) - elif summary_type == "truncate": - docs = [docs[0]] - llm_chain = load_summarize_chain(llm=llm, prompt=PROMPT) - elif summary_type == "map_reduce": - llm_chain = load_summarize_chain( - llm=llm, map_prompt=PROMPT, combine_prompt=PROMPT, chain_type="map_reduce", return_intermediate_steps=True - ) - elif summary_type == "refine": - llm_chain = load_summarize_chain( - llm=llm, - question_prompt=PROMPT, - refine_prompt=PROMPT_REFINE, - chain_type="refine", - return_intermediate_steps=True, - ) - else: - raise NotImplementedError('Please specify the summary_type in "stuff", "truncate", "map_reduce", "refine"') - - if input.stream: - - async def stream_generator(): - from langserve.serialization import WellKnownLCSerializer - - _serializer = WellKnownLCSerializer() - async for chunk in llm_chain.astream_log(docs): - data = _serializer.dumps({"ops": chunk.ops}).decode("utf-8") - if logflag: - logger.info(data) - yield f"data: {data}\n\n" - yield "data: [DONE]\n\n" - - return StreamingResponse(stream_generator(), media_type="text/event-stream") - else: - response = await llm_chain.ainvoke(docs) - - if input.summary_type in ["map_reduce", "refine"]: - intermediate_steps = response["intermediate_steps"] - if logflag: - logger.info("intermediate_steps:") - logger.info(intermediate_steps) - - output_text = response["output_text"] - if logflag: - logger.info("\n\noutput_text:") - logger.info(output_text) - - return GeneratedDoc(text=output_text, prompt=input.query) - - -if __name__ == "__main__": - tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_ID) - opea_microservices["opea_service@llm_docsum"].start() diff --git a/comps/llms/summarization/vllm/langchain/Dockerfile b/comps/llms/summarization/vllm/langchain/Dockerfile deleted file mode 100644 index 3a1cd5a8f7..0000000000 --- a/comps/llms/summarization/vllm/langchain/Dockerfile +++ /dev/null @@ -1,28 +0,0 @@ -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -FROM python:3.11-slim - -ARG ARCH="cpu" - -RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ - libgl1-mesa-glx \ - libjemalloc-dev - -RUN useradd -m -s /bin/bash user && \ - mkdir -p /home/user && \ - chown -R user /home/user/ - -USER user - -COPY comps /home/user/comps - -RUN pip install --no-cache-dir --upgrade pip setuptools && \ - if [ ${ARCH} = "cpu" ]; then pip install --no-cache-dir torch torchvision --index-url https://download.pytorch.org/whl/cpu; fi && \ - pip install --no-cache-dir -r /home/user/comps/llms/summarization/vllm/langchain/requirements.txt - -ENV PYTHONPATH=$PYTHONPATH:/home/user - -WORKDIR /home/user/comps/llms/summarization/vllm/langchain - -ENTRYPOINT ["bash", "entrypoint.sh"] diff --git a/comps/llms/summarization/vllm/langchain/README.md b/comps/llms/summarization/vllm/langchain/README.md deleted file mode 100644 index 061a526d68..0000000000 --- a/comps/llms/summarization/vllm/langchain/README.md +++ /dev/null @@ -1,171 +0,0 @@ -# Document Summary vLLM Microservice - -This microservice leverages LangChain to implement summarization strategies and facilitate LLM inference using vLLM. -[vLLM](https://github.com/vllm-project/vllm) is a fast and easy-to-use library for LLM inference and serving, it delivers state-of-the-art serving throughput with a set of advanced features such as PagedAttention, Continuous batching and etc.. Besides GPUs, vLLM already supported [Intel CPUs](https://www.intel.com/content/www/us/en/products/overview.html) and [Gaudi accelerators](https://habana.ai/products). - -## 🚀1. Start Microservice with Python 🐍 (Option 1) - -To start the LLM microservice, you need to install python packages first. - -### 1.1 Install Requirements - -```bash -pip install -r requirements.txt -``` - -### 1.2 Start LLM Service - -```bash -export HF_TOKEN=${your_hf_api_token} -export LLM_MODEL_ID=${your_hf_llm_model} -docker run -p 8008:80 -v ./data:/data --name llm-docsum-vllm --shm-size 1g opea/vllm-gaudi:latest --model-id ${LLM_MODEL_ID} -``` - -### 1.3 Verify the vLLM Service - -```bash -curl http://${your_ip}:8008/v1/chat/completions \ - -X POST \ - -H "Content-Type: application/json" \ - -d '{"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "user", "content": "What is Deep Learning? "}]}' -``` - -### 1.4 Start LLM Service with Python Script - -```bash -export vLLM_ENDPOINT="http://${your_ip}:8008" -python llm.py -``` - -## 🚀2. Start Microservice with Docker 🐳 (Option 2) - -If you start an LLM microservice with docker, the `docker_compose_llm.yaml` file will automatically start a vLLM/vLLM service with docker. - -To setup or build the vLLM image follow the instructions provided in [vLLM Gaudi](https://github.com/opea-project/GenAIComps/tree/main/comps/llms/text-generation/vllm/langchain#22-vllm-on-gaudi) - -### 2.1 Setup Environment Variables - -In order to start vLLM and LLM services, you need to setup the following environment variables first. - -```bash -export HF_TOKEN=${your_hf_api_token} -export vLLM_ENDPOINT="http://${your_ip}:8008" -export LLM_MODEL_ID=${your_hf_llm_model} -``` - -### 2.2 Build Docker Image - -```bash -cd ../../../../../ -docker build -t opea/llm-docsum-vllm:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/summarization/vllm/langchain/Dockerfile . -``` - -To start a docker container, you have two options: - -- A. Run Docker with CLI -- B. Run Docker with Docker Compose - -You can choose one as needed. - -### 2.3 Run Docker with CLI (Option A) - -```bash -docker run -d --name="llm-docsum-vllm-server" -p 9000:9000 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e vLLM_ENDPOINT=$vLLM_ENDPOINT -e HF_TOKEN=$HF_TOKEN opea/llm-docsum-vllm:latest -``` - -### 2.4 Run Docker with Docker Compose (Option B) - -```bash -docker compose -f docker_compose_llm.yaml up -d -``` - -## 🚀3. Consume LLM Service - -### 3.1 Check Service Status - -```bash -curl http://${your_ip}:9000/v1/health_check\ - -X GET \ - -H 'Content-Type: application/json' -``` - -### 3.2 Consume LLM Service - -In DocSum microservice, except for basic LLM parameters, we also support several optimization parameters setting. - -- "language": specify the language, can be "auto", "en", "zh", default is "auto" - -If you want to deal with long context, can select suitable summary type, details in section 3.2.2. - -- "summary_type": can be "auto", "stuff", "truncate", "map_reduce", "refine", default is "auto" -- "chunk_size": max token length for each chunk. Set to be different default value according to "summary_type". -- "chunk_overlap": overlap token length between each chunk, default is 0.1\*chunk_size - -#### 3.2.1 Basic usage - -```bash -# Enable stream to receive a stream response. By default, this is set to True. -curl http://${your_ip}:9000/v1/chat/docsum \ - -X POST \ - -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en"}' \ - -H 'Content-Type: application/json' - -# Disable stream to receive a non-stream response. -curl http://${your_ip}:9000/v1/chat/docsum \ - -X POST \ - -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "stream":false}' \ - -H 'Content-Type: application/json' - -# Use Chinese mode -curl http://${your_ip}:9000/v1/chat/docsum \ - -X POST \ - -d '{"query":"2024年9月26日,北京——今日,英特尔正式发布英特尔® 至强® 6性能核处理器(代号Granite Rapids),为AI、数据分析、科学计算等计算密集型业务提供卓越性能。", "max_tokens":32, "language":"zh", "stream":false}' \ - -H 'Content-Type: application/json' -``` - -#### 3.2.2 Long context summarization with "summary_type" - -**summary_type=auto** - -"summary_type" is set to be "auto" by default, in this mode we will check input token length, if it exceed `MAX_INPUT_TOKENS`, `summary_type` will automatically be set to `refine` mode, otherwise will be set to `stuff` mode. - -**summary_type=stuff** - -In this mode LLM generate summary based on complete input text. In this case please carefully set `MAX_INPUT_TOKENS` and `MAX_TOTAL_TOKENS` according to your model and device memory, otherwise it may exceed LLM context limit and raise error when meet long context. - -**summary_type=truncate** - -Truncate mode will truncate the input text and keep only the first chunk, whose length is equal to `min(MAX_TOTAL_TOKENS - input.max_tokens - 50, MAX_INPUT_TOKENS)` - -```bash -curl http://${your_ip}:9000/v1/chat/docsum \ - -X POST \ - -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "truncate", "chunk_size": 2000}' \ - -H 'Content-Type: application/json' -``` - -**summary_type=map_reduce** - -Map_reduce mode will split the inputs into multiple chunks, map each document to an individual summary, then consolidate those summaries into a single global summary. `stream=True` is not allowed here. - -In this mode, default `chunk_size` is set to be `min(MAX_TOTAL_TOKENS - input.max_tokens - 50, MAX_INPUT_TOKENS)` - -```bash -curl http://${your_ip}:9000/v1/chat/docsum \ - -X POST \ - -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "map_reduce", "chunk_size": 2000, "stream":false}' \ - -H 'Content-Type: application/json' -``` - -**summary_type=refine** - -Refin mode will split the inputs into multiple chunks, generate summary for the first one, then combine with the second, loops over every remaining chunks to get the final summary. - -In this mode, default `chunk_size` is set to be `min(MAX_TOTAL_TOKENS - 2 * input.max_tokens - 128, MAX_INPUT_TOKENS)`. - -```bash -curl http://${your_ip}:9000/v1/chat/docsum \ - -X POST \ - -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "refine", "chunk_size": 2000}' \ - -H 'Content-Type: application/json' -``` diff --git a/comps/llms/summarization/vllm/langchain/llm.py b/comps/llms/summarization/vllm/langchain/llm.py deleted file mode 100644 index 5371e7e560..0000000000 --- a/comps/llms/summarization/vllm/langchain/llm.py +++ /dev/null @@ -1,247 +0,0 @@ -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -import os -from pathlib import Path as p - -from fastapi.responses import StreamingResponse -from langchain.chains.summarize import load_summarize_chain -from langchain.docstore.document import Document -from langchain.prompts import PromptTemplate -from langchain_community.llms import VLLMOpenAI -from langchain_text_splitters import CharacterTextSplitter, RecursiveCharacterTextSplitter -from transformers import AutoTokenizer - -from comps import CustomLogger, DocSumLLMParams, GeneratedDoc, ServiceType, opea_microservices, register_microservice -from comps.cores.mega.utils import get_access_token - -logger = CustomLogger("llm_docsum") -logflag = os.getenv("LOGFLAG", False) - -# Environment variables -TOKEN_URL = os.getenv("TOKEN_URL") -CLIENTID = os.getenv("CLIENTID") -CLIENT_SECRET = os.getenv("CLIENT_SECRET") -MAX_INPUT_TOKENS = int(os.getenv("MAX_INPUT_TOKENS")) -MAX_TOTAL_TOKENS = int(os.getenv("MAX_TOTAL_TOKENS")) -LLM_MODEL_ID = os.getenv("LLM_MODEL_ID", None) - -templ_en = """Write a concise summary of the following: - - -"{text}" - - -CONCISE SUMMARY:""" - -templ_zh = """请简要概括以下内容: - - -"{text}" - - -概况:""" - - -templ_refine_en = """Your job is to produce a final summary. -We have provided an existing summary up to a certain point, then we will provide more context. -You need to refine the existing summary (only if needed) with new context and generate a final summary. - - -Existing Summary: -"{existing_answer}" - - - -New Context: -"{text}" - - - -Final Summary: - -""" - -templ_refine_zh = """\ -你的任务是生成一个最终摘要。 -我们已经处理好部分文本并生成初始摘要, 并提供了新的未处理文本 -你需要根据新提供的文本,结合初始摘要,生成一个最终摘要。 - - -初始摘要: -"{existing_answer}" - - - -新的文本: -"{text}" - - - -最终摘要: - -""" - - -@register_microservice( - name="opea_service@llm_docsum", - service_type=ServiceType.LLM, - endpoint="/v1/chat/docsum", - host="0.0.0.0", - port=9000, -) -async def llm_generate(input: DocSumLLMParams): - if logflag: - logger.info(input) - - ### check summary type - summary_types = ["auto", "stuff", "truncate", "map_reduce", "refine"] - if input.summary_type not in summary_types: - raise NotImplementedError(f"Please specify the summary_type in {summary_types}") - if input.summary_type == "auto": ### Check input token length in auto mode - token_len = len(tokenizer.encode(input.query)) - if token_len > MAX_INPUT_TOKENS + 50: - input.summary_type = "refine" - if logflag: - logger.info( - f"Input token length {token_len} exceed MAX_INPUT_TOKENS + 50 {MAX_INPUT_TOKENS+50}, auto switch to 'refine' mode." - ) - else: - input.summary_type = "stuff" - if logflag: - logger.info( - f"Input token length {token_len} not exceed MAX_INPUT_TOKENS + 50 {MAX_INPUT_TOKENS+50}, auto switch to 'stuff' mode." - ) - - if input.language in ["en", "auto"]: - templ = templ_en - templ_refine = templ_refine_en - elif input.language in ["zh"]: - templ = templ_zh - templ_refine = templ_refine_zh - else: - raise NotImplementedError('Please specify the input language in "en", "zh", "auto"') - - ## Prompt - PROMPT = PromptTemplate.from_template(templ) - if input.summary_type == "refine": - PROMPT_REFINE = PromptTemplate.from_template(templ_refine) - if logflag: - logger.info("After prompting:") - logger.info(PROMPT) - if input.summary_type == "refine": - logger.info(PROMPT_REFINE) - - ## Split text - if input.summary_type == "stuff": - text_splitter = CharacterTextSplitter() - else: - if input.summary_type == "refine": - if MAX_TOTAL_TOKENS <= 2 * input.max_tokens + 128: - raise RuntimeError("In Refine mode, Please set MAX_TOTAL_TOKENS larger than (max_tokens * 2 + 128)") - max_input_tokens = min( - MAX_TOTAL_TOKENS - 2 * input.max_tokens - 128, MAX_INPUT_TOKENS - ) # 128 is reserved token length for prompt - else: - if MAX_TOTAL_TOKENS <= input.max_tokens + 50: - raise RuntimeError("Please set MAX_TOTAL_TOKENS larger than max_tokens + 50)") - max_input_tokens = min( - MAX_TOTAL_TOKENS - input.max_tokens - 50, MAX_INPUT_TOKENS - ) # 50 is reserved token length for prompt - chunk_size = min(input.chunk_size, max_input_tokens) if input.chunk_size > 0 else max_input_tokens - chunk_overlap = input.chunk_overlap if input.chunk_overlap > 0 else int(0.1 * chunk_size) - text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer( - tokenizer=tokenizer, chunk_size=chunk_size, chunk_overlap=chunk_overlap - ) - if logflag: - logger.info(f"set chunk size to: {chunk_size}") - logger.info(f"set chunk overlap to: {chunk_overlap}") - - texts = text_splitter.split_text(input.query) - docs = [Document(page_content=t) for t in texts] - if logflag: - logger.info(f"Split input query into {len(docs)} chunks") - logger.info(f"The character length of the first chunk is {len(texts[0])}") - - ## Access auth - access_token = ( - get_access_token(TOKEN_URL, CLIENTID, CLIENT_SECRET) if TOKEN_URL and CLIENTID and CLIENT_SECRET else None - ) - headers = {} - if access_token: - headers = {"Authorization": f"Bearer {access_token}"} - - ## LLM - if input.stream and input.summary_type == "map_reduce": - logger.info("Map Reduce mode don't support stream=True, set to stream=False") - input.stream = False - llm_endpoint = os.getenv("vLLM_ENDPOINT", "http://localhost:8080") - model = input.model if input.model else os.getenv("LLM_MODEL_ID") - llm = VLLMOpenAI( - openai_api_key="EMPTY", - openai_api_base=llm_endpoint + "/v1", - model_name=model, - default_headers=headers, - max_tokens=input.max_tokens, - top_p=input.top_p, - streaming=input.stream, - temperature=input.temperature, - presence_penalty=input.repetition_penalty, - ) - - ## LLM chain - summary_type = input.summary_type - if summary_type == "stuff": - llm_chain = load_summarize_chain(llm=llm, prompt=PROMPT) - elif summary_type == "truncate": - docs = [docs[0]] - llm_chain = load_summarize_chain(llm=llm, prompt=PROMPT) - elif summary_type == "map_reduce": - llm_chain = load_summarize_chain( - llm=llm, map_prompt=PROMPT, combine_prompt=PROMPT, chain_type="map_reduce", return_intermediate_steps=True - ) - elif summary_type == "refine": - llm_chain = load_summarize_chain( - llm=llm, - question_prompt=PROMPT, - refine_prompt=PROMPT_REFINE, - chain_type="refine", - return_intermediate_steps=True, - ) - else: - raise NotImplementedError('Please specify the summary_type in "stuff", "truncate", "map_reduce", "refine"') - - if input.stream: - - async def stream_generator(): - from langserve.serialization import WellKnownLCSerializer - - _serializer = WellKnownLCSerializer() - async for chunk in llm_chain.astream_log(docs): - data = _serializer.dumps({"ops": chunk.ops}).decode("utf-8") - if logflag: - logger.info(data) - yield f"data: {data}\n\n" - yield "data: [DONE]\n\n" - - return StreamingResponse(stream_generator(), media_type="text/event-stream") - else: - response = await llm_chain.ainvoke(docs) - - if input.summary_type in ["map_reduce", "refine"]: - intermediate_steps = response["intermediate_steps"] - if logflag: - logger.info("intermediate_steps:") - logger.info(intermediate_steps) - - output_text = response["output_text"] - if logflag: - logger.info("\n\noutput_text:") - logger.info(output_text) - - return GeneratedDoc(text=output_text, prompt=input.query) - - -if __name__ == "__main__": - tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_ID) - opea_microservices["opea_service@llm_docsum"].start() diff --git a/comps/llms/summarization/vllm/langchain/requirements-runtime.txt b/comps/llms/summarization/vllm/langchain/requirements-runtime.txt deleted file mode 100644 index 225adde271..0000000000 --- a/comps/llms/summarization/vllm/langchain/requirements-runtime.txt +++ /dev/null @@ -1 +0,0 @@ -langserve diff --git a/comps/llms/summarization/vllm/langchain/requirements.txt b/comps/llms/summarization/vllm/langchain/requirements.txt deleted file mode 100644 index 1694618637..0000000000 --- a/comps/llms/summarization/vllm/langchain/requirements.txt +++ /dev/null @@ -1,16 +0,0 @@ -docarray[full] -fastapi -httpx==0.27.2 -huggingface_hub -langchain #==0.1.12 -langchain-huggingface -langchain-openai -langchain_community -langchainhub -opentelemetry-api -opentelemetry-exporter-otlp -opentelemetry-sdk -prometheus-fastapi-instrumentator -shortuuid -transformers -uvicorn diff --git a/comps/llms/text-generation/predictionguard/Dockerfile b/comps/llms/text-generation/predictionguard/Dockerfile deleted file mode 100644 index 1c4077ac91..0000000000 --- a/comps/llms/text-generation/predictionguard/Dockerfile +++ /dev/null @@ -1,15 +0,0 @@ -# Copyright (C) 2024 Prediction Guard, Inc. -# SPDX-License-Identifier: Apache-2.0 - -FROM python:3.11-slim - -COPY comps /home/comps - -RUN pip install --no-cache-dir --upgrade pip setuptools && \ - pip install --no-cache-dir -r /home/comps/llms/text-generation/predictionguard/requirements.txt - -ENV PYTHONPATH=$PYTHONPATH:/home - -WORKDIR /home/comps/llms/text-generation/predictionguard - -ENTRYPOINT ["bash", "entrypoint.sh"] diff --git a/comps/llms/text-generation/predictionguard/__init__.py b/comps/llms/text-generation/predictionguard/__init__.py deleted file mode 100644 index a246c95e79..0000000000 --- a/comps/llms/text-generation/predictionguard/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -# Copyright (C) 2024 Prediction Guard, Inc. -# SPDX-License-Identifier: Apache-2.0 diff --git a/comps/llms/text-generation/predictionguard/docker_compose_llm.yaml b/comps/llms/text-generation/predictionguard/docker_compose_llm.yaml deleted file mode 100644 index bde9fa10a9..0000000000 --- a/comps/llms/text-generation/predictionguard/docker_compose_llm.yaml +++ /dev/null @@ -1,20 +0,0 @@ -# Copyright (C) 2024 Prediction Guard, Inc -# SPDX-License-Identifier: Apache-2.0 - -services: - llm: - image: opea/llm-textgen-predictionguard:latest - container_name: llm-textgen-predictionguard - ports: - - "9000:9000" - ipc: host - environment: - no_proxy: ${no_proxy} - http_proxy: ${http_proxy} - https_proxy: ${https_proxy} - PREDICTIONGUARD_API_KEY: ${PREDICTIONGUARD_API_KEY} - restart: unless-stopped - -networks: - default: - driver: bridge diff --git a/comps/llms/text-generation/predictionguard/entrypoint.sh b/comps/llms/text-generation/predictionguard/entrypoint.sh deleted file mode 100644 index 8220ff6399..0000000000 --- a/comps/llms/text-generation/predictionguard/entrypoint.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/usr/bin/env bash - -# Copyright (C) 2024 Prediction Guard, Inc. -# SPDX-License-Identifier: Apache-2.0 - -#pip --no-cache-dir install -r requirements-runtime.txt - -python llm_predictionguard.py diff --git a/comps/llms/text-generation/predictionguard/llm_predictionguard.py b/comps/llms/text-generation/predictionguard/llm_predictionguard.py deleted file mode 100644 index 475b3f69be..0000000000 --- a/comps/llms/text-generation/predictionguard/llm_predictionguard.py +++ /dev/null @@ -1,85 +0,0 @@ -# Copyright (C) 2024 Prediction Guard, Inc. -# SPDX-License-Identified: Apache-2.0 - - -import time - -from fastapi import FastAPI, HTTPException -from fastapi.responses import StreamingResponse -from predictionguard import PredictionGuard - -from comps import ( - GeneratedDoc, - LLMParamsDoc, - ServiceType, - opea_microservices, - register_microservice, - register_statistics, - statistics_dict, -) - -client = PredictionGuard() -app = FastAPI() - - -@register_microservice( - name="opea_service@llm_predictionguard", - service_type=ServiceType.LLM, - endpoint="/v1/chat/completions", - host="0.0.0.0", - port=9000, -) -@register_statistics(names=["opea_service@llm_predictionguard"]) -def llm_generate(input: LLMParamsDoc): - start = time.time() - - messages = [ - { - "role": "system", - "content": "You are a helpful assistant. Your goal is to provide accurate, detailed, and safe responses to the user's queries.", - }, - {"role": "user", "content": input.query}, - ] - - if input.stream: - - async def stream_generator(): - chat_response = "" - for res in client.chat.completions.create( - model=input.model, - messages=messages, - max_tokens=input.max_tokens, - temperature=input.temperature, - top_p=input.top_p, - top_k=input.top_k, - stream=True, - ): - if "choices" in res["data"] and "delta" in res["data"]["choices"][0]: - delta_content = res["data"]["choices"][0]["delta"]["content"] - chat_response += delta_content - yield f"data: {delta_content}\n\n" - else: - yield "data: [DONE]\n\n" - - statistics_dict["opea_service@llm_predictionguard"].append_latency(time.time() - start, None) - return StreamingResponse(stream_generator(), media_type="text/event-stream") - else: - try: - response = client.chat.completions.create( - model=input.model, - messages=messages, - max_tokens=input.max_tokens, - temperature=input.temperature, - top_p=input.top_p, - top_k=input.top_k, - ) - response_text = response["choices"][0]["message"]["content"] - except Exception as e: - raise HTTPException(status_code=500, detail=str(e)) - - statistics_dict["opea_service@llm_predictionguard"].append_latency(time.time() - start, None) - return GeneratedDoc(text=response_text, prompt=input.query) - - -if __name__ == "__main__": - opea_microservices["opea_service@llm_predictionguard"].start() diff --git a/comps/llms/text-generation/predictionguard/requirements.txt b/comps/llms/text-generation/predictionguard/requirements.txt deleted file mode 100644 index 6c9f8340fd..0000000000 --- a/comps/llms/text-generation/predictionguard/requirements.txt +++ /dev/null @@ -1,12 +0,0 @@ -aiohttp -docarray -fastapi -opentelemetry-api -opentelemetry-exporter-otlp -opentelemetry-sdk -Pillow -predictionguard -prometheus-fastapi-instrumentator -shortuuid -transformers -uvicorn diff --git a/comps/lvms/deployment/kubernetes/README.md b/comps/lvms/deployment/kubernetes/README.md new file mode 100644 index 0000000000..f8c26af8d5 --- /dev/null +++ b/comps/lvms/deployment/kubernetes/README.md @@ -0,0 +1,11 @@ +# Deploy LVM microservice on Kubernetes cluster + +- You should have Helm (version >= 3.15) installed. Refer to the [Helm Installation Guide](https://helm.sh/docs/intro/install/) for more information. +- For more deployment options, refer to [helm charts README](https://github.com/opea-project/GenAIInfra/tree/main/helm-charts#readme). + +## Deploy on Kubernetes + +``` +export HFTOKEN="insert-your-huggingface-token-here" +helm install lvm oci://ghcr.io/opea-project/charts/lvm-uservice --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} -f cpu-values.yaml +``` diff --git a/comps/lvms/deployment/kubernetes/cpu-values.yaml b/comps/lvms/deployment/kubernetes/cpu-values.yaml new file mode 100644 index 0000000000..3de5b26fce --- /dev/null +++ b/comps/lvms/deployment/kubernetes/cpu-values.yaml @@ -0,0 +1,5 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +tgi: + enabled: true diff --git a/comps/prompt_registry/deployment/kubernetes/README.md b/comps/prompt_registry/deployment/kubernetes/README.md index e69de29bb2..387197ea76 100644 --- a/comps/prompt_registry/deployment/kubernetes/README.md +++ b/comps/prompt_registry/deployment/kubernetes/README.md @@ -0,0 +1,11 @@ +# Deploy prompt microservice on Kubernetes cluster + +- You should have Helm (version >= 3.15) installed. Refer to the [Helm Installation Guide](https://helm.sh/docs/intro/install/) for more information. +- For more deployment options, refer to [helm charts README](https://github.com/opea-project/GenAIInfra/tree/main/helm-charts#readme). + +## Deploy on Kubernetes + +``` +export HFTOKEN="insert-your-huggingface-token-here" +helm install prompt-usvc oci://ghcr.io/opea-project/charts/prompt-usvc --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} -f cpu-values.yaml +``` diff --git a/comps/prompt_registry/deployment/kubernetes/cpu-values.yaml b/comps/prompt_registry/deployment/kubernetes/cpu-values.yaml new file mode 100644 index 0000000000..7850c0ee9d --- /dev/null +++ b/comps/prompt_registry/deployment/kubernetes/cpu-values.yaml @@ -0,0 +1,5 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +mongodb: + enabled: true diff --git a/comps/rerankings/deployment/kubernetes/README.md b/comps/rerankings/deployment/kubernetes/README.md index e69de29bb2..23bf0ef425 100644 --- a/comps/rerankings/deployment/kubernetes/README.md +++ b/comps/rerankings/deployment/kubernetes/README.md @@ -0,0 +1,11 @@ +# Deploy reranking microservice on Kubernetes cluster + +- You should have Helm (version >= 3.15) installed. Refer to the [Helm Installation Guide](https://helm.sh/docs/intro/install/) for more information. +- For more deployment options, refer to [helm charts README](https://github.com/opea-project/GenAIInfra/tree/main/helm-charts#readme). + +## Deploy on Kubernetes + +``` +export HFTOKEN="insert-your-huggingface-token-here" +helm install reranking-usvc oci://ghcr.io/opea-project/charts/reranking-usvc --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} -f cpu-values.yaml +``` diff --git a/comps/rerankings/deployment/kubernetes/cpu-values.yaml b/comps/rerankings/deployment/kubernetes/cpu-values.yaml new file mode 100644 index 0000000000..f16bb56416 --- /dev/null +++ b/comps/rerankings/deployment/kubernetes/cpu-values.yaml @@ -0,0 +1,5 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +teirerank: + enabled: true diff --git a/comps/retrievers/deployment/kubernetes/README.md b/comps/retrievers/deployment/kubernetes/README.md new file mode 100644 index 0000000000..141d49f05a --- /dev/null +++ b/comps/retrievers/deployment/kubernetes/README.md @@ -0,0 +1,18 @@ +# Deploy retriever microservice on Kubernetes cluster + +- You should have Helm (version >= 3.15) installed. Refer to the [Helm Installation Guide](https://helm.sh/docs/intro/install/) for more information. +- For more deployment options, refer to [helm charts README](https://github.com/opea-project/GenAIInfra/tree/main/helm-charts#readme). + +## Deploy on Kubernetes with redis vector DB + +``` +export HFTOKEN="insert-your-huggingface-token-here" +helm install retriever-usvc oci://ghcr.io/opea-project/charts/retriever-usvc --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} -f redis-values.yaml +``` + +## Deploy on Kubernetes with milvus vector DB + +``` +export HFTOKEN="insert-your-huggingface-token-here" +helm install retriever-usvc oci://ghcr.io/opea-project/charts/retriever-usvc --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} -f milvus-values.yaml +``` diff --git a/comps/retrievers/deployment/kubernetes/milvus-values.yaml b/comps/retrievers/deployment/kubernetes/milvus-values.yaml new file mode 100644 index 0000000000..c186b4be2c --- /dev/null +++ b/comps/retrievers/deployment/kubernetes/milvus-values.yaml @@ -0,0 +1,33 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +# Default values for retriever-usvc. +# This is a YAML-formatted file. +# Declare variables to be passed into your templates. + +milvus: + enabled: true + cluster: + enabled: false + etcd: + replicaCount: 1 + pulsar: + enabled: false + minio: + mode: standalone +redis-vector-db: + enabled: false +tei: + enabled: true + +image: + repository: opea/retriever-milvus +port: 7000 +# text embedding inference service URL, e.g. http://<service-name>:<port> +#TEI_EMBEDDING_ENDPOINT: "http://dataprep-tei:80" +# milvus DB configurations +#MILVUS_HOST: "dataprep-milvus" +MILVUS_PORT: "19530" +COLLECTION_NAME: "rag_milvus" +MOSEC_EMBEDDING_ENDPOINT: "" +MOSEC_EMBEDDING_MODEL: "" diff --git a/comps/retrievers/deployment/kubernetes/redis-values.yaml b/comps/retrievers/deployment/kubernetes/redis-values.yaml new file mode 100644 index 0000000000..cbc29c7eeb --- /dev/null +++ b/comps/retrievers/deployment/kubernetes/redis-values.yaml @@ -0,0 +1,13 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +# Default values for retriever-usvc. +# This is a YAML-formatted file. +# Declare variables to be passed into your templates. + +tei: + enabled: true +redis-vector-db: + enabled: true +milvus: + enabled: false diff --git a/comps/third_parties/gpt-sovits/deployment/kubernetes/README.md b/comps/third_parties/gpt-sovits/deployment/kubernetes/README.md new file mode 100644 index 0000000000..3a9f77f86e --- /dev/null +++ b/comps/third_parties/gpt-sovits/deployment/kubernetes/README.md @@ -0,0 +1,11 @@ +# Deploy gpt-sovits on kubernetes cluster + +- You should have Helm (version >= 3.15) installed. Refer to the [Helm Installation Guide](https://helm.sh/docs/intro/install/) for more information. +- For more deployment options, refer to [helm charts README](https://github.com/opea-project/GenAIInfra/tree/main/helm-charts#readme). + +## Deploy on Xeon + +``` +export HFTOKEN="insert-your-huggingface-token-here" +helm install gpt-sovits oci://ghcr.io/opea-project/charts/gpt-sovits --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} -f cpu-values.yaml +``` diff --git a/comps/third_parties/gpt-sovits/deployment/kubernetes/cpu-values.yaml b/comps/third_parties/gpt-sovits/deployment/kubernetes/cpu-values.yaml new file mode 100644 index 0000000000..087e8b3346 --- /dev/null +++ b/comps/third_parties/gpt-sovits/deployment/kubernetes/cpu-values.yaml @@ -0,0 +1,5 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +image: + repository: opea/gpt-sovits diff --git a/comps/third_parties/mongodb/deployment/kubernetes/README.md b/comps/third_parties/mongodb/deployment/kubernetes/README.md new file mode 100644 index 0000000000..a9c5db7d1e --- /dev/null +++ b/comps/third_parties/mongodb/deployment/kubernetes/README.md @@ -0,0 +1,11 @@ +# Deploy MongoDB on kubernetes cluster + +- You should have Helm (version >= 3.15) installed. Refer to the [Helm Installation Guide](https://helm.sh/docs/intro/install/) for more information. +- For more deployment options, refer to [helm charts README](https://github.com/opea-project/GenAIInfra/tree/main/helm-charts#readme). + +## Deploy on Xeon + +``` +export HFTOKEN="insert-your-huggingface-token-here" +helm install mongodb oci://ghcr.io/opea-project/charts/mongodb --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} -f cpu-values.yaml +``` diff --git a/comps/third_parties/mongodb/deployment/kubernetes/cpu-values.yaml b/comps/third_parties/mongodb/deployment/kubernetes/cpu-values.yaml new file mode 100644 index 0000000000..4d81053189 --- /dev/null +++ b/comps/third_parties/mongodb/deployment/kubernetes/cpu-values.yaml @@ -0,0 +1,4 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +image: + repository: mongo diff --git a/comps/third_parties/nginx/deployment/kubernetes/README.md b/comps/third_parties/nginx/deployment/kubernetes/README.md index e69de29bb2..a96d744db8 100644 --- a/comps/third_parties/nginx/deployment/kubernetes/README.md +++ b/comps/third_parties/nginx/deployment/kubernetes/README.md @@ -0,0 +1,11 @@ +# Deploy nginx on kubernetes cluster + +- You should have Helm (version >= 3.15) installed. Refer to the [Helm Installation Guide](https://helm.sh/docs/intro/install/) for more information. +- For more deployment options, refer to [helm charts README](https://github.com/opea-project/GenAIInfra/tree/main/helm-charts#readme). + +## Deploy on Xeon + +``` +export HFTOKEN="insert-your-huggingface-token-here" +helm install nginx oci://ghcr.io/opea-project/charts/nginx --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} -f cpu-values.yaml +``` diff --git a/comps/third_parties/nginx/deployment/kubernetes/cpu-values.yaml b/comps/third_parties/nginx/deployment/kubernetes/cpu-values.yaml new file mode 100644 index 0000000000..98e8182d2c --- /dev/null +++ b/comps/third_parties/nginx/deployment/kubernetes/cpu-values.yaml @@ -0,0 +1,5 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +image: + repository: opea/nginx diff --git a/comps/third_parties/redis/deployment/kubernetes/README.md b/comps/third_parties/redis/deployment/kubernetes/README.md new file mode 100644 index 0000000000..ab8cdc06c4 --- /dev/null +++ b/comps/third_parties/redis/deployment/kubernetes/README.md @@ -0,0 +1,11 @@ +# Deploy RedisDB on kubernetes cluster + +- You should have Helm (version >= 3.15) installed. Refer to the [Helm Installation Guide](https://helm.sh/docs/intro/install/) for more information. +- For more deployment options, refer to [helm charts README](https://github.com/opea-project/GenAIInfra/tree/main/helm-charts#readme). + +## Deploy on Xeon + +``` +export HFTOKEN="insert-your-huggingface-token-here" +helm install redis-vector-db oci://ghcr.io/opea-project/charts/redis-vector-db --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} -f cpu-values.yaml +``` diff --git a/comps/third_parties/redis/deployment/kubernetes/cpu-values.yaml b/comps/third_parties/redis/deployment/kubernetes/cpu-values.yaml new file mode 100644 index 0000000000..415b0aee8b --- /dev/null +++ b/comps/third_parties/redis/deployment/kubernetes/cpu-values.yaml @@ -0,0 +1,5 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +image: + repository: redis/redis-stack diff --git a/comps/third_parties/speecht5/deployment/kubernetes/README.md b/comps/third_parties/speecht5/deployment/kubernetes/README.md new file mode 100644 index 0000000000..e0f18a3f7d --- /dev/null +++ b/comps/third_parties/speecht5/deployment/kubernetes/README.md @@ -0,0 +1,18 @@ +# Deploy speecht5 on kubernetes cluster + +- You should have Helm (version >= 3.15) installed. Refer to the [Helm Installation Guide](https://helm.sh/docs/intro/install/) for more information. +- For more deployment options, refer to [helm charts README](https://github.com/opea-project/GenAIInfra/tree/main/helm-charts#readme). + +## Deploy on Xeon + +``` +export HFTOKEN="insert-your-huggingface-token-here" +helm install speecht5 oci://ghcr.io/opea-project/charts/speecht5 --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} -f cpu-values.yaml +``` + +## Deploy on Gaudi + +``` +export HFTOKEN="insert-your-huggingface-token-here" +helm install speecht5 oci://ghcr.io/opea-project/charts/speecht5 --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} -f gaudi-values.yaml +``` diff --git a/comps/third_parties/speecht5/deployment/kubernetes/cpu-values.yaml b/comps/third_parties/speecht5/deployment/kubernetes/cpu-values.yaml new file mode 100644 index 0000000000..56e0cd0cdc --- /dev/null +++ b/comps/third_parties/speecht5/deployment/kubernetes/cpu-values.yaml @@ -0,0 +1,5 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +image: + repository: opea/speecht5 diff --git a/comps/third_parties/speecht5/deployment/kubernetes/gaudi-values.yaml b/comps/third_parties/speecht5/deployment/kubernetes/gaudi-values.yaml new file mode 100644 index 0000000000..c7e5295bd9 --- /dev/null +++ b/comps/third_parties/speecht5/deployment/kubernetes/gaudi-values.yaml @@ -0,0 +1,8 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +image: + repository: opea/speecht5-gaudi +resources: + limits: + habana.ai/gaudi: 1 diff --git a/comps/third_parties/tei/deployment/kubernetes/README.md b/comps/third_parties/tei/deployment/kubernetes/README.md new file mode 100644 index 0000000000..1650330214 --- /dev/null +++ b/comps/third_parties/tei/deployment/kubernetes/README.md @@ -0,0 +1,18 @@ +# Deploy TEI on kubernetes cluster + +- You should have Helm (version >= 3.15) installed. Refer to the [Helm Installation Guide](https://helm.sh/docs/intro/install/) for more information. +- For more deployment options, refer to [helm charts README](https://github.com/opea-project/GenAIInfra/tree/main/helm-charts#readme). + +## Deploy on Xeon + +``` +export HFTOKEN="insert-your-huggingface-token-here" +helm install tei oci://ghcr.io/opea-project/charts/tei --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} -f cpu-values.yaml +``` + +## Deploy on Gaudi + +``` +export HFTOKEN="insert-your-huggingface-token-here" +helm install tei oci://ghcr.io/opea-project/charts/tei --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} -f gaudi-values.yaml +``` diff --git a/comps/third_parties/tei/deployment/kubernetes/cpu-values.yaml b/comps/third_parties/tei/deployment/kubernetes/cpu-values.yaml new file mode 100644 index 0000000000..5eaa0d2744 --- /dev/null +++ b/comps/third_parties/tei/deployment/kubernetes/cpu-values.yaml @@ -0,0 +1,5 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +image: + repository: ghcr.io/huggingface/text-embeddings-inference diff --git a/comps/third_parties/tei/deployment/kubernetes/gaudi-values.yaml b/comps/third_parties/tei/deployment/kubernetes/gaudi-values.yaml new file mode 100644 index 0000000000..aa8c36da48 --- /dev/null +++ b/comps/third_parties/tei/deployment/kubernetes/gaudi-values.yaml @@ -0,0 +1,22 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +accelDevice: "gaudi" + +OMPI_MCA_btl_vader_single_copy_mechanism: "none" +MAX_WARMUP_SEQUENCE_LENGTH: "512" +image: + repository: ghcr.io/huggingface/tei-gaudi + tag: 1.5.0 + +securityContext: + readOnlyRootFilesystem: false + +resources: + limits: + habana.ai/gaudi: 1 + +livenessProbe: + timeoutSeconds: 1 +readinessProbe: + timeoutSeconds: 1 diff --git a/comps/third_parties/teirerank/deployment/kubernetes/README.md b/comps/third_parties/teirerank/deployment/kubernetes/README.md new file mode 100644 index 0000000000..b67de89cb0 --- /dev/null +++ b/comps/third_parties/teirerank/deployment/kubernetes/README.md @@ -0,0 +1,18 @@ +# Deploy TEIRERANK on kubernetes cluster + +- You should have Helm (version >= 3.15) installed. Refer to the [Helm Installation Guide](https://helm.sh/docs/intro/install/) for more information. +- For more deployment options, refer to [helm charts README](https://github.com/opea-project/GenAIInfra/tree/main/helm-charts#readme). + +## Deploy on Xeon + +``` +export HFTOKEN="insert-your-huggingface-token-here" +helm install teirerank oci://ghcr.io/opea-project/charts/teirerank --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} -f cpu-values.yaml +``` + +## Deploy on Gaudi + +``` +export HFTOKEN="insert-your-huggingface-token-here" +helm install teirerank oci://ghcr.io/opea-project/charts/teirerank --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} -f gaudi-values.yaml +``` diff --git a/comps/third_parties/teirerank/deployment/kubernetes/cpu-values.yaml b/comps/third_parties/teirerank/deployment/kubernetes/cpu-values.yaml new file mode 100644 index 0000000000..5eaa0d2744 --- /dev/null +++ b/comps/third_parties/teirerank/deployment/kubernetes/cpu-values.yaml @@ -0,0 +1,5 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +image: + repository: ghcr.io/huggingface/text-embeddings-inference diff --git a/comps/third_parties/teirerank/deployment/kubernetes/gaudi-values.yaml b/comps/third_parties/teirerank/deployment/kubernetes/gaudi-values.yaml new file mode 100644 index 0000000000..aa8c36da48 --- /dev/null +++ b/comps/third_parties/teirerank/deployment/kubernetes/gaudi-values.yaml @@ -0,0 +1,22 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +accelDevice: "gaudi" + +OMPI_MCA_btl_vader_single_copy_mechanism: "none" +MAX_WARMUP_SEQUENCE_LENGTH: "512" +image: + repository: ghcr.io/huggingface/tei-gaudi + tag: 1.5.0 + +securityContext: + readOnlyRootFilesystem: false + +resources: + limits: + habana.ai/gaudi: 1 + +livenessProbe: + timeoutSeconds: 1 +readinessProbe: + timeoutSeconds: 1 diff --git a/comps/third_parties/tgi/deployment/kubernetes/README.md b/comps/third_parties/tgi/deployment/kubernetes/README.md index e69de29bb2..ff37f88ecf 100644 --- a/comps/third_parties/tgi/deployment/kubernetes/README.md +++ b/comps/third_parties/tgi/deployment/kubernetes/README.md @@ -0,0 +1,18 @@ +# Deploy TGI on kubernetes cluster + +- You should have Helm (version >= 3.15) installed. Refer to the [Helm Installation Guide](https://helm.sh/docs/intro/install/) for more information. +- For more deployment options, refer to [helm charts README](https://github.com/opea-project/GenAIInfra/tree/main/helm-charts#readme). + +## Deploy on Xeon + +``` +export HFTOKEN="insert-your-huggingface-token-here" +helm install tgi oci://ghcr.io/opea-project/charts/tgi --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} -f cpu-values.yaml +``` + +## Deploy on Gaudi + +``` +export HFTOKEN="insert-your-huggingface-token-here" +helm install tgi oci://ghcr.io/opea-project/charts/tgi --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} -f gaudi-values.yaml +``` diff --git a/comps/third_parties/tgi/deployment/kubernetes/cpu-values.yaml b/comps/third_parties/tgi/deployment/kubernetes/cpu-values.yaml new file mode 100644 index 0000000000..38297ab3d3 --- /dev/null +++ b/comps/third_parties/tgi/deployment/kubernetes/cpu-values.yaml @@ -0,0 +1,26 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +# Resource requirements for Intel/neural-chat-7b-v3-3 @ 32-bit: +resources: + limits: + cpu: 8 + memory: 70Gi + requests: + cpu: 6 + memory: 65Gi + +livenessProbe: + initialDelaySeconds: 8 + periodSeconds: 8 + failureThreshold: 24 + timeoutSeconds: 4 +readinessProbe: + initialDelaySeconds: 16 + periodSeconds: 8 + timeoutSeconds: 4 +startupProbe: + initialDelaySeconds: 10 + periodSeconds: 5 + failureThreshold: 180 + timeoutSeconds: 2 diff --git a/comps/third_parties/tgi/deployment/kubernetes/gaudi-values.yaml b/comps/third_parties/tgi/deployment/kubernetes/gaudi-values.yaml new file mode 100644 index 0000000000..8e04769aec --- /dev/null +++ b/comps/third_parties/tgi/deployment/kubernetes/gaudi-values.yaml @@ -0,0 +1,38 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +accelDevice: "gaudi" + +image: + repository: ghcr.io/huggingface/tgi-gaudi + tag: "2.0.6" + +MAX_INPUT_LENGTH: "1024" +MAX_TOTAL_TOKENS: "2048" +CUDA_GRAPHS: "" +OMPI_MCA_btl_vader_single_copy_mechanism: "none" +ENABLE_HPU_GRAPH: "true" +LIMIT_HPU_GRAPH: "true" +USE_FLASH_ATTENTION: "true" +FLASH_ATTENTION_RECOMPUTE: "true" + +resources: + limits: + habana.ai/gaudi: 1 + requests: + cpu: 1 + memory: 16Gi + +livenessProbe: + initialDelaySeconds: 5 + periodSeconds: 5 + timeoutSeconds: 1 +readinessProbe: + initialDelaySeconds: 5 + periodSeconds: 5 + timeoutSeconds: 1 +startupProbe: + initialDelaySeconds: 5 + periodSeconds: 5 + timeoutSeconds: 1 + failureThreshold: 120 diff --git a/comps/third_parties/vllm/deployment/kubernetes/README.md b/comps/third_parties/vllm/deployment/kubernetes/README.md index e69de29bb2..18b17d9096 100644 --- a/comps/third_parties/vllm/deployment/kubernetes/README.md +++ b/comps/third_parties/vllm/deployment/kubernetes/README.md @@ -0,0 +1,18 @@ +# Deploy vllm on kubernetes cluster + +- You should have Helm (version >= 3.15) installed. Refer to the [Helm Installation Guide](https://helm.sh/docs/intro/install/) for more information. +- For more deployment options, refer to [helm charts README](https://github.com/opea-project/GenAIInfra/tree/main/helm-charts#readme). + +## Deploy on Xeon + +``` +export HFTOKEN="insert-your-huggingface-token-here" +helm install myvllm oci://ghcr.io/opea-project/charts/vllm --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} -f cpu-values.yaml +``` + +## Deploy on Gaudi + +``` +export HFTOKEN="insert-your-huggingface-token-here" +helm install myvllm oci://ghcr.io/opea-project/charts/vllm --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} -f gaudi-values.yaml +``` diff --git a/comps/third_parties/vllm/deployment/kubernetes/cpu-values.yaml b/comps/third_parties/vllm/deployment/kubernetes/cpu-values.yaml new file mode 100644 index 0000000000..c2e01e4be7 --- /dev/null +++ b/comps/third_parties/vllm/deployment/kubernetes/cpu-values.yaml @@ -0,0 +1,5 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +image: + repository: opea/vllm diff --git a/comps/third_parties/vllm/deployment/kubernetes/gaudi-values.yaml b/comps/third_parties/vllm/deployment/kubernetes/gaudi-values.yaml new file mode 100644 index 0000000000..e9ddbed829 --- /dev/null +++ b/comps/third_parties/vllm/deployment/kubernetes/gaudi-values.yaml @@ -0,0 +1,14 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +accelDevice: "gaudi" + +image: + repository: opea/vllm-gaudi + +# VLLM_CPU_KVCACHE_SPACE: "40" +OMPI_MCA_btl_vader_single_copy_mechanism: none +extraCmdArgs: ["--tensor-parallel-size","1","--block-size","128","--max-num-seqs","256","--max-seq_len-to-capture","2048"] +resources: + limits: + habana.ai/gaudi: 1 diff --git a/comps/third_parties/vllm/src/build_docker_vllm.sh b/comps/third_parties/vllm/src/build_docker_vllm.sh index bcbf20c4a3..20d4f8df52 100644 --- a/comps/third_parties/vllm/src/build_docker_vllm.sh +++ b/comps/third_parties/vllm/src/build_docker_vllm.sh @@ -37,7 +37,6 @@ fi if [ "$hw_mode" = "hpu" ]; then git clone https://github.com/HabanaAI/vllm-fork.git cd ./vllm-fork/ - git checkout 3c39626 docker build -f Dockerfile.hpu -t opea/vllm-gaudi:latest --shm-size=128g . --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy cd .. rm -rf vllm-fork diff --git a/comps/third_parties/whisper/deployment/kubernetes/README.md b/comps/third_parties/whisper/deployment/kubernetes/README.md new file mode 100644 index 0000000000..3754916482 --- /dev/null +++ b/comps/third_parties/whisper/deployment/kubernetes/README.md @@ -0,0 +1,18 @@ +# Deploy whisper on kubernetes cluster + +- You should have Helm (version >= 3.15) installed. Refer to the [Helm Installation Guide](https://helm.sh/docs/intro/install/) for more information. +- For more deployment options, refer to [helm charts README](https://github.com/opea-project/GenAIInfra/tree/main/helm-charts#readme). + +## Deploy on Xeon + +``` +export HFTOKEN="insert-your-huggingface-token-here" +helm install whisper oci://ghcr.io/opea-project/charts/whisper --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} -f cpu-values.yaml +``` + +## Deploy on Gaudi + +``` +export HFTOKEN="insert-your-huggingface-token-here" +helm install whisper oci://ghcr.io/opea-project/charts/whisper --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} -f gaudi-values.yaml +``` diff --git a/comps/third_parties/whisper/deployment/kubernetes/cpu-values.yaml b/comps/third_parties/whisper/deployment/kubernetes/cpu-values.yaml new file mode 100644 index 0000000000..f32f55f00f --- /dev/null +++ b/comps/third_parties/whisper/deployment/kubernetes/cpu-values.yaml @@ -0,0 +1,5 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +image: + repository: opea/whisper diff --git a/comps/third_parties/whisper/deployment/kubernetes/gaudi-values.yaml b/comps/third_parties/whisper/deployment/kubernetes/gaudi-values.yaml new file mode 100644 index 0000000000..3ba40c4b8d --- /dev/null +++ b/comps/third_parties/whisper/deployment/kubernetes/gaudi-values.yaml @@ -0,0 +1,9 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +image: + repository: opea/whisper-gaudi + +resources: + limits: + habana.ai/gaudi: 1 diff --git a/comps/tts/deployment/kubernetes/README.md b/comps/tts/deployment/kubernetes/README.md new file mode 100644 index 0000000000..af1bcb05a3 --- /dev/null +++ b/comps/tts/deployment/kubernetes/README.md @@ -0,0 +1,11 @@ +# Deploy tts microservice on Kubernetes cluster + +- You should have Helm (version >= 3.15) installed. Refer to the [Helm Installation Guide](https://helm.sh/docs/intro/install/) for more information. +- For more deployment options, refer to [helm charts README](https://github.com/opea-project/GenAIInfra/tree/main/helm-charts#readme). + +## Deploy on Kubernetes + +``` +export HFTOKEN="insert-your-huggingface-token-here" +helm install tts oci://ghcr.io/opea-project/charts/tts --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} -f cpu-values.yaml +``` diff --git a/comps/tts/deployment/kubernetes/cpu-values.yaml b/comps/tts/deployment/kubernetes/cpu-values.yaml new file mode 100644 index 0000000000..c735ab48ab --- /dev/null +++ b/comps/tts/deployment/kubernetes/cpu-values.yaml @@ -0,0 +1,5 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +speecht5: + enabled: true diff --git a/comps/web_retrievers/deployment/kubernetes/README.md b/comps/web_retrievers/deployment/kubernetes/README.md new file mode 100644 index 0000000000..c361509fe8 --- /dev/null +++ b/comps/web_retrievers/deployment/kubernetes/README.md @@ -0,0 +1,11 @@ +# Deploy web-retriever microservice on Kubernetes cluster + +- You should have Helm (version >= 3.15) installed. Refer to the [Helm Installation Guide](https://helm.sh/docs/intro/install/) for more information. +- For more deployment options, refer to [helm charts README](https://github.com/opea-project/GenAIInfra/tree/main/helm-charts#readme). + +## Deploy on Kubernetes + +``` +export HFTOKEN="insert-your-huggingface-token-here" +helm install web-retriever oci://ghcr.io/opea-project/charts/web-retriever --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} -f cpu-values.yaml +``` diff --git a/comps/web_retrievers/deployment/kubernetes/cpu-values.yaml b/comps/web_retrievers/deployment/kubernetes/cpu-values.yaml new file mode 100644 index 0000000000..e2d62ff26f --- /dev/null +++ b/comps/web_retrievers/deployment/kubernetes/cpu-values.yaml @@ -0,0 +1,5 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +tei: + enabled: true diff --git a/tests/agent/test.py b/tests/agent/test.py index e345e89420..2da0f6c8e6 100644 --- a/tests/agent/test.py +++ b/tests/agent/test.py @@ -10,7 +10,7 @@ def generate_answer_agent_api(url, prompt): proxies = {"http": ""} payload = { - "query": prompt, + "messages": prompt, } response = requests.post(url, json=payload, proxies=proxies) answer = response.json()["text"] @@ -21,7 +21,7 @@ def process_request(url, query, is_stream=False): proxies = {"http": ""} payload = { - "query": query, + "messages": query, } try: diff --git a/tests/llms/test_llms_doc-summarization_langchain_tgi.sh b/tests/llms/test_llms_doc-summarization_langchain_tgi.sh new file mode 100644 index 0000000000..bbf9cd989a --- /dev/null +++ b/tests/llms/test_llms_doc-summarization_langchain_tgi.sh @@ -0,0 +1,152 @@ +#!/bin/bash +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +set -xe + +WORKPATH=$(dirname "$PWD") +host_ip=$(hostname -I | awk '{print $1}') +LOG_PATH="$WORKPATH/tests" + +function build_docker_images() { + cd $WORKPATH + docker build --no-cache -t opea/llm-docsum:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/src/doc-summarization/Dockerfile . + if [ $? -ne 0 ]; then + echo "opea/llm-docsum built fail" + exit 1 + else + echo "opea/llm-docsum built successful" + fi +} + +function start_service() { + export host_ip=${host_ip} + export LLM_ENDPOINT_PORT=5072 + export DOCSUM_PORT=5073 + export HUGGINGFACEHUB_API_TOKEN=${HF_TOKEN} + export LLM_ENDPOINT="http://${host_ip}:${LLM_ENDPOINT_PORT}" + export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3" + export MAX_INPUT_TOKENS=2048 + export MAX_TOTAL_TOKENS=4096 + export DocSum_COMPONENT_NAME="OPEADocSum_TGI" # or "vllm" + export LOGFLAG=True + + cd $WORKPATH/comps/llms/deployment/docker_compose + docker compose -f doc-summarization_tgi.yaml up -d > ${LOG_PATH}/start_services_with_compose.log + + sleep 30s +} + +function validate_services() { + local URL="$1" + local EXPECTED_RESULT="$2" + local SERVICE_NAME="$3" + local DOCKER_NAME="$4" + local INPUT_DATA="$5" + + local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL") + + echo "===========================================" + + if [ "$HTTP_STATUS" -eq 200 ]; then + echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..." + + local CONTENT=$(curl -s -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL" | tee ${LOG_PATH}/${SERVICE_NAME}.log) + + echo $CONTENT + + if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then + echo "[ $SERVICE_NAME ] Content is as expected." + else + echo "[ $SERVICE_NAME ] Content does not match the expected result" + docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log + exit 1 + fi + else + echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS" + docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log + exit 1 + fi + sleep 1s +} + +function validate_microservices() { + URL="http://${host_ip}:$DOCSUM_PORT/v1/docsum" + + echo "Validate tgi..." + validate_services \ + "${LLM_ENDPOINT}/generate" \ + "generated_text" \ + "tgi" \ + "tgi-server" \ + '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}' + + echo "Validate stream=True..." + validate_services \ + "$URL" \ + 'text' \ + "llm_summarization" \ + "llm-docsum-server" \ + '{"query": "Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en"}' + + echo "Validate stream=False..." + validate_services \ + "$URL" \ + 'text' \ + "llm_summarization" \ + "llm-docsum-server" \ + '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "stream":false}' + + echo "Validate Chinese mode..." + validate_services \ + "$URL" \ + 'text' \ + "llm_summarization" \ + "llm-docsum-server" \ + '{"query":"2024年9月26日,北京——今日,英特尔正式发布英特尔® 至强® 6性能核处理器(代号Granite Rapids),为AI、数据分析、科学计算等计算密集型业务提供卓越性能。", "max_tokens":32, "language":"zh", "stream":false}' + + echo "Validate truncate mode..." + validate_services \ + "$URL" \ + 'text' \ + "llm_summarization" \ + "llm-docsum-server" \ + '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "truncate", "chunk_size": 2000}' + + echo "Validate map_reduce mode..." + validate_services \ + "$URL" \ + 'text' \ + "llm_summarization" \ + "llm-docsum-server" \ + '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "map_reduce", "chunk_size": 2000, "stream":false}' + + echo "Validate refine mode..." + validate_services \ + "$URL" \ + 'text' \ + "llm_summarization" \ + "llm-docsum-server" \ + '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "refine", "chunk_size": 2000}' +} + +function stop_docker() { + cd $WORKPATH/comps/llms/deployment/docker_compose + docker compose -f doc-summarization_tgi.yaml down +} + +function main() { + + stop_docker + + build_docker_images + start_service + + validate_microservices + + stop_docker + echo y | docker system prune + +} + +main diff --git a/tests/llms/test_llms_doc-summarization_langchain_tgi_on_intel_hpu.sh b/tests/llms/test_llms_doc-summarization_langchain_tgi_on_intel_hpu.sh new file mode 100644 index 0000000000..ebfd5d8f2a --- /dev/null +++ b/tests/llms/test_llms_doc-summarization_langchain_tgi_on_intel_hpu.sh @@ -0,0 +1,152 @@ +#!/bin/bash +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +set -xe + +WORKPATH=$(dirname "$PWD") +host_ip=$(hostname -I | awk '{print $1}') +LOG_PATH="$WORKPATH/tests" + +function build_docker_images() { + cd $WORKPATH + docker build --no-cache -t opea/llm-docsum:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/src/doc-summarization/Dockerfile . + if [ $? -ne 0 ]; then + echo "opea/llm-docsum built fail" + exit 1 + else + echo "opea/llm-docsum built successful" + fi +} + +function start_service() { + export host_ip=${host_ip} + export LLM_ENDPOINT_PORT=5071 + export DOCSUM_PORT=5072 + export HUGGINGFACEHUB_API_TOKEN=${HF_TOKEN} + export LLM_ENDPOINT="http://${host_ip}:${LLM_ENDPOINT_PORT}" + export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3" + export MAX_INPUT_TOKENS=2048 + export MAX_TOTAL_TOKENS=4096 + export DocSum_COMPONENT_NAME="OPEADocSum_TGI" # or "vllm" + export LOGFLAG=True + + cd $WORKPATH/comps/llms/deployment/docker_compose + docker compose -f doc-summarization_tgi_on_intel_hpu.yaml up -d > ${LOG_PATH}/start_services_with_compose.log + + sleep 30s +} + +function validate_services() { + local URL="$1" + local EXPECTED_RESULT="$2" + local SERVICE_NAME="$3" + local DOCKER_NAME="$4" + local INPUT_DATA="$5" + + local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL") + + echo "===========================================" + + if [ "$HTTP_STATUS" -eq 200 ]; then + echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..." + + local CONTENT=$(curl -s -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL" | tee ${LOG_PATH}/${SERVICE_NAME}.log) + + echo $CONTENT + + if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then + echo "[ $SERVICE_NAME ] Content is as expected." + else + echo "[ $SERVICE_NAME ] Content does not match the expected result" + docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log + exit 1 + fi + else + echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS" + docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log + exit 1 + fi + sleep 1s +} + +function validate_microservices() { + URL="http://${host_ip}:$DOCSUM_PORT/v1/docsum" + + echo "Validate tgi..." + validate_services \ + "${LLM_ENDPOINT}/generate" \ + "generated_text" \ + "tgi" \ + "tgi_gaudi_server" \ + '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}' + + echo "Validate stream=True..." + validate_services \ + "$URL" \ + 'text' \ + "llm_summarization" \ + "llm-docsum-server" \ + '{"query": "Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en"}' + + echo "Validate stream=False..." + validate_services \ + "$URL" \ + 'text' \ + "llm_summarization" \ + "llm-docsum-server" \ + '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "stream":false}' + + echo "Validate Chinese mode..." + validate_services \ + "$URL" \ + 'text' \ + "llm_summarization" \ + "llm-docsum-server" \ + '{"query":"2024年9月26日,北京——今日,英特尔正式发布英特尔® 至强® 6性能核处理器(代号Granite Rapids),为AI、数据分析、科学计算等计算密集型业务提供卓越性能。", "max_tokens":32, "language":"zh", "stream":false}' + + echo "Validate truncate mode..." + validate_services \ + "$URL" \ + 'text' \ + "llm_summarization" \ + "llm-docsum-server" \ + '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "truncate", "chunk_size": 2000}' + + echo "Validate map_reduce mode..." + validate_services \ + "$URL" \ + 'text' \ + "llm_summarization" \ + "llm-docsum-server" \ + '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "map_reduce", "chunk_size": 2000, "stream":false}' + + echo "Validate refine mode..." + validate_services \ + "$URL" \ + 'text' \ + "llm_summarization" \ + "llm-docsum-server" \ + '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "refine", "chunk_size": 2000}' +} + +function stop_docker() { + cd $WORKPATH/comps/llms/deployment/docker_compose + docker compose -f doc-summarization_tgi_on_intel_hpu.yaml down +} + +function main() { + + stop_docker + + build_docker_images + start_service + + validate_microservices + + stop_docker + echo y | docker system prune + +} + +main diff --git a/tests/llms/test_llms_doc-summarization_langchain_vllm_on_intel_hpu.sh b/tests/llms/test_llms_doc-summarization_langchain_vllm_on_intel_hpu.sh new file mode 100644 index 0000000000..0e97d8e13b --- /dev/null +++ b/tests/llms/test_llms_doc-summarization_langchain_vllm_on_intel_hpu.sh @@ -0,0 +1,164 @@ +#!/bin/bash +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +set -xe + +WORKPATH=$(dirname "$PWD") +host_ip=$(hostname -I | awk '{print $1}') +LOG_PATH="$WORKPATH/tests" + +function build_docker_images() { + cd $WORKPATH + git clone https://github.com/HabanaAI/vllm-fork.git + cd vllm-fork/ + docker build --no-cache -f Dockerfile.hpu -t opea/vllm-gaudi:latest --shm-size=128g . + if [ $? -ne 0 ]; then + echo "opea/vllm-gaudi built fail" + exit 1 + else + echo "opea/vllm-gaudi built successful" + fi + + cd $WORKPATH + docker build --no-cache -t opea/llm-docsum:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/src/doc-summarization/Dockerfile . + if [ $? -ne 0 ]; then + echo "opea/llm-docsum built fail" + exit 1 + else + echo "opea/llm-docsum built successful" + fi +} + +function start_service() { + export host_ip=${host_ip} + export LLM_ENDPOINT_PORT=5076 + export DOCSUM_PORT=5077 + export HUGGINGFACEHUB_API_TOKEN=${HF_TOKEN} + export LLM_ENDPOINT="http://${host_ip}:${LLM_ENDPOINT_PORT}" + export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3" + export MAX_INPUT_TOKENS=2048 + export MAX_TOTAL_TOKENS=4096 + export DocSum_COMPONENT_NAME="OPEADocSum_vLLM" # or "vllm" + export VLLM_SKIP_WARMUP=true + export LOGFLAG=True + + cd $WORKPATH/comps/llms/deployment/docker_compose + docker compose -f doc-summarization_vllm_on_intel_hpu.yaml up -d > ${LOG_PATH}/start_services_with_compose.log + + sleep 30s +} + +function validate_services() { + local URL="$1" + local EXPECTED_RESULT="$2" + local SERVICE_NAME="$3" + local DOCKER_NAME="$4" + local INPUT_DATA="$5" + + local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL") + + echo "===========================================" + + if [ "$HTTP_STATUS" -eq 200 ]; then + echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..." + + local CONTENT=$(curl -s -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL" | tee ${LOG_PATH}/${SERVICE_NAME}.log) + + echo $CONTENT + + if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then + echo "[ $SERVICE_NAME ] Content is as expected." + else + echo "[ $SERVICE_NAME ] Content does not match the expected result" + docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log + exit 1 + fi + else + echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS" + docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log + exit 1 + fi + sleep 1s +} + +function validate_microservices() { + URL="http://${host_ip}:$DOCSUM_PORT/v1/docsum" + + echo "Validate vllm..." + validate_services \ + "${LLM_ENDPOINT}/v1/completions" \ + "text" \ + "vllm" \ + "vllm-gaudi-server" \ + '{"model": "Intel/neural-chat-7b-v3-3", "prompt": "What is Deep Learning?", "max_tokens": 32, "temperature": 0}' + + echo "Validate stream=True..." + validate_services \ + "$URL" \ + 'text' \ + "llm_summarization" \ + "llm-docsum-server" \ + '{"query": "Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en"}' + + echo "Validate stream=False..." + validate_services \ + "$URL" \ + 'text' \ + "llm_summarization" \ + "llm-docsum-server" \ + '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "stream":false}' + + echo "Validate Chinese mode..." + validate_services \ + "$URL" \ + 'text' \ + "llm_summarization" \ + "llm-docsum-server" \ + '{"query":"2024年9月26日,北京——今日,英特尔正式发布英特尔® 至强® 6性能核处理器(代号Granite Rapids),为AI、数据分析、科学计算等计算密集型业务提供卓越性能。", "max_tokens":32, "language":"zh", "stream":false}' + + echo "Validate truncate mode..." + validate_services \ + "$URL" \ + 'text' \ + "llm_summarization" \ + "llm-docsum-server" \ + '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "truncate", "chunk_size": 2000}' + + echo "Validate map_reduce mode..." + validate_services \ + "$URL" \ + 'text' \ + "llm_summarization" \ + "llm-docsum-server" \ + '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "map_reduce", "chunk_size": 2000, "stream":false}' + + echo "Validate refine mode..." + validate_services \ + "$URL" \ + 'text' \ + "llm_summarization" \ + "llm-docsum-server" \ + '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "refine", "chunk_size": 2000}' +} + +function stop_docker() { + cd $WORKPATH/comps/llms/deployment/docker_compose + docker compose -f doc-summarization_vllm_on_intel_hpu.yaml down +} + +function main() { + + stop_docker + + build_docker_images + start_service + + validate_microservices + + stop_docker + echo y | docker system prune + +} + +main diff --git a/tests/llms/test_llms_faq-generation_langchain_vllm_on_intel_hpu.sh b/tests/llms/test_llms_faq-generation_langchain_vllm_on_intel_hpu.sh index 37d3be22dc..57d4f4207d 100644 --- a/tests/llms/test_llms_faq-generation_langchain_vllm_on_intel_hpu.sh +++ b/tests/llms/test_llms_faq-generation_langchain_vllm_on_intel_hpu.sh @@ -12,7 +12,6 @@ function build_docker_images() { cd $WORKPATH git clone https://github.com/HabanaAI/vllm-fork.git cd vllm-fork/ - git checkout 3c39626 docker build --no-cache -f Dockerfile.hpu -t opea/vllm-gaudi:latest --shm-size=128g . if [ $? -ne 0 ]; then echo "opea/vllm-gaudi built fail" diff --git a/tests/llms/test_llms_summarization_tgi_langchain.sh b/tests/llms/test_llms_summarization_tgi_langchain.sh deleted file mode 100644 index ee12777657..0000000000 --- a/tests/llms/test_llms_summarization_tgi_langchain.sh +++ /dev/null @@ -1,133 +0,0 @@ -#!/bin/bash -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -set -xe - -WORKPATH=$(dirname "$PWD") -ip_address=$(hostname -I | awk '{print $1}') -LOG_PATH="$WORKPATH/tests" - -function build_docker_images() { - cd $WORKPATH - docker build --no-cache -t opea/llm-sum-tgi:comps --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/summarization/tgi/langchain/Dockerfile . - if [ $? -ne 0 ]; then - echo "opea/llm-textgen built fail" - exit 1 - else - echo "opea/llm-textgen built successful" - fi -} - -function start_service() { - tgi_endpoint_port=5075 - export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3" - export MAX_INPUT_TOKENS=2048 - export MAX_TOTAL_TOKENS=4096 - # Remember to set HF_TOKEN before invoking this test! - export HF_TOKEN=${HF_TOKEN} - docker run -d --name="test-comps-llm-sum-tgi-endpoint" -p $tgi_endpoint_port:80 -v ./data:/data -e http_proxy=$http_proxy -e https_proxy=$https_proxy --shm-size 1g ghcr.io/huggingface/text-generation-inference:1.4 --model-id ${LLM_MODEL_ID} --max-input-length ${MAX_INPUT_TOKENS} --max-total-tokens ${MAX_TOTAL_TOKENS} - export TGI_LLM_ENDPOINT="http://${ip_address}:${tgi_endpoint_port}" - - sum_port=5076 - docker run -d --name="test-comps-llm-sum-tgi-server" -p ${sum_port}:9000 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e TGI_LLM_ENDPOINT=$TGI_LLM_ENDPOINT -e LLM_MODEL_ID=$LLM_MODEL_ID -e MAX_INPUT_TOKENS=$MAX_INPUT_TOKENS -e MAX_TOTAL_TOKENS=$MAX_TOTAL_TOKENS -e HUGGINGFACEHUB_API_TOKEN=$HF_TOKEN -e LOGFLAG=True opea/llm-sum-tgi:comps - - # check whether tgi is fully ready - n=0 - until [[ "$n" -ge 100 ]] || [[ $ready == true ]]; do - docker logs test-comps-llm-sum-tgi-endpoint > ${LOG_PATH}/test-comps-llm-sum-tgi-endpoint.log - n=$((n+1)) - if grep -q Connected ${LOG_PATH}/test-comps-llm-sum-tgi-endpoint.log; then - break - fi - sleep 5s - done - sleep 5s -} - -function validate_services() { - local URL="$1" - local EXPECTED_RESULT="$2" - local SERVICE_NAME="$3" - local DOCKER_NAME="$4" - local INPUT_DATA="$5" - - local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL") - - echo "===========================================" - - if [ "$HTTP_STATUS" -eq 200 ]; then - echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..." - - local CONTENT=$(curl -s -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL" | tee ${LOG_PATH}/${SERVICE_NAME}.log) - - echo $CONTENT - - if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then - echo "[ $SERVICE_NAME ] Content is as expected." - else - echo "[ $SERVICE_NAME ] Content does not match the expected result" - docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log - exit 1 - fi - else - echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS" - docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log - exit 1 - fi - sleep 1s -} - -function validate_microservices() { - sum_port=5076 - URL="http://${ip_address}:$sum_port/v1/chat/docsum" - - validate_services \ - "$URL" \ - 'text' \ - "llm_summarization" \ - "test-comps-llm-sum-tgi-server" \ - '{"query": "What is Deep Learning?"}' - - validate_services \ - "$URL" \ - 'text' \ - "llm_summarization" \ - "test-comps-llm-sum-tgi-server" \ - '{"query": "What is Deep Learning?", "summary_type": "truncate"}' - - validate_services \ - "$URL" \ - 'text' \ - "llm_summarization" \ - "test-comps-llm-sum-tgi-server" \ - '{"query": "What is Deep Learning?", "summary_type": "map_reduce"}' - - validate_services \ - "$URL" \ - 'text' \ - "llm_summarization" \ - "test-comps-llm-sum-tgi-server" \ - '{"query": "What is Deep Learning?", "summary_type": "refine"}' -} - -function stop_docker() { - cid=$(docker ps -aq --filter "name=test-comps-llm-sum-tgi*") - if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi -} - -function main() { - - stop_docker - - build_docker_images - start_service - - validate_microservices - - stop_docker - echo y | docker system prune - -} - -main diff --git a/tests/llms/test_llms_text-generation_opea_vllm_on_intel_hpu.sh b/tests/llms/test_llms_text-generation_opea_vllm_on_intel_hpu.sh index eb5911bb69..05c644ef06 100644 --- a/tests/llms/test_llms_text-generation_opea_vllm_on_intel_hpu.sh +++ b/tests/llms/test_llms_text-generation_opea_vllm_on_intel_hpu.sh @@ -12,7 +12,6 @@ function build_docker_images() { cd $WORKPATH git clone https://github.com/HabanaAI/vllm-fork.git cd vllm-fork/ - git checkout 3c39626 docker build --no-cache -f Dockerfile.hpu -t opea/vllm-gaudi:comps --shm-size=128g . if [ $? -ne 0 ]; then echo "opea/vllm-gaudi built fail" diff --git a/tests/llms/test_llms_text-generation_predictionguard.sh b/tests/llms/test_llms_text-generation_predictionguard.sh index 6cb3507283..107bdd797b 100644 --- a/tests/llms/test_llms_text-generation_predictionguard.sh +++ b/tests/llms/test_llms_text-generation_predictionguard.sh @@ -13,22 +13,24 @@ fi function build_docker_images() { cd $WORKPATH echo $(pwd) - docker build --no-cache -t opea/llm-pg:comps -f comps/llms/text-generation/predictionguard/Dockerfile . + docker build --no-cache -t opea/llm-textgen:comps -f comps/llms/src/text-generation/Dockerfile . if [ $? -ne 0 ]; then - echo "opea/llm-pg built failed" + echo "opea/llm-textgen built failed" exit 1 else - echo "opea/llm-pg built successfully" + echo "opea/llm-textgen built successfully" fi } function start_service() { llm_service_port=9000 unset http_proxy - docker run -d --name=test-comps-llm-pg-server \ + docker run -d --name=test-comps-llm-textgen-pg-server \ -e http_proxy= -e https_proxy= \ -e PREDICTIONGUARD_API_KEY=${PREDICTIONGUARD_API_KEY} \ - -p 9000:9000 --ipc=host opea/llm-pg:comps + -e LLM_COMPONENT_NAME="OPEATextGen_Predictionguard" \ + -e LOGFLAG=True \ + -p 9000:9000 --ipc=host opea/llm-textgen:comps sleep 60 # Sleep for 1 minute to allow the service to start } @@ -36,20 +38,20 @@ function validate_microservice() { llm_service_port=9000 result=$(http_proxy="" curl http://${ip_address}:${llm_service_port}/v1/chat/completions \ -X POST \ - -d '{"model": "Hermes-2-Pro-Llama-3-8B", "query": "What is AI?", "stream": false, "max_tokens": 100, "temperature": 0.7, "top_p": 1.0, "top_k": 50}' \ + -d '{"model": "Hermes-2-Pro-Llama-3-8B", "messages": "What is AI?", "stream": false, "max_tokens": 100, "temperature": 0.7, "top_p": 1.0, "top_k": 50}' \ -H 'Content-Type: application/json') - if [[ $result == *"text"* ]]; then + if [[ $result == *"content"* ]]; then echo "Service response is correct." else echo "Result wrong. Received was $result" - docker logs test-comps-llm-pg-server + docker logs test-comps-llm-textgen-pg-server exit 1 fi } function stop_docker() { - cid=$(docker ps -aq --filter "name=test-comps-llm-pg-*") + cid=$(docker ps -aq --filter "name=test-comps-llm-textgen-pg-*") if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi }