Skip to content

Commit

Permalink
Add chatqna wrapper for multiple model selection (#1144)
Browse files Browse the repository at this point in the history
Signed-off-by: lvliang-intel <[email protected]>
Co-authored-by: Ying Hu <[email protected]>
Co-authored-by: chen, suyue <[email protected]>
  • Loading branch information
3 people authored Nov 18, 2024
1 parent b1bb6db commit fb514bb
Show file tree
Hide file tree
Showing 6 changed files with 262 additions and 53 deletions.
32 changes: 32 additions & 0 deletions ChatQnA/Dockerfile.wrapper
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

FROM python:3.11-slim

RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \
libgl1-mesa-glx \
libjemalloc-dev \
git

RUN useradd -m -s /bin/bash user && \
mkdir -p /home/user && \
chown -R user /home/user/

WORKDIR /home/user/
RUN git clone https://github.com/opea-project/GenAIComps.git

WORKDIR /home/user/GenAIComps
RUN pip install --no-cache-dir --upgrade pip && \
pip install --no-cache-dir -r /home/user/GenAIComps/requirements.txt

COPY ./chatqna_wrapper.py /home/user/chatqna.py

ENV PYTHONPATH=$PYTHONPATH:/home/user/GenAIComps

USER user

WORKDIR /home/user

RUN echo 'ulimit -S -n 999999' >> ~/.bashrc

ENTRYPOINT ["python", "chatqna.py"]
68 changes: 68 additions & 0 deletions ChatQnA/chatqna_wrapper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

import os

from comps import ChatQnAGateway, MicroService, ServiceOrchestrator, ServiceType

MEGA_SERVICE_HOST_IP = os.getenv("MEGA_SERVICE_HOST_IP", "0.0.0.0")
MEGA_SERVICE_PORT = int(os.getenv("MEGA_SERVICE_PORT", 8888))
EMBEDDING_SERVICE_HOST_IP = os.getenv("EMBEDDING_SERVICE_HOST_IP", "0.0.0.0")
EMBEDDING_SERVICE_PORT = int(os.getenv("EMBEDDING_SERVICE_PORT", 6000))
RETRIEVER_SERVICE_HOST_IP = os.getenv("RETRIEVER_SERVICE_HOST_IP", "0.0.0.0")
RETRIEVER_SERVICE_PORT = int(os.getenv("RETRIEVER_SERVICE_PORT", 7000))
RERANK_SERVICE_HOST_IP = os.getenv("RERANK_SERVICE_HOST_IP", "0.0.0.0")
RERANK_SERVICE_PORT = int(os.getenv("RERANK_SERVICE_PORT", 8000))
LLM_SERVICE_HOST_IP = os.getenv("LLM_SERVICE_HOST_IP", "0.0.0.0")
LLM_SERVICE_PORT = int(os.getenv("LLM_SERVICE_PORT", 9000))


class ChatQnAService:
def __init__(self, host="0.0.0.0", port=8000):
self.host = host
self.port = port
self.megaservice = ServiceOrchestrator()

def add_remote_service(self):
embedding = MicroService(
name="embedding",
host=EMBEDDING_SERVICE_HOST_IP,
port=EMBEDDING_SERVICE_PORT,
endpoint="/v1/embeddings",
use_remote_service=True,
service_type=ServiceType.EMBEDDING,
)
retriever = MicroService(
name="retriever",
host=RETRIEVER_SERVICE_HOST_IP,
port=RETRIEVER_SERVICE_PORT,
endpoint="/v1/retrieval",
use_remote_service=True,
service_type=ServiceType.RETRIEVER,
)
rerank = MicroService(
name="rerank",
host=RERANK_SERVICE_HOST_IP,
port=RERANK_SERVICE_PORT,
endpoint="/v1/reranking",
use_remote_service=True,
service_type=ServiceType.RERANK,
)
llm = MicroService(
name="llm",
host=LLM_SERVICE_HOST_IP,
port=LLM_SERVICE_PORT,
endpoint="/v1/chat/completions",
use_remote_service=True,
service_type=ServiceType.LLM,
)
self.megaservice.add(embedding).add(retriever).add(rerank).add(llm)
self.megaservice.flow_to(embedding, retriever)
self.megaservice.flow_to(retriever, rerank)
self.megaservice.flow_to(rerank, llm)
self.gateway = ChatQnAGateway(megaservice=self.megaservice, host="0.0.0.0", port=self.port)


if __name__ == "__main__":
chatqna = ChatQnAService(host=MEGA_SERVICE_HOST_IP, port=MEGA_SERVICE_PORT)
chatqna.add_remote_service()
6 changes: 6 additions & 0 deletions ChatQnA/docker_image_build/build.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,12 @@ services:
context: ../
dockerfile: ./Dockerfile
image: ${REGISTRY:-opea}/chatqna:${TAG:-latest}
chatqna-wrapper:
build:
context: ../
dockerfile: ./Dockerfile.wrapper
extends: chatqna
image: ${REGISTRY:-opea}/chatqna-wrapper:${TAG:-latest}
chatqna-guardrails:
build:
context: ../
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1184,13 +1184,8 @@ spec:
runAsUser: 1000
seccompProfile:
type: RuntimeDefault
<<<<<<< HEAD
image: "ghcr.io/huggingface/tgi-gaudi:2.0.6"
imagePullPolicy: IfNotPresent
=======
image: "ghcr.io/huggingface/tgi-gaudi:2.0.5"
imagePullPolicy: Always
>>>>>>> e3187be819ad088c24bf1b2cbb419255af0f2be3
volumeMounts:
- mountPath: /data
name: model-volume
Expand Down
113 changes: 65 additions & 48 deletions ChatQnA/kubernetes/intel/hpu/gaudi/manifest/chatqna-vllm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ metadata:
app.kubernetes.io/managed-by: Helm
data:
TEI_EMBEDDING_ENDPOINT: "http://chatqna-tei"
HUGGINGFACEHUB_API_TOKEN: "insert-your-huggingface-token-here"
http_proxy: ""
https_proxy: ""
no_proxy: ""
Expand Down Expand Up @@ -70,9 +71,8 @@ data:
no_proxy: ""
LOGFLAG: ""
vLLM_ENDPOINT: "http://chatqna-vllm"
HUGGINGFACEHUB_API_TOKEN: "insert-your-huggingface-token-here"
LLM_MODEL: "meta-llama/Llama-3.1-70B-Instruct"
MODEL_ID: "meta-llama/Llama-3.1-70B-Instruct"
LLM_MODEL: "meta-llama/Meta-Llama-3-8B-Instruct"
MODEL_ID: "meta-llama/Meta-Llama-3-8B-Instruct"
---
# Source: chatqna/charts/reranking-usvc/templates/configmap.yaml
# Copyright (C) 2024 Intel Corporation
Expand Down Expand Up @@ -145,7 +145,6 @@ data:
NUMBA_CACHE_DIR: "/tmp"
TRANSFORMERS_CACHE: "/tmp/transformers_cache"
HF_HOME: "/tmp/.cache/huggingface"
MAX_WARMUP_SEQUENCE_LENGTH: "512"
---
# Source: chatqna/charts/teirerank/templates/configmap.yaml
# Copyright (C) 2024 Intel Corporation
Expand All @@ -170,6 +169,7 @@ data:
NUMBA_CACHE_DIR: "/tmp"
TRANSFORMERS_CACHE: "/tmp/transformers_cache"
HF_HOME: "/tmp/.cache/huggingface"
MAX_WARMUP_SEQUENCE_LENGTH: "512"
---
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
Expand All @@ -183,7 +183,7 @@ metadata:
app.kubernetes.io/instance: chatqna
app.kubernetes.io/version: "2.1.0"
data:
MODEL_ID: "meta-llama/Llama-3.1-70B-Instruct"
MODEL_ID: "meta-llama/Meta-Llama-3-8B-Instruct"
PORT: "2080"
HF_TOKEN: "insert-your-huggingface-token-here"
http_proxy: ""
Expand All @@ -194,6 +194,12 @@ data:
PT_HPU_ENABLE_LAZY_COLLECTIVES: "true"
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
HF_HOME: "/tmp/.cache/huggingface"
GPU_MEMORY_UTILIZATION: "0.5"
DTYPE: "auto"
TENSOR_PARALLEL_SIZE: "1"
BLOCK_SIZE: "128"
MAX_NUM_SEQS: "256"
MAX_SEQ_LEN_TO_CAPTURE: "2048"
---
# Source: chatqna/templates/nginx-deployment.yaml
apiVersion: v1
Expand Down Expand Up @@ -649,7 +655,7 @@ spec:
runAsUser: 1000
seccompProfile:
type: RuntimeDefault
image: "opea/dataprep-redis:v0.9"
image: "opea/dataprep-redis:latest"
imagePullPolicy: Always
ports:
- name: data-prep
Expand Down Expand Up @@ -1103,10 +1109,8 @@ spec:
- configMapRef:
name: chatqna-tei-config
securityContext:
privileged: true
capabilities:
add: ["SYS_NICE"]
image: "ghcr.io/huggingface/tei-gaudi:1.5.0"
{}
image: "ghcr.io/huggingface/text-embeddings-inference:cpu-1.5"
imagePullPolicy: IfNotPresent
args:
- "--auto-truncate"
Expand Down Expand Up @@ -1140,16 +1144,8 @@ spec:
initialDelaySeconds: 5
periodSeconds: 5
resources:
limits:
habana.ai/gaudi: 1
cpu: 10
memory: 100Gi
hugepages-2Mi: 9800Mi
requests:
habana.ai/gaudi: 1
cpu: 10
memory: 100Gi
hugepages-2Mi: 9800Mi
{}

volumes:
- name: model-volume # Replace with Persistent volume claim/ host directory
emptyDir: {}
Expand Down Expand Up @@ -1191,11 +1187,17 @@ spec:
- configMapRef:
name: chatqna-teirerank-config
securityContext:
{}
image: "ghcr.io/huggingface/text-embeddings-inference:cpu-1.5"
imagePullPolicy: Always
args:
- "--auto-truncate"
allowPrivilegeEscalation: false
capabilities:
drop:
- ALL
readOnlyRootFilesystem: false
runAsNonRoot: true
runAsUser: 1000
seccompProfile:
type: RuntimeDefault
image: "ghcr.io/huggingface/tei-gaudi:1.5.0"
imagePullPolicy: IfNotPresent
volumeMounts:
- mountPath: /data
name: model-volume
Expand Down Expand Up @@ -1228,7 +1230,8 @@ spec:
initialDelaySeconds: 5
periodSeconds: 5
resources:
{}
limits:
habana.ai/gaudi: 1
volumes:
- name: model-volume # Replace with Persistent volume claim/ host directory
emptyDir: {}
Expand All @@ -1242,6 +1245,7 @@ spec:
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0


apiVersion: apps/v1
kind: Deployment
metadata:
Expand Down Expand Up @@ -1271,16 +1275,36 @@ spec:
- configMapRef:
name: chatqna-vllm-config
securityContext:
privileged: true
allowPrivilegeEscalation: false
capabilities:
add: ["SYS_NICE"]
image: "opea/llm-vllm-hpu:latest"
command:
- /bin/bash
- -c
- |
export VLLM_CPU_KVCACHE_SPACE=40 && \
python3 -m vllm.entrypoints.openai.api_server --enforce-eager --gpu-memory-utilization 0.5 --dtype auto --model $MODEL_ID --port 2080 --tensor-parallel-size 8 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048
drop:
- ALL
readOnlyRootFilesystem: true
runAsNonRoot: true
runAsUser: 1000
seccompProfile:
type: RuntimeDefault
image: "opea/vllm-hpu:latest"
args:
- "--enforce-eager"
- "--model"
- "$(MODEL_ID)"
- "--tensor-parallel-size"
- "1"
- "--gpu-memory-utilization"
- "$(GPU_MEMORY_UTILIZATION)"
- "--dtype"
- "$(DTYPE)"
- "--max-num-seqs"
- "$(MAX_NUM_SEQS)"
- "--block-size"
- "$(BLOCK_SIZE)"
- "--max-seq-len-to-capture"
- "$(MAX_SEQ_LEN_TO_CAPTURE)"
- "--host"
- "0.0.0.0"
- "--port"
- "$(PORT)"
imagePullPolicy: Always
volumeMounts:
- mountPath: /data
Expand All @@ -1293,20 +1317,13 @@ spec:
protocol: TCP
resources:
limits:
habana.ai/gaudi: 8
cpu: 40
memory: 400Gi
hugepages-2Mi: 9800Mi
requests:
habana.ai/gaudi: 8
cpu: 40
memory: 400Gi
hugepages-2Mi: 9800Mi
habana.ai/gaudi: 1
volumes:
- name: model-volume # Replace with Persistent volume claim/ host directory
emptyDir: {}
- name: tmp
emptyDir: {}

---
# Source: chatqna/templates/deployment.yaml
# Copyright (C) 2024 Intel Corporation
Expand Down Expand Up @@ -1350,8 +1367,8 @@ spec:
value: chatqna-retriever-usvc
- name: EMBEDDING_SERVICE_HOST_IP
value: chatqna-embedding-usvc
- name: GUARDRAIL_SERVICE_HOST_IP
value: chatqna-guardrails-usvc
- name: MODEL_ID
value: "meta-llama/Meta-Llama-3-8B-Instruct"
securityContext:
allowPrivilegeEscalation: false
capabilities:
Expand All @@ -1362,8 +1379,8 @@ spec:
runAsUser: 1000
seccompProfile:
type: RuntimeDefault
image: "opea/chatqna:latest"
imagePullPolicy: Always
image: "opea/chatqna-wrapper:latest"
imagePullPolicy: IfNotPresent
volumeMounts:
- mountPath: /tmp
name: tmp
Expand Down
Loading

0 comments on commit fb514bb

Please sign in to comment.