From fb514bb8ba82ee63ab6db83813f11a2b824fa75e Mon Sep 17 00:00:00 2001 From: Liang Lv Date: Mon, 18 Nov 2024 10:48:09 +0800 Subject: [PATCH] Add chatqna wrapper for multiple model selection (#1144) Signed-off-by: lvliang-intel Co-authored-by: Ying Hu Co-authored-by: chen, suyue --- ChatQnA/Dockerfile.wrapper | 32 +++++ ChatQnA/chatqna_wrapper.py | 68 +++++++++++ ChatQnA/docker_image_build/build.yaml | 6 + .../gaudi/manifest/chatqna-guardrails.yaml | 5 - .../hpu/gaudi/manifest/chatqna-vllm.yaml | 113 ++++++++++-------- ChatQnA/tests/test_manifest_on_gaudi.sh | 91 ++++++++++++++ 6 files changed, 262 insertions(+), 53 deletions(-) create mode 100644 ChatQnA/Dockerfile.wrapper create mode 100644 ChatQnA/chatqna_wrapper.py diff --git a/ChatQnA/Dockerfile.wrapper b/ChatQnA/Dockerfile.wrapper new file mode 100644 index 0000000000..a9e4fb5444 --- /dev/null +++ b/ChatQnA/Dockerfile.wrapper @@ -0,0 +1,32 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +FROM python:3.11-slim + +RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ + libgl1-mesa-glx \ + libjemalloc-dev \ + git + +RUN useradd -m -s /bin/bash user && \ + mkdir -p /home/user && \ + chown -R user /home/user/ + +WORKDIR /home/user/ +RUN git clone https://github.com/opea-project/GenAIComps.git + +WORKDIR /home/user/GenAIComps +RUN pip install --no-cache-dir --upgrade pip && \ + pip install --no-cache-dir -r /home/user/GenAIComps/requirements.txt + +COPY ./chatqna_wrapper.py /home/user/chatqna.py + +ENV PYTHONPATH=$PYTHONPATH:/home/user/GenAIComps + +USER user + +WORKDIR /home/user + +RUN echo 'ulimit -S -n 999999' >> ~/.bashrc + +ENTRYPOINT ["python", "chatqna.py"] diff --git a/ChatQnA/chatqna_wrapper.py b/ChatQnA/chatqna_wrapper.py new file mode 100644 index 0000000000..09062b5d27 --- /dev/null +++ b/ChatQnA/chatqna_wrapper.py @@ -0,0 +1,68 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os + +from comps import ChatQnAGateway, MicroService, ServiceOrchestrator, ServiceType + +MEGA_SERVICE_HOST_IP = os.getenv("MEGA_SERVICE_HOST_IP", "0.0.0.0") +MEGA_SERVICE_PORT = int(os.getenv("MEGA_SERVICE_PORT", 8888)) +EMBEDDING_SERVICE_HOST_IP = os.getenv("EMBEDDING_SERVICE_HOST_IP", "0.0.0.0") +EMBEDDING_SERVICE_PORT = int(os.getenv("EMBEDDING_SERVICE_PORT", 6000)) +RETRIEVER_SERVICE_HOST_IP = os.getenv("RETRIEVER_SERVICE_HOST_IP", "0.0.0.0") +RETRIEVER_SERVICE_PORT = int(os.getenv("RETRIEVER_SERVICE_PORT", 7000)) +RERANK_SERVICE_HOST_IP = os.getenv("RERANK_SERVICE_HOST_IP", "0.0.0.0") +RERANK_SERVICE_PORT = int(os.getenv("RERANK_SERVICE_PORT", 8000)) +LLM_SERVICE_HOST_IP = os.getenv("LLM_SERVICE_HOST_IP", "0.0.0.0") +LLM_SERVICE_PORT = int(os.getenv("LLM_SERVICE_PORT", 9000)) + + +class ChatQnAService: + def __init__(self, host="0.0.0.0", port=8000): + self.host = host + self.port = port + self.megaservice = ServiceOrchestrator() + + def add_remote_service(self): + embedding = MicroService( + name="embedding", + host=EMBEDDING_SERVICE_HOST_IP, + port=EMBEDDING_SERVICE_PORT, + endpoint="/v1/embeddings", + use_remote_service=True, + service_type=ServiceType.EMBEDDING, + ) + retriever = MicroService( + name="retriever", + host=RETRIEVER_SERVICE_HOST_IP, + port=RETRIEVER_SERVICE_PORT, + endpoint="/v1/retrieval", + use_remote_service=True, + service_type=ServiceType.RETRIEVER, + ) + rerank = MicroService( + name="rerank", + host=RERANK_SERVICE_HOST_IP, + port=RERANK_SERVICE_PORT, + endpoint="/v1/reranking", + use_remote_service=True, + service_type=ServiceType.RERANK, + ) + llm = MicroService( + name="llm", + host=LLM_SERVICE_HOST_IP, + port=LLM_SERVICE_PORT, + endpoint="/v1/chat/completions", + use_remote_service=True, + service_type=ServiceType.LLM, + ) + self.megaservice.add(embedding).add(retriever).add(rerank).add(llm) + self.megaservice.flow_to(embedding, retriever) + self.megaservice.flow_to(retriever, rerank) + self.megaservice.flow_to(rerank, llm) + self.gateway = ChatQnAGateway(megaservice=self.megaservice, host="0.0.0.0", port=self.port) + + +if __name__ == "__main__": + chatqna = ChatQnAService(host=MEGA_SERVICE_HOST_IP, port=MEGA_SERVICE_PORT) + chatqna.add_remote_service() diff --git a/ChatQnA/docker_image_build/build.yaml b/ChatQnA/docker_image_build/build.yaml index 7be5141ead..aab333ec8b 100644 --- a/ChatQnA/docker_image_build/build.yaml +++ b/ChatQnA/docker_image_build/build.yaml @@ -11,6 +11,12 @@ services: context: ../ dockerfile: ./Dockerfile image: ${REGISTRY:-opea}/chatqna:${TAG:-latest} + chatqna-wrapper: + build: + context: ../ + dockerfile: ./Dockerfile.wrapper + extends: chatqna + image: ${REGISTRY:-opea}/chatqna-wrapper:${TAG:-latest} chatqna-guardrails: build: context: ../ diff --git a/ChatQnA/kubernetes/intel/hpu/gaudi/manifest/chatqna-guardrails.yaml b/ChatQnA/kubernetes/intel/hpu/gaudi/manifest/chatqna-guardrails.yaml index a802889f8b..a96b7ad7fb 100644 --- a/ChatQnA/kubernetes/intel/hpu/gaudi/manifest/chatqna-guardrails.yaml +++ b/ChatQnA/kubernetes/intel/hpu/gaudi/manifest/chatqna-guardrails.yaml @@ -1184,13 +1184,8 @@ spec: runAsUser: 1000 seccompProfile: type: RuntimeDefault -<<<<<<< HEAD image: "ghcr.io/huggingface/tgi-gaudi:2.0.6" - imagePullPolicy: IfNotPresent -======= - image: "ghcr.io/huggingface/tgi-gaudi:2.0.5" imagePullPolicy: Always ->>>>>>> e3187be819ad088c24bf1b2cbb419255af0f2be3 volumeMounts: - mountPath: /data name: model-volume diff --git a/ChatQnA/kubernetes/intel/hpu/gaudi/manifest/chatqna-vllm.yaml b/ChatQnA/kubernetes/intel/hpu/gaudi/manifest/chatqna-vllm.yaml index 949e7cd8ea..ec7a542264 100644 --- a/ChatQnA/kubernetes/intel/hpu/gaudi/manifest/chatqna-vllm.yaml +++ b/ChatQnA/kubernetes/intel/hpu/gaudi/manifest/chatqna-vllm.yaml @@ -43,6 +43,7 @@ metadata: app.kubernetes.io/managed-by: Helm data: TEI_EMBEDDING_ENDPOINT: "http://chatqna-tei" + HUGGINGFACEHUB_API_TOKEN: "insert-your-huggingface-token-here" http_proxy: "" https_proxy: "" no_proxy: "" @@ -70,9 +71,8 @@ data: no_proxy: "" LOGFLAG: "" vLLM_ENDPOINT: "http://chatqna-vllm" - HUGGINGFACEHUB_API_TOKEN: "insert-your-huggingface-token-here" - LLM_MODEL: "meta-llama/Llama-3.1-70B-Instruct" - MODEL_ID: "meta-llama/Llama-3.1-70B-Instruct" + LLM_MODEL: "meta-llama/Meta-Llama-3-8B-Instruct" + MODEL_ID: "meta-llama/Meta-Llama-3-8B-Instruct" --- # Source: chatqna/charts/reranking-usvc/templates/configmap.yaml # Copyright (C) 2024 Intel Corporation @@ -145,7 +145,6 @@ data: NUMBA_CACHE_DIR: "/tmp" TRANSFORMERS_CACHE: "/tmp/transformers_cache" HF_HOME: "/tmp/.cache/huggingface" - MAX_WARMUP_SEQUENCE_LENGTH: "512" --- # Source: chatqna/charts/teirerank/templates/configmap.yaml # Copyright (C) 2024 Intel Corporation @@ -170,6 +169,7 @@ data: NUMBA_CACHE_DIR: "/tmp" TRANSFORMERS_CACHE: "/tmp/transformers_cache" HF_HOME: "/tmp/.cache/huggingface" + MAX_WARMUP_SEQUENCE_LENGTH: "512" --- # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 @@ -183,7 +183,7 @@ metadata: app.kubernetes.io/instance: chatqna app.kubernetes.io/version: "2.1.0" data: - MODEL_ID: "meta-llama/Llama-3.1-70B-Instruct" + MODEL_ID: "meta-llama/Meta-Llama-3-8B-Instruct" PORT: "2080" HF_TOKEN: "insert-your-huggingface-token-here" http_proxy: "" @@ -194,6 +194,12 @@ data: PT_HPU_ENABLE_LAZY_COLLECTIVES: "true" OMPI_MCA_btl_vader_single_copy_mechanism: "none" HF_HOME: "/tmp/.cache/huggingface" + GPU_MEMORY_UTILIZATION: "0.5" + DTYPE: "auto" + TENSOR_PARALLEL_SIZE: "1" + BLOCK_SIZE: "128" + MAX_NUM_SEQS: "256" + MAX_SEQ_LEN_TO_CAPTURE: "2048" --- # Source: chatqna/templates/nginx-deployment.yaml apiVersion: v1 @@ -649,7 +655,7 @@ spec: runAsUser: 1000 seccompProfile: type: RuntimeDefault - image: "opea/dataprep-redis:v0.9" + image: "opea/dataprep-redis:latest" imagePullPolicy: Always ports: - name: data-prep @@ -1103,10 +1109,8 @@ spec: - configMapRef: name: chatqna-tei-config securityContext: - privileged: true - capabilities: - add: ["SYS_NICE"] - image: "ghcr.io/huggingface/tei-gaudi:1.5.0" + {} + image: "ghcr.io/huggingface/text-embeddings-inference:cpu-1.5" imagePullPolicy: IfNotPresent args: - "--auto-truncate" @@ -1140,16 +1144,8 @@ spec: initialDelaySeconds: 5 periodSeconds: 5 resources: - limits: - habana.ai/gaudi: 1 - cpu: 10 - memory: 100Gi - hugepages-2Mi: 9800Mi - requests: - habana.ai/gaudi: 1 - cpu: 10 - memory: 100Gi - hugepages-2Mi: 9800Mi + {} + volumes: - name: model-volume # Replace with Persistent volume claim/ host directory emptyDir: {} @@ -1191,11 +1187,17 @@ spec: - configMapRef: name: chatqna-teirerank-config securityContext: - {} - image: "ghcr.io/huggingface/text-embeddings-inference:cpu-1.5" - imagePullPolicy: Always - args: - - "--auto-truncate" + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: false + runAsNonRoot: true + runAsUser: 1000 + seccompProfile: + type: RuntimeDefault + image: "ghcr.io/huggingface/tei-gaudi:1.5.0" + imagePullPolicy: IfNotPresent volumeMounts: - mountPath: /data name: model-volume @@ -1228,7 +1230,8 @@ spec: initialDelaySeconds: 5 periodSeconds: 5 resources: - {} + limits: + habana.ai/gaudi: 1 volumes: - name: model-volume # Replace with Persistent volume claim/ host directory emptyDir: {} @@ -1242,6 +1245,7 @@ spec: # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 + apiVersion: apps/v1 kind: Deployment metadata: @@ -1271,16 +1275,36 @@ spec: - configMapRef: name: chatqna-vllm-config securityContext: - privileged: true + allowPrivilegeEscalation: false capabilities: - add: ["SYS_NICE"] - image: "opea/llm-vllm-hpu:latest" - command: - - /bin/bash - - -c - - | - export VLLM_CPU_KVCACHE_SPACE=40 && \ - python3 -m vllm.entrypoints.openai.api_server --enforce-eager --gpu-memory-utilization 0.5 --dtype auto --model $MODEL_ID --port 2080 --tensor-parallel-size 8 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048 + drop: + - ALL + readOnlyRootFilesystem: true + runAsNonRoot: true + runAsUser: 1000 + seccompProfile: + type: RuntimeDefault + image: "opea/vllm-hpu:latest" + args: + - "--enforce-eager" + - "--model" + - "$(MODEL_ID)" + - "--tensor-parallel-size" + - "1" + - "--gpu-memory-utilization" + - "$(GPU_MEMORY_UTILIZATION)" + - "--dtype" + - "$(DTYPE)" + - "--max-num-seqs" + - "$(MAX_NUM_SEQS)" + - "--block-size" + - "$(BLOCK_SIZE)" + - "--max-seq-len-to-capture" + - "$(MAX_SEQ_LEN_TO_CAPTURE)" + - "--host" + - "0.0.0.0" + - "--port" + - "$(PORT)" imagePullPolicy: Always volumeMounts: - mountPath: /data @@ -1293,20 +1317,13 @@ spec: protocol: TCP resources: limits: - habana.ai/gaudi: 8 - cpu: 40 - memory: 400Gi - hugepages-2Mi: 9800Mi - requests: - habana.ai/gaudi: 8 - cpu: 40 - memory: 400Gi - hugepages-2Mi: 9800Mi + habana.ai/gaudi: 1 volumes: - name: model-volume # Replace with Persistent volume claim/ host directory emptyDir: {} - name: tmp emptyDir: {} + --- # Source: chatqna/templates/deployment.yaml # Copyright (C) 2024 Intel Corporation @@ -1350,8 +1367,8 @@ spec: value: chatqna-retriever-usvc - name: EMBEDDING_SERVICE_HOST_IP value: chatqna-embedding-usvc - - name: GUARDRAIL_SERVICE_HOST_IP - value: chatqna-guardrails-usvc + - name: MODEL_ID + value: "meta-llama/Meta-Llama-3-8B-Instruct" securityContext: allowPrivilegeEscalation: false capabilities: @@ -1362,8 +1379,8 @@ spec: runAsUser: 1000 seccompProfile: type: RuntimeDefault - image: "opea/chatqna:latest" - imagePullPolicy: Always + image: "opea/chatqna-wrapper:latest" + imagePullPolicy: IfNotPresent volumeMounts: - mountPath: /tmp name: tmp diff --git a/ChatQnA/tests/test_manifest_on_gaudi.sh b/ChatQnA/tests/test_manifest_on_gaudi.sh index 8bcccab377..170f04541d 100755 --- a/ChatQnA/tests/test_manifest_on_gaudi.sh +++ b/ChatQnA/tests/test_manifest_on_gaudi.sh @@ -99,6 +99,69 @@ function validate_chatqna() { } +function validate_chatqna_vllm() { + local ns=$1 + local log=$2 + max_retry=20 + # make sure microservice retriever-usvc is ready + # try to curl retriever-svc for max_retry times + test_embedding=$(python3 -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)") + for ((i=1; i<=max_retry; i++)) + do + endpoint_url=$(get_end_point "chatqna-retriever-usvc" $ns) + curl http://$endpoint_url/v1/retrieval -X POST \ + -d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${test_embedding}}" \ + -H 'Content-Type: application/json' && break + sleep 30 + done + # if i is bigger than max_retry, then exit with error + if [ $i -gt $max_retry ]; then + echo "Microservice retriever failed, exit with error." + return 1 + fi + + # make sure microservice vllm-svc is ready + for ((i=1; i<=max_retry; i++)) + do + endpoint_url=$(get_end_point "chatqna-vllm" $ns) + curl http://$endpoint_url/v1/chat/completions -X POST \ + -d '{"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "user", "content": "What is Deep Learning?"}]}' \ + -H 'Content-Type: application/json' && break + sleep 30 + done + # if i is bigger than max_retry, then exit with error + if [ $i -gt $max_retry ]; then + echo "Microservice vllm failed, exit with error." + return 1 + fi + + # check megaservice works + # generate a random logfile name to avoid conflict among multiple runners + LOGFILE=$LOG_PATH/curlmega_$log.log + endpoint_url=$(get_end_point "chatqna" $ns) + curl http://$endpoint_url/v1/chatqna -H "Content-Type: application/json" -d '{"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": "What is the revenue of Nike in 2023?"}' > $LOGFILE + exit_code=$? + if [ $exit_code -ne 0 ]; then + echo "Megaservice failed, please check the logs in $LOGFILE!" + return ${exit_code} + fi + + echo "Checking response results, make sure the output is reasonable. " + local status=false + if [[ -f $LOGFILE ]] && + [[ $(grep -c "\[DONE\]" $LOGFILE) != 0 ]]; then + status=true + fi + if [ $status == false ]; then + echo "Response check failed, please check the logs in artifacts!" + return 1 + else + echo "Response check succeed!" + fi + return 0 +} + + function _cleanup_ns() { local ns=$1 if kubectl get ns $ns; then @@ -133,6 +196,30 @@ function install_and_validate_chatqna_guardrail() { fi } +function install_and_validate_chatqna_vllm() { + echo "Testing manifests chatqna_vllm" + local ns=${NAMESPACE} + _cleanup_ns $ns + kubectl create namespace $ns + # install guardrail + kubectl apply -f chatqna-vllm.yaml -n $ns + # Sleep enough time for chatqna_vllm to be ready, vllm warmup takes about 5 minutes + sleep 280 + if kubectl rollout status deployment -n "$ns" --timeout "$ROLLOUT_TIMEOUT_SECONDS"; then + echo "Waiting for chatqna_vllm pod ready done!" + else + echo "Timeout waiting for chatqna_vllm pod ready!" + exit 1 + fi + + # validate guardrail + validate_chatqna_vllm $ns chatqna-vllm + local ret=$? + if [ $ret -ne 0 ]; then + exit 1 + fi +} + if [ $# -eq 0 ]; then echo "Usage: $0 " exit 1 @@ -161,7 +248,11 @@ case "$1" in pushd ChatQnA/kubernetes/intel/hpu/gaudi/manifest install_and_validate_chatqna_guardrail popd + pushd ChatQnA/kubernetes/intel/hpu/gaudi/manifest + install_and_validate_chatqna_vllm + popd ;; + *) echo "Unknown function: $1" ;;