From fb514bb8ba82ee63ab6db83813f11a2b824fa75e Mon Sep 17 00:00:00 2001
From: Liang Lv <liang1.lv@intel.com>
Date: Mon, 18 Nov 2024 10:48:09 +0800
Subject: [PATCH] Add chatqna wrapper for multiple model selection (#1144)

Signed-off-by: lvliang-intel <liang1.lv@intel.com>
Co-authored-by: Ying Hu <ying.hu@intel.com>
Co-authored-by: chen, suyue <suyue.chen@intel.com>
---
 ChatQnA/Dockerfile.wrapper                    |  32 +++++
 ChatQnA/chatqna_wrapper.py                    |  68 +++++++++++
 ChatQnA/docker_image_build/build.yaml         |   6 +
 .../gaudi/manifest/chatqna-guardrails.yaml    |   5 -
 .../hpu/gaudi/manifest/chatqna-vllm.yaml      | 113 ++++++++++--------
 ChatQnA/tests/test_manifest_on_gaudi.sh       |  91 ++++++++++++++
 6 files changed, 262 insertions(+), 53 deletions(-)
 create mode 100644 ChatQnA/Dockerfile.wrapper
 create mode 100644 ChatQnA/chatqna_wrapper.py

diff --git a/ChatQnA/Dockerfile.wrapper b/ChatQnA/Dockerfile.wrapper
new file mode 100644
index 0000000000..a9e4fb5444
--- /dev/null
+++ b/ChatQnA/Dockerfile.wrapper
@@ -0,0 +1,32 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+FROM python:3.11-slim
+
+RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \
+    libgl1-mesa-glx \
+    libjemalloc-dev \
+    git
+
+RUN useradd -m -s /bin/bash user && \
+    mkdir -p /home/user && \
+    chown -R user /home/user/
+
+WORKDIR /home/user/
+RUN git clone https://github.com/opea-project/GenAIComps.git
+
+WORKDIR /home/user/GenAIComps
+RUN pip install --no-cache-dir --upgrade pip && \
+    pip install --no-cache-dir -r /home/user/GenAIComps/requirements.txt
+
+COPY ./chatqna_wrapper.py /home/user/chatqna.py
+
+ENV PYTHONPATH=$PYTHONPATH:/home/user/GenAIComps
+
+USER user
+
+WORKDIR /home/user
+
+RUN echo 'ulimit -S -n 999999' >> ~/.bashrc
+
+ENTRYPOINT ["python", "chatqna.py"]
diff --git a/ChatQnA/chatqna_wrapper.py b/ChatQnA/chatqna_wrapper.py
new file mode 100644
index 0000000000..09062b5d27
--- /dev/null
+++ b/ChatQnA/chatqna_wrapper.py
@@ -0,0 +1,68 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+
+from comps import ChatQnAGateway, MicroService, ServiceOrchestrator, ServiceType
+
+MEGA_SERVICE_HOST_IP = os.getenv("MEGA_SERVICE_HOST_IP", "0.0.0.0")
+MEGA_SERVICE_PORT = int(os.getenv("MEGA_SERVICE_PORT", 8888))
+EMBEDDING_SERVICE_HOST_IP = os.getenv("EMBEDDING_SERVICE_HOST_IP", "0.0.0.0")
+EMBEDDING_SERVICE_PORT = int(os.getenv("EMBEDDING_SERVICE_PORT", 6000))
+RETRIEVER_SERVICE_HOST_IP = os.getenv("RETRIEVER_SERVICE_HOST_IP", "0.0.0.0")
+RETRIEVER_SERVICE_PORT = int(os.getenv("RETRIEVER_SERVICE_PORT", 7000))
+RERANK_SERVICE_HOST_IP = os.getenv("RERANK_SERVICE_HOST_IP", "0.0.0.0")
+RERANK_SERVICE_PORT = int(os.getenv("RERANK_SERVICE_PORT", 8000))
+LLM_SERVICE_HOST_IP = os.getenv("LLM_SERVICE_HOST_IP", "0.0.0.0")
+LLM_SERVICE_PORT = int(os.getenv("LLM_SERVICE_PORT", 9000))
+
+
+class ChatQnAService:
+    def __init__(self, host="0.0.0.0", port=8000):
+        self.host = host
+        self.port = port
+        self.megaservice = ServiceOrchestrator()
+
+    def add_remote_service(self):
+        embedding = MicroService(
+            name="embedding",
+            host=EMBEDDING_SERVICE_HOST_IP,
+            port=EMBEDDING_SERVICE_PORT,
+            endpoint="/v1/embeddings",
+            use_remote_service=True,
+            service_type=ServiceType.EMBEDDING,
+        )
+        retriever = MicroService(
+            name="retriever",
+            host=RETRIEVER_SERVICE_HOST_IP,
+            port=RETRIEVER_SERVICE_PORT,
+            endpoint="/v1/retrieval",
+            use_remote_service=True,
+            service_type=ServiceType.RETRIEVER,
+        )
+        rerank = MicroService(
+            name="rerank",
+            host=RERANK_SERVICE_HOST_IP,
+            port=RERANK_SERVICE_PORT,
+            endpoint="/v1/reranking",
+            use_remote_service=True,
+            service_type=ServiceType.RERANK,
+        )
+        llm = MicroService(
+            name="llm",
+            host=LLM_SERVICE_HOST_IP,
+            port=LLM_SERVICE_PORT,
+            endpoint="/v1/chat/completions",
+            use_remote_service=True,
+            service_type=ServiceType.LLM,
+        )
+        self.megaservice.add(embedding).add(retriever).add(rerank).add(llm)
+        self.megaservice.flow_to(embedding, retriever)
+        self.megaservice.flow_to(retriever, rerank)
+        self.megaservice.flow_to(rerank, llm)
+        self.gateway = ChatQnAGateway(megaservice=self.megaservice, host="0.0.0.0", port=self.port)
+
+
+if __name__ == "__main__":
+    chatqna = ChatQnAService(host=MEGA_SERVICE_HOST_IP, port=MEGA_SERVICE_PORT)
+    chatqna.add_remote_service()
diff --git a/ChatQnA/docker_image_build/build.yaml b/ChatQnA/docker_image_build/build.yaml
index 7be5141ead..aab333ec8b 100644
--- a/ChatQnA/docker_image_build/build.yaml
+++ b/ChatQnA/docker_image_build/build.yaml
@@ -11,6 +11,12 @@ services:
       context: ../
       dockerfile: ./Dockerfile
     image: ${REGISTRY:-opea}/chatqna:${TAG:-latest}
+  chatqna-wrapper:
+    build:
+      context: ../
+      dockerfile: ./Dockerfile.wrapper
+    extends: chatqna
+    image: ${REGISTRY:-opea}/chatqna-wrapper:${TAG:-latest}
   chatqna-guardrails:
     build:
       context: ../
diff --git a/ChatQnA/kubernetes/intel/hpu/gaudi/manifest/chatqna-guardrails.yaml b/ChatQnA/kubernetes/intel/hpu/gaudi/manifest/chatqna-guardrails.yaml
index a802889f8b..a96b7ad7fb 100644
--- a/ChatQnA/kubernetes/intel/hpu/gaudi/manifest/chatqna-guardrails.yaml
+++ b/ChatQnA/kubernetes/intel/hpu/gaudi/manifest/chatqna-guardrails.yaml
@@ -1184,13 +1184,8 @@ spec:
             runAsUser: 1000
             seccompProfile:
               type: RuntimeDefault
-<<<<<<< HEAD
           image: "ghcr.io/huggingface/tgi-gaudi:2.0.6"
-          imagePullPolicy: IfNotPresent
-=======
-          image: "ghcr.io/huggingface/tgi-gaudi:2.0.5"
           imagePullPolicy: Always
->>>>>>> e3187be819ad088c24bf1b2cbb419255af0f2be3
           volumeMounts:
             - mountPath: /data
               name: model-volume
diff --git a/ChatQnA/kubernetes/intel/hpu/gaudi/manifest/chatqna-vllm.yaml b/ChatQnA/kubernetes/intel/hpu/gaudi/manifest/chatqna-vllm.yaml
index 949e7cd8ea..ec7a542264 100644
--- a/ChatQnA/kubernetes/intel/hpu/gaudi/manifest/chatqna-vllm.yaml
+++ b/ChatQnA/kubernetes/intel/hpu/gaudi/manifest/chatqna-vllm.yaml
@@ -43,6 +43,7 @@ metadata:
     app.kubernetes.io/managed-by: Helm
 data:
   TEI_EMBEDDING_ENDPOINT: "http://chatqna-tei"
+  HUGGINGFACEHUB_API_TOKEN: "insert-your-huggingface-token-here"
   http_proxy: ""
   https_proxy: ""
   no_proxy: ""
@@ -70,9 +71,8 @@ data:
   no_proxy: ""
   LOGFLAG: ""
   vLLM_ENDPOINT: "http://chatqna-vllm"
-  HUGGINGFACEHUB_API_TOKEN: "insert-your-huggingface-token-here"
-  LLM_MODEL: "meta-llama/Llama-3.1-70B-Instruct"
-  MODEL_ID: "meta-llama/Llama-3.1-70B-Instruct"
+  LLM_MODEL: "meta-llama/Meta-Llama-3-8B-Instruct"
+  MODEL_ID: "meta-llama/Meta-Llama-3-8B-Instruct"
 ---
 # Source: chatqna/charts/reranking-usvc/templates/configmap.yaml
 # Copyright (C) 2024 Intel Corporation
@@ -145,7 +145,6 @@ data:
   NUMBA_CACHE_DIR: "/tmp"
   TRANSFORMERS_CACHE: "/tmp/transformers_cache"
   HF_HOME: "/tmp/.cache/huggingface"
-  MAX_WARMUP_SEQUENCE_LENGTH: "512"
 ---
 # Source: chatqna/charts/teirerank/templates/configmap.yaml
 # Copyright (C) 2024 Intel Corporation
@@ -170,6 +169,7 @@ data:
   NUMBA_CACHE_DIR: "/tmp"
   TRANSFORMERS_CACHE: "/tmp/transformers_cache"
   HF_HOME: "/tmp/.cache/huggingface"
+  MAX_WARMUP_SEQUENCE_LENGTH: "512"
 ---
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
@@ -183,7 +183,7 @@ metadata:
     app.kubernetes.io/instance: chatqna
     app.kubernetes.io/version: "2.1.0"
 data:
-  MODEL_ID: "meta-llama/Llama-3.1-70B-Instruct"
+  MODEL_ID: "meta-llama/Meta-Llama-3-8B-Instruct"
   PORT: "2080"
   HF_TOKEN: "insert-your-huggingface-token-here"
   http_proxy: ""
@@ -194,6 +194,12 @@ data:
   PT_HPU_ENABLE_LAZY_COLLECTIVES: "true"
   OMPI_MCA_btl_vader_single_copy_mechanism: "none"
   HF_HOME: "/tmp/.cache/huggingface"
+  GPU_MEMORY_UTILIZATION: "0.5"
+  DTYPE: "auto"
+  TENSOR_PARALLEL_SIZE: "1"
+  BLOCK_SIZE: "128"
+  MAX_NUM_SEQS: "256"
+  MAX_SEQ_LEN_TO_CAPTURE: "2048"
 ---
 # Source: chatqna/templates/nginx-deployment.yaml
 apiVersion: v1
@@ -649,7 +655,7 @@ spec:
             runAsUser: 1000
             seccompProfile:
               type: RuntimeDefault
-          image: "opea/dataprep-redis:v0.9"
+          image: "opea/dataprep-redis:latest"
           imagePullPolicy: Always
           ports:
             - name: data-prep
@@ -1103,10 +1109,8 @@ spec:
             - configMapRef:
                 name: chatqna-tei-config
           securityContext:
-            privileged: true
-            capabilities:
-              add: ["SYS_NICE"]
-          image: "ghcr.io/huggingface/tei-gaudi:1.5.0"
+            {}
+          image: "ghcr.io/huggingface/text-embeddings-inference:cpu-1.5"
           imagePullPolicy: IfNotPresent
           args:
             - "--auto-truncate"
@@ -1140,16 +1144,8 @@ spec:
             initialDelaySeconds: 5
             periodSeconds: 5
           resources:
-            limits:
-              habana.ai/gaudi: 1
-              cpu: 10
-              memory: 100Gi
-              hugepages-2Mi: 9800Mi
-            requests:
-              habana.ai/gaudi: 1
-              cpu: 10
-              memory: 100Gi
-              hugepages-2Mi: 9800Mi
+            {}
+
       volumes:
         - name: model-volume # Replace with Persistent volume claim/ host directory
           emptyDir: {}
@@ -1191,11 +1187,17 @@ spec:
             - configMapRef:
                 name: chatqna-teirerank-config
           securityContext:
-            {}
-          image: "ghcr.io/huggingface/text-embeddings-inference:cpu-1.5"
-          imagePullPolicy: Always
-          args:
-            - "--auto-truncate"
+            allowPrivilegeEscalation: false
+            capabilities:
+              drop:
+              - ALL
+            readOnlyRootFilesystem: false
+            runAsNonRoot: true
+            runAsUser: 1000
+            seccompProfile:
+              type: RuntimeDefault
+          image: "ghcr.io/huggingface/tei-gaudi:1.5.0"
+          imagePullPolicy: IfNotPresent
           volumeMounts:
             - mountPath: /data
               name: model-volume
@@ -1228,7 +1230,8 @@ spec:
             initialDelaySeconds: 5
             periodSeconds: 5
           resources:
-            {}
+            limits:
+              habana.ai/gaudi: 1
       volumes:
         - name: model-volume # Replace with Persistent volume claim/ host directory
           emptyDir: {}
@@ -1242,6 +1245,7 @@ spec:
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
+
 apiVersion: apps/v1
 kind: Deployment
 metadata:
@@ -1271,16 +1275,36 @@ spec:
             - configMapRef:
                 name: chatqna-vllm-config
           securityContext:
-            privileged: true
+            allowPrivilegeEscalation: false
             capabilities:
-              add: ["SYS_NICE"]
-          image: "opea/llm-vllm-hpu:latest"
-          command:
-            - /bin/bash
-            - -c
-            - |
-              export VLLM_CPU_KVCACHE_SPACE=40 && \
-              python3 -m vllm.entrypoints.openai.api_server --enforce-eager --gpu-memory-utilization 0.5 --dtype auto --model $MODEL_ID --port 2080 --tensor-parallel-size 8 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048
+              drop:
+              - ALL
+            readOnlyRootFilesystem: true
+            runAsNonRoot: true
+            runAsUser: 1000
+            seccompProfile:
+              type: RuntimeDefault
+          image: "opea/vllm-hpu:latest"
+          args:
+            - "--enforce-eager"
+            - "--model"
+            - "$(MODEL_ID)"
+            - "--tensor-parallel-size"
+            - "1"
+            - "--gpu-memory-utilization"
+            - "$(GPU_MEMORY_UTILIZATION)"
+            - "--dtype"
+            - "$(DTYPE)"
+            - "--max-num-seqs"
+            - "$(MAX_NUM_SEQS)"
+            - "--block-size"
+            - "$(BLOCK_SIZE)"
+            - "--max-seq-len-to-capture"
+            - "$(MAX_SEQ_LEN_TO_CAPTURE)"
+            - "--host"
+            - "0.0.0.0"
+            - "--port"
+            - "$(PORT)"
           imagePullPolicy: Always
           volumeMounts:
             - mountPath: /data
@@ -1293,20 +1317,13 @@ spec:
               protocol: TCP
           resources:
             limits:
-              habana.ai/gaudi: 8
-              cpu: 40
-              memory: 400Gi
-              hugepages-2Mi: 9800Mi
-            requests:
-              habana.ai/gaudi: 8
-              cpu: 40
-              memory: 400Gi
-              hugepages-2Mi: 9800Mi
+              habana.ai/gaudi: 1
       volumes:
         - name: model-volume # Replace with Persistent volume claim/ host directory
           emptyDir: {}
         - name: tmp
           emptyDir: {}
+
 ---
 # Source: chatqna/templates/deployment.yaml
 # Copyright (C) 2024 Intel Corporation
@@ -1350,8 +1367,8 @@ spec:
               value: chatqna-retriever-usvc
             - name: EMBEDDING_SERVICE_HOST_IP
               value: chatqna-embedding-usvc
-            - name: GUARDRAIL_SERVICE_HOST_IP
-              value: chatqna-guardrails-usvc
+            - name: MODEL_ID
+              value: "meta-llama/Meta-Llama-3-8B-Instruct"
           securityContext:
             allowPrivilegeEscalation: false
             capabilities:
@@ -1362,8 +1379,8 @@ spec:
             runAsUser: 1000
             seccompProfile:
               type: RuntimeDefault
-          image: "opea/chatqna:latest"
-          imagePullPolicy: Always
+          image: "opea/chatqna-wrapper:latest"
+          imagePullPolicy: IfNotPresent
           volumeMounts:
             - mountPath: /tmp
               name: tmp
diff --git a/ChatQnA/tests/test_manifest_on_gaudi.sh b/ChatQnA/tests/test_manifest_on_gaudi.sh
index 8bcccab377..170f04541d 100755
--- a/ChatQnA/tests/test_manifest_on_gaudi.sh
+++ b/ChatQnA/tests/test_manifest_on_gaudi.sh
@@ -99,6 +99,69 @@ function validate_chatqna() {
 }
 
 
+function validate_chatqna_vllm() {
+    local ns=$1
+    local log=$2
+    max_retry=20
+    # make sure microservice retriever-usvc is ready
+    # try to curl retriever-svc for max_retry times
+    test_embedding=$(python3 -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)")
+    for ((i=1; i<=max_retry; i++))
+    do
+        endpoint_url=$(get_end_point "chatqna-retriever-usvc" $ns)
+        curl http://$endpoint_url/v1/retrieval -X POST \
+            -d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${test_embedding}}" \
+            -H 'Content-Type: application/json' && break
+        sleep 30
+    done
+    # if i is bigger than max_retry, then exit with error
+    if [ $i -gt $max_retry ]; then
+        echo "Microservice retriever failed, exit with error."
+        return 1
+    fi
+
+    # make sure microservice vllm-svc is ready
+    for ((i=1; i<=max_retry; i++))
+    do
+        endpoint_url=$(get_end_point "chatqna-vllm" $ns)
+        curl http://$endpoint_url/v1/chat/completions -X POST \
+            -d '{"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "user", "content": "What is Deep Learning?"}]}' \
+            -H 'Content-Type: application/json' && break
+        sleep 30
+    done
+    # if i is bigger than max_retry, then exit with error
+    if [ $i -gt $max_retry ]; then
+        echo "Microservice vllm failed, exit with error."
+        return 1
+    fi
+
+    # check megaservice works
+    # generate a random logfile name to avoid conflict among multiple runners
+    LOGFILE=$LOG_PATH/curlmega_$log.log
+    endpoint_url=$(get_end_point "chatqna" $ns)
+    curl http://$endpoint_url/v1/chatqna -H "Content-Type: application/json" -d '{"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": "What is the revenue of Nike in 2023?"}' > $LOGFILE
+    exit_code=$?
+    if [ $exit_code -ne 0 ]; then
+        echo "Megaservice failed, please check the logs in $LOGFILE!"
+        return ${exit_code}
+    fi
+
+    echo "Checking response results, make sure the output is reasonable. "
+    local status=false
+    if [[ -f $LOGFILE ]] &&
+        [[ $(grep -c "\[DONE\]" $LOGFILE) != 0 ]]; then
+        status=true
+    fi
+    if [ $status == false ]; then
+        echo "Response check failed, please check the logs in artifacts!"
+        return 1
+    else
+        echo "Response check succeed!"
+    fi
+    return 0
+}
+
+
 function _cleanup_ns() {
     local ns=$1
     if kubectl get ns $ns; then
@@ -133,6 +196,30 @@ function install_and_validate_chatqna_guardrail() {
     fi
 }
 
+function install_and_validate_chatqna_vllm() {
+    echo "Testing manifests chatqna_vllm"
+    local ns=${NAMESPACE}
+    _cleanup_ns $ns
+    kubectl create namespace $ns
+    # install guardrail
+    kubectl apply -f chatqna-vllm.yaml -n $ns
+    # Sleep enough time for chatqna_vllm to be ready, vllm warmup takes about 5 minutes
+    sleep 280
+    if kubectl rollout status deployment -n "$ns" --timeout "$ROLLOUT_TIMEOUT_SECONDS"; then
+        echo "Waiting for chatqna_vllm pod ready done!"
+    else
+        echo "Timeout waiting for chatqna_vllm pod ready!"
+        exit 1
+    fi
+
+    # validate guardrail
+    validate_chatqna_vllm $ns chatqna-vllm
+    local ret=$?
+    if [ $ret -ne 0 ]; then
+        exit 1
+    fi
+}
+
 if [ $# -eq 0 ]; then
     echo "Usage: $0 <function_name>"
     exit 1
@@ -161,7 +248,11 @@ case "$1" in
         pushd ChatQnA/kubernetes/intel/hpu/gaudi/manifest
         install_and_validate_chatqna_guardrail
         popd
+        pushd ChatQnA/kubernetes/intel/hpu/gaudi/manifest
+        install_and_validate_chatqna_vllm
+        popd
         ;;
+
     *)
         echo "Unknown function: $1"
         ;;