diff --git a/FaqGen/docker/gaudi/README.md b/FaqGen/docker/gaudi/README.md index 12124cd1c..509cbface 100644 --- a/FaqGen/docker/gaudi/README.md +++ b/FaqGen/docker/gaudi/README.md @@ -16,7 +16,7 @@ cd GenAIComps As TGI Gaudi has been officially published as a Docker image, we simply need to pull it: ```bash -docker pull ghcr.io/huggingface/tgi-gaudi:1.2.1 +docker pull ghcr.io/huggingface/tgi-gaudi:2.0.1 ``` ### 2. Build LLM Image @@ -56,7 +56,7 @@ docker build -t opea/faqgen-react-ui:latest --build-arg https_proxy=$https_proxy Then run the command `docker images`, you will have the following Docker Images: -1. `ghcr.io/huggingface/tgi-gaudi:1.2.1` +1. `ghcr.io/huggingface/tgi-gaudi:2.0.1` 2. `opea/llm-faqgen-tgi:latest` 3. `opea/faqgen:latest` 4. `opea/faqgen-ui:latest` diff --git a/FaqGen/docker/gaudi/compose.yaml b/FaqGen/docker/gaudi/compose.yaml index 6a5ec4db6..602105ec2 100644 --- a/FaqGen/docker/gaudi/compose.yaml +++ b/FaqGen/docker/gaudi/compose.yaml @@ -17,12 +17,14 @@ services: https_proxy: ${https_proxy} HABANA_VISIBLE_DEVICES: all OMPI_MCA_btl_vader_single_copy_mechanism: none - HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} + HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} + PREFILL_BATCH_BUCKET_SIZE: 1 + BATCH_BUCKET_SIZE: 8 runtime: habana cap_add: - SYS_NICE ipc: host - command: --model-id ${LLM_MODEL_ID} --max-input-length 1024 --max-total-tokens 2048 + command: --model-id ${LLM_MODEL_ID} --max-input-length 2048 --max-total-tokens 4096 --max-batch-total-tokens 65536 --max-batch-prefill-tokens 4096 llm_faqgen: image: opea/llm-faqgen-tgi:latest container_name: llm-faqgen-server diff --git a/FaqGen/kubernetes/manifests/README.md b/FaqGen/kubernetes/manifests/README.md index 37bab882c..dc0c06b5f 100644 --- a/FaqGen/kubernetes/manifests/README.md +++ b/FaqGen/kubernetes/manifests/README.md @@ -23,13 +23,24 @@ sed -i "s/insert-your-huggingface-token-here/${HUGGINGFACEHUB_API_TOKEN}/g" faqg kubectl apply -f faqgen.yaml ``` +## Deploy UI + +``` +cd GenAIExamples/FaqGen/kubernetes/manifests/ +kubectl get svc # get ip address +ip_address="" # according to your svc address +sed -i "s/insert_your_ip_here/${ip_address}/g" ui.yaml +kubectl apply -f ui.yaml +``` + ## Verify Services Make sure all the pods are running, and restart the faqgen-xxxx pod if necessary. ``` kubectl get pods -curl http://${host_ip}:8888/v1/faqgen -H "Content-Type: application/json" -d '{ +port=7779 # 7779 for gaudi, 7778 for xeon +curl http://${host_ip}:7779/v1/faqgen -H "Content-Type: application/json" -d '{ "messages": "Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5." }' ``` diff --git a/FaqGen/kubernetes/manifests/gaudi/faqgen.yaml b/FaqGen/kubernetes/manifests/gaudi/faqgen.yaml index 5e3c17771..24581e8a4 100644 --- a/FaqGen/kubernetes/manifests/gaudi/faqgen.yaml +++ b/FaqGen/kubernetes/manifests/gaudi/faqgen.yaml @@ -1,216 +1,186 @@ --- -# Source: codegen/charts/llm-uservice/charts/tgi/templates/service.yaml # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -apiVersion: v1 -kind: Service -metadata: - name: faqgen-tgi - labels: - helm.sh/chart: tgi-0.1.0 - app.kubernetes.io/name: tgi - app.kubernetes.io/instance: faqgen - app.kubernetes.io/version: "1.4" - app.kubernetes.io/managed-by: Helm -spec: - type: ClusterIP - ports: - - port: 80 - targetPort: 80 - protocol: TCP - name: tgi - selector: - app.kubernetes.io/name: tgi - app.kubernetes.io/instance: faqgen ---- -apiVersion: v1 -kind: Service -metadata: - name: faqgen-llm-uservice - labels: - helm.sh/chart: llm-uservice-0.1.0 - app.kubernetes.io/name: llm-uservice - app.kubernetes.io/instance: faqgen - app.kubernetes.io/version: "1.0.0" - app.kubernetes.io/managed-by: Helm -spec: - type: ClusterIP - ports: - - port: 9000 - targetPort: 9000 - protocol: TCP - name: llm-uservice - selector: - app.kubernetes.io/name: llm-uservice - app.kubernetes.io/instance: faqgen ---- -apiVersion: v1 -kind: Service -metadata: - name: faqgen - labels: - helm.sh/chart: faqgen-0.1.0 - app.kubernetes.io/name: faqgen - app.kubernetes.io/instance: faqgen - app.kubernetes.io/version: "1.0.0" - app.kubernetes.io/managed-by: Helm -spec: - type: ClusterIP - ports: - - port: 8888 - targetPort: 8888 - protocol: TCP - name: faqgen - selector: - app.kubernetes.io/name: faqgen - app.kubernetes.io/instance: faqgen ---- apiVersion: apps/v1 kind: Deployment metadata: - name: faqgen-tgi - labels: - helm.sh/chart: tgi-0.1.0 - app.kubernetes.io/name: tgi - app.kubernetes.io/instance: faqgen - app.kubernetes.io/version: "1.4" - app.kubernetes.io/managed-by: Helm + name: faq-tgi-deploy + namespace: default spec: replicas: 1 selector: matchLabels: - app.kubernetes.io/name: tgi - app.kubernetes.io/instance: faqgen + app: faq-tgi-deploy template: metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' labels: - app.kubernetes.io/name: tgi - app.kubernetes.io/instance: faqgen + app: faq-tgi-deploy spec: - securityContext: {} + hostIPC: true containers: - - name: tgi - env: - - name: MODEL_ID - value: Intel/neural-chat-7b-v3-3 - - name: PORT - value: "80" - - name: http_proxy - value: - - name: https_proxy - value: - - name: no_proxy - value: - securityContext: {} - image: "ghcr.io/huggingface/text-generation-inference:1.4" - imagePullPolicy: IfNotPresent - volumeMounts: - - mountPath: /data - name: model-volume - ports: - - name: http - containerPort: 80 - protocol: TCP - resources: {} + - name: faq-tgi-deploy-demo + env: + - name: HUGGING_FACE_HUB_TOKEN + value: "insert-your-huggingface-token-here" + - name: OMPI_MCA_btl_vader_single_copy_mechanism + value: none + - name: PT_HPU_ENABLE_LAZY_COLLECTIVES + value: 'true' + - name: runtime + value: habana + - name: HABANA_VISIBLE_DEVICES + value: all + - name: PREFILL_BATCH_BUCKET_SIZE + value: "1" + - name: BATCH_BUCKET_SIZE + value: "8" + - name: PORT + value: "80" + image: ghcr.io/huggingface/tgi-gaudi:2.0.1 + imagePullPolicy: IfNotPresent + securityContext: + capabilities: + add: + - SYS_NICE + args: + - --model-id + - 'meta-llama/Meta-Llama-3-8B-Instruct' + - --max-input-length + - '3096' + - --max-total-tokens + - '4096' + - --max-batch-total-tokens + - '65536' + - --max-batch-prefill-tokens + - '4096' + volumeMounts: + - mountPath: /data + name: model-volume + - mountPath: /dev/shm + name: shm + ports: + - containerPort: 80 + resources: + limits: + habana.ai/gaudi: 1 + serviceAccountName: default volumes: - - name: model-volume - hostPath: - path: /mnt - type: Directory + - name: model-volume + hostPath: + path: /home/sdp/cesg + type: Directory + - name: shm + emptyDir: + medium: Memory + sizeLimit: 1Gi +--- +kind: Service +apiVersion: v1 +metadata: + name: faq-tgi-svc +spec: + type: ClusterIP + selector: + app: faq-tgi-deploy + ports: + - name: service + port: 8010 + targetPort: 80 --- apiVersion: apps/v1 kind: Deployment metadata: - name: faqgen-llm-uservice - labels: - helm.sh/chart: llm-uservice-0.1.0 - app.kubernetes.io/name: llm-uservice - app.kubernetes.io/instance: faqgen - app.kubernetes.io/version: "1.0.0" - app.kubernetes.io/managed-by: Helm + name: faq-micro-deploy + namespace: default spec: replicas: 1 selector: matchLabels: - app.kubernetes.io/name: llm-uservice - app.kubernetes.io/instance: faqgen + app: faq-micro-deploy template: metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' labels: - app.kubernetes.io/name: llm-uservice - app.kubernetes.io/instance: faqgen + app: faq-micro-deploy spec: - securityContext: {} + hostIPC: true containers: - - name: faqgen + - name: faq-micro-deploy env: - name: TGI_LLM_ENDPOINT - value: "http://faqgen-tgi:80" + value: "http://faq-tgi-svc.default.svc.cluster.local:8010" - name: HUGGINGFACEHUB_API_TOKEN value: "insert-your-huggingface-token-here" - - name: http_proxy - value: - - name: https_proxy - value: - - name: no_proxy - value: - securityContext: {} - image: "opea/llm-faqgen-tgi:latest" + image: opea/llm-faqgen-tgi:latest imagePullPolicy: IfNotPresent + args: null ports: - - name: llm-uservice - containerPort: 9000 - protocol: TCP - startupProbe: - exec: - command: - - curl - - http://faqgen-tgi:80 - initialDelaySeconds: 5 - periodSeconds: 5 - failureThreshold: 120 - resources: {} + - containerPort: 9000 + serviceAccountName: default +--- +kind: Service +apiVersion: v1 +metadata: + name: faq-micro-svc +spec: + type: ClusterIP + selector: + app: faq-micro-deploy + ports: + - name: service + port: 9003 + targetPort: 9000 --- apiVersion: apps/v1 kind: Deployment metadata: - name: faqgen - labels: - helm.sh/chart: faqgen-0.1.0 - app.kubernetes.io/name: faqgen - app.kubernetes.io/instance: faqgen - app.kubernetes.io/version: "1.0.0" - app.kubernetes.io/managed-by: Helm + name: faq-mega-server-deploy + namespace: default spec: replicas: 1 selector: matchLabels: - app.kubernetes.io/name: faqgen - app.kubernetes.io/instance: faqgen + app: faq-mega-server-deploy template: metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' labels: - app.kubernetes.io/name: faqgen - app.kubernetes.io/instance: faqgen + app: faq-mega-server-deploy spec: - securityContext: null + hostIPC: true containers: - - name: faqgen + - name: faq-mega-server-deploy env: - name: LLM_SERVICE_HOST_IP - value: faqgen-llm-uservice - - name: http_proxy - value: - - name: https_proxy - value: - - name: no_proxy - value: - securityContext: null - image: "opea/faqgen:latest" + value: faq-micro-svc + - name: LLM_SERVICE_PORT + value: "9003" + - name: MEGA_SERVICE_HOST_IP + value: faq-mega-server-svc + - name: MEGA_SERVICE_PORT + value: "7777" + image: opea/faqgen:latest imagePullPolicy: IfNotPresent + args: null ports: - - name: faqgen - containerPort: 8888 - protocol: TCP - resources: null + - containerPort: 7777 + serviceAccountName: default +--- +kind: Service +apiVersion: v1 +metadata: + name: faq-mega-server-svc +spec: + type: NodePort + selector: + app: faq-mega-server-deploy + ports: + - name: service + port: 7779 + targetPort: 7777 + nodePort: 30779 diff --git a/FaqGen/kubernetes/manifests/ui.yaml b/FaqGen/kubernetes/manifests/ui.yaml new file mode 100644 index 000000000..f74299a09 --- /dev/null +++ b/FaqGen/kubernetes/manifests/ui.yaml @@ -0,0 +1,46 @@ +--- +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +apiVersion: apps/v1 +kind: Deployment +metadata: + name: faq-mega-ui-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: faq-mega-ui-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: faq-mega-ui-deploy + spec: + hostIPC: true + containers: + - name: faq-mega-ui-deploy + env: + - name: DOC_BASE_URL + value: http://{insert_your_ip_here}:7779/v1/faqgen + image: opea/faqgen-ui:latest + imagePullPolicy: IfNotPresent + args: null + ports: + - containerPort: 5173 + serviceAccountName: default +--- +kind: Service +apiVersion: v1 +metadata: + name: faq-mega-ui-svc +spec: + type: NodePort + selector: + app: faq-mega-ui-deploy + ports: + - name: service + port: 5175 + targetPort: 5173 + nodePort: 30175 diff --git a/FaqGen/kubernetes/manifests/xeon/faqgen.yaml b/FaqGen/kubernetes/manifests/xeon/faqgen.yaml index 5e3c17771..b1d102df9 100644 --- a/FaqGen/kubernetes/manifests/xeon/faqgen.yaml +++ b/FaqGen/kubernetes/manifests/xeon/faqgen.yaml @@ -1,216 +1,165 @@ --- -# Source: codegen/charts/llm-uservice/charts/tgi/templates/service.yaml # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -apiVersion: v1 -kind: Service -metadata: - name: faqgen-tgi - labels: - helm.sh/chart: tgi-0.1.0 - app.kubernetes.io/name: tgi - app.kubernetes.io/instance: faqgen - app.kubernetes.io/version: "1.4" - app.kubernetes.io/managed-by: Helm -spec: - type: ClusterIP - ports: - - port: 80 - targetPort: 80 - protocol: TCP - name: tgi - selector: - app.kubernetes.io/name: tgi - app.kubernetes.io/instance: faqgen ---- -apiVersion: v1 -kind: Service -metadata: - name: faqgen-llm-uservice - labels: - helm.sh/chart: llm-uservice-0.1.0 - app.kubernetes.io/name: llm-uservice - app.kubernetes.io/instance: faqgen - app.kubernetes.io/version: "1.0.0" - app.kubernetes.io/managed-by: Helm -spec: - type: ClusterIP - ports: - - port: 9000 - targetPort: 9000 - protocol: TCP - name: llm-uservice - selector: - app.kubernetes.io/name: llm-uservice - app.kubernetes.io/instance: faqgen ---- -apiVersion: v1 -kind: Service -metadata: - name: faqgen - labels: - helm.sh/chart: faqgen-0.1.0 - app.kubernetes.io/name: faqgen - app.kubernetes.io/instance: faqgen - app.kubernetes.io/version: "1.0.0" - app.kubernetes.io/managed-by: Helm -spec: - type: ClusterIP - ports: - - port: 8888 - targetPort: 8888 - protocol: TCP - name: faqgen - selector: - app.kubernetes.io/name: faqgen - app.kubernetes.io/instance: faqgen ---- apiVersion: apps/v1 kind: Deployment metadata: - name: faqgen-tgi - labels: - helm.sh/chart: tgi-0.1.0 - app.kubernetes.io/name: tgi - app.kubernetes.io/instance: faqgen - app.kubernetes.io/version: "1.4" - app.kubernetes.io/managed-by: Helm + name: faq-tgi-cpu-deploy + namespace: default spec: replicas: 1 selector: matchLabels: - app.kubernetes.io/name: tgi - app.kubernetes.io/instance: faqgen + app: faq-tgi-cpu-deploy template: metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' labels: - app.kubernetes.io/name: tgi - app.kubernetes.io/instance: faqgen + app: faq-tgi-cpu-deploy spec: + hostIPC: true securityContext: {} containers: - - name: tgi - env: - - name: MODEL_ID - value: Intel/neural-chat-7b-v3-3 - - name: PORT - value: "80" - - name: http_proxy - value: - - name: https_proxy - value: - - name: no_proxy - value: - securityContext: {} - image: "ghcr.io/huggingface/text-generation-inference:1.4" - imagePullPolicy: IfNotPresent - volumeMounts: - - mountPath: /data - name: model-volume - ports: - - name: http - containerPort: 80 - protocol: TCP - resources: {} + - name: faq-tgi-cpu-deploy-demo + env: + - name: HUGGING_FACE_HUB_TOKEN + value: "insert-your-huggingface-token-here" + - name: PORT + value: "80" + image: ghcr.io/huggingface/text-generation-inference:1.4 + imagePullPolicy: IfNotPresent + securityContext: {} + args: + - --model-id + - 'meta-llama/Meta-Llama-3-8B-Instruct' + - --max-input-length + - '3096' + - --max-total-tokens + - '4096' + volumeMounts: + - mountPath: /data + name: model-volume + - mountPath: /dev/shm + name: shm + ports: + - containerPort: 80 + serviceAccountName: default volumes: - - name: model-volume - hostPath: - path: /mnt - type: Directory + - name: model-volume + hostPath: + path: /home/sdp/cesg + type: Directory + - name: shm + emptyDir: + medium: Memory + sizeLimit: 1Gi +--- +kind: Service +apiVersion: v1 +metadata: + name: faq-tgi-cpu-svc +spec: + type: ClusterIP + selector: + app: faq-tgi-cpu-deploy + ports: + - name: service + port: 8011 + targetPort: 80 --- apiVersion: apps/v1 kind: Deployment metadata: - name: faqgen-llm-uservice - labels: - helm.sh/chart: llm-uservice-0.1.0 - app.kubernetes.io/name: llm-uservice - app.kubernetes.io/instance: faqgen - app.kubernetes.io/version: "1.0.0" - app.kubernetes.io/managed-by: Helm + name: faq-micro-cpu-deploy + namespace: default spec: replicas: 1 selector: matchLabels: - app.kubernetes.io/name: llm-uservice - app.kubernetes.io/instance: faqgen + app: faq-micro-cpu-deploy template: metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' labels: - app.kubernetes.io/name: llm-uservice - app.kubernetes.io/instance: faqgen + app: faq-micro-cpu-deploy spec: - securityContext: {} + hostIPC: true containers: - - name: faqgen + - name: faq-micro-cpu-deploy env: - name: TGI_LLM_ENDPOINT - value: "http://faqgen-tgi:80" + value: "http://faq-tgi-cpu-svc.default.svc.cluster.local:8011" - name: HUGGINGFACEHUB_API_TOKEN value: "insert-your-huggingface-token-here" - - name: http_proxy - value: - - name: https_proxy - value: - - name: no_proxy - value: - securityContext: {} - image: "opea/llm-faqgen-tgi:latest" + image: opea/llm-faqgen-tgi:latest imagePullPolicy: IfNotPresent + args: null ports: - - name: llm-uservice - containerPort: 9000 - protocol: TCP - startupProbe: - exec: - command: - - curl - - http://faqgen-tgi:80 - initialDelaySeconds: 5 - periodSeconds: 5 - failureThreshold: 120 - resources: {} + - containerPort: 9000 + serviceAccountName: default +--- +kind: Service +apiVersion: v1 +metadata: + name: faq-micro-cpu-svc +spec: + type: ClusterIP + selector: + app: faq-micro-cpu-deploy + ports: + - name: service + port: 9004 + targetPort: 9000 --- apiVersion: apps/v1 kind: Deployment metadata: - name: faqgen - labels: - helm.sh/chart: faqgen-0.1.0 - app.kubernetes.io/name: faqgen - app.kubernetes.io/instance: faqgen - app.kubernetes.io/version: "1.0.0" - app.kubernetes.io/managed-by: Helm + name: faq-mega-server-cpu-deploy + namespace: default spec: replicas: 1 selector: matchLabels: - app.kubernetes.io/name: faqgen - app.kubernetes.io/instance: faqgen + app: faq-mega-server-cpu-deploy template: metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' labels: - app.kubernetes.io/name: faqgen - app.kubernetes.io/instance: faqgen + app: faq-mega-server-cpu-deploy spec: - securityContext: null + hostIPC: true containers: - - name: faqgen + - name: faq-mega-server-cpu-deploy env: - name: LLM_SERVICE_HOST_IP - value: faqgen-llm-uservice - - name: http_proxy - value: - - name: https_proxy - value: - - name: no_proxy - value: - securityContext: null - image: "opea/faqgen:latest" + value: faq-micro-cpu-svc + - name: LLM_SERVICE_PORT + value: "9004" + - name: MEGA_SERVICE_HOST_IP + value: faq-mega-server-cpu-svc + - name: MEGA_SERVICE_PORT + value: "7777" + image: opea/faqgen:latest imagePullPolicy: IfNotPresent + args: null ports: - - name: faqgen - containerPort: 8888 - protocol: TCP - resources: null + - containerPort: 7777 + serviceAccountName: default +--- +kind: Service +apiVersion: v1 +metadata: + name: faq-mega-server-cpu-svc +spec: + type: NodePort + selector: + app: faq-mega-server-cpu-deploy + ports: + - name: service + port: 7778 + targetPort: 7777 + nodePort: 30778