diff --git a/ChatQnA/benchmark/four_gaudi/chatqna_config_map.yaml b/ChatQnA/benchmark/four_gaudi/chatqna_config_map.yaml new file mode 100644 index 0000000000..24b8e72df0 --- /dev/null +++ b/ChatQnA/benchmark/four_gaudi/chatqna_config_map.yaml @@ -0,0 +1,23 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v1 +kind: ConfigMap +metadata: + name: qna-config + namespace: default +data: + EMBEDDING_MODEL_ID: BAAI/bge-base-en-v1.5 + RERANK_MODEL_ID: BAAI/bge-reranker-base + LLM_MODEL_ID: Intel/neural-chat-7b-v3-3 + TEI_EMBEDDING_ENDPOINT: http://embedding-dependency-svc.default.svc.cluster.local:6006 + TEI_RERANKING_ENDPOINT: http://reranking-dependency-svc.default.svc.cluster.local:8808 + TGI_LLM_ENDPOINT: http://llm-dependency-svc.default.svc.cluster.local:9009 + REDIS_URL: redis://vector-db.default.svc.cluster.local:6379 + INDEX_NAME: rag-redis + HUGGINGFACEHUB_API_TOKEN: {HF_TOKEN} + EMBEDDING_SERVICE_HOST_IP: embedding-svc + RETRIEVER_SERVICE_HOST_IP: retriever-svc + RERANK_SERVICE_HOST_IP: reranking-svc + NODE_SELECTOR: chatqna-opea + LLM_SERVICE_HOST_IP: llm-svc diff --git a/ChatQnA/benchmark/four_gaudi/chatqna_mega_service_run.yaml b/ChatQnA/benchmark/four_gaudi/chatqna_mega_service_run.yaml new file mode 100644 index 0000000000..6e93eb867f --- /dev/null +++ b/ChatQnA/benchmark/four_gaudi/chatqna_mega_service_run.yaml @@ -0,0 +1,62 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: chatqna-backend-server-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: chatqna-backend-server-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: chatqna-backend-server-deploy + spec: + nodeSelector: + node-type: chatqna-opea + topologySpreadConstraints: + - maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + labelSelector: + matchLabels: + app: chatqna-backend-server-deploy + hostIPC: true + containers: + - envFrom: + - configMapRef: + name: qna-config + image: opea/chatqna:latest + imagePullPolicy: IfNotPresent + name: chatqna-backend-server-deploy + args: null + ports: + - containerPort: 8888 + resources: + limits: + cpu: 8 + memory: 4000Mi + requests: + cpu: 8 + memory: 4000Mi + serviceAccountName: default +--- +kind: Service +apiVersion: v1 +metadata: + name: chaqna-backend-server-svc +spec: + type: NodePort + selector: + app: chatqna-backend-server-deploy + ports: + - name: service + port: 8888 + targetPort: 8888 + nodePort: 30888 diff --git a/ChatQnA/benchmark/four_gaudi/dataprep-microservice_run.yaml b/ChatQnA/benchmark/four_gaudi/dataprep-microservice_run.yaml new file mode 100644 index 0000000000..14fae684c0 --- /dev/null +++ b/ChatQnA/benchmark/four_gaudi/dataprep-microservice_run.yaml @@ -0,0 +1,70 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: dataprep-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: dataprep-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: dataprep-deploy + spec: + nodeSelector: + node-type: chatqna-opea + topologySpreadConstraints: + - maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + labelSelector: + matchLabels: + app: dataprep-deploy + hostIPC: true + containers: + - env: + - name: REDIS_URL + valueFrom: + configMapKeyRef: + name: qna-config + key: REDIS_URL + - name: INDEX_NAME + valueFrom: + configMapKeyRef: + name: qna-config + key: INDEX_NAME + image: opea/dataprep-redis:latest + imagePullPolicy: IfNotPresent + name: dataprep-deploy + args: null + ports: + - containerPort: 6007 + - containerPort: 6008 + - containerPort: 6009 + serviceAccountName: default +--- +kind: Service +apiVersion: v1 +metadata: + name: dataprep-svc +spec: + type: ClusterIP + selector: + app: dataprep-deploy + ports: + - name: port1 + port: 6007 + targetPort: 6007 + - name: port2 + port: 6008 + targetPort: 6008 + - name: port3 + port: 6009 + targetPort: 6009 diff --git a/ChatQnA/benchmark/four_gaudi/embedding-dependency_run.yaml b/ChatQnA/benchmark/four_gaudi/embedding-dependency_run.yaml new file mode 100644 index 0000000000..7dcb10342a --- /dev/null +++ b/ChatQnA/benchmark/four_gaudi/embedding-dependency_run.yaml @@ -0,0 +1,69 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: embedding-dependency-deploy + namespace: default +spec: + replicas: 6 + selector: + matchLabels: + app: embedding-dependency-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: embedding-dependency-deploy + spec: + nodeSelector: + node-type: chatqna-opea + containers: + - envFrom: + - configMapRef: + name: qna-config + image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 + name: embedding-dependency-deploy + args: + - --model-id + - $(EMBEDDING_MODEL_ID) + - --auto-truncate + volumeMounts: + - mountPath: /data + name: model-volume + - mountPath: /dev/shm + name: shm + ports: + - containerPort: 80 + resources: + limits: + cpu: 80 + memory: 20000Mi + requests: + cpu: 80 + memory: 20000Mi + serviceAccountName: default + volumes: + - name: model-volume + hostPath: + path: /home/sdp/cesg + type: Directory + - name: shm + emptyDir: + medium: Memory + sizeLimit: 1Gi +--- +kind: Service +apiVersion: v1 +metadata: + name: embedding-dependency-svc +spec: + type: ClusterIP + selector: + app: embedding-dependency-deploy + ports: + - name: service + port: 6006 + targetPort: 80 diff --git a/ChatQnA/benchmark/four_gaudi/embedding-microservice_run.yaml b/ChatQnA/benchmark/four_gaudi/embedding-microservice_run.yaml new file mode 100644 index 0000000000..f23ba0b4fa --- /dev/null +++ b/ChatQnA/benchmark/four_gaudi/embedding-microservice_run.yaml @@ -0,0 +1,59 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: embedding-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: embedding-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: embedding-deploy + spec: + nodeSelector: + node-type: chatqna-opea + topologySpreadConstraints: + - maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + labelSelector: + matchLabels: + app: embedding-deploy + hostIPC: true + containers: + - envFrom: + - configMapRef: + name: qna-config + image: opea/embedding-tei:latest + imagePullPolicy: IfNotPresent + name: embedding-deploy + args: null + ports: + - containerPort: 6000 + resources: + limits: + cpu: 4 + requests: + cpu: 4 + serviceAccountName: default +--- +kind: Service +apiVersion: v1 +metadata: + name: embedding-svc +spec: + type: ClusterIP + selector: + app: embedding-deploy + ports: + - name: service + port: 6000 + targetPort: 6000 diff --git a/ChatQnA/benchmark/four_gaudi/llm-dependency_run.yaml b/ChatQnA/benchmark/four_gaudi/llm-dependency_run.yaml new file mode 100644 index 0000000000..17766cd483 --- /dev/null +++ b/ChatQnA/benchmark/four_gaudi/llm-dependency_run.yaml @@ -0,0 +1,88 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: llm-dependency-deploy + namespace: default +spec: + replicas: 31 + selector: + matchLabels: + app: llm-dependency-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: llm-dependency-deploy + spec: + nodeSelector: + node-type: chatqna-opea + hostIPC: true + containers: + - envFrom: + - configMapRef: + name: qna-config + image: tgi_gaudi:2.0.1 + name: llm-dependency-deploy-demo + securityContext: + capabilities: + add: + - SYS_NICE + args: + - --model-id + - $(LLM_MODEL_ID) + - --max-input-length + - '1024' + - --max-total-tokens + - '2048' + - --max-batch-total-tokens + - '65536' + - --max-batch-prefill-tokens + - '4096' + volumeMounts: + - mountPath: /data + name: model-volume + - mountPath: /dev/shm + name: shm + ports: + - containerPort: 80 + resources: + limits: + habana.ai/gaudi: 1 + env: + - name: OMPI_MCA_btl_vader_single_copy_mechanism + value: none + - name: PT_HPU_ENABLE_LAZY_COLLECTIVES + value: 'true' + - name: runtime + value: habana + - name: HABANA_VISIBLE_DEVICES + value: all + - name: HF_TOKEN + value: $(HF_TOKEN) + serviceAccountName: default + volumes: + - name: model-volume + hostPath: + path: /home/sdp/cesg + type: Directory + - name: shm + emptyDir: + medium: Memory + sizeLimit: 1Gi +--- +kind: Service +apiVersion: v1 +metadata: + name: llm-dependency-svc +spec: + type: ClusterIP + selector: + app: llm-dependency-deploy + ports: + - name: service + port: 9009 + targetPort: 80 diff --git a/ChatQnA/benchmark/four_gaudi/llm-microservice_run.yaml b/ChatQnA/benchmark/four_gaudi/llm-microservice_run.yaml new file mode 100644 index 0000000000..1d9e291122 --- /dev/null +++ b/ChatQnA/benchmark/four_gaudi/llm-microservice_run.yaml @@ -0,0 +1,59 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: llm-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: llm-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: llm-deploy + spec: + nodeSelector: + node-type: chatqna-opea + topologySpreadConstraints: + - maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + labelSelector: + matchLabels: + app: llm-deploy + hostIPC: true + containers: + - envFrom: + - configMapRef: + name: qna-config + image: opea/llm-tgi:latest + imagePullPolicy: IfNotPresent + name: llm-deploy + args: null + ports: + - containerPort: 9000 + resources: + limits: + cpu: 4 + requests: + cpu: 4 + serviceAccountName: default +--- +kind: Service +apiVersion: v1 +metadata: + name: llm-svc +spec: + type: ClusterIP + selector: + app: llm-deploy + ports: + - name: service + port: 9000 + targetPort: 9000 diff --git a/ChatQnA/benchmark/four_gaudi/reranking-dependency_run.yaml b/ChatQnA/benchmark/four_gaudi/reranking-dependency_run.yaml new file mode 100644 index 0000000000..3f595ae1e6 --- /dev/null +++ b/ChatQnA/benchmark/four_gaudi/reranking-dependency_run.yaml @@ -0,0 +1,85 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: reranking-dependency-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: reranking-dependency-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: reranking-dependency-deploy + spec: + nodeSelector: + node-type: chatqna-opea + topologySpreadConstraints: + - maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + labelSelector: + matchLabels: + app: reranking-dependency-deploy + containers: + - envFrom: + - configMapRef: + name: qna-config + image: tei_gaudi:rerank + name: reranking-dependency-deploy + args: + - --model-id + - $(RERANK_MODEL_ID) + - --auto-truncate + volumeMounts: + - mountPath: /data + name: model-volume + - mountPath: /dev/shm + name: shm + ports: + - containerPort: 80 + resources: + limits: + habana.ai/gaudi: 1 + env: + - name: OMPI_MCA_btl_vader_single_copy_mechanism + value: none + - name: PT_HPU_ENABLE_LAZY_COLLECTIVES + value: 'true' + - name: runtime + value: habana + - name: HABANA_VISIBLE_DEVICES + value: all + - name: HF_TOKEN + value: $(HF_TOKEN) + - name: MAX_WARMUP_SEQUENCE_LENGTH + value: '512' + serviceAccountName: default + volumes: + - name: model-volume + hostPath: + path: /home/sdp/cesg + type: Directory + - name: shm + emptyDir: + medium: Memory + sizeLimit: 1Gi +--- +kind: Service +apiVersion: v1 +metadata: + name: reranking-dependency-svc +spec: + type: ClusterIP + selector: + app: reranking-dependency-deploy + ports: + - name: service + port: 8808 + targetPort: 80 diff --git a/ChatQnA/benchmark/four_gaudi/reranking-microservice_run.yaml b/ChatQnA/benchmark/four_gaudi/reranking-microservice_run.yaml new file mode 100644 index 0000000000..25f6a00b38 --- /dev/null +++ b/ChatQnA/benchmark/four_gaudi/reranking-microservice_run.yaml @@ -0,0 +1,59 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: reranking-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: reranking-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: reranking-deploy + spec: + nodeSelector: + node-type: chatqna-opea + topologySpreadConstraints: + - maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + labelSelector: + matchLabels: + app: reranking-deploy + hostIPC: true + containers: + - envFrom: + - configMapRef: + name: qna-config + image: opea/reranking-tei:latest + imagePullPolicy: IfNotPresent + name: reranking-deploy + args: null + ports: + - containerPort: 8000 + resources: + limits: + cpu: 4 + requests: + cpu: 4 + serviceAccountName: default +--- +kind: Service +apiVersion: v1 +metadata: + name: reranking-svc +spec: + type: ClusterIP + selector: + app: reranking-deploy + ports: + - name: service + port: 8000 + targetPort: 8000 diff --git a/ChatQnA/benchmark/four_gaudi/retrieval-microservice_run.yaml b/ChatQnA/benchmark/four_gaudi/retrieval-microservice_run.yaml new file mode 100644 index 0000000000..40040ee5bd --- /dev/null +++ b/ChatQnA/benchmark/four_gaudi/retrieval-microservice_run.yaml @@ -0,0 +1,69 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: retriever-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: retriever-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: retriever-deploy + spec: + nodeSelector: + node-type: chatqna-opea + topologySpreadConstraints: + - maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + labelSelector: + matchLabels: + app: retriever-deploy + hostIPC: true + containers: + - env: + - name: REDIS_URL + valueFrom: + configMapKeyRef: + name: qna-config + key: REDIS_URL + - name: INDEX_NAME + valueFrom: + configMapKeyRef: + name: qna-config + key: INDEX_NAME + image: opea/retriever-redis:latest + imagePullPolicy: IfNotPresent + name: retriever-deploy + args: null + ports: + - containerPort: 7000 + resources: + limits: + cpu: 8 + memory: 2500Mi + requests: + cpu: 8 + memory: 2500Mi + serviceAccountName: default +--- +kind: Service +apiVersion: v1 +metadata: + name: retriever-svc +spec: + type: ClusterIP + selector: + app: retriever-deploy + ports: + - name: service + port: 7000 + targetPort: 7000 diff --git a/ChatQnA/benchmark/four_gaudi/vector-db_run.yaml b/ChatQnA/benchmark/four_gaudi/vector-db_run.yaml new file mode 100644 index 0000000000..e04e8c5fe7 --- /dev/null +++ b/ChatQnA/benchmark/four_gaudi/vector-db_run.yaml @@ -0,0 +1,48 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vector-db +spec: + replicas: 1 + selector: + matchLabels: + app: vector-db + template: + metadata: + labels: + app: vector-db + spec: + nodeSelector: + node-type: chatqna-opea + topologySpreadConstraints: + - maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + labelSelector: + matchLabels: + app: vector-db + containers: + - name: vector-db + image: redis/redis-stack:7.2.0-v9 + ports: + - containerPort: 6379 + - containerPort: 8001 +--- +apiVersion: v1 +kind: Service +metadata: + name: vector-db +spec: + type: ClusterIP + selector: + app: vector-db + ports: + - name: vector-db-service + port: 6379 + targetPort: 6379 + - name: vector-db-insight + port: 8001 + targetPort: 8001 diff --git a/ChatQnA/benchmark/single_gaudi/chatqna_config_map.yaml b/ChatQnA/benchmark/single_gaudi/chatqna_config_map.yaml index 0fbc55dbce..24b8e72df0 100644 --- a/ChatQnA/benchmark/single_gaudi/chatqna_config_map.yaml +++ b/ChatQnA/benchmark/single_gaudi/chatqna_config_map.yaml @@ -15,7 +15,9 @@ data: TGI_LLM_ENDPOINT: http://llm-dependency-svc.default.svc.cluster.local:9009 REDIS_URL: redis://vector-db.default.svc.cluster.local:6379 INDEX_NAME: rag-redis + HUGGINGFACEHUB_API_TOKEN: {HF_TOKEN} EMBEDDING_SERVICE_HOST_IP: embedding-svc RETRIEVER_SERVICE_HOST_IP: retriever-svc RERANK_SERVICE_HOST_IP: reranking-svc + NODE_SELECTOR: chatqna-opea LLM_SERVICE_HOST_IP: llm-svc diff --git a/ChatQnA/benchmark/single_gaudi/chatqna_mega_service_run.yaml b/ChatQnA/benchmark/single_gaudi/chatqna_mega_service_run.yaml index e255b17e7f..6e93eb867f 100644 --- a/ChatQnA/benchmark/single_gaudi/chatqna_mega_service_run.yaml +++ b/ChatQnA/benchmark/single_gaudi/chatqna_mega_service_run.yaml @@ -18,6 +18,8 @@ spec: labels: app: chatqna-backend-server-deploy spec: + nodeSelector: + node-type: chatqna-opea topologySpreadConstraints: - maxSkew: 1 topologyKey: kubernetes.io/hostname diff --git a/ChatQnA/benchmark/single_gaudi/dataprep-microservice_run.yaml b/ChatQnA/benchmark/single_gaudi/dataprep-microservice_run.yaml index de4d0716a7..14fae684c0 100644 --- a/ChatQnA/benchmark/single_gaudi/dataprep-microservice_run.yaml +++ b/ChatQnA/benchmark/single_gaudi/dataprep-microservice_run.yaml @@ -18,6 +18,8 @@ spec: labels: app: dataprep-deploy spec: + nodeSelector: + node-type: chatqna-opea topologySpreadConstraints: - maxSkew: 1 topologyKey: kubernetes.io/hostname diff --git a/ChatQnA/benchmark/single_gaudi/embedding-dependency_run.yaml b/ChatQnA/benchmark/single_gaudi/embedding-dependency_run.yaml index 11d35cfcf5..d692876aaf 100644 --- a/ChatQnA/benchmark/single_gaudi/embedding-dependency_run.yaml +++ b/ChatQnA/benchmark/single_gaudi/embedding-dependency_run.yaml @@ -7,7 +7,7 @@ metadata: name: embedding-dependency-deploy namespace: default spec: - replicas: 4 + replicas: 1 selector: matchLabels: app: embedding-dependency-deploy @@ -18,11 +18,13 @@ spec: labels: app: embedding-dependency-deploy spec: + nodeSelector: + node-type: chatqna-opea containers: - envFrom: - configMapRef: name: qna-config - image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.2 + image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 name: embedding-dependency-deploy args: - --model-id @@ -37,11 +39,11 @@ spec: - containerPort: 80 resources: limits: - cpu: 24 - memory: 4000Mi + cpu: 80 + memory: 20000Mi requests: - cpu: 24 - memory: 4000Mi + cpu: 80 + memory: 20000Mi serviceAccountName: default volumes: - name: model-volume diff --git a/ChatQnA/benchmark/single_gaudi/embedding-microservice_run.yaml b/ChatQnA/benchmark/single_gaudi/embedding-microservice_run.yaml index 2427872ffb..f23ba0b4fa 100644 --- a/ChatQnA/benchmark/single_gaudi/embedding-microservice_run.yaml +++ b/ChatQnA/benchmark/single_gaudi/embedding-microservice_run.yaml @@ -18,6 +18,8 @@ spec: labels: app: embedding-deploy spec: + nodeSelector: + node-type: chatqna-opea topologySpreadConstraints: - maxSkew: 1 topologyKey: kubernetes.io/hostname diff --git a/ChatQnA/benchmark/single_gaudi/llm-dependency_run.yaml b/ChatQnA/benchmark/single_gaudi/llm-dependency_run.yaml index fbc0eac7e6..eb49bdfdff 100644 --- a/ChatQnA/benchmark/single_gaudi/llm-dependency_run.yaml +++ b/ChatQnA/benchmark/single_gaudi/llm-dependency_run.yaml @@ -18,6 +18,8 @@ spec: labels: app: llm-dependency-deploy spec: + nodeSelector: + node-type: chatqna-opea hostIPC: true containers: - envFrom: @@ -33,8 +35,12 @@ spec: - --model-id - $(LLM_MODEL_ID) - --max-input-length - - '2048' + - '1024' - --max-total-tokens + - '2048' + - --max-batch-total-tokens + - '65536' + - --max-batch-prefill-tokens - '4096' volumeMounts: - mountPath: /data diff --git a/ChatQnA/benchmark/single_gaudi/llm-microservice_run.yaml b/ChatQnA/benchmark/single_gaudi/llm-microservice_run.yaml index 8afbc34673..1d9e291122 100644 --- a/ChatQnA/benchmark/single_gaudi/llm-microservice_run.yaml +++ b/ChatQnA/benchmark/single_gaudi/llm-microservice_run.yaml @@ -18,6 +18,8 @@ spec: labels: app: llm-deploy spec: + nodeSelector: + node-type: chatqna-opea topologySpreadConstraints: - maxSkew: 1 topologyKey: kubernetes.io/hostname diff --git a/ChatQnA/benchmark/single_gaudi/reranking-dependency_run.yaml b/ChatQnA/benchmark/single_gaudi/reranking-dependency_run.yaml index 0fac53c970..3f595ae1e6 100644 --- a/ChatQnA/benchmark/single_gaudi/reranking-dependency_run.yaml +++ b/ChatQnA/benchmark/single_gaudi/reranking-dependency_run.yaml @@ -18,6 +18,8 @@ spec: labels: app: reranking-dependency-deploy spec: + nodeSelector: + node-type: chatqna-opea topologySpreadConstraints: - maxSkew: 1 topologyKey: kubernetes.io/hostname diff --git a/ChatQnA/benchmark/single_gaudi/reranking-microservice_run.yaml b/ChatQnA/benchmark/single_gaudi/reranking-microservice_run.yaml index 24fab63fe4..25f6a00b38 100644 --- a/ChatQnA/benchmark/single_gaudi/reranking-microservice_run.yaml +++ b/ChatQnA/benchmark/single_gaudi/reranking-microservice_run.yaml @@ -18,6 +18,8 @@ spec: labels: app: reranking-deploy spec: + nodeSelector: + node-type: chatqna-opea topologySpreadConstraints: - maxSkew: 1 topologyKey: kubernetes.io/hostname diff --git a/ChatQnA/benchmark/single_gaudi/retrieval-microservice_run.yaml b/ChatQnA/benchmark/single_gaudi/retrieval-microservice_run.yaml index e16505fcc6..40040ee5bd 100644 --- a/ChatQnA/benchmark/single_gaudi/retrieval-microservice_run.yaml +++ b/ChatQnA/benchmark/single_gaudi/retrieval-microservice_run.yaml @@ -18,6 +18,8 @@ spec: labels: app: retriever-deploy spec: + nodeSelector: + node-type: chatqna-opea topologySpreadConstraints: - maxSkew: 1 topologyKey: kubernetes.io/hostname diff --git a/ChatQnA/benchmark/single_gaudi/vector-db_run.yaml b/ChatQnA/benchmark/single_gaudi/vector-db_run.yaml index 704d79d32b..e04e8c5fe7 100644 --- a/ChatQnA/benchmark/single_gaudi/vector-db_run.yaml +++ b/ChatQnA/benchmark/single_gaudi/vector-db_run.yaml @@ -15,6 +15,8 @@ spec: labels: app: vector-db spec: + nodeSelector: + node-type: chatqna-opea topologySpreadConstraints: - maxSkew: 1 topologyKey: kubernetes.io/hostname diff --git a/ChatQnA/benchmark/two_gaudi/chatqna_config_map.yaml b/ChatQnA/benchmark/two_gaudi/chatqna_config_map.yaml index 5449307edd..24b8e72df0 100644 --- a/ChatQnA/benchmark/two_gaudi/chatqna_config_map.yaml +++ b/ChatQnA/benchmark/two_gaudi/chatqna_config_map.yaml @@ -15,8 +15,9 @@ data: TGI_LLM_ENDPOINT: http://llm-dependency-svc.default.svc.cluster.local:9009 REDIS_URL: redis://vector-db.default.svc.cluster.local:6379 INDEX_NAME: rag-redis - HUGGINGFACEHUB_API_TOKEN: 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 + HUGGINGFACEHUB_API_TOKEN: {HF_TOKEN} EMBEDDING_SERVICE_HOST_IP: embedding-svc RETRIEVER_SERVICE_HOST_IP: retriever-svc RERANK_SERVICE_HOST_IP: reranking-svc + NODE_SELECTOR: chatqna-opea LLM_SERVICE_HOST_IP: llm-svc diff --git a/ChatQnA/benchmark/two_gaudi/chatqna_mega_service_run.yaml b/ChatQnA/benchmark/two_gaudi/chatqna_mega_service_run.yaml index e255b17e7f..6e93eb867f 100644 --- a/ChatQnA/benchmark/two_gaudi/chatqna_mega_service_run.yaml +++ b/ChatQnA/benchmark/two_gaudi/chatqna_mega_service_run.yaml @@ -18,6 +18,8 @@ spec: labels: app: chatqna-backend-server-deploy spec: + nodeSelector: + node-type: chatqna-opea topologySpreadConstraints: - maxSkew: 1 topologyKey: kubernetes.io/hostname diff --git a/ChatQnA/benchmark/two_gaudi/dataprep-microservice_run.yaml b/ChatQnA/benchmark/two_gaudi/dataprep-microservice_run.yaml index a9542d79de..14fae684c0 100644 --- a/ChatQnA/benchmark/two_gaudi/dataprep-microservice_run.yaml +++ b/ChatQnA/benchmark/two_gaudi/dataprep-microservice_run.yaml @@ -18,6 +18,8 @@ spec: labels: app: dataprep-deploy spec: + nodeSelector: + node-type: chatqna-opea topologySpreadConstraints: - maxSkew: 1 topologyKey: kubernetes.io/hostname @@ -44,6 +46,8 @@ spec: args: null ports: - containerPort: 6007 + - containerPort: 6008 + - containerPort: 6009 serviceAccountName: default --- kind: Service @@ -58,3 +62,9 @@ spec: - name: port1 port: 6007 targetPort: 6007 + - name: port2 + port: 6008 + targetPort: 6008 + - name: port3 + port: 6009 + targetPort: 6009 diff --git a/ChatQnA/benchmark/two_gaudi/embedding-dependency_run.yaml b/ChatQnA/benchmark/two_gaudi/embedding-dependency_run.yaml index f8e8bbed3d..89d40715e6 100644 --- a/ChatQnA/benchmark/two_gaudi/embedding-dependency_run.yaml +++ b/ChatQnA/benchmark/two_gaudi/embedding-dependency_run.yaml @@ -7,7 +7,7 @@ metadata: name: embedding-dependency-deploy namespace: default spec: - replicas: 10 + replicas: 3 selector: matchLabels: app: embedding-dependency-deploy @@ -18,11 +18,13 @@ spec: labels: app: embedding-dependency-deploy spec: + nodeSelector: + node-type: chatqna-opea containers: - envFrom: - configMapRef: name: qna-config - image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.2 + image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 name: embedding-dependency-deploy args: - --model-id @@ -37,11 +39,11 @@ spec: - containerPort: 80 resources: limits: - cpu: 24 - memory: 4000Mi + cpu: 80 + memory: 20000Mi requests: - cpu: 24 - memory: 4000Mi + cpu: 80 + memory: 20000Mi serviceAccountName: default volumes: - name: model-volume diff --git a/ChatQnA/benchmark/two_gaudi/embedding-microservice_run.yaml b/ChatQnA/benchmark/two_gaudi/embedding-microservice_run.yaml index 2427872ffb..f23ba0b4fa 100644 --- a/ChatQnA/benchmark/two_gaudi/embedding-microservice_run.yaml +++ b/ChatQnA/benchmark/two_gaudi/embedding-microservice_run.yaml @@ -18,6 +18,8 @@ spec: labels: app: embedding-deploy spec: + nodeSelector: + node-type: chatqna-opea topologySpreadConstraints: - maxSkew: 1 topologyKey: kubernetes.io/hostname diff --git a/ChatQnA/benchmark/two_gaudi/llm-dependency_run.yaml b/ChatQnA/benchmark/two_gaudi/llm-dependency_run.yaml index 409a151b8c..6191a9522b 100644 --- a/ChatQnA/benchmark/two_gaudi/llm-dependency_run.yaml +++ b/ChatQnA/benchmark/two_gaudi/llm-dependency_run.yaml @@ -18,6 +18,8 @@ spec: labels: app: llm-dependency-deploy spec: + nodeSelector: + node-type: chatqna-opea hostIPC: true containers: - envFrom: @@ -33,8 +35,12 @@ spec: - --model-id - $(LLM_MODEL_ID) - --max-input-length - - '2048' + - '1024' - --max-total-tokens + - '2048' + - --max-batch-total-tokens + - '65536' + - --max-batch-prefill-tokens - '4096' volumeMounts: - mountPath: /data diff --git a/ChatQnA/benchmark/two_gaudi/llm-microservice_run.yaml b/ChatQnA/benchmark/two_gaudi/llm-microservice_run.yaml index 8afbc34673..1d9e291122 100644 --- a/ChatQnA/benchmark/two_gaudi/llm-microservice_run.yaml +++ b/ChatQnA/benchmark/two_gaudi/llm-microservice_run.yaml @@ -18,6 +18,8 @@ spec: labels: app: llm-deploy spec: + nodeSelector: + node-type: chatqna-opea topologySpreadConstraints: - maxSkew: 1 topologyKey: kubernetes.io/hostname diff --git a/ChatQnA/benchmark/two_gaudi/reranking-dependency_run.yaml b/ChatQnA/benchmark/two_gaudi/reranking-dependency_run.yaml index 0fac53c970..3f595ae1e6 100644 --- a/ChatQnA/benchmark/two_gaudi/reranking-dependency_run.yaml +++ b/ChatQnA/benchmark/two_gaudi/reranking-dependency_run.yaml @@ -18,6 +18,8 @@ spec: labels: app: reranking-dependency-deploy spec: + nodeSelector: + node-type: chatqna-opea topologySpreadConstraints: - maxSkew: 1 topologyKey: kubernetes.io/hostname diff --git a/ChatQnA/benchmark/two_gaudi/reranking-microservice_run.yaml b/ChatQnA/benchmark/two_gaudi/reranking-microservice_run.yaml index 24fab63fe4..25f6a00b38 100644 --- a/ChatQnA/benchmark/two_gaudi/reranking-microservice_run.yaml +++ b/ChatQnA/benchmark/two_gaudi/reranking-microservice_run.yaml @@ -18,6 +18,8 @@ spec: labels: app: reranking-deploy spec: + nodeSelector: + node-type: chatqna-opea topologySpreadConstraints: - maxSkew: 1 topologyKey: kubernetes.io/hostname diff --git a/ChatQnA/benchmark/two_gaudi/retrieval-microservice_run.yaml b/ChatQnA/benchmark/two_gaudi/retrieval-microservice_run.yaml index e16505fcc6..40040ee5bd 100644 --- a/ChatQnA/benchmark/two_gaudi/retrieval-microservice_run.yaml +++ b/ChatQnA/benchmark/two_gaudi/retrieval-microservice_run.yaml @@ -18,6 +18,8 @@ spec: labels: app: retriever-deploy spec: + nodeSelector: + node-type: chatqna-opea topologySpreadConstraints: - maxSkew: 1 topologyKey: kubernetes.io/hostname diff --git a/ChatQnA/benchmark/two_gaudi/vector-db_run.yaml b/ChatQnA/benchmark/two_gaudi/vector-db_run.yaml index 704d79d32b..e04e8c5fe7 100644 --- a/ChatQnA/benchmark/two_gaudi/vector-db_run.yaml +++ b/ChatQnA/benchmark/two_gaudi/vector-db_run.yaml @@ -15,6 +15,8 @@ spec: labels: app: vector-db spec: + nodeSelector: + node-type: chatqna-opea topologySpreadConstraints: - maxSkew: 1 topologyKey: kubernetes.io/hostname