Align HPU parameters with docker compose (#528)

Signed-off-by: Dolpher Du <[email protected]>
opea-project · Nov 8, 2024 · cb0bd53 · cb0bd53
1 parent 809347a
commit cb0bd53
Show file tree

Hide file tree

Showing 11 changed files with 149 additions and 60 deletions.
diff --git a/helm-charts/chatqna/gaudi-values.yaml b/helm-charts/chatqna/gaudi-values.yaml
@@ -13,9 +13,16 @@ tgi:
   resources:
     limits:
       habana.ai/gaudi: 1
-  MAX_INPUT_LENGTH: "1024"
-  MAX_TOTAL_TOKENS: "2048"
+  # higher limits are needed with extra input tokens added by rerank
+  MAX_INPUT_LENGTH: "2048"
+  MAX_TOTAL_TOKENS: "4096"
   CUDA_GRAPHS: ""
+  OMPI_MCA_btl_vader_single_copy_mechanism: "none"
+  ENABLE_HPU_GRAPH: "true"
+  LIMIT_HPU_GRAPH: "true"
+  USE_FLASH_ATTENTION: "true"
+  FLASH_ATTENTION_RECOMPUTE: "true"
+
   livenessProbe:
     initialDelaySeconds: 5
     periodSeconds: 5
@@ -34,22 +41,8 @@ tgi:
 # (i.e. query context docs have been uploaded with data-prep)
 teirerank:
   accelDevice: "gaudi"
-  image:
-    repository: opea/tei-gaudi
-    tag: "latest"
-  resources:
-    limits:
-      habana.ai/gaudi: 1
-  securityContext:
-    readOnlyRootFilesystem: false
-  livenessProbe:
-    timeoutSeconds: 1
-  readinessProbe:
-    timeoutSeconds: 1
-
-# Embedding: Second largest bottleneck without rerank
-tei:
-  accelDevice: "gaudi"
+  OMPI_MCA_btl_vader_single_copy_mechanism: "none"
+  MAX_WARMUP_SEQUENCE_LENGTH: "512"
   image:
     repository: ghcr.io/huggingface/tei-gaudi
     tag: 1.5.0
@@ -62,3 +55,22 @@ tei:
     timeoutSeconds: 1
   readinessProbe:
     timeoutSeconds: 1
+
+# Embedding: Second largest bottleneck without rerank
+# By default tei on gaudi is disabled.
+# tei:
+#   accelDevice: "gaudi"
+#   OMPI_MCA_btl_vader_single_copy_mechanism: "none"
+#   MAX_WARMUP_SEQUENCE_LENGTH: "512"
+#   image:
+#     repository: ghcr.io/huggingface/tei-gaudi
+#     tag: 1.5.0
+#   resources:
+#     limits:
+#       habana.ai/gaudi: 1
+#   securityContext:
+#     readOnlyRootFilesystem: false
+#   livenessProbe:
+#     timeoutSeconds: 1
+#   readinessProbe:
+#     timeoutSeconds: 1
diff --git a/helm-charts/chatqna/guardrails-gaudi-values.yaml b/helm-charts/chatqna/guardrails-gaudi-values.yaml
@@ -12,26 +12,29 @@ guardrails-usvc:
   SAFETY_GUARD_MODEL_ID: "meta-llama/Meta-Llama-Guard-2-8B"
 
 # gaudi related config
-tei:
-  accelDevice: "gaudi"
-  image:
-    repository: ghcr.io/huggingface/tei-gaudi
-    tag: 1.5.0
-  resources:
-    limits:
-      habana.ai/gaudi: 1
-  securityContext:
-    readOnlyRootFilesystem: false
-  livenessProbe:
-    timeoutSeconds: 1
-  readinessProbe:
-    timeoutSeconds: 1
+# tei running on CPU by default
+# tei:
+#   accelDevice: "gaudi"
+#   image:
+#     repository: ghcr.io/huggingface/tei-gaudi
+#     tag: 1.5.0
+#   resources:
+#     limits:
+#       habana.ai/gaudi: 1
+#   securityContext:
+#     readOnlyRootFilesystem: false
+#   livenessProbe:
+#     timeoutSeconds: 1
+#   readinessProbe:
+#     timeoutSeconds: 1
 
 teirerank:
   accelDevice: "gaudi"
+  OMPI_MCA_btl_vader_single_copy_mechanism: "none"
+  MAX_WARMUP_SEQUENCE_LENGTH: "512"
   image:
-    repository: opea/tei-gaudi
-    tag: "latest"
+    repository: ghcr.io/huggingface/tei-gaudi
+    tag: "1.5.0"
   resources:
     limits:
       habana.ai/gaudi: 1
@@ -50,9 +53,15 @@ tgi:
   resources:
     limits:
       habana.ai/gaudi: 1
-  MAX_INPUT_LENGTH: "1024"
-  MAX_TOTAL_TOKENS: "2048"
+  # higher limits are needed with extra input tokens added by rerank
+  MAX_INPUT_LENGTH: "2048"
+  MAX_TOTAL_TOKENS: "4096"
   CUDA_GRAPHS: ""
+  OMPI_MCA_btl_vader_single_copy_mechanism: "none"
+  ENABLE_HPU_GRAPH: "true"
+  LIMIT_HPU_GRAPH: "true"
+  USE_FLASH_ATTENTION: "true"
+  FLASH_ATTENTION_RECOMPUTE: "true"
   livenessProbe:
     initialDelaySeconds: 5
     periodSeconds: 5
@@ -79,6 +88,11 @@ tgi-guardrails:
   MAX_INPUT_LENGTH: "1024"
   MAX_TOTAL_TOKENS: "2048"
   CUDA_GRAPHS: ""
+  OMPI_MCA_btl_vader_single_copy_mechanism: "none"
+  ENABLE_HPU_GRAPH: "true"
+  LIMIT_HPU_GRAPH: "true"
+  USE_FLASH_ATTENTION: "true"
+  FLASH_ATTENTION_RECOMPUTE: "true"
   livenessProbe:
     initialDelaySeconds: 5
     periodSeconds: 5

diff --git a/helm-charts/common/tei/gaudi-values.yaml b/helm-charts/common/tei/gaudi-values.yaml
@@ -7,6 +7,8 @@
 
 accelDevice: "gaudi"
 
+OMPI_MCA_btl_vader_single_copy_mechanism: "none"
+MAX_WARMUP_SEQUENCE_LENGTH: "512"
 image:
   repository: ghcr.io/huggingface/tei-gaudi
   tag: 1.5.0

diff --git a/helm-charts/common/tei/templates/configmap.yaml b/helm-charts/common/tei/templates/configmap.yaml
@@ -19,4 +19,16 @@ data:
   {{- if .Values.global.HF_ENDPOINT }}
   HF_ENDPOINT: {{ .Values.global.HF_ENDPOINT | quote}}
   {{- end }}
-  MAX_WARMUP_SEQUENCE_LENGTH: "512"
+  {{- if .Values.HF_HUB_DISABLE_PROGRESS_BARS }}
+  HF_HUB_DISABLE_PROGRESS_BARS: {{ .Values.HF_HUB_DISABLE_PROGRESS_BARS | quote }}
+  {{- end }}
+  {{- if .Values.HF_HUB_ENABLE_HF_TRANSFER }}
+  HF_HUB_ENABLE_HF_TRANSFER: {{ .Values.HF_HUB_ENABLE_HF_TRANSFER | quote }}
+  {{- end }}
+  # More options for HPU
+  {{- if .Values.OMPI_MCA_btl_vader_single_copy_mechanism }}
+  OMPI_MCA_btl_vader_single_copy_mechanism: {{ .Values.OMPI_MCA_btl_vader_single_copy_mechanism | quote }}
+  {{- end }}
+  {{- if .Values.MAX_WARMUP_SEQUENCE_LENGTH }}
+  MAX_WARMUP_SEQUENCE_LENGTH: {{ .Values.MAX_WARMUP_SEQUENCE_LENGTH | quote }}
+  {{- end }}
diff --git a/helm-charts/common/tei/values.yaml b/helm-charts/common/tei/values.yaml
@@ -18,6 +18,9 @@ autoscaling:
 port: 2081
 shmSize: 1Gi
 EMBEDDING_MODEL_ID: "BAAI/bge-base-en-v1.5"
+HF_HUB_DISABLE_PROGRESS_BARS: "1"
+HF_HUB_ENABLE_HF_TRANSFER: "0"
+
 image:
   repository: ghcr.io/huggingface/text-embeddings-inference
   pullPolicy: IfNotPresent

diff --git a/helm-charts/common/teirerank/gaudi-values.yaml b/helm-charts/common/teirerank/gaudi-values.yaml
@@ -7,9 +7,11 @@
 
 accelDevice: "gaudi"
 
+OMPI_MCA_btl_vader_single_copy_mechanism: "none"
+MAX_WARMUP_SEQUENCE_LENGTH: "512"
 image:
-  repository: opea/tei-gaudi
-  tag: "latest"
+  repository: ghcr.io/huggingface/tei-gaudi
+  tag: 1.5.0
 
 securityContext:
   readOnlyRootFilesystem: false

diff --git a/helm-charts/common/teirerank/templates/configmap.yaml b/helm-charts/common/teirerank/templates/configmap.yaml
@@ -19,4 +19,16 @@ data:
   {{- if .Values.global.HF_ENDPOINT }}
   HF_ENDPOINT: {{ .Values.global.HF_ENDPOINT | quote}}
   {{- end }}
-  MAX_WARMUP_SEQUENCE_LENGTH: "512"
+  {{- if .Values.HF_HUB_DISABLE_PROGRESS_BARS }}
+  HF_HUB_DISABLE_PROGRESS_BARS: {{ .Values.HF_HUB_DISABLE_PROGRESS_BARS | quote }}
+  {{- end }}
+  {{- if .Values.HF_HUB_ENABLE_HF_TRANSFER }}
+  HF_HUB_ENABLE_HF_TRANSFER: {{ .Values.HF_HUB_ENABLE_HF_TRANSFER | quote }}
+  {{- end }}
+  # More options for HPU
+  {{- if .Values.OMPI_MCA_btl_vader_single_copy_mechanism }}
+  OMPI_MCA_btl_vader_single_copy_mechanism: {{ .Values.OMPI_MCA_btl_vader_single_copy_mechanism | quote }}
+  {{- end }}
+  {{- if .Values.MAX_WARMUP_SEQUENCE_LENGTH }}
+  MAX_WARMUP_SEQUENCE_LENGTH: {{ .Values.MAX_WARMUP_SEQUENCE_LENGTH | quote }}
+  {{- end }}
diff --git a/helm-charts/common/teirerank/values.yaml b/helm-charts/common/teirerank/values.yaml
@@ -18,6 +18,9 @@ autoscaling:
 port: 2082
 shmSize: 1Gi
 RERANK_MODEL_ID: "BAAI/bge-reranker-base"
+HF_HUB_DISABLE_PROGRESS_BARS: "1"
+HF_HUB_ENABLE_HF_TRANSFER: "0"
+
 image:
   repository: ghcr.io/huggingface/text-embeddings-inference
   pullPolicy: IfNotPresent

diff --git a/helm-charts/common/tgi/gaudi-values.yaml b/helm-charts/common/tgi/gaudi-values.yaml
@@ -14,6 +14,11 @@ image:
 MAX_INPUT_LENGTH: "1024"
 MAX_TOTAL_TOKENS: "2048"
 CUDA_GRAPHS: ""
+OMPI_MCA_btl_vader_single_copy_mechanism: "none"
+ENABLE_HPU_GRAPH: "true"
+LIMIT_HPU_GRAPH: "true"
+USE_FLASH_ATTENTION: "true"
+FLASH_ATTENTION_RECOMPUTE: "true"
 
 resources:
   limits:

diff --git a/helm-charts/common/tgi/templates/configmap.yaml b/helm-charts/common/tgi/templates/configmap.yaml
@@ -29,3 +29,25 @@ data:
   {{- if .Values.CUDA_GRAPHS }}
   CUDA_GRAPHS: {{ .Values.CUDA_GRAPHS | quote }}
   {{- end }}
+  {{- if .Values.HF_HUB_DISABLE_PROGRESS_BARS }}
+  HF_HUB_DISABLE_PROGRESS_BARS: {{ .Values.HF_HUB_DISABLE_PROGRESS_BARS | quote }}
+  {{- end }}
+  {{- if .Values.HF_HUB_ENABLE_HF_TRANSFER }}
+  HF_HUB_ENABLE_HF_TRANSFER: {{ .Values.HF_HUB_ENABLE_HF_TRANSFER | quote }}
+  {{- end }}
+  # More options for HPU
+  {{- if .Values.OMPI_MCA_btl_vader_single_copy_mechanism }}
+  OMPI_MCA_btl_vader_single_copy_mechanism: {{ .Values.OMPI_MCA_btl_vader_single_copy_mechanism | quote }}
+  {{- end }}
+  {{- if .Values.ENABLE_HPU_GRAPH }}
+  ENABLE_HPU_GRAPH: {{ .Values.ENABLE_HPU_GRAPH | quote }}
+  {{- end }}
+  {{- if .Values.LIMIT_HPU_GRAPH }}
+  LIMIT_HPU_GRAPH: {{ .Values.LIMIT_HPU_GRAPH | quote }}
+  {{- end }}
+  {{- if .Values.USE_FLASH_ATTENTION }}
+  USE_FLASH_ATTENTION: {{ .Values.USE_FLASH_ATTENTION | quote }}
+  {{- end }}
+  {{- if .Values.FLASH_ATTENTION_RECOMPUTE }}
+  FLASH_ATTENTION_RECOMPUTE: {{ .Values.FLASH_ATTENTION_RECOMPUTE | quote }}
+  {{- end }}
diff --git a/helm-charts/common/tgi/values.yaml b/helm-charts/common/tgi/values.yaml
@@ -88,26 +88,26 @@ startupProbe:
   periodSeconds: 5
   failureThreshold: 180
   timeoutSeconds: 2
-#livenessProbe:
-#  httpGet:
-#    path: /health
-#    port: http
-#  initialDelaySeconds: 5
-#  periodSeconds: 5
-#  failureThreshold: 24
-#readinessProbe:
-#  httpGet:
-#    path: /health
-#    port: http
-#  initialDelaySeconds: 5
-#  periodSeconds: 5
-#startupProbe:
-#  httpGet:
-#    path: /health
-#    port: http
-#  initialDelaySeconds: 5
-#  periodSeconds: 5
-#  failureThreshold: 120
+# livenessProbe:
+#   httpGet:
+#     path: /health
+#     port: http
+#   initialDelaySeconds: 5
+#   periodSeconds: 5
+#   failureThreshold: 24
+# readinessProbe:
+#   httpGet:
+#     path: /health
+#     port: http
+#   initialDelaySeconds: 5
+#   periodSeconds: 5
+# startupProbe:
+#   httpGet:
+#     path: /health
+#     port: http
+#   initialDelaySeconds: 5
+#   periodSeconds: 5
+#   failureThreshold: 120
 
 nodeSelector: {}
 
@@ -120,6 +120,8 @@ LLM_MODEL_ID: Intel/neural-chat-7b-v3-3
 MAX_INPUT_LENGTH: ""
 MAX_TOTAL_TOKENS: ""
 CUDA_GRAPHS: "0"
+HF_HUB_DISABLE_PROGRESS_BARS: "1"
+HF_HUB_ENABLE_HF_TRANSFER: "0"
 
 global:
   http_proxy: ""