Add HPA support to embedding, reranking, tgi services

Signed-off-by: Alexey Fomenko <[email protected]>
opea-project · Aug 21, 2024 · 0bebde7 · 0bebde7
1 parent b1182c4
commit 0bebde7
Show file tree

Hide file tree

Showing 17 changed files with 313 additions and 0 deletions.
diff --git a/helm-charts/chatqna/templates/customMetrics.yaml b/helm-charts/chatqna/templates/customMetrics.yaml
@@ -0,0 +1,53 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+{{- if .Values.horizontalPodAutoscaler.enabled }}
+apiVersion: v1
+data:
+  config.yaml: |
+      rules:
+      - seriesQuery: '{__name__="tgi_request_inference_duration_sum"}'
+        # Average request latency from TGI histograms, over 1 min
+        # (0.001 divider add is to make sure there's always a valid value)
+        metricsQuery: 'rate(tgi_request_inference_duration_sum{<<.LabelMatchers>>}[1m]) / (0.001+rate(tgi_request_inference_duration_count{<<.LabelMatchers>>}[1m]))'
+        name:
+          matches: ^tgi_request_inference_duration_sum
+          as: "tgi_request_latency"
+        resources:
+          # HPA needs both namespace + suitable object resource for its query paths:
+          # /apis/custom.metrics.k8s.io/v1beta1/namespaces/default/service/*/tgi_request_latency
+          # (pod is not suitable object type for matching as each instance has different name)
+          overrides:
+            namespace:
+              resource: namespace
+            service:
+              resource: service
+      - seriesQuery: '{__name__="te_request_inference_duration_sum",service="tei-reranking-svc"}'
+        # Average request latency from TEI histograms, over 1 min
+        metricsQuery: 'rate(te_request_inference_duration_sum{service="tei-reranking-svc",<<.LabelMatchers>>}[1m]) / (0.001+rate(te_request_inference_duration_count{service="tei-reranking-svc",<<.LabelMatchers>>}[1m]))'
+        name:
+          matches: ^te_request_inference_duration_sum
+          as: "reranking_request_latency"
+        resources:
+          overrides:
+            namespace:
+              resource: namespace
+            service:
+              resource: service
+      - seriesQuery: '{__name__="te_request_inference_duration_sum",service="tei-embedding-svc"}'
+        # Average request latency from TEI histograms, over 1 min
+        metricsQuery: 'rate(te_request_inference_duration_sum{service="tei-embedding-svc",<<.LabelMatchers>>}[1m]) / (0.001+rate(te_request_inference_duration_count{service="tei-embedding-svc",<<.LabelMatchers>>}[1m]))'
+        name:
+          matches: ^te_request_inference_duration_sum
+          as: "embedding_request_latency"
+        resources:
+          overrides:
+            namespace:
+              resource: namespace
+            service:
+              resource: service
+kind: ConfigMap
+metadata:
+  name: adapter-config
+  namespace: monitoring
+{{- end }}
diff --git a/helm-charts/chatqna/values.yaml b/helm-charts/chatqna/values.yaml
@@ -7,6 +7,14 @@
 
 replicaCount: 1
 
+# Enabling HPA will:
+# - Overwrite existing PrometheusAdapter "adapter-config" configMap with ChatQnA specific custom metric queries
+#   for embedding, reranking, tgi services
+# Upstream default configMap:
+#  - https://github.com/kubernetes-sigs/prometheus-adapter/blob/master/deploy/manifests/config-map.yaml
+horizontalPodAutoscaler:
+  enabled: false
+
 image:
   repository: opea/chatqna
   pullPolicy: IfNotPresent

diff --git a/helm-charts/common/embedding-usvc/templates/_helpers.tpl b/helm-charts/common/embedding-usvc/templates/_helpers.tpl
@@ -40,6 +40,7 @@ helm.sh/chart: {{ include "embedding-usvc.chart" . }}
 app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
 {{- end }}
 app.kubernetes.io/managed-by: {{ .Release.Service }}
+svc: {{ include "embedding-usvc.fullname" . }}
 {{- end }}
 
 {{/*

diff --git a/helm-charts/common/embedding-usvc/templates/deployment.yaml b/helm-charts/common/embedding-usvc/templates/deployment.yaml
@@ -8,7 +8,10 @@ metadata:
   labels:
     {{- include "embedding-usvc.labels" . | nindent 4 }}
 spec:
+  # use explicit replica counts only of HorizontalPodAutoscaler is disabled
+  {{- if not .Values.horizontalPodAutoscaler.enabled }}
   replicas: {{ .Values.replicaCount }}
+  {{- end }}
   selector:
     matchLabels:
       {{- include "embedding-usvc.selectorLabels" . | nindent 6 }}
@@ -62,6 +65,9 @@ spec:
           {{- end }}
           resources:
             {{- toYaml .Values.resources | nindent 12 }}
+          {{- if .Values.horizontalPodAutoscaler.enabled }}
+          terminationGracePeriodSeconds: 60
+          {{- end }}
       volumes:
         - name: tmp
           emptyDir: {}

diff --git a/helm-charts/common/embedding-usvc/templates/horizontalPodAutoscaler.yaml b/helm-charts/common/embedding-usvc/templates/horizontalPodAutoscaler.yaml
@@ -0,0 +1,50 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+{{- if .Values.horizontalPodAutoscaler.enabled }}
+apiVersion: autoscaling/v2
+kind: HorizontalPodAutoscaler
+metadata:
+  name: {{ include "embedding-usvc.fullname" . }}
+spec:
+  scaleTargetRef:
+    apiVersion: apps/v1
+    kind: Deployment
+    name: {{ include "embedding-usvc.fullname" . }}
+  minReplicas: 1
+  maxReplicas: {{ .Values.horizontalPodAutoscaler.maxReplicas }}
+  metrics:
+  - type: Object
+    object:
+      metric:
+        # tei-embedding time metrics are in seconds
+        name: embedding_request_latency
+      describedObject:
+        apiVersion: v1
+        # get metric for named object of given type (in same namespace)
+        kind: Service
+        name: tei-embedding-svc
+      target:
+        # embedding_request_latency is already average for all the TEI pods,
+        # so this uses Value instead of averageValue.
+        # On ICL Xeon, max tei-embedding wait time target = 4s.
+        type: Value
+        value: 4
+  behavior:
+    scaleDown:
+      stabilizationWindowSeconds: 180
+      policies:
+      - type: Percent
+        value: 25
+        periodSeconds: 15
+    scaleUp:
+      selectPolicy: Max
+      stabilizationWindowSeconds: 0
+      policies:
+      - type: Percent
+        value: 50
+        periodSeconds: 15
+      - type: Pods
+        value: 2
+        periodSeconds: 15
+{{- end }}
diff --git a/helm-charts/common/embedding-usvc/templates/servicemonitor.yaml b/helm-charts/common/embedding-usvc/templates/servicemonitor.yaml
@@ -0,0 +1,17 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+{{- if .Values.horizontalPodAutoscaler.enabled }}
+apiVersion: monitoring.coreos.com/v1
+kind: ServiceMonitor
+metadata:
+  name: {{ include "embedding-usvc.fullname" . }}
+spec:
+  selector:
+    matchLabels:
+      svc: {{ include "embedding-usvc.fullname" . }}
+  endpoints:
+  - interval: 4s
+    port: service
+    scheme: http
+{{- end }}
diff --git a/helm-charts/common/embedding-usvc/values.yaml b/helm-charts/common/embedding-usvc/values.yaml
@@ -10,6 +10,14 @@ autodependency:
 
 replicaCount: 1
 
+# Enabling HPA will:
+# - Ignore above replica count, as it will be controlled by HPA
+# - Add example HPA scaling rules with thresholds suitable for Xeon deployments
+# - Require custom metrics ConfigMap available in the main application chart
+horizontalPodAutoscaler:
+  enabled: false
+  maxReplicas: 2
+
 TEI_EMBEDDING_ENDPOINT: ""
 image:
   repository: opea/embedding-tei

diff --git a/helm-charts/common/reranking-usvc/templates/_helpers.tpl b/helm-charts/common/reranking-usvc/templates/_helpers.tpl
@@ -40,6 +40,7 @@ helm.sh/chart: {{ include "reranking-usvc.chart" . }}
 app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
 {{- end }}
 app.kubernetes.io/managed-by: {{ .Release.Service }}
+svc: {{ include "reranking-usvc.fullname" . }}
 {{- end }}
 
 {{/*

diff --git a/helm-charts/common/reranking-usvc/templates/deployment.yaml b/helm-charts/common/reranking-usvc/templates/deployment.yaml
@@ -8,7 +8,10 @@ metadata:
   labels:
     {{- include "reranking-usvc.labels" . | nindent 4 }}
 spec:
+  # use explicit replica counts only of HorizontalPodAutoscaler is disabled
+  {{- if not .Values.horizontalPodAutoscaler.enabled }}
   replicas: {{ .Values.replicaCount }}
+  {{- end }}
   selector:
     matchLabels:
       {{- include "reranking-usvc.selectorLabels" . | nindent 6 }}
@@ -62,6 +65,9 @@ spec:
           {{- end }}
           resources:
             {{- toYaml .Values.resources | nindent 12 }}
+          {{- if .Values.horizontalPodAutoscaler.enabled }}
+          terminationGracePeriodSeconds: 60
+          {{- end }}
       volumes:
         - name: tmp
           emptyDir: {}

diff --git a/helm-charts/common/reranking-usvc/templates/horizontalPodAutoscaler.yaml b/helm-charts/common/reranking-usvc/templates/horizontalPodAutoscaler.yaml
@@ -0,0 +1,50 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+{{- if .Values.horizontalPodAutoscaler.enabled }}
+apiVersion: autoscaling/v2
+kind: HorizontalPodAutoscaler
+metadata:
+  name: {{ include "reranking-usvc.fullname" . }}
+spec:
+  scaleTargetRef:
+    apiVersion: apps/v1
+    kind: Deployment
+    name: {{ include "reranking-usvc.fullname" . }}
+  minReplicas: 1
+  maxReplicas: {{ .Values.horizontalPodAutoscaler.maxReplicas }}
+  metrics:
+  - type: Object
+    object:
+      metric:
+        # tei-reranking time metrics are in seconds
+        name: reranking_request_latency
+      describedObject:
+        apiVersion: v1
+        # get metric for named object of given type (in same namespace)
+        kind: Service
+        name: tei-reranking-svc
+      target:
+        # reranking_request_latency is already average for all the TEI pods,
+        # so this uses Value instead of averageValue.
+        # On ICL Xeon, max tei-reranking wait time target = 4s.
+        type: Value
+        value: 4
+  behavior:
+    scaleDown:
+      stabilizationWindowSeconds: 180
+      policies:
+      - type: Percent
+        value: 25
+        periodSeconds: 15
+    scaleUp:
+      selectPolicy: Max
+      stabilizationWindowSeconds: 0
+      policies:
+      - type: Percent
+        value: 50
+        periodSeconds: 15
+      - type: Pods
+        value: 2
+        periodSeconds: 15
+{{- end }}
diff --git a/helm-charts/common/reranking-usvc/templates/servicemonitor.yaml b/helm-charts/common/reranking-usvc/templates/servicemonitor.yaml
@@ -0,0 +1,17 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+{{- if .Values.horizontalPodAutoscaler.enabled }}
+apiVersion: monitoring.coreos.com/v1
+kind: ServiceMonitor
+metadata:
+  name: {{ include "reranking-usvc.fullname" . }}
+spec:
+  selector:
+    matchLabels:
+      svc: {{ include "reranking-usvc.fullname" . }}
+  endpoints:
+  - interval: 4s
+    port: service
+    scheme: http
+{{- end }}
diff --git a/helm-charts/common/reranking-usvc/values.yaml b/helm-charts/common/reranking-usvc/values.yaml
@@ -10,6 +10,14 @@ autodependency:
 
 replicaCount: 1
 
+# Enabling HPA will:
+# - Ignore above replica count, as it will be controlled by HPA
+# - Add example HPA scaling rules with thresholds suitable for Xeon deployments
+# - Require custom metrics ConfigMap available in the main application chart
+horizontalPodAutoscaler:
+  enabled: false
+  maxReplicas: 3
+
 TEI_RERANKING_ENDPOINT: ""
 image:
   repository: opea/reranking-tei

diff --git a/helm-charts/common/tgi/horizontalPorAutoscaler.yaml b/helm-charts/common/tgi/horizontalPorAutoscaler.yaml
@@ -0,0 +1,51 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+{{- if .Values.horizontalPodAutoscaler.enabled }}
+apiVersion: autoscaling/v2
+kind: HorizontalPodAutoscaler
+metadata:
+  name: {{ include "tgi.fullname" . }}
+spec:
+  scaleTargetRef:
+    apiVersion: apps/v1
+    kind: Deployment
+    name: {{ include "tgi.fullname" . }}
+  minReplicas: 1
+  maxReplicas: {{ .Values.horizontalPodAutoscaler.maxReplicas }}
+  metrics:
+  - type: Object
+    object:
+      metric:
+        # TGI time metrics are in seconds
+        name: tgi_request_latency
+      describedObject:
+        apiVersion: v1
+        # get metric for named object of given type (in same namespace)
+        kind: Service
+        name: tgi-svc
+      target:
+        # tgi_request_latency is average for all the TGI pods. To avoid replica fluctuations when
+        # TGI startup + request processing takes longer than HPA evalution period, this uses
+        # "Value" (replicas = metric.value / target.value), instead of "averageValue" type:
+        #  https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/#algorithm-details
+        type: Value
+        value: 4
+  behavior:
+    scaleDown:
+      stabilizationWindowSeconds: 180
+      policies:
+      - type: Percent
+        value: 25
+        periodSeconds: 15
+    scaleUp:
+      selectPolicy: Max
+      stabilizationWindowSeconds: 0
+      policies:
+      - type: Percent
+        value: 50
+        periodSeconds: 15
+      - type: Pods
+        value: 2
+        periodSeconds: 15
+{{- end }}
diff --git a/helm-charts/common/tgi/servicemonitor.yaml b/helm-charts/common/tgi/servicemonitor.yaml
@@ -0,0 +1,22 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+# Dashboard for the exposed TGI metrics:
+# - https://grafana.com/grafana/dashboards/19831-text-generation-inference-dashboard/
+# Metric descriptions:
+# - https://github.com/huggingface/text-generation-inference/discussions/1127#discussioncomment-7240527
+
+{{- if .Values.horizontalPodAutoscaler.enabled }}
+apiVersion: monitoring.coreos.com/v1
+kind: ServiceMonitor
+metadata:
+  name: {{ include "tgi.fullname" . }}
+spec:
+  selector:
+    matchLabels:
+      svc: {{ include "tgi.fullname" . }}
+  endpoints:
+  - interval: 4s
+    port: service
+    scheme: http
+{{- end }}
diff --git a/helm-charts/common/tgi/templates/_helpers.tpl b/helm-charts/common/tgi/templates/_helpers.tpl
@@ -40,6 +40,7 @@ helm.sh/chart: {{ include "tgi.chart" . }}
 app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
 {{- end }}
 app.kubernetes.io/managed-by: {{ .Release.Service }}
+svc: {{ include "tgi.fullname" . }}
 {{- end }}
 
 {{/*