From 8b9cfdefb2f028b52699251bf9129aa80e32d664 Mon Sep 17 00:00:00 2001
From: Alexey Fomenko <alexey.fomenko@intel.com>
Date: Tue, 20 Aug 2024 19:44:17 +0300
Subject: [PATCH] Add HPA support to embedding, reranking, tgi services

Signed-off-by: Alexey Fomenko <alexey.fomenko@intel.com>
---
 .../chatqna/templates/customMetrics.yaml      | 53 +++++++++++++++++++
 helm-charts/chatqna/values.yaml               |  8 +++
 .../embedding-usvc/templates/_helpers.tpl     |  1 +
 .../embedding-usvc/templates/deployment.yaml  |  7 +++
 .../templates/horizontalPodAutoscaler.yaml    | 51 ++++++++++++++++++
 .../templates/servicemonitor.yaml             | 17 ++++++
 helm-charts/common/embedding-usvc/values.yaml |  8 +++
 .../reranking-usvc/templates/_helpers.tpl     |  1 +
 .../reranking-usvc/templates/deployment.yaml  |  7 +++
 .../templates/horizontalPodAutoscaler.yaml    | 51 ++++++++++++++++++
 .../templates/servicemonitor.yaml             | 17 ++++++
 helm-charts/common/reranking-usvc/values.yaml |  8 +++
 helm-charts/common/tgi/servicemonitor.yaml    | 22 ++++++++
 helm-charts/common/tgi/templates/_helpers.tpl |  1 +
 .../common/tgi/templates/deployment.yaml      |  7 +++
 .../templates/horizontalPorAutoscaler.yaml    | 51 ++++++++++++++++++
 helm-charts/common/tgi/values.yaml            |  8 +++
 17 files changed, 318 insertions(+)
 create mode 100644 helm-charts/chatqna/templates/customMetrics.yaml
 create mode 100644 helm-charts/common/embedding-usvc/templates/horizontalPodAutoscaler.yaml
 create mode 100644 helm-charts/common/embedding-usvc/templates/servicemonitor.yaml
 create mode 100644 helm-charts/common/reranking-usvc/templates/horizontalPodAutoscaler.yaml
 create mode 100644 helm-charts/common/reranking-usvc/templates/servicemonitor.yaml
 create mode 100644 helm-charts/common/tgi/servicemonitor.yaml
 create mode 100644 helm-charts/common/tgi/templates/horizontalPorAutoscaler.yaml

diff --git a/helm-charts/chatqna/templates/customMetrics.yaml b/helm-charts/chatqna/templates/customMetrics.yaml
new file mode 100644
index 000000000..e4dacbdf1
--- /dev/null
+++ b/helm-charts/chatqna/templates/customMetrics.yaml
@@ -0,0 +1,53 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+{{- if .Values.horizontalPodAutoscaler.enabled }}
+apiVersion: v1
+data:
+  config.yaml: |
+      rules:
+      - seriesQuery: '{__name__="tgi_request_inference_duration_sum"}'
+        # Average request latency from TGI histograms, over 1 min
+        # (0.001 divider add is to make sure there's always a valid value)
+        metricsQuery: 'rate(tgi_request_inference_duration_sum{<<.LabelMatchers>>}[1m]) / (0.001+rate(tgi_request_inference_duration_count{<<.LabelMatchers>>}[1m]))'
+        name:
+          matches: ^tgi_request_inference_duration_sum
+          as: "tgi_request_latency"
+        resources:
+          # HPA needs both namespace + suitable object resource for its query paths:
+          # /apis/custom.metrics.k8s.io/v1beta1/namespaces/default/service/*/tgi_request_latency
+          # (pod is not suitable object type for matching as each instance has different name)
+          overrides:
+            namespace:
+              resource: namespace
+            service:
+              resource: service
+      - seriesQuery: '{__name__="te_request_inference_duration_sum",service="tei-reranking-svc"}'
+        # Average request latency from TEI histograms, over 1 min
+        metricsQuery: 'rate(te_request_inference_duration_sum{service="tei-reranking-svc",<<.LabelMatchers>>}[1m]) / (0.001+rate(te_request_inference_duration_count{service="tei-reranking-svc",<<.LabelMatchers>>}[1m]))'
+        name:
+          matches: ^te_request_inference_duration_sum
+          as: "reranking_request_latency"
+        resources:
+          overrides:
+            namespace:
+              resource: namespace
+            service:
+              resource: service
+      - seriesQuery: '{__name__="te_request_inference_duration_sum",service="tei-embedding-svc"}'
+        # Average request latency from TEI histograms, over 1 min
+        metricsQuery: 'rate(te_request_inference_duration_sum{service="tei-embedding-svc",<<.LabelMatchers>>}[1m]) / (0.001+rate(te_request_inference_duration_count{service="tei-embedding-svc",<<.LabelMatchers>>}[1m]))'
+        name:
+          matches: ^te_request_inference_duration_sum
+          as: "embedding_request_latency"
+        resources:
+          overrides:
+            namespace:
+              resource: namespace
+            service:
+              resource: service
+kind: ConfigMap
+metadata:
+  name: adapter-config
+  namespace: monitoring
+{{- end }}
diff --git a/helm-charts/chatqna/values.yaml b/helm-charts/chatqna/values.yaml
index f848b209e..b062d6c03 100644
--- a/helm-charts/chatqna/values.yaml
+++ b/helm-charts/chatqna/values.yaml
@@ -7,6 +7,14 @@
 
 replicaCount: 1
 
+# Enabling HPA will:
+# - Overwrite existing PrometheusAdapter "adapter-config" configMap with ChatQnA specific custom metric queries
+#   for embedding, reranking, tgi services
+# Upstream default configMap:
+#  - https://github.com/kubernetes-sigs/prometheus-adapter/blob/master/deploy/manifests/config-map.yaml
+horizontalPodAutoscaler:
+  enabled: false
+
 image:
   repository: opea/chatqna
   pullPolicy: IfNotPresent
diff --git a/helm-charts/common/embedding-usvc/templates/_helpers.tpl b/helm-charts/common/embedding-usvc/templates/_helpers.tpl
index 229f1a5b5..31a4095d7 100644
--- a/helm-charts/common/embedding-usvc/templates/_helpers.tpl
+++ b/helm-charts/common/embedding-usvc/templates/_helpers.tpl
@@ -40,6 +40,7 @@ helm.sh/chart: {{ include "embedding-usvc.chart" . }}
 app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
 {{- end }}
 app.kubernetes.io/managed-by: {{ .Release.Service }}
+svc: {{ include "embedding-usvc.fullname" . }}
 {{- end }}
 
 {{/*
diff --git a/helm-charts/common/embedding-usvc/templates/deployment.yaml b/helm-charts/common/embedding-usvc/templates/deployment.yaml
index 26f5a76fa..6c2f013ca 100644
--- a/helm-charts/common/embedding-usvc/templates/deployment.yaml
+++ b/helm-charts/common/embedding-usvc/templates/deployment.yaml
@@ -8,7 +8,10 @@ metadata:
   labels:
     {{- include "embedding-usvc.labels" . | nindent 4 }}
 spec:
+  # use explicit replica counts only of HorizontalPodAutoscaler is disabled
+  {{- if not .Values.horizontalPodAutoscaler.enabled }}
   replicas: {{ .Values.replicaCount }}
+  {{- end }}
   selector:
     matchLabels:
       {{- include "embedding-usvc.selectorLabels" . | nindent 6 }}
@@ -62,6 +65,10 @@ spec:
           {{- end }}
           resources:
             {{- toYaml .Values.resources | nindent 12 }}
+          {{- if .Values.horizontalPodAutoscaler.enabled }}
+          # extra time to finish processing buffered requests before HPA forcibly terminates pod
+          terminationGracePeriodSeconds: 60
+          {{- end }}
       volumes:
         - name: tmp
           emptyDir: {}
diff --git a/helm-charts/common/embedding-usvc/templates/horizontalPodAutoscaler.yaml b/helm-charts/common/embedding-usvc/templates/horizontalPodAutoscaler.yaml
new file mode 100644
index 000000000..62089e190
--- /dev/null
+++ b/helm-charts/common/embedding-usvc/templates/horizontalPodAutoscaler.yaml
@@ -0,0 +1,51 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+{{- if .Values.horizontalPodAutoscaler.enabled }}
+apiVersion: autoscaling/v2
+kind: HorizontalPodAutoscaler
+metadata:
+  name: {{ include "embedding-usvc.fullname" . }}
+spec:
+  scaleTargetRef:
+    apiVersion: apps/v1
+    kind: Deployment
+    name: {{ include "embedding-usvc.fullname" . }}
+  minReplicas: 1
+  maxReplicas: {{ .Values.horizontalPodAutoscaler.maxReplicas }}
+  metrics:
+  - type: Object
+    object:
+      metric:
+        # tei-embedding time metrics are in seconds
+        name: embedding_request_latency
+      describedObject:
+        apiVersion: v1
+        # get metric for named object of given type (in same namespace)
+        kind: Service
+        name: tei-embedding-svc
+      target:
+        # embedding_request_latency is average for all TEI pods. To avoid replica fluctuations when
+        # TEI startup + request processing takes longer than HPA evaluation period, this uses
+        # "Value" (replicas = metric.value / target.value), instead of "averageValue" type:
+        #  https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/#algorithm-details
+        type: Value
+        value: 4
+  behavior:
+    scaleDown:
+      stabilizationWindowSeconds: 180
+      policies:
+      - type: Percent
+        value: 25
+        periodSeconds: 15
+    scaleUp:
+      selectPolicy: Max
+      stabilizationWindowSeconds: 0
+      policies:
+      - type: Percent
+        value: 50
+        periodSeconds: 15
+      - type: Pods
+        value: 2
+        periodSeconds: 15
+{{- end }}
diff --git a/helm-charts/common/embedding-usvc/templates/servicemonitor.yaml b/helm-charts/common/embedding-usvc/templates/servicemonitor.yaml
new file mode 100644
index 000000000..0d5e6e776
--- /dev/null
+++ b/helm-charts/common/embedding-usvc/templates/servicemonitor.yaml
@@ -0,0 +1,17 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+{{- if .Values.horizontalPodAutoscaler.enabled }}
+apiVersion: monitoring.coreos.com/v1
+kind: ServiceMonitor
+metadata:
+  name: {{ include "embedding-usvc.fullname" . }}
+spec:
+  selector:
+    matchLabels:
+      svc: {{ include "embedding-usvc.fullname" . }}
+  endpoints:
+  - interval: 4s
+    port: service
+    scheme: http
+{{- end }}
\ No newline at end of file
diff --git a/helm-charts/common/embedding-usvc/values.yaml b/helm-charts/common/embedding-usvc/values.yaml
index f3b1f9e89..8f766d0c9 100644
--- a/helm-charts/common/embedding-usvc/values.yaml
+++ b/helm-charts/common/embedding-usvc/values.yaml
@@ -10,6 +10,14 @@ autodependency:
 
 replicaCount: 1
 
+# Enabling HPA will:
+# - Ignore above replica count, as it will be controlled by HPA
+# - Add example HPA scaling rules with thresholds suitable for Xeon deployments
+# - Require custom metrics ConfigMap available in the main application chart
+horizontalPodAutoscaler:
+  enabled: false
+  maxReplicas: 2
+
 TEI_EMBEDDING_ENDPOINT: ""
 image:
   repository: opea/embedding-tei
diff --git a/helm-charts/common/reranking-usvc/templates/_helpers.tpl b/helm-charts/common/reranking-usvc/templates/_helpers.tpl
index 9247fe13b..1970e7cbd 100644
--- a/helm-charts/common/reranking-usvc/templates/_helpers.tpl
+++ b/helm-charts/common/reranking-usvc/templates/_helpers.tpl
@@ -40,6 +40,7 @@ helm.sh/chart: {{ include "reranking-usvc.chart" . }}
 app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
 {{- end }}
 app.kubernetes.io/managed-by: {{ .Release.Service }}
+svc: {{ include "reranking-usvc.fullname" . }}
 {{- end }}
 
 {{/*
diff --git a/helm-charts/common/reranking-usvc/templates/deployment.yaml b/helm-charts/common/reranking-usvc/templates/deployment.yaml
index 9fc05bc0c..c7bbbbaff 100644
--- a/helm-charts/common/reranking-usvc/templates/deployment.yaml
+++ b/helm-charts/common/reranking-usvc/templates/deployment.yaml
@@ -8,7 +8,10 @@ metadata:
   labels:
     {{- include "reranking-usvc.labels" . | nindent 4 }}
 spec:
+  # use explicit replica counts only of HorizontalPodAutoscaler is disabled
+  {{- if not .Values.horizontalPodAutoscaler.enabled }}
   replicas: {{ .Values.replicaCount }}
+  {{- end }}
   selector:
     matchLabels:
       {{- include "reranking-usvc.selectorLabels" . | nindent 6 }}
@@ -62,6 +65,10 @@ spec:
           {{- end }}
           resources:
             {{- toYaml .Values.resources | nindent 12 }}
+          {{- if .Values.horizontalPodAutoscaler.enabled }}
+          # extra time to finish processing buffered requests before HPA forcibly terminates pod
+          terminationGracePeriodSeconds: 60
+          {{- end }}
       volumes:
         - name: tmp
           emptyDir: {}
diff --git a/helm-charts/common/reranking-usvc/templates/horizontalPodAutoscaler.yaml b/helm-charts/common/reranking-usvc/templates/horizontalPodAutoscaler.yaml
new file mode 100644
index 000000000..72db38086
--- /dev/null
+++ b/helm-charts/common/reranking-usvc/templates/horizontalPodAutoscaler.yaml
@@ -0,0 +1,51 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+{{- if .Values.horizontalPodAutoscaler.enabled }}
+apiVersion: autoscaling/v2
+kind: HorizontalPodAutoscaler
+metadata:
+  name: {{ include "reranking-usvc.fullname" . }}
+spec:
+  scaleTargetRef:
+    apiVersion: apps/v1
+    kind: Deployment
+    name: {{ include "reranking-usvc.fullname" . }}
+  minReplicas: 1
+  maxReplicas: {{ .Values.horizontalPodAutoscaler.maxReplicas }}
+  metrics:
+  - type: Object
+    object:
+      metric:
+        # tei-reranking time metrics are in seconds
+        name: reranking_request_latency
+      describedObject:
+        apiVersion: v1
+        # get metric for named object of given type (in same namespace)
+        kind: Service
+        name: tei-reranking-svc
+      target:
+        # reranking_request_latency is average for all TEI pods. To avoid replica fluctuations when
+        # TEI startup + request processing takes longer than HPA evaluation period, this uses
+        # "Value" (replicas = metric.value / target.value), instead of "averageValue" type:
+        #  https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/#algorithm-details
+        type: Value
+        value: 4
+  behavior:
+    scaleDown:
+      stabilizationWindowSeconds: 180
+      policies:
+      - type: Percent
+        value: 25
+        periodSeconds: 15
+    scaleUp:
+      selectPolicy: Max
+      stabilizationWindowSeconds: 0
+      policies:
+      - type: Percent
+        value: 50
+        periodSeconds: 15
+      - type: Pods
+        value: 2
+        periodSeconds: 15
+{{- end }}
diff --git a/helm-charts/common/reranking-usvc/templates/servicemonitor.yaml b/helm-charts/common/reranking-usvc/templates/servicemonitor.yaml
new file mode 100644
index 000000000..9462d9664
--- /dev/null
+++ b/helm-charts/common/reranking-usvc/templates/servicemonitor.yaml
@@ -0,0 +1,17 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+{{- if .Values.horizontalPodAutoscaler.enabled }}
+apiVersion: monitoring.coreos.com/v1
+kind: ServiceMonitor
+metadata:
+  name: {{ include "reranking-usvc.fullname" . }}
+spec:
+  selector:
+    matchLabels:
+      svc: {{ include "reranking-usvc.fullname" . }}
+  endpoints:
+  - interval: 4s
+    port: service
+    scheme: http
+{{- end }}
\ No newline at end of file
diff --git a/helm-charts/common/reranking-usvc/values.yaml b/helm-charts/common/reranking-usvc/values.yaml
index c011cf309..3bbc07dab 100644
--- a/helm-charts/common/reranking-usvc/values.yaml
+++ b/helm-charts/common/reranking-usvc/values.yaml
@@ -10,6 +10,14 @@ autodependency:
 
 replicaCount: 1
 
+# Enabling HPA will:
+# - Ignore above replica count, as it will be controlled by HPA
+# - Add example HPA scaling rules with thresholds suitable for Xeon deployments
+# - Require custom metrics ConfigMap available in the main application chart
+horizontalPodAutoscaler:
+  enabled: false
+  maxReplicas: 3
+
 TEI_RERANKING_ENDPOINT: ""
 image:
   repository: opea/reranking-tei
diff --git a/helm-charts/common/tgi/servicemonitor.yaml b/helm-charts/common/tgi/servicemonitor.yaml
new file mode 100644
index 000000000..6f96aff89
--- /dev/null
+++ b/helm-charts/common/tgi/servicemonitor.yaml
@@ -0,0 +1,22 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+# Dashboard for the exposed TGI metrics:
+# - https://grafana.com/grafana/dashboards/19831-text-generation-inference-dashboard/
+# Metric descriptions:
+# - https://github.com/huggingface/text-generation-inference/discussions/1127#discussioncomment-7240527
+
+{{- if .Values.horizontalPodAutoscaler.enabled }}
+apiVersion: monitoring.coreos.com/v1
+kind: ServiceMonitor
+metadata:
+  name: {{ include "tgi.fullname" . }}
+spec:
+  selector:
+    matchLabels:
+      svc: {{ include "tgi.fullname" . }}
+  endpoints:
+  - interval: 4s
+    port: service
+    scheme: http
+{{- end }}
\ No newline at end of file
diff --git a/helm-charts/common/tgi/templates/_helpers.tpl b/helm-charts/common/tgi/templates/_helpers.tpl
index 6e98919c1..0a95d784b 100644
--- a/helm-charts/common/tgi/templates/_helpers.tpl
+++ b/helm-charts/common/tgi/templates/_helpers.tpl
@@ -40,6 +40,7 @@ helm.sh/chart: {{ include "tgi.chart" . }}
 app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
 {{- end }}
 app.kubernetes.io/managed-by: {{ .Release.Service }}
+svc: {{ include "tgi.fullname" . }}
 {{- end }}
 
 {{/*
diff --git a/helm-charts/common/tgi/templates/deployment.yaml b/helm-charts/common/tgi/templates/deployment.yaml
index 2ef224b59..1c00a4f37 100644
--- a/helm-charts/common/tgi/templates/deployment.yaml
+++ b/helm-charts/common/tgi/templates/deployment.yaml
@@ -8,7 +8,10 @@ metadata:
   labels:
     {{- include "tgi.labels" . | nindent 4 }}
 spec:
+  # use explicit replica counts only of HorizontalPodAutoscaler is disabled
+  {{- if not .Values.horizontalPodAutoscaler.enabled }}
   replicas: {{ .Values.replicaCount }}
+  {{- end }}
   selector:
     matchLabels:
       {{- include "tgi.selectorLabels" . | nindent 6 }}
@@ -68,6 +71,10 @@ spec:
           {{- end }}
           resources:
             {{- toYaml .Values.resources | nindent 12 }}
+          {{- if .Values.horizontalPodAutoscaler.enabled }}
+          # extra time to finish processing buffered requests before HPA forcibly terminates pod
+          terminationGracePeriodSeconds: 120
+          {{- end }}
       volumes:
         - name: model-volume
           {{- if .Values.global.modelUsePVC }}
diff --git a/helm-charts/common/tgi/templates/horizontalPorAutoscaler.yaml b/helm-charts/common/tgi/templates/horizontalPorAutoscaler.yaml
new file mode 100644
index 000000000..bae813e16
--- /dev/null
+++ b/helm-charts/common/tgi/templates/horizontalPorAutoscaler.yaml
@@ -0,0 +1,51 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+{{- if .Values.horizontalPodAutoscaler.enabled }}
+apiVersion: autoscaling/v2
+kind: HorizontalPodAutoscaler
+metadata:
+  name: {{ include "tgi.fullname" . }}
+spec:
+  scaleTargetRef:
+    apiVersion: apps/v1
+    kind: Deployment
+    name: {{ include "tgi.fullname" . }}
+  minReplicas: 1
+  maxReplicas: {{ .Values.horizontalPodAutoscaler.maxReplicas }}
+  metrics:
+  - type: Object
+    object:
+      metric:
+        # TGI time metrics are in seconds
+        name: tgi_request_latency
+      describedObject:
+        apiVersion: v1
+        # get metric for named object of given type (in same namespace)
+        kind: Service
+        name: tgi-svc
+      target:
+        # tgi_request_latency is average for all the TGI pods. To avoid replica fluctuations when
+        # TGI startup + request processing takes longer than HPA evaluation period, this uses
+        # "Value" (replicas = metric.value / target.value), instead of "averageValue" type:
+        #  https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/#algorithm-details
+        type: Value
+        value: 4
+  behavior:
+    scaleDown:
+      stabilizationWindowSeconds: 180
+      policies:
+      - type: Percent
+        value: 25
+        periodSeconds: 15
+    scaleUp:
+      selectPolicy: Max
+      stabilizationWindowSeconds: 0
+      policies:
+      - type: Percent
+        value: 50
+        periodSeconds: 15
+      - type: Pods
+        value: 2
+        periodSeconds: 15
+{{- end }}
diff --git a/helm-charts/common/tgi/values.yaml b/helm-charts/common/tgi/values.yaml
index dff877f5b..2716522f0 100644
--- a/helm-charts/common/tgi/values.yaml
+++ b/helm-charts/common/tgi/values.yaml
@@ -7,6 +7,14 @@
 
 replicaCount: 1
 
+# Enabling HPA will:
+# - Ignore above replica count, as it will be controlled by HPA
+# - Add example HPA scaling rules with thresholds suitable for Xeon deployments
+# - Require custom metrics ConfigMap available in the main application chart
+horizontalPodAutoscaler:
+  enable: false
+  maxReplicas: 6
+
 port: 2080
 
 image: