Add HPA support to embedding, reranking, tgi services

Signed-off-by: Alexey Fomenko <[email protected]>
opea-project · Aug 21, 2024 · 939976d · 939976d
1 parent b1182c4
commit 939976d
Show file tree

Hide file tree

Showing 17 changed files with 501 additions and 11 deletions.
diff --git a/helm-charts/chatqna/templates/customMetrics.yaml b/helm-charts/chatqna/templates/customMetrics.yaml
@@ -0,0 +1,53 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+{{- if .Values.horizontalPodAutoscaler.enabled }}
+apiVersion: v1
+data:
+  config.yaml: |
+      rules:
+      - seriesQuery: '{__name__="tgi_request_inference_duration_sum"}'
+        # Average request latency from TGI histograms, over 1 min
+        # (0.001 divider add is to make sure there's always a valid value)
+        metricsQuery: 'rate(tgi_request_inference_duration_sum{<<.LabelMatchers>>}[1m]) / (0.001+rate(tgi_request_inference_duration_count{<<.LabelMatchers>>}[1m]))'
+        name:
+          matches: ^tgi_request_inference_duration_sum
+          as: "tgi_request_latency"
+        resources:
+          # HPA needs both namespace + suitable object resource for its query paths:
+          # /apis/custom.metrics.k8s.io/v1beta1/namespaces/default/service/*/tgi_request_latency
+          # (pod is not suitable object type for matching as each instance has different name)
+          overrides:
+            namespace:
+              resource: namespace
+            service:
+              resource: service
+      - seriesQuery: '{__name__="te_request_inference_duration_sum",service="tei-reranking-svc"}'
+        # Average request latency from TEI histograms, over 1 min
+        metricsQuery: 'rate(te_request_inference_duration_sum{service="tei-reranking-svc",<<.LabelMatchers>>}[1m]) / (0.001+rate(te_request_inference_duration_count{service="tei-reranking-svc",<<.LabelMatchers>>}[1m]))'
+        name:
+          matches: ^te_request_inference_duration_sum
+          as: "reranking_request_latency"
+        resources:
+          overrides:
+            namespace:
+              resource: namespace
+            service:
+              resource: service
+      - seriesQuery: '{__name__="te_request_inference_duration_sum",service="tei-embedding-svc"}'
+        # Average request latency from TEI histograms, over 1 min
+        metricsQuery: 'rate(te_request_inference_duration_sum{service="tei-embedding-svc",<<.LabelMatchers>>}[1m]) / (0.001+rate(te_request_inference_duration_count{service="tei-embedding-svc",<<.LabelMatchers>>}[1m]))'
+        name:
+          matches: ^te_request_inference_duration_sum
+          as: "embedding_request_latency"
+        resources:
+          overrides:
+            namespace:
+              resource: namespace
+            service:
+              resource: service
+kind: ConfigMap
+metadata:
+  name: adapter-config
+  namespace: monitoring
+{{- end }}
diff --git a/helm-charts/chatqna/values.yaml b/helm-charts/chatqna/values.yaml
@@ -7,6 +7,14 @@
 
 replicaCount: 1
 
+# Enabling HPA will:
+# - Overwrite existing PrometheusAdapter "adapter-config" configMap with ChatQnA specific custom metric queries
+#   for embedding, reranking, tgi services
+# Upstream default configMap:
+#  - https://github.com/kubernetes-sigs/prometheus-adapter/blob/master/deploy/manifests/config-map.yaml
+horizontalPodAutoscaler:
+  enabled: false
+
 image:
   repository: opea/chatqna
   pullPolicy: IfNotPresent

diff --git a/helm-charts/common/embedding-usvc/README.md b/helm-charts/common/embedding-usvc/README.md
@@ -27,6 +27,34 @@ helm dependency update
 helm install embedding-usvc . --set autodependency.enabled=true
 ```
 
+## HorizontalPodAutoscaler (HPA) support
+
+`horizontalPodAutoscaler` option enables HPA scaling for the deployment:
+https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/
+
+Autoscaling is based on custom application metrics provided through [Prometheus](https://prometheus.io/).
+
+### Pre-conditions
+
+If cluster does not run [Prometheus operator](https://github.com/prometheus-operator/kube-prometheus)
+yet, it SHOULD be be installed before enabling HPA, e.g. by using:
+https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack
+
+### Gotchas
+
+Why HPA is opt-in:
+* Enabling chart `horizontalPodAutoscaler` option will _overwrite_ cluster's current
+  `PrometheusAdapter` configuration with its own custom metrics configuration.
+  Take copy of the existing one before install, if that matters:
+  `kubectl -n monitoring get cm/adapter-config -o yaml > adapter-config.yaml`
+* `PrometheusAdapter` needs to be restarted after install, for it to read the new configuration:
+  `ns=monitoring; kubectl -n $ns delete $(kubectl -n $ns get pod --selector app.kubernetes.io/name=prometheus-adapter -o name)`
+* By default Prometheus adds [k8s RBAC rules](https://github.com/prometheus-operator/kube-prometheus/blob/main/manifests/prometheus-roleBindingSpecificNamespaces.yaml)
+  for accessing metrics from `default`, `kube-system` and `monitoring` namespaces.  If Helm is
+  asked to install OPEA services to some other namespace, those rules need to be updated accordingly
+* Provided HPA rules are examples for Xeon, for efficient scaling they need to be fine-tuned for given setup
+  (underlying HW, used models, OPEA version etc)
+
 ## Verify
 
 To verify the installation, run the command `kubectl get pod` to make sure all pods are running.
@@ -42,10 +70,40 @@ curl http://localhost:6000/v1/embeddings \
     -H 'Content-Type: application/json'
 ```
 
+### Verify HPA metrics
+
+To verify that metrics required by horizontalPodAutoscaler option work, check that:
+
+Prometheus has found the metric endpoints, i.e. last number on the line is non-zero:
+
+```console
+prom_url=http://$(kubectl -n monitoring get -o jsonpath="{.spec.clusterIP}:{.spec.ports[0].port}" svc/prometheus-k8s)
+curl --no-progress-meter $prom_url/metrics | grep scrape_pool_targets.*embedding
+```
+
+Prometheus adapter provides custom metrics for their data:
+
+```console
+kubectl get --raw /apis/custom.metrics.k8s.io/v1beta1 | jq .resources[].name
+```
+
+And those custom metrics have valid values for HPA rules:
+
+```console
+ns=default;  # OPEA namespace
+url=/apis/custom.metrics.k8s.io/v1beta1;
+for m in $(kubectl get --raw $url | jq .resources[].name | tr -d '"' | grep namespaces | sed "s%/%/${ns}/metrics/%"); do
+  kubectl get --raw $url/$m | jq;
+done | grep -e metricName -e value
+```
+
+NOTE: HuggingFace TGI and TEI services provide metrics endpoint only after they've processed their first request!
+
 ## Values
 
-| Key                    | Type   | Default                | Description |
-| ---------------------- | ------ | ---------------------- | ----------- |
-| image.repository       | string | `"opea/embedding-tei"` |             |
-| service.port           | string | `"6000"`               |             |
-| TEI_EMBEDDING_ENDPOINT | string | `""`                   |             |
+| Key                             | Type   | Default                | Description                                                                                                                        |
+| ------------------------------- | ------ | ---------------------- | ---------------------------------------------------------------------------------------------------------------------------------- |
+| image.repository                | string | `"opea/embedding-tei"` |                                                                                                                                    |
+| service.port                    | string | `"6000"`               |                                                                                                                                    |
+| TEI_EMBEDDING_ENDPOINT          | string | `""`                   |                                                                                                                                    |
+| horizontalPodAutoscaler.enabled | bool   | false                  | Enable HPA autoscaling for the service deployments based on metrics it provides. See #pre-conditions and #gotchas before enabling! |
diff --git a/helm-charts/common/embedding-usvc/templates/deployment.yaml b/helm-charts/common/embedding-usvc/templates/deployment.yaml
@@ -8,7 +8,10 @@ metadata:
   labels:
     {{- include "embedding-usvc.labels" . | nindent 4 }}
 spec:
+  # use explicit replica counts only of HorizontalPodAutoscaler is disabled
+  {{- if not .Values.horizontalPodAutoscaler.enabled }}
   replicas: {{ .Values.replicaCount }}
+  {{- end }}
   selector:
     matchLabels:
       {{- include "embedding-usvc.selectorLabels" . | nindent 6 }}
@@ -77,3 +80,7 @@ spec:
       tolerations:
         {{- toYaml . | nindent 8 }}
       {{- end }}
+      {{- if .Values.horizontalPodAutoscaler.enabled }}
+      # extra time to finish processing buffered requests before HPA forcibly terminates pod
+      terminationGracePeriodSeconds: 60
+      {{- end }}
diff --git a/helm-charts/common/embedding-usvc/templates/horizontalPodAutoscaler.yaml b/helm-charts/common/embedding-usvc/templates/horizontalPodAutoscaler.yaml
@@ -0,0 +1,51 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+{{- if .Values.horizontalPodAutoscaler.enabled }}
+apiVersion: autoscaling/v2
+kind: HorizontalPodAutoscaler
+metadata:
+  name: {{ include "embedding-usvc.fullname" . }}
+spec:
+  scaleTargetRef:
+    apiVersion: apps/v1
+    kind: Deployment
+    name: {{ include "embedding-usvc.fullname" . }}
+  minReplicas: 1
+  maxReplicas: {{ .Values.horizontalPodAutoscaler.maxReplicas }}
+  metrics:
+  - type: Object
+    object:
+      metric:
+        # tei-embedding time metrics are in seconds
+        name: embedding_request_latency
+      describedObject:
+        apiVersion: v1
+        # get metric for named object of given type (in same namespace)
+        kind: Service
+        name: tei-embedding-svc
+      target:
+        # embedding_request_latency is average for all TEI pods. To avoid replica fluctuations when
+        # TEI startup + request processing takes longer than HPA evaluation period, this uses
+        # "Value" (replicas = metric.value / target.value), instead of "averageValue" type:
+        #  https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/#algorithm-details
+        type: Value
+        value: 4
+  behavior:
+    scaleDown:
+      stabilizationWindowSeconds: 180
+      policies:
+      - type: Percent
+        value: 25
+        periodSeconds: 15
+    scaleUp:
+      selectPolicy: Max
+      stabilizationWindowSeconds: 0
+      policies:
+      - type: Percent
+        value: 50
+        periodSeconds: 15
+      - type: Pods
+        value: 2
+        periodSeconds: 15
+{{- end }}
diff --git a/helm-charts/common/embedding-usvc/templates/servicemonitor.yaml b/helm-charts/common/embedding-usvc/templates/servicemonitor.yaml
@@ -0,0 +1,17 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+{{- if .Values.horizontalPodAutoscaler.enabled }}
+apiVersion: monitoring.coreos.com/v1
+kind: ServiceMonitor
+metadata:
+  name: {{ include "embedding-usvc.fullname" . }}
+spec:
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: {{ include "embedding-usvc.fullname" . }}
+  endpoints:
+  - interval: 4s
+    port: service
+    scheme: http
+{{- end }}
diff --git a/helm-charts/common/embedding-usvc/values.yaml b/helm-charts/common/embedding-usvc/values.yaml
@@ -10,6 +10,14 @@ autodependency:
 
 replicaCount: 1
 
+# Enabling HPA will:
+# - Ignore above replica count, as it will be controlled by HPA
+# - Add example HPA scaling rules with thresholds suitable for Xeon deployments
+# - Require custom metrics ConfigMap available in the main application chart
+horizontalPodAutoscaler:
+  enabled: false
+  maxReplicas: 2
+
 TEI_EMBEDDING_ENDPOINT: ""
 image:
   repository: opea/embedding-tei

diff --git a/helm-charts/common/teirerank/README.md b/helm-charts/common/teirerank/README.md
@@ -21,6 +21,34 @@ MODELDIR=/mnt/opea-models
 
 MODELNAME="/data/BAAI/bge-reranker-base"
 
+## HorizontalPodAutoscaler (HPA) support
+
+`horizontalPodAutoscaler` option enables HPA scaling for the deployment:
+https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/
+
+Autoscaling is based on custom application metrics provided through [Prometheus](https://prometheus.io/).
+
+### Pre-conditions
+
+If cluster does not run [Prometheus operator](https://github.com/prometheus-operator/kube-prometheus)
+yet, it SHOULD be be installed before enabling HPA, e.g. by using:
+https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack
+
+### Gotchas
+
+Why HPA is opt-in:
+* Enabling chart `horizontalPodAutoscaler` option will _overwrite_ cluster's current
+  `PrometheusAdapter` configuration with its own custom metrics configuration.
+  Take copy of the existing one before install, if that matters:
+  `kubectl -n monitoring get cm/adapter-config -o yaml > adapter-config.yaml`
+* `PrometheusAdapter` needs to be restarted after install, for it to read the new configuration:
+  `ns=monitoring; kubectl -n $ns delete $(kubectl -n $ns get pod --selector app.kubernetes.io/name=prometheus-adapter -o name)`
+* By default Prometheus adds [k8s RBAC rules](https://github.com/prometheus-operator/kube-prometheus/blob/main/manifests/prometheus-roleBindingSpecificNamespaces.yaml)
+  for accessing metrics from `default`, `kube-system` and `monitoring` namespaces.  If Helm is
+  asked to install OPEA services to some other namespace, those rules need to be updated accordingly
+* Provided HPA rules are examples for Xeon, for efficient scaling they need to be fine-tuned for given setup
+  (underlying HW, used models, OPEA version etc)
+
 ## Verify
 
 To verify the installation, run the command `kubectl get pod` to make sure all pods are runinng.
@@ -36,11 +64,41 @@ curl http://localhost:2082/rerank \
     -H 'Content-Type: application/json'
 ```
 
+### Verify HPA metrics
+
+To verify that metrics required by horizontalPodAutoscaler option work, check that:
+
+Prometheus has found the metric endpoints, i.e. last number on the line is non-zero:
+
+```console
+prom_url=http://$(kubectl -n monitoring get -o jsonpath="{.spec.clusterIP}:{.spec.ports[0].port}" svc/prometheus-k8s)
+curl --no-progress-meter $prom_url/metrics | grep scrape_pool_targets.*rerank
+```
+
+Prometheus adapter provides custom metrics for their data:
+
+```console
+kubectl get --raw /apis/custom.metrics.k8s.io/v1beta1 | jq .resources[].name
+```
+
+And those custom metrics have valid values for HPA rules:
+
+```console
+ns=default;  # OPEA namespace
+url=/apis/custom.metrics.k8s.io/v1beta1;
+for m in $(kubectl get --raw $url | jq .resources[].name | tr -d '"' | grep namespaces | sed "s%/%/${ns}/metrics/%"); do
+  kubectl get --raw $url/$m | jq;
+done | grep -e metricName -e value
+```
+
+NOTE: HuggingFace TGI and TEI services provide metrics endpoint only after they've processed their first request!
+
 ## Values
 
-| Key                     | Type   | Default                                           | Description                                                                                                                                                                                                                 |
-| ----------------------- | ------ | ------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| RERANK_MODEL_ID         | string | `"BAAI/bge-reranker-base"`                        | Models id from https://huggingface.co/, or predownloaded model directory                                                                                                                                                    |
-| global.modelUseHostPath | string | `"/mnt/opea-models"`                              | Cached models directory, teirerank will not download if the model is cached here. The host path "modelUseHostPath" will be mounted to container as /data directory. Set this to null/empty will force it to download model. |
-| image.repository        | string | `"ghcr.io/huggingface/text-embeddings-inference"` |                                                                                                                                                                                                                             |
-| image.tag               | string | `"cpu-1.5"`                                       |                                                                                                                                                                                                                             |
+| Key                             | Type   | Default                                           | Description                                                                                                                                                                                                                 |
+| ------------------------------- | ------ | ------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| RERANK_MODEL_ID                 | string | `"BAAI/bge-reranker-base"`                        | Models id from https://huggingface.co/, or predownloaded model directory                                                                                                                                                    |
+| global.modelUseHostPath         | string | `"/mnt/opea-models"`                              | Cached models directory, teirerank will not download if the model is cached here. The host path "modelUseHostPath" will be mounted to container as /data directory. Set this to null/empty will force it to download model. |
+| image.repository                | string | `"ghcr.io/huggingface/text-embeddings-inference"` |                                                                                                                                                                                                                             |
+| image.tag                       | string | `"cpu-1.5"`                                       |                                                                                                                                                                                                                             |
+| horizontalPodAutoscaler.enabled | bool   | false                                             | Enable HPA autoscaling for the service deployments based on metrics it provides. See #pre-conditions and #gotchas before enabling!                                                                                          |
diff --git a/helm-charts/common/teirerank/templates/deployment.yaml b/helm-charts/common/teirerank/templates/deployment.yaml
@@ -8,7 +8,10 @@ metadata:
   labels:
     {{- include "teirerank.labels" . | nindent 4 }}
 spec:
+  # use explicit replica counts only of HorizontalPodAutoscaler is disabled
+  {{- if not .Values.horizontalPodAutoscaler.enabled }}
   replicas: {{ .Values.replicaCount }}
+  {{- end }}
   selector:
     matchLabels:
       {{- include "teirerank.selectorLabels" . | nindent 6 }}
@@ -102,3 +105,7 @@ spec:
       tolerations:
         {{- toYaml . | nindent 8 }}
       {{- end }}
+      {{- if .Values.horizontalPodAutoscaler.enabled }}
+      # extra time to finish processing buffered requests before HPA forcibly terminates pod
+      terminationGracePeriodSeconds: 60
+      {{- end }}