From 8b9cfdefb2f028b52699251bf9129aa80e32d664 Mon Sep 17 00:00:00 2001 From: Alexey Fomenko Date: Tue, 20 Aug 2024 19:44:17 +0300 Subject: [PATCH] Add HPA support to embedding, reranking, tgi services Signed-off-by: Alexey Fomenko --- .../chatqna/templates/customMetrics.yaml | 53 +++++++++++++++++++ helm-charts/chatqna/values.yaml | 8 +++ .../embedding-usvc/templates/_helpers.tpl | 1 + .../embedding-usvc/templates/deployment.yaml | 7 +++ .../templates/horizontalPodAutoscaler.yaml | 51 ++++++++++++++++++ .../templates/servicemonitor.yaml | 17 ++++++ helm-charts/common/embedding-usvc/values.yaml | 8 +++ .../reranking-usvc/templates/_helpers.tpl | 1 + .../reranking-usvc/templates/deployment.yaml | 7 +++ .../templates/horizontalPodAutoscaler.yaml | 51 ++++++++++++++++++ .../templates/servicemonitor.yaml | 17 ++++++ helm-charts/common/reranking-usvc/values.yaml | 8 +++ helm-charts/common/tgi/servicemonitor.yaml | 22 ++++++++ helm-charts/common/tgi/templates/_helpers.tpl | 1 + .../common/tgi/templates/deployment.yaml | 7 +++ .../templates/horizontalPorAutoscaler.yaml | 51 ++++++++++++++++++ helm-charts/common/tgi/values.yaml | 8 +++ 17 files changed, 318 insertions(+) create mode 100644 helm-charts/chatqna/templates/customMetrics.yaml create mode 100644 helm-charts/common/embedding-usvc/templates/horizontalPodAutoscaler.yaml create mode 100644 helm-charts/common/embedding-usvc/templates/servicemonitor.yaml create mode 100644 helm-charts/common/reranking-usvc/templates/horizontalPodAutoscaler.yaml create mode 100644 helm-charts/common/reranking-usvc/templates/servicemonitor.yaml create mode 100644 helm-charts/common/tgi/servicemonitor.yaml create mode 100644 helm-charts/common/tgi/templates/horizontalPorAutoscaler.yaml diff --git a/helm-charts/chatqna/templates/customMetrics.yaml b/helm-charts/chatqna/templates/customMetrics.yaml new file mode 100644 index 000000000..e4dacbdf1 --- /dev/null +++ b/helm-charts/chatqna/templates/customMetrics.yaml @@ -0,0 +1,53 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +{{- if .Values.horizontalPodAutoscaler.enabled }} +apiVersion: v1 +data: + config.yaml: | + rules: + - seriesQuery: '{__name__="tgi_request_inference_duration_sum"}' + # Average request latency from TGI histograms, over 1 min + # (0.001 divider add is to make sure there's always a valid value) + metricsQuery: 'rate(tgi_request_inference_duration_sum{<<.LabelMatchers>>}[1m]) / (0.001+rate(tgi_request_inference_duration_count{<<.LabelMatchers>>}[1m]))' + name: + matches: ^tgi_request_inference_duration_sum + as: "tgi_request_latency" + resources: + # HPA needs both namespace + suitable object resource for its query paths: + # /apis/custom.metrics.k8s.io/v1beta1/namespaces/default/service/*/tgi_request_latency + # (pod is not suitable object type for matching as each instance has different name) + overrides: + namespace: + resource: namespace + service: + resource: service + - seriesQuery: '{__name__="te_request_inference_duration_sum",service="tei-reranking-svc"}' + # Average request latency from TEI histograms, over 1 min + metricsQuery: 'rate(te_request_inference_duration_sum{service="tei-reranking-svc",<<.LabelMatchers>>}[1m]) / (0.001+rate(te_request_inference_duration_count{service="tei-reranking-svc",<<.LabelMatchers>>}[1m]))' + name: + matches: ^te_request_inference_duration_sum + as: "reranking_request_latency" + resources: + overrides: + namespace: + resource: namespace + service: + resource: service + - seriesQuery: '{__name__="te_request_inference_duration_sum",service="tei-embedding-svc"}' + # Average request latency from TEI histograms, over 1 min + metricsQuery: 'rate(te_request_inference_duration_sum{service="tei-embedding-svc",<<.LabelMatchers>>}[1m]) / (0.001+rate(te_request_inference_duration_count{service="tei-embedding-svc",<<.LabelMatchers>>}[1m]))' + name: + matches: ^te_request_inference_duration_sum + as: "embedding_request_latency" + resources: + overrides: + namespace: + resource: namespace + service: + resource: service +kind: ConfigMap +metadata: + name: adapter-config + namespace: monitoring +{{- end }} diff --git a/helm-charts/chatqna/values.yaml b/helm-charts/chatqna/values.yaml index f848b209e..b062d6c03 100644 --- a/helm-charts/chatqna/values.yaml +++ b/helm-charts/chatqna/values.yaml @@ -7,6 +7,14 @@ replicaCount: 1 +# Enabling HPA will: +# - Overwrite existing PrometheusAdapter "adapter-config" configMap with ChatQnA specific custom metric queries +# for embedding, reranking, tgi services +# Upstream default configMap: +# - https://github.com/kubernetes-sigs/prometheus-adapter/blob/master/deploy/manifests/config-map.yaml +horizontalPodAutoscaler: + enabled: false + image: repository: opea/chatqna pullPolicy: IfNotPresent diff --git a/helm-charts/common/embedding-usvc/templates/_helpers.tpl b/helm-charts/common/embedding-usvc/templates/_helpers.tpl index 229f1a5b5..31a4095d7 100644 --- a/helm-charts/common/embedding-usvc/templates/_helpers.tpl +++ b/helm-charts/common/embedding-usvc/templates/_helpers.tpl @@ -40,6 +40,7 @@ helm.sh/chart: {{ include "embedding-usvc.chart" . }} app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} {{- end }} app.kubernetes.io/managed-by: {{ .Release.Service }} +svc: {{ include "embedding-usvc.fullname" . }} {{- end }} {{/* diff --git a/helm-charts/common/embedding-usvc/templates/deployment.yaml b/helm-charts/common/embedding-usvc/templates/deployment.yaml index 26f5a76fa..6c2f013ca 100644 --- a/helm-charts/common/embedding-usvc/templates/deployment.yaml +++ b/helm-charts/common/embedding-usvc/templates/deployment.yaml @@ -8,7 +8,10 @@ metadata: labels: {{- include "embedding-usvc.labels" . | nindent 4 }} spec: + # use explicit replica counts only of HorizontalPodAutoscaler is disabled + {{- if not .Values.horizontalPodAutoscaler.enabled }} replicas: {{ .Values.replicaCount }} + {{- end }} selector: matchLabels: {{- include "embedding-usvc.selectorLabels" . | nindent 6 }} @@ -62,6 +65,10 @@ spec: {{- end }} resources: {{- toYaml .Values.resources | nindent 12 }} + {{- if .Values.horizontalPodAutoscaler.enabled }} + # extra time to finish processing buffered requests before HPA forcibly terminates pod + terminationGracePeriodSeconds: 60 + {{- end }} volumes: - name: tmp emptyDir: {} diff --git a/helm-charts/common/embedding-usvc/templates/horizontalPodAutoscaler.yaml b/helm-charts/common/embedding-usvc/templates/horizontalPodAutoscaler.yaml new file mode 100644 index 000000000..62089e190 --- /dev/null +++ b/helm-charts/common/embedding-usvc/templates/horizontalPodAutoscaler.yaml @@ -0,0 +1,51 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +{{- if .Values.horizontalPodAutoscaler.enabled }} +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: {{ include "embedding-usvc.fullname" . }} +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: {{ include "embedding-usvc.fullname" . }} + minReplicas: 1 + maxReplicas: {{ .Values.horizontalPodAutoscaler.maxReplicas }} + metrics: + - type: Object + object: + metric: + # tei-embedding time metrics are in seconds + name: embedding_request_latency + describedObject: + apiVersion: v1 + # get metric for named object of given type (in same namespace) + kind: Service + name: tei-embedding-svc + target: + # embedding_request_latency is average for all TEI pods. To avoid replica fluctuations when + # TEI startup + request processing takes longer than HPA evaluation period, this uses + # "Value" (replicas = metric.value / target.value), instead of "averageValue" type: + # https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/#algorithm-details + type: Value + value: 4 + behavior: + scaleDown: + stabilizationWindowSeconds: 180 + policies: + - type: Percent + value: 25 + periodSeconds: 15 + scaleUp: + selectPolicy: Max + stabilizationWindowSeconds: 0 + policies: + - type: Percent + value: 50 + periodSeconds: 15 + - type: Pods + value: 2 + periodSeconds: 15 +{{- end }} diff --git a/helm-charts/common/embedding-usvc/templates/servicemonitor.yaml b/helm-charts/common/embedding-usvc/templates/servicemonitor.yaml new file mode 100644 index 000000000..0d5e6e776 --- /dev/null +++ b/helm-charts/common/embedding-usvc/templates/servicemonitor.yaml @@ -0,0 +1,17 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +{{- if .Values.horizontalPodAutoscaler.enabled }} +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: {{ include "embedding-usvc.fullname" . }} +spec: + selector: + matchLabels: + svc: {{ include "embedding-usvc.fullname" . }} + endpoints: + - interval: 4s + port: service + scheme: http +{{- end }} \ No newline at end of file diff --git a/helm-charts/common/embedding-usvc/values.yaml b/helm-charts/common/embedding-usvc/values.yaml index f3b1f9e89..8f766d0c9 100644 --- a/helm-charts/common/embedding-usvc/values.yaml +++ b/helm-charts/common/embedding-usvc/values.yaml @@ -10,6 +10,14 @@ autodependency: replicaCount: 1 +# Enabling HPA will: +# - Ignore above replica count, as it will be controlled by HPA +# - Add example HPA scaling rules with thresholds suitable for Xeon deployments +# - Require custom metrics ConfigMap available in the main application chart +horizontalPodAutoscaler: + enabled: false + maxReplicas: 2 + TEI_EMBEDDING_ENDPOINT: "" image: repository: opea/embedding-tei diff --git a/helm-charts/common/reranking-usvc/templates/_helpers.tpl b/helm-charts/common/reranking-usvc/templates/_helpers.tpl index 9247fe13b..1970e7cbd 100644 --- a/helm-charts/common/reranking-usvc/templates/_helpers.tpl +++ b/helm-charts/common/reranking-usvc/templates/_helpers.tpl @@ -40,6 +40,7 @@ helm.sh/chart: {{ include "reranking-usvc.chart" . }} app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} {{- end }} app.kubernetes.io/managed-by: {{ .Release.Service }} +svc: {{ include "reranking-usvc.fullname" . }} {{- end }} {{/* diff --git a/helm-charts/common/reranking-usvc/templates/deployment.yaml b/helm-charts/common/reranking-usvc/templates/deployment.yaml index 9fc05bc0c..c7bbbbaff 100644 --- a/helm-charts/common/reranking-usvc/templates/deployment.yaml +++ b/helm-charts/common/reranking-usvc/templates/deployment.yaml @@ -8,7 +8,10 @@ metadata: labels: {{- include "reranking-usvc.labels" . | nindent 4 }} spec: + # use explicit replica counts only of HorizontalPodAutoscaler is disabled + {{- if not .Values.horizontalPodAutoscaler.enabled }} replicas: {{ .Values.replicaCount }} + {{- end }} selector: matchLabels: {{- include "reranking-usvc.selectorLabels" . | nindent 6 }} @@ -62,6 +65,10 @@ spec: {{- end }} resources: {{- toYaml .Values.resources | nindent 12 }} + {{- if .Values.horizontalPodAutoscaler.enabled }} + # extra time to finish processing buffered requests before HPA forcibly terminates pod + terminationGracePeriodSeconds: 60 + {{- end }} volumes: - name: tmp emptyDir: {} diff --git a/helm-charts/common/reranking-usvc/templates/horizontalPodAutoscaler.yaml b/helm-charts/common/reranking-usvc/templates/horizontalPodAutoscaler.yaml new file mode 100644 index 000000000..72db38086 --- /dev/null +++ b/helm-charts/common/reranking-usvc/templates/horizontalPodAutoscaler.yaml @@ -0,0 +1,51 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +{{- if .Values.horizontalPodAutoscaler.enabled }} +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: {{ include "reranking-usvc.fullname" . }} +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: {{ include "reranking-usvc.fullname" . }} + minReplicas: 1 + maxReplicas: {{ .Values.horizontalPodAutoscaler.maxReplicas }} + metrics: + - type: Object + object: + metric: + # tei-reranking time metrics are in seconds + name: reranking_request_latency + describedObject: + apiVersion: v1 + # get metric for named object of given type (in same namespace) + kind: Service + name: tei-reranking-svc + target: + # reranking_request_latency is average for all TEI pods. To avoid replica fluctuations when + # TEI startup + request processing takes longer than HPA evaluation period, this uses + # "Value" (replicas = metric.value / target.value), instead of "averageValue" type: + # https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/#algorithm-details + type: Value + value: 4 + behavior: + scaleDown: + stabilizationWindowSeconds: 180 + policies: + - type: Percent + value: 25 + periodSeconds: 15 + scaleUp: + selectPolicy: Max + stabilizationWindowSeconds: 0 + policies: + - type: Percent + value: 50 + periodSeconds: 15 + - type: Pods + value: 2 + periodSeconds: 15 +{{- end }} diff --git a/helm-charts/common/reranking-usvc/templates/servicemonitor.yaml b/helm-charts/common/reranking-usvc/templates/servicemonitor.yaml new file mode 100644 index 000000000..9462d9664 --- /dev/null +++ b/helm-charts/common/reranking-usvc/templates/servicemonitor.yaml @@ -0,0 +1,17 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +{{- if .Values.horizontalPodAutoscaler.enabled }} +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: {{ include "reranking-usvc.fullname" . }} +spec: + selector: + matchLabels: + svc: {{ include "reranking-usvc.fullname" . }} + endpoints: + - interval: 4s + port: service + scheme: http +{{- end }} \ No newline at end of file diff --git a/helm-charts/common/reranking-usvc/values.yaml b/helm-charts/common/reranking-usvc/values.yaml index c011cf309..3bbc07dab 100644 --- a/helm-charts/common/reranking-usvc/values.yaml +++ b/helm-charts/common/reranking-usvc/values.yaml @@ -10,6 +10,14 @@ autodependency: replicaCount: 1 +# Enabling HPA will: +# - Ignore above replica count, as it will be controlled by HPA +# - Add example HPA scaling rules with thresholds suitable for Xeon deployments +# - Require custom metrics ConfigMap available in the main application chart +horizontalPodAutoscaler: + enabled: false + maxReplicas: 3 + TEI_RERANKING_ENDPOINT: "" image: repository: opea/reranking-tei diff --git a/helm-charts/common/tgi/servicemonitor.yaml b/helm-charts/common/tgi/servicemonitor.yaml new file mode 100644 index 000000000..6f96aff89 --- /dev/null +++ b/helm-charts/common/tgi/servicemonitor.yaml @@ -0,0 +1,22 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# +# Dashboard for the exposed TGI metrics: +# - https://grafana.com/grafana/dashboards/19831-text-generation-inference-dashboard/ +# Metric descriptions: +# - https://github.com/huggingface/text-generation-inference/discussions/1127#discussioncomment-7240527 + +{{- if .Values.horizontalPodAutoscaler.enabled }} +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: {{ include "tgi.fullname" . }} +spec: + selector: + matchLabels: + svc: {{ include "tgi.fullname" . }} + endpoints: + - interval: 4s + port: service + scheme: http +{{- end }} \ No newline at end of file diff --git a/helm-charts/common/tgi/templates/_helpers.tpl b/helm-charts/common/tgi/templates/_helpers.tpl index 6e98919c1..0a95d784b 100644 --- a/helm-charts/common/tgi/templates/_helpers.tpl +++ b/helm-charts/common/tgi/templates/_helpers.tpl @@ -40,6 +40,7 @@ helm.sh/chart: {{ include "tgi.chart" . }} app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} {{- end }} app.kubernetes.io/managed-by: {{ .Release.Service }} +svc: {{ include "tgi.fullname" . }} {{- end }} {{/* diff --git a/helm-charts/common/tgi/templates/deployment.yaml b/helm-charts/common/tgi/templates/deployment.yaml index 2ef224b59..1c00a4f37 100644 --- a/helm-charts/common/tgi/templates/deployment.yaml +++ b/helm-charts/common/tgi/templates/deployment.yaml @@ -8,7 +8,10 @@ metadata: labels: {{- include "tgi.labels" . | nindent 4 }} spec: + # use explicit replica counts only of HorizontalPodAutoscaler is disabled + {{- if not .Values.horizontalPodAutoscaler.enabled }} replicas: {{ .Values.replicaCount }} + {{- end }} selector: matchLabels: {{- include "tgi.selectorLabels" . | nindent 6 }} @@ -68,6 +71,10 @@ spec: {{- end }} resources: {{- toYaml .Values.resources | nindent 12 }} + {{- if .Values.horizontalPodAutoscaler.enabled }} + # extra time to finish processing buffered requests before HPA forcibly terminates pod + terminationGracePeriodSeconds: 120 + {{- end }} volumes: - name: model-volume {{- if .Values.global.modelUsePVC }} diff --git a/helm-charts/common/tgi/templates/horizontalPorAutoscaler.yaml b/helm-charts/common/tgi/templates/horizontalPorAutoscaler.yaml new file mode 100644 index 000000000..bae813e16 --- /dev/null +++ b/helm-charts/common/tgi/templates/horizontalPorAutoscaler.yaml @@ -0,0 +1,51 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +{{- if .Values.horizontalPodAutoscaler.enabled }} +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: {{ include "tgi.fullname" . }} +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: {{ include "tgi.fullname" . }} + minReplicas: 1 + maxReplicas: {{ .Values.horizontalPodAutoscaler.maxReplicas }} + metrics: + - type: Object + object: + metric: + # TGI time metrics are in seconds + name: tgi_request_latency + describedObject: + apiVersion: v1 + # get metric for named object of given type (in same namespace) + kind: Service + name: tgi-svc + target: + # tgi_request_latency is average for all the TGI pods. To avoid replica fluctuations when + # TGI startup + request processing takes longer than HPA evaluation period, this uses + # "Value" (replicas = metric.value / target.value), instead of "averageValue" type: + # https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/#algorithm-details + type: Value + value: 4 + behavior: + scaleDown: + stabilizationWindowSeconds: 180 + policies: + - type: Percent + value: 25 + periodSeconds: 15 + scaleUp: + selectPolicy: Max + stabilizationWindowSeconds: 0 + policies: + - type: Percent + value: 50 + periodSeconds: 15 + - type: Pods + value: 2 + periodSeconds: 15 +{{- end }} diff --git a/helm-charts/common/tgi/values.yaml b/helm-charts/common/tgi/values.yaml index dff877f5b..2716522f0 100644 --- a/helm-charts/common/tgi/values.yaml +++ b/helm-charts/common/tgi/values.yaml @@ -7,6 +7,14 @@ replicaCount: 1 +# Enabling HPA will: +# - Ignore above replica count, as it will be controlled by HPA +# - Add example HPA scaling rules with thresholds suitable for Xeon deployments +# - Require custom metrics ConfigMap available in the main application chart +horizontalPodAutoscaler: + enable: false + maxReplicas: 6 + port: 2080 image: