Skip to content

Commit

Permalink
Add HPA support to embedding, reranking, tgi services
Browse files Browse the repository at this point in the history
Signed-off-by: Alexey Fomenko <[email protected]>
  • Loading branch information
byako committed Aug 21, 2024
1 parent b1182c4 commit 8b9cfde
Show file tree
Hide file tree
Showing 17 changed files with 318 additions and 0 deletions.
53 changes: 53 additions & 0 deletions helm-charts/chatqna/templates/customMetrics.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

{{- if .Values.horizontalPodAutoscaler.enabled }}
apiVersion: v1
data:
config.yaml: |
rules:
- seriesQuery: '{__name__="tgi_request_inference_duration_sum"}'
# Average request latency from TGI histograms, over 1 min
# (0.001 divider add is to make sure there's always a valid value)
metricsQuery: 'rate(tgi_request_inference_duration_sum{<<.LabelMatchers>>}[1m]) / (0.001+rate(tgi_request_inference_duration_count{<<.LabelMatchers>>}[1m]))'
name:
matches: ^tgi_request_inference_duration_sum
as: "tgi_request_latency"
resources:
# HPA needs both namespace + suitable object resource for its query paths:
# /apis/custom.metrics.k8s.io/v1beta1/namespaces/default/service/*/tgi_request_latency
# (pod is not suitable object type for matching as each instance has different name)
overrides:
namespace:
resource: namespace
service:
resource: service
- seriesQuery: '{__name__="te_request_inference_duration_sum",service="tei-reranking-svc"}'
# Average request latency from TEI histograms, over 1 min
metricsQuery: 'rate(te_request_inference_duration_sum{service="tei-reranking-svc",<<.LabelMatchers>>}[1m]) / (0.001+rate(te_request_inference_duration_count{service="tei-reranking-svc",<<.LabelMatchers>>}[1m]))'
name:
matches: ^te_request_inference_duration_sum
as: "reranking_request_latency"
resources:
overrides:
namespace:
resource: namespace
service:
resource: service
- seriesQuery: '{__name__="te_request_inference_duration_sum",service="tei-embedding-svc"}'
# Average request latency from TEI histograms, over 1 min
metricsQuery: 'rate(te_request_inference_duration_sum{service="tei-embedding-svc",<<.LabelMatchers>>}[1m]) / (0.001+rate(te_request_inference_duration_count{service="tei-embedding-svc",<<.LabelMatchers>>}[1m]))'
name:
matches: ^te_request_inference_duration_sum
as: "embedding_request_latency"
resources:
overrides:
namespace:
resource: namespace
service:
resource: service
kind: ConfigMap
metadata:
name: adapter-config
namespace: monitoring
{{- end }}
8 changes: 8 additions & 0 deletions helm-charts/chatqna/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,14 @@

replicaCount: 1

# Enabling HPA will:
# - Overwrite existing PrometheusAdapter "adapter-config" configMap with ChatQnA specific custom metric queries
# for embedding, reranking, tgi services
# Upstream default configMap:
# - https://github.com/kubernetes-sigs/prometheus-adapter/blob/master/deploy/manifests/config-map.yaml
horizontalPodAutoscaler:
enabled: false

image:
repository: opea/chatqna
pullPolicy: IfNotPresent
Expand Down
1 change: 1 addition & 0 deletions helm-charts/common/embedding-usvc/templates/_helpers.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ helm.sh/chart: {{ include "embedding-usvc.chart" . }}
app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
{{- end }}
app.kubernetes.io/managed-by: {{ .Release.Service }}
svc: {{ include "embedding-usvc.fullname" . }}
{{- end }}

{{/*
Expand Down
7 changes: 7 additions & 0 deletions helm-charts/common/embedding-usvc/templates/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,10 @@ metadata:
labels:
{{- include "embedding-usvc.labels" . | nindent 4 }}
spec:
# use explicit replica counts only of HorizontalPodAutoscaler is disabled
{{- if not .Values.horizontalPodAutoscaler.enabled }}
replicas: {{ .Values.replicaCount }}
{{- end }}
selector:
matchLabels:
{{- include "embedding-usvc.selectorLabels" . | nindent 6 }}
Expand Down Expand Up @@ -62,6 +65,10 @@ spec:
{{- end }}
resources:
{{- toYaml .Values.resources | nindent 12 }}
{{- if .Values.horizontalPodAutoscaler.enabled }}
# extra time to finish processing buffered requests before HPA forcibly terminates pod
terminationGracePeriodSeconds: 60
{{- end }}
volumes:
- name: tmp
emptyDir: {}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

{{- if .Values.horizontalPodAutoscaler.enabled }}
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: {{ include "embedding-usvc.fullname" . }}
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: {{ include "embedding-usvc.fullname" . }}
minReplicas: 1
maxReplicas: {{ .Values.horizontalPodAutoscaler.maxReplicas }}
metrics:
- type: Object
object:
metric:
# tei-embedding time metrics are in seconds
name: embedding_request_latency
describedObject:
apiVersion: v1
# get metric for named object of given type (in same namespace)
kind: Service
name: tei-embedding-svc
target:
# embedding_request_latency is average for all TEI pods. To avoid replica fluctuations when
# TEI startup + request processing takes longer than HPA evaluation period, this uses
# "Value" (replicas = metric.value / target.value), instead of "averageValue" type:
# https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/#algorithm-details
type: Value
value: 4
behavior:
scaleDown:
stabilizationWindowSeconds: 180
policies:
- type: Percent
value: 25
periodSeconds: 15
scaleUp:
selectPolicy: Max
stabilizationWindowSeconds: 0
policies:
- type: Percent
value: 50
periodSeconds: 15
- type: Pods
value: 2
periodSeconds: 15
{{- end }}
17 changes: 17 additions & 0 deletions helm-charts/common/embedding-usvc/templates/servicemonitor.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

{{- if .Values.horizontalPodAutoscaler.enabled }}
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: {{ include "embedding-usvc.fullname" . }}
spec:
selector:
matchLabels:
svc: {{ include "embedding-usvc.fullname" . }}
endpoints:
- interval: 4s
port: service
scheme: http
{{- end }}
8 changes: 8 additions & 0 deletions helm-charts/common/embedding-usvc/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,14 @@ autodependency:

replicaCount: 1

# Enabling HPA will:
# - Ignore above replica count, as it will be controlled by HPA
# - Add example HPA scaling rules with thresholds suitable for Xeon deployments
# - Require custom metrics ConfigMap available in the main application chart
horizontalPodAutoscaler:
enabled: false
maxReplicas: 2

TEI_EMBEDDING_ENDPOINT: ""
image:
repository: opea/embedding-tei
Expand Down
1 change: 1 addition & 0 deletions helm-charts/common/reranking-usvc/templates/_helpers.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ helm.sh/chart: {{ include "reranking-usvc.chart" . }}
app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
{{- end }}
app.kubernetes.io/managed-by: {{ .Release.Service }}
svc: {{ include "reranking-usvc.fullname" . }}
{{- end }}

{{/*
Expand Down
7 changes: 7 additions & 0 deletions helm-charts/common/reranking-usvc/templates/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,10 @@ metadata:
labels:
{{- include "reranking-usvc.labels" . | nindent 4 }}
spec:
# use explicit replica counts only of HorizontalPodAutoscaler is disabled
{{- if not .Values.horizontalPodAutoscaler.enabled }}
replicas: {{ .Values.replicaCount }}
{{- end }}
selector:
matchLabels:
{{- include "reranking-usvc.selectorLabels" . | nindent 6 }}
Expand Down Expand Up @@ -62,6 +65,10 @@ spec:
{{- end }}
resources:
{{- toYaml .Values.resources | nindent 12 }}
{{- if .Values.horizontalPodAutoscaler.enabled }}
# extra time to finish processing buffered requests before HPA forcibly terminates pod
terminationGracePeriodSeconds: 60
{{- end }}
volumes:
- name: tmp
emptyDir: {}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

{{- if .Values.horizontalPodAutoscaler.enabled }}
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: {{ include "reranking-usvc.fullname" . }}
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: {{ include "reranking-usvc.fullname" . }}
minReplicas: 1
maxReplicas: {{ .Values.horizontalPodAutoscaler.maxReplicas }}
metrics:
- type: Object
object:
metric:
# tei-reranking time metrics are in seconds
name: reranking_request_latency
describedObject:
apiVersion: v1
# get metric for named object of given type (in same namespace)
kind: Service
name: tei-reranking-svc
target:
# reranking_request_latency is average for all TEI pods. To avoid replica fluctuations when
# TEI startup + request processing takes longer than HPA evaluation period, this uses
# "Value" (replicas = metric.value / target.value), instead of "averageValue" type:
# https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/#algorithm-details
type: Value
value: 4
behavior:
scaleDown:
stabilizationWindowSeconds: 180
policies:
- type: Percent
value: 25
periodSeconds: 15
scaleUp:
selectPolicy: Max
stabilizationWindowSeconds: 0
policies:
- type: Percent
value: 50
periodSeconds: 15
- type: Pods
value: 2
periodSeconds: 15
{{- end }}
17 changes: 17 additions & 0 deletions helm-charts/common/reranking-usvc/templates/servicemonitor.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

{{- if .Values.horizontalPodAutoscaler.enabled }}
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: {{ include "reranking-usvc.fullname" . }}
spec:
selector:
matchLabels:
svc: {{ include "reranking-usvc.fullname" . }}
endpoints:
- interval: 4s
port: service
scheme: http
{{- end }}
8 changes: 8 additions & 0 deletions helm-charts/common/reranking-usvc/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,14 @@ autodependency:

replicaCount: 1

# Enabling HPA will:
# - Ignore above replica count, as it will be controlled by HPA
# - Add example HPA scaling rules with thresholds suitable for Xeon deployments
# - Require custom metrics ConfigMap available in the main application chart
horizontalPodAutoscaler:
enabled: false
maxReplicas: 3

TEI_RERANKING_ENDPOINT: ""
image:
repository: opea/reranking-tei
Expand Down
22 changes: 22 additions & 0 deletions helm-charts/common/tgi/servicemonitor.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
#
# Dashboard for the exposed TGI metrics:
# - https://grafana.com/grafana/dashboards/19831-text-generation-inference-dashboard/
# Metric descriptions:
# - https://github.com/huggingface/text-generation-inference/discussions/1127#discussioncomment-7240527

{{- if .Values.horizontalPodAutoscaler.enabled }}
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: {{ include "tgi.fullname" . }}
spec:
selector:
matchLabels:
svc: {{ include "tgi.fullname" . }}
endpoints:
- interval: 4s
port: service
scheme: http
{{- end }}
1 change: 1 addition & 0 deletions helm-charts/common/tgi/templates/_helpers.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ helm.sh/chart: {{ include "tgi.chart" . }}
app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
{{- end }}
app.kubernetes.io/managed-by: {{ .Release.Service }}
svc: {{ include "tgi.fullname" . }}
{{- end }}

{{/*
Expand Down
7 changes: 7 additions & 0 deletions helm-charts/common/tgi/templates/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,10 @@ metadata:
labels:
{{- include "tgi.labels" . | nindent 4 }}
spec:
# use explicit replica counts only of HorizontalPodAutoscaler is disabled
{{- if not .Values.horizontalPodAutoscaler.enabled }}
replicas: {{ .Values.replicaCount }}
{{- end }}
selector:
matchLabels:
{{- include "tgi.selectorLabels" . | nindent 6 }}
Expand Down Expand Up @@ -68,6 +71,10 @@ spec:
{{- end }}
resources:
{{- toYaml .Values.resources | nindent 12 }}
{{- if .Values.horizontalPodAutoscaler.enabled }}
# extra time to finish processing buffered requests before HPA forcibly terminates pod
terminationGracePeriodSeconds: 120
{{- end }}
volumes:
- name: model-volume
{{- if .Values.global.modelUsePVC }}
Expand Down
Loading

0 comments on commit 8b9cfde

Please sign in to comment.