-
Notifications
You must be signed in to change notification settings - Fork 61
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add HPA support to embedding, reranking, tgi services
Signed-off-by: Alexey Fomenko <[email protected]>
- Loading branch information
Showing
17 changed files
with
313 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
# Copyright (C) 2024 Intel Corporation | ||
# SPDX-License-Identifier: Apache-2.0 | ||
|
||
{{- if .Values.horizontalPodAutoscaler.enabled }} | ||
apiVersion: v1 | ||
data: | ||
config.yaml: | | ||
rules: | ||
- seriesQuery: '{__name__="tgi_request_inference_duration_sum"}' | ||
# Average request latency from TGI histograms, over 1 min | ||
# (0.001 divider add is to make sure there's always a valid value) | ||
metricsQuery: 'rate(tgi_request_inference_duration_sum{<<.LabelMatchers>>}[1m]) / (0.001+rate(tgi_request_inference_duration_count{<<.LabelMatchers>>}[1m]))' | ||
name: | ||
matches: ^tgi_request_inference_duration_sum | ||
as: "tgi_request_latency" | ||
resources: | ||
# HPA needs both namespace + suitable object resource for its query paths: | ||
# /apis/custom.metrics.k8s.io/v1beta1/namespaces/default/service/*/tgi_request_latency | ||
# (pod is not suitable object type for matching as each instance has different name) | ||
overrides: | ||
namespace: | ||
resource: namespace | ||
service: | ||
resource: service | ||
- seriesQuery: '{__name__="te_request_inference_duration_sum",service="tei-reranking-svc"}' | ||
# Average request latency from TEI histograms, over 1 min | ||
metricsQuery: 'rate(te_request_inference_duration_sum{service="tei-reranking-svc",<<.LabelMatchers>>}[1m]) / (0.001+rate(te_request_inference_duration_count{service="tei-reranking-svc",<<.LabelMatchers>>}[1m]))' | ||
name: | ||
matches: ^te_request_inference_duration_sum | ||
as: "reranking_request_latency" | ||
resources: | ||
overrides: | ||
namespace: | ||
resource: namespace | ||
service: | ||
resource: service | ||
- seriesQuery: '{__name__="te_request_inference_duration_sum",service="tei-embedding-svc"}' | ||
# Average request latency from TEI histograms, over 1 min | ||
metricsQuery: 'rate(te_request_inference_duration_sum{service="tei-embedding-svc",<<.LabelMatchers>>}[1m]) / (0.001+rate(te_request_inference_duration_count{service="tei-embedding-svc",<<.LabelMatchers>>}[1m]))' | ||
name: | ||
matches: ^te_request_inference_duration_sum | ||
as: "embedding_request_latency" | ||
resources: | ||
overrides: | ||
namespace: | ||
resource: namespace | ||
service: | ||
resource: service | ||
kind: ConfigMap | ||
metadata: | ||
name: adapter-config | ||
namespace: monitoring | ||
{{- end }} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
50 changes: 50 additions & 0 deletions
50
helm-charts/common/embedding-usvc/templates/horizontalPodAutoscaler.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
# Copyright (C) 2024 Intel Corporation | ||
# SPDX-License-Identifier: Apache-2.0 | ||
|
||
{{- if .Values.horizontalPodAutoscaler.enabled }} | ||
apiVersion: autoscaling/v2 | ||
kind: HorizontalPodAutoscaler | ||
metadata: | ||
name: {{ include "embedding-usvc.fullname" . }} | ||
spec: | ||
scaleTargetRef: | ||
apiVersion: apps/v1 | ||
kind: Deployment | ||
name: {{ include "embedding-usvc.fullname" . }} | ||
minReplicas: 1 | ||
maxReplicas: {{ .Values.horizontalPodAutoscaler.maxReplicas }} | ||
metrics: | ||
- type: Object | ||
object: | ||
metric: | ||
# tei-embedding time metrics are in seconds | ||
name: embedding_request_latency | ||
describedObject: | ||
apiVersion: v1 | ||
# get metric for named object of given type (in same namespace) | ||
kind: Service | ||
name: tei-embedding-svc | ||
target: | ||
# embedding_request_latency is already average for all the TEI pods, | ||
# so this uses Value instead of averageValue. | ||
# On ICL Xeon, max tei-embedding wait time target = 4s. | ||
type: Value | ||
value: 4 | ||
behavior: | ||
scaleDown: | ||
stabilizationWindowSeconds: 180 | ||
policies: | ||
- type: Percent | ||
value: 25 | ||
periodSeconds: 15 | ||
scaleUp: | ||
selectPolicy: Max | ||
stabilizationWindowSeconds: 0 | ||
policies: | ||
- type: Percent | ||
value: 50 | ||
periodSeconds: 15 | ||
- type: Pods | ||
value: 2 | ||
periodSeconds: 15 | ||
{{- end }} |
17 changes: 17 additions & 0 deletions
17
helm-charts/common/embedding-usvc/templates/servicemonitor.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
# Copyright (C) 2024 Intel Corporation | ||
# SPDX-License-Identifier: Apache-2.0 | ||
|
||
{{- if .Values.horizontalPodAutoscaler.enabled }} | ||
apiVersion: monitoring.coreos.com/v1 | ||
kind: ServiceMonitor | ||
metadata: | ||
name: {{ include "embedding-usvc.fullname" . }} | ||
spec: | ||
selector: | ||
matchLabels: | ||
svc: {{ include "embedding-usvc.fullname" . }} | ||
endpoints: | ||
- interval: 4s | ||
port: service | ||
scheme: http | ||
{{- end }} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
50 changes: 50 additions & 0 deletions
50
helm-charts/common/reranking-usvc/templates/horizontalPodAutoscaler.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
# Copyright (C) 2024 Intel Corporation | ||
# SPDX-License-Identifier: Apache-2.0 | ||
|
||
{{- if .Values.horizontalPodAutoscaler.enabled }} | ||
apiVersion: autoscaling/v2 | ||
kind: HorizontalPodAutoscaler | ||
metadata: | ||
name: {{ include "reranking-usvc.fullname" . }} | ||
spec: | ||
scaleTargetRef: | ||
apiVersion: apps/v1 | ||
kind: Deployment | ||
name: {{ include "reranking-usvc.fullname" . }} | ||
minReplicas: 1 | ||
maxReplicas: {{ .Values.horizontalPodAutoscaler.maxReplicas }} | ||
metrics: | ||
- type: Object | ||
object: | ||
metric: | ||
# tei-reranking time metrics are in seconds | ||
name: reranking_request_latency | ||
describedObject: | ||
apiVersion: v1 | ||
# get metric for named object of given type (in same namespace) | ||
kind: Service | ||
name: tei-reranking-svc | ||
target: | ||
# reranking_request_latency is already average for all the TEI pods, | ||
# so this uses Value instead of averageValue. | ||
# On ICL Xeon, max tei-reranking wait time target = 4s. | ||
type: Value | ||
value: 4 | ||
behavior: | ||
scaleDown: | ||
stabilizationWindowSeconds: 180 | ||
policies: | ||
- type: Percent | ||
value: 25 | ||
periodSeconds: 15 | ||
scaleUp: | ||
selectPolicy: Max | ||
stabilizationWindowSeconds: 0 | ||
policies: | ||
- type: Percent | ||
value: 50 | ||
periodSeconds: 15 | ||
- type: Pods | ||
value: 2 | ||
periodSeconds: 15 | ||
{{- end }} |
17 changes: 17 additions & 0 deletions
17
helm-charts/common/reranking-usvc/templates/servicemonitor.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
# Copyright (C) 2024 Intel Corporation | ||
# SPDX-License-Identifier: Apache-2.0 | ||
|
||
{{- if .Values.horizontalPodAutoscaler.enabled }} | ||
apiVersion: monitoring.coreos.com/v1 | ||
kind: ServiceMonitor | ||
metadata: | ||
name: {{ include "reranking-usvc.fullname" . }} | ||
spec: | ||
selector: | ||
matchLabels: | ||
svc: {{ include "reranking-usvc.fullname" . }} | ||
endpoints: | ||
- interval: 4s | ||
port: service | ||
scheme: http | ||
{{- end }} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
# Copyright (C) 2024 Intel Corporation | ||
# SPDX-License-Identifier: Apache-2.0 | ||
|
||
{{- if .Values.horizontalPodAutoscaler.enabled }} | ||
apiVersion: autoscaling/v2 | ||
kind: HorizontalPodAutoscaler | ||
metadata: | ||
name: {{ include "tgi.fullname" . }} | ||
spec: | ||
scaleTargetRef: | ||
apiVersion: apps/v1 | ||
kind: Deployment | ||
name: {{ include "tgi.fullname" . }} | ||
minReplicas: 1 | ||
maxReplicas: {{ .Values.horizontalPodAutoscaler.maxReplicas }} | ||
metrics: | ||
- type: Object | ||
object: | ||
metric: | ||
# TGI time metrics are in seconds | ||
name: tgi_request_latency | ||
describedObject: | ||
apiVersion: v1 | ||
# get metric for named object of given type (in same namespace) | ||
kind: Service | ||
name: tgi-svc | ||
target: | ||
# tgi_request_latency is average for all the TGI pods. To avoid replica fluctuations when | ||
# TGI startup + request processing takes longer than HPA evalution period, this uses | ||
# "Value" (replicas = metric.value / target.value), instead of "averageValue" type: | ||
# https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/#algorithm-details | ||
type: Value | ||
value: 4 | ||
behavior: | ||
scaleDown: | ||
stabilizationWindowSeconds: 180 | ||
policies: | ||
- type: Percent | ||
value: 25 | ||
periodSeconds: 15 | ||
scaleUp: | ||
selectPolicy: Max | ||
stabilizationWindowSeconds: 0 | ||
policies: | ||
- type: Percent | ||
value: 50 | ||
periodSeconds: 15 | ||
- type: Pods | ||
value: 2 | ||
periodSeconds: 15 | ||
{{- end }} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
# Copyright (C) 2024 Intel Corporation | ||
# SPDX-License-Identifier: Apache-2.0 | ||
# | ||
# Dashboard for the exposed TGI metrics: | ||
# - https://grafana.com/grafana/dashboards/19831-text-generation-inference-dashboard/ | ||
# Metric descriptions: | ||
# - https://github.com/huggingface/text-generation-inference/discussions/1127#discussioncomment-7240527 | ||
|
||
{{- if .Values.horizontalPodAutoscaler.enabled }} | ||
apiVersion: monitoring.coreos.com/v1 | ||
kind: ServiceMonitor | ||
metadata: | ||
name: {{ include "tgi.fullname" . }} | ||
spec: | ||
selector: | ||
matchLabels: | ||
svc: {{ include "tgi.fullname" . }} | ||
endpoints: | ||
- interval: 4s | ||
port: service | ||
scheme: http | ||
{{- end }} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.