-
Notifications
You must be signed in to change notification settings - Fork 61
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add HPA support to embedding, reranking, tgi services
Signed-off-by: Alexey Fomenko <[email protected]>
- Loading branch information
Showing
17 changed files
with
501 additions
and
11 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
# Copyright (C) 2024 Intel Corporation | ||
# SPDX-License-Identifier: Apache-2.0 | ||
|
||
{{- if .Values.horizontalPodAutoscaler.enabled }} | ||
apiVersion: v1 | ||
data: | ||
config.yaml: | | ||
rules: | ||
- seriesQuery: '{__name__="tgi_request_inference_duration_sum"}' | ||
# Average request latency from TGI histograms, over 1 min | ||
# (0.001 divider add is to make sure there's always a valid value) | ||
metricsQuery: 'rate(tgi_request_inference_duration_sum{<<.LabelMatchers>>}[1m]) / (0.001+rate(tgi_request_inference_duration_count{<<.LabelMatchers>>}[1m]))' | ||
name: | ||
matches: ^tgi_request_inference_duration_sum | ||
as: "tgi_request_latency" | ||
resources: | ||
# HPA needs both namespace + suitable object resource for its query paths: | ||
# /apis/custom.metrics.k8s.io/v1beta1/namespaces/default/service/*/tgi_request_latency | ||
# (pod is not suitable object type for matching as each instance has different name) | ||
overrides: | ||
namespace: | ||
resource: namespace | ||
service: | ||
resource: service | ||
- seriesQuery: '{__name__="te_request_inference_duration_sum",service="tei-reranking-svc"}' | ||
# Average request latency from TEI histograms, over 1 min | ||
metricsQuery: 'rate(te_request_inference_duration_sum{service="tei-reranking-svc",<<.LabelMatchers>>}[1m]) / (0.001+rate(te_request_inference_duration_count{service="tei-reranking-svc",<<.LabelMatchers>>}[1m]))' | ||
name: | ||
matches: ^te_request_inference_duration_sum | ||
as: "reranking_request_latency" | ||
resources: | ||
overrides: | ||
namespace: | ||
resource: namespace | ||
service: | ||
resource: service | ||
- seriesQuery: '{__name__="te_request_inference_duration_sum",service="tei-embedding-svc"}' | ||
# Average request latency from TEI histograms, over 1 min | ||
metricsQuery: 'rate(te_request_inference_duration_sum{service="tei-embedding-svc",<<.LabelMatchers>>}[1m]) / (0.001+rate(te_request_inference_duration_count{service="tei-embedding-svc",<<.LabelMatchers>>}[1m]))' | ||
name: | ||
matches: ^te_request_inference_duration_sum | ||
as: "embedding_request_latency" | ||
resources: | ||
overrides: | ||
namespace: | ||
resource: namespace | ||
service: | ||
resource: service | ||
kind: ConfigMap | ||
metadata: | ||
name: adapter-config | ||
namespace: monitoring | ||
{{- end }} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
51 changes: 51 additions & 0 deletions
51
helm-charts/common/embedding-usvc/templates/horizontalPodAutoscaler.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
# Copyright (C) 2024 Intel Corporation | ||
# SPDX-License-Identifier: Apache-2.0 | ||
|
||
{{- if .Values.horizontalPodAutoscaler.enabled }} | ||
apiVersion: autoscaling/v2 | ||
kind: HorizontalPodAutoscaler | ||
metadata: | ||
name: {{ include "embedding-usvc.fullname" . }} | ||
spec: | ||
scaleTargetRef: | ||
apiVersion: apps/v1 | ||
kind: Deployment | ||
name: {{ include "embedding-usvc.fullname" . }} | ||
minReplicas: 1 | ||
maxReplicas: {{ .Values.horizontalPodAutoscaler.maxReplicas }} | ||
metrics: | ||
- type: Object | ||
object: | ||
metric: | ||
# tei-embedding time metrics are in seconds | ||
name: embedding_request_latency | ||
describedObject: | ||
apiVersion: v1 | ||
# get metric for named object of given type (in same namespace) | ||
kind: Service | ||
name: tei-embedding-svc | ||
target: | ||
# embedding_request_latency is average for all TEI pods. To avoid replica fluctuations when | ||
# TEI startup + request processing takes longer than HPA evaluation period, this uses | ||
# "Value" (replicas = metric.value / target.value), instead of "averageValue" type: | ||
# https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/#algorithm-details | ||
type: Value | ||
value: 4 | ||
behavior: | ||
scaleDown: | ||
stabilizationWindowSeconds: 180 | ||
policies: | ||
- type: Percent | ||
value: 25 | ||
periodSeconds: 15 | ||
scaleUp: | ||
selectPolicy: Max | ||
stabilizationWindowSeconds: 0 | ||
policies: | ||
- type: Percent | ||
value: 50 | ||
periodSeconds: 15 | ||
- type: Pods | ||
value: 2 | ||
periodSeconds: 15 | ||
{{- end }} |
17 changes: 17 additions & 0 deletions
17
helm-charts/common/embedding-usvc/templates/servicemonitor.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
# Copyright (C) 2024 Intel Corporation | ||
# SPDX-License-Identifier: Apache-2.0 | ||
|
||
{{- if .Values.horizontalPodAutoscaler.enabled }} | ||
apiVersion: monitoring.coreos.com/v1 | ||
kind: ServiceMonitor | ||
metadata: | ||
name: {{ include "embedding-usvc.fullname" . }} | ||
spec: | ||
selector: | ||
matchLabels: | ||
app.kubernetes.io/name: {{ include "embedding-usvc.fullname" . }} | ||
endpoints: | ||
- interval: 4s | ||
port: service | ||
scheme: http | ||
{{- end }} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.