Skip to content

Commit

Permalink
Support alternative metrics on accelerated TGI / TEI instances (#454)
Browse files Browse the repository at this point in the history
* Add tgi.accelDevice to rest of top-level gaudi-values.yaml files

DocSum defaults to same model as ChatQnA, and default model used by
CodeGen + CodeTrans is also 7b one, so tgi.accelDevice impact is
assumed to be close enough.

Signed-off-by: Eero Tamminen <[email protected]>

* Different TGI/TEI custom metrics & HPA rules for accelerated devices

Signed-off-by: Eero Tamminen <[email protected]>

---------

Signed-off-by: Eero Tamminen <[email protected]>
  • Loading branch information
eero-t authored Sep 27, 2024
1 parent cdd47a5 commit cdd3585
Show file tree
Hide file tree
Showing 7 changed files with 122 additions and 71 deletions.
121 changes: 71 additions & 50 deletions helm-charts/chatqna/templates/custom-metrics-configmap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,54 +12,75 @@ metadata:
app.kubernetes.io/name: prometheus-adapter
data:
config.yaml: |
rules:
{{- if .Values.tgi.horizontalPodAutoscaler.enabled }}
# check metric with:
# kubectl get --raw /apis/custom.metrics.k8s.io/v1beta1/namespaces/default/service/*/{{ include "tgi.metricPrefix" .Subcharts.tgi }}_request_latency | jq
#
- seriesQuery: '{__name__="tgi_request_inference_duration_sum",service="{{ include "tgi.fullname" .Subcharts.tgi }}"}'
# Average request latency from TGI histograms, over 1 min
# (0.001 divider add is to make sure there's always a valid value)
metricsQuery: 'rate(tgi_request_inference_duration_sum{service="{{ include "tgi.fullname" .Subcharts.tgi }}",<<.LabelMatchers>>}[1m]) / (0.001+rate(tgi_request_inference_duration_count{service="{{ include "tgi.fullname" .Subcharts.tgi }}",<<.LabelMatchers>>}[1m]))'
name:
matches: ^tgi_request_inference_duration_sum
as: "{{ include "tgi.metricPrefix" .Subcharts.tgi }}_request_latency"
resources:
# HPA needs both namespace + suitable object resource for its query paths:
# /apis/custom.metrics.k8s.io/v1beta1/namespaces/default/service/*/tgi_request_latency
# (pod is not suitable object type for matching as each instance has different name)
overrides:
namespace:
resource: namespace
service:
resource: service
{{- end }}
{{- if .Values.teirerank.horizontalPodAutoscaler.enabled }}
- seriesQuery: '{__name__="te_request_inference_duration_sum",service="{{ include "teirerank.fullname" .Subcharts.teirerank }}"}'
# Average request latency from TEI histograms, over 1 min
metricsQuery: 'rate(te_request_inference_duration_sum{service="{{ include "teirerank.fullname" .Subcharts.teirerank }}",<<.LabelMatchers>>}[1m]) / (0.001+rate(te_request_inference_duration_count{service="{{ include "teirerank.fullname" .Subcharts.teirerank }}",<<.LabelMatchers>>}[1m]))'
name:
matches: ^te_request_inference_duration_sum
as: "{{ include "teirerank.metricPrefix" .Subcharts.teirerank }}_request_latency"
resources:
overrides:
namespace:
resource: namespace
service:
resource: service
{{- end }}
{{- if .Values.tei.horizontalPodAutoscaler.enabled }}
- seriesQuery: '{__name__="te_request_inference_duration_sum",service="{{ include "tei.fullname" .Subcharts.tei }}"}'
# Average request latency from TEI histograms, over 1 min
metricsQuery: 'rate(te_request_inference_duration_sum{service="{{ include "tei.fullname" .Subcharts.tei }}",<<.LabelMatchers>>}[1m]) / (0.001+rate(te_request_inference_duration_count{service="{{ include "tei.fullname" .Subcharts.tei }}",<<.LabelMatchers>>}[1m]))'
name:
matches: ^te_request_inference_duration_sum
as: "{{ include "tei.metricPrefix" .Subcharts.tei }}_request_latency"
resources:
overrides:
namespace:
resource: namespace
service:
resource: service
{{- end }}
rules:
{{- if .Values.tgi.horizontalPodAutoscaler.enabled }}
# check metric with:
# kubectl get --raw /apis/custom.metrics.k8s.io/v1beta1/namespaces/default/service/*/<metric> | jq
#
{{- if .Values.tgi.accelDevice }}
- seriesQuery: '{__name__="tgi_queue_size",service="{{ include "tgi.fullname" .Subcharts.tgi }}"}'
# TGI instances queue_size sum
metricsQuery: 'sum by (namespace,service) (tgi_queue_size{service="{{ include "tgi.fullname" .Subcharts.tgi }}",<<.LabelMatchers>>})'
name:
matches: ^tgi_queue_size
as: "{{ include "tgi.metricPrefix" .Subcharts.tgi }}_queue_size_sum"
{{- else }}
- seriesQuery: '{__name__="tgi_request_inference_duration_sum",service="{{ include "tgi.fullname" .Subcharts.tgi }}"}'
# Average request latency from TGI histograms, over 1 min
# (0.001 divider add is to make sure there's always a valid value)
metricsQuery: 'rate(tgi_request_inference_duration_sum{service="{{ include "tgi.fullname" .Subcharts.tgi }}",<<.LabelMatchers>>}[1m]) / (0.001+rate(tgi_request_inference_duration_count{service="{{ include "tgi.fullname" .Subcharts.tgi }}",<<.LabelMatchers>>}[1m]))'
name:
matches: ^tgi_request_inference_duration_sum
as: "{{ include "tgi.metricPrefix" .Subcharts.tgi }}_request_latency"
{{- end }}
resources:
# HPA needs both namespace + suitable object resource for its query paths:
# /apis/custom.metrics.k8s.io/v1beta1/namespaces/default/service/*/<metric>
# (pod is not suitable object type for matching as each instance has different name)
overrides:
namespace: {resource: "namespace"}
service: {resource: "service"}
{{- end }}
{{- if .Values.teirerank.horizontalPodAutoscaler.enabled }}
{{- if .Values.teirerank.accelDevice }}
- seriesQuery: '{__name__="te_queue_size",service="{{ include "teirerank.fullname" .Subcharts.teirerank }}"}'
# TEI instances queue_size sum
metricsQuery: 'sum by (namespace,service) (te_queue_size{service="{{ include "teirerank.fullname" .Subcharts.teirerank }}",<<.LabelMatchers>>})'
name:
matches: ^te_queue_size
as: "{{ include "teirerank.metricPrefix" .Subcharts.teirerank }}_queue_size_sum"
{{- else }}
- seriesQuery: '{__name__="te_request_inference_duration_sum",service="{{ include "teirerank.fullname" .Subcharts.teirerank }}"}'
# Average request latency from TEI histograms, over 1 min
metricsQuery: 'rate(te_request_inference_duration_sum{service="{{ include "teirerank.fullname" .Subcharts.teirerank }}",<<.LabelMatchers>>}[1m]) / (0.001+rate(te_request_inference_duration_count{service="{{ include "teirerank.fullname" .Subcharts.teirerank }}",<<.LabelMatchers>>}[1m]))'
name:
matches: ^te_request_inference_duration_sum
as: "{{ include "teirerank.metricPrefix" .Subcharts.teirerank }}_request_latency"
{{- end }}
resources:
overrides:
namespace: {resource: "namespace"}
service: {resource: "service"}
{{- end }}
{{- if .Values.tei.horizontalPodAutoscaler.enabled }}
{{- if .Values.tei.accelDevice }}
- seriesQuery: '{__name__="te_queue_size",service="{{ include "tei.fullname" .Subcharts.tei }}"}'
# TEI instances queue_size sum
metricsQuery: 'sum by (namespace,service) (te_queue_size{service="{{ include "tei.fullname" .Subcharts.tei }}",<<.LabelMatchers>>})'
name:
matches: ^te_queue_size
as: "{{ include "tei.metricPrefix" .Subcharts.tei }}_queue_size_sum"
{{- else }}
- seriesQuery: '{__name__="te_request_inference_duration_sum",service="{{ include "tei.fullname" .Subcharts.tei }}"}'
# Average request latency from TEI histograms, over 1 min
metricsQuery: 'rate(te_request_inference_duration_sum{service="{{ include "tei.fullname" .Subcharts.tei }}",<<.LabelMatchers>>}[1m]) / (0.001+rate(te_request_inference_duration_count{service="{{ include "tei.fullname" .Subcharts.tei }}",<<.LabelMatchers>>}[1m]))'
name:
matches: ^te_request_inference_duration_sum
as: "{{ include "tei.metricPrefix" .Subcharts.tei }}_request_latency"
{{- end }}
resources:
overrides:
namespace: {resource: "namespace"}
service: {resource: "service"}
{{- end }}
{{- end }}
1 change: 1 addition & 0 deletions helm-charts/codegen/gaudi-values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# SPDX-License-Identifier: Apache-2.0

tgi:
accelDevice: "gaudi"
image:
repository: ghcr.io/huggingface/tgi-gaudi
tag: "2.0.1"
Expand Down
1 change: 1 addition & 0 deletions helm-charts/codetrans/gaudi-values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# SPDX-License-Identifier: Apache-2.0

tgi:
accelDevice: "gaudi"
image:
repository: ghcr.io/huggingface/tgi-gaudi
tag: "2.0.1"
Expand Down
23 changes: 16 additions & 7 deletions helm-charts/common/tei/templates/horizontal-pod-autoscaler.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,21 +16,30 @@ spec:
metrics:
- type: Object
object:
metric:
# TEI time metrics are in seconds
name: {{ include "tei.metricPrefix" . }}_request_latency
describedObject:
apiVersion: v1
# get metric for named object of given type (in same namespace)
kind: Service
name: {{ include "tei.fullname" . }}
target:
# embedding_request_latency is average for all TEI pods. To avoid replica fluctuations when
# TEI startup + request processing takes longer than HPA evaluation period, this uses
# "Value" (replicas = metric.value / target.value), instead of "averageValue" type:
{{- if .Values.accelDevice }}
# Metric is sum from all pods. "AverageValue" divides value returned from
# the custom metrics API by the number of Pods before comparing to the target:
# https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/#algorithm-details
# https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale-walkthrough/#autoscaling-on-multiple-metrics-and-custom-metrics
type: AverageValue
averageValue: 15
metric:
name: {{ include "tei.metricPrefix" . }}_queue_size_sum
{{- else }}
# Metric is average for all the pods. To avoid replica fluctuation when pod
# startup + request processing takes longer than HPA evaluation period, this uses
# "Value" (replicas = metric.value / target.value), instead of "AverageValue" type.
type: Value
value: 4
value: 4 # seconds
metric:
name: {{ include "tei.metricPrefix" . }}_request_latency
{{- end }}
behavior:
scaleDown:
stabilizationWindowSeconds: 180
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,21 +16,30 @@ spec:
metrics:
- type: Object
object:
metric:
# TEI time metrics are in seconds
name: {{ include "teirerank.metricPrefix" . }}_request_latency
describedObject:
apiVersion: v1
# get metric for named object of given type (in same namespace)
kind: Service
name: {{ include "teirerank.fullname" . }}
target:
# reranking_request_latency is average for all TEI pods. To avoid replica fluctuations when
# TEI startup + request processing takes longer than HPA evaluation period, this uses
# "Value" (replicas = metric.value / target.value), instead of "averageValue" type:
{{- if .Values.accelDevice }}
# Metric is sum from all pods. "AverageValue" divides value returned from
# the custom metrics API by the number of Pods before comparing to the target:
# https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/#algorithm-details
# https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale-walkthrough/#autoscaling-on-multiple-metrics-and-custom-metrics
type: AverageValue
averageValue: 15
metric:
name: {{ include "teirerank.metricPrefix" . }}_queue_size_sum
{{- else }}
# Metric is average for all the pods. To avoid replica fluctuation when pod
# startup + request processing takes longer than HPA evaluation period, this uses
# "Value" (replicas = metric.value / target.value), instead of "AverageValue" type.
type: Value
value: 4
value: 4 # seconds
metric:
name: {{ include "teirerank.metricPrefix" . }}_request_latency
{{- end }}
behavior:
scaleDown:
stabilizationWindowSeconds: 180
Expand Down
23 changes: 16 additions & 7 deletions helm-charts/common/tgi/templates/horizontal-pod-autoscaler.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,21 +16,30 @@ spec:
metrics:
- type: Object
object:
metric:
# TGI time metrics are in seconds
name: {{ include "tgi.metricPrefix" . }}_request_latency
describedObject:
apiVersion: v1
# get metric for named object of given type (in same namespace)
kind: Service
name: {{ include "tgi.fullname" . }}
target:
# tgi_request_latency is average for all the TGI pods. To avoid replica fluctuations when
# TGI startup + request processing takes longer than HPA evaluation period, this uses
# "Value" (replicas = metric.value / target.value), instead of "averageValue" type:
{{- if .Values.accelDevice }}
# Metric is sum from all pods. "AverageValue" divides value returned from
# the custom metrics API by the number of Pods before comparing to the target:
# https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/#algorithm-details
# https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale-walkthrough/#autoscaling-on-multiple-metrics-and-custom-metrics
type: AverageValue
averageValue: 15
metric:
name: {{ include "tgi.metricPrefix" . }}_queue_size_sum
{{- else }}
# Metric is average for all the pods. To avoid replica fluctuation when pod
# startup + request processing takes longer than HPA evaluation period, this uses
# "Value" (replicas = metric.value / target.value), instead of "AverageValue" type.
type: Value
value: 4
value: 4 # seconds
metric:
name: {{ include "tgi.metricPrefix" . }}_request_latency
{{- end }}
behavior:
scaleDown:
stabilizationWindowSeconds: 180
Expand Down
1 change: 1 addition & 0 deletions helm-charts/docsum/gaudi-values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# SPDX-License-Identifier: Apache-2.0

tgi:
accelDevice: "gaudi"
image:
repository: ghcr.io/huggingface/tgi-gaudi
tag: "2.0.1"
Expand Down

0 comments on commit cdd3585

Please sign in to comment.