diff --git a/helm-charts/chatqna/templates/custom-metrics-configmap.yaml b/helm-charts/chatqna/templates/custom-metrics-configmap.yaml index 17b23903..c02b07bf 100644 --- a/helm-charts/chatqna/templates/custom-metrics-configmap.yaml +++ b/helm-charts/chatqna/templates/custom-metrics-configmap.yaml @@ -12,54 +12,75 @@ metadata: app.kubernetes.io/name: prometheus-adapter data: config.yaml: | - rules: - {{- if .Values.tgi.horizontalPodAutoscaler.enabled }} - # check metric with: - # kubectl get --raw /apis/custom.metrics.k8s.io/v1beta1/namespaces/default/service/*/{{ include "tgi.metricPrefix" .Subcharts.tgi }}_request_latency | jq - # - - seriesQuery: '{__name__="tgi_request_inference_duration_sum",service="{{ include "tgi.fullname" .Subcharts.tgi }}"}' - # Average request latency from TGI histograms, over 1 min - # (0.001 divider add is to make sure there's always a valid value) - metricsQuery: 'rate(tgi_request_inference_duration_sum{service="{{ include "tgi.fullname" .Subcharts.tgi }}",<<.LabelMatchers>>}[1m]) / (0.001+rate(tgi_request_inference_duration_count{service="{{ include "tgi.fullname" .Subcharts.tgi }}",<<.LabelMatchers>>}[1m]))' - name: - matches: ^tgi_request_inference_duration_sum - as: "{{ include "tgi.metricPrefix" .Subcharts.tgi }}_request_latency" - resources: - # HPA needs both namespace + suitable object resource for its query paths: - # /apis/custom.metrics.k8s.io/v1beta1/namespaces/default/service/*/tgi_request_latency - # (pod is not suitable object type for matching as each instance has different name) - overrides: - namespace: - resource: namespace - service: - resource: service - {{- end }} - {{- if .Values.teirerank.horizontalPodAutoscaler.enabled }} - - seriesQuery: '{__name__="te_request_inference_duration_sum",service="{{ include "teirerank.fullname" .Subcharts.teirerank }}"}' - # Average request latency from TEI histograms, over 1 min - metricsQuery: 'rate(te_request_inference_duration_sum{service="{{ include "teirerank.fullname" .Subcharts.teirerank }}",<<.LabelMatchers>>}[1m]) / (0.001+rate(te_request_inference_duration_count{service="{{ include "teirerank.fullname" .Subcharts.teirerank }}",<<.LabelMatchers>>}[1m]))' - name: - matches: ^te_request_inference_duration_sum - as: "{{ include "teirerank.metricPrefix" .Subcharts.teirerank }}_request_latency" - resources: - overrides: - namespace: - resource: namespace - service: - resource: service - {{- end }} - {{- if .Values.tei.horizontalPodAutoscaler.enabled }} - - seriesQuery: '{__name__="te_request_inference_duration_sum",service="{{ include "tei.fullname" .Subcharts.tei }}"}' - # Average request latency from TEI histograms, over 1 min - metricsQuery: 'rate(te_request_inference_duration_sum{service="{{ include "tei.fullname" .Subcharts.tei }}",<<.LabelMatchers>>}[1m]) / (0.001+rate(te_request_inference_duration_count{service="{{ include "tei.fullname" .Subcharts.tei }}",<<.LabelMatchers>>}[1m]))' - name: - matches: ^te_request_inference_duration_sum - as: "{{ include "tei.metricPrefix" .Subcharts.tei }}_request_latency" - resources: - overrides: - namespace: - resource: namespace - service: - resource: service - {{- end }} + rules: + {{- if .Values.tgi.horizontalPodAutoscaler.enabled }} + # check metric with: + # kubectl get --raw /apis/custom.metrics.k8s.io/v1beta1/namespaces/default/service/*/ | jq + # + {{- if .Values.tgi.accelDevice }} + - seriesQuery: '{__name__="tgi_queue_size",service="{{ include "tgi.fullname" .Subcharts.tgi }}"}' + # TGI instances queue_size sum + metricsQuery: 'sum by (namespace,service) (tgi_queue_size{service="{{ include "tgi.fullname" .Subcharts.tgi }}",<<.LabelMatchers>>})' + name: + matches: ^tgi_queue_size + as: "{{ include "tgi.metricPrefix" .Subcharts.tgi }}_queue_size_sum" + {{- else }} + - seriesQuery: '{__name__="tgi_request_inference_duration_sum",service="{{ include "tgi.fullname" .Subcharts.tgi }}"}' + # Average request latency from TGI histograms, over 1 min + # (0.001 divider add is to make sure there's always a valid value) + metricsQuery: 'rate(tgi_request_inference_duration_sum{service="{{ include "tgi.fullname" .Subcharts.tgi }}",<<.LabelMatchers>>}[1m]) / (0.001+rate(tgi_request_inference_duration_count{service="{{ include "tgi.fullname" .Subcharts.tgi }}",<<.LabelMatchers>>}[1m]))' + name: + matches: ^tgi_request_inference_duration_sum + as: "{{ include "tgi.metricPrefix" .Subcharts.tgi }}_request_latency" + {{- end }} + resources: + # HPA needs both namespace + suitable object resource for its query paths: + # /apis/custom.metrics.k8s.io/v1beta1/namespaces/default/service/*/ + # (pod is not suitable object type for matching as each instance has different name) + overrides: + namespace: {resource: "namespace"} + service: {resource: "service"} + {{- end }} + {{- if .Values.teirerank.horizontalPodAutoscaler.enabled }} + {{- if .Values.teirerank.accelDevice }} + - seriesQuery: '{__name__="te_queue_size",service="{{ include "teirerank.fullname" .Subcharts.teirerank }}"}' + # TEI instances queue_size sum + metricsQuery: 'sum by (namespace,service) (te_queue_size{service="{{ include "teirerank.fullname" .Subcharts.teirerank }}",<<.LabelMatchers>>})' + name: + matches: ^te_queue_size + as: "{{ include "teirerank.metricPrefix" .Subcharts.teirerank }}_queue_size_sum" + {{- else }} + - seriesQuery: '{__name__="te_request_inference_duration_sum",service="{{ include "teirerank.fullname" .Subcharts.teirerank }}"}' + # Average request latency from TEI histograms, over 1 min + metricsQuery: 'rate(te_request_inference_duration_sum{service="{{ include "teirerank.fullname" .Subcharts.teirerank }}",<<.LabelMatchers>>}[1m]) / (0.001+rate(te_request_inference_duration_count{service="{{ include "teirerank.fullname" .Subcharts.teirerank }}",<<.LabelMatchers>>}[1m]))' + name: + matches: ^te_request_inference_duration_sum + as: "{{ include "teirerank.metricPrefix" .Subcharts.teirerank }}_request_latency" + {{- end }} + resources: + overrides: + namespace: {resource: "namespace"} + service: {resource: "service"} + {{- end }} + {{- if .Values.tei.horizontalPodAutoscaler.enabled }} + {{- if .Values.tei.accelDevice }} + - seriesQuery: '{__name__="te_queue_size",service="{{ include "tei.fullname" .Subcharts.tei }}"}' + # TEI instances queue_size sum + metricsQuery: 'sum by (namespace,service) (te_queue_size{service="{{ include "tei.fullname" .Subcharts.tei }}",<<.LabelMatchers>>})' + name: + matches: ^te_queue_size + as: "{{ include "tei.metricPrefix" .Subcharts.tei }}_queue_size_sum" + {{- else }} + - seriesQuery: '{__name__="te_request_inference_duration_sum",service="{{ include "tei.fullname" .Subcharts.tei }}"}' + # Average request latency from TEI histograms, over 1 min + metricsQuery: 'rate(te_request_inference_duration_sum{service="{{ include "tei.fullname" .Subcharts.tei }}",<<.LabelMatchers>>}[1m]) / (0.001+rate(te_request_inference_duration_count{service="{{ include "tei.fullname" .Subcharts.tei }}",<<.LabelMatchers>>}[1m]))' + name: + matches: ^te_request_inference_duration_sum + as: "{{ include "tei.metricPrefix" .Subcharts.tei }}_request_latency" + {{- end }} + resources: + overrides: + namespace: {resource: "namespace"} + service: {resource: "service"} + {{- end }} {{- end }} diff --git a/helm-charts/codegen/gaudi-values.yaml b/helm-charts/codegen/gaudi-values.yaml index e78ab1de..526271ca 100644 --- a/helm-charts/codegen/gaudi-values.yaml +++ b/helm-charts/codegen/gaudi-values.yaml @@ -2,6 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 tgi: + accelDevice: "gaudi" image: repository: ghcr.io/huggingface/tgi-gaudi tag: "2.0.1" diff --git a/helm-charts/codetrans/gaudi-values.yaml b/helm-charts/codetrans/gaudi-values.yaml index e78ab1de..526271ca 100644 --- a/helm-charts/codetrans/gaudi-values.yaml +++ b/helm-charts/codetrans/gaudi-values.yaml @@ -2,6 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 tgi: + accelDevice: "gaudi" image: repository: ghcr.io/huggingface/tgi-gaudi tag: "2.0.1" diff --git a/helm-charts/common/tei/templates/horizontal-pod-autoscaler.yaml b/helm-charts/common/tei/templates/horizontal-pod-autoscaler.yaml index 0da41daf..277184ee 100644 --- a/helm-charts/common/tei/templates/horizontal-pod-autoscaler.yaml +++ b/helm-charts/common/tei/templates/horizontal-pod-autoscaler.yaml @@ -16,21 +16,30 @@ spec: metrics: - type: Object object: - metric: - # TEI time metrics are in seconds - name: {{ include "tei.metricPrefix" . }}_request_latency describedObject: apiVersion: v1 # get metric for named object of given type (in same namespace) kind: Service name: {{ include "tei.fullname" . }} target: - # embedding_request_latency is average for all TEI pods. To avoid replica fluctuations when - # TEI startup + request processing takes longer than HPA evaluation period, this uses - # "Value" (replicas = metric.value / target.value), instead of "averageValue" type: +{{- if .Values.accelDevice }} + # Metric is sum from all pods. "AverageValue" divides value returned from + # the custom metrics API by the number of Pods before comparing to the target: # https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/#algorithm-details + # https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale-walkthrough/#autoscaling-on-multiple-metrics-and-custom-metrics + type: AverageValue + averageValue: 15 + metric: + name: {{ include "tei.metricPrefix" . }}_queue_size_sum +{{- else }} + # Metric is average for all the pods. To avoid replica fluctuation when pod + # startup + request processing takes longer than HPA evaluation period, this uses + # "Value" (replicas = metric.value / target.value), instead of "AverageValue" type. type: Value - value: 4 + value: 4 # seconds + metric: + name: {{ include "tei.metricPrefix" . }}_request_latency +{{- end }} behavior: scaleDown: stabilizationWindowSeconds: 180 diff --git a/helm-charts/common/teirerank/templates/horizontal-pod-autoscaler.yaml b/helm-charts/common/teirerank/templates/horizontal-pod-autoscaler.yaml index c5914fca..f54dc070 100644 --- a/helm-charts/common/teirerank/templates/horizontal-pod-autoscaler.yaml +++ b/helm-charts/common/teirerank/templates/horizontal-pod-autoscaler.yaml @@ -16,21 +16,30 @@ spec: metrics: - type: Object object: - metric: - # TEI time metrics are in seconds - name: {{ include "teirerank.metricPrefix" . }}_request_latency describedObject: apiVersion: v1 # get metric for named object of given type (in same namespace) kind: Service name: {{ include "teirerank.fullname" . }} target: - # reranking_request_latency is average for all TEI pods. To avoid replica fluctuations when - # TEI startup + request processing takes longer than HPA evaluation period, this uses - # "Value" (replicas = metric.value / target.value), instead of "averageValue" type: +{{- if .Values.accelDevice }} + # Metric is sum from all pods. "AverageValue" divides value returned from + # the custom metrics API by the number of Pods before comparing to the target: # https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/#algorithm-details + # https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale-walkthrough/#autoscaling-on-multiple-metrics-and-custom-metrics + type: AverageValue + averageValue: 15 + metric: + name: {{ include "teirerank.metricPrefix" . }}_queue_size_sum +{{- else }} + # Metric is average for all the pods. To avoid replica fluctuation when pod + # startup + request processing takes longer than HPA evaluation period, this uses + # "Value" (replicas = metric.value / target.value), instead of "AverageValue" type. type: Value - value: 4 + value: 4 # seconds + metric: + name: {{ include "teirerank.metricPrefix" . }}_request_latency +{{- end }} behavior: scaleDown: stabilizationWindowSeconds: 180 diff --git a/helm-charts/common/tgi/templates/horizontal-pod-autoscaler.yaml b/helm-charts/common/tgi/templates/horizontal-pod-autoscaler.yaml index 646ea9cc..276ff067 100644 --- a/helm-charts/common/tgi/templates/horizontal-pod-autoscaler.yaml +++ b/helm-charts/common/tgi/templates/horizontal-pod-autoscaler.yaml @@ -16,21 +16,30 @@ spec: metrics: - type: Object object: - metric: - # TGI time metrics are in seconds - name: {{ include "tgi.metricPrefix" . }}_request_latency describedObject: apiVersion: v1 # get metric for named object of given type (in same namespace) kind: Service name: {{ include "tgi.fullname" . }} target: - # tgi_request_latency is average for all the TGI pods. To avoid replica fluctuations when - # TGI startup + request processing takes longer than HPA evaluation period, this uses - # "Value" (replicas = metric.value / target.value), instead of "averageValue" type: +{{- if .Values.accelDevice }} + # Metric is sum from all pods. "AverageValue" divides value returned from + # the custom metrics API by the number of Pods before comparing to the target: # https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/#algorithm-details + # https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale-walkthrough/#autoscaling-on-multiple-metrics-and-custom-metrics + type: AverageValue + averageValue: 15 + metric: + name: {{ include "tgi.metricPrefix" . }}_queue_size_sum +{{- else }} + # Metric is average for all the pods. To avoid replica fluctuation when pod + # startup + request processing takes longer than HPA evaluation period, this uses + # "Value" (replicas = metric.value / target.value), instead of "AverageValue" type. type: Value - value: 4 + value: 4 # seconds + metric: + name: {{ include "tgi.metricPrefix" . }}_request_latency +{{- end }} behavior: scaleDown: stabilizationWindowSeconds: 180 diff --git a/helm-charts/docsum/gaudi-values.yaml b/helm-charts/docsum/gaudi-values.yaml index e78ab1de..526271ca 100644 --- a/helm-charts/docsum/gaudi-values.yaml +++ b/helm-charts/docsum/gaudi-values.yaml @@ -2,6 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 tgi: + accelDevice: "gaudi" image: repository: ghcr.io/huggingface/tgi-gaudi tag: "2.0.1"