Adding worker autoscaling support with KEDA

trinodb · Dec 15, 2024 · 21bfb02 · 21bfb02
1 parent 424580a
commit 21bfb02
Show file tree

Hide file tree

Showing 10 changed files with 219 additions and 12 deletions.
diff --git a/charts/trino/README.md b/charts/trino/README.md
@@ -108,6 +108,70 @@ Fast distributed SQL query engine for big data analytics that helps you explore
        periodSeconds: 15
      selectPolicy: Max
   ```
+* `server.keda` - object, default: `{"advanced":{},"annotations":{},"cooldownPeriod":300,"enabled":false,"fallback":{},"initialCooldownPeriod":0,"maxReplicaCount":5,"minReplicaCount":0,"pollingInterval":30,"triggers":[]}`  
+
+  Configure [KEDA](https://keda.sh/) for workers.
+* `server.keda.cooldownPeriod` - int, default: `300`  
+
+  Period to wait after the last trigger reported active before scaling the resource back to 0
+* `server.keda.initialCooldownPeriod` - int, default: `0`  
+
+  The delay before the `cooldownPeriod` starts after the initial creation of the `ScaledObject`.
+* `server.keda.minReplicaCount` - int, default: `0`  
+
+  Minimum number of replicas KEDA will scale the resource down to. By default, it’s scale to zero, but you can use it with some other value as well.
+* `server.keda.maxReplicaCount` - int, default: `5`  
+
+  This setting is passed to the HPA definition that KEDA will create for a given resource and holds the maximum number of replicas of the target resource.
+* `server.keda.fallback` - object, default: `{}`  
+
+  Defines a number of replicas to fall back to if a scaler is in an error state.
+  Example:
+  ```yaml
+  fallback:             # Optional. Section to specify fallback options
+    failureThreshold: 3 # Mandatory if fallback section is included
+    replicas: 6         # Mandatory if fallback section is included
+  ```
+* `server.keda.advanced` - object, default: `{}`  
+
+  Specifies HPA related options
+  Example:
+  ```yaml
+  advanced:
+    horizontalPodAutoscalerConfig:
+      behavior:
+        scaleDown:
+          stabilizationWindowSeconds: 300
+          policies:
+            - type: Percent
+              value: 100
+              periodSeconds: 15
+  ```
+* `server.keda.triggers` - list, default: `[]`  
+
+  List of triggers to activate scaling of the target resource
+  Example:
+  ```yaml
+  triggers:
+    - type: prometheus
+      metricType: Value
+      metadata:
+        serverAddress: "http://prometheus.example.com"
+        threshold: "1"
+        metricName: running_queries
+          query: >-
+            sum by (service)
+            (avg_over_time(trino_execution_resourcegroups_InternalResourceGroup_RunningQueries{service={{ include "trino.fullname" . | quote }}}[5s]))
+  ```
+* `server.keda.annotations` - object, default: `{}`  
+
+  Annotations to apply to the ScaledObject CRD.
+  Example:
+  ```yaml
+  annotations:
+    autoscaling.keda.sh/paused-replicas: "0"
+    autoscaling.keda.sh/paused: "true"
+  ```
 * `accessControl` - object, default: `{}`  
 
   [System access control](https://trino.io/docs/current/security/built-in-system-access-control.html) configuration.

diff --git a/charts/trino/templates/autoscaler.yaml b/charts/trino/templates/autoscaler.yaml
@@ -1,4 +1,4 @@
-{{- if .Values.server.autoscaling.enabled -}}
+{{- if and .Values.server.autoscaling.enabled (not .Values.server.keda.enabled) -}}
 apiVersion: autoscaling/v2
 kind: HorizontalPodAutoscaler
 metadata:

diff --git a/charts/trino/templates/configmap-coordinator.yaml b/charts/trino/templates/configmap-coordinator.yaml
@@ -50,7 +50,7 @@ data:
 
   config.properties: |
     coordinator=true
-    {{- if gt (int .Values.server.workers) 0 }}
+    {{- if or .Values.server.keda.enabled (gt (int .Values.server.workers) 0) }}
     node-scheduler.include-coordinator=false
     {{- else }}
     node-scheduler.include-coordinator=true

diff --git a/charts/trino/templates/configmap-worker.yaml b/charts/trino/templates/configmap-worker.yaml
@@ -1,5 +1,5 @@
 {{- $workerJmx := merge .Values.jmx.worker (omit .Values.jmx "coordinator" "worker") -}}
-{{- if gt (int .Values.server.workers) 0 }}
+{{- if or .Values.server.keda.enabled (gt (int .Values.server.workers) 0) }}
 apiVersion: v1
 kind: ConfigMap
 metadata:

diff --git a/charts/trino/templates/deployment-worker.yaml b/charts/trino/templates/deployment-worker.yaml
@@ -1,5 +1,5 @@
 {{- $workerJmx := merge .Values.jmx.worker (omit .Values.jmx "coordinator" "worker") -}}
-{{- if gt (int .Values.server.workers) 0 }}
+{{- if or .Values.server.keda.enabled (gt (int .Values.server.workers) 0) }}
 apiVersion: apps/v1
 kind: Deployment
 metadata:
@@ -19,7 +19,7 @@ spec:
   revisionHistoryLimit: {{ .Values.worker.deployment.revisionHistoryLimit }}
   strategy:
     {{- toYaml .Values.worker.deployment.strategy | nindent 4 }}
-  {{- if not .Values.server.autoscaling.enabled }}
+  {{- if and (not .Values.server.autoscaling.enabled) (not .Values.server.keda.enabled) }}
   replicas: {{ .Values.server.workers }}
   {{- end }}
   selector:

diff --git a/charts/trino/templates/keda-scaledobject.yaml b/charts/trino/templates/keda-scaledobject.yaml
@@ -0,0 +1,37 @@
+{{- if .Values.server.keda.enabled }}
+apiVersion: keda.sh/v1alpha1
+kind: ScaledObject
+metadata:
+  name: {{ template "trino.worker" . }}
+  namespace: {{ .Release.Namespace }}
+  labels:
+    {{- include "trino.labels" . | nindent 4 }}
+  {{- with .Values.server.keda.annotations }}
+  annotations:
+    {{- . | nindent 4 }}
+  {{- end }}
+spec:
+  scaleTargetRef:
+    apiVersion: apps/v1
+    kind: Deployment
+    name: {{ template "trino.worker" . }}
+  pollingInterval: {{ .Values.server.keda.pollingInterval }}
+  cooldownPeriod: {{ .Values.server.keda.cooldownPeriod }}
+  initialCooldownPeriod: {{ .Values.server.keda.initialCooldownPeriod }}
+  minReplicaCount: {{ .Values.server.keda.minReplicaCount }}
+  maxReplicaCount: {{ .Values.server.keda.maxReplicaCount }}
+  {{- with .Values.server.keda.fallback }}
+  fallback:
+    {{- toYaml . | nindent 4 }}
+  {{- end }}
+  {{- with .Values.server.keda.advanced }}
+  advanced:
+    {{- toYaml . | nindent 4 }}
+  {{- end }}
+  {{- with .Values.server.keda.triggers }}
+  triggers:
+    {{- tpl (toYaml .) $ | nindent 4 }}
+  {{- else }}
+    {{- fail "At least one element in `.Values.server.keda.triggers` is required!" }}
+  {{- end }}
+{{- end }}
diff --git a/charts/trino/templates/tests/test-connection.yaml b/charts/trino/templates/tests/test-connection.yaml
@@ -42,7 +42,12 @@ spec:
       - --password
     {{- end }}
       - --debug
+      {{- if .Values.server.keda.enabled }}
+      {{/* When testing KEDA we need a query that requires workers to run. */}}
+      - --execute=SELECT COUNT(*) FROM tpch.tiny.nation
+      {{- else }}
       - --execute=SELECT 1
+      {{- end }}
       - --no-progress
     {{- if eq .Values.server.config.authenticationType "PASSWORD" }}
       env:

diff --git a/charts/trino/values.yaml b/charts/trino/values.yaml
@@ -114,6 +114,70 @@ server:
     #    selectPolicy: Max
     # ```
 
+  # -- Configure [KEDA](https://keda.sh/) for workers.
+  keda:
+    enabled: false
+    pollingInterval: 30
+    # -- Period to wait after the last trigger reported active before scaling the resource back to 0
+    cooldownPeriod: 300
+    # -- The delay before the `cooldownPeriod` starts after the initial creation of the `ScaledObject`.
+    initialCooldownPeriod: 0
+    # -- Minimum number of replicas KEDA will scale the resource down to.
+    # By default, it’s scale to zero, but you can use it with some other value as well.
+    minReplicaCount: 0
+    # -- This setting is passed to the HPA definition that KEDA will create for a given resource and
+    # holds the maximum number of replicas of the target resource.
+    maxReplicaCount: 5
+    fallback: {}
+    # server.keda.fallback -- Defines a number of replicas to fall back to if a scaler is in an error state.
+    # @raw
+    # Example:
+    # ```yaml
+    # fallback:             # Optional. Section to specify fallback options
+    #   failureThreshold: 3 # Mandatory if fallback section is included
+    #   replicas: 6         # Mandatory if fallback section is included
+    # ```
+    advanced: {}
+    # server.keda.advanced -- Specifies HPA related options
+    # @raw
+    # Example:
+    # ```yaml
+    # advanced:
+    #   horizontalPodAutoscalerConfig:
+    #     behavior:
+    #       scaleDown:
+    #         stabilizationWindowSeconds: 300
+    #         policies:
+    #           - type: Percent
+    #             value: 100
+    #             periodSeconds: 15
+    # ```
+    triggers: []
+    # server.keda.triggers -- List of triggers to activate scaling of the target resource
+    # @raw
+    # Example:
+    # ```yaml
+    # triggers:
+    #   - type: prometheus
+    #     metricType: Value
+    #     metadata:
+    #       serverAddress: "http://prometheus.example.com"
+    #       threshold: "1"
+    #       metricName: running_queries
+    #         query: >-
+    #           sum by (service)
+    #           (avg_over_time(trino_execution_resourcegroups_InternalResourceGroup_RunningQueries{service={{ include "trino.fullname" . | quote }}}[5s]))
+    # ```
+    annotations: {}
+    # server.keda.annotations -- Annotations to apply to the ScaledObject CRD.
+    # @raw
+    # Example:
+    # ```yaml
+    # annotations:
+    #   autoscaling.keda.sh/paused-replicas: "0"
+    #   autoscaling.keda.sh/paused: "true"
+    # ```
+
 accessControl: {}
 # accessControl -- [System access
 # control](https://trino.io/docs/current/security/built-in-system-access-control.html)

diff --git a/tests/trino/test-values.yaml b/tests/trino/test-values.yaml
@@ -3,7 +3,7 @@
 # Declare variables to be passed into your templates.
 
 server:
-  workers: 2
+  workers: 0
   config:
     https:
       enabled: true
@@ -15,6 +15,23 @@ server:
     query.execution-policy=phased
   autoscaling:
     enabled: true
+  keda:
+    enabled: true
+    pollingInterval: 5
+    minReplicaCount: 0
+    maxReplicaCount: 2
+    cooldownPeriod: 300
+    triggers:
+      - type: prometheus
+        metricType: Value
+        metadata:
+          serverAddress: http://prometheus-operator-kube-p-prometheus.{{ .Release.Namespace }}:9090
+          threshold: "1"
+          metricName: running_queries
+          query: >-
+            sum by (service)
+            (avg_over_time(trino_execution_ClusterSizeMonitor_RequiredWorkers{service={{ include "trino.fullname" . | quote }}}[5s]))
+
 
 additionalConfigProperties:
   - internal-communication.shared-secret=random-value-999
@@ -247,12 +264,13 @@ jmx:
       rules:
         - pattern: 'trino.memory*'
         - pattern: 'trino.execution<name=QueryManager>*'
+        - pattern: 'trino.execution<name=ClusterSizeMonitor>*'
 
 serviceMonitor:
   enabled: true
   labels:
     prometheus: default
-  interval: "30s"
+  interval: "1s"
 
 ingress:
   enabled: true
@@ -271,3 +289,8 @@ networkPolicy:
               - key: test
                 operator: NotIn
                 values: [network-policy]
+
+catalogs:
+  tpch: |
+    connector.name=tpch
+    tpch.splits-per-node=4
diff --git a/tests/trino/test.sh b/tests/trino/test.sh
@@ -34,6 +34,7 @@ function join_by {
 # default to randomly generated namespace, same as chart-testing would do, but we need to load secrets into the same namespace
 NAMESPACE=trino-$(LC_ALL=C tr -dc 'a-z0-9' </dev/urandom | head -c 6 || true)
 DB_NAMESPACE=postgresql
+KEDA_NAMESPACE=keda
 HELM_EXTRA_SET_ARGS=
 CT_ARGS=(
     --skip-clean-up
@@ -105,8 +106,9 @@ spec:
       storage: 128Mi
 YAML
 
-# only install the Prometheus Helm chart when running the `complete_values` test
+# only install the Prometheus and KEDA Helm charts when running the `complete_values` test
 if printf '%s\0' "${TEST_NAMES[@]}" | grep -qwz complete_values; then
+    # prometheus
     helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
     helm upgrade --install prometheus-operator prometheus-community/kube-prometheus-stack -n "$NAMESPACE" \
         --version "60.0.2" \
@@ -129,6 +131,14 @@ if printf '%s\0' "${TEST_NAMES[@]}" | grep -qwz complete_values; then
         --set prometheusOperator.serviceMonitor.selfMonitor=false \
         --set prometheus.serviceMonitor.selfMonitor=false
     kubectl rollout status --watch deployments -l release=prometheus-operator -n "$NAMESPACE"
+    # keda
+    helm repo add kedacore https://kedacore.github.io/charts
+    helm upgrade --install keda kedacore/keda -n "$KEDA_NAMESPACE" \
+        --create-namespace \
+        --version "2.16.0" \
+        --set webhooks.enabled=false \
+        --set asciiArt=false
+    kubectl rollout status --watch deployments -l app.kubernetes.io/instance=keda -n "$KEDA_NAMESPACE"
 fi
 
 # only install the PostgreSQL Helm chart when running the `resource_groups_properties` test
@@ -171,10 +181,14 @@ if [ "$CLEANUP_NAMESPACE" == "true" ]; then
     kubectl delete namespace "$DB_NAMESPACE" --ignore-not-found
     helm -n "$NAMESPACE" uninstall prometheus-operator --ignore-not-found
     kubectl delete namespace "$NAMESPACE"
-    mapfile -t crds < <(kubectl api-resources --api-group=monitoring.coreos.com --output name)
-    if [ ${#crds[@]} -ne 0 ]; then
-        kubectl delete crd "${crds[@]}"
-    fi
+    helm -n "$KEDA_NAMESPACE" uninstall keda --ignore-not-found
+    kubectl delete namespace "$KEDA_NAMESPACE"
+    for api_group in monitoring.coreos.com eventing.keda.sh keda.sh; do
+        mapfile -t crds < <(kubectl api-resources --api-group="$api_group" --output name)
+        if [ ${#crds[@]} -ne 0 ]; then
+            kubectl delete crd "${crds[@]}"
+        fi
+    done
 fi
 
 exit $result