cluster/config-defaults.yaml

# Autoscaling settings
autoscaling_scale_down_enabled: "true"
autoscaling_buffer_cpu: "1m"
autoscaling_buffer_memory: "10Mi"
autoscaling_buffer_pods: "1"
cluster_autoscaler_cpu: "100m"
cluster_autoscaler_memory: "300Mi"
autoscaling_utilization_threshold: "1.0"
autoscaling_max_empty_bulk_delete: "10"
autoscaling_scale_down_unneeded_time: "2m"
autoscaling_unremovable_node_recheck_timeout: "5m"
# configure the log level for the autoscaler. Setting this to 4 gives enough
# verbosity to understand why a certain node pool is not picked for scheduling a
# pod.
autoscaling_autoscaler_log_level: "1"

# How long to wait for pod eviction when scaling down.
{{if eq .Cluster.Environment "production"}}
cluster_autoscaler_max_pod_eviction_time: "1h"
{{else}}
cluster_autoscaler_max_pod_eviction_time: "3h"
{{end}}

# Override terminationGracePeriodSeconds when evicting pods for scale down, if the pods' value is higher than this one
cluster_autoscaler_max_graceful_termination_sec: "1209600" # 2 weeks

# Prevent CA lock-ups caused by large amounts of pending pods. Can be disabled completely by setting to 0.
cluster_autoscaler_max_usnchedulable_pods_considered: "1000"

# karpenter settings
# DO NOT SET TO FALSE IF THE CLUSTER HAS KARPENTER POOLS OR NODES. REFER TO TEAPOT DOCS  FOR HOW TO ROLLBACK KARPENTER
# https://teapot.docs.zalando.net/howtos/karpenter-operations/
karpenter_pools_enabled: "false"

karpenter_controller_cpu: "25m"
karpenter_controller_memory: "256Mi"
# set log level of karpenter: error|debug
karpenter_log_level: "error"
# restrict the maximum number of pods for karpenter nodes
karpenter_max_pods_per_node: "32"
#
# Karpenter version for controlling roll-out, can be "current" or "legacy"
# current => 0.37.0-main-26.patched
# legacy => 0.36.2-main-25.patched
karpenter_version: "current"

# ALB config created by kube-aws-ingress-controller
kube_aws_ingress_controller_ssl_policy: "ELBSecurityPolicy-TLS-1-2-2017-01"
kube_aws_ingress_controller_idle_timeout: "1m"
kube_aws_ingress_controller_deregistration_delay_timeout: "10s"
# allow using NLBs for ingress
# This opens skipper-ingress ports 9998 and 9999 on all worker nodes
kube_aws_ingress_controller_nlb_enabled: "true"
kube_aws_ingress_controller_nlb_cross_zone: "true"
kube_aws_ingress_controller_nlb_zone_affinity: "any_availability_zone"
kube_aws_ingress_controller_cert_polling_interval: "2m"
# sets the default LB type: "network" or "application" are valid choices (overwritten by nlb_switch)
kube_aws_ingress_default_lb_type: "application"
# cert filter
kube_aws_ingress_controller_cert_filter_tag: "kubernetes=enabled"

# ALB to NLB switch
# "pre":
# - kube-ingress-aws-controller will configure NLB to forward HTTPS requests
#   to skipper-ingress on port 9999 and HTTP requests to port 9998
# - skipper-ingress on port 9999 will add X-Forwarded-For=<client IP> and X-Forwarded-Proto=https headers to requests from NLB
# - skipper-ingress on port 9998 will reply with 308 Permanent Redirect to https
#
# "exec": same as "pre" and kube-ingress-aws-controller will default to NLB
# after removing it you need to set kube_aws_ingress_default_lb_type: "network" above and cleanup skipper and ingress-ctl deployment.yaml
nlb_switch: "exec"

# skipper ingress settings
skipper_ingress_target_average_utilization_cpu: "60"
skipper_ingress_target_average_utilization_memory: "80"
skipper_ingress_hpa_scale_down_wait: "600"
skipper_ingress_hpa_scale_up_max_perc: "100"
{{if eq .Cluster.Environment "production"}}
skipper_ingress_min_replicas: "3"
skipper_ingress_max_replicas: "300"
{{else}}
skipper_ingress_min_replicas: "2"
skipper_ingress_max_replicas: "50"
{{end}}
skipper_ingress_cpu: "1000m"
skipper_ingress_memory: "1500Mi"

skipper_topology_spread_enabled: "true"
skipper_topology_spread_timeout: "7m"

# PHC
skipper_ingress_health_check_options: "period=10s,min-requests=10,min-drop-probability=0.05,max-drop-probability=0.9,max-unhealthy-endpoints-ratio=0.9"

# Enables deployment of canary version
skipper_ingress_canary_enabled: "true"
skipper_ingress_test_single_pod: "false"
skipper_canary_controller_enabled: "false"

# When set to true (and dedicated node pool for skipper is also true) the
# daemonset overhead will be subtracted from the cpu settings such
# that skipper will perfectly fit on the node.
{{if eq .Cluster.Environment "e2e"}}
skipper_ingress_binpack: "true"
{{else}}
skipper_ingress_binpack: "false"
{{end}}

# skipper node-pool
enable_dedicate_nodepool_skipper: "true"
{{if eq .Cluster.Environment "e2e"}}
skipper_attach_only_to_skipper_node_pool: "false"
{{else}}
skipper_attach_only_to_skipper_node_pool: "true"
{{end}}

skipper_suppress_route_update_logs: "true"
skipper_validate_query: "true"
skipper_validate_query_log: "false"

skipper_default_filters: 'disableAccessLog(2,3,404,429) -> fifo(2000,20,"1s")'
# skipper_default_filters_authentication defines filters that implement default request authentication
skipper_default_filters_authentication: ''
skipper_default_filters_append: 'stateBagToTag("auth-user", "client.uid")'
skipper_disabled_filters: "static,bearerinjector,setRequestHeaderFromSecret,basicAuth"
skipper_lua_sources: "file"
skipper_edit_route_placeholders: ""
skipper_ingress_inline_routes: ""
skipper_ingress_refuse_payload: ""
skipper_endpointslices_enabled: "true"
skipper_kubernetes_annotation_predicates: ''
skipper_kubernetes_east_west_range_annotation_predicates: ''

skipper_compress_encodings: "gzip,deflate,br"

# Adds "start" label to each prometheus counter with the value of counter creation timestamp as unix nanoseconds
skipper_prometheus_start_label_enabled: "true"

# skipper profiling settings, 0 keeps default, <0 disable, >0 enable with value
# https://pkg.go.dev/runtime@master#SetBlockProfileRate
# https://pkg.go.dev/runtime@master#SetMutexProfileFraction
skipper_block_profile_rate: 0
skipper_mutex_profile_fraction: 0
skipper_memory_profile_rate: 0

# skipper backend timeout defaults
skipper_expect_continue_timeout_backend: "30s"
skipper_keepalive_backend: "30s"
skipper_max_idle_connection_backend: "0"
skipper_response_header_timeout_backend: "1m"
skipper_timeout_backend: "1s"
skipper_tls_timeout_backend: "3s"
skipper_close_idle_conns_period: "20s"

# skipper server timeout defaults
skipper_read_timeout_server: "5m"
skipper_write_timeout_server: "0"

# skipper server keepalive defaults
skipper_keepalive_server: "0"
skipper_keepalive_requests_server: "0"

# skipper startup settings
{{if eq .Cluster.Environment "production"}}
skipper_readiness_init_delay_seconds: 60
skipper_liveness_init_delay_seconds: 30
{{else}}
skipper_readiness_init_delay_seconds: 1
skipper_liveness_init_delay_seconds: 30
{{end}}
# skipper termination settings
# (10s LB healthcheck interval) * (3 unhealthy threshold + margin of 1 interval)
skipper_wait_for_healthcheck_interval: "40s"
# (350s of fixed NLB connection idle timeout) + (margin of 2s)
skipper_idle_timeout_server: "352s"
# wait long enough for LB to detect unhealthy node and all connections become idle,
# i.e. skipper_wait_for_healthcheck_interval + skipper_idle_timeout_server
skipper_termination_grace_period: "392"

# skipper redis settings
enable_dedicate_nodepool_skipper_redis: "false"
skipper_redis_cpu: "100m"
skipper_redis_memory: "512Mi"
skipper_redis_dial_timeout: "25ms"
skipper_redis_pool_timeout: "250ms"
skipper_redis_read_timeout: "25ms"
skipper_redis_write_timeout: "25ms"
skipper_redis_min_conns: 25
skipper_redis_max_conns: 100

skipper_ingress_redis_swarm_enabled: "true"
skipper_ingress_redis_swim_enabled: "false"
skipper_ingress_redis_target_average_utilization_cpu: "30"
skipper_ingress_redis_target_average_utilization_memory: "60"
skipper_ingress_redis_min_replicas: "1"
skipper_ingress_redis_max_replicas: "100"
skipper_ingress_redis_cluster_scaling_schedules: ""
skipper_ingress_redis_hpa_scale_down_wait: "600"

skipper_cluster_ratelimit_max_group_shards: 1

# webhook protection poc
skipper_lua_scripts_enabled: ""
## datadome poc
datadome_api_key: ""
## kasada poc
kasada_api_key: ""

#
# skipper routesrv settings
#
# skipper_routesrv_enabled is a three state switch:
# - "false" - routesrv deployment is removed, skipper uses own k8s dataclient
# - "pre" - routesrv is deployed, skipper uses own k8s dataclient
# - "exec" - routesrv is deployed, skipper uses routesrv
skipper_routesrv_enabled: "exec"
skipper_routesrv_memory: "1Gi"
{{if eq .Cluster.Environment "production"}}
skipper_routesrv_cpu: "1000m"
skipper_routesrv_min_replicas: 2
{{else}}
skipper_routesrv_cpu: "100m"
skipper_routesrv_min_replicas: 1
{{end}}

skipper_routesrv_node_affinity_enabled: "false"
skipper_routesrv_max_replicas: 10
skipper_routesrv_target_average_utilization_cpu: "80"
skipper_routesrv_target_average_utilization_memory: "80"
skipper_ingress_routesrv_scaling_schedules: ""
skipper_routesrv_log_level: "INFO"

# skipper-ingress component pod-deletion-cost-controller
skipper_pod_deletion_cost_controller_memory: 200Mi
skipper_pod_deletion_cost_controller_cpu: 50m
# vpa min values
skipper_pod_deletion_cost_controller_memory_min: 128Mi
skipper_pod_deletion_cost_controller_cpu_min: 25m
# klog style -v=0
skipper_pod_deletion_cost_controller_log_v: "0"
# pod informer sync config
skipper_pod_deletion_cost_controller_poll_interval: "10s"
skipper_pod_deletion_cost_controller_poll_timeout: "60s"
# enable re-sync
skipper_pod_deletion_cost_controller_resync_enable: "true"
skipper_pod_deletion_cost_controller_resync_interval: "1h"

# polarsignals - only enabled for testing teapot
polarsignals_enabled: "false"

# Kube-Metrics-Adapter
## Scheduled scaling metrics: ramp up/down over this period of time
kube_metrics_adapter_default_scaling_window: "10m"
## Scheduled scaling metrics: number of steps to scale. 5 allows at
## least 20% of change between each schedule and enough change to
## trigger the HPA.
kube_metrics_adapter_scaling_schedule_ramp_steps: "5"
## ZMON KairosDB URL
zmon_kairosdb_url: "https://data-service.zmon.zalan.do/kairosdb-proxy"
## Nakadi URL (for the stats API)
nakadi_url: ""

# enable temporary logging of ingress.cluster.local names
# used to find services for which it's being used.
skipper_eastwest_dns_log_enabled: "false"

# if enabled adds port 8080 as svc port to eastwest svc
skipper_ingress_eastwest_additional_port: "false"

# skipper tcp lifo
# See: https://opensource.zalando.com/skipper/operation/operation/#tcp-lifo
skipper_enable_tcp_queue: "true"                    # TODO(sszuecs): cleanup candidate to reduce amount of branches in deployment
skipper_expected_bytes_per_request: "51200"
skipper_max_tcp_listener_concurrency: "-1"
skipper_max_tcp_listener_queue: "-1"

# opentracing
skipper_ingress_opentracing_excluded_proxy_tags: ""
skipper_ingress_opentracing_backend_name_tag: "true"
skipper_opentracing_disable_filter_spans: "true"
# lightstep
skipper_ingress_tracing_buffer: "32768"
skipper_ingress_lightstep_grpc_max_msg_size: 16384000
skipper_ingress_lightstep_min_period: "500ms"
skipper_ingress_lightstep_max_period: "2500ms"
skipper_ingress_lightstep_max_log_key_len: 20
skipper_ingress_lightstep_max_log_value_len: 128
skipper_ingress_lightstep_max_logs_per_span: 20
skipper_ingress_lightstep_propagators: "lightstep"
# set to "log-events" to enable
skipper_ingress_lightstep_log_events: ""
lightstep_token: ""
tracing_collector_host: "tracing.platform-infrastructure.zalan.do"

# skipper_serve_method_metric sets the flag -serve-method-metric. It
# defines if the http method is included in the dimension
# of the skipper_serve_host_duration_seconds_bucket metric.
skipper_serve_method_metric: "false"
# skipper_serve_status_code_metric sets the flag -serve-status-code-metric. It
# defines if the http response status code is included in the dimension
# of the skipper_serve_host_duration_seconds_bucket metric.
skipper_serve_status_code_metric: "false"

# disabled|provisioned|enabled routegroup validation via skipper webhook
# can be one of disabled|provisioned|enabled
routegroups_validation: "enabled"

# disabled|enabled ingress validation via skipper webhook
ingresses_validation: "enabled"

# tokeninfo
{{if eq .Cluster.Environment "production"}}
# production|bridge|disabled
skipper_local_tokeninfo: "production"
{{else}}
# production|bridge|disabled
skipper_local_tokeninfo: "bridge"
{{end}}

# tokeninfo cache
# integer, non-zero value enables tokeninfo cache and sets the maximum number of cached tokens
skipper_tokeninfo_cache_size: 1000
# duration, non-zero value limits the lifetime of a cached tokeninfo
skipper_tokeninfo_cache_ttl: "30s"

# oauth2 UI login - grant flow
{{if eq .Cluster.Environment "e2e"}}
skipper_oauth2_ui_login: "false"
skipper_ingress_encryption_key: ""
{{else}}
skipper_oauth2_ui_login: "true"
{{end}}

# Comma-separated list of tokeninfo keys to retain
skipper_oauth2_ui_login_tokeninfo_keys: ""

# ClusterScalingSchedules
# One or multiple cluster scaling schedules can be configured as a
# comma-separated list of <cluster schedule name>=<target value> pairs.
# E.g. to configure "schedule1" cluster schedule with a target value of 3 and "schedule2" with a target value of 5 set
# skipper_cluster_scaling_schedules: "schedule1=3,schedule2=5"
skipper_cluster_scaling_schedules: ""

# Skipper Ingress/RouteGroup backend traffic split algorithm: traffic-predicate or traffic-segment-predicate
skipper_ingress_backend_traffic_algorithm: "traffic-segment-predicate"

# TODO: after a while we can remove this and hardcode (2023-06-30)
skipper_ingress_default_lb_algorithm: "powerOfRandomNChoices"

skipper_ingress_disable_catchall_routes: "true"

# Original RouteGroup CRD does not require hosts but we explicitly want it specified in our infrastructure
skipper_ingress_routegroup_crd_require_hosts: "true"

# Set defaults values that would enable Open Policy Agent in a skipper filter
skipper_open_policy_agent_enabled: "false"
skipper_open_policy_agent_styra_token: ""

#
# FabricGateway controller config
#
# fabric_gateway_controller_enabled:
#   - false: scales controller to zero replicas
#   - true: runs the controller
#
fabric_gateway_controller_enabled: "true"
fabric_gateway_controller_cpu: "50m"
fabric_gateway_controller_memory: "150Mi"
fabric_gateway_controller_allow_all_filters: "false"
fabric_gateway_controller_snapshots_history_limit: "3"
fabric_gateway_controller_enable_versioning: "true"
fabric_gateway_controller_ssl_policy: ""
fabric_gateway_controller_log_level: "INFO"

# kube-api-server settings

# EventRateLimit admission plugin configuration
# Reference: https://kubernetes.io/docs/reference/access-authn-authz/admission-controllers/#eventratelimit
event_rate_limit_enable: "true"
event_rate_limit_config_qps: "500"
event_rate_limit_config_burst: "1000"

# cadvisor settings
cadvisor_cpu: "150m"
cadvisor_memory: "150Mi"
cadvisor_profiling_enabled: "false"
cadvisor_enabled: "false"

# settings for enabling the kubelet-summary-metrics proxy and prometheus metric
# collection.
kubelet_summary_metrics_enabled: "true"
kubelet_summary_metrics_cpu: "100m"
kubelet_summary_metrics_memory: "512Mi"
kubelet_summary_metrics_hpa_max_replicas: "10"
kubelet_summary_metrics_hpa_cpu_target: "80"
kubelet_summary_metrics_hpa_memory_target: "80"

# enable collection of cadvisor metrics from the kubelet cadvisor endpoint.
kubelet_cadvisor_enabled: "true"

# node exporter settings
node_exporter_cpu: "20m"
node_exporter_memory: "75Mi"
node_exporter_experimental_metrics: "false"

# kube-proxy settings
kube_proxy_cpu: "50m"
kube_proxy_memory_request: "200Mi"
kube_proxy_memory_limit: "200Mi"
kube_proxy_sync_period: "15m0s"
kube_proxy_verbose_level: "2"

# flannel settings
flannel_cpu: "25m"
flannel_memory_request: "100Mi"
flannel_memory_limit: "100Mi"

flannel_awaiter_cpu: "25m"
flannel_awaiter_memory_request: "50Mi"
flannel_awaiter_memory_limit: "50Mi"

# nvidia device plugin
nvidia_device_plugin_cpu: "10m"
nvidia_device_plugin_memory: "50Mi"

# nvidia dcgm exporter
nvidia_dcgm_exporter_enabled: "false"
nvidia_dcgm_exporter_cpu: "10m"
nvidia_dcgm_exporter_memory: "200Mi"

# AWS EFA device plugin
aws_efa_device_plugin_enabled: "false"
aws_efa_device_plugin_cpu: "10m"
aws_efa_device_plugin_memory: "20Mi"

# static egress controller settings
static_egress_controller_enabled: "true"

# journald reader settings
journald_reader_enabled: "true"
journald_reader_cpu: "1m"
journald_reader_memory: "30Mi"

# Logging settings
logging_s3_bucket: "zalando-logging-{{ .Cluster.InfrastructureAccount | getAWSAccountID}}-{{ .Cluster.Region }}"
scalyr_team_token: ""
log_destination_infra: "scalyr/stups"
log_destination_both: "scalyr/main+stups"
log_destination_local: "scalyr/main"

# Central bucket to keep logging infrastructure logs
logging_infrastructure_s3_bucket: ""

vpa_cpu: "200m"
vpa_mem: "500Mi"

prometheus_cpu: "1000m"
prometheus_mem: "4Gi"
prometheus_mem_min: "2Gi"
prometheus_cpu_min: "0"
prometheus_retention_size: "49GB" # one GB less than prometheus' PVC
prometheus_retention_time: "1d"
prometheus_storage_size: "50Gi"

# Upstream defaults are too aggressive:
# https://prometheus.io/docs/prometheus/latest/configuration/configuration/#remote_write
prometheus_remote_write: "disabled"
# Maximum time a sample will wait in buffer.
prometheus_remote_batch_send_deadline: "30s"
# Initial retry delay. Gets doubled for every retry.
prometheus_remote_min_backoff: "3s"
# Maximum retry delay.
prometheus_remote_max_backoff: "10s"

# Comma-separated list of user ids allowed to access Prometheus UI
prometheus_ui_users: ""

metrics_service_cpu: "100m"
metrics_service_mem_max: "4Gi"
metrics_server_metric_resolution: "15s"

kube_aws_iam_controller_cpu: "5m"
kube_aws_iam_controller_mem: "50Mi"

kube_state_metrics_cpu: "100m"
kube_state_metrics_mem_max: "4Gi"
kube_state_metrics_mem_min: "120Mi"

kubernetes_lifecycle_metrics_mem_max: "4Gi"
kubernetes_lifecycle_metrics_mem_min: "120Mi"

kube_node_ready_controller_cpu: "50m"
kube_node_ready_controller_memory: "200Mi"

# Enable deployment of aws-cloud-controller-manager
aws_cloud_controller_manager_enabled: "true"
aws_cloud_controller_manager_cpu: "125m"
aws_cloud_controller_manager_memory: "512Mi"


# Kubernetes Downscaler (for non-production clusters)
{{if eq .Cluster.Environment "test"}}
downscaler_default_uptime: "Mon-Fri 07:30-20:30 Europe/Berlin"
downscaler_default_downtime: "never"
downscaler_enabled: "true"
{{else if eq .Cluster.Environment "e2e"}}
downscaler_default_uptime: "always"
downscaler_default_downtime: "never"
downscaler_enabled: "true"
{{else}}
downscaler_default_uptime: "always"
downscaler_default_downtime: "never"
downscaler_enabled: "false"
{{end}}

# HPA settings (defaults from https://kubernetes.io/docs/reference/command-line-tools-reference/kube-controller-manager/)
horizontal_pod_autoscaler_sync_period: "30s"
horizontal_pod_autoscaler_tolerance: "0.1"
horizontal_pod_downscale_stabilization: "5m0s"

# Vertical pod autoscaler version for controlling roll-out, can be "current" or "legacy"
# current => v1.0.0-internal.20
# legacy => v0.12.0-internal.19
vertical_pod_autoscaler_version: "current"

# Cluster update settings
{{if eq .Cluster.Environment "production"}}
drain_grace_period: "6h"
drain_min_pod_lifetime: "72h"
drain_min_healthy_sibling_lifetime: "1h"
drain_min_unhealthy_sibling_lifetime: "6h"
drain_force_evict_interval: "5m"
node_update_prepare_replacement_node: "true"
{{else}}
drain_grace_period: "2h"
drain_min_pod_lifetime: "8h"
drain_min_healthy_sibling_lifetime: "1h"
drain_min_unhealthy_sibling_lifetime: "1h"
drain_force_evict_interval: "5m"
node_update_prepare_replacement_node: "false" # don't wait for a replacement instance for on-demand pools in test clusters
{{end}}
# add NoSchedule taints to nodes being replaced
decommission_node_no_schedule_taint: "true"

# Teapot admission controller
teapot_admission_controller_default_cpu_request: "25m"
teapot_admission_controller_default_memory_request: "100Mi"
teapot_admission_controller_process_resources: "true"
teapot_admission_controller_process_resource_ephemeral_storage: "false"
teapot_admission_controller_default_ephemeral_storage_request: "25Mi"
teapot_admission_controller_max_ephemeral_storage_request: "5Gi"
teapot_admission_controller_application_min_creation_time: "2019-06-03T12:00:00Z"
teapot_admission_controller_ndots: "2"
teapot_admission_controller_inject_environment_variables: "true"
{{ if eq .Cluster.Environment "test" }}
teapot_admission_controller_inject_node_options_environment_variable: "true"
{{ else }}
teapot_admission_controller_inject_node_options_environment_variable: "false"
{{ end }}
teapot_admission_controller_deployment_default_max_surge: "5%"
teapot_admission_controller_deployment_default_max_unavailable: "1"
teapot_admission_controller_inject_aws_waiter: "true"
teapot_admission_controller_parent_resource_hash: "true"

## Defaults are set per-cluster
teapot_admission_controller_check_daemonset_resources: "true"
teapot_admission_controller_daemonset_reserved_cpu: "8"
teapot_admission_controller_daemonset_reserved_memory: "64Gi"

kubelet_system_reserved_cpu: "100m"
kubelet_system_reserved_memory: "164Mi"
kubelet_kube_reserved_cpu: "100m"
kubelet_kube_reserved_memory: "282Mi"

kubelet_image_gc_high_threshold: 50
kubelet_image_gc_low_threshold: 40

{{if eq .Cluster.Environment "production"}}
teapot_admission_controller_validate_application_label: "true"
teapot_admission_controller_validate_base_images: "true"

# Check container image compliance in production clusters. Be careful when thinking about changing this: Setting it to
# false will allow any container image to run in production clusters.
#
# If you are seeing issues with "docker-meta" check the next config field to allow-list certain namespaces.
teapot_admission_controller_validate_pod_images: "true"

# If you are seeing issues with the container image compliance checker dependency "docker-meta" you can designate
# a subset of namespaces to be allowed regardless with a regular expression on the namespace, e.g.:
#
# if docker-meta is down, do not reject container images running in `kube-system`
# teapot_admission_controller_validate_pod_images_soft_fail_namespaces: "^kube-system$"

teapot_admission_controller_validate_pod_template_resources: "true"
teapot_admission_controller_preemption_enabled: "true"
teapot_admission_controller_postgresql_delete_protection_enabled: "true"
teapot_admission_controller_namespace_delete_protection_enabled: "true"
{{else if eq .Cluster.Environment "e2e"}}
teapot_admission_controller_validate_application_label: "false"
teapot_admission_controller_validate_base_images: "false"

# Check container image compliance in e2e clusters. There are some exceptions to allow the e2e test suite to run.
teapot_admission_controller_validate_pod_images: "true"

teapot_admission_controller_validate_pod_template_resources: "false"
teapot_admission_controller_preemption_enabled: "true"
teapot_admission_controller_postgresql_delete_protection_enabled: "false"
teapot_admission_controller_namespace_delete_protection_enabled: "false"
{{else}}
teapot_admission_controller_validate_application_label: "false"
teapot_admission_controller_validate_base_images: "false"

# Do not check container image compliance in test clusters.
teapot_admission_controller_validate_pod_images: "false"

teapot_admission_controller_validate_pod_template_resources: "true"
teapot_admission_controller_preemption_enabled: "false"
teapot_admission_controller_postgresql_delete_protection_enabled: "false"
teapot_admission_controller_namespace_delete_protection_enabled: "false"
{{end}}

# Enable automatic replacement of vanity images with ECR images
teapot_admission_controller_resolve_vanity_images: "true"

{{if eq .Cluster.Environment "e2e"}}
teapot_admission_controller_ignore_namespaces: "^kube-system|((downward-api|kubectl|projected|statefulset|pod-network|scope-selectors|resourcequota|limitrange|sysctl|node-tests|e2e-kubelet-etc-hosts|csiinlinevolumes|job|dns)-.*)$"
teapot_admission_controller_crd_ensure_no_resources_on_delete: "false"
{{else}}
teapot_admission_controller_ignore_namespaces: "^kube-system$"
teapot_admission_controller_crd_ensure_no_resources_on_delete: "true"
{{end}}

# Enable kube-node-ready-controller and node-not-ready taint
teapot_admission_controller_node_not_ready_taint: "true"

# Some third-party controllers use API groups that look like they belong to Kubernetes resources. Explicitly allow them anyway.
teapot_admission_controller_crd_role_provisioning_allowed_api_groups: "flink.k8s.io"

teapot_admission_controller_topology_spread: optin
teapot_admission_controller_topology_spread_timeout: 7m


# Enable and configure runtime-policy annotation
{{if eq .Cluster.Environment "production"}}
teapot_admission_controller_runtime_policy_default: "require-on-demand"
{{else}}
teapot_admission_controller_runtime_policy_default: "allow-spot"
{{end}}
# Enforce a certain policy (<empty>|allow-spot|require-on-demand) for a cluster,
# leave empty for falling back to the default.
teapot_admission_controller_runtime_policy_enforced: ""
# Enable hard spot assignment. Only relevant when node_lifecycle_provider=zalando
teapot_admission_controller_runtime_policy_spot_hard_assignment: "false"
# Enable experimental rescheduling of spot nodes after spot decommission
spot_node_rescheduler: "false"
spot_node_rescheduler_memory: "348Mi"
spot_node_rescheduler_cpu: "50m"

# Enable and configure prevent scale down annotation
teapot_admission_controller_prevent_scale_down_enabled: "true"
{{if eq .Cluster.Environment "production"}}
teapot_admission_controller_prevent_scale_down_allowed: "true"
{{else}}
teapot_admission_controller_prevent_scale_down_allowed: "false"
{{end}}

teapot_admission_controller_log4j_format_msg_no_lookups: "true"

teapot_admission_controller_graceful_termination: "true"

# toggle that prevents active configmaps from being deleted
{{if eq .Cluster.Environment "e2e"}}
# allow deletion of active configmaps in e2e clusters because some tests rely on it
teapot_admission_controller_configmap_deletion_protection_enabled: "false"
teapot_admission_controller_configmap_deletion_protection_factories_enabled: "false"
{{else}}
# prevent deletion of active configmaps in all test and production clusters
teapot_admission_controller_configmap_deletion_protection_enabled: "true"
teapot_admission_controller_configmap_deletion_protection_factories_enabled: "true"
{{end}}

# enable the rolebinding admission-controller webhook which validates rolebindings and clusterrolebindings
teapot_admission_controller_enable_rolebinding_webhook: "true"

# enable the generic deny-all admission webhook which rejects all requests it receives
teapot_admission_controller_enable_write_protection_webhook: "false"
# configure the behaviour of the deny-all admission webhook, `true` blocks everything, `false` allows everything
teapot_admission_controller_prevent_write_operations: "false"

# Enable and configure Pod Security Policy rules implemented in admission-controller.
teapot_admission_controller_pod_security_policy_enabled: "true"

# comma separated list of service accounts that are allowed to use privileged
# pod security policy rules. Format: `<namespace>_<service-account-name>`
{{ if eq .Cluster.Environment "e2e" }}
teapot_admission_controller_pod_security_policy_privileged_service_accounts: "psp-privileged-zalando_privileged-sa,psp-privileged-deployment-zalando_privileged-sa"
{{ else }}
teapot_admission_controller_pod_security_policy_privileged_service_accounts: ""
{{ end }}
teapot_admission_controller_pod_security_policy_privileged_allow_privilege_escalation: "false"

# comma separated list of restricted or privileged capabilities that are allowed
# to be used by pods.
teapot_admission_controller_pod_security_policy_additional_restricted_capabilities: ""
teapot_admission_controller_pod_security_policy_privileged_capabilities: ""

# Prevent the use of a particular AZ as much as possible
blocked_availability_zone: ""

# etcd cluster
etcd_stack_name: "etcd-cluster-etcd"
# comma separated list of DNS record prefixes which will be prefixed to the
# hosted zone of the account. We allow multiple prefixes for the purpose of
# migration.
etcd_dns_record_prefixes: "etcd-server.{{.Cluster.Region}}"

{{if eq .Cluster.Environment "production"}}
etcd_instance_count: "5"
etcd_instance_type: "m5.large"
{{else}}
etcd_instance_count: "3"
etcd_instance_type: "t3.medium"
{{end}}

etcd_scalyr_key: ""

etcd_ami: {{ amiID "zalando-ubuntu-etcd-production-v3.5.13-amd64-main-34" "861068367966"}}

cluster_dns: "coredns"
coredns_log_svc_names: "true"
coredns_log_forward: "false"
# max concurrency for upstream (AWS VPC) DNS server
#
# AWS VPC DNS server has a limit of 1024 qps before packets are dropped.
# This setting is tuned to allow a buffer compared to the normal DNS QPS in our
# clusters and prevent CoreDNS from running out of memory in case of spikes.
coredns_max_upstream_concurrency: 2000 # 0 means there is no concurrency limits


tracing_coredns_route_traces_to_local_zone: "false"
tracing_coredns_global_traces_endpoint: ""
tracing_coredns_local_zone_traces_endpoint: ""

# Kubernetes on Ubuntu AMI to use
# note this configuration uses the [amiID][0] function. It returns the
# AMI id given the image name and the Image AWS account owner.
#
# [0]: https://github.com/zalando-incubator/cluster-lifecycle-manager/blob/8a9bd1cb2d094038a9e23e646421f8146b48886a/provisioner/template.go#L116
kuberuntu_image_v1_31_jammy_amd64: {{ amiID "zalando-ubuntu-jammy-22.04-kubernetes-production-v1.31.4-amd64-master-359" "861068367966" }}
kuberuntu_image_v1_31_jammy_arm64: {{ amiID "zalando-ubuntu-jammy-22.04-kubernetes-production-v1.31.4-arm64-master-359" "861068367966" }}

# Which distro from the previous config items should be used. Valid options are only `jammy` for now. Can be set for each node pool.
kuberuntu_distro_master: "jammy"
kuberuntu_distro_worker: "jammy"

# Feature toggle for auditing events
audit_pod_events: "true"
# Whether to log audit event for node changes by kube-controller-manager. Useful
# to see if kube-controller-manager is setting overlapping podCIDRs.
audit_kube_controller_manager_node_changes: "false"
{{if eq .Cluster.Environment "production"}}
audittrail_url: "https://audittrail.cloud.zalando.com"
{{else}}
audittrail_url: ""
{{end}}
audittrail_nakadi_url: ""
audittrail_root_account_role: ""

audittrail_adapter_cpu: "50m"
audittrail_adapter_memory: "200Mi"

audittrail_adapter_timeout: "2s"

# When enabled, any read-only events are added to the metrics, but are dropped
# before being sent to audittrail-api.
# When this is set to true, `auditlog_read_access` can safely be set true
# without overloading audittrail-api.
audittrail_adapter_drop_audittrail_api_read_only: "true"

audit_webhook_batch_max_size: "250"

kube2iam_cpu: "25m"
kube2iam_memory: "100Mi"

# configure whether kube2iam should only run on worker nodes.
kube2iam_worker_only: "true"

# CIDR configuration for nodes and pods
# Changing this will change the number of nodes and pods we can schedule in the
# cluster: https://cloud.google.com/kubernetes-engine/docs/how-to/flexible-pod-cidr
{{if eq .Cluster.Environment "production"}}
node_cidr_mask_size: "25"
{{else}}
node_cidr_mask_size: "24"
{{end}}
# How many nodes to keep reserved (e.g. to allow for increasing the node_cidr_mask_size).
# Note that this only affects CA settings, someone can still scale up the ASGs manually.
reserved_nodes: "5"

# How much extra capacity to add when calculating the maximum number of pods per node. This can be increased if some
# pods don't consume the IP space on the node, but it's fairly dangerous since it has to be absolutely correct. Use
# in emergencies only, and pay extra attention when adding, removing or updating daemonsets.
node_max_pods_extra_capacity: "0"

# maximum number of PIDs allowed to be allocated per pod
pod_max_pids: "4096"

# the cpu management policy which should be used by the kubelet
cpu_manager_policy: "none"

# sysctl names allowed to be used in security policies, comma-separated
allowed_unsafe_sysctls: "net.ipv4.tcp_keepalive_time,net.ipv4.tcp_keepalive_intvl,net.ipv4.tcp_keepalive_probes,net.ipv4.tcp_syn_retries,net.ipv4.tcp_retries2"

# the maximum amount of memory for EBS CSI controller's sidecars
ebs_csi_controller_sidecar_memory: "80Mi"
ebs_csi_controller_sidecar_cpu: "10m"

# pull images in parallel
serialize_image_pulls: "false"

# rate of image pull in the kubelet, see
# see https://github.com/kubernetes/kubernetes/blob/v1.31.0/staging/src/k8s.io/kubelet/config/v1beta1/types.go#L200-L212
#
# registryPullQPS is the limit of registry pulls per second.
# The value must not be a negative number.
# Setting it to 0 means no limit.
# Default: 5
kubelet_registry_pull_qps : "20"

# registryBurst is the maximum size of bursty pulls, temporarily allows
# pulls to burst to this number, while still not exceeding registryPullQPS.
# The value must not be a negative number.
# Only used if registryPullQPS is greater than 0.
# Default: 10
kubelet_registry_burst: "40"

# Version of the scheduler or controller-manager used by the master nodes.
# Supported values:
#  - upstream: official Kubernetes version
#  - zalando:  internal Zalando build with our custom patches
kubernetes_scheduler_image: "zalando"
kubernetes_controller_manager_image: "upstream"

# when set to true, service account tokens can be used from outside the cluster
allow_external_service_accounts: "false"

# issue tokens with a long expiration time in order to detect applications that don't refresh tokens.
rotate_service_account_tokens_extended_expiration: "true"

# Comma separated list of dimensions to include in the Prometheus metrics
# exposed by audittrail-adapter.
# Adding more dimensions can help detect which clients are calling the
# Kubernetes API. Examples:
# * Detect clients not rotating service tokens: `authentication_stale_token` metric.
# * Detect clients with high load on the API Server for certain calls.
# * Detect client calling old api_version/api_groups for resources.
#
# Adding more dimensions has the negative effect that it produces more
# Prometheus data, so it's not intended to be enabled ALL THE TIME, but can be
# enabled when needed to answer one of the above questions or similar.
#
# The possible dimensions are:
# authorization_decision,authentication_stale_token,user,user_agent,verb,code,resource,api_group,api_version
auditlog_metric_dimensions: "authorization_decision"

# enable auditlogging of read access to identify service accounts reading from
# the api.
auditlog_read_access: "false"

# allow ssh access for internal VPC IPs only
ssh_vpc_only: "true"

# configure custom dns zone
custom_dns_zone: "" # zone name e.g. example.org
custom_dns_zone_nameservers: "" # space separated list of nameserver IP addresses

# prefix prepended to ownership TXT records for external-dns
external_dns_ownership_prefix: "_external-dns."
# domains that should be included by ExternalDNS ("" includes all hosted zones in the account. Separate multiple domains with a comma)
external_dns_domain_filter: ""
# domains that should be ignored by ExternalDNS
external_dns_excluded_domains: cluster.local
# synchronization policy between Kubernetes and AWS Route53 (default: sync, options: sync, upsert-only, create-only)
external_dns_policy: sync
# the duration for how long to cache the list of hosted zones in memory
external_dns_zones_cache_duration: "1h"

# resource configuration
external_dns_mem: "4Gi"

# select which cache to use for Cluster DNS: unbound or dnsmasq.
dns_cache: "dnsmasq"

expirimental_dns_unbound_liveness_probe: "true"

# DNS container resources
dns_dnsmasq_cpu: "100m"
dns_dnsmasq_mem: "50Mi"
dns_dnsmasq_sidecar_cpu: "10m"
dns_dnsmasq_sidecar_mem: "45Mi"
dns_unbound_cpu: "100m"
dns_unbound_mem: "50Mi"
dns_unbound_telemetry_cpu: "10m"
dns_unbound_telemetry_mem: "45Mi"
dns_coredns_cpu: "50m"
dns_coredns_mem: "100Mi"

# special roles for test/pet clusters
{{if eq .Cluster.Environment "e2e"}}
collaborator_administrator_access: "true"
{{else}}
collaborator_administrator_access: "false"
{{end}}

node_auth_rate_limit: "5.0"

# enable legacy serviceaccounts for smooth RBAC migration
enable_operator_sa: "false"
enable_default_sa: "false"

# virtual memory configuration
vm_dirty_background_bytes: "67108864"
vm_dirty_bytes: "134217728"

# Option to Enable FeatureGate TopologyAwareHints
enable_topology_aware_hints: "false"

# Enable FeatureGate HPAScaleToZero
enable_hpa_scale_to_zero: "true"

# Enable FeatureGate MaxUnavailableStatefulSet
max_unavailable_statefulset_enabled: "false"

# enable encryption of secrets in etcd
# this flag can be switched between true and false
# to ensure all secrets are encrypted/decrypted all secrets need to be rewritten after masters have been rolled
enable_encryption: "true"

# enable non-default profiles for the default scheduler
enable_scheduler_profiles: "true"

# default ttl for kube janitor for resources build from PRs
kube_janitor_default_pr_ttl: "1w"  # 1 week
# enable cleanup of pr resources other than namespaces
kube_janitor_cleanup_pr_resources: "true"

# opt-in deletion of unused PVCs
kube_janitor_default_unused_pvc_ttl: "forever"

# deletes all resources in the cluster that rely on a vpc
# necessary to change the VPC subnet of a cluster
delete_vpc_resources: "false"
# replacement strategy used for default on-demand worker pool
on_demand_worker_replacement_strategy: none

# SpotAllocationStrategy for pools
spot_allocation_strategy: "capacity-optimized"

# Stackset controller
stackset_controller_sync_interval: "10s"
stackset_controller_mem_min: "120Mi"
stackset_controller_mem_max: "4Gi"

# EBS settings for the root volume
ebs_master_root_volume_size: "50"
ebs_root_volume_size: "100"
ebs_root_volume_delete_on_termination: "true"

# Priority class used for critical system pods
system_priority_class: "cluster-critical-nonpreempting"

# configuration for the PDB controller
{{if eq .Cluster.Environment "test" }}
pdb_controller_non_ready_ttl: "1h"
{{else}}
pdb_controller_non_ready_ttl: ""
{{end}}
pdb_controller_max_unavailable: "1%"

# Log Kubernetes events to Scalyr
kubernetes_event_logger_enabled: "true"
event_logger_mem_min: "100Mi"
event_logger_cpu_min: "10m"

# enable/disable routegroup support for stackset
stackset_routegroup_support_enabled: "true"

# enable/disable inline configmap support for stackset
stackset_inline_configmap_support_enabled: "false"

# enable/disable platformCredentialsSet support for stackset
stackset_pcs_support_enabled: "true"

# Enable/Disable profiling for Kubernetes components
enable_control_plane_profiling: "false"

okta_auth_issuer_url: ""
okta_auth_client_id: "kubernetes.cluster.{{.Cluster.Alias}}"

# Deploy
# This section contains config items to enable and disable the the
# permission for the Role {{.Cluster.LocalID}}-deployment. It allows
# CDP to deploy resources of the specified types.
deploy_allow_lakeformation: "false"
deploy_allow_ram: "false"

experimental_nlb_alpn_h2_enabled: "true"

# Enable/Disable ExecProbeTimeout (default in v1.20)
# Can be disabled in case some workloads depends on the old behavior.
exec_probe_timeout_enabled: "true"

# Settings for the deployment service
deployment_service_controller_cpu: "100m"
deployment_service_controller_memory: "1Gi"
deployment_service_api_role_arn: ""
deployment_service_tokeninfo_url: ""
deployment_service_lightstep_token: ""
deployment_service_ml_experiments_enabled: "true"
deployment_service_cf_auto_expand_enabled: "false"
deployment_service_cf_update_source_branch_changes: "true"
deployment_service_executor_cdp_permissions: "false"
deployment_service_skip_mustache_rendering: "true"
{{- if eq .Cluster.Environment "test" }}
# disable CF update of source branch changes in test to avoid updating CF stacks
# on any PR.
deployment_service_cf_update_source_branch_changes: "false"
{{- end }}


# Will be dropped after the migration
deployment_service_enabled: "true"

# opentelemetry config
observability_collector_endpoint: "tracing.platform-infrastructure.zalan.do"
observability_collector_port: "8443"
observability_collector_scheme: "https"
observability_metrics_endpoint: "ingest.lightstep.com"
observability_metrics_port: "443"

# labels whitelisted to kube-state-metrics
observability_metrics_pods_labels: "application,component,version,environment,stack_name,stack_version,application_id,application_version,team,job-name"

observability_metrics_ingresses_labels: ""

observability_metrics_pods_annotations: "zalando.org/zmon-job-metric-stored"

# opentelemetry collector
observability_otel_collector_enabled: "true"

# list of comma separated buckets which are accessible by zmon
zmon_accessible_s3_buckets: ""

# disable zmon-appliance worker tracking in Prometheus
disable_zmon_appliance_worker_tracking: "true"

# Add ClusterRole for clusters required by hyped-article-lifecycle-management controller
hyped_article_lifecycle_management: "false"

# Add ClusterRole for clusters required by business-partner and config-provider controller
business_partner_service: "false"
config_provider_service: "false"

# enable SizeMemoryBackedVolumes feature flag
enable_size_memory_backed_volumes: "true"

# enable StatefulSetAutoDeletePVC feature flag
# https://kubernetes.io/blog/2021/12/16/kubernetes-1-23-statefulset-pvc-auto-deletion/
enable_statefulset_autodelete_pvc: "true"

# Allow configuring ingress rules on the Worker security group. This is used for
# cases where a service needs a Service Type Load Balancer with an NLB and
# require the port and optional CIDR range to be added to the worker node
# Security groups.
# The format is [<protocol>:]<cidr>:<port>[-<port>],
# Example:
#
# "10.0.0.0/8:4180-4181,0.0.0.0/0:4190,udp:0.0.0.0/0:53" would result in the ingress ranges:
#
#
# [
#
#   {
#     CIDR: "10.0.0.0/8",
#     FromPort: 4180,
#     ToPort: 4181,
#     Protocol: "tcp",
#   },
#   {
#     CIDR: "0.0.0.0/0",
#     FromPort: 4190,
#     ToPort: 4190,
#     Protocol: "tcp",
#   },
#   {
#     CIDR: "0.0.0.0/0",
#     FromPort: 53,
#     ToPort: 53,
#     Protocol: "udp",
#   },
# ]
#
# Source for the template function: sgIngressRanges: https://github.com/zalando-incubator/cluster-lifecycle-manager/blob/42695865a251fef58e22ce612d6549e75fa5d103/provisioner/template.go#L336-L417
open_sg_ingress_ranges: ""

# Each subdomain can reach a max of 63 bytes on Route53
# This custom value sets the subdomain max allowed length taking into consideration the 'cname-' prefix added by external-dns
subdomain_max_length: "57"

# Network monitoring
network_monitoring_enabled: "false"
network_monitoring_daemonset_cpu: "40m"
network_monitoring_daemonset_memory: "100Mi"
network_monitoring_check_api_server_direct: "false"
network_monitoring_check_api_server_dns: "false"
network_monitoring_check_kubenurse_service: "false"
network_monitoring_check_kubenurse_ingress: "false"
network_monitoring_check_neighborhood: "true"
network_monitoring_check_unschedulable_nodes: "true"
network_monitoring_check_interval: "1m"
network_monitoring_separate_prometheus: "false"

# Enable/Disable the s3-csi-driver daemonset:
# https://github.com/awslabs/mountpoint-s3-csi-driver
s3_csi_driver: "false"
# When set to true this will add a node-selector `s3-csi-driver: enabled` to the
# s3-csi-driver daemonset such that only node/node-pools with that label will
# have the driver running. This allows enabling in a subset of a cluster for
# testing.
s3_csi_driver_node_selector_restricted: "true"


# Percent of master node instance memory to allocate to the kube-apiserver
# container. If this value is non-zero it will set the memory limit for the
# kube-apiserver container in the kube-apiserver pod.
# Must be a whole number between 0-100.
apiserver_memory_limit_percent: "80"

apiserver_max_requests_inflight: "400"

# enable graceful shutdown on the control_plane nodes
control_plane_graceful_shutdown: "true"

# Optionally enable an internal load balancer for the control plane nodes
# additionally to the public load balancer.
#
# Possible values:
# none - Load Balancer is not created
# pre - Load Balancer is created but not attached to control plane nodes
# serving - Load Balancer is created and attached to control plane nodes.
# active - Load Balancer is being actively used by worker nodes.
#
# For rolling back it needs to be done in multiple stages: active -> serving -> pre -> none
control_plane_load_balancer_internal: "none"

# This allows setting custom sysctl settings. The config-item is intended to be
# used on node-pools rather being set globally.
#
# The value is a comma seprated configuration of `sysctl_setting=value` as
# illustrated in the example below.
#
# Example:
#
#   sysctl_settings: "fs.aio-max-nr=8388608,fs.inotify.max_user_watches=100000"
#
# Which translates to a file on the node:
#
#  cat /etc/sysctl.d/99-custom-sysctl-settings.conf
#  fs.aio-max-nr = 8388608
#  fs.inotify.max_user_watches = 100000
sysctl_settings: ""

# kube-janitor configuration
{{if eq .Cluster.Environment "production"}}
# This makes kube-janitor opt-in for production clusters

# IMPORTANT:
# Please note that before enabling kube-janitor for a production cluster, you
# must ensure that no existing resources should be annotated with a TTL.
# This can happen in the case where a test deployment is deployed to production
# as is. Currently, it's a no-op since kube-janitor doesn't run in production.
#
# This is needed until we can implement namespace prefix matching to reduce
# the scope of kube-janitor to a set of namespace names that aren't known
# at the time of enaling kube-janitor. Once the feature is in place, it would
# be easier to limit the scope.
kube_janitor_enabled: "false"
{{else}}
kube_janitor_enabled: "true"
{{end}}

# scheduling_controls
teapot_admission_controller_scheduling_controls_enabled: "false"
teapot_admission_controller_scheduling_controls_default_architecture: "amd64"

# role-sync-controller configs
# Enabled by default only on Zalando EKS clusters
{{ if eq .Cluster.Provider "zalando-eks" }}
role_sync_controller_enabled: "true"
{{ else }}
role_sync_controller_enabled: "false"
{{ end }}