diff --git a/charts/linkerd2-cni/README.md b/charts/linkerd2-cni/README.md index 30c6e4187a4e2..d4a44b2bda79e 100644 --- a/charts/linkerd2-cni/README.md +++ b/charts/linkerd2-cni/README.md @@ -31,7 +31,7 @@ Kubernetes: `>=1.21.0-0` | ignoreOutboundPorts | string | `""` | Default set of outbound ports to skip via iptables | | image.name | string | `"cr.l5d.io/linkerd/cni-plugin"` | Docker image for the CNI plugin | | image.pullPolicy | string | `"IfNotPresent"` | Pull policy for the linkerd-cni container | -| image.version | string | `"v1.2.2"` | Tag for the CNI container Docker image | +| image.version | string | `"v1.3.0"` | Tag for the CNI container Docker image | | imagePullSecrets | list | `[]` | | | inboundProxyPort | int | `4143` | Inbound port for the proxy container | | logLevel | string | `"info"` | Log level for the CNI plugin | @@ -43,7 +43,17 @@ Kubernetes: `>=1.21.0-0` | proxyAdminPort | int | `4191` | Admin port for the proxy container | | proxyControlPort | int | `4190` | Control port for the proxy container | | proxyUID | int | `2102` | User id under which the proxy shall be ran | -| resources | object | `{"cpu":{"limit":"","request":""},"ephemeral-storage":{"limit":"","request":""},"memory":{"limit":"","request":""}}` | Resource requests and limits for linkerd-cni daemonset containers | +| repairController.enableSecurityContext | bool | `true` | Include a securityContext in the repair-controller container | +| repairController.enabled | bool | `false` | Enables the repair-controller container | +| repairController.logFormat | string | plain | Log format (`plain` or `json`) for the repair-controller container | +| repairController.logLevel | string | info | Log level for the repair-controller container | +| repairController.resources.cpu.limit | string | `""` | Maximum amount of CPU units that the repair-controller container can use | +| repairController.resources.cpu.request | string | `""` | Amount of CPU units that the repair-controller container requests | +| repairController.resources.ephemeral-storage.limit | string | `""` | Maximum amount of ephemeral storage that the repair-controller container can use | +| repairController.resources.ephemeral-storage.request | string | `""` | Amount of ephemeral storage that the repair-controller container requests | +| repairController.resources.memory.limit | string | `""` | Maximum amount of memory that the repair-controller container can use | +| repairController.resources.memory.request | string | `""` | Amount of memory that the repair-controller container requests | +| resources | object | `{"cpu":{"limit":"","request":""},"ephemeral-storage":{"limit":"","request":""},"memory":{"limit":"","request":""}}` | Resource requests and limits for linkerd-cni daemonset container | | resources.cpu.limit | string | `""` | Maximum amount of CPU units that the cni container can use | | resources.cpu.request | string | `""` | Amount of CPU units that the cni container requests | | resources.ephemeral-storage.limit | string | `""` | Maximum amount of ephemeral storage that the cni container can use | diff --git a/charts/linkerd2-cni/templates/cni-plugin.yaml b/charts/linkerd2-cni/templates/cni-plugin.yaml index 160449ee36620..69d3e0e641574 100644 --- a/charts/linkerd2-cni/templates/cni-plugin.yaml +++ b/charts/linkerd2-cni/templates/cni-plugin.yaml @@ -112,6 +112,14 @@ rules: - apiGroups: [""] resources: ["pods", "nodes", "namespaces", "services"] verbs: ["list", "get", "watch"] +{{- if .Values.repairController.enabled }} +- apiGroups: [""] + resources: ["pods"] + verbs: ["delete"] +- apiGroups: ["events.k8s.io"] + resources: ["events"] + verbs: ["create"] +{{- end }} --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding @@ -274,6 +282,59 @@ spec: {{- if .Values.resources }} {{- include "partials.resources" .Values.resources | nindent 8 }} {{- end }} + {{- if .Values.repairController.enabled }} + # This container watches over pods whose linkerd-network-validator + # container failed, probably because of a race condition while setting up + # the CNI plugin chain, and deletes those pods so they can try acquiring a + # proper network config again + - name: repair-controller + image: {{ .Values.image.name -}}:{{- .Values.image.version }} + imagePullPolicy: {{ .Values.image.pullPolicy }} + {{- if .Values.repairController.enableSecurityContext }} + env: + - name: LINKERD_CNI_REPAIR_CONTROLLER_NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + - name: LINKERD_CNI_REPAIR_CONTROLLER_POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + command: + - /usr/lib/linkerd/linkerd-cni-repair-controller + args: + - --admin-addr=0.0.0.0:9990 + - --log-format + - {{ .Values.repairController.logFormat }} + - --log-level + - {{ .Values.repairController.logLevel }} + livenessProbe: + httpGet: + path: /live + port: admin-http + readinessProbe: + failureThreshold: 7 + httpGet: + path: /ready + port: admin-http + initialDelaySeconds: 10 + ports: + - containerPort: 9990 + name: admin-http + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + privileged: false + readOnlyRootFilesystem: true + seccompProfile: + type: RuntimeDefault + {{- end }} + {{- if .Values.resources }} + {{- include "partials.resources" .Values.resources | nindent 8 }} + {{- end }} + {{- end }} volumes: {{- if ne .Values.destCNIBinDir .Values.destCNINetDir }} - name: cni-bin-dir diff --git a/charts/linkerd2-cni/values.yaml b/charts/linkerd2-cni/values.yaml index 977763edbb9a1..a9f9e8fd48781 100644 --- a/charts/linkerd2-cni/values.yaml +++ b/charts/linkerd2-cni/values.yaml @@ -53,7 +53,7 @@ image: # -- Docker image for the CNI plugin name: "cr.l5d.io/linkerd/cni-plugin" # -- Tag for the CNI container Docker image - version: "v1.2.2" + version: "v1.3.0" # -- Pull policy for the linkerd-cni container pullPolicy: IfNotPresent @@ -71,22 +71,44 @@ imagePullSecrets: [] # -- Add additional initContainers to the daemonset extraInitContainers: [] -# - name: wait-for-other-cni -# image: busybox:1.33 -# command: -# - /bin/sh -# - -xc -# - | -# for i in $(seq 1 180); do -# test -f /host/etc/cni/net.d/10-aws.conflist && exit 0 -# sleep 1 -# done -# exit 1 -# volumeMounts: -# - mountPath: /host/etc/cni/net.d -# name: cni-net-dir -# -- Resource requests and limits for linkerd-cni daemonset containers +# The cni-repair-controller scans pods in each node to find those that have +# been injected by linkerd, and whose linkerd-network-validator container has +# failed. This is usually caused by a race between linkerd-cni and the CNI +# plugin used in the cluster. This controller deletes those failed pods so they +# can restart and rety re-acquiring a proper network config. +repairController: + # -- Enables the repair-controller container + enabled: false + + # -- Log level for the repair-controller container + # @default -- info + logLevel: info + # -- Log format (`plain` or `json`) for the repair-controller container + # @default -- plain + logFormat: plain + + # -- Include a securityContext in the repair-controller container + enableSecurityContext: true + + resources: + cpu: + # -- Maximum amount of CPU units that the repair-controller container can use + limit: "" + # -- Amount of CPU units that the repair-controller container requests + request: "" + memory: + # -- Maximum amount of memory that the repair-controller container can use + limit: "" + # -- Amount of memory that the repair-controller container requests + request: "" + ephemeral-storage: + # -- Maximum amount of ephemeral storage that the repair-controller container can use + limit: "" + # -- Amount of ephemeral storage that the repair-controller container requests + request: "" + +# -- Resource requests and limits for linkerd-cni daemonset container resources: cpu: # -- Maximum amount of CPU units that the cni container can use diff --git a/cli/cmd/testdata/install_cni_helm_default_output.golden b/cli/cmd/testdata/install_cni_helm_default_output.golden index 6bd305f0668af..566534f1efea5 100644 --- a/cli/cmd/testdata/install_cni_helm_default_output.golden +++ b/cli/cmd/testdata/install_cni_helm_default_output.golden @@ -111,7 +111,7 @@ spec: # script copies the files into place and then sleeps so # that Kubernetes doesn't keep trying to restart it. - name: install-cni - image: cr.l5d.io/linkerd/cni-plugin:v1.2.2 + image: cr.l5d.io/linkerd/cni-plugin:v1.3.0 imagePullPolicy: IfNotPresent env: - name: DEST_CNI_NET_DIR diff --git a/pkg/charts/cni/values.go b/pkg/charts/cni/values.go index 6c04d776404e3..46e29d23d567e 100644 --- a/pkg/charts/cni/values.go +++ b/pkg/charts/cni/values.go @@ -35,6 +35,15 @@ type Resources struct { EphemeralStorage Constraints `json:"ephemeral-storage"` } +// RepairController contains the config for the repair-controller container +type RepairController struct { + Image Image `json:"image"` + LogLevel string `json:"logLevel"` + LogFormat string `json:"logFormat"` + EnableSecurityContext bool `json:"enableSecurityContext"` + Resources Resources `json:"resources"` +} + // Values contains the top-level elements in the cni Helm chart type Values struct { InboundProxyPort uint `json:"inboundProxyPort"` @@ -60,6 +69,7 @@ type Values struct { EnablePSP bool `json:"enablePSP"` Privileged bool `json:"privileged"` Resources Resources `json:"resources"` + RepairController RepairController `json:"repairController"` } // NewValues returns a new instance of the Values type.