-
Notifications
You must be signed in to change notification settings - Fork 638
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
health-checker not working as expected for containerd #683
Comments
chiming in here, this is definitely broken even in the 0.8.12 release... |
Adding crictl to the docker image works for me i.e.:
Then adding the volume mounts (we use the helm chart): values.yamlsettings:
# Custom monitor definitions to add to Node Problem Detector - to be
# mounted at /custom-config. These are in addition to pre-packaged monitor
# definitions provided within the default docker image available at /config:
# https://github.com/kubernetes/node-problem-detector/tree/master/config
# settings.custom_monitor_definitions -- Custom plugin monitor config files
custom_monitor_definitions:
health-checker-containerd.json: | # https://github.com/kubernetes/node-problem-detector/blob/1e8008bdedbeae39074c93cfe3fcdad7735f4db1/config/health-checker-containerd.json
{
"plugin": "custom",
"pluginConfig": {
"invoke_interval": "10s",
"timeout": "3m",
"max_output_length": 80,
"concurrency": 1
},
"source": "health-checker",
"metricsReporting": true,
"conditions": [
{
"type": "ContainerRuntimeUnhealthy",
"reason": "ContainerRuntimeIsHealthy",
"message": "Container runtime on the node is functioning properly"
}
],
"rules": [
{
"type": "permanent",
"condition": "ContainerRuntimeUnhealthy",
"reason": "ContainerdUnhealthy",
"path": "/home/kubernetes/bin/health-checker",
"args": [
"--component=cri",
"--enable-repair=false",
"--cooldown-time=2m",
"--health-check-timeout=60s"
],
"timeout": "3m"
}
]
}
health-checker-kubelet.json: | # https://github.com/kubernetes/node-problem-detector/blob/1e8008bdedbeae39074c93cfe3fcdad7735f4db1/config/health-checker-kubelet.json
{
"plugin": "custom",
"pluginConfig": {
"invoke_interval": "10s",
"timeout": "3m",
"max_output_length": 80,
"concurrency": 1
},
"source": "health-checker",
"metricsReporting": true,
"conditions": [
{
"type": "KubeletUnhealthy",
"reason": "KubeletIsHealthy",
"message": "kubelet on the node is functioning properly"
}
],
"rules": [
{
"type": "permanent",
"condition": "KubeletUnhealthy",
"reason": "KubeletUnhealthy",
"path": "/home/kubernetes/bin/health-checker",
"args": [
"--component=kubelet",
"--enable-repair=false",
"--cooldown-time=1m",
"--health-check-timeout=10s"
],
"timeout": "3m"
}
]
}
# docker-monitor-filelog.json: |
# {
# "plugin": "filelog",
# "pluginConfig": {
# "timestamp": "^time=\"(\\S*)\"",
# "message": "msg=\"([^\n]*)\"",
# "timestampFormat": "2006-01-02T15:04:05.999999999-07:00"
# },
# "logPath": "/var/log/docker.log",
# "lookback": "5m",
# "bufferSize": 10,
# "source": "docker-monitor",
# "conditions": [],
# "rules": [
# {
# "type": "temporary",
# "reason": "CorruptDockerImage",
# "pattern": "Error trying v2 registry: failed to register layer: rename /var/lib/docker/image/(.+) /var/lib/docker/image/(.+): directory not empty.*"
# }
# ]
# }
# settings.log_monitors -- User-specified custom monitor definitions
log_monitors:
- /config/kernel-monitor.json
# An example of activating a custom log monitor definition in
# Node Problem Detector
# - /custom-config/docker-monitor-filelog.json
custom_plugin_monitors:
- /custom-config/health-checker-kubelet.json
- /custom-config/health-checker-containerd.json
# settings.prometheus_address -- Prometheus exporter address
prometheus_address: 0.0.0.0
# settings.prometheus_port -- Prometheus exporter port
prometheus_port: 20257 # update prometheus.io/port below
# The period at which k8s-exporter does forcibly sync with apiserver
# settings.heartBeatPeriod -- Syncing interval with API server
heartBeatPeriod: 5m0s
logDir:
# logDir.host -- log directory on k8s host
host: /var/log/
# logDir.pod -- log directory in pod (volume mount), use logDir.host if empty
pod: ""
image:
repository: <our_repo>/node-problem-detector/node-problem-detector
tag: v0.8.12-modified
pullPolicy: IfNotPresent
imagePullSecrets: []
nameOverride: "node-problem-detector"
fullnameOverride: "node-problem-detector"
rbac:
create: true
pspEnabled: false
# hostNetwork -- Run pod on host network
# Flag to run Node Problem Detector on the host's network. This is typically
# not recommended, but may be useful for certain use cases.
hostNetwork: true
hostPID: false
priorityClassName: system-node-critical
securityContext:
privileged: true
resources:
limits:
cpu: 10m
memory: 80Mi
requests:
cpu: 10m
memory: 80Mi
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "20257" # should match prometheus_port above
labels: {}
tolerations:
- effect: NoSchedule
operator: Exists
serviceAccount:
# Specifies whether a ServiceAccount should be created
create: true
# The name of the ServiceAccount to use.
# If not set and create is true, a name is generated using the fullname template
name:
affinity: {}
nodeSelector: {}
metrics:
enabled: true
annotations: {}
serviceMonitor:
enabled: false
additionalLabels: {}
prometheusRule:
enabled: false
defaultRules:
create: true
disabled: []
additionalLabels: {}
additionalRules: []
env:
# - name: FOO
# value: BAR
# - name: POD_NAME
# valueFrom:
# fieldRef:
# fieldPath: metadata.name
extraVolumes:
- name: kmsg
hostPath:
path: /dev/kmsg
- name: machine-id
hostPath:
path: /etc/machine-id
type: "File"
- name: systemd
hostPath:
path: /run/systemd/system/
type: ""
- name: dbus
hostPath:
path: /var/run/dbus/
type: ""
- name: containerd
hostPath:
path: /var/run/containerd
type: ""
extraVolumeMounts:
- name: kmsg
mountPath: /dev/kmsg
readOnly: true
- mountPath: /etc/machine-id
name: machine-id
readOnly: true
- mountPath: /run/systemd/system
name: systemd
- mountPath: /var/run/dbus/
name: dbus
mountPropagation: Bidirectional
- mountPath: /var/run/containerd
name: containerd
readOnly: true
extraContainers: []
# updateStrategy -- Manage the daemonset update strategy
updateStrategy: RollingUpdate
# maxUnavailable -- The max pods unavailable during an update
maxUnavailable: 1
daemonset.yaml---
# Source: node-problem-detector/templates/daemonset.yaml
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: node-problem-detector
labels:
app.kubernetes.io/name: node-problem-detector
helm.sh/chart: node-problem-detector-2.2.4
app.kubernetes.io/instance: release-name
app.kubernetes.io/managed-by: Helm
namespace: node-problem-detector
spec:
updateStrategy:
type: RollingUpdate
rollingUpdate:
maxUnavailable: 1
selector:
matchLabels:
app.kubernetes.io/name: node-problem-detector
app.kubernetes.io/instance: release-name
app: node-problem-detector
template:
metadata:
labels:
app.kubernetes.io/name: node-problem-detector
app.kubernetes.io/instance: release-name
app: node-problem-detector
annotations:
checksum/config: 802cfbb98adbbb0754fbc87e3ca04ca46623ed173c8ea4b33bc7b3148611a04a
prometheus.io/port: "20257"
prometheus.io/scrape: "true"
spec:
serviceAccountName: node-problem-detector
hostNetwork: true
hostPID: false
terminationGracePeriodSeconds: 30
priorityClassName: "system-node-critical"
containers:
- name: node-problem-detector
image: "<our_repo>/node-problem-detector/node-problem-detector:v0.8.12-modified"
imagePullPolicy: "IfNotPresent"
command:
- "/bin/sh"
- "-c"
- "exec /node-problem-detector --logtostderr --config.system-log-monitor=/config/kernel-monitor.json --config.custom-plugin-monitor=/custom-config/health-checker-kubelet.json,/custom-config/health-checker-containerd.json --prometheus-address=0.0.0.0 --prometheus-port=20257 --k8s-exporter-heartbeat-period=5m0s"
securityContext:
privileged: true
env:
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
volumeMounts:
- name: log
mountPath: /var/log/
readOnly: true
- name: localtime
mountPath: /etc/localtime
readOnly: true
- name: custom-config
mountPath: /custom-config
readOnly: true
- mountPath: /dev/kmsg
name: kmsg
readOnly: true
- mountPath: /etc/machine-id
name: machine-id
readOnly: true
- mountPath: /run/systemd/system
name: systemd
- mountPath: /var/run/dbus/
mountPropagation: Bidirectional
name: dbus
- mountPath: /var/run/containerd
name: containerd
readOnly: true
ports:
- containerPort: 20257
name: exporter
resources:
limits:
cpu: 10m
memory: 80Mi
requests:
cpu: 10m
memory: 80Mi
tolerations:
- effect: NoSchedule
operator: Exists
volumes:
- name: log
hostPath:
path: /var/log/
- name: localtime
hostPath:
path: /etc/localtime
type: "FileOrCreate"
- name: custom-config
configMap:
name: node-problem-detector-custom-config
- hostPath:
path: /dev/kmsg
name: kmsg
- hostPath:
path: /etc/machine-id
type: File
name: machine-id
- hostPath:
path: /run/systemd/system/
type: ""
name: systemd
- hostPath:
path: /var/run/dbus/
type: ""
name: dbus
- hostPath:
path: /var/run/containerd
type: ""
name: containerd |
@karlhungus I'm still seeing a similar error again
|
This is completely a guess, but it sounds like your mounts aren't right -- it's hard to tell because crictl isn't outputing stderr (i've submitted a pr for that #702, but it hasn't gotten much attention) One thing i found helpful when debugging these issues is to just shell into the container, and see what the command does i.e.
|
thank you @karlhungus
|
Hi,
we are seeing errors while trying to enable monitoring for containerd:
here's the daemonset:
DOCKER_HOST/node-problem-detector:v0.8.9-test
is created from this commentami-0fd6126f25df4ba20 | bottlerocket-aws-k8s-1.21-x86_64-v1.5.3-f37bd7cb
) on AWS EKSkubectl version output:
The text was updated successfully, but these errors were encountered: