Skip to content

Commit

Permalink
Merge pull request cri-o#8791 from saschagrunert/watchdog
Browse files Browse the repository at this point in the history
Add systemd watchdog support
  • Loading branch information
openshift-merge-bot[bot] authored Dec 11, 2024
2 parents 413d2f6 + 33dbcc1 commit 8d332ee
Show file tree
Hide file tree
Showing 119 changed files with 14,799 additions and 159 deletions.
10 changes: 9 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -454,7 +454,8 @@ mockgen: \
mock-image-types \
mock-ocicni-types \
mock-seccompociartifact-types \
mock-ociartifact-types
mock-ociartifact-types \
mock-systemd

.PHONY: mock-containereventserver
mock-containereventserver: ${MOCKGEN}
Expand Down Expand Up @@ -526,6 +527,13 @@ mock-ociartifact-types: ${MOCKGEN}
-destination ${MOCK_PATH}/ociartifact/ociartifact.go \
github.com/cri-o/cri-o/internal/config/ociartifact Impl

.PHONY: mock-systemd
mock-systemd: ${MOCKGEN}
${MOCKGEN} \
-package systemdmock \
-destination ${MOCK_PATH}/systemd/systemd.go \
github.com/cri-o/cri-o/internal/watchdog Systemd

MANPAGES_MD := $(wildcard docs/*.md)
MANPAGES := $(MANPAGES_MD:%.md=%)

Expand Down
6 changes: 4 additions & 2 deletions cmd/crio/daemon_linux.go
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
package main

import (
systemdDaemon "github.com/coreos/go-systemd/v22/daemon"
"github.com/coreos/go-systemd/v22/daemon"
"github.com/sirupsen/logrus"

"github.com/cri-o/cri-o/internal/watchdog"
)

func sdNotify() {
if _, err := systemdDaemon.SdNotify(false, "READY=1"); err != nil {
if _, err := watchdog.DefaultSystemd().Notify(daemon.SdNotifyReady); err != nil {
logrus.Warnf("Failed to sd_notify systemd: %v", err)
}
}
Expand Down
1 change: 1 addition & 0 deletions contrib/systemd/crio.service
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ OOMScoreAdjust=-999
TimeoutStartSec=0
Restart=on-failure
RestartSec=10
WatchdogSec=60s

[Install]
WantedBy=multi-user.target
Expand Down
15 changes: 9 additions & 6 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -73,12 +73,13 @@ require (
golang.org/x/sys v0.24.0
google.golang.org/grpc v1.66.0
google.golang.org/protobuf v1.34.2
k8s.io/api v0.31.0
k8s.io/apimachinery v0.31.0
k8s.io/client-go v0.31.0
k8s.io/cri-api v0.31.0
k8s.io/api v0.31.3
k8s.io/apimachinery v0.31.3
k8s.io/client-go v0.31.3
k8s.io/cri-api v0.31.3
k8s.io/cri-client v0.31.3
k8s.io/klog/v2 v2.130.1
k8s.io/kubelet v0.31.0
k8s.io/kubelet v0.31.3
k8s.io/utils v0.0.0-20240711033017-18e509b52bc8
sigs.k8s.io/release-sdk v0.12.1
sigs.k8s.io/release-utils v0.8.4
Expand Down Expand Up @@ -190,6 +191,7 @@ require (
github.com/sigstore/rekor v1.3.6 // indirect
github.com/sigstore/sigstore v1.8.4 // indirect
github.com/skeema/knownhosts v1.2.2 // indirect
github.com/spf13/pflag v1.0.5 // indirect
github.com/stefanberger/go-pkcs11uri v0.0.0-20230803200340-78284954bff6 // indirect
github.com/sylabs/sif/v2 v2.18.0 // indirect
github.com/tchap/go-patricia/v2 v2.3.1 // indirect
Expand Down Expand Up @@ -227,7 +229,8 @@ require (
gopkg.in/warnings.v0 v0.1.2 // indirect
gopkg.in/yaml.v2 v2.4.0 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
k8s.io/apiserver v0.31.0 // indirect
k8s.io/apiserver v0.31.3 // indirect
k8s.io/component-base v0.31.3 // indirect
k8s.io/kube-openapi v0.0.0-20240228011516-70dd3763d340 // indirect
sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd // indirect
sigs.k8s.io/structured-merge-diff/v4 v4.4.1 // indirect
Expand Down
28 changes: 16 additions & 12 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -2046,22 +2046,26 @@ honnef.co/go/tools v0.0.1-2019.2.3/go.mod h1:a3bituU0lyd329TUQxRnasdCoJDkEUEAqEt
honnef.co/go/tools v0.0.1-2020.1.3/go.mod h1:X/FiERA/W4tHapMX5mGpAtMSVEeEUOyHaw9vFzvIQ3k=
honnef.co/go/tools v0.0.1-2020.1.4/go.mod h1:X/FiERA/W4tHapMX5mGpAtMSVEeEUOyHaw9vFzvIQ3k=
honnef.co/go/tools v0.1.3/go.mod h1:NgwopIslSNH47DimFoV78dnkksY2EFtX0ajyb3K/las=
k8s.io/api v0.31.0 h1:b9LiSjR2ym/SzTOlfMHm1tr7/21aD7fSkqgD/CVJBCo=
k8s.io/api v0.31.0/go.mod h1:0YiFF+JfFxMM6+1hQei8FY8M7s1Mth+z/q7eF1aJkTE=
k8s.io/apimachinery v0.31.0 h1:m9jOiSr3FoSSL5WO9bjm1n6B9KROYYgNZOb4tyZ1lBc=
k8s.io/apimachinery v0.31.0/go.mod h1:rsPdaZJfTfLsNJSQzNHQvYoTmxhoOEofxtOsF3rtsMo=
k8s.io/apiserver v0.31.0 h1:p+2dgJjy+bk+B1Csz+mc2wl5gHwvNkC9QJV+w55LVrY=
k8s.io/apiserver v0.31.0/go.mod h1:KI9ox5Yu902iBnnyMmy7ajonhKnkeZYJhTZ/YI+WEMk=
k8s.io/client-go v0.31.0 h1:QqEJzNjbN2Yv1H79SsS+SWnXkBgVu4Pj3CJQgbx0gI8=
k8s.io/client-go v0.31.0/go.mod h1:Y9wvC76g4fLjmU0BA+rV+h2cncoadjvjjkkIGoTLcGU=
k8s.io/cri-api v0.31.0 h1:6o0XrhWlc1/zseGCh+aMScdXCg5nT6KCGdyx7HQkSKo=
k8s.io/cri-api v0.31.0/go.mod h1:Po3TMAYH/+KrZabi7QiwQI4a692oZcUOUThd/rqwxrI=
k8s.io/api v0.31.3 h1:umzm5o8lFbdN/hIXbrK9oRpOproJO62CV1zqxXrLgk8=
k8s.io/api v0.31.3/go.mod h1:UJrkIp9pnMOI9K2nlL6vwpxRzzEX5sWgn8kGQe92kCE=
k8s.io/apimachinery v0.31.3 h1:6l0WhcYgasZ/wk9ktLq5vLaoXJJr5ts6lkaQzgeYPq4=
k8s.io/apimachinery v0.31.3/go.mod h1:rsPdaZJfTfLsNJSQzNHQvYoTmxhoOEofxtOsF3rtsMo=
k8s.io/apiserver v0.31.3 h1:+1oHTtCB+OheqFEz375D0IlzHZ5VeQKX1KGXnx+TTuY=
k8s.io/apiserver v0.31.3/go.mod h1:PrxVbebxrxQPFhJk4powDISIROkNMKHibTg9lTRQ0Qg=
k8s.io/client-go v0.31.3 h1:CAlZuM+PH2cm+86LOBemaJI/lQ5linJ6UFxKX/SoG+4=
k8s.io/client-go v0.31.3/go.mod h1:2CgjPUTpv3fE5dNygAr2NcM8nhHzXvxB8KL5gYc3kJs=
k8s.io/component-base v0.31.3 h1:DMCXXVx546Rfvhj+3cOm2EUxhS+EyztH423j+8sOwhQ=
k8s.io/component-base v0.31.3/go.mod h1:xME6BHfUOafRgT0rGVBGl7TuSg8Z9/deT7qq6w7qjIU=
k8s.io/cri-api v0.31.3 h1:dsZXzrGrCEwHjsTDlAV7rutEplpMLY8bfNRMIqrtXjo=
k8s.io/cri-api v0.31.3/go.mod h1:Po3TMAYH/+KrZabi7QiwQI4a692oZcUOUThd/rqwxrI=
k8s.io/cri-client v0.31.3 h1:9ZwddaNJomqkTBYQqSmB+Ccns3beY4HyYDwmRtWTCJM=
k8s.io/cri-client v0.31.3/go.mod h1:klbWiYkOatOQOkXOYZMZMGSTM8q9eC/efsYGuXcgPes=
k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk=
k8s.io/klog/v2 v2.130.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE=
k8s.io/kube-openapi v0.0.0-20240228011516-70dd3763d340 h1:BZqlfIlq5YbRMFko6/PM7FjZpUb45WallggurYhKGag=
k8s.io/kube-openapi v0.0.0-20240228011516-70dd3763d340/go.mod h1:yD4MZYeKMBwQKVht279WycxKyM84kkAx2DPrTXaeb98=
k8s.io/kubelet v0.31.0 h1:IlfkBy7QTojGEm97GuVGhtli0HL/Pgu4AdayiF76yWo=
k8s.io/kubelet v0.31.0/go.mod h1:s+OnqnfdIh14PFpUb7NgzM53WSYXcczA3w/1qSzsRc8=
k8s.io/kubelet v0.31.3 h1:DIXRAmvVGp42mV2vpA1GCLU6oO8who0/vp3Oq6kSpbI=
k8s.io/kubelet v0.31.3/go.mod h1:KSdbEfNy5VzqUlAHlytA/fH12s+sE1u8fb/8JY9sL/8=
k8s.io/utils v0.0.0-20240711033017-18e509b52bc8 h1:pUdcCO1Lk/tbT5ztQWOBi5HBgbBP1J8+AsQnQCKsi8A=
k8s.io/utils v0.0.0-20240711033017-18e509b52bc8/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0=
lukechampine.com/uint128 v1.1.1/go.mod h1:c4eWIwlEGaxC/+H1VguhU4PHXNWDCDMUlWdIWl2j1gk=
Expand Down
27 changes: 27 additions & 0 deletions internal/watchdog/suite_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
package watchdog_test

import (
"testing"

. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"

. "github.com/cri-o/cri-o/test/framework"
)

// TestWatchdog runs the created specs.
func TestWatchdog(t *testing.T) {
RegisterFailHandler(Fail)
RunFrameworkSpecs(t, "Watchdog")
}

var t *TestFramework

var _ = BeforeSuite(func() {
t = NewTestFramework(NilFunc, NilFunc)
t.Setup()
})

var _ = AfterSuite(func() {
t.Teardown()
})
43 changes: 43 additions & 0 deletions internal/watchdog/systemd.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
package watchdog

import (
"time"

"github.com/coreos/go-systemd/v22/daemon"
)

// Systemd is the main interface for supported systemd functionality.
type Systemd interface {
WatchdogEnabled() (time.Duration, error)
Notify(string) (bool, error)
}

type defaultSystemd struct{}

// DefaultSystemd returns the default systemd implementation.
func DefaultSystemd() Systemd {
return &defaultSystemd{}
}

// WatchdogEnabled returns watchdog information for a service.
// Processes should call Notify(daemon.SdNotifyWatchdog) every
// time / 2.
//
// It returns one of the following:
// (0, nil) - watchdog isn't enabled or we aren't the watched PID.
// (0, err) - an error happened (e.g. error converting time).
// (time, nil) - watchdog is enabled and we can send ping. time is delay
// before inactive service will be killed.
func (*defaultSystemd) WatchdogEnabled() (time.Duration, error) {
return daemon.SdWatchdogEnabled(false)
}

// Notify sends a message to the init daemon. It is common to ignore the error.
//
// It returns one of the following:
// (false, nil) - notification not supported (i.e. NOTIFY_SOCKET is unset).
// (false, err) - notification supported, but failure happened (e.g. error connecting to NOTIFY_SOCKET or while sending data).
// (true, nil) - notification supported, data has been sent.
func (d *defaultSystemd) Notify(state string) (bool, error) {
return daemon.SdNotify(false, state)
}
101 changes: 101 additions & 0 deletions internal/watchdog/watchdog.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
package watchdog

import (
"context"
"errors"
"fmt"
"sync/atomic"
"time"

"github.com/coreos/go-systemd/v22/daemon"
"k8s.io/apimachinery/pkg/util/wait"

"github.com/cri-o/cri-o/internal/log"
)

// Watchdog is the main structure for this package.
type Watchdog struct {
systemd Systemd
backoff wait.Backoff
healthCheckers []HealthCheckFn
notifications atomic.Uint64
}

const minInterval = time.Second

// HealthCheckFn is the health checker function type definition.
type HealthCheckFn func(context.Context, time.Duration) error

// New creates a new systemd Watchdog instance.
func New(healthCheckers ...HealthCheckFn) *Watchdog {
return &Watchdog{
systemd: DefaultSystemd(),
backoff: wait.Backoff{
Duration: time.Second,
Factor: 2.0,
Jitter: 0.1,
Steps: 2,
},
healthCheckers: healthCheckers,
}
}

// Start runs the watchdog.
func (w *Watchdog) Start(ctx context.Context) error {
interval, err := w.systemd.WatchdogEnabled()
if err != nil {
return fmt.Errorf("configure watchdog: %w", err)
}

if interval == 0 {
log.Infof(ctx, "No systemd watchdog enabled")
return nil
}

if interval <= minInterval {
return fmt.Errorf("watchdog timeout of %v should be at least %v", interval, minInterval)
}
interval /= 2

log.Infof(ctx, "Starting systemd watchdog using interval: %v", interval)

go wait.Forever(func() {
if err := w.runHealthCheckers(ctx, interval); err != nil {
log.Errorf(ctx, "Will not notify watchdog because CRI-O is unhealthy: %v", err)
return
}

if err := wait.ExponentialBackoff(w.backoff, func() (bool, error) {
gotAck, err := w.systemd.Notify(daemon.SdNotifyWatchdog)
w.notifications.Add(1)
if err != nil {
log.Warnf(ctx, "Failed to notify systemd watchdog, retrying: %v", err)
return false, nil
}
if !gotAck {
return false, errors.New("notification not supported (NOTIFY_SOCKET is unset)")
}

log.Debugf(ctx, "Systemd watchdog successfully notified")
return true, nil
}); err != nil {
log.Errorf(ctx, "Failed to notify watchdog: %v", err)
}
}, interval)

return nil
}

// Notifications returns the amount of done systemd notifications.
func (w *Watchdog) Notifications() uint64 {
return w.notifications.Load()
}

func (w *Watchdog) runHealthCheckers(ctx context.Context, timeout time.Duration) error {
for _, hc := range w.healthCheckers {
if err := hc(ctx, timeout); err != nil {
return fmt.Errorf("health checker failed: %w", err)
}
}
return nil
}
Loading

0 comments on commit 8d332ee

Please sign in to comment.