forked from cri-o/cri-o
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request cri-o#8791 from saschagrunert/watchdog
Add systemd watchdog support
- Loading branch information
Showing
119 changed files
with
14,799 additions
and
159 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
package watchdog_test | ||
|
||
import ( | ||
"testing" | ||
|
||
. "github.com/onsi/ginkgo/v2" | ||
. "github.com/onsi/gomega" | ||
|
||
. "github.com/cri-o/cri-o/test/framework" | ||
) | ||
|
||
// TestWatchdog runs the created specs. | ||
func TestWatchdog(t *testing.T) { | ||
RegisterFailHandler(Fail) | ||
RunFrameworkSpecs(t, "Watchdog") | ||
} | ||
|
||
var t *TestFramework | ||
|
||
var _ = BeforeSuite(func() { | ||
t = NewTestFramework(NilFunc, NilFunc) | ||
t.Setup() | ||
}) | ||
|
||
var _ = AfterSuite(func() { | ||
t.Teardown() | ||
}) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
package watchdog | ||
|
||
import ( | ||
"time" | ||
|
||
"github.com/coreos/go-systemd/v22/daemon" | ||
) | ||
|
||
// Systemd is the main interface for supported systemd functionality. | ||
type Systemd interface { | ||
WatchdogEnabled() (time.Duration, error) | ||
Notify(string) (bool, error) | ||
} | ||
|
||
type defaultSystemd struct{} | ||
|
||
// DefaultSystemd returns the default systemd implementation. | ||
func DefaultSystemd() Systemd { | ||
return &defaultSystemd{} | ||
} | ||
|
||
// WatchdogEnabled returns watchdog information for a service. | ||
// Processes should call Notify(daemon.SdNotifyWatchdog) every | ||
// time / 2. | ||
// | ||
// It returns one of the following: | ||
// (0, nil) - watchdog isn't enabled or we aren't the watched PID. | ||
// (0, err) - an error happened (e.g. error converting time). | ||
// (time, nil) - watchdog is enabled and we can send ping. time is delay | ||
// before inactive service will be killed. | ||
func (*defaultSystemd) WatchdogEnabled() (time.Duration, error) { | ||
return daemon.SdWatchdogEnabled(false) | ||
} | ||
|
||
// Notify sends a message to the init daemon. It is common to ignore the error. | ||
// | ||
// It returns one of the following: | ||
// (false, nil) - notification not supported (i.e. NOTIFY_SOCKET is unset). | ||
// (false, err) - notification supported, but failure happened (e.g. error connecting to NOTIFY_SOCKET or while sending data). | ||
// (true, nil) - notification supported, data has been sent. | ||
func (d *defaultSystemd) Notify(state string) (bool, error) { | ||
return daemon.SdNotify(false, state) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,101 @@ | ||
package watchdog | ||
|
||
import ( | ||
"context" | ||
"errors" | ||
"fmt" | ||
"sync/atomic" | ||
"time" | ||
|
||
"github.com/coreos/go-systemd/v22/daemon" | ||
"k8s.io/apimachinery/pkg/util/wait" | ||
|
||
"github.com/cri-o/cri-o/internal/log" | ||
) | ||
|
||
// Watchdog is the main structure for this package. | ||
type Watchdog struct { | ||
systemd Systemd | ||
backoff wait.Backoff | ||
healthCheckers []HealthCheckFn | ||
notifications atomic.Uint64 | ||
} | ||
|
||
const minInterval = time.Second | ||
|
||
// HealthCheckFn is the health checker function type definition. | ||
type HealthCheckFn func(context.Context, time.Duration) error | ||
|
||
// New creates a new systemd Watchdog instance. | ||
func New(healthCheckers ...HealthCheckFn) *Watchdog { | ||
return &Watchdog{ | ||
systemd: DefaultSystemd(), | ||
backoff: wait.Backoff{ | ||
Duration: time.Second, | ||
Factor: 2.0, | ||
Jitter: 0.1, | ||
Steps: 2, | ||
}, | ||
healthCheckers: healthCheckers, | ||
} | ||
} | ||
|
||
// Start runs the watchdog. | ||
func (w *Watchdog) Start(ctx context.Context) error { | ||
interval, err := w.systemd.WatchdogEnabled() | ||
if err != nil { | ||
return fmt.Errorf("configure watchdog: %w", err) | ||
} | ||
|
||
if interval == 0 { | ||
log.Infof(ctx, "No systemd watchdog enabled") | ||
return nil | ||
} | ||
|
||
if interval <= minInterval { | ||
return fmt.Errorf("watchdog timeout of %v should be at least %v", interval, minInterval) | ||
} | ||
interval /= 2 | ||
|
||
log.Infof(ctx, "Starting systemd watchdog using interval: %v", interval) | ||
|
||
go wait.Forever(func() { | ||
if err := w.runHealthCheckers(ctx, interval); err != nil { | ||
log.Errorf(ctx, "Will not notify watchdog because CRI-O is unhealthy: %v", err) | ||
return | ||
} | ||
|
||
if err := wait.ExponentialBackoff(w.backoff, func() (bool, error) { | ||
gotAck, err := w.systemd.Notify(daemon.SdNotifyWatchdog) | ||
w.notifications.Add(1) | ||
if err != nil { | ||
log.Warnf(ctx, "Failed to notify systemd watchdog, retrying: %v", err) | ||
return false, nil | ||
} | ||
if !gotAck { | ||
return false, errors.New("notification not supported (NOTIFY_SOCKET is unset)") | ||
} | ||
|
||
log.Debugf(ctx, "Systemd watchdog successfully notified") | ||
return true, nil | ||
}); err != nil { | ||
log.Errorf(ctx, "Failed to notify watchdog: %v", err) | ||
} | ||
}, interval) | ||
|
||
return nil | ||
} | ||
|
||
// Notifications returns the amount of done systemd notifications. | ||
func (w *Watchdog) Notifications() uint64 { | ||
return w.notifications.Load() | ||
} | ||
|
||
func (w *Watchdog) runHealthCheckers(ctx context.Context, timeout time.Duration) error { | ||
for _, hc := range w.healthCheckers { | ||
if err := hc(ctx, timeout); err != nil { | ||
return fmt.Errorf("health checker failed: %w", err) | ||
} | ||
} | ||
return nil | ||
} |
Oops, something went wrong.