Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Introduce OOM watcher to allow graceful shutdown #628

Merged
merged 3 commits into from
Mar 7, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions internal/features/features.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,10 @@ const (
//
// Ref: https://github.com/helm/helm/security/advisories/GHSA-pwcw-6f5g-gxf8
AllowDNSLookups = "AllowDNSLookups"

// OOMWatch enables the OOM watcher, which will gracefully shut down the controller
// when the memory usage exceeds the configured limit. This is disabled by default.
OOMWatch = "OOMWatch"
)

var features = map[string]bool{
Expand All @@ -50,6 +54,9 @@ var features = map[string]bool{
// AllowDNSLookups
// opt-in from v0.31
AllowDNSLookups: false,
// OOMWatch
// opt-in from v0.31
OOMWatch: false,
}

// FeatureGates contains a list of all supported feature gates and
Expand Down
172 changes: 172 additions & 0 deletions internal/oomwatch/watch.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,172 @@
/*
Copyright 2023 The Flux authors

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

// Package oomwatch provides a way to detect near OOM conditions.
package oomwatch

import (
"context"
"fmt"
"os"
"path/filepath"
"strconv"
"strings"
"sync"
"time"

"github.com/go-logr/logr"
)

const (
// DefaultCgroupPath is the default path to the cgroup directory.
DefaultCgroupPath = "/sys/fs/cgroup/"
// MemoryMaxFile is the cgroup memory.max filename.
MemoryMaxFile = "memory.max"
// MemoryCurrentFile is the cgroup memory.current filename.
MemoryCurrentFile = "memory.current"
)

// Watcher can be used to detect near OOM conditions.
type Watcher struct {
// memoryMax is the maximum amount of memory that can be used by the system.
memoryMax uint64
// memoryCurrentPath is the cgroup memory.current filepath.
memoryCurrentPath string
// memoryUsagePercentThreshold is the threshold at which the system is
// considered to be near OOM.
memoryUsagePercentThreshold uint8
// interval is the interval at which to check for OOM.
interval time.Duration
// logger is the logger to use.
logger logr.Logger

// ctx is the context that is canceled when OOM is detected.
ctx context.Context
// cancel is the function that cancels the context.
cancel context.CancelFunc
// once is used to ensure that Watch is only called once.
once sync.Once
}

// New returns a new Watcher.
func New(memoryMaxPath, memoryCurrentPath string, memoryUsagePercentThreshold uint8, interval time.Duration, logger logr.Logger) (*Watcher, error) {
if memoryUsagePercentThreshold < 1 || memoryUsagePercentThreshold > 100 {
return nil, fmt.Errorf("memory usage percent threshold must be between 1 and 100, got %d", memoryUsagePercentThreshold)
}

if minInterval := 50 * time.Millisecond; interval < minInterval {
return nil, fmt.Errorf("interval must be at least %s, got %s", minInterval, interval)
}

if _, err := os.Lstat(memoryCurrentPath); err != nil {
return nil, fmt.Errorf("failed to stat memory.current %q: %w", memoryCurrentPath, err)
}

memoryMax, err := readUintFromFile(memoryMaxPath)
if err != nil {
return nil, fmt.Errorf("failed to read memory.max %q: %w", memoryMaxPath, err)
}

return &Watcher{
memoryMax: memoryMax,
memoryCurrentPath: memoryCurrentPath,
memoryUsagePercentThreshold: memoryUsagePercentThreshold,
interval: interval,
hiddeco marked this conversation as resolved.
Show resolved Hide resolved
logger: logger,
}, nil
}

// NewDefault returns a new Watcher with default path values.
func NewDefault(memoryUsagePercentThreshold uint8, interval time.Duration, logger logr.Logger) (*Watcher, error) {
return New(
filepath.Join(DefaultCgroupPath, MemoryMaxFile),
filepath.Join(DefaultCgroupPath, MemoryCurrentFile),
memoryUsagePercentThreshold,
interval,
hiddeco marked this conversation as resolved.
Show resolved Hide resolved
logger,
)
}

// Watch returns a context that is canceled when the system reaches the
// configured memory usage threshold. Calling Watch multiple times will return
// the same context.
func (w *Watcher) Watch(ctx context.Context) context.Context {
w.once.Do(func() {
w.ctx, w.cancel = context.WithCancel(ctx)
go w.watchForNearOOM(ctx)
})
return w.ctx
}

// watchForNearOOM polls the memory.current file on the configured interval
// and cancels the context within Watcher when the system is near OOM.
// It is expected that this function is called in a goroutine. Canceling
// provided context will cause the goroutine to exit.
func (w *Watcher) watchForNearOOM(ctx context.Context) {
t := time.NewTicker(w.interval)
defer t.Stop()

for {
select {
case <-ctx.Done():
w.logger.Info("Shutdown signal received, stopping watch for near OOM")
return
case <-t.C:
current, err := readUintFromFile(w.memoryCurrentPath)
if err != nil {
w.logger.Error(err, "Failed to read current memory usage, skipping check")
continue
}

currentPercentage := float64(current) / float64(w.memoryMax) * 100
if currentPercentage >= float64(w.memoryUsagePercentThreshold) {
w.logger.Info(fmt.Sprintf("Memory usage is near OOM (%s/%s), shutting down",
formatSize(current), formatSize(w.memoryMax)))
w.cancel()
return
}
w.logger.V(2).Info(fmt.Sprintf("Current memory usage %s/%s (%.2f%% out of %d%%)",
formatSize(current), formatSize(w.memoryMax), currentPercentage, w.memoryUsagePercentThreshold))
}
}
}

// readUintFromFile reads an uint64 from the file at the given path.
func readUintFromFile(path string) (uint64, error) {
b, err := os.ReadFile(path)
if err != nil {
return 0, err
}
return strconv.ParseUint(strings.TrimSpace(string(b)), 10, 64)
}

// formatSize formats the given size in bytes to a human-readable format.
func formatSize(b uint64) string {
if b == 0 {
return "-"
}
const unit = 1024
if b < unit {
return fmt.Sprintf("%d B", b)
}
div, exp := uint64(unit), 0
for n := b / unit; n >= unit; n /= unit {
div *= unit
exp++
}
return fmt.Sprintf("%.1f %ciB",
float64(b)/float64(div), "KMGTPE"[exp])
}
Loading