Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[4.6] kludges for the freeze race #40

Merged
merged 2 commits into from
Mar 2, 2021
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 45 additions & 14 deletions libcontainer/cgroups/fs/freezer.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import (
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
)

Expand All @@ -28,33 +29,63 @@ func (s *FreezerGroup) Apply(path string, d *cgroupData) error {

func (s *FreezerGroup) Set(path string, cgroup *configs.Cgroup) error {
switch cgroup.Resources.Freezer {
case configs.Frozen, configs.Thawed:
for {
// In case this loop does not exit because it doesn't get the expected
// state, let's write again this state, hoping it's going to be properly
// set this time. Otherwise, this loop could run infinitely, waiting for
// a state change that would never happen.
if err := fscommon.WriteFile(path, "freezer.state", string(cgroup.Resources.Freezer)); err != nil {
case configs.Frozen:
// As per older kernel docs (freezer-subsystem.txt before
// kernel commit ef9fe980c6fcc1821), if FREEZING is seen,
// userspace should either retry or thaw. While current
// kernel cgroup v1 docs no longer mention a need to retry,
// the kernel (tested on v5.4, Ubuntu 20.04) can't reliably
// freeze a cgroup while new processes keep appearing in it
// (either via fork/clone or by writing new PIDs to
// cgroup.procs).
//
// The numbers below are chosen to have a decent chance to
// succeed even in the worst case scenario (runc pause/unpause
// with parallel runc exec).
//
// Adding any amount of sleep in between retries did not
// increase the chances of successful freeze.
for i := 0; i < 1000; i++ {
if i%50 == 49 {
// Briefly thawing the cgroup also helps.
_ = fscommon.WriteFile(path, "freezer.state", string(configs.Thawed))
time.Sleep(10 * time.Millisecond)
}

if err := fscommon.WriteFile(path, "freezer.state", string(configs.Frozen)); err != nil {
return err
}

state, err := s.GetState(path)
state, err := fscommon.ReadFile(path, "freezer.state")
if err != nil {
return err
}
if state == cgroup.Resources.Freezer {
break
state = strings.TrimSpace(state)
switch state {
case "FREEZING":
continue
case string(configs.Frozen):
if i > 1 {
logrus.Debugf("frozen after %d retries", i)
}
return nil
default:
// should never happen
return fmt.Errorf("unexpected state %s while freezing", strings.TrimSpace(state))
}

time.Sleep(1 * time.Millisecond)
}
// Despite our best efforts, it got stuck in FREEZING.
// Leaving it in this state is bad and dangerous, so
// let's (try to) thaw it back and error out.
_ = fscommon.WriteFile(path, "freezer.state", string(configs.Thawed))
return errors.New("unable to freeze")
case configs.Thawed:
return fscommon.WriteFile(path, "freezer.state", string(configs.Thawed))
case configs.Undefined:
return nil
default:
return fmt.Errorf("Invalid argument '%s' to freezer.state", string(cgroup.Resources.Freezer))
}

return nil
}

func (s *FreezerGroup) GetStats(path string, stats *cgroups.Stats) error {
Expand Down