diff --git a/libcontainer/cgroups/fs/freezer.go b/libcontainer/cgroups/fs/freezer.go index 11cb1646f..1193ec271 100644 --- a/libcontainer/cgroups/fs/freezer.go +++ b/libcontainer/cgroups/fs/freezer.go @@ -12,6 +12,7 @@ import ( "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" "github.com/opencontainers/runc/libcontainer/configs" + "github.com/sirupsen/logrus" "golang.org/x/sys/unix" ) @@ -28,33 +29,63 @@ func (s *FreezerGroup) Apply(path string, d *cgroupData) error { func (s *FreezerGroup) Set(path string, cgroup *configs.Cgroup) error { switch cgroup.Resources.Freezer { - case configs.Frozen, configs.Thawed: - for { - // In case this loop does not exit because it doesn't get the expected - // state, let's write again this state, hoping it's going to be properly - // set this time. Otherwise, this loop could run infinitely, waiting for - // a state change that would never happen. - if err := fscommon.WriteFile(path, "freezer.state", string(cgroup.Resources.Freezer)); err != nil { + case configs.Frozen: + // As per older kernel docs (freezer-subsystem.txt before + // kernel commit ef9fe980c6fcc1821), if FREEZING is seen, + // userspace should either retry or thaw. While current + // kernel cgroup v1 docs no longer mention a need to retry, + // the kernel (tested on v5.4, Ubuntu 20.04) can't reliably + // freeze a cgroup while new processes keep appearing in it + // (either via fork/clone or by writing new PIDs to + // cgroup.procs). + // + // The numbers below are chosen to have a decent chance to + // succeed even in the worst case scenario (runc pause/unpause + // with parallel runc exec). + // + // Adding any amount of sleep in between retries did not + // increase the chances of successful freeze. + for i := 0; i < 1000; i++ { + if i%50 == 49 { + // Briefly thawing the cgroup also helps. + _ = fscommon.WriteFile(path, "freezer.state", string(configs.Thawed)) + time.Sleep(10 * time.Millisecond) + } + + if err := fscommon.WriteFile(path, "freezer.state", string(configs.Frozen)); err != nil { return err } - state, err := s.GetState(path) + state, err := fscommon.ReadFile(path, "freezer.state") if err != nil { return err } - if state == cgroup.Resources.Freezer { - break + state = strings.TrimSpace(state) + switch state { + case "FREEZING": + continue + case string(configs.Frozen): + if i > 1 { + logrus.Debugf("frozen after %d retries", i) + } + return nil + default: + // should never happen + return fmt.Errorf("unexpected state %s while freezing", strings.TrimSpace(state)) } - - time.Sleep(1 * time.Millisecond) } + // Despite our best efforts, it got stuck in FREEZING. + // Leaving it in this state is bad and dangerous, so + // let's (try to) thaw it back and error out. + _ = fscommon.WriteFile(path, "freezer.state", string(configs.Thawed)) + return errors.New("unable to freeze") + case configs.Thawed: + return fscommon.WriteFile(path, "freezer.state", string(configs.Thawed)) case configs.Undefined: return nil default: return fmt.Errorf("Invalid argument '%s' to freezer.state", string(cgroup.Resources.Freezer)) } - - return nil } func (s *FreezerGroup) GetStats(path string, stats *cgroups.Stats) error {