diff --git a/cmd/podman/common/create.go b/cmd/podman/common/create.go index 921cd5a71e..9ef11ad0c1 100644 --- a/cmd/podman/common/create.go +++ b/cmd/podman/common/create.go @@ -56,7 +56,7 @@ func GetCreateFlags(cf *ContainerCLIOpts) *pflag.FlagSet { createFlags.StringVar( &cf.CGroupsMode, "cgroups", containerConfig.Cgroups(), - `control container cgroup configuration ("enabled"|"disabled"|"no-conmon")`, + `control container cgroup configuration ("enabled"|"disabled"|"no-conmon"|"conmon-delegated")`, ) createFlags.StringVar( &cf.CGroupParent, diff --git a/docs/source/markdown/podman-create.1.md b/docs/source/markdown/podman-create.1.md index 3ec91a3ad7..289c24d13b 100644 --- a/docs/source/markdown/podman-create.1.md +++ b/docs/source/markdown/podman-create.1.md @@ -80,7 +80,7 @@ If the host uses cgroups v1, the default is set to **host**. On cgroups v2 the Determines whether the container will create CGroups. Valid values are *enabled*, *disabled*, *no-conmon*, which the default being *enabled*. The *disabled* option will force the container to not create CGroups, and thus conflicts with CGroup options (**--cgroupns** and **--cgroup-parent**). -The *no-conmon* option disables a new CGroup only for the conmon process. +The *no-conmon* option disables a new CGroup only for the conmon process. The *conmon-delegated* option reuses the current cgroup for both the conmon and the container payload, it works only on cgroup v2. **--cgroup-parent**=*path* diff --git a/docs/source/markdown/podman-run.1.md b/docs/source/markdown/podman-run.1.md index 7e91a06a3e..320a1455b9 100644 --- a/docs/source/markdown/podman-run.1.md +++ b/docs/source/markdown/podman-run.1.md @@ -89,7 +89,7 @@ Set the cgroup namespace mode for the container. If the host uses cgroups v1, the default is set to **host**. On cgroups v2, the default is **private**. -**--cgroups**=**enabled**|**disabled**|**no-conmon** +**--cgroups**=**enabled**|**disabled**|**no-conmon**|**conmon-delegated** Determines whether the container will create CGroups. @@ -97,6 +97,7 @@ Default is **enabled**. The **disabled** option will force the container to not create CGroups, and thus conflicts with CGroup options (**--cgroupns** and **--cgroup-parent**). The **no-conmon** option disables a new CGroup only for the **conmon** process. +The **conmon-delegated** option reuses the current cgroup for both the conmon and the container payload, it works only on cgroup v2. **--cgroup-parent**=*path* diff --git a/libpod/container_internal_linux.go b/libpod/container_internal_linux.go index 12c1abf1c5..1c5ff59889 100644 --- a/libpod/container_internal_linux.go +++ b/libpod/container_internal_linux.go @@ -31,6 +31,7 @@ import ( "github.com/containers/libpod/pkg/resolvconf" "github.com/containers/libpod/pkg/rootless" "github.com/containers/libpod/pkg/util" + "github.com/containers/libpod/utils" "github.com/containers/storage/pkg/archive" securejoin "github.com/cyphar/filepath-securejoin" "github.com/opencontainers/runc/libcontainer/user" @@ -1505,7 +1506,18 @@ func (c *Container) getOCICgroupPath() (string, error) { case (rootless.IsRootless() && !unified) || c.config.NoCgroups: return "", nil case c.runtime.config.Engine.CgroupManager == config.SystemdCgroupsManager: - // When runc is set to use Systemd as a cgroup manager, it + if c.config.CgroupsMode == conmonDelegated { + if c.config.CgroupParent != "" { + return c.config.CgroupParent, nil + } + selfCgroup, err := utils.GetPidCgroupv2(0) + if err != nil { + return "", err + } + return filepath.Join(selfCgroup, "container"), nil + } + + // When the OCI runtime is set to use Systemd as a cgroup manager, it // expects cgroups to be passed as follows: // slice:prefix:name systemdCgroups := fmt.Sprintf("%s:libpod:%s", path.Base(c.config.CgroupParent), c.ID()) diff --git a/libpod/oci_conmon.go b/libpod/oci_conmon.go new file mode 100644 index 0000000000..592852c87e --- /dev/null +++ b/libpod/oci_conmon.go @@ -0,0 +1,7 @@ +package libpod + +const ( + // conmonDelegated is the cgroup mode for reusing the current cgroup both + // for conmon and for the container payload. + conmonDelegated = "conmon-delegated" +) diff --git a/libpod/oci_conmon_linux.go b/libpod/oci_conmon_linux.go index 625a5bf706..9808f5589f 100644 --- a/libpod/oci_conmon_linux.go +++ b/libpod/oci_conmon_linux.go @@ -881,6 +881,12 @@ func (r *ConmonOCIRuntime) createOCIContainer(ctr *Container, restoreOptions *Co return err } + if ctr.config.CgroupsMode == conmonDelegated { + if err := utils.MoveUnderCgroup2Subtree("supervisor"); err != nil { + return err + } + } + args := r.sharedConmonArgs(ctr, ctr.ID(), ctr.bundlePath(), filepath.Join(ctr.state.RunDir, "pidfile"), ctr.LogPath(), r.exitsDir, ociLog, ctr.LogDriver(), logTag) if ctr.config.Spec.Process.Terminal { @@ -1151,7 +1157,7 @@ func (r *ConmonOCIRuntime) sharedConmonArgs(ctr *Container, cuuid, bundlePath, p "--socket-dir-path", r.socketsDir, } - if r.cgroupManager == config.SystemdCgroupsManager && !ctr.config.NoCgroups { + if r.cgroupManager == config.SystemdCgroupsManager && !ctr.config.NoCgroups && ctr.config.CgroupsMode != conmonDelegated { args = append(args, "-s") } @@ -1253,7 +1259,7 @@ func (r *ConmonOCIRuntime) moveConmonToCgroupAndSignal(ctr *Container, cmd *exec // If cgroup creation is disabled - just signal. switch ctr.config.CgroupsMode { - case "disabled", "no-conmon": + case "disabled", "no-conmon", conmonDelegated: mustCreateCgroup = false } diff --git a/libpod/options.go b/libpod/options.go index ffc9c10181..cb764024a0 100644 --- a/libpod/options.go +++ b/libpod/options.go @@ -1049,7 +1049,7 @@ func WithCgroupsMode(mode string) CtrCreateOption { case "disabled": ctr.config.NoCgroups = true ctr.config.CgroupsMode = mode - case "enabled", "no-conmon": + case "enabled", "no-conmon", conmonDelegated: ctr.config.CgroupsMode = mode default: return errors.Wrapf(define.ErrInvalidArg, "Invalid cgroup mode %q", mode) diff --git a/libpod/runtime_ctr.go b/libpod/runtime_ctr.go index 0431861b53..30c8ca8a5e 100644 --- a/libpod/runtime_ctr.go +++ b/libpod/runtime_ctr.go @@ -233,9 +233,9 @@ func (r *Runtime) setupContainer(ctx context.Context, ctr *Container) (_ *Contai return nil, errors.Wrapf(err, "error retrieving pod %s cgroup", pod.ID()) } ctr.config.CgroupParent = podCgroup - case rootless.IsRootless(): + case rootless.IsRootless() && ctr.config.CgroupsMode != conmonDelegated: ctr.config.CgroupParent = SystemdDefaultRootlessCgroupParent - default: + case ctr.config.CgroupsMode != conmonDelegated: ctr.config.CgroupParent = SystemdDefaultCgroupParent } } else if len(ctr.config.CgroupParent) < 6 || !strings.HasSuffix(path.Base(ctr.config.CgroupParent), ".slice") { diff --git a/utils/utils_supported.go b/utils/utils_supported.go index ce9fd5604c..f8e49b7527 100644 --- a/utils/utils_supported.go +++ b/utils/utils_supported.go @@ -3,10 +3,20 @@ package utils import ( + "bufio" + "bytes" + "fmt" + "io/ioutil" + "os" + "path/filepath" + "strings" + "github.com/containers/libpod/pkg/cgroups" "github.com/containers/libpod/pkg/rootless" systemdDbus "github.com/coreos/go-systemd/v22/dbus" "github.com/godbus/dbus/v5" + "github.com/pkg/errors" + "github.com/sirupsen/logrus" ) // RunUnderSystemdScope adds the specified pid to a systemd scope @@ -43,6 +53,77 @@ func RunUnderSystemdScope(pid int, slice string, unitName string) error { return nil } +// GetPidCgroupv2 returns the unified cgroup for the specified pid. +func GetPidCgroupv2(pid int) (string, error) { + if pid == 0 { + pid = os.Getpid() + } + + unified, err := cgroups.IsCgroup2UnifiedMode() + if err != nil { + return "", err + } + if !unified { + return "", errors.New("move under subtree supported only on cgroup v2") + } + + procFile := fmt.Sprintf("/proc/%d/cgroup", pid) + f, err := os.Open(procFile) + if err != nil { + return "", errors.Wrapf(err, "open file %q", procFile) + } + defer f.Close() + + scanner := bufio.NewScanner(f) + cgroup := "" + for scanner.Scan() { + line := scanner.Text() + if strings.HasPrefix(line, "0::") { + cgroup = line[3:] + break + } + } + if cgroup == "" { + return "", errors.Errorf("could not find cgroup v2 mount in %q", procFile) + } + return cgroup, nil + +} + +// MoveUnderCgroupSubtree moves the PID under a cgroup subtree. +func MoveUnderCgroup2Subtree(subtree string) error { + cgroup, err := GetPidCgroupv2(0) + if err != nil { + return err + } + + cgroupRoot := "/sys/fs/cgroup" + + processes, err := ioutil.ReadFile(filepath.Join(cgroupRoot, cgroup, "cgroup.procs")) + if err != nil { + return err + } + + newCgroup := filepath.Join(cgroupRoot, cgroup, subtree) + if err := os.Mkdir(newCgroup, 0755); err != nil { + return err + } + + f, err := os.OpenFile(filepath.Join(newCgroup, "cgroup.procs"), os.O_RDWR, 0755) + if err != nil { + return err + } + defer f.Close() + + for _, pid := range bytes.Split(processes, []byte("\n")) { + if _, err := f.Write(pid); err != nil { + logrus.Warnf("Cannot move process %s to cgroup %q", pid, newCgroup) + } + } + return nil + +} + func newProp(name string, units interface{}) systemdDbus.Property { return systemdDbus.Property{ Name: name, diff --git a/utils/utils_windows.go b/utils/utils_windows.go index db27877d96..f92425b894 100644 --- a/utils/utils_windows.go +++ b/utils/utils_windows.go @@ -7,3 +7,11 @@ import "github.com/pkg/errors" func RunUnderSystemdScope(pid int, slice string, unitName string) error { return errors.New("not implemented for windows") } + +func MoveUnderCgroup2Subtree(subtree string) error { + return errors.New("not implemented for windows") +} + +func GetPidCgroupv2(pid int) (string, error) { + return "", errors.New("not implemented for windows") +}