diff --git a/cmd/podman/common/create.go b/cmd/podman/common/create.go index fbb7f449eb..ec3251ae18 100644 --- a/cmd/podman/common/create.go +++ b/cmd/podman/common/create.go @@ -57,7 +57,7 @@ func GetCreateFlags(cf *ContainerCLIOpts) *pflag.FlagSet { createFlags.StringVar( &cf.CGroupsMode, "cgroups", containerConfig.Cgroups(), - `control container cgroup configuration ("enabled"|"disabled"|"no-conmon")`, + `control container cgroup configuration ("enabled"|"disabled"|"no-conmon"|"split")`, ) createFlags.StringVar( &cf.CGroupParent, diff --git a/cmd/podman/containers/create.go b/cmd/podman/containers/create.go index c48a739ff9..804a232684 100644 --- a/cmd/podman/containers/create.go +++ b/cmd/podman/containers/create.go @@ -209,6 +209,10 @@ func createInit(c *cobra.Command) error { cliVals.Env = env } + if c.Flag("cgroups").Changed && cliVals.CGroupsMode == "split" && registry.IsRemote() { + return errors.Errorf("the option --cgroups=%q is not supported in remote mode", cliVals.CGroupsMode) + } + // Docker-compatibility: the "-h" flag for run/create is reserved for // the hostname (see https://github.com/containers/libpod/issues/1367). diff --git a/docs/source/markdown/podman-create.1.md b/docs/source/markdown/podman-create.1.md index 3ec91a3ad7..ded668f34a 100644 --- a/docs/source/markdown/podman-create.1.md +++ b/docs/source/markdown/podman-create.1.md @@ -78,9 +78,12 @@ If the host uses cgroups v1, the default is set to **host**. On cgroups v2 the **--cgroups**=*mode* Determines whether the container will create CGroups. -Valid values are *enabled*, *disabled*, *no-conmon*, which the default being *enabled*. +Valid values are *enabled*, *disabled*, *no-conmon*, *split*, which the default being *enabled*. + +The *enabled* option will create a new cgroup under the cgroup-parent. The *disabled* option will force the container to not create CGroups, and thus conflicts with CGroup options (**--cgroupns** and **--cgroup-parent**). The *no-conmon* option disables a new CGroup only for the conmon process. +The *split* option splits the current cgroup in two sub-cgroups: one for conmon and one for the container payload. It is not possible to set *--cgroup-parent* with *split*. **--cgroup-parent**=*path* diff --git a/docs/source/markdown/podman-run.1.md b/docs/source/markdown/podman-run.1.md index 88666d595d..83971107f9 100644 --- a/docs/source/markdown/podman-run.1.md +++ b/docs/source/markdown/podman-run.1.md @@ -89,14 +89,16 @@ Set the cgroup namespace mode for the container. If the host uses cgroups v1, the default is set to **host**. On cgroups v2, the default is **private**. -**--cgroups**=**enabled**|**disabled**|**no-conmon** +**--cgroups**=**enabled**|**disabled**|**no-conmon**|**split** Determines whether the container will create CGroups. -Default is **enabled**. The **disabled** option will force the container -to not create CGroups, and thus conflicts with CGroup options -(**--cgroupns** and **--cgroup-parent**). +Default is **enabled**. + +The **enabled** option will create a new cgroup under the cgroup-parent. +The **disabled** option will force the container to not create CGroups, and thus conflicts with CGroup options (**--cgroupns** and **--cgroup-parent**). The **no-conmon** option disables a new CGroup only for the **conmon** process. +The **split** option splits the current cgroup in two sub-cgroups: one for conmon and one for the container payload. It is not possible to set **--cgroup-parent** with **split**. **--cgroup-parent**=*path* diff --git a/libpod/container.go b/libpod/container.go index c852496760..20688e3ee2 100644 --- a/libpod/container.go +++ b/libpod/container.go @@ -17,6 +17,7 @@ import ( "github.com/containers/libpod/libpod/lock" "github.com/containers/libpod/pkg/namespaces" "github.com/containers/libpod/pkg/rootless" + "github.com/containers/libpod/utils" "github.com/containers/storage" "github.com/cri-o/ocicni/pkg/ocicni" spec "github.com/opencontainers/runtime-spec/specs-go" @@ -1089,10 +1090,25 @@ func (c *Container) NamespacePath(linuxNS LinuxNS) (string, error) { //nolint:in // CGroupPath returns a cgroups "path" for a given container. func (c *Container) CGroupPath() (string, error) { - switch c.runtime.config.Engine.CgroupManager { - case config.CgroupfsCgroupsManager: + switch { + case c.config.CgroupsMode == cgroupSplit: + if c.config.CgroupParent != "" { + return "", errors.Errorf("cannot specify cgroup-parent with cgroup-mode %q", cgroupSplit) + } + cg, err := utils.GetCgroupProcess(c.state.ConmonPID) + if err != nil { + return "", err + } + // Use the conmon cgroup for two reasons: we validate the container + // delegation was correct, and the conmon cgroup doesn't change at runtime + // while we are not sure about the container that can create sub cgroups. + if !strings.HasSuffix(cg, "supervisor") { + return "", errors.Errorf("invalid cgroup for conmon %q", cg) + } + return strings.TrimSuffix(cg, "/supervisor") + "/container", nil + case c.runtime.config.Engine.CgroupManager == config.CgroupfsCgroupsManager: return filepath.Join(c.config.CgroupParent, fmt.Sprintf("libpod-%s", c.ID())), nil - case config.SystemdCgroupsManager: + case c.runtime.config.Engine.CgroupManager == config.SystemdCgroupsManager: if rootless.IsRootless() { uid := rootless.GetRootlessUID() parts := strings.SplitN(c.config.CgroupParent, "/", 2) diff --git a/libpod/container_internal_linux.go b/libpod/container_internal_linux.go index 5ee6726e0e..2c78f6bd28 100644 --- a/libpod/container_internal_linux.go +++ b/libpod/container_internal_linux.go @@ -31,6 +31,7 @@ import ( "github.com/containers/libpod/pkg/resolvconf" "github.com/containers/libpod/pkg/rootless" "github.com/containers/libpod/pkg/util" + "github.com/containers/libpod/utils" "github.com/containers/storage/pkg/archive" securejoin "github.com/cyphar/filepath-securejoin" "github.com/opencontainers/runc/libcontainer/user" @@ -1505,8 +1506,17 @@ func (c *Container) getOCICgroupPath() (string, error) { switch { case (rootless.IsRootless() && !unified) || c.config.NoCgroups: return "", nil + case c.config.CgroupsMode == cgroupSplit: + if c.config.CgroupParent != "" { + return c.config.CgroupParent, nil + } + selfCgroup, err := utils.GetOwnCgroup() + if err != nil { + return "", err + } + return filepath.Join(selfCgroup, "container"), nil case c.runtime.config.Engine.CgroupManager == config.SystemdCgroupsManager: - // When runc is set to use Systemd as a cgroup manager, it + // When the OCI runtime is set to use Systemd as a cgroup manager, it // expects cgroups to be passed as follows: // slice:prefix:name systemdCgroups := fmt.Sprintf("%s:libpod:%s", path.Base(c.config.CgroupParent), c.ID()) diff --git a/libpod/container_validate.go b/libpod/container_validate.go index b7f0aadffe..a53a1839d9 100644 --- a/libpod/container_validate.go +++ b/libpod/container_validate.go @@ -34,6 +34,10 @@ func (c *Container) validate() error { return errors.Wrapf(define.ErrInvalidArg, "cannot both create a network namespace and join another container's network namespace") } + if c.config.CgroupsMode == cgroupSplit && c.config.CgroupParent != "" { + return errors.Wrapf(define.ErrInvalidArg, "cannot specify --cgroup-mode=split with a cgroup-parent") + } + // Not creating cgroups has a number of requirements, mostly related to // the PID namespace. if c.config.NoCgroups || c.config.CgroupsMode == "disabled" { diff --git a/libpod/oci_conmon.go b/libpod/oci_conmon.go new file mode 100644 index 0000000000..74060b3579 --- /dev/null +++ b/libpod/oci_conmon.go @@ -0,0 +1,7 @@ +package libpod + +const ( + // cgroupSplit is the cgroup mode for reusing the current cgroup both + // for conmon and for the container payload. + cgroupSplit = "split" +) diff --git a/libpod/oci_conmon_linux.go b/libpod/oci_conmon_linux.go index d8a89047e3..26e5d70b0a 100644 --- a/libpod/oci_conmon_linux.go +++ b/libpod/oci_conmon_linux.go @@ -881,6 +881,12 @@ func (r *ConmonOCIRuntime) createOCIContainer(ctr *Container, restoreOptions *Co return err } + if ctr.config.CgroupsMode == cgroupSplit { + if err := utils.MoveUnderCgroupSubtree("supervisor"); err != nil { + return err + } + } + args := r.sharedConmonArgs(ctr, ctr.ID(), ctr.bundlePath(), filepath.Join(ctr.state.RunDir, "pidfile"), ctr.LogPath(), r.exitsDir, ociLog, ctr.LogDriver(), logTag) if ctr.config.Spec.Process.Terminal { @@ -1173,7 +1179,7 @@ func (r *ConmonOCIRuntime) sharedConmonArgs(ctr *Container, cuuid, bundlePath, p "--socket-dir-path", r.socketsDir, } - if r.cgroupManager == config.SystemdCgroupsManager && !ctr.config.NoCgroups { + if r.cgroupManager == config.SystemdCgroupsManager && !ctr.config.NoCgroups && ctr.config.CgroupsMode != cgroupSplit { args = append(args, "-s") } @@ -1275,7 +1281,7 @@ func (r *ConmonOCIRuntime) moveConmonToCgroupAndSignal(ctr *Container, cmd *exec // If cgroup creation is disabled - just signal. switch ctr.config.CgroupsMode { - case "disabled", "no-conmon": + case "disabled", "no-conmon", cgroupSplit: mustCreateCgroup = false } diff --git a/libpod/options.go b/libpod/options.go index 7a60870a01..28be1bc034 100644 --- a/libpod/options.go +++ b/libpod/options.go @@ -1049,7 +1049,7 @@ func WithCgroupsMode(mode string) CtrCreateOption { case "disabled": ctr.config.NoCgroups = true ctr.config.CgroupsMode = mode - case "enabled", "no-conmon": + case "enabled", "no-conmon", cgroupSplit: ctr.config.CgroupsMode = mode default: return errors.Wrapf(define.ErrInvalidArg, "Invalid cgroup mode %q", mode) diff --git a/libpod/runtime_ctr.go b/libpod/runtime_ctr.go index f1752cbeb6..dd6602acb3 100644 --- a/libpod/runtime_ctr.go +++ b/libpod/runtime_ctr.go @@ -233,9 +233,9 @@ func (r *Runtime) setupContainer(ctx context.Context, ctr *Container) (_ *Contai return nil, errors.Wrapf(err, "error retrieving pod %s cgroup", pod.ID()) } ctr.config.CgroupParent = podCgroup - case rootless.IsRootless(): + case rootless.IsRootless() && ctr.config.CgroupsMode != cgroupSplit: ctr.config.CgroupParent = SystemdDefaultRootlessCgroupParent - default: + case ctr.config.CgroupsMode != cgroupSplit: ctr.config.CgroupParent = SystemdDefaultCgroupParent } } else if len(ctr.config.CgroupParent) < 6 || !strings.HasSuffix(path.Base(ctr.config.CgroupParent), ".slice") { diff --git a/utils/utils_supported.go b/utils/utils_supported.go index ce9fd5604c..201ddb57be 100644 --- a/utils/utils_supported.go +++ b/utils/utils_supported.go @@ -3,10 +3,20 @@ package utils import ( + "bufio" + "bytes" + "fmt" + "io/ioutil" + "os" + "path/filepath" + "strings" + "github.com/containers/libpod/pkg/cgroups" "github.com/containers/libpod/pkg/rootless" systemdDbus "github.com/coreos/go-systemd/v22/dbus" "github.com/godbus/dbus/v5" + "github.com/pkg/errors" + "github.com/sirupsen/logrus" ) // RunUnderSystemdScope adds the specified pid to a systemd scope @@ -43,6 +53,118 @@ func RunUnderSystemdScope(pid int, slice string, unitName string) error { return nil } +func getCgroupProcess(procFile string) (string, error) { + f, err := os.Open(procFile) + if err != nil { + return "", errors.Wrapf(err, "open file %q", procFile) + } + defer f.Close() + + scanner := bufio.NewScanner(f) + cgroup := "/" + for scanner.Scan() { + line := scanner.Text() + parts := strings.Split(line, ":") + if len(parts) != 3 { + return "", errors.Errorf("cannot parse cgroup line %q", line) + } + if strings.HasPrefix(line, "0::") { + cgroup = line[3:] + break + } + // root cgroup, skip it + if parts[2] == "/" { + continue + } + // The process must have the same cgroup path for all controllers + // The OCI runtime spec file allow us to specify only one path. + if cgroup != "/" && cgroup != parts[2] { + return "", errors.Errorf("cgroup configuration not supported, the process is in two different cgroups") + } + cgroup = parts[2] + } + if cgroup == "/" { + return "", errors.Errorf("could not find cgroup mount in %q", procFile) + } + return cgroup, nil +} + +// GetOwnCgroup returns the cgroup for the current process. +func GetOwnCgroup() (string, error) { + return getCgroupProcess("/proc/self/cgroup") +} + +// GetCgroupProcess returns the cgroup for the specified process process. +func GetCgroupProcess(pid int) (string, error) { + return getCgroupProcess(fmt.Sprintf("/proc/%d/cgroup", pid)) +} + +// MoveUnderCgroupSubtree moves the PID under a cgroup subtree. +func MoveUnderCgroupSubtree(subtree string) error { + procFile := "/proc/self/cgroup" + f, err := os.Open(procFile) + if err != nil { + return errors.Wrapf(err, "open file %q", procFile) + } + defer f.Close() + + unifiedMode, err := cgroups.IsCgroup2UnifiedMode() + if err != nil { + return err + } + + scanner := bufio.NewScanner(f) + for scanner.Scan() { + line := scanner.Text() + parts := strings.Split(line, ":") + if len(parts) != 3 { + return errors.Errorf("cannot parse cgroup line %q", line) + } + + // root cgroup, skip it + if parts[2] == "/" { + continue + } + + cgroupRoot := "/sys/fs/cgroup" + // Special case the unified mount on hybrid cgroup and named hierarchies. + // This works on Fedora 31, but we should really parse the mounts to see + // where the cgroup hierarchy is mounted. + if parts[1] == "" && !unifiedMode { + // If it is not using unified mode, the cgroup v2 hierarchy is + // usually mounted under /sys/fs/cgroup/unified + cgroupRoot = filepath.Join(cgroupRoot, "unified") + } else if parts[1] != "" { + // Assume the controller is mounted at /sys/fs/cgroup/$CONTROLLER. + controller := strings.TrimPrefix(parts[1], "name=") + cgroupRoot = filepath.Join(cgroupRoot, controller) + } + + processes, err := ioutil.ReadFile(filepath.Join(cgroupRoot, parts[2], "cgroup.procs")) + if err != nil { + return err + } + + newCgroup := filepath.Join(cgroupRoot, parts[2], subtree) + if err := os.Mkdir(newCgroup, 0755); err != nil { + return err + } + + f, err := os.OpenFile(filepath.Join(newCgroup, "cgroup.procs"), os.O_RDWR, 0755) + if err != nil { + return err + } + defer f.Close() + + for _, pid := range bytes.Split(processes, []byte("\n")) { + if _, err := f.Write(pid); err != nil { + logrus.Warnf("Cannot move process %s to cgroup %q", pid, newCgroup) + } + } + } + return nil +} + func newProp(name string, units interface{}) systemdDbus.Property { return systemdDbus.Property{ Name: name, diff --git a/utils/utils_windows.go b/utils/utils_windows.go index db27877d96..1a2196029b 100644 --- a/utils/utils_windows.go +++ b/utils/utils_windows.go @@ -7,3 +7,15 @@ import "github.com/pkg/errors" func RunUnderSystemdScope(pid int, slice string, unitName string) error { return errors.New("not implemented for windows") } + +func MoveUnderCgroupSubtree(subtree string) error { + return errors.New("not implemented for windows") +} + +func GetOwnCgroup() (string, error) { + return "", errors.New("not implemented for windows") +} + +func GetCgroupProcess(pid int) (string, error) { + return "", errors.New("not implemented for windows") +}