Skip to content

Commit

Permalink
podman: add new cgroup mode split
Browse files Browse the repository at this point in the history
When running under systemd there is no need to create yet another
cgroup for the container.

With conmon-delegated the current cgroup will be split in two sub
cgroups:

- supervisor
- container

The supervisor cgroup will hold conmon and the podman process, while
the container cgroup is used by the OCI runtime (using the cgroupfs
backend).

Closes: #6400

Signed-off-by: Giuseppe Scrivano <[email protected]>
  • Loading branch information
giuseppe committed Jun 25, 2020
1 parent 35cca19 commit 6ee5f74
Show file tree
Hide file tree
Showing 13 changed files with 201 additions and 15 deletions.
2 changes: 1 addition & 1 deletion cmd/podman/common/create.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ func GetCreateFlags(cf *ContainerCLIOpts) *pflag.FlagSet {
createFlags.StringVar(
&cf.CGroupsMode,
"cgroups", containerConfig.Cgroups(),
`control container cgroup configuration ("enabled"|"disabled"|"no-conmon")`,
`control container cgroup configuration ("enabled"|"disabled"|"no-conmon"|"split")`,
)
createFlags.StringVar(
&cf.CGroupParent,
Expand Down
4 changes: 4 additions & 0 deletions cmd/podman/containers/create.go
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,10 @@ func createInit(c *cobra.Command) error {
cliVals.Env = env
}

if c.Flag("cgroups").Changed && cliVals.CGroupsMode == "split" && registry.IsRemote() {
return errors.Errorf("the option --cgroups=%q is not supported in remote mode", cliVals.CGroupsMode)
}

// Docker-compatibility: the "-h" flag for run/create is reserved for
// the hostname (see https://github.com/containers/libpod/issues/1367).

Expand Down
5 changes: 4 additions & 1 deletion docs/source/markdown/podman-create.1.md
Original file line number Diff line number Diff line change
Expand Up @@ -78,9 +78,12 @@ If the host uses cgroups v1, the default is set to **host**. On cgroups v2 the
**--cgroups**=*mode*

Determines whether the container will create CGroups.
Valid values are *enabled*, *disabled*, *no-conmon*, which the default being *enabled*.
Valid values are *enabled*, *disabled*, *no-conmon*, *split*, which the default being *enabled*.

The *enabled* option will create a new cgroup under the cgroup-parent.
The *disabled* option will force the container to not create CGroups, and thus conflicts with CGroup options (**--cgroupns** and **--cgroup-parent**).
The *no-conmon* option disables a new CGroup only for the conmon process.
The *split* option splits the current cgroup in two sub-cgroups: one for conmon and one for the container payload. It is not possible to set *--cgroup-parent* with *split*.

**--cgroup-parent**=*path*

Expand Down
10 changes: 6 additions & 4 deletions docs/source/markdown/podman-run.1.md
Original file line number Diff line number Diff line change
Expand Up @@ -89,14 +89,16 @@ Set the cgroup namespace mode for the container.

If the host uses cgroups v1, the default is set to **host**. On cgroups v2, the default is **private**.

**--cgroups**=**enabled**|**disabled**|**no-conmon**
**--cgroups**=**enabled**|**disabled**|**no-conmon**|**split**

Determines whether the container will create CGroups.

Default is **enabled**. The **disabled** option will force the container
to not create CGroups, and thus conflicts with CGroup options
(**--cgroupns** and **--cgroup-parent**).
Default is **enabled**.

The **enabled** option will create a new cgroup under the cgroup-parent.
The **disabled** option will force the container to not create CGroups, and thus conflicts with CGroup options (**--cgroupns** and **--cgroup-parent**).
The **no-conmon** option disables a new CGroup only for the **conmon** process.
The **split** option splits the current cgroup in two sub-cgroups: one for conmon and one for the container payload. It is not possible to set **--cgroup-parent** with **split**.

**--cgroup-parent**=*path*

Expand Down
22 changes: 19 additions & 3 deletions libpod/container.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ import (
"github.com/containers/libpod/libpod/lock"
"github.com/containers/libpod/pkg/namespaces"
"github.com/containers/libpod/pkg/rootless"
"github.com/containers/libpod/utils"
"github.com/containers/storage"
"github.com/cri-o/ocicni/pkg/ocicni"
spec "github.com/opencontainers/runtime-spec/specs-go"
Expand Down Expand Up @@ -1089,10 +1090,25 @@ func (c *Container) NamespacePath(linuxNS LinuxNS) (string, error) { //nolint:in

// CGroupPath returns a cgroups "path" for a given container.
func (c *Container) CGroupPath() (string, error) {
switch c.runtime.config.Engine.CgroupManager {
case config.CgroupfsCgroupsManager:
switch {
case c.config.CgroupsMode == cgroupSplit:
if c.config.CgroupParent != "" {
return "", errors.Errorf("cannot specify cgroup-parent with cgroup-mode %q", cgroupSplit)
}
cg, err := utils.GetCgroupProcess(c.state.ConmonPID)
if err != nil {
return "", err
}
// Use the conmon cgroup for two reasons: we validate the container
// delegation was correct, and the conmon cgroup doesn't change at runtime
// while we are not sure about the container that can create sub cgroups.
if !strings.HasSuffix(cg, "supervisor") {
return "", errors.Errorf("invalid cgroup for conmon %q", cg)
}
return strings.TrimSuffix(cg, "/supervisor") + "/container", nil
case c.runtime.config.Engine.CgroupManager == config.CgroupfsCgroupsManager:
return filepath.Join(c.config.CgroupParent, fmt.Sprintf("libpod-%s", c.ID())), nil
case config.SystemdCgroupsManager:
case c.runtime.config.Engine.CgroupManager == config.SystemdCgroupsManager:
if rootless.IsRootless() {
uid := rootless.GetRootlessUID()
parts := strings.SplitN(c.config.CgroupParent, "/", 2)
Expand Down
12 changes: 11 additions & 1 deletion libpod/container_internal_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ import (
"github.com/containers/libpod/pkg/resolvconf"
"github.com/containers/libpod/pkg/rootless"
"github.com/containers/libpod/pkg/util"
"github.com/containers/libpod/utils"
"github.com/containers/storage/pkg/archive"
securejoin "github.com/cyphar/filepath-securejoin"
"github.com/opencontainers/runc/libcontainer/user"
Expand Down Expand Up @@ -1505,8 +1506,17 @@ func (c *Container) getOCICgroupPath() (string, error) {
switch {
case (rootless.IsRootless() && !unified) || c.config.NoCgroups:
return "", nil
case c.config.CgroupsMode == cgroupSplit:
if c.config.CgroupParent != "" {
return c.config.CgroupParent, nil
}
selfCgroup, err := utils.GetOwnCgroup()
if err != nil {
return "", err
}
return filepath.Join(selfCgroup, "container"), nil
case c.runtime.config.Engine.CgroupManager == config.SystemdCgroupsManager:
// When runc is set to use Systemd as a cgroup manager, it
// When the OCI runtime is set to use Systemd as a cgroup manager, it
// expects cgroups to be passed as follows:
// slice:prefix:name
systemdCgroups := fmt.Sprintf("%s:libpod:%s", path.Base(c.config.CgroupParent), c.ID())
Expand Down
4 changes: 4 additions & 0 deletions libpod/container_validate.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,10 @@ func (c *Container) validate() error {
return errors.Wrapf(define.ErrInvalidArg, "cannot both create a network namespace and join another container's network namespace")
}

if c.config.CgroupsMode == cgroupSplit && c.config.CgroupParent != "" {
return errors.Wrapf(define.ErrInvalidArg, "cannot specify --cgroup-mode=split with a cgroup-parent")
}

// Not creating cgroups has a number of requirements, mostly related to
// the PID namespace.
if c.config.NoCgroups || c.config.CgroupsMode == "disabled" {
Expand Down
7 changes: 7 additions & 0 deletions libpod/oci_conmon.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
package libpod

const (
// cgroupSplit is the cgroup mode for reusing the current cgroup both
// for conmon and for the container payload.
cgroupSplit = "split"
)
10 changes: 8 additions & 2 deletions libpod/oci_conmon_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -881,6 +881,12 @@ func (r *ConmonOCIRuntime) createOCIContainer(ctr *Container, restoreOptions *Co
return err
}

if ctr.config.CgroupsMode == cgroupSplit {
if err := utils.MoveUnderCgroupSubtree("supervisor"); err != nil {
return err
}
}

args := r.sharedConmonArgs(ctr, ctr.ID(), ctr.bundlePath(), filepath.Join(ctr.state.RunDir, "pidfile"), ctr.LogPath(), r.exitsDir, ociLog, ctr.LogDriver(), logTag)

if ctr.config.Spec.Process.Terminal {
Expand Down Expand Up @@ -1173,7 +1179,7 @@ func (r *ConmonOCIRuntime) sharedConmonArgs(ctr *Container, cuuid, bundlePath, p
"--socket-dir-path", r.socketsDir,
}

if r.cgroupManager == config.SystemdCgroupsManager && !ctr.config.NoCgroups {
if r.cgroupManager == config.SystemdCgroupsManager && !ctr.config.NoCgroups && ctr.config.CgroupsMode != cgroupSplit {
args = append(args, "-s")
}

Expand Down Expand Up @@ -1275,7 +1281,7 @@ func (r *ConmonOCIRuntime) moveConmonToCgroupAndSignal(ctr *Container, cmd *exec

// If cgroup creation is disabled - just signal.
switch ctr.config.CgroupsMode {
case "disabled", "no-conmon":
case "disabled", "no-conmon", cgroupSplit:
mustCreateCgroup = false
}

Expand Down
2 changes: 1 addition & 1 deletion libpod/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -1049,7 +1049,7 @@ func WithCgroupsMode(mode string) CtrCreateOption {
case "disabled":
ctr.config.NoCgroups = true
ctr.config.CgroupsMode = mode
case "enabled", "no-conmon":
case "enabled", "no-conmon", cgroupSplit:
ctr.config.CgroupsMode = mode
default:
return errors.Wrapf(define.ErrInvalidArg, "Invalid cgroup mode %q", mode)
Expand Down
4 changes: 2 additions & 2 deletions libpod/runtime_ctr.go
Original file line number Diff line number Diff line change
Expand Up @@ -233,9 +233,9 @@ func (r *Runtime) setupContainer(ctx context.Context, ctr *Container) (_ *Contai
return nil, errors.Wrapf(err, "error retrieving pod %s cgroup", pod.ID())
}
ctr.config.CgroupParent = podCgroup
case rootless.IsRootless():
case rootless.IsRootless() && ctr.config.CgroupsMode != cgroupSplit:
ctr.config.CgroupParent = SystemdDefaultRootlessCgroupParent
default:
case ctr.config.CgroupsMode != cgroupSplit:
ctr.config.CgroupParent = SystemdDefaultCgroupParent
}
} else if len(ctr.config.CgroupParent) < 6 || !strings.HasSuffix(path.Base(ctr.config.CgroupParent), ".slice") {
Expand Down
122 changes: 122 additions & 0 deletions utils/utils_supported.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,20 @@
package utils

import (
"bufio"
"bytes"
"fmt"
"io/ioutil"
"os"
"path/filepath"
"strings"

"github.com/containers/libpod/pkg/cgroups"
"github.com/containers/libpod/pkg/rootless"
systemdDbus "github.com/coreos/go-systemd/v22/dbus"
"github.com/godbus/dbus/v5"
"github.com/pkg/errors"
"github.com/sirupsen/logrus"
)

// RunUnderSystemdScope adds the specified pid to a systemd scope
Expand Down Expand Up @@ -43,6 +53,118 @@ func RunUnderSystemdScope(pid int, slice string, unitName string) error {
return nil
}

func getCgroupProcess(procFile string) (string, error) {
f, err := os.Open(procFile)
if err != nil {
return "", errors.Wrapf(err, "open file %q", procFile)
}
defer f.Close()

scanner := bufio.NewScanner(f)
cgroup := "/"
for scanner.Scan() {
line := scanner.Text()
parts := strings.Split(line, ":")
if len(parts) != 3 {
return "", errors.Errorf("cannot parse cgroup line %q", line)
}
if strings.HasPrefix(line, "0::") {
cgroup = line[3:]
break
}
// root cgroup, skip it
if parts[2] == "/" {
continue
}
// The process must have the same cgroup path for all controllers
// The OCI runtime spec file allow us to specify only one path.
if cgroup != "/" && cgroup != parts[2] {
return "", errors.Errorf("cgroup configuration not supported, the process is in two different cgroups")
}
cgroup = parts[2]
}
if cgroup == "/" {
return "", errors.Errorf("could not find cgroup mount in %q", procFile)
}
return cgroup, nil
}

// GetOwnCgroup returns the cgroup for the current process.
func GetOwnCgroup() (string, error) {
return getCgroupProcess("/proc/self/cgroup")
}

// GetCgroupProcess returns the cgroup for the specified process process.
func GetCgroupProcess(pid int) (string, error) {
return getCgroupProcess(fmt.Sprintf("/proc/%d/cgroup", pid))
}

// MoveUnderCgroupSubtree moves the PID under a cgroup subtree.
func MoveUnderCgroupSubtree(subtree string) error {
procFile := "/proc/self/cgroup"
f, err := os.Open(procFile)
if err != nil {
return errors.Wrapf(err, "open file %q", procFile)
}
defer f.Close()

unifiedMode, err := cgroups.IsCgroup2UnifiedMode()
if err != nil {
return err
}

scanner := bufio.NewScanner(f)
for scanner.Scan() {
line := scanner.Text()
parts := strings.Split(line, ":")
if len(parts) != 3 {
return errors.Errorf("cannot parse cgroup line %q", line)
}

// root cgroup, skip it
if parts[2] == "/" {
continue
}

cgroupRoot := "/sys/fs/cgroup"
// Special case the unified mount on hybrid cgroup and named hierarchies.
// This works on Fedora 31, but we should really parse the mounts to see
// where the cgroup hierarchy is mounted.
if parts[1] == "" && !unifiedMode {
// If it is not using unified mode, the cgroup v2 hierarchy is
// usually mounted under /sys/fs/cgroup/unified
cgroupRoot = filepath.Join(cgroupRoot, "unified")
} else if parts[1] != "" {
// Assume the controller is mounted at /sys/fs/cgroup/$CONTROLLER.
controller := strings.TrimPrefix(parts[1], "name=")
cgroupRoot = filepath.Join(cgroupRoot, controller)
}

processes, err := ioutil.ReadFile(filepath.Join(cgroupRoot, parts[2], "cgroup.procs"))
if err != nil {
return err
}

newCgroup := filepath.Join(cgroupRoot, parts[2], subtree)
if err := os.Mkdir(newCgroup, 0755); err != nil {
return err
}

f, err := os.OpenFile(filepath.Join(newCgroup, "cgroup.procs"), os.O_RDWR, 0755)
if err != nil {
return err
}
defer f.Close()

for _, pid := range bytes.Split(processes, []byte("\n")) {
if _, err := f.Write(pid); err != nil {
logrus.Warnf("Cannot move process %s to cgroup %q", pid, newCgroup)
}
}
}
return nil
}

func newProp(name string, units interface{}) systemdDbus.Property {
return systemdDbus.Property{
Name: name,
Expand Down
12 changes: 12 additions & 0 deletions utils/utils_windows.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,15 @@ import "github.com/pkg/errors"
func RunUnderSystemdScope(pid int, slice string, unitName string) error {
return errors.New("not implemented for windows")
}

func MoveUnderCgroupSubtree(subtree string) error {
return errors.New("not implemented for windows")
}

func GetOwnCgroup() (string, error) {
return "", errors.New("not implemented for windows")
}

func GetCgroupProcess(pid int) (string, error) {
return "", errors.New("not implemented for windows")
}

0 comments on commit 6ee5f74

Please sign in to comment.