Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

podman: add new cgroup mode split #6666

Merged
merged 1 commit into from
Jun 29, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cmd/podman/common/create.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ func GetCreateFlags(cf *ContainerCLIOpts) *pflag.FlagSet {
createFlags.StringVar(
&cf.CGroupsMode,
"cgroups", containerConfig.Cgroups(),
`control container cgroup configuration ("enabled"|"disabled"|"no-conmon")`,
`control container cgroup configuration ("enabled"|"disabled"|"no-conmon"|"split")`,
)
createFlags.StringVar(
&cf.CGroupParent,
Expand Down
4 changes: 4 additions & 0 deletions cmd/podman/containers/create.go
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,10 @@ func createInit(c *cobra.Command) error {
cliVals.Env = env
}

if c.Flag("cgroups").Changed && cliVals.CGroupsMode == "split" && registry.IsRemote() {
return errors.Errorf("the option --cgroups=%q is not supported in remote mode", cliVals.CGroupsMode)
}

// Docker-compatibility: the "-h" flag for run/create is reserved for
// the hostname (see https://github.com/containers/libpod/issues/1367).

Expand Down
5 changes: 4 additions & 1 deletion docs/source/markdown/podman-create.1.md
Original file line number Diff line number Diff line change
Expand Up @@ -78,9 +78,12 @@ If the host uses cgroups v1, the default is set to **host**. On cgroups v2 the
**--cgroups**=*mode*

Determines whether the container will create CGroups.
Valid values are *enabled*, *disabled*, *no-conmon*, which the default being *enabled*.
Valid values are *enabled*, *disabled*, *no-conmon*, *split*, which the default being *enabled*.

The *enabled* option will create a new cgroup under the cgroup-parent.
The *disabled* option will force the container to not create CGroups, and thus conflicts with CGroup options (**--cgroupns** and **--cgroup-parent**).
The *no-conmon* option disables a new CGroup only for the conmon process.
The *split* option splits the current cgroup in two sub-cgroups: one for conmon and one for the container payload. It is not possible to set *--cgroup-parent* with *split*.

**--cgroup-parent**=*path*

Expand Down
10 changes: 6 additions & 4 deletions docs/source/markdown/podman-run.1.md
Original file line number Diff line number Diff line change
Expand Up @@ -89,14 +89,16 @@ Set the cgroup namespace mode for the container.

If the host uses cgroups v1, the default is set to **host**. On cgroups v2, the default is **private**.

**--cgroups**=**enabled**|**disabled**|**no-conmon**
**--cgroups**=**enabled**|**disabled**|**no-conmon**|**split**

Determines whether the container will create CGroups.

Default is **enabled**. The **disabled** option will force the container
to not create CGroups, and thus conflicts with CGroup options
(**--cgroupns** and **--cgroup-parent**).
Default is **enabled**.

The **enabled** option will create a new cgroup under the cgroup-parent.
The **disabled** option will force the container to not create CGroups, and thus conflicts with CGroup options (**--cgroupns** and **--cgroup-parent**).
The **no-conmon** option disables a new CGroup only for the **conmon** process.
The **split** option splits the current cgroup in two sub-cgroups: one for conmon and one for the container payload. It is not possible to set **--cgroup-parent** with **split**.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We really, really, really need to create a conmon man page that we can add a link to for pages like this. @haircommander any chance of making this happen, even if the page is terribly short and sweet?

This is not a holdback for this review at this time.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.


**--cgroup-parent**=*path*

Expand Down
22 changes: 19 additions & 3 deletions libpod/container.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ import (
"github.com/containers/libpod/libpod/lock"
"github.com/containers/libpod/pkg/namespaces"
"github.com/containers/libpod/pkg/rootless"
"github.com/containers/libpod/utils"
"github.com/containers/storage"
"github.com/cri-o/ocicni/pkg/ocicni"
spec "github.com/opencontainers/runtime-spec/specs-go"
Expand Down Expand Up @@ -1089,10 +1090,25 @@ func (c *Container) NamespacePath(linuxNS LinuxNS) (string, error) { //nolint:in

// CGroupPath returns a cgroups "path" for a given container.
func (c *Container) CGroupPath() (string, error) {
switch c.runtime.config.Engine.CgroupManager {
case config.CgroupfsCgroupsManager:
switch {
case c.config.CgroupsMode == cgroupSplit:
if c.config.CgroupParent != "" {
return "", errors.Errorf("cannot specify cgroup-parent with cgroup-mode %q", cgroupSplit)
}
cg, err := utils.GetCgroupProcess(c.state.ConmonPID)
if err != nil {
return "", err
}
// Use the conmon cgroup for two reasons: we validate the container
// delegation was correct, and the conmon cgroup doesn't change at runtime
// while we are not sure about the container that can create sub cgroups.
if !strings.HasSuffix(cg, "supervisor") {
return "", errors.Errorf("invalid cgroup for conmon %q", cg)
}
return strings.TrimSuffix(cg, "/supervisor") + "/container", nil
case c.runtime.config.Engine.CgroupManager == config.CgroupfsCgroupsManager:
return filepath.Join(c.config.CgroupParent, fmt.Sprintf("libpod-%s", c.ID())), nil
case config.SystemdCgroupsManager:
case c.runtime.config.Engine.CgroupManager == config.SystemdCgroupsManager:
if rootless.IsRootless() {
uid := rootless.GetRootlessUID()
parts := strings.SplitN(c.config.CgroupParent, "/", 2)
Expand Down
12 changes: 11 additions & 1 deletion libpod/container_internal_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ import (
"github.com/containers/libpod/pkg/resolvconf"
"github.com/containers/libpod/pkg/rootless"
"github.com/containers/libpod/pkg/util"
"github.com/containers/libpod/utils"
"github.com/containers/storage/pkg/archive"
securejoin "github.com/cyphar/filepath-securejoin"
"github.com/opencontainers/runc/libcontainer/user"
Expand Down Expand Up @@ -1505,8 +1506,17 @@ func (c *Container) getOCICgroupPath() (string, error) {
switch {
case (rootless.IsRootless() && !unified) || c.config.NoCgroups:
return "", nil
case c.config.CgroupsMode == cgroupSplit:
if c.config.CgroupParent != "" {
return c.config.CgroupParent, nil
}
selfCgroup, err := utils.GetOwnCgroup()
if err != nil {
return "", err
}
return filepath.Join(selfCgroup, "container"), nil
case c.runtime.config.Engine.CgroupManager == config.SystemdCgroupsManager:
// When runc is set to use Systemd as a cgroup manager, it
// When the OCI runtime is set to use Systemd as a cgroup manager, it
// expects cgroups to be passed as follows:
// slice:prefix:name
systemdCgroups := fmt.Sprintf("%s:libpod:%s", path.Base(c.config.CgroupParent), c.ID())
Expand Down
4 changes: 4 additions & 0 deletions libpod/container_validate.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,10 @@ func (c *Container) validate() error {
return errors.Wrapf(define.ErrInvalidArg, "cannot both create a network namespace and join another container's network namespace")
}

if c.config.CgroupsMode == cgroupSplit && c.config.CgroupParent != "" {
return errors.Wrapf(define.ErrInvalidArg, "cannot specify --cgroup-mode=split with a cgroup-parent")
}

// Not creating cgroups has a number of requirements, mostly related to
// the PID namespace.
if c.config.NoCgroups || c.config.CgroupsMode == "disabled" {
Expand Down
7 changes: 7 additions & 0 deletions libpod/oci_conmon.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
package libpod

const (
// cgroupSplit is the cgroup mode for reusing the current cgroup both
// for conmon and for the container payload.
cgroupSplit = "split"
)
10 changes: 8 additions & 2 deletions libpod/oci_conmon_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -881,6 +881,12 @@ func (r *ConmonOCIRuntime) createOCIContainer(ctr *Container, restoreOptions *Co
return err
}

if ctr.config.CgroupsMode == cgroupSplit {
if err := utils.MoveUnderCgroupSubtree("supervisor"); err != nil {
return err
}
}

args := r.sharedConmonArgs(ctr, ctr.ID(), ctr.bundlePath(), filepath.Join(ctr.state.RunDir, "pidfile"), ctr.LogPath(), r.exitsDir, ociLog, ctr.LogDriver(), logTag)

if ctr.config.Spec.Process.Terminal {
Expand Down Expand Up @@ -1173,7 +1179,7 @@ func (r *ConmonOCIRuntime) sharedConmonArgs(ctr *Container, cuuid, bundlePath, p
"--socket-dir-path", r.socketsDir,
}

if r.cgroupManager == config.SystemdCgroupsManager && !ctr.config.NoCgroups {
if r.cgroupManager == config.SystemdCgroupsManager && !ctr.config.NoCgroups && ctr.config.CgroupsMode != cgroupSplit {
args = append(args, "-s")
}

Expand Down Expand Up @@ -1275,7 +1281,7 @@ func (r *ConmonOCIRuntime) moveConmonToCgroupAndSignal(ctr *Container, cmd *exec

// If cgroup creation is disabled - just signal.
switch ctr.config.CgroupsMode {
case "disabled", "no-conmon":
case "disabled", "no-conmon", cgroupSplit:
mustCreateCgroup = false
}

Expand Down
2 changes: 1 addition & 1 deletion libpod/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -1049,7 +1049,7 @@ func WithCgroupsMode(mode string) CtrCreateOption {
case "disabled":
ctr.config.NoCgroups = true
ctr.config.CgroupsMode = mode
case "enabled", "no-conmon":
case "enabled", "no-conmon", cgroupSplit:
ctr.config.CgroupsMode = mode
default:
return errors.Wrapf(define.ErrInvalidArg, "Invalid cgroup mode %q", mode)
Expand Down
4 changes: 2 additions & 2 deletions libpod/runtime_ctr.go
Original file line number Diff line number Diff line change
Expand Up @@ -233,9 +233,9 @@ func (r *Runtime) setupContainer(ctx context.Context, ctr *Container) (_ *Contai
return nil, errors.Wrapf(err, "error retrieving pod %s cgroup", pod.ID())
}
ctr.config.CgroupParent = podCgroup
case rootless.IsRootless():
case rootless.IsRootless() && ctr.config.CgroupsMode != cgroupSplit:
ctr.config.CgroupParent = SystemdDefaultRootlessCgroupParent
default:
case ctr.config.CgroupsMode != cgroupSplit:
ctr.config.CgroupParent = SystemdDefaultCgroupParent
}
} else if len(ctr.config.CgroupParent) < 6 || !strings.HasSuffix(path.Base(ctr.config.CgroupParent), ".slice") {
Expand Down
122 changes: 122 additions & 0 deletions utils/utils_supported.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,20 @@
package utils

import (
"bufio"
"bytes"
"fmt"
"io/ioutil"
"os"
"path/filepath"
"strings"

"github.com/containers/libpod/pkg/cgroups"
"github.com/containers/libpod/pkg/rootless"
systemdDbus "github.com/coreos/go-systemd/v22/dbus"
"github.com/godbus/dbus/v5"
"github.com/pkg/errors"
"github.com/sirupsen/logrus"
)

// RunUnderSystemdScope adds the specified pid to a systemd scope
Expand Down Expand Up @@ -43,6 +53,118 @@ func RunUnderSystemdScope(pid int, slice string, unitName string) error {
return nil
}

func getCgroupProcess(procFile string) (string, error) {
f, err := os.Open(procFile)
if err != nil {
return "", errors.Wrapf(err, "open file %q", procFile)
}
defer f.Close()

scanner := bufio.NewScanner(f)
cgroup := "/"
for scanner.Scan() {
line := scanner.Text()
parts := strings.Split(line, ":")
if len(parts) != 3 {
return "", errors.Errorf("cannot parse cgroup line %q", line)
}
if strings.HasPrefix(line, "0::") {
cgroup = line[3:]
break
}
// root cgroup, skip it
if parts[2] == "/" {
continue
}
// The process must have the same cgroup path for all controllers
// The OCI runtime spec file allow us to specify only one path.
if cgroup != "/" && cgroup != parts[2] {
return "", errors.Errorf("cgroup configuration not supported, the process is in two different cgroups")
}
cgroup = parts[2]
}
if cgroup == "/" {
return "", errors.Errorf("could not find cgroup mount in %q", procFile)
}
return cgroup, nil
}

// GetOwnCgroup returns the cgroup for the current process.
func GetOwnCgroup() (string, error) {
return getCgroupProcess("/proc/self/cgroup")
}

// GetCgroupProcess returns the cgroup for the specified process process.
func GetCgroupProcess(pid int) (string, error) {
return getCgroupProcess(fmt.Sprintf("/proc/%d/cgroup", pid))
}

// MoveUnderCgroupSubtree moves the PID under a cgroup subtree.
func MoveUnderCgroupSubtree(subtree string) error {
procFile := "/proc/self/cgroup"
f, err := os.Open(procFile)
if err != nil {
return errors.Wrapf(err, "open file %q", procFile)
}
defer f.Close()

unifiedMode, err := cgroups.IsCgroup2UnifiedMode()
if err != nil {
return err
}

scanner := bufio.NewScanner(f)
for scanner.Scan() {
line := scanner.Text()
parts := strings.Split(line, ":")
if len(parts) != 3 {
return errors.Errorf("cannot parse cgroup line %q", line)
}

// root cgroup, skip it
if parts[2] == "/" {
continue
}

cgroupRoot := "/sys/fs/cgroup"
// Special case the unified mount on hybrid cgroup and named hierarchies.
// This works on Fedora 31, but we should really parse the mounts to see
// where the cgroup hierarchy is mounted.
if parts[1] == "" && !unifiedMode {
// If it is not using unified mode, the cgroup v2 hierarchy is
// usually mounted under /sys/fs/cgroup/unified
cgroupRoot = filepath.Join(cgroupRoot, "unified")
} else if parts[1] != "" {
// Assume the controller is mounted at /sys/fs/cgroup/$CONTROLLER.
controller := strings.TrimPrefix(parts[1], "name=")
cgroupRoot = filepath.Join(cgroupRoot, controller)
}

processes, err := ioutil.ReadFile(filepath.Join(cgroupRoot, parts[2], "cgroup.procs"))
if err != nil {
return err
}

newCgroup := filepath.Join(cgroupRoot, parts[2], subtree)
if err := os.Mkdir(newCgroup, 0755); err != nil {
return err
}

f, err := os.OpenFile(filepath.Join(newCgroup, "cgroup.procs"), os.O_RDWR, 0755)
if err != nil {
return err
}
defer f.Close()

for _, pid := range bytes.Split(processes, []byte("\n")) {
if _, err := f.Write(pid); err != nil {
logrus.Warnf("Cannot move process %s to cgroup %q", pid, newCgroup)
}
}
}
return nil
}

func newProp(name string, units interface{}) systemdDbus.Property {
return systemdDbus.Property{
Name: name,
Expand Down
12 changes: 12 additions & 0 deletions utils/utils_windows.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,15 @@ import "github.com/pkg/errors"
func RunUnderSystemdScope(pid int, slice string, unitName string) error {
return errors.New("not implemented for windows")
}

func MoveUnderCgroupSubtree(subtree string) error {
return errors.New("not implemented for windows")
}

func GetOwnCgroup() (string, error) {
return "", errors.New("not implemented for windows")
}

func GetCgroupProcess(pid int) (string, error) {
return "", errors.New("not implemented for windows")
}