From 556db46a68003047dbb18bab79af7dda008335f3 Mon Sep 17 00:00:00 2001 From: Giuseppe Scrivano Date: Wed, 6 Sep 2023 12:17:03 +0200 Subject: [PATCH 01/10] libpod: refactor code to new function move the code to remove the pod cgroup to a separate function. It is a preparation for the next patch. Signed-off-by: Giuseppe Scrivano --- libpod/runtime_pod_common.go | 113 ++++++++++++++++------------------- 1 file changed, 51 insertions(+), 62 deletions(-) diff --git a/libpod/runtime_pod_common.go b/libpod/runtime_pod_common.go index 56c8a6153f..74100c4071 100644 --- a/libpod/runtime_pod_common.go +++ b/libpod/runtime_pod_common.go @@ -192,6 +192,51 @@ func (r *Runtime) removeMalformedPod(ctx context.Context, p *Pod, ctrs []*Contai return removedCtrs, nil } +func (p *Pod) removePodCgroup() error { + // Remove pod cgroup, if present + if p.state.CgroupPath == "" { + return nil + } + logrus.Debugf("Removing pod cgroup %s", p.state.CgroupPath) + + switch p.runtime.config.Engine.CgroupManager { + case config.SystemdCgroupsManager: + if err := deleteSystemdCgroup(p.state.CgroupPath, p.ResourceLim()); err != nil { + return fmt.Errorf("removing pod %s cgroup: %w", p.ID(), err) + } + case config.CgroupfsCgroupsManager: + // Delete the cgroupfs cgroup + // Make sure the conmon cgroup is deleted first + // Since the pod is almost gone, don't bother failing + // hard - instead, just log errors. + conmonCgroupPath := filepath.Join(p.state.CgroupPath, "conmon") + conmonCgroup, err := cgroups.Load(conmonCgroupPath) + if err != nil && err != cgroups.ErrCgroupDeleted && err != cgroups.ErrCgroupV1Rootless { + return fmt.Errorf("retrieving pod %s conmon cgroup: %w", p.ID(), err) + } + if err == nil { + if err = conmonCgroup.Delete(); err != nil { + return fmt.Errorf("removing pod %s conmon cgroup: %w", p.ID(), err) + } + } + cgroup, err := cgroups.Load(p.state.CgroupPath) + if err != nil && err != cgroups.ErrCgroupDeleted && err != cgroups.ErrCgroupV1Rootless { + return fmt.Errorf("retrieving pod %s cgroup: %w", p.ID(), err) + } + if err == nil { + if err := cgroup.Delete(); err != nil { + return fmt.Errorf("removing pod %s cgroup: %w", p.ID(), err) + } + } + default: + // This should be caught much earlier, but let's still + // keep going so we make sure to evict the pod before + // ending up with an inconsistent state. + return fmt.Errorf("unrecognized cgroup manager %s when removing pod %s cgroups: %w", p.runtime.config.Engine.CgroupManager, p.ID(), define.ErrInternal) + } + return nil +} + func (r *Runtime) removePod(ctx context.Context, p *Pod, removeCtrs, force bool, timeout *uint) (map[string]error, error) { removedCtrs := make(map[string]error) @@ -269,68 +314,12 @@ func (r *Runtime) removePod(ctx context.Context, p *Pod, removeCtrs, force bool, } } - // Remove pod cgroup, if present - if p.state.CgroupPath != "" { - logrus.Debugf("Removing pod cgroup %s", p.state.CgroupPath) - - switch p.runtime.config.Engine.CgroupManager { - case config.SystemdCgroupsManager: - if err := deleteSystemdCgroup(p.state.CgroupPath, p.ResourceLim()); err != nil { - if removalErr == nil { - removalErr = fmt.Errorf("removing pod %s cgroup: %w", p.ID(), err) - } else { - logrus.Errorf("Deleting pod %s cgroup %s: %v", p.ID(), p.state.CgroupPath, err) - } - } - case config.CgroupfsCgroupsManager: - // Delete the cgroupfs cgroup - // Make sure the conmon cgroup is deleted first - // Since the pod is almost gone, don't bother failing - // hard - instead, just log errors. - conmonCgroupPath := filepath.Join(p.state.CgroupPath, "conmon") - conmonCgroup, err := cgroups.Load(conmonCgroupPath) - if err != nil && err != cgroups.ErrCgroupDeleted && err != cgroups.ErrCgroupV1Rootless { - if removalErr == nil { - removalErr = fmt.Errorf("retrieving pod %s conmon cgroup: %w", p.ID(), err) - } else { - logrus.Debugf("Error retrieving pod %s conmon cgroup %s: %v", p.ID(), conmonCgroupPath, err) - } - } - if err == nil { - if err = conmonCgroup.Delete(); err != nil { - if removalErr == nil { - removalErr = fmt.Errorf("removing pod %s conmon cgroup: %w", p.ID(), err) - } else { - logrus.Errorf("Deleting pod %s conmon cgroup %s: %v", p.ID(), conmonCgroupPath, err) - } - } - } - cgroup, err := cgroups.Load(p.state.CgroupPath) - if err != nil && err != cgroups.ErrCgroupDeleted && err != cgroups.ErrCgroupV1Rootless { - if removalErr == nil { - removalErr = fmt.Errorf("retrieving pod %s cgroup: %w", p.ID(), err) - } else { - logrus.Errorf("Retrieving pod %s cgroup %s: %v", p.ID(), p.state.CgroupPath, err) - } - } - if err == nil { - if err := cgroup.Delete(); err != nil { - if removalErr == nil { - removalErr = fmt.Errorf("removing pod %s cgroup: %w", p.ID(), err) - } else { - logrus.Errorf("Deleting pod %s cgroup %s: %v", p.ID(), p.state.CgroupPath, err) - } - } - } - default: - // This should be caught much earlier, but let's still - // keep going so we make sure to evict the pod before - // ending up with an inconsistent state. - if removalErr == nil { - removalErr = fmt.Errorf("unrecognized cgroup manager %s when removing pod %s cgroups: %w", p.runtime.config.Engine.CgroupManager, p.ID(), define.ErrInternal) - } else { - logrus.Errorf("Unknown cgroups manager %s specified - cannot remove pod %s cgroup", p.runtime.config.Engine.CgroupManager, p.ID()) - } + // Remove pod cgroup + if err := p.removePodCgroup(); err != nil { + if removalErr == nil { + removalErr = fmt.Errorf("removing pod %s cgroup: %w", p.ID(), err) + } else { + logrus.Errorf("Deleting pod %s cgroup %s: %v", p.ID(), p.state.CgroupPath, err) } } From 9a347619d8940e7df640584814195bd57f94adcf Mon Sep 17 00:00:00 2001 From: Giuseppe Scrivano Date: Fri, 8 Sep 2023 11:13:50 +0200 Subject: [PATCH 02/10] utils: export MoveUnderCgroup Signed-off-by: Giuseppe Scrivano --- utils/utils_supported.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/utils/utils_supported.go b/utils/utils_supported.go index ecd0c4c024..fe6472066e 100644 --- a/utils/utils_supported.go +++ b/utils/utils_supported.go @@ -47,7 +47,7 @@ func RunUnderSystemdScope(pid int, slice string, unitName string) error { // On errors check if the cgroup already exists, if it does move the process there if props, err := conn.GetUnitTypePropertiesContext(context.Background(), unitName, "Scope"); err == nil { if cgroup, ok := props["ControlGroup"].(string); ok && cgroup != "" { - if err := moveUnderCgroup(cgroup, "", []uint32{uint32(pid)}); err == nil { + if err := MoveUnderCgroup(cgroup, "", []uint32{uint32(pid)}); err == nil { return nil } // On errors return the original error message we got from StartTransientUnit. @@ -107,13 +107,13 @@ func GetCgroupProcess(pid int) (string, error) { // MoveUnderCgroupSubtree moves the PID under a cgroup subtree. func MoveUnderCgroupSubtree(subtree string) error { - return moveUnderCgroup("", subtree, nil) + return MoveUnderCgroup("", subtree, nil) } -// moveUnderCgroup moves a group of processes to a new cgroup. +// MoveUnderCgroup moves a group of processes to a new cgroup. // If cgroup is the empty string, then the current calling process cgroup is used. // If processes is empty, then the processes from the current cgroup are moved. -func moveUnderCgroup(cgroup, subtree string, processes []uint32) error { +func MoveUnderCgroup(cgroup, subtree string, processes []uint32) error { procFile := "/proc/self/cgroup" f, err := os.Open(procFile) if err != nil { From 627ac1c96bde5123596b5c3faa0d6e52fabc4b79 Mon Sep 17 00:00:00 2001 From: Giuseppe Scrivano Date: Wed, 6 Sep 2023 13:28:30 +0200 Subject: [PATCH 03/10] libpod: destroy pod cgroup on pod stop When the pod is stopped, we need to destroy the pod cgroup, otherwise it is leaked. Signed-off-by: Giuseppe Scrivano --- libpod/pod_api.go | 7 +++++++ libpod/runtime_pod_common.go | 16 ++++++++++++++++ 2 files changed, 23 insertions(+) diff --git a/libpod/pod_api.go b/libpod/pod_api.go index 87c5c80386..8c6e4cb6b4 100644 --- a/libpod/pod_api.go +++ b/libpod/pod_api.go @@ -209,6 +209,13 @@ func (p *Pod) stopWithTimeout(ctx context.Context, cleanup bool, timeout int) (m return nil, err } + if err := p.updatePod(); err != nil { + return nil, err + } + if err := p.removePodCgroup(); err != nil { + return nil, err + } + return nil, nil } diff --git a/libpod/runtime_pod_common.go b/libpod/runtime_pod_common.go index 74100c4071..d9f1f5276b 100644 --- a/libpod/runtime_pod_common.go +++ b/libpod/runtime_pod_common.go @@ -7,6 +7,7 @@ import ( "context" "errors" "fmt" + "path" "path/filepath" "github.com/containers/common/pkg/cgroups" @@ -14,6 +15,7 @@ import ( "github.com/containers/podman/v4/libpod/define" "github.com/containers/podman/v4/libpod/events" "github.com/containers/podman/v4/pkg/specgen" + "github.com/containers/podman/v4/utils" "github.com/hashicorp/go-multierror" "github.com/sirupsen/logrus" ) @@ -199,6 +201,20 @@ func (p *Pod) removePodCgroup() error { } logrus.Debugf("Removing pod cgroup %s", p.state.CgroupPath) + cgroup, err := utils.GetOwnCgroup() + if err != nil { + return err + } + + // if we are trying to delete a cgroup that is our ancestor, we need to move the + // current process out of it before the cgroup is destroyed. + if isSubDir(cgroup, string(filepath.Separator)+p.state.CgroupPath) { + parent := path.Dir(p.state.CgroupPath) + if err := utils.MoveUnderCgroup(parent, "cleanup", nil); err != nil { + return err + } + } + switch p.runtime.config.Engine.CgroupManager { case config.SystemdCgroupsManager: if err := deleteSystemdCgroup(p.state.CgroupPath, p.ResourceLim()); err != nil { From 38209ef49d2f334ce37bf8179789e540e0024c85 Mon Sep 17 00:00:00 2001 From: Giuseppe Scrivano Date: Wed, 6 Sep 2023 15:41:20 +0200 Subject: [PATCH 04/10] libpod: refactor platformMakePod signature accept only the resources to be used by the pod, so that the function can more easily be used by a successive patch. Signed-off-by: Giuseppe Scrivano --- libpod/runtime_pod_common.go | 6 +++- libpod/runtime_pod_freebsd.go | 6 ++-- libpod/runtime_pod_linux.go | 59 +++++++++++++++++------------------ 3 files changed, 36 insertions(+), 35 deletions(-) diff --git a/libpod/runtime_pod_common.go b/libpod/runtime_pod_common.go index d9f1f5276b..4e1b1b7e71 100644 --- a/libpod/runtime_pod_common.go +++ b/libpod/runtime_pod_common.go @@ -58,9 +58,13 @@ func (r *Runtime) NewPod(ctx context.Context, p specgen.PodSpecGenerator, option pod.valid = true - if err := r.platformMakePod(pod, p); err != nil { + parentCgroup, err := r.platformMakePod(pod, p.ResourceLimits) + if err != nil { return nil, err } + if p.InfraContainerSpec != nil { + p.InfraContainerSpec.CgroupParent = parentCgroup + } if !pod.HasInfraContainer() && pod.SharesNamespaces() { return nil, errors.New("Pods must have an infra container to share namespaces") diff --git a/libpod/runtime_pod_freebsd.go b/libpod/runtime_pod_freebsd.go index eb5315fc1a..7ec9bf026a 100644 --- a/libpod/runtime_pod_freebsd.go +++ b/libpod/runtime_pod_freebsd.go @@ -1,9 +1,9 @@ package libpod import ( - "github.com/containers/podman/v4/pkg/specgen" + spec "github.com/opencontainers/runtime-spec/specs-go" ) -func (r *Runtime) platformMakePod(pod *Pod, p specgen.PodSpecGenerator) error { - return nil +func (r *Runtime) platformMakePod(pod *Pod, resourceLimits *spec.LinuxResources) (string, error) { + return "", nil } diff --git a/libpod/runtime_pod_linux.go b/libpod/runtime_pod_linux.go index 830d9e4ef4..9d4a8641a3 100644 --- a/libpod/runtime_pod_linux.go +++ b/libpod/runtime_pod_linux.go @@ -10,11 +10,12 @@ import ( "github.com/containers/common/pkg/config" "github.com/containers/podman/v4/libpod/define" "github.com/containers/podman/v4/pkg/rootless" - "github.com/containers/podman/v4/pkg/specgen" + spec "github.com/opencontainers/runtime-spec/specs-go" "github.com/sirupsen/logrus" ) -func (r *Runtime) platformMakePod(pod *Pod, p specgen.PodSpecGenerator) error { +func (r *Runtime) platformMakePod(pod *Pod, resourceLimits *spec.LinuxResources) (string, error) { + cgroupParent := "" // Check Cgroup parent sanity, and set it if it was not set if r.config.Cgroups() != "disabled" { switch r.config.Engine.CgroupManager { @@ -25,32 +26,30 @@ func (r *Runtime) platformMakePod(pod *Pod, p specgen.PodSpecGenerator) error { if pod.config.CgroupParent == "" { pod.config.CgroupParent = CgroupfsDefaultCgroupParent } else if strings.HasSuffix(path.Base(pod.config.CgroupParent), ".slice") { - return fmt.Errorf("systemd slice received as cgroup parent when using cgroupfs: %w", define.ErrInvalidArg) + return "", fmt.Errorf("systemd slice received as cgroup parent when using cgroupfs: %w", define.ErrInvalidArg) } // If we are set to use pod cgroups, set the cgroup parent that // all containers in the pod will share if pod.config.UsePodCgroup { pod.state.CgroupPath = filepath.Join(pod.config.CgroupParent, pod.ID()) - if p.InfraContainerSpec != nil { - p.InfraContainerSpec.CgroupParent = pod.state.CgroupPath - // cgroupfs + rootless = permission denied when creating the cgroup. - if !rootless.IsRootless() { - res, err := GetLimits(p.ResourceLimits) - if err != nil { - return err - } - // Need to both create and update the cgroup - // rather than create a new path in c/common for pod cgroup creation - // just create as if it is a ctr and then update figures out that we need to - // populate the resource limits on the pod level - cgc, err := cgroups.New(pod.state.CgroupPath, &res) - if err != nil { - return err - } - err = cgc.Update(&res) - if err != nil { - return err - } + cgroupParent = pod.state.CgroupPath + // cgroupfs + rootless = permission denied when creating the cgroup. + if !rootless.IsRootless() { + res, err := GetLimits(resourceLimits) + if err != nil { + return "", err + } + // Need to both create and update the cgroup + // rather than create a new path in c/common for pod cgroup creation + // just create as if it is a ctr and then update figures out that we need to + // populate the resource limits on the pod level + cgc, err := cgroups.New(pod.state.CgroupPath, &res) + if err != nil { + return "", err + } + err = cgc.Update(&res) + if err != nil { + return "", err } } } @@ -63,22 +62,20 @@ func (r *Runtime) platformMakePod(pod *Pod, p specgen.PodSpecGenerator) error { pod.config.CgroupParent = SystemdDefaultCgroupParent } } else if len(pod.config.CgroupParent) < 6 || !strings.HasSuffix(path.Base(pod.config.CgroupParent), ".slice") { - return fmt.Errorf("did not receive systemd slice as cgroup parent when using systemd to manage cgroups: %w", define.ErrInvalidArg) + return "", fmt.Errorf("did not receive systemd slice as cgroup parent when using systemd to manage cgroups: %w", define.ErrInvalidArg) } // If we are set to use pod cgroups, set the cgroup parent that // all containers in the pod will share if pod.config.UsePodCgroup { - cgroupPath, err := systemdSliceFromPath(pod.config.CgroupParent, fmt.Sprintf("libpod_pod_%s", pod.ID()), p.ResourceLimits) + cgroupPath, err := systemdSliceFromPath(pod.config.CgroupParent, fmt.Sprintf("libpod_pod_%s", pod.ID()), resourceLimits) if err != nil { - return fmt.Errorf("unable to create pod cgroup for pod %s: %w", pod.ID(), err) + return "", fmt.Errorf("unable to create pod cgroup for pod %s: %w", pod.ID(), err) } pod.state.CgroupPath = cgroupPath - if p.InfraContainerSpec != nil { - p.InfraContainerSpec.CgroupParent = pod.state.CgroupPath - } + cgroupParent = pod.state.CgroupPath } default: - return fmt.Errorf("unsupported Cgroup manager: %s - cannot validate cgroup parent: %w", r.config.Engine.CgroupManager, define.ErrInvalidArg) + return "", fmt.Errorf("unsupported Cgroup manager: %s - cannot validate cgroup parent: %w", r.config.Engine.CgroupManager, define.ErrInvalidArg) } } @@ -86,5 +83,5 @@ func (r *Runtime) platformMakePod(pod *Pod, p specgen.PodSpecGenerator) error { logrus.Debugf("Got pod cgroup as %s", pod.state.CgroupPath) } - return nil + return cgroupParent, nil } From 5121c9eb0e0f461530f9a28fbfc0c35ce197b4b2 Mon Sep 17 00:00:00 2001 From: Giuseppe Scrivano Date: Wed, 6 Sep 2023 17:31:33 +0200 Subject: [PATCH 05/10] libpod: check if cgroup exists before creating it do not create the pod cgroup if it already exists. Signed-off-by: Giuseppe Scrivano --- libpod/util_linux.go | 44 ++++++++++++++++++++++++++++++++++---------- 1 file changed, 34 insertions(+), 10 deletions(-) diff --git a/libpod/util_linux.go b/libpod/util_linux.go index 155bd91857..7f6e1472c6 100644 --- a/libpod/util_linux.go +++ b/libpod/util_linux.go @@ -5,6 +5,8 @@ package libpod import ( "fmt" + "os" + "path/filepath" "strings" "syscall" @@ -17,22 +19,36 @@ import ( "golang.org/x/sys/unix" ) +func cgroupExist(path string) bool { + cgroupv2, _ := cgroups.IsCgroup2UnifiedMode() + var fullPath string + if cgroupv2 { + fullPath = filepath.Join("/sys/fs/cgroup", path) + } else { + fullPath = filepath.Join("/sys/fs/cgroup/memory", path) + } + _, err := os.Stat(fullPath) + return err == nil +} + // systemdSliceFromPath makes a new systemd slice under the given parent with // the given name. // The parent must be a slice. The name must NOT include ".slice" func systemdSliceFromPath(parent, name string, resources *spec.LinuxResources) (string, error) { - cgroupPath, err := assembleSystemdCgroupName(parent, name) + cgroupPath, systemdPath, err := assembleSystemdCgroupName(parent, name) if err != nil { return "", err } - logrus.Debugf("Created cgroup path %s for parent %s and name %s", cgroupPath, parent, name) + logrus.Debugf("Created cgroup path %s for parent %s and name %s", systemdPath, parent, name) - if err := makeSystemdCgroup(cgroupPath, resources); err != nil { - return "", fmt.Errorf("creating cgroup %s: %w", cgroupPath, err) + if !cgroupExist(cgroupPath) { + if err := makeSystemdCgroup(systemdPath, resources); err != nil { + return "", fmt.Errorf("creating cgroup %s: %w", cgroupPath, err) + } } - logrus.Debugf("Created cgroup %s", cgroupPath) + logrus.Debugf("Created cgroup %s", systemdPath) return cgroupPath, nil } @@ -88,19 +104,27 @@ func deleteSystemdCgroup(path string, resources *spec.LinuxResources) error { } // assembleSystemdCgroupName creates a systemd cgroup path given a base and -// a new component to add. +// a new component to add. It also returns the path to the cgroup as it accessible +// below the cgroup mounts. // The base MUST be systemd slice (end in .slice) -func assembleSystemdCgroupName(baseSlice, newSlice string) (string, error) { +func assembleSystemdCgroupName(baseSlice, newSlice string) (string, string, error) { const sliceSuffix = ".slice" if !strings.HasSuffix(baseSlice, sliceSuffix) { - return "", fmt.Errorf("cannot assemble cgroup path with base %q - must end in .slice: %w", baseSlice, define.ErrInvalidArg) + return "", "", fmt.Errorf("cannot assemble cgroup path with base %q - must end in .slice: %w", baseSlice, define.ErrInvalidArg) } noSlice := strings.TrimSuffix(baseSlice, sliceSuffix) - final := fmt.Sprintf("%s/%s-%s%s", baseSlice, noSlice, newSlice, sliceSuffix) + systemdPath := fmt.Sprintf("%s/%s-%s%s", baseSlice, noSlice, newSlice, sliceSuffix) - return final, nil + if rootless.IsRootless() { + // When we run as rootless, the cgroup has a path like the following: + ///sys/fs/cgroup/user.slice/user-@$UID.slice/user@$UID.service/user.slice/user-libpod_pod_$POD_ID.slice + uid := rootless.GetRootlessUID() + raw := fmt.Sprintf("user.slice/%s-%d.slice/user@%d.service/%s/%s-%s%s", noSlice, uid, uid, baseSlice, noSlice, newSlice, sliceSuffix) + return raw, systemdPath, nil + } + return systemdPath, systemdPath, nil } var lvpRelabel = label.Relabel From 5de8f4aba003938f509fb81235d670fffa1e362e Mon Sep 17 00:00:00 2001 From: Giuseppe Scrivano Date: Wed, 6 Sep 2023 21:11:34 +0200 Subject: [PATCH 06/10] libpod: allow cgroup path without infra container a pod can use cgroups without an infra container. Signed-off-by: Giuseppe Scrivano --- libpod/pod.go | 3 --- 1 file changed, 3 deletions(-) diff --git a/libpod/pod.go b/libpod/pod.go index cdd0f3d715..0d4a591c2f 100644 --- a/libpod/pod.go +++ b/libpod/pod.go @@ -361,9 +361,6 @@ func (p *Pod) CgroupPath() (string, error) { if err := p.updatePod(); err != nil { return "", err } - if p.state.InfraContainerID == "" { - return "", fmt.Errorf("pod has no infra container: %w", define.ErrNoSuchCtr) - } return p.state.CgroupPath, nil } From 83334fb4e71f08b8aea8e8f709461a295aba27c8 Mon Sep 17 00:00:00 2001 From: Giuseppe Scrivano Date: Wed, 6 Sep 2023 21:24:06 +0200 Subject: [PATCH 07/10] specgen: allow --share-parent with --infra=false This allows to use --share-parent with --infra=false, so that the containers in the pod can share the parent cgroup. Signed-off-by: Giuseppe Scrivano --- pkg/specgen/generate/pod_create.go | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pkg/specgen/generate/pod_create.go b/pkg/specgen/generate/pod_create.go index d8759a9314..a8136ba490 100644 --- a/pkg/specgen/generate/pod_create.go +++ b/pkg/specgen/generate/pod_create.go @@ -123,11 +123,12 @@ func createPodOptions(p *specgen.PodSpecGenerator) ([]libpod.PodCreateOption, er var ( options []libpod.PodCreateOption ) + + if p.ShareParent == nil || (p.ShareParent != nil && *p.ShareParent) { + options = append(options, libpod.WithPodParent()) + } if !p.NoInfra { options = append(options, libpod.WithInfraContainer()) - if p.ShareParent == nil || (p.ShareParent != nil && *p.ShareParent) { - options = append(options, libpod.WithPodParent()) - } nsOptions, err := GetNamespaceOptions(p.SharedNamespaces, p.InfraContainerSpec.NetNS.IsHost()) if err != nil { return nil, err From 331b3c216d68337ba9285542e0663f742dd5af5e Mon Sep 17 00:00:00 2001 From: Giuseppe Scrivano Date: Wed, 6 Sep 2023 21:35:57 +0200 Subject: [PATCH 08/10] cmd, specgen: allow cgroup resources without --infra When the infra container is not created, we can still set limits on the pod cgroup. Signed-off-by: Giuseppe Scrivano --- cmd/podman/pods/create.go | 24 +++++++ pkg/specgen/generate/pod_create.go | 102 ++++++++++++++--------------- 2 files changed, 73 insertions(+), 53 deletions(-) diff --git a/cmd/podman/pods/create.go b/cmd/podman/pods/create.go index 5cc4505ff2..79c3524a6c 100644 --- a/cmd/podman/pods/create.go +++ b/cmd/podman/pods/create.go @@ -154,6 +154,15 @@ func create(cmd *cobra.Command, args []string) error { return fmt.Errorf("cannot set share(%s) namespaces without an infra container", cmd.Flag("share").Value) } createOptions.Share = nil + + infraOptions, err = containers.CreateInit(cmd, infraOptions, true) + if err != nil { + return err + } + err = common.ContainerToPodOptions(&infraOptions, &createOptions) + if err != nil { + return err + } } else { // reassign certain options for lbpod api, these need to be populated in spec flags := cmd.Flags() @@ -280,6 +289,21 @@ func create(cmd *cobra.Command, args []string) error { return err } podSpec.Name = podName + } else { + ctrSpec := specgen.NewSpecGenerator("", false) + err = specgenutil.FillOutSpecGen(ctrSpec, &infraOptions, []string{}) + if err != nil { + return err + } + // Marshall and Unmarshal the spec in order to map similar entities + wrapped, err := json.Marshal(ctrSpec) + if err != nil { + return err + } + err = json.Unmarshal(wrapped, podSpec) + if err != nil { + return err + } } PodSpec := entities.PodSpec{PodSpecGen: *podSpec} response, err := registry.ContainerEngine().PodCreate(context.Background(), PodSpec) diff --git a/pkg/specgen/generate/pod_create.go b/pkg/specgen/generate/pod_create.go index a8136ba490..e14ed00d66 100644 --- a/pkg/specgen/generate/pod_create.go +++ b/pkg/specgen/generate/pod_create.go @@ -44,28 +44,18 @@ func MakePod(p *entities.PodSpec, rt *libpod.Runtime) (_ *libpod.Pod, finalErr e p.PodSpecGen.InfraContainerSpec.RawImageName = imageName } - if !p.PodSpecGen.NoInfra && p.PodSpecGen.InfraContainerSpec != nil { - var err error - p.PodSpecGen.InfraContainerSpec, err = MapSpec(&p.PodSpecGen) - if err != nil { - return nil, err - } + spec, err := MapSpec(&p.PodSpecGen) + if err != nil { + return nil, err } - - if !p.PodSpecGen.NoInfra { - err := specgen.FinishThrottleDevices(p.PodSpecGen.InfraContainerSpec) - if err != nil { - return nil, err - } - if p.PodSpecGen.InfraContainerSpec.ResourceLimits != nil && - p.PodSpecGen.InfraContainerSpec.ResourceLimits.BlockIO != nil { - p.PodSpecGen.ResourceLimits.BlockIO = p.PodSpecGen.InfraContainerSpec.ResourceLimits.BlockIO - } - err = specgen.WeightDevices(p.PodSpecGen.InfraContainerSpec) - if err != nil { - return nil, err - } - p.PodSpecGen.ResourceLimits = p.PodSpecGen.InfraContainerSpec.ResourceLimits + if err := specgen.FinishThrottleDevices(spec); err != nil { + return nil, err + } + if err := specgen.WeightDevices(spec); err != nil { + return nil, err + } + if spec.ResourceLimits != nil && spec.ResourceLimits.BlockIO != nil { + p.PodSpecGen.ResourceLimits.BlockIO = spec.ResourceLimits.BlockIO } options, err := createPodOptions(&p.PodSpecGen) @@ -177,12 +167,18 @@ func createPodOptions(p *specgen.PodSpecGenerator) ([]libpod.PodCreateOption, er // MapSpec modifies the already filled Infra specgenerator, // replacing necessary values with those specified in pod creation func MapSpec(p *specgen.PodSpecGenerator) (*specgen.SpecGenerator, error) { + var spec *specgen.SpecGenerator + if p.InfraContainerSpec != nil { + spec = p.InfraContainerSpec + } else { + spec = &specgen.SpecGenerator{} + } if len(p.PortMappings) > 0 { ports, err := ParsePortMapping(p.PortMappings, nil) if err != nil { return nil, err } - p.InfraContainerSpec.PortMappings = ports + spec.PortMappings = ports } switch p.NetNS.NSMode { case specgen.Default, "": @@ -191,90 +187,90 @@ func MapSpec(p *specgen.PodSpecGenerator) (*specgen.SpecGenerator, error) { break } case specgen.Bridge: - p.InfraContainerSpec.NetNS.NSMode = specgen.Bridge + spec.NetNS.NSMode = specgen.Bridge logrus.Debugf("Pod using bridge network mode") case specgen.Private: - p.InfraContainerSpec.NetNS.NSMode = specgen.Private + spec.NetNS.NSMode = specgen.Private logrus.Debugf("Pod will use default network mode") case specgen.Host: logrus.Debugf("Pod will use host networking") - if len(p.InfraContainerSpec.PortMappings) > 0 || - len(p.InfraContainerSpec.Networks) > 0 || - p.InfraContainerSpec.NetNS.NSMode == specgen.NoNetwork { + if len(spec.PortMappings) > 0 || + len(spec.Networks) > 0 || + spec.NetNS.NSMode == specgen.NoNetwork { return nil, fmt.Errorf("cannot set host network if network-related configuration is specified: %w", define.ErrInvalidArg) } - p.InfraContainerSpec.NetNS.NSMode = specgen.Host + spec.NetNS.NSMode = specgen.Host case specgen.Slirp: logrus.Debugf("Pod will use slirp4netns") - if p.InfraContainerSpec.NetNS.NSMode != specgen.Host { - p.InfraContainerSpec.NetworkOptions = p.NetworkOptions - p.InfraContainerSpec.NetNS.NSMode = specgen.Slirp + if spec.NetNS.NSMode != specgen.Host { + spec.NetworkOptions = p.NetworkOptions + spec.NetNS.NSMode = specgen.Slirp } case specgen.Pasta: logrus.Debugf("Pod will use pasta") - if p.InfraContainerSpec.NetNS.NSMode != specgen.Host { - p.InfraContainerSpec.NetworkOptions = p.NetworkOptions - p.InfraContainerSpec.NetNS.NSMode = specgen.Pasta + if spec.NetNS.NSMode != specgen.Host { + spec.NetworkOptions = p.NetworkOptions + spec.NetNS.NSMode = specgen.Pasta } case specgen.Path: logrus.Debugf("Pod will use namespace path networking") - p.InfraContainerSpec.NetNS.NSMode = specgen.Path - p.InfraContainerSpec.NetNS.Value = p.PodNetworkConfig.NetNS.Value + spec.NetNS.NSMode = specgen.Path + spec.NetNS.Value = p.PodNetworkConfig.NetNS.Value case specgen.NoNetwork: logrus.Debugf("Pod will not use networking") - if len(p.InfraContainerSpec.PortMappings) > 0 || - len(p.InfraContainerSpec.Networks) > 0 || - p.InfraContainerSpec.NetNS.NSMode == specgen.Host { + if len(spec.PortMappings) > 0 || + len(spec.Networks) > 0 || + spec.NetNS.NSMode == specgen.Host { return nil, fmt.Errorf("cannot disable pod network if network-related configuration is specified: %w", define.ErrInvalidArg) } - p.InfraContainerSpec.NetNS.NSMode = specgen.NoNetwork + spec.NetNS.NSMode = specgen.NoNetwork default: return nil, fmt.Errorf("pods presently do not support network mode %s", p.NetNS.NSMode) } if len(p.InfraCommand) > 0 { - p.InfraContainerSpec.Entrypoint = p.InfraCommand + spec.Entrypoint = p.InfraCommand } if len(p.HostAdd) > 0 { - p.InfraContainerSpec.HostAdd = p.HostAdd + spec.HostAdd = p.HostAdd } if len(p.DNSServer) > 0 { var dnsServers []net.IP dnsServers = append(dnsServers, p.DNSServer...) - p.InfraContainerSpec.DNSServers = dnsServers + spec.DNSServers = dnsServers } if len(p.DNSOption) > 0 { - p.InfraContainerSpec.DNSOptions = p.DNSOption + spec.DNSOptions = p.DNSOption } if len(p.DNSSearch) > 0 { - p.InfraContainerSpec.DNSSearch = p.DNSSearch + spec.DNSSearch = p.DNSSearch } if p.NoManageResolvConf { - p.InfraContainerSpec.UseImageResolvConf = true + spec.UseImageResolvConf = true } if len(p.Networks) > 0 { - p.InfraContainerSpec.Networks = p.Networks + spec.Networks = p.Networks } // deprecated cni networks for api users if len(p.CNINetworks) > 0 { - p.InfraContainerSpec.CNINetworks = p.CNINetworks + spec.CNINetworks = p.CNINetworks } if p.NoManageHosts { - p.InfraContainerSpec.UseImageHosts = p.NoManageHosts + spec.UseImageHosts = p.NoManageHosts } if len(p.InfraConmonPidFile) > 0 { - p.InfraContainerSpec.ConmonPidFile = p.InfraConmonPidFile + spec.ConmonPidFile = p.InfraConmonPidFile } if p.Sysctl != nil && len(p.Sysctl) > 0 { - p.InfraContainerSpec.Sysctl = p.Sysctl + spec.Sysctl = p.Sysctl } - p.InfraContainerSpec.Image = p.InfraImage - return p.InfraContainerSpec, nil + spec.Image = p.InfraImage + return spec, nil } func PodConfigToSpec(rt *libpod.Runtime, spec *specgen.PodSpecGenerator, infraOptions *entities.ContainerCreateOptions, id string) (p *libpod.Pod, err error) { From b8f6a12d01e753fed4a1377ad912ae6191817d18 Mon Sep 17 00:00:00 2001 From: Giuseppe Scrivano Date: Wed, 6 Sep 2023 17:29:23 +0200 Subject: [PATCH 09/10] libpod: create the cgroup pod before containers When a container is created and it is part of a pod, we ensure the pod cgroup exists so limits can be applied on the pod cgroup. Closes: https://github.com/containers/podman/issues/19175 Signed-off-by: Giuseppe Scrivano --- libpod/container_internal.go | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/libpod/container_internal.go b/libpod/container_internal.go index 1e665d27e5..4098296b5f 100644 --- a/libpod/container_internal.go +++ b/libpod/container_internal.go @@ -1028,6 +1028,19 @@ func (c *Container) init(ctx context.Context, retainRetries bool) error { shutdown.Inhibit() defer shutdown.Uninhibit() + // If the container is part of a pod, make sure the pod cgroup is created before the container + // so the limits can be applied. + if c.PodID() != "" { + pod, err := c.runtime.LookupPod(c.PodID()) + if err != nil { + return err + } + + if _, err := c.runtime.platformMakePod(pod, &pod.config.ResourceLimits); err != nil { + return err + } + } + // With the spec complete, do an OCI create if _, err = c.ociRuntime.CreateContainer(c, nil); err != nil { return err From 0c75eac63715d67cff08cc2f4df629460777f732 Mon Sep 17 00:00:00 2001 From: Giuseppe Scrivano Date: Thu, 7 Sep 2023 13:39:29 +0200 Subject: [PATCH 10/10] tests: add test for pod cgroups This test checks that the pod cgroups are created and that the limits set for a pod cgroup are enforced also after a reboot. Signed-off-by: Giuseppe Scrivano --- test/system/200-pod.bats | 50 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/test/system/200-pod.bats b/test/system/200-pod.bats index c8ef3a80c4..ea2eeec5ef 100644 --- a/test/system/200-pod.bats +++ b/test/system/200-pod.bats @@ -689,4 +689,54 @@ function thingy_with_unique_id() { run_podman rm -f -a } + +@test "podman pod cleans cgroup and keeps limits" { + skip_if_remote "we cannot check cgroup settings" + skip_if_rootless_cgroupsv1 "rootless cannot use cgroups on v1" + + for infra in true false; do + run_podman pod create --infra=$infra --memory=256M + podid="$output" + run_podman run -d --pod $podid $IMAGE top -d 2 + + run_podman pod inspect $podid + result=$(jq -r .CgroupPath <<< $output) + assert "$result" =~ "/" ".CgroupPath is a valid path" + + if is_cgroupsv2; then + cgroup_path=/sys/fs/cgroup/$result + else + cgroup_path=/sys/fs/cgroup/memory/$result + fi + + if test ! -e $cgroup_path; then + die "the cgroup $cgroup_path does not exist" + fi + + run_podman pod stop -t 0 $podid + if test -e $cgroup_path; then + die "the cgroup $cgroup_path should not exist after pod stop" + fi + + run_podman pod start $podid + if test ! -e $cgroup_path; then + die "the cgroup $cgroup_path does not exist" + fi + + # validate that cgroup limits are in place after a restart + # issue #19175 + if is_cgroupsv2; then + memory_limit_file=$cgroup_path/memory.max + else + memory_limit_file=$cgroup_path/memory.limit_in_bytes + fi + assert "$(< $memory_limit_file)" = "268435456" "Contents of $memory_limit_file" + + run_podman pod rm -t 0 -f $podid + if test -e $cgroup_path; then + die "the cgroup $cgroup_path should not exist after pod rm" + fi + done +} + # vim: filetype=sh