From 556db46a68003047dbb18bab79af7dda008335f3 Mon Sep 17 00:00:00 2001
From: Giuseppe Scrivano <gscrivan@redhat.com>
Date: Wed, 6 Sep 2023 12:17:03 +0200
Subject: [PATCH 01/10] libpod: refactor code to new function

move the code to remove the pod cgroup to a separate function.  It is
a preparation for the next patch.

Signed-off-by: Giuseppe Scrivano <gscrivan@redhat.com>
---
 libpod/runtime_pod_common.go | 113 ++++++++++++++++-------------------
 1 file changed, 51 insertions(+), 62 deletions(-)

diff --git a/libpod/runtime_pod_common.go b/libpod/runtime_pod_common.go
index 56c8a6153f..74100c4071 100644
--- a/libpod/runtime_pod_common.go
+++ b/libpod/runtime_pod_common.go
@@ -192,6 +192,51 @@ func (r *Runtime) removeMalformedPod(ctx context.Context, p *Pod, ctrs []*Contai
 	return removedCtrs, nil
 }
 
+func (p *Pod) removePodCgroup() error {
+	// Remove pod cgroup, if present
+	if p.state.CgroupPath == "" {
+		return nil
+	}
+	logrus.Debugf("Removing pod cgroup %s", p.state.CgroupPath)
+
+	switch p.runtime.config.Engine.CgroupManager {
+	case config.SystemdCgroupsManager:
+		if err := deleteSystemdCgroup(p.state.CgroupPath, p.ResourceLim()); err != nil {
+			return fmt.Errorf("removing pod %s cgroup: %w", p.ID(), err)
+		}
+	case config.CgroupfsCgroupsManager:
+		// Delete the cgroupfs cgroup
+		// Make sure the conmon cgroup is deleted first
+		// Since the pod is almost gone, don't bother failing
+		// hard - instead, just log errors.
+		conmonCgroupPath := filepath.Join(p.state.CgroupPath, "conmon")
+		conmonCgroup, err := cgroups.Load(conmonCgroupPath)
+		if err != nil && err != cgroups.ErrCgroupDeleted && err != cgroups.ErrCgroupV1Rootless {
+			return fmt.Errorf("retrieving pod %s conmon cgroup: %w", p.ID(), err)
+		}
+		if err == nil {
+			if err = conmonCgroup.Delete(); err != nil {
+				return fmt.Errorf("removing pod %s conmon cgroup: %w", p.ID(), err)
+			}
+		}
+		cgroup, err := cgroups.Load(p.state.CgroupPath)
+		if err != nil && err != cgroups.ErrCgroupDeleted && err != cgroups.ErrCgroupV1Rootless {
+			return fmt.Errorf("retrieving pod %s cgroup: %w", p.ID(), err)
+		}
+		if err == nil {
+			if err := cgroup.Delete(); err != nil {
+				return fmt.Errorf("removing pod %s cgroup: %w", p.ID(), err)
+			}
+		}
+	default:
+		// This should be caught much earlier, but let's still
+		// keep going so we make sure to evict the pod before
+		// ending up with an inconsistent state.
+		return fmt.Errorf("unrecognized cgroup manager %s when removing pod %s cgroups: %w", p.runtime.config.Engine.CgroupManager, p.ID(), define.ErrInternal)
+	}
+	return nil
+}
+
 func (r *Runtime) removePod(ctx context.Context, p *Pod, removeCtrs, force bool, timeout *uint) (map[string]error, error) {
 	removedCtrs := make(map[string]error)
 
@@ -269,68 +314,12 @@ func (r *Runtime) removePod(ctx context.Context, p *Pod, removeCtrs, force bool,
 		}
 	}
 
-	// Remove pod cgroup, if present
-	if p.state.CgroupPath != "" {
-		logrus.Debugf("Removing pod cgroup %s", p.state.CgroupPath)
-
-		switch p.runtime.config.Engine.CgroupManager {
-		case config.SystemdCgroupsManager:
-			if err := deleteSystemdCgroup(p.state.CgroupPath, p.ResourceLim()); err != nil {
-				if removalErr == nil {
-					removalErr = fmt.Errorf("removing pod %s cgroup: %w", p.ID(), err)
-				} else {
-					logrus.Errorf("Deleting pod %s cgroup %s: %v", p.ID(), p.state.CgroupPath, err)
-				}
-			}
-		case config.CgroupfsCgroupsManager:
-			// Delete the cgroupfs cgroup
-			// Make sure the conmon cgroup is deleted first
-			// Since the pod is almost gone, don't bother failing
-			// hard - instead, just log errors.
-			conmonCgroupPath := filepath.Join(p.state.CgroupPath, "conmon")
-			conmonCgroup, err := cgroups.Load(conmonCgroupPath)
-			if err != nil && err != cgroups.ErrCgroupDeleted && err != cgroups.ErrCgroupV1Rootless {
-				if removalErr == nil {
-					removalErr = fmt.Errorf("retrieving pod %s conmon cgroup: %w", p.ID(), err)
-				} else {
-					logrus.Debugf("Error retrieving pod %s conmon cgroup %s: %v", p.ID(), conmonCgroupPath, err)
-				}
-			}
-			if err == nil {
-				if err = conmonCgroup.Delete(); err != nil {
-					if removalErr == nil {
-						removalErr = fmt.Errorf("removing pod %s conmon cgroup: %w", p.ID(), err)
-					} else {
-						logrus.Errorf("Deleting pod %s conmon cgroup %s: %v", p.ID(), conmonCgroupPath, err)
-					}
-				}
-			}
-			cgroup, err := cgroups.Load(p.state.CgroupPath)
-			if err != nil && err != cgroups.ErrCgroupDeleted && err != cgroups.ErrCgroupV1Rootless {
-				if removalErr == nil {
-					removalErr = fmt.Errorf("retrieving pod %s cgroup: %w", p.ID(), err)
-				} else {
-					logrus.Errorf("Retrieving pod %s cgroup %s: %v", p.ID(), p.state.CgroupPath, err)
-				}
-			}
-			if err == nil {
-				if err := cgroup.Delete(); err != nil {
-					if removalErr == nil {
-						removalErr = fmt.Errorf("removing pod %s cgroup: %w", p.ID(), err)
-					} else {
-						logrus.Errorf("Deleting pod %s cgroup %s: %v", p.ID(), p.state.CgroupPath, err)
-					}
-				}
-			}
-		default:
-			// This should be caught much earlier, but let's still
-			// keep going so we make sure to evict the pod before
-			// ending up with an inconsistent state.
-			if removalErr == nil {
-				removalErr = fmt.Errorf("unrecognized cgroup manager %s when removing pod %s cgroups: %w", p.runtime.config.Engine.CgroupManager, p.ID(), define.ErrInternal)
-			} else {
-				logrus.Errorf("Unknown cgroups manager %s specified - cannot remove pod %s cgroup", p.runtime.config.Engine.CgroupManager, p.ID())
-			}
+	// Remove pod cgroup
+	if err := p.removePodCgroup(); err != nil {
+		if removalErr == nil {
+			removalErr = fmt.Errorf("removing pod %s cgroup: %w", p.ID(), err)
+		} else {
+			logrus.Errorf("Deleting pod %s cgroup %s: %v", p.ID(), p.state.CgroupPath, err)
 		}
 	}
 

From 9a347619d8940e7df640584814195bd57f94adcf Mon Sep 17 00:00:00 2001
From: Giuseppe Scrivano <gscrivan@redhat.com>
Date: Fri, 8 Sep 2023 11:13:50 +0200
Subject: [PATCH 02/10] utils: export MoveUnderCgroup

Signed-off-by: Giuseppe Scrivano <gscrivan@redhat.com>
---
 utils/utils_supported.go | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/utils/utils_supported.go b/utils/utils_supported.go
index ecd0c4c024..fe6472066e 100644
--- a/utils/utils_supported.go
+++ b/utils/utils_supported.go
@@ -47,7 +47,7 @@ func RunUnderSystemdScope(pid int, slice string, unitName string) error {
 		// On errors check if the cgroup already exists, if it does move the process there
 		if props, err := conn.GetUnitTypePropertiesContext(context.Background(), unitName, "Scope"); err == nil {
 			if cgroup, ok := props["ControlGroup"].(string); ok && cgroup != "" {
-				if err := moveUnderCgroup(cgroup, "", []uint32{uint32(pid)}); err == nil {
+				if err := MoveUnderCgroup(cgroup, "", []uint32{uint32(pid)}); err == nil {
 					return nil
 				}
 				// On errors return the original error message we got from StartTransientUnit.
@@ -107,13 +107,13 @@ func GetCgroupProcess(pid int) (string, error) {
 
 // MoveUnderCgroupSubtree moves the PID under a cgroup subtree.
 func MoveUnderCgroupSubtree(subtree string) error {
-	return moveUnderCgroup("", subtree, nil)
+	return MoveUnderCgroup("", subtree, nil)
 }
 
-// moveUnderCgroup moves a group of processes to a new cgroup.
+// MoveUnderCgroup moves a group of processes to a new cgroup.
 // If cgroup is the empty string, then the current calling process cgroup is used.
 // If processes is empty, then the processes from the current cgroup are moved.
-func moveUnderCgroup(cgroup, subtree string, processes []uint32) error {
+func MoveUnderCgroup(cgroup, subtree string, processes []uint32) error {
 	procFile := "/proc/self/cgroup"
 	f, err := os.Open(procFile)
 	if err != nil {

From 627ac1c96bde5123596b5c3faa0d6e52fabc4b79 Mon Sep 17 00:00:00 2001
From: Giuseppe Scrivano <gscrivan@redhat.com>
Date: Wed, 6 Sep 2023 13:28:30 +0200
Subject: [PATCH 03/10] libpod: destroy pod cgroup on pod stop

When the pod is stopped, we need to destroy the pod cgroup, otherwise
it is leaked.

Signed-off-by: Giuseppe Scrivano <gscrivan@redhat.com>
---
 libpod/pod_api.go            |  7 +++++++
 libpod/runtime_pod_common.go | 16 ++++++++++++++++
 2 files changed, 23 insertions(+)

diff --git a/libpod/pod_api.go b/libpod/pod_api.go
index 87c5c80386..8c6e4cb6b4 100644
--- a/libpod/pod_api.go
+++ b/libpod/pod_api.go
@@ -209,6 +209,13 @@ func (p *Pod) stopWithTimeout(ctx context.Context, cleanup bool, timeout int) (m
 		return nil, err
 	}
 
+	if err := p.updatePod(); err != nil {
+		return nil, err
+	}
+	if err := p.removePodCgroup(); err != nil {
+		return nil, err
+	}
+
 	return nil, nil
 }
 
diff --git a/libpod/runtime_pod_common.go b/libpod/runtime_pod_common.go
index 74100c4071..d9f1f5276b 100644
--- a/libpod/runtime_pod_common.go
+++ b/libpod/runtime_pod_common.go
@@ -7,6 +7,7 @@ import (
 	"context"
 	"errors"
 	"fmt"
+	"path"
 	"path/filepath"
 
 	"github.com/containers/common/pkg/cgroups"
@@ -14,6 +15,7 @@ import (
 	"github.com/containers/podman/v4/libpod/define"
 	"github.com/containers/podman/v4/libpod/events"
 	"github.com/containers/podman/v4/pkg/specgen"
+	"github.com/containers/podman/v4/utils"
 	"github.com/hashicorp/go-multierror"
 	"github.com/sirupsen/logrus"
 )
@@ -199,6 +201,20 @@ func (p *Pod) removePodCgroup() error {
 	}
 	logrus.Debugf("Removing pod cgroup %s", p.state.CgroupPath)
 
+	cgroup, err := utils.GetOwnCgroup()
+	if err != nil {
+		return err
+	}
+
+	// if we are trying to delete a cgroup that is our ancestor, we need to move the
+	// current process out of it before the cgroup is destroyed.
+	if isSubDir(cgroup, string(filepath.Separator)+p.state.CgroupPath) {
+		parent := path.Dir(p.state.CgroupPath)
+		if err := utils.MoveUnderCgroup(parent, "cleanup", nil); err != nil {
+			return err
+		}
+	}
+
 	switch p.runtime.config.Engine.CgroupManager {
 	case config.SystemdCgroupsManager:
 		if err := deleteSystemdCgroup(p.state.CgroupPath, p.ResourceLim()); err != nil {

From 38209ef49d2f334ce37bf8179789e540e0024c85 Mon Sep 17 00:00:00 2001
From: Giuseppe Scrivano <gscrivan@redhat.com>
Date: Wed, 6 Sep 2023 15:41:20 +0200
Subject: [PATCH 04/10] libpod: refactor platformMakePod signature

accept only the resources to be used by the pod, so that the function
can more easily be used by a successive patch.

Signed-off-by: Giuseppe Scrivano <gscrivan@redhat.com>
---
 libpod/runtime_pod_common.go  |  6 +++-
 libpod/runtime_pod_freebsd.go |  6 ++--
 libpod/runtime_pod_linux.go   | 59 +++++++++++++++++------------------
 3 files changed, 36 insertions(+), 35 deletions(-)

diff --git a/libpod/runtime_pod_common.go b/libpod/runtime_pod_common.go
index d9f1f5276b..4e1b1b7e71 100644
--- a/libpod/runtime_pod_common.go
+++ b/libpod/runtime_pod_common.go
@@ -58,9 +58,13 @@ func (r *Runtime) NewPod(ctx context.Context, p specgen.PodSpecGenerator, option
 
 	pod.valid = true
 
-	if err := r.platformMakePod(pod, p); err != nil {
+	parentCgroup, err := r.platformMakePod(pod, p.ResourceLimits)
+	if err != nil {
 		return nil, err
 	}
+	if p.InfraContainerSpec != nil {
+		p.InfraContainerSpec.CgroupParent = parentCgroup
+	}
 
 	if !pod.HasInfraContainer() && pod.SharesNamespaces() {
 		return nil, errors.New("Pods must have an infra container to share namespaces")
diff --git a/libpod/runtime_pod_freebsd.go b/libpod/runtime_pod_freebsd.go
index eb5315fc1a..7ec9bf026a 100644
--- a/libpod/runtime_pod_freebsd.go
+++ b/libpod/runtime_pod_freebsd.go
@@ -1,9 +1,9 @@
 package libpod
 
 import (
-	"github.com/containers/podman/v4/pkg/specgen"
+	spec "github.com/opencontainers/runtime-spec/specs-go"
 )
 
-func (r *Runtime) platformMakePod(pod *Pod, p specgen.PodSpecGenerator) error {
-	return nil
+func (r *Runtime) platformMakePod(pod *Pod, resourceLimits *spec.LinuxResources) (string, error) {
+	return "", nil
 }
diff --git a/libpod/runtime_pod_linux.go b/libpod/runtime_pod_linux.go
index 830d9e4ef4..9d4a8641a3 100644
--- a/libpod/runtime_pod_linux.go
+++ b/libpod/runtime_pod_linux.go
@@ -10,11 +10,12 @@ import (
 	"github.com/containers/common/pkg/config"
 	"github.com/containers/podman/v4/libpod/define"
 	"github.com/containers/podman/v4/pkg/rootless"
-	"github.com/containers/podman/v4/pkg/specgen"
+	spec "github.com/opencontainers/runtime-spec/specs-go"
 	"github.com/sirupsen/logrus"
 )
 
-func (r *Runtime) platformMakePod(pod *Pod, p specgen.PodSpecGenerator) error {
+func (r *Runtime) platformMakePod(pod *Pod, resourceLimits *spec.LinuxResources) (string, error) {
+	cgroupParent := ""
 	// Check Cgroup parent sanity, and set it if it was not set
 	if r.config.Cgroups() != "disabled" {
 		switch r.config.Engine.CgroupManager {
@@ -25,32 +26,30 @@ func (r *Runtime) platformMakePod(pod *Pod, p specgen.PodSpecGenerator) error {
 				if pod.config.CgroupParent == "" {
 					pod.config.CgroupParent = CgroupfsDefaultCgroupParent
 				} else if strings.HasSuffix(path.Base(pod.config.CgroupParent), ".slice") {
-					return fmt.Errorf("systemd slice received as cgroup parent when using cgroupfs: %w", define.ErrInvalidArg)
+					return "", fmt.Errorf("systemd slice received as cgroup parent when using cgroupfs: %w", define.ErrInvalidArg)
 				}
 				// If we are set to use pod cgroups, set the cgroup parent that
 				// all containers in the pod will share
 				if pod.config.UsePodCgroup {
 					pod.state.CgroupPath = filepath.Join(pod.config.CgroupParent, pod.ID())
-					if p.InfraContainerSpec != nil {
-						p.InfraContainerSpec.CgroupParent = pod.state.CgroupPath
-						// cgroupfs + rootless = permission denied when creating the cgroup.
-						if !rootless.IsRootless() {
-							res, err := GetLimits(p.ResourceLimits)
-							if err != nil {
-								return err
-							}
-							// Need to both create and update the cgroup
-							// rather than create a new path in c/common for pod cgroup creation
-							// just create as if it is a ctr and then update figures out that we need to
-							// populate the resource limits on the pod level
-							cgc, err := cgroups.New(pod.state.CgroupPath, &res)
-							if err != nil {
-								return err
-							}
-							err = cgc.Update(&res)
-							if err != nil {
-								return err
-							}
+					cgroupParent = pod.state.CgroupPath
+					// cgroupfs + rootless = permission denied when creating the cgroup.
+					if !rootless.IsRootless() {
+						res, err := GetLimits(resourceLimits)
+						if err != nil {
+							return "", err
+						}
+						// Need to both create and update the cgroup
+						// rather than create a new path in c/common for pod cgroup creation
+						// just create as if it is a ctr and then update figures out that we need to
+						// populate the resource limits on the pod level
+						cgc, err := cgroups.New(pod.state.CgroupPath, &res)
+						if err != nil {
+							return "", err
+						}
+						err = cgc.Update(&res)
+						if err != nil {
+							return "", err
 						}
 					}
 				}
@@ -63,22 +62,20 @@ func (r *Runtime) platformMakePod(pod *Pod, p specgen.PodSpecGenerator) error {
 					pod.config.CgroupParent = SystemdDefaultCgroupParent
 				}
 			} else if len(pod.config.CgroupParent) < 6 || !strings.HasSuffix(path.Base(pod.config.CgroupParent), ".slice") {
-				return fmt.Errorf("did not receive systemd slice as cgroup parent when using systemd to manage cgroups: %w", define.ErrInvalidArg)
+				return "", fmt.Errorf("did not receive systemd slice as cgroup parent when using systemd to manage cgroups: %w", define.ErrInvalidArg)
 			}
 			// If we are set to use pod cgroups, set the cgroup parent that
 			// all containers in the pod will share
 			if pod.config.UsePodCgroup {
-				cgroupPath, err := systemdSliceFromPath(pod.config.CgroupParent, fmt.Sprintf("libpod_pod_%s", pod.ID()), p.ResourceLimits)
+				cgroupPath, err := systemdSliceFromPath(pod.config.CgroupParent, fmt.Sprintf("libpod_pod_%s", pod.ID()), resourceLimits)
 				if err != nil {
-					return fmt.Errorf("unable to create pod cgroup for pod %s: %w", pod.ID(), err)
+					return "", fmt.Errorf("unable to create pod cgroup for pod %s: %w", pod.ID(), err)
 				}
 				pod.state.CgroupPath = cgroupPath
-				if p.InfraContainerSpec != nil {
-					p.InfraContainerSpec.CgroupParent = pod.state.CgroupPath
-				}
+				cgroupParent = pod.state.CgroupPath
 			}
 		default:
-			return fmt.Errorf("unsupported Cgroup manager: %s - cannot validate cgroup parent: %w", r.config.Engine.CgroupManager, define.ErrInvalidArg)
+			return "", fmt.Errorf("unsupported Cgroup manager: %s - cannot validate cgroup parent: %w", r.config.Engine.CgroupManager, define.ErrInvalidArg)
 		}
 	}
 
@@ -86,5 +83,5 @@ func (r *Runtime) platformMakePod(pod *Pod, p specgen.PodSpecGenerator) error {
 		logrus.Debugf("Got pod cgroup as %s", pod.state.CgroupPath)
 	}
 
-	return nil
+	return cgroupParent, nil
 }

From 5121c9eb0e0f461530f9a28fbfc0c35ce197b4b2 Mon Sep 17 00:00:00 2001
From: Giuseppe Scrivano <gscrivan@redhat.com>
Date: Wed, 6 Sep 2023 17:31:33 +0200
Subject: [PATCH 05/10] libpod: check if cgroup exists before creating it

do not create the pod cgroup if it already exists.

Signed-off-by: Giuseppe Scrivano <gscrivan@redhat.com>
---
 libpod/util_linux.go | 44 ++++++++++++++++++++++++++++++++++----------
 1 file changed, 34 insertions(+), 10 deletions(-)

diff --git a/libpod/util_linux.go b/libpod/util_linux.go
index 155bd91857..7f6e1472c6 100644
--- a/libpod/util_linux.go
+++ b/libpod/util_linux.go
@@ -5,6 +5,8 @@ package libpod
 
 import (
 	"fmt"
+	"os"
+	"path/filepath"
 	"strings"
 	"syscall"
 
@@ -17,22 +19,36 @@ import (
 	"golang.org/x/sys/unix"
 )
 
+func cgroupExist(path string) bool {
+	cgroupv2, _ := cgroups.IsCgroup2UnifiedMode()
+	var fullPath string
+	if cgroupv2 {
+		fullPath = filepath.Join("/sys/fs/cgroup", path)
+	} else {
+		fullPath = filepath.Join("/sys/fs/cgroup/memory", path)
+	}
+	_, err := os.Stat(fullPath)
+	return err == nil
+}
+
 // systemdSliceFromPath makes a new systemd slice under the given parent with
 // the given name.
 // The parent must be a slice. The name must NOT include ".slice"
 func systemdSliceFromPath(parent, name string, resources *spec.LinuxResources) (string, error) {
-	cgroupPath, err := assembleSystemdCgroupName(parent, name)
+	cgroupPath, systemdPath, err := assembleSystemdCgroupName(parent, name)
 	if err != nil {
 		return "", err
 	}
 
-	logrus.Debugf("Created cgroup path %s for parent %s and name %s", cgroupPath, parent, name)
+	logrus.Debugf("Created cgroup path %s for parent %s and name %s", systemdPath, parent, name)
 
-	if err := makeSystemdCgroup(cgroupPath, resources); err != nil {
-		return "", fmt.Errorf("creating cgroup %s: %w", cgroupPath, err)
+	if !cgroupExist(cgroupPath) {
+		if err := makeSystemdCgroup(systemdPath, resources); err != nil {
+			return "", fmt.Errorf("creating cgroup %s: %w", cgroupPath, err)
+		}
 	}
 
-	logrus.Debugf("Created cgroup %s", cgroupPath)
+	logrus.Debugf("Created cgroup %s", systemdPath)
 
 	return cgroupPath, nil
 }
@@ -88,19 +104,27 @@ func deleteSystemdCgroup(path string, resources *spec.LinuxResources) error {
 }
 
 // assembleSystemdCgroupName creates a systemd cgroup path given a base and
-// a new component to add.
+// a new component to add.  It also returns the path to the cgroup as it accessible
+// below the cgroup mounts.
 // The base MUST be systemd slice (end in .slice)
-func assembleSystemdCgroupName(baseSlice, newSlice string) (string, error) {
+func assembleSystemdCgroupName(baseSlice, newSlice string) (string, string, error) {
 	const sliceSuffix = ".slice"
 
 	if !strings.HasSuffix(baseSlice, sliceSuffix) {
-		return "", fmt.Errorf("cannot assemble cgroup path with base %q - must end in .slice: %w", baseSlice, define.ErrInvalidArg)
+		return "", "", fmt.Errorf("cannot assemble cgroup path with base %q - must end in .slice: %w", baseSlice, define.ErrInvalidArg)
 	}
 
 	noSlice := strings.TrimSuffix(baseSlice, sliceSuffix)
-	final := fmt.Sprintf("%s/%s-%s%s", baseSlice, noSlice, newSlice, sliceSuffix)
+	systemdPath := fmt.Sprintf("%s/%s-%s%s", baseSlice, noSlice, newSlice, sliceSuffix)
 
-	return final, nil
+	if rootless.IsRootless() {
+		// When we run as rootless, the cgroup has a path like the following:
+		///sys/fs/cgroup/user.slice/user-@$UID.slice/user@$UID.service/user.slice/user-libpod_pod_$POD_ID.slice
+		uid := rootless.GetRootlessUID()
+		raw := fmt.Sprintf("user.slice/%s-%d.slice/user@%d.service/%s/%s-%s%s", noSlice, uid, uid, baseSlice, noSlice, newSlice, sliceSuffix)
+		return raw, systemdPath, nil
+	}
+	return systemdPath, systemdPath, nil
 }
 
 var lvpRelabel = label.Relabel

From 5de8f4aba003938f509fb81235d670fffa1e362e Mon Sep 17 00:00:00 2001
From: Giuseppe Scrivano <gscrivan@redhat.com>
Date: Wed, 6 Sep 2023 21:11:34 +0200
Subject: [PATCH 06/10] libpod: allow cgroup path without infra container

a pod can use cgroups without an infra container.

Signed-off-by: Giuseppe Scrivano <gscrivan@redhat.com>
---
 libpod/pod.go | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/libpod/pod.go b/libpod/pod.go
index cdd0f3d715..0d4a591c2f 100644
--- a/libpod/pod.go
+++ b/libpod/pod.go
@@ -361,9 +361,6 @@ func (p *Pod) CgroupPath() (string, error) {
 	if err := p.updatePod(); err != nil {
 		return "", err
 	}
-	if p.state.InfraContainerID == "" {
-		return "", fmt.Errorf("pod has no infra container: %w", define.ErrNoSuchCtr)
-	}
 	return p.state.CgroupPath, nil
 }
 

From 83334fb4e71f08b8aea8e8f709461a295aba27c8 Mon Sep 17 00:00:00 2001
From: Giuseppe Scrivano <gscrivan@redhat.com>
Date: Wed, 6 Sep 2023 21:24:06 +0200
Subject: [PATCH 07/10] specgen: allow --share-parent with --infra=false

This allows to use --share-parent with --infra=false, so that the
containers in the pod can share the parent cgroup.

Signed-off-by: Giuseppe Scrivano <gscrivan@redhat.com>
---
 pkg/specgen/generate/pod_create.go | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/pkg/specgen/generate/pod_create.go b/pkg/specgen/generate/pod_create.go
index d8759a9314..a8136ba490 100644
--- a/pkg/specgen/generate/pod_create.go
+++ b/pkg/specgen/generate/pod_create.go
@@ -123,11 +123,12 @@ func createPodOptions(p *specgen.PodSpecGenerator) ([]libpod.PodCreateOption, er
 	var (
 		options []libpod.PodCreateOption
 	)
+
+	if p.ShareParent == nil || (p.ShareParent != nil && *p.ShareParent) {
+		options = append(options, libpod.WithPodParent())
+	}
 	if !p.NoInfra {
 		options = append(options, libpod.WithInfraContainer())
-		if p.ShareParent == nil || (p.ShareParent != nil && *p.ShareParent) {
-			options = append(options, libpod.WithPodParent())
-		}
 		nsOptions, err := GetNamespaceOptions(p.SharedNamespaces, p.InfraContainerSpec.NetNS.IsHost())
 		if err != nil {
 			return nil, err

From 331b3c216d68337ba9285542e0663f742dd5af5e Mon Sep 17 00:00:00 2001
From: Giuseppe Scrivano <gscrivan@redhat.com>
Date: Wed, 6 Sep 2023 21:35:57 +0200
Subject: [PATCH 08/10] cmd, specgen: allow cgroup resources without --infra

When the infra container is not created, we can still set limits on
the pod cgroup.

Signed-off-by: Giuseppe Scrivano <gscrivan@redhat.com>
---
 cmd/podman/pods/create.go          |  24 +++++++
 pkg/specgen/generate/pod_create.go | 102 ++++++++++++++---------------
 2 files changed, 73 insertions(+), 53 deletions(-)

diff --git a/cmd/podman/pods/create.go b/cmd/podman/pods/create.go
index 5cc4505ff2..79c3524a6c 100644
--- a/cmd/podman/pods/create.go
+++ b/cmd/podman/pods/create.go
@@ -154,6 +154,15 @@ func create(cmd *cobra.Command, args []string) error {
 			return fmt.Errorf("cannot set share(%s) namespaces without an infra container", cmd.Flag("share").Value)
 		}
 		createOptions.Share = nil
+
+		infraOptions, err = containers.CreateInit(cmd, infraOptions, true)
+		if err != nil {
+			return err
+		}
+		err = common.ContainerToPodOptions(&infraOptions, &createOptions)
+		if err != nil {
+			return err
+		}
 	} else {
 		// reassign certain options for lbpod api, these need to be populated in spec
 		flags := cmd.Flags()
@@ -280,6 +289,21 @@ func create(cmd *cobra.Command, args []string) error {
 			return err
 		}
 		podSpec.Name = podName
+	} else {
+		ctrSpec := specgen.NewSpecGenerator("", false)
+		err = specgenutil.FillOutSpecGen(ctrSpec, &infraOptions, []string{})
+		if err != nil {
+			return err
+		}
+		// Marshall and Unmarshal the spec in order to map similar entities
+		wrapped, err := json.Marshal(ctrSpec)
+		if err != nil {
+			return err
+		}
+		err = json.Unmarshal(wrapped, podSpec)
+		if err != nil {
+			return err
+		}
 	}
 	PodSpec := entities.PodSpec{PodSpecGen: *podSpec}
 	response, err := registry.ContainerEngine().PodCreate(context.Background(), PodSpec)
diff --git a/pkg/specgen/generate/pod_create.go b/pkg/specgen/generate/pod_create.go
index a8136ba490..e14ed00d66 100644
--- a/pkg/specgen/generate/pod_create.go
+++ b/pkg/specgen/generate/pod_create.go
@@ -44,28 +44,18 @@ func MakePod(p *entities.PodSpec, rt *libpod.Runtime) (_ *libpod.Pod, finalErr e
 		p.PodSpecGen.InfraContainerSpec.RawImageName = imageName
 	}
 
-	if !p.PodSpecGen.NoInfra && p.PodSpecGen.InfraContainerSpec != nil {
-		var err error
-		p.PodSpecGen.InfraContainerSpec, err = MapSpec(&p.PodSpecGen)
-		if err != nil {
-			return nil, err
-		}
+	spec, err := MapSpec(&p.PodSpecGen)
+	if err != nil {
+		return nil, err
 	}
-
-	if !p.PodSpecGen.NoInfra {
-		err := specgen.FinishThrottleDevices(p.PodSpecGen.InfraContainerSpec)
-		if err != nil {
-			return nil, err
-		}
-		if p.PodSpecGen.InfraContainerSpec.ResourceLimits != nil &&
-			p.PodSpecGen.InfraContainerSpec.ResourceLimits.BlockIO != nil {
-			p.PodSpecGen.ResourceLimits.BlockIO = p.PodSpecGen.InfraContainerSpec.ResourceLimits.BlockIO
-		}
-		err = specgen.WeightDevices(p.PodSpecGen.InfraContainerSpec)
-		if err != nil {
-			return nil, err
-		}
-		p.PodSpecGen.ResourceLimits = p.PodSpecGen.InfraContainerSpec.ResourceLimits
+	if err := specgen.FinishThrottleDevices(spec); err != nil {
+		return nil, err
+	}
+	if err := specgen.WeightDevices(spec); err != nil {
+		return nil, err
+	}
+	if spec.ResourceLimits != nil && spec.ResourceLimits.BlockIO != nil {
+		p.PodSpecGen.ResourceLimits.BlockIO = spec.ResourceLimits.BlockIO
 	}
 
 	options, err := createPodOptions(&p.PodSpecGen)
@@ -177,12 +167,18 @@ func createPodOptions(p *specgen.PodSpecGenerator) ([]libpod.PodCreateOption, er
 // MapSpec modifies the already filled Infra specgenerator,
 // replacing necessary values with those specified in pod creation
 func MapSpec(p *specgen.PodSpecGenerator) (*specgen.SpecGenerator, error) {
+	var spec *specgen.SpecGenerator
+	if p.InfraContainerSpec != nil {
+		spec = p.InfraContainerSpec
+	} else {
+		spec = &specgen.SpecGenerator{}
+	}
 	if len(p.PortMappings) > 0 {
 		ports, err := ParsePortMapping(p.PortMappings, nil)
 		if err != nil {
 			return nil, err
 		}
-		p.InfraContainerSpec.PortMappings = ports
+		spec.PortMappings = ports
 	}
 	switch p.NetNS.NSMode {
 	case specgen.Default, "":
@@ -191,90 +187,90 @@ func MapSpec(p *specgen.PodSpecGenerator) (*specgen.SpecGenerator, error) {
 			break
 		}
 	case specgen.Bridge:
-		p.InfraContainerSpec.NetNS.NSMode = specgen.Bridge
+		spec.NetNS.NSMode = specgen.Bridge
 		logrus.Debugf("Pod using bridge network mode")
 	case specgen.Private:
-		p.InfraContainerSpec.NetNS.NSMode = specgen.Private
+		spec.NetNS.NSMode = specgen.Private
 		logrus.Debugf("Pod will use default network mode")
 	case specgen.Host:
 		logrus.Debugf("Pod will use host networking")
-		if len(p.InfraContainerSpec.PortMappings) > 0 ||
-			len(p.InfraContainerSpec.Networks) > 0 ||
-			p.InfraContainerSpec.NetNS.NSMode == specgen.NoNetwork {
+		if len(spec.PortMappings) > 0 ||
+			len(spec.Networks) > 0 ||
+			spec.NetNS.NSMode == specgen.NoNetwork {
 			return nil, fmt.Errorf("cannot set host network if network-related configuration is specified: %w", define.ErrInvalidArg)
 		}
-		p.InfraContainerSpec.NetNS.NSMode = specgen.Host
+		spec.NetNS.NSMode = specgen.Host
 	case specgen.Slirp:
 		logrus.Debugf("Pod will use slirp4netns")
-		if p.InfraContainerSpec.NetNS.NSMode != specgen.Host {
-			p.InfraContainerSpec.NetworkOptions = p.NetworkOptions
-			p.InfraContainerSpec.NetNS.NSMode = specgen.Slirp
+		if spec.NetNS.NSMode != specgen.Host {
+			spec.NetworkOptions = p.NetworkOptions
+			spec.NetNS.NSMode = specgen.Slirp
 		}
 	case specgen.Pasta:
 		logrus.Debugf("Pod will use pasta")
-		if p.InfraContainerSpec.NetNS.NSMode != specgen.Host {
-			p.InfraContainerSpec.NetworkOptions = p.NetworkOptions
-			p.InfraContainerSpec.NetNS.NSMode = specgen.Pasta
+		if spec.NetNS.NSMode != specgen.Host {
+			spec.NetworkOptions = p.NetworkOptions
+			spec.NetNS.NSMode = specgen.Pasta
 		}
 	case specgen.Path:
 		logrus.Debugf("Pod will use namespace path networking")
-		p.InfraContainerSpec.NetNS.NSMode = specgen.Path
-		p.InfraContainerSpec.NetNS.Value = p.PodNetworkConfig.NetNS.Value
+		spec.NetNS.NSMode = specgen.Path
+		spec.NetNS.Value = p.PodNetworkConfig.NetNS.Value
 	case specgen.NoNetwork:
 		logrus.Debugf("Pod will not use networking")
-		if len(p.InfraContainerSpec.PortMappings) > 0 ||
-			len(p.InfraContainerSpec.Networks) > 0 ||
-			p.InfraContainerSpec.NetNS.NSMode == specgen.Host {
+		if len(spec.PortMappings) > 0 ||
+			len(spec.Networks) > 0 ||
+			spec.NetNS.NSMode == specgen.Host {
 			return nil, fmt.Errorf("cannot disable pod network if network-related configuration is specified: %w", define.ErrInvalidArg)
 		}
-		p.InfraContainerSpec.NetNS.NSMode = specgen.NoNetwork
+		spec.NetNS.NSMode = specgen.NoNetwork
 	default:
 		return nil, fmt.Errorf("pods presently do not support network mode %s", p.NetNS.NSMode)
 	}
 
 	if len(p.InfraCommand) > 0 {
-		p.InfraContainerSpec.Entrypoint = p.InfraCommand
+		spec.Entrypoint = p.InfraCommand
 	}
 
 	if len(p.HostAdd) > 0 {
-		p.InfraContainerSpec.HostAdd = p.HostAdd
+		spec.HostAdd = p.HostAdd
 	}
 	if len(p.DNSServer) > 0 {
 		var dnsServers []net.IP
 		dnsServers = append(dnsServers, p.DNSServer...)
 
-		p.InfraContainerSpec.DNSServers = dnsServers
+		spec.DNSServers = dnsServers
 	}
 	if len(p.DNSOption) > 0 {
-		p.InfraContainerSpec.DNSOptions = p.DNSOption
+		spec.DNSOptions = p.DNSOption
 	}
 	if len(p.DNSSearch) > 0 {
-		p.InfraContainerSpec.DNSSearch = p.DNSSearch
+		spec.DNSSearch = p.DNSSearch
 	}
 	if p.NoManageResolvConf {
-		p.InfraContainerSpec.UseImageResolvConf = true
+		spec.UseImageResolvConf = true
 	}
 	if len(p.Networks) > 0 {
-		p.InfraContainerSpec.Networks = p.Networks
+		spec.Networks = p.Networks
 	}
 	// deprecated cni networks for api users
 	if len(p.CNINetworks) > 0 {
-		p.InfraContainerSpec.CNINetworks = p.CNINetworks
+		spec.CNINetworks = p.CNINetworks
 	}
 	if p.NoManageHosts {
-		p.InfraContainerSpec.UseImageHosts = p.NoManageHosts
+		spec.UseImageHosts = p.NoManageHosts
 	}
 
 	if len(p.InfraConmonPidFile) > 0 {
-		p.InfraContainerSpec.ConmonPidFile = p.InfraConmonPidFile
+		spec.ConmonPidFile = p.InfraConmonPidFile
 	}
 
 	if p.Sysctl != nil && len(p.Sysctl) > 0 {
-		p.InfraContainerSpec.Sysctl = p.Sysctl
+		spec.Sysctl = p.Sysctl
 	}
 
-	p.InfraContainerSpec.Image = p.InfraImage
-	return p.InfraContainerSpec, nil
+	spec.Image = p.InfraImage
+	return spec, nil
 }
 
 func PodConfigToSpec(rt *libpod.Runtime, spec *specgen.PodSpecGenerator, infraOptions *entities.ContainerCreateOptions, id string) (p *libpod.Pod, err error) {

From b8f6a12d01e753fed4a1377ad912ae6191817d18 Mon Sep 17 00:00:00 2001
From: Giuseppe Scrivano <gscrivan@redhat.com>
Date: Wed, 6 Sep 2023 17:29:23 +0200
Subject: [PATCH 09/10] libpod: create the cgroup pod before containers

When a container is created and it is part of a pod, we ensure the pod
cgroup exists so limits can be applied on the pod cgroup.

Closes: https://github.com/containers/podman/issues/19175

Signed-off-by: Giuseppe Scrivano <gscrivan@redhat.com>
---
 libpod/container_internal.go | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/libpod/container_internal.go b/libpod/container_internal.go
index 1e665d27e5..4098296b5f 100644
--- a/libpod/container_internal.go
+++ b/libpod/container_internal.go
@@ -1028,6 +1028,19 @@ func (c *Container) init(ctx context.Context, retainRetries bool) error {
 	shutdown.Inhibit()
 	defer shutdown.Uninhibit()
 
+	// If the container is part of a pod, make sure the pod cgroup is created before the container
+	// so the limits can be applied.
+	if c.PodID() != "" {
+		pod, err := c.runtime.LookupPod(c.PodID())
+		if err != nil {
+			return err
+		}
+
+		if _, err := c.runtime.platformMakePod(pod, &pod.config.ResourceLimits); err != nil {
+			return err
+		}
+	}
+
 	// With the spec complete, do an OCI create
 	if _, err = c.ociRuntime.CreateContainer(c, nil); err != nil {
 		return err

From 0c75eac63715d67cff08cc2f4df629460777f732 Mon Sep 17 00:00:00 2001
From: Giuseppe Scrivano <gscrivan@redhat.com>
Date: Thu, 7 Sep 2023 13:39:29 +0200
Subject: [PATCH 10/10] tests: add test for pod cgroups

This test checks that the pod cgroups are created and that the limits
set for a pod cgroup are enforced also after a reboot.

Signed-off-by: Giuseppe Scrivano <gscrivan@redhat.com>
---
 test/system/200-pod.bats | 50 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)

diff --git a/test/system/200-pod.bats b/test/system/200-pod.bats
index c8ef3a80c4..ea2eeec5ef 100644
--- a/test/system/200-pod.bats
+++ b/test/system/200-pod.bats
@@ -689,4 +689,54 @@ function thingy_with_unique_id() {
     run_podman rm -f -a
 }
 
+
+@test "podman pod cleans cgroup and keeps limits" {
+    skip_if_remote "we cannot check cgroup settings"
+    skip_if_rootless_cgroupsv1 "rootless cannot use cgroups on v1"
+
+    for infra in true false; do
+        run_podman pod create --infra=$infra --memory=256M
+        podid="$output"
+        run_podman run -d --pod $podid $IMAGE top -d 2
+
+        run_podman pod inspect $podid
+        result=$(jq -r .CgroupPath <<< $output)
+        assert "$result" =~ "/" ".CgroupPath is a valid path"
+
+        if is_cgroupsv2; then
+           cgroup_path=/sys/fs/cgroup/$result
+        else
+           cgroup_path=/sys/fs/cgroup/memory/$result
+        fi
+
+        if test ! -e $cgroup_path; then
+            die "the cgroup $cgroup_path does not exist"
+        fi
+
+        run_podman pod stop -t 0 $podid
+        if test -e $cgroup_path; then
+            die "the cgroup $cgroup_path should not exist after pod stop"
+        fi
+
+        run_podman pod start $podid
+        if test ! -e $cgroup_path; then
+            die "the cgroup $cgroup_path does not exist"
+        fi
+
+        # validate that cgroup limits are in place after a restart
+        # issue #19175
+        if is_cgroupsv2; then
+           memory_limit_file=$cgroup_path/memory.max
+        else
+           memory_limit_file=$cgroup_path/memory.limit_in_bytes
+        fi
+        assert "$(< $memory_limit_file)" = "268435456" "Contents of $memory_limit_file"
+
+        run_podman pod rm -t 0 -f $podid
+        if test -e $cgroup_path; then
+            die "the cgroup $cgroup_path should not exist after pod rm"
+        fi
+    done
+}
+
 # vim: filetype=sh