From b46ac860b2a505e79615ff153d3d4f56fbefbe9f Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Thu, 23 Mar 2023 10:56:20 -0700 Subject: [PATCH 1/4] libct/cg/sd: refactor startUnit Move error handling earlier, removing "if err == nil" block. No change of logic. Signed-off-by: Kir Kolyshkin (cherry picked from commit c6e8cb79260cdfc435ceecc797efb92cc5977632) Signed-off-by: Kir Kolyshkin --- libcontainer/cgroups/systemd/common.go | 33 ++++++++++++++------------ 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/libcontainer/cgroups/systemd/common.go b/libcontainer/cgroups/systemd/common.go index b6bfb080a33..46b0c87443e 100644 --- a/libcontainer/cgroups/systemd/common.go +++ b/libcontainer/cgroups/systemd/common.go @@ -359,24 +359,27 @@ func startUnit(cm *dbusConnManager, unitName string, properties []systemdDbus.Pr _, err := c.StartTransientUnitContext(context.TODO(), unitName, "replace", properties, statusChan) return err }) - if err == nil { - timeout := time.NewTimer(30 * time.Second) - defer timeout.Stop() + if err != nil { + if !isUnitExists(err) { + return err + } + return nil + } - select { - case s := <-statusChan: - close(statusChan) - // Please refer to https://pkg.go.dev/github.com/coreos/go-systemd/v22/dbus#Conn.StartUnit - if s != "done" { - resetFailedUnit(cm, unitName) - return fmt.Errorf("error creating systemd unit `%s`: got `%s`", unitName, s) - } - case <-timeout.C: + timeout := time.NewTimer(30 * time.Second) + defer timeout.Stop() + + select { + case s := <-statusChan: + close(statusChan) + // Please refer to https://pkg.go.dev/github.com/coreos/go-systemd/v22/dbus#Conn.StartUnit + if s != "done" { resetFailedUnit(cm, unitName) - return errors.New("Timeout waiting for systemd to create " + unitName) + return fmt.Errorf("error creating systemd unit `%s`: got `%s`", unitName, s) } - } else if !isUnitExists(err) { - return err + case <-timeout.C: + resetFailedUnit(cm, unitName) + return errors.New("Timeout waiting for systemd to create " + unitName) } return nil From 931b9bf36a7f8f688db3355e79dc56bfb9db3c46 Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Thu, 23 Mar 2023 11:01:14 -0700 Subject: [PATCH 2/4] libct/cg/sd: ignore UnitExists only for Apply(-1) Commit d223e2adae83f62d5 ("Ignore error when starting transient unit that already exists" modified the code handling errors from startUnit to ignore UnitExists error. Apparently it was done so that kubelet can create the same pod slice over and over without hitting an error (see [1]). While it works for a pod slice to ensure it exists, it is a gross bug to ignore UnitExists when creating a container. In this case, the container init PID won't be added to the systemd unit (and to the required cgroup), and as a result the container will successfully run in a current user cgroup, without any cgroup limits applied. So, fix the code to only ignore UnitExists if we're not adding a process to the systemd unit. This way, kubelet will keep working as is, but runc will refuse to create containers which are not placed into a requested cgroup. [1] https://github.com/opencontainers/runc/pull/1124 Signed-off-by: Kir Kolyshkin (cherry picked from commit c2533420613be63ac9a84fc4363b8ddd46568985) Signed-off-by: Kir Kolyshkin --- libcontainer/cgroups/systemd/common.go | 10 ++++++++-- libcontainer/cgroups/systemd/v1.go | 2 +- libcontainer/cgroups/systemd/v2.go | 2 +- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/libcontainer/cgroups/systemd/common.go b/libcontainer/cgroups/systemd/common.go index 46b0c87443e..3e6f8a928fa 100644 --- a/libcontainer/cgroups/systemd/common.go +++ b/libcontainer/cgroups/systemd/common.go @@ -353,7 +353,7 @@ func isUnitExists(err error) bool { return isDbusError(err, "org.freedesktop.systemd1.UnitExists") } -func startUnit(cm *dbusConnManager, unitName string, properties []systemdDbus.Property) error { +func startUnit(cm *dbusConnManager, unitName string, properties []systemdDbus.Property, ignoreExist bool) error { statusChan := make(chan string, 1) err := cm.retryOnDisconnect(func(c *systemdDbus.Conn) error { _, err := c.StartTransientUnitContext(context.TODO(), unitName, "replace", properties, statusChan) @@ -363,7 +363,13 @@ func startUnit(cm *dbusConnManager, unitName string, properties []systemdDbus.Pr if !isUnitExists(err) { return err } - return nil + if ignoreExist { + // TODO: remove this hack. + // This is kubelet making sure a slice exists (see + // https://github.com/opencontainers/runc/pull/1124). + return nil + } + return err } timeout := time.NewTimer(30 * time.Second) diff --git a/libcontainer/cgroups/systemd/v1.go b/libcontainer/cgroups/systemd/v1.go index a74a05a5cd0..4ec32f7a9ca 100644 --- a/libcontainer/cgroups/systemd/v1.go +++ b/libcontainer/cgroups/systemd/v1.go @@ -206,7 +206,7 @@ func (m *legacyManager) Apply(pid int) error { properties = append(properties, c.SystemdProps...) - if err := startUnit(m.dbus, unitName, properties); err != nil { + if err := startUnit(m.dbus, unitName, properties, pid == -1); err != nil { return err } diff --git a/libcontainer/cgroups/systemd/v2.go b/libcontainer/cgroups/systemd/v2.go index de0cb974d46..94d24ee4502 100644 --- a/libcontainer/cgroups/systemd/v2.go +++ b/libcontainer/cgroups/systemd/v2.go @@ -284,7 +284,7 @@ func (m *unifiedManager) Apply(pid int) error { properties = append(properties, c.SystemdProps...) - if err := startUnit(m.dbus, unitName, properties); err != nil { + if err := startUnit(m.dbus, unitName, properties, pid == -1); err != nil { return fmt.Errorf("unable to start unit %q (properties %+v): %w", unitName, properties, err) } From e618ec36cd6dc7b0dd637ac0e0c69909e30fdc46 Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Thu, 23 Mar 2023 11:15:01 -0700 Subject: [PATCH 3/4] libct/cg/sd: reset-failed and retry startUnit on UnitExists In case a systemd unit fails (for example, timed out or OOM-killed), systemd keeps the unit. This prevents starting a new container with the same systemd unit name. The fix is to call reset-failed in case UnitExists error is returned, and retry once. Signed-off-by: Kir Kolyshkin (cherry picked from commit 1d18743f9e6767580a4276108e0c977d64fd03b6) Signed-off-by: Kir Kolyshkin --- libcontainer/cgroups/systemd/common.go | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/libcontainer/cgroups/systemd/common.go b/libcontainer/cgroups/systemd/common.go index 3e6f8a928fa..50746ae0c56 100644 --- a/libcontainer/cgroups/systemd/common.go +++ b/libcontainer/cgroups/systemd/common.go @@ -355,6 +355,9 @@ func isUnitExists(err error) bool { func startUnit(cm *dbusConnManager, unitName string, properties []systemdDbus.Property, ignoreExist bool) error { statusChan := make(chan string, 1) + retry := true + +retry: err := cm.retryOnDisconnect(func(c *systemdDbus.Conn) error { _, err := c.StartTransientUnitContext(context.TODO(), unitName, "replace", properties, statusChan) return err @@ -369,6 +372,14 @@ func startUnit(cm *dbusConnManager, unitName string, properties []systemdDbus.Pr // https://github.com/opencontainers/runc/pull/1124). return nil } + if retry { + // In case a unit with the same name exists, this may + // be a leftover failed unit. Reset it, so systemd can + // remove it, and retry once. + resetFailedUnit(cm, unitName) + retry = false + goto retry + } return err } From 12f2f03fd2e1cd64a73f5ed70564569e43060462 Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Thu, 23 Mar 2023 11:57:46 -0700 Subject: [PATCH 4/4] [1.1] runc run: refuse a non-empty cgroup for systemd driver As this is currently not possible to add a PID into an existing systemd unit, plus this feature will be deprected in runc 1.2 (see commit d08bc0c1b3bb2 ("runc run: warn on non-empty cgroup"), let's reject sharing a systemd unit between two containers, and fix the test case accordingly. We still allow this to happen in case cgroupfs driver is used, to minimize the potential compatibility issues in a stable branch. This is an adaptation of main branch commit 82bc89cd10e for 1.1. Signed-off-by: Kir Kolyshkin --- libcontainer/factory_linux.go | 6 ++++++ tests/integration/cgroups.bats | 13 ++++++++++--- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/libcontainer/factory_linux.go b/libcontainer/factory_linux.go index e6c71ac34e3..1df87e2e5e2 100644 --- a/libcontainer/factory_linux.go +++ b/libcontainer/factory_linux.go @@ -179,6 +179,12 @@ func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, err return nil, fmt.Errorf("unable to get cgroup PIDs: %w", err) } if len(pids) != 0 { + if config.Cgroups.Systemd { + // systemd cgroup driver can't add a pid to an + // existing systemd unit and will return an + // error anyway, so let's error out early. + return nil, fmt.Errorf("container's cgroup is not empty: %d process(es) found", len(pids)) + } // TODO: return an error. logrus.Warnf("container's cgroup is not empty: %d process(es) found", len(pids)) logrus.Warn("DEPRECATED: running container in a non-empty cgroup won't be supported in runc 1.2; https://github.com/opencontainers/runc/issues/3132") diff --git a/tests/integration/cgroups.bats b/tests/integration/cgroups.bats index ff7cf6d0b70..7fd2e30affa 100644 --- a/tests/integration/cgroups.bats +++ b/tests/integration/cgroups.bats @@ -348,7 +348,7 @@ function setup() { [ "$output" = "ok" ] } -@test "runc run/create should warn about a non-empty cgroup" { +@test "runc run/create should error/warn about a non-empty cgroup" { if [[ "$ROOTLESS" -ne 0 ]]; then requires rootless_cgroup fi @@ -358,14 +358,21 @@ function setup() { runc run -d --console-socket "$CONSOLE_SOCKET" ct1 [ "$status" -eq 0 ] + # When systemd driver is used, runc can't add PID to an existing unit, + # so runc returns an error. For backward compatibility, we still allow + # such configuration in 1.1, but only when systemd driver is NOT used. + # See https://github.com/opencontainers/runc/issues/3780. + local exp=0 + [[ -n "${RUNC_USE_SYSTEMD}" ]] && exp=1 + # Run a second container sharing the cgroup with the first one. runc --debug run -d --console-socket "$CONSOLE_SOCKET" ct2 - [ "$status" -eq 0 ] + [ "$status" -eq "$exp" ] [[ "$output" == *"container's cgroup is not empty"* ]] # Same but using runc create. runc create --console-socket "$CONSOLE_SOCKET" ct3 - [ "$status" -eq 0 ] + [ "$status" -eq "$exp" ] [[ "$output" == *"container's cgroup is not empty"* ]] }