-
Notifications
You must be signed in to change notification settings - Fork 2k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
raw_exec: make raw exec driver work with cgroups v2
This PR adds support for the raw_exec driver on systems with only cgroups v2. The raw exec driver is able to use cgroups to manage processes. This happens only on Linux, when exec_driver is enabled, and the no_cgroups option is not set. The driver uses the freezer controller to freeze processes of a task, issue a sigkill, then unfreeze. Previously the implementation assumed cgroups v1, and now it also supports cgroups v2. There is a bit of refactoring in this PR, but the fundamental design remains the same. Closes #12351 #12348
- Loading branch information
Showing
35 changed files
with
697 additions
and
307 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
```release-note:improvement | ||
Add support for cgroups v2 in raw_exec driver | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,210 @@ | ||
//go:build linux | ||
|
||
package cgutil | ||
|
||
import ( | ||
"errors" | ||
"fmt" | ||
"os" | ||
"path/filepath" | ||
"time" | ||
|
||
"github.com/hashicorp/go-hclog" | ||
"github.com/opencontainers/runc/libcontainer/cgroups" | ||
"github.com/opencontainers/runc/libcontainer/cgroups/fs" | ||
"github.com/opencontainers/runc/libcontainer/cgroups/fs2" | ||
"github.com/opencontainers/runc/libcontainer/configs" | ||
) | ||
|
||
// freezer is the name of the cgroup subsystem used for stopping / starting | ||
// a group of processes | ||
const freezer = "freezer" | ||
|
||
// thawed and frozen are the two states we put a cgroup in when trying to remove it | ||
var ( | ||
thawed = &configs.Resources{Freezer: configs.Thawed} | ||
frozen = &configs.Resources{Freezer: configs.Frozen} | ||
) | ||
|
||
// GroupKiller is used for SIGKILL-ing the process tree[s] of a cgroup by leveraging | ||
// the freezer cgroup subsystem. | ||
type GroupKiller interface { | ||
KillGroup(cgroup *configs.Cgroup) error | ||
} | ||
|
||
// NewGroupKiller creates a GroupKiller with executor PID pid. | ||
func NewGroupKiller(logger hclog.Logger, pid int) GroupKiller { | ||
return &killer{ | ||
logger: logger.Named("group_killer"), | ||
pid: pid, | ||
} | ||
} | ||
|
||
type killer struct { | ||
logger hclog.Logger | ||
pid int | ||
} | ||
|
||
// KillGroup will SIGKILL the process tree present in cgroup, using the freezer | ||
// subsystem to prevent further forking, etc. | ||
func (d *killer) KillGroup(cgroup *configs.Cgroup) error { | ||
if UseV2 { | ||
return d.v2(cgroup) | ||
} | ||
return d.v1(cgroup) | ||
} | ||
|
||
func (d *killer) v1(cgroup *configs.Cgroup) error { | ||
if cgroup == nil { | ||
return errors.New("missing cgroup") | ||
} | ||
|
||
// the actual path to our tasks freezer cgroup | ||
path := cgroup.Paths[freezer] | ||
|
||
d.logger.Trace("killing processes", "cgroup_path", path, "cgroup_version", "v1", "executor_pid", d.pid) | ||
|
||
// move executor PID into the init freezer cgroup so we can kill the task | ||
// pids without killing the executor (which is the process running this code, | ||
// doing the killing) | ||
initPath, err := cgroups.GetInitCgroupPath(freezer) | ||
if err != nil { | ||
return fmt.Errorf("failed to find init cgroup: %w", err) | ||
} | ||
m := map[string]string{freezer: initPath} | ||
if err = cgroups.EnterPid(m, d.pid); err != nil { | ||
return fmt.Errorf("failed to add executor pid to init cgroup: %w", err) | ||
} | ||
|
||
// ability to freeze the cgroup | ||
freeze := func() { | ||
_ = new(fs.FreezerGroup).Set(path, frozen) | ||
} | ||
|
||
// ability to thaw the cgroup | ||
thaw := func() { | ||
_ = new(fs.FreezerGroup).Set(path, thawed) | ||
} | ||
|
||
// do the common kill logic | ||
if err = d.kill(path, freeze, thaw); err != nil { | ||
return err | ||
} | ||
|
||
// remove the cgroup from disk | ||
return cgroups.RemovePath(path) | ||
} | ||
|
||
func (d *killer) v2(cgroup *configs.Cgroup) error { | ||
if cgroup == nil { | ||
return errors.New("missing cgroup") | ||
} | ||
|
||
path := filepath.Join(CgroupRoot, cgroup.Path) | ||
|
||
existingPIDs, err := cgroups.GetPids(path) | ||
if err != nil { | ||
return fmt.Errorf("failed to determine pids in cgroup: %w", err) | ||
} | ||
|
||
d.logger.Trace("killing processes", "cgroup_path", path, "cgroup_version", "v2", "executor_pid", d.pid, "existing_pids", existingPIDs) | ||
|
||
mgr, err := fs2.NewManager(cgroup, "", rootless) | ||
if err != nil { | ||
return fmt.Errorf("failed to create v2 cgroup manager: %w", err) | ||
} | ||
|
||
// move executor PID into the root init.scope so we can kill the task pids | ||
// without killing the executor (which is the process running this code, doing | ||
// the killing) | ||
init, err := fs2.NewManager(nil, filepath.Join(CgroupRoot, "init.scope"), rootless) | ||
if err != nil { | ||
return fmt.Errorf("failed to create v2 init cgroup manager: %w", err) | ||
} | ||
if err = init.Apply(d.pid); err != nil { | ||
return fmt.Errorf("failed to move executor pid into init.scope cgroup: %w", err) | ||
} | ||
|
||
d.logger.Trace("move of executor pid into init.scope complete", "pid", d.pid) | ||
|
||
// ability to freeze the cgroup | ||
freeze := func() { | ||
_ = mgr.Freeze(configs.Frozen) | ||
} | ||
|
||
// ability to thaw the cgroup | ||
thaw := func() { | ||
_ = mgr.Freeze(configs.Thawed) | ||
} | ||
|
||
// do the common kill logic | ||
|
||
if err = d.kill(path, freeze, thaw); err != nil { | ||
return err | ||
} | ||
|
||
// remove the cgroup from disk | ||
return mgr.Destroy() | ||
} | ||
|
||
// kill is used to SIGKILL all processes in cgroup | ||
// | ||
// The order of operations is | ||
// 0. before calling this method, the executor pid has been moved outside of cgroup | ||
// 1. freeze cgroup (so processes cannot fork further) | ||
// 2. scan the cgroup to collect all pids | ||
// 3. issue SIGKILL to each pid found | ||
// 4. thaw the cgroup so processes can go die | ||
// 5. wait on each processes until it is confirmed dead | ||
func (d *killer) kill(cgroup string, freeze func(), thaw func()) error { | ||
// freeze the cgroup stopping further forking | ||
freeze() | ||
|
||
d.logger.Trace("search for pids in", "cgroup", cgroup) | ||
|
||
// find all the pids we intend to kill | ||
pids, err := cgroups.GetPids(cgroup) | ||
if err != nil { | ||
// if we fail to get pids, re-thaw before bailing so there is at least | ||
// a chance the processes can go die out of band | ||
thaw() | ||
return fmt.Errorf("failed to find pids: %w", err) | ||
} | ||
|
||
d.logger.Trace("send sigkill to frozen processes", "cgroup", cgroup, "pids", pids) | ||
|
||
var processes []*os.Process | ||
|
||
// kill the processes in cgroup | ||
for _, pid := range pids { | ||
p, findErr := os.FindProcess(pid) | ||
if findErr != nil { | ||
d.logger.Trace("failed to find process of pid to kill", "pid", pid, "error", findErr) | ||
continue | ||
} | ||
processes = append(processes, p) | ||
if killErr := p.Kill(); killErr != nil { | ||
d.logger.Trace("failed to kill process", "pid", pid, "error", killErr) | ||
continue | ||
} | ||
} | ||
|
||
// thawed the cgroup so we can wait on each process | ||
thaw() | ||
|
||
// wait on each process | ||
for _, p := range processes { | ||
// do not capture error; errors are normal here | ||
pState, _ := p.Wait() | ||
d.logger.Trace("return from wait on process", "pid", p.Pid, "state", pState) | ||
} | ||
|
||
// cgroups are not atomic, the OS takes a moment to un-mark the cgroup as in-use; | ||
// a tiny sleep here goes a long way for not creating noisy (but functionally benign) | ||
// errors about removing busy cgroup | ||
// | ||
// alternatively we could do the removal in a loop and silence the interim errors, but meh | ||
time.Sleep(50 * time.Millisecond) | ||
|
||
return nil | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
package resources | ||
|
||
// A Containment will cleanup resources created by an executor. | ||
type Containment interface { | ||
// Apply enables containment on pid. | ||
Apply(pid int) error | ||
|
||
// Cleanup will purge executor resources like cgroups. | ||
Cleanup() error | ||
|
||
// GetPIDs will return the processes overseen by the Containment | ||
GetPIDs() PIDs | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
//go:build !linux | ||
|
||
package resources | ||
|
||
type containment struct { | ||
// non-linux executors currently do not create resources to be cleaned up | ||
} | ||
|
||
func (c *containment) Cleanup() error { | ||
return nil | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,107 @@ | ||
//go:build linux | ||
|
||
package resources | ||
|
||
import ( | ||
"fmt" | ||
"os" | ||
"path/filepath" | ||
"sync" | ||
|
||
"github.com/hashicorp/go-hclog" | ||
"github.com/hashicorp/nomad/client/lib/cgutil" | ||
"github.com/opencontainers/runc/libcontainer/cgroups" | ||
"github.com/opencontainers/runc/libcontainer/cgroups/fs2" | ||
"github.com/opencontainers/runc/libcontainer/configs" | ||
) | ||
|
||
type containment struct { | ||
lock sync.RWMutex | ||
cgroup *configs.Cgroup | ||
logger hclog.Logger | ||
} | ||
|
||
func Contain(logger hclog.Logger, cgroup *configs.Cgroup) *containment { | ||
return &containment{ | ||
cgroup: cgroup, | ||
logger: logger.Named("containment"), | ||
} | ||
} | ||
|
||
func (c *containment) Apply(pid int) error { | ||
c.lock.Lock() | ||
defer c.lock.Unlock() | ||
|
||
c.logger.Trace("create containment for", "cgroup", c.cgroup, "pid", pid) | ||
|
||
// for v2 use manager to create and enter the cgroup | ||
if cgutil.UseV2 { | ||
mgr, err := fs2.NewManager(c.cgroup, "", false) | ||
if err != nil { | ||
return fmt.Errorf("failed to create v2 cgroup manager for containment: %w", err) | ||
} | ||
|
||
// add the pid to the cgroup | ||
if err = mgr.Apply(pid); err != nil { | ||
return fmt.Errorf("failed to apply v2 cgroup containment: %w", err) | ||
} | ||
|
||
// in v2 it is important to set the device resource configuration | ||
if err = mgr.Set(c.cgroup.Resources); err != nil { | ||
return fmt.Errorf("failed to set v2 cgroup resources: %w", err) | ||
} | ||
|
||
return nil | ||
} | ||
|
||
// for v1 a random cgroup was created already; just enter it | ||
if err := cgroups.EnterPid(c.cgroup.Paths, pid); err != nil { | ||
return fmt.Errorf("failed to add pid to v1 cgroup: %w", err) | ||
} | ||
|
||
return nil | ||
} | ||
|
||
func (c *containment) Cleanup() error { | ||
c.lock.Lock() | ||
defer c.lock.Unlock() | ||
|
||
// the current pid is of the executor, who manages the task process cleanup | ||
executorPID := os.Getpid() | ||
c.logger.Trace("cleanup on", "cgroup", c.cgroup, "executor_pid", executorPID) | ||
|
||
// destroy the task processes | ||
destroyer := cgutil.NewGroupKiller(c.logger, executorPID) | ||
return destroyer.KillGroup(c.cgroup) | ||
} | ||
|
||
func (c *containment) GetPIDs() PIDs { | ||
c.lock.Lock() | ||
defer c.lock.Unlock() | ||
|
||
m := make(PIDs) | ||
if c.cgroup == nil { | ||
return m | ||
} | ||
|
||
// get the cgroup path under containment | ||
var path string | ||
if cgutil.UseV2 { | ||
path = filepath.Join(cgutil.CgroupRoot, c.cgroup.Path) | ||
} else { | ||
path = c.cgroup.Paths["freezer"] | ||
} | ||
|
||
// find the pids in the cgroup under containment | ||
pids, err := cgroups.GetAllPids(path) | ||
if err != nil { | ||
c.logger.Debug("failed to get pids", "cgroup", c.cgroup, "error", err) | ||
return m | ||
} | ||
|
||
for _, pid := range pids { | ||
m[pid] = NewPID(pid) | ||
} | ||
|
||
return m | ||
} |
Oops, something went wrong.