Skip to content

Commit

Permalink
raw_exec: make raw exec driver work with cgroups v2
Browse files Browse the repository at this point in the history
This PR adds support for the raw_exec driver on systems with only cgroups v2.

The raw exec driver is able to use cgroups to manage processes. This happens
only on Linux, when exec_driver is enabled, and the no_cgroups option is not
set. The driver uses the freezer controller to freeze processes of a task,
issue a sigkill, then unfreeze. Previously the implementation assumed cgroups
v1, and now it also supports cgroups v2.

There is a bit of refactoring in this PR, but the fundamental design remains
the same.

Closes #12351 #12348
  • Loading branch information
shoenig committed Mar 31, 2022
1 parent 845ab88 commit c8d2402
Show file tree
Hide file tree
Showing 34 changed files with 683 additions and 306 deletions.
3 changes: 2 additions & 1 deletion .github/workflows/test-core.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,8 @@ jobs:
run: |
make bootstrap
make generate-all
make test-nomad-module
sudo sed -i 's!Defaults!#Defaults!g' /etc/sudoers
sudo -E env "PATH=$PATH" make test-nomad-module
tests-pkgs:
runs-on: ubuntu-20.04
timeout-minutes: 30
Expand Down
1 change: 0 additions & 1 deletion client/allocdir/fs_unix.go
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
//go:build darwin || dragonfly || freebsd || linux || netbsd || openbsd || solaris
// +build darwin dragonfly freebsd linux netbsd openbsd solaris

package allocdir

Expand Down
1 change: 1 addition & 0 deletions client/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -1433,6 +1433,7 @@ func (c *Client) setupNode() error {
if node.Name == "" {
node.Name, _ = os.Hostname()
}
node.CgroupParent = c.config.CgroupParent
if node.HostVolumes == nil {
if l := len(c.config.HostVolumes); l != 0 {
node.HostVolumes = make(map[string]*structs.ClientHostVolumeConfig, l)
Expand Down
1 change: 1 addition & 0 deletions client/fs_endpoint_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -318,6 +318,7 @@ func TestFS_List_ACL(t *testing.T) {

func TestFS_Stream_NoAlloc(t *testing.T) {
ci.Parallel(t)
ci.SkipSlow(t, "flaky on GHA; #12358")
require := require.New(t)

// Start a client
Expand Down
8 changes: 4 additions & 4 deletions client/lib/cgutil/cgutil_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,13 +58,13 @@ func CgroupScope(allocID, task string) string {
return fmt.Sprintf("%s.%s.scope", allocID, task)
}

// ConfigureBasicCgroups will initialize cgroups for v1.
// ConfigureBasicCgroups will initialize a cgroup and modify config to contain
// a reference to its path.
//
// Not useful in cgroups.v2
// v1: creates a random "freezer" cgroup which can later be used for cleanup of processes.
// v2: does nothing.
func ConfigureBasicCgroups(config *lcc.Config) error {
if UseV2 {
// In v2 the default behavior is to create inherited interface files for
// all mounted subsystems automatically.
return nil
}

Expand Down
199 changes: 199 additions & 0 deletions client/lib/cgutil/group_killer.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,199 @@
//go:build linux

package cgutil

import (
"errors"
"fmt"
"os"
"path/filepath"
"time"

"github.com/hashicorp/go-hclog"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fs"
"github.com/opencontainers/runc/libcontainer/cgroups/fs2"
"github.com/opencontainers/runc/libcontainer/configs"
)

// freezer is the name of the cgroup subsystem used for stopping / starting
// a group of processes
const freezer = "freezer"

// thawed and frozen are the two states we put a cgroup in when trying to remove it
var (
thawed = &configs.Resources{Freezer: configs.Thawed}
frozen = &configs.Resources{Freezer: configs.Frozen}
)

// GroupKiller is used for SIGKILL-ing the process tree[s] of a cgroup by leveraging
// the freezer cgroup subsystem.
type GroupKiller interface {
KillGroup(cgroup *configs.Cgroup) error
}

// NewGroupKiller creates a GroupKiller with executor PID pid.
func NewGroupKiller(logger hclog.Logger, pid int) GroupKiller {
return &killer{
logger: logger.Named("mr_freeze"),
pid: pid,
}
}

type killer struct {
logger hclog.Logger
pid int
}

// KillGroup will SIGKILL the process tree present in cgroup, using the freezer
// subsystem to prevent further forking, etc.
func (d *killer) KillGroup(cgroup *configs.Cgroup) error {
if UseV2 {
return d.v2(cgroup)
}
return d.v1(cgroup)
}

func (d *killer) v1(cgroup *configs.Cgroup) error {
if cgroup == nil {
return errors.New("missing cgroup")
}

// the actual path to our tasks freezer cgroup
path := cgroup.Paths[freezer]

d.logger.Trace("killing processes", "cgroup_path", path, "cgroup_version", "v1", "executor_pid", d.pid)

// move executor PID into the init freezer cgroup so we can kill the task
// pids without killing the executor (which is the process running this code,
// doing the killing)
initPath, err := cgroups.GetInitCgroupPath(freezer)
if err != nil {
return fmt.Errorf("failed to find init cgroup: %w", err)
}
m := map[string]string{freezer: initPath}
if err = cgroups.EnterPid(m, d.pid); err != nil {
return fmt.Errorf("failed to add executor pid to init cgroup: %w", err)
}

// ability to freeze the cgroup
freeze := func() {
_ = new(fs.FreezerGroup).Set(path, frozen)
}

// ability to thaw the cgroup
thaw := func() {
_ = new(fs.FreezerGroup).Set(path, thawed)
}

// do the common kill logic
if err = d.kill(path, freeze, thaw); err != nil {
return err
}

// remove the cgroup from disk
return cgroups.RemovePath(path)
}

func (d *killer) v2(cgroup *configs.Cgroup) error {
if cgroup == nil {
return errors.New("missing cgroup")
}

d.logger.Trace("killing processes", "cgroup_path", cgroup.Path, "cgroup_version", "v2", "executor_pid", d.pid)

mgr, err := fs2.NewManager(cgroup, "", rootless)
if err != nil {
return fmt.Errorf("failed to create v2 cgroup manager: %w", err)
}

// move executor PID into the root init.scope so we can kill the task pids
// without killing the executor (which is the process running this code, doing
// the killing)
init, err := fs2.NewManager(nil, filepath.Join(CgroupRoot, "init.scope"), rootless)
if err != nil {
return fmt.Errorf("failed to create v2 init cgroup manager: %w", err)
}
if err = init.Apply(d.pid); err != nil {
return fmt.Errorf("failed to move executor pid into init.scope cgroup: %w", err)
}

// ability to freeze the cgroup
freeze := func() {
_ = mgr.Freeze(configs.Frozen)
}

// ability to thaw the cgroup
thaw := func() {
_ = mgr.Freeze(configs.Thawed)
}

// do the common kill logic
path := filepath.Join(CgroupRoot, cgroup.Path)
if err = d.kill(path, freeze, thaw); err != nil {
return err
}

// remove the cgroup from disk
return mgr.Destroy()
}

// kill is used to SIGKILL all processes in cgroup
//
// The order of operations is
// 0. before calling this method, the executor pid has been moved outside of cgroup
// 1. freeze cgroup (so processes cannot fork further)
// 2. scan the cgroup to collect all pids
// 3. issue SIGKILL to each pid found
// 4. thaw the cgroup so processes can go die
// 5. wait on each processes until it is confirmed dead
func (d *killer) kill(cgroup string, freeze func(), thaw func()) error {
// freeze the cgroup stopping further forking
freeze()

// find all the pids we intend to kill
pids, err := cgroups.GetAllPids(cgroup)
if err != nil {
// if we fail to get pids, re-thaw before bailing so there is at least
// a chance the processes can go die out of band
thaw()
return fmt.Errorf("failed to find pids: %w", err)
}

d.logger.Trace("send sigkill to frozen processes", "cgroup", cgroup, "pids", pids)

var processes []*os.Process

// kill the processes in cgroup
for _, pid := range pids {
p, findErr := os.FindProcess(pid)
if findErr != nil {
d.logger.Trace("failed to find process of pid to kill", "pid", pid, "error", findErr)
continue
}
processes = append(processes, p)
if killErr := p.Kill(); killErr != nil {
d.logger.Trace("failed to kill process", "pid", pid, "error", killErr)
continue
}
}

// thawed the cgroup so we can wait on each process
thaw()

// wait on each process
for _, p := range processes {
// do not capture error; errors are normal here
pState, _ := p.Wait()
d.logger.Trace("return from wait on process", "pid", p.Pid, "state", pState)
}

// cgroups are not atomic, the OS takes a moment to un-mark the cgroup as in-use;
// a tiny sleep here goes a long way for not creating noisy (but functionally benign)
// errors about removing busy cgroup
//
// alternatively we could do the removal in a loop and silence the interim errors, but meh
time.Sleep(50 * time.Millisecond)

return nil
}
13 changes: 13 additions & 0 deletions client/lib/resources/containment.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
package resources

// A Containment will cleanup resources created by an executor.
type Containment interface {
// Apply enables containment on pid.
Apply(pid int) error

// Cleanup will purge executor resources like cgroups.
Cleanup() error

// GetPIDs will return the processes overseen by the Containment
GetPIDs() PIDs
}
11 changes: 11 additions & 0 deletions client/lib/resources/containment_default.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
//go:build !linux

package resources

type containment struct {
// non-linux executors currently do not create resources to be cleaned up
}

func (c *containment) Cleanup() error {
return nil
}
107 changes: 107 additions & 0 deletions client/lib/resources/containment_linux.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
//go:build linux

package resources

import (
"fmt"
"os"
"path/filepath"
"sync"

"github.com/hashicorp/go-hclog"
"github.com/hashicorp/nomad/client/lib/cgutil"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fs2"
"github.com/opencontainers/runc/libcontainer/configs"
)

type containment struct {
lock sync.RWMutex
cgroup *configs.Cgroup
logger hclog.Logger
}

func Contain(logger hclog.Logger, cgroup *configs.Cgroup) *containment {
return &containment{
cgroup: cgroup,
logger: logger.Named("containment"),
}
}

func (c *containment) Apply(pid int) error {
c.lock.Lock()
defer c.lock.Unlock()

c.logger.Trace("create containment for", "cgroup", c.cgroup, "pid", pid)

// for v2 use manager to create and enter the cgroup
if cgutil.UseV2 {
mgr, err := fs2.NewManager(c.cgroup, "", false)
if err != nil {
return fmt.Errorf("failed to create v2 cgroup manager for containment: %w", err)
}

// add the pid to the cgroup
if err = mgr.Apply(pid); err != nil {
return fmt.Errorf("failed to apply v2 cgroup containment: %w", err)
}

// in v2 it is important to set the device resource configuration
if err = mgr.Set(c.cgroup.Resources); err != nil {
return fmt.Errorf("failed to set v2 cgroup resources: %w", err)
}

return nil
}

// for v1 a random cgroup was created already; just enter it
if err := cgroups.EnterPid(c.cgroup.Paths, pid); err != nil {
return fmt.Errorf("failed to add pid to v1 cgroup: %w", err)
}

return nil
}

func (c *containment) Cleanup() error {
c.lock.Lock()
defer c.lock.Unlock()

// the current pid is of the executor, who manages the task process cleanup
executorPID := os.Getpid()
c.logger.Trace("cleanup on", "cgroup", c.cgroup, "executor_pid", executorPID)

// destroy the task processes
destroyer := cgutil.NewGroupKiller(c.logger, executorPID)
return destroyer.KillGroup(c.cgroup)
}

func (c *containment) GetPIDs() PIDs {
c.lock.Lock()
defer c.lock.Unlock()

m := make(PIDs)
if c.cgroup == nil {
return m
}

// get the cgroup path under containment
var path string
if cgutil.UseV2 {
path = filepath.Join(cgutil.CgroupRoot, c.cgroup.Path)
} else {
path = c.cgroup.Paths["freezer"]
}

// find the pids in the cgroup under containment
pids, err := cgroups.GetAllPids(path)
if err != nil {
c.logger.Debug("failed to get pids", "cgroup", c.cgroup, "error", err)
return m
}

for _, pid := range pids {
m[pid] = NewPID(pid)
}

return m
}
Loading

0 comments on commit c8d2402

Please sign in to comment.