Skip to content

Commit

Permalink
Add support for cgroup namespace
Browse files Browse the repository at this point in the history
Cgroup namespace can be configured in `config.json` as other
namespaces. Here is an example:

```
"namespaces": [
	{
		"type": "pid"
	},
	{
		"type": "network"
	},
	{
		"type": "ipc"
	},
	{
		"type": "uts"
	},
	{
		"type": "mount"
	},
	{
		"type": "cgroup"
	}
],

```

Note that if you want to run a container which has shared cgroup ns with
another container, then it's strongly recommended that you set
proper `CgroupsPath` of both containers(the second container's cgroup
path must be the subdirectory of the first one). Or there might be
some unexpected results.

Signed-off-by: Yuanhong Peng <[email protected]>
Signed-off-by: Michael Crosby <[email protected]>
  • Loading branch information
Yuanhong Peng authored and crosbymichael committed Oct 31, 2018
1 parent 9a3a8a5 commit df3fa11
Show file tree
Hide file tree
Showing 10 changed files with 216 additions and 75 deletions.
1 change: 1 addition & 0 deletions libcontainer/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,7 @@ config := &configs.Config{
{Type: configs.NEWPID},
{Type: configs.NEWUSER},
{Type: configs.NEWNET},
{Type: configs.NEWCGROUP},
}),
Cgroups: &configs.Cgroup{
Name: "test-container",
Expand Down
21 changes: 11 additions & 10 deletions libcontainer/SPEC.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,16 +21,17 @@ Minimum requirements:

### Namespaces

| Flag | Enabled |
| ------------ | ------- |
| CLONE_NEWPID | 1 |
| CLONE_NEWUTS | 1 |
| CLONE_NEWIPC | 1 |
| CLONE_NEWNET | 1 |
| CLONE_NEWNS | 1 |
| CLONE_NEWUSER | 1 |

Namespaces are created for the container via the `clone` syscall.
| Flag | Enabled |
| --------------- | ------- |
| CLONE_NEWPID | 1 |
| CLONE_NEWUTS | 1 |
| CLONE_NEWIPC | 1 |
| CLONE_NEWNET | 1 |
| CLONE_NEWNS | 1 |
| CLONE_NEWUSER | 1 |
| CLONE_NEWCGROUP | 1 |

Namespaces are created for the container via the `unshare` syscall.


### Filesystem
Expand Down
8 changes: 4 additions & 4 deletions libcontainer/cgroups/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ import (
)

const (
cgroupNamePrefix = "name="
CgroupNamePrefix = "name="
CgroupProcesses = "cgroup.procs"
)

Expand Down Expand Up @@ -156,8 +156,8 @@ func getCgroupMountsHelper(ss map[string]bool, mi io.Reader, all bool) ([]Mount,
continue
}
ss[opt] = true
if strings.HasPrefix(opt, cgroupNamePrefix) {
opt = opt[len(cgroupNamePrefix):]
if strings.HasPrefix(opt, CgroupNamePrefix) {
opt = opt[len(CgroupNamePrefix):]
}
m.Subsystems = append(m.Subsystems, opt)
numFound++
Expand Down Expand Up @@ -343,7 +343,7 @@ func getControllerPath(subsystem string, cgroups map[string]string) (string, err
return p, nil
}

if p, ok := cgroups[cgroupNamePrefix+subsystem]; ok {
if p, ok := cgroups[CgroupNamePrefix+subsystem]; ok {
return p, nil
}

Expand Down
5 changes: 1 addition & 4 deletions libcontainer/configs/namespaces_syscall.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,17 +8,14 @@ func (n *Namespace) Syscall() int {
return namespaceInfo[n.Type]
}

// This is not yet in the Go stdlib.
const syscall_CLONE_NEWCGROUP = (1 << 29)

var namespaceInfo = map[NamespaceType]int{
NEWNET: unix.CLONE_NEWNET,
NEWNS: unix.CLONE_NEWNS,
NEWUSER: unix.CLONE_NEWUSER,
NEWIPC: unix.CLONE_NEWIPC,
NEWUTS: unix.CLONE_NEWUTS,
NEWPID: unix.CLONE_NEWPID,
NEWCGROUP: syscall_CLONE_NEWCGROUP,
NEWCGROUP: unix.CLONE_NEWCGROUP,
}

// CloneFlags parses the container's Namespaces options to set the correct
Expand Down
12 changes: 12 additions & 0 deletions libcontainer/configs/validate/validator.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,9 @@ func (v *ConfigValidator) Validate(config *configs.Config) error {
if err := v.usernamespace(config); err != nil {
return err
}
if err := v.cgroupnamespace(config); err != nil {
return err
}
if err := v.sysctl(config); err != nil {
return err
}
Expand Down Expand Up @@ -116,6 +119,15 @@ func (v *ConfigValidator) usernamespace(config *configs.Config) error {
return nil
}

func (v *ConfigValidator) cgroupnamespace(config *configs.Config) error {
if config.Namespaces.Contains(configs.NEWCGROUP) {
if _, err := os.Stat("/proc/self/ns/cgroup"); os.IsNotExist(err) {
return fmt.Errorf("cgroup namespaces aren't enabled in the kernel")
}
}
return nil
}

// sysctl validates that the specified sysctl keys are valid or not.
// /proc/sys isn't completely namespaced and depending on which namespaces
// are specified, a subset of sysctls are permitted.
Expand Down
1 change: 0 additions & 1 deletion libcontainer/container_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -1745,7 +1745,6 @@ func (c *linuxContainer) currentState() (*State, error) {
// can setns in order.
func (c *linuxContainer) orderNamespacePaths(namespaces map[configs.NamespaceType]string) ([]string, error) {
paths := []string{}

for _, ns := range configs.NamespaceTypes() {

// Remove namespaces that we don't need to join.
Expand Down
57 changes: 57 additions & 0 deletions libcontainer/integration/exec_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1776,3 +1776,60 @@ func TestTmpfsCopyUp(t *testing.T) {
t.Fatalf("/etc/passwd not copied up as expected: %v", outputLs)
}
}

func TestCGROUPPrivate(t *testing.T) {
if _, err := os.Stat("/proc/self/ns/cgroup"); os.IsNotExist(err) {
t.Skip("cgroupns is unsupported")
}
if testing.Short() {
return
}

rootfs, err := newRootfs()
ok(t, err)
defer remove(rootfs)

l, err := os.Readlink("/proc/1/ns/cgroup")
ok(t, err)

config := newTemplateConfig(rootfs)
config.Namespaces.Add(configs.NEWCGROUP, "")
buffers, exitCode, err := runContainer(config, "", "readlink", "/proc/self/ns/cgroup")
ok(t, err)

if exitCode != 0 {
t.Fatalf("exit code not 0. code %d stderr %q", exitCode, buffers.Stderr)
}

if actual := strings.Trim(buffers.Stdout.String(), "\n"); actual == l {
t.Fatalf("cgroup link should be private to the container but equals host %q %q", actual, l)
}
}

func TestCGROUPHost(t *testing.T) {
if _, err := os.Stat("/proc/self/ns/cgroup"); os.IsNotExist(err) {
t.Skip("cgroupns is unsupported")
}
if testing.Short() {
return
}

rootfs, err := newRootfs()
ok(t, err)
defer remove(rootfs)

l, err := os.Readlink("/proc/1/ns/cgroup")
ok(t, err)

config := newTemplateConfig(rootfs)
buffers, exitCode, err := runContainer(config, "", "readlink", "/proc/self/ns/cgroup")
ok(t, err)

if exitCode != 0 {
t.Fatalf("exit code not 0. code %d stderr %q", exitCode, buffers.Stderr)
}

if actual := strings.Trim(buffers.Stdout.String(), "\n"); actual != l {
t.Fatalf("cgroup link not equal to host link %q %q", actual, l)
}
}
63 changes: 39 additions & 24 deletions libcontainer/nsenter/nsexec.c
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,12 @@ enum sync_t {
SYNC_ERR = 0xFF, /* Fatal error, no turning back. The error code follows. */
};

/*
* Synchronisation value for cgroup namespace setup.
* The same constant is defined in process_linux.go as "createCgroupns".
*/
#define CREATECGROUPNS 0x80

/* longjmp() arguments. */
#define JUMP_PARENT 0x00
#define JUMP_CHILD 0xA0
Expand Down Expand Up @@ -640,7 +646,6 @@ void nsexec(void)
case JUMP_PARENT:{
int len;
pid_t child, first_child = -1;
char buf[JSON_MAX];
bool ready = false;

/* For debugging. */
Expand Down Expand Up @@ -716,6 +721,18 @@ void nsexec(void)
kill(child, SIGKILL);
bail("failed to sync with child: write(SYNC_RECVPID_ACK)");
}

/* Send the init_func pid back to our parent.
*
* Send the init_func pid and the pid of the first child back to our parent.
* We need to send both back because we can't reap the first child we created (CLONE_PARENT).
* It becomes the responsibility of our parent to reap the first child.
*/
len = dprintf(pipenum, "{\"pid\": %d, \"pid_first\": %d}\n", child, first_child);
if (len < 0) {
kill(child, SIGKILL);
bail("unable to generate JSON for child pid");
}
}
break;
case SYNC_CHILD_READY:
Expand Down Expand Up @@ -759,23 +776,6 @@ void nsexec(void)
bail("unexpected sync value: %u", s);
}
}

/*
* Send the init_func pid and the pid of the first child back to our parent.
*
* We need to send both back because we can't reap the first child we created (CLONE_PARENT).
* It becomes the responsibility of our parent to reap the first child.
*/
len = snprintf(buf, JSON_MAX, "{\"pid\": %d, \"pid_first\": %d}\n", child, first_child);
if (len < 0) {
kill(child, SIGKILL);
bail("unable to generate JSON for child pid");
}
if (write(pipenum, buf, len) != len) {
kill(child, SIGKILL);
bail("unable to send child pid to bootstrapper");
}

exit(0);
}

Expand Down Expand Up @@ -862,14 +862,17 @@ void nsexec(void)
if (setresuid(0, 0, 0) < 0)
bail("failed to become root in user namespace");
}

/*
* Unshare all of the namespaces. Note that we don't merge this
* with clone() because there were some old kernel versions where
* clone(CLONE_PARENT | CLONE_NEWPID) was broken, so we'll just do
* it the long way.
* Unshare all of the namespaces. Now, it should be noted that this
* ordering might break in the future (especially with rootless
* containers). But for now, it's not possible to split this into
* CLONE_NEWUSER + [the rest] because of some RHEL SELinux issues.
*
* Note that we don't merge this with clone() because there were
* some old kernel versions where clone(CLONE_PARENT | CLONE_NEWPID)
* was broken, so we'll just do it the long way anyway.
*/
if (unshare(config.cloneflags) < 0)
if (unshare(config.cloneflags & ~CLONE_NEWCGROUP) < 0)
bail("failed to unshare namespaces");

/*
Expand Down Expand Up @@ -958,6 +961,18 @@ void nsexec(void)
bail("setgroups failed");
}

/* ... wait until our topmost parent has finished cgroup setup in p.manager.Apply() ... */
if (config.cloneflags & CLONE_NEWCGROUP) {
uint8_t value;
if (read(pipenum, &value, sizeof(value)) != sizeof(value))
bail("read synchronisation value failed");
if (value == CREATECGROUPNS) {
if (unshare(CLONE_NEWCGROUP) < 0)
bail("failed to unshare cgroup namespace");
} else
bail("received unknown synchronisation value");
}

s = SYNC_CHILD_READY;
if (write(syncfd, &s, sizeof(s)) != sizeof(s))
bail("failed to sync with patent: write(SYNC_CHILD_READY)");
Expand Down
75 changes: 49 additions & 26 deletions libcontainer/process_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,10 @@ import (
"golang.org/x/sys/unix"
)

// Synchronisation value for cgroup namespace setup.
// The same constant is defined in nsexec.c as "CREATECGROUPNS".
const createCgroupns = 0x80

type parentProcess interface {
// pid returns the pid for the running process.
pid() int
Expand Down Expand Up @@ -225,12 +229,17 @@ func (p *initProcess) externalDescriptors() []string {
return p.fds
}

// execSetns runs the process that executes C code to perform the setns calls
// because setns support requires the C process to fork off a child and perform the setns
// before the go runtime boots, we wait on the process to die and receive the child's pid
// over the provided pipe.
// This is called by initProcess.start function
func (p *initProcess) execSetns() error {
// getChildPid receives the final child's pid over the provided pipe.
func (p *initProcess) getChildPid() (int, error) {
var pid pid
if err := json.NewDecoder(p.parentPipe).Decode(&pid); err != nil {
p.cmd.Wait()
return -1, err
}
return pid.Pid, nil
}

func (p *initProcess) waitForChildExit(childPid int) error {
status, err := p.cmd.Process.Wait()
if err != nil {
p.cmd.Wait()
Expand All @@ -240,22 +249,8 @@ func (p *initProcess) execSetns() error {
p.cmd.Wait()
return &exec.ExitError{ProcessState: status}
}
var pid *pid
if err := json.NewDecoder(p.parentPipe).Decode(&pid); err != nil {
p.cmd.Wait()
return err
}

// Clean up the zombie parent process
firstChildProcess, err := os.FindProcess(pid.PidFirstChild)
if err != nil {
return err
}

// Ignore the error in case the child has already been reaped for any reason
_, _ = firstChildProcess.Wait()

process, err := os.FindProcess(pid.Pid)
process, err := os.FindProcess(childPid)
if err != nil {
return err
}
Expand Down Expand Up @@ -297,19 +292,47 @@ func (p *initProcess) start() error {
if _, err := io.Copy(p.parentPipe, p.bootstrapData); err != nil {
return newSystemErrorWithCause(err, "copying bootstrap data to pipe")
}

if err := p.execSetns(); err != nil {
return newSystemErrorWithCause(err, "running exec setns process for init")
childPid, err := p.getChildPid()
if err != nil {
return newSystemErrorWithCause(err, "getting the final child's pid from pipe")
}

// Save the standard descriptor names before the container process
// can potentially move them (e.g., via dup2()). If we don't do this now,
// we won't know at checkpoint time which file descriptor to look up.
fds, err := getPipeFds(p.pid())
fds, err := getPipeFds(childPid)
if err != nil {
return newSystemErrorWithCausef(err, "getting pipe fds for pid %d", p.pid())
return newSystemErrorWithCausef(err, "getting pipe fds for pid %d", childPid)
}
p.setExternalDescriptors(fds)
// Do this before syncing with child so that no children
// can escape the cgroup
if err := p.manager.Apply(childPid); err != nil {
return newSystemErrorWithCause(err, "applying cgroup configuration for process")
}
if p.intelRdtManager != nil {
if err := p.intelRdtManager.Apply(childPid); err != nil {
return newSystemErrorWithCause(err, "applying Intel RDT configuration for process")
}
}
// Now it's time to setup cgroup namesapce
if p.config.Config.Namespaces.Contains(configs.NEWCGROUP) && p.config.Config.Namespaces.PathOf(configs.NEWCGROUP) == "" {
if _, err := p.parentPipe.Write([]byte{createCgroupns}); err != nil {
return newSystemErrorWithCause(err, "sending synchronization value to init process")
}
}

// Wait for our first child to exit
if err := p.waitForChildExit(childPid); err != nil {
return newSystemErrorWithCause(err, "waiting for our first child to exit")
}

defer func() {
if err != nil {
// TODO: should not be the responsibility to call here
p.manager.Destroy()
}
}()
if err := p.createNetworkInterfaces(); err != nil {
return newSystemErrorWithCause(err, "creating network interfaces")
}
Expand Down
Loading

0 comments on commit df3fa11

Please sign in to comment.