diff --git a/configs/config.go b/configs/config.go index 293af0a9b..7275b6421 100644 --- a/configs/config.go +++ b/configs/config.go @@ -13,6 +13,40 @@ type IDMap struct { Size int `json:"size"` } +type Seccomp struct { + Syscalls []*Syscall `json:"syscalls"` +} + +type Action int + +const ( + Kill Action = iota - 3 + Trap + Allow +) + +type Operator int + +const ( + EqualTo Operator = iota + NotEqualTo + GreatherThan + LessThan + MaskEqualTo +) + +type Arg struct { + Index int `json:"index"` + Value uint32 `json:"value"` + Op Operator `json:"op"` +} + +type Syscall struct { + Value int `json:"value"` + Action Action `json:"action"` + Args []*Arg `json:"args"` +} + // TODO Windows. Many of these fields should be factored out into those parts // which are common across platforms, and those which are platform specific. @@ -104,4 +138,9 @@ type Config struct { // SystemProperties is a map of properties and their values. It is the equivalent of using // sysctl -w my.property.name value in Linux. SystemProperties map[string]string `json:"system_properties"` + + // Seccomp allows actions to be taken whenever a syscall is made within the container. + // By default, all syscalls are allowed with actions to allow, trap, kill, or return an errno + // can be specified on a per syscall basis. + Seccomp *Seccomp `json:"seccomp"` } diff --git a/init_linux.go b/init_linux.go index 1771fd193..3eabe3cd6 100644 --- a/init_linux.go +++ b/init_linux.go @@ -13,6 +13,7 @@ import ( "github.com/docker/libcontainer/cgroups" "github.com/docker/libcontainer/configs" "github.com/docker/libcontainer/netlink" + "github.com/docker/libcontainer/seccomp" "github.com/docker/libcontainer/system" "github.com/docker/libcontainer/user" "github.com/docker/libcontainer/utils" @@ -259,3 +260,61 @@ func killCgroupProcesses(m cgroups.Manager) error { } return nil } + +func finalizeSeccomp(config *initConfig) error { + if config.Config.Seccomp == nil { + return nil + } + context := seccomp.New() + for _, s := range config.Config.Seccomp.Syscalls { + ss := &seccomp.Syscall{ + Value: uint32(s.Value), + Action: seccompAction(s.Action), + } + if len(s.Args) > 0 { + ss.Args = seccompArgs(s.Args) + } + context.Add(ss) + } + return context.Load() +} + +func seccompAction(a configs.Action) seccomp.Action { + switch a { + case configs.Kill: + return seccomp.Kill + case configs.Trap: + return seccomp.Trap + case configs.Allow: + return seccomp.Allow + } + return seccomp.Error(syscall.Errno(int(a))) +} + +func seccompArgs(args []*configs.Arg) seccomp.Args { + var sa []seccomp.Arg + for _, a := range args { + sa = append(sa, seccomp.Arg{ + Index: uint32(a.Index), + Op: seccompOperator(a.Op), + Value: uint(a.Value), + }) + } + return seccomp.Args{sa} +} + +func seccompOperator(o configs.Operator) seccomp.Operator { + switch o { + case configs.EqualTo: + return seccomp.EqualTo + case configs.NotEqualTo: + return seccomp.NotEqualTo + case configs.GreatherThan: + return seccomp.GreatherThan + case configs.LessThan: + return seccomp.LessThan + case configs.MaskEqualTo: + return seccomp.MaskEqualTo + } + return 0 +} diff --git a/integration/exec_test.go b/integration/exec_test.go index 20d781ee5..3b8a83b73 100644 --- a/integration/exec_test.go +++ b/integration/exec_test.go @@ -714,3 +714,27 @@ func TestSystemProperties(t *testing.T) { t.Fatalf("kernel.shmmni property expected to be 8192, but is %s", shmmniOutput) } } + +func TestSeccompNoChown(t *testing.T) { + if testing.Short() { + return + } + rootfs, err := newRootfs() + if err != nil { + t.Fatal(err) + } + defer remove(rootfs) + config := newTemplateConfig(rootfs) + config.Seccomp = &configs.Seccomp{} + config.Seccomp.Syscalls = append(config.Seccomp.Syscalls, &configs.Syscall{ + Value: syscall.SYS_CHOWN, + Action: configs.Action(syscall.EPERM), + }) + buffers, _, err := runContainer(config, "", "/bin/sh", "-c", "chown 1:1 /tmp") + if err == nil { + t.Fatal("running chown in a container should fail") + } + if s := buffers.String(); !strings.Contains(s, "not permitted") { + t.Fatalf("running chown should result in an EPERM but got %q", s) + } +} diff --git a/integration/utils_test.go b/integration/utils_test.go index 41b914cac..0f9181332 100644 --- a/integration/utils_test.go +++ b/integration/utils_test.go @@ -122,11 +122,11 @@ func runContainer(config *configs.Config, console string, args ...string) (buffe err = container.Start(process) if err != nil { - return nil, -1, err + return buffers, -1, err } ps, err := process.Wait() if err != nil { - return nil, -1, err + return buffers, -1, err } status := ps.Sys().(syscall.WaitStatus) if status.Exited() { @@ -134,7 +134,7 @@ func runContainer(config *configs.Config, console string, args ...string) (buffe } else if status.Signaled() { exitCode = -int(status.Signal()) } else { - return nil, -1, err + return buffers, -1, err } return } diff --git a/nsinit/config.go b/nsinit/config.go index bf3506c25..7fb28a58c 100644 --- a/nsinit/config.go +++ b/nsinit/config.go @@ -19,32 +19,33 @@ import ( const defaultMountFlags = syscall.MS_NOEXEC | syscall.MS_NOSUID | syscall.MS_NODEV var createFlags = []cli.Flag{ - cli.IntFlag{Name: "parent-death-signal", Usage: "set the signal that will be delivered to the process in case the parent dies"}, + cli.BoolFlag{Name: "cgroup", Usage: "mount the cgroup data for the container"}, cli.BoolFlag{Name: "read-only", Usage: "set the container's rootfs as read-only"}, - cli.StringSliceFlag{Name: "bind", Value: &cli.StringSlice{}, Usage: "add bind mounts to the container"}, - cli.StringSliceFlag{Name: "tmpfs", Value: &cli.StringSlice{}, Usage: "add tmpfs mounts to the container"}, cli.IntFlag{Name: "cpushares", Usage: "set the cpushares for the container"}, cli.IntFlag{Name: "memory-limit", Usage: "set the memory limit for the container"}, cli.IntFlag{Name: "memory-swap", Usage: "set the memory swap limit for the container"}, + cli.IntFlag{Name: "parent-death-signal", Usage: "set the signal that will be delivered to the process in case the parent dies"}, + cli.IntFlag{Name: "userns-root-uid", Usage: "set the user namespace root uid"}, + cli.IntFlag{Name: "veth-mtu", Usage: "veth mtu"}, + cli.StringFlag{Name: "apparmor-profile", Usage: "set the apparmor profile"}, cli.StringFlag{Name: "cpuset-cpus", Usage: "set the cpuset cpus"}, cli.StringFlag{Name: "cpuset-mems", Usage: "set the cpuset mems"}, - cli.StringFlag{Name: "apparmor-profile", Usage: "set the apparmor profile"}, - cli.StringFlag{Name: "process-label", Usage: "set the process label"}, - cli.StringFlag{Name: "mount-label", Usage: "set the mount label"}, - cli.StringFlag{Name: "rootfs", Usage: "set the rootfs"}, - cli.IntFlag{Name: "userns-root-uid", Usage: "set the user namespace root uid"}, cli.StringFlag{Name: "hostname", Value: "nsinit", Usage: "hostname value for the container"}, - cli.StringFlag{Name: "net", Value: "", Usage: "network namespace"}, cli.StringFlag{Name: "ipc", Value: "", Usage: "ipc namespace"}, + cli.StringFlag{Name: "mnt", Value: "", Usage: "mount namespace"}, + cli.StringFlag{Name: "mount-label", Usage: "set the mount label"}, + cli.StringFlag{Name: "net", Value: "", Usage: "network namespace"}, cli.StringFlag{Name: "pid", Value: "", Usage: "pid namespace"}, + cli.StringFlag{Name: "process-label", Usage: "set the process label"}, + cli.StringFlag{Name: "rootfs", Usage: "set the rootfs"}, + cli.StringFlag{Name: "security", Value: "", Usage: "set the security profile (high, medium, low)"}, cli.StringFlag{Name: "uts", Value: "", Usage: "uts namespace"}, - cli.StringFlag{Name: "mnt", Value: "", Usage: "mount namespace"}, - cli.StringFlag{Name: "veth-bridge", Usage: "veth bridge"}, cli.StringFlag{Name: "veth-address", Usage: "veth ip address"}, + cli.StringFlag{Name: "veth-bridge", Usage: "veth bridge"}, cli.StringFlag{Name: "veth-gateway", Usage: "veth gateway address"}, - cli.IntFlag{Name: "veth-mtu", Usage: "veth mtu"}, - cli.BoolFlag{Name: "cgroup", Usage: "mount the cgroup data for the container"}, + cli.StringSliceFlag{Name: "bind", Value: &cli.StringSlice{}, Usage: "add bind mounts to the container"}, cli.StringSliceFlag{Name: "sysctl", Value: &cli.StringSlice{}, Usage: "set system properties in the container"}, + cli.StringSliceFlag{Name: "tmpfs", Value: &cli.StringSlice{}, Usage: "add tmpfs mounts to the container"}, } var configCommand = cli.Command{ @@ -203,6 +204,24 @@ func modify(config *configs.Config, context *cli.Context) { Device: "cgroup", }) } + modifySecurityProfile(context, config) +} + +func modifySecurityProfile(context *cli.Context, config *configs.Config) { + profileName := context.String("security") + if profileName == "" { + return + } + profile := profiles[profileName] + if profile == nil { + logrus.Fatalf("invalid profile name %q", profileName) + } + config.Rlimits = profile.Rlimits + config.Capabilities = profile.Capabilities + config.Seccomp = profile.Seccomp + config.AppArmorProfile = profile.ApparmorProfile + config.MountLabel = profile.MountLabel + config.ProcessLabel = profile.ProcessLabel } func getTemplate() *configs.Config { @@ -290,13 +309,5 @@ func getTemplate() *configs.Config { Flags: defaultMountFlags | syscall.MS_RDONLY, }, }, - Rlimits: []configs.Rlimit{ - { - Type: syscall.RLIMIT_NOFILE, - Hard: 1024, - Soft: 1024, - }, - }, } - } diff --git a/nsinit/security.go b/nsinit/security.go new file mode 100644 index 000000000..7835c4b91 --- /dev/null +++ b/nsinit/security.go @@ -0,0 +1,272 @@ +package main + +import ( + "syscall" + + "github.com/docker/libcontainer/configs" + "github.com/docker/libcontainer/system" +) + +var profiles = map[string]*securityProfile{ + "high": highProfile, + "medium": mediumProfile, + "low": lowProfile, +} + +type securityProfile struct { + Capabilities []string `json:"capabilities"` + ApparmorProfile string `json:"apparmor_profile"` + MountLabel string `json:"mount_label"` + ProcessLabel string `json:"process_label"` + Rlimits []configs.Rlimit `json:"rlimits"` + Seccomp *configs.Seccomp `json:"seccomp"` +} + +// this should be a runtime config that is not able to do things like apt-get or yum install. +var highProfile = &securityProfile{ + Capabilities: []string{ + "NET_BIND_SERVICE", + "KILL", + "AUDIT_WRITE", + }, + Rlimits: []configs.Rlimit{ + { + Type: syscall.RLIMIT_NOFILE, + Hard: 1024, + Soft: 1024, + }, + }, + // http://man7.org/linux/man-pages/man2/syscalls.2.html + Seccomp: &configs.Seccomp{ + Syscalls: []*configs.Syscall{ + { + Value: syscall.SYS_CAPSET, // http://man7.org/linux/man-pages/man2/capset.2.html + Action: configs.Action(syscall.EPERM), + }, + { + Value: syscall.SYS_UNSHARE, // http://man7.org/linux/man-pages/man2/unshare.2.html + Action: configs.Action(syscall.EPERM), + }, + { + Value: int(system.SysSetns()), + Action: configs.Action(syscall.EPERM), + }, + { + Value: syscall.SYS_MOUNT, // http://man7.org/linux/man-pages/man2/mount.2.html + Action: configs.Action(syscall.EPERM), + }, + { + Value: syscall.SYS_UMOUNT2, // http://man7.org/linux/man-pages/man2/umount.2.html + Action: configs.Action(syscall.EPERM), + }, + { + Value: syscall.SYS_CREATE_MODULE, // http://man7.org/linux/man-pages/man2/create_module.2.html + Action: configs.Action(syscall.EPERM), + }, + { + Value: syscall.SYS_DELETE_MODULE, // http://man7.org/linux/man-pages/man2/delete_module.2.html + Action: configs.Action(syscall.EPERM), + }, + { + Value: syscall.SYS_CHMOD, // http://man7.org/linux/man-pages/man2/chmod.2.html + Action: configs.Action(syscall.EPERM), + }, + { + Value: syscall.SYS_CHOWN, // http://man7.org/linux/man-pages/man2/chown.2.html + Action: configs.Action(syscall.EPERM), + }, + { + Value: syscall.SYS_LINK, // http://man7.org/linux/man-pages/man2/link.2.html + Action: configs.Action(syscall.EPERM), + }, + { + Value: syscall.SYS_LINKAT, // http://man7.org/linux/man-pages/man2/linkat.2.html + Action: configs.Action(syscall.EPERM), + }, + { + Value: syscall.SYS_UNLINK, // http://man7.org/linux/man-pages/man2/unlink.2.html + Action: configs.Action(syscall.EPERM), + }, + { + Value: syscall.SYS_UNLINKAT, // http://man7.org/linux/man-pages/man2/unlinkat.2.html + Action: configs.Action(syscall.EPERM), + }, + { + Value: syscall.SYS_CHROOT, // http://man7.org/linux/man-pages/man2/chroot.2.html + Action: configs.Action(syscall.EPERM), + }, + { + Value: syscall.SYS_KEXEC_LOAD, // http://man7.org/linux/man-pages/man2/kexec_load.2.html + Action: configs.Action(syscall.EPERM), + }, + { + Value: syscall.SYS_SETDOMAINNAME, // http://man7.org/linux/man-pages/man2/setdomainname.2.html + Action: configs.Action(syscall.EPERM), + }, + { + Value: syscall.SYS_SETHOSTNAME, // http://man7.org/linux/man-pages/man2/sethostname.2.html + Action: configs.Action(syscall.EPERM), + }, + { + Value: syscall.SYS_CLONE, // http://man7.org/linux/man-pages/man2/clone.2.html + Action: configs.Action(syscall.EPERM), + Args: []*configs.Arg{ + { + Index: 0, // the glibc wrapper has the flags at arg2 but the raw syscall has flags at arg0 + Value: syscall.CLONE_NEWUSER, + Op: configs.MaskEqualTo, + }, + }, + }, + }, + }, +} + +// This is a medium level profile that should be able to do things like installing from +// apt-get or yum. +var mediumProfile = &securityProfile{ + Capabilities: []string{ + "CHOWN", + "DAC_OVERRIDE", + "FSETID", + "FOWNER", + "SETGID", + "SETUID", + "SETFCAP", + "SETPCAP", + "NET_BIND_SERVICE", + "KILL", + "AUDIT_WRITE", + }, + Rlimits: []configs.Rlimit{ + { + Type: syscall.RLIMIT_NOFILE, + Hard: 1024, + Soft: 1024, + }, + }, + // http://man7.org/linux/man-pages/man2/syscalls.2.html + Seccomp: &configs.Seccomp{ + Syscalls: []*configs.Syscall{ + { + Value: syscall.SYS_UNSHARE, // http://man7.org/linux/man-pages/man2/unshare.2.html + Action: configs.Action(syscall.EPERM), + }, + { + Value: int(system.SysSetns()), + Action: configs.Action(syscall.EPERM), + }, + { + Value: syscall.SYS_MOUNT, // http://man7.org/linux/man-pages/man2/mount.2.html + Action: configs.Action(syscall.EPERM), + }, + { + Value: syscall.SYS_UMOUNT2, // http://man7.org/linux/man-pages/man2/umount.2.html + Action: configs.Action(syscall.EPERM), + }, + { + Value: syscall.SYS_CHROOT, // http://man7.org/linux/man-pages/man2/chroot.2.html + Action: configs.Action(syscall.EPERM), + }, + { + Value: syscall.SYS_CREATE_MODULE, // http://man7.org/linux/man-pages/man2/create_module.2.html + Action: configs.Action(syscall.EPERM), + }, + { + Value: syscall.SYS_DELETE_MODULE, // http://man7.org/linux/man-pages/man2/delete_module.2.html + Action: configs.Action(syscall.EPERM), + }, + { + Value: syscall.SYS_KEXEC_LOAD, // http://man7.org/linux/man-pages/man2/kexec_load.2.html + Action: configs.Action(syscall.EPERM), + }, + { + Value: syscall.SYS_SETDOMAINNAME, // http://man7.org/linux/man-pages/man2/setdomainname.2.html + Action: configs.Action(syscall.EPERM), + }, + { + Value: syscall.SYS_SETHOSTNAME, // http://man7.org/linux/man-pages/man2/sethostname.2.html + Action: configs.Action(syscall.EPERM), + }, + { + Value: syscall.SYS_CLONE, // http://man7.org/linux/man-pages/man2/clone.2.html + Action: configs.Action(syscall.EPERM), + Args: []*configs.Arg{ + { + Index: 0, // the glibc wrapper has the flags at arg2 but the raw syscall has flags at arg0 + Value: syscall.CLONE_NEWUSER, + Op: configs.MaskEqualTo, + }, + }, + }, + }, + }, +} + +var lowProfile = &securityProfile{ + Capabilities: []string{ + "CHOWN", + "DAC_OVERRIDE", + "FSETID", + "FOWNER", + "SETGID", + "SETUID", + "SYS_CHROOT", + "SETFCAP", + "SETPCAP", + "NET_BIND_SERVICE", + "KILL", + "AUDIT_WRITE", + }, + Rlimits: []configs.Rlimit{ + { + Type: syscall.RLIMIT_NOFILE, + Hard: 1024, + Soft: 1024, + }, + }, + // http://man7.org/linux/man-pages/man2/syscalls.2.html + Seccomp: &configs.Seccomp{ + Syscalls: []*configs.Syscall{ + { + Value: syscall.SYS_UNSHARE, // http://man7.org/linux/man-pages/man2/unshare.2.html + Action: configs.Action(syscall.EPERM), + }, + { + Value: int(system.SysSetns()), + Action: configs.Action(syscall.EPERM), + }, + { + Value: syscall.SYS_MOUNT, // http://man7.org/linux/man-pages/man2/mount.2.html + Action: configs.Action(syscall.EPERM), + }, + { + Value: syscall.SYS_UMOUNT2, // http://man7.org/linux/man-pages/man2/umount.2.html + Action: configs.Action(syscall.EPERM), + }, + { + Value: syscall.SYS_CREATE_MODULE, // http://man7.org/linux/man-pages/man2/create_module.2.html + Action: configs.Action(syscall.EPERM), + }, + { + Value: syscall.SYS_DELETE_MODULE, // http://man7.org/linux/man-pages/man2/delete_module.2.html + Action: configs.Action(syscall.EPERM), + }, + { + Value: syscall.SYS_KEXEC_LOAD, // http://man7.org/linux/man-pages/man2/kexec_load.2.html + Action: configs.Action(syscall.EPERM), + }, + { + Value: syscall.SYS_CLONE, // http://man7.org/linux/man-pages/man2/clone.2.html + Action: configs.Action(syscall.EPERM), + Args: []*configs.Arg{ + { + Index: 0, // the glibc wrapper has the flags at arg2 but the raw syscall has flags at arg0 + Value: syscall.CLONE_NEWUSER, + Op: configs.MaskEqualTo, + }, + }, + }, + }, + }, +} diff --git a/seccomp/bpf.go b/seccomp/bpf.go new file mode 100644 index 000000000..a4b3bdf7a --- /dev/null +++ b/seccomp/bpf.go @@ -0,0 +1,32 @@ +package seccomp + +import "strings" + +type bpfLabel struct { + label string + location uint32 +} + +type bpfLabels []bpfLabel + +// labelIndex returns the index for the label if it exists in the slice. +// if it does not exist in the slice it appends the label lb to the end +// of the slice and returns the index. +func labelIndex(labels *bpfLabels, lb string) uint32 { + var id uint32 + for id = 0; id < uint32(len(*labels)); id++ { + if strings.EqualFold(lb, (*labels)[id].label) { + return id + } + } + *labels = append(*labels, bpfLabel{lb, 0xffffffff}) + return id +} + +func scmpBpfStmt(code uint16, k uint32) sockFilter { + return sockFilter{code, 0, 0, k} +} + +func scmpBpfJump(code uint16, k uint32, jt, jf uint8) sockFilter { + return sockFilter{code, jt, jf, k} +} diff --git a/seccomp/context.go b/seccomp/context.go new file mode 100644 index 000000000..c8d4e7314 --- /dev/null +++ b/seccomp/context.go @@ -0,0 +1,144 @@ +package seccomp + +import ( + "errors" + "syscall" +) + +const labelTemplate = "lb-%d-%d" + +// Action is the type of action that will be taken when a +// syscall is performed. +type Action int + +const ( + Kill Action = iota - 3 // Kill the calling process of the syscall. + Trap // Trap and coredump the calling process of the syscall. + Allow // Allow the syscall to be completed. +) + +// Syscall is the specified syscall, action, and any type of arguments +// to filter on. +type Syscall struct { + // Value is the syscall number. + Value uint32 + // Action is the action to perform when the specified syscall is made. + Action Action + // Args are filters that can be specified on the arguments to the syscall. + Args Args +} + +func (s *Syscall) scmpAction() uint32 { + switch s.Action { + case Allow: + return retAllow + case Trap: + return retTrap + case Kill: + return retKill + } + return actionErrno(uint32(s.Action)) +} + +// Arg represents an argument to the syscall with the argument's index, +// the operator to apply when matching, and the argument's value at that time. +type Arg struct { + Index uint32 // index of args which start from zero + Op Operator // operation, such as EQ/NE/GE/LE + Value uint // the value of arg +} + +type Args [][]Arg + +var ( + ErrUnresolvedLabel = errors.New("seccomp: unresolved label") + ErrDuplicateLabel = errors.New("seccomp: duplicate label use") + ErrUnsupportedOperation = errors.New("seccomp: unsupported operation for argument") +) + +// Error returns an Action that will be used to send the calling +// process the specified errno when the syscall is made. +func Error(code syscall.Errno) Action { + return Action(code) +} + +// New returns a new syscall context for use. +func New() *Context { + return &Context{ + syscalls: make(map[uint32]*Syscall), + } +} + +// Context holds syscalls for the current process to limit the type of +// actions the calling process can make. +type Context struct { + syscalls map[uint32]*Syscall +} + +// Add will add the specified syscall, action, and arguments to the seccomp +// Context. +func (c *Context) Add(s *Syscall) { + c.syscalls[s.Value] = s +} + +// Remove removes the specified syscall configuration from the Context. +func (c *Context) Remove(call uint32) { + delete(c.syscalls, call) +} + +// Load will apply the Context to the calling process makeing any secccomp process changes +// apply after the context is loaded. +func (c *Context) Load() error { + filter, err := c.newFilter() + if err != nil { + return err + } + if err := prctl(prSetNoNewPrivileges, 1, 0, 0, 0); err != nil { + return err + } + prog := newSockFprog(filter) + return prog.set() +} + +func (c *Context) newFilter() ([]sockFilter, error) { + var ( + labels bpfLabels + f = newFilter() + ) + for _, s := range c.syscalls { + f.addSyscall(s, &labels) + } + f.allow() + // process args for the syscalls + for _, s := range c.syscalls { + if err := f.addArguments(s, &labels); err != nil { + return nil, err + } + } + // apply labels for arguments + idx := int32(len(*f) - 1) + for ; idx >= 0; idx-- { + lf := &(*f)[idx] + if lf.code != (syscall.BPF_JMP + syscall.BPF_JA) { + continue + } + rel := int32(lf.jt)<<8 | int32(lf.jf) + if ((jumpJT << 8) | jumpJF) == rel { + if labels[lf.k].location == 0xffffffff { + return nil, ErrUnresolvedLabel + } + lf.k = labels[lf.k].location - uint32(idx+1) + lf.jt = 0 + lf.jf = 0 + } else if ((labelJT << 8) | labelJF) == rel { + if labels[lf.k].location != 0xffffffff { + return nil, ErrDuplicateLabel + } + labels[lf.k].location = uint32(idx) + lf.k = 0 + lf.jt = 0 + lf.jf = 0 + } + } + return *f, nil +} diff --git a/seccomp/filter.go b/seccomp/filter.go new file mode 100644 index 000000000..370cdf087 --- /dev/null +++ b/seccomp/filter.go @@ -0,0 +1,116 @@ +package seccomp + +import ( + "fmt" + "syscall" + "unsafe" +) + +type sockFilter struct { + code uint16 + jt uint8 + jf uint8 + k uint32 +} + +func newFilter() *filter { + var f filter + f = append(f, sockFilter{ + pfLD + syscall.BPF_W + syscall.BPF_ABS, + 0, + 0, + uint32(unsafe.Offsetof(secData.nr)), + }) + return &f +} + +type filter []sockFilter + +func (f *filter) addSyscall(s *Syscall, labels *bpfLabels) { + if len(s.Args) == 0 { + f.call(s.Value, scmpBpfStmt(syscall.BPF_RET+syscall.BPF_K, s.scmpAction())) + } else { + if len(s.Args[0]) > 0 { + lb := fmt.Sprintf(labelTemplate, s.Value, s.Args[0][0].Index) + f.call(s.Value, + scmpBpfJump(syscall.BPF_JMP+syscall.BPF_JA, labelIndex(labels, lb), + jumpJT, jumpJF)) + } + } +} + +func (f *filter) addArguments(s *Syscall, labels *bpfLabels) error { + for i := 0; len(s.Args) > i; i++ { + if len(s.Args[i]) > 0 { + lb := fmt.Sprintf(labelTemplate, s.Value, s.Args[i][0].Index) + f.label(labels, lb) + f.arg(s.Args[i][0].Index) + } + for j := 0; j < len(s.Args[i]); j++ { + var jf sockFilter + if len(s.Args)-1 > i && len(s.Args[i+1]) > 0 { + lbj := fmt.Sprintf(labelTemplate, s.Value, s.Args[i+1][0].Index) + jf = scmpBpfJump(syscall.BPF_JMP+syscall.BPF_JA, + labelIndex(labels, lbj), jumpJT, jumpJF) + } else { + jf = scmpBpfStmt(syscall.BPF_RET+syscall.BPF_K, s.scmpAction()) + } + if err := f.op(s.Args[i][j].Op, s.Args[i][j].Value, jf); err != nil { + return err + } + } + f.allow() + } + return nil +} + +func (f *filter) label(labels *bpfLabels, lb string) { + *f = append(*f, scmpBpfJump(syscall.BPF_JMP+syscall.BPF_JA, labelIndex(labels, lb), labelJT, labelJF)) +} + +func (f *filter) call(nr uint32, jt sockFilter) { + *f = append(*f, scmpBpfJump(syscall.BPF_JMP+syscall.BPF_JEQ+syscall.BPF_K, nr, 0, 1)) + *f = append(*f, jt) +} + +func (f *filter) allow() { + *f = append(*f, scmpBpfStmt(syscall.BPF_RET+syscall.BPF_K, retAllow)) +} + +func (f *filter) deny() { + *f = append(*f, scmpBpfStmt(syscall.BPF_RET+syscall.BPF_K, retTrap)) +} + +func (f *filter) arg(index uint32) { + arg(f, index) +} + +func (f *filter) op(operation Operator, v uint, jf sockFilter) error { + switch operation { + case EqualTo: + jumpEqualTo(f, v, jf) + case NotEqualTo: + jumpNotEqualTo(f, v, jf) + case GreatherThan: + jumpGreaterThan(f, v, jf) + case LessThan: + jumpLessThan(f, v, jf) + case MaskEqualTo: + jumpMaskEqualTo(f, v, jf) + default: + return ErrUnsupportedOperation + } + return nil +} + +func arg(f *filter, idx uint32) { + *f = append(*f, scmpBpfStmt(syscall.BPF_LD+syscall.BPF_W+syscall.BPF_ABS, endian.low(idx))) + *f = append(*f, scmpBpfStmt(syscall.BPF_ST, 0)) + *f = append(*f, scmpBpfStmt(syscall.BPF_LD+syscall.BPF_W+syscall.BPF_ABS, endian.hi(idx))) + *f = append(*f, scmpBpfStmt(syscall.BPF_ST, 1)) +} + +func jump(f *filter, labels *bpfLabels, lb string) { + *f = append(*f, scmpBpfJump(syscall.BPF_JMP+syscall.BPF_JA, labelIndex(labels, lb), + jumpJT, jumpJF)) +} diff --git a/seccomp/jump_amd64.go b/seccomp/jump_amd64.go new file mode 100644 index 000000000..f0d07716a --- /dev/null +++ b/seccomp/jump_amd64.go @@ -0,0 +1,68 @@ +// +build linux,amd64 + +package seccomp + +// Using BPF filters +// +// ref: http://www.gsp.com/cgi-bin/man.cgi?topic=bpf +import "syscall" + +func jumpGreaterThan(f *filter, v uint, jt sockFilter) { + lo := uint32(uint64(v) % 0x100000000) + hi := uint32(uint64(v) / 0x100000000) + *f = append(*f, scmpBpfJump(syscall.BPF_JMP+syscall.BPF_JGT+syscall.BPF_K, (hi), 4, 0)) + *f = append(*f, scmpBpfJump(syscall.BPF_JMP+syscall.BPF_JEQ+syscall.BPF_K, (hi), 0, 5)) + *f = append(*f, scmpBpfStmt(syscall.BPF_LD+syscall.BPF_MEM, 0)) + *f = append(*f, scmpBpfJump(syscall.BPF_JMP+syscall.BPF_JGE+syscall.BPF_K, (lo), 0, 2)) + *f = append(*f, scmpBpfStmt(syscall.BPF_LD+syscall.BPF_MEM, 1)) + *f = append(*f, jt) + *f = append(*f, scmpBpfStmt(syscall.BPF_LD+syscall.BPF_MEM, 1)) +} + +func jumpEqualTo(f *filter, v uint, jt sockFilter) { + lo := uint32(uint64(v) % 0x100000000) + hi := uint32(uint64(v) / 0x100000000) + *f = append(*f, scmpBpfJump(syscall.BPF_JMP+syscall.BPF_JEQ+syscall.BPF_K, (hi), 0, 5)) + *f = append(*f, scmpBpfStmt(syscall.BPF_LD+syscall.BPF_MEM, 0)) + *f = append(*f, scmpBpfJump(syscall.BPF_JMP+syscall.BPF_JEQ+syscall.BPF_K, (lo), 0, 2)) + *f = append(*f, scmpBpfStmt(syscall.BPF_LD+syscall.BPF_MEM, 1)) + *f = append(*f, jt) + *f = append(*f, scmpBpfStmt(syscall.BPF_LD+syscall.BPF_MEM, 1)) +} + +func jumpLessThan(f *filter, v uint, jt sockFilter) { + lo := uint32(uint64(v) % 0x100000000) + hi := uint32(uint64(v) / 0x100000000) + *f = append(*f, scmpBpfJump(syscall.BPF_JMP+syscall.BPF_JGT+syscall.BPF_K, (hi), 6, 0)) + *f = append(*f, scmpBpfJump(syscall.BPF_JMP+syscall.BPF_JEQ+syscall.BPF_K, (hi), 0, 3)) + *f = append(*f, scmpBpfStmt(syscall.BPF_LD+syscall.BPF_MEM, 0)) + *f = append(*f, scmpBpfJump(syscall.BPF_JMP+syscall.BPF_JGT+syscall.BPF_K, (lo), 2, 0)) + *f = append(*f, scmpBpfStmt(syscall.BPF_LD+syscall.BPF_MEM, 1)) + *f = append(*f, jt) + *f = append(*f, scmpBpfStmt(syscall.BPF_LD+syscall.BPF_MEM, 1)) +} + +func jumpNotEqualTo(f *filter, v uint, jt sockFilter) { + lo := uint32(uint64(v) % 0x100000000) + hi := uint32(uint64(v) / 0x100000000) + *f = append(*f, scmpBpfJump(syscall.BPF_JMP+syscall.BPF_JEQ+syscall.BPF_K, hi, 5, 0)) + *f = append(*f, scmpBpfStmt(syscall.BPF_LD+syscall.BPF_MEM, 0)) + *f = append(*f, scmpBpfJump(syscall.BPF_JMP+syscall.BPF_JEQ+syscall.BPF_K, lo, 2, 0)) + *f = append(*f, scmpBpfStmt(syscall.BPF_LD+syscall.BPF_MEM, 1)) + *f = append(*f, jt) + *f = append(*f, scmpBpfStmt(syscall.BPF_LD+syscall.BPF_MEM, 1)) +} + +// this checks for a value inside a mask. The evalusation is equal to doing +// CLONE_NEWUSER & syscallMask == CLONE_NEWUSER +func jumpMaskEqualTo(f *filter, v uint, jt sockFilter) { + lo := uint32(uint64(v) % 0x100000000) + hi := uint32(uint64(v) / 0x100000000) + *f = append(*f, scmpBpfJump(syscall.BPF_JMP+syscall.BPF_JEQ+syscall.BPF_K, hi, 0, 6)) + *f = append(*f, scmpBpfStmt(syscall.BPF_LD+syscall.BPF_MEM, 0)) + *f = append(*f, scmpBpfStmt(syscall.BPF_ALU+syscall.BPF_AND, uint32(v))) + *f = append(*f, scmpBpfJump(syscall.BPF_JMP+syscall.BPF_JEQ+syscall.BPF_K, lo, 0, 2)) + *f = append(*f, scmpBpfStmt(syscall.BPF_LD+syscall.BPF_MEM, 1)) + *f = append(*f, jt) + *f = append(*f, scmpBpfStmt(syscall.BPF_LD+syscall.BPF_MEM, 1)) +} diff --git a/seccomp/seccomp.go b/seccomp/seccomp.go new file mode 100644 index 000000000..78d7d8533 --- /dev/null +++ b/seccomp/seccomp.go @@ -0,0 +1,122 @@ +// Package seccomp provides native seccomp ( https://www.kernel.org/doc/Documentation/prctl/seccomp_filter.txt ) support for go. +package seccomp + +import ( + "syscall" + "unsafe" +) + +// Operator that is used for argument comparison. +type Operator int + +const ( + EqualTo Operator = iota + NotEqualTo + GreatherThan + LessThan + MaskEqualTo +) + +const ( + jumpJT = 0xff + jumpJF = 0xff + labelJT = 0xfe + labelJF = 0xfe +) + +const ( + pfLD = 0x0 + retKill = 0x00000000 + retTrap = 0x00030000 + retAllow = 0x7fff0000 + modeFilter = 0x2 + prSetNoNewPrivileges = 0x26 +) + +func actionErrno(errno uint32) uint32 { + return 0x00050000 | (errno & 0x0000ffff) +} + +var ( + secData = struct { + nr int32 + arch uint32 + insPointer uint64 + args [6]uint64 + }{0, 0, 0, [6]uint64{0, 0, 0, 0, 0, 0}} +) + +var isLittle = func() bool { + var ( + x = 0x1234 + p = unsafe.Pointer(&x) + p2 = (*[unsafe.Sizeof(0)]byte)(p) + ) + if p2[0] == 0 { + return false + } + return true +}() + +var endian endianSupport + +type endianSupport struct { +} + +func (e endianSupport) hi(i uint32) uint32 { + if isLittle { + return e.little(i) + } + return e.big(i) +} + +func (e endianSupport) low(i uint32) uint32 { + if isLittle { + return e.big(i) + } + return e.little(i) +} + +func (endianSupport) big(idx uint32) uint32 { + if idx >= 6 { + return 0 + } + return uint32(unsafe.Offsetof(secData.args)) + 8*idx +} + +func (endianSupport) little(idx uint32) uint32 { + if idx < 0 || idx >= 6 { + return 0 + } + return uint32(unsafe.Offsetof(secData.args)) + + uint32(unsafe.Alignof(secData.args[0]))*idx + uint32(unsafe.Sizeof(secData.arch)) +} + +func prctl(option int, arg2, arg3, arg4, arg5 uintptr) error { + _, _, err := syscall.Syscall6(syscall.SYS_PRCTL, uintptr(option), arg2, arg3, arg4, arg5, 0) + if err != 0 { + return err + } + return nil +} + +func newSockFprog(filter []sockFilter) *sockFprog { + return &sockFprog{ + len: uint16(len(filter)), + filt: filter, + } +} + +type sockFprog struct { + len uint16 + filt []sockFilter +} + +func (s *sockFprog) set() error { + _, _, err := syscall.Syscall(syscall.SYS_PRCTL, uintptr(syscall.PR_SET_SECCOMP), + uintptr(modeFilter), uintptr(unsafe.Pointer(s))) + if err != 0 { + return err + } + return nil +} diff --git a/standard_init_linux.go b/standard_init_linux.go index 251c09f69..445c1fa29 100644 --- a/standard_init_linux.go +++ b/standard_init_linux.go @@ -99,5 +99,8 @@ func (l *linuxStandardInit) Init() error { if syscall.Getppid() != l.parentPid { return syscall.Kill(syscall.Getpid(), syscall.SIGKILL) } + if err := finalizeSeccomp(l.config); err != nil { + return err + } return system.Execv(l.config.Args[0], l.config.Args[0:], os.Environ()) } diff --git a/system/setns_linux.go b/system/setns_linux.go index a3c4cbb27..615ff4c82 100644 --- a/system/setns_linux.go +++ b/system/setns_linux.go @@ -21,16 +21,20 @@ var setNsMap = map[string]uintptr{ "linux/s390x": 339, } +var sysSetns = setNsMap[fmt.Sprintf("%s/%s", runtime.GOOS, runtime.GOARCH)] + +func SysSetns() uint32 { + return uint32(sysSetns) +} + func Setns(fd uintptr, flags uintptr) error { ns, exists := setNsMap[fmt.Sprintf("%s/%s", runtime.GOOS, runtime.GOARCH)] if !exists { return fmt.Errorf("unsupported platform %s/%s", runtime.GOOS, runtime.GOARCH) } - _, _, err := syscall.RawSyscall(ns, fd, flags, 0) if err != 0 { return err } - return nil }