From 8b93f9fb3c3ef655a31729702860b10ec5ec38d5 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Fri, 20 May 2022 10:39:41 +1000 Subject: [PATCH] seccomp: enosys: always return -ENOSYS for setup(2) on s390(x) On s390x, syscalls above 255 are multiplexed using the (now otherwise unused) setup(2) syscall (syscall number 0). If the kernel supports the syscall then it will correctly translate the syscall number such that seccomp will correctly detect it -- however, for unknown syscalls the syscall number remains unchanged. This can be verified by running the following program under strace: int main(void) { scmp_filter_ctx ctx = seccomp_init(SCMP_ACT_TRAP); seccomp_load(ctx); return syscall(439, AT_FDCWD, "asdf", X_OK, 0); } Which will then die with the following signal (on pre-5.8 kernels): --- SIGSYS {si_signo=SIGSYS, si_code=SYS_SECCOMP, si_call_addr=0x3ffb3006c22, si_syscall=__NR_setup, si_arch=AUDIT_ARCH_S390X} --- (Note that the si_syscall is __NR_setup, not __NR_faccessat2.) As a result, the -ENOSYS handling we had previously did not work completely correctly on s390x because any syscall not supported by the kernel would be treated as syscall number 0 rather than the actual syscall number. Always returning -ENOSYS will not cause any issues because in all of the cases where this multiplexing occurs, seccomp will see the remapped syscall number -- and no userspace program will call setup(2) intentionally (the syscall has not existed in Linux for decades and was originally a hack used early in Linux init prior to spawning pid1 -- so you will get -ENOSYS from the kernel anyway). Signed-off-by: Aleksa Sarai --- libcontainer/seccomp/patchbpf/enosys_linux.go | 48 ++++++++++++++----- .../seccomp/patchbpf/enosys_linux_test.go | 13 +++++ 2 files changed, 50 insertions(+), 11 deletions(-) diff --git a/libcontainer/seccomp/patchbpf/enosys_linux.go b/libcontainer/seccomp/patchbpf/enosys_linux.go index dfb8a0a8e59..7d4ec6a42e5 100644 --- a/libcontainer/seccomp/patchbpf/enosys_linux.go +++ b/libcontainer/seccomp/patchbpf/enosys_linux.go @@ -72,6 +72,11 @@ import "C" var retErrnoEnosys = uint32(C.C_ACT_ERRNO_ENOSYS) +// This syscall is used for multiplexing "large" syscalls on s390(x). Unknown +// syscalls will end up with this syscall number, so we need to explcitly +// return -ENOSYS for this syscall on those architectures. +const s390xMultiplexSyscall libseccomp.ScmpSyscall = 0 + func isAllowAction(action configs.Action) bool { switch action { // Trace is considered an "allow" action because a good tracer should @@ -305,7 +310,7 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error) // directly from the arch code so we need to do it here. Sadly we can't // share this code between architecture branches. section := []bpf.Instruction{ - // load [0] + // load [0] (syscall number) bpf.LoadAbsolute{Off: 0, Size: 4}, // NOTE: We assume sizeof(int) == 4. } @@ -314,10 +319,37 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error) // No syscalls found for this arch -- skip it and move on. continue case 1: - // Get the only syscall in the map. - var sysno libseccomp.ScmpSyscall - for _, no := range maxSyscalls { + // Get the only syscall and scmpArch in the map. + var ( + scmpArch libseccomp.ScmpArch + sysno libseccomp.ScmpSyscall + ) + for arch, no := range maxSyscalls { sysno = no + scmpArch = arch + } + + switch scmpArch { + // Return -ENOSYS for setup(2) on s390(x). This syscall is used for + // multiplexing "large syscall number" syscalls, but if the syscall + // number is not known to the kernel then the syscall number is + // left unchanged (and because it is sysno=0, you'll end up with + // EPERM for syscalls the kernel doesn't know about). + // + // The actual setup(2) syscall is never used by userspace anymore + // (and hasn't existed for decades) outside of this multiplexing + // scheme so returning -ENOSYS is fine. + case libseccomp.ArchS390, libseccomp.ArchS390X: + section = append(section, []bpf.Instruction{ + // jne [setup=0],1 + bpf.JumpIf{ + Cond: bpf.JumpNotEqual, + Val: uint32(s390xMultiplexSyscall), + SkipTrue: 1, + }, + // ret [ENOSYS] + bpf.RetConstant{Val: retErrnoEnosys}, + }...) } // The simplest case just boils down to a single jgt instruction, @@ -349,12 +381,6 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error) // If we're on x86 we need to add a check for x32 and if we're in // the wrong mode we jump over the section. if uint32(nativeArch) == uint32(C.C_AUDIT_ARCH_X86_64) { - // Grab the only architecture in the map. - var scmpArch libseccomp.ScmpArch - for arch := range maxSyscalls { - scmpArch = arch - } - // Generate a prefix to check the mode. switch scmpArch { case libseccomp.ArchAMD64: @@ -512,7 +538,7 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error) // Prepend the load instruction for the architecture. programTail = append([]bpf.Instruction{ - // load [4] + // load [4] (architecture) bpf.LoadAbsolute{Off: 4, Size: 4}, // NOTE: We assume sizeof(int) == 4. }, programTail...) diff --git a/libcontainer/seccomp/patchbpf/enosys_linux_test.go b/libcontainer/seccomp/patchbpf/enosys_linux_test.go index 727800aa50c..e2d363a43bd 100644 --- a/libcontainer/seccomp/patchbpf/enosys_linux_test.go +++ b/libcontainer/seccomp/patchbpf/enosys_linux_test.go @@ -213,6 +213,19 @@ func testEnosysStub(t *testing.T, defaultAction configs.Action, arches []string) }) } + // If we're on s390(x) make sure you get -ENOSYS for the "setup" + // syscall (this is done to work around an issue with s390x's + // syscall multiplexing which results in unknown syscalls being a + // setup(2) invocation). + switch scmpArch { + case libseccomp.ArchS390, libseccomp.ArchS390X: + syscallTests = append(syscallTests, syscallTest{ + sysno: s390xMultiplexSyscall, + syscall: "setup", + expected: retErrnoEnosys, + }) + } + // Test syscalls in the explicit list. for _, test := range syscallTests { // Override the expected value in the two special cases.