From 3faaa761f70a853087b75083747c6e0f5d615f59 Mon Sep 17 00:00:00 2001 From: Derek Bruening Date: Mon, 12 Aug 2019 13:26:05 -0400 Subject: [PATCH] i#2350 rseq: Add -disable_rseq option Adds a new option -disable_rseq, which returns -ENOSYS on any SYS_rseq system call. This is intended as a workaround for applications that do not satisfy DR's limitations for full rseq support. Adds a test that fails unless -disable_rseq is passed. Moves the rseq limitations list to a new section on rseq in the documentation. Issue: #2350 --- api/docs/bt.dox | 51 +++++++- api/docs/intro.dox | 5 + api/docs/release.dox | 27 +--- core/optionsx.h | 3 + core/unix/os.c | 10 +- suite/tests/CMakeLists.txt | 2 + suite/tests/linux/rseq_noncompliant.c | 144 +++++++++++++++++++++ suite/tests/linux/rseq_noncompliant.expect | 2 + 8 files changed, 218 insertions(+), 26 deletions(-) create mode 100644 suite/tests/linux/rseq_noncompliant.c create mode 100644 suite/tests/linux/rseq_noncompliant.expect diff --git a/api/docs/bt.dox b/api/docs/bt.dox index 2ba4d807f48..ab2e6f8db36 100644 --- a/api/docs/bt.dox +++ b/api/docs/bt.dox @@ -1,5 +1,5 @@ /* ********************************************************** - * Copyright (c) 2011-2017 Google, Inc. All rights reserved. + * Copyright (c) 2011-2019 Google, Inc. All rights reserved. * Copyright (c) 2007-2009 VMware, Inc. All rights reserved. * **********************************************************/ @@ -52,6 +52,7 @@ following sections: - \ref sec_translation - \ref sec_predication - \ref sec_ldrex + - \ref sec_rseq - \ref sec_pcache - \ref bt_examples - \ref sec_startstop @@ -1283,6 +1284,54 @@ load/store pair into a macro-instruction, which prevents any loads and stores from being inserted, but also prevents any of the instructions involved from being instrumented. +*************************************************************************** +\htmlonly + +
+
+
+\endhtmlonly +\section sec_rseq Restartable Sequence Instrumentation Constraints + +The Linux kernel supports special code regions called restartable +sequences. This "rseq" feature is challenging to support under +instrumentation due to the tight restrictions on operations inside the +sequence. Instrumentation inserted in the sequence would need to be +designed to be restartable as well, with a single commit point. Meeting +such requirements is unrealistic for most instrumentation. Instead, DR +provides a "run twice" solution where the sequence is first executed as +regular code with regular instrumentation up to the commit point. Then the +sequence is restarted and executed without instrumentation to perform the +commit. + +This run-twice approach is subject to the following limitations: + +- The application must store an rseq_cs struct for each rseq region in a + section of its binary named "__rseq_cs", optionally with an "__rseq_cs_ptr_array" + section of pointers into the __rseq_cs section, per established conventions. + These sections must be located in loaded segments. +- Each rseq region's code must never be also executed as a non-restartable sequence. +- Each rseq region must make forward progress if its abort handler is always + called the first time it is executed. +- Each memory store instruction inside an rseq region must have no other side + effects: it must only write to memory and not to any registers. +- Each rseq region must end with a return instruction, and each abort handler + plus rseq code must combine into a callee following normal call-return + semantics. +- Each rseq region's code must end with a fall-through (non-control-flow) + instruction. +- Any helper function called from within an rseq region must have no side effects. +- The instrumented execution of the rseq region may not perfectly reflect + the native behavior of the application. The instrumentation will never see + the abort handler called, and memory addresses may be wrong if they are based on + the underlying cpuid and a migration occurred mid-region. These are minor and + acceptable for most tools (especially given that there is no better alternative). + +If an application does not satisfy these limitations, the \ref +op_disable_rseq "disable_rseq" runtime option may be used to return ENOSYS, +which can provide a workaround for applications which have fallback code +for kernels where rseq is not supported. + \ifnot vmsafe *************************************************************************** \htmlonly diff --git a/api/docs/intro.dox b/api/docs/intro.dox index 58797747474..4ddf11dea3f 100644 --- a/api/docs/intro.dox +++ b/api/docs/intro.dox @@ -1679,6 +1679,11 @@ Options available only in Code Manipulation mode and Memory Firewall mode This option requests that DynamoRIO convert sysenter into int 0x80 instead. See \ref sec_extlibs. + - \b -disable_rseq: \anchor op_disable_rseq + This option only applies to Linux. It returns -ENOSYS from the SYS_rseq + system call, forcing applications to fall back to code that does not + use restartable sequences. See \ref sec_rseq for more information. + - \b -multi_thread_exit: By default, DynamoRIO synchronizes with all remaining threads at process exit time and the process exit event executes with only diff --git a/api/docs/release.dox b/api/docs/release.dox index dd9768248d9..4c5ca7cf939 100644 --- a/api/docs/release.dox +++ b/api/docs/release.dox @@ -201,7 +201,7 @@ Further non-compatibility-affecting changes include: switching. - Added the function proc_avx512_enabled(). - Added support for applications using the Linux kernel's restartable sequence - ("rseq") feature, subject to the limitations listed in \ref sec_limit_platforms. + ("rseq") feature, subject to the limitations listed in \ref sec_rseq. - Added coherence support in drcachesim. - Added the function proc_num_opmask_registers(); - reg_get_value_ex() now supports reading AVX-512 mask registers. @@ -1636,28 +1636,9 @@ not supported. not fully decoded or encoded yet, and a few features are not yet ported: traces, clean call inlining and other optimizations, and several samples and provided tools. - - This release of DynamoRIO adds support for applications using the Linux kernel - restartable sequence ("rseq") feature, subject to the following limitations: - - The application must store an rseq_cs struct for each rseq region in a - section of its binary named "__rseq_cs", optionally with an "__rseq_cs_ptr_array" - section of pointers into the __rseq_cs section, per established conventions. - These sections must be located in loaded segments. - - Each rseq region's code must never be also executed as a non-restartable sequence. - - Each rseq region must make forward progress if its abort handler is always - called the first time it is executed. - - Each memory store instruction inside an rseq region must have no other side - effects: it must only write to memory and not to any registers. - - Each rseq region must end with a return instruction, and each abort handler - plus rseq code must combine into a callee following normal call-return - semantics. - - Each rseq region's code must end with a fall-through (non-control-flow) - instruction. - - Any helper function called from within an rseq region must have no side effects. - - The instrumented execution of the rseq region may not perfectly reflect - the native behavior of the application. The instrumentation will never see - the abort handler called, and memory addresses may be wrong if they are based on - the underlying cpuid and a migration occurred mid-region. These are minor and - acceptable for most tools (especially given that there is no better alternative). + - This release of DynamoRIO includes support for applications using the Linux kernel + restartable sequence ("rseq") feature, subject to the limitations listed + in \ref sec_rseq. \subsection sec_limit_perf Performance Limitations diff --git a/core/optionsx.h b/core/optionsx.h index 19b647014cd..0c77ba42e8d 100644 --- a/core/optionsx.h +++ b/core/optionsx.h @@ -1618,6 +1618,9 @@ OPTION_DEFAULT(uint, early_inject_location, 4 /* INJECT_LOCATION_LdrDefault */, OPTION_DEFAULT(bool, hook_vsyscall, true, "hook vdso vsyscall if possible") /* PR 356503: workaround to allow clients to make syscalls */ OPTION_ALIAS(sysenter_is_int80, hook_vsyscall, false, STATIC, OP_PCACHE_GLOBAL) + OPTION_DEFAULT(bool, disable_rseq, false, "cause the restartable sequence SYS_rseq " + "system call to return -ENOSYS as a workaround for rseq features not " + "supportable by DR") #endif #ifdef UNIX OPTION_DEFAULT(bool, restart_syscalls, true, diff --git a/core/unix/os.c b/core/unix/os.c index 72d59c4ce67..087f31ddce1 100644 --- a/core/unix/os.c +++ b/core/unix/os.c @@ -7549,8 +7549,14 @@ pre_system_call(dcontext_t *dcontext) #ifdef LINUX case SYS_rseq: - /* Lazy rseq handling. */ - module_locate_rseq_regions(); + if (DYNAMO_OPTION(disable_rseq)) { + set_failure_return_val(dcontext, ENOSYS); + DODEBUG({ dcontext->expect_last_syscall_to_fail = true; }); + execute_syscall = false; + } else { + /* Lazy rseq handling. */ + module_locate_rseq_regions(); + } break; #endif diff --git a/suite/tests/CMakeLists.txt b/suite/tests/CMakeLists.txt index 074ff31b03c..6ebf1ccc355 100644 --- a/suite/tests/CMakeLists.txt +++ b/suite/tests/CMakeLists.txt @@ -3522,6 +3522,8 @@ if (UNIX) tobuild_api(api.rseq linux/rseq.c "" "" OFF OFF) link_with_pthread(api.rseq) append_property_string(TARGET api.rseq COMPILE_FLAGS "-DRSEQ_TEST_ATTACH") + # Test non-compliant code with our workaround flag. + tobuild_ops(linux.rseq_noncompliant linux/rseq_noncompliant.c "-disable_rseq" "") endif () else (UNIX) add_exe(win32.infloop win32/infloop.c) diff --git a/suite/tests/linux/rseq_noncompliant.c b/suite/tests/linux/rseq_noncompliant.c new file mode 100644 index 00000000000..40e2549a8a4 --- /dev/null +++ b/suite/tests/linux/rseq_noncompliant.c @@ -0,0 +1,144 @@ +/* ********************************************************** + * Copyright (c) 2019 Google, Inc. All rights reserved. + * **********************************************************/ + +/* + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the name of VMware, Inc. nor the names of its contributors may be + * used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL VMWARE, INC. OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + */ + +/* This test uses rseq but does not comply with the conventions that DR requires. + * It shares similar code to the other rseq tests but not enough to try to share + * the separate bits of code. + */ + +#include "tools.h" +#ifndef LINUX +# error Only Linux is supported. +#endif +/* TODO i#2350: Port this to other platforms and bitwidths. There is a lot of + * assembly which makes that non-trivial work. + */ +#if !defined(X86) || !defined(X64) +# error Only x86_64 is supported. +#endif +#include "../../core/unix/include/syscall.h" +#ifndef HAVE_RSEQ +# error The linux/rseq header is required. +#endif +#include +#include +#include + +#define EXPANDSTR(x) #x +#define STRINGIFY(x) EXPANDSTR(x) + +#define RSEQ_SIG 0x90909090 /* nops to disasm nicely */ + +/* This cannot be a stack-local variable, as the kernel will force SIGSEGV on a syscall + * if it can't read this struct. And for multiple threads it should be in TLS. + */ +static __thread volatile struct rseq rseq_tls; + +#ifdef RSEQ_TEST_ATTACH +static volatile int exit_requested; +static void *thread_ready; +#endif + +int +test_rseq(void) +{ + /* We use static to avoid stack reference issues with our extra frame inside the asm. + */ + static __u32 id = RSEQ_CPU_ID_UNINITIALIZED; + static int restarts = 0; + __asm__ __volatile__( + /* We deliberately do NOT put our data into an __rseq_cs section. */ + ".pushsection .data, \"aw\"\n\t" + ".balign 32\n\t" + "1:\n\t" + ".long 0, 0\n\t" /* version, flags */ + ".quad 2f, 3f-2f, 4f\n\t" /* start_ip, post_commit_offset, abort_ip */ + ".popsection\n\t" + + /* We do NOT structure this as a call. */ + "6:\n\t" + /* Store the entry into the ptr. */ + "leaq 1b(%%rip), %%rax\n\t" + "movq %%rax, %0\n\t" + /* Test "falling into" the rseq region. */ + + /* Restartable sequence. + * If I pause in gdb in here, often the thread is migrated and the abort + * handler invoked: a simple way to test a restart natively. + */ + "2:\n\t" + "mov %3, %%rax\n\t" + "mov %%rax, %1\n\t" + + /* Post-commit. */ + "3:\n\t" + "jmp 5f\n\t" + + /* Abort handler. */ + /* clang-format off */ /* (avoid indenting next few lines) */ + ".long " STRINGIFY(RSEQ_SIG) "\n\t" + "4:\n\t" + "addl $1, %2\n\t" + "jmp 6b\n\t" + + /* Clear the ptr. */ + "5:\n\t" + "leaq 1b(%%rip), %%rax\n\t" + "movq $0, %0\n\t" + /* clang-format on */ + + : "=m"(rseq_tls.rseq_cs), "=m"(id), "=m"(restarts) + : "m"(rseq_tls.cpu_id) + : "rax", "memory"); + assert(id != RSEQ_CPU_ID_UNINITIALIZED); + return restarts; +} + +int +main() +{ + int restart_count = 0; + rseq_tls.cpu_id = RSEQ_CPU_ID_UNINITIALIZED; + int res = syscall(SYS_rseq, &rseq_tls, sizeof(rseq_tls), 0, RSEQ_SIG); + if (res == 0) { + restart_count = test_rseq(); + } else { + /* Linux kernel 4.18+ is required. */ + assert(errno == ENOSYS); + /* Make the test pass. */ + restart_count = 1; + } + /* We expect 0 restart_count natively (ok, tiny chance of >0), and 1 under DR. */ + print("Saw %s restarts\n", restart_count > 0 ? "some" : "no"); + print("All done\n"); + return 0; +} diff --git a/suite/tests/linux/rseq_noncompliant.expect b/suite/tests/linux/rseq_noncompliant.expect new file mode 100644 index 00000000000..3ae711b1a48 --- /dev/null +++ b/suite/tests/linux/rseq_noncompliant.expect @@ -0,0 +1,2 @@ +Saw some restarts +All done