Skip to content

Commit

Permalink
i#2350 rseq: run restartable sequences natively (#2396)
Browse files Browse the repository at this point in the history
Adds preliminary handling for restartable sequences in Linux, a kernel
feature meant for per-cpu critical regions.  The code here targets a
particular Linux kernel patch, but tries to be general to make it easy to
tweak for other flavors.

Adds a new option -rseq_sysnum to enable this support and to specify the
syscall number for the kernel extension.

For initial handling, we execute the restartable sequence natively.  It is
very complex to build regular blocks and support instrumentation, so we
start with native execution to at least allow correct app execution even if
tools miss some code.  We assume each sub-sequence is entered via a call
and exited via a return, and we use our existing native_exec gateway and
resumption mechanisms.
  • Loading branch information
derekbruening authored Apr 28, 2017
1 parent 5a209f9 commit cda88be
Show file tree
Hide file tree
Showing 6 changed files with 161 additions and 4 deletions.
4 changes: 2 additions & 2 deletions core/arch/interp.c
Original file line number Diff line number Diff line change
Expand Up @@ -4667,8 +4667,8 @@ build_native_exec_bb(dcontext_t *dcontext, build_bb_t *bb)
ASSERT(bb->app_interp);
ASSERT(!bb->record_translation);
ASSERT(bb->start_pc != NULL);
/* vmlist must start out empty (or N/A) */
ASSERT(bb->vmlist == NULL);
/* vmlist must start out empty (or N/A). For clients it may have started early. */
ASSERT(bb->vmlist == NULL || !bb->record_vmlist || bb->checked_start_vmarea);
if (TEST(FRAG_HAS_TRANSLATION_INFO, bb->flags))
bb->flags &= ~FRAG_HAS_TRANSLATION_INFO;
bb->native_exec = true;
Expand Down
1 change: 1 addition & 0 deletions core/native_exec.c
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ native_exec_init(void)
ASSERT(retstub_end == local_start +
MAX_NATIVE_RETSTACK * BACK_FROM_NATIVE_RETSTUB_SIZE);
});
native_exec_os_init();
}

void
Expand Down
10 changes: 9 additions & 1 deletion core/optionsx.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/* *******************************************************************************
* Copyright (c) 2010-2016 Google, Inc. All rights reserved.
* Copyright (c) 2010-2017 Google, Inc. All rights reserved.
* Copyright (c) 2011 Massachusetts Institute of Technology All rights reserved.
* Copyright (c) 2003-2010 VMware, Inc. All rights reserved.
* *******************************************************************************/
Expand Down Expand Up @@ -1557,6 +1557,14 @@
OPTION_DEFAULT(bool, hook_vsyscall, true, "hook vdso vsyscall if possible")
/* PR 356503: workaround to allow clients to make syscalls */
OPTION_ALIAS(sysenter_is_int80, hook_vsyscall, false, STATIC, OP_PCACHE_GLOBAL)
/* i#2350: we support restartable sequence ("rseq") Linux kernel extensions,
* but as they are not in the mainline kernel we need the number to be passed
* in. If left as 0 the support is disabled.
* Current support is preliminary: we execute them natively.
*/
/* XXX: I'd prefer -1 to disable but there's no signed option type. */
OPTION_DEFAULT(uint, rseq_sysnum, 0,
"system call number for restartable sequences; 0 disables")
#endif
#ifdef UNIX
OPTION_DEFAULT(bool, restart_syscalls, true,
Expand Down
2 changes: 2 additions & 0 deletions core/os_shared.h
Original file line number Diff line number Diff line change
Expand Up @@ -545,6 +545,8 @@ bool query_memory_ex_from_os(const byte *pc, OUT dr_mem_info_t *info);
void os_check_new_app_module(dcontext_t *dcontext, app_pc pc);
#endif

void native_exec_os_init(void);

bool get_stack_bounds(dcontext_t *dcontext, byte **base, byte **top);

/* Does a safe_read of *src_ptr into dst_var, returning true for success. We
Expand Down
142 changes: 141 additions & 1 deletion core/unix/os.c
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@
#include <fcntl.h>
#include "../globals.h"
#include "../hashtable.h"
#include "../native_exec.h"
#include <string.h>
#include <unistd.h> /* for write and usleep and _exit */
#include <limits.h>
Expand Down Expand Up @@ -285,6 +286,9 @@ static bool handle_app_mremap(dcontext_t *dcontext, byte *base, size_t size,
uint old_prot, uint old_type);
static void handle_app_brk(dcontext_t *dcontext, byte *lowest_brk/*if known*/,
byte *old_brk, byte *new_brk);
static void restartable_region_init(void);
static bool handle_restartable_region_syscall_pre(dcontext_t *dcontext);
static void handle_restartable_region_syscall_post(dcontext_t *dcontext, bool success);
#endif

/* full path to our own library, used for execve */
Expand Down Expand Up @@ -7533,12 +7537,16 @@ pre_system_call(dcontext_t *dcontext)
#endif

default: {
#ifdef LINUX
execute_syscall = handle_restartable_region_syscall_pre(dcontext);
#endif
#ifdef VMX86_SERVER
if (is_vmkuw_sysnum(dcontext->sys_num)) {
execute_syscall = vmkuw_pre_system_call(dcontext);
break;
}
#endif
break;
}

} /* end switch */
Expand Down Expand Up @@ -8528,13 +8536,17 @@ post_system_call(dcontext_t *dcontext)
break;
#endif

#ifdef VMX86_SERVER
default:
#ifdef LINUX
handle_restartable_region_syscall_post(dcontext, success);
#endif
#ifdef VMX86_SERVER
if (is_vmkuw_sysnum(sysnum)) {
vmkuw_post_system_call(dcontext);
break;
}
#endif
break;

} /* switch */

Expand Down Expand Up @@ -10127,6 +10139,134 @@ __umoddi3(uint64 dividend, uint64 divisor)
*/
#endif /* X86_32 */

/****************************************************************************
* Kernel-restartable sequences
*/

#ifdef LINUX
/* Support for Linux kernel extensions for per-cpu critical regions.
* Xref https://lwn.net/Articles/649288/
* Some of this may vary on different kernels.
* The way it works is that the app tells the kernel the bounds of a
* code region within which a context switch should restart the code.
*
* As these sequences are complex to handle (it would be much simpler
* if they used existing mechanisms like signals!), we start out by
* running their code natively. We assume it is "well-behaved" and
* we'll get control back. These code sequences will be invisible to
* tools: we'll live with the lack of instrumentation for now as a
* tradeoff for getting correct app execution.
*
* Unfortunately we can't easily have a regression test in the main
* repository as mainstream kernels do not have this feature.
*/

/* We support a syscall of this form, with number DYNAMO_OPTION(rseq_sysnum):
* SYSCALL_DEFINE4(rseq, int, op, long, val1, long, val2, long, val3)
*/
/* Set operation: app_pc start, app_pc end, app_pc restart */
# define RSEQ_SET_CRITICAL 1
/* Get operation: app_pc *start, app_pc *end, app_pc *restart */
# define RSEQ_GET_CRITICAL 3

static app_pc app_restart_region_start;
static app_pc app_restart_region_end;

static void
restartable_region_init(void)
{
int res;
app_pc restart_handler;
if (DYNAMO_OPTION(rseq_sysnum) == 0)
return;
res = dynamorio_syscall(DYNAMO_OPTION(rseq_sysnum), 4, RSEQ_GET_CRITICAL,
&app_restart_region_start,
&app_restart_region_end,
&restart_handler);
if (res != 0) {
ASSERT(res == -ENOSYS);
LOG(GLOBAL, LOG_TOP, 1, "No restartable region at init\n");
app_restart_region_start = NULL;
app_restart_region_end = NULL;
} else {
LOG(GLOBAL, LOG_TOP, 1, "Restartable region at init: " PFX"-" PFX" @" PFX"\n",
app_restart_region_start, app_restart_region_end, restart_handler);
if (app_restart_region_start != NULL &&
app_restart_region_end > app_restart_region_start) {
vmvector_add(native_exec_areas, app_restart_region_start,
app_restart_region_end, NULL);
}
}
}

static bool
handle_restartable_region_syscall_pre(dcontext_t *dcontext)
{
if (DYNAMO_OPTION(rseq_sysnum) == 0 ||
dcontext->sys_num != DYNAMO_OPTION(rseq_sysnum))
return true;
/* We do the work in post */
dcontext->sys_param0 = sys_param(dcontext, 0);
dcontext->sys_param1 = sys_param(dcontext, 1);
return true;
}

/* Though there is a race, it is hard to imagine the app executing correctly
* without first checking the return value of the syscall. Thus we handle
* rseq in post and avoid having to emulate the kernel's argument checking.
*/
static void
handle_restartable_region_syscall_post(dcontext_t *dcontext, bool success)
{
int op;
if (DYNAMO_OPTION(rseq_sysnum) == 0 ||
dcontext->sys_num != DYNAMO_OPTION(rseq_sysnum) ||
!success)
return;
op = (int) sys_param(dcontext, 0);
if (op == RSEQ_SET_CRITICAL) {
app_pc start = (app_pc) dcontext->sys_param0;
app_pc end = (app_pc) dcontext->sys_param1;
LOG(THREAD, LOG_VMAREAS|LOG_SYSCALLS, 2,
"syscall: set rseq region to " PFX"-" PFX"\n", start, end);
/* An unlink flush should be good enough: we simply don't support
* suddenly setting an rseq region for some fallthrough code after the
* syscall.
*/
if (app_restart_region_start != NULL &&
app_restart_region_end > app_restart_region_start) {
vmvector_remove(native_exec_areas, app_restart_region_start,
app_restart_region_end);
/* Flush existing code so it no longer goes native. */
flush_fragments_from_region(dcontext, app_restart_region_start,
app_restart_region_end - app_restart_region_start,
false/*don't force synchall*/);
}
SELF_UNPROTECT_DATASEC(DATASEC_RARELY_PROT);
app_restart_region_start = start;
app_restart_region_end = end;
SELF_PROTECT_DATASEC(DATASEC_RARELY_PROT);
if (app_restart_region_start != NULL &&
app_restart_region_end > app_restart_region_start) {
vmvector_add(native_exec_areas, app_restart_region_start,
app_restart_region_end, NULL);
/* We have to flush any existing code in the region. */
flush_fragments_from_region(dcontext, app_restart_region_start,
app_restart_region_end - app_restart_region_start,
false/*don't force synchall*/);
}
}
}
#endif /* LINUX */

void
native_exec_os_init(void)
{
#ifdef LINUX
restartable_region_init();
#endif
}

#endif /* !NOT_DYNAMORIO_CORE_PROPER: around most of file, to exclude preload */

/****************************************************************************
Expand Down
6 changes: 6 additions & 0 deletions core/win32/os.c
Original file line number Diff line number Diff line change
Expand Up @@ -1096,6 +1096,12 @@ os_init(void)
os_get_current_dir(cwd, BUFFER_SIZE_ELEMENTS(cwd));
}

void
native_exec_os_init(void)
{
/* Nothing yet. */
}

static void
print_mem_stats()
{
Expand Down

0 comments on commit cda88be

Please sign in to comment.