From 83043cde2a5d19dc0c719f135bb985c15f487d33 Mon Sep 17 00:00:00 2001 From: Derek Bruening Date: Tue, 20 Aug 2019 11:10:26 -0400 Subject: [PATCH 1/4] i#2350 rseq: Use a local copy for native execution Eliminates the call-return reliance for the native execution step of rseq support. Makes a local copy of the sequence right inside the sequence-ending block and executes it. The sequence is inserted as additional instructions and is then mangled normally (mangling changes are assumed to be restartable), but it is not passed to clients. Any exits are regular block exits, resulting in a block with many exits. The prior call-return scheme is left under a temporary option -rseq_assume_call, as a failsafe in case there are stability problems discovered with this native execution implementation. Once we are happy with the new scheme we can remove the option. To make the local copy an rseq region, the per-thread rseq_cs address is identified by watching system calls. For attach, it is identified by searching the possible static TLS offsets. The assumption of a constant offset is documented and verified. The rseq_cs's abort handler is a new exit added with the app's signature as data just before it, hidden in the operands of a nop instruction to avoid problems with decoding the fragment. A local jump skips over the data and exit. A new rseq_cs structure is allocated for each sequence-ending fragment. It is stored in a hashtable in the rseq module, to avoid complexities and overhead of adding an additional fragment_t or "subclass" field. A new flag is set to trigger calling into the rseq module on fragment deletion. The rseq_cs fields are filled in via a new post-emit control point, using information stored in labels during mangling. The pointer to the rseq_cs is inserted with a dummy value and patched in this new control point using a new utility routine patch_mov_immed_ptrsz(). To avoid crashing due to invalid rseq bounds after freeing the rseq_cs structure, the rseq pointer is cleared explicitly on completion, and on midpoint exit by the fragment deletion hook along with a hook on the shared fragment flushtime update, to ensure all threads are covered. The rseq test is augmented and expanded. An invalid instruction is added to properly test the abort handler, under a conditional to allow testing each sequence both to completion and on abort. Future work is properly handling a midpoint exit during the instrumentation execution: we need to invoke the native version as well. Adding aarchxx support is also future work: the patch_mov_immed_ptrsz(), the writes to the rseq struct in TLS, and the rseq tests are currently x86-only. Issue: #2350 --- api/docs/bt.dox | 7 +- core/arch/aarchxx/mangle.c | 7 + core/arch/arch.h | 6 + core/arch/arch_exports.h | 25 +- core/arch/instr.h | 8 + core/arch/mangle_shared.c | 585 ++++++++++++++++++++++++++-------- core/arch/mangle_utils.c | 2 +- core/arch/steal_reg.h | 6 + core/arch/x86/mangle.c | 31 ++ core/emit.c | 7 +- core/fragment.c | 8 + core/fragment.h | 10 +- core/globals.h | 1 + core/optionsx.h | 3 + core/unix/os.c | 23 +- core/unix/os_exports.h | 23 ++ core/unix/rseq_linux.c | 236 +++++++++++++- core/unix/rseq_linux.h | 3 + suite/tests/linux/rseq.c | 253 +++++++++++---- suite/tests/linux/rseq.expect | 1 - 20 files changed, 1026 insertions(+), 219 deletions(-) diff --git a/api/docs/bt.dox b/api/docs/bt.dox index 8c35961b283..d3a806f83b5 100644 --- a/api/docs/bt.dox +++ b/api/docs/bt.dox @@ -1306,10 +1306,13 @@ commit. This run-twice approach is subject to the following limitations: +- Only x86 is supported for now (no arm or aarch64 support yet). - The application must store an rseq_cs struct for each rseq region in a section of its binary named "__rseq_cs", optionally with an "__rseq_cs_ptr_array" section of pointers into the __rseq_cs section, per established conventions. These sections must be located in loaded segments. +- The application must use static thread-local storage for its struct rseq registrations. +- The application must use the same signature for every rseq system call. - Each rseq region's code must never be also executed as a non-restartable sequence. - Each rseq region must handle being directly restarted without its abort handler being called (with the machine state restored). @@ -1317,11 +1320,9 @@ This run-twice approach is subject to the following limitations: effects: it must only write to memory and not to any registers. For example, a push instruction which both writes to memory and the stack pointer register is not supported. -- Each rseq region must end with a return instruction, and each abort handler - plus rseq code must combine into a callee following normal call-return - semantics. - Each rseq region's code must end with a fall-through (non-control-flow) instruction. +- No indirect branches that do not exit the rseq region are allowed inside. - Each rseq region must be entered only from the top, with no branches from outside the region targeting a point inside the region. - No system calls are allowed inside rseq regions. diff --git a/core/arch/aarchxx/mangle.c b/core/arch/aarchxx/mangle.c index 6cdaa9e3605..f6e94b7cade 100644 --- a/core/arch/aarchxx/mangle.c +++ b/core/arch/aarchxx/mangle.c @@ -1411,6 +1411,13 @@ insert_mov_immed_arch(dcontext_t *dcontext, instr_t *src_inst, byte *encode_esti #endif } +void +patch_mov_immed_arch(dcontext_t *dcontext, ptr_int_t val, byte *pc, instr_t *first, + instr_t *last) +{ + ASSERT_NOT_IMPLEMENTED(false); /* FIXME i#1551, i#1569 */ +} + void insert_push_immed_arch(dcontext_t *dcontext, instr_t *src_inst, byte *encode_estimate, ptr_int_t val, instrlist_t *ilist, instr_t *instr, diff --git a/core/arch/arch.h b/core/arch/arch.h index 88f57928ae5..494f9868388 100644 --- a/core/arch/arch.h +++ b/core/arch/arch.h @@ -504,6 +504,9 @@ insert_mov_immed_ptrsz(dcontext_t *dcontext, ptr_int_t val, opnd_t dst, instrlist_t *ilist, instr_t *instr, OUT instr_t **first, OUT instr_t **last); void +patch_mov_immed_ptrsz(dcontext_t *dcontext, ptr_int_t val, byte *pc, instr_t *first, + instr_t *last); +void insert_push_immed_ptrsz(dcontext_t *dcontext, ptr_int_t val, instrlist_t *ilist, instr_t *instr, OUT instr_t **first, OUT instr_t **last); void @@ -535,6 +538,9 @@ insert_mov_immed_arch(dcontext_t *dcontext, instr_t *src_inst, byte *encode_esti ptr_int_t val, opnd_t dst, instrlist_t *ilist, instr_t *instr, OUT instr_t **first, OUT instr_t **last); void +patch_mov_immed_arch(dcontext_t *dcontext, ptr_int_t val, byte *pc, instr_t *first, + instr_t *last); +void insert_push_immed_arch(dcontext_t *dcontext, instr_t *src_inst, byte *encode_estimate, ptr_int_t val, instrlist_t *ilist, instr_t *instr, OUT instr_t **first, OUT instr_t **last); diff --git a/core/arch/arch_exports.h b/core/arch/arch_exports.h index 16b47472f32..34084044f9a 100644 --- a/core/arch/arch_exports.h +++ b/core/arch/arch_exports.h @@ -572,6 +572,16 @@ atomic_add_exchange_int64(volatile int64 *var, int64 value) # elif defined(AARCH64) +# define ATOMIC_1BYTE_WRITE(target, value, hot_patch) \ + do { \ + ASSERT(sizeof(value) == 1); \ + /* Not currently used to write code */ \ + ASSERT_CURIOSITY(!hot_patch); \ + __asm__ __volatile__("strb %w0, [%1]" \ + : \ + : "r"(value), "r"(target) \ + : "memory"); \ + } while (0) # define ATOMIC_4BYTE_WRITE(target, value, hot_patch) \ do { \ ASSERT(sizeof(value) == 4); \ @@ -726,6 +736,14 @@ atomic_dec_becomes_zero(volatile int *var) # elif defined(ARM) +# define ATOMIC_1BYTE_WRITE(target, value, hot_patch) \ + do { \ + ASSERT(sizeof(value) == 1); \ + __asm__ __volatile__("strb %0, [%1]" \ + : \ + : "r"(value), "r"(target) \ + : "memory"); \ + } while (0) # define ATOMIC_4BYTE_WRITE(target, value, hot_patch) \ do { \ ASSERT(sizeof(value) == 4); \ @@ -1645,7 +1663,9 @@ d_r_decode_init(void); # define STUB_COARSE_DIRECT_SIZE(flags) \ (FRAG_IS_32(flags) ? STUB_COARSE_DIRECT_SIZE32 : STUB_COARSE_DIRECT_SIZE64) -/* writes nops into the address range */ +/* Writes nops into the address range. + * XXX: Better to use the newer multi-byte nops. + */ # define SET_TO_NOPS(isa_mode, addr, size) memset(addr, 0x90, size) /* writes debugbreaks into the address range */ # define SET_TO_DEBUG(addr, size) memset(addr, 0xcc, size) @@ -2255,6 +2275,9 @@ instr_supports_simple_mangling_epilogue(dcontext_t *dcontext, instr_t *inst); void float_pc_update(dcontext_t *dcontext); +void +mangle_finalize(dcontext_t *dcontext, instrlist_t *ilist, fragment_t *f); + /* in retcheck.c */ #ifdef CHECK_RETURNS_SSE2 void diff --git a/core/arch/instr.h b/core/arch/instr.h index 34c71c3758c..83fce7e9155 100644 --- a/core/arch/instr.h +++ b/core/arch/instr.h @@ -197,6 +197,14 @@ enum { # ifdef WINDOWS /* used to indicate that a syscall should be executed via shared syscall */ INSTR_SHARED_SYSCALL = 0x01000000, +# else + /* Indicates an instruction that's part of the rseq endpoint. We use this in + * instrlist_t.flags (sort of the same namespace: INSTR_OUR_MANGLING is used there, + * but also EDI_VAL_*) and as a version of DR_NOTE_RSEQ that survives encoding + * (seems like we could store notes for labels in another field so they do + * in fact survive: a union with instr_t.translation?). + */ + INSTR_RSEQ_ENDPOINT = 0x01000000, # endif # ifdef CLIENT_INTERFACE diff --git a/core/arch/mangle_shared.c b/core/arch/mangle_shared.c index 893aa6671da..febe5a42fe7 100644 --- a/core/arch/mangle_shared.c +++ b/core/arch/mangle_shared.c @@ -43,6 +43,7 @@ #include "instr_create.h" #include "instrument.h" /* for insert_get_mcontext_base */ #include "decode_fast.h" /* for decode_next_pc */ +#include "disassemble.h" #ifdef ANNOTATIONS # include "../annotations.h" @@ -534,6 +535,14 @@ insert_meta_call_vargs(dcontext_t *dcontext, instrlist_t *ilist, instr_t *instr, * M A N G L I N G R O U T I N E S */ +/* This routine is not shared with drdecode, so it's here instead of mangle_utils.c. */ +void +patch_mov_immed_ptrsz(dcontext_t *dcontext, ptr_int_t val, byte *pc, instr_t *first, + instr_t *last) +{ + patch_mov_immed_arch(dcontext, val, pc, first, last); +} + app_pc get_app_instr_xl8(instr_t *instr) { @@ -773,9 +782,378 @@ mangle_syscall_code(dcontext_t *dcontext, fragment_t *f, byte *pc, bool skip) #endif /* UNIX */ #ifdef LINUX -/* Returns whether it destroyed "instr". */ +/*************************************************************************** + * Rseq (restartable sequence) mangling. + */ +enum { + DR_RSEQ_LABEL_START = 0, + DR_RSEQ_LABEL_END = 1, + DR_RSEQ_LABEL_ABORT = 2, + DR_RSEQ_LABEL_CS = 3, +}; + +static instr_t * +mangle_rseq_create_label(dcontext_t *dcontext, int type, ptr_uint_t data) +{ + instr_t *label = INSTR_CREATE_label(dcontext); + instr_set_note(label, (void *)DR_NOTE_RSEQ); + /* XXX: The note doesn't surivive encoding, so we also use a flag. See comment in + * instr.h by this flag: maybe we should move a label's note somewhere persistent? + */ + label->flags |= INSTR_RSEQ_ENDPOINT; + dr_instr_label_data_t *label_data = instr_get_label_data_area(label); + label_data->data[0] = type; + label_data->data[1] = data; + return label; +} + +/* May modify next_instr. */ +/* TODO i#2350: Remove this once we are sure of the stability of + * mangle_rseq_insert_native_sequence(). + */ +static void +mangle_rseq_insert_call_sequence(dcontext_t *dcontext, instrlist_t *ilist, instr_t *instr, + instr_t *next_instr, uint *flags INOUT, app_pc start, + app_pc end, app_pc handler, reg_id_t scratch_reg, + bool *reg_written, int reg_written_count) +{ + /* See the big "We just ran the instrumented version" comment below. */ + LOG(THREAD, LOG_INTERP, 4, "mangle: inserting call to native rseq " PFX "\n", start); + RSTATS_INC(num_rseq_native_calls_inserted); +# ifdef X86 + /* Create a scratch register. */ + if (SCRATCH_ALWAYS_TLS()) { + PRE(ilist, next_instr, + instr_create_save_to_tls(dcontext, scratch_reg, TLS_REG0_SLOT)); + insert_get_mcontext_base(dcontext, ilist, next_instr, scratch_reg); + } else { + PRE(ilist, next_instr, + instr_create_save_to_dcontext(dcontext, scratch_reg, REG0_OFFSET)); + insert_mov_immed_ptrsz(dcontext, (ptr_int_t)dcontext, + opnd_create_reg(scratch_reg), ilist, next_instr, NULL, + NULL); + } + if (reg_written_count > 0) { + /* Restore the entry state we preserved earlier. */ + int i; + for (i = 0; i < DR_NUM_GPR_REGS; i++) { + if (reg_written[i]) { + size_t offs = offsetof(dcontext_t, rseq_entry_state) + sizeof(reg_t) * i; + PRE(ilist, next_instr, + XINST_CREATE_load(dcontext, + opnd_create_reg(DR_REG_START_GPR + (reg_id_t)i), + OPND_CREATE_MEMPTR(scratch_reg, offs))); + } + } + } + + /* For simplicity in this first version of the code, we assume call-return + * semantics for the rseq region. We create an extra frame + * and assume that causes no problems. We assume the native invocation will + * come back to us. + * TODO i#2350: Make a local copy of the rseq code so we can arrange for a + * guaranteed return on (any) exit from the region, and use relative jumps to + * avoid needing a scratch register (though on x86 we could call through TLS). + * We would transform all mid-point exits into capture points. This gets rid + * of the call-return assumptions and the extra frame. + */ + instr_t check; + instr_init(dcontext, &check); + if (decode_cti(dcontext, end, &check) == NULL || !instr_is_return(&check)) { + REPORT_FATAL_ERROR_AND_EXIT(RSEQ_BEHAVIOR_UNSUPPORTED, 3, get_application_name(), + get_application_pid(), + "Rseq sequences must end with a return"); + ASSERT_NOT_REACHED(); + } + instr_free(dcontext, &check); + /* We assume that by making this a block end, clients will restore app state + * before this native invocation. + * TODO i#2350: Take some further action to better guarantee this in the face + * of future drreg optimizations, etc. Do we need new interface features, or + * do we live with a fake app jump or sthg? + */ + /* A direct call may not reach, so we need an indirect call. We use a TLS slot + * to avoid needing a dead register. + */ + insert_mov_immed_ptrsz(dcontext, (ptr_int_t)start, opnd_create_reg(scratch_reg), + ilist, next_instr, NULL, NULL); + if (SCRATCH_ALWAYS_TLS()) { + PRE(ilist, next_instr, + instr_create_save_to_tls(dcontext, scratch_reg, TLS_REG1_SLOT)); + } else { + PRE(ilist, next_instr, + instr_create_save_to_dcontext(dcontext, scratch_reg, REG1_OFFSET)); + } + /* Restore the scratch register. */ + if (SCRATCH_ALWAYS_TLS()) { + PRE(ilist, next_instr, + instr_create_restore_from_tls(dcontext, scratch_reg, TLS_REG0_SLOT)); + } else { + PRE(ilist, next_instr, + instr_create_restore_from_dcontext(dcontext, scratch_reg, REG0_OFFSET)); + } + /* Set up the frame and stack alignment. We assume the rseq code was a leaf + * function and that rsp is 16-aligned now. + * TODO i#2350: If we stick with an extra call frame, it would be better to + * spill rsp and hard-align it using a bitmask to ensure alignment; however, + * see above where we hope to eliminate the call-return assumption altogether. + */ + instrlist_meta_preinsert( + ilist, next_instr, + XINST_CREATE_sub(dcontext, opnd_create_reg(DR_REG_RSP), OPND_CREATE_INT32(8))); + instrlist_meta_preinsert( + ilist, next_instr, + INSTR_CREATE_call_ind(dcontext, + SCRATCH_ALWAYS_TLS() + ? opnd_create_tls_slot(os_tls_offset(TLS_REG1_SLOT)) + : opnd_create_dcontext_field(dcontext, REG1_OFFSET))); + instrlist_meta_preinsert( + ilist, next_instr, + XINST_CREATE_add(dcontext, opnd_create_reg(DR_REG_RSP), OPND_CREATE_INT32(8))); +# else + /* TODO i#2350: Add non-x86 support. We need to pay particular attention + * to the stolen register. If we do a local copy (with no callouts) we could + * mangle it. We also cannot do an indirect call through anything but a + * register and thus need a dead register for the call-return approach, but + * that disappears once DR uses a local copy. + */ + REPORT_FATAL_ERROR_AND_EXIT(RSEQ_BEHAVIOR_UNSUPPORTED, 3, get_application_name(), + get_application_pid(), + "Rseq is not yet supported for non-x86"); + ASSERT_NOT_REACHED(); +# endif +} + +/* May modify next_instr. */ +static void +mangle_rseq_insert_native_sequence(dcontext_t *dcontext, instrlist_t *ilist, + instr_t *instr, INOUT instr_t **next_instr, + uint *flags INOUT, app_pc start, app_pc end, + app_pc handler, reg_id_t scratch_reg, + bool *reg_written, int reg_written_count) +{ + /* We just ran the instrumented version of the rseq code, with the stores + * removed. Now we need to invoke it again natively for real. We would prefer + * to invoke the abort handler, as it may perform some setup, but in too many + * cases it is truly an "abort" handler that just exits rather than a "restart + * handler". Furthermore, to support executing a copy of the code natively in + * order to provide guarantees on regaining control and not rely on call-return + * semantics, it is simpler to execute only the limited-scope rseq region. + * Thus, we target the start point. + * + * In case the abort handler does perform setup, we checkpoint and restore GPR + * register values. Memory should remain as it was, due to nop-ing of stores. + * + * XXX i#2350: We ignore the app's rseq flags for when to restart. It's + * possible the app disabled restarts on preempts and migrations and can't + * handle our restart here, but that seems pathological: we expect the rseq + * feature to be used for restarts rather than just a detection mechanism of + * preemption. + */ + /* TODO i#2350: We may want to watch exits from the sequence in the instrumented + * run and do this native invocation on those as well and not just on completion + * of the rseq sequence, in case a store we nop-ed is read post-exit. We would + * flag all rseq exit ctis and process in dispatch or sthg. + */ + LOG(THREAD, LOG_INTERP, 3, "mangle: inserting native rseq code " PFX "\n", start); + RSTATS_INC(num_rseq_native_calls_inserted); + instr_t *insert_at = *next_instr; + + /* We assume that by making this a block end, clients will restore app state + * before this native invocation. + * TODO i#2350: Take some further action to better guarantee this in the face + * of future drreg optimizations, etc. Do we need new interface features, or + * do we live with a fake app jump or sthg? + */ + + /* Create a scratch register. Use slot 1 to avoid conflict with segment + * mangling below. + */ + if (SCRATCH_ALWAYS_TLS()) { + PRE(ilist, insert_at, + instr_create_save_to_tls(dcontext, scratch_reg, TLS_REG1_SLOT)); + } else { + PRE(ilist, insert_at, + instr_create_save_to_dcontext(dcontext, scratch_reg, REG1_OFFSET)); + } + /* Restore the entry state we preserved earlier. */ + if (reg_written_count > 0) { + if (SCRATCH_ALWAYS_TLS()) + insert_get_mcontext_base(dcontext, ilist, insert_at, scratch_reg); + else { + insert_mov_immed_ptrsz(dcontext, (ptr_int_t)dcontext, + opnd_create_reg(scratch_reg), ilist, insert_at, NULL, + NULL); + } + int i; + for (i = 0; i < DR_NUM_GPR_REGS; i++) { + if (reg_written[i]) { + size_t offs = offsetof(dcontext_t, rseq_entry_state) + sizeof(reg_t) * i; + PRE(ilist, insert_at, + XINST_CREATE_load(dcontext, + opnd_create_reg(DR_REG_START_GPR + (reg_id_t)i), + OPND_CREATE_MEMPTR(scratch_reg, offs))); + } + } + } + + instr_t *label_start = mangle_rseq_create_label(dcontext, DR_RSEQ_LABEL_START, 0); + instr_t *label_end = mangle_rseq_create_label(dcontext, DR_RSEQ_LABEL_END, 0); + instr_t *label_abort = mangle_rseq_create_label(dcontext, DR_RSEQ_LABEL_ABORT, 0); + + /* Create an exit cti targeting the abort handler, to use as our handler. + * XXX: I'd like to use DO_NOT_EMIT but that's not supported for regular + * blocks and so we pay the cost of a little jump. + * The jump helps with the signature data we embed in any case. + */ + instr_t *skip_abort = INSTR_CREATE_label(dcontext); + PRE(ilist, insert_at, + XINST_CREATE_jump_short(dcontext, opnd_create_instr(skip_abort))); + /* We have to put the signature just prior to the abort handler pc. */ + int signature = rseq_get_signature(); +# ifdef X86 + /* To avoid errors decoding in decode_fragment() or disassembly, we spend 3 + * extra bytes making it into a 7-byte nop instr on variable-length x86. + */ + instr_t *abort_sig = INSTR_CREATE_nop_modrm( + dcontext, opnd_create_base_disp(DR_REG_NULL, DR_REG_NULL, 0, signature, OPSZ_4)); +# else + /* XXX i#2350: This may still have trouble with decode_fragment() if it + * happens to look like a branch or invalid opcode. + */ + instr_t *abort_sig = INSTR_CREATE_nop(dcontext); + instr_allocate_raw_bits(dcontext, abort_sig, sizeof(signature)); + instr_set_raw_word(abort_sig, 0, (uint)signature); +# endif + PRE(ilist, insert_at, abort_sig); + PRE(ilist, insert_at, label_abort); + instrlist_preinsert(ilist, insert_at, + XINST_CREATE_jump(dcontext, opnd_create_pc(handler))); + PRE(ilist, insert_at, skip_abort); + + /* Point this thread's struct rseq ptr at an rseq_cs which points at the bounds + * of this fragment's region. We want to create a new rseq_cs, point at it + * here, and finalize it in mangle_rseq_finalize(), but allocating memory here + * leads to complexities freeing it and determinism complexities in various + * non-emitted-block scenarios, primarily translation. Thus we instead make a + * placeholder and patch it in mangle_rseq_finalize(). To ensure the immed will + * reach we always ask for a large immediate (paying the price of an extra zero + * write for the top half for aarchxx). + * + * An alternative is to embed the rseq_cs as data inside the fragment, next to + * the signature above. To ensure alignment, that would add 64 bytes to the + * cache and require extra intructions to do the alignment (if we got the + * address from an instr opnd), but the real problem is it messes up + * decode_fragment() and even disassembly. + */ + instr_t *immed_first, *immed_last; + insert_mov_immed_ptrsz(dcontext, (ptr_int_t)INT_MAX IF_X64(+1), + opnd_create_reg(scratch_reg), ilist, insert_at, &immed_first, + &immed_last); + ASSERT(immed_first != NULL); + IF_X86(ASSERT(immed_last == NULL)); + instr_t *label_rseq_cs = + mangle_rseq_create_label(dcontext, DR_RSEQ_LABEL_CS, immed_last == NULL ? 1 : 2); + PRE(ilist, immed_first /*prior to immeds*/, label_rseq_cs); + /* We need to mangle this segment ref, and all of the subsequent local copy. */ +# ifdef X86 + instr_t *start_mangling = XINST_CREATE_store( + dcontext, + opnd_create_far_base_disp(LIB_SEG_TLS, DR_REG_NULL, DR_REG_NULL, 0, + rseq_get_tls_ptr_offset(), OPSZ_PTR), + opnd_create_reg(scratch_reg)); +# else + /* TODO i#2350: Construct an app TLS access instruction for aarchxx. */ + ASSERT_NOT_IMPLEMENTED(false); + instr_t *start_mangling = INSTR_CREATE_label(dcontext); /* So it compiles. */ +# endif + instrlist_preinsert(ilist, insert_at, start_mangling); + + /* Restore scratch_reg. */ + if (SCRATCH_ALWAYS_TLS()) { + PRE(ilist, insert_at, + instr_create_restore_from_tls(dcontext, scratch_reg, TLS_REG1_SLOT)); + } else { + PRE(ilist, insert_at, + instr_create_restore_from_dcontext(dcontext, scratch_reg, REG1_OFFSET)); + } + + /* Make a local copy of the rseq code (otherwise we would have to assume that + * all rseq sequences are callees with a nice return to come back to us, which + * is not true). All exits become fragment exits; we assume all ind branches + * are exits. We mangle the code here (but do *not* pass to clients), which is + * required (rip-rel reachable, segments, stolen reg!) and safe (mostly loads, + * not stores, and restartable). + */ + app_pc pc = start; + PRE(ilist, insert_at, label_start); + while (pc < end) { + instr_t *copy = instr_create(dcontext); + pc = decode(dcontext, pc, copy); + if (pc == NULL) { + REPORT_FATAL_ERROR_AND_EXIT(RSEQ_BEHAVIOR_UNSUPPORTED, 3, + get_application_name(), get_application_pid(), + "Invalid instruction inside rseq region"); + ASSERT_NOT_REACHED(); + } + /* Make intra-region branches meta; all others are exit ctis. */ + if ((instr_is_cbr(copy) || instr_is_ubr(copy)) && + opnd_is_pc(instr_get_target(copy))) { + app_pc tgt = opnd_get_pc(instr_get_target(copy)); + if (tgt >= start && tgt < end) { + PRE(ilist, insert_at, copy); + continue; + } + } + instrlist_preinsert(ilist, insert_at, copy); + if (instr_is_mbr(copy)) { + /* We need to add the exit cti that interp adds for a regular block. + * We explicitly disallow targeting inside the sequence, but we have + * no way to easily verify that: if it happens we'll end up interpreting + * the target part of the sequence, leading to app errors. + */ + uint exit_type = instr_branch_type(copy); + byte *pc = get_ibl_routine(dcontext, get_ibl_entry_type(exit_type), + TEST(FRAG_IS_TRACE, *flags) ? DEFAULT_IBL_TRACE() + : DEFAULT_IBL_BB(), + get_ibl_branch_type(copy)); + instr_t *exit = XINST_CREATE_jump(dcontext, opnd_create_pc(pc)); + instr_exit_branch_set_type(exit, exit_type); + instrlist_preinsert(ilist, insert_at, exit); + } + } + PRE(ilist, insert_at, label_end); + /* Now mangle from this point. */ + *next_instr = start_mangling; + + /* Clear the rseq ptr on exit to avoid problems if we free the rseq_cs and + * the kernel finds invalid addresses there and forces a SIGSEGV on us. + * For midpoint exits above, it's not easy to insert this clear directly. + * Instead, we rely on rseq_shared_fragment_flushtime_update() and + * rseq_remove_fragment() to clear the pointer before rseq_cs is freed. + */ +# ifdef X86 + instrlist_preinsert(ilist, insert_at, + XINST_CREATE_store(dcontext, + opnd_create_far_base_disp( + LIB_SEG_TLS, DR_REG_NULL, DR_REG_NULL, 0, + rseq_get_tls_ptr_offset(), OPSZ_PTR), + OPND_CREATE_INT32(0))); +# else + /* TODO i#2350: Construct an app TLS access instruction for aarchxx. */ + ASSERT_NOT_IMPLEMENTED(false); +# endif + + DOLOG(4, LOG_INTERP, { + LOG(THREAD, LOG_INTERP, 4, "New ilist for rseq:\n"); + instrlist_disassemble(dcontext, start, ilist, THREAD); + }); +} + +/* Returns whether it destroyed "instr". May modify next_instr. */ static bool -mangle_rseq(dcontext_t *dcontext, instrlist_t *ilist, instr_t *instr, instr_t *next_instr) +mangle_rseq(dcontext_t *dcontext, instrlist_t *ilist, instr_t *instr, + INOUT instr_t **next_instr, uint *flags INOUT) { int i; app_pc pc = get_app_instr_xl8(instr); @@ -849,6 +1227,8 @@ mangle_rseq(dcontext_t *dcontext, instrlist_t *ilist, instr_t *instr, instr_t *n } int len = instr_length(dcontext, instr); if (pc + len >= end) { + ilist->flags |= INSTR_RSEQ_ENDPOINT; + *flags |= FRAG_HAS_RSEQ_ENDPOINT; if (pc + len != end) { REPORT_FATAL_ERROR_AND_EXIT( RSEQ_BEHAVIOR_UNSUPPORTED, 3, get_application_name(), @@ -863,131 +1243,19 @@ mangle_rseq(dcontext_t *dcontext, instrlist_t *ilist, instr_t *instr, instr_t *n "Rseq sequences must fall through their endpoints"); ASSERT_NOT_REACHED(); } -# ifdef X86 - /* We just ran the instrumented version of the rseq code, with the stores - * removed. Now we need to invoke it again natively for real. We would prefer - * to invoke the abort handler, as it may perform some setup, but in too many - * cases it is truly an "abort" handler that just exits rather than a "restart - * handler". Furthermore, to support executing a copy of the code natively in - * order to provide guarantees on regaining control and not rely on call-return - * semantics, it is simpler to execute only the limited-scope rseq region. - * Thus, we target the start point. - * - * In case the abort handler does perform setup, we checkpoint and restore GPR - * register values. Memory should remain as it was, due to nop-ing of stores. - * - * XXX i#2350: We ignore the app's rseq flags for when to restart. It's - * possible the app disabled restarts on preempts and migrations and can't - * handle our restart here, but that seems pathological: we expect the rseq - * feature to be used for restarts rather than just a detection mechanism of - * preemption. - */ - LOG(THREAD, LOG_INTERP, 4, "mangle: inserting call to native rseq " PFX "\n", - start); - RSTATS_INC(num_rseq_native_calls_inserted); - - /* Create a scratch register. */ - if (SCRATCH_ALWAYS_TLS()) { - PRE(ilist, next_instr, - instr_create_save_to_tls(dcontext, scratch_reg, TLS_REG0_SLOT)); - insert_get_mcontext_base(dcontext, ilist, next_instr, scratch_reg); - } else { - PRE(ilist, next_instr, - instr_create_save_to_dcontext(dcontext, scratch_reg, REG0_OFFSET)); - insert_mov_immed_ptrsz(dcontext, (ptr_int_t)dcontext, - opnd_create_reg(scratch_reg), ilist, next_instr, NULL, - NULL); - } - if (reg_written_count > 0) { - /* Restore the entry state we preserved earlier. */ - for (i = 0; i < DR_NUM_GPR_REGS; i++) { - if (reg_written[i]) { - size_t offs = - offsetof(dcontext_t, rseq_entry_state) + sizeof(reg_t) * i; - PRE(ilist, next_instr, - XINST_CREATE_load(dcontext, - opnd_create_reg(DR_REG_START_GPR + (reg_id_t)i), - OPND_CREATE_MEMPTR(scratch_reg, offs))); - } - } - } - - /* For simplicity in this first version of the code, we assume call-return - * semantics for the rseq region. We create an extra frame - * and assume that causes no problems. We assume the native invocation will - * come back to us. - * TODO i#2350: Make a local copy of the rseq code so we can arrange for a - * guaranteed return on (any) exit from the region, and use relative jumps to - * avoid needing a scratch register (though on x86 we could call through TLS). - * We would transform all mid-point exits into capture points. This gets rid - * of the call-return assumptions and the extra frame. - */ - instr_t check; - instr_init(dcontext, &check); - if (decode_cti(dcontext, end, &check) == NULL || !instr_is_return(&check)) { - REPORT_FATAL_ERROR_AND_EXIT(RSEQ_BEHAVIOR_UNSUPPORTED, 3, - get_application_name(), get_application_pid(), - "Rseq sequences must end with a return"); - ASSERT_NOT_REACHED(); - } - instr_free(dcontext, &check); - /* We assume that by making this a block end, clients will restore app state - * before this native invocation. - * TODO i#2350: Take some further action to better guarantee this in the face - * of future drreg optimizations, etc. Do we need new interface features, or - * do we live with a fake app jump or sthg? - */ - /* A direct call may not reach, so we need an indirect call. We use a TLS slot - * to avoid needing a dead register. - */ - insert_mov_immed_ptrsz(dcontext, (ptr_int_t)start, opnd_create_reg(scratch_reg), - ilist, next_instr, NULL, NULL); - if (SCRATCH_ALWAYS_TLS()) { - PRE(ilist, next_instr, - instr_create_save_to_tls(dcontext, scratch_reg, TLS_REG1_SLOT)); - } else { - PRE(ilist, next_instr, - instr_create_save_to_dcontext(dcontext, scratch_reg, REG1_OFFSET)); - } - /* Restore the scratch register. */ - if (SCRATCH_ALWAYS_TLS()) { - PRE(ilist, next_instr, - instr_create_restore_from_tls(dcontext, scratch_reg, TLS_REG0_SLOT)); + if (DYNAMO_OPTION(rseq_assume_call)) { + mangle_rseq_insert_call_sequence(dcontext, ilist, instr, *next_instr, flags, + start, end, handler, scratch_reg, + reg_written, reg_written_count); } else { - PRE(ilist, next_instr, - instr_create_restore_from_dcontext(dcontext, scratch_reg, REG0_OFFSET)); + mangle_rseq_insert_native_sequence(dcontext, ilist, instr, next_instr, flags, + start, end, handler, scratch_reg, + reg_written, reg_written_count); } - /* Set up the frame and stack alignment. We assume the rseq code was a leaf - * function and that rsp is 16-aligned now. - * TODO i#2350: If we stick with an extra call frame, it would be better to - * spill rsp and hard-align it using a bitmask to ensure alignment; however, - * see above where we hope to eliminate the call-return assumption altogether. - */ - instrlist_meta_preinsert(ilist, next_instr, - XINST_CREATE_sub(dcontext, opnd_create_reg(DR_REG_RSP), - OPND_CREATE_INT32(8))); - instrlist_meta_preinsert( - ilist, next_instr, - INSTR_CREATE_call_ind( - dcontext, - SCRATCH_ALWAYS_TLS() - ? opnd_create_tls_slot(os_tls_offset(TLS_REG1_SLOT)) - : opnd_create_dcontext_field(dcontext, REG1_OFFSET))); - instrlist_meta_preinsert(ilist, next_instr, - XINST_CREATE_add(dcontext, opnd_create_reg(DR_REG_RSP), - OPND_CREATE_INT32(8))); -# else - /* TODO i#2350: Add non-x86 support. We need to pay particular attention - * to the stolen register. If we do a local copy (with no callouts) we could - * mangle it. We also cannot do an indirect call through anything but a - * register and thus need a dead register for the call-return approach, but - * that disappears once DR uses a local copy. + /* TODO i#2350: We should also invoke the native sequence on a midpoint exit + * from the sequence during instrumentation, since there may be state changes + * in the early part that are visible outside. */ - REPORT_FATAL_ERROR_AND_EXIT(RSEQ_BEHAVIOR_UNSUPPORTED, 3, get_application_name(), - get_application_pid(), - "Rseq is not yet supported for non-x86"); - ASSERT_NOT_REACHED(); -# endif } /* If we're inside a restartable sequence, this is the first run which is @@ -1012,13 +1280,58 @@ mangle_rseq(dcontext_t *dcontext, instrlist_t *ilist, instr_t *instr, instr_t *n "Store inside rseq region has multiple destinations"); ASSERT_NOT_REACHED(); } - LOG(THREAD, LOG_INTERP, 4, "mangle: removing store inside rseq region @" PFX "\n", + LOG(THREAD, LOG_INTERP, 3, "mangle: removing store inside rseq region @" PFX "\n", pc); RSTATS_INC(num_rseq_stores_elided); instrlist_remove(ilist, instr); instr_destroy(dcontext, instr); return true; /* destroyed instr */ } + +static void +mangle_rseq_finalize(dcontext_t *dcontext, instrlist_t *ilist, fragment_t *f) +{ + if (DYNAMO_OPTION(rseq_assume_call)) + return; + instr_t *instr, *immed_first = NULL, *immed_last = NULL; + cache_pc pc = FCACHE_ENTRY_PC(f), immed_start_pc = NULL; + cache_pc rseq_start = NULL, rseq_end = NULL, rseq_abort = NULL; + for (instr = instrlist_first(ilist); instr != NULL; instr = instr_get_next(instr)) { + if (instr_is_label(instr) && + (instr_get_note(instr) == (void *)DR_NOTE_RSEQ || + TEST(INSTR_RSEQ_ENDPOINT, instr->flags))) { + dr_instr_label_data_t *label_data = instr_get_label_data_area(instr); + switch (label_data->data[0]) { + case DR_RSEQ_LABEL_START: rseq_start = pc; break; + case DR_RSEQ_LABEL_END: rseq_end = pc; break; + case DR_RSEQ_LABEL_ABORT: rseq_abort = pc; break; + case DR_RSEQ_LABEL_CS: + immed_start_pc = pc; + immed_first = instr_get_next(instr); + if (label_data->data[1] > 1) + immed_last = instr_get_next(immed_first); + break; + default: ASSERT_NOT_REACHED(); + } + } + pc += instr_length(dcontext, instr); + } + LOG(THREAD, LOG_INTERP, 4, "%s: start=" PFX ", end=" PFX ", abort=" PFX "\n", + __FUNCTION__, rseq_start, rseq_end, rseq_abort); + ASSERT(rseq_start != NULL && rseq_end != NULL && rseq_abort != NULL); + + byte *rseq_cs_alloc, *rseq_cs; + /* The rseq_cs creation and recording is structured like this in two steps to + * provide flexibility in mangling. Originally the alloc was done in mangle_rseq() + * and passed here in the label data, but to simplify freeing we now allocate here + * and patch the immediates. + */ + rseq_cs_alloc = rseq_get_rseq_cs_alloc(&rseq_cs); + rseq_record_rseq_cs(rseq_cs_alloc, f, rseq_start, rseq_end, rseq_abort); + ASSERT(immed_start_pc != NULL && immed_first != NULL); + patch_mov_immed_ptrsz(dcontext, (ptr_int_t)rseq_cs, immed_start_pc, immed_first, + immed_last); +} #endif /* LINUX */ /* TOP-LEVEL MANGLE @@ -1120,7 +1433,7 @@ d_r_mangle(dcontext_t *dcontext, instrlist_t *ilist, uint *flags INOUT, bool man !vmvector_empty(d_r_rseq_areas)) { app_pc pc = get_app_instr_xl8(instr); if (vmvector_overlap(d_r_rseq_areas, pc, pc + 1)) { - if (mangle_rseq(dcontext, ilist, instr, next_instr)) + if (mangle_rseq(dcontext, ilist, instr, &next_instr, flags)) continue; /* instr was destroyed */ } } @@ -1303,7 +1616,7 @@ d_r_mangle(dcontext_t *dcontext, instrlist_t *ilist, uint *flags INOUT, bool man } #ifdef STEAL_REGISTER - if (ilist->flags) { + if (TESTANY(STEAL_REG_ILIST_FLAGS, ilist->flags)) { restore_state(dcontext, instr, ilist); /* end of edi calculation */ } #endif @@ -1355,10 +1668,12 @@ d_r_mangle(dcontext_t *dcontext, instrlist_t *ilist, uint *flags INOUT, bool man } #endif +#ifdef STEAL_REGISTER /* The following assertion should be guaranteed by fact that all * blocks end in some kind of branch, and the code above restores * the register state on a branch. */ - ASSERT(ilist->flags == 0); + ASSERT(!TESTANY(STEAL_REG_ILIST_FLAGS, ilist->flags)); +#endif KSTOP(mangling); } @@ -1495,6 +1810,20 @@ find_syscall_num(dcontext_t *dcontext, instrlist_t *ilist, instr_t *instr) return (int)syscall; } +void +mangle_finalize(dcontext_t *dcontext, instrlist_t *ilist, fragment_t *f) +{ +#ifdef X86 + if (TEST(FRAG_SELFMOD_SANDBOXED, f->flags)) { + finalize_selfmod_sandbox(dcontext, f); + } +#endif +#ifdef LINUX + if (TEST(INSTR_RSEQ_ENDPOINT, ilist->flags)) + mangle_rseq_finalize(dcontext, ilist, f); +#endif +} + /* END OF CONTROL-FLOW MANGLING ROUTINES *########################################################################### *########################################################################### diff --git a/core/arch/mangle_utils.c b/core/arch/mangle_utils.c index affdeaeaee8..0dcd5b4a3a7 100644 --- a/core/arch/mangle_utils.c +++ b/core/arch/mangle_utils.c @@ -1,5 +1,5 @@ /* ****************************************************************************** - * Copyright (c) 2010-2018 Google, Inc. All rights reserved. + * Copyright (c) 2010-2019 Google, Inc. All rights reserved. * Copyright (c) 2010 Massachusetts Institute of Technology All rights reserved. * Copyright (c) 2000-2010 VMware, Inc. All rights reserved. * ******************************************************************************/ diff --git a/core/arch/steal_reg.h b/core/arch/steal_reg.h index fde890fa748..167635b890d 100644 --- a/core/arch/steal_reg.h +++ b/core/arch/steal_reg.h @@ -1,4 +1,5 @@ /* ********************************************************** + * Copyright (c) 2019 Google, Inc. All rights reserved. * Copyright (c) 2000-2008 VMware, Inc. All rights reserved. * **********************************************************/ @@ -60,5 +61,10 @@ restore_state(dcontext_t *dcontext, instr_t *instr, instrlist_t *ilist); #define EDI_VAL_IN_MEM 0 #define EDI_VAL_IN_EBX 1 #define EDI_VAL_IN_EBX_AND_MEM 3 +/* XXX: We now store other flags in instrlist_t.flags. The steal reg code needs + * to be updated for that. However, it's been unused for so long perhaps it should + * just be deleted. + */ +#define STEAL_REG_ILIST_FLAGS (EDI_VAL_IN_MEM | EDI_VAL_IN_EBX | EDI_VAL_IN_EBX_AND_MEM) #endif /* X86_STEAL_REG_H */ diff --git a/core/arch/x86/mangle.c b/core/arch/x86/mangle.c index ef820fe3aab..b03d794defe 100644 --- a/core/arch/x86/mangle.c +++ b/core/arch/x86/mangle.c @@ -1128,6 +1128,36 @@ insert_reachable_cti(dcontext_t *dcontext, instrlist_t *ilist, instr_t *where, * M A N G L I N G R O U T I N E S */ +/* Updates the immediates used by insert_mov_immed_arch() to used the value "val". + * The "first" and "last" from insert_mov_immed_arch() should be passed here, + * along with the encoded start pc of "first" as "pc". + * Keep this in sync with insert_mov_immed_arch(). + * This is *not* a hot-patchable patch: i.e., it is subject to races. + */ +void +patch_mov_immed_arch(dcontext_t *dcontext, ptr_int_t val, byte *pc, instr_t *first, + instr_t *last) +{ + byte *write_pc = vmcode_get_writable_addr(pc); + byte *immed_pc; + ASSERT(first != NULL); +# ifdef X64 + if (X64_MODE_DC(dcontext) && last != NULL) { + immed_pc = write_pc + instr_length(dcontext, first) - sizeof(int); + ATOMIC_4BYTE_WRITE(immed_pc, (int)val, NOT_HOT_PATCHABLE); + immed_pc = write_pc + instr_length(dcontext, first) + + instr_length(dcontext, last) - sizeof(int); + ATOMIC_4BYTE_WRITE(immed_pc, (int)(val >> 32), NOT_HOT_PATCHABLE); + } else { +# endif + immed_pc = write_pc + instr_length(dcontext, first) - sizeof(val); + ATOMIC_ADDR_WRITE(immed_pc, val, NOT_HOT_PATCHABLE); + ASSERT(last == NULL); +# ifdef X64 + } +# endif +} + #endif /* !STANDALONE_DECODER */ /* We export these mov/push utilities to drdecode */ @@ -1135,6 +1165,7 @@ insert_reachable_cti(dcontext_t *dcontext, instrlist_t *ilist, instr_t *where, * encode_estimate to determine whether > 32 bits or not: so if unsure where * it will be encoded, pass a high address) as the immediate; else * uses val. + * Keep this in sync with patch_mov_immed_arch(). */ void insert_mov_immed_arch(dcontext_t *dcontext, instr_t *src_inst, byte *encode_estimate, diff --git a/core/emit.c b/core/emit.c index dd49dee4a2b..8b32a2bd3ab 100644 --- a/core/emit.c +++ b/core/emit.c @@ -903,11 +903,8 @@ emit_fragment_common(dcontext_t *dcontext, app_pc tag, instrlist_t *ilist, uint } else { /* bb-only finalization */ } -#ifdef X86 - if ((flags & FRAG_SELFMOD_SANDBOXED) != 0) { - finalize_selfmod_sandbox(dcontext, f); - } -#endif + mangle_finalize(dcontext, ilist, f); + /* add fragment to vm area lists */ vm_area_add_fragment(dcontext, f, vmlist); diff --git a/core/fragment.c b/core/fragment.c index 033822c4b24..46d4056d1e2 100644 --- a/core/fragment.c +++ b/core/fragment.c @@ -3057,6 +3057,11 @@ fragment_delete(dcontext_t *dcontext, fragment_t *f, uint actions) release_recursive_lock(&change_linking_lock); } +#ifdef LINUX + if (TEST(FRAG_HAS_RSEQ_ENDPOINT, f->flags)) + rseq_remove_fragment(dcontext, f); +#endif + if (!TEST(FRAGDEL_NO_HTABLE, actions)) fragment_remove(dcontext, f); @@ -5434,6 +5439,9 @@ check_flush_queue(dcontext_t *dcontext, fragment_t *was_I_flushed) * actual shared flushing. */ pt->flushtime_last_update < flushtime_global) { +#ifdef LINUX + rseq_shared_fragment_flushtime_update(dcontext); +#endif /* dec ref count on any pending shared areas */ not_flushed = not_flushed && vm_area_check_shared_pending(dcontext, was_I_flushed); diff --git a/core/fragment.h b/core/fragment.h index 50fc7ba2321..6a41ab208e8 100644 --- a/core/fragment.h +++ b/core/fragment.h @@ -1,5 +1,5 @@ /* ********************************************************** - * Copyright (c) 2012-2017 Google, Inc. All rights reserved. + * Copyright (c) 2012-2019 Google, Inc. All rights reserved. * Copyright (c) 2000-2010 VMware, Inc. All rights reserved. * **********************************************************/ @@ -94,7 +94,15 @@ #define FRAG_IS_EMPTY_SLOT 0x020000 /* used by vmarea to distinguish fragment_t from its own multi unit struct */ #define FRAG_IS_EXTRA_VMAREA 0x040000 +/* If FRAG_IS_EXTRA_VMAREA is set, this value indicates this flag: */ #define FRAG_IS_EXTRA_VMAREA_INIT 0x080000 +#ifdef LINUX +/* If FRAG_IS_EXTRA_VMAREA is not set, this value indicates this flag, + * which labels the fragment as containing rseq data whose lifetime should + * match the fragment. + */ +# define FRAG_HAS_RSEQ_ENDPOINT 0x080000 +#endif #ifdef PROGRAM_SHEPHERDING /* indicates from memory that wasn't part of code from image on disk */ diff --git a/core/globals.h b/core/globals.h index 4360dfbe371..c5a4cbd8244 100644 --- a/core/globals.h +++ b/core/globals.h @@ -270,6 +270,7 @@ typedef struct _module_data_t module_data_t; # define DR_NOTE_FIRST_RESERVED 0xfffffff0UL #endif #define DR_NOTE_ANNOTATION (DR_NOTE_FIRST_RESERVED + 1) +#define DR_NOTE_RSEQ (DR_NOTE_FIRST_RESERVED + 2) /** * Structure written by dr_get_time() to specify the current time. diff --git a/core/optionsx.h b/core/optionsx.h index 0c77ba42e8d..098c444cd2d 100644 --- a/core/optionsx.h +++ b/core/optionsx.h @@ -1621,6 +1621,9 @@ OPTION_DEFAULT(uint, early_inject_location, 4 /* INJECT_LOCATION_LdrDefault */, OPTION_DEFAULT(bool, disable_rseq, false, "cause the restartable sequence SYS_rseq " "system call to return -ENOSYS as a workaround for rseq features not " "supportable by DR") + /* TODO i#2350: Remove this once we are sure of the stability of local copies. */ + OPTION_DEFAULT(bool, rseq_assume_call, false, "assume rseq sequences are always " + "structured with function call interfaces") #endif #ifdef UNIX OPTION_DEFAULT(bool, restart_syscalls, true, diff --git a/core/unix/os.c b/core/unix/os.c index 11d8c675adb..f88011c4ba9 100644 --- a/core/unix/os.c +++ b/core/unix/os.c @@ -922,7 +922,8 @@ d_r_os_init(void) init_android_version(); #endif #ifdef LINUX - d_r_rseq_init(); + if (!standalone_library) + d_r_rseq_init(); #endif } @@ -1267,7 +1268,8 @@ void os_slow_exit(void) { #ifdef LINUX - d_r_rseq_exit(); + if (!standalone_library) + d_r_rseq_exit(); #endif d_r_signal_exit(); memquery_exit(); @@ -1850,7 +1852,8 @@ get_app_segment_base(uint seg) if (seg == SEG_CS || seg == SEG_SS || seg == SEG_DS || seg == SEG_ES) return NULL; #endif /* X86 */ - if (IF_CLIENT_INTERFACE_ELSE(INTERNAL_OPTION(private_loader), false)) { + if (IF_CLIENT_INTERFACE_ELSE(INTERNAL_OPTION(private_loader), false) && + first_thread_tls_initialized && !last_thread_tls_exited) { return d_r_get_tls(os_get_app_tls_base_offset(seg)); } return get_segment_base(seg); @@ -7559,13 +7562,15 @@ pre_system_call(dcontext_t *dcontext) #ifdef LINUX case SYS_rseq: + LOG(THREAD, LOG_VMAREAS | LOG_SYSCALLS, 2, "syscall: rseq " PFX " %d %d %d\n", + sys_param(dcontext, 0), sys_param(dcontext, 1), sys_param(dcontext, 2), + sys_param(dcontext, 3)); if (DYNAMO_OPTION(disable_rseq)) { set_failure_return_val(dcontext, ENOSYS); DODEBUG({ dcontext->expect_last_syscall_to_fail = true; }); execute_syscall = false; } else { - /* Lazy rseq handling. */ - rseq_locate_rseq_regions(); + dcontext->sys_param0 = sys_param(dcontext, 0); } break; #endif @@ -8590,6 +8595,14 @@ post_system_call(dcontext_t *dcontext) } } break; + + case SYS_rseq: + /* Lazy rseq handling. */ + if (success) { + rseq_process_syscall(dcontext); + rseq_locate_rseq_regions(); + } + break; #endif default: diff --git a/core/unix/os_exports.h b/core/unix/os_exports.h index 890b0cf3911..1985f29e3a5 100644 --- a/core/unix/os_exports.h +++ b/core/unix/os_exports.h @@ -541,6 +541,29 @@ extern vm_area_vector_t *d_r_rseq_areas; bool rseq_get_region_info(app_pc pc, app_pc *start OUT, app_pc *end OUT, app_pc *handler OUT, bool **reg_written OUT, int *reg_written_size OUT); + +int +rseq_get_tls_ptr_offset(void); + +int +rseq_get_signature(void); + +int +rseq_get_rseq_cs_alignment(void); + +byte * +rseq_get_rseq_cs_alloc(byte **rseq_cs_aligned OUT); + +/* The first parameter is the value returned by rseq_get_rseq_cs_alloc(). */ +void +rseq_record_rseq_cs(byte *rseq_cs_alloc, fragment_t *f, cache_pc start, cache_pc end, + cache_pc abort); +void +rseq_remove_fragment(dcontext_t *dcontext, fragment_t *f); + +void +rseq_shared_fragment_flushtime_update(dcontext_t *dcontext); + #endif #endif /* _OS_EXPORTS_H_ */ diff --git a/core/unix/rseq_linux.c b/core/unix/rseq_linux.c index 864430c0281..54b1c35029c 100644 --- a/core/unix/rseq_linux.c +++ b/core/unix/rseq_linux.c @@ -46,6 +46,7 @@ #include "rseq_linux.h" #include "../fragment.h" #include "decode.h" +#include #ifdef HAVE_RSEQ # include #else @@ -59,7 +60,9 @@ struct rseq_cs { struct rseq { uint cpu_id_start; uint cpu_id; - uint64 ptr64; + union { + uint64 ptr64; + } rseq_cs; uint flags; } __attribute__((aligned(4 * sizeof(uint64)))); # define RSEQ_FLAG_UNREGISTER 1 @@ -72,6 +75,14 @@ DECLARE_CXTSWPROT_VAR(static mutex_t rseq_trigger_lock, INIT_LOCK_FREE(rseq_trigger_lock)); static volatile bool rseq_enabled; +/* We require all threads to use the same TLS offset to point at struct rseq. */ +static int rseq_tls_offset; + +/* The signature is registered per thread, but we require all registrations + * to be the same. + */ +static int rseq_signature; + typedef struct _rseq_region_t { app_pc start; app_pc end; @@ -84,6 +95,14 @@ typedef struct _rseq_region_t { bool reg_written[DR_NUM_GPR_REGS]; } rseq_region_t; +/* We need to store a struct rseq_cs per fragment_t. To avoid the cost of adding a + * pointer field to every fragment_t, and the complexity of another subclass like + * trace_t, we store them externally in a hashtable. The FRAG_HAS_RSEQ_ENDPOINT flag + * avoids the hashtable lookup on every fragment. + */ +static generic_table_t *rseq_cs_table; +#define INIT_RSEQ_CS_TABLE_SIZE 5 + /* vmvector callbacks */ static void rseq_area_free(void *data) @@ -102,19 +121,37 @@ rseq_area_dup(void *data) return dst; } +static inline size_t +rseq_cs_alloc_size(void) +{ + return sizeof(struct rseq) + __alignof(struct rseq_cs); +} + +static void +rseq_cs_free(dcontext_t *dcontext, void *data) +{ + global_heap_free(data, rseq_cs_alloc_size() HEAPACCT(ACCT_OTHER)); +} + void d_r_rseq_init(void) { VMVECTOR_ALLOC_VECTOR(d_r_rseq_areas, GLOBAL_DCONTEXT, VECTOR_SHARED | VECTOR_NEVER_MERGE, rseq_areas); vmvector_set_callbacks(d_r_rseq_areas, rseq_area_free, rseq_area_dup, NULL, NULL); + + rseq_cs_table = generic_hash_create(GLOBAL_DCONTEXT, INIT_RSEQ_CS_TABLE_SIZE, 80, + HASHTABLE_SHARED | HASHTABLE_PERSISTENT, + rseq_cs_free _IF_DEBUG("rseq_cs table")); + /* Enable rseq pre-attach for things like dr_prepopulate_cache(). */ if (rseq_is_registered_for_current_thread()) - rseq_enabled = true; + rseq_locate_rseq_regions(); } void d_r_rseq_exit(void) { + generic_hash_destroy(GLOBAL_DCONTEXT, rseq_cs_table); vmvector_delete_vector(GLOBAL_DCONTEXT, d_r_rseq_areas); DELETE_LOCK(rseq_trigger_lock); } @@ -150,6 +187,79 @@ rseq_get_region_info(app_pc pc, app_pc *start OUT, app_pc *end OUT, app_pc *hand return true; } +int +rseq_get_tls_ptr_offset(void) +{ + /* This read is assumed to be atomic. */ + ASSERT(rseq_tls_offset != 0); + return rseq_tls_offset + offsetof(struct rseq, rseq_cs); +} + +static void +rseq_clear_tls_ptr(dcontext_t *dcontext) +{ + byte *base = get_segment_base(LIB_SEG_TLS); + struct rseq *app_rseq = (struct rseq *)(base + rseq_tls_offset); + /* We're directly writing this in the cache, so we do not bother with safe_read + * or safe_write here either. We already cannot handle rseq adversarial cases. + */ + if (is_dynamo_address((byte *)(ptr_uint_t)app_rseq->rseq_cs.ptr64)) + app_rseq->rseq_cs.ptr64 = 0; +} + +int +rseq_get_signature(void) +{ + /* This read is assumed to be atomic. */ + return rseq_signature; +} + +byte * +rseq_get_rseq_cs_alloc(byte **rseq_cs_aligned OUT) +{ + byte *rseq_cs_alloc = global_heap_alloc(rseq_cs_alloc_size() HEAPACCT(ACCT_OTHER)); + *rseq_cs_aligned = (byte *)ALIGN_FORWARD(rseq_cs_alloc, __alignof(struct rseq_cs)); + return rseq_cs_alloc; +} + +void +rseq_record_rseq_cs(byte *rseq_cs_alloc, fragment_t *f, cache_pc start, cache_pc end, + cache_pc abort) +{ + struct rseq_cs *target = + (struct rseq_cs *)ALIGN_FORWARD(rseq_cs_alloc, __alignof(struct rseq_cs)); + target->version = 0; + target->flags = 0; + target->start_ip = (ptr_uint_t)start; + target->post_commit_offset = (ptr_uint_t)(end - start); + target->abort_ip = (ptr_uint_t)abort; + TABLE_RWLOCK(rseq_cs_table, write, lock); + generic_hash_add(GLOBAL_DCONTEXT, rseq_cs_table, (ptr_uint_t)f, rseq_cs_alloc); + TABLE_RWLOCK(rseq_cs_table, write, unlock); +} + +void +rseq_remove_fragment(dcontext_t *dcontext, fragment_t *f) +{ + /* Avoid freeing a live rseq_cs for a thread-private fragment deletion. */ + rseq_clear_tls_ptr(dcontext); + TABLE_RWLOCK(rseq_cs_table, write, lock); + generic_hash_remove(GLOBAL_DCONTEXT, rseq_cs_table, (ptr_uint_t)f); + TABLE_RWLOCK(rseq_cs_table, write, unlock); +} + +void +rseq_shared_fragment_flushtime_update(dcontext_t *dcontext) +{ + /* Avoid freeing a live rseq_cs for thread-shared fragment deletion. + * We clear the pointer on completion of the native rseq execution, but it's + * not easy to clear it on midpoint exits. We instead clear prior to + * rseq_cs being freed: for thread-private in rseq_remove_fragment() and for + * thread-shared each thread should come here prior to deletion. + */ + rseq_clear_tls_ptr(dcontext); +} + bool rseq_is_registered_for_current_thread(void) { @@ -251,6 +361,27 @@ rseq_process_entry(struct rseq_cs *entry, ssize_t load_offs) info->start = (app_pc)(ptr_uint_t)entry->start_ip + load_offs; info->end = info->start + entry->post_commit_offset; info->handler = (app_pc)(ptr_uint_t)entry->abort_ip + load_offs; + int signature; + if (!d_r_safe_read(info->handler - sizeof(signature), sizeof(signature), + &signature)) { + REPORT_FATAL_ERROR_AND_EXIT(RSEQ_BEHAVIOR_UNSUPPORTED, 3, get_application_name(), + get_application_pid(), + "Rseq signature is unreadable"); + ASSERT_NOT_REACHED(); + } + if (signature != rseq_signature) { + if (rseq_signature == 0) { + SELF_UNPROTECT_DATASEC(DATASEC_RARELY_PROT); + ATOMIC_4BYTE_WRITE(&rseq_signature, signature, false); + SELF_PROTECT_DATASEC(DATASEC_RARELY_PROT); + LOG(GLOBAL, LOG_LOADER, 2, "Rseq signature is 0x%08x\n", rseq_signature); + } else { + REPORT_FATAL_ERROR_AND_EXIT(RSEQ_BEHAVIOR_UNSUPPORTED, 3, + get_application_name(), get_application_pid(), + "Rseq signatures are not all identical"); + ASSERT_NOT_REACHED(); + } + } rseq_analyze_instructions(info); vmvector_add(d_r_rseq_areas, info->start, info->end, (void *)info); RSTATS_INC(num_rseq_regions); @@ -435,12 +566,91 @@ rseq_process_module(module_area_t *ma, bool at_map) return res; } +static int +rseq_locate_tls_offset(void) +{ + /* We assume (and document) that the loader's static TLS is used, so every thread + * has a consistent %fs:-offs address. Unfortunately, using a local copy of the + * rseq code for our non-instrumented execution requires us to locate the app's + * struct using heuristics, because the system call was poorly designed and will not + * let us replace the app's. Alternatives of no local copy have worse problems. + */ + /* Static TLS is at a negative offset from the app library segment base. We simply + * search all possible aligned slots. Typically there are <64 possible slots. + */ + int offset = 0; + byte *addr = get_app_segment_base(LIB_SEG_TLS); + byte *seg_bottom; + if (addr > 0 && get_memory_info(addr, &seg_bottom, NULL, NULL)) { + LOG(GLOBAL, LOG_LOADER, 3, "rseq within static TLS " PFX " - " PFX "\n", + seg_bottom, addr); + /* struct rseq_cs is aligned to 32. */ + int alignment = __alignof(struct rseq_cs); + int i; + for (i = 0; addr - i * alignment >= seg_bottom; i++) { + byte *try_addr = addr - i * alignment; + ASSERT(try_addr >= seg_bottom); /* For loop guarantees this. */ + /* Our strategy is to check all of the aligned static TLS addresses to + * find the registered one. Our caller is not supposed to call here + * until the app has registered the current thread. + */ + static const int RSEQ_RARE_SIGNATURE = 42; + int res = dynamorio_syscall(SYS_rseq, 4, try_addr, sizeof(struct rseq), + RSEQ_FLAG_UNREGISTER, RSEQ_RARE_SIGNATURE); + LOG(GLOBAL, LOG_LOADER, 3, "Tried rseq @ " PFX " => %d\n", try_addr, res); + if (res == -EINVAL) /* Our struct != registered struct. */ + continue; + /* We expect -EPERM on a signature mismatch. On the small chance the app + * actually used 42 for its signature we'll have to re-register it. + */ + if (res == 0) { + int res = dynamorio_syscall(SYS_rseq, 4, try_addr, sizeof(struct rseq), 0, + RSEQ_RARE_SIGNATURE); + ASSERT(res == 0); + res = -EPERM; + } + if (res == -EPERM) { + /* Found it! */ + LOG(GLOBAL, LOG_LOADER, 2, + "Found struct rseq @ " PFX " for thread => %s:-0x%x\n", try_addr, + get_register_name(LIB_SEG_TLS), i * alignment); + offset = -i * alignment; + } + break; + } + } + return offset; +} + +void +rseq_process_syscall(dcontext_t *dcontext) +{ + byte *seg_base = get_app_segment_base(LIB_SEG_TLS); + byte *app_addr = (byte *)dcontext->sys_param0; + if (rseq_tls_offset == 0) { + SELF_UNPROTECT_DATASEC(DATASEC_RARELY_PROT); + int offset = app_addr - seg_base; + ATOMIC_4BYTE_WRITE(&rseq_tls_offset, offset, false); + SELF_PROTECT_DATASEC(DATASEC_RARELY_PROT); + LOG(GLOBAL, LOG_LOADER, 2, + "Observed struct rseq @ " PFX " for thread => %s:-0x%x\n", app_addr, + get_register_name(LIB_SEG_TLS), -rseq_tls_offset); + } else if (seg_base + rseq_tls_offset != app_addr) { + REPORT_FATAL_ERROR_AND_EXIT( + RSEQ_BEHAVIOR_UNSUPPORTED, 3, get_application_name(), get_application_pid(), + "struct rseq is not always in static thread-local storage"); + ASSERT_NOT_REACHED(); + } +} + /* Restartable sequence region identification. * * To avoid extra overhead going to disk to read section headers, we delay looking * for rseq data until the app invokes an rseq syscall (or on attach we see a thread * that has rseq set up). We document that we do not handle the app using rseq * regions for non-rseq purposes, so we do not need to flush the cache here. + * Since we also identify the rseq_cs address here, this should be called *after* + * the app has registered the current thread for rseq. */ void rseq_locate_rseq_regions(void) @@ -455,8 +665,25 @@ rseq_locate_rseq_regions(void) d_r_mutex_unlock(&rseq_trigger_lock); return; } + + int offset = 0; + if (rseq_tls_offset == 0) { + /* Identify the TLS offset of this thread's struct rseq. */ + offset = rseq_locate_tls_offset(); + if (offset == 0) { + REPORT_FATAL_ERROR_AND_EXIT( + RSEQ_BEHAVIOR_UNSUPPORTED, 3, get_application_name(), + get_application_pid(), + "struct rseq is not in static thread-local storage"); + ASSERT_NOT_REACHED(); + } + } + SELF_UNPROTECT_DATASEC(DATASEC_RARELY_PROT); - rseq_enabled = true; + bool new_value = true; + ATOMIC_1BYTE_WRITE(&rseq_enabled, new_value, false); + if (rseq_tls_offset == 0) + ATOMIC_4BYTE_WRITE(&rseq_tls_offset, offset, false); SELF_PROTECT_DATASEC(DATASEC_RARELY_PROT); module_iterator_t *iter = module_iterator_start(); @@ -471,6 +698,7 @@ rseq_locate_rseq_regions(void) void rseq_module_init(module_area_t *ma, bool at_map) { - if (rseq_enabled) + if (rseq_enabled) { rseq_process_module(ma, at_map); + } } diff --git a/core/unix/rseq_linux.h b/core/unix/rseq_linux.h index ba24cd0533f..a4863e4cc8a 100644 --- a/core/unix/rseq_linux.h +++ b/core/unix/rseq_linux.h @@ -63,4 +63,7 @@ rseq_locate_rseq_regions(void); void rseq_module_init(module_area_t *ma, bool at_map); +void +rseq_process_syscall(dcontext_t *dcontext); + #endif /* _RSEQ_H_ */ diff --git a/suite/tests/linux/rseq.c b/suite/tests/linux/rseq.c index 6f8ab8f4870..1405d611fbf 100644 --- a/suite/tests/linux/rseq.c +++ b/suite/tests/linux/rseq.c @@ -60,67 +60,92 @@ #define STRINGIFY(x) EXPANDSTR(x) #define RSEQ_SIG 0x90909090 /* nops to disasm nicely */ +#ifdef RSEQ_TEST_USE_OLD_SECTION_NAME +# define RSEQ_SECTION_NAME "__rseq_table" +#else +# define RSEQ_SECTION_NAME "__rseq_cs" +#endif + +#define RSEQ_ADD_TABLE_ENTRY(name, start, end, abort) \ + ".pushsection " RSEQ_SECTION_NAME ", \"aw\"\n\t" \ + ".balign 32\n\t" \ + "rseq_cs_" #name ":\n\t" \ + ".long 0, 0\n\t" /* version, flags */ \ + ".quad " #start ", " #end " - " #start ", " #abort "\n\t" \ + ".popsection\n\t" RSEQ_ADD_ARRAY_ENTRY(rseq_cs_##name) + +#if !defined(RSEQ_TEST_USE_OLD_SECTION_NAME) && !defined(RSEQ_TEST_USE_NO_ARRAY) +# define RSEQ_ADD_ARRAY_ENTRY(label) \ + ".pushsection __rseq_cs_ptr_array, \"aw\"\n\t" \ + ".quad " #label "\n\t" \ + ".popsection\n\t" +#else +# define RSEQ_ADD_ARRAY_ENTRY(label) /* Nothing. */ +#endif /* This cannot be a stack-local variable, as the kernel will force SIGSEGV * if it can't read this struct. And for multiple threads it should be in TLS. */ static __thread volatile struct rseq rseq_tls; +/* Make it harder to find rseq_tls for DR's heuristic by adding more static TLS. */ +static __thread volatile struct rseq fill_up_tls[128]; #ifdef RSEQ_TEST_ATTACH static volatile int exit_requested; static void *thread_ready; #endif -int -test_rseq(void) +static volatile int sigill_count; + +static void +signal_handler(int sig, siginfo_t *siginfo, ucontext_t *ucxt) +{ + if (sig == SIGILL) + ++sigill_count; +} + +static void +test_rseq_call_once(bool force_restart_in, int *completions_out, int *restarts_out) { /* We use static to avoid stack reference issues with our extra frame inside the asm. */ static __u32 id = RSEQ_CPU_ID_UNINITIALIZED; - static int restarts = 0; + static int completions; + static int restarts; + static volatile int force_restart; + completions = 0; + restarts = 0; + force_restart = force_restart_in; + sigill_count = 0; __asm__ __volatile__( -#ifdef RSEQ_TEST_USE_OLD_SECTION_NAME - /* Add a table entry. */ - ".pushsection __rseq_table, \"aw\"\n\t" -#else - ".pushsection __rseq_cs, \"aw\"\n\t" -#endif - ".balign 32\n\t" - "1:\n\t" - ".long 0, 0\n\t" /* version, flags */ - ".quad 2f, 3f-2f, 4f\n\t" /* start_ip, post_commit_offset, abort_ip */ - ".popsection\n\t" -#if !defined(RSEQ_TEST_USE_OLD_SECTION_NAME) && !defined(RSEQ_TEST_USE_NO_ARRAY) - /* Add an array entry. */ - ".pushsection __rseq_cs_ptr_array, \"aw\"\n\t" - ".quad 1b\n\t" - ".popsection\n\t" -#endif + RSEQ_ADD_TABLE_ENTRY(simple, 2f, 3f, 4f) - /* Although our abort handler has to handle being called (that's all DR - * supports), we structure the code to allow directly calling past it, to - * count restart_count. + /* In the past DR only supported an rseq sequence structured as a call-return + * with an abort handler that always restarted. We keep that structure here + * as a test of that pattern, though we now support other patterns. */ "call 6f\n\t" "jmp 5f\n\t" "6:\n\t" /* Store the entry into the ptr. */ - "leaq 1b(%%rip), %%rax\n\t" - "movq %%rax, %0\n\t" + "leaq rseq_cs_simple(%%rip), %%rax\n\t" + "movq %%rax, %[rseq_tls]\n\t" /* Test a register input to the sequence. */ - "mov %3, %%rax\n\t" + "movl %[cpu_id], %%eax\n\t" /* Test "falling into" the rseq region. */ - /* Restartable sequence. - * If I pause in gdb in here, often the thread is migrated and the abort - * handler invoked: a simple way to test a restart natively. - */ + /* Restartable sequence. */ "2:\n\t" - "mov %%rax, %1\n\t" + "movl %%eax, %[id]\n\t" /* Test clobbering an input register. */ - "mov %4, %%rax\n\t" - "addl $1, %2\n\t" + "movl %[cpu_id_uninit], %%eax\n\t" + /* Test a restart in the middle of the sequence via ud2a SIGILL. */ + "cmpb $0, %[force_restart]\n\t" + "jz 7f\n\t" + "ud2a\n\t" + "7:\n\t" + "addl $1, %[completions]\n\t" /* Post-commit. */ "3:\n\t" @@ -130,18 +155,120 @@ test_rseq(void) /* clang-format off */ /* (avoid indenting next few lines) */ ".long " STRINGIFY(RSEQ_SIG) "\n\t" "4:\n\t" + "addl $1, %[restarts]\n\t" + "movb $0, %[force_restart_write]\n\t" "jmp 6b\n\t" /* Clear the ptr. */ "5:\n\t" - "movq $0, %0\n\t" + "movq $0, %[rseq_tls]\n\t" /* clang-format on */ - : "=m"(rseq_tls.rseq_cs), "=m"(id), "=m"(restarts) - : "m"(rseq_tls.cpu_id), "i"(RSEQ_CPU_ID_UNINITIALIZED) + : [rseq_tls] "=m"(rseq_tls.rseq_cs), [id] "=m"(id), + [completions] "=m"(completions), [restarts] "=m"(restarts), + [force_restart_write] "=m"(force_restart) + : [cpu_id] "m"(rseq_tls.cpu_id), [cpu_id_uninit] "i"(RSEQ_CPU_ID_UNINITIALIZED), + [force_restart] "m"(force_restart) : "rax", "memory"); assert(id != RSEQ_CPU_ID_UNINITIALIZED); - return restarts; + *completions_out = completions; + *restarts_out = restarts; +} + +static void +test_rseq_call(void) +{ + int completions, restarts; + sigill_count = 0; + test_rseq_call_once(false, &completions, &restarts); + /* There *could* have been a migration restart. */ + assert(completions == 1 && sigill_count == 0); + test_rseq_call_once(true, &completions, &restarts); + assert(completions == 1 && restarts > 0 && sigill_count == 1); +} + +static void +test_rseq_branches_once(bool force_restart, int *completions_out, int *restarts_out) +{ + /* We use static to avoid stack reference issues with our extra frame inside the asm. + */ + __u32 id = RSEQ_CPU_ID_UNINITIALIZED; + int completions = 0; + int restarts = 0; + __asm__ __volatile__( + /* clang-format off */ /* (avoid indenting next few lines) */ + RSEQ_ADD_TABLE_ENTRY(branches, 2f, 3f, 4f) + /* clang-format on */ + + "6:\n\t" + /* Store the entry into the ptr. */ + "leaq rseq_cs_branches(%%rip), %%rax\n\t" + "movq %%rax, %[rseq_tls]\n\t" + /* Test a register input to the sequence. */ + "movl %[cpu_id], %%eax\n\t" + /* Test "falling into" the rseq region. */ + + /* Restartable sequence. We include control flow to test a + * complex sequence with midpoint branches, but no exits. + * TODO i#2350: Support for exits has not yet been added and + * once finished separate tests will be added. + */ + "2:\n\t" + "movl %%eax, %[id]\n\t" + "mov $0, %%rax\n\t" + "cmp $0, %%rax\n\t" + "je 11f\n\t" + "mov $4, %%rcx\n\t" + "11:\n\t" + "cmp $1, %%rax\n\t" + "je 12f\n\t" + "cmp $2, %%rax\n\t" + "je 13f\n\t" + /* Test a restart via ud2a SIGILL. */ + "cmpb $0, %[force_restart]\n\t" + "jz 7f\n\t" + "ud2a\n\t" + "7:\n\t" + "addl $1, %[completions]\n\t" + + /* Post-commit. */ + "3:\n\t" + "jmp 5f\n\t" + + /* Abort handler. */ + /* clang-format off */ /* (avoid indenting next few lines) */ + ".long " STRINGIFY(RSEQ_SIG) "\n\t" + "4:\n\t" + "addl $1, %[restarts]\n\t" + "movb $0, %[force_restart_write]\n\t" + "jmp 6b\n\t" + + /* Clear the ptr. */ + "13:\n\t" + "12:\n\t" + "5:\n\t" + "movq $0, %[rseq_tls]\n\t" + /* clang-format on */ + + : [rseq_tls] "=m"(rseq_tls.rseq_cs), [id] "=m"(id), + [completions] "=m"(completions), [restarts] "=m"(restarts), + [force_restart_write] "=m"(force_restart) + : [cpu_id] "m"(rseq_tls.cpu_id), [cpu_id_uninit] "i"(RSEQ_CPU_ID_UNINITIALIZED), + [force_restart] "m"(force_restart) + : "rax", "rcx", "rdx", "memory"); + assert(id != RSEQ_CPU_ID_UNINITIALIZED); +} + +static void +test_rseq_branches(void) +{ + int completions, restarts; + sigill_count = 0; + test_rseq_branches_once(false, &completions, &restarts); + /* There *could* have been a migration restart. */ + assert(completions == 1 && sigill_count == 0); + test_rseq_branches_once(true, &completions, &restarts); + assert(completions == 1 && restarts > 0 && sigill_count == 1); } #ifdef RSEQ_TEST_ATTACH @@ -158,29 +285,14 @@ rseq_thread_loop(void *arg) return NULL; static int zero; __asm__ __volatile__( - /* Add a table entry. */ - ".pushsection __rseq_cs, \"aw\"\n\t" - ".balign 32\n\t" - "1:\n\t" - ".long 0, 0\n\t" /* version, flags */ - ".quad 2f, 3f-2f, 4f\n\t" /* start_ip, post_commit_offset, abort_ip */ - ".popsection\n\t" - /* Add an array entry. */ - ".pushsection __rseq_cs_ptr_array, \"aw\"\n\t" - ".quad 1b\n\t" - ".popsection\n\t" - - /* Although our abort handler has to handle being called (that's all DR - * supports), we structure the code to allow directly calling past it, to - * count restart_count. - */ - "call 6f\n\t" - "jmp 5f\n\t" + /* clang-format off */ /* (avoid indenting next few lines) */ + RSEQ_ADD_TABLE_ENTRY(thread, 2f, 3f, 4f) + /* clang-format on */ "6:\n\t" /* Store the entry into the ptr. */ - "leaq 1b(%%rip), %%rax\n\t" - "movq %%rax, %0\n\t" + "leaq rseq_cs_thread(%%rip), %%rax\n\t" + "movq %%rax, %[rseq_tls]\n\t" /* Test "falling into" the rseq region. */ /* Restartable sequence. We loop to ensure we're in the region on @@ -195,32 +307,31 @@ rseq_thread_loop(void *arg) /* I was going to assert that zero==0 at the end, but that requires more * synch to not reach here natively before DR attaches. Decided against it. */ - "movl $1, %1\n\t" + "movl $1, %[zero]\n\t" "jmp 2b\n\t" /* We can't end the sequence in a branch (DR can't handle it). */ "nop\n\t" /* Post-commit. */ "3:\n\t" - "ret\n\t" + "jmp 5f\n\t" /* Abort handler: if we're done, exit; else, re-enter. */ /* clang-format off */ /* (avoid indenting next few lines) */ ".long " STRINGIFY(RSEQ_SIG) "\n\t" "4:\n\t" - "mov %2, %%rax\n\t" + "mov %[exit_requested], %%rax\n\t" "cmp $0, %%rax\n\t" "jne 3b\n\t" "jmp 6b\n\t" /* Clear the ptr. */ "5:\n\t" - "leaq 1b(%%rip), %%rax\n\t" - "movq $0, %0\n\t" + "movq $0, %[rseq_tls]\n\t" /* clang-format on */ - : "=m"(rseq_tls.rseq_cs), "=m"(zero) - : "m"(exit_requested) + : [rseq_tls] "=m"(rseq_tls.rseq_cs), [zero] "=m"(zero) + : [exit_requested] "m"(exit_requested) : "rax", "memory"); return NULL; } @@ -229,7 +340,7 @@ rseq_thread_loop(void *arg) int main() { - int restart_count = 0; + intercept_signal(SIGILL, signal_handler, false); rseq_tls.cpu_id = RSEQ_CPU_ID_UNINITIALIZED; int res = syscall(SYS_rseq, &rseq_tls, sizeof(rseq_tls), 0, RSEQ_SIG); if (res == 0) { @@ -242,7 +353,13 @@ main() wait_cond_var(thread_ready); dr_app_setup_and_start(); #endif - restart_count = test_rseq(); + test_rseq_call(); + /* Test variations inside the sequence. */ + test_rseq_branches(); + /* Test a trace. */ + int i; + for (i = 0; i < 200; i++) + test_rseq_branches(); #ifdef RSEQ_TEST_ATTACH /* Detach while the thread is in its rseq region loop. */ exit_requested = 1; /* atomic on x86; ARM will need more. */ @@ -253,11 +370,7 @@ main() } else { /* Linux kernel 4.18+ is required. */ assert(errno == ENOSYS); - /* Make the test pass. */ - restart_count = 1; } - /* We expect 0 restart_count natively (ok, tiny chance of >0), and 1 under DR. */ - print("Saw %s restarts\n", restart_count > 0 ? "some" : "no"); print("All done\n"); return 0; } diff --git a/suite/tests/linux/rseq.expect b/suite/tests/linux/rseq.expect index 3ae711b1a48..d918cd25e54 100644 --- a/suite/tests/linux/rseq.expect +++ b/suite/tests/linux/rseq.expect @@ -1,2 +1 @@ -Saw some restarts All done From 7906b7f1cf827cc7f8cd979e652a5553369bbe18 Mon Sep 17 00:00:00 2001 From: Derek Bruening Date: Wed, 11 Sep 2019 00:53:58 -0400 Subject: [PATCH 2/4] Do not clobber TLS when rseq is not enabled --- core/unix/rseq_linux.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/core/unix/rseq_linux.c b/core/unix/rseq_linux.c index 54b1c35029c..25ec00e9e69 100644 --- a/core/unix/rseq_linux.c +++ b/core/unix/rseq_linux.c @@ -198,6 +198,7 @@ rseq_get_tls_ptr_offset(void) static void rseq_clear_tls_ptr(dcontext_t *dcontext) { + ASSERT(rseq_tls_offset != 0); byte *base = get_segment_base(LIB_SEG_TLS); struct rseq *app_rseq = (struct rseq *)(base + rseq_tls_offset); /* We're directly writing this in the cache, so we do not bother with safe_read @@ -241,6 +242,8 @@ rseq_record_rseq_cs(byte *rseq_cs_alloc, fragment_t *f, cache_pc start, cache_pc void rseq_remove_fragment(dcontext_t *dcontext, fragment_t *f) { + if (!rseq_enabled) + return; /* Avoid freeing a live rseq_cs for a thread-private fragment deletion. */ rseq_clear_tls_ptr(dcontext); TABLE_RWLOCK(rseq_cs_table, write, lock); @@ -251,6 +254,8 @@ rseq_remove_fragment(dcontext_t *dcontext, fragment_t *f) void rseq_shared_fragment_flushtime_update(dcontext_t *dcontext) { + if (!rseq_enabled) + return; /* Avoid freeing a live rseq_cs for thread-shared fragment deletion. * We clear the pointer on completion of the native rseq execution, but it's * not easy to clear it on midpoint exits. We instead clear prior to From 51d5961b950ade527cb9a41d1071d8f7ed544b92 Mon Sep 17 00:00:00 2001 From: Derek Bruening Date: Wed, 11 Sep 2019 17:45:02 -0400 Subject: [PATCH 3/4] Address reviewer comments. --- api/docs/bt.dox | 2 +- core/arch/arch_exports.h | 2 +- core/arch/x86/mangle.c | 2 +- core/unix/rseq_linux.c | 12 +++++++++--- 4 files changed, 12 insertions(+), 6 deletions(-) diff --git a/api/docs/bt.dox b/api/docs/bt.dox index d3a806f83b5..7418e52e577 100644 --- a/api/docs/bt.dox +++ b/api/docs/bt.dox @@ -1322,7 +1322,7 @@ This run-twice approach is subject to the following limitations: stack pointer register is not supported. - Each rseq region's code must end with a fall-through (non-control-flow) instruction. -- No indirect branches that do not exit the rseq region are allowed inside. +- Indirect branches that do not exit the rseq region are not allowed. - Each rseq region must be entered only from the top, with no branches from outside the region targeting a point inside the region. - No system calls are allowed inside rseq regions. diff --git a/core/arch/arch_exports.h b/core/arch/arch_exports.h index 34084044f9a..b2e4d1d8c07 100644 --- a/core/arch/arch_exports.h +++ b/core/arch/arch_exports.h @@ -1664,7 +1664,7 @@ d_r_decode_init(void); (FRAG_IS_32(flags) ? STUB_COARSE_DIRECT_SIZE32 : STUB_COARSE_DIRECT_SIZE64) /* Writes nops into the address range. - * XXX: Better to use the newer multi-byte nops. + * XXX i#3828: Better to use the newer multi-byte nops. */ # define SET_TO_NOPS(isa_mode, addr, size) memset(addr, 0x90, size) /* writes debugbreaks into the address range */ diff --git a/core/arch/x86/mangle.c b/core/arch/x86/mangle.c index b03d794defe..1241d209054 100644 --- a/core/arch/x86/mangle.c +++ b/core/arch/x86/mangle.c @@ -1128,7 +1128,7 @@ insert_reachable_cti(dcontext_t *dcontext, instrlist_t *ilist, instr_t *where, * M A N G L I N G R O U T I N E S */ -/* Updates the immediates used by insert_mov_immed_arch() to used the value "val". +/* Updates the immediates used by insert_mov_immed_arch() to use the value "val". * The "first" and "last" from insert_mov_immed_arch() should be passed here, * along with the encoded start pc of "first" as "pc". * Keep this in sync with insert_mov_immed_arch(). diff --git a/core/unix/rseq_linux.c b/core/unix/rseq_linux.c index 25ec00e9e69..eaf61317075 100644 --- a/core/unix/rseq_linux.c +++ b/core/unix/rseq_linux.c @@ -211,7 +211,8 @@ rseq_clear_tls_ptr(dcontext_t *dcontext) int rseq_get_signature(void) { - /* This read is assumed to be atomic. */ + /* This is only called after rseq is initialized and the signature determined. */ + ASSERT(rseq_enabled); return rseq_signature; } @@ -632,15 +633,20 @@ rseq_process_syscall(dcontext_t *dcontext) { byte *seg_base = get_app_segment_base(LIB_SEG_TLS); byte *app_addr = (byte *)dcontext->sys_param0; + bool constant_offset = false; if (rseq_tls_offset == 0) { SELF_UNPROTECT_DATASEC(DATASEC_RARELY_PROT); int offset = app_addr - seg_base; - ATOMIC_4BYTE_WRITE(&rseq_tls_offset, offset, false); + /* To handle races here, we use an atomic_exchange. */ + int prior = atomic_exchange_int(&rseq_tls_offset, offset); SELF_PROTECT_DATASEC(DATASEC_RARELY_PROT); + constant_offset = (prior == offset); LOG(GLOBAL, LOG_LOADER, 2, "Observed struct rseq @ " PFX " for thread => %s:-0x%x\n", app_addr, get_register_name(LIB_SEG_TLS), -rseq_tls_offset); - } else if (seg_base + rseq_tls_offset != app_addr) { + } else + constant_offset = (seg_base + rseq_tls_offset == app_addr); + if (!constant_offset) { REPORT_FATAL_ERROR_AND_EXIT( RSEQ_BEHAVIOR_UNSUPPORTED, 3, get_application_name(), get_application_pid(), "struct rseq is not always in static thread-local storage"); From d49012ae8b5b83924ec088014eb550c511f49e00 Mon Sep 17 00:00:00 2001 From: Derek Bruening Date: Wed, 11 Sep 2019 19:15:38 -0400 Subject: [PATCH 4/4] Fix incorrect complaint about a zero offset --- core/unix/rseq_linux.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/unix/rseq_linux.c b/core/unix/rseq_linux.c index eaf61317075..0395e50fb4b 100644 --- a/core/unix/rseq_linux.c +++ b/core/unix/rseq_linux.c @@ -640,7 +640,7 @@ rseq_process_syscall(dcontext_t *dcontext) /* To handle races here, we use an atomic_exchange. */ int prior = atomic_exchange_int(&rseq_tls_offset, offset); SELF_PROTECT_DATASEC(DATASEC_RARELY_PROT); - constant_offset = (prior == offset); + constant_offset = (prior == 0 || prior == offset); LOG(GLOBAL, LOG_LOADER, 2, "Observed struct rseq @ " PFX " for thread => %s:-0x%x\n", app_addr, get_register_name(LIB_SEG_TLS), -rseq_tls_offset);