diff --git a/core/arch/aarchxx/mangle.c b/core/arch/aarchxx/mangle.c index 389657f63cd..c07fb43b1da 100644 --- a/core/arch/aarchxx/mangle.c +++ b/core/arch/aarchxx/mangle.c @@ -202,7 +202,7 @@ insert_push_all_registers(dcontext_t *dcontext, clean_call_info_t *cci, #endif if (cci == NULL) cci = &default_clean_call_info; - if (cci->preserve_mcontext || cci->num_xmms_skip != NUM_XMM_REGS) { + if (cci->preserve_mcontext || cci->num_xmms_skip != NUM_SIMD_REGS) { /* FIXME i#1551: once we add skipping of regs, need to keep shape here */ } /* FIXME i#1551: once we have cci->num_xmms_skip, skip this if possible */ diff --git a/core/arch/arch.c b/core/arch/arch.c index 0f10749263e..d082432db3f 100644 --- a/core/arch/arch.c +++ b/core/arch/arch.c @@ -3379,7 +3379,7 @@ dump_mcontext(priv_mcontext_t *context, file_t f, bool dump_xml) #ifdef X86 if (preserve_xmm_caller_saved()) { int i, j; - for (i=0; inum_xmms_used = NUM_XMM_REGS; - for (i = 0; i < NUM_XMM_REGS; i++) + ci->num_xmms_used = NUM_SIMD_REGS; + for (i = 0; i < NUM_SIMD_REGS; i++) ci->xmm_used[i] = true; for (i = 0; i < NUM_GP_REGS; i++) ci->reg_used[i] = true; @@ -453,7 +453,7 @@ analyze_callee_regs_usage(dcontext_t *dcontext, callee_info_t *ci) uint i, num_regparm; ci->num_xmms_used = 0; - memset(ci->xmm_used, 0, sizeof(bool) * NUM_XMM_REGS); + memset(ci->xmm_used, 0, sizeof(bool) * NUM_SIMD_REGS); memset(ci->reg_used, 0, sizeof(bool) * NUM_GP_REGS); ci->write_aflags = false; for (instr = instrlist_first(ilist); @@ -466,7 +466,7 @@ analyze_callee_regs_usage(dcontext_t *dcontext, callee_info_t *ci) * impact unless there are a lot of different clean call callees. */ /* XMM registers usage */ - for (i = 0; i < NUM_XMM_REGS; i++) { + for (i = 0; i < NUM_SIMD_REGS; i++) { if (!ci->xmm_used[i] && instr_uses_reg(instr, (DR_REG_XMM0 + (reg_id_t)i))) { LOG(THREAD, LOG_CLEANCALL, 2, @@ -1060,7 +1060,7 @@ analyze_clean_call_regs(dcontext_t *dcontext, clean_call_info_t *cci) callee_info_t *info = cci->callee_info; /* 1. xmm registers */ - for (i = 0; i < NUM_XMM_REGS; i++) { + for (i = 0; i < NUM_SIMD_REGS; i++) { if (info->xmm_used[i]) { cci->xmm_skip[i] = false; } else { @@ -1071,7 +1071,7 @@ analyze_clean_call_regs(dcontext_t *dcontext, clean_call_info_t *cci) cci->num_xmms_skip++; } } - if (INTERNAL_OPTION(opt_cleancall) > 2 && cci->num_xmms_skip != NUM_XMM_REGS) + if (INTERNAL_OPTION(opt_cleancall) > 2 && cci->num_xmms_skip != NUM_SIMD_REGS) cci->should_align = false; /* 2. general purpose registers */ /* set regs not to be saved for clean call */ @@ -1213,7 +1213,7 @@ analyze_clean_call_inline(dcontext_t *dcontext, clean_call_info_t *cci) } } } - if (cci->num_xmms_skip == NUM_XMM_REGS) { + if (cci->num_xmms_skip == NUM_SIMD_REGS) { STATS_INC(cleancall_xmm_skipped); } if (cci->skip_save_aflags) { @@ -1306,7 +1306,7 @@ insert_inline_reg_save(dcontext_t *dcontext, clean_call_info_t *cci, insert_get_mcontext_base(dcontext, ilist, where, ci->spill_reg); /* Save used registers. */ - ASSERT(cci->num_xmms_skip == NUM_XMM_REGS); + ASSERT(cci->num_xmms_skip == NUM_SIMD_REGS); for (i = 0; i < NUM_GP_REGS; i++) { if (!cci->reg_skip[i]) { reg_id_t reg_id = DR_REG_XAX + (reg_id_t)i; diff --git a/core/arch/x86/emit_utils.c b/core/arch/x86/emit_utils.c index 1cfcb68108d..f1a0c53e84e 100644 --- a/core/arch/x86/emit_utils.c +++ b/core/arch/x86/emit_utils.c @@ -1303,7 +1303,7 @@ append_restore_simd_reg(dcontext_t *dcontext, instrlist_t *ilist, bool absolute) int i; uint opcode = move_mm_reg_opcode(true/*align32*/, true/*align16*/); ASSERT(proc_has_feature(FEATURE_SSE)); - for (i=0; ipreserve_mcontext || cci->num_xmms_skip != NUM_XMM_REGS) { + if (cci->preserve_mcontext || cci->num_xmms_skip != NUM_SIMD_REGS) { int offs = XMM_SLOTS_SIZE + PRE_XMM_PADDING; if (cci->preserve_mcontext && cci->skip_save_aflags) { offs_beyond_xmm = 2*XSP_SZ; /* pc and flags */ @@ -361,7 +361,7 @@ insert_push_all_registers(dcontext_t *dcontext, clean_call_info_t *cci, */ uint opcode = move_mm_reg_opcode(ALIGNED(alignment, 16), ALIGNED(alignment, 32)); ASSERT(proc_has_feature(FEATURE_SSE)); - for (i=0; ixmm_skip[i]) { PRE(ilist, instr, instr_create_1dst_1src (dcontext, opcode, @@ -504,7 +504,7 @@ insert_pop_all_registers(dcontext_t *dcontext, clean_call_info_t *cci, * is better. */ uint opcode = move_mm_reg_opcode(ALIGNED(alignment, 32), ALIGNED(alignment, 16)); ASSERT(proc_has_feature(FEATURE_SSE)); - for (i=0; ixmm_skip[i]) { PRE(ilist, instr, instr_create_1dst_1src (dcontext, opcode, opnd_create_reg(REG_SAVED_XMM0 + (reg_id_t)i), diff --git a/core/arch/x86/x86_asm_defines.asm b/core/arch/x86/x86_asm_defines.asm index 62553ee0470..f7b28773bd3 100644 --- a/core/arch/x86/x86_asm_defines.asm +++ b/core/arch/x86/x86_asm_defines.asm @@ -47,18 +47,18 @@ */ #ifdef X64 # ifdef WINDOWS -# define NUM_XMM_SLOTS 6 /* xmm0-5 */ +# define NUM_SIMD_SLOTS 6 /* xmm0-5 */ # else -# define NUM_XMM_SLOTS 16 /* xmm0-15 */ +# define NUM_SIMD_SLOTS 16 /* xmm0-15 */ # endif # define PRE_XMM_PADDING 16 #else -# define NUM_XMM_SLOTS 8 /* xmm0-7 */ +# define NUM_SIMD_SLOTS 8 /* xmm0-7 */ # define PRE_XMM_PADDING 24 #endif #define XMM_SAVED_REG_SIZE 32 /* for ymm */ /* xmm0-5/7/15 for PR 264138/i#139/PR 302107 */ -#define XMM_SAVED_SIZE ((NUM_XMM_SLOTS)*(XMM_SAVED_REG_SIZE)) +#define XMM_SAVED_SIZE ((NUM_SIMD_SLOTS)*(XMM_SAVED_REG_SIZE)) #ifdef X64 /* push GPR registers in priv_mcontext_t order. does NOT make xsp have a diff --git a/core/lib/globals_shared.h b/core/lib/globals_shared.h index 4891ee47ad1..dfa299fbe81 100644 --- a/core/lib/globals_shared.h +++ b/core/lib/globals_shared.h @@ -1789,7 +1789,7 @@ typedef union _dr_ymm_t { reg_t reg[IF_X64_ELSE(4,8)]; /**< Representation as 4 or 8 registers. */ } dr_ymm_t; -#ifdef AARCHXX +#if defined(AARCHXX) /** * 128-bit ARM SIMD Vn register. * In AArch64, align to 16 bytes for better performance. @@ -1818,28 +1818,38 @@ typedef union _dr_simd_t { # define NUM_SIMD_SLOTS 16 /**< Number of 128-bit SIMD Vn slots in dr_mcontext_t */ # endif # define PRE_SIMD_PADDING 0 /**< Bytes of padding before xmm/ymm dr_mcontext_t slots */ -#endif /* ARM */ -#ifdef AVOID_API_EXPORT +#elif defined(X86) + +# ifdef AVOID_API_EXPORT /* If this is increased, you'll probably need to increase the size of * inject_into_thread's buf and INTERCEPTION_CODE_SIZE (for Windows). - * Also, update NUM_XMM_SLOTS in x86.asm and get_xmm_caller_saved. + * Also, update NUM_SIMD_SLOTS in x86.asm and get_xmm_caller_saved. * i#437: YMM is an extension of XMM from 128-bit to 256-bit without * adding new ones, so code operating on XMM often also operates on YMM, * and thus some *XMM* macros also apply to *YMM*. */ -#endif -#ifdef X64 -# ifdef WINDOWS -# define NUM_XMM_SLOTS 6 /**< Number of [xy]mm reg slots in dr_mcontext_t */ /*xmm0-5*/ +# endif +# ifdef X64 +# ifdef WINDOWS + /*xmm0-5*/ +# define NUM_SIMD_SLOTS 6 /**< Number of [xy]mm reg slots in dr_mcontext_t */ +# else + /*xmm0-15*/ +# define NUM_SIMD_SLOTS 16 /**< Number of [xy]mm reg slots in dr_mcontext_t */ +# endif +# define PRE_XMM_PADDING 16 /**< Bytes of padding before xmm/ymm dr_mcontext_t slots */ # else -# define NUM_XMM_SLOTS 16 /**< Number of [xy]mm reg slots in dr_mcontext_t */ /*xmm0-15*/ + /*xmm0-7*/ +# define NUM_SIMD_SLOTS 8 /**< Number of [xy]mm reg slots in dr_mcontext_t */ +# define PRE_XMM_PADDING 24 /**< Bytes of padding before xmm/ymm dr_mcontext_t slots */ # endif -# define PRE_XMM_PADDING 16 /**< Bytes of padding before xmm/ymm dr_mcontext_t slots */ + +# define NUM_XMM_SLOTS NUM_SIMD_SLOTS /* for backward compatibility */ + #else -# define NUM_XMM_SLOTS 8 /**< Number of [xy]mm reg slots in dr_mcontext_t */ /*xmm0-7*/ -# define PRE_XMM_PADDING 24 /**< Bytes of padding before xmm/ymm dr_mcontext_t slots */ -#endif +# error NYI +#endif /* AARCHXX/X86 */ /** Values for the flags field of dr_mcontext_t */ typedef enum { @@ -1910,10 +1920,10 @@ typedef struct _priv_mcontext_t { * have noticable impacts, i.e. pushing bbs over the max size limit, * and could have a noticeable performance hit. */ -/* We now save everything but we keep separate NUM_XMM_SLOTS vs NUM_XMM_SAVED +/* We now save everything but we keep separate NUM_SIMD_SLOTS vs NUM_SIMD_SAVED * in case we go back to not saving some slots in the future: e.g., w/o * CLIENT_INTERFACE we could control our own libs enough to avoid some saves. */ -#define NUM_XMM_SAVED NUM_XMM_SLOTS +#define NUM_SIMD_SAVED NUM_SIMD_SLOTS #endif /* ifndef _GLOBALS_SHARED_H_ */ diff --git a/core/lib/instrument.c b/core/lib/instrument.c index 78b65ff7d64..a3fa8c52824 100644 --- a/core/lib/instrument.c +++ b/core/lib/instrument.c @@ -5166,7 +5166,7 @@ dr_insert_clean_call_ex_varg(void *drcontext, instrlist_t *ilist, instr_t *where cci.num_xmms_skip = 6; #else /* all 8 (or 16) are scratch */ - cci.num_xmms_skip = NUM_XMM_REGS; + cci.num_xmms_skip = NUM_SIMD_REGS; #endif for (i=0; ixstate_hdr.xstate_bv = (((uint64)bv_high)<<32) | bv_low; } - for (i=0; isw_reserved.xstate_bv)); LOG(THREAD, LOG_ASYNCH, 1, "\txstate_bv = 0x"HEX64_FORMAT_STRING"\n", xstate->xstate_hdr.xstate_bv); - for (i=0; iymmh.ymmh_space[i*4+j]); @@ -424,7 +424,7 @@ sigcontext_to_mcontext_simd(priv_mcontext_t *mc, sig_full_cxt_t *sc_full) sigcontext_t *sc = sc_full->sc; if (sc->fpstate != NULL) { int i; - for (i=0; iymm[i], &sc->fpstate->IF_X64_ELSE(xmm_space[i*4],_xmm[i]), XMM_REG_SIZE); } @@ -436,7 +436,7 @@ sigcontext_to_mcontext_simd(priv_mcontext_t *mc, sig_full_cxt_t *sc_full) */ ASSERT(sc->fpstate->sw_reserved.extended_size >= sizeof(*xstate)); ASSERT(TEST(XCR0_AVX, sc->fpstate->sw_reserved.xstate_bv)); - for (i=0; iymm[i].u32[4], &xstate->ymmh.ymmh_space[i*4], YMMH_REG_SIZE); } @@ -451,7 +451,7 @@ mcontext_to_sigcontext_simd(sig_full_cxt_t *sc_full, priv_mcontext_t *mc) sigcontext_t *sc = sc_full->sc; if (sc->fpstate != NULL) { int i; - for (i=0; ifpstate->IF_X64_ELSE(xmm_space[i*4],_xmm[i]), &mc->ymm[i], XMM_REG_SIZE); } @@ -463,7 +463,7 @@ mcontext_to_sigcontext_simd(sig_full_cxt_t *sc_full, priv_mcontext_t *mc) */ ASSERT(sc->fpstate->sw_reserved.extended_size >= sizeof(*xstate)); ASSERT(TEST(XCR0_AVX, sc->fpstate->sw_reserved.xstate_bv)); - for (i=0; iymmh.ymmh_space[i*4], &mc->ymm[i].u32[4], YMMH_REG_SIZE); } diff --git a/core/unix/signal_macos.c b/core/unix/signal_macos.c index 15eeb5ee0db..00e4c06d5bf 100644 --- a/core/unix/signal_macos.c +++ b/core/unix/signal_macos.c @@ -154,11 +154,11 @@ sigcontext_to_mcontext_simd(priv_mcontext_t *mc, sig_full_cxt_t *sc_full) */ sigcontext_t *sc = sc_full->sc; int i; - for (i=0; iymm[i], &sc->__fs.__fpu_xmm0 + i, XMM_REG_SIZE); } if (YMM_ENABLED()) { - for (i=0; iymm[i].u32[4], &sc->__fs.__fpu_ymmh0 + i, YMMH_REG_SIZE); } } @@ -169,11 +169,11 @@ mcontext_to_sigcontext_simd(sig_full_cxt_t *sc_full, priv_mcontext_t *mc) { sigcontext_t *sc = sc_full->sc; int i; - for (i=0; i__fs.__fpu_xmm0 + i, &mc->ymm[i], XMM_REG_SIZE); } if (YMM_ENABLED()) { - for (i=0; i__fs.__fpu_ymmh0 + i, &mc->ymm[i].u32[4], YMMH_REG_SIZE); } } @@ -200,7 +200,7 @@ dump_fpstate(dcontext_t *dcontext, sigcontext_t *sc) *((ushort *)(&sc->__fs.__fpu_stmm0 + i) + j)); LOG(THREAD, LOG_ASYNCH, 1, "\n"); } - for (i=0; iContextFlags)) { int i, j; byte *ymmh_area; - for (i=0; iContextFlags))); - for (i = 0; i < NUM_XMM_SLOTS; i++) { + for (i = 0; i < NUM_SIMD_SLOTS; i++) { for (j = 0; j < IF_X64_ELSE(2,4); j++) { *bufptr++ = CXT_XMM(cxt, i)->reg[j]; } diff --git a/core/win32/ntdll.c b/core/win32/ntdll.c index 15c53cfeb64..e40171ec2e4 100644 --- a/core/win32/ntdll.c +++ b/core/win32/ntdll.c @@ -1157,7 +1157,7 @@ context_to_mcontext_internal(priv_mcontext_t *mcontext, CONTEXT *cxt) /* no harm done if no sse support */ /* CONTEXT_FLOATING_POINT or CONTEXT_EXTENDED_REGISTERS */ int i; - for (i = 0; i < NUM_XMM_SLOTS; i++) + for (i = 0; i < NUM_SIMD_SLOTS; i++) memcpy(&mcontext->ymm[i], CXT_XMM(cxt, i), XMM_REG_SIZE); } /* if XSTATE is NOT set, the app has NOT used any ymm state and @@ -1167,7 +1167,7 @@ context_to_mcontext_internal(priv_mcontext_t *mcontext, CONTEXT *cxt) byte *ymmh_area = context_ymmh_saved_area(cxt); if (ymmh_area != NULL) { int i; - for (i = 0; i < NUM_XMM_SLOTS; i++) { + for (i = 0; i < NUM_SIMD_SLOTS; i++) { memcpy(&mcontext->ymm[i].u32[4], &YMMH_AREA(ymmh_area, i).u32[0], YMMH_REG_SIZE); @@ -1259,7 +1259,7 @@ mcontext_to_context(CONTEXT *cxt, priv_mcontext_t *mcontext, bool set_cur_seg) memcpy(&cxt->ExtendedRegisters, fpstate, written); #endif /* Now update w/ the xmm values from mcontext */ - for (i = 0; i < NUM_XMM_SLOTS; i++) + for (i = 0; i < NUM_SIMD_SLOTS; i++) memcpy(CXT_XMM(cxt, i), &mcontext->ymm[i], XMM_REG_SIZE); } if (CONTEXT_PRESERVE_YMM && TESTALL(CONTEXT_XSTATE, cxt->ContextFlags)) { @@ -1289,7 +1289,7 @@ mcontext_to_context(CONTEXT *cxt, priv_mcontext_t *mcontext, bool set_cur_seg) memcpy(&YMMH_AREA(ymmh_area, 6).u32[0], &ymms[0].u32[4], YMMH_REG_SIZE); memcpy(&YMMH_AREA(ymmh_area, 7).u32[0], &ymms[1].u32[4], YMMH_REG_SIZE); #endif - for (i = 0; i < NUM_XMM_SLOTS; i++) { + for (i = 0; i < NUM_SIMD_SLOTS; i++) { memcpy(&YMMH_AREA(ymmh_area, i).u32[0], &mcontext->ymm[i].u32[4], YMMH_REG_SIZE); diff --git a/suite/tests/client-interface/inline.dll.c b/suite/tests/client-interface/inline.dll.c index 22cfdeba111..034319b6a5d 100644 --- a/suite/tests/client-interface/inline.dll.c +++ b/suite/tests/client-interface/inline.dll.c @@ -349,7 +349,7 @@ mcontexts_equal(dr_mcontext_t *mc_a, dr_mcontext_t *mc_b, int func_index) /* Only look at the initialized bits of the SSE regs. */ ymm_bytes_used = (proc_has_feature(FEATURE_AVX) ? 32 : 16); - for (i = 0; i < NUM_XMM_SLOTS; i++) { + for (i = 0; i < NUM_SIMD_SLOTS; i++) { if (memcmp(&mc_a->ymm[i], &mc_b->ymm[i], ymm_bytes_used) != 0) return false; } @@ -375,7 +375,7 @@ dump_diff_mcontexts(void) } dr_fprintf(STDERR, "Printing XMM regs:\n"); - for (i = 0; i < NUM_XMM_SLOTS; i++) { + for (i = 0; i < NUM_SIMD_SLOTS; i++) { dr_ymm_t before_reg = before_mcontext.ymm[i]; dr_ymm_t after_reg = after_mcontext.ymm[i]; size_t mmsz = proc_has_feature(FEATURE_AVX) ? sizeof(dr_xmm_t) : diff --git a/suite/tests/linux/sigcontext.c b/suite/tests/linux/sigcontext.c index 4479c8aec41..9ce30647b5a 100644 --- a/suite/tests/linux/sigcontext.c +++ b/suite/tests/linux/sigcontext.c @@ -44,10 +44,10 @@ #include #ifdef X64 -# define NUM_XMM_REGS 16 +# define NUM_SIMD_REGS 16 # define XAX "rax" #else -# define NUM_XMM_REGS 8 +# define NUM_SIMD_REGS 8 # define XAX "eax" #endif #define INTS_PER_XMM 4 @@ -66,7 +66,7 @@ signal_handler(int sig, siginfo_t *siginfo, ucontext_t *ucxt) * fpstate with xmm inside on delayed signals */ struct _fpstate *fp = (struct _fpstate *) ucxt->uc_mcontext.fpregs; - for (i = 0; i < NUM_XMM_REGS; i++) { + for (i = 0; i < NUM_SIMD_REGS; i++) { print("xmm[%d] = 0x%x 0x%x 0x%x 0x%x\n", i, #ifdef X64 fp->xmm_space[i*4], fp->xmm_space[i*4+1], @@ -98,7 +98,7 @@ signal_handler(int sig, siginfo_t *siginfo, ucontext_t *ucxt) if (xstate->fpstate.sw_reserved.magic1 == FP_XSTATE_MAGIC1) { assert(xstate->fpstate.sw_reserved.xstate_size >= sizeof(*xstate)); /* we can't print b/c not all processors have avx */ - for (i = 0; i < NUM_XMM_REGS; i++) { + for (i = 0; i < NUM_SIMD_REGS; i++) { #if VERBOSE print("ymmh[%d] = 0x%x 0x%x 0x%x 0x%x\n", i, xstate->ymmh.ymmh_space[i*4], @@ -143,7 +143,7 @@ determine_avx(void) int main(int argc, char *argv[]) { - int buf[INTS_PER_XMM*NUM_XMM_REGS]; + int buf[INTS_PER_XMM*NUM_SIMD_REGS]; char *ptr = (char *)buf; int i, j; @@ -152,7 +152,7 @@ main(int argc, char *argv[]) print("Sending SIGUSR1\n"); /* put known values in xmm regs (we assume processor has xmm) */ - for (i = 0; i < NUM_XMM_REGS; i++) { + for (i = 0; i < NUM_SIMD_REGS; i++) { for (j = 0; j < INTS_PER_XMM; j++) buf[i*INTS_PER_XMM+j] = 0xdeadbeef << i; } @@ -180,12 +180,12 @@ main(int argc, char *argv[]) if (determine_avx()) { /* put known values in ymm regs */ - int buf[INTS_PER_YMM*NUM_XMM_REGS]; + int buf[INTS_PER_YMM*NUM_SIMD_REGS]; char *ptr = (char *)buf; int i, j; intercept_signal(SIGUSR2, signal_handler, false); /* put known values in xmm regs (we assume processor has xmm) */ - for (i = 0; i < NUM_XMM_REGS; i++) { + for (i = 0; i < NUM_SIMD_REGS; i++) { for (j = 0; j < INTS_PER_YMM; j++) buf[i*INTS_PER_YMM+j] = 0xdeadbeef << i; }