diff --git a/core/arch/aarchxx/mangle.c b/core/arch/aarchxx/mangle.c
index 389657f63cd..c07fb43b1da 100644
--- a/core/arch/aarchxx/mangle.c
+++ b/core/arch/aarchxx/mangle.c
@@ -202,7 +202,7 @@ insert_push_all_registers(dcontext_t *dcontext, clean_call_info_t *cci,
 #endif
     if (cci == NULL)
         cci = &default_clean_call_info;
-    if (cci->preserve_mcontext || cci->num_xmms_skip != NUM_XMM_REGS) {
+    if (cci->preserve_mcontext || cci->num_xmms_skip != NUM_SIMD_REGS) {
         /* FIXME i#1551: once we add skipping of regs, need to keep shape here */
     }
     /* FIXME i#1551: once we have cci->num_xmms_skip, skip this if possible */
diff --git a/core/arch/arch.c b/core/arch/arch.c
index 0f10749263e..d082432db3f 100644
--- a/core/arch/arch.c
+++ b/core/arch/arch.c
@@ -3379,7 +3379,7 @@ dump_mcontext(priv_mcontext_t *context, file_t f, bool dump_xml)
 #ifdef X86
     if (preserve_xmm_caller_saved()) {
         int i, j;
-        for (i=0; i<NUM_XMM_SAVED; i++) {
+        for (i=0; i<NUM_SIMD_SAVED; i++) {
             if (YMM_ENABLED()) {
                 print_file(f, dump_xml ? "\t\tymm%d= \"0x" : "\tymm%d= 0x", i);
                 for (j = 0; j < 8; j++) {
diff --git a/core/arch/arch.h b/core/arch/arch.h
index 1b757d66a8e..45b10a648af 100644
--- a/core/arch/arch.h
+++ b/core/arch/arch.h
@@ -324,7 +324,7 @@ typedef enum {
 # define SHARED_GENCODE_MATCH_THREAD(dc) get_shared_gencode(dc)
 #endif
 
-#define NUM_XMM_REGS  NUM_XMM_SAVED
+#define NUM_SIMD_REGS NUM_SIMD_SAVED
 #define NUM_GP_REGS   DR_NUM_GPR_REGS
 
 /* Information about each individual clean call invocation site.
@@ -340,7 +340,7 @@ typedef struct _clean_call_info_t {
     bool skip_save_aflags;
     bool skip_clear_eflags;
     uint num_xmms_skip;
-    bool xmm_skip[NUM_XMM_REGS];
+    bool xmm_skip[NUM_SIMD_REGS];
     uint num_regs_skip;
     bool reg_skip[NUM_GP_REGS];
     bool preserve_mcontext; /* even if skip reg save, preserve mcontext shape */
@@ -1253,7 +1253,7 @@ typedef struct _callee_info_t {
     app_pc bwd_tgt;           /* earliest backward branch target */
     app_pc fwd_tgt;           /* last forward branch target */
     int num_xmms_used;        /* number of xmms used by callee */
-    bool xmm_used[NUM_XMM_REGS];  /* xmm/ymm registers usage */
+    bool xmm_used[NUM_SIMD_REGS]; /* xmm/ymm registers usage */
     bool reg_used[NUM_GP_REGS];   /* general purpose registers usage */
     int num_callee_save_regs; /* number of regs callee saved */
     bool callee_save_regs[NUM_GP_REGS]; /* callee-save registers */
diff --git a/core/arch/arch_exports.h b/core/arch/arch_exports.h
index 8d694dc7a34..fbd3fc8f8b8 100644
--- a/core/arch/arch_exports.h
+++ b/core/arch/arch_exports.h
@@ -59,12 +59,12 @@
 # define XMM_REG_SIZE  16
 # define YMM_REG_SIZE  32
 # define XMM_SAVED_REG_SIZE  YMM_REG_SIZE /* space in priv_mcontext_t for xmm/ymm */
-# define XMM_SLOTS_SIZE  (NUM_XMM_SLOTS*XMM_SAVED_REG_SIZE)
-# define XMM_SAVED_SIZE  (NUM_XMM_SAVED*XMM_SAVED_REG_SIZE)
+# define XMM_SLOTS_SIZE  (NUM_SIMD_SLOTS*XMM_SAVED_REG_SIZE)
+# define XMM_SAVED_SIZE  (NUM_SIMD_SAVED*XMM_SAVED_REG_SIZE)
 /* Indicates OS support, not just processor support (xref i#1278) */
 # define YMM_ENABLED() (proc_avx_enabled())
 # define YMMH_REG_SIZE (YMM_REG_SIZE/2) /* upper half */
-# define YMMH_SAVED_SIZE (NUM_XMM_SLOTS*YMMH_REG_SIZE)
+# define YMMH_SAVED_SIZE (NUM_SIMD_SLOTS*YMMH_REG_SIZE)
 #endif /* X86 */
 
 /* Number of slots for spills from inlined clean calls. */
diff --git a/core/arch/x86/clean_call_opt.c b/core/arch/x86/clean_call_opt.c
index 2c37285eaba..875b5efb1f2 100644
--- a/core/arch/x86/clean_call_opt.c
+++ b/core/arch/x86/clean_call_opt.c
@@ -80,8 +80,8 @@ callee_info_init(callee_info_t *ci)
      * but then later in analyze_callee_regs_usage, we have to use the loop.
      */
     /* assuming all xmm registers are used */
-    ci->num_xmms_used = NUM_XMM_REGS;
-    for (i = 0; i < NUM_XMM_REGS; i++)
+    ci->num_xmms_used = NUM_SIMD_REGS;
+    for (i = 0; i < NUM_SIMD_REGS; i++)
         ci->xmm_used[i] = true;
     for (i = 0; i < NUM_GP_REGS; i++)
         ci->reg_used[i] = true;
@@ -453,7 +453,7 @@ analyze_callee_regs_usage(dcontext_t *dcontext, callee_info_t *ci)
     uint i, num_regparm;
 
     ci->num_xmms_used = 0;
-    memset(ci->xmm_used, 0, sizeof(bool) * NUM_XMM_REGS);
+    memset(ci->xmm_used, 0, sizeof(bool) * NUM_SIMD_REGS);
     memset(ci->reg_used, 0, sizeof(bool) * NUM_GP_REGS);
     ci->write_aflags = false;
     for (instr  = instrlist_first(ilist);
@@ -466,7 +466,7 @@ analyze_callee_regs_usage(dcontext_t *dcontext, callee_info_t *ci)
          * impact unless there are a lot of different clean call callees.
          */
         /* XMM registers usage */
-        for (i = 0; i < NUM_XMM_REGS; i++) {
+        for (i = 0; i < NUM_SIMD_REGS; i++) {
             if (!ci->xmm_used[i] &&
                 instr_uses_reg(instr, (DR_REG_XMM0 + (reg_id_t)i))) {
                 LOG(THREAD, LOG_CLEANCALL, 2,
@@ -1060,7 +1060,7 @@ analyze_clean_call_regs(dcontext_t *dcontext, clean_call_info_t *cci)
     callee_info_t *info = cci->callee_info;
 
     /* 1. xmm registers */
-    for (i = 0; i < NUM_XMM_REGS; i++) {
+    for (i = 0; i < NUM_SIMD_REGS; i++) {
         if (info->xmm_used[i]) {
             cci->xmm_skip[i] = false;
         } else {
@@ -1071,7 +1071,7 @@ analyze_clean_call_regs(dcontext_t *dcontext, clean_call_info_t *cci)
             cci->num_xmms_skip++;
         }
     }
-    if (INTERNAL_OPTION(opt_cleancall) > 2 && cci->num_xmms_skip != NUM_XMM_REGS)
+    if (INTERNAL_OPTION(opt_cleancall) > 2 && cci->num_xmms_skip != NUM_SIMD_REGS)
         cci->should_align = false;
     /* 2. general purpose registers */
     /* set regs not to be saved for clean call */
@@ -1213,7 +1213,7 @@ analyze_clean_call_inline(dcontext_t *dcontext, clean_call_info_t *cci)
                 }
             }
         }
-        if (cci->num_xmms_skip == NUM_XMM_REGS) {
+        if (cci->num_xmms_skip == NUM_SIMD_REGS) {
             STATS_INC(cleancall_xmm_skipped);
         }
         if (cci->skip_save_aflags) {
@@ -1306,7 +1306,7 @@ insert_inline_reg_save(dcontext_t *dcontext, clean_call_info_t *cci,
     insert_get_mcontext_base(dcontext, ilist, where, ci->spill_reg);
 
     /* Save used registers. */
-    ASSERT(cci->num_xmms_skip == NUM_XMM_REGS);
+    ASSERT(cci->num_xmms_skip == NUM_SIMD_REGS);
     for (i = 0; i < NUM_GP_REGS; i++) {
         if (!cci->reg_skip[i]) {
             reg_id_t reg_id = DR_REG_XAX + (reg_id_t)i;
diff --git a/core/arch/x86/emit_utils.c b/core/arch/x86/emit_utils.c
index 1cfcb68108d..f1a0c53e84e 100644
--- a/core/arch/x86/emit_utils.c
+++ b/core/arch/x86/emit_utils.c
@@ -1303,7 +1303,7 @@ append_restore_simd_reg(dcontext_t *dcontext, instrlist_t *ilist, bool absolute)
         int i;
         uint opcode = move_mm_reg_opcode(true/*align32*/, true/*align16*/);
         ASSERT(proc_has_feature(FEATURE_SSE));
-        for (i=0; i<NUM_XMM_SAVED; i++) {
+        for (i=0; i<NUM_SIMD_SAVED; i++) {
             APP(ilist, instr_create_1dst_1src
                 (dcontext, opcode, opnd_create_reg
                  (REG_SAVED_XMM0 + (reg_id_t)i),
@@ -1521,7 +1521,7 @@ append_save_simd_reg(dcontext_t *dcontext, instrlist_t *ilist, bool absolute)
         int i;
         uint opcode = move_mm_reg_opcode(true/*align32*/, true/*align16*/);
         ASSERT(proc_has_feature(FEATURE_SSE));
-        for (i=0; i<NUM_XMM_SAVED; i++) {
+        for (i=0; i<NUM_SIMD_SAVED; i++) {
             APP(ilist, instr_create_1dst_1src
                 (dcontext, opcode,
                  OPND_DC_FIELD(absolute, dcontext, OPSZ_SAVED_XMM,
diff --git a/core/arch/x86/mangle.c b/core/arch/x86/mangle.c
index 0cb6581a927..5cd3512671a 100644
--- a/core/arch/x86/mangle.c
+++ b/core/arch/x86/mangle.c
@@ -335,7 +335,7 @@ insert_push_all_registers(dcontext_t *dcontext, clean_call_info_t *cci,
     int  offs_beyond_xmm = 0;
     if (cci == NULL)
         cci = &default_clean_call_info;
-    if (cci->preserve_mcontext || cci->num_xmms_skip != NUM_XMM_REGS) {
+    if (cci->preserve_mcontext || cci->num_xmms_skip != NUM_SIMD_REGS) {
         int offs = XMM_SLOTS_SIZE + PRE_XMM_PADDING;
         if (cci->preserve_mcontext && cci->skip_save_aflags) {
             offs_beyond_xmm = 2*XSP_SZ; /* pc and flags */
@@ -361,7 +361,7 @@ insert_push_all_registers(dcontext_t *dcontext, clean_call_info_t *cci,
          */
         uint opcode = move_mm_reg_opcode(ALIGNED(alignment, 16), ALIGNED(alignment, 32));
         ASSERT(proc_has_feature(FEATURE_SSE));
-        for (i=0; i<NUM_XMM_SAVED; i++) {
+        for (i=0; i<NUM_SIMD_SAVED; i++) {
             if (!cci->xmm_skip[i]) {
                 PRE(ilist, instr, instr_create_1dst_1src
                     (dcontext, opcode,
@@ -504,7 +504,7 @@ insert_pop_all_registers(dcontext_t *dcontext, clean_call_info_t *cci,
          * is better. */
         uint opcode = move_mm_reg_opcode(ALIGNED(alignment, 32), ALIGNED(alignment, 16));
         ASSERT(proc_has_feature(FEATURE_SSE));
-        for (i=0; i<NUM_XMM_SAVED; i++) {
+        for (i=0; i<NUM_SIMD_SAVED; i++) {
             if (!cci->xmm_skip[i]) {
                 PRE(ilist, instr, instr_create_1dst_1src
                     (dcontext, opcode, opnd_create_reg(REG_SAVED_XMM0 + (reg_id_t)i),
diff --git a/core/arch/x86/x86_asm_defines.asm b/core/arch/x86/x86_asm_defines.asm
index 62553ee0470..f7b28773bd3 100644
--- a/core/arch/x86/x86_asm_defines.asm
+++ b/core/arch/x86/x86_asm_defines.asm
@@ -47,18 +47,18 @@
  */
 #ifdef X64
 # ifdef WINDOWS
-#  define NUM_XMM_SLOTS 6 /* xmm0-5 */
+#  define NUM_SIMD_SLOTS 6 /* xmm0-5 */
 # else
-#  define NUM_XMM_SLOTS 16 /* xmm0-15 */
+#  define NUM_SIMD_SLOTS 16 /* xmm0-15 */
 # endif
 # define PRE_XMM_PADDING 16
 #else
-# define NUM_XMM_SLOTS 8 /* xmm0-7 */
+# define NUM_SIMD_SLOTS 8 /* xmm0-7 */
 # define PRE_XMM_PADDING 24
 #endif
 #define XMM_SAVED_REG_SIZE 32 /* for ymm */
 /* xmm0-5/7/15 for PR 264138/i#139/PR 302107 */
-#define XMM_SAVED_SIZE ((NUM_XMM_SLOTS)*(XMM_SAVED_REG_SIZE))
+#define XMM_SAVED_SIZE ((NUM_SIMD_SLOTS)*(XMM_SAVED_REG_SIZE))
 
 #ifdef X64
 /* push GPR registers in priv_mcontext_t order.  does NOT make xsp have a
diff --git a/core/lib/globals_shared.h b/core/lib/globals_shared.h
index 4891ee47ad1..dfa299fbe81 100644
--- a/core/lib/globals_shared.h
+++ b/core/lib/globals_shared.h
@@ -1789,7 +1789,7 @@ typedef union _dr_ymm_t {
     reg_t  reg[IF_X64_ELSE(4,8)]; /**< Representation as 4 or 8 registers. */
 } dr_ymm_t;
 
-#ifdef AARCHXX
+#if defined(AARCHXX)
 /**
  * 128-bit ARM SIMD Vn register.
  * In AArch64, align to 16 bytes for better performance.
@@ -1818,28 +1818,38 @@ typedef union _dr_simd_t {
 #  define NUM_SIMD_SLOTS 16 /**< Number of 128-bit SIMD Vn slots in dr_mcontext_t */
 # endif
 # define PRE_SIMD_PADDING 0 /**< Bytes of padding before xmm/ymm dr_mcontext_t slots */
-#endif /* ARM */
 
-#ifdef AVOID_API_EXPORT
+#elif defined(X86)
+
+# ifdef AVOID_API_EXPORT
 /* If this is increased, you'll probably need to increase the size of
  * inject_into_thread's buf and INTERCEPTION_CODE_SIZE (for Windows).
- * Also, update NUM_XMM_SLOTS in x86.asm and get_xmm_caller_saved.
+ * Also, update NUM_SIMD_SLOTS in x86.asm and get_xmm_caller_saved.
  * i#437: YMM is an extension of XMM from 128-bit to 256-bit without
  * adding new ones, so code operating on XMM often also operates on YMM,
  * and thus some *XMM* macros also apply to *YMM*.
  */
-#endif
-#ifdef X64
-# ifdef WINDOWS
-#  define NUM_XMM_SLOTS 6 /**< Number of [xy]mm reg slots in dr_mcontext_t */ /*xmm0-5*/
+# endif
+# ifdef X64
+#  ifdef WINDOWS
+    /*xmm0-5*/
+#   define NUM_SIMD_SLOTS 6 /**< Number of [xy]mm reg slots in dr_mcontext_t */
+#  else
+    /*xmm0-15*/
+#   define NUM_SIMD_SLOTS 16 /**< Number of [xy]mm reg slots in dr_mcontext_t */
+#  endif
+#  define PRE_XMM_PADDING 16 /**< Bytes of padding before xmm/ymm dr_mcontext_t slots */
 # else
-#  define NUM_XMM_SLOTS 16 /**< Number of [xy]mm reg slots in dr_mcontext_t */ /*xmm0-15*/
+   /*xmm0-7*/
+#  define NUM_SIMD_SLOTS 8 /**< Number of [xy]mm reg slots in dr_mcontext_t */
+#  define PRE_XMM_PADDING 24 /**< Bytes of padding before xmm/ymm dr_mcontext_t slots */
 # endif
-# define PRE_XMM_PADDING 16 /**< Bytes of padding before xmm/ymm dr_mcontext_t slots */
+
+# define NUM_XMM_SLOTS NUM_SIMD_SLOTS /* for backward compatibility */
+
 #else
-# define NUM_XMM_SLOTS 8 /**< Number of [xy]mm reg slots in dr_mcontext_t */ /*xmm0-7*/
-# define PRE_XMM_PADDING 24 /**< Bytes of padding before xmm/ymm dr_mcontext_t slots */
-#endif
+# error NYI
+#endif /* AARCHXX/X86 */
 
 /** Values for the flags field of dr_mcontext_t */
 typedef enum {
@@ -1910,10 +1920,10 @@ typedef struct _priv_mcontext_t {
  * have noticable impacts, i.e. pushing bbs over the max size limit,
  * and could have a noticeable performance hit.
  */
-/* We now save everything but we keep separate NUM_XMM_SLOTS vs NUM_XMM_SAVED
+/* We now save everything but we keep separate NUM_SIMD_SLOTS vs NUM_SIMD_SAVED
  * in case we go back to not saving some slots in the future: e.g., w/o
  * CLIENT_INTERFACE we could control our own libs enough to avoid some saves.
  */
-#define NUM_XMM_SAVED NUM_XMM_SLOTS
+#define NUM_SIMD_SAVED NUM_SIMD_SLOTS
 
 #endif /* ifndef _GLOBALS_SHARED_H_ */
diff --git a/core/lib/instrument.c b/core/lib/instrument.c
index 78b65ff7d64..a3fa8c52824 100644
--- a/core/lib/instrument.c
+++ b/core/lib/instrument.c
@@ -5166,7 +5166,7 @@ dr_insert_clean_call_ex_varg(void *drcontext, instrlist_t *ilist, instr_t *where
         cci.num_xmms_skip = 6;
 #else
         /* all 8 (or 16) are scratch */
-        cci.num_xmms_skip = NUM_XMM_REGS;
+        cci.num_xmms_skip = NUM_SIMD_REGS;
 #endif
         for (i=0; i<cci.num_xmms_skip; i++)
             cci.xmm_skip[i] = true;
diff --git a/core/lib/instrument_api.h b/core/lib/instrument_api.h
index 7228580a1e8..4e7e6f93801 100644
--- a/core/lib/instrument_api.h
+++ b/core/lib/instrument_api.h
@@ -5347,7 +5347,7 @@ DR_API
  * multimedia registers incurs a higher performance cost.  An invalid
  * flags value will return false.
  *
- * \note NUM_XMM_SLOTS in the dr_mcontext_t.xmm array are filled in,
+ * \note NUM_SIMD_SLOTS in the dr_mcontext_t.xmm array are filled in,
  * but only if dr_mcontext_xmm_fields_valid() returns true and
  * DR_MC_MULTIMEDIA is set in the flags field.
  *
diff --git a/core/lib/mcxtx.h b/core/lib/mcxtx.h
index 0150a9382c6..01d3161e3e1 100644
--- a/core/lib/mcxtx.h
+++ b/core/lib/mcxtx.h
@@ -244,5 +244,5 @@
      * DrMi#665: we now preserve all of the xmm registers.
      */
 # endif
-    dr_ymm_t ymm[NUM_XMM_SLOTS];
+    dr_ymm_t ymm[NUM_SIMD_SLOTS];
 #endif /* ARM/X86 */
diff --git a/core/unix/signal_linux_x86.c b/core/unix/signal_linux_x86.c
index da3b9be24df..d3c86c2d516 100644
--- a/core/unix/signal_linux_x86.c
+++ b/core/unix/signal_linux_x86.c
@@ -196,7 +196,7 @@ save_xmm(dcontext_t *dcontext, sigframe_rt_t *frame)
         dr_xgetbv(&bv_high, &bv_low);
         xstate->xstate_hdr.xstate_bv = (((uint64)bv_high)<<32) | bv_low;
     }
-    for (i=0; i<NUM_XMM_SAVED; i++) {
+    for (i=0; i<NUM_SIMD_SAVED; i++) {
         /* we assume no padding */
 #ifdef X64
         /* __u32 xmm_space[64] */
@@ -360,7 +360,7 @@ dump_fpstate(dcontext_t *dcontext, struct _fpstate *fp)
             ASSERT(TEST(XCR0_AVX, fp->sw_reserved.xstate_bv));
             LOG(THREAD, LOG_ASYNCH, 1, "\txstate_bv = 0x"HEX64_FORMAT_STRING"\n",
                 xstate->xstate_hdr.xstate_bv);
-            for (i=0; i<NUM_XMM_SLOTS; i++) {
+            for (i=0; i<NUM_SIMD_SLOTS; i++) {
                 LOG(THREAD, LOG_ASYNCH, 1, "\tymmh%d = ", i);
                 for (j=0; j<4; j++)
                     LOG(THREAD, LOG_ASYNCH, 1, "%04x ", xstate->ymmh.ymmh_space[i*4+j]);
@@ -424,7 +424,7 @@ sigcontext_to_mcontext_simd(priv_mcontext_t *mc, sig_full_cxt_t *sc_full)
     sigcontext_t *sc = sc_full->sc;
     if (sc->fpstate != NULL) {
         int i;
-        for (i=0; i<NUM_XMM_SLOTS; i++) {
+        for (i=0; i<NUM_SIMD_SLOTS; i++) {
             memcpy(&mc->ymm[i], &sc->fpstate->IF_X64_ELSE(xmm_space[i*4],_xmm[i]),
                    XMM_REG_SIZE);
         }
@@ -436,7 +436,7 @@ sigcontext_to_mcontext_simd(priv_mcontext_t *mc, sig_full_cxt_t *sc_full)
                  */
                 ASSERT(sc->fpstate->sw_reserved.extended_size >= sizeof(*xstate));
                 ASSERT(TEST(XCR0_AVX, sc->fpstate->sw_reserved.xstate_bv));
-                for (i=0; i<NUM_XMM_SLOTS; i++) {
+                for (i=0; i<NUM_SIMD_SLOTS; i++) {
                     memcpy(&mc->ymm[i].u32[4], &xstate->ymmh.ymmh_space[i*4],
                            YMMH_REG_SIZE);
                 }
@@ -451,7 +451,7 @@ mcontext_to_sigcontext_simd(sig_full_cxt_t *sc_full, priv_mcontext_t *mc)
     sigcontext_t *sc = sc_full->sc;
     if (sc->fpstate != NULL) {
         int i;
-        for (i=0; i<NUM_XMM_SLOTS; i++) {
+        for (i=0; i<NUM_SIMD_SLOTS; i++) {
             memcpy(&sc->fpstate->IF_X64_ELSE(xmm_space[i*4],_xmm[i]), &mc->ymm[i],
                    XMM_REG_SIZE);
         }
@@ -463,7 +463,7 @@ mcontext_to_sigcontext_simd(sig_full_cxt_t *sc_full, priv_mcontext_t *mc)
                  */
                 ASSERT(sc->fpstate->sw_reserved.extended_size >= sizeof(*xstate));
                 ASSERT(TEST(XCR0_AVX, sc->fpstate->sw_reserved.xstate_bv));
-                for (i=0; i<NUM_XMM_SLOTS; i++) {
+                for (i=0; i<NUM_SIMD_SLOTS; i++) {
                     memcpy(&xstate->ymmh.ymmh_space[i*4], &mc->ymm[i].u32[4],
                            YMMH_REG_SIZE);
                 }
diff --git a/core/unix/signal_macos.c b/core/unix/signal_macos.c
index 15eeb5ee0db..00e4c06d5bf 100644
--- a/core/unix/signal_macos.c
+++ b/core/unix/signal_macos.c
@@ -154,11 +154,11 @@ sigcontext_to_mcontext_simd(priv_mcontext_t *mc, sig_full_cxt_t *sc_full)
      */
     sigcontext_t *sc = sc_full->sc;
     int i;
-    for (i=0; i<NUM_XMM_SLOTS; i++) {
+    for (i=0; i<NUM_SIMD_SLOTS; i++) {
         memcpy(&mc->ymm[i], &sc->__fs.__fpu_xmm0 + i, XMM_REG_SIZE);
     }
     if (YMM_ENABLED()) {
-        for (i=0; i<NUM_XMM_SLOTS; i++) {
+        for (i=0; i<NUM_SIMD_SLOTS; i++) {
             memcpy(&mc->ymm[i].u32[4], &sc->__fs.__fpu_ymmh0 + i, YMMH_REG_SIZE);
         }
     }
@@ -169,11 +169,11 @@ mcontext_to_sigcontext_simd(sig_full_cxt_t *sc_full, priv_mcontext_t *mc)
 {
     sigcontext_t *sc = sc_full->sc;
     int i;
-    for (i=0; i<NUM_XMM_SLOTS; i++) {
+    for (i=0; i<NUM_SIMD_SLOTS; i++) {
         memcpy(&sc->__fs.__fpu_xmm0 + i, &mc->ymm[i], XMM_REG_SIZE);
     }
     if (YMM_ENABLED()) {
-        for (i=0; i<NUM_XMM_SLOTS; i++) {
+        for (i=0; i<NUM_SIMD_SLOTS; i++) {
             memcpy(&sc->__fs.__fpu_ymmh0 + i, &mc->ymm[i].u32[4], YMMH_REG_SIZE);
         }
     }
@@ -200,7 +200,7 @@ dump_fpstate(dcontext_t *dcontext, sigcontext_t *sc)
                 *((ushort *)(&sc->__fs.__fpu_stmm0 + i) + j));
         LOG(THREAD, LOG_ASYNCH, 1, "\n");
     }
-    for (i=0; i<NUM_XMM_SLOTS; i++) {
+    for (i=0; i<NUM_SIMD_SLOTS; i++) {
         LOG(THREAD, LOG_ASYNCH, 1, "\txmm%d = ", i);
         for (j=0; j<4; j++)
             LOG(THREAD, LOG_ASYNCH, 1, "%08x ",
@@ -208,7 +208,7 @@ dump_fpstate(dcontext_t *dcontext, sigcontext_t *sc)
         LOG(THREAD, LOG_ASYNCH, 1, "\n");
     }
     if (YMM_ENABLED()) {
-        for (i=0; i<NUM_XMM_SLOTS; i++) {
+        for (i=0; i<NUM_SIMD_SLOTS; i++) {
             LOG(THREAD, LOG_ASYNCH, 1, "\tymmh%d = ", i);
             for (j=0; j<4; j++) {
                 LOG(THREAD, LOG_ASYNCH, 1, "%08x ",
diff --git a/core/win32/callback.c b/core/win32/callback.c
index 2223cb38781..229d5eda840 100644
--- a/core/win32/callback.c
+++ b/core/win32/callback.c
@@ -4513,7 +4513,7 @@ dump_context_info(CONTEXT *context, file_t file, bool all)
         TESTALL(CONTEXT_XMM_FLAG, context->ContextFlags)) {
         int i, j;
         byte *ymmh_area;
-        for (i=0; i<NUM_XMM_SAVED; i++) {
+        for (i=0; i<NUM_SIMD_SAVED; i++) {
             LOG(file, LOG_ASYNCH, 2, "xmm%d=0x", i);
             /* This would be simpler if we had uint64 fields in dr_xmm_t but
              * that complicates our struct layouts */
diff --git a/core/win32/inject.c b/core/win32/inject.c
index 02c2f9d0770..3cc585e790b 100644
--- a/core/win32/inject.c
+++ b/core/win32/inject.c
@@ -246,7 +246,7 @@ inject_into_thread(HANDLE phandle, CONTEXT *cxt, HANDLE thandle,
             int i, j;
             /* For x86, ensure we have ExtendedRegisters space (i#1223) */
             IF_NOT_X64(ASSERT(TEST(CONTEXT_XMM_FLAG, cxt->ContextFlags)));
-            for (i = 0; i < NUM_XMM_SLOTS; i++) {
+            for (i = 0; i < NUM_SIMD_SLOTS; i++) {
                 for (j = 0; j < IF_X64_ELSE(2,4); j++) {
                     *bufptr++ = CXT_XMM(cxt, i)->reg[j];
                 }
diff --git a/core/win32/ntdll.c b/core/win32/ntdll.c
index 15c53cfeb64..e40171ec2e4 100644
--- a/core/win32/ntdll.c
+++ b/core/win32/ntdll.c
@@ -1157,7 +1157,7 @@ context_to_mcontext_internal(priv_mcontext_t *mcontext, CONTEXT *cxt)
         /* no harm done if no sse support */
         /* CONTEXT_FLOATING_POINT or CONTEXT_EXTENDED_REGISTERS */
         int i;
-        for (i = 0; i < NUM_XMM_SLOTS; i++)
+        for (i = 0; i < NUM_SIMD_SLOTS; i++)
             memcpy(&mcontext->ymm[i], CXT_XMM(cxt, i), XMM_REG_SIZE);
     }
     /* if XSTATE is NOT set, the app has NOT used any ymm state and
@@ -1167,7 +1167,7 @@ context_to_mcontext_internal(priv_mcontext_t *mcontext, CONTEXT *cxt)
         byte *ymmh_area = context_ymmh_saved_area(cxt);
         if (ymmh_area != NULL) {
             int i;
-            for (i = 0; i < NUM_XMM_SLOTS; i++) {
+            for (i = 0; i < NUM_SIMD_SLOTS; i++) {
                 memcpy(&mcontext->ymm[i].u32[4],
                        &YMMH_AREA(ymmh_area, i).u32[0],
                        YMMH_REG_SIZE);
@@ -1259,7 +1259,7 @@ mcontext_to_context(CONTEXT *cxt, priv_mcontext_t *mcontext, bool set_cur_seg)
         memcpy(&cxt->ExtendedRegisters, fpstate, written);
 #endif
         /* Now update w/ the xmm values from mcontext */
-        for (i = 0; i < NUM_XMM_SLOTS; i++)
+        for (i = 0; i < NUM_SIMD_SLOTS; i++)
             memcpy(CXT_XMM(cxt, i), &mcontext->ymm[i], XMM_REG_SIZE);
     }
     if (CONTEXT_PRESERVE_YMM && TESTALL(CONTEXT_XSTATE, cxt->ContextFlags)) {
@@ -1289,7 +1289,7 @@ mcontext_to_context(CONTEXT *cxt, priv_mcontext_t *mcontext, bool set_cur_seg)
             memcpy(&YMMH_AREA(ymmh_area, 6).u32[0], &ymms[0].u32[4], YMMH_REG_SIZE);
             memcpy(&YMMH_AREA(ymmh_area, 7).u32[0], &ymms[1].u32[4], YMMH_REG_SIZE);
 #endif
-            for (i = 0; i < NUM_XMM_SLOTS; i++) {
+            for (i = 0; i < NUM_SIMD_SLOTS; i++) {
                 memcpy(&YMMH_AREA(ymmh_area, i).u32[0],
                        &mcontext->ymm[i].u32[4],
                        YMMH_REG_SIZE);
diff --git a/suite/tests/client-interface/inline.dll.c b/suite/tests/client-interface/inline.dll.c
index 22cfdeba111..034319b6a5d 100644
--- a/suite/tests/client-interface/inline.dll.c
+++ b/suite/tests/client-interface/inline.dll.c
@@ -349,7 +349,7 @@ mcontexts_equal(dr_mcontext_t *mc_a, dr_mcontext_t *mc_b, int func_index)
 
     /* Only look at the initialized bits of the SSE regs. */
     ymm_bytes_used = (proc_has_feature(FEATURE_AVX) ? 32 : 16);
-    for (i = 0; i < NUM_XMM_SLOTS; i++) {
+    for (i = 0; i < NUM_SIMD_SLOTS; i++) {
         if (memcmp(&mc_a->ymm[i], &mc_b->ymm[i], ymm_bytes_used) != 0)
             return false;
     }
@@ -375,7 +375,7 @@ dump_diff_mcontexts(void)
     }
 
     dr_fprintf(STDERR, "Printing XMM regs:\n");
-    for (i = 0; i < NUM_XMM_SLOTS; i++) {
+    for (i = 0; i < NUM_SIMD_SLOTS; i++) {
         dr_ymm_t before_reg = before_mcontext.ymm[i];
         dr_ymm_t  after_reg =  after_mcontext.ymm[i];
         size_t mmsz = proc_has_feature(FEATURE_AVX) ? sizeof(dr_xmm_t) :
diff --git a/suite/tests/linux/sigcontext.c b/suite/tests/linux/sigcontext.c
index 4479c8aec41..9ce30647b5a 100644
--- a/suite/tests/linux/sigcontext.c
+++ b/suite/tests/linux/sigcontext.c
@@ -44,10 +44,10 @@
 #include <errno.h>
 
 #ifdef X64
-# define NUM_XMM_REGS 16
+# define NUM_SIMD_REGS 16
 # define XAX "rax"
 #else
-# define NUM_XMM_REGS 8
+# define NUM_SIMD_REGS 8
 # define XAX "eax"
 #endif
 #define INTS_PER_XMM 4
@@ -66,7 +66,7 @@ signal_handler(int sig, siginfo_t *siginfo, ucontext_t *ucxt)
              * fpstate with xmm inside on delayed signals
              */
             struct _fpstate *fp = (struct _fpstate *) ucxt->uc_mcontext.fpregs;
-            for (i = 0; i < NUM_XMM_REGS; i++) {
+            for (i = 0; i < NUM_SIMD_REGS; i++) {
                 print("xmm[%d] = 0x%x 0x%x 0x%x 0x%x\n", i,
 #ifdef X64
                       fp->xmm_space[i*4], fp->xmm_space[i*4+1],
@@ -98,7 +98,7 @@ signal_handler(int sig, siginfo_t *siginfo, ucontext_t *ucxt)
             if (xstate->fpstate.sw_reserved.magic1 == FP_XSTATE_MAGIC1) {
                 assert(xstate->fpstate.sw_reserved.xstate_size >= sizeof(*xstate));
                 /* we can't print b/c not all processors have avx */
-                for (i = 0; i < NUM_XMM_REGS; i++) {
+                for (i = 0; i < NUM_SIMD_REGS; i++) {
 #if VERBOSE
                     print("ymmh[%d] = 0x%x 0x%x 0x%x 0x%x\n", i,
                           xstate->ymmh.ymmh_space[i*4],
@@ -143,7 +143,7 @@ determine_avx(void)
 int
 main(int argc, char *argv[])
 {
-    int buf[INTS_PER_XMM*NUM_XMM_REGS];
+    int buf[INTS_PER_XMM*NUM_SIMD_REGS];
     char *ptr = (char *)buf;
     int i, j;
 
@@ -152,7 +152,7 @@ main(int argc, char *argv[])
     print("Sending SIGUSR1\n");
 
     /* put known values in xmm regs (we assume processor has xmm) */
-    for (i = 0; i < NUM_XMM_REGS; i++) {
+    for (i = 0; i < NUM_SIMD_REGS; i++) {
         for (j = 0; j < INTS_PER_XMM; j++)
             buf[i*INTS_PER_XMM+j] = 0xdeadbeef << i;
     }
@@ -180,12 +180,12 @@ main(int argc, char *argv[])
 
     if (determine_avx()) {
         /* put known values in ymm regs */
-        int buf[INTS_PER_YMM*NUM_XMM_REGS];
+        int buf[INTS_PER_YMM*NUM_SIMD_REGS];
         char *ptr = (char *)buf;
         int i, j;
         intercept_signal(SIGUSR2, signal_handler, false);
         /* put known values in xmm regs (we assume processor has xmm) */
-        for (i = 0; i < NUM_XMM_REGS; i++) {
+        for (i = 0; i < NUM_SIMD_REGS; i++) {
             for (j = 0; j < INTS_PER_YMM; j++)
                 buf[i*INTS_PER_YMM+j] = 0xdeadbeef << i;
         }