diff --git a/CMakeLists.txt b/CMakeLists.txt index 6b1430c5fa0..5e1b01df6c9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -621,6 +621,12 @@ if (X86 AND UNIX) check_avx512_processor_and_compiler_support(proc_supports_avx512) endif () +set(proc_supports_sve OFF) +if (AARCH64 AND UNIX) + set(CFLAGS_SVE "-march=armv8-a+sve") + check_sve_processor_and_compiler_support(proc_supports_sve) +endif () + # Ensure that _AMD64_ or _X86_ are defined on Microsoft Windows, as otherwise # um/winnt.h provided since Windows 10.0.22000 will error. if (NOT UNIX) diff --git a/api/samples/memtrace_simple.c b/api/samples/memtrace_simple.c index 3227c46c2f3..f45b45a1c73 100644 --- a/api/samples/memtrace_simple.c +++ b/api/samples/memtrace_simple.c @@ -321,7 +321,7 @@ event_app_instruction(void *drcontext, void *tag, instrlist_t *bb, instr_t *wher const opnd_t src = instr_get_src(instr_operands, i); if (opnd_is_memory_reference(src)) { #ifdef AARCH64 - /* TODO i#5844: Memory references involving SVE registers are not + /* TODO i#5036: Memory references involving SVE registers are not * supported yet. To be implemented as part of scatter/gather work. */ if (opnd_is_base_disp(src) && @@ -343,7 +343,7 @@ event_app_instruction(void *drcontext, void *tag, instrlist_t *bb, instr_t *wher const opnd_t dst = instr_get_dst(instr_operands, i); if (opnd_is_memory_reference(dst)) { #ifdef AARCH64 - /* TODO i#5844: Memory references involving SVE registers are not + /* TODO i#5036: Memory references involving SVE registers are not * supported yet. To be implemented as part of scatter/gather work. */ if (opnd_is_base_disp(dst) && diff --git a/api/samples/memval_simple.c b/api/samples/memval_simple.c index 15a7539c308..6f523e14cb6 100644 --- a/api/samples/memval_simple.c +++ b/api/samples/memval_simple.c @@ -334,7 +334,7 @@ handle_post_write(void *drcontext, instrlist_t *ilist, instr_t *where, reg_id_t } #ifdef AARCH64 - /* TODO i#5844: Memory references involving SVE registers are not + /* TODO i#5036: Memory references involving SVE registers are not * supported yet. To be implemented as part of scatter/gather work. */ if (opnd_is_base_disp(dst) && @@ -405,7 +405,7 @@ event_app_instruction(void *drcontext, void *tag, instrlist_t *bb, instr_t *wher break; } #ifdef AARCH64 - /* TODO i#5844: Memory references involving SVE registers are not + /* TODO i#5036: Memory references involving SVE registers are not * supported yet. To be implemented as part of scatter/gather work. */ if (opnd_is_base_disp(dst) && diff --git a/clients/drcachesim/tests/scattergather-aarch64.templatex b/clients/drcachesim/tests/scattergather-aarch64.templatex new file mode 100644 index 00000000000..50aaa0cb30a --- /dev/null +++ b/clients/drcachesim/tests/scattergather-aarch64.templatex @@ -0,0 +1,86 @@ +#ifdef __ARM_FEATURE_SVE +ld1b 32bit unscaled offset uxtw: PASS +ld1b 32bit unscaled offset sxtw: PASS +ld1b 32bit unpacked unscaled offset uxtw: PASS +ld1b 32bit unpacked unscaled offset sxtw: PASS +ld1b 64bit unscaled offset: PASS +ld1b 64bit unscaled offset Zt==Zm: PASS +ld1sb 32bit unscaled offset uxtw: PASS +ld1sb 32bit unscaled offset sxtw: PASS +ld1sb 32bit unpacked unscaled offset uxtw: PASS +ld1sb 32bit unpacked unscaled offset sxtw: PASS +ld1sb 64bit unscaled offset: PASS +ld1sb 64bit unscaled offset: PASS +ld1h 32bit scaled offset uxtw: PASS +ld1h 32bit scaled offset sxtw: PASS +ld1h 32bit unpacked scaled offset uxtw: PASS +ld1h 32bit unpacked scaled offset sxtw: PASS +ld1h 32bit unpacked unscaled offset uxtw: PASS +ld1h 32bit unpacked unscaled offset sxtw: PASS +ld1h 32bit unscaled offset uxtw: PASS +ld1h 32bit unscaled offset sxtw: PASS +ld1h 64bit scaled offset: PASS +ld1h 64bit unscaled offset: PASS +ld1h 64bit unscaled offset Zt==Zm: PASS +ld1sh 32bit scaled offset uxtw: PASS +ld1sh 32bit scaled offset sxtw: PASS +ld1sh 32bit unpacked scaled offset uxtw: PASS +ld1sh 32bit unpacked scaled offset sxtw: PASS +ld1sh 32bit unpacked unscaled offset uxtw: PASS +ld1sh 32bit unpacked unscaled offset sxtw: PASS +ld1sh 32bit unscaled offset uxtw: PASS +ld1sh 32bit unscaled offset sxtw: PASS +ld1sh 64bit scaled offset: PASS +ld1sh 64bit unscaled offset: PASS +ld1sh 64bit unscaled offset Zt==Zm: PASS +ld1w 32bit scaled offset uxtw: PASS +ld1w 32bit scaled offset sxtw: PASS +ld1w 32bit unpacked scaled offset uxtw: PASS +ld1w 32bit unpacked scaled offset sxtw: PASS +ld1w 32bit unpacked unscaled offset uxtw: PASS +ld1w 32bit unpacked unscaled offset sxtw: PASS +ld1w 32bit unscaled offset uxtw: PASS +ld1w 32bit unscaled offset sxtw: PASS +ld1w 64bit scaled offset: PASS +ld1w 64bit unscaled offset: PASS +ld1w 64bit unscaled offset Zt==Zm: PASS +ld1sw 32bit unpacked scaled offset uxtw: PASS +ld1sw 32bit unpacked scaled offset sxtw: PASS +ld1sw 32bit unpacked unscaled offset uxtw: PASS +ld1sw 32bit unpacked unscaled offset sxtw: PASS +ld1sw 64bit scaled offset: PASS +ld1sw 64bit unscaled offset: PASS +ld1sw 64bit unscaled offset Zt==Zm: PASS +ld1d 32bit unpacked scaled offset uxtw: PASS +ld1d 32bit unpacked scaled offset sxtw: PASS +ld1d 32bit unpacked unscaled offset uxtw: PASS +ld1d 32bit unpacked unscaled offset sxtw: PASS +ld1d 64bit scaled offset: PASS +ld1d 64bit unscaled offset: PASS +ld1d 64bit unscaled offset Zt==Zm: PASS +#endif /* __ARM_FEATURE_SVE */ +---- ---- +Basic counts tool results: +Total counts: + .* total \(fetched\) instructions + .* total unique \(fetched\) instructions + .* total non-fetched instructions + .* total prefetches + .* total data loads + .* total data stores + .* total icache flushes + .* total dcache flushes + 1 total threads + .* total scheduling markers +.* +Thread .* counts: + .* \(fetched\) instructions + .* unique \(fetched\) instructions + .* non-fetched instructions + .* prefetches + .* data loads + .* data stores + .* icache flushes + .* dcache flushes + .* scheduling markers +.* diff --git a/clients/drcachesim/tests/scattergather.templatex b/clients/drcachesim/tests/scattergather-x86.templatex similarity index 100% rename from clients/drcachesim/tests/scattergather.templatex rename to clients/drcachesim/tests/scattergather-x86.templatex diff --git a/clients/drcachesim/tracer/instru_offline.cpp b/clients/drcachesim/tracer/instru_offline.cpp index 99f5ac83dbb..9d303e01677 100644 --- a/clients/drcachesim/tracer/instru_offline.cpp +++ b/clients/drcachesim/tracer/instru_offline.cpp @@ -949,7 +949,7 @@ offline_instru_t::identify_elidable_addresses(void *drcontext, instrlist_t *ilis // view by expanding the instr in raw2trace (e.g. using // drx_expand_scatter_gather) when building the ilist. if (drutil_instr_is_stringop_loop(instr) - // TODO i#3837: Scatter/gather support NYI on ARM/AArch64. + // TODO i#5036: Scatter/gather support incomplete on AArch64. IF_X86(|| instr_is_scatter(instr) || instr_is_gather(instr))) { return; } diff --git a/clients/drcachesim/tracer/tracer.cpp b/clients/drcachesim/tracer/tracer.cpp index 1b3365cf5f8..3c067c9c1ce 100644 --- a/clients/drcachesim/tracer/tracer.cpp +++ b/clients/drcachesim/tracer/tracer.cpp @@ -1311,7 +1311,7 @@ event_app_instruction(void *drcontext, void *tag, instrlist_t *bb, instr_t *inst const opnd_t src = instr_get_src(instr_operands, i); if (opnd_is_memory_reference(src)) { #ifdef AARCH64 - /* TODO i#5844: Memory references involving SVE registers are not + /* TODO i#5036: Memory references involving SVE registers are not * supported yet. To be implemented as part of scatter/gather work. */ if (opnd_is_base_disp(src) && @@ -1335,7 +1335,7 @@ event_app_instruction(void *drcontext, void *tag, instrlist_t *bb, instr_t *inst const opnd_t dst = instr_get_dst(instr_operands, i); if (opnd_is_memory_reference(dst)) { #ifdef AARCH64 - /* TODO i#5844: Memory references involving SVE registers are not + /* TODO i#5036: Memory references involving SVE registers are not * supported yet. To be implemented as part of scatter/gather work. */ if (opnd_is_base_disp(dst) && diff --git a/core/ir/aarch64/instr.c b/core/ir/aarch64/instr.c index ae17444c225..4abbabfe352 100644 --- a/core/ir/aarch64/instr.c +++ b/core/ir/aarch64/instr.c @@ -610,8 +610,28 @@ DR_API bool instr_is_scatter(instr_t *instr) { - /* FIXME i#3837: add support. */ - ASSERT_NOT_IMPLEMENTED(false); + switch (instr_get_opcode(instr)) { + case OP_st1b: + case OP_st1h: + case OP_st1w: + case OP_st1d: + case OP_st2b: + case OP_st2h: + case OP_st2w: + case OP_st2d: + case OP_st3b: + case OP_st3h: + case OP_st3w: + case OP_st3d: + case OP_st4b: + case OP_st4h: + case OP_st4w: + case OP_st4d: + case OP_stnt1b: + case OP_stnt1h: + case OP_stnt1w: + case OP_stnt1d: return true; + } return false; } @@ -619,8 +639,53 @@ DR_API bool instr_is_gather(instr_t *instr) { - /* FIXME i#3837: add support. */ - ASSERT_NOT_IMPLEMENTED(false); + switch (instr_get_opcode(instr)) { + case OP_ld1b: + case OP_ld1h: + case OP_ld1w: + case OP_ld1d: + case OP_ld1sb: + case OP_ld1sh: + case OP_ld1sw: + case OP_ld1rob: + case OP_ld1rqb: + case OP_ld1rqh: + case OP_ld1rqw: + case OP_ld1rqd: + case OP_ldff1b: + case OP_ldff1h: + case OP_ldff1w: + case OP_ldff1d: + case OP_ldff1sb: + case OP_ldff1sh: + case OP_ldff1sw: + case OP_ldnf1b: + case OP_ldnf1h: + case OP_ldnf1w: + case OP_ldnf1d: + case OP_ldnf1sb: + case OP_ldnf1sh: + case OP_ldnf1sw: + case OP_ldnt1b: + case OP_ldnt1h: + case OP_ldnt1w: + case OP_ldnt1d: + case OP_ldnt1sb: + case OP_ldnt1sh: + case OP_ldnt1sw: + case OP_ld2b: + case OP_ld2h: + case OP_ld2w: + case OP_ld2d: + case OP_ld3b: + case OP_ld3h: + case OP_ld3w: + case OP_ld3d: + case OP_ld4b: + case OP_ld4h: + case OP_ld4w: + case OP_ld4d: return true; + } return false; } diff --git a/core/ir/aarch64/instr_create_api.h b/core/ir/aarch64/instr_create_api.h index 7a699c55f28..7d50163d6d0 100644 --- a/core/ir/aarch64/instr_create_api.h +++ b/core/ir/aarch64/instr_create_api.h @@ -634,10 +634,13 @@ #define INSTR_CREATE_ldp(dc, rt1, rt2, mem) \ instr_create_2dst_1src(dc, OP_ldp, rt1, rt2, mem) #define INSTR_CREATE_ldr(dc, Rd, mem) instr_create_1dst_1src((dc), OP_ldr, (Rd), (mem)) +#define INSTR_CREATE_ldrsw(dc, Rd, mem) \ + instr_create_1dst_1src((dc), OP_ldrsw, (Rd), (mem)) #define INSTR_CREATE_ldrb(dc, Rd, mem) instr_create_1dst_1src(dc, OP_ldrb, Rd, mem) #define INSTR_CREATE_ldrsb(dc, Rd, mem) \ instr_create_1dst_1src((dc), OP_ldrsb, (Rd), (mem)) #define INSTR_CREATE_ldrh(dc, Rd, mem) instr_create_1dst_1src(dc, OP_ldrh, Rd, mem) +#define INSTR_CREATE_ldrsh(dc, Rd, mem) instr_create_1dst_1src(dc, OP_ldrsh, Rd, mem) #define INSTR_CREATE_ldur(dc, rt, mem) instr_create_1dst_1src(dc, OP_ldur, rt, mem) #define INSTR_CREATE_ldar(dc, Rt, mem) instr_create_1dst_1src((dc), OP_ldar, (Rt), (mem)) #define INSTR_CREATE_ldarb(dc, Rt, mem) \ @@ -7185,6 +7188,22 @@ #define INSTR_CREATE_eor_sve_pred_b(dc, Pd, Pg, Pn, Pm) \ instr_create_1dst_3src(dc, OP_eor, Pd, Pg, Pn, Pm) +/** + * Creates a NOT instruction. + * + * This macro is used to encode the forms: + * \verbatim + * NOT .B, /Z, .B + * \endverbatim + * \param dc The void * dcontext used to allocate memory for the #instr_t. + * \param Pd The destination predicate register, P (Predicate). + * \param Pg The governing predicate register, P (Predicate). + * \param Pn The first source predicate register, P (Predicate). + */ +#define INSTR_CREATE_not_sve_pred_b(dc, Pd, Pg, Pn) \ + INSTR_CREATE_eor_sve_pred_b( \ + dc, Pd, Pg, Pn, opnd_create_reg_element_vector(opnd_get_reg(Pg), OPSZ_1)) + /** * Creates an EOR instruction. * diff --git a/ext/drx/CMakeLists.txt b/ext/drx/CMakeLists.txt index d22ee8768d5..f66b6e83776 100755 --- a/ext/drx/CMakeLists.txt +++ b/ext/drx/CMakeLists.txt @@ -43,6 +43,7 @@ set(srcs drx.c drx_buf.c scatter_gather_${ARCH_NAME}.c + scatter_gather_shared.c # add more here ) @@ -71,6 +72,9 @@ macro(configure_drx_target target) if (WIN32) target_link_libraries(${target} ntdll_imports) endif () + target_include_directories(${target} + PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}) endmacro() configure_drx_target(drx) diff --git a/ext/drx/drx.c b/ext/drx/drx.c index 6c1d8b26925..d583fb07a17 100644 --- a/ext/drx/drx.c +++ b/ext/drx/drx.c @@ -71,8 +71,8 @@ # define IF_WINDOWS_ELSE(x, y) (y) #endif -#ifdef X86 -/* TODO i#3837: Add AArch64 support. */ +#if defined(X86) || defined(AARCH64) +/* TODO i#5036: Complete AArch64 support. */ # define PLATFORM_SUPPORTS_SCATTER_GATHER #endif diff --git a/ext/drx/scatter_gather_aarch64.c b/ext/drx/scatter_gather_aarch64.c index 93827d32694..4c03890dade 100644 --- a/ext/drx/scatter_gather_aarch64.c +++ b/ext/drx/scatter_gather_aarch64.c @@ -38,6 +38,476 @@ #include "dr_api.h" #include "drx.h" #include "drmgr.h" +#include "drreg.h" +#include "../ext_utils.h" +#include "scatter_gather_shared.h" + +#include /* for offsetof */ + +/* Control printing of verbose debugging messages. */ +#define VERBOSE 0 + +#define SVE_MAX_VECTOR_LENGTH_BITS 2048 +#define SVE_MAX_VECTOR_LENGTH_BYTES (SVE_MAX_VECTOR_LENGTH_BITS / 8) +#define SVE_VECTOR_ALIGNMENT_BYTES 16 +#define SVE_VECTOR_SPILL_SLOT_SIZE \ + SVE_MAX_VECTOR_LENGTH_BYTES + (SVE_VECTOR_ALIGNMENT_BYTES - 1) + +#define SVE_MAX_PREDICATE_LENGTH_BITS (SVE_MAX_VECTOR_LENGTH_BITS / 8) +#define SVE_MAX_PREDICATE_LENGTH_BYTES (SVE_MAX_PREDICATE_LENGTH_BITS / 8) +#define SVE_PREDICATE_ALIGNMENT_BYTES 2 +#define SVE_PREDICATE_SPILL_SLOT_SIZE SVE_MAX_PREDICATE_LENGTH_BYTES + +typedef struct _per_thread_t { + void *scratch_pred_spill_slot; /* Storage for spilled predicate register. */ +} per_thread_t; + +/* Track the state of manual spill slots for SVE registers. + * This corresponds to the spill slot storage in per_thread_t. + */ +typedef struct _spill_slot_state_t { + reg_id_t pred_slots[1]; +} spill_slot_state_t; + +void +init_spill_slot_state(OUT spill_slot_state_t *spill_slot_state) +{ + const size_t num_pred_slots = + sizeof(spill_slot_state->pred_slots) / sizeof(spill_slot_state->pred_slots[0]); + for (size_t i = 0; i < num_pred_slots; i++) + spill_slot_state->pred_slots[i] = DR_REG_NULL; +} + +void +drx_scatter_gather_thread_init(void *drcontext) +{ + per_thread_t *pt = (per_thread_t *)dr_thread_alloc(drcontext, sizeof(*pt)); + + /* + * The instructions we use to load/store the spilled predicate register require + * the base address to be aligned to 2 bytes: + * LDR , [{, #, MUL VL}] + * STR , [{, #, MUL VL}] + * and dr_thread_alloc() guarantees allocated memory is aligned to the pointer size + * (8 bytes) so we shouldn't have to do any further alignment. + */ + pt->scratch_pred_spill_slot = + dr_thread_alloc(drcontext, SVE_PREDICATE_SPILL_SLOT_SIZE); + DR_ASSERT_MSG(ALIGNED(pt->scratch_pred_spill_slot, SVE_PREDICATE_ALIGNMENT_BYTES), + "scratch_pred_spill_slot is misaligned"); + + drmgr_set_tls_field(drcontext, drx_scatter_gather_tls_idx, (void *)pt); +} + +void +drx_scatter_gather_thread_exit(void *drcontext) +{ + per_thread_t *pt = + (per_thread_t *)drmgr_get_tls_field(drcontext, drx_scatter_gather_tls_idx); + dr_thread_free(drcontext, pt->scratch_pred_spill_slot, SVE_PREDICATE_SPILL_SLOT_SIZE); + dr_thread_free(drcontext, pt, sizeof(*pt)); +} + +static void +get_scatter_gather_info(instr_t *instr, OUT scatter_gather_info_t *sg_info) +{ + DR_ASSERT_MSG(instr_is_scatter(instr) || instr_is_gather(instr), + "Instruction must be scatter or gather."); + + opnd_t dst0 = instr_get_dst(instr, 0); + opnd_t src0 = instr_get_src(instr, 0); + sg_info->mask_reg = opnd_get_reg(instr_get_src(instr, 1)); + + opnd_t memopnd; + if (instr_is_scatter(instr)) { + sg_info->is_load = false; + sg_info->scatter_src_reg = opnd_get_reg(src0); + sg_info->element_size = opnd_get_vector_element_size(src0); + memopnd = dst0; + } else { + sg_info->is_load = true; + sg_info->gather_dst_reg = opnd_get_reg(dst0); + sg_info->element_size = opnd_get_vector_element_size(dst0); + memopnd = src0; + } + + sg_info->base_reg = opnd_get_base(memopnd); + sg_info->index_reg = opnd_get_index(memopnd); + + sg_info->disp = opnd_get_disp(memopnd); + sg_info->extend = + opnd_get_index_extend(memopnd, &sg_info->scaled, &sg_info->extend_amount); + + sg_info->scatter_gather_size = opnd_get_size(memopnd); + + switch (instr_get_opcode(instr)) { +#define DRX_CASE(op, _reg_count, value_size, value_signed, _faulting_behavior) \ + case OP_##op: \ + sg_info->reg_count = _reg_count; \ + sg_info->scalar_value_size = value_size; \ + sg_info->is_scalar_value_signed = value_signed; \ + sg_info->faulting_behavior = _faulting_behavior; \ + break + + DRX_CASE(ld1b, 1, OPSZ_1, false, DRX_NORMAL_FAULTING); + DRX_CASE(ld1h, 1, OPSZ_2, false, DRX_NORMAL_FAULTING); + DRX_CASE(ld1w, 1, OPSZ_4, false, DRX_NORMAL_FAULTING); + DRX_CASE(ld1d, 1, OPSZ_8, false, DRX_NORMAL_FAULTING); + DRX_CASE(ld1sb, 1, OPSZ_1, true, DRX_NORMAL_FAULTING); + DRX_CASE(ld1sh, 1, OPSZ_2, true, DRX_NORMAL_FAULTING); + DRX_CASE(ld1sw, 1, OPSZ_4, true, DRX_NORMAL_FAULTING); + + DRX_CASE(ldff1b, 1, OPSZ_1, false, DRX_FIRST_FAULTING); + DRX_CASE(ldff1h, 1, OPSZ_2, false, DRX_FIRST_FAULTING); + DRX_CASE(ldff1w, 1, OPSZ_4, false, DRX_FIRST_FAULTING); + DRX_CASE(ldff1d, 1, OPSZ_8, false, DRX_FIRST_FAULTING); + DRX_CASE(ldff1sb, 1, OPSZ_1, true, DRX_FIRST_FAULTING); + DRX_CASE(ldff1sh, 1, OPSZ_2, true, DRX_FIRST_FAULTING); + DRX_CASE(ldff1sw, 1, OPSZ_4, true, DRX_FIRST_FAULTING); + + DRX_CASE(ldnf1b, 1, OPSZ_1, false, DRX_NON_FAULTING); + DRX_CASE(ldnf1h, 1, OPSZ_2, false, DRX_NON_FAULTING); + DRX_CASE(ldnf1w, 1, OPSZ_4, false, DRX_NON_FAULTING); + DRX_CASE(ldnf1d, 1, OPSZ_8, false, DRX_NON_FAULTING); + DRX_CASE(ldnf1sb, 1, OPSZ_1, true, DRX_NON_FAULTING); + DRX_CASE(ldnf1sh, 1, OPSZ_2, true, DRX_NON_FAULTING); + DRX_CASE(ldnf1sw, 1, OPSZ_4, true, DRX_NON_FAULTING); + + DRX_CASE(ldnt1b, 1, OPSZ_1, false, DRX_NORMAL_FAULTING); + DRX_CASE(ldnt1h, 1, OPSZ_2, false, DRX_NORMAL_FAULTING); + DRX_CASE(ldnt1w, 1, OPSZ_4, false, DRX_NORMAL_FAULTING); + DRX_CASE(ldnt1d, 1, OPSZ_8, false, DRX_NORMAL_FAULTING); + DRX_CASE(ldnt1sb, 1, OPSZ_1, true, DRX_NORMAL_FAULTING); + DRX_CASE(ldnt1sh, 1, OPSZ_2, true, DRX_NORMAL_FAULTING); + DRX_CASE(ldnt1sw, 1, OPSZ_4, true, DRX_NORMAL_FAULTING); + + DRX_CASE(st1b, 1, OPSZ_1, false, DRX_NORMAL_FAULTING); + DRX_CASE(st1h, 1, OPSZ_2, false, DRX_NORMAL_FAULTING); + DRX_CASE(st1w, 1, OPSZ_4, false, DRX_NORMAL_FAULTING); + DRX_CASE(st1d, 1, OPSZ_8, false, DRX_NORMAL_FAULTING); + + DRX_CASE(stnt1b, 1, OPSZ_1, false, DRX_NORMAL_FAULTING); + DRX_CASE(stnt1h, 1, OPSZ_2, false, DRX_NORMAL_FAULTING); + DRX_CASE(stnt1w, 1, OPSZ_4, false, DRX_NORMAL_FAULTING); + DRX_CASE(stnt1d, 1, OPSZ_8, false, DRX_NORMAL_FAULTING); + + DRX_CASE(ld2b, 2, OPSZ_1, false, DRX_NORMAL_FAULTING); + DRX_CASE(ld2h, 2, OPSZ_2, false, DRX_NORMAL_FAULTING); + DRX_CASE(ld2w, 2, OPSZ_4, false, DRX_NORMAL_FAULTING); + DRX_CASE(ld2d, 2, OPSZ_8, false, DRX_NORMAL_FAULTING); + + DRX_CASE(st2b, 2, OPSZ_1, false, DRX_NORMAL_FAULTING); + DRX_CASE(st2h, 2, OPSZ_2, false, DRX_NORMAL_FAULTING); + DRX_CASE(st2w, 2, OPSZ_4, false, DRX_NORMAL_FAULTING); + DRX_CASE(st2d, 2, OPSZ_8, false, DRX_NORMAL_FAULTING); + + DRX_CASE(ld3b, 3, OPSZ_1, false, DRX_NORMAL_FAULTING); + DRX_CASE(ld3h, 3, OPSZ_2, false, DRX_NORMAL_FAULTING); + DRX_CASE(ld3w, 3, OPSZ_4, false, DRX_NORMAL_FAULTING); + DRX_CASE(ld3d, 3, OPSZ_8, false, DRX_NORMAL_FAULTING); + + DRX_CASE(st3b, 3, OPSZ_1, false, DRX_NORMAL_FAULTING); + DRX_CASE(st3h, 3, OPSZ_2, false, DRX_NORMAL_FAULTING); + DRX_CASE(st3w, 3, OPSZ_4, false, DRX_NORMAL_FAULTING); + DRX_CASE(st3d, 3, OPSZ_8, false, DRX_NORMAL_FAULTING); + + DRX_CASE(ld4b, 4, OPSZ_1, false, DRX_NORMAL_FAULTING); + DRX_CASE(ld4h, 4, OPSZ_2, false, DRX_NORMAL_FAULTING); + DRX_CASE(ld4w, 4, OPSZ_4, false, DRX_NORMAL_FAULTING); + DRX_CASE(ld4d, 4, OPSZ_8, false, DRX_NORMAL_FAULTING); + + DRX_CASE(st4b, 4, OPSZ_1, false, DRX_NORMAL_FAULTING); + DRX_CASE(st4h, 4, OPSZ_2, false, DRX_NORMAL_FAULTING); + DRX_CASE(st4w, 4, OPSZ_4, false, DRX_NORMAL_FAULTING); + DRX_CASE(st4d, 4, OPSZ_8, false, DRX_NORMAL_FAULTING); + + DRX_CASE(ld1rob, 1, OPSZ_1, false, DRX_NORMAL_FAULTING); + + DRX_CASE(ld1rqb, 1, OPSZ_1, false, DRX_NORMAL_FAULTING); + DRX_CASE(ld1rqh, 1, OPSZ_2, false, DRX_NORMAL_FAULTING); + DRX_CASE(ld1rqw, 1, OPSZ_4, false, DRX_NORMAL_FAULTING); + DRX_CASE(ld1rqd, 1, OPSZ_8, false, DRX_NORMAL_FAULTING); +#undef DRX_CASE + + default: DR_ASSERT_MSG(false, "Invalid scatter/gather instruction"); + } +} + +/* + * Emit code to expand a scalar + vector gather load into a series of equivalent scalar + * loads. + * These instructions have memory operands of the form: + * [, .{, }] + * where addresses to load/store each element are calculated by adding a base address + * from the scalar register Xn, to an offset read from the corresponding element of the + * vector index register Zm. + * + * The emitted code roughly implements this algorithm: + * if (is_load) + * clear_inactive_elements(dst); + * for (e=first_active_element(); + * active_elements_remain(); + * e = next_active_element()) { + * if (is_load) + * dst[e] = scalar_load(base, offsets[e], mod); + * else + * scalar_store(src[e], base, offsets[e], mod); + * } + * except we unroll the loop. Without unrolling the loop drmemtrace's instrumentation + * would be repeated every iteration and give incorrect ifetch statistics. + * (See i#4948 for more details) + * + * For example + * ld1d (%x0,%z26.d,lsl #3)[32byte] %p1/z -> %z27.d + * with a 256-bit vector length expands to: + * + * clear_inactive_elements: + * dup $0x00 lsl $0x00 -> %z27.d ; Clear dst register + * pfalse -> %p0.b + * handle_active_elements: + * pnext %p1 %p0.d -> %p0.d ; p0 = mask indicating first active + * ; element of p1 + * ; NOTE: This is the first *active* + * ; element which may or may not be + * ; element 0. + * b.none end ; if (no more active elements) goto end + * lastb %p0 %z26.d -> %x1 ; extract offset for the current element + * ldr (%x0,%x1,lsl #3)[8byte] -> %x1 ; perform the scalar load + * cpy %p0/m %x1 -> %z27.d ; cpy loaded value to dst element + * pnext %p1 %p0.d -> %p0.d ; Find the second active element (if any) + * b.none end + * lastb %p0 %z26.d -> %x1 + * ldr (%x0,%x1,lsl #3)[8byte] -> %x1 + * cpy %p0/m %x1 -> %z27.d + * pnext %p1 %p0.d -> %p0.d ; Find the third active element (if any) + * b.none end + * lastb %p0 %z26.d -> %x1 + * ldr (%x0,%x1,lsl #3)[8byte] -> %x1 + * cpy %p0/m %x1 -> %z27.d + * pnext %p1 %p0.d -> %p0.d ; Find the fifth active element (if any) + * b.none end + * lastb %p0 %z26.d -> %x1 + * ldr (%x0,%x1,lsl #3)[8byte] -> %x1 + * cpy %p0/m %x1 -> %z27.d + * end: + * ... + * + * TODO i#5036 Add support for scatter store operations. + */ +static void +expand_scalar_plus_vector(void *drcontext, instrlist_t *bb, instr_t *sg_instr, + const scatter_gather_info_t *sg_info, reg_id_t scratch_gpr, + reg_id_t scratch_pred, app_pc orig_app_pc) +{ +#define EMIT(op, ...) \ + instrlist_preinsert( \ + bb, sg_instr, INSTR_XL8(INSTR_CREATE_##op(drcontext, __VA_ARGS__), orig_app_pc)) + + DR_ASSERT_MSG(reg_is_z(sg_info->index_reg), "Index must be a Z register"); + + const uint no_of_elements = + (opnd_size_in_bytes(sg_info->scatter_gather_size) / sg_info->reg_count) / + opnd_size_in_bytes(sg_info->scalar_value_size); + + if (sg_info->is_load) { + /* First we deal with the inactive elements. Gather loads are always zeroing so we + * need to set all inactive elements to 0. + */ + if (sg_info->index_reg == sg_info->gather_dst_reg) { + /* The dst register is also the index register so we need to preserve the + * value of the active elements so we can use them as offsets. We do this by + * cpying a 0 value into the dst register using the inverse of the mask_reg as + * the governing predicate. + */ + + /* ptrue scratch_pred.b */ + EMIT(ptrue_sve, opnd_create_reg_element_vector(scratch_pred, OPSZ_1), + opnd_create_immed_pred_constr(DR_PRED_CONSTR_ALL)); + + /* not scratch_pred.b, scratch_pred/z, mask_reg.b */ + EMIT(not_sve_pred_b, opnd_create_reg_element_vector(scratch_pred, OPSZ_1), + opnd_create_predicate_reg(scratch_pred, false), + opnd_create_reg_element_vector(sg_info->mask_reg, OPSZ_1)); + + /* cpy gather_dst_reg.element_size, scratch_pred/m, #0, lsl #0 */ + EMIT(cpy_sve_shift_pred, + opnd_create_reg_element_vector(sg_info->gather_dst_reg, + sg_info->element_size), + opnd_create_predicate_reg(scratch_pred, true), OPND_CREATE_INT8(0), + opnd_create_immed_uint(0, OPSZ_1b)); + } else { + /* We don't care about any values in the dst register so zero the whole thing. + */ + + /* dup gather_dst_reg.element_size, #0, lsl #0 */ + EMIT(dup_sve_shift, + opnd_create_reg_element_vector(sg_info->gather_dst_reg, + sg_info->element_size), + OPND_CREATE_INT8(0), opnd_create_immed_uint(0, OPSZ_1b)); + } + } + + /* pfalse scratch_pred.b */ + EMIT(pfalse_sve, opnd_create_reg_element_vector(scratch_pred, OPSZ_1)); + + instr_t *end_label = INSTR_CREATE_label(drcontext); + + for (uint i = 0; i < no_of_elements; i++) { + /* pnext scratch_pred.element_size, mask_reg, scratch_pred.element_size */ + EMIT(pnext_sve, + opnd_create_reg_element_vector(scratch_pred, sg_info->element_size), + opnd_create_reg(sg_info->mask_reg)); + + /* b.none end */ + instrlist_preinsert( + bb, sg_instr, + INSTR_XL8( + INSTR_PRED(INSTR_CREATE_bcond(drcontext, opnd_create_instr(end_label)), + DR_PRED_SVE_NONE), + orig_app_pc)); + + /* lastb scratch_gpr, scratch_pred, index_reg.element_size */ + EMIT(lastb_sve_scalar, opnd_create_reg(scratch_gpr), + opnd_create_reg(scratch_pred), + opnd_create_reg_element_vector(sg_info->index_reg, sg_info->element_size)); + + if (sg_info->is_load) { + /* ldr[bh] scratch_gpr, [base_reg, scratch_gpr, mod #amount] */ + opnd_t mem = opnd_create_base_disp_shift_aarch64( + sg_info->base_reg, scratch_gpr, sg_info->extend, sg_info->scaled, + /*disp=*/0, /*flags=*/0, sg_info->scalar_value_size, + sg_info->extend_amount); + + if (sg_info->is_scalar_value_signed) { + const reg_id_t ld_dst = + reg_resize_to_opsz(scratch_gpr, sg_info->element_size); + switch (sg_info->scalar_value_size) { + case OPSZ_1: EMIT(ldrsb, opnd_create_reg(ld_dst), mem); break; + case OPSZ_2: EMIT(ldrsh, opnd_create_reg(ld_dst), mem); break; + case OPSZ_4: EMIT(ldrsw, opnd_create_reg(ld_dst), mem); break; + default: DR_ASSERT_MSG(false, "Invalid scatter_gather_info_t data"); + } + } else { + const reg_id_t scratch_gpr_w = reg_resize_to_opsz(scratch_gpr, OPSZ_4); + switch (sg_info->scalar_value_size) { + case OPSZ_1: EMIT(ldrb, opnd_create_reg(scratch_gpr_w), mem); break; + case OPSZ_2: EMIT(ldrh, opnd_create_reg(scratch_gpr_w), mem); break; + case OPSZ_4: EMIT(ldr, opnd_create_reg(scratch_gpr_w), mem); break; + case OPSZ_8: EMIT(ldr, opnd_create_reg(scratch_gpr), mem); break; + default: DR_ASSERT_MSG(false, "Invalid scatter_gather_info_t data"); + } + } + + /* cpy gather_dst_reg.element_size, scratch_pred/m, scratch_gpr */ + EMIT(cpy_sve_pred, + opnd_create_reg_element_vector(sg_info->gather_dst_reg, + sg_info->element_size), + opnd_create_predicate_reg(scratch_pred, true), + opnd_create_reg(reg_resize_to_opsz(scratch_gpr, sg_info->element_size))); + } else { + DR_ASSERT_MSG(sg_info->is_load, "Stores are not yet supported"); + } + } + + instrlist_meta_preinsert(bb, sg_instr, end_label); + +#undef EMIT +} + +/* Spill a scratch predicate or vector register. + * TODO i#3844: drreg does not support spilling predicate regs yet, so we do it + * ourselves. + * When that support is available, this function can be replaced with a drreg API call. + */ +reg_id_t +reserve_sve_register(void *drcontext, instrlist_t *bb, instr_t *where, + reg_id_t scratch_gpr, reg_id_t min_register, reg_id_t max_register, + size_t slot_offset, opnd_size_t reg_size) +{ + /* Search the instruction for an unused register we will use as a temp. */ + reg_id_t reg; + for (reg = min_register; reg <= max_register; ++reg) { + if (!instr_uses_reg(where, reg)) + break; + } + DR_ASSERT(!instr_uses_reg(where, reg)); + + drmgr_insert_read_tls_field(drcontext, drx_scatter_gather_tls_idx, bb, where, + scratch_gpr); + + /* ldr scratch_gpr, [scratch_gpr, #slot_offset] */ + instrlist_meta_preinsert( + bb, where, + INSTR_CREATE_ldr(drcontext, opnd_create_reg(scratch_gpr), + OPND_CREATE_MEMPTR(scratch_gpr, slot_offset))); + + /* str reg, [scratch_gpr] */ + instrlist_meta_preinsert( + bb, where, + INSTR_CREATE_str(drcontext, + opnd_create_base_disp(scratch_gpr, DR_REG_NULL, 0, 0, reg_size), + opnd_create_reg(reg))); + + return reg; +} + +reg_id_t +reserve_pred_register(void *drcontext, instrlist_t *bb, instr_t *where, + reg_id_t scratch_gpr, spill_slot_state_t *slot_state) +{ + DR_ASSERT(slot_state->pred_slots[0] == DR_REG_NULL); + + /* Some instructions require the predicate to be in the range p0 - p7. This includes + * LASTB which we use to extract elements from the vector register. + */ + const reg_id_t reg = + reserve_sve_register(drcontext, bb, where, scratch_gpr, DR_REG_P0, DR_REG_P7, + offsetof(per_thread_t, scratch_pred_spill_slot), + opnd_size_from_bytes(proc_get_vector_length_bytes() / 8)); + + slot_state->pred_slots[0] = reg; + return reg; +} + +/* Restore the scratch predicate reg. + * TODO i#3844: drreg does not support spilling predicate regs yet, so we do it + * ourselves. + * When that support is available, this funcion can be replaced with a drreg API call. + */ +void +unreserve_sve_register(void *drcontext, instrlist_t *bb, instr_t *where, + reg_id_t scratch_gpr, reg_id_t reg, size_t slot_offset, + opnd_size_t reg_size) +{ + drmgr_insert_read_tls_field(drcontext, drx_scatter_gather_tls_idx, bb, where, + scratch_gpr); + + /* ldr scratch_gpr, [scratch_gpr, #slot_offset] */ + instrlist_meta_preinsert( + bb, where, + INSTR_CREATE_ldr(drcontext, opnd_create_reg(scratch_gpr), + OPND_CREATE_MEMPTR(scratch_gpr, slot_offset))); + + /* ldr reg, [scratch_gpr] */ + instrlist_meta_preinsert( + bb, where, + INSTR_CREATE_ldr( + drcontext, opnd_create_reg(reg), + opnd_create_base_disp(scratch_gpr, DR_REG_NULL, 0, 0, reg_size))); +} + +void +unreserve_pred_register(void *drcontext, instrlist_t *bb, instr_t *where, + reg_id_t scratch_gpr, reg_id_t scratch_pred, + spill_slot_state_t *slot_state) +{ + DR_ASSERT(slot_state->pred_slots[0] == scratch_pred); + slot_state->pred_slots[0] = DR_REG_NULL; + + unreserve_sve_register(drcontext, bb, where, scratch_gpr, scratch_pred, + offsetof(per_thread_t, scratch_pred_spill_slot), + opnd_size_from_bytes(proc_get_vector_length_bytes() / 8)); +} /***************************************************************************************** * drx_expand_scatter_gather() @@ -48,8 +518,135 @@ bool drx_expand_scatter_gather(void *drcontext, instrlist_t *bb, OUT bool *expanded) { - /* TODO i#3837: add support for AArch64. */ if (expanded != NULL) *expanded = false; - return drmgr_current_bb_phase(drcontext) == DRMGR_PHASE_APP2APP; + + if (drmgr_current_bb_phase(drcontext) != DRMGR_PHASE_APP2APP) + return false; + + instr_t *sg_instr = NULL; + if (!scatter_gather_split_bb(drcontext, bb, &sg_instr)) { + /* bb did not begin with a scatter/gather instruction. If there were any + * scatter/gather instructions that were not at the beginning, they have been + * split out of the bb and we will be called again later to handle them. + */ + return true; + } + DR_ASSERT(sg_instr != NULL); + + scatter_gather_info_t sg_info; + bool res = false; + get_scatter_gather_info(sg_instr, &sg_info); + + if (!(sg_info.is_load && reg_is_z(sg_info.index_reg) && + sg_info.faulting_behavior == DRX_NORMAL_FAULTING)) + return true; + + /* We want to avoid spill slot conflicts with later instrumentation passes. */ + drreg_status_t res_bb_props = + drreg_set_bb_properties(drcontext, DRREG_HANDLE_MULTI_PHASE_SLOT_RESERVATIONS); + DR_ASSERT(res_bb_props == DRREG_SUCCESS); + + /* Tell drx_event_restore_state() that an expansion has occurred. */ + drx_mark_scatter_gather_expanded(); + + reg_id_t scratch_gpr = DR_REG_INVALID; + drvector_t allowed; + drreg_init_and_fill_vector(&allowed, true); + + /* We need the scratch registers and base register app's value to be available at the + * same time. Do not use. + */ + drreg_set_vector_entry(&allowed, sg_info.base_reg, false); + + if (drreg_reserve_aflags(drcontext, bb, sg_instr) != DRREG_SUCCESS) + goto drx_expand_scatter_gather_exit; + if (drreg_reserve_register(drcontext, bb, sg_instr, &allowed, &scratch_gpr) != + DRREG_SUCCESS) + goto drx_expand_scatter_gather_exit; + + spill_slot_state_t spill_slot_state; + init_spill_slot_state(&spill_slot_state); + + const reg_id_t scratch_pred = + reserve_pred_register(drcontext, bb, sg_instr, scratch_gpr, &spill_slot_state); + + const app_pc orig_app_pc = instr_get_app_pc(sg_instr); + + emulated_instr_t emulated_instr; + emulated_instr.size = sizeof(emulated_instr); + emulated_instr.pc = instr_get_app_pc(sg_instr); + emulated_instr.instr = sg_instr; + /* Tools should instrument the data operations in the sequence. */ + emulated_instr.flags = DR_EMULATE_INSTR_ONLY; + drmgr_insert_emulation_start(drcontext, bb, sg_instr, &emulated_instr); + + if (sg_info.is_load && reg_is_z(sg_info.index_reg)) { + /* scalar+vector */ + expand_scalar_plus_vector(drcontext, bb, sg_instr, &sg_info, scratch_gpr, + scratch_pred, orig_app_pc); + } else { + /* TODO i#5036 + * Add support for: + * Other scatter gather variants: + * scalar + vector st1* + * vector + immediate ld1/st1* + * Predicated contiguous variants: + * scalar + immediate ld1/st1* + * scalar + scalar ld1/st1* + * First fault and non-faulting variants: + * ldff1*, ldnf1* + * Multi-register variants: + * ld2*, ld3*, ld4*, + * st2*, st3*, st4* + */ + goto drx_expand_scatter_gather_exit; + } + + drmgr_insert_emulation_end(drcontext, bb, sg_instr); + + unreserve_pred_register(drcontext, bb, sg_instr, scratch_gpr, scratch_pred, + &spill_slot_state); + if (drreg_unreserve_register(drcontext, bb, sg_instr, scratch_gpr) != DRREG_SUCCESS) { + DR_ASSERT_MSG(false, "drreg_unreserve_register should not fail"); + goto drx_expand_scatter_gather_exit; + } + if (drreg_unreserve_aflags(drcontext, bb, sg_instr) != DRREG_SUCCESS) + goto drx_expand_scatter_gather_exit; + +#if VERBOSE + dr_fprintf(STDERR, "\tVector length = %u bytes\n", proc_get_vector_length_bytes()); + dr_print_instr(drcontext, STDERR, sg_instr, "\tThe instruction\n"); +#endif + + /* Remove and destroy the original scatter/gather. */ + instrlist_remove(bb, sg_instr); +#if VERBOSE + dr_fprintf(STDERR, "\twas expanded to the following sequence:\n"); + for (instr_t *instr = instrlist_first(bb); instr != NULL; + instr = instr_get_next(instr)) { + dr_print_instr(drcontext, STDERR, instr, ""); + } +#endif + + if (expanded != NULL) + *expanded = true; + res = true; + +drx_expand_scatter_gather_exit: + drvector_delete(&allowed); + return res; +} + +bool +drx_scatter_gather_restore_state(void *drcontext, dr_restore_state_info_t *info, + instr_t *sg_inst) +{ + DR_ASSERT(instr_is_gather(sg_inst) || instr_is_scatter(sg_inst)); + /* TODO i#5365, i#5036: Restore the scratch predicate register. + * We need to add support for handling SVE state during + * signals first. + */ + DR_ASSERT_MSG(false, "NYI i#5365 i#5036"); + return false; } diff --git a/ext/drx/scatter_gather_shared.c b/ext/drx/scatter_gather_shared.c new file mode 100644 index 00000000000..ba16ef55cf1 --- /dev/null +++ b/ext/drx/scatter_gather_shared.c @@ -0,0 +1,165 @@ +/* ********************************************************** + * Copyright (c) 2013-2023 Google, Inc. All rights reserved. + * **********************************************************/ + +/* + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the name of Google, Inc. nor the names of its contributors may be + * used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL GOOGLE, INC. OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + */ + +/* DynamoRio eXtension utilities */ + +#include "dr_api.h" +#include "drmgr.h" +#include "drx.h" + +#include "scatter_gather_shared.h" + +int drx_scatter_gather_tls_idx; + +/* + * Split a basic block at the first scatter/gather app instruction found. + * + * If the first app instruction in bb is a scatter/gather instruction, all following + * instructions will be removed so that bb just contains the scatter/gather instruction. + * + * If the first app instruction in bb is not a scatter/gather instruction, all + * instructions up until the first scatter/gather instruction will be left. The + * scatter/gather instruction and any following instructions will be removed from bb and + * the function will return false. + * + * If there are no scatter/gather instructions in bb, it will be unchanged and the + * function will return false. + */ +bool +scatter_gather_split_bb(void *drcontext, instrlist_t *bb, OUT instr_t **sg_instr) +{ + instr_t *instr, *next_instr, *first_app = NULL; + bool delete_rest = false; + bool first_app_is_scatter_gather = false; + + for (instr = instrlist_first(bb); instr != NULL; instr = next_instr) { + next_instr = instr_get_next(instr); + if (delete_rest) { + instrlist_remove(bb, instr); + instr_destroy(drcontext, instr); + } else if (instr_is_app(instr)) { + if (first_app == NULL) + first_app = instr; + if (instr_is_gather(instr) || instr_is_scatter(instr)) { + delete_rest = true; + if (instr == first_app) { + first_app_is_scatter_gather = true; + } else { + instrlist_remove(bb, instr); + instr_destroy(drcontext, instr); + } + } + } + } + + if (first_app_is_scatter_gather && (sg_instr != NULL)) { + *sg_instr = first_app; + } + + return first_app_is_scatter_gather; +} + +/* These architecture specific functions are defined in scatter_gather_${ARCH_NAME}.c */ +void +drx_scatter_gather_thread_exit(void *drcontext); + +void +drx_scatter_gather_thread_exit(void *drcontext); + +bool +drx_scatter_gather_restore_state(void *drcontext, dr_restore_state_info_t *info, + instr_t *sg_inst); + +int drx_scatter_gather_expanded = 0; + +void +drx_mark_scatter_gather_expanded(void) +{ + dr_atomic_store32(&drx_scatter_gather_expanded, 1); +} + +static bool +drx_event_restore_state(void *drcontext, bool restore_memory, + dr_restore_state_info_t *info) +{ + instr_t inst; + bool success = true; + if (info->fragment_info.cache_start_pc == NULL) + return true; /* fault not in cache */ + if (dr_atomic_load32(&drx_scatter_gather_expanded) == 0) { + /* Nothing to do if nobody had never called expand_scatter_gather() before. */ + return true; + } + if (!info->fragment_info.app_code_consistent) { + /* Can't verify application code. + * XXX i#2985: is it better to keep searching? + */ + return true; + } + instr_init(drcontext, &inst); + byte *pc = decode(drcontext, dr_fragment_app_pc(info->fragment_info.tag), &inst); + if (pc != NULL) { + if (instr_is_gather(&inst) || instr_is_scatter(&inst)) { + success = success && drx_scatter_gather_restore_state(drcontext, info, &inst); + } + } + instr_free(drcontext, &inst); + return success; +} + +bool +drx_scatter_gather_init() +{ + drmgr_priority_t fault_priority = { sizeof(fault_priority), + DRMGR_PRIORITY_NAME_DRX_FAULT, NULL, NULL, + DRMGR_PRIORITY_FAULT_DRX }; + + if (!drmgr_register_restore_state_ex_event_ex(drx_event_restore_state, + &fault_priority)) + return false; + + drx_scatter_gather_tls_idx = drmgr_register_tls_field(); + if (drx_scatter_gather_tls_idx == -1) + return false; + + if (!drmgr_register_thread_init_event(drx_scatter_gather_thread_init) || + !drmgr_register_thread_exit_event(drx_scatter_gather_thread_exit)) + return false; + + return true; +} + +void +drx_scatter_gather_exit() +{ + drmgr_unregister_tls_field(drx_scatter_gather_tls_idx); +} diff --git a/ext/drx/scatter_gather_shared.h b/ext/drx/scatter_gather_shared.h new file mode 100644 index 00000000000..2b25422b083 --- /dev/null +++ b/ext/drx/scatter_gather_shared.h @@ -0,0 +1,107 @@ +/* ********************************************************** + * Copyright (c) 2013-2023 Google, Inc. All rights reserved. + * Copyright (c) 2023 Arm Limited. All rights reserved. + * **********************************************************/ + +/* + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the name of Google, Inc. nor the names of its contributors may be + * used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL GOOGLE, INC. OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + */ + +/* DynamoRio eXtension utilities */ + +#include "dr_api.h" + +extern int drx_scatter_gather_tls_idx; + +/* Make each scatter or gather instruction be in their own basic block. + */ +bool +scatter_gather_split_bb(void *drcontext, instrlist_t *bb, OUT instr_t **sg_instr); + +/* Tell drx_event_restore_state() that an expansion has occurred. */ +void +drx_mark_scatter_gather_expanded(void); + +typedef struct _scatter_gather_info_t { +#if defined(X86) + bool is_evex; +#endif + bool is_load; + +#if defined(AARCH64) + /* The vector element size for all vector registers used by the instruction. + * This applies to: + * gather_dst_reg/scatter_src_reg for all + * scatter/gather/predicated-contiguous-access instructions, + * base_reg for vector+immediate scatter/gather instructions, + * index_reg for scalar+vector scatter/gather instructions. + */ + opnd_size_t element_size; +#elif defined(X86) + opnd_size_t scalar_index_size; +#endif + + opnd_size_t scalar_value_size; + opnd_size_t scatter_gather_size; + reg_id_t mask_reg; + reg_id_t base_reg; + reg_id_t index_reg; + union { + reg_id_t gather_dst_reg; + reg_id_t scatter_src_reg; + }; + int disp; +#if defined(X86) + int scale; +#elif defined(AARCH64) + dr_extend_type_t extend; + uint extend_amount; + uint reg_count; /* Number of registers accessed. If >1 + * gather_dst_reg/scatter_src_reg is the first register. + */ + bool scaled; + bool is_scalar_value_signed; + enum { + DRX_NORMAL_FAULTING, + DRX_FIRST_FAULTING, + DRX_NON_FAULTING, + } faulting_behavior; +#endif +} scatter_gather_info_t; + +/* These architecture specific functions and defined in scatter_gather_${ARCH_NAME}.c + * and used by functions in scatter_gather_shared.c + */ +void +drx_scatter_gather_thread_init(void *drcontext); + +void +drx_scatter_gather_thread_exit(void *drcontext); + +bool +drx_scatter_gather_restore_state(void *drcontext, dr_restore_state_info_t *info, + instr_t *sg_inst); diff --git a/ext/drx/scatter_gather_x86.c b/ext/drx/scatter_gather_x86.c index ea52e643fba..ea84166cf65 100644 --- a/ext/drx/scatter_gather_x86.c +++ b/ext/drx/scatter_gather_x86.c @@ -38,6 +38,7 @@ #include "drx.h" #include "hashtable.h" #include "../ext_utils.h" +#include "scatter_gather_shared.h" #include /* for offsetof */ #include @@ -89,23 +90,17 @@ #define VERBOSE 0 -static int tls_idx; typedef struct _per_thread_t { void *scratch_mm_spill_slot; void *scratch_mm_spill_slot_aligned; } per_thread_t; static per_thread_t init_pt; -static int drx_scatter_gather_expanded; - -static bool -drx_event_restore_state(void *drcontext, bool restore_memory, - dr_restore_state_info_t *info); - static per_thread_t * get_tls_data(void *drcontext) { - per_thread_t *pt = (per_thread_t *)drmgr_get_tls_field(drcontext, tls_idx); + per_thread_t *pt = + (per_thread_t *)drmgr_get_tls_field(drcontext, drx_scatter_gather_tls_idx); /* Support use during init (i#2910). */ if (pt == NULL) return &init_pt; @@ -134,8 +129,8 @@ get_mov_scratch_mm_opcode_and_size(int *opcode_out, opnd_size_t *opnd_size_out) *opnd_size_out = opnd_size; } -static void -drx_thread_init(void *drcontext) +void +drx_scatter_gather_thread_init(void *drcontext) { per_thread_t *pt = (per_thread_t *)dr_thread_alloc(drcontext, sizeof(*pt)); opnd_size_t mm_opsz; @@ -144,13 +139,14 @@ drx_thread_init(void *drcontext) dr_thread_alloc(drcontext, opnd_size_in_bytes(mm_opsz) + (MM_ALIGNMENT - 1)); pt->scratch_mm_spill_slot_aligned = (void *)ALIGN_FORWARD(pt->scratch_mm_spill_slot, MM_ALIGNMENT); - drmgr_set_tls_field(drcontext, tls_idx, (void *)pt); + drmgr_set_tls_field(drcontext, drx_scatter_gather_tls_idx, (void *)pt); } -static void -drx_thread_exit(void *drcontext) +void +drx_scatter_gather_thread_exit(void *drcontext) { - per_thread_t *pt = (per_thread_t *)drmgr_get_tls_field(drcontext, tls_idx); + per_thread_t *pt = + (per_thread_t *)drmgr_get_tls_field(drcontext, drx_scatter_gather_tls_idx); opnd_size_t mm_opsz; get_mov_scratch_mm_opcode_and_size(NULL, &mm_opsz); dr_thread_free(drcontext, pt->scratch_mm_spill_slot, @@ -158,49 +154,6 @@ drx_thread_exit(void *drcontext) dr_thread_free(drcontext, pt, sizeof(*pt)); } -bool -drx_scatter_gather_init() -{ - drmgr_priority_t fault_priority = { sizeof(fault_priority), - DRMGR_PRIORITY_NAME_DRX_FAULT, NULL, NULL, - DRMGR_PRIORITY_FAULT_DRX }; - - if (!drmgr_register_restore_state_ex_event_ex(drx_event_restore_state, - &fault_priority)) - return false; - tls_idx = drmgr_register_tls_field(); - if (tls_idx == -1) - return false; - if (!drmgr_register_thread_init_event(drx_thread_init) || - !drmgr_register_thread_exit_event(drx_thread_exit)) - return false; - - return true; -} - -void -drx_scatter_gather_exit() -{ - drmgr_unregister_tls_field(tls_idx); -} - -typedef struct _scatter_gather_info_t { - bool is_evex; - bool is_load; - opnd_size_t scalar_index_size; - opnd_size_t scalar_value_size; - opnd_size_t scatter_gather_size; - reg_id_t mask_reg; - reg_id_t base_reg; - reg_id_t index_reg; - union { - reg_id_t gather_dst_reg; - reg_id_t scatter_src_reg; - }; - int disp; - int scale; -} scatter_gather_info_t; - static void get_scatter_gather_info(instr_t *instr, scatter_gather_info_t *sg_info) { @@ -981,49 +934,30 @@ expand_gather_load_scalar_value(void *drcontext, instrlist_t *bb, instr_t *sg_in bool drx_expand_scatter_gather(void *drcontext, instrlist_t *bb, OUT bool *expanded) { - instr_t *instr, *next_instr, *first_app = NULL; - bool delete_rest = false; - if (expanded != NULL) *expanded = false; if (drmgr_current_bb_phase(drcontext) != DRMGR_PHASE_APP2APP) { return false; } - /* Make each scatter or gather instruction be in their own basic block. - * TODO i#3837: cross-platform code like the following bb splitting can be shared - * with other architectures in the future. - */ - for (instr = instrlist_first(bb); instr != NULL; instr = next_instr) { - next_instr = instr_get_next(instr); - if (delete_rest) { - instrlist_remove(bb, instr); - instr_destroy(drcontext, instr); - } else if (instr_is_app(instr)) { - if (first_app == NULL) - first_app = instr; - if (instr_is_gather(instr) || instr_is_scatter(instr)) { - delete_rest = true; - if (instr != first_app) { - instrlist_remove(bb, instr); - instr_destroy(drcontext, instr); - } - } - } - } - if (first_app == NULL) - return true; - if (!instr_is_gather(first_app) && !instr_is_scatter(first_app)) + instr_t *sg_instr = NULL; + if (!scatter_gather_split_bb(drcontext, bb, &sg_instr)) { + /* bb did not begin with a scatter/gather instruction. If there were any + * scatter/gather instructions that were not at the beginning, they have been + * split out of the bb and we will be called again later to handle them. + */ return true; + } + DR_ASSERT(sg_instr != NULL); /* We want to avoid spill slot conflicts with later instrumentation passes. */ drreg_status_t res_bb_props = drreg_set_bb_properties(drcontext, DRREG_HANDLE_MULTI_PHASE_SLOT_RESERVATIONS); DR_ASSERT(res_bb_props == DRREG_SUCCESS); - dr_atomic_store32(&drx_scatter_gather_expanded, 1); + /* Tell drx_event_restore_state() that an expansion has occurred. */ + drx_mark_scatter_gather_expanded(); - instr_t *sg_instr = first_app; scatter_gather_info_t sg_info; bool res = false; /* XXX: we may want to make this function public, as it may be useful to clients. */ @@ -1083,7 +1017,8 @@ drx_expand_scatter_gather(void *drcontext, instrlist_t *bb, OUT bool *expanded) get_mov_scratch_mm_opcode_and_size(&mov_scratch_mm_opcode, &mov_scratch_mm_opnd_sz); scratch_mm = reg_resize_to_opsz(scratch_xmm, mov_scratch_mm_opnd_sz); - drmgr_insert_read_tls_field(drcontext, tls_idx, bb, sg_instr, scratch_reg0); + drmgr_insert_read_tls_field(drcontext, drx_scatter_gather_tls_idx, bb, sg_instr, + scratch_reg0); instrlist_meta_preinsert( bb, sg_instr, INSTR_CREATE_mov_ld( @@ -1221,7 +1156,8 @@ drx_expand_scatter_gather(void *drcontext, instrlist_t *bb, OUT bool *expanded) orig_app_pc)); } /* Restore the scratch xmm. */ - drmgr_insert_read_tls_field(drcontext, tls_idx, bb, sg_instr, scratch_reg0); + drmgr_insert_read_tls_field(drcontext, drx_scatter_gather_tls_idx, bb, sg_instr, + scratch_reg0); instrlist_meta_preinsert( bb, sg_instr, INSTR_CREATE_mov_ld( @@ -1269,6 +1205,7 @@ drx_expand_scatter_gather(void *drcontext, instrlist_t *bb, OUT bool *expanded) instrlist_remove(bb, sg_instr); #if VERBOSE dr_fprintf(STDERR, "\twas expanded to the following sequence:\n"); + instr_t *instr; for (instr = instrlist_first(bb); instr != NULL; instr = instr_get_next(instr)) { dr_print_instr(drcontext, STDERR, instr, ""); } @@ -2397,43 +2334,20 @@ drx_restore_state_for_avx2_gather(void *drcontext, dr_restore_state_info_t *info drx_avx2_gather_sequence_state_machine); } -static bool -drx_event_restore_state(void *drcontext, bool restore_memory, - dr_restore_state_info_t *info) +bool +drx_scatter_gather_restore_state(void *drcontext, dr_restore_state_info_t *info, + instr_t *sg_inst) { - instr_t inst; - bool success = true; - if (info->fragment_info.cache_start_pc == NULL) - return true; /* fault not in cache */ - if (dr_atomic_load32(&drx_scatter_gather_expanded) == 0) { - /* Nothing to do if nobody had never called expand_scatter_gather() before. */ - return true; - } - if (!info->fragment_info.app_code_consistent) { - /* Can't verify application code. - * XXX i#2985: is it better to keep searching? - */ - return true; - } - instr_init(drcontext, &inst); - byte *pc = decode(drcontext, dr_fragment_app_pc(info->fragment_info.tag), &inst); - if (pc != NULL) { - scatter_gather_info_t sg_info; - if (instr_is_gather(&inst)) { - get_scatter_gather_info(&inst, &sg_info); - if (sg_info.is_evex) { - success = success && - drx_restore_state_for_avx512_gather(drcontext, info, &sg_info); - } else { - success = success && - drx_restore_state_for_avx2_gather(drcontext, info, &sg_info); - } - } else if (instr_is_scatter(&inst)) { - get_scatter_gather_info(&inst, &sg_info); - success = success && - drx_restore_state_for_avx512_scatter(drcontext, info, &sg_info); + scatter_gather_info_t sg_info; + get_scatter_gather_info(sg_inst, &sg_info); + + if (sg_info.is_load) { + if (sg_info.is_evex) { + return drx_restore_state_for_avx512_gather(drcontext, info, &sg_info); + } else { + return drx_restore_state_for_avx2_gather(drcontext, info, &sg_info); } + } else { + return drx_restore_state_for_avx512_scatter(drcontext, info, &sg_info); } - instr_free(drcontext, &inst); - return success; } diff --git a/make/utils.cmake b/make/utils.cmake index 981ddacda41..e82c683169e 100644 --- a/make/utils.cmake +++ b/make/utils.cmake @@ -313,3 +313,31 @@ if (UNIX) endfunction (set_preferred_base_start_and_end) endif (UNIX) + +function (check_sve_processor_and_compiler_support out) + include(CheckCSourceRuns) + set(sve_prog "#include + int main() { + uint64_t vl = 0; + asm(\"rdvl %[dest], 1\" : [dest] \"=r\" (vl) : :); + (void) vl; + return 0; + }") + set(CMAKE_REQUIRED_FLAGS ${CFLAGS_SVE}) + if (CMAKE_CROSSCOMPILING) + # If we are cross-compiling check_c_source_runs() can't run the executable on the + # host to find out whether the target processor supports SVE, so we assume it + # doesn't. + set(proc_found_sve_EXITCODE 1 CACHE STRING + "Set to 0 if target processor/emulator supports SVE to enable SVE tests" + FORCE) + endif () + check_c_source_runs("${sve_prog}" proc_found_sve) + if (proc_found_sve) + message(STATUS "Compiler and processor support SVE.") + else () + message(STATUS "WARNING: Compiler or processor do not support SVE. " + "Skipping tests") + endif () + set(${out} ${proc_found_sve} PARENT_SCOPE) +endfunction (check_sve_processor_and_compiler_support) diff --git a/suite/tests/CMakeLists.txt b/suite/tests/CMakeLists.txt index cf744ee32d2..f88e8052e14 100644 --- a/suite/tests/CMakeLists.txt +++ b/suite/tests/CMakeLists.txt @@ -1,7 +1,7 @@ # ********************************************************** # Copyright (c) 2010-2023 Google, Inc. All rights reserved. # Copyright (c) 2009-2010 VMware, Inc. All rights reserved. -# Copyright (c) 2016-2022 ARM Limited. All rights reserved. +# Copyright (c) 2016-2023 ARM Limited. All rights reserved. # ********************************************************** # Redistribution and use in source and binary forms, with or without @@ -630,11 +630,14 @@ endfunction (setup_test_client_dll_basics) # and calls this one w/ "" "" "" b/c the PARENT_SCOPE won't make # it all the way back then: would need to chain. function(tobuild_ci test source client_ops dr_ops exe_ops) - string(REGEX REPLACE "\\.(c|cpp)$" ".dll.\\1" client_source "${source}") - string(REGEX REPLACE "\\.runall$" ".dll.c" client_source "${client_source}") - if (NOT EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/${client_source}") - string(REGEX REPLACE "\\.c$" ".cpp" client_source "${client_source}") - endif () + if (NOT DEFINED ${test}_client_source) + string(REGEX REPLACE "\\.(c|cpp)$" ".dll.\\1" client_source "${source}") + string(REGEX REPLACE "\\.runall$" ".dll.c" client_source "${client_source}") + if (NOT EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/${client_source}") + string(REGEX REPLACE "\\.c$" ".cpp" client_source "${client_source}") + endif () + set(${test}_client_source ${client_source}) + endif() if (NOT "${source}" MATCHES "\\.runall$") # Many client tests don't care about the app that's run. To save time @@ -661,7 +664,7 @@ function(tobuild_ci test source client_ops dr_ops exe_ops) endif () endif () - add_library(${test}.dll SHARED ${client_source}) + add_library(${test}.dll SHARED ${${test}_client_source}) if (NOT DEFINED ${test}_no_reg_compat AND NOT "${source}" MATCHES "cpp\\.cpp") # to avoid changing all the REG_ constants we ask for compatibility @@ -1044,6 +1047,8 @@ function(template2expect outexpect template runops key) set(rundefs "${rundefs} -D__AVX__ -D__AVX512F__") elseif (DEFINED ${key}_runavx) set(rundefs "${rundefs} -D__AVX__") + elseif (DEFINED ${key}_runsve) + set(rundefs "${rundefs} -D__ARM_FEATURE_SVE") endif () if (DEFINED ${key}_test_sample_client) @@ -1716,6 +1721,15 @@ macro(set_avx_flags target) endif () endmacro(set_avx_flags) +macro(set_sve_flags target) + if (proc_supports_sve) + if (TARGET ${target}) # Support calling on non-exe target. + append_property_string(TARGET ${target} COMPILE_FLAGS "${CFLAGS_SVE}") + endif () + set(${target}_runsve 1) + endif () +endmacro(set_sve_flags) + ########################################################################### # We'll want the latest CTest (2.6.4) so we can use the -W parameter @@ -2819,31 +2833,43 @@ if (NOT RISCV64) # TODO i#3544: Port tests to RISC-V 64 use_DynamoRIO_extension(client.tls.dll drmgr) endif (NOT RISCV64) -if (X86) +if (X86 OR AARCH64) if (NOT MACOS) # XXX i#2985: The test's asm doesn't build with MacOS's clang. - tobuild_ci(client.drx-scattergather client-interface/drx-scattergather.c "" "" "") + set(client.drx-scattergather_client_source "client-interface/drx-scattergather.dll.c") + if (X86) + tobuild_ci(client.drx-scattergather "client-interface/drx-scattergather-x86.c" + "" "" "") + set_avx_flags(client.drx-scattergather) + elseif (AARCH64) + tobuild_ci(client.drx-scattergather "client-interface/drx-scattergather-aarch64.cpp" + "" "" "") + set_sve_flags(client.drx-scattergather) + endif() target_include_directories(client.drx-scattergather PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/client-interface) use_DynamoRIO_extension(client.drx-scattergather.dll drmgr) use_DynamoRIO_extension(client.drx-scattergather.dll drx) use_DynamoRIO_extension(client.drx-scattergather.dll drreg) - set_avx_flags(client.drx-scattergather) # Our scattergather drbbdup test uses the same app as the base test. set(client.drx-scattergather-bbdup_realtest client.drx-scattergather) - set(client.drx-scattergather-bbdup_expectbase "drx-scattergather") + set(client.drx-scattergather-bbdup_expectbase "drx-scattergather-${ARCH_NAME}") tobuild_ci(client.drx-scattergather-bbdup client-interface/drx-scattergather-bbdup.c "" "" "") + if (X86) + set_avx_flags(client.drx-scattergather-bbdup) + elseif (AARCH64) + set_sve_flags(client.drx-scattergather-bbdup) + endif() target_include_directories(client.drx-scattergather PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/client-interface) use_DynamoRIO_extension(client.drx-scattergather-bbdup.dll drmgr) use_DynamoRIO_extension(client.drx-scattergather-bbdup.dll drx) use_DynamoRIO_extension(client.drx-scattergather-bbdup.dll drreg) use_DynamoRIO_extension(client.drx-scattergather-bbdup.dll drbbdup) - set_avx_flags(client.drx-scattergather-bbdup) - if (X64 AND UNIX) + if (X86 AND X64 AND UNIX) if (proc_supports_avx) add_exe(allasm_scattergather ${PROJECT_SOURCE_DIR}/clients/drcachesim/tests/allasm_scattergather.asm @@ -2855,7 +2881,7 @@ if (X86) ${PROJECT_SOURCE_DIR}/clients/drcachesim/tests/allasm_repstr.asm "-early_inject" "") append_pure_asm_app_link_flags(allasm_repstr) - endif (X64 AND UNIX) + endif (X86 AND X64 AND UNIX) endif() endif () @@ -3344,14 +3370,21 @@ if (BUILD_SAMPLES) client-interface/memval-test.c "" "" "") torunonly_ci(sample.${sample}_scattergather client.drx-scattergather ${sample} - client-interface/drx-scattergather.c "" "" "") + client-interface/drx-scattergather-x86.c "" "" "") set(sample.${sample}_scattergather_test_sample_client 1) if (proc_supports_avx512) set(sample.${sample}_scattergather_runavx512 1) elseif (proc_supports_avx) set(sample.${sample}_scattergather_runavx 1) endif () - endif (X86 AND UNIX) + elseif (AARCH64) + torunonly_ci(sample.${sample}_scattergather client.drx-scattergather ${sample} + client-interface/drx-scattergather-aarch64.cpp "" "" "") + set(sample.${sample}_scattergather_test_sample_client 1) + if (proc_supports_sve) + set(sample.${sample}_scattergather_runsve 1) + endif () + endif () elseif (sample STREQUAL "opcode_count") # We do not do a simple sanity test for opcode_count, but a normal test # that checks the sample's output. @@ -3667,14 +3700,16 @@ if (BUILD_CLIENTS) set(tool.drcachesim.invariants_timeout 180) endif () - if (NOT MACOS AND X86) - torunonly_drcachesim(scattergather client.drx-scattergather + if ((NOT MACOS) AND (X86 OR AARCH64)) + torunonly_drcachesim(scattergather-${ARCH_NAME} client.drx-scattergather "-simulator_type basic_counts" "") - unset(tool.drcachesim.scattergather_rawtemp) # use preprocessor + unset(tool.drcachesim.scattergather-${ARCH_NAME}_rawtemp) # use preprocessor if (proc_supports_avx512) - set(tool.drcachesim.scattergather_runavx512 1) + set(tool.drcachesim.scattergather-${ARCH_NAME}_runavx512 1) elseif (proc_supports_avx) - set(tool.drcachesim.scattergather_runavx 1) + set(tool.drcachesim.scattergather-${ARCH_NAME}_runavx 1) + elseif (proc_supports_sve) + set(tool.drcachesim.scattergather-${ARCH_NAME}_runsve 1) endif () endif () diff --git a/suite/tests/client-interface/drx-scattergather-aarch64.cpp b/suite/tests/client-interface/drx-scattergather-aarch64.cpp new file mode 100644 index 00000000000..e8ba28afbc3 --- /dev/null +++ b/suite/tests/client-interface/drx-scattergather-aarch64.cpp @@ -0,0 +1,1161 @@ +/* ********************************************************** + * Copyright (c) 2023 Arm Limited. All rights reserved. + * **********************************************************/ + +/* + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the name of Arm Limited nor the names of its contributors may be + * used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL ARM LIMITED OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "tools.h" + +namespace { + +/* + * Tests are specified assuming 128-bit vectors. If we run on hardware with a + * higher VL then vector values are made up to the correct size by duplicating + * the first 128-bits. + */ +constexpr size_t TEST_VL_BYTES = 16; + +constexpr size_t NUM_Z_REGS = 32; +constexpr size_t NUM_P_REGS = 16; + +using vector_reg_value128_t = std::array; +using predicate_reg_value128_t = uint16_t; + +constexpr vector_reg_value128_t UNINITIALIZED_VECTOR { + 0xAD, 0xDE, 0xAD, 0xDE, 0xAD, 0xDE, 0xAD, 0xDE, + 0xAD, 0xDE, 0xAD, 0xDE, 0xAD, 0xDE, 0xAD, 0xDE, +}; +constexpr predicate_reg_value128_t UNINITIALIZED_PREDICATE = 0xDEAD; + +enum class element_size_t { + BYTE = 1, + HALF = 2, + SINGLE = 4, + DOUBLE = 8, +}; + +/* Exhaustive lists of valid 128-bit vl predicate register values for single or double + * sized elements. + */ +const std::map> ALL_PREDICATES { + { element_size_t::SINGLE, + { 0x0000, 0x0001, 0x0010, 0x0011, 0x0100, 0x0101, 0x0110, 0x0111, 0x1000, 0x1001, + 0x1010, 0x1011, 0x1100, 0x1101, 0x1110, 0x1111 } }, + { element_size_t::DOUBLE, { 0x0000, 0x0001, 0x0100, 0x0101 } }, +}; + +enum test_result_t { + FAIL, + PASS, +}; + +bool +element_is_active(size_t element, predicate_reg_value128_t mask, + element_size_t element_size) +{ + const auto element_size_bytes = static_cast(element_size); + const auto element_flag = 1 << (element_size_bytes * element); + return TESTALL(element_flag, mask); +} + +/* + * Create a copy of the data vector with all the elements that are inactive in the mask + * set to 0. + */ +vector_reg_value128_t +apply_predicate_mask(vector_reg_value128_t data, predicate_reg_value128_t mask, + element_size_t element_size) +{ + const auto element_size_bytes = static_cast(element_size); + const auto num_elements = data.size() / element_size_bytes; + for (size_t i = 0; i < num_elements; i++) { + if (!element_is_active(i, mask, element_size)) { + // Element is inactive, set it to 0. + memset(&data[element_size_bytes * i], 0, element_size_bytes); + } + } + + return data; +} + +size_t +get_vl_bytes() +{ + static const auto vl_bytes = []() { + const int returned_value = prctl(PR_SVE_GET_VL); + if (returned_value < 0) { + perror("prctl(PR_SVE_GET_VL) failed"); + exit(1); + } + + return static_cast(returned_value & PR_SVE_VL_LEN_MASK); + }(); + return vl_bytes; +} + +struct scalable_reg_value_t { + const uint8_t *data; + size_t size; + + bool + operator==(const scalable_reg_value_t &other) const + { + return (other.size == size) && (memcmp(data, other.data, size) == 0); + } + + bool + operator!=(const scalable_reg_value_t &other) const + { + return !(*this == other); + } +}; + +void +print_vector(const scalable_reg_value_t &value) +{ + print("0x"); + for (size_t i = 0; i < value.size; i++) { + print("%02x", value.data[i]); + } +} + +/* + * Print a predicate register value as a binary number. Each bit is printed with a space + * in between so that the bit will line up vertically with the corresponding byte of a + * vector register printed on an adjacent line. + * vec: 0x12345678 + * pred: 0b 0 1 0 1 + */ +void +print_predicate(const scalable_reg_value_t &value) +{ + print("0b"); + for (size_t byte_i = 0; byte_i < value.size; byte_i++) { + for (unsigned bit = 0; bit < 8; bit++) { + if (TESTALL(1 << bit, value.data[byte_i])) + print(" 1"); + else + print(" 0"); + } + } +} + +struct sve_register_file_t { + std::vector z; + std::vector p; + + sve_register_file_t() + { + const auto vl_bytes = get_vl_bytes(); + const auto pl_bytes = vl_bytes / 8; + z.resize(NUM_Z_REGS * vl_bytes); + p.resize(NUM_P_REGS * pl_bytes); + } + + scalable_reg_value_t + get_z_register_value(size_t reg_num) const + { + assert(reg_num < NUM_Z_REGS); + const auto vl_bytes = get_vl_bytes(); + return { &z[vl_bytes * reg_num], vl_bytes }; + } + + void + set_z_register_value(size_t reg_num, vector_reg_value128_t value) + { + const auto vl_bytes = get_vl_bytes(); + const auto reg_offset = vl_bytes * reg_num; + for (size_t i = 0; i < vl_bytes / TEST_VL_BYTES; i++) { + const auto slice_offset = reg_offset + (TEST_VL_BYTES * i); + memcpy(&z[slice_offset], value.data(), TEST_VL_BYTES); + } + } + + scalable_reg_value_t + get_p_register_value(size_t reg_num) const + { + assert(reg_num < NUM_P_REGS); + const auto pl_bytes = get_vl_bytes() / 8; + return { &p[pl_bytes * reg_num], pl_bytes }; + } + + void + set_p_register_value(size_t reg_num, predicate_reg_value128_t value) + { + const auto pl_bytes = get_vl_bytes() / 8; + const auto reg_offset = pl_bytes * reg_num; + for (size_t i = 0; i < pl_bytes / sizeof(value); i++) { + const auto slice_offset = reg_offset + (sizeof(value) * i); + memcpy(&p[slice_offset], &value, sizeof(value)); + } + } +}; + +struct test_register_data_t { + sve_register_file_t before; // Values the registers will be set to before the test. + sve_register_file_t after; // Values of the registers after the test instruction. +}; + +struct scalar_plus_vector_test_case_t { + std::string name; // Unique name for this test printed when the test is run. + + struct test_ptrs_t { + const void *base; // Base address used for the test instruction. + const void *z_restore_base; // Base address for initializing Z registers. + const void *p_restore_base; // Base address for initializing P registers. + void *z_save_base; // Base address to save Z registers to after test + // instruction + void *p_save_base; // Base address to save P registers to after test + // instruction + }; + + using test_func_t = std::function; + test_func_t run_test; + + vector_reg_value128_t reference_data; + vector_reg_value128_t offset_data; + element_size_t element_size; + const void *input_data; + + struct registers_used_t { + unsigned dest_z; + unsigned governing_p; + unsigned index_z; + }; + registers_used_t registers_used; + + template + scalar_plus_vector_test_case_t( + std::string name_, test_func_t func_, registers_used_t registers_used_, + std::array reference_data_, + std::array offsets, + const void *input_data_) + : name(std::move(name_)) + , run_test(std::move(func_)) + , registers_used(registers_used_) + , element_size(static_cast(sizeof(ELEMENT_T))) + , input_data(input_data_) + { + std::memcpy(reference_data.data(), reference_data_.data(), reference_data.size()); + std::memcpy(offset_data.data(), offsets.data(), offset_data.size()); + } + + test_result_t + run_test_case() const + { + test_result_t status = PASS; + const auto test_failed = [&status]() { + if (status == PASS) { + status = FAIL; + print("FAIL\n"); + } + }; + print("%s: ", name.c_str()); + + test_register_data_t register_data; + for (size_t i = 0; i < NUM_Z_REGS; i++) { + register_data.before.set_z_register_value(i, UNINITIALIZED_VECTOR); + } + register_data.before.set_z_register_value(registers_used.index_z, offset_data); + for (size_t i = 0; i < NUM_P_REGS; i++) { + register_data.before.set_p_register_value(i, UNINITIALIZED_PREDICATE); + } + + test_ptrs_t ptrs { + input_data, + register_data.before.z.data(), + register_data.before.p.data(), + register_data.after.z.data(), + register_data.after.p.data(), + }; + const size_t num_elements = + offset_data.size() / static_cast(element_size); + + const auto vl_bytes = get_vl_bytes(); + std::vector expected_output_data; + expected_output_data.resize(vl_bytes); + + const auto &predicates = ALL_PREDICATES.at(element_size); + for (const auto &pred : predicates) { + /* TODO i#5036: Test faulting behavior. */ + + const auto expected_output128 = + apply_predicate_mask(reference_data, pred, element_size); + for (size_t i = 0; i < vl_bytes / TEST_VL_BYTES; i++) { + memcpy(&expected_output_data[TEST_VL_BYTES * i], + expected_output128.data(), TEST_VL_BYTES); + } + const scalable_reg_value_t expected_output { + expected_output_data.data(), + vl_bytes, + }; + + register_data.before.set_p_register_value(registers_used.governing_p, pred); + + run_test(ptrs); + + const auto output_value = + register_data.after.get_z_register_value(registers_used.dest_z); + + if (output_value != expected_output) { + test_failed(); + print("predicate: "); + print_predicate(register_data.before.get_p_register_value( + registers_used.governing_p)); + print("\nexpected: "); + print_vector(expected_output); + print("\nactual: "); + print_vector(output_value); + print("\n"); + } + + // Check that the values of the other Z registers have been preserved. + for (size_t i = 0; i < NUM_Z_REGS; i++) { + if (i == registers_used.dest_z) + continue; + const auto before = register_data.before.get_z_register_value(i); + const auto after = register_data.after.get_z_register_value(i); + if (before != after) { + test_failed(); + print("z%u has been corrupted:\n", i); + print("before: "); + print_vector(before); + print("\nafter: "); + print_vector(after); + print("\n"); + } + } + // Check that the values of the P registers have been preserved. + for (size_t i = 0; i < NUM_P_REGS; i++) { + const auto before = register_data.before.get_p_register_value(i); + const auto after = register_data.after.get_p_register_value(i); + if (before != after) { + test_failed(); + print("p%u has been corrupted:\n", i); + print("before: "); + print_predicate(before); + print("\nafter: "); + print_predicate(after); + print("\n"); + } + } + } + if (status == PASS) + print("PASS\n"); + + return status; +#undef TEST_FAILED + } +}; + +test_result_t +run_tests(const std::vector &tests) +{ + test_result_t overall_status = PASS; + + for (const auto &instr_test : tests) { + if (instr_test.run_test_case() == FAIL) { + overall_status = FAIL; + } + } + + return overall_status; +} + +class input_data_t { +public: + const size_t DATA_SIZE = 9 * 4096; // 9 4KiB pages + input_data_t() + : data(mmap(nullptr, DATA_SIZE, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0)) + { + /* + * We set up nine pages of memory to use as input data for load instruction + * tests. The first page contains input data of different sizes and the rest + * of the pages are set up to fault when they are accessed. The tests use + * one of the first 4 regions as their base pointer, so when we want to + * force an instruction to fault (to test faulting behaviour) we can set the + * offset to 4096 which will always land in one of the faulting pages. + * +=====================================================+ + * | Page | Byte off | Region off | | + * +=====================================================+ + * | page 0 | 0 | 0 | 8-bit input data | + * | |----------+------------+--------------------+ + * | | 1024 | 1 | 16-bit input data | + * | |----------+------------+--------------------+ + * | | 2048 | 2 | 32-bit input data | + * | |----------+------------+--------------------+ + * | | 3072 | 3 | 64-bit input data | + * +--------+----------+------------+--------------------+ + * | page 1 | 4096 | 4 | All accesses fault | + * | | | | | + * | | | | | + * | | | | | + * | | | | | + * | | | | | + * | | | | | + * +~~~~~~~~+~~~~~~~~~~+~~~~~~~~~~~~+~~~~~~~~~~~~~~~~~~~~+ + * | pg 2-7 | ... | ... | All accesses fault | + * +~~~~~~~~+~~~~~~~~~~+~~~~~~~~~~~~+~~~~~~~~~~~~~~~~~~~~+ + * | page 8 | 32768 | 32 | All accesses fault | + * | | | | | + * | | | | | + * | | | | | + * | | | | | + * | | | | | + * | | | | | + * +--------+----------+------------+--------------------+ + * + */ + + // Write 8, 16, 32, and 64-bit input data to the first page. + // Each region contains 40 values. We set the base address to the 9th + // element in the array so we can use 8 negative offsets and 32 positive + // offsets. + write_input_data(0, + std::array { + 0xf8, 0xf7, 0xf6, 0xf5, 0xf4, 0xf3, 0xf2, 0xf1, 0x00, 0x01, + 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x10, 0x11, + 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x20, 0x21, + 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x30, 0xff, + }); + write_input_data( + 1, std::array { 0xfff8, 0xfff7, 0xfff6, 0xfff5, 0xfff4, 0xfff3, + 0xfff2, 0xfff1, 0x0000, 0x0001, 0x0002, 0x0003, + 0x0004, 0x0005, 0x0006, 0x0007, 0x0008, 0x0009, + 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, + 0x0016, 0x0017, 0x0018, 0x0019, 0x0020, 0x0021, + 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, + 0x0028, 0x0029, 0x0030, 0xffff }); + write_input_data(2, + std::array { + 0xfffffff8, 0xfffffff7, 0xfffffff6, 0xfffffff5, 0xfffffff4, + 0xfffffff3, 0xfffffff2, 0xfffffff1, 0x00000000, 0x00000001, + 0x00000002, 0x00000003, 0x00000004, 0x00000005, 0x00000006, + 0x00000007, 0x00000008, 0x00000009, 0x00000010, 0x00000011, + 0x00000012, 0x00000013, 0x00000014, 0x00000015, 0x00000016, + 0x00000017, 0x00000018, 0x00000019, 0x00000020, 0x00000021, + 0x00000022, 0x00000023, 0x00000024, 0x00000025, 0x00000026, + 0x00000027, 0x00000028, 0x00000029, 0x00000030, 0xffffffff, + }); + write_input_data(3, + std::array { + 0xfffffffffffffff8, 0xfffffffffffffff7, 0xfffffffffffffff6, + 0xfffffffffffffff5, 0xfffffffffffffff4, 0xfffffffffffffff3, + 0xfffffffffffffff2, 0xfffffffffffffff1, 0x0000000000000000, + 0x0000000000000001, 0x0000000000000002, 0x0000000000000003, + 0x0000000000000004, 0x0000000000000005, 0x0000000000000006, + 0x0000000000000007, 0x0000000000000008, 0x0000000000000009, + 0x0000000000000010, 0x0000000000000011, 0x0000000000000012, + 0x0000000000000013, 0x0000000000000014, 0x0000000000000015, + 0x0000000000000016, 0x0000000000000017, 0x0000000000000018, + 0x0000000000000019, 0x0000000000000020, 0x0000000000000021, + 0x0000000000000022, 0x0000000000000023, 0x0000000000000024, + 0x0000000000000025, 0x0000000000000026, 0x0000000000000027, + 0x0000000000000028, 0x0000000000000029, 0x0000000000000030, + 0xffffffffffffffff, + }); + // Change the permissions of the second page so that any accesses to it will + // fault. + mmap(region_start_addr(4), 8 * 4096, PROT_NONE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0); + } + + ~input_data_t() + { + munmap(data, DATA_SIZE); + } + + const void * + region_start_addr(size_t offset) const + { + return &static_cast(data)[1024 * offset]; + } + + void * + region_start_addr(size_t offset) + { + return &static_cast(data)[1024 * offset]; + } + + const void * + base_addr_for_data_size(element_size_t element_size) const + { + return &static_cast(data)[base_offset_for_data_size(element_size)]; + } + + void * + base_addr_for_data_size(element_size_t element_size) + { + return &static_cast(data)[base_offset_for_data_size(element_size)]; + } + +private: + static size_t + base_offset_for_data_size(element_size_t element_size) + { + size_t offset = 0; + switch (element_size) { + case element_size_t::BYTE: offset = 0; break; + case element_size_t::HALF: offset = 1; break; + case element_size_t::SINGLE: offset = 2; break; + case element_size_t::DOUBLE: offset = 3; break; + } + // The base address is set to the 8th element in the region. + return (1024 * offset) + (static_cast(element_size) * 8); + } + + template + void + write_input_data(size_t offset, const std::array &input_data) + { + memcpy(region_start_addr(offset), input_data.data(), + input_data.size() * sizeof(T)); + } + + void *data; +}; + +#if defined(__ARM_FEATURE_SVE) +/* + * Expands to a string literal containing assembly code that can be included in + * an asm {} statement using string literal concatenation. + * + * For example SAVE_OR_RESTORE_SINGLE_REGISTER(ldr, z, 5, mem_base_ptr) + * produces: "ldr z5, [%mem_base_ptr], #5, mul vl]\n" The empty string at the + * beginning "" is necessary to stop clang-format treating #op as a preprocessor + * directive. + */ +# define SAVE_OR_RESTORE_SINGLE_REGISTER(op, reg_type, num, base) \ + "" #op " " #reg_type #num ", [%[" #base "], #" #num ", mul vl]\n" + +# define SAVE_OR_RESTORE_Z_REGISTERS(op, base) \ + SAVE_OR_RESTORE_SINGLE_REGISTER(op, z, 0, base) \ + SAVE_OR_RESTORE_SINGLE_REGISTER(op, z, 1, base) \ + SAVE_OR_RESTORE_SINGLE_REGISTER(op, z, 2, base) \ + SAVE_OR_RESTORE_SINGLE_REGISTER(op, z, 3, base) \ + SAVE_OR_RESTORE_SINGLE_REGISTER(op, z, 4, base) \ + SAVE_OR_RESTORE_SINGLE_REGISTER(op, z, 5, base) \ + SAVE_OR_RESTORE_SINGLE_REGISTER(op, z, 6, base) \ + SAVE_OR_RESTORE_SINGLE_REGISTER(op, z, 7, base) \ + SAVE_OR_RESTORE_SINGLE_REGISTER(op, z, 8, base) \ + SAVE_OR_RESTORE_SINGLE_REGISTER(op, z, 9, base) \ + SAVE_OR_RESTORE_SINGLE_REGISTER(op, z, 10, base) \ + SAVE_OR_RESTORE_SINGLE_REGISTER(op, z, 11, base) \ + SAVE_OR_RESTORE_SINGLE_REGISTER(op, z, 12, base) \ + SAVE_OR_RESTORE_SINGLE_REGISTER(op, z, 13, base) \ + SAVE_OR_RESTORE_SINGLE_REGISTER(op, z, 14, base) \ + SAVE_OR_RESTORE_SINGLE_REGISTER(op, z, 15, base) \ + SAVE_OR_RESTORE_SINGLE_REGISTER(op, z, 16, base) \ + SAVE_OR_RESTORE_SINGLE_REGISTER(op, z, 17, base) \ + SAVE_OR_RESTORE_SINGLE_REGISTER(op, z, 18, base) \ + SAVE_OR_RESTORE_SINGLE_REGISTER(op, z, 19, base) \ + SAVE_OR_RESTORE_SINGLE_REGISTER(op, z, 20, base) \ + SAVE_OR_RESTORE_SINGLE_REGISTER(op, z, 21, base) \ + SAVE_OR_RESTORE_SINGLE_REGISTER(op, z, 22, base) \ + SAVE_OR_RESTORE_SINGLE_REGISTER(op, z, 23, base) \ + SAVE_OR_RESTORE_SINGLE_REGISTER(op, z, 24, base) \ + SAVE_OR_RESTORE_SINGLE_REGISTER(op, z, 25, base) \ + SAVE_OR_RESTORE_SINGLE_REGISTER(op, z, 26, base) \ + SAVE_OR_RESTORE_SINGLE_REGISTER(op, z, 27, base) \ + SAVE_OR_RESTORE_SINGLE_REGISTER(op, z, 28, base) \ + SAVE_OR_RESTORE_SINGLE_REGISTER(op, z, 29, base) \ + SAVE_OR_RESTORE_SINGLE_REGISTER(op, z, 30, base) \ + SAVE_OR_RESTORE_SINGLE_REGISTER(op, z, 31, base) + +# define SAVE_OR_RESTORE_P_REGISTERS(op, base) \ + SAVE_OR_RESTORE_SINGLE_REGISTER(op, p, 0, base) \ + SAVE_OR_RESTORE_SINGLE_REGISTER(op, p, 1, base) \ + SAVE_OR_RESTORE_SINGLE_REGISTER(op, p, 2, base) \ + SAVE_OR_RESTORE_SINGLE_REGISTER(op, p, 3, base) \ + SAVE_OR_RESTORE_SINGLE_REGISTER(op, p, 4, base) \ + SAVE_OR_RESTORE_SINGLE_REGISTER(op, p, 5, base) \ + SAVE_OR_RESTORE_SINGLE_REGISTER(op, p, 6, base) \ + SAVE_OR_RESTORE_SINGLE_REGISTER(op, p, 7, base) \ + SAVE_OR_RESTORE_SINGLE_REGISTER(op, p, 8, base) \ + SAVE_OR_RESTORE_SINGLE_REGISTER(op, p, 9, base) \ + SAVE_OR_RESTORE_SINGLE_REGISTER(op, p, 10, base) \ + SAVE_OR_RESTORE_SINGLE_REGISTER(op, p, 11, base) \ + SAVE_OR_RESTORE_SINGLE_REGISTER(op, p, 12, base) \ + SAVE_OR_RESTORE_SINGLE_REGISTER(op, p, 13, base) \ + SAVE_OR_RESTORE_SINGLE_REGISTER(op, p, 14, base) \ + SAVE_OR_RESTORE_SINGLE_REGISTER(op, p, 15, base) + +# define SAVE_Z_REGISTERS(base) SAVE_OR_RESTORE_Z_REGISTERS(str, base) +# define RESTORE_Z_REGISTERS(base) SAVE_OR_RESTORE_Z_REGISTERS(ldr, base) + +# define SAVE_P_REGISTERS(base) SAVE_OR_RESTORE_P_REGISTERS(str, base) +# define RESTORE_P_REGISTERS(base) SAVE_OR_RESTORE_P_REGISTERS(ldr, base) + +// Handy short hand to list all Z registers in an asm {} statment clobber list. +# define ALL_Z_REGS \ + "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", \ + "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", \ + "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + +// Handy short hand to list all P registers in an asm {} statment clobber list. +# define ALL_P_REGS \ + "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", \ + "p13", "p14", "p15" + +test_result_t +test_ld1_scalar_plus_vector() +{ +# define TEST_FUNC(ld_instruction) \ + [](scalar_plus_vector_test_case_t::test_ptrs_t &ptrs) { \ + asm(/* clang-format off */ \ + RESTORE_Z_REGISTERS(z_restore_base) \ + RESTORE_P_REGISTERS(p_restore_base) \ + ld_instruction "\n" \ + SAVE_Z_REGISTERS(z_save_base) \ + SAVE_P_REGISTERS(p_save_base) /* clang-format on */ \ + : \ + : [base] "r"(ptrs.base), [z_restore_base] "r"(ptrs.z_restore_base), \ + [z_save_base] "r"(ptrs.z_save_base), \ + [p_restore_base] "r"(ptrs.p_restore_base), \ + [p_save_base] "r"(ptrs.p_save_base) \ + : ALL_Z_REGS, ALL_P_REGS, "memory"); \ + } + + input_data_t input_data; + return run_tests({ + /* { + * Test name, + * Function that executes the test instruction, + * Registers used {zt, pg, zm}, + * Expected output data, + * Offset data (value for zm), + * Base pointer (value for Xn), + * }, + */ + // LD1B instructions. + { + "ld1b 32bit unscaled offset uxtw", + TEST_FUNC("ld1b z0.s, p7/z, [%[base], z31.s, uxtw]"), + { /*zt=*/0, /*pg=*/7, /*zm=*/31 }, + std::array { 0x00, 0x01, 0x07, 0x10 }, + std::array { 0, 1, 7, 10 }, + input_data.base_addr_for_data_size(element_size_t::BYTE), + }, + { + "ld1b 32bit unscaled offset sxtw", + TEST_FUNC("ld1b z1.s, p6/z, [%[base], z30.s, sxtw]"), + { /*zt=*/1, /*pg=*/6, /*zm=*/30 }, + std::array { 0x00, 0xF1, 0x18, 0x27 }, + std::array { 0, -1, 18, 27 }, + input_data.base_addr_for_data_size(element_size_t::BYTE), + }, + { + "ld1b 32bit unpacked unscaled offset uxtw", + TEST_FUNC("ld1b z2.d, p5/z, [%[base], z29.d, uxtw]"), + { /*zt=*/2, /*pg=*/5, /*zm=*/29 }, + std::array { 0x01, 0x22 }, + std::array { 1, 22 }, + input_data.base_addr_for_data_size(element_size_t::BYTE), + }, + { + "ld1b 32bit unpacked unscaled offset sxtw", + TEST_FUNC("ld1b z3.d, p4/z, [%[base], z28.d, sxtw]"), + { /*zt=*/3, /*pg=*/4, /*zm=*/28 }, + std::array { 0xF2, 0x29 }, + std::array { -2, 29 }, + input_data.base_addr_for_data_size(element_size_t::BYTE), + }, + { + "ld1b 64bit unscaled offset", + TEST_FUNC("ld1b z4.d, p3/z, [%[base], z27.d]"), + { /*zt=*/4, /*pg=*/3, /*zm=*/27 }, + std::array { 0x09, 0x28 }, + std::array { 9, 28 }, + input_data.base_addr_for_data_size(element_size_t::BYTE), + }, + { + "ld1b 64bit unscaled offset Zt==Zm", + TEST_FUNC("ld1b z30.d, p3/z, [%[base], z30.d]"), + { /*zt=*/30, /*pg=*/3, /*zm=*/30 }, + std::array { 0x09, 0x28 }, + std::array { 9, 28 }, + input_data.base_addr_for_data_size(element_size_t::BYTE), + }, + // LD1SB instructions. + { + "ld1sb 32bit unscaled offset uxtw", + TEST_FUNC("ld1sb z5.s, p2/z, [%[base], z26.s, uxtw]"), + { /*zt=*/5, /*pg=*/2, /*zm=*/26 }, + std::array { 0x00, -1, 0x23, 0x30 }, + std::array { 0, 31, 23, 30 }, + input_data.base_addr_for_data_size(element_size_t::BYTE), + }, + { + "ld1sb 32bit unscaled offset sxtw", + TEST_FUNC("ld1sb z6.s, p1/z, [%[base], z25.s, sxtw]"), + { /*zt=*/6, /*pg=*/1, /*zm=*/25 }, + std::array { 0x01, -15, 0x11, 0x24 }, + std::array { 1, -1, 11, 24 }, + input_data.base_addr_for_data_size(element_size_t::BYTE), + }, + { + "ld1sb 32bit unpacked unscaled offset uxtw", + TEST_FUNC("ld1sb z7.d, p0/z, [%[base], z24.d, uxtw]"), + { /*zt=*/7, /*pg=*/0, /*zm=*/24 }, + std::array { 0x01, -1 }, + std::array { 1, 31 }, + input_data.base_addr_for_data_size(element_size_t::BYTE), + }, + { + "ld1sb 32bit unpacked unscaled offset sxtw", + TEST_FUNC("ld1sb z8.d, p1/z, [%[base], z23.d, sxtw]"), + { /*zt=*/8, /*pg=*/1, /*zm=*/23 }, + std::array { -14, 0x29 }, + std::array { -2, 29 }, + input_data.base_addr_for_data_size(element_size_t::BYTE), + }, + { + "ld1sb 64bit unscaled offset", + TEST_FUNC("ld1sb z9.d, p2/z, [%[base], z22.d]"), + { /*zt=*/9, /*pg=*/2, /*zm=*/22 }, + std::array { -1, 0x09 }, + std::array { 31, 9 }, + input_data.base_addr_for_data_size(element_size_t::BYTE), + }, + { + "ld1sb 64bit unscaled offset", + TEST_FUNC("ld1sb z17.d, p7/z, [%[base], z17.d]"), + { /*zt=*/17, /*pg=*/7, /*zm=*/17 }, + std::array { -1, 0x09 }, + std::array { 31, 9 }, + input_data.base_addr_for_data_size(element_size_t::BYTE), + }, + // LD1H instructions. + { + "ld1h 32bit scaled offset uxtw", + TEST_FUNC("ld1h z10.s, p3/z, [%[base], z21.s, uxtw #1]"), + { /*zt=*/10, /*pg=*/3, /*zm=*/21 }, + std::array { 0x01, 0x10, 0x23, 0x26 }, + std::array { 1, 10, 23, 26 }, + input_data.base_addr_for_data_size(element_size_t::HALF), + }, + { + "ld1h 32bit scaled offset sxtw", + TEST_FUNC("ld1h z11.s, p4/z, [%[base], z20.s, sxtw #1]"), + { /*zt=*/11, /*pg=*/4, /*zm=*/20 }, + std::array { 0xFFF3, 0x07, 0x16, 0x30 }, + std::array { -3, 7, 16, 30 }, + input_data.base_addr_for_data_size(element_size_t::HALF), + }, + { + "ld1h 32bit unpacked scaled offset uxtw", + TEST_FUNC("ld1h z12.d, p5/z, [%[base], z19.d, uxtw #1]"), + { /*zt=*/12, /*pg=*/5, /*zm=*/19 }, + std::array { 0x08, 0x28 }, + std::array { 8, 28 }, + input_data.base_addr_for_data_size(element_size_t::HALF), + }, + { + "ld1h 32bit unpacked scaled offset sxtw", + TEST_FUNC("ld1h z13.d, p6/z, [%[base], z18.d, sxtw #1]"), + { /*zt=*/13, /*pg=*/6, /*zm=*/18 }, + std::array { 0xFFF4, 0x24 }, + std::array { -4, 24 }, + input_data.base_addr_for_data_size(element_size_t::HALF), + }, + { + "ld1h 32bit unpacked unscaled offset uxtw", + TEST_FUNC("ld1h z14.d, p7/z, [%[base], z17.d, uxtw]"), + { /*zt=*/14, /*pg=*/7, /*zm=*/17 }, + std::array { 0x0403, 0x2322 }, + std::array { 3, 22 }, + input_data.base_addr_for_data_size(element_size_t::BYTE), + }, + { + "ld1h 32bit unpacked unscaled offset sxtw", + TEST_FUNC("ld1h z15.d, p6/z, [%[base], z16.d, sxtw]"), + { /*zt=*/15, /*pg=*/6, /*zm=*/16 }, + std::array { 0x0100, 0xF4F5 }, + std::array { 0, -5 }, + input_data.base_addr_for_data_size(element_size_t::BYTE), + }, + { + "ld1h 32bit unscaled offset uxtw", + TEST_FUNC("ld1h z16.s, p5/z, [%[base], z15.s, uxtw #1]"), + { /*zt=*/16, /*pg=*/5, /*zm=*/15 }, + std::array { 0x01, 0x10, 0x23, 0x30 }, + std::array { 1, 10, 23, 30 }, + input_data.base_addr_for_data_size(element_size_t::HALF), + }, + { + "ld1h 32bit unscaled offset sxtw", + TEST_FUNC("ld1h z17.s, p4/z, [%[base], z14.s, sxtw #1]"), + { /*zt=*/17, /*pg=*/4, /*zm=*/14 }, + std::array { 0x00, 0xFFF6, 0x18, 0x27 }, + std::array { 0, -6, 18, 27 }, + input_data.base_addr_for_data_size(element_size_t::HALF), + }, + { + "ld1h 64bit scaled offset", + TEST_FUNC("ld1h z18.d, p3/z, [%[base], z13.d, lsl #1]"), + { /*zt=*/18, /*pg=*/3, /*zm=*/13 }, + std::array { 0x03, 0x14 }, + std::array { 3, 14 }, + input_data.base_addr_for_data_size(element_size_t::HALF), + }, + { + "ld1h 64bit unscaled offset", + TEST_FUNC("ld1h z19.d, p2/z, [%[base], z12.d]"), + { /*zt=*/19, /*pg=*/2, /*zm=*/12 }, + std::array { 0x1009, 0x2928 }, + std::array { 9, 28 }, + input_data.base_addr_for_data_size(element_size_t::BYTE), + }, + { + "ld1h 64bit unscaled offset Zt==Zm", + TEST_FUNC("ld1h z25.d, p5/z, [%[base], z25.d]"), + { /*zt=*/25, /*pg=*/5, /*zm=*/25 }, + std::array { 0x1009, 0x2928 }, + std::array { 9, 28 }, + input_data.base_addr_for_data_size(element_size_t::BYTE), + }, + // LD1SH instructions. + { + "ld1sh 32bit scaled offset uxtw", + TEST_FUNC("ld1sh z20.s, p1/z, [%[base], z11.s, uxtw #1]"), + { /*zt=*/20, /*pg=*/1, /*zm=*/11 }, + std::array { 0x00, 0x07, 0x16, -1 }, + std::array { 0, 7, 16, 31 }, + input_data.base_addr_for_data_size(element_size_t::HALF), + }, + { + "ld1sh 32bit scaled offset sxtw", + TEST_FUNC("ld1sh z21.s, p0/z, [%[base], z10.s, sxtw #1]"), + { /*zt=*/21, /*pg=*/0, /*zm=*/10 }, + std::array { -13, 0x01, 0x10, 0x30 }, + std::array { -3, 1, 10, 30 }, + input_data.base_addr_for_data_size(element_size_t::HALF), + }, + { + "ld1sh 32bit unpacked scaled offset uxtw", + TEST_FUNC("ld1sh z22.d, p1/z, [%[base], z9.d, uxtw #1]"), + { /*zt=*/22, /*pg=*/1, /*zm=*/9 }, + std::array { 0x00, -1 }, + std::array { 0, 31 }, + input_data.base_addr_for_data_size(element_size_t::HALF), + }, + { + "ld1sh 32bit unpacked scaled offset sxtw", + TEST_FUNC("ld1sh z23.d, p2/z, [%[base], z8.d, sxtw #1]"), + { /*zt=*/23, /*pg=*/2, /*zm=*/8 }, + std::array { -12, 0x14 }, + std::array { -4, 14 }, + input_data.base_addr_for_data_size(element_size_t::HALF), + }, + { + "ld1sh 32bit unpacked unscaled offset uxtw", + TEST_FUNC("ld1sh z24.d, p3/z, [%[base], z7.d, uxtw]"), + { /*zt=*/24, /*pg=*/3, /*zm=*/7 }, + std::array { 0x0201, -208 }, + std::array { 1, 30 }, + input_data.base_addr_for_data_size(element_size_t::BYTE), + }, + { + "ld1sh 32bit unpacked unscaled offset sxtw", + TEST_FUNC("ld1sh z25.d, p4/z, [%[base], z6.d, sxtw]"), + { /*zt=*/25, /*pg=*/4, /*zm=*/6 }, + std::array { -2827, 0x3029 }, + std::array { -5, 29 }, + input_data.base_addr_for_data_size(element_size_t::BYTE), + }, + { + "ld1sh 32bit unscaled offset uxtw", + TEST_FUNC("ld1sh z26.s, p5/z, [%[base], z5.s, uxtw #1]"), + { /*zt=*/26, /*pg=*/5, /*zm=*/5 }, + std::array { 0x05, 0x15, 0x25, -1 }, + std::array { 5, 15, 25, 31 }, + input_data.base_addr_for_data_size(element_size_t::HALF), + }, + { + "ld1sh 32bit unscaled offset sxtw", + TEST_FUNC("ld1sh z27.s, p6/z, [%[base], z4.s, sxtw #1]"), + { /*zt=*/27, /*pg=*/6, /*zm=*/4 }, + std::array { 0x06, 0x16, -10, 0x26 }, + std::array { 6, 16, -6, 26 }, + input_data.base_addr_for_data_size(element_size_t::HALF), + }, + { + "ld1sh 64bit scaled offset", + TEST_FUNC("ld1sh z28.d, p7/z, [%[base], z3.d, lsl #1]"), + { /*zt=*/28, /*pg=*/7, /*zm=*/3 }, + std::array { 0x09, -1 }, + std::array { 9, 31 }, + input_data.base_addr_for_data_size(element_size_t::HALF), + }, + { + "ld1sh 64bit unscaled offset", + TEST_FUNC("ld1sh z29.d, p6/z, [%[base], z2.d]"), + { /*zt=*/29, /*pg=*/6, /*zm=*/2 }, + std::array { 0x0403, -208 }, + std::array { 3, 30 }, + input_data.base_addr_for_data_size(element_size_t::BYTE), + }, + { + "ld1sh 64bit unscaled offset Zt==Zm", + TEST_FUNC("ld1sh z0.d, p0/z, [%[base], z0.d]"), + { /*zt=*/0, /*pg=*/0, /*zm=*/0 }, + std::array { 0x0403, -208 }, + std::array { 3, 30 }, + input_data.base_addr_for_data_size(element_size_t::BYTE), + }, + // LD1W instructions. + { + "ld1w 32bit scaled offset uxtw", + TEST_FUNC("ld1w z30.s, p5/z, [%[base], z1.s, uxtw #2]"), + { /*zt=*/30, /*pg=*/5, /*zm=*/1 }, + std::array { 0x00, 0x07, 0x17, 0x27 }, + std::array { 0, 7, 17, 27 }, + input_data.base_addr_for_data_size(element_size_t::SINGLE), + }, + { + "ld1w 32bit scaled offset sxtw", + TEST_FUNC("ld1w z31.s, p4/z, [%[base], z0.s, sxtw #2]"), + { /*zt=*/31, /*pg=*/4, /*zm=*/0 }, + std::array { 0xFFFFFFF7, 0x07, 0x17, 0x27 }, + std::array { -7, 7, 17, 27 }, + input_data.base_addr_for_data_size(element_size_t::SINGLE), + }, + { + "ld1w 32bit unpacked scaled offset uxtw", + TEST_FUNC("ld1w z0.d, p3/z, [%[base], z1.d, uxtw #2]"), + { /*zt=*/0, /*pg=*/3, /*zm=*/1 }, + std::array { 0x18, 0x28 }, + std::array { 18, 28 }, + input_data.base_addr_for_data_size(element_size_t::SINGLE), + }, + { + "ld1w 32bit unpacked scaled offset sxtw", + TEST_FUNC("ld1w z2.d, p2/z, [%[base], z3.d, sxtw #2]"), + { /*zt=*/2, /*pg=*/2, /*zm=*/3 }, + std::array { 0xFFFFFFF8, 0x08 }, + std::array { -8, 8 }, + input_data.base_addr_for_data_size(element_size_t::SINGLE), + }, + { + "ld1w 32bit unpacked unscaled offset uxtw", + TEST_FUNC("ld1w z4.d, p1/z, [%[base], z5.d, uxtw]"), + { /*zt=*/4, /*pg=*/1, /*zm=*/5 }, + std::array { 0x04030201, 0x25242322 }, + std::array { 1, 22 }, + input_data.base_addr_for_data_size(element_size_t::BYTE), + }, + { + "ld1w 32bit unpacked unscaled offset sxtw", + TEST_FUNC("ld1w z6.d, p0/z, [%[base], z7.d, sxtw]"), + { /*zt=*/6, /*pg=*/0, /*zm=*/7 }, + std::array { 0x020100F1, 0x30292827 }, + std::array { -1, 27 }, + input_data.base_addr_for_data_size(element_size_t::BYTE), + }, + { + "ld1w 32bit unscaled offset uxtw", + TEST_FUNC("ld1w z8.s, p1/z, [%[base], z9.s, uxtw]"), + { /*zt=*/8, /*pg=*/1, /*zm=*/9 }, + std::array { 0x03020100, 0x05040302, 0x15141312, 0x25242322 }, + std::array { 0, 2, 12, 22 }, + input_data.base_addr_for_data_size(element_size_t::BYTE), + }, + { + "ld1w 32bit unscaled offset sxtw", + TEST_FUNC("ld1w z10.s, p2/z, [%[base], z11.s, sxtw]"), + { /*zt=*/10, /*pg=*/2, /*zm=*/11 }, + std::array { 0x0100F1F2, 0x05040302, 0x15141312, 0x25242322 }, + std::array { -2, 2, 12, 22 }, + input_data.base_addr_for_data_size(element_size_t::BYTE), + }, + { + "ld1w 64bit scaled offset", + TEST_FUNC("ld1w z12.d, p3/z, [%[base], z13.d, lsl #2]"), + { /*zt=*/12, /*pg=*/3, /*zm=*/13 }, + std::array { 0x03, 0x14 }, + std::array { 3, 14 }, + input_data.base_addr_for_data_size(element_size_t::SINGLE), + }, + { + "ld1w 64bit unscaled offset", + TEST_FUNC("ld1w z14.d, p4/z, [%[base], z15.d]"), + { /*zt=*/14, /*pg=*/4, /*zm=*/15 }, + std::array { 0x06050403, 0x17161514 }, + std::array { 3, 14 }, + input_data.base_addr_for_data_size(element_size_t::BYTE), + }, + { + "ld1w 64bit unscaled offset Zt==Zm", + TEST_FUNC("ld1w z5.d, p5/z, [%[base], z5.d]"), + { /*zt=*/5, /*pg=*/5, /*zm=*/5 }, + std::array { 0x06050403, 0x17161514 }, + std::array { 3, 14 }, + input_data.base_addr_for_data_size(element_size_t::BYTE), + }, + // LD1SW instructions. + { + "ld1sw 32bit unpacked scaled offset uxtw", + TEST_FUNC("ld1sw z16.d, p5/z, [%[base], z17.d, uxtw #2]"), + { /*zt=*/16, /*pg=*/5, /*zm=*/17 }, + std::array { -1, 0x10 }, + std::array { 31, 10 }, + input_data.base_addr_for_data_size(element_size_t::SINGLE), + }, + { + "ld1sw 32bit unpacked scaled offset sxtw", + TEST_FUNC("ld1sw z18.d, p6/z, [%[base], z19.d, sxtw #2]"), + { /*zt=*/18, /*pg=*/6, /*zm=*/19 }, + std::array { -8, 0x16 }, + std::array { -8, 16 }, + input_data.base_addr_for_data_size(element_size_t::SINGLE), + }, + { + "ld1sw 32bit unpacked unscaled offset uxtw", + TEST_FUNC("ld1sw z20.d, p7/z, [%[base], z21.d, uxtw]"), + { /*zt=*/20, /*pg=*/7, /*zm=*/21 }, + std::array { 0x04030201, -13620952 }, + std::array { 1, 28 }, + input_data.base_addr_for_data_size(element_size_t::BYTE), + }, + { + "ld1sw 32bit unpacked unscaled offset sxtw", + TEST_FUNC("ld1sw z22.d, p6/z, [%[base], z23.d, sxtw]"), + { /*zt=*/22, /*pg=*/6, /*zm=*/23 }, + std::array { 0x11100908, -168364040 }, + std::array { 8, -8 }, + input_data.base_addr_for_data_size(element_size_t::BYTE), + }, + { + "ld1sw 64bit scaled offset", + TEST_FUNC("ld1sw z24.d, p5/z, [%[base], z25.d, lsl #2]"), + { /*zt=*/24, /*pg=*/5, /*zm=*/25 }, + std::array { -1, 0x28 }, + std::array { 31, 28 }, + input_data.base_addr_for_data_size(element_size_t::SINGLE), + }, + { + "ld1sw 64bit unscaled offset", + TEST_FUNC("ld1sw z26.d, p4/z, [%[base], z27.d]"), + { /*zt=*/26, /*pg=*/4, /*zm=*/27 }, + std::array { 0x12111009, -13620952 }, + std::array { 9, 28 }, + input_data.base_addr_for_data_size(element_size_t::BYTE), + }, + { + "ld1sw 64bit unscaled offset Zt==Zm", + TEST_FUNC("ld1sw z10.d, p5/z, [%[base], z10.d]"), + { /*zt=*/10, /*pg=*/5, /*zm=*/10 }, + std::array { 0x12111009, -13620952 }, + std::array { 9, 28 }, + input_data.base_addr_for_data_size(element_size_t::BYTE), + }, + // LD1D + { + "ld1d 32bit unpacked scaled offset uxtw", + TEST_FUNC("ld1d z28.d, p3/z, [%[base], z29.d, uxtw #3]"), + { /*zt=*/28, /*pg=*/3, /*zm=*/29 }, + std::array { 0x15, 0x25 }, + std::array { 15, 25 }, + input_data.base_addr_for_data_size(element_size_t::DOUBLE), + }, + { + "ld1d 32bit unpacked scaled offset sxtw", + TEST_FUNC("ld1d z30.d, p2/z, [%[base], z31.d, sxtw #3]"), + { /*zt=*/30, /*pg=*/2, /*zm=*/31 }, + std::array { 0x08, 0xFFFFFFFFFFFFFFF3 }, + std::array { 8, -3 }, + input_data.base_addr_for_data_size(element_size_t::DOUBLE), + }, + { + "ld1d 32bit unpacked unscaled offset uxtw", + TEST_FUNC("ld1d z31.d, p1/z, [%[base], z30.d, uxtw]"), + { /*zt=*/31, /*pg=*/1, /*zm=*/30 }, + std::array { 0x2019181716151413, 0x3029282726252423 }, + std::array { 13, 23 }, + input_data.base_addr_for_data_size(element_size_t::BYTE), + }, + { + "ld1d 32bit unpacked unscaled offset sxtw", + TEST_FUNC("ld1d z29.d, p0/z, [%[base], z28.d, sxtw]"), + { /*zt=*/29, /*pg=*/0, /*zm=*/28 }, + std::array { 0x2120191817161514, 0x03020100F1F2F3F4 }, + std::array { 14, -4 }, + input_data.base_addr_for_data_size(element_size_t::BYTE), + }, + { + "ld1d 64bit scaled offset", + TEST_FUNC("ld1d z27.d, p1/z, [%[base], z26.d, lsl #3]"), + { /*zt=*/27, /*pg=*/1, /*zm=*/26 }, + std::array { 0x00, 0x10 }, + std::array { 0, 10 }, + input_data.base_addr_for_data_size(element_size_t::DOUBLE), + }, + { + "ld1d 64bit unscaled offset", + TEST_FUNC("ld1d z25.d, p2/z, [%[base], z24.d]"), + { /*zt=*/25, /*pg=*/2, /*zm=*/24 }, + std::array { 0x020100F1F2F3F4F5, 0x1716151413121110 }, + std::array { -5, 10 }, + input_data.base_addr_for_data_size(element_size_t::BYTE), + }, + { + "ld1d 64bit unscaled offset Zt==Zm", + TEST_FUNC("ld1d z15.d, p5/z, [%[base], z15.d]"), + { /*zt=*/15, /*pg=*/5, /*zm=*/15 }, + std::array { 0x020100F1F2F3F4F5, 0x1716151413121110 }, + std::array { -5, 10 }, + input_data.base_addr_for_data_size(element_size_t::BYTE), + }, + }); +# undef TEST_FUNC +} +#endif // defined(__ARM_FEATURE_SVE) + +} // namespace + +int +main(int argc, char **argv) +{ +#if defined(__ARM_FEATURE_SVE) + test_ld1_scalar_plus_vector(); +#endif + + return 0; +} diff --git a/suite/tests/client-interface/drx-scattergather-aarch64.templatex b/suite/tests/client-interface/drx-scattergather-aarch64.templatex new file mode 100644 index 00000000000..6c4bb0a2ce4 --- /dev/null +++ b/suite/tests/client-interface/drx-scattergather-aarch64.templatex @@ -0,0 +1,68 @@ +#ifdef __ARM_FEATURE_SVE +ld1b 32bit unscaled offset uxtw: PASS +ld1b 32bit unscaled offset sxtw: PASS +ld1b 32bit unpacked unscaled offset uxtw: PASS +ld1b 32bit unpacked unscaled offset sxtw: PASS +ld1b 64bit unscaled offset: PASS +ld1b 64bit unscaled offset Zt==Zm: PASS +ld1sb 32bit unscaled offset uxtw: PASS +ld1sb 32bit unscaled offset sxtw: PASS +ld1sb 32bit unpacked unscaled offset uxtw: PASS +ld1sb 32bit unpacked unscaled offset sxtw: PASS +ld1sb 64bit unscaled offset: PASS +ld1sb 64bit unscaled offset: PASS +ld1h 32bit scaled offset uxtw: PASS +ld1h 32bit scaled offset sxtw: PASS +ld1h 32bit unpacked scaled offset uxtw: PASS +ld1h 32bit unpacked scaled offset sxtw: PASS +ld1h 32bit unpacked unscaled offset uxtw: PASS +ld1h 32bit unpacked unscaled offset sxtw: PASS +ld1h 32bit unscaled offset uxtw: PASS +ld1h 32bit unscaled offset sxtw: PASS +ld1h 64bit scaled offset: PASS +ld1h 64bit unscaled offset: PASS +ld1h 64bit unscaled offset Zt==Zm: PASS +ld1sh 32bit scaled offset uxtw: PASS +ld1sh 32bit scaled offset sxtw: PASS +ld1sh 32bit unpacked scaled offset uxtw: PASS +ld1sh 32bit unpacked scaled offset sxtw: PASS +ld1sh 32bit unpacked unscaled offset uxtw: PASS +ld1sh 32bit unpacked unscaled offset sxtw: PASS +ld1sh 32bit unscaled offset uxtw: PASS +ld1sh 32bit unscaled offset sxtw: PASS +ld1sh 64bit scaled offset: PASS +ld1sh 64bit unscaled offset: PASS +ld1sh 64bit unscaled offset Zt==Zm: PASS +ld1w 32bit scaled offset uxtw: PASS +ld1w 32bit scaled offset sxtw: PASS +ld1w 32bit unpacked scaled offset uxtw: PASS +ld1w 32bit unpacked scaled offset sxtw: PASS +ld1w 32bit unpacked unscaled offset uxtw: PASS +ld1w 32bit unpacked unscaled offset sxtw: PASS +ld1w 32bit unscaled offset uxtw: PASS +ld1w 32bit unscaled offset sxtw: PASS +ld1w 64bit scaled offset: PASS +ld1w 64bit unscaled offset: PASS +ld1w 64bit unscaled offset Zt==Zm: PASS +ld1sw 32bit unpacked scaled offset uxtw: PASS +ld1sw 32bit unpacked scaled offset sxtw: PASS +ld1sw 32bit unpacked unscaled offset uxtw: PASS +ld1sw 32bit unpacked unscaled offset sxtw: PASS +ld1sw 64bit scaled offset: PASS +ld1sw 64bit unscaled offset: PASS +ld1sw 64bit unscaled offset Zt==Zm: PASS +ld1d 32bit unpacked scaled offset uxtw: PASS +ld1d 32bit unpacked scaled offset sxtw: PASS +ld1d 32bit unpacked unscaled offset uxtw: PASS +ld1d 32bit unpacked unscaled offset sxtw: PASS +ld1d 64bit scaled offset: PASS +ld1d 64bit unscaled offset: PASS +ld1d 64bit unscaled offset Zt==Zm: PASS +#endif /* __ARM_FEATURE_SVE */ +#ifndef TEST_SAMPLE_CLIENT +#ifdef __ARM_FEATURE_SVE +event_exit, 428 scatter/gather instructions +#else +event_exit, 0 scatter/gather instructions +#endif /* __ARM_FEATURE_SVE */ +#endif /* TEST_SAMPLE_CLIENT */ diff --git a/suite/tests/client-interface/drx-scattergather-bbdup.dll.c b/suite/tests/client-interface/drx-scattergather-bbdup.dll.c index 237ecf1b2ba..a6c322e40ef 100644 --- a/suite/tests/client-interface/drx-scattergather-bbdup.dll.c +++ b/suite/tests/client-interface/drx-scattergather-bbdup.dll.c @@ -61,6 +61,7 @@ inscount(uint num_instrs) global_sg_count += num_instrs; } +#if defined(X86) /* Global, because the markers will be in a different app2app list after breaking up * scatter/gather into separate basic blocks during expansion. */ @@ -69,6 +70,7 @@ static app_pc mask_update_test_avx512_gather_pc = (app_pc)INT_MAX; static app_pc mask_clobber_test_avx512_scatter_pc = (app_pc)INT_MAX; static app_pc mask_update_test_avx512_scatter_pc = (app_pc)INT_MAX; static app_pc mask_update_test_avx2_gather_pc = (app_pc)INT_MAX; +#endif /* defined(X86) */ static ptr_int_t instru_mode; enum { @@ -257,6 +259,11 @@ event_bb_app2app(void *drcontext, void *tag, instrlist_t *bb, bool for_trace, scatter_gather_present = true; } else if (instr_is_scatter(instr)) { scatter_gather_present = true; +#if defined(X86) + /* TODO i#5036: Port this code to AArch64 to test state restoration of + * clobbered predicate registers (when we have added support for state + * restoration). + */ } else if (instr_is_mov_constant(instr, &val) && val == TEST_AVX512_GATHER_MASK_CLOBBER_MARKER) { instr_t *next_instr = instr_get_next(instr); @@ -325,6 +332,7 @@ event_bb_app2app(void *drcontext, void *tag, instrlist_t *bb, bool for_trace, search_for_next_gather_pc(drcontext, next_instr); } } +#endif /* defined(X86) */ } } bool expansion_ok = drx_expand_scatter_gather(drcontext, bb, &expanded); @@ -336,6 +344,11 @@ event_bb_app2app(void *drcontext, void *tag, instrlist_t *bb, bool for_trace, } CHECK((scatter_gather_present IF_X64(&&expanded)) || (expansion_ok && !expanded), "drx_expand_scatter_gather() bad OUT values"); +#if defined(X86) + /* TODO i#5036: Port this code to AArch64 to test state restoration of clobbered + * predicate registers (when we have added support for state + * restoration). + */ for (instr = instrlist_first(bb); instr != NULL; instr = instr_get_next(instr)) { if (instr_get_opcode(instr) == OP_kandnw && (instr_get_app_pc(instr) == mask_clobber_test_avx512_gather_pc || @@ -381,6 +394,7 @@ event_bb_app2app(void *drcontext, void *tag, instrlist_t *bb, bool for_trace, break; } } +#endif /* defined(X86) */ return DR_EMIT_DEFAULT; } diff --git a/suite/tests/client-interface/drx-scattergather.c b/suite/tests/client-interface/drx-scattergather-x86.c similarity index 100% rename from suite/tests/client-interface/drx-scattergather.c rename to suite/tests/client-interface/drx-scattergather-x86.c diff --git a/suite/tests/client-interface/drx-scattergather.templatex b/suite/tests/client-interface/drx-scattergather-x86.templatex similarity index 100% rename from suite/tests/client-interface/drx-scattergather.templatex rename to suite/tests/client-interface/drx-scattergather-x86.templatex diff --git a/suite/tests/client-interface/drx-scattergather.dll.c b/suite/tests/client-interface/drx-scattergather.dll.c index ce2d8498b05..d3273cb0b34 100644 --- a/suite/tests/client-interface/drx-scattergather.dll.c +++ b/suite/tests/client-interface/drx-scattergather.dll.c @@ -58,6 +58,7 @@ inscount(uint num_instrs) global_sg_count += num_instrs; } +#if defined(X86) /* Global, because the markers will be in a different app2app list after breaking up * scatter/gather into separate basic blocks during expansion. */ @@ -66,6 +67,7 @@ static app_pc mask_update_test_avx512_gather_pc = (app_pc)INT_MAX; static app_pc mask_clobber_test_avx512_scatter_pc = (app_pc)INT_MAX; static app_pc mask_update_test_avx512_scatter_pc = (app_pc)INT_MAX; static app_pc mask_update_test_avx2_gather_pc = (app_pc)INT_MAX; +#endif static dr_emit_flags_t event_app_instruction(void *drcontext, void *tag, instrlist_t *bb, instr_t *instr, @@ -182,6 +184,11 @@ event_bb_app2app(void *drcontext, void *tag, instrlist_t *bb, bool for_trace, scatter_gather_present = true; } else if (instr_is_scatter(instr)) { scatter_gather_present = true; +#if defined(X86) + /* TODO i#5036: Port this code to AArch64 to test state restoration of + * clobbered predicate registers (when we have added support for state + * restoration). + */ } else if (instr_is_mov_constant(instr, &val) && val == TEST_AVX512_GATHER_MASK_CLOBBER_MARKER) { instr_t *next_instr = instr_get_next(instr); @@ -250,6 +257,7 @@ event_bb_app2app(void *drcontext, void *tag, instrlist_t *bb, bool for_trace, search_for_next_gather_pc(drcontext, next_instr); } } +#endif /* defined(X86) */ } } bool expansion_ok = drx_expand_scatter_gather(drcontext, bb, &expanded); @@ -261,6 +269,11 @@ event_bb_app2app(void *drcontext, void *tag, instrlist_t *bb, bool for_trace, } CHECK((scatter_gather_present IF_X64(&&expanded)) || (expansion_ok && !expanded), "drx_expand_scatter_gather() bad OUT values"); +#if defined(X86) + /* TODO i#5036: Port this code to AArch64 to test state restoration of clobbered + * predicate registers (when we have added support for state + * restoration). + */ for (instr = instrlist_first(bb); instr != NULL; instr = instr_get_next(instr)) { if (instr_get_opcode(instr) == OP_kandnw && (instr_get_app_pc(instr) == mask_clobber_test_avx512_gather_pc || @@ -306,6 +319,7 @@ event_bb_app2app(void *drcontext, void *tag, instrlist_t *bb, bool for_trace, break; } } +#endif /* defined(X86) */ *user_data = (uint *)dr_thread_alloc(drcontext, sizeof(uint)); return DR_EMIT_DEFAULT; }