Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

i#5036 A64 scatter/gather, part 1: Expand scalar+vector loads #6267

Merged
merged 33 commits into from
Aug 24, 2023
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
f005f94
i#5844 AArch64 drx_expand_scatter_gather() part 1
jackgallagher-arm Jun 8, 2023
ab5128c
Fix formatting
jackgallagher-arm Aug 16, 2023
7f939de
Try to fix AArch64 cross compile
jackgallagher-arm Aug 16, 2023
67a04e9
Revert "Try to fix AArch64 cross compile"
jackgallagher-arm Aug 16, 2023
91c4544
Try to fix AArch64 cross compile again
jackgallagher-arm Aug 16, 2023
148e802
Update issue numbers
jackgallagher-arm Aug 17, 2023
471a95a
Remove doxygen markup where doxygen is not run
jackgallagher-arm Aug 17, 2023
ede4ee2
Fix copy/paste error
jackgallagher-arm Aug 17, 2023
f602c61
Use TESTALL for bit checks
jackgallagher-arm Aug 17, 2023
7988174
Move EMIT macro inside function
jackgallagher-arm Aug 17, 2023
78a9776
Fix pnext comment
jackgallagher-arm Aug 18, 2023
baecaa8
Use INSTR_CREATE_not_sve_pred_b alias macro
jackgallagher-arm Aug 18, 2023
28c6fb0
Rename elsz to element_size in comments
jackgallagher-arm Aug 18, 2023
488c256
Fix typo
jackgallagher-arm Aug 18, 2023
dc70497
Only call prctl once
jackgallagher-arm Aug 18, 2023
d1d6bc4
Rename drx_thread* -> drx_scatter_gather_thread*
jackgallagher-arm Aug 18, 2023
d4d679f
Share more code between x86 and AArch64
jackgallagher-arm Aug 21, 2023
d64b646
Clarify scatter_gather_split_bb()
jackgallagher-arm Aug 21, 2023
989bc6c
Add assert
jackgallagher-arm Aug 21, 2023
bf43b0b
Merge remote-tracking branch 'github/master' into i5844-aarch64-expan…
jackgallagher-arm Aug 21, 2023
217b897
Update test template
jackgallagher-arm Aug 21, 2023
d964b23
Unroll the expanded loop
jackgallagher-arm Aug 21, 2023
efe2c1c
Fix formatting
jackgallagher-arm Aug 21, 2023
c2df888
Add state restoration TODO comments
jackgallagher-arm Aug 22, 2023
14f4c23
Check that spill slots are used correctly
jackgallagher-arm Aug 22, 2023
abdf288
Add comment
jackgallagher-arm Aug 22, 2023
5e7ad0a
Fix formatting
jackgallagher-arm Aug 22, 2023
be6d533
Merge branch 'master' into i5844-aarch64-expand-scatter-gather-part1
jackgallagher-arm Aug 22, 2023
980b803
Merge branch 'master' into i5844-aarch64-expand-scatter-gather-part1
jackgallagher-arm Aug 23, 2023
f8f1fcd
Fix typos
jackgallagher-arm Aug 24, 2023
edc3212
Add comment about loop unrolling
jackgallagher-arm Aug 24, 2023
6549079
Fix comment whitespace
jackgallagher-arm Aug 24, 2023
58169d3
Add arg name comments
jackgallagher-arm Aug 24, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -621,6 +621,12 @@ if (X86 AND UNIX)
check_avx512_processor_and_compiler_support(proc_supports_avx512)
jackgallagher-arm marked this conversation as resolved.
Show resolved Hide resolved
jackgallagher-arm marked this conversation as resolved.
Show resolved Hide resolved
endif ()

set(proc_supports_sve OFF)
if (AARCH64 AND UNIX)
set(CFLAGS_SVE "-march=armv8-a+sve")
check_sve_processor_and_compiler_support(proc_supports_sve)
endif ()

# Ensure that _AMD64_ or _X86_ are defined on Microsoft Windows, as otherwise
# um/winnt.h provided since Windows 10.0.22000 will error.
if (NOT UNIX)
Expand Down
102 changes: 102 additions & 0 deletions clients/drcachesim/tests/scattergather-aarch64.templatex
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
#ifdef __ARM_FEATURE_SVE
ld1b 32bit unscaled offset uxtw: PASS
ld1b 32bit unscaled offset sxtw: PASS
ld1b 32bit unpacked unscaled offset uxtw: PASS
ld1b 32bit unpacked unscaled offset sxtw: PASS
ld1b 64bit unscaled offset: PASS
ld1b 64bit unscaled offset Zt==Zm: PASS
ld1sb 32bit unscaled offset uxtw: PASS
ld1sb 32bit unscaled offset sxtw: PASS
ld1sb 32bit unpacked unscaled offset uxtw: PASS
ld1sb 32bit unpacked unscaled offset sxtw: PASS
ld1sb 64bit unscaled offset: PASS
ld1sb 64bit unscaled offset: PASS
ld1h 32bit scaled offset uxtw: PASS
ld1h 32bit scaled offset sxtw: PASS
ld1h 32bit unpacked scaled offset uxtw: PASS
ld1h 32bit unpacked scaled offset sxtw: PASS
ld1h 32bit unpacked unscaled offset uxtw: PASS
ld1h 32bit unpacked unscaled offset sxtw: PASS
ld1h 32bit unscaled offset uxtw: PASS
ld1h 32bit unscaled offset sxtw: PASS
ld1h 64bit scaled offset: PASS
ld1h 64bit unscaled offset: PASS
ld1h 64bit unscaled offset Zt==Zm: PASS
ld1sh 32bit scaled offset uxtw: PASS
ld1sh 32bit scaled offset sxtw: PASS
ld1sh 32bit unpacked scaled offset uxtw: PASS
ld1sh 32bit unpacked scaled offset sxtw: PASS
ld1sh 32bit unpacked unscaled offset uxtw: PASS
ld1sh 32bit unpacked unscaled offset sxtw: PASS
ld1sh 32bit unscaled offset uxtw: PASS
ld1sh 32bit unscaled offset sxtw: PASS
ld1sh 64bit scaled offset: PASS
ld1sh 64bit unscaled offset: PASS
ld1sh 64bit unscaled offset Zt==Zm: PASS
ld1w 32bit scaled offset uxtw: PASS
ld1w 32bit scaled offset sxtw: PASS
ld1w 32bit unpacked scaled offset uxtw: PASS
ld1w 32bit unpacked scaled offset sxtw: PASS
ld1w 32bit unpacked unscaled offset uxtw: PASS
ld1w 32bit unpacked unscaled offset sxtw: PASS
ld1w 32bit unscaled offset uxtw: PASS
ld1w 32bit unscaled offset sxtw: PASS
ld1w 64bit scaled offset: PASS
ld1w 64bit unscaled offset: PASS
ld1w 64bit unscaled offset Zt==Zm: PASS
ld1sw 32bit unpacked scaled offset uxtw: PASS
ld1sw 32bit unpacked scaled offset sxtw: PASS
ld1sw 32bit unpacked unscaled offset uxtw: PASS
ld1sw 32bit unpacked unscaled offset sxtw: PASS
ld1sw 64bit scaled offset: PASS
ld1sw 64bit unscaled offset: PASS
ld1sw 64bit unscaled offset Zt==Zm: PASS
ld1d 32bit unpacked scaled offset uxtw: PASS
ld1d 32bit unpacked scaled offset sxtw: PASS
ld1d 32bit unpacked unscaled offset uxtw: PASS
ld1d 32bit unpacked unscaled offset sxtw: PASS
ld1d 64bit scaled offset: PASS
ld1d 64bit unscaled offset: PASS
ld1d 64bit unscaled offset Zt==Zm: PASS
#endif /* __ARM_FEATURE_SVE */
---- <application exited with code 0> ----
Basic counts tool results:
jackgallagher-arm marked this conversation as resolved.
Show resolved Hide resolved
Total counts:
.* total \(fetched\) instructions
.* total unique \(fetched\) instructions
.* total non-fetched instructions
.* total prefetches
.* total data loads
.* total data stores
.* total icache flushes
.* total dcache flushes
1 total threads
.* total scheduling markers
.* total transfer markers
jackgallagher-arm marked this conversation as resolved.
Show resolved Hide resolved
.* total function id markers
.* total function return address markers
.* total function argument markers
.* total function return value markers
.* total physical address \+ virtual address marker pairs
.* total physical address unavailable markers
.* total other markers
.* total encodings
Thread .* counts:
.* \(fetched\) instructions
.* unique \(fetched\) instructions
.* non-fetched instructions
.* prefetches
.* data loads
.* data stores
.* icache flushes
.* dcache flushes
.* scheduling markers
.* transfer markers
.* function id markers
.* function return address markers
.* function argument markers
.* function return value markers
.* physical address \+ virtual address marker pairs
.* physical address unavailable markers
.* other markers
.* encodings
73 changes: 69 additions & 4 deletions core/ir/aarch64/instr.c
Original file line number Diff line number Diff line change
Expand Up @@ -588,17 +588,82 @@ DR_API
bool
instr_is_scatter(instr_t *instr)
{
/* FIXME i#3837: add support. */
ASSERT_NOT_IMPLEMENTED(false);
switch (instr_get_opcode(instr)) {
case OP_st1b:
case OP_st1h:
case OP_st1w:
case OP_st1d:
case OP_st2b:
case OP_st2h:
case OP_st2w:
case OP_st2d:
case OP_st3b:
case OP_st3h:
case OP_st3w:
case OP_st3d:
case OP_st4b:
case OP_st4h:
case OP_st4w:
case OP_st4d:
case OP_stnt1b:
case OP_stnt1h:
case OP_stnt1w:
case OP_stnt1d: return true;
}
return false;
}

DR_API
bool
instr_is_gather(instr_t *instr)
{
/* FIXME i#3837: add support. */
ASSERT_NOT_IMPLEMENTED(false);
switch (instr_get_opcode(instr)) {
case OP_ld1b:
case OP_ld1h:
case OP_ld1w:
case OP_ld1d:
case OP_ld1sb:
case OP_ld1sh:
case OP_ld1sw:
case OP_ld1rob:
case OP_ld1rqb:
case OP_ld1rqh:
case OP_ld1rqw:
case OP_ld1rqd:
case OP_ldff1b:
case OP_ldff1h:
case OP_ldff1w:
case OP_ldff1d:
case OP_ldff1sb:
case OP_ldff1sh:
case OP_ldff1sw:
case OP_ldnf1b:
case OP_ldnf1h:
case OP_ldnf1w:
case OP_ldnf1d:
case OP_ldnf1sb:
case OP_ldnf1sh:
case OP_ldnf1sw:
case OP_ldnt1b:
case OP_ldnt1h:
case OP_ldnt1w:
case OP_ldnt1d:
case OP_ldnt1sb:
case OP_ldnt1sh:
case OP_ldnt1sw:
case OP_ld2b:
case OP_ld2h:
case OP_ld2w:
case OP_ld2d:
case OP_ld3b:
case OP_ld3h:
case OP_ld3w:
case OP_ld3d:
case OP_ld4b:
case OP_ld4h:
case OP_ld4w:
case OP_ld4d: return true;
}
return false;
}

Expand Down
3 changes: 3 additions & 0 deletions core/ir/aarch64/instr_create_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -634,10 +634,13 @@
#define INSTR_CREATE_ldp(dc, rt1, rt2, mem) \
instr_create_2dst_1src(dc, OP_ldp, rt1, rt2, mem)
#define INSTR_CREATE_ldr(dc, Rd, mem) instr_create_1dst_1src((dc), OP_ldr, (Rd), (mem))
#define INSTR_CREATE_ldrsw(dc, Rd, mem) \
instr_create_1dst_1src((dc), OP_ldrsw, (Rd), (mem))
#define INSTR_CREATE_ldrb(dc, Rd, mem) instr_create_1dst_1src(dc, OP_ldrb, Rd, mem)
#define INSTR_CREATE_ldrsb(dc, Rd, mem) \
instr_create_1dst_1src((dc), OP_ldrsb, (Rd), (mem))
#define INSTR_CREATE_ldrh(dc, Rd, mem) instr_create_1dst_1src(dc, OP_ldrh, Rd, mem)
#define INSTR_CREATE_ldrsh(dc, Rd, mem) instr_create_1dst_1src(dc, OP_ldrsh, Rd, mem)
#define INSTR_CREATE_ldur(dc, rt, mem) instr_create_1dst_1src(dc, OP_ldur, rt, mem)
#define INSTR_CREATE_ldar(dc, Rt, mem) instr_create_1dst_1src((dc), OP_ldar, (Rt), (mem))
#define INSTR_CREATE_ldarb(dc, Rt, mem) \
Expand Down
4 changes: 4 additions & 0 deletions ext/drx/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ set(srcs
drx.c
drx_buf.c
scatter_gather_${ARCH_NAME}.c
scatter_gather_shared.c
# add more here
)

Expand Down Expand Up @@ -71,6 +72,9 @@ macro(configure_drx_target target)
if (WIN32)
target_link_libraries(${target} ntdll_imports)
endif ()
target_include_directories(${target}
PRIVATE
${CMAKE_CURRENT_SOURCE_DIR})
endmacro()

configure_drx_target(drx)
Expand Down
4 changes: 2 additions & 2 deletions ext/drx/drx.c
Original file line number Diff line number Diff line change
Expand Up @@ -71,8 +71,8 @@
# define IF_WINDOWS_ELSE(x, y) (y)
#endif

#ifdef X86
/* TODO i#3837: Add AArch64 support. */
#if defined(X86) || defined(AARCH64)
/* TODO i#3837: Complete AArch64 support. */
jackgallagher-arm marked this conversation as resolved.
Show resolved Hide resolved
# define PLATFORM_SUPPORTS_SCATTER_GATHER
#endif

Expand Down
Loading