Skip to content

Commit

Permalink
Merge pull request #3617 from bylaws/arm64ec-dispatcher
Browse files Browse the repository at this point in the history
FEXCore: ARM64EC x64 entry/exit support
  • Loading branch information
Sonicadvance1 authored May 8, 2024
2 parents 1fde5d7 + 61cd835 commit 2cae2f2
Show file tree
Hide file tree
Showing 20 changed files with 113 additions and 49 deletions.
4 changes: 4 additions & 0 deletions FEXCore/Source/Interface/Core/ArchHelpers/Arm64Emitter.h
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,10 @@ constexpr auto REG_AF = FEXCore::ARMEmitter::Reg::r24;
// Vector temporaries
constexpr auto VTMP1 = FEXCore::ARMEmitter::VReg::v16;
constexpr auto VTMP2 = FEXCore::ARMEmitter::VReg::v17;

// Entry/Exit ABI
constexpr auto EC_CALL_CHECKER_PC_REG = FEXCore::ARMEmitter::XReg::x9;
constexpr auto EC_ENTRY_CPUAREA_REG = FEXCore::ARMEmitter::XReg::x17;
#endif

// Predicate register temporaries (used when AVX support is enabled)
Expand Down
63 changes: 59 additions & 4 deletions FEXCore/Source/Interface/Core/Dispatcher/Dispatcher.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,11 @@ void Dispatcher::EmitDispatcher() {
ARMEmitter::ForwardLabel l_CTX;
ARMEmitter::SingleUseForwardLabel l_Sleep;
#ifdef _M_ARM_64EC
// These structures are not included in the standard Windows headers, define them here
static constexpr size_t TEBCPUAreaOffset = 0x1788;
static constexpr size_t CPUAreaInSyscallCallbackOffset = 0x1;
static constexpr size_t CPUAreaEmulatorStackLimitOffset = 0x8;
static constexpr size_t CPUAreaEmulatorDataOffset = 0x30;
ARMEmitter::SingleUseForwardLabel ExitEC;
#endif
ARMEmitter::SingleUseForwardLabel l_CompileBlock;
Expand All @@ -82,20 +87,45 @@ void Dispatcher::EmitDispatcher() {
AbsoluteLoopTopAddressFillSRA = GetCursorAddress<uint64_t>();

FillStaticRegs();
ARMEmitter::BiDirectionalLabel LoopTop {};

#ifdef _M_ARM_64EC
b(&LoopTop);

AbsoluteLoopTopAddressEnterECFillSRA = GetCursorAddress<uint64_t>();
ldr(STATE, EC_ENTRY_CPUAREA_REG, CPUAreaEmulatorDataOffset);
FillStaticRegs();

// Enter JIT
b(&LoopTop);

AbsoluteLoopTopAddressEnterEC = GetCursorAddress<uint64_t>();
// Load ThreadState and write the target PC there
ldr(STATE, EC_ENTRY_CPUAREA_REG, CPUAreaEmulatorDataOffset);
str(EC_CALL_CHECKER_PC_REG, STATE_PTR(CpuStateFrame, State.rip));

// Swap stacks to the emulator stack
ldr(TMP1, EC_ENTRY_CPUAREA_REG, CPUAreaEmulatorStackLimitOffset);
add(ARMEmitter::Size::i64Bit, StaticRegisters[X86State::REG_RSP], ARMEmitter::Reg::rsp, 0);
add(ARMEmitter::Size::i64Bit, ARMEmitter::Reg::rsp, TMP1, 0);

if (EmitterCTX->HostFeatures.SupportsSVE) {
ptrue(ARMEmitter::SubRegSize::i8Bit, PRED_TMP_16B, ARMEmitter::PredicatePattern::SVE_VL16);
}

// Enter JIT
#endif

// We want to ensure that we are 16 byte aligned at the top of this loop
Align16B();
ARMEmitter::BiDirectionalLabel FullLookup {};
ARMEmitter::BiDirectionalLabel CallBlock {};
ARMEmitter::BackwardLabel LoopTop {};

Bind(&LoopTop);
AbsoluteLoopTopAddress = GetCursorAddress<uint64_t>();

// Load in our RIP
// Don't modify TMP3 since it contains our RIP once the block doesn't exist
// IMPORTANT: Pointers.Common.ExitFunctionEC callsites/implementations need to be
// adjusted accordingly if this changes.
auto RipReg = TMP3;
ldr(RipReg, STATE_PTR(CpuStateFrame, State.rip));

Expand Down Expand Up @@ -177,7 +207,8 @@ void Dispatcher::EmitDispatcher() {
#ifdef _M_ARM_64EC
{
Bind(&ExitEC);
// Target PC is already loaded into TMP3 at the start of the dispatcher
add(ARMEmitter::Size::i64Bit, ARMEmitter::Reg::rsp, StaticRegisters[X86State::REG_RSP], 0);
mov(EC_CALL_CHECKER_PC_REG, RipReg);
ldr(TMP2, STATE_PTR(CpuStateFrame, Pointers.Common.ExitFunctionEC));
br(TMP2);
}
Expand All @@ -204,6 +235,12 @@ void Dispatcher::EmitDispatcher() {
add(ARMEmitter::Size::i64Bit, ARMEmitter::XReg::x0, ARMEmitter::XReg::x0, 1);
str(ARMEmitter::XReg::x0, STATE, offsetof(FEXCore::Core::CPUState, DeferredSignalRefCount));

#ifdef _M_ARM_64EC
ldr(ARMEmitter::XReg::x0, ARMEmitter::XReg::x18, TEBCPUAreaOffset);
LoadConstant(ARMEmitter::Size::i32Bit, ARMEmitter::Reg::r1, 1);
strb(ARMEmitter::WReg::w1, ARMEmitter::XReg::x0, CPUAreaInSyscallCallbackOffset);
#endif

mov(ARMEmitter::XReg::x0, STATE);
mov(ARMEmitter::XReg::x1, ARMEmitter::XReg::lr);

Expand All @@ -220,6 +257,11 @@ void Dispatcher::EmitDispatcher() {

FillStaticRegs();

#ifdef _M_ARM_64EC
ldr(TMP2, ARMEmitter::XReg::x18, TEBCPUAreaOffset);
strb(ARMEmitter::WReg::zr, TMP2, CPUAreaInSyscallCallbackOffset);
#endif

ldr(TMP2, STATE, offsetof(FEXCore::Core::CPUState, DeferredSignalRefCount));
sub(ARMEmitter::Size::i64Bit, TMP2, TMP2, 1);
str(TMP2, STATE, offsetof(FEXCore::Core::CPUState, DeferredSignalRefCount));
Expand All @@ -245,6 +287,12 @@ void Dispatcher::EmitDispatcher() {
add(ARMEmitter::Size::i64Bit, ARMEmitter::XReg::x0, ARMEmitter::XReg::x0, 1);
str(ARMEmitter::XReg::x0, STATE, offsetof(FEXCore::Core::CPUState, DeferredSignalRefCount));

#ifdef _M_ARM_64EC
ldr(ARMEmitter::XReg::x0, ARMEmitter::XReg::x18, TEBCPUAreaOffset);
LoadConstant(ARMEmitter::Size::i32Bit, ARMEmitter::Reg::r1, 1);
strb(ARMEmitter::WReg::w1, ARMEmitter::XReg::x0, CPUAreaInSyscallCallbackOffset);
#endif

ldr(ARMEmitter::XReg::x0, &l_CTX);
mov(ARMEmitter::XReg::x1, STATE);
// x2 contains guest RIP
Expand All @@ -259,6 +307,11 @@ void Dispatcher::EmitDispatcher() {

FillStaticRegs();

#ifdef _M_ARM_64EC
ldr(TMP1, ARMEmitter::XReg::x18, TEBCPUAreaOffset);
strb(ARMEmitter::WReg::zr, TMP1, CPUAreaInSyscallCallbackOffset);
#endif

ldr(TMP1, STATE, offsetof(FEXCore::Core::CPUState, DeferredSignalRefCount));
sub(ARMEmitter::Size::i64Bit, TMP1, TMP1, 1);
str(TMP1, STATE, offsetof(FEXCore::Core::CPUState, DeferredSignalRefCount));
Expand Down Expand Up @@ -494,6 +547,8 @@ void Dispatcher::InitThreadPointers(FEXCore::Core::InternalThreadState* Thread)

Common.DispatcherLoopTop = AbsoluteLoopTopAddress;
Common.DispatcherLoopTopFillSRA = AbsoluteLoopTopAddressFillSRA;
Common.DispatcherLoopTopEnterEC = AbsoluteLoopTopAddressEnterEC;
Common.DispatcherLoopTopEnterECFillSRA = AbsoluteLoopTopAddressEnterECFillSRA;
Common.ExitFunctionLinker = ExitFunctionLinkerAddress;
Common.ThreadStopHandlerSpillSRA = ThreadStopHandlerAddressSpillSRA;
Common.ThreadPauseHandlerSpillSRA = ThreadPauseHandlerAddressSpillSRA;
Expand Down
2 changes: 2 additions & 0 deletions FEXCore/Source/Interface/Core/Dispatcher/Dispatcher.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ class Dispatcher final : public Arm64Emitter {
uint64_t ThreadStopHandlerAddressSpillSRA {};
uint64_t AbsoluteLoopTopAddress {};
uint64_t AbsoluteLoopTopAddressFillSRA {};
uint64_t AbsoluteLoopTopAddressEnterEC {};
uint64_t AbsoluteLoopTopAddressEnterECFillSRA {};
uint64_t ThreadPauseHandlerAddress {};
uint64_t ThreadPauseHandlerAddressSpillSRA {};
uint64_t ExitFunctionLinkerAddress {};
Expand Down
3 changes: 2 additions & 1 deletion FEXCore/Source/Interface/Core/JIT/Arm64/BranchOps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,8 @@ DEF_OP(ExitFunction) {
if (IsInlineConstant(Op->NewRIP, &NewRIP) || IsInlineEntrypointOffset(Op->NewRIP, &NewRIP)) {
#ifdef _M_ARM_64EC
if (RtlIsEcCode(NewRIP)) {
LoadConstant(ARMEmitter::Size::i64Bit, TMP3, NewRIP);
add(ARMEmitter::Size::i64Bit, ARMEmitter::Reg::rsp, StaticRegisters[X86State::REG_RSP], 0);
LoadConstant(ARMEmitter::Size::i64Bit, EC_CALL_CHECKER_PC_REG, NewRIP);
ldr(TMP2, STATE_PTR(CpuStateFrame, Pointers.Common.ExitFunctionEC));
br(TMP2);
} else {
Expand Down
2 changes: 2 additions & 0 deletions FEXCore/include/FEXCore/Core/CoreState.h
Original file line number Diff line number Diff line change
Expand Up @@ -262,6 +262,8 @@ struct JITPointers {
* @{ */
uint64_t DispatcherLoopTop {};
uint64_t DispatcherLoopTopFillSRA {};
uint64_t DispatcherLoopTopEnterEC {};
uint64_t DispatcherLoopTopEnterECFillSRA {};
uint64_t ExitFunctionLinker {};
uint64_t ThreadStopHandlerSpillSRA {};
uint64_t ThreadPauseHandlerSpillSRA {};
Expand Down
4 changes: 2 additions & 2 deletions unittests/InstructionCountCI/Crypto/H0F3A.json
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@
"0x66 0x0f 0x3a 0xdf"
],
"ExpectedArm64ASM": [
"ldr q2, [x28, #2160]",
"ldr q2, [x28, #2176]",
"movi v3.2d, #0x0",
"mov v16.16b, v17.16b",
"unimplemented (Unimplemented)",
Expand All @@ -68,7 +68,7 @@
"0x66 0x0f 0x3a 0xdf"
],
"ExpectedArm64ASM": [
"ldr q2, [x28, #2160]",
"ldr q2, [x28, #2176]",
"movi v3.2d, #0x0",
"mov v16.16b, v17.16b",
"unimplemented (Unimplemented)",
Expand Down
2 changes: 1 addition & 1 deletion unittests/InstructionCountCI/FlagM/Secondary.json
Original file line number Diff line number Diff line change
Expand Up @@ -1612,7 +1612,7 @@
"Comment": "0x0f 0xd7",
"ExpectedArm64ASM": [
"ldr d2, [x28, #768]",
"ldr d3, [x28, #2272]",
"ldr d3, [x28, #2288]",
"cmlt v2.16b, v2.16b, #0",
"and v2.16b, v2.16b, v3.16b",
"addp v2.16b, v2.16b, v2.16b",
Expand Down
2 changes: 1 addition & 1 deletion unittests/InstructionCountCI/FlagM/Secondary_OpSize.json
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
"ExpectedInstructionCount": 7,
"Comment": "0x66 0x0f 0xd7",
"ExpectedArm64ASM": [
"ldr q2, [x28, #2272]",
"ldr q2, [x28, #2288]",
"cmlt v3.16b, v16.16b, #0",
"and v2.16b, v3.16b, v2.16b",
"addp v2.16b, v2.16b, v2.16b",
Expand Down
2 changes: 1 addition & 1 deletion unittests/InstructionCountCI/FlagM/VEX_map1.json
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@
"Map 1 0b01 0xd7 256-bit"
],
"ExpectedArm64ASM": [
"ldr q2, [x28, #2272]",
"ldr q2, [x28, #2288]",
"cmlt v3.16b, v16.16b, #0",
"and v2.16b, v3.16b, v2.16b",
"addp v2.16b, v2.16b, v2.16b",
Expand Down
16 changes: 8 additions & 8 deletions unittests/InstructionCountCI/FlagM/x87.json
Original file line number Diff line number Diff line change
Expand Up @@ -4516,7 +4516,7 @@
"orr w21, w22, w21",
"strb w21, [x28, #1026]",
"strb w20, [x28, #747]",
"ldr q2, [x28, #2304]",
"ldr q2, [x28, #2320]",
"add x0, x28, x20, lsl #4",
"str q2, [x0, #768]"
]
Expand All @@ -4536,7 +4536,7 @@
"orr w21, w22, w21",
"strb w21, [x28, #1026]",
"strb w20, [x28, #747]",
"ldr q2, [x28, #2320]",
"ldr q2, [x28, #2336]",
"add x0, x28, x20, lsl #4",
"str q2, [x0, #768]"
]
Expand All @@ -4556,7 +4556,7 @@
"orr w21, w22, w21",
"strb w21, [x28, #1026]",
"strb w20, [x28, #747]",
"ldr q2, [x28, #2336]",
"ldr q2, [x28, #2352]",
"add x0, x28, x20, lsl #4",
"str q2, [x0, #768]"
]
Expand All @@ -4576,7 +4576,7 @@
"orr w21, w22, w21",
"strb w21, [x28, #1026]",
"strb w20, [x28, #747]",
"ldr q2, [x28, #2352]",
"ldr q2, [x28, #2368]",
"add x0, x28, x20, lsl #4",
"str q2, [x0, #768]"
]
Expand All @@ -4596,7 +4596,7 @@
"orr w21, w22, w21",
"strb w21, [x28, #1026]",
"strb w20, [x28, #747]",
"ldr q2, [x28, #2368]",
"ldr q2, [x28, #2384]",
"add x0, x28, x20, lsl #4",
"str q2, [x0, #768]"
]
Expand All @@ -4616,7 +4616,7 @@
"orr w21, w22, w21",
"strb w21, [x28, #1026]",
"strb w20, [x28, #747]",
"ldr q2, [x28, #2384]",
"ldr q2, [x28, #2400]",
"add x0, x28, x20, lsl #4",
"str q2, [x0, #768]"
]
Expand Down Expand Up @@ -4780,7 +4780,7 @@
"eor v2.16b, v2.16b, v2.16b",
"mov v2.d[0], x0",
"mov v2.h[4], w1",
"ldr q3, [x28, #2304]",
"ldr q3, [x28, #2320]",
"mov w21, #0x0",
"strb w21, [x28, #746]",
"add x0, x28, x20, lsl #4",
Expand Down Expand Up @@ -5062,7 +5062,7 @@
"ldr q2, [x0, #768]",
"add x0, x28, x21, lsl #4",
"ldr q3, [x0, #768]",
"ldr q4, [x28, #2304]",
"ldr q4, [x28, #2320]",
"mrs x0, nzcv",
"str w0, [x28, #728]",
"stp x4, x5, [x28, #8]",
Expand Down
2 changes: 1 addition & 1 deletion unittests/InstructionCountCI/H0F38.json
Original file line number Diff line number Diff line change
Expand Up @@ -624,7 +624,7 @@
"0x66 0x0f 0x38 0x41"
],
"ExpectedArm64ASM": [
"ldr q2, [x28, #2048]",
"ldr q2, [x28, #2064]",
"zip1 v3.8h, v2.8h, v17.8h",
"zip2 v2.8h, v2.8h, v17.8h",
"umin v2.4s, v3.4s, v2.4s",
Expand Down
12 changes: 6 additions & 6 deletions unittests/InstructionCountCI/H0F3A.json
Original file line number Diff line number Diff line change
Expand Up @@ -315,7 +315,7 @@
"0x66 0x0f 0x3a 0x0c"
],
"ExpectedArm64ASM": [
"ldr q2, [x28, #2176]",
"ldr q2, [x28, #2192]",
"tbx v16.16b, {v17.16b}, v2.16b"
]
},
Expand All @@ -325,7 +325,7 @@
"0x66 0x0f 0x3a 0x0c"
],
"ExpectedArm64ASM": [
"ldr q2, [x28, #2192]",
"ldr q2, [x28, #2208]",
"tbx v16.16b, {v17.16b}, v2.16b"
]
},
Expand All @@ -344,7 +344,7 @@
"0x66 0x0f 0x3a 0x0c"
],
"ExpectedArm64ASM": [
"ldr q2, [x28, #2208]",
"ldr q2, [x28, #2224]",
"tbx v16.16b, {v17.16b}, v2.16b"
]
},
Expand All @@ -364,7 +364,7 @@
"0x66 0x0f 0x3a 0x0c"
],
"ExpectedArm64ASM": [
"ldr q2, [x28, #2224]",
"ldr q2, [x28, #2240]",
"tbx v16.16b, {v17.16b}, v2.16b"
]
},
Expand All @@ -383,7 +383,7 @@
"0x66 0x0f 0x3a 0x0c"
],
"ExpectedArm64ASM": [
"ldr q2, [x28, #2240]",
"ldr q2, [x28, #2256]",
"tbx v16.16b, {v17.16b}, v2.16b"
]
},
Expand All @@ -393,7 +393,7 @@
"0x66 0x0f 0x3a 0x0c"
],
"ExpectedArm64ASM": [
"ldr q2, [x28, #2256]",
"ldr q2, [x28, #2272]",
"tbx v16.16b, {v17.16b}, v2.16b"
]
},
Expand Down
8 changes: 4 additions & 4 deletions unittests/InstructionCountCI/PrimaryGroup.json
Original file line number Diff line number Diff line change
Expand Up @@ -2856,7 +2856,7 @@
"mov x0, x6",
"mov x1, x20",
"mov x2, x7",
"ldr x3, [x28, #2432]",
"ldr x3, [x28, #2448]",
"str x30, [sp, #-16]!",
"blr x3",
"ldr x30, [sp], #16",
Expand All @@ -2867,7 +2867,7 @@
"mov x0, x6",
"mov x1, x20",
"mov x2, x7",
"ldr x3, [x28, #2448]",
"ldr x3, [x28, #2464]",
"str x30, [sp, #-16]!",
"blr x3",
"ldr x30, [sp], #16",
Expand Down Expand Up @@ -2928,7 +2928,7 @@
"mov x0, x6",
"mov x1, x20",
"mov x2, x7",
"ldr x3, [x28, #2440]",
"ldr x3, [x28, #2456]",
"str x30, [sp, #-16]!",
"blr x3",
"ldr x30, [sp], #16",
Expand All @@ -2941,7 +2941,7 @@
"mov x0, x6",
"mov x1, x20",
"mov x2, x7",
"ldr x3, [x28, #2456]",
"ldr x3, [x28, #2472]",
"str x30, [sp, #-16]!",
"blr x3",
"ldr x30, [sp], #16",
Expand Down
Loading

0 comments on commit 2cae2f2

Please sign in to comment.