Skip to content

Commit

Permalink
[NativeAOT] Save full ARM64 SIMD arg registers in UniversalTransition (
Browse files Browse the repository at this point in the history
…#74888)

* Save full ARM64 SIMD arg registers in UniversalTransition

* remove unused SAVE/RESTORE argument macros
  • Loading branch information
VSadov authored Sep 1, 2022
1 parent ab2d195 commit 5e7cfbf
Show file tree
Hide file tree
Showing 5 changed files with 46 additions and 89 deletions.
14 changes: 7 additions & 7 deletions src/coreclr/nativeaot/Runtime/StackFrameIterator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1105,13 +1105,13 @@ struct UniversalTransitionStackFrame
// Conservative GC reporting must be applied to everything between the base of the
// ReturnBlock and the top of the StackPassedArgs.
private:
uintptr_t m_pushedFP; // ChildSP+000 CallerSP-0C0 (0x08 bytes) (fp)
uintptr_t m_pushedLR; // ChildSP+008 CallerSP-0B8 (0x08 bytes) (lr)
uint64_t m_fpArgRegs[8]; // ChildSP+010 CallerSP-0B0 (0x40 bytes) (d0-d7)
uintptr_t m_returnBlock[4]; // ChildSP+050 CallerSP-070 (0x40 bytes)
uintptr_t m_intArgRegs[9]; // ChildSP+070 CallerSP-050 (0x48 bytes) (x0-x8)
uintptr_t m_alignmentPad; // ChildSP+0B8 CallerSP-008 (0x08 bytes)
uintptr_t m_stackPassedArgs[1]; // ChildSP+0C0 CallerSP+000 (unknown size)
uintptr_t m_pushedFP; // ChildSP+000 CallerSP-100 (0x08 bytes) (fp)
uintptr_t m_pushedLR; // ChildSP+008 CallerSP-0F8 (0x08 bytes) (lr)
Fp128 m_fpArgRegs[8]; // ChildSP+010 CallerSP-0F0 (0x80 bytes) (q0-q7)
uintptr_t m_returnBlock[4]; // ChildSP+090 CallerSP-070 (0x40 bytes)
uintptr_t m_intArgRegs[9]; // ChildSP+0B0 CallerSP-050 (0x48 bytes) (x0-x8)
uintptr_t m_alignmentPad; // ChildSP+0F8 CallerSP-008 (0x08 bytes)
uintptr_t m_stackPassedArgs[1]; // ChildSP+100 CallerSP+000 (unknown size)

public:
PTR_UIntNative get_CallerSP() { return GET_POINTER_TO_FIELD(m_stackPassedArgs[0]); }
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
//
// In the absence of trashing, such bugs can become undetectable if the code that
// dispatches the call happens to never touch the impacted argument register (e.g., xmm3 on
// amd64 or d5 on arm32). In such a case, the original enregistered argument will flow
// amd64 or q5 on arm64). In such a case, the original enregistered argument will flow
// unmodified into the eventual callee, obscuring the fact that the dispatcher failed to
// propagate the transition frame copy of this register.
//
Expand Down
42 changes: 21 additions & 21 deletions src/coreclr/nativeaot/Runtime/arm64/UniversalTransition.S
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
#define RETURN_BLOCK_SIZE (32)

#define COUNT_FLOAT_ARG_REGISTERS (8)
#define FLOAT_REGISTER_SIZE (8)
#define FLOAT_REGISTER_SIZE (16)
#define FLOAT_ARG_REGISTERS_SIZE (COUNT_FLOAT_ARG_REGISTERS * FLOAT_REGISTER_SIZE)

#define PUSHED_LR_SIZE (8)
Expand All @@ -50,7 +50,7 @@
//
// RhpUniversalTransition
//
// At input to this function, x0-8, d0-7 and the stack may contain any number of arguments.
// At input to this function, x0-8, q0-7 and the stack may contain any number of arguments.
//
// In addition, there are 2 extra arguments passed in the intra-procedure-call scratch register:
// xip0 will contain the managed function that is to be called by this transition function
Expand All @@ -63,16 +63,16 @@
//
// Frame layout is:
//
// {StackPassedArgs} ChildSP+0C0 CallerSP+000
// {AlignmentPad (0x8 bytes)} ChildSP+0B8 CallerSP-008
// {IntArgRegs (x0-x8) (0x48 bytes)} ChildSP+070 CallerSP-050
// {ReturnBlock (0x20 bytes)} ChildSP+050 CallerSP-070
// {StackPassedArgs} ChildSP+100 CallerSP+000
// {AlignmentPad (0x8 bytes)} ChildSP+0F8 CallerSP-008
// {IntArgRegs (x0-x8) (0x48 bytes)} ChildSP+0B0 CallerSP-050
// {ReturnBlock (0x20 bytes)} ChildSP+090 CallerSP-070
// -- The base address of the Return block is the TransitionBlock pointer, the floating point args are
// in the neg space of the TransitionBlock pointer. Note that the callee has knowledge of the exact
// layout of all pieces of the frame that lie at or above the pushed floating point registers.
// {FpArgRegs (d0-d7) (0x40 bytes)} ChildSP+010 CallerSP-0B0
// {PushedLR} ChildSP+008 CallerSP-0B8
// {PushedFP} ChildSP+000 CallerSP-0C0
// {FpArgRegs (q0-q7) (0x80 bytes)} ChildSP+010 CallerSP-0F0
// {PushedLR} ChildSP+008 CallerSP-0F8
// {PushedFP} ChildSP+000 CallerSP-100
//
// NOTE: If the frame layout ever changes, the C++ UniversalTransitionStackFrame structure
// must be updated as well.
Expand All @@ -95,10 +95,10 @@
PROLOG_SAVE_REG_PAIR_INDEXED fp, lr, -STACK_SIZE // ;; Push down stack pointer and store FP and LR

// Floating point registers
stp d0, d1, [sp, #(FLOAT_ARG_OFFSET )]
stp d2, d3, [sp, #(FLOAT_ARG_OFFSET + 0x10)]
stp d4, d5, [sp, #(FLOAT_ARG_OFFSET + 0x20)]
stp d6, d7, [sp, #(FLOAT_ARG_OFFSET + 0x30)]
stp q0, q1, [sp, #(FLOAT_ARG_OFFSET )]
stp q2, q3, [sp, #(FLOAT_ARG_OFFSET + 0x20)]
stp q4, q5, [sp, #(FLOAT_ARG_OFFSET + 0x40)]
stp q6, q7, [sp, #(FLOAT_ARG_OFFSET + 0x60)]

// Space for return buffer data (0x40 bytes)

Expand All @@ -112,10 +112,10 @@
#ifdef TRASH_SAVED_ARGUMENT_REGISTERS
PREPARE_EXTERNAL_VAR RhpFpTrashValues, x1

ldp d0,d1, [x1, 0x0]
ldp d2,d3, [x1, 0x10]
ldp d4,d5, [x1, 0x20]
ldp d6,d7, [x1, 0x30]
ldp q0,q1, [x1, 0x0]
ldp q2,q3, [x1, 0x20]
ldp q4,q5, [x1, 0x40]
ldp q6,q7, [x1, 0x60]

PREPARE_EXTERNAL_VAR RhpIntegerTrashValues, x1

Expand All @@ -139,10 +139,10 @@
mov x12, x0

// Restore floating point registers
ldp d0, d1, [sp, #(FLOAT_ARG_OFFSET )]
ldp d2, d3, [sp, #(FLOAT_ARG_OFFSET + 0x10)]
ldp d4, d5, [sp, #(FLOAT_ARG_OFFSET + 0x20)]
ldp d6, d7, [sp, #(FLOAT_ARG_OFFSET + 0x30)]
ldp q0, q1, [sp, #(FLOAT_ARG_OFFSET )]
ldp q2, q3, [sp, #(FLOAT_ARG_OFFSET + 0x20)]
ldp q4, q5, [sp, #(FLOAT_ARG_OFFSET + 0x40)]
ldp q6, q7, [sp, #(FLOAT_ARG_OFFSET + 0x60)]

// Restore the argument registers
ldp x0, x1, [sp, #(ARGUMENT_REGISTERS_OFFSET )]
Expand Down
34 changes: 17 additions & 17 deletions src/coreclr/nativeaot/Runtime/arm64/UniversalTransition.asm
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
#define RETURN_BLOCK_SIZE (32)

#define COUNT_FLOAT_ARG_REGISTERS (8)
#define FLOAT_REGISTER_SIZE (8)
#define FLOAT_REGISTER_SIZE (16)
#define FLOAT_ARG_REGISTERS_SIZE (COUNT_FLOAT_ARG_REGISTERS * FLOAT_REGISTER_SIZE)

#define PUSHED_LR_SIZE (8)
Expand Down Expand Up @@ -51,7 +51,7 @@
;;
;; RhpUniversalTransition
;;
;; At input to this function, x0-8, d0-7 and the stack may contain any number of arguments.
;; At input to this function, x0-8, q0-7 and the stack may contain any number of arguments.
;;
;; In addition, there are 2 extra arguments passed in the intra-procedure-call scratch register:
;; xip0 will contain the managed function that is to be called by this transition function
Expand All @@ -64,16 +64,16 @@
;;
;; Frame layout is:
;;
;; {StackPassedArgs} ChildSP+0C0 CallerSP+000
;; {AlignmentPad (0x8 bytes)} ChildSP+0B8 CallerSP-008
;; {IntArgRegs (x0-x8) (0x48 bytes)} ChildSP+070 CallerSP-050
;; {ReturnBlock (0x20 bytes)} ChildSP+050 CallerSP-070
;; {StackPassedArgs} ChildSP+100 CallerSP+000
;; {AlignmentPad (0x8 bytes)} ChildSP+0F8 CallerSP-008
;; {IntArgRegs (x0-x8) (0x48 bytes)} ChildSP+0A0 CallerSP-050
;; {ReturnBlock (0x20 bytes)} ChildSP+090 CallerSP-070
;; -- The base address of the Return block is the TransitionBlock pointer, the floating point args are
;; in the neg space of the TransitionBlock pointer. Note that the callee has knowledge of the exact
;; layout of all pieces of the frame that lie at or above the pushed floating point registers.
;; {FpArgRegs (d0-d7) (0x40 bytes)} ChildSP+010 CallerSP-0B0
;; {PushedLR} ChildSP+008 CallerSP-0B8
;; {PushedFP} ChildSP+000 CallerSP-0C0
;; {FpArgRegs (q0-q7) (0x80 bytes)} ChildSP+010 CallerSP-0F0
;; {PushedLR} ChildSP+008 CallerSP-0F8
;; {PushedFP} ChildSP+000 CallerSP-100
;;
;; NOTE: If the frame layout ever changes, the C++ UniversalTransitionStackFrame structure
;; must be updated as well.
Expand All @@ -97,10 +97,10 @@
PROLOG_SAVE_REG_PAIR fp, lr, #-STACK_SIZE! ;; Push down stack pointer and store FP and LR

;; Floating point registers
stp d0, d1, [sp, #(FLOAT_ARG_OFFSET )]
stp d2, d3, [sp, #(FLOAT_ARG_OFFSET + 0x10)]
stp d4, d5, [sp, #(FLOAT_ARG_OFFSET + 0x20)]
stp d6, d7, [sp, #(FLOAT_ARG_OFFSET + 0x30)]
stp q0, q1, [sp, #(FLOAT_ARG_OFFSET )]
stp q2, q3, [sp, #(FLOAT_ARG_OFFSET + 0x20)]
stp q4, q5, [sp, #(FLOAT_ARG_OFFSET + 0x40)]
stp q6, q7, [sp, #(FLOAT_ARG_OFFSET + 0x60)]

;; Space for return buffer data (0x40 bytes)

Expand Down Expand Up @@ -130,10 +130,10 @@
mov x12, x0

;; Restore floating point registers
ldp d0, d1, [sp, #(FLOAT_ARG_OFFSET )]
ldp d2, d3, [sp, #(FLOAT_ARG_OFFSET + 0x10)]
ldp d4, d5, [sp, #(FLOAT_ARG_OFFSET + 0x20)]
ldp d6, d7, [sp, #(FLOAT_ARG_OFFSET + 0x30)]
ldp q0, q1, [sp, #(FLOAT_ARG_OFFSET )]
ldp q2, q3, [sp, #(FLOAT_ARG_OFFSET + 0x20)]
ldp q4, q5, [sp, #(FLOAT_ARG_OFFSET + 0x40)]
ldp q6, q7, [sp, #(FLOAT_ARG_OFFSET + 0x60)]

;; Restore the argument registers
ldp x0, x1, [sp, #(ARGUMENT_REGISTERS_OFFSET )]
Expand Down
43 changes: 0 additions & 43 deletions src/coreclr/nativeaot/Runtime/unix/unixasmmacrosarm64.inc
Original file line number Diff line number Diff line change
Expand Up @@ -164,49 +164,6 @@ C_FUNC(\Name):
brk #0
.endm

//-----------------------------------------------------------------------------
// The Following sets of SAVE_*_REGISTERS expect the memory to be reserved and
// base address to be passed in $reg
//

// Reserve 64 bytes of memory before calling SAVE_ARGUMENT_REGISTERS
.macro SAVE_ARGUMENT_REGISTERS reg, ofs

stp x0, x1, [\reg, #(\ofs)]
stp x2, x3, [\reg, #(\ofs + 16)]
stp x4, x5, [\reg, #(\ofs + 32)]
stp x6, x7, [\reg, #(\ofs + 48)]

.endm

// Reserve 64 bytes of memory before calling SAVE_FLOAT_ARGUMENT_REGISTERS
.macro SAVE_FLOAT_ARGUMENT_REGISTERS reg, ofs

stp d0, d1, [\reg, #(\ofs)]
stp d2, d3, [\reg, #(\ofs + 16)]
stp d4, d5, [\reg, #(\ofs + 32)]
stp d6, d7, [\reg, #(\ofs + 48)]

.endm

.macro RESTORE_ARGUMENT_REGISTERS reg, ofs

ldp x0, x1, [\reg, #(\ofs)]
ldp x2, x3, [\reg, #(\ofs + 16)]
ldp x4, x5, [\reg, #(\ofs + 32)]
ldp x6, x7, [\reg, #(\ofs + 48)]

.endm

.macro RESTORE_FLOAT_ARGUMENT_REGISTERS reg, ofs

ldp d0, d1, [\reg, #(\ofs)]
ldp d2, d3, [\reg, #(\ofs + 16)]
ldp d4, d5, [\reg, #(\ofs + 32)]
ldp d6, d7, [\reg, #(\ofs + 48)]

.endm

.macro EPILOG_BRANCH_REG reg

br \reg
Expand Down

0 comments on commit 5e7cfbf

Please sign in to comment.