Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[NativeAOT] Save full ARM64 SIMD arg registers in UniversalTransition #74888

Merged
merged 2 commits into from
Sep 1, 2022
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 7 additions & 7 deletions src/coreclr/nativeaot/Runtime/StackFrameIterator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1105,13 +1105,13 @@ struct UniversalTransitionStackFrame
// Conservative GC reporting must be applied to everything between the base of the
// ReturnBlock and the top of the StackPassedArgs.
private:
uintptr_t m_pushedFP; // ChildSP+000 CallerSP-0C0 (0x08 bytes) (fp)
uintptr_t m_pushedLR; // ChildSP+008 CallerSP-0B8 (0x08 bytes) (lr)
uint64_t m_fpArgRegs[8]; // ChildSP+010 CallerSP-0B0 (0x40 bytes) (d0-d7)
uintptr_t m_returnBlock[4]; // ChildSP+050 CallerSP-070 (0x40 bytes)
uintptr_t m_intArgRegs[9]; // ChildSP+070 CallerSP-050 (0x48 bytes) (x0-x8)
uintptr_t m_alignmentPad; // ChildSP+0B8 CallerSP-008 (0x08 bytes)
uintptr_t m_stackPassedArgs[1]; // ChildSP+0C0 CallerSP+000 (unknown size)
uintptr_t m_pushedFP; // ChildSP+000 CallerSP-100 (0x08 bytes) (fp)
uintptr_t m_pushedLR; // ChildSP+008 CallerSP-0F8 (0x08 bytes) (lr)
Fp128 m_fpArgRegs[8]; // ChildSP+010 CallerSP-0F0 (0x80 bytes) (q0-q7)
uintptr_t m_returnBlock[4]; // ChildSP+090 CallerSP-070 (0x40 bytes)
uintptr_t m_intArgRegs[9]; // ChildSP+0B0 CallerSP-050 (0x48 bytes) (x0-x8)
uintptr_t m_alignmentPad; // ChildSP+0F8 CallerSP-008 (0x08 bytes)
uintptr_t m_stackPassedArgs[1]; // ChildSP+100 CallerSP+000 (unknown size)

public:
PTR_UIntNative get_CallerSP() { return GET_POINTER_TO_FIELD(m_stackPassedArgs[0]); }
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
//
// In the absence of trashing, such bugs can become undetectable if the code that
// dispatches the call happens to never touch the impacted argument register (e.g., xmm3 on
// amd64 or d5 on arm32). In such a case, the original enregistered argument will flow
// amd64 or q5 on arm64). In such a case, the original enregistered argument will flow
// unmodified into the eventual callee, obscuring the fact that the dispatcher failed to
// propagate the transition frame copy of this register.
//
Expand Down
42 changes: 21 additions & 21 deletions src/coreclr/nativeaot/Runtime/arm64/UniversalTransition.S
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
#define RETURN_BLOCK_SIZE (32)

#define COUNT_FLOAT_ARG_REGISTERS (8)
#define FLOAT_REGISTER_SIZE (8)
#define FLOAT_REGISTER_SIZE (16)
#define FLOAT_ARG_REGISTERS_SIZE (COUNT_FLOAT_ARG_REGISTERS * FLOAT_REGISTER_SIZE)

#define PUSHED_LR_SIZE (8)
Expand All @@ -50,7 +50,7 @@
//
// RhpUniversalTransition
//
// At input to this function, x0-8, d0-7 and the stack may contain any number of arguments.
// At input to this function, x0-8, q0-7 and the stack may contain any number of arguments.
//
// In addition, there are 2 extra arguments passed in the intra-procedure-call scratch register:
// xip0 will contain the managed function that is to be called by this transition function
Expand All @@ -63,16 +63,16 @@
//
// Frame layout is:
//
// {StackPassedArgs} ChildSP+0C0 CallerSP+000
// {AlignmentPad (0x8 bytes)} ChildSP+0B8 CallerSP-008
// {IntArgRegs (x0-x8) (0x48 bytes)} ChildSP+070 CallerSP-050
// {ReturnBlock (0x20 bytes)} ChildSP+050 CallerSP-070
// {StackPassedArgs} ChildSP+100 CallerSP+000
// {AlignmentPad (0x8 bytes)} ChildSP+0F8 CallerSP-008
// {IntArgRegs (x0-x8) (0x48 bytes)} ChildSP+0B0 CallerSP-050
// {ReturnBlock (0x20 bytes)} ChildSP+090 CallerSP-070
// -- The base address of the Return block is the TransitionBlock pointer, the floating point args are
// in the neg space of the TransitionBlock pointer. Note that the callee has knowledge of the exact
// layout of all pieces of the frame that lie at or above the pushed floating point registers.
// {FpArgRegs (d0-d7) (0x40 bytes)} ChildSP+010 CallerSP-0B0
// {PushedLR} ChildSP+008 CallerSP-0B8
// {PushedFP} ChildSP+000 CallerSP-0C0
// {FpArgRegs (q0-q7) (0x80 bytes)} ChildSP+010 CallerSP-0F0
// {PushedLR} ChildSP+008 CallerSP-0F8
// {PushedFP} ChildSP+000 CallerSP-100
//
// NOTE: If the frame layout ever changes, the C++ UniversalTransitionStackFrame structure
// must be updated as well.
Expand All @@ -95,10 +95,10 @@
PROLOG_SAVE_REG_PAIR_INDEXED fp, lr, -STACK_SIZE // ;; Push down stack pointer and store FP and LR

// Floating point registers
stp d0, d1, [sp, #(FLOAT_ARG_OFFSET )]
stp d2, d3, [sp, #(FLOAT_ARG_OFFSET + 0x10)]
stp d4, d5, [sp, #(FLOAT_ARG_OFFSET + 0x20)]
stp d6, d7, [sp, #(FLOAT_ARG_OFFSET + 0x30)]
stp q0, q1, [sp, #(FLOAT_ARG_OFFSET )]
stp q2, q3, [sp, #(FLOAT_ARG_OFFSET + 0x20)]
stp q4, q5, [sp, #(FLOAT_ARG_OFFSET + 0x40)]
stp q6, q7, [sp, #(FLOAT_ARG_OFFSET + 0x60)]

// Space for return buffer data (0x40 bytes)

Expand All @@ -112,10 +112,10 @@
#ifdef TRASH_SAVED_ARGUMENT_REGISTERS
PREPARE_EXTERNAL_VAR RhpFpTrashValues, x1

ldp d0,d1, [x1, 0x0]
ldp d2,d3, [x1, 0x10]
ldp d4,d5, [x1, 0x20]
ldp d6,d7, [x1, 0x30]
ldp q0,q1, [x1, 0x0]
ldp q2,q3, [x1, 0x20]
ldp q4,q5, [x1, 0x40]
ldp q6,q7, [x1, 0x60]

PREPARE_EXTERNAL_VAR RhpIntegerTrashValues, x1

Expand All @@ -139,10 +139,10 @@
mov x12, x0

// Restore floating point registers
ldp d0, d1, [sp, #(FLOAT_ARG_OFFSET )]
ldp d2, d3, [sp, #(FLOAT_ARG_OFFSET + 0x10)]
ldp d4, d5, [sp, #(FLOAT_ARG_OFFSET + 0x20)]
ldp d6, d7, [sp, #(FLOAT_ARG_OFFSET + 0x30)]
ldp q0, q1, [sp, #(FLOAT_ARG_OFFSET )]
ldp q2, q3, [sp, #(FLOAT_ARG_OFFSET + 0x20)]
ldp q4, q5, [sp, #(FLOAT_ARG_OFFSET + 0x40)]
ldp q6, q7, [sp, #(FLOAT_ARG_OFFSET + 0x60)]

// Restore the argument registers
ldp x0, x1, [sp, #(ARGUMENT_REGISTERS_OFFSET )]
Expand Down
34 changes: 17 additions & 17 deletions src/coreclr/nativeaot/Runtime/arm64/UniversalTransition.asm
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
#define RETURN_BLOCK_SIZE (32)

#define COUNT_FLOAT_ARG_REGISTERS (8)
#define FLOAT_REGISTER_SIZE (8)
#define FLOAT_REGISTER_SIZE (16)
#define FLOAT_ARG_REGISTERS_SIZE (COUNT_FLOAT_ARG_REGISTERS * FLOAT_REGISTER_SIZE)

#define PUSHED_LR_SIZE (8)
Expand Down Expand Up @@ -51,7 +51,7 @@
;;
;; RhpUniversalTransition
;;
;; At input to this function, x0-8, d0-7 and the stack may contain any number of arguments.
;; At input to this function, x0-8, q0-7 and the stack may contain any number of arguments.
;;
;; In addition, there are 2 extra arguments passed in the intra-procedure-call scratch register:
;; xip0 will contain the managed function that is to be called by this transition function
Expand All @@ -64,16 +64,16 @@
;;
;; Frame layout is:
;;
;; {StackPassedArgs} ChildSP+0C0 CallerSP+000
;; {AlignmentPad (0x8 bytes)} ChildSP+0B8 CallerSP-008
;; {IntArgRegs (x0-x8) (0x48 bytes)} ChildSP+070 CallerSP-050
;; {ReturnBlock (0x20 bytes)} ChildSP+050 CallerSP-070
;; {StackPassedArgs} ChildSP+100 CallerSP+000
;; {AlignmentPad (0x8 bytes)} ChildSP+0F8 CallerSP-008
;; {IntArgRegs (x0-x8) (0x48 bytes)} ChildSP+0A0 CallerSP-050
;; {ReturnBlock (0x20 bytes)} ChildSP+090 CallerSP-070
;; -- The base address of the Return block is the TransitionBlock pointer, the floating point args are
;; in the neg space of the TransitionBlock pointer. Note that the callee has knowledge of the exact
;; layout of all pieces of the frame that lie at or above the pushed floating point registers.
;; {FpArgRegs (d0-d7) (0x40 bytes)} ChildSP+010 CallerSP-0B0
;; {PushedLR} ChildSP+008 CallerSP-0B8
;; {PushedFP} ChildSP+000 CallerSP-0C0
;; {FpArgRegs (q0-q7) (0x80 bytes)} ChildSP+010 CallerSP-0F0
;; {PushedLR} ChildSP+008 CallerSP-0F8
;; {PushedFP} ChildSP+000 CallerSP-100
;;
;; NOTE: If the frame layout ever changes, the C++ UniversalTransitionStackFrame structure
;; must be updated as well.
Expand All @@ -97,10 +97,10 @@
PROLOG_SAVE_REG_PAIR fp, lr, #-STACK_SIZE! ;; Push down stack pointer and store FP and LR

;; Floating point registers
stp d0, d1, [sp, #(FLOAT_ARG_OFFSET )]
stp d2, d3, [sp, #(FLOAT_ARG_OFFSET + 0x10)]
stp d4, d5, [sp, #(FLOAT_ARG_OFFSET + 0x20)]
stp d6, d7, [sp, #(FLOAT_ARG_OFFSET + 0x30)]
stp q0, q1, [sp, #(FLOAT_ARG_OFFSET )]
stp q2, q3, [sp, #(FLOAT_ARG_OFFSET + 0x20)]
stp q4, q5, [sp, #(FLOAT_ARG_OFFSET + 0x40)]
stp q6, q7, [sp, #(FLOAT_ARG_OFFSET + 0x60)]

;; Space for return buffer data (0x40 bytes)

Expand Down Expand Up @@ -130,10 +130,10 @@
mov x12, x0

;; Restore floating point registers
ldp d0, d1, [sp, #(FLOAT_ARG_OFFSET )]
ldp d2, d3, [sp, #(FLOAT_ARG_OFFSET + 0x10)]
ldp d4, d5, [sp, #(FLOAT_ARG_OFFSET + 0x20)]
ldp d6, d7, [sp, #(FLOAT_ARG_OFFSET + 0x30)]
ldp q0, q1, [sp, #(FLOAT_ARG_OFFSET )]
ldp q2, q3, [sp, #(FLOAT_ARG_OFFSET + 0x20)]
ldp q4, q5, [sp, #(FLOAT_ARG_OFFSET + 0x40)]
ldp q6, q7, [sp, #(FLOAT_ARG_OFFSET + 0x60)]

;; Restore the argument registers
ldp x0, x1, [sp, #(ARGUMENT_REGISTERS_OFFSET )]
Expand Down