From cfeedc683efcda5872d15dbea452504852a5b095 Mon Sep 17 00:00:00 2001 From: vsadov <8218165+VSadov@users.noreply.github.com> Date: Wed, 31 Aug 2022 13:04:25 -0700 Subject: [PATCH] Save full ARM64 SIMD arg registers in UniversalTransition --- .../nativeaot/Runtime/StackFrameIterator.cpp | 14 +++---- .../Runtime/UniversalTransitionHelpers.cpp | 2 +- .../Runtime/arm64/UniversalTransition.S | 42 +++++++++---------- .../Runtime/arm64/UniversalTransition.asm | 34 +++++++-------- 4 files changed, 46 insertions(+), 46 deletions(-) diff --git a/src/coreclr/nativeaot/Runtime/StackFrameIterator.cpp b/src/coreclr/nativeaot/Runtime/StackFrameIterator.cpp index 2fcceff6d0dd85..3da12598ba196f 100644 --- a/src/coreclr/nativeaot/Runtime/StackFrameIterator.cpp +++ b/src/coreclr/nativeaot/Runtime/StackFrameIterator.cpp @@ -1105,13 +1105,13 @@ struct UniversalTransitionStackFrame // Conservative GC reporting must be applied to everything between the base of the // ReturnBlock and the top of the StackPassedArgs. private: - uintptr_t m_pushedFP; // ChildSP+000 CallerSP-0C0 (0x08 bytes) (fp) - uintptr_t m_pushedLR; // ChildSP+008 CallerSP-0B8 (0x08 bytes) (lr) - uint64_t m_fpArgRegs[8]; // ChildSP+010 CallerSP-0B0 (0x40 bytes) (d0-d7) - uintptr_t m_returnBlock[4]; // ChildSP+050 CallerSP-070 (0x40 bytes) - uintptr_t m_intArgRegs[9]; // ChildSP+070 CallerSP-050 (0x48 bytes) (x0-x8) - uintptr_t m_alignmentPad; // ChildSP+0B8 CallerSP-008 (0x08 bytes) - uintptr_t m_stackPassedArgs[1]; // ChildSP+0C0 CallerSP+000 (unknown size) + uintptr_t m_pushedFP; // ChildSP+000 CallerSP-100 (0x08 bytes) (fp) + uintptr_t m_pushedLR; // ChildSP+008 CallerSP-0F8 (0x08 bytes) (lr) + Fp128 m_fpArgRegs[8]; // ChildSP+010 CallerSP-0F0 (0x80 bytes) (q0-q7) + uintptr_t m_returnBlock[4]; // ChildSP+090 CallerSP-070 (0x40 bytes) + uintptr_t m_intArgRegs[9]; // ChildSP+0B0 CallerSP-050 (0x48 bytes) (x0-x8) + uintptr_t m_alignmentPad; // ChildSP+0F8 CallerSP-008 (0x08 bytes) + uintptr_t m_stackPassedArgs[1]; // ChildSP+100 CallerSP+000 (unknown size) public: PTR_UIntNative get_CallerSP() { return GET_POINTER_TO_FIELD(m_stackPassedArgs[0]); } diff --git a/src/coreclr/nativeaot/Runtime/UniversalTransitionHelpers.cpp b/src/coreclr/nativeaot/Runtime/UniversalTransitionHelpers.cpp index 30a1ff269290ab..649aac21ac8d18 100644 --- a/src/coreclr/nativeaot/Runtime/UniversalTransitionHelpers.cpp +++ b/src/coreclr/nativeaot/Runtime/UniversalTransitionHelpers.cpp @@ -21,7 +21,7 @@ // // In the absence of trashing, such bugs can become undetectable if the code that // dispatches the call happens to never touch the impacted argument register (e.g., xmm3 on -// amd64 or d5 on arm32). In such a case, the original enregistered argument will flow +// amd64 or q5 on arm64). In such a case, the original enregistered argument will flow // unmodified into the eventual callee, obscuring the fact that the dispatcher failed to // propagate the transition frame copy of this register. // diff --git a/src/coreclr/nativeaot/Runtime/arm64/UniversalTransition.S b/src/coreclr/nativeaot/Runtime/arm64/UniversalTransition.S index 12fa42365f4c2e..8274b6b1110a6a 100644 --- a/src/coreclr/nativeaot/Runtime/arm64/UniversalTransition.S +++ b/src/coreclr/nativeaot/Runtime/arm64/UniversalTransition.S @@ -23,7 +23,7 @@ #define RETURN_BLOCK_SIZE (32) #define COUNT_FLOAT_ARG_REGISTERS (8) -#define FLOAT_REGISTER_SIZE (8) +#define FLOAT_REGISTER_SIZE (16) #define FLOAT_ARG_REGISTERS_SIZE (COUNT_FLOAT_ARG_REGISTERS * FLOAT_REGISTER_SIZE) #define PUSHED_LR_SIZE (8) @@ -50,7 +50,7 @@ // // RhpUniversalTransition // -// At input to this function, x0-8, d0-7 and the stack may contain any number of arguments. +// At input to this function, x0-8, q0-7 and the stack may contain any number of arguments. // // In addition, there are 2 extra arguments passed in the intra-procedure-call scratch register: // xip0 will contain the managed function that is to be called by this transition function @@ -63,16 +63,16 @@ // // Frame layout is: // -// {StackPassedArgs} ChildSP+0C0 CallerSP+000 -// {AlignmentPad (0x8 bytes)} ChildSP+0B8 CallerSP-008 -// {IntArgRegs (x0-x8) (0x48 bytes)} ChildSP+070 CallerSP-050 -// {ReturnBlock (0x20 bytes)} ChildSP+050 CallerSP-070 +// {StackPassedArgs} ChildSP+100 CallerSP+000 +// {AlignmentPad (0x8 bytes)} ChildSP+0F8 CallerSP-008 +// {IntArgRegs (x0-x8) (0x48 bytes)} ChildSP+0B0 CallerSP-050 +// {ReturnBlock (0x20 bytes)} ChildSP+090 CallerSP-070 // -- The base address of the Return block is the TransitionBlock pointer, the floating point args are // in the neg space of the TransitionBlock pointer. Note that the callee has knowledge of the exact // layout of all pieces of the frame that lie at or above the pushed floating point registers. -// {FpArgRegs (d0-d7) (0x40 bytes)} ChildSP+010 CallerSP-0B0 -// {PushedLR} ChildSP+008 CallerSP-0B8 -// {PushedFP} ChildSP+000 CallerSP-0C0 +// {FpArgRegs (q0-q7) (0x80 bytes)} ChildSP+010 CallerSP-0F0 +// {PushedLR} ChildSP+008 CallerSP-0F8 +// {PushedFP} ChildSP+000 CallerSP-100 // // NOTE: If the frame layout ever changes, the C++ UniversalTransitionStackFrame structure // must be updated as well. @@ -95,10 +95,10 @@ PROLOG_SAVE_REG_PAIR_INDEXED fp, lr, -STACK_SIZE // ;; Push down stack pointer and store FP and LR // Floating point registers - stp d0, d1, [sp, #(FLOAT_ARG_OFFSET )] - stp d2, d3, [sp, #(FLOAT_ARG_OFFSET + 0x10)] - stp d4, d5, [sp, #(FLOAT_ARG_OFFSET + 0x20)] - stp d6, d7, [sp, #(FLOAT_ARG_OFFSET + 0x30)] + stp q0, q1, [sp, #(FLOAT_ARG_OFFSET )] + stp q2, q3, [sp, #(FLOAT_ARG_OFFSET + 0x20)] + stp q4, q5, [sp, #(FLOAT_ARG_OFFSET + 0x40)] + stp q6, q7, [sp, #(FLOAT_ARG_OFFSET + 0x60)] // Space for return buffer data (0x40 bytes) @@ -112,10 +112,10 @@ #ifdef TRASH_SAVED_ARGUMENT_REGISTERS PREPARE_EXTERNAL_VAR RhpFpTrashValues, x1 - ldp d0,d1, [x1, 0x0] - ldp d2,d3, [x1, 0x10] - ldp d4,d5, [x1, 0x20] - ldp d6,d7, [x1, 0x30] + ldp q0,q1, [x1, 0x0] + ldp q2,q3, [x1, 0x20] + ldp q4,q5, [x1, 0x40] + ldp q6,q7, [x1, 0x60] PREPARE_EXTERNAL_VAR RhpIntegerTrashValues, x1 @@ -139,10 +139,10 @@ mov x12, x0 // Restore floating point registers - ldp d0, d1, [sp, #(FLOAT_ARG_OFFSET )] - ldp d2, d3, [sp, #(FLOAT_ARG_OFFSET + 0x10)] - ldp d4, d5, [sp, #(FLOAT_ARG_OFFSET + 0x20)] - ldp d6, d7, [sp, #(FLOAT_ARG_OFFSET + 0x30)] + ldp q0, q1, [sp, #(FLOAT_ARG_OFFSET )] + ldp q2, q3, [sp, #(FLOAT_ARG_OFFSET + 0x20)] + ldp q4, q5, [sp, #(FLOAT_ARG_OFFSET + 0x40)] + ldp q6, q7, [sp, #(FLOAT_ARG_OFFSET + 0x60)] // Restore the argument registers ldp x0, x1, [sp, #(ARGUMENT_REGISTERS_OFFSET )] diff --git a/src/coreclr/nativeaot/Runtime/arm64/UniversalTransition.asm b/src/coreclr/nativeaot/Runtime/arm64/UniversalTransition.asm index 6f1fc0953cd985..2e23ea4302a4fc 100644 --- a/src/coreclr/nativeaot/Runtime/arm64/UniversalTransition.asm +++ b/src/coreclr/nativeaot/Runtime/arm64/UniversalTransition.asm @@ -23,7 +23,7 @@ #define RETURN_BLOCK_SIZE (32) #define COUNT_FLOAT_ARG_REGISTERS (8) -#define FLOAT_REGISTER_SIZE (8) +#define FLOAT_REGISTER_SIZE (16) #define FLOAT_ARG_REGISTERS_SIZE (COUNT_FLOAT_ARG_REGISTERS * FLOAT_REGISTER_SIZE) #define PUSHED_LR_SIZE (8) @@ -51,7 +51,7 @@ ;; ;; RhpUniversalTransition ;; -;; At input to this function, x0-8, d0-7 and the stack may contain any number of arguments. +;; At input to this function, x0-8, q0-7 and the stack may contain any number of arguments. ;; ;; In addition, there are 2 extra arguments passed in the intra-procedure-call scratch register: ;; xip0 will contain the managed function that is to be called by this transition function @@ -64,16 +64,16 @@ ;; ;; Frame layout is: ;; -;; {StackPassedArgs} ChildSP+0C0 CallerSP+000 -;; {AlignmentPad (0x8 bytes)} ChildSP+0B8 CallerSP-008 -;; {IntArgRegs (x0-x8) (0x48 bytes)} ChildSP+070 CallerSP-050 -;; {ReturnBlock (0x20 bytes)} ChildSP+050 CallerSP-070 +;; {StackPassedArgs} ChildSP+100 CallerSP+000 +;; {AlignmentPad (0x8 bytes)} ChildSP+0F8 CallerSP-008 +;; {IntArgRegs (x0-x8) (0x48 bytes)} ChildSP+0A0 CallerSP-050 +;; {ReturnBlock (0x20 bytes)} ChildSP+090 CallerSP-070 ;; -- The base address of the Return block is the TransitionBlock pointer, the floating point args are ;; in the neg space of the TransitionBlock pointer. Note that the callee has knowledge of the exact ;; layout of all pieces of the frame that lie at or above the pushed floating point registers. -;; {FpArgRegs (d0-d7) (0x40 bytes)} ChildSP+010 CallerSP-0B0 -;; {PushedLR} ChildSP+008 CallerSP-0B8 -;; {PushedFP} ChildSP+000 CallerSP-0C0 +;; {FpArgRegs (q0-q7) (0x80 bytes)} ChildSP+010 CallerSP-0F0 +;; {PushedLR} ChildSP+008 CallerSP-0F8 +;; {PushedFP} ChildSP+000 CallerSP-100 ;; ;; NOTE: If the frame layout ever changes, the C++ UniversalTransitionStackFrame structure ;; must be updated as well. @@ -97,10 +97,10 @@ PROLOG_SAVE_REG_PAIR fp, lr, #-STACK_SIZE! ;; Push down stack pointer and store FP and LR ;; Floating point registers - stp d0, d1, [sp, #(FLOAT_ARG_OFFSET )] - stp d2, d3, [sp, #(FLOAT_ARG_OFFSET + 0x10)] - stp d4, d5, [sp, #(FLOAT_ARG_OFFSET + 0x20)] - stp d6, d7, [sp, #(FLOAT_ARG_OFFSET + 0x30)] + stp q0, q1, [sp, #(FLOAT_ARG_OFFSET )] + stp q2, q3, [sp, #(FLOAT_ARG_OFFSET + 0x20)] + stp q4, q5, [sp, #(FLOAT_ARG_OFFSET + 0x40)] + stp q6, q7, [sp, #(FLOAT_ARG_OFFSET + 0x60)] ;; Space for return buffer data (0x40 bytes) @@ -130,10 +130,10 @@ mov x12, x0 ;; Restore floating point registers - ldp d0, d1, [sp, #(FLOAT_ARG_OFFSET )] - ldp d2, d3, [sp, #(FLOAT_ARG_OFFSET + 0x10)] - ldp d4, d5, [sp, #(FLOAT_ARG_OFFSET + 0x20)] - ldp d6, d7, [sp, #(FLOAT_ARG_OFFSET + 0x30)] + ldp q0, q1, [sp, #(FLOAT_ARG_OFFSET )] + ldp q2, q3, [sp, #(FLOAT_ARG_OFFSET + 0x20)] + ldp q4, q5, [sp, #(FLOAT_ARG_OFFSET + 0x40)] + ldp q6, q7, [sp, #(FLOAT_ARG_OFFSET + 0x60)] ;; Restore the argument registers ldp x0, x1, [sp, #(ARGUMENT_REGISTERS_OFFSET )]