-
Notifications
You must be signed in to change notification settings - Fork 4.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
JIT: Allow sharing call temps within statements #79574
JIT: Allow sharing call temps within statements #79574
Conversation
fgMakeOutgoingArgCopy has a hashset of shared temps that it uses before creating a new temp. However, this hash set is reset only at new statements which can cause regressions when forward sub moves several calls into the same statement. This adds support to allow the temps to be shared also within the same statement. To do this we must take care not to share temps that have overlapping lifetimes.
Tagging subscribers to this area: @JulieLeeMSFT, @jakobbotsch Issue DetailsfgMakeOutgoingArgCopy has a hashset of shared temps that it uses before creating a new temp. However, this hash set is reset only at new statements which can cause regressions when forward sub moves several calls into the same statement. This adds support to allow the temps to be shared also within the same statement. To do this we must take care not to share temps that have overlapping lifetimes. This fixes a regression seen in #79346.
|
The macro expands to multiple loops so "break" does not do what would be expected.
tmp = (unsigned)lclNum; | ||
found = true; | ||
JITDUMP("reusing outgoing struct arg\n"); | ||
break; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
FOREACH_HBV_BIT_SET
expands into 4 nested loops, so this break
is not doing what is expected here.
I have removed the macros and turned it into a callback based function instead to make it harder to make the same mistake in the future.
I first tried turning it into an iterator but it was quite complex and hard to follow when the loops are flattened, so I went with the callback approach instead.
Diffs. I spot checked a few of the regressions and they are all happening when the previous @@ -166,11 +166,11 @@
; V156 tmp131 [V156,T79] ( 2, 1 ) int -> r14 V82._length(offs=0x08) P-INDEP "field V82._length (fldOffset=0x8)"
;* V157 tmp132 [V157 ] ( 0, 0 ) struct (16) zero-ref "Promoted implicit byref"
;* V158 tmp133 [V158 ] ( 0, 0 ) struct (16) zero-ref "Promoted implicit byref"
-; V159 tmp134 [V159 ] ( 12, 12 ) struct (16) [rbp+78H] do-not-enreg[XSF] must-init addr-exposed "by-value struct argument"
+; V159 tmp134 [V159 ] ( 18, 18 ) struct (16) [rbp+78H] do-not-enreg[XSF] must-init addr-exposed "by-value struct argument"
; V160 tmp135 [V160 ] ( 18, 18 ) struct (16) [rbp+68H] do-not-enreg[XSF] must-init addr-exposed "by-value struct argument"
; V161 tmp136 [V161,T41] ( 2, 2 ) ref -> rcx single-def "argument with side effect"
; V162 tmp137 [V162,T42] ( 2, 2 ) ref -> rcx single-def "argument with side effect"
-; V163 tmp138 [V163 ] ( 12, 12 ) struct (16) [rbp+58H] do-not-enreg[XSF] must-init addr-exposed "by-value struct argument"
+; V163 tmp138 [V163 ] ( 6, 6 ) struct (16) [rbp+58H] do-not-enreg[XSF] must-init addr-exposed "by-value struct argument"
; V164 tmp139 [V164,T43] ( 2, 2 ) ref -> rcx single-def "argument with side effect"
; V165 tmp140 [V165,T44] ( 2, 2 ) ref -> rcx single-def "argument with side effect"
; V166 GsCookie [V166 ] ( 1, 1 ) long -> [rbp+00H] do-not-enreg[X] addr-exposed "GSSecurityCookie"
@@ -517,13 +517,13 @@ G_M30337_IG26:
call [<unknown method>]
mov ecx, dword ptr [rbp+54H]
call [System.Diagnostics.Debug:Assert(bool)]
- mov bword ptr [rbp+58H], rbx
- mov dword ptr [rbp+60H], r14d
+ mov bword ptr [rbp+78H], rbx
+ mov dword ptr [rbp+80H], r14d
mov rbx, bword ptr [rbp+18H]
mov bword ptr [rbp+68H], rbx
mov r14d, dword ptr [rbp+8CH]
mov dword ptr [rbp+70H], r14d
- lea rcx, [rbp+58H]
+ lea rcx, [rbp+78H]
lea rdx, [rbp+68H]
call [System.Numerics.BigIntegerCalculator:Square(System.ReadOnlySpan`1[uint],System.Span`1[uint])]
mov r12, gword ptr [rbp+40H]
@@ -538,7 +538,7 @@ G_M30337_IG26:
xor r8d, r8d
cmp dword ptr [rcx], ecx
call [<unknown method>]
- ;; size=162 bbWeight=0.50 PerfScore 19.50
+ ;; size=165 bbWeight=0.50 PerfScore 19.50
G_M30337_IG27:
mov ecx, dword ptr [rbp+50H]
call [System.Diagnostics.Debug:Assert(bool)]
@@ -549,15 +549,15 @@ G_M30337_IG27:
not ecx
shr ecx, 31
call [System.Diagnostics.Debug:Assert(bool)]
- mov bword ptr [rbp+58H], r12
+ mov bword ptr [rbp+78H], r12
mov r12d, dword ptr [rbp+88H]
- mov dword ptr [rbp+60H], r12d
- mov bword ptr [rbp+78H], rsi
- mov dword ptr [rbp+80H], r15d
+ mov dword ptr [rbp+80H], r12d
+ mov bword ptr [rbp+58H], rsi
+ mov dword ptr [rbp+60H], r15d
mov bword ptr [rbp+68H], rbx
mov dword ptr [rbp+70H], r14d
- lea rcx, [rbp+58H]
- lea rdx, [rbp+78H]
+ lea rcx, [rbp+78H]
+ lea rdx, [rbp+58H]
lea r8, [rbp+68H]
call [<unknown method>]
cmp r13d, edi
@@ -573,10 +573,10 @@ G_M30337_IG27:
call [System.Diagnostics.Debug:Assert(bool)]
mov bword ptr [rbp+68H], rsi
mov dword ptr [rbp+70H], edi
- mov bword ptr [rbp+58H], rbx
- mov dword ptr [rbp+60H], r14d
+ mov bword ptr [rbp+78H], rbx
+ mov dword ptr [rbp+80H], r14d
lea rcx, [rbp+68H]
- lea rdx, [rbp+58H]
+ lea rdx, [rbp+78H]
call [System.Numerics.BigIntegerCalculator:AddSelf(System.Span`1[uint],System.ReadOnlySpan`1[uint])]
mov rsi, gword ptr [rbp+38H]
test rsi, rsi
@@ -590,7 +590,7 @@ G_M30337_IG27:
xor r8d, r8d
cmp dword ptr [rcx], ecx
call [<unknown method>]
- ;; size=217 bbWeight=0.50 PerfScore 26.62
+ ;; size=220 bbWeight=0.50 PerfScore 26.62
G_M30337_IG28:
mov rcx, 0xD1FFAB1E
cmp qword ptr [rbp], rcx
@@ -621,11 +621,11 @@ G_M30337_IG32:
int3
;; size=6 bbWeight=0 PerfScore 0.00
-; Total bytes of code 1505, prolog size 68, PerfScore 575.98, instruction count 372, allocated bytes for code 1509 (MethodHash=b62d897e) for method System.Numerics.BigIntegerCalculator:Square(System.ReadOnlySpan`1[uint],System.Span`1[uint])
+; Total bytes of code 1511, prolog size 68, PerfScore 576.58, instruction count 372, allocated bytes for code 1515 (MethodHash=b62d897e) for method System.Numerics.BigIntegerCalculator:Square(System.ReadOnlySpan`1[uint],System.Span`1[uint]) Notice that we have more references to the "lower-numbered" local now, but it is allocated in the local frame right around the offset where we stop being able to use the smaller encoding for accesses. |
/azp run runtime-coreclr jitstress, runtime-coreclr libraries-jitstress |
/azp run Fuzzlyn |
Azure Pipelines successfully started running 1 pipeline(s). |
Azure Pipelines successfully started running 2 pipeline(s). |
cc @dotnet/jit-contrib PTAL @EgorBo |
Stress failure is known. The Fuzzlyn failures were #79543. |
Ping @EgorBo |
Going to rerun the CI on this to get fresh diffs. |
Diffs are the same as above, mostly some minor TP and CQ improvements, with a few regressions due to what I mentioned above. |
fgMakeOutgoingArgCopy has a hashset of shared temps that it uses before creating a new temp. However, this hash set is reset only at new statements which can cause regressions when forward sub moves several calls into the same statement.
This adds support to allow the temps to be shared also within the same statement. To do this we must take care not to share temps that have overlapping lifetimes.
This fixes a regression seen in #79346.