-
Notifications
You must be signed in to change notification settings - Fork 4.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
JIT: Add a pass of early liveness and use it for forward sub and last-use copy elision for implicit byrefs #79346
Conversation
Requires more handling for memory liveness
Tagging subscribers to this area: @JulieLeeMSFT, @jakobbotsch Issue DetailsThis runs a pass of liveness right after local morph and uses it for forward sub and to omit copies of structs when passed as implicit byrefs at their last use. Presumably there's other optimization opportunities as well. TP impact quite high but I have lots of ideas to optimize this.
|
This comment was marked as outdated.
This comment was marked as outdated.
This comment was marked as outdated.
This comment was marked as outdated.
This comment was marked as outdated.
This comment was marked as outdated.
Again looks like a forward sub results in different execution order, which results in worse register allocation (we need another callee-save). @@ -9,32 +9,34 @@
;
; V00 RetBuf [V00,T01] ( 5, 5 ) byref -> rsi single-def
; V01 arg0 [V01,T00] ( 4, 8 ) byref -> rdx single-def
-; V02 loc0 [V02,T02] ( 4, 4 ) double -> mm0
-; V03 loc1 [V03,T05] ( 3, 3 ) double -> mm1
-; V04 loc2 [V04,T08] ( 2, 2 ) double -> mm6
-; V05 loc3 [V05,T09] ( 2, 2 ) double -> mm7
+; V02 loc0 [V02,T02] ( 4, 4 ) double -> mm6
+; V03 loc1 [V03,T05] ( 3, 3 ) double -> mm7
+; V04 loc2 [V04,T08] ( 2, 2 ) double -> mm9
+;* V05 loc3 [V05 ] ( 0, 0 ) double -> zero-ref
; V06 OutArgs [V06 ] ( 1, 1 ) lclBlk (32) [rsp+00H] "OutgoingArgSpace"
;* V07 tmp1 [V07 ] ( 0, 0 ) struct (16) zero-ref ld-addr-op "NewObj constructor temp"
-; V08 tmp2 [V08,T03] ( 2, 4 ) double -> mm7 "Inlining Arg"
+; V08 tmp2 [V08,T03] ( 2, 4 ) double -> mm6 "Inlining Arg"
; V09 tmp3 [V09,T04] ( 2, 4 ) double -> mm0 "Inlining Arg"
; V10 tmp4 [V10,T06] ( 3, 3 ) double -> [rsp+28H] V14.m_real(offs=0x00) P-INDEP "field V01.m_real (fldOffset=0x0)"
-; V11 tmp5 [V11,T10] ( 2, 2 ) double -> mm1 V14.m_imaginary(offs=0x08) P-INDEP "field V01.m_imaginary (fldOffset=0x8)"
-; V12 tmp6 [V12,T11] ( 2, 2 ) double -> mm7 V07.m_real(offs=0x00) P-INDEP "field V07.m_real (fldOffset=0x0)"
-; V13 tmp7 [V13,T12] ( 2, 2 ) double -> mm0 V07.m_imaginary(offs=0x08) P-INDEP "field V07.m_imaginary (fldOffset=0x8)"
+; V11 tmp5 [V11,T09] ( 2, 2 ) double -> mm1 V14.m_imaginary(offs=0x08) P-INDEP "field V01.m_imaginary (fldOffset=0x8)"
+; V12 tmp6 [V12,T10] ( 2, 2 ) double -> mm6 V07.m_real(offs=0x00) P-INDEP "field V07.m_real (fldOffset=0x0)"
+; V13 tmp7 [V13,T11] ( 2, 2 ) double -> mm0 V07.m_imaginary(offs=0x08) P-INDEP "field V07.m_imaginary (fldOffset=0x8)"
;* V14 tmp8 [V14 ] ( 0, 0 ) struct (16) zero-ref "Promoted implicit byref"
-; V15 cse0 [V15,T07] ( 3, 3 ) double -> mm3 "CSE - aggressive"
+; V15 cse0 [V15,T07] ( 3, 3 ) double -> mm8 "CSE - aggressive"
;
-; Lcl frame size = 80
+; Lcl frame size = 112
G_M28178_IG01: ; bbWeight=1, gcrefRegs=00000000 {}, byrefRegs=00000000 {}, byref, nogc <-- Prolog IG
push rsi
- sub rsp, 80
+ sub rsp, 112
vzeroupper
- vmovaps xmmword ptr [rsp+40H], xmm6
- vmovaps xmmword ptr [rsp+30H], xmm7
+ vmovaps xmmword ptr [rsp+60H], xmm6
+ vmovaps xmmword ptr [rsp+50H], xmm7
+ vmovaps xmmword ptr [rsp+40H], xmm8
+ vmovaps xmmword ptr [rsp+30H], xmm9
mov rsi, rcx
; byrRegs +[rsi]
- ;; size=23 bbWeight=1 PerfScore 6.50
+ ;; size=35 bbWeight=1 PerfScore 10.50
G_M28178_IG02: ; bbWeight=1, gcrefRegs=00000000 {}, byrefRegs=00000044 {rdx rsi}, byref
; byrRegs +[rdx]
vmovsd xmm0, qword ptr [rdx]
@@ -44,38 +46,41 @@ G_M28178_IG02: ; bbWeight=1, gcrefRegs=00000000 {}, byrefRegs=00000044 {r
call <unknown method>
; byrRegs -[rdx]
; gcr arg pop 0
- vmovsd xmm1, qword ptr [reloc @RWD00]
- vdivsd xmm1, xmm1, xmm0
- vsubsd xmm2, xmm0, xmm1
- vmovsd xmm3, qword ptr [reloc @RWD08]
- vmulsd xmm6, xmm2, xmm3
- vaddsd xmm0, xmm0, xmm1
- vmulsd xmm7, xmm0, xmm3
+ vmovaps xmm6, xmm0
+ vmovsd xmm0, qword ptr [reloc @RWD00]
+ vdivsd xmm7, xmm0, xmm6
+ vsubsd xmm0, xmm6, xmm7
+ vmovsd xmm8, qword ptr [reloc @RWD08]
+ vmulsd xmm9, xmm0, xmm8
vmovsd xmm0, qword ptr [rsp+28H]
call <unknown method>
; gcr arg pop 0
- vmulsd xmm7, xmm0, xmm7
+ vaddsd xmm1, xmm6, xmm7
+ vmulsd xmm1, xmm1, xmm8
+ vmulsd xmm6, xmm0, xmm1
vmovsd xmm0, qword ptr [rsp+28H]
call <unknown method>
; gcr arg pop 0
- vmulsd xmm0, xmm0, xmm6
- vmovsd qword ptr [rsi], xmm7
+ vmulsd xmm0, xmm0, xmm9
+ vmovsd qword ptr [rsi], xmm6
vmovsd qword ptr [rsi+08H], xmm0
mov rax, rsi
; byrRegs +[rax]
- ;; size=102 bbWeight=1 PerfScore 58.50
+ ;; size=109 bbWeight=1 PerfScore 58.75
G_M28178_IG03: ; bbWeight=1, epilog, nogc, extend
- vmovaps xmm6, xmmword ptr [rsp+40H]
- vmovaps xmm7, xmmword ptr [rsp+30H]
- add rsp, 80
+ vmovaps xmm6, xmmword ptr [rsp+60H]
+ vmovaps xmm7, xmmword ptr [rsp+50H]
+ vmovaps xmm8, xmmword ptr [rsp+40H]
+ vmovaps xmm9, xmmword ptr [rsp+30H]
+ add rsp, 112
pop rsi
ret
- ;; size=18 bbWeight=1 PerfScore 9.75
+ ;; size=30 bbWeight=1 PerfScore 17.75
RWD00 dq 3FF0000000000000h ; 1
RWD08 dq 3FE0000000000000h ; 0.5
-; Total bytes of code 143, prolog size 20, PerfScore 89.05, instruction count 32, allocated bytes for code 143 (MethodHash=72b691ed) for method System.Numerics.Complex:Sin(System.Numerics.Complex):System.Numerics.Complex
+; Total bytes of code 174, prolog size 32, PerfScore 104.40, instruction count 37, allocated bytes for code 174 (MethodHash=72b691ed) for method System.Numerics.Complex:Sin(System.Numerics.Complex):System.Numerics.Complex
; ============================================================ Potentially |
@@ -7,32 +7,32 @@
; 0 inlinees with PGO data; 16 single block inlinees; 0 inlinees without PGO data
; Final local variable assignments
;
-; V00 this [V00,T08] ( 6, 6 ) byref -> rcx this single-def
-; V01 loc0 [V01,T24] ( 2, 2 ) long -> rdx
+; V00 this [V00,T00] ( 6, 6 ) byref -> rcx this single-def
+;* V01 loc0 [V01 ] ( 0, 0 ) long -> zero-ref
;# V02 OutArgs [V02 ] ( 1, 1 ) lclBlk ( 0) [rsp+00H] "OutgoingArgSpace"
-; V03 tmp1 [V03,T09] ( 2, 4 ) long -> rdx "impAppendStmt"
-; V04 tmp2 [V04,T10] ( 2, 4 ) long -> rdx "impAppendStmt"
-; V05 tmp3 [V05,T11] ( 2, 4 ) long -> rdx "impAppendStmt"
-; V06 tmp4 [V06,T12] ( 2, 4 ) long -> rdx "Inlining Arg"
-; V07 tmp5 [V07,T13] ( 2, 4 ) long -> r9 "Inlining Arg"
-; V08 tmp6 [V08,T14] ( 2, 4 ) long -> r10 "Inlining Arg"
-; V09 tmp7 [V09,T15] ( 2, 4 ) long -> r10 "Inlining Arg"
-; V10 tmp8 [V10,T00] ( 6, 12 ) long -> rdx "Inlining Arg"
-; V11 tmp9 [V11,T16] ( 2, 4 ) long -> rax "Inlining Arg"
-; V12 tmp10 [V12,T01] ( 6, 12 ) long -> rax "Inlining Arg"
-; V13 tmp11 [V13,T02] ( 6, 12 ) long -> rax "Inlining Arg"
-; V14 tmp12 [V14,T17] ( 2, 4 ) long -> r8 "Inlining Arg"
-; V15 tmp13 [V15,T03] ( 6, 12 ) long -> rax "Inlining Arg"
-; V16 tmp14 [V16,T04] ( 6, 12 ) long -> rax "Inlining Arg"
-; V17 tmp15 [V17,T18] ( 2, 4 ) long -> r9 "Inlining Arg"
-; V18 tmp16 [V18,T05] ( 6, 12 ) long -> rdx "Inlining Arg"
-; V19 tmp17 [V19,T06] ( 6, 12 ) long -> rax "Inlining Arg"
-; V20 tmp18 [V20,T19] ( 2, 4 ) long -> rcx "Inlining Arg"
-; V21 tmp19 [V21,T07] ( 6, 12 ) long -> rdx "Inlining Arg"
-; V22 cse0 [V22,T20] ( 3, 3 ) long -> r8 "CSE - moderate"
-; V23 cse1 [V23,T21] ( 3, 3 ) long -> r9 "CSE - moderate"
-; V24 cse2 [V24,T22] ( 3, 3 ) long -> rcx "CSE - moderate"
-; V25 cse3 [V25,T23] ( 3, 3 ) long -> rax "CSE - moderate"
+; V03 tmp1 [V03,T05] ( 2, 4 ) long -> rdx "impAppendStmt"
+; V04 tmp2 [V04,T06] ( 2, 4 ) long -> rdx "impAppendStmt"
+; V05 tmp3 [V05,T07] ( 2, 4 ) long -> rdx "impAppendStmt"
+; V06 tmp4 [V06,T08] ( 2, 4 ) long -> rdx "Inlining Arg"
+; V07 tmp5 [V07,T09] ( 2, 4 ) long -> r9 "Inlining Arg"
+; V08 tmp6 [V08,T10] ( 2, 4 ) long -> r10 "Inlining Arg"
+; V09 tmp7 [V09,T11] ( 2, 4 ) long -> r10 "Inlining Arg"
+; V10 tmp8 [V10,T01] ( 4, 8 ) long -> rdx "Inlining Arg"
+;* V11 tmp9 [V11 ] ( 0, 0 ) long -> zero-ref "Inlining Arg"
+; V12 tmp10 [V12,T12] ( 2, 4 ) long -> rax "Inlining Arg"
+; V13 tmp11 [V13,T02] ( 4, 8 ) long -> rax "Inlining Arg"
+;* V14 tmp12 [V14 ] ( 0, 0 ) long -> zero-ref "Inlining Arg"
+; V15 tmp13 [V15,T13] ( 2, 4 ) long -> rdx "Inlining Arg"
+; V16 tmp14 [V16,T03] ( 4, 8 ) long -> rax "Inlining Arg"
+;* V17 tmp15 [V17 ] ( 0, 0 ) long -> zero-ref "Inlining Arg"
+; V18 tmp16 [V18,T14] ( 2, 4 ) long -> rdx "Inlining Arg"
+; V19 tmp17 [V19,T04] ( 4, 8 ) long -> rax "Inlining Arg"
+;* V20 tmp18 [V20 ] ( 0, 0 ) long -> zero-ref "Inlining Arg"
+; V21 tmp19 [V21,T15] ( 2, 4 ) long -> rdx "Inlining Arg"
+; V22 cse0 [V22,T16] ( 3, 3 ) long -> r8 "CSE - moderate"
+; V23 cse1 [V23,T17] ( 3, 3 ) long -> r9 "CSE - moderate"
+; V24 cse2 [V24,T18] ( 3, 3 ) long -> rcx "CSE - moderate"
+; V25 cse3 [V25,T19] ( 3, 3 ) long -> rax "CSE - moderate"
;
; Lcl frame size = 0
@@ -62,39 +62,42 @@ G_M64746_IG02: ; bbWeight=1, gcrefRegs=00000000 {}, byrefRegs=00000002 {r
mov r10, 0xD1FFAB1E
imul rax, r10
xor rdx, rax
- imul rdx, r10
+ imul r10, rdx
mov rax, 0xD1FFAB1E
- add rdx, rax
- mov rax, 0xD1FFAB1E
- imul rax, r8
- rol rax, 31
- imul rax, r10
- xor rax, rdx
- imul rax, r10
+ add rax, r10
mov rdx, 0xD1FFAB1E
- add rax, rdx
+ imul rdx, r8
+ rol rdx, 31
+ mov r8, 0xD1FFAB1E
+ imul rdx, r8
+ xor rax, rdx
+ imul r8, rax
+ mov rax, 0xD1FFAB1E
+ add rax, r8
mov rdx, 0xD1FFAB1E
imul rdx, r9
rol rdx, 31
- imul rdx, r10
+ mov r8, 0xD1FFAB1E
+ imul rdx, r8
xor rax, rdx
- imul rax, r10
- mov rdx, 0xD1FFAB1E
- add rax, rdx
+ imul r8, rax
+ mov rax, 0xD1FFAB1E
+ add rax, r8
mov rdx, 0xD1FFAB1E
imul rdx, rcx
rol rdx, 31
- imul rdx, r10
+ mov rcx, 0xD1FFAB1E
+ imul rdx, rcx
xor rax, rdx
- imul rax, r10
+ imul rax, rcx
mov rdx, 0xD1FFAB1E
add rax, rdx
- ;; size=229 bbWeight=1 PerfScore 42.50
+ ;; size=259 bbWeight=1 PerfScore 43.25
G_M64746_IG03: ; bbWeight=1, epilog, nogc, extend
ret
;; size=1 bbWeight=1 PerfScore 1.00
-; Total bytes of code 230, prolog size 0, PerfScore 66.50, instruction count 49, allocated bytes for code 230 (MethodHash=2b3f0315) for method System.IO.Hashing.XxHash64+State:Converge():ulong:this
+; Total bytes of code 260, prolog size 0, PerfScore 70.25, instruction count 52, allocated bytes for code 260 (MethodHash=2b3f0315) for method System.IO.Hashing.XxHash64+State:Converge():ulong:this Looks like same as above, forward sub resulting in worse register allocation. |
The above is a common theme for a lot of the regressions I am spot checking -- forward subbing a tree changes execution order which leads to new and worse register allocations. In particular forward subbing an argument past a call commonly leads to a spill or new callee preserved registers. @AndyAyersMS what do you think we should do here? You closed #75565, presumably partly because of this issue. But I am partial to merge this as-is (assuming the new debug checks I added look ok) and look further at the benchmarks once we get them. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think we should merge as is and sort through regressions and perhaps checking strategy as follow-ups.
src/coreclr/jit/fgdiagnostic.cpp
Outdated
assert(stmt->GetRootNode()->gtPrev == nullptr); | ||
} | ||
|
||
fgSequenceLocals(stmt); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't like to see debug checks that alter the ambient state, since in release these checks won't be running. But I think this is something we can fix later on.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It should only modify state when we actually hit the assert, but nevertheless I agree that it isn't pretty. Reworked it to have its own tree walk that sequences into an array.
I agree. There are perhaps some improvements we can make in LSRA to improve preferencing for tree temps and we now have a bunch of motivating examples to look at... |
/azp run runtime-coreclr jitstress, runtime-coreclr libraries-jitstress, Fuzzlyn, runtime-coreclr outerloop |
Azure Pipelines successfully started running 4 pipeline(s). |
The failures are known. |
@jakobbotsch - let's chat offline about this. |
Improvements on Linux-x64: Improvements on Windows-x64: |
@jakobbotsch @SingleAccretion did instructions for this script ever get written up anywhere? |
I think the main instructions are at https://github.com/SingleAccretion/Dotnet-Runtime.Dev/#pinps1---diff-the-tp-impact-measured-as-the-count-of-retired-instructions. |
This runs a pass of liveness right after local morph and uses it for forward sub and to omit copies of structs when passed as implicit byrefs at their last use.
Fix #76069
Fix #75206
Fix #65025
Fix #9839
This PR introduces the following new JIT invariants:
Statement::LocalsTreeList
. This tree list is kept valid starting from local morph and ending with forward sub. There is no memory impact of this since we reuse theGenTree::gtPrev
andGenTree::gtNext
fields.GTF_VAR_DEATH
and the promoted struct death vars map) is kept valid (sound) starting from early liveness and ending with morph.There are asserts that the tree list is up to date when it is accessed. This is done through a new member
fgNodeThreading
that replaces the preexistingfgStmtListThreaded
and keeps information about what the current kind of node threading is.The benefits are large, -2 MB on win-x64 collections (-0.85% on libraries.pmi that only has optimized contexts). Of course, there are also significant regressions, I have analyzed some of them below.
The improvements primarily come from the omission of copies for implicit byrefs, so the benefits on platforms with fewer implicit byrefs is smaller, but the forward sub change alone is still very impactful (e.g. -300K on linux-x64).
The throughput impact is around 1% in optimized contexts and below 0.1% in unoptimized contexts, the latter due to local morph needing to check if it should be threading nodes (I experimented with templating local morph to avoid any impact, but I didn't think the code changes and final code duplication was worth it). I think the impact is acceptable, especially considering that future work on generalized promotion will also use liveness information. I did a lot of work to get TP impact down to this level, in the beginning the throughput impact was around 5%. For posterity, some of the changes made were (copied from an internal status email):
To fix this I reused the linked local list in forward sub. It gives us a much quicker way to scan the next statement for whether it has a last use local that we would be able to forward sub, before we proceed to do the 2 expensive tree walks. This reduced total TP impact to around 1.3%.
TODO:
gtExtractSideEffList
about qmarks