-
Notifications
You must be signed in to change notification settings - Fork 0
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[Perf -34%] SciMark2.kernel.benchSparseMult #2271
Comments
I see similar results with and without 32B method alignment.
After 32B method alignment:
Looking at the disassembly, the innermost loop of benchmark is cloned and it is long enough that it takes (1 clone takes 2 chunks and other takes 3 chunks). The loop alignment won't help in this case because the existing placement of loop is appropriate. That makes me wonder if this is another memory alignment related issue that depends on the alignment of arrays created for the benchmark. @adamsitnik , any thought? With loop alignment; Assembly listing for method SciMark2.SparseCompRow:matmult(System.Double[],System.Double[],System.Int32[],System.Int32[],System.Double[],int)
; Emitting BLENDED_CODE for X64 CPU with AVX - Windows
; optimized code
; rsp based frame
; fully interruptible
; Final local variable assignments
;
; V00 arg0 [V00,T12] ( 7, 58 ) ref -> rcx class-hnd
; V01 arg1 [V01,T03] ( 9, 354 ) ref -> [rsp+0x78] class-hnd
; V02 arg2 [V02,T11] ( 7, 67 ) ref -> r8 class-hnd
; V03 arg3 [V03,T04] ( 9, 354 ) ref -> r9 class-hnd
; V04 arg4 [V04,T02] ( 7, 400 ) ref -> rax class-hnd
; V05 arg5 [V05,T20] ( 2, 5 ) int -> r10
; V06 loc0 [V06,T13] ( 7, 49 ) int -> rsi
; V07 loc1 [V07,T18] ( 4, 13 ) int -> rdi
; V08 loc2 [V08,T09] ( 10, 148 ) int -> rbx
; V09 loc3 [V09,T21] ( 10, 448 ) double -> mm0
; V10 loc4 [V10,T10] ( 5, 80 ) int -> r14
; V11 loc5 [V11,T05] ( 10, 304 ) int -> r12
; V12 loc6 [V12,T01] ( 20,1088 ) int -> registers
; V13 OutArgs [V13 ] ( 1, 1 ) lclBlk (32) [rsp+0x00] "OutgoingArgSpace"
; V14 tmp1 [V14,T00] ( 9,1152 ) int -> registers "index expr"
; V15 cse0 [V15,T19] ( 3, 6 ) int -> r11 "CSE - aggressive"
; V16 cse1 [V16,T14] ( 3, 48 ) int -> rbx "CSE - aggressive"
; V17 cse2 [V17,T15] ( 3, 48 ) int -> r15 "CSE - aggressive"
; V18 cse3 [V18,T06] ( 3, 192 ) long -> r14 "CSE - aggressive"
; V19 cse4 [V19,T07] ( 3, 192 ) long -> r14 "CSE - aggressive"
; V20 cse5 [V20,T08] ( 3, 192 ) long -> r13 "CSE - aggressive"
; V21 cse6 [V21,T16] ( 3, 48 ) long -> rbp "CSE - aggressive"
; V22 cse7 [V22,T17] ( 3, 48 ) long -> rbp "CSE - aggressive"
;
; Lcl frame size = 40
G_M25427_IG01: ;; offset=0000H
00007ffb`9af076c0 4157 push r15
00007ffb`9af076c2 4156 push r14
00007ffb`9af076c4 4155 push r13
00007ffb`9af076c6 4154 push r12
00007ffb`9af076c8 57 push rdi
00007ffb`9af076c9 56 push rsi
00007ffb`9af076ca 55 push rbp
00007ffb`9af076cb 53 push rbx
00007ffb`9af076cc 4883EC28 sub rsp, 40
00007ffb`9af076d0 C5F877 vzeroupper
00007ffb`9af076d3 4889542478 mov gword ptr [rsp+78H], rdx
00007ffb`9af076d8 488B842490000000 mov rax, gword ptr [rsp+90H]
; =========================== 32B boundary ===========================
00007ffb`9af076e0 448B942498000000 mov r10d, dword ptr [rsp+98H]
;; bbWeight=1 PerfScore 12.25
G_M25427_IG02: ;; offset=0028H
00007ffb`9af076e8 458B5808 mov r11d, dword ptr [r8+8]
00007ffb`9af076ec 418D73FF lea esi, [r11-1]
00007ffb`9af076f0 33FF xor edi, edi
00007ffb`9af076f2 4585D2 test r10d, r10d
00007ffb`9af076f5 0F8E58020000 jle G_M25427_IG18
;; bbWeight=1 PerfScore 4.00
G_M25427_IG03: ;; offset=003BH
00007ffb`9af076fb 33DB xor ebx, ebx
00007ffb`9af076fd 85F6 test esi, esi
00007ffb`9af076ff 0F8E1F010000 jle G_M25427_IG10
; =========================== 32B boundary ===========================
00007ffb`9af07705 4885C9 test rcx, rcx
00007ffb`9af07708 400F95C5 setne bpl
00007ffb`9af0770c 400FB6ED movzx rbp, bpl
00007ffb`9af07710 40F6C501 test bpl, 1
00007ffb`9af07714 0F8495010000 je G_M25427_IG14
;; bbWeight=4 PerfScore 17.00
G_M25427_IG04: ;; offset=005AH
00007ffb`9af0771a 443BDE cmp r11d, esi
00007ffb`9af0771d 400F9DC5 setge bpl
; =========================== 32B boundary ===========================
00007ffb`9af07721 400FB6ED movzx rbp, bpl
00007ffb`9af07725 448BF6 mov r14d, esi
00007ffb`9af07728 41F7D6 not r14d
00007ffb`9af0772b 41C1EE1F shr r14d, 31
00007ffb`9af0772f 4123EE and ebp, r14d
00007ffb`9af07732 397108 cmp dword ptr [rcx+8], esi
00007ffb`9af07735 410F9DC6 setge r14b
00007ffb`9af07739 450FB6F6 movzx r14, r14b
00007ffb`9af0773d 4185EE test ebp, r14d
; =========================== 32B boundary ===========================
00007ffb`9af07740 0F8469010000 je G_M25427_IG14
;; bbWeight=4 PerfScore 35.00
G_M25427_IG05: ;; offset=0086H
00007ffb`9af07746 C5F857C0 vxorps xmm0, xmm0
00007ffb`9af0774a 4863EB movsxd rbp, ebx
00007ffb`9af0774d 458B74A810 mov r14d, dword ptr [r8+4*rbp+16]
00007ffb`9af07752 FFC3 inc ebx
00007ffb`9af07754 4C63E3 movsxd r12, ebx
00007ffb`9af07757 478B64A010 mov r12d, dword ptr [r8+4*r12+16]
00007ffb`9af0775c 458BFE mov r15d, r14d
00007ffb`9af0775f 453BF4 cmp r14d, r12d
; =========================== 32B boundary ===========================
00007ffb`9af07762 0F8DB5000000 jge G_M25427_IG09
00007ffb`9af07768 4D85C9 test r9, r9
00007ffb`9af0776b 410F95C6 setne r14b
00007ffb`9af0776f 450FB6F6 movzx r14, r14b
00007ffb`9af07773 488B542478 mov rdx, gword ptr [rsp+78H]
00007ffb`9af07778 4885D2 test rdx, rdx
00007ffb`9af0777b 410F95C5 setne r13b
00007ffb`9af0777f 450FB6ED movzx r13, r13b
; =========================== 32B boundary ===========================
00007ffb`9af07783 4585F5 test r14d, r13d
00007ffb`9af07786 0F84C1000000 je G_M25427_IG11
;; bbWeight=16 PerfScore 189.33
G_M25427_IG06: ;; offset=00CCH
00007ffb`9af0778c 458BF7 mov r14d, r15d
00007ffb`9af0778f 41F7D6 not r14d
00007ffb`9af07792 41C1EE1F shr r14d, 31
00007ffb`9af07796 458BEC mov r13d, r12d
00007ffb`9af07799 41F7D5 not r13d
00007ffb`9af0779c 41C1ED1F shr r13d, 31
; =========================== 32B boundary ===========================
00007ffb`9af077a0 4523F5 and r14d, r13d
00007ffb`9af077a3 45396108 cmp dword ptr [r9+8], r12d
00007ffb`9af077a7 410F9DC5 setge r13b
00007ffb`9af077ab 450FB6ED movzx r13, r13b
00007ffb`9af077af 4523F5 and r14d, r13d
00007ffb`9af077b2 44396208 cmp dword ptr [rdx+8], r12d
00007ffb`9af077b6 410F9DC5 setge r13b
00007ffb`9af077ba 450FB6ED movzx r13, r13b
00007ffb`9af077be 4585F5 test r14d, r13d
; =========================== 32B boundary ===========================
00007ffb`9af077c1 0F8485000000 je G_M25427_IG11
00007ffb`9af077c7 448B7008 mov r14d, dword ptr [rax+8]
; ~~~~~~~~~~~~~~~~~~~~~~ Skipping because (currentOffset < extraBytesNotInLoop) ~~~~~~~~~~~~~~~~~~~~~~
00007ffb`9af077cb align
00007ffb`9af077cb align
00007ffb`9af077cb align
;; bbWeight=16 PerfScore 256.00
G_M25427_IG07: ;; offset=010BH
00007ffb`9af077cb 4D63F7 movsxd r14, r15d
00007ffb`9af077ce 478B6CB110 mov r13d, dword ptr [r9+4*r14+16]
00007ffb`9af077d3 443B6808 cmp r13d, dword ptr [rax+8]
00007ffb`9af077d7 0F8382010000 jae G_M25427_IG22
00007ffb`9af077dd 4D63ED movsxd r13, r13d
; =========================== 32B boundary ===========================
00007ffb`9af077e0 C4A17B104CE810 vmovsd xmm1, qword ptr [rax+8*r13+16]
00007ffb`9af077e7 C4A173594CF210 vmulsd xmm1, xmm1, qword ptr [rdx+8*r14+16]
00007ffb`9af077ee C5F358C0 vaddsd xmm0, xmm1, xmm0
00007ffb`9af077f2 41FFC7 inc r15d
00007ffb`9af077f5 453BFC cmp r15d, r12d
00007ffb`9af077f8 7CD1 jl SHORT G_M25427_IG07
;; bbWeight=64 PerfScore 1152.00
G_M25427_IG08: ;; offset=013AH
00007ffb`9af077fa EB74 jmp SHORT G_M25427_IG12
;; bbWeight=16 PerfScore 32.00
G_M25427_IG09: ;; offset=013CH
00007ffb`9af077fc 488B542478 mov rdx, gword ptr [rsp+78H]
; =========================== 32B boundary ===========================
00007ffb`9af07801 EB6D jmp SHORT G_M25427_IG12
;; bbWeight=8 PerfScore 24.00
G_M25427_IG10: ;; offset=0143H
00007ffb`9af07803 488B542478 mov rdx, gword ptr [rsp+78H]
00007ffb`9af07808 E915010000 jmp G_M25427_IG17
; ~~~~~~~~~~~~~~~~~~~~~~ Skipping because (currentOffset < extraBytesNotInLoop) ~~~~~~~~~~~~~~~~~~~~~~
00007ffb`9af0780d align
00007ffb`9af0780d align
00007ffb`9af0780d align
;; bbWeight=2 PerfScore 7.50
G_M25427_IG11: ;; offset=014DH
00007ffb`9af0780d 453B7908 cmp r15d, dword ptr [r9+8]
00007ffb`9af07811 0F8328010000 jae G_M25427_IG22
00007ffb`9af07817 4D63F7 movsxd r14, r15d
00007ffb`9af0781a 478B6CB110 mov r13d, dword ptr [r9+4*r14+16]
00007ffb`9af0781f 443B6808 cmp r13d, dword ptr [rax+8]
; =========================== 32B boundary ===========================
00007ffb`9af07823 0F8316010000 jae G_M25427_IG22
00007ffb`9af07829 4D63ED movsxd r13, r13d
00007ffb`9af0782c C4A17B104CE810 vmovsd xmm1, qword ptr [rax+8*r13+16]
00007ffb`9af07833 443B7A08 cmp r15d, dword ptr [rdx+8]
00007ffb`9af07837 0F8302010000 jae G_M25427_IG22
00007ffb`9af0783d C4A173594CF210 vmulsd xmm1, xmm1, qword ptr [rdx+8*r14+16]
; =========================== 32B boundary ===========================
00007ffb`9af07844 C5F358C0 vaddsd xmm0, xmm1, xmm0
00007ffb`9af07848 41FFC7 inc r15d
00007ffb`9af0784b 453BFC cmp r15d, r12d
00007ffb`9af0784e 7CBD jl SHORT G_M25427_IG11
;; bbWeight=64 PerfScore 1536.00
G_M25427_IG12: ;; offset=0190H
00007ffb`9af07850 C5FB1144E910 vmovsd qword ptr [rcx+8*rbp+16], xmm0
00007ffb`9af07856 3BDE cmp ebx, esi
00007ffb`9af07858 4889542478 mov gword ptr [rsp+78H], rdx
00007ffb`9af0785d 0F8CE3FEFFFF jl G_M25427_IG05
; =========================== 32B boundary ===========================
;; bbWeight=16 PerfScore 44.00
G_M25427_IG13: ;; offset=01A3H
00007ffb`9af07863 488B542478 mov rdx, gword ptr [rsp+78H]
00007ffb`9af07868 E994000000 jmp G_M25427_IG17
;; bbWeight=4 PerfScore 12.00
G_M25427_IG14: ;; offset=01ADH
00007ffb`9af0786d C5F857C0 vxorps xmm0, xmm0
00007ffb`9af07871 4863EB movsxd rbp, ebx
00007ffb`9af07874 458B74A810 mov r14d, dword ptr [r8+4*rbp+16]
00007ffb`9af07879 448D7B01 lea r15d, [rbx+1]
00007ffb`9af0787d 4D63E7 movsxd r12, r15d
; =========================== 32B boundary ===========================
00007ffb`9af07880 478B64A010 mov r12d, dword ptr [r8+4*r12+16]
00007ffb`9af07885 453BF4 cmp r14d, r12d
00007ffb`9af07888 0F8DA8000000 jge G_M25427_IG21
; ~~~~~~~~~~~~~~~~~~~~~~ Skipping because (totalCodeSize <= emitComp->compJitAlignLoopMaxCodeSize) ~~~~~~~~~~~~~~~~~~~~~~
00007ffb`9af0788e align
00007ffb`9af0788e align
00007ffb`9af0788e align
;; bbWeight=16 PerfScore 117.33
G_M25427_IG15: ;; offset=01CEH
00007ffb`9af0788e 453B7108 cmp r14d, dword ptr [r9+8]
00007ffb`9af07892 0F8385000000 jae G_M25427_IG22
00007ffb`9af07898 4D63EE movsxd r13, r14d
00007ffb`9af0789b 438B54A910 mov edx, dword ptr [r9+4*r13+16]
; =========================== 32B boundary ===========================
00007ffb`9af078a0 3B5008 cmp edx, dword ptr [rax+8]
00007ffb`9af078a3 7378 jae SHORT G_M25427_IG22
00007ffb`9af078a5 4863D2 movsxd rdx, edx
00007ffb`9af078a8 C5FB104CD010 vmovsd xmm1, qword ptr [rax+8*rdx+16]
00007ffb`9af078ae 488B542478 mov rdx, gword ptr [rsp+78H]
00007ffb`9af078b3 443B7208 cmp r14d, dword ptr [rdx+8]
00007ffb`9af078b7 7364 jae SHORT G_M25427_IG22
00007ffb`9af078b9 C4A173594CEA10 vmulsd xmm1, xmm1, qword ptr [rdx+8*r13+16]
; =========================== 32B boundary ===========================
00007ffb`9af078c0 C5F358C0 vaddsd xmm0, xmm1, xmm0
00007ffb`9af078c4 41FFC6 inc r14d
00007ffb`9af078c7 453BF4 cmp r14d, r12d
00007ffb`9af078ca 7C40 jl SHORT G_M25427_IG20
;; bbWeight=64 PerfScore 1600.00
G_M25427_IG16: ;; offset=020CH
00007ffb`9af078cc 3B5908 cmp ebx, dword ptr [rcx+8]
00007ffb`9af078cf 734A jae SHORT G_M25427_IG22
00007ffb`9af078d1 C5FB1144E910 vmovsd qword ptr [rcx+8*rbp+16], xmm0
00007ffb`9af078d7 418BDF mov ebx, r15d
00007ffb`9af078da 3BDE cmp ebx, esi
00007ffb`9af078dc 7C22 jl SHORT G_M25427_IG19
;; bbWeight=16 PerfScore 80.00
G_M25427_IG17: ;; offset=021EH
00007ffb`9af078de FFC7 inc edi
; =========================== 32B boundary ===========================
00007ffb`9af078e0 413BFA cmp edi, r10d
00007ffb`9af078e3 4889542478 mov gword ptr [rsp+78H], rdx
00007ffb`9af078e8 0F8C0DFEFFFF jl G_M25427_IG03
;; bbWeight=4 PerfScore 10.00
G_M25427_IG18: ;; offset=022EH
00007ffb`9af078ee 4883C428 add rsp, 40
00007ffb`9af078f2 5B pop rbx
00007ffb`9af078f3 5D pop rbp
00007ffb`9af078f4 5E pop rsi
00007ffb`9af078f5 5F pop rdi
00007ffb`9af078f6 415C pop r12
00007ffb`9af078f8 415D pop r13
00007ffb`9af078fa 415E pop r14
00007ffb`9af078fc 415F pop r15
00007ffb`9af078fe C3 ret
;; bbWeight=1 PerfScore 5.25
G_M25427_IG19: ;; offset=023FH
00007ffb`9af078ff 4889542478 mov gword ptr [rsp+78H], rdx
; =========================== 32B boundary ===========================
00007ffb`9af07904 E964FFFFFF jmp G_M25427_IG14
;; bbWeight=8 PerfScore 24.00
G_M25427_IG20: ;; offset=0249H
00007ffb`9af07909 4889542478 mov gword ptr [rsp+78H], rdx
00007ffb`9af0790e E97BFFFFFF jmp G_M25427_IG15
;; bbWeight=32 PerfScore 96.00
G_M25427_IG21: ;; offset=0253H
00007ffb`9af07913 488B542478 mov rdx, gword ptr [rsp+78H]
00007ffb`9af07918 EBB2 jmp SHORT G_M25427_IG16
;; bbWeight=8 PerfScore 24.00
G_M25427_IG22: ;; offset=025AH
00007ffb`9af0791a E801CD065F call CORINFO_HELP_RNGCHKFAIL
00007ffb`9af0791f CC int3
; =========================== 32B boundary ===========================
;; bbWeight=0 PerfScore 0.00
; Total bytes of code 608, prolog size 40, PerfScore 5348.57, instruction count 171 (MethodHash=61c29cac) for method SciMark2.SparseCompRow:matmult(System.Double[],System.Double[],System.Int32[],System.Int32[],System.Double[],int)
; ============================================================ |
* Added --disable-animations option to device startup testing. * Added exits after logger exceptions, and added actual check to ensure the animation values were properly set.
Run Information
Regressions in SciMark2.kernel
Historical Data in Reporting System
Repro
Histogram
SciMark2.kernel.benchSparseMult
Docs
Profiling workflow for dotnet/runtime repository
Benchmarking workflow for dotnet/runtime repository
The text was updated successfully, but these errors were encountered: