Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Perf] Windows/x64: 2 Regressions on 5/31/2024 9:53:21 PM #35582

Closed
performanceautofiler bot opened this issue Jun 4, 2024 · 2 comments
Closed

[Perf] Windows/x64: 2 Regressions on 5/31/2024 9:53:21 PM #35582

performanceautofiler bot opened this issue Jun 4, 2024 · 2 comments

Comments

@performanceautofiler
Copy link

Run Information

Name Value
Architecture x64
OS Windows 10.0.22621
Queue TigerWindows
Baseline 214cbbf3a7b60fd763a1401a853393f44c8cbc7d
Compare 9db548658b943f7b819428385cbd6304dadcb69a
Diff Diff
Configs CompilationMode:tiered, RunKind:micro

Regressions in System.Memory.Span<Char>

Benchmark Baseline Test Test/Base Test Quality Edge Detector Baseline IR Compare IR IR Ratio
10.62 ns 14.95 ns 1.41 0.05 True
6.57 ns 8.74 ns 1.33 0.05 False

graph
graph
Test Report

Repro

General Docs link: https://github.com/dotnet/performance/blob/main/docs/benchmarking-workflow-dotnet-runtime.md

git clone https://github.com/dotnet/performance.git
py .\performance\scripts\benchmarks_ci.py -f net8.0 --filter 'System.Memory.Span&lt;Char&gt;*'

System.Memory.Span<Char>.BinarySearch(Size: 512)

ETL Files

Histogram

JIT Disasms

System.Memory.Span<Char>.BinarySearch(Size: 33)

ETL Files

Histogram

JIT Disasms

Docs

Profiling workflow for dotnet/runtime repository
Benchmarking workflow for dotnet/runtime repository

@LoopedBard3
Copy link
Member

Diff: dotnet/runtime@2280972...d8c59a4

@jakobbotsch
Copy link
Member

jakobbotsch commented Jun 24, 2024

Seems alignment related given that there is only one x64 config that seems to have regressed.
The diff seen below looks good as well -- we managed to get rid of a stack load and then hoisted an extension out from the loop.

System.Memory.Span<Char>.BinarySearch(Size: 512)

Hot functions:

  • (86.46%) SpanHelpers.BinarySearch (Tier-1)
    • Has diffs
  • (6.13%) System.Memory.Span`1.BinarySearch (Tier-1)
    • No diffs
  • (2.96%) Runnable_0.WorkloadActionUnroll (FullOpt)
    • No diffs
Diffs

[System.Private.CoreLib]SpanHelpers.BinarySearch(!!0&,int32,!!1)

 ; Final local variable assignments
 ;
 ;  V00 arg0         [V00,T04] (  3,  9.20)   byref  ->  rbx         single-def
-;  V01 arg1         [V01,T05] (  3,  3   )     int  ->  rdx         single-def
-;  V02 arg2         [V02    ] (  3,  9.20)  ushort  ->  [rsp+0x60]  do-not-enreg[X] addr-exposed ld-addr-op single-def
+;  V01 arg1         [V01,T06] (  3,  3   )     int  ->  rdx         single-def
+;  V02 arg2         [V02,T07] (  3,  2.90)  ushort  ->   r8         ld-addr-op single-def
 ;  V03 loc0         [V03,T03] (  5, 16.21)     int  ->  rsi        
 ;  V04 loc1         [V04,T02] (  5, 16.51)     int  ->  rdi        
-;  V05 loc2         [V05,T00] (  5, 19.14)     int  ->  rbp        
-;  V06 loc3         [V06,T01] (  3, 18.66)     int  ->  rcx        
+;  V05 loc2         [V05,T00] (  5, 19.14)     int  ->  r14        
+;  V06 loc3         [V06,T01] (  3, 18.66)     int  ->  rax        
 ;* V07 loc4         [V07    ] (  0,  0   )  ushort  ->  zero-ref    ld-addr-op
 ;  V08 OutArgs      [V08    ] (  1,  1   )  struct (32) [rsp+0x00]  do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
 ;* V09 tmp1         [V09    ] (  0,  0   )   byref  ->  zero-ref   
 ;* V10 tmp2         [V10    ] (  0,  0   )  ushort  ->  zero-ref    "Inlining Arg"
+;  V11 cse0         [V11,T05] (  2,  8.10)     int  ->  rbp         hoist "CSE #01: aggressive"
 ;
-; Lcl frame size = 40
+; Lcl frame size = 32
 
 G_M30959_IG01:
+       push     r14
        push     rdi
        push     rsi
        push     rbp
        push     rbx
-       sub      rsp, 40
-       mov      dword ptr [rsp+0x60], r8d
+       sub      rsp, 32
        mov      rbx, rcx
-						;; size=16 bbWeight=1 PerfScore 5.50
+						;; size=13 bbWeight=1 PerfScore 5.50
 G_M30959_IG02:
        xor      esi, esi
        lea      edi, [rdx-0x01]
        test     edi, edi
-       jl       SHORT G_M30959_IG10
+       jl       SHORT G_M30959_IG11
 						;; size=9 bbWeight=1 PerfScore 2.00
 G_M30959_IG03:
-       lea      ebp, [rdi+rsi]
-       shr      ebp, 1
-       movzx    rcx, word  ptr [rsp+0x60]
-       movsxd   rax, ebp
-       movzx    rax, word  ptr [rbx+2*rax]
-       sub      ecx, eax
-       je       SHORT G_M30959_IG08
-						;; size=21 bbWeight=7.20 PerfScore 39.60
+       movzx    rbp, r8w
+						;; size=4 bbWeight=0.90 PerfScore 0.23
 G_M30959_IG04:
-       test     ecx, ecx
-       jle      SHORT G_M30959_IG06
-						;; size=4 bbWeight=4.26 PerfScore 5.33
+       lea      r14d, [rdi+rsi]
+       shr      r14d, 1
+       movsxd   rcx, r14d
+       movzx    rcx, word  ptr [rbx+2*rcx]
+       mov      eax, ebp
+       sub      eax, ecx
+       je       SHORT G_M30959_IG09
+						;; size=20 bbWeight=7.20 PerfScore 34.20
 G_M30959_IG05:
-       mov      rcx, 0xD1FFAB1E
-       call     CORINFO_HELP_COUNTPROFILE32
-       lea      esi, [rbp+0x01]
-       jmp      SHORT G_M30959_IG07
-						;; size=20 bbWeight=2.22 PerfScore 8.31
+       test     eax, eax
+       jle      SHORT G_M30959_IG07
+						;; size=4 bbWeight=4.26 PerfScore 5.33
 G_M30959_IG06:
        mov      rcx, 0xD1FFAB1E
        call     CORINFO_HELP_COUNTPROFILE32
-       lea      edi, [rbp-0x01]
-						;; size=18 bbWeight=2.05 PerfScore 3.58
+       lea      esi, [r14+0x01]
+       jmp      SHORT G_M30959_IG08
+						;; size=21 bbWeight=2.22 PerfScore 8.31
 G_M30959_IG07:
-       cmp      esi, edi
-       jle      SHORT G_M30959_IG03
-       jmp      SHORT G_M30959_IG10
-						;; size=6 bbWeight=5.26 PerfScore 17.11
-G_M30959_IG08:
        mov      rcx, 0xD1FFAB1E
        call     CORINFO_HELP_COUNTPROFILE32
-       mov      eax, ebp
-						;; size=17 bbWeight=0.47 PerfScore 0.71
+       lea      edi, [r14-0x01]
+						;; size=19 bbWeight=2.05 PerfScore 3.58
+G_M30959_IG08:
+       cmp      esi, edi
+       jle      SHORT G_M30959_IG04
+       jmp      SHORT G_M30959_IG11
+						;; size=6 bbWeight=5.26 PerfScore 17.11
 G_M30959_IG09:
-       add      rsp, 40
+       mov      rcx, 0xD1FFAB1E
+       call     CORINFO_HELP_COUNTPROFILE32
+       mov      eax, r14d
+						;; size=18 bbWeight=0.47 PerfScore 0.71
+G_M30959_IG10:
+       add      rsp, 32
        pop      rbx
        pop      rbp
        pop      rsi
        pop      rdi
+       pop      r14
        ret      
-						;; size=9 bbWeight=0.47 PerfScore 1.54
-G_M30959_IG10:
+						;; size=11 bbWeight=0.47 PerfScore 1.78
+G_M30959_IG11:
        mov      rcx, 0xD1FFAB1E
        call     CORINFO_HELP_COUNTPROFILE32
        mov      eax, esi
        not      eax
 						;; size=19 bbWeight=0.53 PerfScore 0.92
-G_M30959_IG11:
-       add      rsp, 40
+G_M30959_IG12:
+       add      rsp, 32
        pop      rbx
        pop      rbp
        pop      rsi
        pop      rdi
+       pop      r14
        ret      
-						;; size=9 bbWeight=0.53 PerfScore 1.71
+						;; size=11 bbWeight=0.53 PerfScore 1.97
 
-; Total bytes of code 148, prolog size 16, PerfScore 86.31, instruction count 49, allocated bytes for code 148 (MethodHash=971b8710) for method System.SpanHelpers:BinarySearch[ushort,ushort](byref,int,ushort):int (Instrumented Tier1)
+; Total bytes of code 155, prolog size 13, PerfScore 81.64, instruction count 52, allocated bytes for code 155 (MethodHash=971b8710) for method System.SpanHelpers:BinarySearch[ushort,ushort](byref,int,ushort):int (Instrumented Tier1)
 ; ============================================================
 
 ; Assembly listing for method System.SpanHelpers:BinarySearch[ushort,ushort](System.ReadOnlySpan`1[ushort],ushort):int (Instrumented Tier0)
@@ -214,52 +220,53 @@ G_M27467_IG05:
 ; optimized using Dynamic PGO
 ; rsp based frame
 ; fully interruptible
-; with Dynamic PGO: fgCalledCount is 31548
+; with Dynamic PGO: fgCalledCount is 30036
 ; 0 inlinees with PGO data; 1 single block inlinees; 0 inlinees without PGO data
 ; Final local variable assignments
 ;
-;  V00 arg0         [V00,T04] (  3, 12.17)   byref  ->  rcx         single-def
-;  V01 arg1         [V01,T05] (  3,  3   )     int  ->  rdx         single-def
-;  V02 arg2         [V02    ] (  3, 12.17)  ushort  ->  [rsp+0x18]  do-not-enreg[X] addr-exposed ld-addr-op single-def
-;  V03 loc0         [V03,T00] (  5, 30.51)     int  ->   r8        
-;  V04 loc1         [V04,T03] (  5, 22.34)     int  ->  rdx        
-;  V05 loc2         [V05,T01] (  5, 30.51)     int  ->  rax        
-;  V06 loc3         [V06,T02] (  3, 29.51)     int  ->  r10        
+;  V00 arg0         [V00,T04] (  3, 12.02)   byref  ->  rcx         single-def
+;  V01 arg1         [V01,T06] (  3,  3   )     int  ->  rdx         single-def
+;  V02 arg2         [V02,T07] (  3,  3   )  ushort  ->   r8         ld-addr-op single-def
+;  V03 loc0         [V03,T00] (  5, 30.06)     int  ->  r10        
+;  V04 loc1         [V04,T03] (  5, 22.04)     int  ->  rdx        
+;  V05 loc2         [V05,T01] (  5, 30.06)     int  ->  rax        
+;  V06 loc3         [V06,T02] (  3, 29.06)     int  ->  r11        
 ;* V07 loc4         [V07    ] (  0,  0   )  ushort  ->  zero-ref    ld-addr-op
 ;# V08 OutArgs      [V08    ] (  1,  1   )  struct ( 0) [rsp+0x00]  do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
 ;* V09 tmp1         [V09    ] (  0,  0   )   byref  ->  zero-ref   
 ;* V10 tmp2         [V10    ] (  0,  0   )  ushort  ->  zero-ref    "Inlining Arg"
+;  V11 cse0         [V11,T05] (  2, 11.02)     int  ->   r8         hoist "CSE #01: aggressive"
 ;
 ; Lcl frame size = 0
 
 G_M30959_IG01:
-       mov      dword ptr [rsp+0x18], r8d
-						;; size=5 bbWeight=1 PerfScore 1.00
+						;; size=0 bbWeight=1 PerfScore 0.00
 G_M30959_IG02:
-       xor      r8d, r8d
+       xor      r10d, r10d
        dec      edx
        js       SHORT G_M30959_IG08
+       movzx    r8, r8w
        align    [0 bytes for IG03]
-						;; size=7 bbWeight=1 PerfScore 1.50
+						;; size=11 bbWeight=1 PerfScore 1.75
 G_M30959_IG03:
-       lea      eax, [rdx+r8]
+       lea      eax, [rdx+r10]
        shr      eax, 1
-       movzx    r10, word  ptr [rsp+0x18]
        movsxd   r9, eax
        movzx    r9, word  ptr [rcx+2*r9]
-       sub      r10d, r9d
+       mov      r11d, r8d
+       sub      r11d, r9d
        je       SHORT G_M30959_IG06
-						;; size=25 bbWeight=10.17 PerfScore 55.93
+						;; size=22 bbWeight=10.02 PerfScore 47.59
 G_M30959_IG04:
-       test     r10d, r10d
+       test     r11d, r11d
        jle      SHORT G_M30959_IG07
-       lea      r8d, [rax+0x01]
-						;; size=9 bbWeight=9.17 PerfScore 16.05
+       lea      r10d, [rax+0x01]
+						;; size=9 bbWeight=9.02 PerfScore 15.78
 G_M30959_IG05:
-       cmp      r8d, edx
+       cmp      r10d, edx
        jle      SHORT G_M30959_IG03
        jmp      SHORT G_M30959_IG08
-						;; size=7 bbWeight=10.17 PerfScore 33.05
+						;; size=7 bbWeight=10.02 PerfScore 32.56
 G_M30959_IG06:
        ret      
 						;; size=1 bbWeight=1.00 PerfScore 1.00
@@ -268,14 +275,14 @@ G_M30959_IG07:
        jmp      SHORT G_M30959_IG05
 						;; size=5 bbWeight=0 PerfScore 0.00
 G_M30959_IG08:
-       mov      eax, r8d
+       mov      eax, r10d
        not      eax
 						;; size=5 bbWeight=0 PerfScore 0.00
 G_M30959_IG09:
        ret      
 						;; size=1 bbWeight=0 PerfScore 0.00
 
-; Total bytes of code 65, prolog size 5, PerfScore 108.53, instruction count 24, allocated bytes for code 65 (MethodHash=971b8710) for method System.SpanHelpers:BinarySearch[ushort,ushort](byref,int,ushort):int (Tier1)
+; Total bytes of code 61, prolog size 0, PerfScore 98.69, instruction count 24, allocated bytes for code 61 (MethodHash=971b8710) for method System.SpanHelpers:BinarySearch[ushort,ushort](byref,int,ushort):int (Tier1)
 ; ============================================================
 
 ; Assembly listing for method System.SpanHelpers:BinarySearch[ushort,ushort](System.ReadOnlySpan`1[ushort],ushort):int (Tier1)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

No branches or pull requests

2 participants