-
Notifications
You must be signed in to change notification settings - Fork 4.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[Perf -30%] System.Memory.Span<Byte>.BinarySearch #39721
Comments
I pulled out latest graph and if I compared it with the graph in this issue, it looks like we see regression once in a while, although in latest graph below, the regression continues. On my local machine, I came up with standalone test to verify it. Stopwatch st = new Stopwatch();
st.Start();
BinarySearchTest<char> test = new BinarySearchTest<char>();
for (int i = 0; i < 6 * 1000 * 1000 * 200; i++)
{
test.BinarySearch();
}
st.Stop();
Console.WriteLine("Time: " + st.ElapsedMilliseconds + " msec."); Base: 16725 msec.
Test: 19845 msec. Base = 4f703e6 I can clearly see the regression on my local machine as well, but I couldn't see why it regressed. The JIT code produced during entire run is identical. The JIT throughput itself was better in test compared to base as seen below: (left is base and right is test). Drilling down further doesn't tell much. The time is just spent in executing the assembly code of BinarySearch. Perhaps, one of @GrabYourPitchforks or @adamsitnik should investigate further. |
Have you used the latest 5.0 bits? I've seen the same assembly code and no regression when using https://www.diffchecker.com/OfXOVRte git clone https://github.com/dotnet/performance.git
py .\performance\scripts\benchmarks_ci.py -f netcoreapp3.1 netcoreapp5.0 --filter 'System.Memory.Span<Byte>.BinarySearch' --bdn-arguments "--disasm true" 3.1BenchmarkDotNet=v0.12.1.1405-nightly, OS=Windows 10.0.18363.959 (1909/November2019Update/19H2) PowerPlanMode=00000000-0000-0000-0000-000000000000 Runtime=.NET Core 3.1 Arguments=/p:DebugType=portable
.NET Core 3.1.6 (CoreCLR 4.700.20.26901, CoreFX 4.700.20.31603), X64 RyuJIT; System.Memory.Span`1[[System.Byte, System.Private.CoreLib]].BinarySearch()
sub rsp,28
mov rdx,[rcx+18]
test rdx,rdx
jne short M00_L00
xor r8d,r8d
xor edx,edx
jmp short M00_L01
M00_L00:
lea r8,[rdx+10]
mov edx,[rdx+8]
M00_L01:
movzx eax,byte ptr [rcx+2C]
mov rcx,r8
mov r8d,eax
call System.SpanHelpers.BinarySearch[[System.Byte, System.Private.CoreLib],[System.Byte, System.Private.CoreLib]](Byte ByRef, Int32, Byte)
nop
add rsp,28
ret
; Total bytes of code 48 ; System.SpanHelpers.BinarySearch[[System.Byte, System.Private.CoreLib],[System.Byte, System.Private.CoreLib]](Byte ByRef, Int32, Byte)
mov [rsp+18],r8d
xor eax,eax
dec edx
test edx,edx
jl short M01_L03
M01_L00:
lea r8d,[rdx+rax]
shr r8d,1
movsxd r9,r8d
movzx r9d,byte ptr [rcx+r9]
movzx r10d,byte ptr [rsp+18]
sub r10d,r9d
test r10d,r10d
je short M01_L04
test r10d,r10d
jle short M01_L01
lea eax,[r8+1]
jmp short M01_L02
M01_L01:
lea edx,[r8+0FFFF]
M01_L02:
cmp eax,edx
jle short M01_L00
M01_L03:
not eax
ret
M01_L04:
mov eax,r8d
ret
; Total bytes of code 68 5.0BenchmarkDotNet=v0.12.1.1405-nightly, OS=Windows 10.0.18363.959 (1909/November2019Update/19H2) PowerPlanMode=00000000-0000-0000-0000-000000000000 Runtime=.NET Core 5.0 Arguments=/p:DebugType=portable
.NET Core 5.0.0 (CoreCLR 5.0.20.40416, CoreFX 5.0.20.40416), X64 RyuJIT; System.Memory.Span`1[[System.Byte, System.Private.CoreLib]].BinarySearch()
sub rsp,28
mov rdx,[rcx+18]
test rdx,rdx
jne short M00_L00
xor r8d,r8d
xor edx,edx
jmp short M00_L01
M00_L00:
lea r8,[rdx+10]
mov edx,[rdx+8]
M00_L01:
movzx eax,byte ptr [rcx+2C]
mov rcx,r8
mov r8d,eax
call System.SpanHelpers.BinarySearch[[System.Byte, System.Private.CoreLib],[System.Byte, System.Private.CoreLib]](Byte ByRef, Int32, Byte)
nop
add rsp,28
ret
; Total bytes of code 48 ; System.SpanHelpers.BinarySearch[[System.Byte, System.Private.CoreLib],[System.Byte, System.Private.CoreLib]](Byte ByRef, Int32, Byte)
mov [rsp+18],r8d
xor eax,eax
dec edx
test edx,edx
jl short M01_L03
M01_L00:
lea r8d,[rdx+rax]
shr r8d,1
movsxd r9,r8d
movzx r9d,byte ptr [rcx+r9]
movzx r10d,byte ptr [rsp+18]
sub r10d,r9d
je short M01_L04
test r10d,r10d
jle short M01_L01
lea eax,[r8+1]
jmp short M01_L02
M01_L01:
lea edx,[r8+0FFFF]
M01_L02:
cmp eax,edx
jle short M01_L00
M01_L03:
not eax
ret
M01_L04:
mov eax,r8d
ret
; Total bytes of code 65 |
The diff is related to #38586 and shouldn't introduce regression. |
How important is this scenario? I think we can get substantial perf wins for primitive types like |
@kunalspathak you are right, it should not. I've dig a little bit more into that and confirmed that removal of extra .AddDiagnoser(new DisassemblyDiagnoser(new DisassemblyDiagnoserConfig(printInstructionAddresses: true))) py .\performance\scripts\benchmarks_ci.py -f netcoreapp5.0 --filter 'System.Memory.Span<Byte>.BinarySearch' --bdn-arguments "--envVars COMPlus_JitAlignLoops:0"
.NET Core 5.0.0 (CoreCLR 5.0.20.40416, CoreFX 5.0.20.40416), X64 RyuJIT; System.Memory.Span`1[[System.Byte, System.Private.CoreLib]].BinarySearch()
7FFB797C58B0 sub rsp,28
7FFB797C58B4 mov rdx,[rcx+18]
7FFB797C58B8 test rdx,rdx
7FFB797C58BB jne short M00_L00
7FFB797C58BD xor r8d,r8d
7FFB797C58C0 xor edx,edx
7FFB797C58C2 jmp short M00_L01
M00_L00:
7FFB797C58C4 lea r8,[rdx+10]
7FFB797C58C8 mov edx,[rdx+8]
M00_L01:
7FFB797C58CB movzx eax,byte ptr [rcx+2C]
7FFB797C58CF mov rcx,r8
7FFB797C58D2 mov r8d,eax
7FFB797C58D5 call System.SpanHelpers.BinarySearch[[System.Byte, System.Private.CoreLib],[System.Byte, System.Private.CoreLib]](Byte ByRef, Int32, Byte)
7FFB797C58DA nop
7FFB797C58DB add rsp,28
7FFB797C58DF ret
; Total bytes of code 48 ; System.SpanHelpers.BinarySearch[[System.Byte, System.Private.CoreLib],[System.Byte, System.Private.CoreLib]](Byte ByRef, Int32, Byte)
7FFB797C5980 mov [rsp+18],r8d
7FFB797C5985 xor eax,eax
7FFB797C5987 dec edx
7FFB797C5989 test edx,edx
7FFB797C598B jl short M01_L03
M01_L00:
7FFB797C598D lea r8d,[rdx+rax]
7FFB797C5991 shr r8d,1
7FFB797C5994 movsxd r9,r8d
7FFB797C5997 movzx r9d,byte ptr [rcx+r9]
7FFB797C599C movzx r10d,byte ptr [rsp+18]
7FFB797C59A2 sub r10d,r9d
7FFB797C59A5 je short M01_L04
7FFB797C59A7 test r10d,r10d
7FFB797C59AA jle short M01_L01
7FFB797C59AC lea eax,[r8+1]
7FFB797C59B0 jmp short M01_L02
M01_L01:
7FFB797C59B2 lea edx,[r8+0FFFF]
M01_L02:
7FFB797C59B6 cmp eax,edx
7FFB797C59B8 jle short M01_L00
M01_L03:
7FFB797C59BA not eax
7FFB797C59BC ret
M01_L04:
7FFB797C59BD mov eax,r8d
7FFB797C59C0 ret
; Total bytes of code 65 py .\performance\scripts\benchmarks_ci.py -f netcoreapp5.0 --filter 'System.Memory.Span<Byte>.BinarySearch' --bdn-arguments "--envVars COMPlus_JitAlignLoops:1"
.NET Core 5.0.0 (CoreCLR 5.0.20.40416, CoreFX 5.0.20.40416), X64 RyuJIT; System.Memory.Span`1[[System.Byte, System.Private.CoreLib]].BinarySearch()
7FFB797D5930 sub rsp,28
7FFB797D5934 mov rdx,[rcx+18]
7FFB797D5938 test rdx,rdx
7FFB797D593B jne short M00_L00
7FFB797D593D xor r8d,r8d
7FFB797D5940 xor edx,edx
7FFB797D5942 jmp short M00_L01
M00_L00:
7FFB797D5944 lea r8,[rdx+10]
7FFB797D5948 mov edx,[rdx+8]
M00_L01:
7FFB797D594B movzx eax,byte ptr [rcx+2C]
7FFB797D594F mov rcx,r8
7FFB797D5952 mov r8d,eax
7FFB797D5955 call System.SpanHelpers.BinarySearch[[System.Byte, System.Private.CoreLib],[System.Byte, System.Private.CoreLib]](Byte ByRef, Int32, Byte)
7FFB797D595A nop
7FFB797D595B add rsp,28
7FFB797D595F ret
; Total bytes of code 48 ; System.SpanHelpers.BinarySearch[[System.Byte, System.Private.CoreLib],[System.Byte, System.Private.CoreLib]](Byte ByRef, Int32, Byte)
7FFB797D5A00 mov [rsp+18],r8d
7FFB797D5A05 xor eax,eax
7FFB797D5A07 dec edx
7FFB797D5A09 test edx,edx
7FFB797D5A0B jl short M01_L03
7FFB797D5A0D nop dword ptr [rax]
M01_L00:
7FFB797D5A10 lea r8d,[rdx+rax]
7FFB797D5A14 shr r8d,1
7FFB797D5A17 movsxd r9,r8d
7FFB797D5A1A movzx r9d,byte ptr [rcx+r9]
7FFB797D5A1F movzx r10d,byte ptr [rsp+18]
7FFB797D5A25 sub r10d,r9d
7FFB797D5A28 je short M01_L04
7FFB797D5A2A test r10d,r10d
7FFB797D5A2D jle short M01_L01
7FFB797D5A2F lea eax,[r8+1]
7FFB797D5A33 jmp short M01_L02
M01_L01:
7FFB797D5A35 lea edx,[r8+0FFFF]
M01_L02:
7FFB797D5A39 cmp eax,edx
7FFB797D5A3B jle short M01_L00
M01_L03:
7FFB797D5A3D not eax
7FFB797D5A3F ret
M01_L04:
7FFB797D5A40 mov eax,r8d
7FFB797D5A43 ret
; Total bytes of code 68 Since this #38586 is something that we definitely want and we have very little control of code alignment, I am closing this issue. |
Linking to #8108. |
Run Information
Regressions in System.Memory.Span
Related Issue on x86 Windows
[Perf -25%] System.Memory.Span.IndexOfAnyThreeValues
Historical Data in Reporting System
Repro
Histogram
System.Memory.Span.BinarySearch(Size: 512)
Docs
Profiling workflow for dotnet/runtime repository
Benchmarking workflow for dotnet/runtime repository
The text was updated successfully, but these errors were encountered: