Suboptimal code generated for ValueTuple.GetHashCode() #66197

hez2010 · 2022-03-04T16:23:15Z

Description

Given this code:

class Program
{
    int M1(int x, int y)
    {
        return HashCode.Combine(x, y);
    }

    int M2(int x, int y)
    {
        return (x, y).GetHashCode();
    }
}

where the implementation of ValueTuple<,>.GetHashCode() is:

public override int GetHashCode()
{
    return HashCode.Combine(Item1?.GetHashCode() ?? 0, Item2?.GetHashCode() ?? 0);
}

The codegen for M1 and M2:

; Method Program:M1(int,int):int:this
G_M53360_IG01:
						;; bbWeight=1    PerfScore 0.00

G_M53360_IG02:
       mov      ecx, edx
       mov      edx, r8d
						;; bbWeight=1    PerfScore 0.50

G_M53360_IG03:
       jmp      System.HashCode:Combine(int,int):int
						;; bbWeight=1    PerfScore 2.00
; Total bytes of code: 10

; Method Program:M2(int,int):int:this
G_M19763_IG01:
       sub      rsp, 40
       xor      eax, eax
       mov      qword ptr [rsp+20H], rax
						;; bbWeight=1    PerfScore 1.50

G_M19763_IG02:
       mov      dword ptr [rsp+20H], edx
       mov      dword ptr [rsp+24H], r8d
       lea      rcx, bword ptr [rsp+20H]
       call     System.ValueTuple`2[Int32,Int32][System.Int32,System.Int32]:GetHashCode():int:this
       nop      
						;; bbWeight=1    PerfScore 3.75

G_M19763_IG03:
       add      rsp, 40
       ret      
						;; bbWeight=1    PerfScore 1.25
; Total bytes of code: 36

I think in this case ValueTuple.GetHashCode should at least be inlined (with T1=T2=int, both Item1 and Item2 can not be null).

But even with DOTNET_JitAggressiveInlining=1 (which forces inlining ValueTuple.GetHashCode), the codegen for M2 is still suboptimal:

; Method Program:M1(int,int):int:this
G_M53360_IG01:
       push     rdi
       push     rsi
       sub      rsp, 40
       mov      esi, edx
       mov      edi, r8d
						;; bbWeight=1    PerfScore 2.75

G_M53360_IG02:
       mov      rcx, 0xD1FFAB1E
       mov      edx, 315
       call     CORINFO_HELP_GETSHARED_NONGCSTATIC_BASE
       mov      eax, dword ptr [reloc classVar[0xd1ffab1e]]
       add      eax, 0xD1FFAB1E
       add      eax, 8
       imul     edx, esi, 0xD1FFAB1E
       add      edx, eax
       rol      edx, 17
       imul     eax, edx, 0xD1FFAB1E
       imul     edx, edi, 0xD1FFAB1E
       add      edx, eax
       rol      edx, 17
       imul     eax, edx, 0xD1FFAB1E
       mov      edx, eax
       shr      edx, 15
       xor      eax, edx
       imul     eax, eax, 0xD1FFAB1E
       mov      edx, eax
       shr      edx, 13
       xor      eax, edx
       imul     eax, eax, 0xD1FFAB1E
       mov      edx, eax
       shr      edx, 16
       xor      eax, edx
						;; bbWeight=1    PerfScore 19.50

G_M53360_IG03:
       add      rsp, 40
       pop      rsi
       pop      rdi
       ret      
						;; bbWeight=1    PerfScore 2.25
; Total bytes of code: 119

; Method Program:M2(int,int):int:this
G_M19763_IG01:
       push     rdi
       push     rsi
       sub      rsp, 40
       xor      eax, eax
       mov      qword ptr [rsp+20H], rax
						;; bbWeight=1    PerfScore 3.50

G_M19763_IG02:
       mov      dword ptr [rsp+20H], edx
       mov      dword ptr [rsp+24H], r8d
       mov      esi, dword ptr [rsp+20H]
       lea      rcx, bword ptr [rsp+24H]
       mov      edi, dword ptr [rcx]
       mov      rcx, 0xD1FFAB1E
       mov      edx, 315
       call     CORINFO_HELP_GETSHARED_NONGCSTATIC_BASE
       mov      eax, dword ptr [reloc classVar[0xd1ffab1e]]
       add      eax, 0xD1FFAB1E
       add      eax, 8
       imul     edx, esi, 0xD1FFAB1E
       add      edx, eax
       rol      edx, 17
       imul     eax, edx, 0xD1FFAB1E
       imul     edx, edi, 0xD1FFAB1E
       add      edx, eax
       rol      edx, 17
       imul     eax, edx, 0xD1FFAB1E
       mov      edx, eax
       shr      edx, 15
       xor      eax, edx
       imul     eax, eax, 0xD1FFAB1E
       mov      edx, eax
       shr      edx, 13
       xor      eax, edx
       imul     eax, eax, 0xD1FFAB1E
       mov      edx, eax
       shr      edx, 16
       xor      eax, edx
						;; bbWeight=1    PerfScore 25.00

G_M19763_IG03:
       add      rsp, 40
       pop      rsi
       pop      rdi
       ret      
						;; bbWeight=1    PerfScore 2.25
; Total bytes of code: 141

Configuration

.NET build from main branch.

category:cq
theme:codegen
skill-level:expert
cost:large
impact:medium

The text was updated successfully, but these errors were encountered:

ghost · 2022-03-04T16:23:21Z

Tagging subscribers to this area: @JulieLeeMSFT
See info in area-owners.md if you want to be subscribed.

Issue Details

Description

Given this code:

class Program
{
    int M1(int x, int y)
    {
        return HashCode.Combine(x, y);
    }

    int M2(int x, int y)
    {
        return (x, y).GetHashCode();
    }
}

The codegen for M1 and M2:

; Method Program:M1(int,int):int:this
G_M53360_IG01:
						;; bbWeight=1    PerfScore 0.00

G_M53360_IG02:
       mov      ecx, edx
       mov      edx, r8d
						;; bbWeight=1    PerfScore 0.50

G_M53360_IG03:
       jmp      System.HashCode:Combine(int,int):int
						;; bbWeight=1    PerfScore 2.00
; Total bytes of code: 10

; Method Program:M2(int,int):int:this
G_M19763_IG01:
       sub      rsp, 40
       xor      eax, eax
       mov      qword ptr [rsp+20H], rax
						;; bbWeight=1    PerfScore 1.50

G_M19763_IG02:
       mov      dword ptr [rsp+20H], edx
       mov      dword ptr [rsp+24H], r8d
       lea      rcx, bword ptr [rsp+20H]
       call     System.ValueTuple`2[Int32,Int32][System.Int32,System.Int32]:GetHashCode():int:this
       nop      
						;; bbWeight=1    PerfScore 3.75

G_M19763_IG03:
       add      rsp, 40
       ret      
						;; bbWeight=1    PerfScore 1.25
; Total bytes of code: 36

I think ValueTuple.GetHashCode should at least be inlined.

But even with DOTNET_JitAggressiveInlining=1 (which forces inlining ValueTuple.GetHashCode), the codegen for M2 is still suboptimal:

; Method Program:M1(int,int):int:this
G_M53360_IG01:
       push     rdi
       push     rsi
       sub      rsp, 40
       mov      esi, edx
       mov      edi, r8d
						;; bbWeight=1    PerfScore 2.75

G_M53360_IG02:
       mov      rcx, 0xD1FFAB1E
       mov      edx, 315
       call     CORINFO_HELP_GETSHARED_NONGCSTATIC_BASE
       mov      eax, dword ptr [reloc classVar[0xd1ffab1e]]
       add      eax, 0xD1FFAB1E
       add      eax, 8
       imul     edx, esi, 0xD1FFAB1E
       add      edx, eax
       rol      edx, 17
       imul     eax, edx, 0xD1FFAB1E
       imul     edx, edi, 0xD1FFAB1E
       add      edx, eax
       rol      edx, 17
       imul     eax, edx, 0xD1FFAB1E
       mov      edx, eax
       shr      edx, 15
       xor      eax, edx
       imul     eax, eax, 0xD1FFAB1E
       mov      edx, eax
       shr      edx, 13
       xor      eax, edx
       imul     eax, eax, 0xD1FFAB1E
       mov      edx, eax
       shr      edx, 16
       xor      eax, edx
						;; bbWeight=1    PerfScore 19.50

G_M53360_IG03:
       add      rsp, 40
       pop      rsi
       pop      rdi
       ret      
						;; bbWeight=1    PerfScore 2.25
; Total bytes of code: 119

; Method Program:M2(int,int):int:this
G_M19763_IG01:
       push     rdi
       push     rsi
       sub      rsp, 40
       xor      eax, eax
       mov      qword ptr [rsp+20H], rax
						;; bbWeight=1    PerfScore 3.50

G_M19763_IG02:
       mov      dword ptr [rsp+20H], edx
       mov      dword ptr [rsp+24H], r8d
       mov      esi, dword ptr [rsp+20H]
       lea      rcx, bword ptr [rsp+24H]
       mov      edi, dword ptr [rcx]
       mov      rcx, 0xD1FFAB1E
       mov      edx, 315
       call     CORINFO_HELP_GETSHARED_NONGCSTATIC_BASE
       mov      eax, dword ptr [reloc classVar[0xd1ffab1e]]
       add      eax, 0xD1FFAB1E
       add      eax, 8
       imul     edx, esi, 0xD1FFAB1E
       add      edx, eax
       rol      edx, 17
       imul     eax, edx, 0xD1FFAB1E
       imul     edx, edi, 0xD1FFAB1E
       add      edx, eax
       rol      edx, 17
       imul     eax, edx, 0xD1FFAB1E
       mov      edx, eax
       shr      edx, 15
       xor      eax, edx
       imul     eax, eax, 0xD1FFAB1E
       mov      edx, eax
       shr      edx, 13
       xor      eax, edx
       imul     eax, eax, 0xD1FFAB1E
       mov      edx, eax
       shr      edx, 16
       xor      eax, edx
						;; bbWeight=1    PerfScore 25.00

G_M19763_IG03:
       add      rsp, 40
       pop      rsi
       pop      rdi
       ret      
						;; bbWeight=1    PerfScore 2.25
; Total bytes of code: 141

Configuration

.NET build from main branch.

Author:	hez2010
Assignees:	-
Labels:	`tenet-performance`, `area-CodeGen-coreclr`, `untriaged`
Milestone:	-

EgorBo · 2022-03-04T16:48:05Z

The IL of ValueTuple<T1,T2>::GetHashCode is tricky:

weight= 10 : state   3 [ ldarg.0 ]
weight= 17 : state 110 [ ldflda ]
weight= 35 : state 189 [ ldloca.s.normed ]
weight= 55 : state 180 [ initobj ]
weight= 12 : state   7 [ ldloc.0 ]
weight=247 : state 117 [ box ]
weight= 25 : state  45 [ brtrue.s ]
weight= 29 : state 101 [ ldobj ]
weight=  6 : state  11 [ stloc.0 ]
weight= 35 : state 189 [ ldloca.s.normed ]
weight= 12 : state   7 [ ldloc.0 ]
weight=247 : state 117 [ box ]
weight= 25 : state  45 [ brtrue.s ]
weight=-24 : state  39 [ pop ]
weight= 15 : state  23 [ ldc.i4.0 ]
weight= 44 : state  43 [ br.s ]
weight=161 : state 190 [ constrained -> callvirt ]
weight= 10 : state   3 [ ldarg.0 ]
weight= 17 : state 110 [ ldflda ]
weight= 35 : state 189 [ ldloca.s.normed ]
weight= 55 : state 180 [ initobj ]
weight=  9 : state   8 [ ldloc.1 ]
weight=247 : state 117 [ box ]
weight= 25 : state  45 [ brtrue.s ]
weight= 29 : state 101 [ ldobj ]
weight= 34 : state  12 [ stloc.1 ]
weight= 35 : state 189 [ ldloca.s.normed ]
weight=  9 : state   8 [ ldloc.1 ]
weight=247 : state 117 [ box ]
weight= 25 : state  45 [ brtrue.s ]
weight=-24 : state  39 [ pop ]
weight= 15 : state  23 [ ldc.i4.0 ]
weight= 44 : state  43 [ br.s ]
weight=161 : state 190 [ constrained -> callvirt ]
weight= 79 : state  40 [ call ]
weight= 19 : state  42 [ ret ]

multiplier in methods of struct increased to 3.
2 ldfld or stfld over arguments which are structs.  Multiplier increased to 4.
Inline candidate looks like a wrapper method.  Multiplier increased to 5.
Inline candidate is generic and caller is not.  Multiplier increased to 7.
Inline has 4 foldable BOX ops.  Multiplier increased to 10.
Inline candidate callsite is boring.  Multiplier increased to 11.3.
calleeNativeSizeEstimate=2022
callsiteNativeSizeEstimate=85
benefit multiplier=11.3
threshold=960
Native estimate for function size exceeds threshold for inlining 202.2 > 96 (multiplier = 11.3)

inliner managed to figure out that BOX operations are foldable, but the benefit multiplier is still too low.

hez2010 · 2022-03-04T17:16:04Z

Also note that some unnecessary code are generated for M2 even with ValueTuple.GetHashCode inlined:

       mov      qword ptr [rsp+20H], rax
						;; bbWeight=1    PerfScore 3.50

G_M19763_IG02:
       mov      dword ptr [rsp+20H], edx
       mov      dword ptr [rsp+24H], r8d
       mov      esi, dword ptr [rsp+20H]
       lea      rcx, bword ptr [rsp+24H]
       mov      edi, dword ptr [rcx]

JulieLeeMSFT · 2022-03-07T17:52:04Z

@kunalspathak please check if your PR fixes the struct issue.

kunalspathak · 2022-03-11T17:29:47Z

#64130 handles patterns when a struct is returned through "return buffer". This pattern is not handled.

kunalspathak · 2022-03-11T17:30:53Z

In the original version of M2 where GetHashCode() is not inlined, we can avoid zero initializing the struct

mov      qword ptr [rsp+20H], rax

kunalspathak · 2022-07-08T18:18:51Z

This won't happen in .NET 7. Moving to .NET 8.

kunalspathak · 2023-06-05T17:49:13Z

@jakobbotsch - can you double check if you get this pattern with physical promotion?

jakobbotsch · 2023-06-06T09:10:13Z

(With aggressive inlining enabled) This and #87072 are the same underlying problem -- we end up with address exposure because Roslyn leaves the address of a struct live on the IL stack at the end of a basic block.

I will move this to future. Also we would still need the inliner to be more aggressive here, even with a fix for #87072 (cc @EgorBo).

jakobbotsch · 2024-05-31T09:55:31Z

With #102808 the codegen for M1 and M2 are now the same when DOTNET_JitAggressiveInlining=1.

jakobbotsch · 2024-05-31T09:56:08Z

Going to give this to @EgorBo since this is now a question of inlining.

hez2010 added the tenet-performance Performance related issue label Mar 4, 2022

dotnet-issue-labeler bot added area-CodeGen-coreclr CLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI untriaged New issue has not been triaged by the area owner labels Mar 4, 2022

JulieLeeMSFT assigned kunalspathak Mar 10, 2022

kunalspathak modified the milestones: Future, 7.0.0 Mar 11, 2022

jeffschwMSFT removed the untriaged New issue has not been triaged by the area owner label Mar 28, 2022

kunalspathak modified the milestones: 7.0.0, 8.0.0 Jul 8, 2022

kunalspathak assigned jakobbotsch and unassigned kunalspathak Jun 5, 2023

jakobbotsch modified the milestones: 8.0.0, Future Jun 6, 2023

jakobbotsch assigned EgorBo and unassigned jakobbotsch May 31, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Suboptimal code generated for ValueTuple.GetHashCode() #66197

Suboptimal code generated for ValueTuple.GetHashCode() #66197

hez2010 commented Mar 4, 2022 •

edited by BruceForstall

Loading

ghost commented Mar 4, 2022

Description

Configuration

EgorBo commented Mar 4, 2022

hez2010 commented Mar 4, 2022

JulieLeeMSFT commented Mar 7, 2022

kunalspathak commented Mar 11, 2022

kunalspathak commented Mar 11, 2022

kunalspathak commented Jul 8, 2022

kunalspathak commented Jun 5, 2023

jakobbotsch commented Jun 6, 2023 •

edited

Loading

jakobbotsch commented May 31, 2024

jakobbotsch commented May 31, 2024

Suboptimal code generated for ValueTuple.GetHashCode() #66197

Suboptimal code generated for ValueTuple.GetHashCode() #66197

Comments

hez2010 commented Mar 4, 2022 • edited by BruceForstall Loading

Description

Configuration

ghost commented Mar 4, 2022

Description

Configuration

EgorBo commented Mar 4, 2022

hez2010 commented Mar 4, 2022

JulieLeeMSFT commented Mar 7, 2022

kunalspathak commented Mar 11, 2022

kunalspathak commented Mar 11, 2022

kunalspathak commented Jul 8, 2022

kunalspathak commented Jun 5, 2023

jakobbotsch commented Jun 6, 2023 • edited Loading

jakobbotsch commented May 31, 2024

jakobbotsch commented May 31, 2024

hez2010 commented Mar 4, 2022 •

edited by BruceForstall

Loading

jakobbotsch commented Jun 6, 2023 •

edited

Loading