Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

sub-optimal codegen for llvm.experimental.vector.reduce of <N x i1> #38188

Closed
gnzlbg mannequin opened this issue Sep 5, 2018 · 15 comments
Closed

sub-optimal codegen for llvm.experimental.vector.reduce of <N x i1> #38188

gnzlbg mannequin opened this issue Sep 5, 2018 · 15 comments
Assignees
Labels
backend:X86 bugzilla Issues migrated from bugzilla

Comments

@gnzlbg
Copy link
Mannequin

gnzlbg mannequin commented Sep 5, 2018

Bugzilla Link 38840
Resolution FIXED
Resolved on Apr 28, 2019 04:01
Version trunk
OS All
CC @topperc,@RKSimon,@rotateright
Fixed by commit(s) r359385,r359396

Extended Description

The llvm.experimental.vector.reduce.{and,or,xor} instructions of the x86 backend produce very sub-optimal machine code. See it live: https://gcc.godbolt.org/z/qIHi6D

LLVM-IR:

declare i1 @​llvm.experimental.vector.reduce.and.v32i1(<32 x i1>);
declare i1 @​llvm.experimental.vector.reduce.and.v8i1(<8 x i1>);
declare i1 @​llvm.experimental.vector.reduce.and.v4i1(<4 x i1>);
declare i1 @​llvm.experimental.vector.reduce.and.v2i1(<2 x i1>);

define i1 @​and128_x2(<2 x i64>) {
%a = trunc <2 x i64> %0 to <2 x i1>
%b = call i1 @​llvm.experimental.vector.reduce.and.v2i1(<2 x i1> %a)
ret i1 %b
}
define i1 @​and_x4(<4 x i32>) {
%a = trunc <4 x i32> %0 to <4 x i1>
%b = call i1 @​llvm.experimental.vector.reduce.and.v4i1(<4 x i1> %a)
ret i1 %b
}
define i1 @​and128_x8(<8 x i8>) {
%a = trunc <8 x i8> %0 to <8 x i1>
%b = call i1 @​llvm.experimental.vector.reduce.and.v8i1(<8 x i1> %a)
ret i1 %b
}
define i1 @​and256_x4(<4 x i64>) {
%a = trunc <4 x i64> %0 to <4 x i1>
%b = call i1 @​llvm.experimental.vector.reduce.and.v4i1(<4 x i1> %a)
ret i1 %b
}
define i1 @​and_x8(<8 x i32>) {
%a = trunc <8 x i32> %0 to <8 x i1>
%b = call i1 @​llvm.experimental.vector.reduce.and.v8i1(<8 x i1> %a)
ret i1 %b
}
define i1 @​and256_x32(<32 x i8>) {
%a = trunc <32 x i8> %0 to <32 x i1>
%b = call i1 @​llvm.experimental.vector.reduce.and.v32i1(<32 x i1> %a)
ret i1 %b
}

produces

and128_x2: # @​and128_x2
pshufd $78, %xmm0, %xmm1 # xmm1 = xmm0[2,3,0,1]
pand %xmm0, %xmm1
movd %xmm1, %eax
retq
and_x4: # @​and_x4
pshufd $78, %xmm0, %xmm1 # xmm1 = xmm0[2,3,0,1]
pand %xmm0, %xmm1
pshufd $229, %xmm1, %xmm0 # xmm0 = xmm1[1,1,2,3]
pand %xmm1, %xmm0
movd %xmm0, %eax
retq
and128_x8: # @​and128_x8
pshufd $78, %xmm0, %xmm1 # xmm1 = xmm0[2,3,0,1]
pand %xmm0, %xmm1
pshufd $229, %xmm1, %xmm0 # xmm0 = xmm1[1,1,2,3]
pand %xmm1, %xmm0
movdqa %xmm0, %xmm1
psrld $16, %xmm1
pand %xmm0, %xmm1
movd %xmm1, %eax
retq
and256_x4: # @​and256_x4
shufps $136, %xmm1, %xmm0 # xmm0 = xmm0[0,2],xmm1[0,2]
pshufd $78, %xmm0, %xmm1 # xmm1 = xmm0[2,3,0,1]
pand %xmm0, %xmm1
pshufd $229, %xmm1, %xmm0 # xmm0 = xmm1[1,1,2,3]
pand %xmm1, %xmm0
movd %xmm0, %eax
retq
and256_x8: # @​and_x8
pshuflw $232, %xmm1, %xmm1 # xmm1 = xmm1[0,2,2,3,4,5,6,7]
pshufhw $232, %xmm1, %xmm1 # xmm1 = xmm1[0,1,2,3,4,6,6,7]
pshufd $232, %xmm1, %xmm1 # xmm1 = xmm1[0,2,2,3]
pshuflw $232, %xmm0, %xmm0 # xmm0 = xmm0[0,2,2,3,4,5,6,7]
pshufhw $232, %xmm0, %xmm0 # xmm0 = xmm0[0,1,2,3,4,6,6,7]
pshufd $232, %xmm0, %xmm0 # xmm0 = xmm0[0,2,2,3]
punpcklqdq %xmm1, %xmm0 # xmm0 = xmm0[0],xmm1[0]
pshufd $78, %xmm0, %xmm1 # xmm1 = xmm0[2,3,0,1]
pand %xmm0, %xmm1
pshufd $229, %xmm1, %xmm0 # xmm0 = xmm1[1,1,2,3]
pand %xmm1, %xmm0
movdqa %xmm0, %xmm1
psrld $16, %xmm1
pand %xmm0, %xmm1
movd %xmm1, %eax
retq
and256_x32: # @​and256_x32
pand %xmm1, %xmm0
pshufd $78, %xmm0, %xmm1 # xmm1 = xmm0[2,3,0,1]
pand %xmm0, %xmm1
pshufd $229, %xmm1, %xmm0 # xmm0 = xmm1[1,1,2,3]
pand %xmm1, %xmm0
movdqa %xmm0, %xmm1
psrld $16, %xmm1
pand %xmm0, %xmm1
movdqa %xmm1, %xmm0
psrlw $8, %xmm0
pand %xmm1, %xmm0
movd %xmm0, %eax
retq

but these should all lower to a single mvmsk instruction:

and128_x2:
movmskpd %xmm0, %eax
retq
and128_x4:
movmskps %xmm0, %eax
retq
and128_x8:
pmovmskb %xmm0, %eax
retq
and256_x4:
vmovmskpd %ymm0, %eax
vzeroupper
retq
and256_x8:
vmovmskps %ymm0, %eax
vzeroupper
retq
and256_x32:
vpmovmskb %ymm0, %eax
vzeroupper
retq1

The llvm.experimental.vector.reduce.and for <8 x i16>, <16 x i16>, <1 x i128>, <2 x i128>, etc. probably produce very sub-optimal machine code for i1 vectors as well.

The llvm.experimental.vector.reduce.or and llvm.experimental.vector.reduce.xor probably produce very sub-optimal machine code for all these i1 vectors too.

These llvm intrinsics are critical for efficiently performing coherent control flow.

@gnzlbg
Copy link
Mannequin Author

gnzlbg mannequin commented Sep 5, 2018

assigned to @RKSimon

@topperc
Copy link
Collaborator

topperc commented Sep 5, 2018

I don’t think I understand how any of these can be done with a single movmsk. Movmsk collects the sign bits of the elements into adjacent bits of a GPR. How is that equivalent to an i1 result?

@topperc
Copy link
Collaborator

topperc commented Sep 5, 2018

Also it’s a truncate to vXi1 in IR which would only grab the LSB. Movmsk grabs the MSB. So without knowing that it’s at sign bits that wouldn’t be interchangeable either

@RKSimon
Copy link
Collaborator

RKSimon commented Sep 5, 2018

Modified godbolt that uses comparisons: https://gcc.godbolt.org/z/ioTWEX

@gnzlbg
Copy link
Mannequin Author

gnzlbg mannequin commented Sep 5, 2018

I don’t think I understand how any of these can be done with a single movmsk. Movmsk collects the sign bits of the elements into adjacent bits of a GPR. How is that equivalent to an i1 result?

Oops, it isn't! The result of movmsk would need to be compared against an appropriate integer value (the value for which the N first bits are set). The result of that comparison is equivalent to the i1 that the vector.reduce.and instruction returns.

That is, IIUC the correct assembly would be

and128_x4:
movmskps %xmm0, %eax
cmpl $15, %eax
sete %al
ret

which is ~1.25 cycles, and much better than what's currently being produced:

and128_x4:
pshufd $78, %xmm0, %xmm1 # xmm1 = xmm0[2,3,0,1]
pand %xmm0, %xmm1
pshufd $229, %xmm1, %xmm0 # xmm0 = xmm1[1,1,2,3]
pand %xmm1, %xmm0
movd %xmm0, %eax
retq

which takes ~4 cycles on SKL.

@topperc
Copy link
Collaborator

topperc commented Sep 5, 2018

Why not just put a bitcast and icmp in IR? The reduction intrinsics are not really intended for i1 elements.

@gnzlbg
Copy link
Mannequin Author

gnzlbg mannequin commented Sep 5, 2018

Why not just put a bitcast and icmp in IR? The reduction intrinsics are not really intended for i1 elements.

I am not sure if the following is equivalent, but if the problem is i1 then I can also use i32:

declare i32 @​llvm.experimental.vector.reduce.and.v8i1(<8 x i32>);
define i1 @​and256_x8(<8 x i32>) {
%a = trunc <8 x i32> %0 to <8 x i1>
%b = sext <8 x i1> %a to <8 x i32>
%c = call i32 @​llvm.experimental.vector.reduce.and.v8i1(<8 x i32> %b)
%d = trunc i32 %c to i1
ret i1 %d
}

That code truncates to a vector of i1, and then sign-extends it back to i32, so that movmsk becomes valid IIUC. It still produces:

and256_x8: # @​and256_x8
vpslld $31, %ymm0, %ymm0
vpsrad $31, %ymm0, %ymm0
vmovmskps %ymm0, %eax
xorl %ecx, %ecx
cmpl $255, %eax
movl $-1, %eax
cmovnel %ecx, %eax
vzeroupper
retq

@gnzlbg
Copy link
Mannequin Author

gnzlbg mannequin commented Sep 5, 2018

Or do you meant with the icmp that I should do something like this?

declare i32 @​llvm.experimental.vector.reduce.and.v8i1(<8 x i32>);
define i1 @​and256_x8(<8 x i32>) {
%a = trunc <8 x i32> %0 to <8 x i1>
%b = sext <8 x i1> %a to <8 x i32>
%c = call i32 @​llvm.experimental.vector.reduce.and.v8i1(<8 x i32> %b)
%d = icmp eq i32 %c, 255
ret i1 %d
}

and256_x8: # @​and256_x8
vpslld $31, %ymm0, %ymm0
vpsrad $31, %ymm0, %ymm0
vmovmskps %ymm0, %eax
xorl %ecx, %ecx
cmpl $255, %eax
movl $-1, %eax
cmovnel %ecx, %eax
cmpl $255, %eax
sete %al
vzeroupper
retq

@topperc
Copy link
Collaborator

topperc commented Sep 5, 2018

I meant don’t use the intrinsic at all. Truncate to vXi1. Bitcast to iX. icmp with 2^X-1. Sorry I’m writing from my phone so it’s hard to write IR or test myself at the moment.

@gnzlbg
Copy link
Mannequin Author

gnzlbg mannequin commented Sep 5, 2018

Sorry I’m writing from my phone so it’s hard to write IR or test myself at the moment.

Don't worry! And sincere thanks for trying to help me get to the bottom of this!

So let's see if I got this right:

define i1 @​and256_x8(<8 x i32>) {
%a = trunc <8 x i32> %0 to <8 x i1>
%b = bitcast <8 x i1> %a to i8
%d = icmp eq i8 %b, 255
ret i1 %d
}
define i1 @​and256_x8_opt(<8 x i32>) {
%2 = and <8 x i32> %0, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
%a = icmp ne <8 x i32> %2, zeroinitializer
%b = bitcast <8 x i1> %a to i8
%d = icmp eq i8 %b, -1
ret i1 %d
}

where the second version is the result of passing the first one through opt -O3. They produce the following machine code, which isn't great either AFAICT:

.LCPI0_0:
.byte 0 # 0x0
.byte 1 # 0x1
.byte 4 # 0x4
.byte 5 # 0x5
.byte 8 # 0x8
.byte 9 # 0x9
.byte 12 # 0xc
.byte 13 # 0xd
.byte 8 # 0x8
.byte 9 # 0x9
.byte 12 # 0xc
.byte 13 # 0xd
.byte 12 # 0xc
.byte 13 # 0xd
.byte 14 # 0xe
.byte 15 # 0xf
.byte 16 # 0x10
.byte 17 # 0x11
.byte 20 # 0x14
.byte 21 # 0x15
.byte 24 # 0x18
.byte 25 # 0x19
.byte 28 # 0x1c
.byte 29 # 0x1d
.byte 24 # 0x18
.byte 25 # 0x19
.byte 28 # 0x1c
.byte 29 # 0x1d
.byte 28 # 0x1c
.byte 29 # 0x1d
.byte 30 # 0x1e
.byte 31 # 0x1f
and256_x8: # @​and256_x8
vpshufb .LCPI0_0(%rip), %ymm0, %ymm0 # ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
vpermq $232, %ymm0, %ymm0 # ymm0 = ymm0[0,2,2,3]
vpsllw $15, %xmm0, %xmm0
vpsraw $15, %xmm0, %xmm0
vpacksswb %xmm0, %xmm0, %xmm0
vpmovmskb %xmm0, %eax
cmpb $-1, %al
sete %al
vzeroupper
retq
.LCPI1_0:
.long 1 # 0x1
and256_x8_opt: # @​and256_x8_opt
vpbroadcastd .LCPI1_0(%rip), %ymm1 # ymm1 = [1,1,1,1,1,1,1,1]
vpand %ymm1, %ymm0, %ymm0
vpcmpeqd %ymm1, %ymm0, %ymm0
vmovmskps %ymm0, %eax
cmpb $-1, %al
sete %al
vzeroupper
retq

@topperc
Copy link
Collaborator

topperc commented Sep 10, 2018

I'm working on a patch that will turn the the O3 form into this

and256_x8_opt: # @​and256_x8_opt
.cfi_startproc

%bb.0:

    vpslld  $31, %ymm0, %ymm0
    vmovmskps       %ymm0, %eax
    cmpb    $-1, %al
    sete    %al
    vzeroupper
    retq

@RKSimon
Copy link
Collaborator

RKSimon commented Apr 27, 2019

Most of this looks good in trunk now - test cases added at rL359385

using @​llvm.experimental.vector.reduce.* for and/or uses MOVMSK pretty efficiently

the bitcast variants also optimize well

The only outstanding issue is @​llvm.experimental.vector.reduce.xor.vXi1 reduction which isn't currently covered by combineHorizontalPredicateResult - but a MOVMSK followed by a parity check should work correctly for that.

@RKSimon
Copy link
Collaborator

RKSimon commented Apr 27, 2019

xor bool reduction: https://reviews.llvm.org/D61230

@RKSimon
Copy link
Collaborator

RKSimon commented Apr 28, 2019

xor bool reduction: https://reviews.llvm.org/D61230

Committed at rL359396

@calebzulawski
Copy link
Contributor

mentioned in issue llvm/llvm-bugzilla-archive#51122

@llvmbot llvmbot transferred this issue from llvm/llvm-bugzilla-archive Dec 10, 2021
This issue was closed.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
backend:X86 bugzilla Issues migrated from bugzilla
Projects
None yet
Development

No branches or pull requests

3 participants