Skip to content

Commit

Permalink
[InstCombine] reverse 'trunc X to <N x i1>' canonicalization; 2nd try
Browse files Browse the repository at this point in the history
Re-trying r344082 because it unintentionally included extra diffs.

Original commit message:
icmp ne (and X, 1), 0 --> trunc X to N x i1

Ideally, we'd do the same for scalars, but there will likely be
regressions unless we add more trunc folds as we're doing here
for vectors.

The motivating vector case is from PR37549:
https://bugs.llvm.org/show_bug.cgi?id=37549

define <4 x float> @bitwise_select(<4 x float> %x, <4 x float> %y, <4 x float> %z, <4 x float> %w) {

  %c = fcmp ole <4 x float> %x, %y
  %s = sext <4 x i1> %c to <4 x i32>
  %s1 = shufflevector <4 x i32> %s, <4 x i32> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
  %s2 = shufflevector <4 x i32> %s, <4 x i32> undef, <4 x i32> <i32 2, i32 2, i32 3, i32 3>
  %cond = or <4 x i32> %s1, %s2
  %condtr = trunc <4 x i32> %cond to <4 x i1>
  %r = select <4 x i1> %condtr, <4 x float> %z, <4 x float> %w
  ret <4 x float> %r

}

Here's a sampling of the vector codegen for that case using
mask+icmp (current behavior) vs. trunc (with this patch):

AVX before:

vcmpleps        %xmm1, %xmm0, %xmm0
vpermilps       $80, %xmm0, %xmm1 ## xmm1 = xmm0[0,0,1,1]
vpermilps       $250, %xmm0, %xmm0 ## xmm0 = xmm0[2,2,3,3]
vorps   %xmm0, %xmm1, %xmm0
vandps  LCPI0_0(%rip), %xmm0, %xmm0
vxorps  %xmm1, %xmm1, %xmm1
vpcmpeqd        %xmm1, %xmm0, %xmm0
vblendvps       %xmm0, %xmm3, %xmm2, %xmm0

AVX after:

vcmpleps        %xmm1, %xmm0, %xmm0
vpermilps       $80, %xmm0, %xmm1 ## xmm1 = xmm0[0,0,1,1]
vpermilps       $250, %xmm0, %xmm0 ## xmm0 = xmm0[2,2,3,3]
vorps   %xmm0, %xmm1, %xmm0
vblendvps       %xmm0, %xmm2, %xmm3, %xmm0

AVX512f before:

vcmpleps        %xmm1, %xmm0, %xmm0
vpermilps       $80, %xmm0, %xmm1 ## xmm1 = xmm0[0,0,1,1]
vpermilps       $250, %xmm0, %xmm0 ## xmm0 = xmm0[2,2,3,3]
vorps   %xmm0, %xmm1, %xmm0
vpbroadcastd    LCPI0_0(%rip), %xmm1 ## xmm1 = [1,1,1,1]
vptestnmd       %zmm1, %zmm0, %k1
vblendmps       %zmm3, %zmm2, %zmm0 {%k1}

AVX512f after:

vcmpleps        %xmm1, %xmm0, %xmm0
vpermilps       $80, %xmm0, %xmm1 ## xmm1 = xmm0[0,0,1,1]
vpermilps       $250, %xmm0, %xmm0 ## xmm0 = xmm0[2,2,3,3]
vorps   %xmm0, %xmm1, %xmm0
vpslld  $31, %xmm0, %xmm0
vptestmd        %zmm0, %zmm0, %k1
vblendmps       %zmm2, %zmm3, %zmm0 {%k1}

AArch64 before:

fcmge   v0.4s, v1.4s, v0.4s
zip1    v1.4s, v0.4s, v0.4s
zip2    v0.4s, v0.4s, v0.4s
orr     v0.16b, v1.16b, v0.16b
movi    v1.4s, #1
and     v0.16b, v0.16b, v1.16b
cmeq    v0.4s, v0.4s, #0
bsl     v0.16b, v3.16b, v2.16b

AArch64 after:

fcmge   v0.4s, v1.4s, v0.4s
zip1    v1.4s, v0.4s, v0.4s
zip2    v0.4s, v0.4s, v0.4s
orr     v0.16b, v1.16b, v0.16b
bsl     v0.16b, v2.16b, v3.16b

PowerPC-le before:

xvcmpgesp 34, 35, 34
vspltisw 0, 1
vmrglw 3, 2, 2
vmrghw 2, 2, 2
xxlor 0, 35, 34
xxlxor 35, 35, 35
xxland 34, 0, 32
vcmpequw 2, 2, 3
xxsel 34, 36, 37, 34

PowerPC-le after:

xvcmpgesp 34, 35, 34
vmrglw 3, 2, 2
vmrghw 2, 2, 2
xxlor 0, 35, 34
xxsel 34, 37, 36, 0

Differential Revision: https://reviews.llvm.org/D52747

llvm-svn: 344181
  • Loading branch information
rotateright committed Oct 10, 2018
1 parent eff0542 commit 05aadf8
Show file tree
Hide file tree
Showing 7 changed files with 213 additions and 209 deletions.
31 changes: 27 additions & 4 deletions llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -706,12 +706,35 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) {
if (SimplifyDemandedInstructionBits(CI))
return &CI;

// Canonicalize trunc x to i1 -> (icmp ne (and x, 1), 0), likewise for vector.
if (DestTy->getScalarSizeInBits() == 1) {
Constant *One = ConstantInt::get(SrcTy, 1);
Src = Builder.CreateAnd(Src, One);
Value *Zero = Constant::getNullValue(Src->getType());
return new ICmpInst(ICmpInst::ICMP_NE, Src, Zero);
if (DestTy->isIntegerTy()) {
// Canonicalize trunc x to i1 -> icmp ne (and x, 1), 0 (scalar only).
// TODO: We canonicalize to more instructions here because we are probably
// lacking equivalent analysis for trunc relative to icmp. There may also
// be codegen concerns. If those trunc limitations were removed, we could
// remove this transform.
Value *And = Builder.CreateAnd(Src, ConstantInt::get(SrcTy, 1));
return new ICmpInst(ICmpInst::ICMP_NE, And, Zero);
}

// For vectors, we do not canonicalize all truncs to icmp, so optimize
// patterns that would be covered within visitICmpInst.
Value *X;
const APInt *C;
if (match(Src, m_OneUse(m_LShr(m_Value(X), m_APInt(C))))) {
// trunc (lshr X, C) to i1 --> icmp ne (and X, C'), 0
APInt MaskC = APInt(SrcTy->getScalarSizeInBits(), 1).shl(*C);
Value *And = Builder.CreateAnd(X, ConstantInt::get(SrcTy, MaskC));
return new ICmpInst(ICmpInst::ICMP_NE, And, Zero);
}
if (match(Src, m_OneUse(m_c_Or(m_LShr(m_Value(X), m_APInt(C)),
m_Deferred(X))))) {
// trunc (or (lshr X, C), X) to i1 --> icmp ne (and X, C'), 0
APInt MaskC = APInt(SrcTy->getScalarSizeInBits(), 1).shl(*C) | 1;
Value *And = Builder.CreateAnd(X, ConstantInt::get(SrcTy, MaskC));
return new ICmpInst(ICmpInst::ICMP_NE, And, Zero);
}
}

// FIXME: Maybe combine the next two transforms to handle the no cast case
Expand Down
7 changes: 7 additions & 0 deletions llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1609,6 +1609,13 @@ Instruction *InstCombiner::foldICmpAndShift(ICmpInst &Cmp, BinaryOperator *And,
Instruction *InstCombiner::foldICmpAndConstConst(ICmpInst &Cmp,
BinaryOperator *And,
const APInt &C1) {
// For vectors: icmp ne (and X, 1), 0 --> trunc X to N x i1
// TODO: We canonicalize to the longer form for scalars because we have
// better analysis/folds for icmp, and codegen may be better with icmp.
if (Cmp.getPredicate() == CmpInst::ICMP_NE && Cmp.getType()->isVectorTy() &&
C1.isNullValue() && match(And->getOperand(1), m_One()))
return new TruncInst(And->getOperand(0), Cmp.getType());

const APInt *C2;
if (!match(And->getOperand(1), m_APInt(C2)))
return nullptr;
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/Transforms/InstCombine/apint-shift.ll
Original file line number Diff line number Diff line change
Expand Up @@ -319,8 +319,8 @@ define i1 @test16(i84 %X) {

define <2 x i1> @test16vec(<2 x i84> %X) {
; CHECK-LABEL: @test16vec(
; CHECK-NEXT: [[AND:%.*]] = and <2 x i84> %X, <i84 16, i84 16>
; CHECK-NEXT: [[CMP:%.*]] = icmp ne <2 x i84> [[AND]], zeroinitializer
; CHECK-NEXT: [[TMP1:%.*]] = and <2 x i84> [[X:%.*]], <i84 16, i84 16>
; CHECK-NEXT: [[CMP:%.*]] = icmp ne <2 x i84> [[TMP1]], zeroinitializer
; CHECK-NEXT: ret <2 x i1> [[CMP]]
;
%shr = ashr <2 x i84> %X, <i84 4, i84 4>
Expand Down
5 changes: 2 additions & 3 deletions llvm/test/Transforms/InstCombine/apint-shl-trunc.ll
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,8 @@ define i1 @test1(i799 %X, i799 %A) {

define <2 x i1> @test0vec(<2 x i39> %X, <2 x i39> %A) {
; CHECK-LABEL: @test0vec(
; CHECK-NEXT: [[TMP1:%.*]] = shl <2 x i39> <i39 1, i39 1>, [[A:%.*]]
; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i39> [[TMP1]], [[X:%.*]]
; CHECK-NEXT: [[D:%.*]] = icmp ne <2 x i39> [[TMP2]], zeroinitializer
; CHECK-NEXT: [[B:%.*]] = lshr <2 x i39> [[X:%.*]], [[A:%.*]]
; CHECK-NEXT: [[D:%.*]] = trunc <2 x i39> [[B]] to <2 x i1>
; CHECK-NEXT: ret <2 x i1> [[D]]
;
%B = lshr <2 x i39> %X, %A
Expand Down
20 changes: 8 additions & 12 deletions llvm/test/Transforms/InstCombine/icmp.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2427,10 +2427,9 @@ define i1 @icmp_and_or_lshr(i32 %x, i32 %y) {

define <2 x i1> @icmp_and_or_lshr_vec(<2 x i32> %x, <2 x i32> %y) {
; CHECK-LABEL: @icmp_and_or_lshr_vec(
; CHECK-NEXT: [[SHF1:%.*]] = shl nuw <2 x i32> <i32 1, i32 1>, [[Y:%.*]]
; CHECK-NEXT: [[OR2:%.*]] = or <2 x i32> [[SHF1]], <i32 1, i32 1>
; CHECK-NEXT: [[AND3:%.*]] = and <2 x i32> [[OR2]], [[X:%.*]]
; CHECK-NEXT: [[RET:%.*]] = icmp ne <2 x i32> [[AND3]], zeroinitializer
; CHECK-NEXT: [[SHF:%.*]] = lshr <2 x i32> [[X:%.*]], [[Y:%.*]]
; CHECK-NEXT: [[OR:%.*]] = or <2 x i32> [[SHF]], [[X]]
; CHECK-NEXT: [[RET:%.*]] = trunc <2 x i32> [[OR]] to <2 x i1>
; CHECK-NEXT: ret <2 x i1> [[RET]]
;
%shf = lshr <2 x i32> %x, %y
Expand All @@ -2445,8 +2444,7 @@ define <2 x i1> @icmp_and_or_lshr_vec_commute(<2 x i32> %xp, <2 x i32> %y) {
; CHECK-NEXT: [[X:%.*]] = srem <2 x i32> [[XP:%.*]], <i32 42, i32 42>
; CHECK-NEXT: [[SHF:%.*]] = lshr <2 x i32> [[X]], [[Y:%.*]]
; CHECK-NEXT: [[OR:%.*]] = or <2 x i32> [[X]], [[SHF]]
; CHECK-NEXT: [[AND:%.*]] = and <2 x i32> [[OR]], <i32 1, i32 1>
; CHECK-NEXT: [[RET:%.*]] = icmp ne <2 x i32> [[AND]], zeroinitializer
; CHECK-NEXT: [[RET:%.*]] = trunc <2 x i32> [[OR]] to <2 x i1>
; CHECK-NEXT: ret <2 x i1> [[RET]]
;
%x = srem <2 x i32> %xp, <i32 42, i32 -42> ; prevent complexity-based canonicalization
Expand All @@ -2472,8 +2470,8 @@ define i1 @icmp_and_or_lshr_cst(i32 %x) {

define <2 x i1> @icmp_and_or_lshr_cst_vec(<2 x i32> %x) {
; CHECK-LABEL: @icmp_and_or_lshr_cst_vec(
; CHECK-NEXT: [[AND1:%.*]] = and <2 x i32> [[X:%.*]], <i32 3, i32 3>
; CHECK-NEXT: [[RET:%.*]] = icmp ne <2 x i32> [[AND1]], zeroinitializer
; CHECK-NEXT: [[TMP1:%.*]] = and <2 x i32> [[X:%.*]], <i32 3, i32 3>
; CHECK-NEXT: [[RET:%.*]] = icmp ne <2 x i32> [[TMP1]], zeroinitializer
; CHECK-NEXT: ret <2 x i1> [[RET]]
;
%shf = lshr <2 x i32> %x, <i32 1, i32 1>
Expand All @@ -2486,10 +2484,8 @@ define <2 x i1> @icmp_and_or_lshr_cst_vec(<2 x i32> %x) {
define <2 x i1> @icmp_and_or_lshr_cst_vec_commute(<2 x i32> %xp) {
; CHECK-LABEL: @icmp_and_or_lshr_cst_vec_commute(
; CHECK-NEXT: [[X:%.*]] = srem <2 x i32> [[XP:%.*]], <i32 42, i32 42>
; CHECK-NEXT: [[SHF:%.*]] = lshr <2 x i32> [[X]], <i32 1, i32 1>
; CHECK-NEXT: [[OR:%.*]] = or <2 x i32> [[X]], [[SHF]]
; CHECK-NEXT: [[AND:%.*]] = and <2 x i32> [[OR]], <i32 1, i32 1>
; CHECK-NEXT: [[RET:%.*]] = icmp ne <2 x i32> [[AND]], zeroinitializer
; CHECK-NEXT: [[TMP1:%.*]] = and <2 x i32> [[X]], <i32 3, i32 3>
; CHECK-NEXT: [[RET:%.*]] = icmp ne <2 x i32> [[TMP1]], zeroinitializer
; CHECK-NEXT: ret <2 x i1> [[RET]]
;
%x = srem <2 x i32> %xp, <i32 42, i32 -42> ; prevent complexity-based canonicalization
Expand Down
19 changes: 7 additions & 12 deletions llvm/test/Transforms/InstCombine/vector-casts.ll
Original file line number Diff line number Diff line change
@@ -1,47 +1,42 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -instcombine -S | FileCheck %s

; This turns into a&1 != 0
; TODO: The bar for canonicalizing to something bigger than the minimal length IR is very high.
; This pattern does not appear to meet that standard.
; Can't get smaller than this.

define <2 x i1> @trunc(<2 x i64> %a) {
; CHECK-LABEL: @trunc(
; CHECK-NEXT: [[TMP1:%.*]] = and <2 x i64> [[A:%.*]], <i64 1, i64 1>
; CHECK-NEXT: [[T:%.*]] = icmp ne <2 x i64> [[TMP1]], zeroinitializer
; CHECK-NEXT: [[T:%.*]] = trunc <2 x i64> [[A:%.*]] to <2 x i1>
; CHECK-NEXT: ret <2 x i1> [[T]]
;
%t = trunc <2 x i64> %a to <2 x i1>
ret <2 x i1> %t
}

; TODO: This could be just 1 instruction (trunc).
; This is trunc.

define <2 x i1> @and_cmp_is_trunc(<2 x i64> %a) {
; CHECK-LABEL: @and_cmp_is_trunc(
; CHECK-NEXT: [[T:%.*]] = and <2 x i64> [[A:%.*]], <i64 1, i64 1>
; CHECK-NEXT: [[R:%.*]] = icmp ne <2 x i64> [[T]], zeroinitializer
; CHECK-NEXT: [[R:%.*]] = trunc <2 x i64> [[A:%.*]] to <2 x i1>
; CHECK-NEXT: ret <2 x i1> [[R]]
;
%t = and <2 x i64> %a, <i64 1, i64 1>
%r = icmp ne <2 x i64> %t, zeroinitializer
ret <2 x i1> %r
}

; TODO: This could be just 1 instruction (trunc).
; This is trunc.

define <2 x i1> @and_cmp_is_trunc_even_with_undef_elt(<2 x i64> %a) {
; CHECK-LABEL: @and_cmp_is_trunc_even_with_undef_elt(
; CHECK-NEXT: [[T:%.*]] = and <2 x i64> [[A:%.*]], <i64 undef, i64 1>
; CHECK-NEXT: [[R:%.*]] = icmp ne <2 x i64> [[T]], zeroinitializer
; CHECK-NEXT: [[R:%.*]] = trunc <2 x i64> [[A:%.*]] to <2 x i1>
; CHECK-NEXT: ret <2 x i1> [[R]]
;
%t = and <2 x i64> %a, <i64 undef, i64 1>
%r = icmp ne <2 x i64> %t, zeroinitializer
ret <2 x i1> %r
}

; TODO: This could be just 1 instruction (trunc).
; TODO: This could be just 1 instruction (trunc), but our undef matching is incomplete.

define <2 x i1> @and_cmp_is_trunc_even_with_undef_elts(<2 x i64> %a) {
; CHECK-LABEL: @and_cmp_is_trunc_even_with_undef_elts(
Expand Down
Loading

0 comments on commit 05aadf8

Please sign in to comment.