[InstCombine] reverse 'trunc X to <N x i1>' canonicalization; 2nd try

Re-trying r344082 because it unintentionally included extra diffs. Original commit message: icmp ne (and X, 1), 0 --> trunc X to N x i1 Ideally, we'd do the same for scalars, but there will likely be regressions unless we add more trunc folds as we're doing here for vectors. The motivating vector case is from PR37549: https://bugs.llvm.org/show_bug.cgi?id=37549 define <4 x float> @bitwise_select(<4 x float> %x, <4 x float> %y, <4 x float> %z, <4 x float> %w) { %c = fcmp ole <4 x float> %x, %y %s = sext <4 x i1> %c to <4 x i32> %s1 = shufflevector <4 x i32> %s, <4 x i32> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 1> %s2 = shufflevector <4 x i32> %s, <4 x i32> undef, <4 x i32> <i32 2, i32 2, i32 3, i32 3> %cond = or <4 x i32> %s1, %s2 %condtr = trunc <4 x i32> %cond to <4 x i1> %r = select <4 x i1> %condtr, <4 x float> %z, <4 x float> %w ret <4 x float> %r } Here's a sampling of the vector codegen for that case using mask+icmp (current behavior) vs. trunc (with this patch): AVX before: vcmpleps %xmm1, %xmm0, %xmm0 vpermilps $80, %xmm0, %xmm1 ## xmm1 = xmm0[0,0,1,1] vpermilps $250, %xmm0, %xmm0 ## xmm0 = xmm0[2,2,3,3] vorps %xmm0, %xmm1, %xmm0 vandps LCPI0_0(%rip), %xmm0, %xmm0 vxorps %xmm1, %xmm1, %xmm1 vpcmpeqd %xmm1, %xmm0, %xmm0 vblendvps %xmm0, %xmm3, %xmm2, %xmm0 AVX after: vcmpleps %xmm1, %xmm0, %xmm0 vpermilps $80, %xmm0, %xmm1 ## xmm1 = xmm0[0,0,1,1] vpermilps $250, %xmm0, %xmm0 ## xmm0 = xmm0[2,2,3,3] vorps %xmm0, %xmm1, %xmm0 vblendvps %xmm0, %xmm2, %xmm3, %xmm0 AVX512f before: vcmpleps %xmm1, %xmm0, %xmm0 vpermilps $80, %xmm0, %xmm1 ## xmm1 = xmm0[0,0,1,1] vpermilps $250, %xmm0, %xmm0 ## xmm0 = xmm0[2,2,3,3] vorps %xmm0, %xmm1, %xmm0 vpbroadcastd LCPI0_0(%rip), %xmm1 ## xmm1 = [1,1,1,1] vptestnmd %zmm1, %zmm0, %k1 vblendmps %zmm3, %zmm2, %zmm0 {%k1} AVX512f after: vcmpleps %xmm1, %xmm0, %xmm0 vpermilps $80, %xmm0, %xmm1 ## xmm1 = xmm0[0,0,1,1] vpermilps $250, %xmm0, %xmm0 ## xmm0 = xmm0[2,2,3,3] vorps %xmm0, %xmm1, %xmm0 vpslld $31, %xmm0, %xmm0 vptestmd %zmm0, %zmm0, %k1 vblendmps %zmm2, %zmm3, %zmm0 {%k1} AArch64 before: fcmge v0.4s, v1.4s, v0.4s zip1 v1.4s, v0.4s, v0.4s zip2 v0.4s, v0.4s, v0.4s orr v0.16b, v1.16b, v0.16b movi v1.4s, #1 and v0.16b, v0.16b, v1.16b cmeq v0.4s, v0.4s, #0 bsl v0.16b, v3.16b, v2.16b AArch64 after: fcmge v0.4s, v1.4s, v0.4s zip1 v1.4s, v0.4s, v0.4s zip2 v0.4s, v0.4s, v0.4s orr v0.16b, v1.16b, v0.16b bsl v0.16b, v2.16b, v3.16b PowerPC-le before: xvcmpgesp 34, 35, 34 vspltisw 0, 1 vmrglw 3, 2, 2 vmrghw 2, 2, 2 xxlor 0, 35, 34 xxlxor 35, 35, 35 xxland 34, 0, 32 vcmpequw 2, 2, 3 xxsel 34, 36, 37, 34 PowerPC-le after: xvcmpgesp 34, 35, 34 vmrglw 3, 2, 2 vmrghw 2, 2, 2 xxlor 0, 35, 34 xxsel 34, 37, 36, 0 Differential Revision: https://reviews.llvm.org/D52747 llvm-svn: 344181
draperlaboratory · Oct 10, 2018 · 05aadf8 · 05aadf8
1 parent eff0542
commit 05aadf8
Show file tree

Hide file tree

Showing 7 changed files with 213 additions and 209 deletions.
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -706,12 +706,35 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) {
   if (SimplifyDemandedInstructionBits(CI))
     return &CI;
 
-  // Canonicalize trunc x to i1 -> (icmp ne (and x, 1), 0), likewise for vector.
   if (DestTy->getScalarSizeInBits() == 1) {
-    Constant *One = ConstantInt::get(SrcTy, 1);
-    Src = Builder.CreateAnd(Src, One);
     Value *Zero = Constant::getNullValue(Src->getType());
-    return new ICmpInst(ICmpInst::ICMP_NE, Src, Zero);
+    if (DestTy->isIntegerTy()) {
+      // Canonicalize trunc x to i1 -> icmp ne (and x, 1), 0 (scalar only).
+      // TODO: We canonicalize to more instructions here because we are probably
+      // lacking equivalent analysis for trunc relative to icmp. There may also
+      // be codegen concerns. If those trunc limitations were removed, we could
+      // remove this transform.
+      Value *And = Builder.CreateAnd(Src, ConstantInt::get(SrcTy, 1));
+      return new ICmpInst(ICmpInst::ICMP_NE, And, Zero);
+    }
+
+    // For vectors, we do not canonicalize all truncs to icmp, so optimize
+    // patterns that would be covered within visitICmpInst.
+    Value *X;
+    const APInt *C;
+    if (match(Src, m_OneUse(m_LShr(m_Value(X), m_APInt(C))))) {
+      // trunc (lshr X, C) to i1 --> icmp ne (and X, C'), 0
+      APInt MaskC = APInt(SrcTy->getScalarSizeInBits(), 1).shl(*C);
+      Value *And = Builder.CreateAnd(X, ConstantInt::get(SrcTy, MaskC));
+      return new ICmpInst(ICmpInst::ICMP_NE, And, Zero);
+    }
+    if (match(Src, m_OneUse(m_c_Or(m_LShr(m_Value(X), m_APInt(C)),
+                                   m_Deferred(X))))) {
+      // trunc (or (lshr X, C), X) to i1 --> icmp ne (and X, C'), 0
+      APInt MaskC = APInt(SrcTy->getScalarSizeInBits(), 1).shl(*C) | 1;
+      Value *And = Builder.CreateAnd(X, ConstantInt::get(SrcTy, MaskC));
+      return new ICmpInst(ICmpInst::ICMP_NE, And, Zero);
+    }
   }
 
   // FIXME: Maybe combine the next two transforms to handle the no cast case

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -1609,6 +1609,13 @@ Instruction *InstCombiner::foldICmpAndShift(ICmpInst &Cmp, BinaryOperator *And,
 Instruction *InstCombiner::foldICmpAndConstConst(ICmpInst &Cmp,
                                                  BinaryOperator *And,
                                                  const APInt &C1) {
+  // For vectors: icmp ne (and X, 1), 0 --> trunc X to N x i1
+  // TODO: We canonicalize to the longer form for scalars because we have
+  // better analysis/folds for icmp, and codegen may be better with icmp.
+  if (Cmp.getPredicate() == CmpInst::ICMP_NE && Cmp.getType()->isVectorTy() &&
+      C1.isNullValue() && match(And->getOperand(1), m_One()))
+    return new TruncInst(And->getOperand(0), Cmp.getType());
+
   const APInt *C2;
   if (!match(And->getOperand(1), m_APInt(C2)))
     return nullptr;

diff --git a/llvm/test/Transforms/InstCombine/apint-shift.ll b/llvm/test/Transforms/InstCombine/apint-shift.ll
@@ -319,8 +319,8 @@ define i1 @test16(i84 %X) {
 
 define <2 x i1> @test16vec(<2 x i84> %X) {
 ; CHECK-LABEL: @test16vec(
-; CHECK-NEXT:    [[AND:%.*]] = and <2 x i84> %X, <i84 16, i84 16>
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ne <2 x i84> [[AND]], zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = and <2 x i84> [[X:%.*]], <i84 16, i84 16>
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne <2 x i84> [[TMP1]], zeroinitializer
 ; CHECK-NEXT:    ret <2 x i1> [[CMP]]
 ;
   %shr = ashr <2 x i84> %X, <i84 4, i84 4>

diff --git a/llvm/test/Transforms/InstCombine/apint-shl-trunc.ll b/llvm/test/Transforms/InstCombine/apint-shl-trunc.ll
@@ -27,9 +27,8 @@ define i1 @test1(i799 %X, i799 %A) {
 
 define <2 x i1> @test0vec(<2 x i39> %X, <2 x i39> %A) {
 ; CHECK-LABEL: @test0vec(
-; CHECK-NEXT:    [[TMP1:%.*]] = shl <2 x i39> <i39 1, i39 1>, [[A:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = and <2 x i39> [[TMP1]], [[X:%.*]]
-; CHECK-NEXT:    [[D:%.*]] = icmp ne <2 x i39> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[B:%.*]] = lshr <2 x i39> [[X:%.*]], [[A:%.*]]
+; CHECK-NEXT:    [[D:%.*]] = trunc <2 x i39> [[B]] to <2 x i1>
 ; CHECK-NEXT:    ret <2 x i1> [[D]]
 ;
   %B = lshr <2 x i39> %X, %A

diff --git a/llvm/test/Transforms/InstCombine/icmp.ll b/llvm/test/Transforms/InstCombine/icmp.ll
@@ -2427,10 +2427,9 @@ define i1 @icmp_and_or_lshr(i32 %x, i32 %y) {
 
 define <2 x i1> @icmp_and_or_lshr_vec(<2 x i32> %x, <2 x i32> %y) {
 ; CHECK-LABEL: @icmp_and_or_lshr_vec(
-; CHECK-NEXT:    [[SHF1:%.*]] = shl nuw <2 x i32> <i32 1, i32 1>, [[Y:%.*]]
-; CHECK-NEXT:    [[OR2:%.*]] = or <2 x i32> [[SHF1]], <i32 1, i32 1>
-; CHECK-NEXT:    [[AND3:%.*]] = and <2 x i32> [[OR2]], [[X:%.*]]
-; CHECK-NEXT:    [[RET:%.*]] = icmp ne <2 x i32> [[AND3]], zeroinitializer
+; CHECK-NEXT:    [[SHF:%.*]] = lshr <2 x i32> [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[OR:%.*]] = or <2 x i32> [[SHF]], [[X]]
+; CHECK-NEXT:    [[RET:%.*]] = trunc <2 x i32> [[OR]] to <2 x i1>
 ; CHECK-NEXT:    ret <2 x i1> [[RET]]
 ;
   %shf = lshr <2 x i32> %x, %y
@@ -2445,8 +2444,7 @@ define <2 x i1> @icmp_and_or_lshr_vec_commute(<2 x i32> %xp, <2 x i32> %y) {
 ; CHECK-NEXT:    [[X:%.*]] = srem <2 x i32> [[XP:%.*]], <i32 42, i32 42>
 ; CHECK-NEXT:    [[SHF:%.*]] = lshr <2 x i32> [[X]], [[Y:%.*]]
 ; CHECK-NEXT:    [[OR:%.*]] = or <2 x i32> [[X]], [[SHF]]
-; CHECK-NEXT:    [[AND:%.*]] = and <2 x i32> [[OR]], <i32 1, i32 1>
-; CHECK-NEXT:    [[RET:%.*]] = icmp ne <2 x i32> [[AND]], zeroinitializer
+; CHECK-NEXT:    [[RET:%.*]] = trunc <2 x i32> [[OR]] to <2 x i1>
 ; CHECK-NEXT:    ret <2 x i1> [[RET]]
 ;
   %x = srem <2 x i32> %xp, <i32 42, i32 -42> ; prevent complexity-based canonicalization
@@ -2472,8 +2470,8 @@ define i1 @icmp_and_or_lshr_cst(i32 %x) {
 
 define <2 x i1> @icmp_and_or_lshr_cst_vec(<2 x i32> %x) {
 ; CHECK-LABEL: @icmp_and_or_lshr_cst_vec(
-; CHECK-NEXT:    [[AND1:%.*]] = and <2 x i32> [[X:%.*]], <i32 3, i32 3>
-; CHECK-NEXT:    [[RET:%.*]] = icmp ne <2 x i32> [[AND1]], zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = and <2 x i32> [[X:%.*]], <i32 3, i32 3>
+; CHECK-NEXT:    [[RET:%.*]] = icmp ne <2 x i32> [[TMP1]], zeroinitializer
 ; CHECK-NEXT:    ret <2 x i1> [[RET]]
 ;
   %shf = lshr <2 x i32> %x, <i32 1, i32 1>
@@ -2486,10 +2484,8 @@ define <2 x i1> @icmp_and_or_lshr_cst_vec(<2 x i32> %x) {
 define <2 x i1> @icmp_and_or_lshr_cst_vec_commute(<2 x i32> %xp) {
 ; CHECK-LABEL: @icmp_and_or_lshr_cst_vec_commute(
 ; CHECK-NEXT:    [[X:%.*]] = srem <2 x i32> [[XP:%.*]], <i32 42, i32 42>
-; CHECK-NEXT:    [[SHF:%.*]] = lshr <2 x i32> [[X]], <i32 1, i32 1>
-; CHECK-NEXT:    [[OR:%.*]] = or <2 x i32> [[X]], [[SHF]]
-; CHECK-NEXT:    [[AND:%.*]] = and <2 x i32> [[OR]], <i32 1, i32 1>
-; CHECK-NEXT:    [[RET:%.*]] = icmp ne <2 x i32> [[AND]], zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = and <2 x i32> [[X]], <i32 3, i32 3>
+; CHECK-NEXT:    [[RET:%.*]] = icmp ne <2 x i32> [[TMP1]], zeroinitializer
 ; CHECK-NEXT:    ret <2 x i1> [[RET]]
 ;
   %x = srem <2 x i32> %xp, <i32 42, i32 -42> ; prevent complexity-based canonicalization

diff --git a/llvm/test/Transforms/InstCombine/vector-casts.ll b/llvm/test/Transforms/InstCombine/vector-casts.ll
@@ -1,47 +1,42 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -instcombine -S | FileCheck %s
 
-; This turns into a&1 != 0
-; TODO: The bar for canonicalizing to something bigger than the minimal length IR is very high. 
-; This pattern does not appear to meet that standard.
+; Can't get smaller than this.
 
 define <2 x i1> @trunc(<2 x i64> %a) {
 ; CHECK-LABEL: @trunc(
-; CHECK-NEXT:    [[TMP1:%.*]] = and <2 x i64> [[A:%.*]], <i64 1, i64 1>
-; CHECK-NEXT:    [[T:%.*]] = icmp ne <2 x i64> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[T:%.*]] = trunc <2 x i64> [[A:%.*]] to <2 x i1>
 ; CHECK-NEXT:    ret <2 x i1> [[T]]
 ;
   %t = trunc <2 x i64> %a to <2 x i1>
   ret <2 x i1> %t
 }
 
-; TODO: This could be just 1 instruction (trunc). 
+; This is trunc.
 
 define <2 x i1> @and_cmp_is_trunc(<2 x i64> %a) {
 ; CHECK-LABEL: @and_cmp_is_trunc(
-; CHECK-NEXT:    [[T:%.*]] = and <2 x i64> [[A:%.*]], <i64 1, i64 1>
-; CHECK-NEXT:    [[R:%.*]] = icmp ne <2 x i64> [[T]], zeroinitializer
+; CHECK-NEXT:    [[R:%.*]] = trunc <2 x i64> [[A:%.*]] to <2 x i1>
 ; CHECK-NEXT:    ret <2 x i1> [[R]]
 ;
   %t = and <2 x i64> %a, <i64 1, i64 1>
   %r = icmp ne <2 x i64> %t, zeroinitializer
   ret <2 x i1> %r
 }
 
-; TODO: This could be just 1 instruction (trunc). 
+; This is trunc.
 
 define <2 x i1> @and_cmp_is_trunc_even_with_undef_elt(<2 x i64> %a) {
 ; CHECK-LABEL: @and_cmp_is_trunc_even_with_undef_elt(
-; CHECK-NEXT:    [[T:%.*]] = and <2 x i64> [[A:%.*]], <i64 undef, i64 1>
-; CHECK-NEXT:    [[R:%.*]] = icmp ne <2 x i64> [[T]], zeroinitializer
+; CHECK-NEXT:    [[R:%.*]] = trunc <2 x i64> [[A:%.*]] to <2 x i1>
 ; CHECK-NEXT:    ret <2 x i1> [[R]]
 ;
   %t = and <2 x i64> %a, <i64 undef, i64 1>
   %r = icmp ne <2 x i64> %t, zeroinitializer
   ret <2 x i1> %r
 }
 
-; TODO: This could be just 1 instruction (trunc). 
+; TODO: This could be just 1 instruction (trunc), but our undef matching is incomplete.
 
 define <2 x i1> @and_cmp_is_trunc_even_with_undef_elts(<2 x i64> %a) {
 ; CHECK-LABEL: @and_cmp_is_trunc_even_with_undef_elts(