[InstSimplify] fold FP rounding intrinsic with rounded operand

issue #56775 I rearranged the Thumb2 codegen test to avoid simplifying the chain of rounding instructions. I'm assuming the intent of the test is to verify lowering of each of those intrinsics.
llvm · Jul 31, 2022 · 02b3a35 · 02b3a35
1 parent ba29549
commit 02b3a35
Show file tree

Hide file tree

Showing 3 changed files with 98 additions and 127 deletions.
diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -5573,6 +5573,24 @@ static bool isIdempotent(Intrinsic::ID ID) {
   }
 }
 
+/// Return true if the intrinsic rounds a floating-point value to an integral
+/// floating-point value (not an integer type).
+static bool removesFPFraction(Intrinsic::ID ID) {
+  switch (ID) {
+  default:
+    return false;
+
+  case Intrinsic::floor:
+  case Intrinsic::ceil:
+  case Intrinsic::trunc:
+  case Intrinsic::rint:
+  case Intrinsic::nearbyint:
+  case Intrinsic::round:
+  case Intrinsic::roundeven:
+    return true;
+  }
+}
+
 static Value *simplifyRelativeLoad(Constant *Ptr, Constant *Offset,
                                    const DataLayout &DL) {
   GlobalValue *PtrSym;
@@ -5638,6 +5656,18 @@ static Value *simplifyUnaryIntrinsic(Function *F, Value *Op0,
       if (II->getIntrinsicID() == IID)
         return II;
 
+  if (removesFPFraction(IID)) {
+    // Converting from int or calling a rounding function always results in a
+    // finite integral number or infinity. For those inputs, rounding functions
+    // always return the same value, so the (2nd) rounding is eliminated. Ex:
+    // floor (sitofp x) -> sitofp x
+    // round (ceil x) -> ceil x
+    auto *II = dyn_cast<IntrinsicInst>(Op0);
+    if ((II && removesFPFraction(II->getIntrinsicID())) ||
+        match(Op0, m_SIToFP(m_Value())) || match(Op0, m_UIToFP(m_Value())))
+      return Op0;
+  }
+
   Value *X;
   switch (IID) {
   case Intrinsic::fabs:
@@ -5695,23 +5725,6 @@ static Value *simplifyUnaryIntrinsic(Function *F, Value *Op0,
         match(Op0, m_Intrinsic<Intrinsic::pow>(m_SpecificFP(10.0), m_Value(X))))
       return X;
     break;
-  case Intrinsic::floor:
-  case Intrinsic::trunc:
-  case Intrinsic::ceil:
-  case Intrinsic::round:
-  case Intrinsic::roundeven:
-  case Intrinsic::nearbyint:
-  case Intrinsic::rint: {
-    // floor (sitofp x) -> sitofp x
-    // floor (uitofp x) -> uitofp x
-    //
-    // Converting from int always results in a finite integral number or
-    // infinity. For either of those inputs, these rounding functions always
-    // return the same value, so the rounding can be eliminated.
-    if (match(Op0, m_SIToFP(m_Value())) || match(Op0, m_UIToFP(m_Value())))
-      return Op0;
-    break;
-  }
   case Intrinsic::experimental_vector_reverse:
     // experimental.vector.reverse(experimental.vector.reverse(x)) -> x
     if (match(Op0,

diff --git a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll
@@ -578,46 +578,46 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @ext_fpintrinsics_trunc_half(<8 x half> %a, <8 x half> %b) {
 ; CHECK-LABEL: ext_fpintrinsics_trunc_half:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9}
-; CHECK-NEXT:    vpush {d8, d9}
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11}
 ; CHECK-NEXT:    vcvtb.f32.f16 q2, q0
 ; CHECK-NEXT:    vcvtb.f32.f16 q4, q1
-; CHECK-NEXT:    vabs.f32 q3, q2
-; CHECK-NEXT:    vcvtt.f32.f16 q0, q0
+; CHECK-NEXT:    vrintm.f32 q3, q2
+; CHECK-NEXT:    vrintx.f32 q5, q4
+; CHECK-NEXT:    vabs.f32 q3, q3
+; CHECK-NEXT:    vrinta.f32 q4, q4
 ; CHECK-NEXT:    vminnm.f32 q3, q3, q2
+; CHECK-NEXT:    vrintp.f32 q2, q2
+; CHECK-NEXT:    vmaxnm.f32 q3, q3, q5
+; CHECK-NEXT:    vcvtt.f32.f16 q0, q0
+; CHECK-NEXT:    vfma.f32 q2, q3, q4
+; CHECK-NEXT:    vrintm.f32 q3, q0
+; CHECK-NEXT:    vabs.f32 q3, q3
 ; CHECK-NEXT:    vcvtt.f32.f16 q1, q1
-; CHECK-NEXT:    vmaxnm.f32 q3, q3, q4
-; CHECK-NEXT:    vfma.f32 q4, q3, q2
-; CHECK-NEXT:    vabs.f32 q3, q0
 ; CHECK-NEXT:    vminnm.f32 q3, q3, q0
-; CHECK-NEXT:    vrintp.f32 q2, q4
-; CHECK-NEXT:    vmaxnm.f32 q3, q3, q1
-; CHECK-NEXT:    vrintm.f32 q2, q2
-; CHECK-NEXT:    vfma.f32 q1, q3, q0
-; CHECK-NEXT:    vrintx.f32 q2, q2
-; CHECK-NEXT:    vrintp.f32 q0, q1
-; CHECK-NEXT:    vrinta.f32 q2, q2
-; CHECK-NEXT:    vrintm.f32 q0, q0
+; CHECK-NEXT:    vrintx.f32 q4, q1
+; CHECK-NEXT:    vmaxnm.f32 q3, q3, q4
+; CHECK-NEXT:    vrinta.f32 q1, q1
+; CHECK-NEXT:    vrintp.f32 q0, q0
 ; CHECK-NEXT:    vrintz.f32 q2, q2
-; CHECK-NEXT:    vrintx.f32 q0, q0
-; CHECK-NEXT:    vrinta.f32 q0, q0
+; CHECK-NEXT:    vfma.f32 q0, q3, q1
 ; CHECK-NEXT:    vrintz.f32 q1, q0
 ; CHECK-NEXT:    vcvtb.f16.f32 q0, q2
 ; CHECK-NEXT:    vcvtt.f16.f32 q0, q1
-; CHECK-NEXT:    vpop {d8, d9}
+; CHECK-NEXT:    vpop {d8, d9, d10, d11}
 ; CHECK-NEXT:    bx lr
 entry:
   %sa = fpext <8 x half> %a to <8 x float>
   %sb = fpext <8 x half> %b to <8 x float>
-  %abs = call <8 x float> @llvm.fabs.v8f32(<8 x float> %sa)
+  %floor = call <8 x float> @llvm.floor.v8f32(<8 x float> %sa)
+  %rint = call <8 x float> @llvm.rint.v8f32(<8 x float> %sb)
+  %ceil = call <8 x float> @llvm.ceil.v8f32(<8 x float> %sa)
+  %round = call <8 x float> @llvm.round.v8f32(<8 x float> %sb)
+  %abs = call <8 x float> @llvm.fabs.v8f32(<8 x float> %floor)
   %min = call <8 x float> @llvm.minnum.v8f32(<8 x float> %abs, <8 x float> %sa)
-  %max = call <8 x float> @llvm.maxnum.v8f32(<8 x float> %min, <8 x float> %sb)
-  %fma = call <8 x float> @llvm.fma.v8f32(<8 x float> %max, <8 x float> %sa, <8 x float> %sb)
-  %ceil = call <8 x float> @llvm.ceil.v8f32(<8 x float> %fma)
-  %floor = call <8 x float> @llvm.floor.v8f32(<8 x float> %ceil)
-  %rint = call <8 x float> @llvm.rint.v8f32(<8 x float> %floor)
-  %round = call <8 x float> @llvm.round.v8f32(<8 x float> %rint)
-  %trunc = call <8 x float> @llvm.trunc.v8f32(<8 x float> %round)
+  %max = call <8 x float> @llvm.maxnum.v8f32(<8 x float> %min, <8 x float> %rint)
+  %fma = call <8 x float> @llvm.fma.v8f32(<8 x float> %max, <8 x float> %round, <8 x float> %ceil)
+  %trunc = call <8 x float> @llvm.trunc.v8f32(<8 x float> %fma)
   %t = fptrunc <8 x float> %trunc to <8 x half>
   ret <8 x half> %t
 }