diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index ebbd05e6d47af..a88b9895ebb7d 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -665,9 +665,10 @@ bool VectorCombine::foldInsExtFNeg(Instruction &I) { m_ExtractElt(m_Value(SrcVec), m_SpecificInt(Index)))))) return false; - // TODO: We could handle this with a length-changing shuffle. auto *VecTy = cast(I.getType()); - if (SrcVec->getType() != VecTy) + auto *ScalarTy = VecTy->getScalarType(); + auto *SrcVecTy = dyn_cast(SrcVec->getType()); + if (!SrcVecTy || ScalarTy != SrcVecTy->getScalarType()) return false; // Ignore bogus insert/extract index. @@ -681,8 +682,6 @@ bool VectorCombine::foldInsExtFNeg(Instruction &I) { SmallVector Mask(NumElts); std::iota(Mask.begin(), Mask.end(), 0); Mask[Index] = Index + NumElts; - - Type *ScalarTy = VecTy->getScalarType(); InstructionCost OldCost = TTI.getArithmeticInstrCost(Instruction::FNeg, ScalarTy, CostKind) + TTI.getVectorInstrCost(I, VecTy, CostKind, Index); @@ -697,14 +696,33 @@ bool VectorCombine::foldInsExtFNeg(Instruction &I) { TTI.getArithmeticInstrCost(Instruction::FNeg, VecTy, CostKind) + TTI.getShuffleCost(TargetTransformInfo::SK_Select, VecTy, Mask, CostKind); + bool NeedLenChg = SrcVecTy->getNumElements() != NumElts; + // If the lengths of the two vectors are not equal, + // we need to add a length-change vector. Add this cost. + SmallVector SrcMask; + if (NeedLenChg) { + SrcMask.assign(NumElts, PoisonMaskElem); + SrcMask[Index] = Index; + NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, + SrcVecTy, SrcMask, CostKind); + } + if (NewCost > OldCost) return false; - // insertelt DestVec, (fneg (extractelt SrcVec, Index)), Index --> - // shuffle DestVec, (fneg SrcVec), Mask + Value *NewShuf; + // insertelt DestVec, (fneg (extractelt SrcVec, Index)), Index Value *VecFNeg = Builder.CreateFNegFMF(SrcVec, FNeg); - Value *Shuf = Builder.CreateShuffleVector(DestVec, VecFNeg, Mask); - replaceValue(I, *Shuf); + if (NeedLenChg) { + // shuffle DestVec, (shuffle (fneg SrcVec), poison, SrcMask), Mask + Value *LenChgShuf = Builder.CreateShuffleVector(SrcVec, SrcMask); + NewShuf = Builder.CreateShuffleVector(DestVec, LenChgShuf, Mask); + } else { + // shuffle DestVec, (fneg SrcVec), Mask + NewShuf = Builder.CreateShuffleVector(DestVec, VecFNeg, Mask); + } + + replaceValue(I, *NewShuf); return true; } diff --git a/llvm/test/Transforms/VectorCombine/X86/extract-fneg-insert.ll b/llvm/test/Transforms/VectorCombine/X86/extract-fneg-insert.ll index df5fcdb7beb65..83f94ba46a072 100644 --- a/llvm/test/Transforms/VectorCombine/X86/extract-fneg-insert.ll +++ b/llvm/test/Transforms/VectorCombine/X86/extract-fneg-insert.ll @@ -18,6 +18,19 @@ define <4 x float> @ext0_v4f32(<4 x float> %x, <4 x float> %y) { ret <4 x float> %r } +define <4 x float> @ext0_v2f32v4f32(<2 x float> %x, <4 x float> %y) { +; CHECK-LABEL: @ext0_v2f32v4f32( +; CHECK-NEXT: [[E:%.*]] = extractelement <2 x float> [[X:%.*]], i32 0 +; CHECK-NEXT: [[N:%.*]] = fneg float [[E]] +; CHECK-NEXT: [[R:%.*]] = insertelement <4 x float> [[Y:%.*]], float [[N]], i32 0 +; CHECK-NEXT: ret <4 x float> [[R]] +; + %e = extractelement <2 x float> %x, i32 0 + %n = fneg float %e + %r = insertelement <4 x float> %y, float %n, i32 0 + ret <4 x float> %r +} + ; Eliminating extract/insert is profitable. define <4 x float> @ext2_v4f32(<4 x float> %x, <4 x float> %y) { @@ -32,6 +45,19 @@ define <4 x float> @ext2_v4f32(<4 x float> %x, <4 x float> %y) { ret <4 x float> %r } +define <4 x float> @ext2_v2f32v4f32(<2 x float> %x, <4 x float> %y) { +; CHECK-LABEL: @ext2_v2f32v4f32( +; CHECK-NEXT: [[TMP1:%.*]] = fneg <2 x float> [[X:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x float> [[X]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[Y:%.*]], <4 x float> [[TMP2]], <4 x i32> +; CHECK-NEXT: ret <4 x float> [[R]] +; + %e = extractelement <2 x float> %x, i32 2 + %n = fneg float %e + %r = insertelement <4 x float> %y, float %n, i32 2 + ret <4 x float> %r +} + ; Eliminating extract/insert is still profitable. Flags propagate. define <2 x double> @ext1_v2f64(<2 x double> %x, <2 x double> %y) { @@ -46,6 +72,25 @@ define <2 x double> @ext1_v2f64(<2 x double> %x, <2 x double> %y) { ret <2 x double> %r } +define <4 x double> @ext1_v2f64v4f64(<2 x double> %x, <4 x double> %y) { +; SSE-LABEL: @ext1_v2f64v4f64( +; SSE-NEXT: [[E:%.*]] = extractelement <2 x double> [[X:%.*]], i32 1 +; SSE-NEXT: [[N:%.*]] = fneg nsz double [[E]] +; SSE-NEXT: [[R:%.*]] = insertelement <4 x double> [[Y:%.*]], double [[N]], i32 1 +; SSE-NEXT: ret <4 x double> [[R]] +; +; AVX-LABEL: @ext1_v2f64v4f64( +; AVX-NEXT: [[TMP1:%.*]] = fneg nsz <2 x double> [[X:%.*]] +; AVX-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[X]], <2 x double> poison, <4 x i32> +; AVX-NEXT: [[R:%.*]] = shufflevector <4 x double> [[Y:%.*]], <4 x double> [[TMP2]], <4 x i32> +; AVX-NEXT: ret <4 x double> [[R]] +; + %e = extractelement <2 x double> %x, i32 1 + %n = fneg nsz double %e + %r = insertelement <4 x double> %y, double %n, i32 1 + ret <4 x double> %r +} + ; The vector fneg would cost twice as much as the scalar op with SSE, ; so we don't transform there (the shuffle would also be more expensive). @@ -67,6 +112,19 @@ define <8 x float> @ext7_v8f32(<8 x float> %x, <8 x float> %y) { ret <8 x float> %r } +define <8 x float> @ext7_v4f32v8f32(<4 x float> %x, <8 x float> %y) { +; CHECK-LABEL: @ext7_v4f32v8f32( +; CHECK-NEXT: [[E:%.*]] = extractelement <4 x float> [[X:%.*]], i32 3 +; CHECK-NEXT: [[N:%.*]] = fneg float [[E]] +; CHECK-NEXT: [[R:%.*]] = insertelement <8 x float> [[Y:%.*]], float [[N]], i32 7 +; CHECK-NEXT: ret <8 x float> [[R]] +; + %e = extractelement <4 x float> %x, i32 3 + %n = fneg float %e + %r = insertelement <8 x float> %y, float %n, i32 7 + ret <8 x float> %r +} + ; Same as above with an extra use of the extracted element. define <8 x float> @ext7_v8f32_use1(<8 x float> %x, <8 x float> %y) { @@ -91,6 +149,21 @@ define <8 x float> @ext7_v8f32_use1(<8 x float> %x, <8 x float> %y) { ret <8 x float> %r } +define <8 x float> @ext7_v4f32v8f32_use1(<4 x float> %x, <8 x float> %y) { +; CHECK-LABEL: @ext7_v4f32v8f32_use1( +; CHECK-NEXT: [[E:%.*]] = extractelement <4 x float> [[X:%.*]], i32 3 +; CHECK-NEXT: call void @use(float [[E]]) +; CHECK-NEXT: [[N:%.*]] = fneg float [[E]] +; CHECK-NEXT: [[R:%.*]] = insertelement <8 x float> [[Y:%.*]], float [[N]], i32 3 +; CHECK-NEXT: ret <8 x float> [[R]] +; + %e = extractelement <4 x float> %x, i32 3 + call void @use(float %e) + %n = fneg float %e + %r = insertelement <8 x float> %y, float %n, i32 3 + ret <8 x float> %r +} + ; Negative test - the transform is likely not profitable if the fneg has another use. define <8 x float> @ext7_v8f32_use2(<8 x float> %x, <8 x float> %y) { @@ -108,6 +181,21 @@ define <8 x float> @ext7_v8f32_use2(<8 x float> %x, <8 x float> %y) { ret <8 x float> %r } +define <8 x float> @ext7_v4f32v8f32_use2(<4 x float> %x, <8 x float> %y) { +; CHECK-LABEL: @ext7_v4f32v8f32_use2( +; CHECK-NEXT: [[E:%.*]] = extractelement <4 x float> [[X:%.*]], i32 3 +; CHECK-NEXT: [[N:%.*]] = fneg float [[E]] +; CHECK-NEXT: call void @use(float [[N]]) +; CHECK-NEXT: [[R:%.*]] = insertelement <8 x float> [[Y:%.*]], float [[N]], i32 3 +; CHECK-NEXT: ret <8 x float> [[R]] +; + %e = extractelement <4 x float> %x, i32 3 + %n = fneg float %e + call void @use(float %n) + %r = insertelement <8 x float> %y, float %n, i32 3 + ret <8 x float> %r +} + ; Negative test - can't convert variable index to a shuffle. define <2 x double> @ext_index_var_v2f64(<2 x double> %x, <2 x double> %y, i32 %index) { @@ -123,6 +211,19 @@ define <2 x double> @ext_index_var_v2f64(<2 x double> %x, <2 x double> %y, i32 % ret <2 x double> %r } +define <4 x double> @ext_index_var_v2f64v4f64(<2 x double> %x, <4 x double> %y, i32 %index) { +; CHECK-LABEL: @ext_index_var_v2f64v4f64( +; CHECK-NEXT: [[E:%.*]] = extractelement <2 x double> [[X:%.*]], i32 [[INDEX:%.*]] +; CHECK-NEXT: [[N:%.*]] = fneg nsz double [[E]] +; CHECK-NEXT: [[R:%.*]] = insertelement <4 x double> [[Y:%.*]], double [[N]], i32 [[INDEX]] +; CHECK-NEXT: ret <4 x double> [[R]] +; + %e = extractelement <2 x double> %x, i32 %index + %n = fneg nsz double %e + %r = insertelement <4 x double> %y, double %n, i32 %index + ret <4 x double> %r +} + ; Negative test - require same extract/insert index for simple shuffle. ; TODO: We could handle this by adjusting the cost calculation. @@ -139,6 +240,33 @@ define <2 x double> @ext1_v2f64_ins0(<2 x double> %x, <2 x double> %y) { ret <2 x double> %r } +; Negative test - extract from an index greater than the vector width of the destination +define <2 x double> @ext3_v4f64v2f64(<4 x double> %x, <2 x double> %y) { +; CHECK-LABEL: @ext3_v4f64v2f64( +; CHECK-NEXT: [[E:%.*]] = extractelement <4 x double> [[X:%.*]], i32 3 +; CHECK-NEXT: [[N:%.*]] = fneg nsz double [[E]] +; CHECK-NEXT: [[R:%.*]] = insertelement <2 x double> [[Y:%.*]], double [[N]], i32 1 +; CHECK-NEXT: ret <2 x double> [[R]] +; + %e = extractelement <4 x double> %x, i32 3 + %n = fneg nsz double %e + %r = insertelement <2 x double> %y, double %n, i32 1 + ret <2 x double> %r +} + +define <4 x double> @ext1_v2f64v4f64_ins0(<2 x double> %x, <4 x double> %y) { +; CHECK-LABEL: @ext1_v2f64v4f64_ins0( +; CHECK-NEXT: [[E:%.*]] = extractelement <2 x double> [[X:%.*]], i32 1 +; CHECK-NEXT: [[N:%.*]] = fneg nsz double [[E]] +; CHECK-NEXT: [[R:%.*]] = insertelement <4 x double> [[Y:%.*]], double [[N]], i32 0 +; CHECK-NEXT: ret <4 x double> [[R]] +; + %e = extractelement <2 x double> %x, i32 1 + %n = fneg nsz double %e + %r = insertelement <4 x double> %y, double %n, i32 0 + ret <4 x double> %r +} + ; Negative test - avoid changing poison ops define <4 x float> @ext12_v4f32(<4 x float> %x, <4 x float> %y) { @@ -154,6 +282,19 @@ define <4 x float> @ext12_v4f32(<4 x float> %x, <4 x float> %y) { ret <4 x float> %r } +define <4 x float> @ext12_v2f32v4f32(<2 x float> %x, <4 x float> %y) { +; CHECK-LABEL: @ext12_v2f32v4f32( +; CHECK-NEXT: [[E:%.*]] = extractelement <2 x float> [[X:%.*]], i32 6 +; CHECK-NEXT: [[N:%.*]] = fneg float [[E]] +; CHECK-NEXT: [[R:%.*]] = insertelement <4 x float> [[Y:%.*]], float [[N]], i32 12 +; CHECK-NEXT: ret <4 x float> [[R]] +; + %e = extractelement <2 x float> %x, i32 6 + %n = fneg float %e + %r = insertelement <4 x float> %y, float %n, i32 12 + ret <4 x float> %r +} + ; This used to crash because we assumed matching a true, unary fneg instruction. define <2 x float> @ext1_v2f32_fsub(<2 x float> %x) { @@ -181,3 +322,16 @@ define <2 x float> @ext1_v2f32_fsub_fmf(<2 x float> %x, <2 x float> %y) { %r = insertelement <2 x float> %y, float %s, i32 1 ret <2 x float> %r } + +define <4 x float> @ext1_v2f32v4f32_fsub_fmf(<2 x float> %x, <4 x float> %y) { +; CHECK-LABEL: @ext1_v2f32v4f32_fsub_fmf( +; CHECK-NEXT: [[TMP1:%.*]] = fneg nnan nsz <2 x float> [[X:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x float> [[X]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[Y:%.*]], <4 x float> [[TMP2]], <4 x i32> +; CHECK-NEXT: ret <4 x float> [[R]] +; + %e = extractelement <2 x float> %x, i32 1 + %s = fsub nsz nnan float 0.0, %e + %r = insertelement <4 x float> %y, float %s, i32 1 + ret <4 x float> %r +}