From 26466ce804ac523b398608f17388eb6d605a3f09 Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Thu, 11 Jul 2024 08:57:33 -0700 Subject: [PATCH] [SLP]Improve/fix extracts calculations for non-power-of-2 elements. Change-Id: I6ea9a21eba83034bb01bb1ab9aabb2b97b0d40c2 --- .../Transforms/Vectorize/SLPVectorizer.cpp | 127 ++++++++++----- .../AMDGPU/add_sub_sat-inseltpoison.ll | 150 +++++++++++++++++- .../SLPVectorizer/AMDGPU/add_sub_sat.ll | 150 +++++++++++++++++- .../SLPVectorizer/AMDGPU/reduction.ll | 23 +-- .../SLPVectorizer/RISCV/math-function.ll | 144 +++++++---------- .../X86/alternate-calls-inseltpoison.ll | 74 +++++++-- .../SLPVectorizer/X86/alternate-calls.ll | 74 +++++++-- .../SLPVectorizer/X86/hadd-inseltpoison.ll | 75 ++------- .../test/Transforms/SLPVectorizer/X86/hadd.ll | 75 ++------- .../SLPVectorizer/X86/scalarazied-result.ll | 4 - 10 files changed, 593 insertions(+), 303 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 71c72ccac9da3c..fae2aabe464409 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -239,6 +239,21 @@ static std::string shortBundleName(ArrayRef VL) { } #endif +/// Returns power-of-2 number of elements in a single register (part), given the +/// total number of elements \p Size and number of registers (parts) \p +/// NumParts. +static unsigned getPartNumElems(unsigned Size, unsigned NumParts) { + return PowerOf2Ceil(divideCeil(Size, NumParts)); +} + +/// Returns correct remaining number of elements, considering total amount \p +/// Size, (power-of-2 number) of elements in a single register \p PartNumElems +/// and current register (part) \p Part. +static unsigned getNumElems(unsigned Size, unsigned PartNumElems, + unsigned Part) { + return std::min(PartNumElems, Size - Part * PartNumElems); +} + /// \returns true if all of the instructions in \p VL are in the same block or /// false otherwise. static bool allSameBlock(ArrayRef VL) { @@ -7139,14 +7154,17 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { if (NumSrcRegs == 0) NumSrcRegs = 1; // FIXME: this must be moved to TTI for better estimation. - unsigned EltsPerVector = PowerOf2Ceil(std::max( - divideCeil(VL.size(), NumParts), divideCeil(NumElts, NumSrcRegs))); + unsigned EltsPerVector = getPartNumElems(VL.size(), NumParts); auto CheckPerRegistersShuffle = - [&](MutableArrayRef Mask) -> std::optional { + [&](MutableArrayRef Mask, + SmallVector Indices) -> std::optional { + if (NumElts <= EltsPerVector) + return std::nullopt; DenseSet RegIndices; // Check that if trying to permute same single/2 input vectors. TTI::ShuffleKind ShuffleKind = TTI::SK_PermuteSingleSrc; int FirstRegId = -1; + Indices.assign(1, -1); for (int &I : Mask) { if (I == PoisonMaskElem) continue; @@ -7156,8 +7174,15 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { RegIndices.insert(RegId); if (RegIndices.size() > 2) return std::nullopt; - if (RegIndices.size() == 2) + if (RegIndices.size() == 2) { ShuffleKind = TTI::SK_PermuteTwoSrc; + if (Indices.size() == 1) + Indices.push_back(-1); + } + if (RegId == FirstRegId) + Indices.front() = I % NumElts; + else + Indices.back() = I % NumElts; I = (I % NumElts) % EltsPerVector + (RegId == FirstRegId ? 0 : EltsPerVector); } @@ -7168,22 +7193,23 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { // Process extracts in blocks of EltsPerVector to check if the source vector // operand can be re-used directly. If not, add the cost of creating a // shuffle to extract the values into a vector register. - for (unsigned Part = 0; Part < NumParts; ++Part) { + for (unsigned Part : seq(NumParts)) { if (!ShuffleKinds[Part]) continue; - ArrayRef MaskSlice = - Mask.slice(Part * EltsPerVector, - (Part == NumParts - 1 && Mask.size() % EltsPerVector != 0) - ? Mask.size() % EltsPerVector - : EltsPerVector); + ArrayRef MaskSlice = Mask.slice( + Part * EltsPerVector, getNumElems(Mask.size(), EltsPerVector, Part)); SmallVector SubMask(EltsPerVector, PoisonMaskElem); copy(MaskSlice, SubMask.begin()); + SmallVector Indices; std::optional RegShuffleKind = - CheckPerRegistersShuffle(SubMask); + CheckPerRegistersShuffle(SubMask, Indices); if (!RegShuffleKind) { - Cost += ::getShuffleCost( - TTI, *ShuffleKinds[Part], - FixedVectorType::get(VL.front()->getType(), NumElts), MaskSlice); + if (*ShuffleKinds[Part] != TTI::SK_PermuteSingleSrc || + !ShuffleVectorInst::isIdentityMask( + MaskSlice, std::max(NumElts, MaskSlice.size()))) + Cost += ::getShuffleCost( + TTI, *ShuffleKinds[Part], + FixedVectorType::get(VL.front()->getType(), NumElts), MaskSlice); continue; } if (*RegShuffleKind != TTI::SK_PermuteSingleSrc || @@ -7193,6 +7219,13 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { FixedVectorType::get(VL.front()->getType(), EltsPerVector), SubMask); } + for (int Idx : Indices) { + Cost += ::getShuffleCost( + TTI, TTI::SK_ExtractSubvector, + FixedVectorType::get(VL.front()->getType(), NumElts), std::nullopt, + CostKind, Idx, + FixedVectorType::get(VL.front()->getType(), EltsPerVector)); + } } return Cost; } @@ -7220,11 +7253,11 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { InVectors.front().get() == &E1 && InVectors.back().get() == E2) || (!E2 && InVectors.front().get() == &E1)) { - assert(all_of(ArrayRef(CommonMask).slice(Part * SliceSize, SliceSize), + unsigned Limit = getNumElems(Mask.size(), SliceSize, Part); + assert(all_of(ArrayRef(CommonMask).slice(Part * SliceSize, Limit), [](int Idx) { return Idx == PoisonMaskElem; }) && "Expected all poisoned elements."); - ArrayRef SubMask = - ArrayRef(Mask).slice(Part * SliceSize, SliceSize); + ArrayRef SubMask = ArrayRef(Mask).slice(Part * SliceSize, Limit); copy(SubMask, std::next(CommonMask.begin(), SliceSize * Part)); return; } @@ -7465,10 +7498,11 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { }); }); SmallPtrSet UniqueBases; - unsigned SliceSize = VL.size() / NumParts; - for (unsigned Part = 0; Part < NumParts; ++Part) { - ArrayRef SubMask = Mask.slice(Part * SliceSize, SliceSize); - for (auto [I, V] : enumerate(VL.slice(Part * SliceSize, SliceSize))) { + unsigned SliceSize = getPartNumElems(VL.size(), NumParts); + for (unsigned Part : seq(NumParts)) { + unsigned Limit = getNumElems(VL.size(), SliceSize, Part); + ArrayRef SubMask = Mask.slice(Part * SliceSize, Limit); + for (auto [I, V] : enumerate(VL.slice(Part * SliceSize, Limit))) { // Ignore non-extractelement scalars. if (isa(V) || (!SubMask.empty() && SubMask[I] == PoisonMaskElem)) @@ -7561,7 +7595,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { unsigned NumParts = TTI.getNumberOfParts(MaskVecTy); if (NumParts == 0 || NumParts >= Mask.size()) NumParts = 1; - unsigned SliceSize = Mask.size() / NumParts; + unsigned SliceSize = getPartNumElems(Mask.size(), NumParts); const auto *It = find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; }); unsigned Part = std::distance(Mask.begin(), It) / SliceSize; @@ -7579,7 +7613,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { unsigned NumParts = TTI.getNumberOfParts(MaskVecTy); if (NumParts == 0 || NumParts >= Mask.size()) NumParts = 1; - unsigned SliceSize = Mask.size() / NumParts; + unsigned SliceSize = getPartNumElems(Mask.size(), NumParts); const auto *It = find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; }); unsigned Part = std::distance(Mask.begin(), It) / SliceSize; @@ -9339,12 +9373,12 @@ BoUpSLP::tryToGatherExtractElements(SmallVectorImpl &VL, assert(NumParts > 0 && "NumParts expected be greater than or equal to 1."); SmallVector> ShufflesRes(NumParts); Mask.assign(VL.size(), PoisonMaskElem); - unsigned SliceSize = VL.size() / NumParts; - for (unsigned Part = 0; Part < NumParts; ++Part) { + unsigned SliceSize = getPartNumElems(VL.size(), NumParts); + for (unsigned Part : seq(NumParts)) { // Scan list of gathered scalars for extractelements that can be represented // as shuffles. - MutableArrayRef SubVL = - MutableArrayRef(VL).slice(Part * SliceSize, SliceSize); + MutableArrayRef SubVL = MutableArrayRef(VL).slice( + Part * SliceSize, getNumElems(VL.size(), SliceSize, Part)); SmallVector SubMask; std::optional Res = tryToGatherSingleRegisterExtractElements(SubVL, SubMask); @@ -9730,10 +9764,11 @@ BoUpSLP::isGatherShuffledEntry( "Expected only single user of the gather node."); assert(VL.size() % NumParts == 0 && "Number of scalars must be divisible by NumParts."); - unsigned SliceSize = VL.size() / NumParts; + unsigned SliceSize = getPartNumElems(VL.size(), NumParts); SmallVector> Res; - for (unsigned Part = 0; Part < NumParts; ++Part) { - ArrayRef SubVL = VL.slice(Part * SliceSize, SliceSize); + for (unsigned Part : seq(NumParts)) { + ArrayRef SubVL = + VL.slice(Part * SliceSize, getNumElems(VL.size(), SliceSize, Part)); SmallVectorImpl &SubEntries = Entries.emplace_back(); std::optional SubRes = isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part); @@ -10250,11 +10285,12 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis { // into a long virtual vector register, forming the original vector. Value *Vec = nullptr; SmallVector VecMask(Mask.size(), PoisonMaskElem); - unsigned SliceSize = E->Scalars.size() / NumParts; - for (unsigned Part = 0; Part < NumParts; ++Part) { + unsigned SliceSize = getPartNumElems(E->Scalars.size(), NumParts); + for (unsigned Part : seq(NumParts)) { + unsigned Limit = getNumElems(E->Scalars.size(), SliceSize, Part); ArrayRef VL = - ArrayRef(E->Scalars).slice(Part * SliceSize, SliceSize); - MutableArrayRef SubMask = Mask.slice(Part * SliceSize, SliceSize); + ArrayRef(E->Scalars).slice(Part * SliceSize, Limit); + MutableArrayRef SubMask = Mask.slice(Part * SliceSize, Limit); constexpr int MaxBases = 2; SmallVector Bases(MaxBases); #ifndef NDEBUG @@ -10290,7 +10326,9 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis { assert((Part == 0 || all_of(seq(0, Part), [&](unsigned P) { ArrayRef SubMask = - Mask.slice(P * SliceSize, SliceSize); + Mask.slice(P * SliceSize, + getNumElems(Mask.size(), + SliceSize, P)); return all_of(SubMask, [](int Idx) { return Idx == PoisonMaskElem; }); @@ -10663,13 +10701,19 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) { Idx == 0) || (Mask.size() == InputVF && ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))) { - std::iota(std::next(Mask.begin(), I * SliceSize), - std::next(Mask.begin(), (I + 1) * SliceSize), 0); + std::iota( + std::next(Mask.begin(), I * SliceSize), + std::next(Mask.begin(), + I * SliceSize + getNumElems(Mask.size(), SliceSize, I)), + 0); } else { unsigned IVal = *find_if_not(Mask, [](int Idx) { return Idx == PoisonMaskElem; }); - std::fill(std::next(Mask.begin(), I * SliceSize), - std::next(Mask.begin(), (I + 1) * SliceSize), IVal); + std::fill( + std::next(Mask.begin(), I * SliceSize), + std::next(Mask.begin(), + I * SliceSize + getNumElems(Mask.size(), SliceSize, I)), + IVal); } return true; }; @@ -10930,7 +10974,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) { } } if (!GatherShuffles.empty()) { - unsigned SliceSize = E->Scalars.size() / NumParts; + unsigned SliceSize = getPartNumElems(E->Scalars.size(), NumParts); SmallVector VecMask(Mask.size(), PoisonMaskElem); for (const auto [I, TEs] : enumerate(Entries)) { if (TEs.empty()) { @@ -10940,7 +10984,8 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) { } assert((TEs.size() == 1 || TEs.size() == 2) && "Expected shuffle of 1 or 2 entries."); - auto SubMask = ArrayRef(Mask).slice(I * SliceSize, SliceSize); + unsigned Limit = getNumElems(Mask.size(), SliceSize, I); + auto SubMask = ArrayRef(Mask).slice(I * SliceSize, Limit); VecMask.assign(VecMask.size(), PoisonMaskElem); copy(SubMask, std::next(VecMask.begin(), I * SliceSize)); if (TEs.size() == 1) { diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll index 2cd29c52411867..ddb2f6c40bbddf 100644 --- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -passes=slp-vectorizer,instcombine %s | FileCheck -check-prefixes=GCN,GFX7 %s -; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -passes=slp-vectorizer,instcombine %s | FileCheck -check-prefixes=GCN,GFX8 %s -; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=slp-vectorizer,instcombine %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -passes=slp-vectorizer,instcombine %s | FileCheck -check-prefixes=GFX7 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -passes=slp-vectorizer,instcombine %s | FileCheck -check-prefixes=GFX8 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=slp-vectorizer,instcombine %s | FileCheck -check-prefixes=GFX9 %s define <2 x i16> @uadd_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) { ; GFX7-LABEL: @uadd_sat_v2i16( @@ -156,6 +156,42 @@ define <2 x i32> @uadd_sat_v2i32(<2 x i32> %arg0, <2 x i32> %arg1) { ; GCN-NEXT: [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1 ; GCN-NEXT: ret <2 x i32> [[INS_1]] ; +; GFX7-LABEL: @uadd_sat_v2i32( +; GFX7-NEXT: bb: +; GFX7-NEXT: [[ARG0_0:%.*]] = extractelement <2 x i32> [[ARG0:%.*]], i64 0 +; GFX7-NEXT: [[ARG0_1:%.*]] = extractelement <2 x i32> [[ARG0]], i64 1 +; GFX7-NEXT: [[ARG1_0:%.*]] = extractelement <2 x i32> [[ARG1:%.*]], i64 0 +; GFX7-NEXT: [[ARG1_1:%.*]] = extractelement <2 x i32> [[ARG1]], i64 1 +; GFX7-NEXT: [[ADD_0:%.*]] = call i32 @llvm.uadd.sat.i32(i32 [[ARG0_0]], i32 [[ARG1_0]]) +; GFX7-NEXT: [[ADD_1:%.*]] = call i32 @llvm.uadd.sat.i32(i32 [[ARG0_1]], i32 [[ARG1_1]]) +; GFX7-NEXT: [[INS_0:%.*]] = insertelement <2 x i32> poison, i32 [[ADD_0]], i64 0 +; GFX7-NEXT: [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1 +; GFX7-NEXT: ret <2 x i32> [[INS_1]] +; +; GFX8-LABEL: @uadd_sat_v2i32( +; GFX8-NEXT: bb: +; GFX8-NEXT: [[ARG0_0:%.*]] = extractelement <2 x i32> [[ARG0:%.*]], i64 0 +; GFX8-NEXT: [[ARG0_1:%.*]] = extractelement <2 x i32> [[ARG0]], i64 1 +; GFX8-NEXT: [[ARG1_0:%.*]] = extractelement <2 x i32> [[ARG1:%.*]], i64 0 +; GFX8-NEXT: [[ARG1_1:%.*]] = extractelement <2 x i32> [[ARG1]], i64 1 +; GFX8-NEXT: [[ADD_0:%.*]] = call i32 @llvm.uadd.sat.i32(i32 [[ARG0_0]], i32 [[ARG1_0]]) +; GFX8-NEXT: [[ADD_1:%.*]] = call i32 @llvm.uadd.sat.i32(i32 [[ARG0_1]], i32 [[ARG1_1]]) +; GFX8-NEXT: [[INS_0:%.*]] = insertelement <2 x i32> poison, i32 [[ADD_0]], i64 0 +; GFX8-NEXT: [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1 +; GFX8-NEXT: ret <2 x i32> [[INS_1]] +; +; GFX9-LABEL: @uadd_sat_v2i32( +; GFX9-NEXT: bb: +; GFX9-NEXT: [[ARG0_0:%.*]] = extractelement <2 x i32> [[ARG0:%.*]], i64 0 +; GFX9-NEXT: [[ARG0_1:%.*]] = extractelement <2 x i32> [[ARG0]], i64 1 +; GFX9-NEXT: [[ARG1_0:%.*]] = extractelement <2 x i32> [[ARG1:%.*]], i64 0 +; GFX9-NEXT: [[ARG1_1:%.*]] = extractelement <2 x i32> [[ARG1]], i64 1 +; GFX9-NEXT: [[ADD_0:%.*]] = call i32 @llvm.uadd.sat.i32(i32 [[ARG0_0]], i32 [[ARG1_0]]) +; GFX9-NEXT: [[ADD_1:%.*]] = call i32 @llvm.uadd.sat.i32(i32 [[ARG0_1]], i32 [[ARG1_1]]) +; GFX9-NEXT: [[INS_0:%.*]] = insertelement <2 x i32> poison, i32 [[ADD_0]], i64 0 +; GFX9-NEXT: [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1 +; GFX9-NEXT: ret <2 x i32> [[INS_1]] +; bb: %arg0.0 = extractelement <2 x i32> %arg0, i64 0 %arg0.1 = extractelement <2 x i32> %arg0, i64 1 @@ -181,6 +217,42 @@ define <2 x i32> @usub_sat_v2i32(<2 x i32> %arg0, <2 x i32> %arg1) { ; GCN-NEXT: [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1 ; GCN-NEXT: ret <2 x i32> [[INS_1]] ; +; GFX7-LABEL: @usub_sat_v2i32( +; GFX7-NEXT: bb: +; GFX7-NEXT: [[ARG0_0:%.*]] = extractelement <2 x i32> [[ARG0:%.*]], i64 0 +; GFX7-NEXT: [[ARG0_1:%.*]] = extractelement <2 x i32> [[ARG0]], i64 1 +; GFX7-NEXT: [[ARG1_0:%.*]] = extractelement <2 x i32> [[ARG1:%.*]], i64 0 +; GFX7-NEXT: [[ARG1_1:%.*]] = extractelement <2 x i32> [[ARG1]], i64 1 +; GFX7-NEXT: [[ADD_0:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[ARG0_0]], i32 [[ARG1_0]]) +; GFX7-NEXT: [[ADD_1:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[ARG0_1]], i32 [[ARG1_1]]) +; GFX7-NEXT: [[INS_0:%.*]] = insertelement <2 x i32> poison, i32 [[ADD_0]], i64 0 +; GFX7-NEXT: [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1 +; GFX7-NEXT: ret <2 x i32> [[INS_1]] +; +; GFX8-LABEL: @usub_sat_v2i32( +; GFX8-NEXT: bb: +; GFX8-NEXT: [[ARG0_0:%.*]] = extractelement <2 x i32> [[ARG0:%.*]], i64 0 +; GFX8-NEXT: [[ARG0_1:%.*]] = extractelement <2 x i32> [[ARG0]], i64 1 +; GFX8-NEXT: [[ARG1_0:%.*]] = extractelement <2 x i32> [[ARG1:%.*]], i64 0 +; GFX8-NEXT: [[ARG1_1:%.*]] = extractelement <2 x i32> [[ARG1]], i64 1 +; GFX8-NEXT: [[ADD_0:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[ARG0_0]], i32 [[ARG1_0]]) +; GFX8-NEXT: [[ADD_1:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[ARG0_1]], i32 [[ARG1_1]]) +; GFX8-NEXT: [[INS_0:%.*]] = insertelement <2 x i32> poison, i32 [[ADD_0]], i64 0 +; GFX8-NEXT: [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1 +; GFX8-NEXT: ret <2 x i32> [[INS_1]] +; +; GFX9-LABEL: @usub_sat_v2i32( +; GFX9-NEXT: bb: +; GFX9-NEXT: [[ARG0_0:%.*]] = extractelement <2 x i32> [[ARG0:%.*]], i64 0 +; GFX9-NEXT: [[ARG0_1:%.*]] = extractelement <2 x i32> [[ARG0]], i64 1 +; GFX9-NEXT: [[ARG1_0:%.*]] = extractelement <2 x i32> [[ARG1:%.*]], i64 0 +; GFX9-NEXT: [[ARG1_1:%.*]] = extractelement <2 x i32> [[ARG1]], i64 1 +; GFX9-NEXT: [[ADD_0:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[ARG0_0]], i32 [[ARG1_0]]) +; GFX9-NEXT: [[ADD_1:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[ARG0_1]], i32 [[ARG1_1]]) +; GFX9-NEXT: [[INS_0:%.*]] = insertelement <2 x i32> poison, i32 [[ADD_0]], i64 0 +; GFX9-NEXT: [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1 +; GFX9-NEXT: ret <2 x i32> [[INS_1]] +; bb: %arg0.0 = extractelement <2 x i32> %arg0, i64 0 %arg0.1 = extractelement <2 x i32> %arg0, i64 1 @@ -206,6 +278,42 @@ define <2 x i32> @sadd_sat_v2i32(<2 x i32> %arg0, <2 x i32> %arg1) { ; GCN-NEXT: [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1 ; GCN-NEXT: ret <2 x i32> [[INS_1]] ; +; GFX7-LABEL: @sadd_sat_v2i32( +; GFX7-NEXT: bb: +; GFX7-NEXT: [[ARG0_0:%.*]] = extractelement <2 x i32> [[ARG0:%.*]], i64 0 +; GFX7-NEXT: [[ARG0_1:%.*]] = extractelement <2 x i32> [[ARG0]], i64 1 +; GFX7-NEXT: [[ARG1_0:%.*]] = extractelement <2 x i32> [[ARG1:%.*]], i64 0 +; GFX7-NEXT: [[ARG1_1:%.*]] = extractelement <2 x i32> [[ARG1]], i64 1 +; GFX7-NEXT: [[ADD_0:%.*]] = call i32 @llvm.sadd.sat.i32(i32 [[ARG0_0]], i32 [[ARG1_0]]) +; GFX7-NEXT: [[ADD_1:%.*]] = call i32 @llvm.sadd.sat.i32(i32 [[ARG0_1]], i32 [[ARG1_1]]) +; GFX7-NEXT: [[INS_0:%.*]] = insertelement <2 x i32> poison, i32 [[ADD_0]], i64 0 +; GFX7-NEXT: [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1 +; GFX7-NEXT: ret <2 x i32> [[INS_1]] +; +; GFX8-LABEL: @sadd_sat_v2i32( +; GFX8-NEXT: bb: +; GFX8-NEXT: [[ARG0_0:%.*]] = extractelement <2 x i32> [[ARG0:%.*]], i64 0 +; GFX8-NEXT: [[ARG0_1:%.*]] = extractelement <2 x i32> [[ARG0]], i64 1 +; GFX8-NEXT: [[ARG1_0:%.*]] = extractelement <2 x i32> [[ARG1:%.*]], i64 0 +; GFX8-NEXT: [[ARG1_1:%.*]] = extractelement <2 x i32> [[ARG1]], i64 1 +; GFX8-NEXT: [[ADD_0:%.*]] = call i32 @llvm.sadd.sat.i32(i32 [[ARG0_0]], i32 [[ARG1_0]]) +; GFX8-NEXT: [[ADD_1:%.*]] = call i32 @llvm.sadd.sat.i32(i32 [[ARG0_1]], i32 [[ARG1_1]]) +; GFX8-NEXT: [[INS_0:%.*]] = insertelement <2 x i32> poison, i32 [[ADD_0]], i64 0 +; GFX8-NEXT: [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1 +; GFX8-NEXT: ret <2 x i32> [[INS_1]] +; +; GFX9-LABEL: @sadd_sat_v2i32( +; GFX9-NEXT: bb: +; GFX9-NEXT: [[ARG0_0:%.*]] = extractelement <2 x i32> [[ARG0:%.*]], i64 0 +; GFX9-NEXT: [[ARG0_1:%.*]] = extractelement <2 x i32> [[ARG0]], i64 1 +; GFX9-NEXT: [[ARG1_0:%.*]] = extractelement <2 x i32> [[ARG1:%.*]], i64 0 +; GFX9-NEXT: [[ARG1_1:%.*]] = extractelement <2 x i32> [[ARG1]], i64 1 +; GFX9-NEXT: [[ADD_0:%.*]] = call i32 @llvm.sadd.sat.i32(i32 [[ARG0_0]], i32 [[ARG1_0]]) +; GFX9-NEXT: [[ADD_1:%.*]] = call i32 @llvm.sadd.sat.i32(i32 [[ARG0_1]], i32 [[ARG1_1]]) +; GFX9-NEXT: [[INS_0:%.*]] = insertelement <2 x i32> poison, i32 [[ADD_0]], i64 0 +; GFX9-NEXT: [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1 +; GFX9-NEXT: ret <2 x i32> [[INS_1]] +; bb: %arg0.0 = extractelement <2 x i32> %arg0, i64 0 %arg0.1 = extractelement <2 x i32> %arg0, i64 1 @@ -231,6 +339,42 @@ define <2 x i32> @ssub_sat_v2i32(<2 x i32> %arg0, <2 x i32> %arg1) { ; GCN-NEXT: [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1 ; GCN-NEXT: ret <2 x i32> [[INS_1]] ; +; GFX7-LABEL: @ssub_sat_v2i32( +; GFX7-NEXT: bb: +; GFX7-NEXT: [[ARG0_0:%.*]] = extractelement <2 x i32> [[ARG0:%.*]], i64 0 +; GFX7-NEXT: [[ARG0_1:%.*]] = extractelement <2 x i32> [[ARG0]], i64 1 +; GFX7-NEXT: [[ARG1_0:%.*]] = extractelement <2 x i32> [[ARG1:%.*]], i64 0 +; GFX7-NEXT: [[ARG1_1:%.*]] = extractelement <2 x i32> [[ARG1]], i64 1 +; GFX7-NEXT: [[ADD_0:%.*]] = call i32 @llvm.ssub.sat.i32(i32 [[ARG0_0]], i32 [[ARG1_0]]) +; GFX7-NEXT: [[ADD_1:%.*]] = call i32 @llvm.ssub.sat.i32(i32 [[ARG0_1]], i32 [[ARG1_1]]) +; GFX7-NEXT: [[INS_0:%.*]] = insertelement <2 x i32> poison, i32 [[ADD_0]], i64 0 +; GFX7-NEXT: [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1 +; GFX7-NEXT: ret <2 x i32> [[INS_1]] +; +; GFX8-LABEL: @ssub_sat_v2i32( +; GFX8-NEXT: bb: +; GFX8-NEXT: [[ARG0_0:%.*]] = extractelement <2 x i32> [[ARG0:%.*]], i64 0 +; GFX8-NEXT: [[ARG0_1:%.*]] = extractelement <2 x i32> [[ARG0]], i64 1 +; GFX8-NEXT: [[ARG1_0:%.*]] = extractelement <2 x i32> [[ARG1:%.*]], i64 0 +; GFX8-NEXT: [[ARG1_1:%.*]] = extractelement <2 x i32> [[ARG1]], i64 1 +; GFX8-NEXT: [[ADD_0:%.*]] = call i32 @llvm.ssub.sat.i32(i32 [[ARG0_0]], i32 [[ARG1_0]]) +; GFX8-NEXT: [[ADD_1:%.*]] = call i32 @llvm.ssub.sat.i32(i32 [[ARG0_1]], i32 [[ARG1_1]]) +; GFX8-NEXT: [[INS_0:%.*]] = insertelement <2 x i32> poison, i32 [[ADD_0]], i64 0 +; GFX8-NEXT: [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1 +; GFX8-NEXT: ret <2 x i32> [[INS_1]] +; +; GFX9-LABEL: @ssub_sat_v2i32( +; GFX9-NEXT: bb: +; GFX9-NEXT: [[ARG0_0:%.*]] = extractelement <2 x i32> [[ARG0:%.*]], i64 0 +; GFX9-NEXT: [[ARG0_1:%.*]] = extractelement <2 x i32> [[ARG0]], i64 1 +; GFX9-NEXT: [[ARG1_0:%.*]] = extractelement <2 x i32> [[ARG1:%.*]], i64 0 +; GFX9-NEXT: [[ARG1_1:%.*]] = extractelement <2 x i32> [[ARG1]], i64 1 +; GFX9-NEXT: [[ADD_0:%.*]] = call i32 @llvm.ssub.sat.i32(i32 [[ARG0_0]], i32 [[ARG1_0]]) +; GFX9-NEXT: [[ADD_1:%.*]] = call i32 @llvm.ssub.sat.i32(i32 [[ARG0_1]], i32 [[ARG1_1]]) +; GFX9-NEXT: [[INS_0:%.*]] = insertelement <2 x i32> poison, i32 [[ADD_0]], i64 0 +; GFX9-NEXT: [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1 +; GFX9-NEXT: ret <2 x i32> [[INS_1]] +; bb: %arg0.0 = extractelement <2 x i32> %arg0, i64 0 %arg0.1 = extractelement <2 x i32> %arg0, i64 1 diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll index 4653e47f4a5f36..1414a71321843a 100644 --- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll +++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -passes=slp-vectorizer,instcombine %s | FileCheck -check-prefixes=GCN,GFX7 %s -; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -passes=slp-vectorizer,instcombine %s | FileCheck -check-prefixes=GCN,GFX8 %s -; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=slp-vectorizer,instcombine %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -passes=slp-vectorizer,instcombine %s | FileCheck -check-prefixes=GFX7 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -passes=slp-vectorizer,instcombine %s | FileCheck -check-prefixes=GFX8 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=slp-vectorizer,instcombine %s | FileCheck -check-prefixes=GFX9 %s define <2 x i16> @uadd_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) { ; GFX7-LABEL: @uadd_sat_v2i16( @@ -156,6 +156,42 @@ define <2 x i32> @uadd_sat_v2i32(<2 x i32> %arg0, <2 x i32> %arg1) { ; GCN-NEXT: [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1 ; GCN-NEXT: ret <2 x i32> [[INS_1]] ; +; GFX7-LABEL: @uadd_sat_v2i32( +; GFX7-NEXT: bb: +; GFX7-NEXT: [[ARG0_0:%.*]] = extractelement <2 x i32> [[ARG0:%.*]], i64 0 +; GFX7-NEXT: [[ARG0_1:%.*]] = extractelement <2 x i32> [[ARG0]], i64 1 +; GFX7-NEXT: [[ARG1_0:%.*]] = extractelement <2 x i32> [[ARG1:%.*]], i64 0 +; GFX7-NEXT: [[ARG1_1:%.*]] = extractelement <2 x i32> [[ARG1]], i64 1 +; GFX7-NEXT: [[ADD_0:%.*]] = call i32 @llvm.uadd.sat.i32(i32 [[ARG0_0]], i32 [[ARG1_0]]) +; GFX7-NEXT: [[ADD_1:%.*]] = call i32 @llvm.uadd.sat.i32(i32 [[ARG0_1]], i32 [[ARG1_1]]) +; GFX7-NEXT: [[INS_0:%.*]] = insertelement <2 x i32> poison, i32 [[ADD_0]], i64 0 +; GFX7-NEXT: [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1 +; GFX7-NEXT: ret <2 x i32> [[INS_1]] +; +; GFX8-LABEL: @uadd_sat_v2i32( +; GFX8-NEXT: bb: +; GFX8-NEXT: [[ARG0_0:%.*]] = extractelement <2 x i32> [[ARG0:%.*]], i64 0 +; GFX8-NEXT: [[ARG0_1:%.*]] = extractelement <2 x i32> [[ARG0]], i64 1 +; GFX8-NEXT: [[ARG1_0:%.*]] = extractelement <2 x i32> [[ARG1:%.*]], i64 0 +; GFX8-NEXT: [[ARG1_1:%.*]] = extractelement <2 x i32> [[ARG1]], i64 1 +; GFX8-NEXT: [[ADD_0:%.*]] = call i32 @llvm.uadd.sat.i32(i32 [[ARG0_0]], i32 [[ARG1_0]]) +; GFX8-NEXT: [[ADD_1:%.*]] = call i32 @llvm.uadd.sat.i32(i32 [[ARG0_1]], i32 [[ARG1_1]]) +; GFX8-NEXT: [[INS_0:%.*]] = insertelement <2 x i32> poison, i32 [[ADD_0]], i64 0 +; GFX8-NEXT: [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1 +; GFX8-NEXT: ret <2 x i32> [[INS_1]] +; +; GFX9-LABEL: @uadd_sat_v2i32( +; GFX9-NEXT: bb: +; GFX9-NEXT: [[ARG0_0:%.*]] = extractelement <2 x i32> [[ARG0:%.*]], i64 0 +; GFX9-NEXT: [[ARG0_1:%.*]] = extractelement <2 x i32> [[ARG0]], i64 1 +; GFX9-NEXT: [[ARG1_0:%.*]] = extractelement <2 x i32> [[ARG1:%.*]], i64 0 +; GFX9-NEXT: [[ARG1_1:%.*]] = extractelement <2 x i32> [[ARG1]], i64 1 +; GFX9-NEXT: [[ADD_0:%.*]] = call i32 @llvm.uadd.sat.i32(i32 [[ARG0_0]], i32 [[ARG1_0]]) +; GFX9-NEXT: [[ADD_1:%.*]] = call i32 @llvm.uadd.sat.i32(i32 [[ARG0_1]], i32 [[ARG1_1]]) +; GFX9-NEXT: [[INS_0:%.*]] = insertelement <2 x i32> poison, i32 [[ADD_0]], i64 0 +; GFX9-NEXT: [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1 +; GFX9-NEXT: ret <2 x i32> [[INS_1]] +; bb: %arg0.0 = extractelement <2 x i32> %arg0, i64 0 %arg0.1 = extractelement <2 x i32> %arg0, i64 1 @@ -181,6 +217,42 @@ define <2 x i32> @usub_sat_v2i32(<2 x i32> %arg0, <2 x i32> %arg1) { ; GCN-NEXT: [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1 ; GCN-NEXT: ret <2 x i32> [[INS_1]] ; +; GFX7-LABEL: @usub_sat_v2i32( +; GFX7-NEXT: bb: +; GFX7-NEXT: [[ARG0_0:%.*]] = extractelement <2 x i32> [[ARG0:%.*]], i64 0 +; GFX7-NEXT: [[ARG0_1:%.*]] = extractelement <2 x i32> [[ARG0]], i64 1 +; GFX7-NEXT: [[ARG1_0:%.*]] = extractelement <2 x i32> [[ARG1:%.*]], i64 0 +; GFX7-NEXT: [[ARG1_1:%.*]] = extractelement <2 x i32> [[ARG1]], i64 1 +; GFX7-NEXT: [[ADD_0:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[ARG0_0]], i32 [[ARG1_0]]) +; GFX7-NEXT: [[ADD_1:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[ARG0_1]], i32 [[ARG1_1]]) +; GFX7-NEXT: [[INS_0:%.*]] = insertelement <2 x i32> poison, i32 [[ADD_0]], i64 0 +; GFX7-NEXT: [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1 +; GFX7-NEXT: ret <2 x i32> [[INS_1]] +; +; GFX8-LABEL: @usub_sat_v2i32( +; GFX8-NEXT: bb: +; GFX8-NEXT: [[ARG0_0:%.*]] = extractelement <2 x i32> [[ARG0:%.*]], i64 0 +; GFX8-NEXT: [[ARG0_1:%.*]] = extractelement <2 x i32> [[ARG0]], i64 1 +; GFX8-NEXT: [[ARG1_0:%.*]] = extractelement <2 x i32> [[ARG1:%.*]], i64 0 +; GFX8-NEXT: [[ARG1_1:%.*]] = extractelement <2 x i32> [[ARG1]], i64 1 +; GFX8-NEXT: [[ADD_0:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[ARG0_0]], i32 [[ARG1_0]]) +; GFX8-NEXT: [[ADD_1:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[ARG0_1]], i32 [[ARG1_1]]) +; GFX8-NEXT: [[INS_0:%.*]] = insertelement <2 x i32> poison, i32 [[ADD_0]], i64 0 +; GFX8-NEXT: [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1 +; GFX8-NEXT: ret <2 x i32> [[INS_1]] +; +; GFX9-LABEL: @usub_sat_v2i32( +; GFX9-NEXT: bb: +; GFX9-NEXT: [[ARG0_0:%.*]] = extractelement <2 x i32> [[ARG0:%.*]], i64 0 +; GFX9-NEXT: [[ARG0_1:%.*]] = extractelement <2 x i32> [[ARG0]], i64 1 +; GFX9-NEXT: [[ARG1_0:%.*]] = extractelement <2 x i32> [[ARG1:%.*]], i64 0 +; GFX9-NEXT: [[ARG1_1:%.*]] = extractelement <2 x i32> [[ARG1]], i64 1 +; GFX9-NEXT: [[ADD_0:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[ARG0_0]], i32 [[ARG1_0]]) +; GFX9-NEXT: [[ADD_1:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[ARG0_1]], i32 [[ARG1_1]]) +; GFX9-NEXT: [[INS_0:%.*]] = insertelement <2 x i32> poison, i32 [[ADD_0]], i64 0 +; GFX9-NEXT: [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1 +; GFX9-NEXT: ret <2 x i32> [[INS_1]] +; bb: %arg0.0 = extractelement <2 x i32> %arg0, i64 0 %arg0.1 = extractelement <2 x i32> %arg0, i64 1 @@ -206,6 +278,42 @@ define <2 x i32> @sadd_sat_v2i32(<2 x i32> %arg0, <2 x i32> %arg1) { ; GCN-NEXT: [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1 ; GCN-NEXT: ret <2 x i32> [[INS_1]] ; +; GFX7-LABEL: @sadd_sat_v2i32( +; GFX7-NEXT: bb: +; GFX7-NEXT: [[ARG0_0:%.*]] = extractelement <2 x i32> [[ARG0:%.*]], i64 0 +; GFX7-NEXT: [[ARG0_1:%.*]] = extractelement <2 x i32> [[ARG0]], i64 1 +; GFX7-NEXT: [[ARG1_0:%.*]] = extractelement <2 x i32> [[ARG1:%.*]], i64 0 +; GFX7-NEXT: [[ARG1_1:%.*]] = extractelement <2 x i32> [[ARG1]], i64 1 +; GFX7-NEXT: [[ADD_0:%.*]] = call i32 @llvm.sadd.sat.i32(i32 [[ARG0_0]], i32 [[ARG1_0]]) +; GFX7-NEXT: [[ADD_1:%.*]] = call i32 @llvm.sadd.sat.i32(i32 [[ARG0_1]], i32 [[ARG1_1]]) +; GFX7-NEXT: [[INS_0:%.*]] = insertelement <2 x i32> poison, i32 [[ADD_0]], i64 0 +; GFX7-NEXT: [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1 +; GFX7-NEXT: ret <2 x i32> [[INS_1]] +; +; GFX8-LABEL: @sadd_sat_v2i32( +; GFX8-NEXT: bb: +; GFX8-NEXT: [[ARG0_0:%.*]] = extractelement <2 x i32> [[ARG0:%.*]], i64 0 +; GFX8-NEXT: [[ARG0_1:%.*]] = extractelement <2 x i32> [[ARG0]], i64 1 +; GFX8-NEXT: [[ARG1_0:%.*]] = extractelement <2 x i32> [[ARG1:%.*]], i64 0 +; GFX8-NEXT: [[ARG1_1:%.*]] = extractelement <2 x i32> [[ARG1]], i64 1 +; GFX8-NEXT: [[ADD_0:%.*]] = call i32 @llvm.sadd.sat.i32(i32 [[ARG0_0]], i32 [[ARG1_0]]) +; GFX8-NEXT: [[ADD_1:%.*]] = call i32 @llvm.sadd.sat.i32(i32 [[ARG0_1]], i32 [[ARG1_1]]) +; GFX8-NEXT: [[INS_0:%.*]] = insertelement <2 x i32> poison, i32 [[ADD_0]], i64 0 +; GFX8-NEXT: [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1 +; GFX8-NEXT: ret <2 x i32> [[INS_1]] +; +; GFX9-LABEL: @sadd_sat_v2i32( +; GFX9-NEXT: bb: +; GFX9-NEXT: [[ARG0_0:%.*]] = extractelement <2 x i32> [[ARG0:%.*]], i64 0 +; GFX9-NEXT: [[ARG0_1:%.*]] = extractelement <2 x i32> [[ARG0]], i64 1 +; GFX9-NEXT: [[ARG1_0:%.*]] = extractelement <2 x i32> [[ARG1:%.*]], i64 0 +; GFX9-NEXT: [[ARG1_1:%.*]] = extractelement <2 x i32> [[ARG1]], i64 1 +; GFX9-NEXT: [[ADD_0:%.*]] = call i32 @llvm.sadd.sat.i32(i32 [[ARG0_0]], i32 [[ARG1_0]]) +; GFX9-NEXT: [[ADD_1:%.*]] = call i32 @llvm.sadd.sat.i32(i32 [[ARG0_1]], i32 [[ARG1_1]]) +; GFX9-NEXT: [[INS_0:%.*]] = insertelement <2 x i32> poison, i32 [[ADD_0]], i64 0 +; GFX9-NEXT: [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1 +; GFX9-NEXT: ret <2 x i32> [[INS_1]] +; bb: %arg0.0 = extractelement <2 x i32> %arg0, i64 0 %arg0.1 = extractelement <2 x i32> %arg0, i64 1 @@ -231,6 +339,42 @@ define <2 x i32> @ssub_sat_v2i32(<2 x i32> %arg0, <2 x i32> %arg1) { ; GCN-NEXT: [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1 ; GCN-NEXT: ret <2 x i32> [[INS_1]] ; +; GFX7-LABEL: @ssub_sat_v2i32( +; GFX7-NEXT: bb: +; GFX7-NEXT: [[ARG0_0:%.*]] = extractelement <2 x i32> [[ARG0:%.*]], i64 0 +; GFX7-NEXT: [[ARG0_1:%.*]] = extractelement <2 x i32> [[ARG0]], i64 1 +; GFX7-NEXT: [[ARG1_0:%.*]] = extractelement <2 x i32> [[ARG1:%.*]], i64 0 +; GFX7-NEXT: [[ARG1_1:%.*]] = extractelement <2 x i32> [[ARG1]], i64 1 +; GFX7-NEXT: [[ADD_0:%.*]] = call i32 @llvm.ssub.sat.i32(i32 [[ARG0_0]], i32 [[ARG1_0]]) +; GFX7-NEXT: [[ADD_1:%.*]] = call i32 @llvm.ssub.sat.i32(i32 [[ARG0_1]], i32 [[ARG1_1]]) +; GFX7-NEXT: [[INS_0:%.*]] = insertelement <2 x i32> poison, i32 [[ADD_0]], i64 0 +; GFX7-NEXT: [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1 +; GFX7-NEXT: ret <2 x i32> [[INS_1]] +; +; GFX8-LABEL: @ssub_sat_v2i32( +; GFX8-NEXT: bb: +; GFX8-NEXT: [[ARG0_0:%.*]] = extractelement <2 x i32> [[ARG0:%.*]], i64 0 +; GFX8-NEXT: [[ARG0_1:%.*]] = extractelement <2 x i32> [[ARG0]], i64 1 +; GFX8-NEXT: [[ARG1_0:%.*]] = extractelement <2 x i32> [[ARG1:%.*]], i64 0 +; GFX8-NEXT: [[ARG1_1:%.*]] = extractelement <2 x i32> [[ARG1]], i64 1 +; GFX8-NEXT: [[ADD_0:%.*]] = call i32 @llvm.ssub.sat.i32(i32 [[ARG0_0]], i32 [[ARG1_0]]) +; GFX8-NEXT: [[ADD_1:%.*]] = call i32 @llvm.ssub.sat.i32(i32 [[ARG0_1]], i32 [[ARG1_1]]) +; GFX8-NEXT: [[INS_0:%.*]] = insertelement <2 x i32> poison, i32 [[ADD_0]], i64 0 +; GFX8-NEXT: [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1 +; GFX8-NEXT: ret <2 x i32> [[INS_1]] +; +; GFX9-LABEL: @ssub_sat_v2i32( +; GFX9-NEXT: bb: +; GFX9-NEXT: [[ARG0_0:%.*]] = extractelement <2 x i32> [[ARG0:%.*]], i64 0 +; GFX9-NEXT: [[ARG0_1:%.*]] = extractelement <2 x i32> [[ARG0]], i64 1 +; GFX9-NEXT: [[ARG1_0:%.*]] = extractelement <2 x i32> [[ARG1:%.*]], i64 0 +; GFX9-NEXT: [[ARG1_1:%.*]] = extractelement <2 x i32> [[ARG1]], i64 1 +; GFX9-NEXT: [[ADD_0:%.*]] = call i32 @llvm.ssub.sat.i32(i32 [[ARG0_0]], i32 [[ARG1_0]]) +; GFX9-NEXT: [[ADD_1:%.*]] = call i32 @llvm.ssub.sat.i32(i32 [[ARG0_1]], i32 [[ARG1_1]]) +; GFX9-NEXT: [[INS_0:%.*]] = insertelement <2 x i32> poison, i32 [[ADD_0]], i64 0 +; GFX9-NEXT: [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1 +; GFX9-NEXT: ret <2 x i32> [[INS_1]] +; bb: %arg0.0 = extractelement <2 x i32> %arg0, i64 0 %arg0.1 = extractelement <2 x i32> %arg0, i64 1 diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/reduction.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/reduction.ll index a37c870253fc92..567760f6955984 100644 --- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/reduction.ll +++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/reduction.ll @@ -56,25 +56,12 @@ define half @reduction_half16(<16 x half> %vec16) { ; ; VI-LABEL: @reduction_half16( ; VI-NEXT: entry: -; VI-NEXT: [[ELT8:%.*]] = extractelement <16 x half> [[VEC16:%.*]], i64 8 -; VI-NEXT: [[ELT9:%.*]] = extractelement <16 x half> [[VEC16]], i64 9 -; VI-NEXT: [[ELT10:%.*]] = extractelement <16 x half> [[VEC16]], i64 10 -; VI-NEXT: [[ELT11:%.*]] = extractelement <16 x half> [[VEC16]], i64 11 -; VI-NEXT: [[ELT12:%.*]] = extractelement <16 x half> [[VEC16]], i64 12 -; VI-NEXT: [[ELT13:%.*]] = extractelement <16 x half> [[VEC16]], i64 13 -; VI-NEXT: [[ELT14:%.*]] = extractelement <16 x half> [[VEC16]], i64 14 -; VI-NEXT: [[ELT15:%.*]] = extractelement <16 x half> [[VEC16]], i64 15 -; VI-NEXT: [[TMP0:%.*]] = shufflevector <16 x half> [[VEC16]], <16 x half> poison, <8 x i32> +; VI-NEXT: [[TMP0:%.*]] = shufflevector <16 x half> [[VEC16:%.*]], <16 x half> poison, <8 x i32> ; VI-NEXT: [[TMP1:%.*]] = call fast half @llvm.vector.reduce.fadd.v8f16(half 0xH8000, <8 x half> [[TMP0]]) -; VI-NEXT: [[OP_RDX:%.*]] = fadd fast half [[TMP1]], [[ELT8]] -; VI-NEXT: [[OP_RDX1:%.*]] = fadd fast half [[ELT9]], [[ELT10]] -; VI-NEXT: [[OP_RDX2:%.*]] = fadd fast half [[ELT11]], [[ELT12]] -; VI-NEXT: [[OP_RDX3:%.*]] = fadd fast half [[ELT13]], [[ELT14]] -; VI-NEXT: [[OP_RDX4:%.*]] = fadd fast half [[OP_RDX]], [[OP_RDX1]] -; VI-NEXT: [[OP_RDX5:%.*]] = fadd fast half [[OP_RDX2]], [[OP_RDX3]] -; VI-NEXT: [[OP_RDX6:%.*]] = fadd fast half [[OP_RDX4]], [[OP_RDX5]] -; VI-NEXT: [[OP_RDX7:%.*]] = fadd fast half [[OP_RDX6]], [[ELT15]] -; VI-NEXT: ret half [[OP_RDX7]] +; VI-NEXT: [[TMP2:%.*]] = shufflevector <16 x half> [[VEC16]], <16 x half> poison, <8 x i32> +; VI-NEXT: [[TMP3:%.*]] = call fast half @llvm.vector.reduce.fadd.v8f16(half 0xH8000, <8 x half> [[TMP2]]) +; VI-NEXT: [[OP_RDX:%.*]] = fadd fast half [[TMP1]], [[TMP3]] +; VI-NEXT: ret half [[OP_RDX]] ; entry: %elt0 = extractelement <16 x half> %vec16, i64 0 diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll index 059e4c38b519bd..9608608a180982 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll @@ -155,13 +155,11 @@ define <4 x float> @exp_4x(ptr %a) { ; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 ; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @expf(float [[VECEXT_1]]) ; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 -; CHECK-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; CHECK-NEXT: [[TMP3:%.*]] = tail call fast float @expf(float [[VECEXT_2]]) -; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 -; CHECK-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; CHECK-NEXT: [[TMP4:%.*]] = tail call fast float @expf(float [[VECEXT_3]]) -; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 -; CHECK-NEXT: ret <4 x float> [[VECINS_3]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> +; CHECK-NEXT: ret <4 x float> [[VECINS_31]] ; ; DEFAULT-LABEL: define <4 x float> @exp_4x ; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] { @@ -173,13 +171,11 @@ define <4 x float> @exp_4x(ptr %a) { ; DEFAULT-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 ; DEFAULT-NEXT: [[TMP2:%.*]] = tail call fast float @expf(float [[VECEXT_1]]) ; DEFAULT-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 -; DEFAULT-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; DEFAULT-NEXT: [[TMP3:%.*]] = tail call fast float @expf(float [[VECEXT_2]]) -; DEFAULT-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 -; DEFAULT-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; DEFAULT-NEXT: [[TMP4:%.*]] = tail call fast float @expf(float [[VECEXT_3]]) -; DEFAULT-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 -; DEFAULT-NEXT: ret <4 x float> [[VECINS_3]] +; DEFAULT-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> +; DEFAULT-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP3]]) +; DEFAULT-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> +; DEFAULT-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> +; DEFAULT-NEXT: ret <4 x float> [[VECINS_31]] ; entry: %0 = load <4 x float>, ptr %a, align 16 @@ -212,13 +208,11 @@ define <4 x float> @int_exp_4x(ptr %a) { ; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 ; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT_1]]) ; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 -; CHECK-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; CHECK-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT_2]]) -; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 -; CHECK-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; CHECK-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT_3]]) -; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 -; CHECK-NEXT: ret <4 x float> [[VECINS_3]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> +; CHECK-NEXT: ret <4 x float> [[VECINS_31]] ; ; DEFAULT-LABEL: define <4 x float> @int_exp_4x ; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] { @@ -230,13 +224,11 @@ define <4 x float> @int_exp_4x(ptr %a) { ; DEFAULT-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 ; DEFAULT-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT_1]]) ; DEFAULT-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 -; DEFAULT-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; DEFAULT-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT_2]]) -; DEFAULT-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 -; DEFAULT-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; DEFAULT-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT_3]]) -; DEFAULT-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 -; DEFAULT-NEXT: ret <4 x float> [[VECINS_3]] +; DEFAULT-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> +; DEFAULT-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP3]]) +; DEFAULT-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> +; DEFAULT-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> +; DEFAULT-NEXT: ret <4 x float> [[VECINS_31]] ; entry: %0 = load <4 x float>, ptr %a, align 16 @@ -269,13 +261,11 @@ define <4 x float> @log_4x(ptr %a) { ; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 ; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @logf(float [[VECEXT_1]]) ; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 -; CHECK-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; CHECK-NEXT: [[TMP3:%.*]] = tail call fast float @logf(float [[VECEXT_2]]) -; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 -; CHECK-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; CHECK-NEXT: [[TMP4:%.*]] = tail call fast float @logf(float [[VECEXT_3]]) -; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 -; CHECK-NEXT: ret <4 x float> [[VECINS_3]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.log.v2f32(<2 x float> [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> +; CHECK-NEXT: ret <4 x float> [[VECINS_31]] ; ; DEFAULT-LABEL: define <4 x float> @log_4x ; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] { @@ -287,13 +277,11 @@ define <4 x float> @log_4x(ptr %a) { ; DEFAULT-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 ; DEFAULT-NEXT: [[TMP2:%.*]] = tail call fast float @logf(float [[VECEXT_1]]) ; DEFAULT-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 -; DEFAULT-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; DEFAULT-NEXT: [[TMP3:%.*]] = tail call fast float @logf(float [[VECEXT_2]]) -; DEFAULT-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 -; DEFAULT-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; DEFAULT-NEXT: [[TMP4:%.*]] = tail call fast float @logf(float [[VECEXT_3]]) -; DEFAULT-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 -; DEFAULT-NEXT: ret <4 x float> [[VECINS_3]] +; DEFAULT-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> +; DEFAULT-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.log.v2f32(<2 x float> [[TMP3]]) +; DEFAULT-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> +; DEFAULT-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> +; DEFAULT-NEXT: ret <4 x float> [[VECINS_31]] ; entry: %0 = load <4 x float>, ptr %a, align 16 @@ -326,13 +314,11 @@ define <4 x float> @int_log_4x(ptr %a) { ; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 ; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT_1]]) ; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 -; CHECK-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; CHECK-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT_2]]) -; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 -; CHECK-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; CHECK-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT_3]]) -; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 -; CHECK-NEXT: ret <4 x float> [[VECINS_3]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.log.v2f32(<2 x float> [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> +; CHECK-NEXT: ret <4 x float> [[VECINS_31]] ; ; DEFAULT-LABEL: define <4 x float> @int_log_4x ; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] { @@ -344,13 +330,11 @@ define <4 x float> @int_log_4x(ptr %a) { ; DEFAULT-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 ; DEFAULT-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT_1]]) ; DEFAULT-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 -; DEFAULT-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; DEFAULT-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT_2]]) -; DEFAULT-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 -; DEFAULT-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; DEFAULT-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT_3]]) -; DEFAULT-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 -; DEFAULT-NEXT: ret <4 x float> [[VECINS_3]] +; DEFAULT-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> +; DEFAULT-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.log.v2f32(<2 x float> [[TMP3]]) +; DEFAULT-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> +; DEFAULT-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> +; DEFAULT-NEXT: ret <4 x float> [[VECINS_31]] ; entry: %0 = load <4 x float>, ptr %a, align 16 @@ -383,13 +367,11 @@ define <4 x float> @sin_4x(ptr %a) { ; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 ; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @sinf(float [[VECEXT_1]]) ; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 -; CHECK-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; CHECK-NEXT: [[TMP3:%.*]] = tail call fast float @sinf(float [[VECEXT_2]]) -; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 -; CHECK-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; CHECK-NEXT: [[TMP4:%.*]] = tail call fast float @sinf(float [[VECEXT_3]]) -; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 -; CHECK-NEXT: ret <4 x float> [[VECINS_3]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> +; CHECK-NEXT: ret <4 x float> [[VECINS_31]] ; ; DEFAULT-LABEL: define <4 x float> @sin_4x ; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] { @@ -401,13 +383,11 @@ define <4 x float> @sin_4x(ptr %a) { ; DEFAULT-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 ; DEFAULT-NEXT: [[TMP2:%.*]] = tail call fast float @sinf(float [[VECEXT_1]]) ; DEFAULT-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 -; DEFAULT-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; DEFAULT-NEXT: [[TMP3:%.*]] = tail call fast float @sinf(float [[VECEXT_2]]) -; DEFAULT-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 -; DEFAULT-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; DEFAULT-NEXT: [[TMP4:%.*]] = tail call fast float @sinf(float [[VECEXT_3]]) -; DEFAULT-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 -; DEFAULT-NEXT: ret <4 x float> [[VECINS_3]] +; DEFAULT-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> +; DEFAULT-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]]) +; DEFAULT-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> +; DEFAULT-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> +; DEFAULT-NEXT: ret <4 x float> [[VECINS_31]] ; entry: %0 = load <4 x float>, ptr %a, align 16 @@ -440,13 +420,11 @@ define <4 x float> @int_sin_4x(ptr %a) { ; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 ; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]]) ; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 -; CHECK-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; CHECK-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_2]]) -; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 -; CHECK-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; CHECK-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_3]]) -; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 -; CHECK-NEXT: ret <4 x float> [[VECINS_3]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> +; CHECK-NEXT: ret <4 x float> [[VECINS_31]] ; ; DEFAULT-LABEL: define <4 x float> @int_sin_4x ; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] { @@ -458,13 +436,11 @@ define <4 x float> @int_sin_4x(ptr %a) { ; DEFAULT-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 ; DEFAULT-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]]) ; DEFAULT-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 -; DEFAULT-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; DEFAULT-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_2]]) -; DEFAULT-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 -; DEFAULT-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; DEFAULT-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_3]]) -; DEFAULT-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 -; DEFAULT-NEXT: ret <4 x float> [[VECINS_3]] +; DEFAULT-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> +; DEFAULT-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]]) +; DEFAULT-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> +; DEFAULT-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> +; DEFAULT-NEXT: ret <4 x float> [[VECINS_31]] ; entry: %0 = load <4 x float>, ptr %a, align 16 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls-inseltpoison.ll index 6c21cc1cfc5be8..80099bad4693b8 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls-inseltpoison.ll @@ -3,8 +3,8 @@ ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=SLM ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=AVX ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=AVX2 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=AVX2 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=AVX2 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=KNL +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=SKX define <8 x float> @ceil_floor(<8 x float> %a) { ; SSE-LABEL: @ceil_floor( @@ -51,25 +51,23 @@ define <8 x float> @ceil_floor(<8 x float> %a) { ; ; AVX-LABEL: @ceil_floor( ; AVX-NEXT: [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i64 0 -; AVX-NEXT: [[A1:%.*]] = extractelement <8 x float> [[A]], i64 1 -; AVX-NEXT: [[A2:%.*]] = extractelement <8 x float> [[A]], i64 2 ; AVX-NEXT: [[A3:%.*]] = extractelement <8 x float> [[A]], i64 3 ; AVX-NEXT: [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]]) -; AVX-NEXT: [[AB1:%.*]] = call float @llvm.floor.f32(float [[A1]]) -; AVX-NEXT: [[AB2:%.*]] = call float @llvm.floor.f32(float [[A2]]) +; AVX-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> +; AVX-NEXT: [[TMP2:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP1]]) ; AVX-NEXT: [[AB3:%.*]] = call float @llvm.ceil.f32(float [[A3]]) -; AVX-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> -; AVX-NEXT: [[TMP2:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP1]]) -; AVX-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> -; AVX-NEXT: [[TMP4:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP3]]) +; AVX-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> +; AVX-NEXT: [[TMP4:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP3]]) +; AVX-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> +; AVX-NEXT: [[TMP6:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP5]]) ; AVX-NEXT: [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i64 0 -; AVX-NEXT: [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i64 1 -; AVX-NEXT: [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i64 2 -; AVX-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i64 3 -; AVX-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <8 x i32> -; AVX-NEXT: [[R52:%.*]] = shufflevector <8 x float> [[R3]], <8 x float> [[TMP5]], <8 x i32> -; AVX-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <8 x i32> -; AVX-NEXT: [[R71:%.*]] = shufflevector <8 x float> [[R52]], <8 x float> [[TMP6]], <8 x i32> +; AVX-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <8 x i32> +; AVX-NEXT: [[R23:%.*]] = shufflevector <8 x float> [[R0]], <8 x float> [[TMP7]], <8 x i32> +; AVX-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R23]], float [[AB3]], i64 3 +; AVX-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <8 x i32> +; AVX-NEXT: [[R52:%.*]] = shufflevector <8 x float> [[R3]], <8 x float> [[TMP8]], <8 x i32> +; AVX-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <8 x i32> +; AVX-NEXT: [[R71:%.*]] = shufflevector <8 x float> [[R52]], <8 x float> [[TMP9]], <8 x i32> ; AVX-NEXT: ret <8 x float> [[R71]] ; ; AVX2-LABEL: @ceil_floor( @@ -92,6 +90,48 @@ define <8 x float> @ceil_floor(<8 x float> %a) { ; AVX2-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <8 x i32> ; AVX2-NEXT: [[R71:%.*]] = shufflevector <8 x float> [[R52]], <8 x float> [[TMP9]], <8 x i32> ; AVX2-NEXT: ret <8 x float> [[R71]] +; +; KNL-LABEL: @ceil_floor( +; KNL-NEXT: [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i64 0 +; KNL-NEXT: [[A3:%.*]] = extractelement <8 x float> [[A]], i64 3 +; KNL-NEXT: [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]]) +; KNL-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> +; KNL-NEXT: [[TMP2:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP1]]) +; KNL-NEXT: [[AB3:%.*]] = call float @llvm.ceil.f32(float [[A3]]) +; KNL-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> +; KNL-NEXT: [[TMP4:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP3]]) +; KNL-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> +; KNL-NEXT: [[TMP6:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP5]]) +; KNL-NEXT: [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i64 0 +; KNL-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <8 x i32> +; KNL-NEXT: [[R23:%.*]] = shufflevector <8 x float> [[R0]], <8 x float> [[TMP7]], <8 x i32> +; KNL-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R23]], float [[AB3]], i64 3 +; KNL-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <8 x i32> +; KNL-NEXT: [[R52:%.*]] = shufflevector <8 x float> [[R3]], <8 x float> [[TMP8]], <8 x i32> +; KNL-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <8 x i32> +; KNL-NEXT: [[R71:%.*]] = shufflevector <8 x float> [[R52]], <8 x float> [[TMP9]], <8 x i32> +; KNL-NEXT: ret <8 x float> [[R71]] +; +; SKX-LABEL: @ceil_floor( +; SKX-NEXT: [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i64 0 +; SKX-NEXT: [[A3:%.*]] = extractelement <8 x float> [[A]], i64 3 +; SKX-NEXT: [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]]) +; SKX-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> +; SKX-NEXT: [[TMP2:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP1]]) +; SKX-NEXT: [[AB3:%.*]] = call float @llvm.ceil.f32(float [[A3]]) +; SKX-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> +; SKX-NEXT: [[TMP4:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP3]]) +; SKX-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> +; SKX-NEXT: [[TMP6:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP5]]) +; SKX-NEXT: [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i64 0 +; SKX-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <8 x i32> +; SKX-NEXT: [[R23:%.*]] = shufflevector <8 x float> [[R0]], <8 x float> [[TMP7]], <8 x i32> +; SKX-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R23]], float [[AB3]], i64 3 +; SKX-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <8 x i32> +; SKX-NEXT: [[R52:%.*]] = shufflevector <8 x float> [[R3]], <8 x float> [[TMP8]], <8 x i32> +; SKX-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <8 x i32> +; SKX-NEXT: [[R71:%.*]] = shufflevector <8 x float> [[R52]], <8 x float> [[TMP9]], <8 x i32> +; SKX-NEXT: ret <8 x float> [[R71]] ; %a0 = extractelement <8 x float> %a, i32 0 %a1 = extractelement <8 x float> %a, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls.ll index bc5bcee361168a..b8ed0edfa2d692 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls.ll @@ -3,8 +3,8 @@ ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=SLM ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=AVX ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=AVX2 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=AVX2 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=AVX2 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=KNL +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=SKX define <8 x float> @ceil_floor(<8 x float> %a) { ; SSE-LABEL: @ceil_floor( @@ -51,25 +51,23 @@ define <8 x float> @ceil_floor(<8 x float> %a) { ; ; AVX-LABEL: @ceil_floor( ; AVX-NEXT: [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i64 0 -; AVX-NEXT: [[A1:%.*]] = extractelement <8 x float> [[A]], i64 1 -; AVX-NEXT: [[A2:%.*]] = extractelement <8 x float> [[A]], i64 2 ; AVX-NEXT: [[A3:%.*]] = extractelement <8 x float> [[A]], i64 3 ; AVX-NEXT: [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]]) -; AVX-NEXT: [[AB1:%.*]] = call float @llvm.floor.f32(float [[A1]]) -; AVX-NEXT: [[AB2:%.*]] = call float @llvm.floor.f32(float [[A2]]) +; AVX-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> +; AVX-NEXT: [[TMP2:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP1]]) ; AVX-NEXT: [[AB3:%.*]] = call float @llvm.ceil.f32(float [[A3]]) -; AVX-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> -; AVX-NEXT: [[TMP2:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP1]]) -; AVX-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> -; AVX-NEXT: [[TMP4:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP3]]) +; AVX-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> +; AVX-NEXT: [[TMP4:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP3]]) +; AVX-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> +; AVX-NEXT: [[TMP6:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP5]]) ; AVX-NEXT: [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i64 0 -; AVX-NEXT: [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i64 1 -; AVX-NEXT: [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i64 2 -; AVX-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i64 3 -; AVX-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <8 x i32> -; AVX-NEXT: [[R52:%.*]] = shufflevector <8 x float> [[R3]], <8 x float> [[TMP5]], <8 x i32> -; AVX-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <8 x i32> -; AVX-NEXT: [[R71:%.*]] = shufflevector <8 x float> [[R52]], <8 x float> [[TMP6]], <8 x i32> +; AVX-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <8 x i32> +; AVX-NEXT: [[R23:%.*]] = shufflevector <8 x float> [[R0]], <8 x float> [[TMP7]], <8 x i32> +; AVX-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R23]], float [[AB3]], i64 3 +; AVX-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <8 x i32> +; AVX-NEXT: [[R52:%.*]] = shufflevector <8 x float> [[R3]], <8 x float> [[TMP8]], <8 x i32> +; AVX-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <8 x i32> +; AVX-NEXT: [[R71:%.*]] = shufflevector <8 x float> [[R52]], <8 x float> [[TMP9]], <8 x i32> ; AVX-NEXT: ret <8 x float> [[R71]] ; ; AVX2-LABEL: @ceil_floor( @@ -92,6 +90,48 @@ define <8 x float> @ceil_floor(<8 x float> %a) { ; AVX2-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <8 x i32> ; AVX2-NEXT: [[R71:%.*]] = shufflevector <8 x float> [[R52]], <8 x float> [[TMP9]], <8 x i32> ; AVX2-NEXT: ret <8 x float> [[R71]] +; +; KNL-LABEL: @ceil_floor( +; KNL-NEXT: [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i64 0 +; KNL-NEXT: [[A3:%.*]] = extractelement <8 x float> [[A]], i64 3 +; KNL-NEXT: [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]]) +; KNL-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> +; KNL-NEXT: [[TMP2:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP1]]) +; KNL-NEXT: [[AB3:%.*]] = call float @llvm.ceil.f32(float [[A3]]) +; KNL-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> +; KNL-NEXT: [[TMP4:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP3]]) +; KNL-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> +; KNL-NEXT: [[TMP6:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP5]]) +; KNL-NEXT: [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i64 0 +; KNL-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <8 x i32> +; KNL-NEXT: [[R23:%.*]] = shufflevector <8 x float> [[R0]], <8 x float> [[TMP7]], <8 x i32> +; KNL-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R23]], float [[AB3]], i64 3 +; KNL-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <8 x i32> +; KNL-NEXT: [[R52:%.*]] = shufflevector <8 x float> [[R3]], <8 x float> [[TMP8]], <8 x i32> +; KNL-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <8 x i32> +; KNL-NEXT: [[R71:%.*]] = shufflevector <8 x float> [[R52]], <8 x float> [[TMP9]], <8 x i32> +; KNL-NEXT: ret <8 x float> [[R71]] +; +; SKX-LABEL: @ceil_floor( +; SKX-NEXT: [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i64 0 +; SKX-NEXT: [[A3:%.*]] = extractelement <8 x float> [[A]], i64 3 +; SKX-NEXT: [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]]) +; SKX-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> +; SKX-NEXT: [[TMP2:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP1]]) +; SKX-NEXT: [[AB3:%.*]] = call float @llvm.ceil.f32(float [[A3]]) +; SKX-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> +; SKX-NEXT: [[TMP4:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP3]]) +; SKX-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> +; SKX-NEXT: [[TMP6:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP5]]) +; SKX-NEXT: [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i64 0 +; SKX-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <8 x i32> +; SKX-NEXT: [[R23:%.*]] = shufflevector <8 x float> [[R0]], <8 x float> [[TMP7]], <8 x i32> +; SKX-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R23]], float [[AB3]], i64 3 +; SKX-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <8 x i32> +; SKX-NEXT: [[R52:%.*]] = shufflevector <8 x float> [[R3]], <8 x float> [[TMP8]], <8 x i32> +; SKX-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <8 x i32> +; SKX-NEXT: [[R71:%.*]] = shufflevector <8 x float> [[R52]], <8 x float> [[TMP9]], <8 x i32> +; SKX-NEXT: ret <8 x float> [[R71]] ; %a0 = extractelement <8 x float> %a, i32 0 %a1 = extractelement <8 x float> %a, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll index 4a9f717918a029..3a2acefb012a08 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll @@ -3,8 +3,8 @@ ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,SLM ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX2 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX512 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX512 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX,KNL +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX,SKX ; ; 128-bit vectors @@ -213,62 +213,16 @@ define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) { ; PR50392 define <4 x double> @test_v4f64_partial_swizzle(<4 x double> %a, <4 x double> %b) { -; SSE-LABEL: @test_v4f64_partial_swizzle( -; SSE-NEXT: [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2 -; SSE-NEXT: [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3 -; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B]], <2 x i32> -; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; SSE-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]] -; SSE-NEXT: [[R3:%.*]] = fadd double [[B2]], [[B3]] -; SSE-NEXT: [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> -; SSE-NEXT: [[R03:%.*]] = insertelement <4 x double> [[TMP4]], double [[R3]], i64 3 -; SSE-NEXT: ret <4 x double> [[R03]] -; -; SLM-LABEL: @test_v4f64_partial_swizzle( -; SLM-NEXT: [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2 -; SLM-NEXT: [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3 -; SLM-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B]], <2 x i32> -; SLM-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; SLM-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]] -; SLM-NEXT: [[R3:%.*]] = fadd double [[B2]], [[B3]] -; SLM-NEXT: [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> -; SLM-NEXT: [[R03:%.*]] = insertelement <4 x double> [[TMP4]], double [[R3]], i64 3 -; SLM-NEXT: ret <4 x double> [[R03]] -; -; AVX1-LABEL: @test_v4f64_partial_swizzle( -; AVX1-NEXT: [[A0:%.*]] = extractelement <4 x double> [[A:%.*]], i64 0 -; AVX1-NEXT: [[A1:%.*]] = extractelement <4 x double> [[A]], i64 1 -; AVX1-NEXT: [[R0:%.*]] = fadd double [[A0]], [[A1]] -; AVX1-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <2 x i32> -; AVX1-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> poison, <2 x i32> -; AVX1-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]] -; AVX1-NEXT: [[R00:%.*]] = insertelement <4 x double> poison, double [[R0]], i64 0 -; AVX1-NEXT: [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> -; AVX1-NEXT: [[R031:%.*]] = shufflevector <4 x double> [[R00]], <4 x double> [[TMP4]], <4 x i32> -; AVX1-NEXT: ret <4 x double> [[R031]] -; -; AVX2-LABEL: @test_v4f64_partial_swizzle( -; AVX2-NEXT: [[A0:%.*]] = extractelement <4 x double> [[A:%.*]], i64 0 -; AVX2-NEXT: [[A1:%.*]] = extractelement <4 x double> [[A]], i64 1 -; AVX2-NEXT: [[R0:%.*]] = fadd double [[A0]], [[A1]] -; AVX2-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <2 x i32> -; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> poison, <2 x i32> -; AVX2-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]] -; AVX2-NEXT: [[R00:%.*]] = insertelement <4 x double> poison, double [[R0]], i64 0 -; AVX2-NEXT: [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> -; AVX2-NEXT: [[R031:%.*]] = shufflevector <4 x double> [[R00]], <4 x double> [[TMP4]], <4 x i32> -; AVX2-NEXT: ret <4 x double> [[R031]] -; -; AVX512-LABEL: @test_v4f64_partial_swizzle( -; AVX512-NEXT: [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2 -; AVX512-NEXT: [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3 -; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B]], <2 x i32> -; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; AVX512-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]] -; AVX512-NEXT: [[R3:%.*]] = fadd double [[B2]], [[B3]] -; AVX512-NEXT: [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> -; AVX512-NEXT: [[R03:%.*]] = insertelement <4 x double> [[TMP4]], double [[R3]], i64 3 -; AVX512-NEXT: ret <4 x double> [[R03]] +; CHECK-LABEL: @test_v4f64_partial_swizzle( +; CHECK-NEXT: [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2 +; CHECK-NEXT: [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B]], <2 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[R3:%.*]] = fadd double [[B2]], [[B3]] +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> +; CHECK-NEXT: [[R03:%.*]] = insertelement <4 x double> [[TMP4]], double [[R3]], i64 3 +; CHECK-NEXT: ret <4 x double> [[R03]] ; %a0 = extractelement <4 x double> %a, i64 0 %a1 = extractelement <4 x double> %a, i64 1 @@ -548,3 +502,8 @@ define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) { %rv15 = insertelement <16 x i16> %rv14, i16 %r15, i32 15 ret <16 x i16> %rv15 } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; AVX1: {{.*}} +; AVX2: {{.*}} +; KNL: {{.*}} +; SKX: {{.*}} diff --git a/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll b/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll index 31e3e6aa0a833d..aaaebb53b808a7 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll @@ -3,8 +3,8 @@ ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,SLM ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX2 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX512 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX512 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX,KNL +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX,SKX ; ; 128-bit vectors @@ -213,62 +213,16 @@ define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) { ; PR50392 define <4 x double> @test_v4f64_partial_swizzle(<4 x double> %a, <4 x double> %b) { -; SSE-LABEL: @test_v4f64_partial_swizzle( -; SSE-NEXT: [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2 -; SSE-NEXT: [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3 -; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B]], <2 x i32> -; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; SSE-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]] -; SSE-NEXT: [[R3:%.*]] = fadd double [[B2]], [[B3]] -; SSE-NEXT: [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> , <4 x i32> -; SSE-NEXT: [[R03:%.*]] = insertelement <4 x double> [[TMP4]], double [[R3]], i64 3 -; SSE-NEXT: ret <4 x double> [[R03]] -; -; SLM-LABEL: @test_v4f64_partial_swizzle( -; SLM-NEXT: [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2 -; SLM-NEXT: [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3 -; SLM-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B]], <2 x i32> -; SLM-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; SLM-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]] -; SLM-NEXT: [[R3:%.*]] = fadd double [[B2]], [[B3]] -; SLM-NEXT: [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> , <4 x i32> -; SLM-NEXT: [[R03:%.*]] = insertelement <4 x double> [[TMP4]], double [[R3]], i64 3 -; SLM-NEXT: ret <4 x double> [[R03]] -; -; AVX1-LABEL: @test_v4f64_partial_swizzle( -; AVX1-NEXT: [[A0:%.*]] = extractelement <4 x double> [[A:%.*]], i64 0 -; AVX1-NEXT: [[A1:%.*]] = extractelement <4 x double> [[A]], i64 1 -; AVX1-NEXT: [[R0:%.*]] = fadd double [[A0]], [[A1]] -; AVX1-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <2 x i32> -; AVX1-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> poison, <2 x i32> -; AVX1-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]] -; AVX1-NEXT: [[R00:%.*]] = insertelement <4 x double> , double [[R0]], i64 0 -; AVX1-NEXT: [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> -; AVX1-NEXT: [[R031:%.*]] = shufflevector <4 x double> [[R00]], <4 x double> [[TMP4]], <4 x i32> -; AVX1-NEXT: ret <4 x double> [[R031]] -; -; AVX2-LABEL: @test_v4f64_partial_swizzle( -; AVX2-NEXT: [[A0:%.*]] = extractelement <4 x double> [[A:%.*]], i64 0 -; AVX2-NEXT: [[A1:%.*]] = extractelement <4 x double> [[A]], i64 1 -; AVX2-NEXT: [[R0:%.*]] = fadd double [[A0]], [[A1]] -; AVX2-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <2 x i32> -; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> poison, <2 x i32> -; AVX2-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]] -; AVX2-NEXT: [[R00:%.*]] = insertelement <4 x double> , double [[R0]], i64 0 -; AVX2-NEXT: [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> -; AVX2-NEXT: [[R031:%.*]] = shufflevector <4 x double> [[R00]], <4 x double> [[TMP4]], <4 x i32> -; AVX2-NEXT: ret <4 x double> [[R031]] -; -; AVX512-LABEL: @test_v4f64_partial_swizzle( -; AVX512-NEXT: [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2 -; AVX512-NEXT: [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3 -; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B]], <2 x i32> -; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; AVX512-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]] -; AVX512-NEXT: [[R3:%.*]] = fadd double [[B2]], [[B3]] -; AVX512-NEXT: [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> , <4 x i32> -; AVX512-NEXT: [[R03:%.*]] = insertelement <4 x double> [[TMP4]], double [[R3]], i64 3 -; AVX512-NEXT: ret <4 x double> [[R03]] +; CHECK-LABEL: @test_v4f64_partial_swizzle( +; CHECK-NEXT: [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2 +; CHECK-NEXT: [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B]], <2 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[R3:%.*]] = fadd double [[B2]], [[B3]] +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> , <4 x i32> +; CHECK-NEXT: [[R03:%.*]] = insertelement <4 x double> [[TMP4]], double [[R3]], i64 3 +; CHECK-NEXT: ret <4 x double> [[R03]] ; %a0 = extractelement <4 x double> %a, i64 0 %a1 = extractelement <4 x double> %a, i64 1 @@ -548,3 +502,8 @@ define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) { %rv15 = insertelement <16 x i16> %rv14, i16 %r15, i32 15 ret <16 x i16> %rv15 } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; AVX1: {{.*}} +; AVX2: {{.*}} +; KNL: {{.*}} +; SKX: {{.*}} diff --git a/llvm/test/Transforms/SLPVectorizer/X86/scalarazied-result.ll b/llvm/test/Transforms/SLPVectorizer/X86/scalarazied-result.ll index 53f17083bd4b30..1d6e191c6f97bf 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/scalarazied-result.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/scalarazied-result.ll @@ -4,10 +4,6 @@ define void @test() { ; CHECK-LABEL: @test( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = extractelement <8 x half> zeroinitializer, i64 1 -; CHECK-NEXT: [[TOBOOL:%.*]] = fcmp une half [[TMP0]], 0xH0000 -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x half> zeroinitializer, i64 1 -; CHECK-NEXT: [[TOBOOL3:%.*]] = fcmp une half [[TMP1]], 0xH0000 ; CHECK-NEXT: ret void ; entry: