-
Notifications
You must be signed in to change notification settings - Fork 12.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SLP]Improve/fix subvectors in gather/buildvector nodes handling #104144
[SLP]Improve/fix subvectors in gather/buildvector nodes handling #104144
Conversation
Created using spr 1.3.5
@llvm/pr-subscribers-llvm-transforms @llvm/pr-subscribers-backend-systemz Author: Alexey Bataev (alexey-bataev) ChangesSLP vectorizer has an estimation for gather/buildvector nodes, which Patch is 183.96 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/104144.diff 27 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 81841a8f692870..b32017adcf8ca8 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -3104,6 +3104,10 @@ class BoUpSLP {
/// The index of this treeEntry in VectorizableTree.
int Idx = -1;
+ /// For gather/buildvector/alt opcode (TODO) nodes, which are combined from
+ /// other nodes as a serie of insertvector instructions.
+ SmallVector<std::pair<unsigned, unsigned>, 0> CombinedEntriesWithIndices;
+
private:
/// The operands of each instruction in each lane Operands[op_index][lane].
/// Note: This helps avoid the replication of the code that performs the
@@ -3404,7 +3408,9 @@ class BoUpSLP {
if (!isConstant(V)) {
auto *I = dyn_cast<CastInst>(V);
AllConstsOrCasts &= I && I->getType()->isIntegerTy();
- ValueToGatherNodes.try_emplace(V).first->getSecond().insert(Last);
+ if (UserTreeIdx.EdgeIdx != UINT_MAX || !UserTreeIdx.UserTE ||
+ !UserTreeIdx.UserTE->isGather())
+ ValueToGatherNodes.try_emplace(V).first->getSecond().insert(Last);
}
if (AllConstsOrCasts)
CastMaxMinBWSizes =
@@ -8361,8 +8367,49 @@ getGEPCosts(const TargetTransformInfo &TTI, ArrayRef<Value *> Ptrs,
void BoUpSLP::transformNodes() {
constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
- for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
- TreeEntry &E = *TE;
+ // The tree may grow here, so iterate over nodes, built before.
+ for (unsigned Idx : seq<unsigned>(VectorizableTree.size())) {
+ TreeEntry &E = *VectorizableTree[Idx];
+ if (E.isGather()) {
+ ArrayRef<Value *> VL = E.Scalars;
+ const unsigned Sz = getVectorElementSize(VL.front());
+ unsigned MinVF = getMinVF(2 * Sz);
+ if (VL.size() <= 2 ||
+ (E.getOpcode() &&
+ (E.isAltShuffle() || E.getOpcode() != Instruction::Load)))
+ continue;
+ // Try to find vectorizable sequences and transform them into a serie of
+ // insertvector instructions.
+ unsigned StartIdx = 0;
+ unsigned End = VL.size();
+ for (unsigned VF = VL.size() / 2; VF >= MinVF; VF /= 2) {
+ for (unsigned Cnt = StartIdx; Cnt + VF <= End; Cnt += VF) {
+ ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
+ InstructionsState S = getSameOpcode(Slice, *TLI);
+ if (!S.getOpcode() || S.isAltShuffle() ||
+ (S.getOpcode() != Instruction::Load &&
+ any_of(Slice, [&](Value *V) {
+ return !areAllUsersVectorized(cast<Instruction>(V),
+ UserIgnoreList);
+ })))
+ continue;
+ if (!getTreeEntry(Slice.front()) && !getTreeEntry(Slice.back())) {
+ unsigned PrevSize = VectorizableTree.size();
+ buildTree_rec(Slice, 0, EdgeInfo(&E, UINT_MAX));
+ if (PrevSize + 1 == VectorizableTree.size() &&
+ VectorizableTree[PrevSize]->isGather()) {
+ VectorizableTree.pop_back();
+ continue;
+ }
+ E.CombinedEntriesWithIndices.emplace_back(PrevSize, Cnt);
+ if (StartIdx == Cnt)
+ StartIdx = Cnt + VF;
+ if (End == Cnt + VF)
+ End = Cnt;
+ }
+ }
+ }
+ }
switch (E.getOpcode()) {
case Instruction::Load: {
// No need to reorder masked gather loads, just reorder the scalar
@@ -8485,175 +8532,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
auto *VecTy = getWidenedType(ScalarTy, VL.size());
InstructionCost GatherCost = 0;
SmallVector<Value *> Gathers(VL);
- // Improve gather cost for gather of loads, if we can group some of the
- // loads into vector loads.
- InstructionsState S = getSameOpcode(VL, *R.TLI);
- const unsigned Sz = R.DL->getTypeSizeInBits(ScalarTy);
- unsigned MinVF = R.getMinVF(2 * Sz);
- if (VL.size() > 2 &&
- ((S.getOpcode() == Instruction::Load && !S.isAltShuffle()) ||
- (InVectors.empty() &&
- any_of(seq<unsigned>(0, VL.size() / MinVF),
- [&](unsigned Idx) {
- ArrayRef<Value *> SubVL = VL.slice(Idx * MinVF, MinVF);
- InstructionsState S = getSameOpcode(SubVL, *R.TLI);
- return S.getOpcode() == Instruction::Load &&
- !S.isAltShuffle();
- }))) &&
- !all_of(Gathers, [&](Value *V) { return R.getTreeEntry(V); }) &&
- !isSplat(Gathers)) {
- InstructionCost BaseCost = R.getGatherCost(Gathers, !Root, ScalarTy);
- SetVector<Value *> VectorizedLoads;
- SmallVector<std::pair<unsigned, LoadsState>> VectorizedStarts;
- SmallVector<unsigned> ScatterVectorized;
- unsigned StartIdx = 0;
- unsigned VF = VL.size() / 2;
- for (; VF >= MinVF; VF /= 2) {
- for (unsigned Cnt = StartIdx, End = VL.size(); Cnt + VF <= End;
- Cnt += VF) {
- ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
- if (S.getOpcode() != Instruction::Load || S.isAltShuffle()) {
- InstructionsState SliceS = getSameOpcode(Slice, *R.TLI);
- if (SliceS.getOpcode() != Instruction::Load ||
- SliceS.isAltShuffle())
- continue;
- }
- if (!VectorizedLoads.count(Slice.front()) &&
- !VectorizedLoads.count(Slice.back()) && allSameBlock(Slice)) {
- SmallVector<Value *> PointerOps;
- OrdersType CurrentOrder;
- LoadsState LS = R.canVectorizeLoads(Slice, Slice.front(),
- CurrentOrder, PointerOps);
- switch (LS) {
- case LoadsState::Vectorize:
- case LoadsState::ScatterVectorize:
- case LoadsState::StridedVectorize:
- // Mark the vectorized loads so that we don't vectorize them
- // again.
- // TODO: better handling of loads with reorders.
- if (((LS == LoadsState::Vectorize ||
- LS == LoadsState::StridedVectorize) &&
- CurrentOrder.empty()) ||
- (LS == LoadsState::StridedVectorize &&
- isReverseOrder(CurrentOrder)))
- VectorizedStarts.emplace_back(Cnt, LS);
- else
- ScatterVectorized.push_back(Cnt);
- VectorizedLoads.insert(Slice.begin(), Slice.end());
- // If we vectorized initial block, no need to try to vectorize
- // it again.
- if (Cnt == StartIdx)
- StartIdx += VF;
- break;
- case LoadsState::Gather:
- break;
- }
- }
- }
- // Check if the whole array was vectorized already - exit.
- if (StartIdx >= VL.size())
- break;
- // Found vectorizable parts - exit.
- if (!VectorizedLoads.empty())
- break;
- }
- if (!VectorizedLoads.empty()) {
- unsigned NumParts = TTI.getNumberOfParts(VecTy);
- bool NeedInsertSubvectorAnalysis =
- !NumParts || (VL.size() / VF) > NumParts;
- // Get the cost for gathered loads.
- for (unsigned I = 0, End = VL.size(); I < End; I += VF) {
- if (VectorizedLoads.contains(VL[I]))
- continue;
- GatherCost +=
- getBuildVectorCost(VL.slice(I, std::min(End - I, VF)), Root);
- }
- // Exclude potentially vectorized loads from list of gathered
- // scalars.
- Gathers.assign(Gathers.size(), PoisonValue::get(VL.front()->getType()));
- // The cost for vectorized loads.
- InstructionCost ScalarsCost = 0;
- for (Value *V : VectorizedLoads) {
- auto *LI = cast<LoadInst>(V);
- ScalarsCost +=
- TTI.getMemoryOpCost(Instruction::Load, LI->getType(),
- LI->getAlign(), LI->getPointerAddressSpace(),
- CostKind, TTI::OperandValueInfo(), LI);
- }
- auto *LoadTy = getWidenedType(VL.front()->getType(), VF);
- for (const std::pair<unsigned, LoadsState> &P : VectorizedStarts) {
- auto *LI = cast<LoadInst>(VL[P.first]);
- Align Alignment = LI->getAlign();
- GatherCost +=
- P.second == LoadsState::Vectorize
- ? TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment,
- LI->getPointerAddressSpace(), CostKind,
- TTI::OperandValueInfo(), LI)
- : TTI.getStridedMemoryOpCost(
- Instruction::Load, LoadTy, LI->getPointerOperand(),
- /*VariableMask=*/false, Alignment, CostKind, LI);
- // Add external uses costs.
- for (auto [Idx, V] : enumerate(VL.slice(
- P.first, std::min<unsigned>(VL.size() - P.first, VF))))
- if (!R.areAllUsersVectorized(cast<Instruction>(V)))
- GatherCost += TTI.getVectorInstrCost(Instruction::ExtractElement,
- LoadTy, CostKind, Idx);
- // Estimate GEP cost.
- SmallVector<Value *> PointerOps(VF);
- for (auto [I, V] : enumerate(VL.slice(P.first, VF)))
- PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
- auto [ScalarGEPCost, VectorGEPCost] =
- getGEPCosts(TTI, PointerOps, LI->getPointerOperand(),
- Instruction::Load, CostKind, LI->getType(), LoadTy);
- GatherCost += VectorGEPCost - ScalarGEPCost;
- }
- for (unsigned P : ScatterVectorized) {
- auto *LI0 = cast<LoadInst>(VL[P]);
- ArrayRef<Value *> Slice = VL.slice(P, VF);
- Align CommonAlignment = computeCommonAlignment<LoadInst>(Slice);
- GatherCost += TTI.getGatherScatterOpCost(
- Instruction::Load, LoadTy, LI0->getPointerOperand(),
- /*VariableMask=*/false, CommonAlignment, CostKind, LI0);
- // Estimate GEP cost.
- SmallVector<Value *> PointerOps(VF);
- for (auto [I, V] : enumerate(Slice))
- PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
- OrdersType Order;
- if (sortPtrAccesses(PointerOps, LI0->getType(), *R.DL, *R.SE,
- Order)) {
- // TODO: improve checks if GEPs can be vectorized.
- Value *Ptr0 = PointerOps.front();
- Type *ScalarTy = Ptr0->getType();
- auto *VecTy = getWidenedType(ScalarTy, VF);
- auto [ScalarGEPCost, VectorGEPCost] =
- getGEPCosts(TTI, PointerOps, Ptr0, Instruction::GetElementPtr,
- CostKind, ScalarTy, VecTy);
- GatherCost += VectorGEPCost - ScalarGEPCost;
- if (!Order.empty()) {
- SmallVector<int> Mask;
- inversePermutation(Order, Mask);
- GatherCost += ::getShuffleCost(TTI, TTI::SK_PermuteSingleSrc,
- VecTy, Mask, CostKind);
- }
- } else {
- GatherCost += R.getGatherCost(PointerOps, /*ForPoisonSrc=*/true,
- PointerOps.front()->getType());
- }
- }
- if (NeedInsertSubvectorAnalysis) {
- // Add the cost for the subvectors insert.
- SmallVector<int> ShuffleMask(VL.size());
- for (unsigned I = VF, E = VL.size(); I < E; I += VF) {
- for (unsigned Idx : seq<unsigned>(0, E))
- ShuffleMask[Idx] = Idx / VF == I ? E + Idx % VF : Idx;
- GatherCost += ::getShuffleCost(TTI, TTI::SK_InsertSubvector, VecTy,
- ShuffleMask, CostKind, I, LoadTy);
- }
- }
- GatherCost -= ScalarsCost;
- }
- GatherCost = std::min(BaseCost, GatherCost);
- } else if (!Root && isSplat(VL)) {
+ if (!Root && isSplat(VL)) {
// Found the broadcasting of the single scalar, calculate the cost as
// the broadcast.
const auto *It = find_if_not(VL, IsaPred<UndefValue>);
@@ -9401,7 +9280,9 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
InstructionCost createFreeze(InstructionCost Cost) { return Cost; }
/// Finalize emission of the shuffles.
InstructionCost
- finalize(ArrayRef<int> ExtMask, unsigned VF = 0,
+ finalize(ArrayRef<int> ExtMask,
+ ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
+ unsigned VF = 0,
function_ref<void(Value *&, SmallVectorImpl<int> &)> Action = {}) {
IsFinalized = true;
if (Action) {
@@ -9419,6 +9300,29 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
Action(V, CommonMask);
InVectors.front() = V;
}
+ if (!SubVectors.empty()) {
+ const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
+ if (InVectors.size() == 2)
+ Cost += createShuffle(Vec, InVectors.back(), CommonMask);
+ else
+ Cost += createShuffle(Vec, nullptr, CommonMask);
+ for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
+ if (CommonMask[Idx] != PoisonMaskElem)
+ CommonMask[Idx] = Idx;
+ for (const auto [E, Idx] : SubVectors) {
+ Cost += ::getShuffleCost(
+ TTI, TTI::SK_InsertSubvector,
+ FixedVectorType::get(ScalarTy, CommonMask.size()), std::nullopt,
+ CostKind, Idx,
+ FixedVectorType::get(ScalarTy, E->getVectorFactor()));
+ if (!CommonMask.empty()) {
+ std::iota(std::next(CommonMask.begin(), Idx),
+ std::next(CommonMask.begin(), Idx + E->getVectorFactor()),
+ Idx);
+ }
+ }
+ }
+
::addMask(CommonMask, ExtMask, /*ExtendingManyInputs=*/true);
if (CommonMask.empty()) {
assert(InVectors.size() == 1 && "Expected only one vector with no mask");
@@ -10942,8 +10846,31 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
if (CanBeUsedAsScalar) {
InstructionCost ScalarCost = TTI->getInstructionCost(Inst, CostKind);
bool KeepScalar = ScalarCost <= ExtraCost;
- if (KeepScalar && ScalarCost != TTI::TCC_Free &&
- ExtraCost - ScalarCost <= TTI::TCC_Basic) {
+ // Try to keep original scalar if the user is the phi node from the same
+ // block as the root phis, currently vectorized. It allows to keep
+ // better ordering info of PHIs, being vectorized currently.
+ bool IsProfitablePHIUser =
+ (KeepScalar || (ScalarCost - ExtraCost <= TTI::TCC_Basic &&
+ VectorizableTree.front()->Scalars.size() > 2)) &&
+ VectorizableTree.front()->getOpcode() == Instruction::PHI &&
+ !Inst->hasNUsesOrMore(UsesLimit) &&
+ none_of(Inst->users(),
+ [&](User *U) {
+ auto *PHIUser = dyn_cast<PHINode>(U);
+ return (!PHIUser ||
+ PHIUser->getParent() !=
+ cast<Instruction>(
+ VectorizableTree.front()->getMainOp())
+ ->getParent()) &&
+ !getTreeEntry(U);
+ }) &&
+ count_if(Entry->Scalars, [&](Value *V) {
+ return ValueToExtUses->contains(V);
+ }) <= 2;
+ if (IsProfitablePHIUser) {
+ KeepScalar = true;
+ } else if (KeepScalar && ScalarCost != TTI::TCC_Free &&
+ ExtraCost - ScalarCost <= TTI::TCC_Basic) {
unsigned ScalarUsesCount = count_if(Entry->Scalars, [&](Value *V) {
return ValueToExtUses->contains(V);
});
@@ -12490,7 +12417,9 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
/// \param Action the action (if any) to be performed before final applying of
/// the \p ExtMask mask.
Value *
- finalize(ArrayRef<int> ExtMask, unsigned VF = 0,
+ finalize(ArrayRef<int> ExtMask,
+ ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
+ unsigned VF = 0,
function_ref<void(Value *&, SmallVectorImpl<int> &)> Action = {}) {
IsFinalized = true;
SmallVector<int> NewExtMask(ExtMask);
@@ -12524,6 +12453,29 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
Action(Vec, CommonMask);
InVectors.front() = Vec;
}
+ if (!SubVectors.empty()) {
+ Value *Vec = InVectors.front();
+ if (InVectors.size() == 2) {
+ Vec = createShuffle(Vec, InVectors.back(), CommonMask);
+ InVectors.pop_back();
+ } else {
+ Vec = createShuffle(Vec, nullptr, CommonMask);
+ }
+ for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
+ if (CommonMask[Idx] != PoisonMaskElem)
+ CommonMask[Idx] = Idx;
+ for (const auto [E, Idx] : SubVectors) {
+ Vec = Builder.CreateInsertVector(
+ Vec->getType(), Vec, E->VectorizedValue, Builder.getInt64(Idx));
+ if (!CommonMask.empty()) {
+ std::iota(std::next(CommonMask.begin(), Idx),
+ std::next(CommonMask.begin(), Idx + E->getVectorFactor()),
+ Idx);
+ }
+ }
+ InVectors.front() = Vec;
+ }
+
if (!ExtMask.empty()) {
if (CommonMask.empty()) {
CommonMask.assign(ExtMask.begin(), ExtMask.end());
@@ -12602,7 +12554,10 @@ Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx,
: ScalarTy,
Builder, *this);
ShuffleBuilder.add(V, Mask);
- return ShuffleBuilder.finalize(std::nullopt);
+ SmallVector<std::pair<const TreeEntry *, unsigned>> SubVectors;
+ for (const auto [EIdx, Idx] : E->CombinedEntriesWithIndices)
+ SubVectors.emplace_back(VectorizableTree[EIdx].get(), Idx);
+ return ShuffleBuilder.finalize(std::nullopt, SubVectors);
};
Value *V = vectorizeTree(VE, PostponedPHIs);
if (VF * getNumElements(VL[0]->getType()) !=
@@ -12685,6 +12640,14 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
SmallVector<int> ReuseShuffleIndices(E->ReuseShuffleIndices.begin(),
E->ReuseShuffleIndices.end());
SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end());
+ // Clear values, to be replaced by insertvector instructions.
+ for (const auto [EIdx, Idx] : E->CombinedEntriesWithIndices)
+ for_each(MutableArrayRef(GatheredScalars)
+ .slice(Idx, VectorizableTree[EIdx]->getVectorFactor()),
+ [&](Value *&V) { V = PoisonValue::get(V->getType()); });
+ SmallVector<std::pair<const TreeEntry *, unsigned>> SubVectors;
+ for (const auto [EIdx, Idx] : E->CombinedEntriesWithIndices)
+ SubVectors.emplace_back(VectorizableTree[EIdx].get(), Idx);
// Build a mask out of the reorder indices and reorder scalars per this
// mask.
SmallVector<int> ReorderMask;
@@ -12822,7 +12785,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
}
}
ShuffleBuilder.add(*FrontTE, Mask);
- Res = ShuffleBuilder.finalize(E->getCommonMask());
+ Res = ShuffleBuilder.finalize(E->getCommonMask(), SubVectors);
return Res;
}
if (!Resized) {
@@ -13079,10 +13042,10 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
(IsSingleShuffle && ((IsIdentityShuffle &&
IsNonPoisoned) || IsUsedInExpr) && isa<UndefValue>(V));
}))
- Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
+ Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors);
else
Res = ShuffleBuilder.finalize(
- E->ReuseShuffleIndices, E->Scalars.size(),
+ E->ReuseShuffleIndices, SubVectors, E->Scalars.size(),
[&](Value *&Vec, SmallVectorImpl<int> &Mask) {
TryPackScalars(NonConstant...
[truncated]
|
Created using spr 1.3.5
Ping! |
1 similar comment
Ping! |
Created using spr 1.3.5
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
This change appears to have broken the UBSan runtime build with the following error:
Would it be possible to take a look and revert the change if the issue cannot be addressed quickly? |
There is something wrong with the instcombiner, will check it tomorrow |
I also ran into assert failures due to this, on both aarch64 and i686 in the wild, and a reduced snippet also triggers on x86_64. Reproducible with the following reduced snippet: union a {
short b
};
int c, d, e;
int *f;
unsigned h(int i, int j) {
if (i & ~j)
return i & 1;
return i;
}
void k(short *i) {
int l, m, n;
for (;;) {
int o;
e += f[d];
n += o = c;
m = h(e, 6);
short g = m << 8;
((union a *)&i[0])->b = g;
l = h(o, 6);
g = l << 8;
((union a *)&i[1])->b = g;
((union a *)&i[2])->b = g = n << 8;
((union a *)&i[3])->b = g;
}
} Compiled with:
Also reproduces with |
Second reproducer: int a(char *b, int c) {
int d, e, f = d = 0;
for (; d < 3; d++) {
e = 0;
for (; e < 8; e++)
f += -b[e] - b[e + c] >> 31;
b += c;
}
return f;
} This reproduces with i686 and x86_64, but not on aarch64. |
In this one, SLP is producing the illegal instruction: %51 = call <8 x i16> @llvm.vector.insert.v8i16.v4i32(<8 x i16> %50, <4 x i32> %41, i64 4) |
SLP vectorizer has an estimation for gather/buildvector nodes, which contain some scalar loads. SLP vectorizer performs pretty similar (but large in SLOCs) estimation, which not always correct. Instead, this patch implements clustering analysis and actual node allocation with the full analysis for the vectorized clustered scalars (not only loads, but also some other instructions) with the correct cost estimation and vector insert instructions. Improves overall vectorization quality and simplifies analysis/estimations. Reviewers: RKSimon Reviewed By: RKSimon Pull Request: #104144
SLP vectorizer has an estimation for gather/buildvector nodes, which contain some scalar loads. SLP vectorizer performs pretty similar (but large in SLOCs) estimation, which not always correct. Instead, this patch implements clustering analysis and actual node allocation with the full analysis for the vectorized clustered scalars (not only loads, but also some other instructions) with the correct cost estimation and vector insert instructions. Improves overall vectorization quality and simplifies analysis/estimations. Reviewers: RKSimon Reviewed By: RKSimon Pull Request: llvm#104144
…ing" (llvm#105780) with "[Vectorize] Fix warnings" It introduced compiler crashes, see llvm#104144. This reverts commit 69332bb and 351f4a5.
SLP vectorizer has an estimation for gather/buildvector nodes, which contain some scalar loads. SLP vectorizer performs pretty similar (but large in SLOCs) estimation, which not always correct. Instead, this patch implements clustering analysis and actual node allocation with the full analysis for the vectorized clustered scalars (not only loads, but also some other instructions) with the correct cost estimation and vector insert instructions. Improves overall vectorization quality and simplifies analysis/estimations. Reviewers: RKSimon Reviewed By: RKSimon Pull Request: llvm#104144
SLP vectorizer has an estimation for gather/buildvector nodes, which contain some scalar loads. SLP vectorizer performs pretty similar (but large in SLOCs) estimation, which not always correct. Instead, this patch implements clustering analysis and actual node allocation with the full analysis for the vectorized clustered scalars (not only loads, but also some other instructions) with the correct cost estimation and vector insert instructions. Improves overall vectorization quality and simplifies analysis/estimations. Reviewers: RKSimon Reviewed By: RKSimon Pull Request: #104144
@@ -13122,6 +13069,8 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy, | |||
} | |||
|
|||
Value *BoUpSLP::createBuildVector(const TreeEntry *E, Type *ScalarTy) { | |||
for (const auto [EIdx, _] : E->CombinedEntriesWithIndices) | |||
(void)vectorizeTree(VectorizableTree[EIdx].get(), /*PostponedPHIs=*/false); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I noticed issue #105904. This patch breaks the precondition of the Gather node, which must have only a single user in vectorizeOperand
We are hitting an assert error with the last reland: assert.h assertion failed at llvm/lib/IR/Instructions.cpp:1854 in bool isSingleSourceMaskImpl(ArrayRef, int): I >= 0 && I < (NumOpElts * 2) && "Out-of-bounds shuffle mask element" in https://github.com/openxla/xla/blob/main/xla/service/gpu/fusions/triton/triton_fusion_emitter_parametrized_test.cc#L65 I'll see if we can create a simpler reproducer but we may need to revert in the meanwhile. |
Need a reproducer |
Unfortunately it's in a very large generated block and we were not able to isolate it to an external repro yet. One pointer is, dropping lines 12657-12661 from SLPVectorizer.cpp fixes our test. Could you take a look if the ranges here, and the values set look correct? Update: corrected line number. |
Could you provide full stack trace? Also, large code is not a problem, attach it, I will reduce it myself |
Stack trace:
Looks like invalid mask with values 124 and 126 here:
(The test in question is this: https://github.com/openxla/xla/blob/main/xla/service/gpu/fusions/triton/triton_fusion_emitter_parametrized_test.cc#L65 , if you could run the test, you'll probably be able to collect more useful information. This is significantly out of my normal problem area). I got ~800 Mb of IR generated and I can't validate if I can share all of it. |
Unfortunately, it does not help, need a reproducer |
Reduced as #106655 |
Thanks, fixed |
SLP vectorizer has an estimation for gather/buildvector nodes, which contain some scalar loads. SLP vectorizer performs pretty similar (but large in SLOCs) estimation, which not always correct. Instead, this patch implements clustering analysis and actual node allocation with the full analysis for the vectorized clustered scalars (not only loads, but also some other instructions) with the correct cost estimation and vector insert instructions. Improves overall vectorization quality and simplifies analysis/estimations. Reviewers: RKSimon Reviewed By: RKSimon Pull Request: llvm#104144
…ing" (llvm#105780) with "[Vectorize] Fix warnings" It introduced compiler crashes, see llvm#104144. This reverts commit 69332bb and 351f4a5.
SLP vectorizer has an estimation for gather/buildvector nodes, which contain some scalar loads. SLP vectorizer performs pretty similar (but large in SLOCs) estimation, which not always correct. Instead, this patch implements clustering analysis and actual node allocation with the full analysis for the vectorized clustered scalars (not only loads, but also some other instructions) with the correct cost estimation and vector insert instructions. Improves overall vectorization quality and simplifies analysis/estimations. Reviewers: RKSimon Reviewed By: RKSimon Pull Request: llvm#104144
This CL seems to have changed behavior: https://gist.github.com/fmayer/6030063dfacd2abfb0898a5855949034 This IR here generates a program that returns 0 before, and 1 after this change. It seems related to inline asm instructions. |
Must be fixed in c13bf6d |
I can confirm this fixed it. |
Should not return the original phi vector instruction, need to return actual vectorized value as a result.
SLP vectorizer has an estimation for gather/buildvector nodes, which
contain some scalar loads. SLP vectorizer performs pretty similar (but
large in SLOCs) estimation, which not always correct. Instead, this
patch implements clustering analysis and actual node allocation with the
full analysis for the vectorized clustered scalars (not only loads, but
also some other instructions) with the correct cost estimation and
vector insert instructions. Improves overall vectorization quality and
simplifies analysis/estimations.