-
Notifications
You must be signed in to change notification settings - Fork 12.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AArch64] Add costs for ST3 and ST4 instructions, modelled as store(shuffle). #87934
Conversation
@llvm/pr-subscribers-backend-arm @llvm/pr-subscribers-backend-x86 Author: David Green (davemgreen) ChangesThis tries to add some costs for the shuffle in a ST3/ST4 instruction, which are represented in LLVM IR as store(interleaving shuffle). In order to detect the store, it needs to add a CxtI context instruction to check the users of the shuffle. LD3 and LD4 are added, LD2 should be a zip1 shuffle, which will be added in another patch. It should help fix some of the regressions from #87510. Patch is 49.87 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/87934.diff 22 Files Affected:
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index fa9392b86c15b9..58c69ac939763a 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1291,12 +1291,11 @@ class TargetTransformInfo {
/// passed through \p Args, which helps improve the cost estimation in some
/// cases, like in broadcast loads.
/// NOTE: For subvector extractions Tp represents the source type.
- InstructionCost
- getShuffleCost(ShuffleKind Kind, VectorType *Tp,
- ArrayRef<int> Mask = std::nullopt,
- TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
- int Index = 0, VectorType *SubTp = nullptr,
- ArrayRef<const Value *> Args = std::nullopt) const;
+ InstructionCost getShuffleCost(
+ ShuffleKind Kind, VectorType *Tp, ArrayRef<int> Mask = std::nullopt,
+ TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput, int Index = 0,
+ VectorType *SubTp = nullptr, ArrayRef<const Value *> Args = std::nullopt,
+ const Instruction *CxtI = nullptr) const;
/// Represents a hint about the context in which a cast is used.
///
@@ -2008,11 +2007,10 @@ class TargetTransformInfo::Concept {
const SmallBitVector &OpcodeMask,
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const = 0;
- virtual InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *Tp,
- ArrayRef<int> Mask,
- TTI::TargetCostKind CostKind,
- int Index, VectorType *SubTp,
- ArrayRef<const Value *> Args) = 0;
+ virtual InstructionCost
+ getShuffleCost(ShuffleKind Kind, VectorType *Tp, ArrayRef<int> Mask,
+ TTI::TargetCostKind CostKind, int Index, VectorType *SubTp,
+ ArrayRef<const Value *> Args, const Instruction *CxtI) = 0;
virtual InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst,
Type *Src, CastContextHint CCH,
TTI::TargetCostKind CostKind,
@@ -2647,8 +2645,10 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
ArrayRef<int> Mask,
TTI::TargetCostKind CostKind, int Index,
VectorType *SubTp,
- ArrayRef<const Value *> Args) override {
- return Impl.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args);
+ ArrayRef<const Value *> Args,
+ const Instruction *CxtI) override {
+ return Impl.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args,
+ CxtI);
}
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
CastContextHint CCH,
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 63c2ef8912b29c..5f17d511d4b835 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -579,10 +579,12 @@ class TargetTransformInfoImplBase {
return InstructionCost::getInvalid();
}
- InstructionCost
- getShuffleCost(TTI::ShuffleKind Kind, VectorType *Ty, ArrayRef<int> Mask,
- TTI::TargetCostKind CostKind, int Index, VectorType *SubTp,
- ArrayRef<const Value *> Args = std::nullopt) const {
+ InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Ty,
+ ArrayRef<int> Mask,
+ TTI::TargetCostKind CostKind, int Index,
+ VectorType *SubTp,
+ ArrayRef<const Value *> Args = std::nullopt,
+ const Instruction *CxtI = nullptr) const {
return 1;
}
@@ -1341,13 +1343,13 @@ class TargetTransformInfoImplCRTPBase : public TargetTransformInfoImplBase {
if (Shuffle->isExtractSubvectorMask(SubIndex))
return TargetTTI->getShuffleCost(TTI::SK_ExtractSubvector, VecSrcTy,
Mask, CostKind, SubIndex, VecTy,
- Operands);
+ Operands, Shuffle);
if (Shuffle->isInsertSubvectorMask(NumSubElts, SubIndex))
return TargetTTI->getShuffleCost(
TTI::SK_InsertSubvector, VecTy, Mask, CostKind, SubIndex,
FixedVectorType::get(VecTy->getScalarType(), NumSubElts),
- Operands);
+ Operands, Shuffle);
int ReplicationFactor, VF;
if (Shuffle->isReplicationMask(ReplicationFactor, VF)) {
@@ -1374,7 +1376,7 @@ class TargetTransformInfoImplCRTPBase : public TargetTransformInfoImplBase {
return TargetTTI->getShuffleCost(
IsUnary ? TTI::SK_PermuteSingleSrc : TTI::SK_PermuteTwoSrc, VecTy,
- AdjustMask, CostKind, 0, nullptr);
+ AdjustMask, CostKind, 0, nullptr, Shuffle);
}
// Narrowing shuffle - perform shuffle at original wider width and
@@ -1383,13 +1385,13 @@ class TargetTransformInfoImplCRTPBase : public TargetTransformInfoImplBase {
InstructionCost ShuffleCost = TargetTTI->getShuffleCost(
IsUnary ? TTI::SK_PermuteSingleSrc : TTI::SK_PermuteTwoSrc,
- VecSrcTy, AdjustMask, CostKind, 0, nullptr);
+ VecSrcTy, AdjustMask, CostKind, 0, nullptr, Shuffle);
SmallVector<int, 16> ExtractMask(Mask.size());
std::iota(ExtractMask.begin(), ExtractMask.end(), 0);
- return ShuffleCost + TargetTTI->getShuffleCost(TTI::SK_ExtractSubvector,
- VecSrcTy, ExtractMask,
- CostKind, 0, VecTy);
+ return ShuffleCost + TargetTTI->getShuffleCost(
+ TTI::SK_ExtractSubvector, VecSrcTy,
+ ExtractMask, CostKind, 0, VecTy, Shuffle);
}
if (Shuffle->isIdentity())
@@ -1397,35 +1399,39 @@ class TargetTransformInfoImplCRTPBase : public TargetTransformInfoImplBase {
if (Shuffle->isReverse())
return TargetTTI->getShuffleCost(TTI::SK_Reverse, VecTy, Mask, CostKind,
- 0, nullptr, Operands);
+ 0, nullptr, Operands, Shuffle);
if (Shuffle->isSelect())
return TargetTTI->getShuffleCost(TTI::SK_Select, VecTy, Mask, CostKind,
- 0, nullptr, Operands);
+ 0, nullptr, Operands, Shuffle);
if (Shuffle->isTranspose())
return TargetTTI->getShuffleCost(TTI::SK_Transpose, VecTy, Mask,
- CostKind, 0, nullptr, Operands);
+ CostKind, 0, nullptr, Operands,
+ Shuffle);
if (Shuffle->isZeroEltSplat())
return TargetTTI->getShuffleCost(TTI::SK_Broadcast, VecTy, Mask,
- CostKind, 0, nullptr, Operands);
+ CostKind, 0, nullptr, Operands,
+ Shuffle);
if (Shuffle->isSingleSource())
return TargetTTI->getShuffleCost(TTI::SK_PermuteSingleSrc, VecTy, Mask,
- CostKind, 0, nullptr, Operands);
+ CostKind, 0, nullptr, Operands,
+ Shuffle);
if (Shuffle->isInsertSubvectorMask(NumSubElts, SubIndex))
return TargetTTI->getShuffleCost(
TTI::SK_InsertSubvector, VecTy, Mask, CostKind, SubIndex,
- FixedVectorType::get(VecTy->getScalarType(), NumSubElts), Operands);
+ FixedVectorType::get(VecTy->getScalarType(), NumSubElts), Operands,
+ Shuffle);
if (Shuffle->isSplice(SubIndex))
return TargetTTI->getShuffleCost(TTI::SK_Splice, VecTy, Mask, CostKind,
- SubIndex, nullptr, Operands);
+ SubIndex, nullptr, Operands, Shuffle);
return TargetTTI->getShuffleCost(TTI::SK_PermuteTwoSrc, VecTy, Mask,
- CostKind, 0, nullptr, Operands);
+ CostKind, 0, nullptr, Operands, Shuffle);
}
case Instruction::ExtractElement: {
auto *EEI = dyn_cast<ExtractElementInst>(U);
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 42d8f74fd427fb..f0bc1b7e205bee 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -1020,7 +1020,8 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
ArrayRef<int> Mask,
TTI::TargetCostKind CostKind, int Index,
VectorType *SubTp,
- ArrayRef<const Value *> Args = std::nullopt) {
+ ArrayRef<const Value *> Args = std::nullopt,
+ const Instruction *CxtI = nullptr) {
switch (improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp)) {
case TTI::SK_Broadcast:
if (auto *FVT = dyn_cast<FixedVectorType>(Tp))
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 5f933b4587843c..33c899fe889990 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -916,9 +916,9 @@ InstructionCost TargetTransformInfo::getAltInstrCost(
InstructionCost TargetTransformInfo::getShuffleCost(
ShuffleKind Kind, VectorType *Ty, ArrayRef<int> Mask,
TTI::TargetCostKind CostKind, int Index, VectorType *SubTp,
- ArrayRef<const Value *> Args) const {
- InstructionCost Cost =
- TTIImpl->getShuffleCost(Kind, Ty, Mask, CostKind, Index, SubTp, Args);
+ ArrayRef<const Value *> Args, const Instruction *CxtI) const {
+ InstructionCost Cost = TTIImpl->getShuffleCost(Kind, Ty, Mask, CostKind,
+ Index, SubTp, Args, CxtI);
assert(Cost >= 0 && "TTI should not produce negative costs!");
return Cost;
}
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index ee7137b92445bb..0fe1847ecf945b 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -3815,18 +3815,30 @@ InstructionCost AArch64TTIImpl::getSpliceCost(VectorType *Tp, int Index) {
return LegalizationCost * LT.first;
}
-InstructionCost AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
- VectorType *Tp,
- ArrayRef<int> Mask,
- TTI::TargetCostKind CostKind,
- int Index, VectorType *SubTp,
- ArrayRef<const Value *> Args) {
+InstructionCost AArch64TTIImpl::getShuffleCost(
+ TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef<int> Mask,
+ TTI::TargetCostKind CostKind, int Index, VectorType *SubTp,
+ ArrayRef<const Value *> Args, const Instruction *CxtI) {
std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
+
// If we have a Mask, and the LT is being legalized somehow, split the Mask
// into smaller vectors and sum the cost of each shuffle.
if (!Mask.empty() && isa<FixedVectorType>(Tp) && LT.second.isVector() &&
Tp->getScalarSizeInBits() == LT.second.getScalarSizeInBits() &&
Mask.size() > LT.second.getVectorNumElements() && !Index && !SubTp) {
+
+ // Check for ST3/ST4 instructions, which are represented in llvm IR as
+ // store(interleaving-shuffle). The shuffle cost could potentially be free,
+ // but we model it with a cost of LT.first so that LD3/LD3 have a higher
+ // cost than just the store.
+ if ((ShuffleVectorInst::isInterleaveMask(
+ Mask, 4, Tp->getElementCount().getKnownMinValue() * 2) ||
+ ShuffleVectorInst::isInterleaveMask(
+ Mask, 3, Tp->getElementCount().getKnownMinValue() * 2)) &&
+ !ShuffleVectorInst::isZeroEltSplatMask(
+ Mask, Tp->getElementCount().getKnownMinValue()))
+ return LT.first;
+
unsigned TpNumElts = Mask.size();
unsigned LTNumElts = LT.second.getVectorNumElements();
unsigned NumVecs = (TpNumElts + LTNumElts - 1) / LTNumElts;
@@ -3874,7 +3886,7 @@ InstructionCost AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
if (NumSources <= 2)
Cost += getShuffleCost(NumSources <= 1 ? TTI::SK_PermuteSingleSrc
: TTI::SK_PermuteTwoSrc,
- NTp, NMask, CostKind, 0, nullptr, Args);
+ NTp, NMask, CostKind, 0, nullptr, Args, CxtI);
else if (any_of(enumerate(NMask), [&](const auto &ME) {
return ME.value() % LTNumElts == ME.index();
}))
@@ -4055,7 +4067,8 @@ InstructionCost AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
// Restore optimal kind.
if (IsExtractSubvector)
Kind = TTI::SK_ExtractSubvector;
- return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp);
+ return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args,
+ CxtI);
}
static bool containsDecreasingPointers(Loop *TheLoop,
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index de39dea2be43e1..dba384481f6a34 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -393,7 +393,8 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
ArrayRef<int> Mask,
TTI::TargetCostKind CostKind, int Index,
VectorType *SubTp,
- ArrayRef<const Value *> Args = std::nullopt);
+ ArrayRef<const Value *> Args = std::nullopt,
+ const Instruction *CxtI = nullptr);
InstructionCost getScalarizationOverhead(VectorType *Ty,
const APInt &DemandedElts,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 31077dbc0b2cc4..84320d296a037b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -1127,7 +1127,8 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
VectorType *VT, ArrayRef<int> Mask,
TTI::TargetCostKind CostKind,
int Index, VectorType *SubTp,
- ArrayRef<const Value *> Args) {
+ ArrayRef<const Value *> Args,
+ const Instruction *CxtI) {
Kind = improveShuffleKindFromMask(Kind, Mask, VT, Index, SubTp);
// Treat extractsubvector as single op permutation.
bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index cd8e9fd10bbf21..0dab3a98277943 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -234,7 +234,8 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
ArrayRef<int> Mask,
TTI::TargetCostKind CostKind, int Index,
VectorType *SubTp,
- ArrayRef<const Value *> Args = std::nullopt);
+ ArrayRef<const Value *> Args = std::nullopt,
+ const Instruction *CxtI = nullptr);
bool areInlineCompatible(const Function *Caller,
const Function *Callee) const;
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 3be894ad3bef2c..ee87f7f0e555ef 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -1212,7 +1212,8 @@ InstructionCost ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
VectorType *Tp, ArrayRef<int> Mask,
TTI::TargetCostKind CostKind,
int Index, VectorType *SubTp,
- ArrayRef<const Value *> Args) {
+ ArrayRef<const Value *> Args,
+ const Instruction *CxtI) {
Kind = improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp);
// Treat extractsubvector as single op permutation.
bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector;
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
index bb4b321b530091..04b32194f806f6 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -220,7 +220,8 @@ class ARMTTIImpl : public BasicTTIImplBase<ARMTTIImpl> {
ArrayRef<int> Mask,
TTI::TargetCostKind CostKind, int Index,
VectorType *SubTp,
- ArrayRef<const Value *> Args = std::nullopt);
+ ArrayRef<const Value *> Args = std::nullopt,
+ const Instruction *CxtI = nullptr);
bool preferInLoopReduction(unsigned Opcode, Type *Ty,
TTI::ReductionFlags Flags) const;
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
index 458b8717256f24..f47fcff5d60259 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
@@ -230,7 +230,8 @@ InstructionCost HexagonTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp,
ArrayRef<int> Mask,
TTI::TargetCostKind CostKind,
int Index, Type *SubTp,
- ArrayRef<const Value *> Args) {
+ ArrayRef<const Value *> Args,
+ const Instruction *CxtI) {
return 1;
}
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
index fdb34f308e641e..9689f2f5bb865c 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
+++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
@@ -122,7 +122,8 @@ class HexagonTTIImpl : public BasicTTIImplBase<HexagonTTIImpl> {
ArrayRef<int> Mask,
TTI::TargetCostKind CostKind, int Index,
Type *SubTp,
- ArrayRef<const Value *> Args = std::nullopt);
+ ArrayRef<const Value *> Args = std::nullopt,
+ const Instruction *CxtI = nullptr);
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type ...
[truncated]
|
Mask, 3, Tp->getElementCount().getKnownMinValue() * 2)) && | ||
!ShuffleVectorInst::isZeroEltSplatMask( | ||
Mask, Tp->getElementCount().getKnownMinValue())) | ||
return LT.first; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Where do you use CxtI to detect a store?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah, yeah! I rewrote the code when moving it into this outer if statement, and must have missed that part of the original.
…huffle). This tries to add some costs for the shuffle in a ST3/ST4 instruction, which are represented in LLVM IR as store(interleaving shuffle). In order to detect the store, it needs to add a CxtI context instruction to check the users of the shuffle. LD3 and LD4 are added, LD2 should be a zip1 shuffle, which will be added in another patch. It should help fix some of the regressions from llvm#87510.
2faddc9
to
fcb91ed
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM - my only remaining thought is whether we need the Args argument now we have CxtI?
I guess that you could know the Args that the shuffle will use without knowing/having the exact shuffle instruction already, so it might still be useful to keep. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
OK - we can revisit this if necessary
Similar to llvm#87934, this adds costs to the shuffles in a canonical LD3/LD4 pattern, which are represented in LLVM as deinterleaving-shuffle(load). This likely has less effect at the moment than the ST3/ST4 costs as instcombine will perform certain transforms without considering the cost.
Similar to llvm#87934, this adds costs to the shuffles in a canonical LD3/LD4 pattern, which are represented in LLVM as deinterleaving-shuffle(load). This likely has less effect at the moment than the ST3/ST4 costs as instcombine will perform certain transforms without considering the cost.
Similar to llvm#87934, this adds costs to the shuffles in a canonical LD3/LD4 pattern, which are represented in LLVM as deinterleaving-shuffle(load). This likely has less effect at the moment than the ST3/ST4 costs as instcombine will perform certain transforms without considering the cost.
Similar to #87934, this adds costs to the shuffles in a canonical LD3/LD4 pattern, which are represented in LLVM as deinterleaving-shuffle(load). This likely has less effect at the moment than the ST3/ST4 costs as instcombine will perform certain transforms without considering the cost.
Similar to llvm#87934, this adds costs to the shuffles in a canonical LD3/LD4 pattern, which are represented in LLVM as deinterleaving-shuffle(load). This likely has less effect at the moment than the ST3/ST4 costs as instcombine will perform certain transforms without considering the cost.
This tries to add some costs for the shuffle in a ST3/ST4 instruction, which are represented in LLVM IR as store(interleaving shuffle). In order to detect the store, it needs to add a CxtI context instruction to check the users of the shuffle. LD3 and LD4 are added, LD2 should be a zip1 shuffle, which will be added in another patch.
It should help fix some of the regressions from #87510.