Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[AArch64][GlobalISel] Split offsets of consecutive stores to aid STP … #66980

Merged
merged 3 commits into from
Sep 22, 2023
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
Original file line number Diff line number Diff line change
Expand Up @@ -364,6 +364,8 @@ class MachineIRBuilder {
State.Observer = &Observer;
}

GISelChangeObserver *getObserver() { return State.Observer; }

void stopObservingChanges() { State.Observer = nullptr; }

bool isObservingChanges() const { return State.Observer != nullptr; }
Expand Down
202 changes: 201 additions & 1 deletion llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,9 @@
//===----------------------------------------------------------------------===//

#include "AArch64TargetMachine.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/CodeGen/GlobalISel/CSEInfo.h"
#include "llvm/CodeGen/GlobalISel/CSEMIRBuilder.h"
#include "llvm/CodeGen/GlobalISel/Combiner.h"
#include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
#include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
Expand Down Expand Up @@ -439,6 +441,19 @@ class AArch64PostLegalizerCombiner : public MachineFunctionPass {
private:
bool IsOptNone;
AArch64PostLegalizerCombinerImplRuleConfig RuleConfig;


struct StoreInfo {
GStore *St;
GPtrAdd *Ptr;
int64_t Offset;
LLT StoredType;
aemerson marked this conversation as resolved.
Show resolved Hide resolved
};
bool tryOptimizeConsecStores(SmallVectorImpl<StoreInfo> &Stores,
CSEMIRBuilder &MIB);

bool optimizeConsecutiveMemOpAddressing(MachineFunction &MF,
CSEMIRBuilder &MIB);
};
} // end anonymous namespace

Expand Down Expand Up @@ -492,7 +507,192 @@ bool AArch64PostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
F.hasMinSize());
AArch64PostLegalizerCombinerImpl Impl(MF, CInfo, TPC, *KB, CSEInfo,
RuleConfig, ST, MDT, LI);
return Impl.combineMachineInstrs();
bool Changed = Impl.combineMachineInstrs();

auto MIB = CSEMIRBuilder(MF);
MIB.setCSEInfo(CSEInfo);
Changed |= optimizeConsecutiveMemOpAddressing(MF, MIB);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just wondering, do you need to do this because the combiner isn't powerful enough/is missing some feature to make this a normal rule (what feature?), or just because this is a separate transform that could technically be in another pass (but it's not worth creating a pass just for that)?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just wondering, do you need to do this because the combiner isn't powerful enough/is missing some feature to make this a normal rule (what feature?), or just because this is a separate transform that could technically be in another pass (but it's not worth creating a pass just for that)?

Its the latter, we're just piggy-backing off the existing combiner pass. There's a few reasons for doing that:

  1. We need this to run after reassociations/ptradd_immed_chain since this is undoing the effect of those.
  2. Since this optimization wants to look at other, non-use/def instructions in the function, doing so with manual C++ let's us make sure we have predictable linear complexity.
  3. We also want a predictable top-down inst visitation order, instead of running to a fixed point.

return Changed;
}

bool AArch64PostLegalizerCombiner::tryOptimizeConsecStores(
SmallVectorImpl<StoreInfo> &Stores, CSEMIRBuilder &MIB) {
if (Stores.size() <= 2)
return false;

jroelofs marked this conversation as resolved.
Show resolved Hide resolved
// Profitabity checks:
int64_t BaseOffset = Stores[0].Offset;
unsigned NumPairsExpected = Stores.size() / 2;
unsigned TotalInstsExpected = NumPairsExpected + (Stores.size() % 2);
// Size savings will depend on whether we can fold the offset, as an
// immediate of an ADD.
auto &TLI = *MIB.getMF().getSubtarget().getTargetLowering();
if (!TLI.isLegalAddImmediate(BaseOffset))
TotalInstsExpected++;
int SavingsExpected = Stores.size() - TotalInstsExpected;
if (SavingsExpected <= 0)
return false;

auto &MRI = MIB.getMF().getRegInfo();

// We have a series of consecutive stores. Factor out the common base
// pointer and rewrite the offsets.
Register NewBase = Stores[0].Ptr->getReg(0);
for (auto &SInfo : Stores) {
// Compute a new pointer with the new base ptr and adjusted offset.
MIB.setInstrAndDebugLoc(*SInfo.St);
auto NewOff = MIB.buildConstant(LLT::scalar(64), SInfo.Offset - BaseOffset);
auto NewPtr = MIB.buildPtrAdd(MRI.getType(SInfo.St->getPointerReg()),
NewBase, NewOff);
if (MIB.getObserver())
MIB.getObserver()->changingInstr(*SInfo.St);
SInfo.St->getOperand(1).setReg(NewPtr.getReg(0));
if (MIB.getObserver())
MIB.getObserver()->changedInstr(*SInfo.St);
}
LLVM_DEBUG(dbgs() << "Split a series of " << Stores.size()
<< " stores into a base pointer and offsets.\n");
return true;
}

static cl::opt<bool>
EnableConsecutiveMemOpOpt("aarch64-postlegalizer-consecutive-memops",
cl::init(true), cl::Hidden,
cl::desc("Enable consecutive memop optimization "
"in AArch64PostLegalizerCombiner"));

bool AArch64PostLegalizerCombiner::optimizeConsecutiveMemOpAddressing(
MachineFunction &MF, CSEMIRBuilder &MIB) {
// This combine needs to run after all reassociations/folds on pointer
// addressing have been done, specifically those that combine two G_PTR_ADDs
// with constant offsets into a single G_PTR_ADD with a combined offset.
// The goal of this optimization is to undo that combine in the case where
// doing so has prevented the formation of pair stores due to illegal
// addressing modes of STP. The reason that we do it here is because
// it's much easier to undo the transformation of a series consecutive
// mem ops, than it is to detect when doing it would be a bad idea looking
// at a single G_PTR_ADD in the reassociation/ptradd_immed_chain combine.
//
// An example:
// G_STORE %11:_(<2 x s64>), %base:_(p0) :: (store (<2 x s64>), align 1)
// %off1:_(s64) = G_CONSTANT i64 4128
// %p1:_(p0) = G_PTR_ADD %0:_, %off1:_(s64)
// G_STORE %11:_(<2 x s64>), %p1:_(p0) :: (store (<2 x s64>), align 1)
// %off2:_(s64) = G_CONSTANT i64 4144
// %p2:_(p0) = G_PTR_ADD %0:_, %off2:_(s64)
// G_STORE %11:_(<2 x s64>), %p2:_(p0) :: (store (<2 x s64>), align 1)
// %off3:_(s64) = G_CONSTANT i64 4160
// %p3:_(p0) = G_PTR_ADD %0:_, %off3:_(s64)
// G_STORE %11:_(<2 x s64>), %17:_(p0) :: (store (<2 x s64>), align 1)
bool Changed = false;
auto &MRI = MF.getRegInfo();

if (!EnableConsecutiveMemOpOpt)
return Changed;

SmallVector<StoreInfo, 8> Stores;
// If we see a load, then we keep track of any values defined by it.
// In the following example, STP formation will fail anyway because
// the latter store is using a load result that appears after the
// the prior store. In this situation if we factor out the offset then
// we increase code size for no benefit.
SmallVector<Register> LoadValsSinceLastStore;

auto storeIsValid = [&](StoreInfo &Last, StoreInfo New) {
// Check if this store is consecutive to the last one.
if (Last.Ptr->getBaseReg() != New.Ptr->getBaseReg() ||
(Last.Offset + static_cast<int64_t>(Last.StoredType.getSizeInBytes()) !=
New.Offset) ||
Last.StoredType != New.StoredType)
return false;

// Check if this store is using a load result that appears after the
// last store. If so, bail out.
if (llvm::any_of(LoadValsSinceLastStore, [&](Register LoadVal) {
aemerson marked this conversation as resolved.
Show resolved Hide resolved
return New.St->getValueReg() == LoadVal;
}))
return false;

// Check if the current offset would be too large for STP.
// If not, then STP formation should be able to handle it, so we don't
// need to do anything.
int64_t MaxLegalOffset;
switch (New.StoredType.getSizeInBits()) {
case 32:
MaxLegalOffset = 252;
break;
case 64:
MaxLegalOffset = 504;
break;
case 128:
MaxLegalOffset = 1008;
break;
default:
llvm_unreachable("Unexpected stored type size");
}
if (New.Offset < MaxLegalOffset)
return false;

// If factoring it out still wouldn't help then don't bother.
return New.Offset - Stores[0].Offset <= MaxLegalOffset;
};

for (auto &MBB : MF) {
// We're looking inside a single BB at a time since the memset pattern
// should only be in a single block.

Stores.clear();
LoadValsSinceLastStore.clear();

auto resetState = [&]() {
Stores.clear();
LoadValsSinceLastStore.clear();
};
aemerson marked this conversation as resolved.
Show resolved Hide resolved

for (auto &MI : MBB) {
if (auto *St = dyn_cast<GStore>(&MI)) {
Register PtrBaseReg;
APInt Offset;
LLT StoredValTy = MRI.getType(St->getValueReg());
unsigned ValSize = StoredValTy.getSizeInBits();
if (ValSize != St->getMMO().getSizeInBits())
continue; // Don't handle truncating stores.
if (ValSize < 32)
continue; // Can only STP 32b or larger.
aemerson marked this conversation as resolved.
Show resolved Hide resolved

if (mi_match(
St->getPointerReg(), MRI,
m_OneNonDBGUse(m_GPtrAdd(m_Reg(PtrBaseReg), m_ICst(Offset))))) {
GPtrAdd *PtrAdd = cast<GPtrAdd>(MRI.getVRegDef(St->getPointerReg()));
StoreInfo New = {St, PtrAdd, Offset.getSExtValue(), StoredValTy};
aemerson marked this conversation as resolved.
Show resolved Hide resolved

if (Stores.empty()) {
Stores.push_back(New);
continue;
}

// Check if this store is consecutive to the last one.
auto &Last = Stores.back();
if (storeIsValid(Last, New)) {
Stores.push_back(New);
LoadValsSinceLastStore.clear(); // Reset the load value tracking.
} else {
// The store isn't a valid to consider for the prior sequence,
// so try to optimize what we have so far and start a new sequence.
Changed |= tryOptimizeConsecStores(Stores, MIB);
resetState();
Stores.push_back(New);
}
}
} else if (auto *Ld = dyn_cast<GLoad>(&MI)) {
LoadValsSinceLastStore.push_back(Ld->getDstReg());
}
}
Changed |= tryOptimizeConsecStores(Stores, MIB);
resetState();
}

return Changed;
}

char AArch64PostLegalizerCombiner::ID = 0;
Expand Down
Loading