Skip to content

Commit

Permalink
[LAA] Support backward dependences with non-constant distance. (#91525)
Browse files Browse the repository at this point in the history
Following up to 933f492, also update the code reasoning about
backwards dependences to support non-constant distances.

Update the code to use the signed minimum distance instead of a constant
distance

This means e checked the lower bound of the dependence distance and the
distance may be larger at runtime (and safe for vectorization). Whether
to classify it as Unknown or Backwards depends on the vector width and
LAA was updated to take TTI to get the maximum vector register width.

If the minimum dependence distance is larger than the max vector width,
we consider it as backwards-vectorizable. Otherwise we classify them as
Unknown, so we re-try with runtime checks.

PR: #91525
  • Loading branch information
fhahn authored May 10, 2024
1 parent 2e8d815 commit 28767af
Show file tree
Hide file tree
Showing 7 changed files with 247 additions and 169 deletions.
23 changes: 17 additions & 6 deletions llvm/include/llvm/Analysis/LoopAccessAnalysis.h
Original file line number Diff line number Diff line change
Expand Up @@ -181,8 +181,10 @@ class MemoryDepChecker {
const SmallVectorImpl<Instruction *> &Instrs) const;
};

MemoryDepChecker(PredicatedScalarEvolution &PSE, const Loop *L)
: PSE(PSE), InnermostLoop(L) {}
MemoryDepChecker(PredicatedScalarEvolution &PSE, const Loop *L,
unsigned MaxTargetVectorWidthInBits)
: PSE(PSE), InnermostLoop(L),
MaxTargetVectorWidthInBits(MaxTargetVectorWidthInBits) {}

/// Register the location (instructions are given increasing numbers)
/// of a write access.
Expand Down Expand Up @@ -314,6 +316,12 @@ class MemoryDepChecker {
/// RecordDependences is true.
SmallVector<Dependence, 8> Dependences;

/// The maximum width of a target's vector registers multiplied by 2 to also
/// roughly account for additional interleaving. Is used to decide if a
/// backwards dependence with non-constant stride should be classified as
/// backwards-vectorizable or unknown (triggering a runtime check).
unsigned MaxTargetVectorWidthInBits = 0;

/// Check whether there is a plausible dependence between the two
/// accesses.
///
Expand Down Expand Up @@ -575,8 +583,9 @@ class RuntimePointerChecking {
/// PSE must be emitted in order for the results of this analysis to be valid.
class LoopAccessInfo {
public:
LoopAccessInfo(Loop *L, ScalarEvolution *SE, const TargetLibraryInfo *TLI,
AAResults *AA, DominatorTree *DT, LoopInfo *LI);
LoopAccessInfo(Loop *L, ScalarEvolution *SE, const TargetTransformInfo *TTI,
const TargetLibraryInfo *TLI, AAResults *AA, DominatorTree *DT,
LoopInfo *LI);

/// Return true we can analyze the memory accesses in the loop and there are
/// no memory dependence cycles. Note that for dependences between loads &
Expand Down Expand Up @@ -799,12 +808,14 @@ class LoopAccessInfoManager {
AAResults &AA;
DominatorTree &DT;
LoopInfo &LI;
TargetTransformInfo *TTI;
const TargetLibraryInfo *TLI = nullptr;

public:
LoopAccessInfoManager(ScalarEvolution &SE, AAResults &AA, DominatorTree &DT,
LoopInfo &LI, const TargetLibraryInfo *TLI)
: SE(SE), AA(AA), DT(DT), LI(LI), TLI(TLI) {}
LoopInfo &LI, TargetTransformInfo *TTI,
const TargetLibraryInfo *TLI)
: SE(SE), AA(AA), DT(DT), LI(LI), TTI(TTI), TLI(TLI) {}

const LoopAccessInfo &getInfo(Loop &L);

Expand Down
93 changes: 67 additions & 26 deletions llvm/lib/Analysis/LoopAccessAnalysis.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/Analysis/VectorUtils.h"
#include "llvm/IR/BasicBlock.h"
Expand Down Expand Up @@ -2122,32 +2123,34 @@ MemoryDepChecker::Dependence::DepType MemoryDepChecker::isDependent(
return Dependence::Forward;
}

if (!C) {
// TODO: FoundNonConstantDistanceDependence is used as a necessary condition
// to consider retrying with runtime checks. Historically, we did not set it
// when strides were different but there is no inherent reason to.
int64_t MinDistance = SE.getSignedRangeMin(Dist).getSExtValue();
// Below we only handle strictly positive distances.
if (MinDistance <= 0) {
FoundNonConstantDistanceDependence |= CommonStride.has_value();
LLVM_DEBUG(dbgs() << "LAA: Dependence because of non-constant distance\n");
return Dependence::Unknown;
}

if (!SE.isKnownPositive(Dist))
return Dependence::Unknown;
if (!isa<SCEVConstant>(Dist)) {
// Previously this case would be treated as Unknown, possibly setting
// FoundNonConstantDistanceDependence to force re-trying with runtime
// checks. Until the TODO below is addressed, set it here to preserve
// original behavior w.r.t. re-trying with runtime checks.
// TODO: FoundNonConstantDistanceDependence is used as a necessary
// condition to consider retrying with runtime checks. Historically, we
// did not set it when strides were different but there is no inherent
// reason to.
FoundNonConstantDistanceDependence |= CommonStride.has_value();
}

if (!HasSameSize) {
LLVM_DEBUG(dbgs() << "LAA: ReadWrite-Write positive dependency with "
"different type sizes\n");
return Dependence::Unknown;
}

// The logic below currently only supports StrideA == StrideB, i.e. there's a
// common stride.
if (!CommonStride)
return Dependence::Unknown;

const APInt &Val = C->getAPInt();
int64_t Distance = Val.getSExtValue();

// Bail out early if passed-in parameters make vectorization not feasible.
unsigned ForcedFactor = (VectorizerParams::VectorizationFactor ?
VectorizerParams::VectorizationFactor : 1);
Expand All @@ -2172,8 +2175,8 @@ MemoryDepChecker::Dependence::DepType MemoryDepChecker::isDependent(
// | A[0] | | A[2] | | A[4] | | A[6] | |
// | B[0] | | B[2] | | B[4] |
//
// Distance needs for vectorizing iterations except the last iteration:
// 4 * 2 * (MinNumIter - 1). Distance needs for the last iteration: 4.
// MinDistance needs for vectorizing iterations except the last iteration:
// 4 * 2 * (MinNumIter - 1). MinDistance needs for the last iteration: 4.
// So the minimum distance needed is: 4 * 2 * (MinNumIter - 1) + 4.
//
// If MinNumIter is 2, it is vectorizable as the minimum distance needed is
Expand All @@ -2182,11 +2185,22 @@ MemoryDepChecker::Dependence::DepType MemoryDepChecker::isDependent(
// If MinNumIter is 4 (Say if a user forces the vectorization factor to be 4),
// the minimum distance needed is 28, which is greater than distance. It is
// not safe to do vectorization.

// We know that Dist is positive, but it may not be constant. Use the signed
// minimum for computations below, as this ensures we compute the closest
// possible dependence distance.
uint64_t MinDistanceNeeded =
TypeByteSize * (*CommonStride) * (MinNumIter - 1) + TypeByteSize;
if (MinDistanceNeeded > static_cast<uint64_t>(Distance)) {
LLVM_DEBUG(dbgs() << "LAA: Failure because of positive distance "
<< Distance << '\n');
TypeByteSize * *CommonStride * (MinNumIter - 1) + TypeByteSize;
if (MinDistanceNeeded > static_cast<uint64_t>(MinDistance)) {
if (!isa<SCEVConstant>(Dist)) {
// For non-constant distances, we checked the lower bound of the
// dependence distance and the distance may be larger at runtime (and safe
// for vectorization). Classify it as Unknown, so we re-try with runtime
// checks.
return Dependence::Unknown;
}
LLVM_DEBUG(dbgs() << "LAA: Failure because of positive minimum distance "
<< MinDistance << '\n');
return Dependence::Backward;
}

Expand Down Expand Up @@ -2215,12 +2229,13 @@ MemoryDepChecker::Dependence::DepType MemoryDepChecker::isDependent(
// is 8, which is less than 2 and forbidden vectorization, But actually
// both A and B could be vectorized by 2 iterations.
MinDepDistBytes =
std::min(static_cast<uint64_t>(Distance), MinDepDistBytes);
std::min(static_cast<uint64_t>(MinDistance), MinDepDistBytes);

bool IsTrueDataDependence = (!AIsWrite && BIsWrite);
uint64_t MinDepDistBytesOld = MinDepDistBytes;
if (IsTrueDataDependence && EnableForwardingConflictDetection &&
couldPreventStoreLoadForward(Distance, TypeByteSize)) {
isa<SCEVConstant>(Dist) &&
couldPreventStoreLoadForward(MinDistance, TypeByteSize)) {
// Sanity check that we didn't update MinDepDistBytes when calling
// couldPreventStoreLoadForward
assert(MinDepDistBytes == MinDepDistBytesOld &&
Expand All @@ -2232,10 +2247,18 @@ MemoryDepChecker::Dependence::DepType MemoryDepChecker::isDependent(

// An update to MinDepDistBytes requires an update to MaxSafeVectorWidthInBits
// since there is a backwards dependency.
uint64_t MaxVF = MinDepDistBytes / (TypeByteSize * (*CommonStride));
LLVM_DEBUG(dbgs() << "LAA: Positive distance " << Val.getSExtValue()
uint64_t MaxVF = MinDepDistBytes / (TypeByteSize * *CommonStride);
LLVM_DEBUG(dbgs() << "LAA: Positive min distance " << MinDistance
<< " with max VF = " << MaxVF << '\n');

uint64_t MaxVFInBits = MaxVF * TypeByteSize * 8;
if (!isa<SCEVConstant>(Dist) && MaxVFInBits < MaxTargetVectorWidthInBits) {
// For non-constant distances, we checked the lower bound of the dependence
// distance and the distance may be larger at runtime (and safe for
// vectorization). Classify it as Unknown, so we re-try with runtime checks.
return Dependence::Unknown;
}

MaxSafeVectorWidthInBits = std::min(MaxSafeVectorWidthInBits, MaxVFInBits);
return Dependence::BackwardVectorizable;
}
Expand Down Expand Up @@ -3018,11 +3041,28 @@ void LoopAccessInfo::collectStridedAccess(Value *MemAccess) {
}

LoopAccessInfo::LoopAccessInfo(Loop *L, ScalarEvolution *SE,
const TargetTransformInfo *TTI,
const TargetLibraryInfo *TLI, AAResults *AA,
DominatorTree *DT, LoopInfo *LI)
: PSE(std::make_unique<PredicatedScalarEvolution>(*SE, *L)),
PtrRtChecking(nullptr),
DepChecker(std::make_unique<MemoryDepChecker>(*PSE, L)), TheLoop(L) {
PtrRtChecking(nullptr), TheLoop(L) {
unsigned MaxTargetVectorWidthInBits = std::numeric_limits<unsigned>::max();
if (TTI) {
TypeSize FixedWidth =
TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector);
if (FixedWidth.isNonZero()) {
// Scale the vector width by 2 as rough estimate to also consider
// interleaving.
MaxTargetVectorWidthInBits = FixedWidth.getFixedValue() * 2;
}

TypeSize ScalableWidth =
TTI->getRegisterBitWidth(TargetTransformInfo::RGK_ScalableVector);
if (ScalableWidth.isNonZero())
MaxTargetVectorWidthInBits = std::numeric_limits<unsigned>::max();
}
DepChecker =
std::make_unique<MemoryDepChecker>(*PSE, L, MaxTargetVectorWidthInBits);
PtrRtChecking = std::make_unique<RuntimePointerChecking>(*DepChecker, SE);
if (canAnalyzeLoop()) {
analyzeLoop(AA, LI, TLI, DT);
Expand Down Expand Up @@ -3082,7 +3122,7 @@ const LoopAccessInfo &LoopAccessInfoManager::getInfo(Loop &L) {

if (I.second)
I.first->second =
std::make_unique<LoopAccessInfo>(&L, &SE, TLI, &AA, &DT, &LI);
std::make_unique<LoopAccessInfo>(&L, &SE, TTI, TLI, &AA, &DT, &LI);

return *I.first->second;
}
Expand Down Expand Up @@ -3111,8 +3151,9 @@ LoopAccessInfoManager LoopAccessAnalysis::run(Function &F,
auto &AA = FAM.getResult<AAManager>(F);
auto &DT = FAM.getResult<DominatorTreeAnalysis>(F);
auto &LI = FAM.getResult<LoopAnalysis>(F);
auto &TTI = FAM.getResult<TargetIRAnalysis>(F);
auto &TLI = FAM.getResult<TargetLibraryAnalysis>(F);
return LoopAccessInfoManager(SE, AA, DT, LI, &TLI);
return LoopAccessInfoManager(SE, AA, DT, LI, &TTI, &TLI);
}

AnalysisKey LoopAccessAnalysis::Key;
2 changes: 1 addition & 1 deletion llvm/lib/Transforms/Scalar/LoopFlatten.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1005,7 +1005,7 @@ PreservedAnalyses LoopFlattenPass::run(LoopNest &LN, LoopAnalysisManager &LAM,
// in simplified form, and also needs LCSSA. Running
// this pass will simplify all loops that contain inner loops,
// regardless of whether anything ends up being flattened.
LoopAccessInfoManager LAIM(AR.SE, AR.AA, AR.DT, AR.LI, nullptr);
LoopAccessInfoManager LAIM(AR.SE, AR.AA, AR.DT, AR.LI, &AR.TTI, nullptr);
for (Loop *InnerLoop : LN.getLoops()) {
auto *OuterLoop = InnerLoop->getParentLoop();
if (!OuterLoop)
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -582,7 +582,7 @@ PreservedAnalyses LoopVersioningLICMPass::run(Loop &L, LoopAnalysisManager &AM,
const Function *F = L.getHeader()->getParent();
OptimizationRemarkEmitter ORE(F);

LoopAccessInfoManager LAIs(*SE, *AA, *DT, LAR.LI, nullptr);
LoopAccessInfoManager LAIs(*SE, *AA, *DT, LAR.LI, nullptr, nullptr);
if (!LoopVersioningLICM(AA, SE, &ORE, LAIs, LAR.LI, &L).run(DT))
return PreservedAnalyses::all();
return getLoopPassPreservedAnalyses();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@

; CHECK: function 'Test':
; CHECK: .inner:
; CHECK-NEXT: Memory dependences are safe with run-time checks
; CHECK-NEXT: Memory dependences are safe with a maximum safe vector width of 2048 bits with run-time checks
; CHECK-NEXT: Dependences:
; CHECK-NEXT: Run-time memory checks:
; CHECK: Check 0:
Expand Down
Loading

0 comments on commit 28767af

Please sign in to comment.