Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[EarlyCSE] Compare GEP instructions based on offset #65875

Merged
merged 9 commits into from
Sep 19, 2023
153 changes: 133 additions & 20 deletions llvm/lib/Transforms/Scalar/EarlyCSE.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -143,11 +143,11 @@ struct SimpleValue {
!CI->getFunction()->isPresplitCoroutine();
}
return isa<CastInst>(Inst) || isa<UnaryOperator>(Inst) ||
isa<BinaryOperator>(Inst) || isa<GetElementPtrInst>(Inst) ||
isa<CmpInst>(Inst) || isa<SelectInst>(Inst) ||
isa<ExtractElementInst>(Inst) || isa<InsertElementInst>(Inst) ||
isa<ShuffleVectorInst>(Inst) || isa<ExtractValueInst>(Inst) ||
isa<InsertValueInst>(Inst) || isa<FreezeInst>(Inst);
isa<BinaryOperator>(Inst) || isa<CmpInst>(Inst) ||
isa<SelectInst>(Inst) || isa<ExtractElementInst>(Inst) ||
isa<InsertElementInst>(Inst) || isa<ShuffleVectorInst>(Inst) ||
isa<ExtractValueInst>(Inst) || isa<InsertValueInst>(Inst) ||
isa<FreezeInst>(Inst);
}
};

Expand Down Expand Up @@ -307,10 +307,9 @@ static unsigned getHashValueImpl(SimpleValue Val) {
IVI->getOperand(1),
hash_combine_range(IVI->idx_begin(), IVI->idx_end()));

assert((isa<CallInst>(Inst) || isa<GetElementPtrInst>(Inst) ||
isa<ExtractElementInst>(Inst) || isa<InsertElementInst>(Inst) ||
isa<ShuffleVectorInst>(Inst) || isa<UnaryOperator>(Inst) ||
isa<FreezeInst>(Inst)) &&
assert((isa<CallInst>(Inst) || isa<ExtractElementInst>(Inst) ||
isa<InsertElementInst>(Inst) || isa<ShuffleVectorInst>(Inst) ||
isa<UnaryOperator>(Inst) || isa<FreezeInst>(Inst)) &&
"Invalid/unknown instruction");

// Handle intrinsics with commutative operands.
Expand Down Expand Up @@ -553,6 +552,77 @@ bool DenseMapInfo<CallValue>::isEqual(CallValue LHS, CallValue RHS) {
return LHSI->isIdenticalTo(RHSI);
}

//===----------------------------------------------------------------------===//
// GEPValue
//===----------------------------------------------------------------------===//

namespace {

struct GEPValue {
Instruction *Inst;
APInt ConstantOffset;
bool HasConstantOffset;
DianQK marked this conversation as resolved.
Show resolved Hide resolved

GEPValue(Instruction *I) : Inst(I), HasConstantOffset(false) {
assert((isSentinel() || canHandle(I)) && "Inst can't be handled!");
}
GEPValue(Instruction *I, APInt ConstantOffset, bool HasConstantOffset)
: Inst(I), ConstantOffset(ConstantOffset),
HasConstantOffset(HasConstantOffset) {
assert((isSentinel() || canHandle(I)) && "Inst can't be handled!");
}

bool isSentinel() const {
return Inst == DenseMapInfo<Instruction *>::getEmptyKey() ||
Inst == DenseMapInfo<Instruction *>::getTombstoneKey();
}

static bool canHandle(Instruction *Inst) {
return isa<GetElementPtrInst>(Inst);
DianQK marked this conversation as resolved.
Show resolved Hide resolved
}
};

} // namespace

namespace llvm {

template <> struct DenseMapInfo<GEPValue> {
static inline GEPValue getEmptyKey() {
return DenseMapInfo<Instruction *>::getEmptyKey();
}

static inline GEPValue getTombstoneKey() {
return DenseMapInfo<Instruction *>::getTombstoneKey();
}

static unsigned getHashValue(GEPValue Val);
static bool isEqual(GEPValue LHS, GEPValue RHS);
};

} // end namespace llvm

unsigned DenseMapInfo<GEPValue>::getHashValue(GEPValue Val) {
DianQK marked this conversation as resolved.
Show resolved Hide resolved
GetElementPtrInst *GEP = cast<GetElementPtrInst>(Val.Inst);
DianQK marked this conversation as resolved.
Show resolved Hide resolved
if (Val.HasConstantOffset)
return hash_combine(GEP->getOpcode(), GEP->getPointerOperand(),
Val.ConstantOffset);
return hash_combine(
GEP->getOpcode(),
hash_combine_range(GEP->value_op_begin(), GEP->value_op_end()));
}

bool DenseMapInfo<GEPValue>::isEqual(GEPValue LHS, GEPValue RHS) {
if (LHS.isSentinel() || RHS.isSentinel())
return LHS.Inst == RHS.Inst;
GetElementPtrInst *LGEP = cast<GetElementPtrInst>(LHS.Inst);
GetElementPtrInst *RGEP = cast<GetElementPtrInst>(RHS.Inst);
if (LGEP->getPointerOperand() != RGEP->getPointerOperand())
return false;
if (LHS.HasConstantOffset && RHS.HasConstantOffset)
return LHS.ConstantOffset == RHS.ConstantOffset;
return LGEP->isIdenticalToWhenDefined(RGEP);
}

//===----------------------------------------------------------------------===//
// EarlyCSE implementation
//===----------------------------------------------------------------------===//
Expand Down Expand Up @@ -647,6 +717,13 @@ class EarlyCSE {
ScopedHashTable<CallValue, std::pair<Instruction *, unsigned>>;
CallHTType AvailableCalls;

using GEPMapAllocatorTy =
RecyclingAllocator<BumpPtrAllocator,
ScopedHashTableVal<GEPValue, Value *>>;
using GEPHTType = ScopedHashTable<GEPValue, Value *, DenseMapInfo<GEPValue>,
GEPMapAllocatorTy>;
GEPHTType AvailableGEPs;

/// This is the current generation of the memory value.
unsigned CurrentGeneration = 0;

Expand All @@ -667,9 +744,11 @@ class EarlyCSE {
class NodeScope {
public:
NodeScope(ScopedHTType &AvailableValues, LoadHTType &AvailableLoads,
InvariantHTType &AvailableInvariants, CallHTType &AvailableCalls)
: Scope(AvailableValues), LoadScope(AvailableLoads),
InvariantScope(AvailableInvariants), CallScope(AvailableCalls) {}
InvariantHTType &AvailableInvariants, CallHTType &AvailableCalls,
GEPHTType &AvailableGEPs)
: Scope(AvailableValues), LoadScope(AvailableLoads),
InvariantScope(AvailableInvariants), CallScope(AvailableCalls),
GEPScope(AvailableGEPs) {}
NodeScope(const NodeScope &) = delete;
NodeScope &operator=(const NodeScope &) = delete;

Expand All @@ -678,6 +757,7 @@ class EarlyCSE {
LoadHTType::ScopeTy LoadScope;
InvariantHTType::ScopeTy InvariantScope;
CallHTType::ScopeTy CallScope;
GEPHTType::ScopeTy GEPScope;
};

// Contains all the needed information to create a stack for doing a depth
Expand All @@ -688,13 +768,13 @@ class EarlyCSE {
public:
StackNode(ScopedHTType &AvailableValues, LoadHTType &AvailableLoads,
InvariantHTType &AvailableInvariants, CallHTType &AvailableCalls,
unsigned cg, DomTreeNode *n, DomTreeNode::const_iterator child,
GEPHTType &AvailableGEPs, unsigned cg, DomTreeNode *n,
DomTreeNode::const_iterator child,
DomTreeNode::const_iterator end)
: CurrentGeneration(cg), ChildGeneration(cg), Node(n), ChildIter(child),
EndIter(end),
Scopes(AvailableValues, AvailableLoads, AvailableInvariants,
AvailableCalls)
{}
AvailableCalls, AvailableGEPs) {}
StackNode(const StackNode &) = delete;
StackNode &operator=(const StackNode &) = delete;

Expand Down Expand Up @@ -1561,6 +1641,39 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
continue;
}

if (GEPValue::canHandle(&Inst)) {
GetElementPtrInst *GEP = cast<GetElementPtrInst>(&Inst);
APInt Offset(SQ.DL.getIndexTypeSizeInBits(GEP->getType()), 0);
bool HasConstantOffset = GEP->accumulateConstantOffset(SQ.DL, Offset);
GEPValue GEPVal(GEP, Offset, HasConstantOffset);
if (Value *V = AvailableGEPs.lookup(GEPVal)) {
LLVM_DEBUG(dbgs() << "EarlyCSE CSE: " << Inst << " to: " << *V
<< '\n');
if (auto *I = dyn_cast<Instruction>(V)) {
// If I being poison triggers UB, there is no need to drop those
// flags. Otherwise, only retain flags present on both I and Inst.
// TODO: Currently some fast-math flags are not treated as
// poison-generating even though they should. Until this is fixed,
// always retain flags present on both I and Inst for floating point
// instructions.
if (isa<FPMathOperator>(I) ||
(I->hasPoisonGeneratingFlags() && !programUndefinedIfPoison(I)))
I->andIRFlags(&Inst);
}
Inst.replaceAllUsesWith(V);
salvageKnowledge(&Inst, &AC);
removeMSSA(Inst);
Inst.eraseFromParent();
Changed = true;
++NumCSE;
continue;
}

// Otherwise, just remember that this value is available.
AvailableGEPs.insert(GEPVal, &Inst);
continue;
}

// A release fence requires that all stores complete before it, but does
// not prevent the reordering of following loads 'before' the fence. As a
// result, we don't need to consider it as writing to memory and don't need
Expand Down Expand Up @@ -1675,7 +1788,7 @@ bool EarlyCSE::run() {
// Process the root node.
nodesToProcess.push_back(new StackNode(
AvailableValues, AvailableLoads, AvailableInvariants, AvailableCalls,
CurrentGeneration, DT.getRootNode(),
AvailableGEPs, CurrentGeneration, DT.getRootNode(),
DT.getRootNode()->begin(), DT.getRootNode()->end()));

assert(!CurrentGeneration && "Create a new EarlyCSE instance to rerun it.");
Expand All @@ -1698,10 +1811,10 @@ bool EarlyCSE::run() {
} else if (NodeToProcess->childIter() != NodeToProcess->end()) {
// Push the next child onto the stack.
DomTreeNode *child = NodeToProcess->nextChild();
nodesToProcess.push_back(
new StackNode(AvailableValues, AvailableLoads, AvailableInvariants,
AvailableCalls, NodeToProcess->childGeneration(),
child, child->begin(), child->end()));
nodesToProcess.push_back(new StackNode(
AvailableValues, AvailableLoads, AvailableInvariants, AvailableCalls,
AvailableGEPs, NodeToProcess->childGeneration(), child,
child->begin(), child->end()));
} else {
// It has been processed, and there are no more children to process,
// so delete it and pop it off the stack.
Expand Down
44 changes: 44 additions & 0 deletions llvm/test/Transforms/EarlyCSE/gep.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
; RUN: opt < %s -S -passes=early-cse -earlycse-debug-hash | FileCheck %s
; RUN: opt < %s -S -passes='early-cse<memssa>' | FileCheck %s

%T1 = type { i64, i64, i64 }

declare void @use_vec(<4 x ptr>);

define void @foo(ptr %a, <4 x i64> %b, i64 %i) {
; CHECK-LABEL: define void @foo(
; CHECK-SAME: ptr [[A:%.*]], <4 x i64> [[B:%.*]], i64 [[I:%.*]]) {
; CHECK-NEXT: [[S1A:%.*]] = getelementptr i8, ptr [[A]], i64 8
; CHECK-NEXT: [[N1D:%.*]] = getelementptr i8, ptr [[A]], i64 7
; CHECK-NEXT: [[N1G:%.*]] = getelementptr i32, ptr [[A]], i64 1
; CHECK-NEXT: [[N1H:%.*]] = getelementptr i8, ptr [[A]], i64 [[I]]
; CHECK-NEXT: [[V:%.*]] = getelementptr i64, ptr [[A]], <4 x i64> <i64 1, i64 1, i64 1, i64 1>
; CHECK-NEXT: call void @use_vec(<4 x ptr> [[V]])
; CHECK-NEXT: [[V2:%.*]] = getelementptr i64, ptr [[A]], <4 x i64> <i64 0, i64 2, i64 1, i64 1>
; CHECK-NEXT: call void @use_vec(<4 x ptr> [[V2]])
; CHECK-NEXT: ret void
;
%s1a = getelementptr i8, ptr %a, i64 8
%s1av = load i64, ptr %s1a
%s1b = getelementptr inbounds i8, ptr %a, i64 8
%s1bv = load i64, ptr %s1b
%s1c = getelementptr %T1, ptr %a, i64 0, i32 1
%s1cv = load i64, ptr %s1c
%n1d = getelementptr i8, ptr %a, i64 7
%n1dv = load i64, ptr %n1d
%s1e = getelementptr i64, ptr %a, i64 1
%s1ev = load i64, ptr %s1e
%s1f = getelementptr i32, ptr %a, i64 2
%s1fv = load i64, ptr %s1f
%n1g = getelementptr i32, ptr %a, i64 1
%n1gv = load i64, ptr %n1g
%n1h = getelementptr i8, ptr %a, i64 %i
%n1hv = load i64, ptr %n1h

%v = getelementptr i64, ptr %a, <4 x i64> <i64 1, i64 1, i64 1, i64 1>
call void @use_vec(<4 x ptr> %v)
%v2 = getelementptr i64, ptr %a, <4 x i64> <i64 0, i64 2, i64 1, i64 1>
call void @use_vec(<4 x ptr> %v2)
ret void
}
44 changes: 44 additions & 0 deletions llvm/test/Transforms/PhaseOrdering/X86/unroll-vectorizer.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
; RUN: opt < %s -O3 -S | FileCheck %s

target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

%Zip = type { { ptr, ptr }, { [32 x i8], { i64, i64 } } }

define void @foo(ptr %a, <32 x i8> %_0) #0 {
; CHECK-LABEL: define void @foo(
; CHECK-SAME: ptr nocapture writeonly [[A:%.*]], <32 x i8> [[_0:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: start:
; CHECK-NEXT: store <32 x i8> [[_0]], ptr [[A]], align 1
; CHECK-NEXT: ret void
;
start:
%z = alloca %Zip, align 8
%sroa_1 = getelementptr i8, ptr %z, i64 16
store <32 x i8> %_0, ptr %sroa_1, align 8
%len_ = getelementptr i8, ptr %z, i64 56
store i64 32, ptr %len_, align 8
%_1 = getelementptr %Zip, ptr %z, i64 0, i32 1, i32 1
%_2 = getelementptr %Zip, ptr %z, i64 0, i32 1, i32 1, i32 1
%len = load i64, ptr %_2, align 8
%_10 = getelementptr %Zip, ptr %z, i64 0, i32 1
br label %body

body: ; preds = %body, %start
%_34 = phi ptr [ %_34i, %body ], [ %a, %start ]
%idx = phi i64 [ %idx_, %body ], [ 0, %start ]
%_34i = getelementptr i8, ptr %_34, i64 1
%idx_ = add i64 %idx, 1
store i64 0, ptr %_1, align 8
%_24 = getelementptr i8, ptr %_10, i64 %idx
%_18 = load i8, ptr %_24, align 1
store i8 %_18, ptr %_34, align 1
%_6 = icmp eq i64 %len, %idx_
br i1 %_6, label %exit, label %body

exit: ; preds = %body
ret void
}

attributes #0 = { "target-cpu"="znver3" }