diff --git a/src/jitlayers.cpp b/src/jitlayers.cpp index dcf9aae95722e..f101a94cf2ca3 100644 --- a/src/jitlayers.cpp +++ b/src/jitlayers.cpp @@ -192,6 +192,11 @@ void addOptimizationPasses(legacy::PassManagerBase *PM, int opt_level, bool dump PM->add(createEarlyCSEPass()); //// **** +#if JL_LLVM_VERSION >= 50000 + // Load forwarding above can expose allocations that aren't actually used + // remove those before optimizing loops. + PM->add(createAllocOptPass()); +#endif PM->add(createLoopIdiomPass()); //// **** PM->add(createLoopRotatePass()); // Rotate loops. #ifdef USE_POLLY @@ -214,6 +219,10 @@ void addOptimizationPasses(legacy::PassManagerBase *PM, int opt_level, bool dump PM->add(createSimpleLoopUnrollPass()); // Unroll small loops //PM->add(createLoopStrengthReducePass()); // (jwb added) +#if JL_LLVM_VERSION >= 50000 + // Run our own SROA on heap objects before LLVM's + PM->add(createAllocOptPass()); +#endif // Re-run SROA after loop-unrolling (useful for small loops that operate, // over the structure of an aggregate) PM->add(createSROAPass()); // Break up aggregate allocas @@ -230,6 +239,10 @@ void addOptimizationPasses(legacy::PassManagerBase *PM, int opt_level, bool dump PM->add(createJumpThreadingPass()); // Thread jumps PM->add(createDeadStoreEliminationPass()); // Delete dead stores +#if JL_LLVM_VERSION >= 50000 + // More dead allocation (store) deletion before loop optimization + PM->add(createAllocOptPass()); +#endif // see if all of the constant folding has exposed more loops // to simplification and deletion // this helps significantly with cleaning up iteration diff --git a/src/llvm-alloc-opt.cpp b/src/llvm-alloc-opt.cpp index cb26f53451a2b..9d118680b2396 100644 --- a/src/llvm-alloc-opt.cpp +++ b/src/llvm-alloc-opt.cpp @@ -6,6 +6,7 @@ #include #include +#include #include #include #include @@ -17,6 +18,7 @@ #include #include #include +#include #include "codegen_shared.h" #include "julia.h" @@ -58,6 +60,40 @@ static bool isBundleOperand(CallInst *call, unsigned idx) #endif } +static void removeGCPreserve(CallInst *call, Instruction *val) +{ + auto replace = Constant::getNullValue(val->getType()); + call->replaceUsesOfWith(val, replace); + for (auto &arg: call->arg_operands()) { + if (!isa(arg.get())) { + return; + } + } + while (!call->use_empty()) { + auto end = cast(*call->user_begin()); + // gc_preserve_end returns void. + assert(end->use_empty()); + end->eraseFromParent(); + } + call->eraseFromParent(); +} + +static bool hasObjref(Type *ty) +{ + if (auto ptrty = dyn_cast(ty)) + return ptrty->getAddressSpace() == AddressSpace::Tracked; + if (auto seqty = dyn_cast(ty)) + return hasObjref(seqty->getElementType()); + if (auto structty = dyn_cast(ty)) { + for (auto elty: structty->elements()) { + if (hasObjref(elty)) { + return true; + } + } + } + return false; +} + /** * Promote `julia.gc_alloc_obj` which do not have escaping root to a alloca. * Uses that are not considered to escape the object (i.e. heap address) includes, @@ -75,6 +111,14 @@ static bool isBundleOperand(CallInst *call, unsigned idx) * All other uses are considered to escape conservatively. */ +/** + * TODO: + * * Return twice + * * Handle phi node. + * * Look through `pointer_from_objref`. + * * Handle jl_box* + */ + struct AllocOpt : public FunctionPass { static char ID; AllocOpt() @@ -83,7 +127,6 @@ struct AllocOpt : public FunctionPass { llvm::initializeDominatorTreeWrapperPassPass(*PassRegistry::getPassRegistry()); } -private: LLVMContext *ctx; const DataLayout *DL; @@ -94,6 +137,7 @@ struct AllocOpt : public FunctionPass { Function *lifetime_end; Function *gc_preserve_begin; Function *typeof_func; + Function *write_barrier_func; Type *T_int8; Type *T_int32; @@ -106,37 +150,66 @@ struct AllocOpt : public FunctionPass { MDNode *tbaa_tag; - struct CheckInstFrame { - Instruction *parent; - uint64_t offset; - Instruction::use_iterator use_it; - Instruction::use_iterator use_end; - }; - typedef SmallVector CheckInstStack; - struct ReplaceUsesFrame { - Instruction *orig_i; - Instruction *new_i; - ReplaceUsesFrame(Instruction *orig_i, Instruction *new_i) - : orig_i(orig_i), - new_i(new_i) - {} - }; - typedef SmallVector ReplaceUsesStack; +private: + bool doInitialization(Module &m) override; + bool runOnFunction(Function &F) override; + void getAnalysisUsage(AnalysisUsage &AU) const override + { + FunctionPass::getAnalysisUsage(AU); + AU.addRequired(); + AU.addPreserved(); + AU.setPreservesCFG(); + } +}; - struct LifetimeMarker { - LifetimeMarker(AllocOpt &pass) - : pass(pass), - first_safepoint{}, - stack{} - {} - // insert llvm.lifetime.* calls for `ptr` with size `sz` - // based on the use of `orig` given in `alloc_uses`. - void insert(Function &F, Instruction *ptr, Constant *sz, Instruction *orig, - const std::set &alloc_uses, - const std::set &preserves); - private: - Instruction *getFirstSafepoint(BasicBlock *bb); - void insertEnd(Instruction *ptr, Constant *sz, Instruction *insert); +struct Optimizer { + Optimizer(Function &F, AllocOpt &pass) + : F(F), + pass(pass) + {} + + void initialize(); + void optimizeAll(); + bool finalize(); +private: + bool isSafepoint(Instruction *inst); + Instruction *getFirstSafepoint(BasicBlock *bb); + ssize_t getGCAllocSize(Instruction *I); + void pushInstruction(Instruction *I); + + void insertLifetimeEnd(Value *ptr, Constant *sz, Instruction *insert); + // insert llvm.lifetime.* calls for `ptr` with size `sz` based on the use of `orig`. + void insertLifetime(Value *ptr, Constant *sz, Instruction *orig); + + void checkInst(Instruction *I); + + void replaceIntrinsicUseWith(IntrinsicInst *call, Intrinsic::ID ID, + Instruction *orig_i, Instruction *new_i); + void removeAlloc(CallInst *orig_inst); + void moveToStack(CallInst *orig_inst, size_t sz, bool has_ref); + void splitOnStack(CallInst *orig_inst); + + Function &F; + AllocOpt &pass; + DominatorTree *_DT = nullptr; + + DominatorTree &getDomTree() + { + if (!_DT) + _DT = &pass.getAnalysis().getDomTree(); + return *_DT; + } + + struct CheckInst { + struct Frame { + Instruction *parent; + uint32_t offset; + Instruction::use_iterator use_it; + Instruction::use_iterator use_end; + }; + typedef SmallVector Stack; + }; + struct Lifetime { struct Frame { BasicBlock *bb; pred_iterator p_cur; @@ -147,37 +220,214 @@ struct AllocOpt : public FunctionPass { p_end(pred_end(bb)) {} }; - AllocOpt &pass; - std::map first_safepoint; - SmallVector stack; + typedef SmallVector Stack; + }; + struct ReplaceUses { + struct Frame { + Instruction *orig_i; + union { + Instruction *new_i; + uint32_t offset; + }; + Frame(Instruction *orig_i, Instruction *new_i) + : orig_i(orig_i), + new_i(new_i) + {} + Frame(Instruction *orig_i, uint32_t offset) + : orig_i(orig_i), + offset(offset) + {} + }; + typedef SmallVector Stack; }; - bool doInitialization(Module &m) override; - bool runOnFunction(Function &F) override; - bool checkInst(Instruction *I, CheckInstStack &stack, std::set &uses, - std::set &preserves, bool &ignore_tag); - void replaceUsesWith(Instruction *orig_i, Instruction *new_i, ReplaceUsesStack &stack, - Value *tag); - void replaceIntrinsicUseWith(IntrinsicInst *call, Intrinsic::ID ID, Instruction *orig_i, - Instruction *new_i); - bool isSafepoint(Instruction *inst); - void getAnalysisUsage(AnalysisUsage &AU) const override - { - FunctionPass::getAnalysisUsage(AU); - AU.addRequired(); - AU.addPreserved(); - AU.setPreservesCFG(); - } + struct MemOp { + Instruction *inst; + unsigned opno; + uint32_t offset = 0; + uint32_t size = 0; + bool isobjref:1; + bool isaggr:1; + MemOp(Instruction *inst, unsigned opno) + : inst(inst), + opno(opno), + isobjref(false), + isaggr(false) + {} + }; + struct Field { + uint32_t size; + bool hasobjref:1; + bool hasaggr:1; + bool multiloc:1; + bool hasload:1; + Type *elty; + SmallVector accesses; + Field(uint32_t size, Type *elty) + : size(size), + hasobjref(false), + hasaggr(false), + multiloc(false), + hasload(false), + elty(elty) + { + } + }; + struct AllocUseInfo { + SmallSet uses; + SmallSet preserves; + std::map memops; + // Completely unknown use + bool escaped:1; + // Address is leaked to functions that doesn't care where the object is allocated. + bool addrescaped:1; + // There are reader of the memory + bool hasload:1; + // There are uses in gc_preserve intrinsics or ccall roots + bool haspreserve:1; + // There are objects fields being loaded + bool refload:1; + // There are objects fields being stored + bool refstore:1; + // There are memset call + bool hasmemset:1; + // There are store/load/memset on this object with offset or size (or value for memset) + // that cannot be statically computed. + // This is a weaker form of `addrescaped` since `hasload` can still be used + // to see if the memory is actually being used + bool hasunknownmem:1; + void reset() + { + escaped = false; + addrescaped = false; + hasload = false; + haspreserve = false; + refload = false; + refstore = false; + hasunknownmem = false; + uses.clear(); + preserves.clear(); + memops.clear(); + } + void dump(); + bool addMemOp(Instruction *inst, unsigned opno, uint32_t offset, Type *elty, + bool isstore, const DataLayout &DL); + std::pair &getField(uint32_t offset, uint32_t size, Type *elty); + std::map::iterator findLowerField(uint32_t offset) + { + // Find the last field that starts no higher than `offset`. + auto it = memops.upper_bound(offset); + if (it != memops.begin()) + return --it; + return memops.end(); + } + }; + + SetVector> worklist; + SmallVector removed; + AllocUseInfo use_info; + CheckInst::Stack check_stack; + Lifetime::Stack lifetime_stack; + ReplaceUses::Stack replace_stack; + std::map first_safepoint; }; -Instruction *AllocOpt::LifetimeMarker::getFirstSafepoint(BasicBlock *bb) +void Optimizer::pushInstruction(Instruction *I) +{ + ssize_t sz = getGCAllocSize(I); + if (sz != -1) { + worklist.insert(std::make_pair(cast(I), sz)); + } +} + +void Optimizer::initialize() +{ + for (auto &bb: F) { + for (auto &I: bb) { + pushInstruction(&I); + } + } +} + +void Optimizer::optimizeAll() +{ + while (!worklist.empty()) { + auto item = worklist.pop_back_val(); + auto orig = item.first; + size_t sz = item.second; + checkInst(orig); + if (use_info.escaped) + continue; + if (!use_info.addrescaped && !use_info.hasload && (!use_info.haspreserve || + !use_info.refstore)) { + // No one took the address, no one reads anything and there's no meaningful + // preserve of fields (either no preserve/ccall or no object reference fields) + // We can just delete all the uses. + removeAlloc(orig); + continue; + } + bool has_ref = false; + bool has_refaggr = false; + for (auto memop: use_info.memops) { + auto &field = memop.second; + if (field.hasobjref) { + has_ref = true; + // This can be relaxed a little based on hasload + if (field.hasaggr || field.multiloc) { + has_refaggr = true; + break; + } + } + } + if (!use_info.hasunknownmem && !use_info.addrescaped && !has_refaggr) { + // No one actually care about the memory layout of this object, split it. + splitOnStack(orig); + continue; + } + if (has_ref) { + if (use_info.memops.size() != 1 || has_refaggr || + use_info.memops.begin()->second.size != sz) { + continue; + } + // The object only has a single field that's a reference with only one kind of access. + } + moveToStack(orig, sz, has_ref); + } +} + +bool Optimizer::finalize() +{ + if (removed.empty()) + return false; + for (auto inst: removed) + inst->eraseFromParent(); + return true; +} + +bool Optimizer::isSafepoint(Instruction *inst) +{ + auto call = dyn_cast(inst); + if (!call) + return false; + if (isa(call)) + return false; + if (auto callee = call->getCalledFunction()) { + // Known functions emitted in codegen that are not safepoints + if (callee == pass.ptr_from_objref || callee->getName() == "memcmp") { + return false; + } + } + return true; +} + +Instruction *Optimizer::getFirstSafepoint(BasicBlock *bb) { auto it = first_safepoint.find(bb); if (it != first_safepoint.end()) return it->second; Instruction *first = nullptr; for (auto &I: *bb) { - if (pass.isSafepoint(&I)) { + if (isSafepoint(&I)) { first = &I; break; } @@ -186,7 +436,268 @@ Instruction *AllocOpt::LifetimeMarker::getFirstSafepoint(BasicBlock *bb) return first; } -void AllocOpt::LifetimeMarker::insertEnd(Instruction *ptr, Constant *sz, Instruction *insert) +ssize_t Optimizer::getGCAllocSize(Instruction *I) +{ + auto call = dyn_cast(I); + if (!call) + return -1; + if (call->getCalledValue() != pass.alloc_obj) + return -1; + assert(call->getNumArgOperands() == 3); + size_t sz = (size_t)cast(call->getArgOperand(1))->getZExtValue(); + if (sz < IntegerType::MAX_INT_BITS / 8 && sz < INT32_MAX) + return sz; + return -1; +} + +std::pair& +Optimizer::AllocUseInfo::getField(uint32_t offset, uint32_t size, Type *elty) +{ + auto it = findLowerField(offset); + auto end = memops.end(); + auto lb = end; // first overlap + auto ub = end; // last overlap + if (it != end) { + // The slot found contains the current location + if (it->first + it->second.size >= offset + size) { + if (it->second.elty != elty) + it->second.elty = nullptr; + return *it; + } + if (it->first + it->second.size > offset) { + lb = it; + ub = it; + } + } + else { + it = memops.begin(); + } + // Now fine the last slot that overlaps with the current memory location. + // Also set `lb` if we didn't find any above. + for (; it != end && it->first < offset + size; ++it) { + if (lb == end) + lb = it; + ub = it; + } + // no overlap found just create a new one. + if (lb == end) + return *memops.emplace(offset, Field(size, elty)).first; + // We find overlapping but not containing slot we need to merge slot/create new one + uint32_t new_offset = std::min(offset, lb->first); + uint32_t new_addrub = std::max(offset + uint32_t(size), ub->first + ub->second.size); + uint32_t new_size = new_addrub - new_offset; + Field field(new_size, nullptr); + field.multiloc = true; + ++ub; + for (it = lb; it != ub; ++it) { + field.hasobjref |= it->second.hasobjref; + field.hasload |= it->second.hasload; + field.hasaggr |= it->second.hasaggr; + field.accesses.append(it->second.accesses.begin(), it->second.accesses.end()); + } + memops.erase(lb, ub); + return *memops.emplace(new_offset, std::move(field)).first; +} + +bool Optimizer::AllocUseInfo::addMemOp(Instruction *inst, unsigned opno, uint32_t offset, + Type *elty, bool isstore, const DataLayout &DL) +{ + MemOp memop(inst, opno); + memop.offset = offset; + uint64_t size = DL.getTypeStoreSize(elty); + if (size >= UINT32_MAX - offset) + return false; + memop.size = size; + memop.isaggr = isa(elty); + memop.isobjref = hasObjref(elty); + auto &field = getField(offset, size, elty); + if (field.first != offset || field.second.size != size) + field.second.multiloc = true; + if (!isstore) + field.second.hasload = true; + if (memop.isobjref) { + if (isstore) { + refstore = true; + } + else { + refload = true; + } + if (memop.isaggr) + field.second.hasaggr = true; + field.second.hasobjref = true; + } + else if (memop.isaggr) { + field.second.hasaggr = true; + } + field.second.accesses.push_back(memop); + return true; +} + +JL_USED_FUNC void Optimizer::AllocUseInfo::dump() +{ + jl_safe_printf("escaped: %d\n", escaped); + jl_safe_printf("addrescaped: %d\n", addrescaped); + jl_safe_printf("hasload: %d\n", hasload); + jl_safe_printf("haspreserve: %d\n", haspreserve); + jl_safe_printf("refload: %d\n", refload); + jl_safe_printf("refstore: %d\n", refstore); + jl_safe_printf("hasunknownmem: %d\n", hasunknownmem); + jl_safe_printf("Uses: %d\n", (unsigned)uses.size()); + for (auto inst: uses) + llvm_dump(inst); + if (!preserves.empty()) { + jl_safe_printf("Preserves: %d\n", (unsigned)preserves.size()); + for (auto inst: preserves) { + llvm_dump(inst); + } + } + if (!memops.empty()) { + jl_safe_printf("Memops: %d\n", (unsigned)memops.size()); + for (auto &field: memops) { + jl_safe_printf(" Field %d @ %d\n", field.second.size, field.first); + jl_safe_printf(" Accesses:\n"); + for (auto memop: field.second.accesses) { + jl_safe_printf(" "); + llvm_dump(memop.inst); + } + } + } +} + +void Optimizer::checkInst(Instruction *I) +{ + use_info.reset(); + if (I->use_empty()) + return; + CheckInst::Frame cur{I, 0, I->use_begin(), I->use_end()}; + check_stack.clear(); + + // Recursion + auto push_inst = [&] (Instruction *inst) { + if (cur.use_it != cur.use_end) + check_stack.push_back(cur); + cur.parent = inst; + cur.use_it = inst->use_begin(); + cur.use_end = inst->use_end(); + }; + + auto check_inst = [&] (Instruction *inst, Use *use) { + if (isa(inst)) { + use_info.hasload = true; + if (cur.offset == UINT32_MAX || !use_info.addMemOp(inst, 0, cur.offset, + inst->getType(), + false, *pass.DL)) + use_info.hasunknownmem = true; + return true; + } + if (auto call = dyn_cast(inst)) { + // TODO handle `memcmp` + // None of the intrinsics should care if the memory is stack or heap allocated. + auto callee = call->getCalledValue(); + if (auto II = dyn_cast(call)) { + if (auto id = II->getIntrinsicID()) { + if (id == Intrinsic::memset) { + assert(call->getNumArgOperands() == 5); + use_info.hasmemset = true; + if (cur.offset == UINT32_MAX || + !isa(call->getArgOperand(2)) || + !isa(call->getArgOperand(1)) || + (cast(call->getArgOperand(2))->getLimitedValue() >= + UINT32_MAX - cur.offset)) + use_info.hasunknownmem = true; + return true; + } + if (id == Intrinsic::lifetime_start || id == Intrinsic::lifetime_end || + isa(II)) + return true; + use_info.addrescaped = true; + return true; + } + if (pass.gc_preserve_begin == callee) { + for (auto user: call->users()) + use_info.uses.insert(cast(user)); + use_info.preserves.insert(call); + use_info.haspreserve = true; + return true; + } + } + if (pass.ptr_from_objref == callee) { + use_info.addrescaped = true; + return true; + } + if (pass.typeof_func == callee || pass.write_barrier_func == callee) + return true; + auto opno = use->getOperandNo(); + // Uses in `jl_roots` operand bundle are not counted as escaping, everything else is. + if (!isBundleOperand(call, opno) || + call->getOperandBundleForOperand(opno).getTagName() != "jl_roots") { + use_info.escaped = true; + return false; + } + use_info.haspreserve = true; + return true; + } + if (auto store = dyn_cast(inst)) { + // Only store value count + if (use->getOperandNo() != StoreInst::getPointerOperandIndex()) { + use_info.escaped = true; + return false; + } + auto storev = store->getValueOperand(); + if (cur.offset == UINT32_MAX || !use_info.addMemOp(inst, use->getOperandNo(), + cur.offset, storev->getType(), + true, *pass.DL)) + use_info.hasunknownmem = true; + return true; + } + if (isa(inst) || isa(inst)) { + push_inst(inst); + return true; + } + if (auto gep = dyn_cast(inst)) { + uint64_t next_offset = cur.offset; + if (cur.offset != UINT32_MAX) { + APInt apoffset(sizeof(void*) * 8, cur.offset, true); + if (!gep->accumulateConstantOffset(*pass.DL, apoffset) || apoffset.isNegative()) { + next_offset = UINT32_MAX; + } + else { + next_offset = apoffset.getLimitedValue(); + if (next_offset > UINT32_MAX) { + next_offset = UINT32_MAX; + } + } + } + push_inst(inst); + cur.offset = (uint32_t)next_offset; + return true; + } + use_info.escaped = true; + return false; + }; + + while (true) { + assert(cur.use_it != cur.use_end); + auto use = &*cur.use_it; + auto inst = dyn_cast(use->getUser()); + ++cur.use_it; + if (!inst) { + use_info.escaped = true; + return; + } + if (!check_inst(inst, use)) + return; + use_info.uses.insert(inst); + if (cur.use_it == cur.use_end) { + if (check_stack.empty()) + return; + cur = check_stack.back(); + check_stack.pop_back(); + } + } +} + +void Optimizer::insertLifetimeEnd(Value *ptr, Constant *sz, Instruction *insert) { BasicBlock::iterator it(insert); BasicBlock::iterator begin(insert->getParent()->begin()); @@ -208,35 +719,33 @@ void AllocOpt::LifetimeMarker::insertEnd(Instruction *ptr, Constant *sz, Instruc CallInst::Create(pass.lifetime_end, {sz, ptr}, "", insert); } -void AllocOpt::LifetimeMarker::insert(Function &F, Instruction *ptr, Constant *sz, - Instruction *orig, const std::set &alloc_uses, - const std::set &preserves) +void Optimizer::insertLifetime(Value *ptr, Constant *sz, Instruction *orig) { CallInst::Create(pass.lifetime_start, {sz, ptr}, "", orig); BasicBlock *def_bb = orig->getParent(); std::set bbs{def_bb}; - auto &DT = pass.getAnalysis().getDomTree(); + auto &DT = getDomTree(); // Collect all BB where the allocation is live - for (auto use: alloc_uses) { + for (auto use: use_info.uses) { auto bb = use->getParent(); if (!bbs.insert(bb).second) continue; - assert(stack.empty()); - Frame cur{bb}; + assert(lifetime_stack.empty()); + Lifetime::Frame cur{bb}; while (true) { assert(cur.p_cur != cur.p_end); auto pred = *cur.p_cur; ++cur.p_cur; if (bbs.insert(pred).second) { if (cur.p_cur != cur.p_end) - stack.push_back(cur); - cur = Frame(pred); + lifetime_stack.push_back(cur); + cur = Lifetime::Frame(pred); } if (cur.p_cur == cur.p_end) { - if (stack.empty()) + if (lifetime_stack.empty()) break; - cur = stack.back(); - stack.pop_back(); + cur = lifetime_stack.back(); + lifetime_stack.pop_back(); } } } @@ -257,7 +766,7 @@ void AllocOpt::LifetimeMarker::insert(Function &F, Instruction *ptr, Constant *s // Record extra BBs that contain invisible uses. SmallSet extra_use; SmallVector*, 8> dominated; - for (auto preserve: preserves) { + for (auto preserve: use_info.preserves) { for (auto RN = DT.getNode(preserve->getParent()); RN; RN = dominated.empty() ? nullptr : dominated.pop_back_val()) { for (auto N: *RN) { @@ -310,7 +819,7 @@ void AllocOpt::LifetimeMarker::insert(Function &F, Instruction *ptr, Constant *s } else { for (auto it = bb->rbegin(), end = bb->rend(); it != end; ++it) { - if (alloc_uses.count(&*it)) { + if (use_info.uses.count(&*it)) { --it; first_dead.push_back(&*it); break; @@ -334,11 +843,11 @@ void AllocOpt::LifetimeMarker::insert(Function &F, Instruction *ptr, Constant *s // Otherwise, there could be use that we don't track // before hitting the next safepoint. if (!DT.dominates(orig, bb)) { - insertEnd(ptr, sz, &*bb->getFirstInsertionPt()); + insertLifetimeEnd(ptr, sz, &*bb->getFirstInsertionPt()); continue; } else if (auto insert = getFirstSafepoint(bb)) { - insertEnd(ptr, sz, insert); + insertLifetimeEnd(ptr, sz, insert); continue; } } @@ -349,8 +858,8 @@ void AllocOpt::LifetimeMarker::insert(Function &F, Instruction *ptr, Constant *s bool safepoint_found = false; for (; it != end; ++it) { auto insert = &*it; - if (pass.isSafepoint(insert)) { - insertEnd(ptr, sz, insert); + if (isSafepoint(insert)) { + insertLifetimeEnd(ptr, sz, insert); safepoint_found = true; break; } @@ -365,153 +874,17 @@ void AllocOpt::LifetimeMarker::insert(Function &F, Instruction *ptr, Constant *s } } -bool AllocOpt::doInitialization(Module &M) +void Optimizer::replaceIntrinsicUseWith(IntrinsicInst *call, Intrinsic::ID ID, + Instruction *orig_i, Instruction *new_i) { - ctx = &M.getContext(); - DL = &M.getDataLayout(); - - alloc_obj = M.getFunction("julia.gc_alloc_obj"); - if (!alloc_obj) - return false; - - ptr_from_objref = M.getFunction("julia.pointer_from_objref"); - gc_preserve_begin = M.getFunction("llvm.julia.gc_preserve_begin"); - typeof_func = M.getFunction("julia.typeof"); - - T_prjlvalue = alloc_obj->getReturnType(); - T_pjlvalue = PointerType::get(cast(T_prjlvalue)->getElementType(), 0); - T_pprjlvalue = PointerType::get(T_prjlvalue, 0); - - T_int8 = Type::getInt8Ty(*ctx); - T_int32 = Type::getInt32Ty(*ctx); - T_int64 = Type::getInt64Ty(*ctx); - T_size = sizeof(void*) == 8 ? T_int64 : T_int32; - T_pint8 = PointerType::get(T_int8, 0); - -#if JL_LLVM_VERSION >= 50000 - lifetime_start = Intrinsic::getDeclaration(&M, Intrinsic::lifetime_start, { T_pint8 }); - lifetime_end = Intrinsic::getDeclaration(&M, Intrinsic::lifetime_end, { T_pint8 }); -#else - lifetime_start = Intrinsic::getDeclaration(&M, Intrinsic::lifetime_start); - lifetime_end = Intrinsic::getDeclaration(&M, Intrinsic::lifetime_end); -#endif - - MDNode *tbaa_data; - MDNode *tbaa_data_scalar; - std::tie(tbaa_data, tbaa_data_scalar) = tbaa_make_child("jtbaa_data"); - tbaa_tag = tbaa_make_child("jtbaa_tag", tbaa_data_scalar).first; - - return true; -} - -bool AllocOpt::checkInst(Instruction *I, CheckInstStack &stack, std::set &uses, - std::set &preserves, bool &ignore_tag) -{ - uses.clear(); - if (I->use_empty()) - return true; - CheckInstFrame cur{I, 0, I->use_begin(), I->use_end()}; - stack.clear(); - - // Recursion - auto push_inst = [&] (Instruction *inst) { - if (cur.use_it != cur.use_end) - stack.push_back(cur); - cur.parent = inst; - cur.use_it = inst->use_begin(); - cur.use_end = inst->use_end(); - }; - - auto check_inst = [&] (Instruction *inst, Use *use) { - if (isa(inst)) - return true; - if (auto call = dyn_cast(inst)) { - // TODO handle `memcmp` - // None of the intrinsics should care if the memory is stack or heap allocated. - auto callee = call->getCalledFunction(); - if (auto II = dyn_cast(call)) { - if (II->getIntrinsicID()) { - return true; - } - if (gc_preserve_begin && gc_preserve_begin == callee) { - for (auto user: call->users()) - uses.insert(cast(user)); - preserves.insert(call); - return true; - } - } - if (ptr_from_objref && ptr_from_objref == callee) - return true; - if (typeof_func && typeof_func == callee) - return true; - auto opno = use->getOperandNo(); - // Uses in `jl_roots` operand bundle are not counted as escaping, everything else is. - if (!isBundleOperand(call, opno)) - return false; - return call->getOperandBundleForOperand(opno).getTagName() == "jl_roots"; - } - if (auto store = dyn_cast(inst)) { - // Only store value count - if (use->getOperandNo() != StoreInst::getPointerOperandIndex()) - return false; - auto storev = store->getValueOperand(); - // There's GC root in this object. - if (auto ptrtype = dyn_cast(storev->getType())) { - if (ptrtype->getAddressSpace() == AddressSpace::Tracked) { - return false; - } - } - return true; - } - if (isa(inst) || isa(inst)) { - push_inst(inst); - return true; - } - if (auto gep = dyn_cast(inst)) { - APInt apoffset(sizeof(void*) * 8, cur.offset, true); - if (ignore_tag && (!gep->accumulateConstantOffset(*DL, apoffset) || - apoffset.isNegative())) - ignore_tag = false; - push_inst(inst); - cur.offset = apoffset.getLimitedValue(); - // Check overflow - if (cur.offset == UINT64_MAX) - ignore_tag = false; - return true; - } - return false; - }; - - while (true) { - assert(cur.use_it != cur.use_end); - auto use = &*cur.use_it; - auto inst = dyn_cast(use->getUser()); - ++cur.use_it; - if (!inst) - return false; - if (!check_inst(inst, use)) - return false; - uses.insert(inst); - if (cur.use_it == cur.use_end) { - if (stack.empty()) - return true; - cur = stack.back(); - stack.pop_back(); - } - } -} - -void AllocOpt::replaceIntrinsicUseWith(IntrinsicInst *call, Intrinsic::ID ID, - Instruction *orig_i, Instruction *new_i) -{ - auto nargs = call->getNumArgOperands(); - SmallVector args(nargs); - SmallVector argTys(nargs); - for (unsigned i = 0; i < nargs; i++) { - auto arg = call->getArgOperand(i); - args[i] = arg == orig_i ? new_i : arg; - argTys[i] = args[i]->getType(); - } + auto nargs = call->getNumArgOperands(); + SmallVector args(nargs); + SmallVector argTys(nargs); + for (unsigned i = 0; i < nargs; i++) { + auto arg = call->getArgOperand(i); + args[i] = arg == orig_i ? new_i : arg; + argTys[i] = args[i]->getType(); + } // Accumulate an array of overloaded types for the given intrinsic SmallVector overloadTys; @@ -536,12 +909,12 @@ void AllocOpt::replaceIntrinsicUseWith(IntrinsicInst *call, Intrinsic::ID ID, newCall->setTailCallKind(call->getTailCallKind()); auto old_attrs = call->getAttributes(); #if JL_LLVM_VERSION >= 50000 - newCall->setAttributes(AttributeList::get(*ctx, old_attrs.getFnAttributes(), + newCall->setAttributes(AttributeList::get(*pass.ctx, old_attrs.getFnAttributes(), old_attrs.getRetAttributes(), {})); #else AttributeSet attr; - attr = attr.addAttributes(*ctx, AttributeSet::ReturnIndex, old_attrs.getRetAttributes()) - .addAttributes(*ctx, AttributeSet::FunctionIndex, old_attrs.getFnAttributes()); + attr = attr.addAttributes(*pass.ctx, AttributeSet::ReturnIndex, old_attrs.getRetAttributes()) + .addAttributes(*pass.ctx, AttributeSet::FunctionIndex, old_attrs.getFnAttributes()); newCall->setAttributes(attr); #endif newCall->setDebugLoc(call->getDebugLoc()); @@ -549,12 +922,46 @@ void AllocOpt::replaceIntrinsicUseWith(IntrinsicInst *call, Intrinsic::ID ID, call->eraseFromParent(); } -// This function needs to handle all cases `AllocOpt::checkInst` can handle. // This function should not erase any safepoint so that the lifetime marker can find and cache // all the original safepoints. -void AllocOpt::replaceUsesWith(Instruction *orig_inst, Instruction *new_inst, - ReplaceUsesStack &stack, Value *tag) +void Optimizer::moveToStack(CallInst *orig_inst, size_t sz, bool has_ref) { + auto tag = orig_inst->getArgOperand(2); + removed.push_back(orig_inst); + // The allocation does not escape or get used in a phi node so none of the derived + // SSA from it are live when we run the allocation again. + // It is now safe to promote the allocation to an entry block alloca. + size_t align = 1; + // TODO make codegen handling of alignment consistent and pass that as a parameter + // to the allocation function directly. + if (sz > 1) + align = MinAlign(JL_SMALL_BYTE_ALIGNMENT, NextPowerOf2(sz)); + // No debug info for prolog instructions + IRBuilder<> prolog_builder(&F.getEntryBlock().front()); + AllocaInst *buff; + Instruction *ptr; + if (sz == 0) { + buff = prolog_builder.CreateAlloca(pass.T_int8, ConstantInt::get(pass.T_int64, 0)); + ptr = buff; + } + else if (has_ref) { + // Allocate with the correct type so that the GC frame lowering pass will + // treat this as a non-mem2reg'd alloca + // The ccall root and GC preserve handling below makes sure that + // the alloca isn't optimized out. + buff = prolog_builder.CreateAlloca(pass.T_prjlvalue); + buff->setAlignment(align); + ptr = cast(prolog_builder.CreateBitCast(buff, pass.T_pint8)); + } + else { + buff = prolog_builder.CreateAlloca(Type::getIntNTy(*pass.ctx, sz * 8)); + buff->setAlignment(align); + ptr = cast(prolog_builder.CreateBitCast(buff, pass.T_pint8)); + } + insertLifetime(ptr, ConstantInt::get(pass.T_int64, sz), orig_inst); + auto new_inst = cast(prolog_builder.CreateBitCast(ptr, pass.T_pjlvalue)); + new_inst->takeName(orig_inst); + auto simple_replace = [&] (Instruction *orig_i, Instruction *new_i) { if (orig_i->user_empty()) { if (orig_i != orig_inst) @@ -573,8 +980,8 @@ void AllocOpt::replaceUsesWith(Instruction *orig_inst, Instruction *new_inst, }; if (simple_replace(orig_inst, new_inst)) return; - assert(stack.empty()); - ReplaceUsesFrame cur{orig_inst, new_inst}; + assert(replace_stack.empty()); + ReplaceUses::Frame cur{orig_inst, new_inst}; auto finish_cur = [&] () { assert(cur.orig_i->user_empty()); if (cur.orig_i != orig_inst) { @@ -584,7 +991,7 @@ void AllocOpt::replaceUsesWith(Instruction *orig_inst, Instruction *new_inst, auto push_frame = [&] (Instruction *orig_i, Instruction *new_i) { if (simple_replace(orig_i, new_i)) return; - stack.push_back(cur); + replace_stack.push_back(cur); cur = {orig_i, new_i}; }; // Both `orig_i` and `new_i` should be pointer of the same type @@ -596,24 +1003,28 @@ void AllocOpt::replaceUsesWith(Instruction *orig_inst, Instruction *new_inst, user->replaceUsesOfWith(orig_i, new_i); } else if (auto call = dyn_cast(user)) { - if (ptr_from_objref && ptr_from_objref == call->getCalledFunction()) { + auto callee = call->getCalledValue(); + if (pass.ptr_from_objref == callee) { call->replaceAllUsesWith(new_i); call->eraseFromParent(); return; } - if (typeof_func && typeof_func == call->getCalledFunction()) { + if (pass.typeof_func == callee) { call->replaceAllUsesWith(tag); call->eraseFromParent(); return; } // Also remove the preserve intrinsics so that it can be better optimized. - if (gc_preserve_begin && gc_preserve_begin == call->getCalledFunction()) { - while (!call->use_empty()) { - auto end = cast(*call->user_begin()); - // gc_preserve_end returns void. - assert(end->use_empty()); - end->eraseFromParent(); + if (pass.gc_preserve_begin == callee) { + if (has_ref) { + call->replaceUsesOfWith(orig_i, buff); + } + else { + removeGCPreserve(call, orig_i); } + return; + } + if (pass.write_barrier_func == callee) { call->eraseFromParent(); return; } @@ -624,8 +1035,8 @@ void AllocOpt::replaceUsesWith(Instruction *orig_inst, Instruction *new_inst, } } // remove from operand bundle - Type *orig_t = orig_i->getType(); - user->replaceUsesOfWith(orig_i, ConstantPointerNull::get(cast(orig_t))); + Value *replace = has_ref ? (Value*)buff : Constant::getNullValue(orig_i->getType()); + user->replaceUsesOfWith(orig_i, replace); } else if (isa(user) || isa(user)) { auto cast_t = PointerType::get(cast(user->getType())->getElementType(), @@ -658,124 +1069,439 @@ void AllocOpt::replaceUsesWith(Instruction *orig_inst, Instruction *new_inst, replace_inst(cast(*cur.orig_i->user_begin())); while (cur.orig_i->use_empty()) { finish_cur(); - if (stack.empty()) + if (replace_stack.empty()) return; - cur = stack.back(); - stack.pop_back(); + cur = replace_stack.back(); + replace_stack.pop_back(); } } } -bool AllocOpt::isSafepoint(Instruction *inst) +// This function should not erase any safepoint so that the lifetime marker can find and cache +// all the original safepoints. +void Optimizer::removeAlloc(CallInst *orig_inst) { - auto call = dyn_cast(inst); - if (!call) - return false; - if (isa(call)) + auto tag = orig_inst->getArgOperand(2); + removed.push_back(orig_inst); + auto simple_remove = [&] (Instruction *orig_i) { + if (orig_i->user_empty()) { + if (orig_i != orig_inst) + orig_i->eraseFromParent(); + return true; + } return false; - if (auto callee = call->getCalledFunction()) { - // Known functions emitted in codegen that are not safepoints - if (callee == ptr_from_objref || callee->getName() == "memcmp") { - return false; + }; + if (simple_remove(orig_inst)) + return; + assert(replace_stack.empty()); + ReplaceUses::Frame cur{orig_inst, nullptr}; + auto finish_cur = [&] () { + assert(cur.orig_i->user_empty()); + if (cur.orig_i != orig_inst) { + cur.orig_i->eraseFromParent(); + } + }; + auto push_frame = [&] (Instruction *orig_i) { + if (simple_remove(orig_i)) + return; + replace_stack.push_back(cur); + cur = {orig_i, nullptr}; + }; + auto remove_inst = [&] (Instruction *user) { + Instruction *orig_i = cur.orig_i; + if (auto store = dyn_cast(user)) { + // All stores are known to be dead. + // The stored value might be an gc pointer in which case deleting the object + // might open more optimization opportunities. + if (auto stored_inst = dyn_cast(store->getValueOperand())) + pushInstruction(stored_inst); + user->eraseFromParent(); + return; + } + else if (auto call = dyn_cast(user)) { + auto callee = call->getCalledValue(); + if (pass.gc_preserve_begin == callee) { + removeGCPreserve(call, orig_i); + return; + } + if (pass.typeof_func == callee) { + call->replaceAllUsesWith(tag); + call->eraseFromParent(); + return; + } + if (pass.write_barrier_func == callee) { + call->eraseFromParent(); + return; + } + if (auto II = dyn_cast(call)) { + auto id = II->getIntrinsicID(); + if (id == Intrinsic::memset || id == Intrinsic::lifetime_start || + id == Intrinsic::lifetime_end || isa(II)) { + call->eraseFromParent(); + } + } + // remove from operand bundle + user->replaceUsesOfWith(orig_i, Constant::getNullValue(orig_i->getType())); + } + else if (isa(user) || isa(user) || + isa(user)) { + push_frame(user); + } + else { + abort(); + } + }; + + while (true) { + remove_inst(cast(*cur.orig_i->user_begin())); + while (cur.orig_i->use_empty()) { + finish_cur(); + if (replace_stack.empty()) + return; + cur = replace_stack.back(); + replace_stack.pop_back(); } } - return true; } -bool AllocOpt::runOnFunction(Function &F) +void Optimizer::splitOnStack(CallInst *orig_inst) { - if (!alloc_obj) + auto tag = orig_inst->getArgOperand(2); + removed.push_back(orig_inst); + IRBuilder<> prolog_builder(&F.getEntryBlock().front()); + struct SplitSlot { + AllocaInst *slot; + bool isref; + uint32_t offset; + uint32_t size; + }; + SmallVector slots; + for (auto memop: use_info.memops) { + auto offset = memop.first; + auto &field = memop.second; + // If the field has no reader and is not a object reference field that we + // need to preserve at some point, there's no need to allocate the field. + if (!field.hasload && (!field.hasobjref || !use_info.haspreserve)) + continue; + SplitSlot slot{nullptr, field.hasobjref, offset, field.size}; + Type *allocty; + if (field.hasobjref) { + allocty = pass.T_prjlvalue; + } + else if (field.elty && !field.multiloc) { + allocty = field.elty; + } + else { + allocty = Type::getIntNTy(*pass.ctx, field.size * 8); + } + slot.slot = prolog_builder.CreateAlloca(allocty); + insertLifetime(prolog_builder.CreateBitCast(slot.slot, pass.T_pint8), + ConstantInt::get(pass.T_int64, field.size), orig_inst); + slots.push_back(std::move(slot)); + } + const auto nslots = slots.size(); + auto find_slot = [&] (uint32_t offset) { + if (offset == 0) + return 0u; + unsigned lb = 0; + unsigned ub = slots.size(); + while (lb + 1 < ub) { + unsigned mid = (lb + ub) / 2; + if (slots[mid].offset <= offset) { + lb = mid; + } + else { + ub = mid; + } + } + return lb; + }; + auto simple_replace = [&] (Instruction *orig_i) { + if (orig_i->user_empty()) { + if (orig_i != orig_inst) + orig_i->eraseFromParent(); + return true; + } return false; - SmallVector,6> allocs; - for (auto &bb: F) { - for (auto &I: bb) { - auto call = dyn_cast(&I); - if (!call) - continue; - auto callee = call->getCalledFunction(); - if (!callee) - continue; - size_t sz; - if (callee == alloc_obj) { - assert(call->getNumArgOperands() == 3); - sz = (size_t)cast(call->getArgOperand(1))->getZExtValue(); + }; + if (simple_replace(orig_inst)) + return; + assert(replace_stack.empty()); + ReplaceUses::Frame cur{orig_inst, uint32_t(0)}; + auto finish_cur = [&] () { + assert(cur.orig_i->user_empty()); + if (cur.orig_i != orig_inst) { + cur.orig_i->eraseFromParent(); + } + }; + auto push_frame = [&] (Instruction *orig_i, uint32_t offset) { + if (simple_replace(orig_i)) + return; + replace_stack.push_back(cur); + cur = {orig_i, offset}; + }; + auto slot_gep = [&] (SplitSlot &slot, uint32_t offset, Type *elty, IRBuilder<> &builder) { + assert(slot.offset <= offset); + offset -= slot.offset; + auto size = pass.DL->getTypeAllocSize(elty); + Value *addr; + if (offset % size == 0) { + addr = builder.CreateBitCast(slot.slot, elty->getPointerTo()); + if (offset != 0) { + addr = builder.CreateConstInBoundsGEP1_32(elty, addr, offset / size); + } + } + else { + addr = builder.CreateBitCast(slot.slot, pass.T_pint8); + addr = builder.CreateConstInBoundsGEP1_32(pass.T_int8, addr, offset); + addr = builder.CreateBitCast(addr, elty->getPointerTo()); + } + return addr; + }; + auto replace_inst = [&] (Use *use) { + Instruction *user = cast(use->getUser()); + Instruction *orig_i = cur.orig_i; + uint32_t offset = cur.offset; + if (auto load = dyn_cast(user)) { + auto slot_idx = find_slot(offset); + auto &slot = slots[slot_idx]; + assert(slot.offset <= offset && slot.offset + slot.size >= offset); + IRBuilder<> builder(load); + Value *val; + auto load_ty = load->getType(); + if (slot.isref) { + assert(slot.offset == offset); + val = builder.CreateLoad(pass.T_prjlvalue, slot.slot); + // Assume the addrspace is correct. + val = builder.CreateBitCast(val, load_ty); } else { - continue; + val = builder.CreateLoad(load_ty, slot_gep(slot, offset, load_ty, builder)); } - if (sz < IntegerType::MAX_INT_BITS / 8 && sz < INT32_MAX) { - allocs.push_back(std::make_pair(call, sz)); + load->replaceAllUsesWith(val); + load->eraseFromParent(); + return; + } + else if (auto store = dyn_cast(user)) { + if (auto stored_inst = dyn_cast(store->getValueOperand())) + pushInstruction(stored_inst); + auto slot_idx = find_slot(offset); + auto &slot = slots[slot_idx]; + if (slot.offset > offset || slot.offset + slot.size <= offset) { + store->eraseFromParent(); + return; + } + IRBuilder<> builder(store); + auto store_val = store->getValueOperand(); + auto store_ty = store_val->getType(); + if (slot.isref) { + assert(slot.offset == offset); + if (!isa(store_ty)) { + store_val = builder.CreateBitCast(store_val, pass.T_size); + store_val = builder.CreateIntToPtr(store_val, pass.T_pjlvalue); + store_ty = pass.T_pjlvalue; + } + else { + store_ty = cast(pass.T_pjlvalue)->getElementType() + ->getPointerTo(cast(store_ty)->getAddressSpace()); + store_val = builder.CreateBitCast(store_val, store_ty); + } + if (cast(store_ty)->getAddressSpace() != AddressSpace::Tracked) + store_val = builder.CreateAddrSpaceCast(store_val, pass.T_prjlvalue); + builder.CreateStore(store_val, slot.slot); + } + else { + builder.CreateStore(store_val, slot_gep(slot, offset, store_ty, builder)); } + store->eraseFromParent(); + return; } - } - - auto &entry = F.getEntryBlock(); - CheckInstStack check_stack; - ReplaceUsesStack replace_stack; - std::set alloc_uses; - std::set preserves; - LifetimeMarker lifetime(*this); - for (auto &it: allocs) { - // TODO, this should not be needed anymore now that we've hide the tag access completely. - bool ignore_tag = true; - auto orig = it.first; - size_t &sz = it.second; - preserves.clear(); - if (!checkInst(orig, check_stack, alloc_uses, preserves, ignore_tag)) { - sz = UINT32_MAX; - continue; + else if (auto call = dyn_cast(user)) { + auto callee = call->getCalledValue(); + if (auto intrinsic = dyn_cast(call)) { + if (Intrinsic::ID id = intrinsic->getIntrinsicID()) { + if (id == Intrinsic::memset) { + IRBuilder<> builder(call); + auto val_arg = cast(call->getArgOperand(1)); + auto size_arg = cast(call->getArgOperand(2)); + uint8_t val = val_arg->getLimitedValue(); + uint32_t size = size_arg->getLimitedValue(); + for (unsigned idx = find_slot(offset); idx < nslots; idx++) { + auto &slot = slots[idx]; + if (slot.offset + slot.size <= offset || slot.offset >= offset + size) + break; + if (slot.isref) { + assert(slot.offset >= offset && + slot.offset + slot.size <= offset + size); + Constant *ptr; + if (val == 0) { + ptr = Constant::getNullValue(pass.T_prjlvalue); + } + else { + uint64_t intval; + memset(&intval, val, 8); + Constant *val = ConstantInt::get(pass.T_size, intval); + val = ConstantExpr::getIntToPtr(val, pass.T_pjlvalue); + ptr = ConstantExpr::getAddrSpaceCast(val, pass.T_prjlvalue); + } + builder.CreateStore(ptr, slot.slot); + continue; + } + auto ptr8 = builder.CreateBitCast(slot.slot, pass.T_pint8); + if (offset > slot.offset) + ptr8 = builder.CreateConstInBoundsGEP1_32(pass.T_int8, ptr8, + offset - slot.offset); + auto sub_size = std::min(slot.offset + slot.size, offset + size) - + std::max(offset, slot.offset); + builder.CreateMemSet(ptr8, val_arg, sub_size, 0); + } + call->eraseFromParent(); + return; + } + call->eraseFromParent(); + return; + } + } + if (pass.typeof_func == callee) { + call->replaceAllUsesWith(tag); + call->eraseFromParent(); + return; + } + if (pass.write_barrier_func == callee) { + call->eraseFromParent(); + return; + } + if (pass.gc_preserve_begin == callee) { + SmallVector operands; + for (auto &arg: call->arg_operands()) { + if (arg.get() == orig_i || isa(arg.get())) + continue; + operands.push_back(arg.get()); + } + IRBuilder<> builder(call); + for (auto &slot: slots) { + if (!slot.isref) + continue; + operands.push_back(builder.CreateLoad(pass.T_prjlvalue, slot.slot)); + } + auto new_call = builder.CreateCall(pass.gc_preserve_begin, operands); + new_call->takeName(call); + new_call->setAttributes(call->getAttributes()); + call->replaceAllUsesWith(new_call); + call->eraseFromParent(); + return; + } + // remove from operand bundle + assert(isBundleOperand(call, use->getOperandNo())); + assert(call->getOperandBundleForOperand(use->getOperandNo()).getTagName() == + "jl_roots"); + SmallVector bundles; + call->getOperandBundlesAsDefs(bundles); + for (auto &bundle: bundles) { + if (bundle.getTag() != "jl_roots") + continue; + std::vector operands; + for (auto op: bundle.inputs()) { + if (op == orig_i || isa(op)) + continue; + operands.push_back(op); + } + IRBuilder<> builder(call); + for (auto &slot: slots) { + if (!slot.isref) + continue; + operands.push_back(builder.CreateLoad(pass.T_prjlvalue, slot.slot)); + } + bundle = OperandBundleDef("jl_roots", std::move(operands)); + break; + } + auto new_call = CallInst::Create(call, bundles, call); + new_call->takeName(call); + call->replaceAllUsesWith(new_call); + call->eraseFromParent(); + return; + } + else if (isa(user) || isa(user)) { + push_frame(user, offset); } - // The allocation does not escape or get used in a phi node so none of the derived - // SSA from it are live when we run the allocation again. - // It is now safe to promote the allocation to an entry block alloca. - size_t align = 1; - // TODO make codegen handling of alignment consistent and pass that as a parameter - // to the allocation function directly. - if (!ignore_tag) { - align = sz <= 8 ? 8 : JL_SMALL_BYTE_ALIGNMENT; - sz += align; - } - else if (sz > 1) { - align = llvm::MinAlign(JL_SMALL_BYTE_ALIGNMENT, llvm::NextPowerOf2(sz)); - } - // No debug info for prolog instructions - IRBuilder<> prolog_builder(&entry.front()); - AllocaInst *buff; - Instruction *ptr; - if (sz == 0) { - buff = prolog_builder.CreateAlloca(T_int8, ConstantInt::get(T_int64, 0)); - ptr = buff; + else if (auto gep = dyn_cast(user)) { + APInt apoffset(sizeof(void*) * 8, offset, true); + gep->accumulateConstantOffset(*pass.DL, apoffset); + push_frame(gep, apoffset.getLimitedValue()); } else { - buff = prolog_builder.CreateAlloca(Type::getIntNTy(*ctx, sz * 8)); - buff->setAlignment(align); - ptr = cast(prolog_builder.CreateBitCast(buff, T_pint8)); - } - lifetime.insert(F, ptr, ConstantInt::get(T_int64, sz), orig, alloc_uses, preserves); - auto tag = orig->getArgOperand(2); - // Someone might be reading the tag, initialize it. - if (!ignore_tag) { - ptr = cast(prolog_builder.CreateConstGEP1_32(T_int8, ptr, align)); - auto casti = prolog_builder.CreateBitCast(ptr, T_pprjlvalue); - auto tagaddr = prolog_builder.CreateGEP(T_prjlvalue, casti, - ConstantInt::get(T_size, -1)); - // Store should be created at the callsite and not in the prolog - auto store = new StoreInst(tag, tagaddr, orig); - store->setMetadata(LLVMContext::MD_tbaa, tbaa_tag); - store->setDebugLoc(orig->getDebugLoc()); - } - auto casti = cast(prolog_builder.CreateBitCast(ptr, T_pjlvalue)); - casti->takeName(orig); - replaceUsesWith(orig, cast(casti), replace_stack, tag); - } - for (auto it: allocs) { - if (it.second == UINT32_MAX) + abort(); + } + }; + + while (true) { + replace_inst(&*cur.orig_i->use_begin()); + while (cur.orig_i->use_empty()) { + finish_cur(); + if (replace_stack.empty()) + goto cleanup; + cur = replace_stack.back(); + replace_stack.pop_back(); + } + } +cleanup: + for (auto &slot: slots) { + if (!slot.isref) continue; - it.first->eraseFromParent(); + PromoteMemToReg({slot.slot}, getDomTree()); } +} + +bool AllocOpt::doInitialization(Module &M) +{ + ctx = &M.getContext(); + DL = &M.getDataLayout(); + + alloc_obj = M.getFunction("julia.gc_alloc_obj"); + if (!alloc_obj) + return false; + + ptr_from_objref = M.getFunction("julia.pointer_from_objref"); + gc_preserve_begin = M.getFunction("llvm.julia.gc_preserve_begin"); + typeof_func = M.getFunction("julia.typeof"); + write_barrier_func = M.getFunction("julia.write_barrier"); + + T_prjlvalue = alloc_obj->getReturnType(); + T_pjlvalue = PointerType::get(cast(T_prjlvalue)->getElementType(), 0); + T_pprjlvalue = PointerType::get(T_prjlvalue, 0); + + T_int8 = Type::getInt8Ty(*ctx); + T_int32 = Type::getInt32Ty(*ctx); + T_int64 = Type::getInt64Ty(*ctx); + T_size = sizeof(void*) == 8 ? T_int64 : T_int32; + T_pint8 = PointerType::get(T_int8, 0); + +#if JL_LLVM_VERSION >= 50000 + lifetime_start = Intrinsic::getDeclaration(&M, Intrinsic::lifetime_start, { T_pint8 }); + lifetime_end = Intrinsic::getDeclaration(&M, Intrinsic::lifetime_end, { T_pint8 }); +#else + lifetime_start = Intrinsic::getDeclaration(&M, Intrinsic::lifetime_start); + lifetime_end = Intrinsic::getDeclaration(&M, Intrinsic::lifetime_end); +#endif + + MDNode *tbaa_data; + MDNode *tbaa_data_scalar; + std::tie(tbaa_data, tbaa_data_scalar) = tbaa_make_child("jtbaa_data"); + tbaa_tag = tbaa_make_child("jtbaa_tag", tbaa_data_scalar).first; + return true; } +bool AllocOpt::runOnFunction(Function &F) +{ + if (!alloc_obj) + return false; + Optimizer optimizer(F, *this); + optimizer.initialize(); + optimizer.optimizeAll(); + return optimizer.finalize(); +} + char AllocOpt::ID = 0; static RegisterPass X("AllocOpt", "Promote heap allocation to stack", false /* Only looks at CFG */, diff --git a/src/llvm-late-gc-lowering.cpp b/src/llvm-late-gc-lowering.cpp index df95eb8521136..016f4d8b90f8e 100644 --- a/src/llvm-late-gc-lowering.cpp +++ b/src/llvm-late-gc-lowering.cpp @@ -769,7 +769,7 @@ void RecursivelyVisit(callback f, Value *V) { f(VU); if (isa(TheUser) || isa(TheUser) || isa(TheUser) || isa(TheUser) || - isa(TheUser)) + isa(TheUser) || isa(TheUser)) continue; if (isa(TheUser) || isa(TheUser) || isa(TheUser)) { RecursivelyVisit(f, TheUser); diff --git a/test/llvmpasses/alloc-opt2.jl b/test/llvmpasses/alloc-opt2.jl index 18f9e780f4cf9..c23444fb0abb7 100644 --- a/test/llvmpasses/alloc-opt2.jl +++ b/test/llvmpasses/alloc-opt2.jl @@ -46,8 +46,48 @@ L3: """) # CHECK-LABEL: } +# CHECK-LABEL: @preserve_branches2 +# CHECK: alloca i64 +# CHECK: call %jl_value_t*** @julia.ptls_states() +# CHECK: L1: +# CHECK-NEXT: call void @llvm.lifetime.start{{.*}}(i64 8, +# CHECK-NEXT: @llvm.julia.gc_preserve_begin{{.*}}%jl_value_t addrspace(10)* %v2 +# CHECK-NEXT: @external_function() +# CHECK-NEXT: br i1 %b2, label %L2, label %L3 + +# CHECK: L2: +# CHECK-NOT: call void @llvm.lifetime.end{{.*}}(i64 8, +# CHECK: @external_function() +# CHECK-NEXT: br label %L3 + +# CHECK: L3: +# CHECK-NEXT: call void @llvm.lifetime.end{{.*}}(i64 8, +println(""" +define void @preserve_branches2(i8* %fptr, i1 %b, i1 %b2) { + %ptls = call %jl_value_t*** @julia.ptls_states() + %ptls_i8 = bitcast %jl_value_t*** %ptls to i8* + %v2 = call %jl_value_t addrspace(10)* @external_function2() + br i1 %b, label %L1, label %L3 + +L1: + %v = call noalias %jl_value_t addrspace(10)* @julia.gc_alloc_obj(i8* %ptls_i8, $isz 8, %jl_value_t addrspace(10)* @tag) + %tok = call token (...) @llvm.julia.gc_preserve_begin(%jl_value_t addrspace(10)* %v, %jl_value_t addrspace(10)* %v2) + call void @external_function() + br i1 %b2, label %L2, label %L3 + +L2: + call void @external_function() + br label %L3 + +L3: + ret void +} +""") +# CHECK-LABEL: } + println(""" declare void @external_function() +declare %jl_value_t addrspace(10)* @external_function2() declare %jl_value_t*** @julia.ptls_states() declare noalias %jl_value_t addrspace(10)* @julia.gc_alloc_obj(i8*, $isz, %jl_value_t addrspace(10)*) declare i64 @julia.pointer_from_objref(%jl_value_t addrspace(11)*)