diff --git a/src/jitlayers.cpp b/src/jitlayers.cpp index 53ffaaaa8dab3..91d994071eabb 100644 --- a/src/jitlayers.cpp +++ b/src/jitlayers.cpp @@ -193,6 +193,11 @@ void addOptimizationPasses(legacy::PassManagerBase *PM, int opt_level, bool dump PM->add(createEarlyCSEPass()); //// **** +#if JL_LLVM_VERSION >= 50000 + // Load forwarding above can expose allocations that aren't actually used + // remove those before optimizing loops. + PM->add(createAllocOptPass()); +#endif PM->add(createLoopIdiomPass()); //// **** PM->add(createLoopRotatePass()); // Rotate loops. #ifdef USE_POLLY @@ -215,6 +220,10 @@ void addOptimizationPasses(legacy::PassManagerBase *PM, int opt_level, bool dump PM->add(createSimpleLoopUnrollPass()); // Unroll small loops //PM->add(createLoopStrengthReducePass()); // (jwb added) +#if JL_LLVM_VERSION >= 50000 + // Run our own SROA on heap objects before LLVM's + PM->add(createAllocOptPass()); +#endif // Re-run SROA after loop-unrolling (useful for small loops that operate, // over the structure of an aggregate) PM->add(createSROAPass()); // Break up aggregate allocas @@ -231,6 +240,10 @@ void addOptimizationPasses(legacy::PassManagerBase *PM, int opt_level, bool dump PM->add(createJumpThreadingPass()); // Thread jumps PM->add(createDeadStoreEliminationPass()); // Delete dead stores +#if JL_LLVM_VERSION >= 50000 + // More dead allocation (store) deletion before loop optimization + PM->add(createAllocOptPass()); +#endif // see if all of the constant folding has exposed more loops // to simplification and deletion // this helps significantly with cleaning up iteration diff --git a/src/llvm-alloc-opt.cpp b/src/llvm-alloc-opt.cpp index 5ca10d5439c10..9d118680b2396 100644 --- a/src/llvm-alloc-opt.cpp +++ b/src/llvm-alloc-opt.cpp @@ -18,6 +18,7 @@ #include #include #include +#include #include "codegen_shared.h" #include "julia.h" @@ -61,7 +62,7 @@ static bool isBundleOperand(CallInst *call, unsigned idx) static void removeGCPreserve(CallInst *call, Instruction *val) { - auto replace = ConstantPointerNull::get(cast(val->getType())); + auto replace = Constant::getNullValue(val->getType()); call->replaceUsesOfWith(val, replace); for (auto &arg: call->arg_operands()) { if (!isa(arg.get())) { @@ -77,6 +78,22 @@ static void removeGCPreserve(CallInst *call, Instruction *val) call->eraseFromParent(); } +static bool hasObjref(Type *ty) +{ + if (auto ptrty = dyn_cast(ty)) + return ptrty->getAddressSpace() == AddressSpace::Tracked; + if (auto seqty = dyn_cast(ty)) + return hasObjref(seqty->getElementType()); + if (auto structty = dyn_cast(ty)) { + for (auto elty: structty->elements()) { + if (hasObjref(elty)) { + return true; + } + } + } + return false; +} + /** * Promote `julia.gc_alloc_obj` which do not have escaping root to a alloca. * Uses that are not considered to escape the object (i.e. heap address) includes, @@ -94,6 +111,14 @@ static void removeGCPreserve(CallInst *call, Instruction *val) * All other uses are considered to escape conservatively. */ +/** + * TODO: + * * Return twice + * * Handle phi node. + * * Look through `pointer_from_objref`. + * * Handle jl_box* + */ + struct AllocOpt : public FunctionPass { static char ID; AllocOpt() @@ -112,6 +137,7 @@ struct AllocOpt : public FunctionPass { Function *lifetime_end; Function *gc_preserve_begin; Function *typeof_func; + Function *write_barrier_func; Type *T_int8; Type *T_int32; @@ -149,17 +175,19 @@ struct Optimizer { bool isSafepoint(Instruction *inst); Instruction *getFirstSafepoint(BasicBlock *bb); ssize_t getGCAllocSize(Instruction *I); + void pushInstruction(Instruction *I); - void insertLifetimeEnd(Instruction *ptr, Constant *sz, Instruction *insert); - // insert llvm.lifetime.* calls for `ptr` with size `sz` - // based on the use of `orig` given in `alloc_uses`. - void insertLifetime(Instruction *ptr, Constant *sz, Instruction *orig); + void insertLifetimeEnd(Value *ptr, Constant *sz, Instruction *insert); + // insert llvm.lifetime.* calls for `ptr` with size `sz` based on the use of `orig`. + void insertLifetime(Value *ptr, Constant *sz, Instruction *orig); - bool checkInst(Instruction *I); + void checkInst(Instruction *I); void replaceIntrinsicUseWith(IntrinsicInst *call, Intrinsic::ID ID, Instruction *orig_i, Instruction *new_i); - void replaceUsesWith(Instruction *orig_inst, Instruction *new_inst, Value *tag); + void removeAlloc(CallInst *orig_inst); + void moveToStack(CallInst *orig_inst, size_t sz, bool has_ref); + void splitOnStack(CallInst *orig_inst); Function &F; AllocOpt &pass; @@ -175,7 +203,7 @@ struct Optimizer { struct CheckInst { struct Frame { Instruction *parent; - uint64_t offset; + uint32_t offset; Instruction::use_iterator use_it; Instruction::use_iterator use_end; }; @@ -197,33 +225,126 @@ struct Optimizer { struct ReplaceUses { struct Frame { Instruction *orig_i; - Instruction *new_i; + union { + Instruction *new_i; + uint32_t offset; + }; Frame(Instruction *orig_i, Instruction *new_i) : orig_i(orig_i), new_i(new_i) {} + Frame(Instruction *orig_i, uint32_t offset) + : orig_i(orig_i), + offset(offset) + {} }; typedef SmallVector Stack; }; + struct MemOp { + Instruction *inst; + unsigned opno; + uint32_t offset = 0; + uint32_t size = 0; + bool isobjref:1; + bool isaggr:1; + MemOp(Instruction *inst, unsigned opno) + : inst(inst), + opno(opno), + isobjref(false), + isaggr(false) + {} + }; + struct Field { + uint32_t size; + bool hasobjref:1; + bool hasaggr:1; + bool multiloc:1; + bool hasload:1; + Type *elty; + SmallVector accesses; + Field(uint32_t size, Type *elty) + : size(size), + hasobjref(false), + hasaggr(false), + multiloc(false), + hasload(false), + elty(elty) + { + } + }; + struct AllocUseInfo { + SmallSet uses; + SmallSet preserves; + std::map memops; + // Completely unknown use + bool escaped:1; + // Address is leaked to functions that doesn't care where the object is allocated. + bool addrescaped:1; + // There are reader of the memory + bool hasload:1; + // There are uses in gc_preserve intrinsics or ccall roots + bool haspreserve:1; + // There are objects fields being loaded + bool refload:1; + // There are objects fields being stored + bool refstore:1; + // There are memset call + bool hasmemset:1; + // There are store/load/memset on this object with offset or size (or value for memset) + // that cannot be statically computed. + // This is a weaker form of `addrescaped` since `hasload` can still be used + // to see if the memory is actually being used + bool hasunknownmem:1; + void reset() + { + escaped = false; + addrescaped = false; + hasload = false; + haspreserve = false; + refload = false; + refstore = false; + hasunknownmem = false; + uses.clear(); + preserves.clear(); + memops.clear(); + } + void dump(); + bool addMemOp(Instruction *inst, unsigned opno, uint32_t offset, Type *elty, + bool isstore, const DataLayout &DL); + std::pair &getField(uint32_t offset, uint32_t size, Type *elty); + std::map::iterator findLowerField(uint32_t offset) + { + // Find the last field that starts no higher than `offset`. + auto it = memops.upper_bound(offset); + if (it != memops.begin()) + return --it; + return memops.end(); + } + }; + SetVector> worklist; SmallVector removed; - SmallSet alloc_uses; - SmallSet preserves; + AllocUseInfo use_info; CheckInst::Stack check_stack; Lifetime::Stack lifetime_stack; ReplaceUses::Stack replace_stack; std::map first_safepoint; }; +void Optimizer::pushInstruction(Instruction *I) +{ + ssize_t sz = getGCAllocSize(I); + if (sz != -1) { + worklist.insert(std::make_pair(cast(I), sz)); + } +} + void Optimizer::initialize() { for (auto &bb: F) { for (auto &I: bb) { - ssize_t sz = getGCAllocSize(&I); - if (sz != -1) { - worklist.insert(std::make_pair(cast(&I), sz)); - } + pushInstruction(&I); } } } @@ -234,35 +355,43 @@ void Optimizer::optimizeAll() auto item = worklist.pop_back_val(); auto orig = item.first; size_t sz = item.second; - if (!checkInst(orig)) + checkInst(orig); + if (use_info.escaped) + continue; + if (!use_info.addrescaped && !use_info.hasload && (!use_info.haspreserve || + !use_info.refstore)) { + // No one took the address, no one reads anything and there's no meaningful + // preserve of fields (either no preserve/ccall or no object reference fields) + // We can just delete all the uses. + removeAlloc(orig); continue; - removed.push_back(orig); - // The allocation does not escape or get used in a phi node so none of the derived - // SSA from it are live when we run the allocation again. - // It is now safe to promote the allocation to an entry block alloca. - size_t align = 1; - // TODO make codegen handling of alignment consistent and pass that as a parameter - // to the allocation function directly. - if (sz > 1) - align = MinAlign(JL_SMALL_BYTE_ALIGNMENT, NextPowerOf2(sz)); - // No debug info for prolog instructions - IRBuilder<> prolog_builder(&F.getEntryBlock().front()); - AllocaInst *buff; - Instruction *ptr; - if (sz == 0) { - buff = prolog_builder.CreateAlloca(pass.T_int8, ConstantInt::get(pass.T_int64, 0)); - ptr = buff; } - else { - buff = prolog_builder.CreateAlloca(Type::getIntNTy(*pass.ctx, sz * 8)); - buff->setAlignment(align); - ptr = cast(prolog_builder.CreateBitCast(buff, pass.T_pint8)); + bool has_ref = false; + bool has_refaggr = false; + for (auto memop: use_info.memops) { + auto &field = memop.second; + if (field.hasobjref) { + has_ref = true; + // This can be relaxed a little based on hasload + if (field.hasaggr || field.multiloc) { + has_refaggr = true; + break; + } + } + } + if (!use_info.hasunknownmem && !use_info.addrescaped && !has_refaggr) { + // No one actually care about the memory layout of this object, split it. + splitOnStack(orig); + continue; + } + if (has_ref) { + if (use_info.memops.size() != 1 || has_refaggr || + use_info.memops.begin()->second.size != sz) { + continue; + } + // The object only has a single field that's a reference with only one kind of access. } - insertLifetime(ptr, ConstantInt::get(pass.T_int64, sz), orig); - auto tag = orig->getArgOperand(2); - auto casti = cast(prolog_builder.CreateBitCast(ptr, pass.T_pjlvalue)); - casti->takeName(orig); - replaceUsesWith(orig, casti, tag); + moveToStack(orig, sz, has_ref); } } @@ -321,12 +450,125 @@ ssize_t Optimizer::getGCAllocSize(Instruction *I) return -1; } -bool Optimizer::checkInst(Instruction *I) +std::pair& +Optimizer::AllocUseInfo::getField(uint32_t offset, uint32_t size, Type *elty) +{ + auto it = findLowerField(offset); + auto end = memops.end(); + auto lb = end; // first overlap + auto ub = end; // last overlap + if (it != end) { + // The slot found contains the current location + if (it->first + it->second.size >= offset + size) { + if (it->second.elty != elty) + it->second.elty = nullptr; + return *it; + } + if (it->first + it->second.size > offset) { + lb = it; + ub = it; + } + } + else { + it = memops.begin(); + } + // Now fine the last slot that overlaps with the current memory location. + // Also set `lb` if we didn't find any above. + for (; it != end && it->first < offset + size; ++it) { + if (lb == end) + lb = it; + ub = it; + } + // no overlap found just create a new one. + if (lb == end) + return *memops.emplace(offset, Field(size, elty)).first; + // We find overlapping but not containing slot we need to merge slot/create new one + uint32_t new_offset = std::min(offset, lb->first); + uint32_t new_addrub = std::max(offset + uint32_t(size), ub->first + ub->second.size); + uint32_t new_size = new_addrub - new_offset; + Field field(new_size, nullptr); + field.multiloc = true; + ++ub; + for (it = lb; it != ub; ++it) { + field.hasobjref |= it->second.hasobjref; + field.hasload |= it->second.hasload; + field.hasaggr |= it->second.hasaggr; + field.accesses.append(it->second.accesses.begin(), it->second.accesses.end()); + } + memops.erase(lb, ub); + return *memops.emplace(new_offset, std::move(field)).first; +} + +bool Optimizer::AllocUseInfo::addMemOp(Instruction *inst, unsigned opno, uint32_t offset, + Type *elty, bool isstore, const DataLayout &DL) +{ + MemOp memop(inst, opno); + memop.offset = offset; + uint64_t size = DL.getTypeStoreSize(elty); + if (size >= UINT32_MAX - offset) + return false; + memop.size = size; + memop.isaggr = isa(elty); + memop.isobjref = hasObjref(elty); + auto &field = getField(offset, size, elty); + if (field.first != offset || field.second.size != size) + field.second.multiloc = true; + if (!isstore) + field.second.hasload = true; + if (memop.isobjref) { + if (isstore) { + refstore = true; + } + else { + refload = true; + } + if (memop.isaggr) + field.second.hasaggr = true; + field.second.hasobjref = true; + } + else if (memop.isaggr) { + field.second.hasaggr = true; + } + field.second.accesses.push_back(memop); + return true; +} + +JL_USED_FUNC void Optimizer::AllocUseInfo::dump() +{ + jl_safe_printf("escaped: %d\n", escaped); + jl_safe_printf("addrescaped: %d\n", addrescaped); + jl_safe_printf("hasload: %d\n", hasload); + jl_safe_printf("haspreserve: %d\n", haspreserve); + jl_safe_printf("refload: %d\n", refload); + jl_safe_printf("refstore: %d\n", refstore); + jl_safe_printf("hasunknownmem: %d\n", hasunknownmem); + jl_safe_printf("Uses: %d\n", (unsigned)uses.size()); + for (auto inst: uses) + llvm_dump(inst); + if (!preserves.empty()) { + jl_safe_printf("Preserves: %d\n", (unsigned)preserves.size()); + for (auto inst: preserves) { + llvm_dump(inst); + } + } + if (!memops.empty()) { + jl_safe_printf("Memops: %d\n", (unsigned)memops.size()); + for (auto &field: memops) { + jl_safe_printf(" Field %d @ %d\n", field.second.size, field.first); + jl_safe_printf(" Accesses:\n"); + for (auto memop: field.second.accesses) { + jl_safe_printf(" "); + llvm_dump(memop.inst); + } + } + } +} + +void Optimizer::checkInst(Instruction *I) { - preserves.clear(); - alloc_uses.clear(); + use_info.reset(); if (I->use_empty()) - return true; + return; CheckInst::Frame cur{I, 0, I->use_begin(), I->use_end()}; check_stack.clear(); @@ -340,42 +582,72 @@ bool Optimizer::checkInst(Instruction *I) }; auto check_inst = [&] (Instruction *inst, Use *use) { - if (isa(inst)) + if (isa(inst)) { + use_info.hasload = true; + if (cur.offset == UINT32_MAX || !use_info.addMemOp(inst, 0, cur.offset, + inst->getType(), + false, *pass.DL)) + use_info.hasunknownmem = true; return true; + } if (auto call = dyn_cast(inst)) { // TODO handle `memcmp` // None of the intrinsics should care if the memory is stack or heap allocated. auto callee = call->getCalledValue(); if (auto II = dyn_cast(call)) { - if (II->getIntrinsicID()) { + if (auto id = II->getIntrinsicID()) { + if (id == Intrinsic::memset) { + assert(call->getNumArgOperands() == 5); + use_info.hasmemset = true; + if (cur.offset == UINT32_MAX || + !isa(call->getArgOperand(2)) || + !isa(call->getArgOperand(1)) || + (cast(call->getArgOperand(2))->getLimitedValue() >= + UINT32_MAX - cur.offset)) + use_info.hasunknownmem = true; + return true; + } + if (id == Intrinsic::lifetime_start || id == Intrinsic::lifetime_end || + isa(II)) + return true; + use_info.addrescaped = true; return true; } if (pass.gc_preserve_begin == callee) { for (auto user: call->users()) - alloc_uses.insert(cast(user)); - preserves.insert(call); + use_info.uses.insert(cast(user)); + use_info.preserves.insert(call); + use_info.haspreserve = true; return true; } } - if (pass.ptr_from_objref == callee || pass.typeof_func == callee) + if (pass.ptr_from_objref == callee) { + use_info.addrescaped = true; + return true; + } + if (pass.typeof_func == callee || pass.write_barrier_func == callee) return true; auto opno = use->getOperandNo(); // Uses in `jl_roots` operand bundle are not counted as escaping, everything else is. - if (!isBundleOperand(call, opno)) + if (!isBundleOperand(call, opno) || + call->getOperandBundleForOperand(opno).getTagName() != "jl_roots") { + use_info.escaped = true; return false; - return call->getOperandBundleForOperand(opno).getTagName() == "jl_roots"; + } + use_info.haspreserve = true; + return true; } if (auto store = dyn_cast(inst)) { // Only store value count - if (use->getOperandNo() != StoreInst::getPointerOperandIndex()) + if (use->getOperandNo() != StoreInst::getPointerOperandIndex()) { + use_info.escaped = true; return false; - auto storev = store->getValueOperand(); - // There's GC root in this object. - if (auto ptrtype = dyn_cast(storev->getType())) { - if (ptrtype->getAddressSpace() == AddressSpace::Tracked) { - return false; - } } + auto storev = store->getValueOperand(); + if (cur.offset == UINT32_MAX || !use_info.addMemOp(inst, use->getOperandNo(), + cur.offset, storev->getType(), + true, *pass.DL)) + use_info.hasunknownmem = true; return true; } if (isa(inst) || isa(inst)) { @@ -383,18 +655,24 @@ bool Optimizer::checkInst(Instruction *I) return true; } if (auto gep = dyn_cast(inst)) { - APInt apoffset(sizeof(void*) * 8, cur.offset, true); - uint64_t next_offset; - if (!gep->accumulateConstantOffset(*pass.DL, apoffset) || apoffset.isNegative()) { - next_offset = UINT64_MAX; - } - else { - next_offset = apoffset.getLimitedValue(); + uint64_t next_offset = cur.offset; + if (cur.offset != UINT32_MAX) { + APInt apoffset(sizeof(void*) * 8, cur.offset, true); + if (!gep->accumulateConstantOffset(*pass.DL, apoffset) || apoffset.isNegative()) { + next_offset = UINT32_MAX; + } + else { + next_offset = apoffset.getLimitedValue(); + if (next_offset > UINT32_MAX) { + next_offset = UINT32_MAX; + } + } } push_inst(inst); - cur.offset = next_offset; + cur.offset = (uint32_t)next_offset; return true; } + use_info.escaped = true; return false; }; @@ -403,21 +681,23 @@ bool Optimizer::checkInst(Instruction *I) auto use = &*cur.use_it; auto inst = dyn_cast(use->getUser()); ++cur.use_it; - if (!inst) - return false; + if (!inst) { + use_info.escaped = true; + return; + } if (!check_inst(inst, use)) - return false; - alloc_uses.insert(inst); + return; + use_info.uses.insert(inst); if (cur.use_it == cur.use_end) { if (check_stack.empty()) - return true; + return; cur = check_stack.back(); check_stack.pop_back(); } } } -void Optimizer::insertLifetimeEnd(Instruction *ptr, Constant *sz, Instruction *insert) +void Optimizer::insertLifetimeEnd(Value *ptr, Constant *sz, Instruction *insert) { BasicBlock::iterator it(insert); BasicBlock::iterator begin(insert->getParent()->begin()); @@ -439,14 +719,14 @@ void Optimizer::insertLifetimeEnd(Instruction *ptr, Constant *sz, Instruction *i CallInst::Create(pass.lifetime_end, {sz, ptr}, "", insert); } -void Optimizer::insertLifetime(Instruction *ptr, Constant *sz, Instruction *orig) +void Optimizer::insertLifetime(Value *ptr, Constant *sz, Instruction *orig) { CallInst::Create(pass.lifetime_start, {sz, ptr}, "", orig); BasicBlock *def_bb = orig->getParent(); std::set bbs{def_bb}; auto &DT = getDomTree(); // Collect all BB where the allocation is live - for (auto use: alloc_uses) { + for (auto use: use_info.uses) { auto bb = use->getParent(); if (!bbs.insert(bb).second) continue; @@ -486,7 +766,7 @@ void Optimizer::insertLifetime(Instruction *ptr, Constant *sz, Instruction *orig // Record extra BBs that contain invisible uses. SmallSet extra_use; SmallVector*, 8> dominated; - for (auto preserve: preserves) { + for (auto preserve: use_info.preserves) { for (auto RN = DT.getNode(preserve->getParent()); RN; RN = dominated.empty() ? nullptr : dominated.pop_back_val()) { for (auto N: *RN) { @@ -539,7 +819,7 @@ void Optimizer::insertLifetime(Instruction *ptr, Constant *sz, Instruction *orig } else { for (auto it = bb->rbegin(), end = bb->rend(); it != end; ++it) { - if (alloc_uses.count(&*it)) { + if (use_info.uses.count(&*it)) { --it; first_dead.push_back(&*it); break; @@ -642,11 +922,46 @@ void Optimizer::replaceIntrinsicUseWith(IntrinsicInst *call, Intrinsic::ID ID, call->eraseFromParent(); } -// This function needs to handle all cases `checkInst` can handle. // This function should not erase any safepoint so that the lifetime marker can find and cache // all the original safepoints. -void Optimizer::replaceUsesWith(Instruction *orig_inst, Instruction *new_inst, Value *tag) +void Optimizer::moveToStack(CallInst *orig_inst, size_t sz, bool has_ref) { + auto tag = orig_inst->getArgOperand(2); + removed.push_back(orig_inst); + // The allocation does not escape or get used in a phi node so none of the derived + // SSA from it are live when we run the allocation again. + // It is now safe to promote the allocation to an entry block alloca. + size_t align = 1; + // TODO make codegen handling of alignment consistent and pass that as a parameter + // to the allocation function directly. + if (sz > 1) + align = MinAlign(JL_SMALL_BYTE_ALIGNMENT, NextPowerOf2(sz)); + // No debug info for prolog instructions + IRBuilder<> prolog_builder(&F.getEntryBlock().front()); + AllocaInst *buff; + Instruction *ptr; + if (sz == 0) { + buff = prolog_builder.CreateAlloca(pass.T_int8, ConstantInt::get(pass.T_int64, 0)); + ptr = buff; + } + else if (has_ref) { + // Allocate with the correct type so that the GC frame lowering pass will + // treat this as a non-mem2reg'd alloca + // The ccall root and GC preserve handling below makes sure that + // the alloca isn't optimized out. + buff = prolog_builder.CreateAlloca(pass.T_prjlvalue); + buff->setAlignment(align); + ptr = cast(prolog_builder.CreateBitCast(buff, pass.T_pint8)); + } + else { + buff = prolog_builder.CreateAlloca(Type::getIntNTy(*pass.ctx, sz * 8)); + buff->setAlignment(align); + ptr = cast(prolog_builder.CreateBitCast(buff, pass.T_pint8)); + } + insertLifetime(ptr, ConstantInt::get(pass.T_int64, sz), orig_inst); + auto new_inst = cast(prolog_builder.CreateBitCast(ptr, pass.T_pjlvalue)); + new_inst->takeName(orig_inst); + auto simple_replace = [&] (Instruction *orig_i, Instruction *new_i) { if (orig_i->user_empty()) { if (orig_i != orig_inst) @@ -701,7 +1016,16 @@ void Optimizer::replaceUsesWith(Instruction *orig_inst, Instruction *new_inst, V } // Also remove the preserve intrinsics so that it can be better optimized. if (pass.gc_preserve_begin == callee) { - removeGCPreserve(call, orig_i); + if (has_ref) { + call->replaceUsesOfWith(orig_i, buff); + } + else { + removeGCPreserve(call, orig_i); + } + return; + } + if (pass.write_barrier_func == callee) { + call->eraseFromParent(); return; } if (auto intrinsic = dyn_cast(call)) { @@ -711,8 +1035,8 @@ void Optimizer::replaceUsesWith(Instruction *orig_inst, Instruction *new_inst, V } } // remove from operand bundle - Type *orig_t = orig_i->getType(); - user->replaceUsesOfWith(orig_i, ConstantPointerNull::get(cast(orig_t))); + Value *replace = has_ref ? (Value*)buff : Constant::getNullValue(orig_i->getType()); + user->replaceUsesOfWith(orig_i, replace); } else if (isa(user) || isa(user)) { auto cast_t = PointerType::get(cast(user->getType())->getElementType(), @@ -753,6 +1077,381 @@ void Optimizer::replaceUsesWith(Instruction *orig_inst, Instruction *new_inst, V } } +// This function should not erase any safepoint so that the lifetime marker can find and cache +// all the original safepoints. +void Optimizer::removeAlloc(CallInst *orig_inst) +{ + auto tag = orig_inst->getArgOperand(2); + removed.push_back(orig_inst); + auto simple_remove = [&] (Instruction *orig_i) { + if (orig_i->user_empty()) { + if (orig_i != orig_inst) + orig_i->eraseFromParent(); + return true; + } + return false; + }; + if (simple_remove(orig_inst)) + return; + assert(replace_stack.empty()); + ReplaceUses::Frame cur{orig_inst, nullptr}; + auto finish_cur = [&] () { + assert(cur.orig_i->user_empty()); + if (cur.orig_i != orig_inst) { + cur.orig_i->eraseFromParent(); + } + }; + auto push_frame = [&] (Instruction *orig_i) { + if (simple_remove(orig_i)) + return; + replace_stack.push_back(cur); + cur = {orig_i, nullptr}; + }; + auto remove_inst = [&] (Instruction *user) { + Instruction *orig_i = cur.orig_i; + if (auto store = dyn_cast(user)) { + // All stores are known to be dead. + // The stored value might be an gc pointer in which case deleting the object + // might open more optimization opportunities. + if (auto stored_inst = dyn_cast(store->getValueOperand())) + pushInstruction(stored_inst); + user->eraseFromParent(); + return; + } + else if (auto call = dyn_cast(user)) { + auto callee = call->getCalledValue(); + if (pass.gc_preserve_begin == callee) { + removeGCPreserve(call, orig_i); + return; + } + if (pass.typeof_func == callee) { + call->replaceAllUsesWith(tag); + call->eraseFromParent(); + return; + } + if (pass.write_barrier_func == callee) { + call->eraseFromParent(); + return; + } + if (auto II = dyn_cast(call)) { + auto id = II->getIntrinsicID(); + if (id == Intrinsic::memset || id == Intrinsic::lifetime_start || + id == Intrinsic::lifetime_end || isa(II)) { + call->eraseFromParent(); + } + } + // remove from operand bundle + user->replaceUsesOfWith(orig_i, Constant::getNullValue(orig_i->getType())); + } + else if (isa(user) || isa(user) || + isa(user)) { + push_frame(user); + } + else { + abort(); + } + }; + + while (true) { + remove_inst(cast(*cur.orig_i->user_begin())); + while (cur.orig_i->use_empty()) { + finish_cur(); + if (replace_stack.empty()) + return; + cur = replace_stack.back(); + replace_stack.pop_back(); + } + } +} + +void Optimizer::splitOnStack(CallInst *orig_inst) +{ + auto tag = orig_inst->getArgOperand(2); + removed.push_back(orig_inst); + IRBuilder<> prolog_builder(&F.getEntryBlock().front()); + struct SplitSlot { + AllocaInst *slot; + bool isref; + uint32_t offset; + uint32_t size; + }; + SmallVector slots; + for (auto memop: use_info.memops) { + auto offset = memop.first; + auto &field = memop.second; + // If the field has no reader and is not a object reference field that we + // need to preserve at some point, there's no need to allocate the field. + if (!field.hasload && (!field.hasobjref || !use_info.haspreserve)) + continue; + SplitSlot slot{nullptr, field.hasobjref, offset, field.size}; + Type *allocty; + if (field.hasobjref) { + allocty = pass.T_prjlvalue; + } + else if (field.elty && !field.multiloc) { + allocty = field.elty; + } + else { + allocty = Type::getIntNTy(*pass.ctx, field.size * 8); + } + slot.slot = prolog_builder.CreateAlloca(allocty); + insertLifetime(prolog_builder.CreateBitCast(slot.slot, pass.T_pint8), + ConstantInt::get(pass.T_int64, field.size), orig_inst); + slots.push_back(std::move(slot)); + } + const auto nslots = slots.size(); + auto find_slot = [&] (uint32_t offset) { + if (offset == 0) + return 0u; + unsigned lb = 0; + unsigned ub = slots.size(); + while (lb + 1 < ub) { + unsigned mid = (lb + ub) / 2; + if (slots[mid].offset <= offset) { + lb = mid; + } + else { + ub = mid; + } + } + return lb; + }; + auto simple_replace = [&] (Instruction *orig_i) { + if (orig_i->user_empty()) { + if (orig_i != orig_inst) + orig_i->eraseFromParent(); + return true; + } + return false; + }; + if (simple_replace(orig_inst)) + return; + assert(replace_stack.empty()); + ReplaceUses::Frame cur{orig_inst, uint32_t(0)}; + auto finish_cur = [&] () { + assert(cur.orig_i->user_empty()); + if (cur.orig_i != orig_inst) { + cur.orig_i->eraseFromParent(); + } + }; + auto push_frame = [&] (Instruction *orig_i, uint32_t offset) { + if (simple_replace(orig_i)) + return; + replace_stack.push_back(cur); + cur = {orig_i, offset}; + }; + auto slot_gep = [&] (SplitSlot &slot, uint32_t offset, Type *elty, IRBuilder<> &builder) { + assert(slot.offset <= offset); + offset -= slot.offset; + auto size = pass.DL->getTypeAllocSize(elty); + Value *addr; + if (offset % size == 0) { + addr = builder.CreateBitCast(slot.slot, elty->getPointerTo()); + if (offset != 0) { + addr = builder.CreateConstInBoundsGEP1_32(elty, addr, offset / size); + } + } + else { + addr = builder.CreateBitCast(slot.slot, pass.T_pint8); + addr = builder.CreateConstInBoundsGEP1_32(pass.T_int8, addr, offset); + addr = builder.CreateBitCast(addr, elty->getPointerTo()); + } + return addr; + }; + auto replace_inst = [&] (Use *use) { + Instruction *user = cast(use->getUser()); + Instruction *orig_i = cur.orig_i; + uint32_t offset = cur.offset; + if (auto load = dyn_cast(user)) { + auto slot_idx = find_slot(offset); + auto &slot = slots[slot_idx]; + assert(slot.offset <= offset && slot.offset + slot.size >= offset); + IRBuilder<> builder(load); + Value *val; + auto load_ty = load->getType(); + if (slot.isref) { + assert(slot.offset == offset); + val = builder.CreateLoad(pass.T_prjlvalue, slot.slot); + // Assume the addrspace is correct. + val = builder.CreateBitCast(val, load_ty); + } + else { + val = builder.CreateLoad(load_ty, slot_gep(slot, offset, load_ty, builder)); + } + load->replaceAllUsesWith(val); + load->eraseFromParent(); + return; + } + else if (auto store = dyn_cast(user)) { + if (auto stored_inst = dyn_cast(store->getValueOperand())) + pushInstruction(stored_inst); + auto slot_idx = find_slot(offset); + auto &slot = slots[slot_idx]; + if (slot.offset > offset || slot.offset + slot.size <= offset) { + store->eraseFromParent(); + return; + } + IRBuilder<> builder(store); + auto store_val = store->getValueOperand(); + auto store_ty = store_val->getType(); + if (slot.isref) { + assert(slot.offset == offset); + if (!isa(store_ty)) { + store_val = builder.CreateBitCast(store_val, pass.T_size); + store_val = builder.CreateIntToPtr(store_val, pass.T_pjlvalue); + store_ty = pass.T_pjlvalue; + } + else { + store_ty = cast(pass.T_pjlvalue)->getElementType() + ->getPointerTo(cast(store_ty)->getAddressSpace()); + store_val = builder.CreateBitCast(store_val, store_ty); + } + if (cast(store_ty)->getAddressSpace() != AddressSpace::Tracked) + store_val = builder.CreateAddrSpaceCast(store_val, pass.T_prjlvalue); + builder.CreateStore(store_val, slot.slot); + } + else { + builder.CreateStore(store_val, slot_gep(slot, offset, store_ty, builder)); + } + store->eraseFromParent(); + return; + } + else if (auto call = dyn_cast(user)) { + auto callee = call->getCalledValue(); + if (auto intrinsic = dyn_cast(call)) { + if (Intrinsic::ID id = intrinsic->getIntrinsicID()) { + if (id == Intrinsic::memset) { + IRBuilder<> builder(call); + auto val_arg = cast(call->getArgOperand(1)); + auto size_arg = cast(call->getArgOperand(2)); + uint8_t val = val_arg->getLimitedValue(); + uint32_t size = size_arg->getLimitedValue(); + for (unsigned idx = find_slot(offset); idx < nslots; idx++) { + auto &slot = slots[idx]; + if (slot.offset + slot.size <= offset || slot.offset >= offset + size) + break; + if (slot.isref) { + assert(slot.offset >= offset && + slot.offset + slot.size <= offset + size); + Constant *ptr; + if (val == 0) { + ptr = Constant::getNullValue(pass.T_prjlvalue); + } + else { + uint64_t intval; + memset(&intval, val, 8); + Constant *val = ConstantInt::get(pass.T_size, intval); + val = ConstantExpr::getIntToPtr(val, pass.T_pjlvalue); + ptr = ConstantExpr::getAddrSpaceCast(val, pass.T_prjlvalue); + } + builder.CreateStore(ptr, slot.slot); + continue; + } + auto ptr8 = builder.CreateBitCast(slot.slot, pass.T_pint8); + if (offset > slot.offset) + ptr8 = builder.CreateConstInBoundsGEP1_32(pass.T_int8, ptr8, + offset - slot.offset); + auto sub_size = std::min(slot.offset + slot.size, offset + size) - + std::max(offset, slot.offset); + builder.CreateMemSet(ptr8, val_arg, sub_size, 0); + } + call->eraseFromParent(); + return; + } + call->eraseFromParent(); + return; + } + } + if (pass.typeof_func == callee) { + call->replaceAllUsesWith(tag); + call->eraseFromParent(); + return; + } + if (pass.write_barrier_func == callee) { + call->eraseFromParent(); + return; + } + if (pass.gc_preserve_begin == callee) { + SmallVector operands; + for (auto &arg: call->arg_operands()) { + if (arg.get() == orig_i || isa(arg.get())) + continue; + operands.push_back(arg.get()); + } + IRBuilder<> builder(call); + for (auto &slot: slots) { + if (!slot.isref) + continue; + operands.push_back(builder.CreateLoad(pass.T_prjlvalue, slot.slot)); + } + auto new_call = builder.CreateCall(pass.gc_preserve_begin, operands); + new_call->takeName(call); + new_call->setAttributes(call->getAttributes()); + call->replaceAllUsesWith(new_call); + call->eraseFromParent(); + return; + } + // remove from operand bundle + assert(isBundleOperand(call, use->getOperandNo())); + assert(call->getOperandBundleForOperand(use->getOperandNo()).getTagName() == + "jl_roots"); + SmallVector bundles; + call->getOperandBundlesAsDefs(bundles); + for (auto &bundle: bundles) { + if (bundle.getTag() != "jl_roots") + continue; + std::vector operands; + for (auto op: bundle.inputs()) { + if (op == orig_i || isa(op)) + continue; + operands.push_back(op); + } + IRBuilder<> builder(call); + for (auto &slot: slots) { + if (!slot.isref) + continue; + operands.push_back(builder.CreateLoad(pass.T_prjlvalue, slot.slot)); + } + bundle = OperandBundleDef("jl_roots", std::move(operands)); + break; + } + auto new_call = CallInst::Create(call, bundles, call); + new_call->takeName(call); + call->replaceAllUsesWith(new_call); + call->eraseFromParent(); + return; + } + else if (isa(user) || isa(user)) { + push_frame(user, offset); + } + else if (auto gep = dyn_cast(user)) { + APInt apoffset(sizeof(void*) * 8, offset, true); + gep->accumulateConstantOffset(*pass.DL, apoffset); + push_frame(gep, apoffset.getLimitedValue()); + } + else { + abort(); + } + }; + + while (true) { + replace_inst(&*cur.orig_i->use_begin()); + while (cur.orig_i->use_empty()) { + finish_cur(); + if (replace_stack.empty()) + goto cleanup; + cur = replace_stack.back(); + replace_stack.pop_back(); + } + } +cleanup: + for (auto &slot: slots) { + if (!slot.isref) + continue; + PromoteMemToReg({slot.slot}, getDomTree()); + } +} + bool AllocOpt::doInitialization(Module &M) { ctx = &M.getContext(); @@ -765,6 +1464,7 @@ bool AllocOpt::doInitialization(Module &M) ptr_from_objref = M.getFunction("julia.pointer_from_objref"); gc_preserve_begin = M.getFunction("llvm.julia.gc_preserve_begin"); typeof_func = M.getFunction("julia.typeof"); + write_barrier_func = M.getFunction("julia.write_barrier"); T_prjlvalue = alloc_obj->getReturnType(); T_pjlvalue = PointerType::get(cast(T_prjlvalue)->getElementType(), 0); diff --git a/src/llvm-late-gc-lowering.cpp b/src/llvm-late-gc-lowering.cpp index bbf8426c96dc7..d7b0152ae0c24 100644 --- a/src/llvm-late-gc-lowering.cpp +++ b/src/llvm-late-gc-lowering.cpp @@ -769,7 +769,7 @@ void RecursivelyVisit(callback f, Value *V) { f(VU); if (isa(TheUser) || isa(TheUser) || isa(TheUser) || isa(TheUser) || - isa(TheUser)) + isa(TheUser) || isa(TheUser)) continue; if (isa(TheUser) || isa(TheUser) || isa(TheUser)) { RecursivelyVisit(f, TheUser);