diff --git a/base/compiler/bootstrap.jl b/base/compiler/bootstrap.jl index 1989d8aa573931..487ddf2ccdd1ba 100644 --- a/base/compiler/bootstrap.jl +++ b/base/compiler/bootstrap.jl @@ -11,7 +11,11 @@ let world = get_world_counter() interp = NativeInterpreter(world) - analyze_escapes_tt = Tuple{typeof(analyze_escapes), IRCode, Int, Bool, typeof(get_escape_cache(code_cache(interp)))} + analyze_escapes_tt = Any[typeof(analyze_escapes), IRCode, Int, Bool, + # typeof(get_escape_cache(code_cache(interp))) # once we enable IPO EA + typeof(null_escape_cache) + ] + analyze_escapes_tt = Tuple{analyze_escapes_tt...} fs = Any[ # we first create caches for the optimizer, because they contain many loop constructions # and they're better to not run in interpreter even during bootstrapping diff --git a/base/compiler/optimize.jl b/base/compiler/optimize.jl index 635e53a9e1f1da..e84f77ae1ea48d 100644 --- a/base/compiler/optimize.jl +++ b/base/compiler/optimize.jl @@ -98,7 +98,7 @@ and then caches it into a global cache for later interprocedural propagation. cache_escapes!(caller::InferenceResult, estate::EscapeState) = caller.argescapes = ArgEscapeCache(estate) -function get_escape_cache(mi_cache::MICache) where MICache +function ipo_escape_cache(mi_cache::MICache) where MICache return function (linfo::Union{InferenceResult,MethodInstance}) if isa(linfo, InferenceResult) argescapes = linfo.argescapes @@ -110,6 +110,7 @@ function get_escape_cache(mi_cache::MICache) where MICache return argescapes !== nothing ? argescapes::ArgEscapeCache : nothing end end +null_escape_cache(linfo::Union{InferenceResult,MethodInstance}) = nothing mutable struct OptimizationState linfo::MethodInstance @@ -540,17 +541,24 @@ function run_passes(ci::CodeInfo, sv::OptimizationState, caller::InferenceResult # TODO: Domsorting can produce an updated domtree - no need to recompute here @timeit "compact 1" ir = compact!(ir) nargs = let def = sv.linfo.def; isa(def, Method) ? Int(def.nargs) : 0; end - get_escape_cache = (@__MODULE__).get_escape_cache(sv.inlining.mi_cache) - if is_ipo_profitable(ir, nargs) - @timeit "IPO EA" begin - state = analyze_escapes(ir, nargs, false, get_escape_cache) - cache_escapes!(caller, state) - end - end + # if is_ipo_profitable(ir, nargs) + # @timeit "IPO EA" begin + # state = analyze_escapes(ir, + # nargs, #=call_resolved=#false, ipo_escape_cache(sv.inlining.mi_cache)) + # cache_escapes!(caller, state) + # end + # end @timeit "Inlining" ir = ssa_inlining_pass!(ir, ir.linetable, sv.inlining, ci.propagate_inbounds) # @timeit "verify 2" verify_ir(ir) @timeit "compact 2" ir = compact!(ir) - @timeit "SROA" ir = sroa_pass!(ir) + @timeit "SROA" ir, memory_opt = linear_pass!(ir) + if memory_opt + @timeit "memory_opt_pass!" begin + @timeit "Local EA" estate = analyze_escapes(ir, + nargs, #=call_resolved=#true, null_escape_cache) + @timeit "memory_opt_pass!" ir = memory_opt_pass!(ir, estate) + end + end @timeit "ADCE" ir = adce_pass!(ir) @timeit "type lift" ir = type_lift_pass!(ir) @timeit "compact 3" ir = compact!(ir) diff --git a/base/compiler/ssair/passes.jl b/base/compiler/ssair/passes.jl index 67610f0c1df60e..e44e6d3a00f77d 100644 --- a/base/compiler/ssair/passes.jl +++ b/base/compiler/ssair/passes.jl @@ -6,29 +6,6 @@ function is_known_call(@nospecialize(x), @nospecialize(func), ir::Union{IRCode,I return singleton_type(ft) === func end -""" - du::SSADefUse - -This struct keeps track of all uses of some mutable struct allocated in the current function: -- `du.uses::Vector{Int}` are all instances of `getfield` on the struct -- `du.defs::Vector{Int}` are all instances of `setfield!` on the struct -The terminology refers to the uses/defs of the "slot bundle" that the mutable struct represents. - -In addition we keep track of all instances of a `:foreigncall` that preserves of this mutable -struct in `du.ccall_preserve_uses`. Somewhat counterintuitively, we don't actually need to -make sure that the struct itself is live (or even allocated) at a `ccall` site. -If there are no other places where the struct escapes (and thus e.g. where its address is taken), -it need not be allocated. We do however, need to make sure to preserve any elements of this struct. -""" -struct SSADefUse - uses::Vector{Int} - defs::Vector{Int} - ccall_preserve_uses::Vector{Int} -end -SSADefUse() = SSADefUse(Int[], Int[], Int[]) - -compute_live_ins(cfg::CFG, du::SSADefUse) = compute_live_ins(cfg, du.defs, du.uses) - # assume `stmt == getfield(obj, field, ...)` or `stmt == setfield!(obj, field, val, ...)` try_compute_field_stmt(ir::Union{IncrementalCompact,IRCode}, stmt::Expr) = try_compute_field(ir, stmt.args[3]) @@ -55,112 +32,6 @@ function try_compute_fieldidx_stmt(ir::Union{IncrementalCompact,IRCode}, stmt::E return try_compute_fieldidx(typ, field) end -function find_curblock(domtree::DomTree, allblocks::Vector{Int}, curblock::Int) - # TODO: This can be much faster by looking at current level and only - # searching for those blocks in a sorted order - while !(curblock in allblocks) - curblock = domtree.idoms_bb[curblock] - end - return curblock -end - -function val_for_def_expr(ir::IRCode, def::Int, fidx::Int) - ex = ir[SSAValue(def)][:inst] - if isexpr(ex, :new) - return ex.args[1+fidx] - else - @assert isa(ex, Expr) - # The use is whatever the setfield was - return ex.args[4] - end -end - -function compute_value_for_block(ir::IRCode, domtree::DomTree, allblocks::Vector{Int}, du::SSADefUse, phinodes::IdDict{Int, SSAValue}, fidx::Int, curblock::Int) - curblock = find_curblock(domtree, allblocks, curblock) - def = 0 - for stmt in du.defs - if block_for_inst(ir.cfg, stmt) == curblock - def = max(def, stmt) - end - end - def == 0 ? phinodes[curblock] : val_for_def_expr(ir, def, fidx) -end - -function compute_value_for_use(ir::IRCode, domtree::DomTree, allblocks::Vector{Int}, du::SSADefUse, phinodes::IdDict{Int, SSAValue}, fidx::Int, use::Int) - def, useblock, curblock = find_def_for_use(ir, domtree, allblocks, du, use) - if def == 0 - if !haskey(phinodes, curblock) - # If this happens, we need to search the predecessors for defs. Which - # one doesn't matter - if it did, we'd have had a phinode - return compute_value_for_block(ir, domtree, allblocks, du, phinodes, fidx, first(ir.cfg.blocks[useblock].preds)) - end - # The use is the phinode - return phinodes[curblock] - else - return val_for_def_expr(ir, def, fidx) - end -end - -# even when the allocation contains an uninitialized field, we try an extra effort to check -# if this load at `idx` have any "safe" `setfield!` calls that define the field -function has_safe_def( - ir::IRCode, domtree::DomTree, allblocks::Vector{Int}, du::SSADefUse, - newidx::Int, idx::Int) - def, _, _ = find_def_for_use(ir, domtree, allblocks, du, idx) - # will throw since we already checked this `:new` site doesn't define this field - def == newidx && return false - # found a "safe" definition - def ≠ 0 && return true - # we may still be able to replace this load with `PhiNode` - # examine if all predecessors of `block` have any "safe" definition - block = block_for_inst(ir, idx) - seen = BitSet(block) - worklist = BitSet(ir.cfg.blocks[block].preds) - isempty(worklist) && return false - while !isempty(worklist) - pred = pop!(worklist) - # if this block has already been examined, bail out to avoid infinite cycles - pred in seen && return false - idx = last(ir.cfg.blocks[pred].stmts) - # NOTE `idx` isn't a load, thus we can use inclusive coondition within the `find_def_for_use` - def, _, _ = find_def_for_use(ir, domtree, allblocks, du, idx, true) - # will throw since we already checked this `:new` site doesn't define this field - def == newidx && return false - push!(seen, pred) - # found a "safe" definition for this predecessor - def ≠ 0 && continue - # check for the predecessors of this predecessor - for newpred in ir.cfg.blocks[pred].preds - push!(worklist, newpred) - end - end - return true -end - -# find the first dominating def for the given use -function find_def_for_use( - ir::IRCode, domtree::DomTree, allblocks::Vector{Int}, du::SSADefUse, use::Int, inclusive::Bool=false) - useblock = block_for_inst(ir.cfg, use) - curblock = find_curblock(domtree, allblocks, useblock) - local def = 0 - for idx in du.defs - if block_for_inst(ir.cfg, idx) == curblock - if curblock != useblock - # Find the last def in this block - def = max(def, idx) - else - # Find the last def before our use - if inclusive - def = max(def, idx ≤ use ? idx : 0) - else - def = max(def, idx < use ? idx : 0) - end - end - end - end - return def, useblock, curblock -end - function collect_leaves(compact::IncrementalCompact, @nospecialize(val), @nospecialize(typeconstraint)) if isa(val, Union{OldSSAValue, SSAValue}) val, typeconstraint = simple_walk_constraint(compact, val, typeconstraint) @@ -657,38 +528,35 @@ end const SPCSet = IdSet{Int} """ - sroa_pass!(ir::IRCode) -> newir::IRCode - -`getfield` elimination pass, a.k.a. Scalar Replacements of Aggregates optimization. - -This pass is based on a local field analysis by def-use chain walking. -It looks for struct allocation sites ("definitions"), and `getfield` calls as well as -`:foreigncall`s that preserve the structs ("usages"). If "definitions" have enough information, -then this pass will replace corresponding usages with forwarded values. -`mutable struct`s require additional cares and need to be handled separately from immutables. -For `mutable struct`s, `setfield!` calls account for "definitions" also, and the pass should -give up the lifting conservatively when there are any "intermediate usages" that may escape -the mutable struct (e.g. non-inlined generic function call that takes the mutable struct as -its argument). - -In a case when all usages are fully eliminated, `struct` allocation may also be erased as -a result of succeeding dead code elimination. + linear_pass!(ir::IRCode) -> (newir::IRCode, memory_opt::Bool) + +This pass consists of the following optimizations that can be performed by +a single linear traversal over IR statements: +- load forwarding of immutables (`getfield` elimination): immutable allocations whose + loads are all eliminated by this pass may be erased entirely as a result of succeeding + dead code elimination (this allocation elimination is called "SROA", Scalar Replacements of Aggregates) +- lifting of builtin comparisons: see [`lift_comparison!`](@ref) +- canonicalization of `typeassert` calls: see [`canonicalize_typeassert!`](@ref) + +In addition to performing the optimizations above, the linear traversal also examines each +statement and checks if there is any profitability of running [`memory_opt_pass!`](@ref) pass. +In such cases `memory_opt` is flagged on and it indicates `ir` may be further optimized by +running `memory_opt_pass!(ir, estate::EscapeState)`. """ -function sroa_pass!(ir::IRCode) +function linear_pass!(ir::IRCode) compact = IncrementalCompact(ir) - defuses = nothing # will be initialized once we encounter mutability in order to reduce dynamic allocations lifting_cache = IdDict{Pair{AnySSAValue, Any}, AnySSAValue}() + local memory_opt = false # whether or not to run the memory_opt_pass! pass later for ((_, idx), stmt) in compact - # check whether this statement is `getfield` / `setfield!` (or other "interesting" statement) isa(stmt, Expr) || continue - is_setfield = false field_ordering = :unspecified - if is_known_call(stmt, setfield!, compact) - 4 <= length(stmt.args) <= 5 || continue - is_setfield = true - if length(stmt.args) == 5 - field_ordering = argextype(stmt.args[5], compact) + if isexpr(stmt, :new) + typ = unwrap_unionall(widenconst(argextype(SSAValue(idx), compact))) + if ismutabletype(typ) + # mutable SROA may eliminate this eliminate this allocation, mark it now + memory_opt = true end + continue elseif is_known_call(stmt, getfield, compact) 3 <= length(stmt.args) <= 5 || continue if length(stmt.args) == 5 @@ -704,40 +572,21 @@ function sroa_pass!(ir::IRCode) for pidx in (6+nccallargs):length(stmt.args) preserved_arg = stmt.args[pidx] isa(preserved_arg, SSAValue) || continue - let intermediaries = SPCSet() - callback = function (@nospecialize(pi), @nospecialize(ssa)) - push!(intermediaries, ssa.id) - return false - end - def = simple_walk(compact, preserved_arg, callback) - isa(def, SSAValue) || continue - defidx = def.id - def = compact[defidx] - if is_known_call(def, tuple, compact) + def = simple_walk(compact, preserved_arg) + isa(def, SSAValue) || continue + defidx = def.id + def = compact[defidx] + if is_known_call(def, tuple, compact) + record_immutable_preserve!(new_preserves, def, compact) + push!(preserved, preserved_arg.id) + elseif isexpr(def, :new) + typ = unwrap_unionall(widenconst(argextype(SSAValue(defidx), compact))) + if typ isa DataType + ismutabletype(typ) && continue # mutable SROA is performed later record_immutable_preserve!(new_preserves, def, compact) push!(preserved, preserved_arg.id) - continue - elseif isexpr(def, :new) - typ = widenconst(argextype(SSAValue(defidx), compact)) - if isa(typ, UnionAll) - typ = unwrap_unionall(typ) - end - if typ isa DataType && !ismutabletype(typ) - record_immutable_preserve!(new_preserves, def, compact) - push!(preserved, preserved_arg.id) - continue - end - else - continue - end - if defuses === nothing - defuses = IdDict{Int, Tuple{SPCSet, SSADefUse}}() end - mid, defuse = get!(defuses, defidx, (SPCSet(), SSADefUse())) - push!(defuse.ccall_preserve_uses, idx) - union!(mid, intermediaries) end - continue end if !isempty(new_preserves) compact[idx] = form_new_preserves(stmt, preserved, new_preserves) @@ -756,7 +605,7 @@ function sroa_pass!(ir::IRCode) continue end - # analyze this `getfield` / `setfield!` call + # analyze this `getfield` call field = try_compute_field_stmt(compact, stmt) field === nothing && continue @@ -774,32 +623,7 @@ function sroa_pass!(ir::IRCode) continue end - # analyze this mutable struct here for the later pass - if ismutabletype(struct_typ) - isa(val, SSAValue) || continue - let intermediaries = SPCSet() - callback = function (@nospecialize(pi), @nospecialize(ssa)) - push!(intermediaries, ssa.id) - return false - end - def = simple_walk(compact, val, callback) - # Mutable stuff here - isa(def, SSAValue) || continue - if defuses === nothing - defuses = IdDict{Int, Tuple{SPCSet, SSADefUse}}() - end - mid, defuse = get!(defuses, def.id, (SPCSet(), SSADefUse())) - if is_setfield - push!(defuse.defs, idx) - else - push!(defuse.uses, idx) - end - union!(mid, intermediaries) - end - continue - elseif is_setfield - continue # invalid `setfield!` call, but just ignore here - end + ismutabletype(struct_typ) && continue # mutable SROA is performed later # perform SROA on immutable structs here on @@ -837,177 +661,462 @@ function sroa_pass!(ir::IRCode) end non_dce_finish!(compact) - if defuses !== nothing - # now go through analyzed mutable structs and see which ones we can eliminate - # NOTE copy the use count here, because `simple_dce!` may modify it and we need it - # consistent with the state of the IR here (after tracking `PhiNode` arguments, - # but before the DCE) for our predicate within `sroa_mutables!`, but we also - # try an extra effort using a callback so that reference counts are updated - used_ssas = copy(compact.used_ssas) - simple_dce!(compact, (x::SSAValue) -> used_ssas[x.id] -= 1) - ir = complete(compact) - sroa_mutables!(ir, defuses, used_ssas) - return ir - else - simple_dce!(compact) - return complete(compact) - end + simple_dce!(compact) + ir = complete(compact) + return ir, memory_opt end -function sroa_mutables!(ir::IRCode, defuses::IdDict{Int, Tuple{SPCSet, SSADefUse}}, used_ssas::Vector{Int}) - # initialization of domtree is delayed to avoid the expensive computation in many cases - local domtree = nothing - for (idx, (intermediaries, defuse)) in defuses - intermediaries = collect(intermediaries) - # Check if there are any uses we did not account for. If so, the variable - # escapes and we cannot eliminate the allocation. This works, because we're guaranteed - # not to include any intermediaries that have dead uses. As a result, missing uses will only ever - # show up in the nuses_total count. - nleaves = length(defuse.uses) + length(defuse.defs) + length(defuse.ccall_preserve_uses) - nuses = 0 - for idx in intermediaries - nuses += used_ssas[idx] +function form_new_preserves(origex::Expr, intermediates::Vector{Int}, new_preserves::Vector{Any}) + newex = Expr(:foreigncall) + nccallargs = length(origex.args[3]::SimpleVector) + for i in 1:(6+nccallargs-1) + push!(newex.args, origex.args[i]) + end + for i in (6+nccallargs):length(origex.args) + x = origex.args[i] + # don't need to preserve intermediaries + if isa(x, SSAValue) && x.id in intermediates + continue end - nuses_total = used_ssas[idx] + nuses - length(intermediaries) - nleaves == nuses_total || continue - # Find the type for this allocation - defexpr = ir[SSAValue(idx)][:inst] - isexpr(defexpr, :new) || continue - newidx = idx - typ = ir.stmts[newidx][:type] - if isa(typ, UnionAll) - typ = unwrap_unionall(typ) + push!(newex.args, x) + end + for i in 1:length(new_preserves) + push!(newex.args, new_preserves[i]) + end + return newex +end + +import .EscapeAnalysis: + EscapeState, EscapeInfo, IndexableFields, LivenessSet, getaliases, LocalUse, LocalDef + +""" + memory_opt_pass!(ir::IRCode, estate::EscapeState) -> newir::IRCode + +Performs memory optimizations using escape information analyzed by `EscapeAnalysis`. +Specifically, this optimization pass does SROA of mutable allocations. + +`estate::EscapeState` is expected to be a result of `analyze_escapes(ir, ...)`. +Since the computational cost of running `analyze_escapes` can be relatively expensive, +it is recommended to run this pass "selectively" i.e. only when there seems to be +a profitability for the memory optimizations. +""" +function memory_opt_pass!(ir::IRCode, estate::EscapeState) + # Compute domtree now, needed below, now that we have finished compacting the IR. + # This needs to be after we iterate through the IR with `IncrementalCompact` + # because removing dead blocks can invalidate the domtree. + # TODO initialization of the domtree can be delayed to avoid the expensive computation + # in cases when there are no loads to be forwarded + @timeit "domtree 2" domtree = construct_domtree(ir.cfg.blocks) + wset = BitSet(1:length(ir.stmts)+length(ir.new_nodes.stmts)) + eliminated = BitSet() + revisit = Tuple{#=related=#Vector{SSAValue}, #=Liveness=#LivenessSet}[] + all_preserved = true + newpreserves = nothing + while !isempty(wset) + idx = pop!(wset) + ssa = SSAValue(idx) + stmt = ir[ssa][:inst] + # NOTE `linear_pass!` can't eliminate immutables wrapped by mutables, + # but the EA-based alias analysis may be able to eliminate them also + isexpr(stmt, :new) || is_known_call(stmt, tuple, ir) || continue + einfo = estate[ssa] + is_load_forwardable(einfo) || continue + aliases = getaliases(ssa, estate) + if aliases === nothing + related = SSAValue[ssa] + else + related = SSAValue[] + for alias in aliases + @assert isa(alias, SSAValue) "invalid escape analysis" + push!(related, alias) + delete!(wset, alias.id) + end end - # Could still end up here if we tried to setfield! on an immutable, which would - # error at runtime, but is not illegal to have in the IR. - ismutabletype(typ) || continue - typ = typ::DataType + finfos = (einfo.AliasInfo::IndexableFields).infos + nflds = length(finfos) + # Partition defuses by field - fielddefuse = SSADefUse[SSADefUse() for _ = 1:fieldcount(typ)] - all_forwarded = true - for use in defuse.uses - stmt = ir[SSAValue(use)][:inst] # == `getfield` call - # We may have discovered above that this use is dead - # after the getfield elim of immutables. In that case, - # it would have been deleted. That's fine, just ignore - # the use in that case. - if stmt === nothing - all_forwarded = false - continue + fdefuses = Vector{FieldDefUse}(undef, nflds) + for i = 1:nflds + finfo = finfos[i] + fdu = FieldDefUse() + for fx in finfo + if isa(fx, LocalUse) + push!(fdu.uses, GetfieldLoad(fx.idx)) # use (getfield call) + else + @assert isa(fx, LocalDef) + push!(fdu.defs, fx.idx) # def (setfield! call or :new expression) + end end - field = try_compute_fieldidx_stmt(ir, stmt::Expr, typ) - field === nothing && @goto skip - push!(fielddefuse[field].uses, use) + fdefuses[i] = fdu end - for def in defuse.defs - stmt = ir[SSAValue(def)][:inst]::Expr # == `setfield!` call - field = try_compute_fieldidx_stmt(ir, stmt, typ) - field === nothing && @goto skip - isconst(typ, field) && @goto skip # we discovered an attempt to mutate a const field, which must error - push!(fielddefuse[field].defs, def) + + Liveness = einfo.Liveness + for livepc in Liveness + livestmt = ir[SSAValue(livepc)][:inst] + if is_known_call(livestmt, Core.ifelse, ir) + # the succeeding domination analysis doesn't account for conditional branching + # by ifelse branching at this moment + @goto next_itr + elseif is_known_call(livestmt, isdefined, ir) + args = livestmt.args + length(args) ≥ 3 || continue + obj = args[2] + isa(obj, SSAValue) || continue + obj in related || continue + fld = args[3] + fldval = try_compute_field(ir, fld) + fldval === nothing && continue + typ = unwrap_unionall(widenconst(argextype(obj, ir))) + isa(typ, DataType) || continue + fldidx = try_compute_fieldidx(typ, fldval) + fldidx === nothing && continue + push!(fdefuses[fldidx].uses, IsdefinedUse(livepc)) + elseif isexpr(livestmt, :foreigncall) + # we shouldn't eliminate this use if it's used as a direct argument + args = livestmt.args + nccallargs = length(args[3]::SimpleVector) + for i = 6:(5+nccallargs) + arg = args[i] + isa(arg, SSAValue) && arg in related && @goto next_liveness + end + # this use is preserve, and may be eliminable + for fidx in 1:nflds + push!(fdefuses[fidx].uses, PreserveUse(livepc)) + end + end + @label next_liveness end - # Check that the defexpr has defined values for all the fields - # we're accessing. In the future, we may want to relax this, - # but we should come up with semantics for well defined semantics - # for uninitialized fields first. - ndefuse = length(fielddefuse) - blocks = Vector{Tuple{#=phiblocks=# Vector{Int}, #=allblocks=# Vector{Int}}}(undef, ndefuse) - for fidx in 1:ndefuse - du = fielddefuse[fidx] - isempty(du.uses) && continue - push!(du.defs, newidx) - ldu = compute_live_ins(ir.cfg, du) + + for fidx in 1:nflds + fdu = fdefuses[fidx] + isempty(fdu.uses) && @goto next_use + # check if all uses have safe definitions first, otherwise we should bail out + # since then we may fail to form new ϕ-nodes + ldu = compute_live_ins(ir.cfg, fdu) if isempty(ldu.live_in_bbs) phiblocks = Int[] else - domtree === nothing && (@timeit "domtree 2" domtree = construct_domtree(ir.cfg.blocks)) phiblocks = iterated_dominance_frontier(ir.cfg, ldu, domtree) end - allblocks = sort(vcat(phiblocks, ldu.def_bbs)) - blocks[fidx] = phiblocks, allblocks - if fidx + 1 > length(defexpr.args) - for use in du.uses - domtree === nothing && (@timeit "domtree 2" domtree = construct_domtree(ir.cfg.blocks)) - has_safe_def(ir, domtree, allblocks, du, newidx, use) || @goto skip - end - end - end - # Everything accounted for. Go field by field and perform idf: - # Compute domtree now, needed below, now that we have finished compacting the IR. - # This needs to be after we iterate through the IR with `IncrementalCompact` - # because removing dead blocks can invalidate the domtree. - domtree === nothing && (@timeit "domtree 2" domtree = construct_domtree(ir.cfg.blocks)) - preserve_uses = isempty(defuse.ccall_preserve_uses) ? nothing : - IdDict{Int, Vector{Any}}((idx=>Any[] for idx in SPCSet(defuse.ccall_preserve_uses))) - for fidx in 1:ndefuse - du = fielddefuse[fidx] - ftyp = fieldtype(typ, fidx) - if !isempty(du.uses) - phiblocks, allblocks = blocks[fidx] - phinodes = IdDict{Int, SSAValue}() - for b in phiblocks - phinodes[b] = insert_node!(ir, first(ir.cfg.blocks[b].stmts), - NewInstruction(PhiNode(), ftyp)) + allblocks = sort!(vcat(phiblocks, ldu.def_bbs)) + for use in fdu.uses + isa(use, IsdefinedUse) && continue + if isa(use, PreserveUse) && isempty(fdu.defs) + # nothing to preserve, just ignore this use (may happen when there are unintialized fields) + continue end - # Now go through all uses and rewrite them - for stmt in du.uses - ir[SSAValue(stmt)][:inst] = compute_value_for_use(ir, domtree, allblocks, du, phinodes, fidx, stmt) + if !has_safe_def(ir, domtree, allblocks, fdu, getuseidx(use)) + all_preserved = false + @goto next_use end - if !isbitstype(ftyp) - if preserve_uses !== nothing - for (use, list) in preserve_uses - push!(list, compute_value_for_use(ir, domtree, allblocks, du, phinodes, fidx, use)) - end + end + phinodes = IdDict{Int, SSAValue}() + for b in phiblocks + phinodes[b] = insert_node!(ir, first(ir.cfg.blocks[b].stmts), + NewInstruction(PhiNode(), Any)) + end + # Now go through all uses and rewrite them + for use in fdu.uses + if isa(use, GetfieldLoad) + use = getuseidx(use) + ir[SSAValue(use)][:inst] = compute_value_for_use( + ir, domtree, allblocks, fdu, phinodes, fidx, use) + push!(eliminated, use) + elseif isa(use, PreserveUse) + all_preserved || continue + if newpreserves === nothing + newpreserves = IdDict{Int,Vector{Any}}() end - end - for b in phiblocks - n = ir[phinodes[b]][:inst]::PhiNode - for p in ir.cfg.blocks[b].preds - push!(n.edges, p) - push!(n.values, compute_value_for_block(ir, domtree, - allblocks, du, phinodes, fidx, p)) + # record this `use` as replaceable no matter if we preserve new value or not + use = getuseidx(use) + newvalues = get!(()->Any[], newpreserves, use) + isempty(fdu.defs) && continue # nothing to preserve (may happen when there are unintialized fields) + newval = compute_value_for_use( + ir, domtree, allblocks, fdu, phinodes, fidx, use) + if !isbitstype(widenconst(argextype(newval, ir))) + push!(newvalues, newval) + end + elseif isa(use, IsdefinedUse) + use = getuseidx(use) + if has_safe_def(ir, domtree, allblocks, fdu, use) + ir[SSAValue(use)][:inst] = true + push!(eliminated, use) end + else + throw("unexpected use") end end - for stmt in du.defs - stmt == newidx && continue - ir[SSAValue(stmt)][:inst] = nothing + for b in phiblocks + ϕssa = phinodes[b] + n = ir[ϕssa][:inst]::PhiNode + t = Bottom + for p in ir.cfg.blocks[b].preds + push!(n.edges, p) + v = compute_value_for_block(ir, domtree, allblocks, fdu, phinodes, fidx, p) + push!(n.values, v) + if t !== Any + t = tmerge(t, argextype(v, ir)) + end + end + ir[ϕssa][:type] = t end + @label next_use end - preserve_uses === nothing && continue - if all_forwarded - # this means all ccall preserves have been replaced with forwarded loads - # so we can potentially eliminate the allocation, otherwise we must preserve - # the whole allocation. - push!(intermediaries, newidx) + push!(revisit, (related, Liveness)) + @label next_itr + end + + # remove dead setfield! and :new allocs + deadssas = IdSet{SSAValue}() + if all_preserved && newpreserves !== nothing + preserved = keys(newpreserves) + else + preserved = EMPTY_PRESERVED_SSAS + end + mark_dead_ssas!(ir, deadssas, revisit, eliminated, preserved) + for ssa in deadssas + ir[ssa][:inst] = nothing + end + if all_preserved && newpreserves !== nothing + deadssas = Int[ssa.id for ssa in deadssas] + for (idx, newuses) in newpreserves + ir[SSAValue(idx)][:inst] = form_new_preserves( + ir[SSAValue(idx)][:inst]::Expr, deadssas, newuses) end - # Insert the new preserves - for (use, new_preserves) in preserve_uses - ir[SSAValue(use)][:inst] = form_new_preserves(ir[SSAValue(use)][:inst]::Expr, intermediaries, new_preserves) + end + + return ir +end + +const EMPTY_PRESERVED_SSAS = keys(IdDict{Int,Vector{Any}}()) +const PreservedSets = typeof(EMPTY_PRESERVED_SSAS) + +function is_load_forwardable(x::EscapeInfo) + AliasInfo = x.AliasInfo + return isa(AliasInfo, IndexableFields) +end + +struct FieldDefUse + uses::Vector{Any} + defs::Vector{Int} +end +FieldDefUse() = FieldDefUse(Any[], Int[]) +struct GetfieldLoad + idx::Int +end +struct PreserveUse + idx::Int +end +struct IsdefinedUse + idx::Int +end +function getuseidx(@nospecialize use) + if isa(use, GetfieldLoad) + return use.idx + elseif isa(use, PreserveUse) + return use.idx + elseif isa(use, IsdefinedUse) + return use.idx + end + throw("getuseidx: unexpected use") +end + +function compute_live_ins(cfg::CFG, fdu::FieldDefUse) + uses = Int[] + for use in fdu.uses + isa(use, IsdefinedUse) && continue + push!(uses, getuseidx(use)) + end + return compute_live_ins(cfg, fdu.defs, uses) +end + +# even when the allocation contains an uninitialized field, we try an extra effort to check +# if this load at `idx` have any "safe" `setfield!` calls that define the field +# try to find +function has_safe_def(ir::IRCode, domtree::DomTree, allblocks::Vector{Int}, + fdu::FieldDefUse, use::Int) + dfu = find_def_for_use(ir, domtree, allblocks, fdu, use) + dfu === nothing && return false + def = dfu[1] + def ≠ 0 && return true # found a "safe" definition + # we may still be able to replace this load with `PhiNode` -- examine if all predecessors of + # this `block` have any "safe" definition + block = block_for_inst(ir, use) + seen = BitSet(block) + worklist = BitSet(ir.cfg.blocks[block].preds) + isempty(worklist) && return false + while !isempty(worklist) + pred = pop!(worklist) + # if this block has already been examined, bail out to avoid infinite cycles + pred in seen && return false + use = last(ir.cfg.blocks[pred].stmts) + # NOTE this `use` isn't a load, and so the inclusive condition can be used + dfu = find_def_for_use(ir, domtree, allblocks, fdu, use, true) + dfu === nothing && return false + def = dfu[1] + push!(seen, pred) + def ≠ 0 && continue # found a "safe" definition for this predecessor + # if not, check for the predecessors of this predecessor + for newpred in ir.cfg.blocks[pred].preds + push!(worklist, newpred) + end + end + return true +end + +# find the first dominating def for the given use +function find_def_for_use(ir::IRCode, domtree::DomTree, allblocks::Vector{Int}, + fdu::FieldDefUse, use::Int, inclusive::Bool=false) + useblock = block_for_inst(ir.cfg, use) + curblock = find_curblock(domtree, allblocks, useblock) + curblock === nothing && return nothing + local def = 0 + for idx in fdu.defs + if block_for_inst(ir.cfg, idx) == curblock + if curblock != useblock + # Find the last def in this block + def = max(def, idx) + else + # Find the last def before our use + if inclusive + def = max(def, idx ≤ use ? idx : 0) + else + def = max(def, idx < use ? idx : 0) + end + end end + end + return def, useblock, curblock +end - @label skip +function find_curblock(domtree::DomTree, allblocks::Vector{Int}, curblock::Int) + # TODO: This can be much faster by looking at current level and only + # searching for those blocks in a sorted order + while !(curblock in allblocks) + curblock = domtree.idoms_bb[curblock] + curblock == 0 && return nothing end + return curblock end -function form_new_preserves(origex::Expr, intermediates::Vector{Int}, new_preserves::Vector{Any}) - newex = Expr(:foreigncall) - nccallargs = length(origex.args[3]::SimpleVector) - for i in 1:(6+nccallargs-1) - push!(newex.args, origex.args[i]) +function compute_value_for_use(ir::IRCode, domtree::DomTree, allblocks::Vector{Int}, + fdu::FieldDefUse, phinodes::IdDict{Int, SSAValue}, fidx::Int, use::Int) + dfu = find_def_for_use(ir, domtree, allblocks, fdu, use) + @assert dfu !== nothing "has_safe_def condition unsatisfied" + def, useblock, curblock = dfu + if def == 0 + if !haskey(phinodes, curblock) + # If this happens, we need to search the predecessors for defs. Which + # one doesn't matter - if it did, we'd have had a phinode + return compute_value_for_block(ir, domtree, allblocks, fdu, phinodes, fidx, first(ir.cfg.blocks[useblock].preds)) + end + # The use is the phinode + return phinodes[curblock] + else + return val_for_def_expr(ir, def, fidx) end - for i in (6+nccallargs):length(origex.args) - x = origex.args[i] - # don't need to preserve intermediaries - if isa(x, SSAValue) && x.id in intermediates - continue +end + +function compute_value_for_block(ir::IRCode, domtree::DomTree, allblocks::Vector{Int}, + fdu::FieldDefUse, phinodes::IdDict{Int, SSAValue}, fidx::Int, curblock::Int) + curblock = find_curblock(domtree, allblocks, curblock) + @assert curblock !== nothing "has_safe_def condition unsatisfied" + def = 0 + for stmt in fdu.defs + if block_for_inst(ir.cfg, stmt) == curblock + def = max(def, stmt) end - push!(newex.args, x) end - for i in 1:length(new_preserves) - push!(newex.args, new_preserves[i]) + return def == 0 ? phinodes[curblock] : val_for_def_expr(ir, def, fidx) +end + +function val_for_def_expr(ir::IRCode, def::Int, fidx::Int) + ex = ir[SSAValue(def)][:inst] + if isexpr(ex, :new) || is_known_call(ex, tuple, ir) + return ex.args[1+fidx] + else + @assert is_known_call(ex, setfield!, ir) "invalid load forwarding" + return ex.args[4] end - return newex +end + +function mark_dead_ssas!(ir::IRCode, deadssas::IdSet{SSAValue}, + revisit::Vector{Tuple{Vector{SSAValue},LivenessSet}}, eliminated::BitSet, + preserved::PreservedSets) + wset = BitSet(1:length(revisit)) + while !isempty(wset) + revisit_idx = pop!(wset) + mark_dead_ssas!(ir, deadssas, revisit, eliminated, preserved, wset, revisit_idx) + end +end + +function mark_dead_ssas!(ir::IRCode, deadssas::IdSet{SSAValue}, + revisit::Vector{Tuple{Vector{SSAValue},LivenessSet}}, eliminated::BitSet, + preserved::PreservedSets, wset::BitSet, revisit_idx::Int) + related, Liveness = revisit[revisit_idx] + eliminable = SSAValue[] + for livepc in Liveness + livepc in eliminated && @goto next_live + ssa = SSAValue(livepc) + stmt = ir[ssa][:inst] + if isexpr(stmt, :new) + ssa in deadssas && @goto next_live + for new_revisit_idx in wset + if ssa in revisit[new_revisit_idx][1] + delete!(wset, new_revisit_idx) + if mark_dead_ssas!(ir, deadssas, + revisit, eliminated, + preserved, wset, new_revisit_idx) + push!(eliminable, ssa) + @goto next_live + else + return false + end + end + end + return false + elseif is_known_call(stmt, setfield!, ir) + @assert length(stmt.args) ≥ 4 "invalid escape analysis" + obj = stmt.args[2] + val = stmt.args[4] + if isa(obj, SSAValue) + if obj in related + push!(eliminable, ssa) + @goto next_live + end + if isa(val, SSAValue) && val in related + if obj in deadssas + push!(eliminable, ssa) + @goto next_live + end + for new_revisit_idx in wset + if obj in revisit[new_revisit_idx][1] + delete!(wset, new_revisit_idx) + if mark_dead_ssas!(ir, deadssas, + revisit, eliminated, + preserved, wset, new_revisit_idx) + push!(eliminable, ssa) + @goto next_live + else + return false + end + end + end + end + end + return false + elseif isexpr(stmt, :foreigncall) + livepc in preserved && @goto next_live + return false + else + return false + end + @label next_live + end + for ssa in related; push!(deadssas, ssa); end + for ssa in eliminable; push!(deadssas, ssa); end + return true end """ @@ -1084,15 +1193,15 @@ In addition to a simple DCE for unused values and allocations, this pass also nullifies `typeassert` calls that can be proved to be no-op, in order to allow LLVM to emit simpler code down the road. -Note that this pass is more effective after SROA optimization (i.e. `sroa_pass!`), +Note that this pass is more effective after SROA optimization (i.e. `linear_pass!`), since SROA often allows this pass to: - eliminate allocation of object whose field references are all replaced with scalar values, and - nullify `typeassert` call whose first operand has been replaced with a scalar value (, which may have introduced new type information that inference did not understand) -Also note that currently this pass _needs_ to run after `sroa_pass!`, because +Also note that currently this pass _needs_ to run after `linear_pass!`, because the `typeassert` elimination depends on the transformation by `canonicalize_typeassert!` done -within `sroa_pass!` which redirects references of `typeassert`ed value to the corresponding `PiNode`. +within `linear_pass!` which redirects references of `typeassert`ed value to the corresponding `PiNode`. """ function adce_pass!(ir::IRCode) phi_uses = fill(0, length(ir.stmts) + length(ir.new_nodes)) diff --git a/test/compiler/EscapeAnalysis/EAUtils.jl b/test/compiler/EscapeAnalysis/EAUtils.jl index 3ae9b41a0ddac4..7ef50d5434932b 100644 --- a/test/compiler/EscapeAnalysis/EAUtils.jl +++ b/test/compiler/EscapeAnalysis/EAUtils.jl @@ -71,8 +71,8 @@ import Core: CodeInstance, MethodInstance, CodeInfo import .CC: InferenceResult, OptimizationState, IRCode, copy as cccopy, - @timeit, convert_to_ircode, slot2reg, compact!, ssa_inlining_pass!, sroa_pass!, - adce_pass!, type_lift_pass!, JLOptions, verify_ir, verify_linetable + @timeit, convert_to_ircode, slot2reg, compact!, ssa_inlining_pass!, linear_pass!, + memory_opt_pass!, adce_pass!, type_lift_pass!, JLOptions, verify_ir, verify_linetable import .EA: analyze_escapes, ArgEscapeCache, EscapeInfo, EscapeState, is_ipo_profitable # when working outside of Core.Compiler, @@ -227,6 +227,7 @@ function run_passes_with_ea(interp::EscapeAnalyzer, ci::CodeInfo, sv::Optimizati @timeit "Inlining" ir = ssa_inlining_pass!(ir, ir.linetable, sv.inlining, ci.propagate_inbounds) # @timeit "verify 2" verify_ir(ir) @timeit "compact 2" ir = compact!(ir) + @timeit "SROA" ir, _ = linear_pass!(ir) if caller.linfo.specTypes === interp.entry_tt && interp.optimize try @timeit "[Local EA]" state = analyze_escapes(ir, nargs, true, get_escape_cache(interp)) @@ -240,7 +241,6 @@ function run_passes_with_ea(interp::EscapeAnalyzer, ci::CodeInfo, sv::Optimizati interp.state = state interp.linfo = sv.linfo end - @timeit "SROA" ir = sroa_pass!(ir) @timeit "ADCE" ir = adce_pass!(ir) @timeit "type lift" ir = type_lift_pass!(ir) @timeit "compact 3" ir = compact!(ir) diff --git a/test/compiler/EscapeAnalysis/interprocedural.jl b/test/compiler/EscapeAnalysis/interprocedural.jl index eccdc710a6c120..42a2505e03c087 100644 --- a/test/compiler/EscapeAnalysis/interprocedural.jl +++ b/test/compiler/EscapeAnalysis/interprocedural.jl @@ -7,8 +7,6 @@ include(normpath(@__DIR__, "setup.jl")) # callsites # --------- -import .EA: ignore_argescape - noescape(a) = nothing noescape(a, b) = nothing function global_escape!(x) diff --git a/test/compiler/EscapeAnalysis/setup.jl b/test/compiler/EscapeAnalysis/setup.jl index 5123b18e2dfdd2..4e7d6fb5159aae 100644 --- a/test/compiler/EscapeAnalysis/setup.jl +++ b/test/compiler/EscapeAnalysis/setup.jl @@ -2,6 +2,7 @@ include(normpath(@__DIR__, "EAUtils.jl")) using Test, Core.Compiler.EscapeAnalysis, .EAUtils import Core: Argument, SSAValue, ReturnNode const EA = Core.Compiler.EscapeAnalysis +import .EA: ignore_argescape isT(T) = (@nospecialize x) -> x === T isreturn(@nospecialize x) = isa(x, Core.ReturnNode) && isdefined(x, :val) diff --git a/test/compiler/irpasses.jl b/test/compiler/irpasses.jl index 128fd6cc84b7b2..fc70bb1c3c9336 100644 --- a/test/compiler/irpasses.jl +++ b/test/compiler/irpasses.jl @@ -2,7 +2,9 @@ using Test using Base.Meta -using Core: PhiNode, SSAValue, GotoNode, PiNode, QuoteNode, ReturnNode, GotoIfNot +import Core: + CodeInfo, Argument, SSAValue, GotoNode, GotoIfNot, PiNode, PhiNode, + QuoteNode, ReturnNode include(normpath(@__DIR__, "irutils.jl")) @@ -12,7 +14,7 @@ include(normpath(@__DIR__, "irutils.jl")) ## Test that domsort doesn't mangle single-argument phis (#29262) let m = Meta.@lower 1 + 1 @assert Meta.isexpr(m, :thunk) - src = m.args[1]::Core.CodeInfo + src = m.args[1]::CodeInfo src.code = Any[ # block 1 Expr(:call, :opaque), @@ -47,7 +49,7 @@ end # test that we don't stack-overflow in SNCA with large functions. let m = Meta.@lower 1 + 1 @assert Meta.isexpr(m, :thunk) - src = m.args[1]::Core.CodeInfo + src = m.args[1]::CodeInfo code = Any[] N = 2^15 for i in 1:2:N @@ -73,30 +75,87 @@ end # SROA # ==== +import Core.Compiler: widenconst + +is_load_forwarded(src::CodeInfo) = !any(iscall((src, getfield)), src.code) +is_scalar_replaced(src::CodeInfo) = + is_load_forwarded(src) && !any(iscall((src, setfield!)), src.code) && !any(isnew, src.code) + +function is_load_forwarded(@nospecialize(T), src::CodeInfo) + for i in 1:length(src.code) + x = src.code[i] + if iscall((src, getfield), x) + widenconst(argextype(x.args[1], src)) <: T && return false + end + end + return true +end +function is_scalar_replaced(@nospecialize(T), src::CodeInfo) + is_load_forwarded(T, src) || return false + for i in 1:length(src.code) + x = src.code[i] + if iscall((src, setfield!), x) + widenconst(argextype(x.args[1], src)) <: T && return false + elseif isnew(x) + widenconst(argextype(SSAValue(i), src)) <: T && return false + end + end + return true +end + struct ImmutableXYZ; x; y; z; end mutable struct MutableXYZ; x; y; z; end +struct ImmutableOuter{T}; x::T; y::T; z::T; end +mutable struct MutableOuter{T}; x::T; y::T; z::T; end +struct ImmutableRef{T}; x::T; end +Base.getindex(r::ImmutableRef) = r.x +mutable struct SafeRef{T}; x::T; end +Base.getindex(s::SafeRef) = getfield(s, 1) +Base.setindex!(s::SafeRef, x) = setfield!(s, 1, x) + +# simple immutability +# ------------------- -# should optimize away very basic cases let src = code_typed1((Any,Any,Any)) do x, y, z xyz = ImmutableXYZ(x, y, z) xyz.x, xyz.y, xyz.z end - @test !any(isnew, src.code) + @test is_scalar_replaced(src) + @test any(src.code) do @nospecialize x + iscall((src, tuple), x) && + x.args[2:end] == Any[#=x=# Core.Argument(2), #=y=# Core.Argument(3), #=z=# Core.Argument(4)] + end end +let src = code_typed1((Any,Any,Any)) do x, y, z + xyz = (x, y, z) + xyz[1], xyz[2], xyz[3] + end + @test is_scalar_replaced(src) + @test any(src.code) do @nospecialize x + iscall((src, tuple), x) && + x.args[2:end] == Any[#=x=# Core.Argument(2), #=y=# Core.Argument(3), #=z=# Core.Argument(4)] + end +end + +# simple mutability +# ----------------- + let src = code_typed1((Any,Any,Any)) do x, y, z xyz = MutableXYZ(x, y, z) xyz.x, xyz.y, xyz.z end - @test !any(isnew, src.code) + @test is_scalar_replaced(src) + @test any(src.code) do @nospecialize x + iscall((src, tuple), x) && + x.args[2:end] == Any[#=x=# Core.Argument(2), #=y=# Core.Argument(3), #=z=# Core.Argument(4)] + end end - -# should handle simple mutabilities let src = code_typed1((Any,Any,Any)) do x, y, z xyz = MutableXYZ(x, y, z) xyz.y = 42 xyz.x, xyz.y, xyz.z end - @test !any(isnew, src.code) + @test is_scalar_replaced(src) @test any(src.code) do @nospecialize x iscall((src, tuple), x) && x.args[2:end] == Any[#=x=# Core.Argument(2), 42, #=x=# Core.Argument(4)] @@ -107,19 +166,23 @@ let src = code_typed1((Any,Any,Any)) do x, y, z xyz.x, xyz.z = xyz.z, xyz.x xyz.x, xyz.y, xyz.z end - @test !any(isnew, src.code) + @test is_scalar_replaced(src) @test any(src.code) do @nospecialize x iscall((src, tuple), x) && x.args[2:end] == Any[#=z=# Core.Argument(4), #=y=# Core.Argument(3), #=x=# Core.Argument(2)] end end -# circumvent uninitialized fields as far as there is a solid `setfield!` definition + +# uninitialized fields +# -------------------- + +# safe cases let src = code_typed1() do r = Ref{Any}() r[] = 42 return r[] end - @test !any(isnew, src.code) + @test is_scalar_replaced(src) end let src = code_typed1((Bool,)) do cond r = Ref{Any}() @@ -131,7 +194,7 @@ let src = code_typed1((Bool,)) do cond return r[] end end - @test !any(isnew, src.code) + @test is_scalar_replaced(src) end let src = code_typed1((Bool,)) do cond r = Ref{Any}() @@ -142,7 +205,7 @@ let src = code_typed1((Bool,)) do cond end return r[] end - @test !any(isnew, src.code) + @test is_scalar_replaced(src) end let src = code_typed1((Bool,Bool,Any,Any,Any)) do c1, c2, x, y, z r = Ref{Any}() @@ -157,7 +220,16 @@ let src = code_typed1((Bool,Bool,Any,Any,Any)) do c1, c2, x, y, z end return r[] end - @test !any(isnew, src.code) + @test is_scalar_replaced(src) +end + +# unsafe cases +let src = code_typed1() do + r = Ref{Any}() + return r[] + end + @test count(isnew, src.code) == 1 + @test count(iscall((src, getfield)), src.code) == 1 end let src = code_typed1((Bool,)) do cond r = Ref{Any}() @@ -167,7 +239,9 @@ let src = code_typed1((Bool,)) do cond return r[] end # N.B. `r` should be allocated since `cond` might be `false` and then it will be thrown - @test any(isnew, src.code) + @test count(isnew, src.code) == 1 + @test count(iscall((src, setfield!)), src.code) == 1 + @test count(iscall((src, getfield)), src.code) == 1 end let src = code_typed1((Bool,Bool,Any,Any)) do c1, c2, x, y r = Ref{Any}() @@ -181,12 +255,95 @@ let src = code_typed1((Bool,Bool,Any,Any)) do c1, c2, x, y return r[] end # N.B. `r` should be allocated since `c2` might be `false` and then it will be thrown - @test any(isnew, src.code) + @test count(isnew, src.code) == 1 + @test count(iscall((src, setfield!)), src.code) == 2 + @test count(iscall((src, getfield)), src.code) == 1 end -# should include a simple alias analysis -struct ImmutableOuter{T}; x::T; y::T; z::T; end -mutable struct MutableOuter{T}; x::T; y::T; z::T; end +# load forwarding +# --------------- +# even if allocation can't be eliminated + +# safe cases +for T in (ImmutableRef{Any}, Ref{Any}) + let src = @eval code_typed1((Bool,Any,)) do c, a + r = $T(a) + if c + return r[] + else + return r + end + end + @test is_load_forwarded(src) + @test count(isnew, src.code) == 1 + end + let src = @eval code_typed1((Bool,String,)) do c, a + r = $T(a) + if c + return r[]::String # adce_pass! will further eliminate this type assert call also + else + return r + end + end + @test is_load_forwarded(src) + @test count(isnew, src.code) == 1 + @test !any(iscall((src, typeassert)), src.code) + end + let src = @eval code_typed1((Bool,Any,)) do c, a + r = $T(a) + if c + return r[] + else + throw(r) + end + end + @test is_load_forwarded(src) + @test count(isnew, src.code) == 1 + end +end +let src = code_typed1((Bool,Any,Any)) do c, a, b + r = Ref{Any}(a) + if c + return r[] + end + r[] = b + return r + end + @test is_load_forwarded(src) + @test count(isnew, src.code) == 1 + @test count(iscall((src, setfield!)), src.code) == 1 + @test count(src.code) do @nospecialize x + isreturn(x) && x.val === Argument(3) # a + end == 1 +end + +# unsafe case +let src = code_typed1((Bool,Any,Any)) do c, a, b + r = Ref{Any}(a) + r[] = b + @noinline some_escape!(r) + return r[] + end + @test !is_load_forwarded(src) + @test count(isnew, src.code) == 1 + @test count(iscall((src, setfield!)), src.code) == 1 +end +let src = code_typed1((Bool,String,Regex)) do c, a, b + r1 = Ref{Any}(a) + r2 = Ref{Any}(b) + return ifelse(c, r1, r2)[] + end + r = only(findall(isreturn, src.code)) + v = (src.code[r]::Core.ReturnNode).val + @test v !== Argument(3) # a + @test v !== Argument(4) # b + @test_broken is_load_forwarded(src) # ideally +end + +# aliased load forwarding +# ----------------------- + +# OK: immutable(immutable(...)) case let src = code_typed1((Any,Any,Any)) do x, y, z xyz = ImmutableXYZ(x, y, z) outer = ImmutableOuter(xyz, xyz, xyz) @@ -214,22 +371,21 @@ let src = code_typed1((Any,Any,Any)) do x, y, z end end -# FIXME our analysis isn't yet so powerful at this moment: may be unable to handle nested objects well -# OK: mutable(immutable(...)) case +# OK: immutable(mutable(...)) case let src = code_typed1((Any,Any,Any)) do x, y, z xyz = MutableXYZ(x, y, z) t = (xyz,) v = t[1].x v, v, v end - @test !any(isnew, src.code) + @test is_scalar_replaced(src) end let src = code_typed1((Any,Any,Any)) do x, y, z xyz = MutableXYZ(x, y, z) outer = ImmutableOuter(xyz, xyz, xyz) outer.x.x, outer.y.y, outer.z.z end - @test !any(isnew, src.code) + @test is_scalar_replaced(src) @test any(src.code) do @nospecialize x iscall((src, tuple), x) && x.args[2:end] == Any[#=x=# Core.Argument(2), #=y=# Core.Argument(3), #=y=# Core.Argument(4)] @@ -240,32 +396,500 @@ let # this is a simple end to end test case, which demonstrates allocation elimi # NOTE this test case isn't so robust and might be subject to future changes of the broadcasting implementation, # in that case you don't really need to stick to keeping this test case around simple_sroa(s) = broadcast(identity, Ref(s)) + let src = code_typed1(simple_sroa, (String,)) + @test is_scalar_replaced(src) + end s = Base.inferencebarrier("julia")::String simple_sroa(s) # NOTE don't hard-code `"julia"` in `@allocated` clause and make sure to execute the # compiled code for `simple_sroa`, otherwise everything can be folded even without SROA @test @allocated(simple_sroa(s)) == 0 end -# FIXME: immutable(mutable(...)) case +let # some insanely nested example + src = code_typed1((Int,)) do x + (Ref(Ref(Ref(Ref(Ref(Ref(Ref(Ref(Ref(Ref((x))))))))))))[][][][][][][][][][] + end + @test is_scalar_replaced(src) +end + +# OK: mutable(immutable(...)) case let src = code_typed1((Any,Any,Any)) do x, y, z xyz = ImmutableXYZ(x, y, z) outer = MutableOuter(xyz, xyz, xyz) outer.x.x, outer.y.y, outer.z.z end - @test_broken !any(isnew, src.code) + @test is_scalar_replaced(src) + @test any(src.code) do @nospecialize x + iscall((src, tuple), x) && + x.args[2:end] == Any[#=x=# Core.Argument(2), #=y=# Core.Argument(3), #=z=# Core.Argument(4)] + end +end +let src = code_typed1((String,String,String)) do x, y, z + xyz = (x, y, z) + r = Ref(xyz) + return r[][3], r[][2], r[][1] + end + @test is_scalar_replaced(src) + @test any(src.code) do @nospecialize x + iscall((src, tuple), x) && + x.args[2:end] == Any[#=z=# Core.Argument(4), #=y=# Core.Argument(3), #=x=# Core.Argument(2)] + end end -# FIXME: mutable(mutable(...)) case + +# OK: mutable(mutable(...)) case +# new chain let src = code_typed1((Any,Any,Any)) do x, y, z xyz = MutableXYZ(x, y, z) outer = MutableOuter(xyz, xyz, xyz) outer.x.x, outer.y.y, outer.z.z end - @test_broken !any(isnew, src.code) + @test is_scalar_replaced(src) + @test any(src.code) do @nospecialize x + iscall((src, tuple), x) && + x.args[2:end] == Any[#=x=# Core.Argument(2), #=y=# Core.Argument(3), #=z=# Core.Argument(4)] + end +end +let src = code_typed1((Any,Any,Any)) do x, y, z + xyz = MutableXYZ(x, y, z) + xyz.x, xyz.y, xyz.z = z, y, x + outer = MutableOuter(xyz, xyz, xyz) + outer.x.x, outer.y.y, outer.z.z + end + @test is_scalar_replaced(src) + @test any(src.code) do @nospecialize x + iscall((src, tuple), x) && + x.args[2:end] == Any[#=z=# Core.Argument(4), #=y=# Core.Argument(3), #=x=# Core.Argument(2)] + end +end +let src = code_typed1((Any,Any,Any)) do x, y, z + xyz = MutableXYZ(x, y, z) + xyz.x, xyz.y, xyz.z = xyz.z, xyz.y, xyz.x + outer = MutableOuter(xyz, xyz, xyz) + outer.x.x, outer.y.y, outer.z.z + end + @test is_scalar_replaced(src) + @test any(src.code) do @nospecialize x + iscall((src, tuple), x) && + x.args[2:end] == Any[#=z=# Core.Argument(4), #=y=# Core.Argument(3), #=x=# Core.Argument(2)] + end +end +let src = code_typed1((Any,Any,Any)) do x, y, z + xyz = MutableXYZ(x, y, z) + inner = MutableOuter(xyz, xyz, xyz) + outer = MutableOuter(inner, inner, inner) + outer.x.x.x, outer.y.y.y, outer.z.z.z + end + @test is_scalar_replaced(src) + @test any(src.code) do @nospecialize x + iscall((src, tuple), x) && + x.args[2:end] == Any[#=x=# Core.Argument(2), #=y=# Core.Argument(3), #=z=# Core.Argument(4)] + end +end +let src = code_typed1((Any,Any,Any)) do x, y, z + xyz = MutableXYZ(x, y, z) + xyz.x, xyz.y, xyz.z = z, y, x + inner = MutableOuter(xyz, xyz, xyz) + outer = MutableOuter(inner, inner, inner) + outer.x.x.x, outer.y.y.y, outer.z.z.z + end + @test is_scalar_replaced(src) + @test any(src.code) do @nospecialize x + iscall((src, tuple), x) && + x.args[2:end] == Any[#=z=# Core.Argument(4), #=y=# Core.Argument(3), #=x=# Core.Argument(2)] + end +end +# setfield! chain +let src = code_typed1((Any,Any,Any)) do x, y, z + xyz = MutableXYZ(x, y, z) + outer = Ref{MutableXYZ}() + outer[] = xyz + return outer[].x, outer[].y, outer[].z + end + @test is_scalar_replaced(src) + @test any(src.code) do @nospecialize x + iscall((src, tuple), x) && + x.args[2:end] == Any[#=x=# Core.Argument(2), #=y=# Core.Argument(3), #=z=# Core.Argument(4)] + end +end +let src = code_typed1((Any,Any,Any)) do x, y, z + xyz = MutableXYZ(x, y, z) + outer = Ref{MutableXYZ}() + outer[] = xyz + xyz.z = 42 + return outer[].x, outer[].y, outer[].z + end + @test is_scalar_replaced(src) + @test any(src.code) do @nospecialize x + iscall((src, tuple), x) && + x.args[2:end] == Any[#=x=# Core.Argument(2), #=y=# Core.Argument(3), 42] + end +end + +# ϕ-allocation elimination +# ------------------------ + +# safe cases +let src = code_typed1((Bool,Any,Any)) do cond, x, y + if cond + ϕ = Ref{Any}(x) + else + ϕ = Ref{Any}(y) + end + ϕ[] + end + @test is_scalar_replaced(src) + @test count(src.code) do @nospecialize x + isa(x, Core.PhiNode) && + #=x=# Core.Argument(3) in x.values && + #=y=# Core.Argument(4) in x.values + end == 1 +end +let src = code_typed1((Bool,Bool,Any,Any,Any)) do cond1, cond2, x, y, z + if cond1 + ϕ = Ref{Any}(x) + elseif cond2 + ϕ = Ref{Any}(y) + else + ϕ = Ref{Any}(z) + end + ϕ[] + end + @test is_scalar_replaced(src) + @test count(src.code) do @nospecialize x + isa(x, Core.PhiNode) && + #=x=# Core.Argument(4) in x.values && + #=y=# Core.Argument(5) in x.values && + #=z=# Core.Argument(6) in x.values + end == 1 +end +let src = code_typed1((Bool,Any,Any,Any)) do cond, x, y, z + if cond + ϕ = Ref{Any}(x) + else + ϕ = Ref{Any}(y) + end + ϕ[] = z + ϕ[] + end + @test is_scalar_replaced(src) + @test count(src.code) do @nospecialize x + isa(x, Core.ReturnNode) && + #=z=# Core.Argument(5) === x.val + end == 1 +end +let src = code_typed1((Bool,Any,Any,)) do cond, x, y + if cond + ϕ = Ref{Any}(x) + out1 = ϕ[] + else + ϕ = Ref{Any}(y) + out1 = ϕ[] + end + out2 = ϕ[] + out1, out2 + end + @test is_scalar_replaced(src) + @test count(src.code) do @nospecialize x + isa(x, Core.PhiNode) && + #=x=# Core.Argument(3) in x.values && + #=y=# Core.Argument(4) in x.values + end == 2 +end +let src = code_typed1((Bool,Any,Any,Any)) do cond, x, y, z + if cond + ϕ = Ref{Any}(x) + else + ϕ = Ref{Any}(y) + ϕ[] = z + end + ϕ[] + end + @test is_scalar_replaced(src) + @test count(src.code) do @nospecialize x + isa(x, Core.PhiNode) && + #=x=# Core.Argument(3) in x.values && + #=z=# Core.Argument(5) in x.values + end == 1 +end +let src = code_typed1((Bool,Any,Any,Any)) do cond, x, y, z + if cond + ϕ = Ref{Any}(x) + out1 = ϕ[] + else + ϕ = Ref{Any}(y) + out1 = ϕ[] + ϕ[] = z + end + out2 = ϕ[] + out1, out2 + end + @test is_scalar_replaced(src) + @test count(src.code) do @nospecialize x + isa(x, Core.PhiNode) && + #=x=# Core.Argument(3) in x.values && + #=y=# Core.Argument(4) in x.values + end == 1 + @test count(src.code) do @nospecialize x + isa(x, Core.PhiNode) && + #=x=# Core.Argument(3) in x.values && + #=z=# Core.Argument(5) in x.values + end == 1 +end +let src = code_typed1((Bool,Any,Any)) do cond, x, y + # these allocation form multiple ϕ-nodes + if cond + ϕ2 = ϕ1 = Ref{Any}(x) + else + ϕ2 = ϕ1 = Ref{Any}(y) + end + ϕ1[], ϕ2[] + end + @test is_scalar_replaced(src) + @test count(src.code) do @nospecialize x + isa(x, Core.PhiNode) && + #=x=# Core.Argument(3) in x.values && + #=y=# Core.Argument(4) in x.values + end == 1 +end +let src = code_typed1((Bool,String,)) do cond, x + # these allocation form multiple ϕ-nodes + if cond + ϕ2 = ϕ1 = Ref{Any}("foo") + else + ϕ2 = ϕ1 = Ref{Any}("bar") + end + ϕ2[] = x + y = ϕ1[] # => x + return y + end + @test is_scalar_replaced(src) + @test count(src.code) do @nospecialize x + isa(x, Core.ReturnNode) && + #=x=# x.val === Core.Argument(3) + end == 1 +end + +# unsafe cases +let src = code_typed1((Bool,Any,Any)) do cond, x, y + if cond + ϕ = Ref{Any}(x) + else + ϕ = Ref{Any}(y) + end + some_escape!(ϕ) + ϕ[] + end + @test count(isnew, src.code) == 2 + @test count(iscall((src, getfield)), src.code) == 1 +end +let src = code_typed1((Bool,Any,Any)) do cond, x, y + if cond + ϕ = Ref{Any}(x) + some_escape!(ϕ) + else + ϕ = Ref{Any}(y) + end + ϕ[] + end + @test count(isnew, src.code) == 2 + @test count(iscall((src, getfield)), src.code) == 1 +end +let src = code_typed1((Bool,Any,)) do cond, x + if cond + ϕ = Ref{Any}(x) + else + ϕ = Ref{Any}() + end + ϕ[] + end + @test count(isnew, src.code) == 2 + @test count(iscall((src, getfield)), src.code) == 1 +end +let src = code_typed1((Bool,Any)) do c, a + local r + if c + r = Ref{Any}(a) + end + (r::Base.RefValue{Any})[] + end + @test count(isnew, src.code) == 1 + @test count(iscall((src, getfield)), src.code) == 1 +end + +function mutable_ϕ_elim(x, xs) + r = Ref(x) + for x in xs + r = Ref(x) + end + return r[] +end +let src = code_typed1(mutable_ϕ_elim, (String, Vector{String})) + @test is_scalar_replaced(src) + + xs = String[string(gensym()) for _ in 1:100] + mutable_ϕ_elim("init", xs) + @test @allocated(mutable_ϕ_elim("init", xs)) == 0 +end + +# demonstrate the power of our field / alias analysis with realistic end to end examples +# adapted from http://wiki.luajit.org/Allocation-Sinking-Optimization#implementation%5B +abstract type AbstractPoint{T} end +struct Point{T} <: AbstractPoint{T} + x::T + y::T +end +mutable struct MPoint{T} <: AbstractPoint{T} + x::T + y::T +end +add(a::P, b::P) where P<:AbstractPoint = P(a.x + b.x, a.y + b.y) +function compute_point(T, n, ax, ay, bx, by) + a = T(ax, ay) + b = T(bx, by) + for i in 0:(n-1) + a = add(add(a, b), b) + end + a.x, a.y +end +function compute_point(n, a, b) + for i in 0:(n-1) + a = add(add(a, b), b) + end + a.x, a.y +end +function compute_point!(n, a, b) + for i in 0:(n-1) + a′ = add(add(a, b), b) + a.x = a′.x + a.y = a′.y + end end -let # should work with constant globals - # immutable case - # -------------- +let # immutable case + src = code_typed1((Int,)) do n + compute_point(Point, n, 1+.5, 2+.5, 2+.25, 4+.75) + end + @test is_scalar_replaced(Point, src) + src = code_typed1((Int,)) do n + compute_point(Point, n, 1+.5im, 2+.5im, 2+.25im, 4+.75im) + end + @test is_scalar_replaced(Point, src) + @test is_load_forwarded(ComplexF64, src) + @test !is_scalar_replaced(ComplexF64, src) + + # mutable case + src = code_typed1((Int,)) do n + compute_point(MPoint, n, 1+.5, 2+.5, 2+.25, 4+.75) + end + @test is_scalar_replaced(MPoint, src) + src = code_typed1((Int,)) do n + compute_point(MPoint, n, 1+.5im, 2+.5im, 2+.25im, 4+.75im) + end + @test is_scalar_replaced(MPoint, src) + @test is_load_forwarded(ComplexF64, src) + @test !is_scalar_replaced(ComplexF64, src) +end +compute_point(MPoint, 10, 1+.5, 2+.5, 2+.25, 4+.75) +compute_point(MPoint, 10, 1+.5im, 2+.5im, 2+.25im, 4+.75im) +@test @allocated(compute_point(MPoint, 10000, 1+.5, 2+.5, 2+.25, 4+.75)) == 0 +@test @allocated(compute_point(MPoint, 10000, 1+.5im, 2+.5im, 2+.25im, 4+.75im)) == 0 + +let # immutable case + src = code_typed1((Int,)) do n + compute_point(n, Point(1+.5, 2+.5), Point(2+.25, 4+.75)) + end + @test is_scalar_replaced(Point, src) + src = code_typed1((Int,)) do n + compute_point(n, Point(1+.5im, 2+.5im), Point(2+.25im, 4+.75im)) + end + @test is_scalar_replaced(Point, src) + @test is_load_forwarded(ComplexF64, src) + @test !is_scalar_replaced(ComplexF64, src) + + # mutable case + src = code_typed1((Int,)) do n + compute_point(n, MPoint(1+.5, 2+.5), MPoint(2+.25, 4+.75)) + end + @test is_scalar_replaced(MPoint, src) + src = code_typed1((Int,)) do n + compute_point(n, MPoint(1+.5im, 2+.5im), MPoint(2+.25im, 4+.75im)) + end + @test is_scalar_replaced(MPoint, src) + @test is_load_forwarded(ComplexF64, src) + @test !is_scalar_replaced(ComplexF64, src) +end +compute_point(10, MPoint(1+.5, 2+.5), MPoint(2+.25, 4+.75)) +compute_point(10, MPoint(1+.5im, 2+.5im), MPoint(2+.25im, 4+.75im)) +@test @allocated(compute_point(10000, MPoint(1+.5, 2+.5), MPoint(2+.25, 4+.75))) == 0 +@test @allocated(compute_point(10000, MPoint(1+.5im, 2+.5im), MPoint(2+.25im, 4+.75im))) == 0 + +let # mutable case + src = code_typed1(compute_point!, (Int,MPoint{Float64},MPoint{Float64})) + @test is_scalar_replaced(MPoint, src) + src = code_typed1(compute_point!, (Int,MPoint{ComplexF64},MPoint{ComplexF64})) + @test is_scalar_replaced(MPoint, src) + @test is_load_forwarded(ComplexF64, src) + @test !is_scalar_replaced(ComplexF64, src) +end +let + af, bf = MPoint(1+.5, 2+.5), MPoint(2+.25, 4+.75) + ac, bc = MPoint(1+.5im, 2+.5im), MPoint(2+.25im, 4+.75im) + compute_point!(10, af, bf) + compute_point!(10, ac, bc) + @test @allocated(compute_point!(10000, af, bf)) == 0 + @test @allocated(compute_point!(10000, ac, bc)) == 0 +end + +# isdefined elimination +# --------------------- + +let src = code_typed1((Any,)) do a + r = Ref{Any}() + r[] = a + if isassigned(r) + return r[] + end + return nothing + end + @test is_scalar_replaced(src) +end + +callit(f, args...) = f(args...) +function isdefined_elim() + local arr::Vector{Any} + callit() do + arr = Any[] + end + return arr +end +let src = code_typed1(isdefined_elim) + @test is_scalar_replaced(src) +end +@test isdefined_elim() == Any[] + +# preserve elimination +# -------------------- + +let src = code_typed1((String,)) do s + ccall(:some_ccall, Cint, (Ptr{String},), Ref(s)) + end + @test count(isnew, src.code) == 0 +end + +# if the mutable struct is directly used, we shouldn't eliminate it +let src = code_typed1() do + a = MutableXYZ(-512275808,882558299,-2133022131) + b = Int32(42) + ccall(:some_ccall, Cvoid, (MutableXYZ, Int32), a, b) + return a.x + end + @test count(isnew, src.code) == 1 +end + +# constant globals +# ---------------- + +let # immutable case src = @eval Module() begin const REF_FLD = :x struct ImmutableRef{T} @@ -282,7 +906,6 @@ let # should work with constant globals @test count(isnew, src.code) == 0 # mutable case - # ------------ src = @eval Module() begin const REF_FLD = :x code_typed() do @@ -295,25 +918,6 @@ let # should work with constant globals @test count(isnew, src.code) == 0 end -# should work nicely with inlining to optimize away a complicated case -# adapted from http://wiki.luajit.org/Allocation-Sinking-Optimization#implementation%5B -struct Point - x::Float64 - y::Float64 -end -#=@inline=# add(a::Point, b::Point) = Point(a.x + b.x, a.y + b.y) -function compute_points() - a = Point(1.5, 2.5) - b = Point(2.25, 4.75) - for i in 0:(100000000-1) - a = add(add(a, b), b) - end - a.x, a.y -end -let src = code_typed1(compute_points) - @test !any(isnew, src.code) -end - # comparison lifting # ================== @@ -454,7 +1058,7 @@ end # A SSAValue after the compaction line let m = Meta.@lower 1 + 1 @assert Meta.isexpr(m, :thunk) - src = m.args[1]::Core.CodeInfo + src = m.args[1]::CodeInfo src.code = Any[ # block 1 nothing, @@ -492,7 +1096,7 @@ let m = Meta.@lower 1 + 1 src.ssaflags = fill(Int32(0), nstmts) ir = Core.Compiler.inflate_ir(src, Any[], Any[Any, Any]) @test Core.Compiler.verify_ir(ir) === nothing - ir = @test_nowarn Core.Compiler.sroa_pass!(ir) + ir, = @test_nowarn Core.Compiler.linear_pass!(ir) @test Core.Compiler.verify_ir(ir) === nothing end @@ -517,7 +1121,7 @@ end let m = Meta.@lower 1 + 1 # Test that CFG simplify combines redundant basic blocks @assert Meta.isexpr(m, :thunk) - src = m.args[1]::Core.CodeInfo + src = m.args[1]::CodeInfo src.code = Any[ Core.Compiler.GotoNode(2), Core.Compiler.GotoNode(3), @@ -542,7 +1146,7 @@ end let m = Meta.@lower 1 + 1 # Test that CFG simplify doesn't mess up when chaining past return blocks @assert Meta.isexpr(m, :thunk) - src = m.args[1]::Core.CodeInfo + src = m.args[1]::CodeInfo src.code = Any[ Core.Compiler.GotoIfNot(Core.Compiler.Argument(2), 3), Core.Compiler.GotoNode(4), @@ -572,7 +1176,7 @@ let m = Meta.@lower 1 + 1 # Test that CFG simplify doesn't try to merge every block in a loop into # its predecessor @assert Meta.isexpr(m, :thunk) - src = m.args[1]::Core.CodeInfo + src = m.args[1]::CodeInfo src.code = Any[ # Block 1 Core.Compiler.GotoNode(2),