From cdef102caae5fa9f2793b9abf86ac0d7cfedef61 Mon Sep 17 00:00:00 2001
From: Shuhei Kadowaki <aviatesk@gmail.com>
Date: Sat, 22 Jan 2022 03:12:25 +0900
Subject: [PATCH] optimizer: alias-aware SROA
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Enhances SROA of mutables using the novel Julia-level escape analysis (on top of #43800):
1. alias-aware SROA, mutable ϕ-node elimination
2. `isdefined` check elimination
3. load-forwarding for non-eliminable but analyzable mutables

---

1. alias-aware SROA, mutable ϕ-node elimination

EA's alias analysis allows this new SROA to handle nested mutables allocations
pretty well. Now we can eliminate the heap allocations completely from
this insanely nested examples by the single analysis/optimization pass:
```julia
julia> function refs(x)
           (Ref(Ref(Ref(Ref(Ref(Ref(Ref(Ref(Ref(Ref((x))))))))))))[][][][][][][][][][]
       end
refs (generic function with 1 method)

julia> refs("julia"); @allocated refs("julia")
0
```

EA can also analyze escape of ϕ-node as well as its aliasing.
Mutable ϕ-nodes would be eliminated even for a very tricky case as like:
```julia
julia> code_typed((Bool,String,)) do cond, x
           # these allocation form multiple ϕ-nodes
           if cond
               ϕ2 = ϕ1 = Ref{Any}("foo")
           else
               ϕ2 = ϕ1 = Ref{Any}("bar")
           end
           ϕ2[] = x
           y = ϕ1[] # => x
           return y
       end
1-element Vector{Any}:
 CodeInfo(
1 ─     goto #3 if not cond
2 ─     goto #4
3 ─     nothing::Nothing
4 ┄     return x
) => Any
```

Combined with the alias analysis and ϕ-node handling above,
allocations in the following "realistic" examples will be optimized:
```julia
julia> # demonstrate the power of our field / alias analysis with realistic end to end examples
       # adapted from http://wiki.luajit.org/Allocation-Sinking-Optimization#implementation%5B
       abstract type AbstractPoint{T} end

julia> struct Point{T} <: AbstractPoint{T}
           x::T
           y::T
       end

julia> mutable struct MPoint{T} <: AbstractPoint{T}
           x::T
           y::T
       end

julia> add(a::P, b::P) where P<:AbstractPoint = P(a.x + b.x, a.y + b.y);

julia> function compute_point(T, n, ax, ay, bx, by)
           a = T(ax, ay)
           b = T(bx, by)
           for i in 0:(n-1)
               a = add(add(a, b), b)
           end
           a.x, a.y
       end;

julia> function compute_point(n, a, b)
           for i in 0:(n-1)
               a = add(add(a, b), b)
           end
           a.x, a.y
       end;

julia> function compute_point!(n, a, b)
           for i in 0:(n-1)
               a′ = add(add(a, b), b)
               a.x = a′.x
               a.y = a′.y
           end
       end;

julia> compute_point(MPoint, 10, 1+.5, 2+.5, 2+.25, 4+.75);

julia> compute_point(MPoint, 10, 1+.5im, 2+.5im, 2+.25im, 4+.75im);

julia> @allocated compute_point(MPoint, 10000, 1+.5, 2+.5, 2+.25, 4+.75)
0

julia> @allocated compute_point(MPoint, 10000, 1+.5im, 2+.5im, 2+.25im, 4+.75im)
0

julia> compute_point(10, MPoint(1+.5, 2+.5), MPoint(2+.25, 4+.75));

julia> compute_point(10, MPoint(1+.5im, 2+.5im), MPoint(2+.25im, 4+.75im));

julia> @allocated compute_point(10000, MPoint(1+.5, 2+.5), MPoint(2+.25, 4+.75))
0

julia> @allocated compute_point(10000, MPoint(1+.5im, 2+.5im), MPoint(2+.25im, 4+.75im))
0

julia> af, bf = MPoint(1+.5, 2+.5), MPoint(2+.25, 4+.75);

julia> ac, bc = MPoint(1+.5im, 2+.5im), MPoint(2+.25im, 4+.75im);

julia> compute_point!(10, af, bf);

julia> compute_point!(10, ac, bc);

julia> @allocated compute_point!(10000, af, bf)
0

julia> @allocated compute_point!(10000, ac, bc)
0
```

2. `isdefined` check elimination

This commit also implements a simple optimization to eliminate
`isdefined` call by checking load-fowardability.
This optimization may be especially useful to eliminate extra allocation
involved with a capturing closure, e.g.:
```julia
julia> callit(f, args...) = f(args...);

julia> function isdefined_elim()
           local arr::Vector{Any}
           callit() do
               arr = Any[]
           end
           return arr
       end;

julia> code_typed(isdefined_elim)
1-element Vector{Any}:
 CodeInfo(
1 ─ %1 = $(Expr(:foreigncall, :(:jl_alloc_array_1d), Vector{Any}, svec(Any, Int64), 0, :(:ccall), Vector{Any}, 0, 0))::Vector{Any}
└──      goto #3 if not true
2 ─      goto #4
3 ─      $(Expr(:throw_undef_if_not, :arr, false))::Any
4 ┄      return %1
) => Vector{Any}
```

3. load-forwarding for non-eliminable but analyzable mutables

EA also allows us to forward loads even when the mutable allocation
can't be eliminated but still its fields are known precisely.
The load forwarding might be useful since it may derive new type information
that succeeding optimization passes can use (or just because it allows
simpler code transformations down the load):
```julia
julia> code_typed((Bool,String,)) do c, s
           r = Ref{Any}(s)
           if c
               return r[]::String # adce_pass! will further eliminate this type assert call also
           else
               return r
           end
       end
1-element Vector{Any}:
 CodeInfo(
1 ─ %1 = %new(Base.RefValue{Any}, s)::Base.RefValue{Any}
└──      goto #3 if not c
2 ─      return s
3 ─      return %1
) => Union{Base.RefValue{Any}, String}
```

---

Please refer to the newly added test cases for more examples.
Also, EA's alias analysis already succeeds to reason about arrays, and
so this EA-based SROA will hopefully be generalized for array SROA as well.
---
 base/compiler/bootstrap.jl              |   6 +-
 base/compiler/optimize.jl               |  17 +-
 base/compiler/ssair/passes.jl           | 848 ++++++++++++++----------
 test/compiler/EscapeAnalysis/EAUtils.jl |   6 +-
 test/compiler/irpasses.jl               | 781 ++++++++++++++++++++--
 5 files changed, 1250 insertions(+), 408 deletions(-)

diff --git a/base/compiler/bootstrap.jl b/base/compiler/bootstrap.jl
index f335cf31a8467..c1465113e98c2 100644
--- a/base/compiler/bootstrap.jl
+++ b/base/compiler/bootstrap.jl
@@ -11,7 +11,11 @@ let
     world = get_world_counter()
     interp = NativeInterpreter(world)
 
-    analyze_escapes_tt = Tuple{typeof(analyze_escapes), IRCode, Int, Bool, typeof(null_escape_cache)}
+    analyze_escapes_tt = Any[typeof(analyze_escapes), IRCode, Int, Bool,
+        # typeof(get_escape_cache(code_cache(interp))) # once we enable IPO EA
+        typeof(null_escape_cache)
+        ]
+    analyze_escapes_tt = Tuple{analyze_escapes_tt...}
     fs = Any[
         # we first create caches for the optimizer, because they contain many loop constructions
         # and they're better to not run in interpreter even during bootstrapping
diff --git a/base/compiler/optimize.jl b/base/compiler/optimize.jl
index 2d9247420bd35..4aebc8ec4a17f 100644
--- a/base/compiler/optimize.jl
+++ b/base/compiler/optimize.jl
@@ -537,10 +537,25 @@ function run_passes(ci::CodeInfo, sv::OptimizationState, caller::InferenceResult
     @timeit "slot2reg"  ir = slot2reg(ir, ci, sv)
     # TODO: Domsorting can produce an updated domtree - no need to recompute here
     @timeit "compact 1" ir = compact!(ir)
+    nargs = let def = sv.linfo.def; isa(def, Method) ? Int(def.nargs) : 0; end
+    # if is_ipo_profitable(ir, nargs)
+    #     @timeit "IPO EA" begin
+    #         state = analyze_escapes(ir,
+    #             nargs, #=call_resolved=#false, ipo_escape_cache(sv.inlining.mi_cache))
+    #         cache_escapes!(caller, state)
+    #     end
+    # end
     @timeit "Inlining"  ir = ssa_inlining_pass!(ir, ir.linetable, sv.inlining, ci.propagate_inbounds)
     # @timeit "verify 2" verify_ir(ir)
     @timeit "compact 2" ir = compact!(ir)
-    @timeit "SROA"      ir = sroa_pass!(ir)
+    @timeit "SROA" ir, memory_opt, get_domtree = linear_pass!(ir)
+    if memory_opt
+        @timeit "memory_opt_pass!" begin
+            @timeit "Local EA" estate = analyze_escapes(ir,
+                nargs, #=call_resolved=#true, null_escape_cache)
+            @timeit "memory_opt_pass!" ir = memory_opt_pass!(ir, estate, get_domtree)
+        end
+    end
     @timeit "ADCE"      ir = adce_pass!(ir)
     @timeit "type lift" ir = type_lift_pass!(ir)
     @timeit "compact 3" ir = compact!(ir)
diff --git a/base/compiler/ssair/passes.jl b/base/compiler/ssair/passes.jl
index 7fcaa79a468d5..c866657c68bd2 100644
--- a/base/compiler/ssair/passes.jl
+++ b/base/compiler/ssair/passes.jl
@@ -6,29 +6,6 @@ function is_known_call(@nospecialize(x), @nospecialize(func), ir::Union{IRCode,I
     return singleton_type(ft) === func
 end
 
-"""
-    du::SSADefUse
-
-This struct keeps track of all uses of some mutable struct allocated in the current function:
-- `du.uses::Vector{Int}` are all instances of `getfield` on the struct
-- `du.defs::Vector{Int}` are all instances of `setfield!` on the struct
-The terminology refers to the uses/defs of the "slot bundle" that the mutable struct represents.
-
-In addition we keep track of all instances of a `:foreigncall` that preserves of this mutable
-struct in `du.ccall_preserve_uses`. Somewhat counterintuitively, we don't actually need to
-make sure that the struct itself is live (or even allocated) at a `ccall` site.
-If there are no other places where the struct escapes (and thus e.g. where its address is taken),
-it need not be allocated. We do however, need to make sure to preserve any elements of this struct.
-"""
-struct SSADefUse
-    uses::Vector{Int}
-    defs::Vector{Int}
-    ccall_preserve_uses::Vector{Int}
-end
-SSADefUse() = SSADefUse(Int[], Int[], Int[])
-
-compute_live_ins(cfg::CFG, du::SSADefUse) = compute_live_ins(cfg, du.defs, du.uses)
-
 # assume `stmt == getfield(obj, field, ...)` or `stmt == setfield!(obj, field, val, ...)`
 try_compute_field_stmt(ir::Union{IncrementalCompact,IRCode}, stmt::Expr) =
     try_compute_field(ir, stmt.args[3])
@@ -55,112 +32,6 @@ function try_compute_fieldidx_stmt(ir::Union{IncrementalCompact,IRCode}, stmt::E
     return try_compute_fieldidx(typ, field)
 end
 
-function find_curblock(domtree::DomTree, allblocks::Vector{Int}, curblock::Int)
-    # TODO: This can be much faster by looking at current level and only
-    # searching for those blocks in a sorted order
-    while !(curblock in allblocks)
-        curblock = domtree.idoms_bb[curblock]
-    end
-    return curblock
-end
-
-function val_for_def_expr(ir::IRCode, def::Int, fidx::Int)
-    ex = ir[SSAValue(def)][:inst]
-    if isexpr(ex, :new)
-        return ex.args[1+fidx]
-    else
-        @assert isa(ex, Expr)
-        # The use is whatever the setfield was
-        return ex.args[4]
-    end
-end
-
-function compute_value_for_block(ir::IRCode, domtree::DomTree, allblocks::Vector{Int}, du::SSADefUse, phinodes::IdDict{Int, SSAValue}, fidx::Int, curblock::Int)
-    curblock = find_curblock(domtree, allblocks, curblock)
-    def = 0
-    for stmt in du.defs
-        if block_for_inst(ir.cfg, stmt) == curblock
-            def = max(def, stmt)
-        end
-    end
-    def == 0 ? phinodes[curblock] : val_for_def_expr(ir, def, fidx)
-end
-
-function compute_value_for_use(ir::IRCode, domtree::DomTree, allblocks::Vector{Int}, du::SSADefUse, phinodes::IdDict{Int, SSAValue}, fidx::Int, use::Int)
-    def, useblock, curblock = find_def_for_use(ir, domtree, allblocks, du, use)
-    if def == 0
-        if !haskey(phinodes, curblock)
-            # If this happens, we need to search the predecessors for defs. Which
-            # one doesn't matter - if it did, we'd have had a phinode
-            return compute_value_for_block(ir, domtree, allblocks, du, phinodes, fidx, first(ir.cfg.blocks[useblock].preds))
-        end
-        # The use is the phinode
-        return phinodes[curblock]
-    else
-        return val_for_def_expr(ir, def, fidx)
-    end
-end
-
-# even when the allocation contains an uninitialized field, we try an extra effort to check
-# if this load at `idx` have any "safe" `setfield!` calls that define the field
-function has_safe_def(
-    ir::IRCode, domtree::DomTree, allblocks::Vector{Int}, du::SSADefUse,
-    newidx::Int, idx::Int)
-    def, _, _ = find_def_for_use(ir, domtree, allblocks, du, idx)
-    # will throw since we already checked this `:new` site doesn't define this field
-    def == newidx && return false
-    # found a "safe" definition
-    def ≠ 0 && return true
-    # we may still be able to replace this load with `PhiNode`
-    # examine if all predecessors of `block` have any "safe" definition
-    block = block_for_inst(ir, idx)
-    seen = BitSet(block)
-    worklist = BitSet(ir.cfg.blocks[block].preds)
-    isempty(worklist) && return false
-    while !isempty(worklist)
-        pred = pop!(worklist)
-        # if this block has already been examined, bail out to avoid infinite cycles
-        pred in seen && return false
-        idx = last(ir.cfg.blocks[pred].stmts)
-        # NOTE `idx` isn't a load, thus we can use inclusive coondition within the `find_def_for_use`
-        def, _, _ = find_def_for_use(ir, domtree, allblocks, du, idx, true)
-        # will throw since we already checked this `:new` site doesn't define this field
-        def == newidx && return false
-        push!(seen, pred)
-        # found a "safe" definition for this predecessor
-        def ≠ 0 && continue
-        # check for the predecessors of this predecessor
-        for newpred in ir.cfg.blocks[pred].preds
-            push!(worklist, newpred)
-        end
-    end
-    return true
-end
-
-# find the first dominating def for the given use
-function find_def_for_use(
-    ir::IRCode, domtree::DomTree, allblocks::Vector{Int}, du::SSADefUse, use::Int, inclusive::Bool=false)
-    useblock = block_for_inst(ir.cfg, use)
-    curblock = find_curblock(domtree, allblocks, useblock)
-    local def = 0
-    for idx in du.defs
-        if block_for_inst(ir.cfg, idx) == curblock
-            if curblock != useblock
-                # Find the last def in this block
-                def = max(def, idx)
-            else
-                # Find the last def before our use
-                if inclusive
-                    def = max(def, idx ≤ use ? idx : 0)
-                else
-                    def = max(def, idx < use ? idx : 0)
-                end
-            end
-        end
-    end
-    return def, useblock, curblock
-end
-
 function collect_leaves(compact::IncrementalCompact, @nospecialize(val), @nospecialize(typeconstraint))
     if isa(val, Union{OldSSAValue, SSAValue})
         val, typeconstraint = simple_walk_constraint(compact, val, typeconstraint)
@@ -693,26 +564,23 @@ end
 const SPCSet = IdSet{Int}
 
 """
-    sroa_pass!(ir::IRCode) -> newir::IRCode
-
-`getfield` elimination pass, a.k.a. Scalar Replacements of Aggregates optimization.
-
-This pass is based on a local field analysis by def-use chain walking.
-It looks for struct allocation sites ("definitions"), and `getfield` calls as well as
-`:foreigncall`s that preserve the structs ("usages"). If "definitions" have enough information,
-then this pass will replace corresponding usages with forwarded values.
-`mutable struct`s require additional cares and need to be handled separately from immutables.
-For `mutable struct`s, `setfield!` calls account for "definitions" also, and the pass should
-give up the lifting conservatively when there are any "intermediate usages" that may escape
-the mutable struct (e.g. non-inlined generic function call that takes the mutable struct as
-its argument).
-
-In a case when all usages are fully eliminated, `struct` allocation may also be erased as
-a result of succeeding dead code elimination.
+    linear_pass!(ir::IRCode) -> (newir::IRCode, memory_opt::Bool)
+
+This pass consists of the following optimizations that can be performed by
+a single linear traversal over IR statements:
+- load forwarding of immutables (`getfield` elimination): immutable allocations whose
+  loads are all eliminated by this pass may be erased entirely as a result of succeeding
+  dead code elimination (this allocation elimination is called "SROA", Scalar Replacements of Aggregates)
+- lifting of builtin comparisons: see [`lift_comparison!`](@ref)
+- canonicalization of `typeassert` calls: see [`canonicalize_typeassert!`](@ref)
+
+In addition to performing the optimizations above, the linear traversal also examines each
+statement and checks if there is any profitability of running [`memory_opt_pass!`](@ref) pass.
+In such cases `memory_opt` is flagged on and it indicates `ir` may be further optimized by
+running `memory_opt_pass!(ir, estate::EscapeState)`.
 """
-function sroa_pass!(ir::IRCode)
+function linear_pass!(ir::IRCode)
     compact = IncrementalCompact(ir)
-    defuses = nothing # will be initialized once we encounter mutability in order to reduce dynamic allocations
     lifting_cache = IdDict{Pair{AnySSAValue, Any}, AnySSAValue}()
     # initialization of domtree is delayed to avoid the expensive computation in many cases
     local domtree = nothing
@@ -722,17 +590,17 @@ function sroa_pass!(ir::IRCode)
         end
         return domtree
     end
+    local memory_opt = false # whether or not to run the memory_opt_pass! pass later
     for ((_, idx), stmt) in compact
-        # check whether this statement is `getfield` / `setfield!` (or other "interesting" statement)
         isa(stmt, Expr) || continue
-        is_setfield = false
         field_ordering = :unspecified
-        if is_known_call(stmt, setfield!, compact)
-            4 <= length(stmt.args) <= 5 || continue
-            is_setfield = true
-            if length(stmt.args) == 5
-                field_ordering = argextype(stmt.args[5], compact)
+        if isexpr(stmt, :new)
+            typ = unwrap_unionall(widenconst(argextype(SSAValue(idx), compact)))
+            if ismutabletype(typ)
+                # mutable SROA may eliminate this eliminate this allocation, mark it now
+                memory_opt = true
             end
+            continue
         elseif is_known_call(stmt, getfield, compact)
             3 <= length(stmt.args) <= 5 || continue
             if length(stmt.args) == 5
@@ -748,40 +616,21 @@ function sroa_pass!(ir::IRCode)
             for pidx in (6+nccallargs):length(stmt.args)
                 preserved_arg = stmt.args[pidx]
                 isa(preserved_arg, SSAValue) || continue
-                let intermediaries = SPCSet()
-                    callback = function (@nospecialize(pi), @nospecialize(ssa))
-                        push!(intermediaries, ssa.id)
-                        return false
-                    end
-                    def = simple_walk(compact, preserved_arg, callback)
-                    isa(def, SSAValue) || continue
-                    defidx = def.id
-                    def = compact[defidx]
-                    if is_known_call(def, tuple, compact)
+                def = simple_walk(compact, preserved_arg)
+                isa(def, SSAValue) || continue
+                defidx = def.id
+                def = compact[defidx]
+                if is_known_call(def, tuple, compact)
+                    record_immutable_preserve!(new_preserves, def, compact)
+                    push!(preserved, preserved_arg.id)
+                elseif isexpr(def, :new)
+                    typ = unwrap_unionall(widenconst(argextype(SSAValue(defidx), compact)))
+                    if typ isa DataType
+                        ismutabletype(typ) && continue # mutable SROA is performed later
                         record_immutable_preserve!(new_preserves, def, compact)
                         push!(preserved, preserved_arg.id)
-                        continue
-                    elseif isexpr(def, :new)
-                        typ = widenconst(argextype(SSAValue(defidx), compact))
-                        if isa(typ, UnionAll)
-                            typ = unwrap_unionall(typ)
-                        end
-                        if typ isa DataType && !ismutabletype(typ)
-                            record_immutable_preserve!(new_preserves, def, compact)
-                            push!(preserved, preserved_arg.id)
-                            continue
-                        end
-                    else
-                        continue
                     end
-                    if defuses === nothing
-                        defuses = IdDict{Int, Tuple{SPCSet, SSADefUse}}()
-                    end
-                    mid, defuse = get!(defuses, defidx, (SPCSet(), SSADefUse()))
-                    push!(defuse.ccall_preserve_uses, idx)
-                    union!(mid, intermediaries)
                 end
-                continue
             end
             if !isempty(new_preserves)
                 compact[idx] = nothing
@@ -801,7 +650,7 @@ function sroa_pass!(ir::IRCode)
             continue
         end
 
-        # analyze this `getfield` / `setfield!` call
+        # analyze this `getfield` call
 
         field = try_compute_field_stmt(compact, stmt)
         field === nothing && continue
@@ -819,32 +668,7 @@ function sroa_pass!(ir::IRCode)
             continue
         end
 
-        # analyze this mutable struct here for the later pass
-        if ismutabletype(struct_typ)
-            isa(val, SSAValue) || continue
-            let intermediaries = SPCSet()
-                callback = function (@nospecialize(pi), @nospecialize(ssa))
-                    push!(intermediaries, ssa.id)
-                    return false
-                end
-                def = simple_walk(compact, val, callback)
-                # Mutable stuff here
-                isa(def, SSAValue) || continue
-                if defuses === nothing
-                    defuses = IdDict{Int, Tuple{SPCSet, SSADefUse}}()
-                end
-                mid, defuse = get!(defuses, def.id, (SPCSet(), SSADefUse()))
-                if is_setfield
-                    push!(defuse.defs, idx)
-                else
-                    push!(defuse.uses, idx)
-                end
-                union!(mid, intermediaries)
-            end
-            continue
-        elseif is_setfield
-            continue # invalid `setfield!` call, but just ignore here
-        end
+        ismutabletype(struct_typ) && continue # mutable SROA is performed later
 
         # perform SROA on immutable structs here on
 
@@ -882,173 +706,503 @@ function sroa_pass!(ir::IRCode)
     end
 
     non_dce_finish!(compact)
-    if defuses !== nothing
-        # now go through analyzed mutable structs and see which ones we can eliminate
-        # NOTE copy the use count here, because `simple_dce!` may modify it and we need it
-        # consistent with the state of the IR here (after tracking `PhiNode` arguments,
-        # but before the DCE) for our predicate within `sroa_mutables!`, but we also
-        # try an extra effort using a callback so that reference counts are updated
-        used_ssas = copy(compact.used_ssas)
-        simple_dce!(compact, (x::SSAValue) -> used_ssas[x.id] -= 1)
-        ir = complete(compact)
-        sroa_mutables!(ir, defuses, used_ssas, get_domtree)
-        return ir
-    else
-        simple_dce!(compact)
-        return complete(compact)
-    end
+    simple_dce!(compact)
+    ir = complete(compact)
+    return ir, memory_opt, get_domtree
 end
 
-function sroa_mutables!(ir::IRCode, defuses::IdDict{Int, Tuple{SPCSet, SSADefUse}}, used_ssas::Vector{Int}, get_domtree)
-    for (idx, (intermediaries, defuse)) in defuses
-        intermediaries = collect(intermediaries)
-        # Check if there are any uses we did not account for. If so, the variable
-        # escapes and we cannot eliminate the allocation. This works, because we're guaranteed
-        # not to include any intermediaries that have dead uses. As a result, missing uses will only ever
-        # show up in the nuses_total count.
-        nleaves = length(defuse.uses) + length(defuse.defs) + length(defuse.ccall_preserve_uses)
-        nuses = 0
-        for idx in intermediaries
-            nuses += used_ssas[idx]
+function form_new_preserves(origex::Expr, intermediates::Vector{Int}, new_preserves::Vector{Any})
+    newex = Expr(:foreigncall)
+    nccallargs = length(origex.args[3]::SimpleVector)
+    for i in 1:(6+nccallargs-1)
+        push!(newex.args, origex.args[i])
+    end
+    for i in (6+nccallargs):length(origex.args)
+        x = origex.args[i]
+        # don't need to preserve intermediaries
+        if isa(x, SSAValue) && x.id in intermediates
+            continue
         end
-        nuses_total = used_ssas[idx] + nuses - length(intermediaries)
-        nleaves == nuses_total || continue
-        # Find the type for this allocation
-        defexpr = ir[SSAValue(idx)][:inst]
-        isexpr(defexpr, :new) || continue
-        newidx = idx
-        typ = ir.stmts[newidx][:type]
-        if isa(typ, UnionAll)
-            typ = unwrap_unionall(typ)
+        push!(newex.args, x)
+    end
+    for i in 1:length(new_preserves)
+        push!(newex.args, new_preserves[i])
+    end
+    return newex
+end
+
+import .EscapeAnalysis:
+    EscapeState, EscapeInfo, IndexableFields, LivenessSet, getaliases, LocalUse, LocalDef
+
+"""
+    memory_opt_pass!(ir::IRCode, estate::EscapeState) -> newir::IRCode
+
+Performs memory optimizations using escape information analyzed by `EscapeAnalysis`.
+Specifically, this optimization pass does SROA of mutable allocations.
+
+`estate::EscapeState` is expected to be a result of `analyze_escapes(ir, ...)`.
+Since the computational cost of running `analyze_escapes` can be relatively expensive,
+it is recommended to run this pass "selectively" i.e. only when there seems to be
+a profitability for the memory optimizations.
+"""
+function memory_opt_pass!(ir::IRCode, estate::EscapeState, @specialize(get_domtree))
+    # Compute domtree now, needed below, now that we have finished compacting the IR.
+    # This needs to be after we iterate through the IR with `IncrementalCompact`
+    # because removing dead blocks can invalidate the domtree.
+    # TODO initialization of the domtree can be delayed to avoid the expensive computation
+    # in cases when there are no loads to be forwarded
+    workingset   = BitSet(1:length(ir.stmts)+length(ir.new_nodes.stmts))
+    eliminated   = BitSet()
+    revisit      = Tuple{#=related=#Vector{SSAValue}, #=Liveness=#LivenessSet}[]
+    allpreserved = true
+    newpreserves = nothing
+    while !isempty(workingset)
+        idx = pop!(workingset)
+        ssa = SSAValue(idx)
+        stmt = ir[ssa][:inst]
+        # NOTE `linear_pass!` can't eliminate immutables wrapped by mutables,
+        # but the EA-based alias analysis may be able to eliminate them also
+        isexpr(stmt, :new) || is_known_call(stmt, tuple, ir) || continue
+        einfo = estate[ssa]
+        is_load_forwardable(einfo) || continue
+        aliases = getaliases(ssa, estate)
+        if aliases === nothing
+            related = SSAValue[ssa]
+        else
+            related = SSAValue[]
+            for alias in aliases
+                @assert isa(alias, SSAValue) "invalid escape analysis"
+                push!(related, alias)
+                delete!(workingset, alias.id)
+            end
         end
-        # Could still end up here if we tried to setfield! on an immutable, which would
-        # error at runtime, but is not illegal to have in the IR.
-        ismutabletype(typ) || continue
-        typ = typ::DataType
-        # Partition defuses by field
-        fielddefuse = SSADefUse[SSADefUse() for _ = 1:fieldcount(typ)]
-        all_forwarded = true
-        for use in defuse.uses
-            stmt = ir[SSAValue(use)][:inst] # == `getfield` call
-            # We may have discovered above that this use is dead
-            # after the getfield elim of immutables. In that case,
-            # it would have been deleted. That's fine, just ignore
-            # the use in that case.
-            if stmt === nothing
-                all_forwarded = false
-                continue
+        finfos = (einfo.AliasInfo::IndexableFields).infos
+        nflds = length(finfos)
+
+        # Partition defuses by field, and object identity
+        fdefuses = IdDict{Tuple{Int,SSAValue},FieldDefUse}()
+        for fidx = 1:nflds
+            finfo = finfos[fidx]
+            for fx in finfo
+                if isa(fx, LocalUse)
+                    use = fx.idx
+                    stmt = ir[SSAValue(use)][:inst] # use (getfield call)
+                    @assert is_known_call(stmt, getfield, ir)
+                    obj = stmt.args[2]
+                    @assert isa(obj, SSAValue)
+                    fdu = get!(()->FieldDefUse(), fdefuses, (fidx, obj))
+                    push!(fdu.uses, GetfieldLoad(use))
+                elseif isa(fx, LocalDef)
+                    def = fx.idx
+                    obj = SSAValue(def)
+                    stmt = ir[obj][:inst] # def (setfield! call, tuple call or :new expression)
+                    for rel in related
+                        if isexpr(stmt, :new) || is_known_call(stmt, tuple, ir)
+                            relstmt = ir[rel][:inst]
+                            if isexpr(relstmt, :new) || is_known_call(relstmt, tuple, ir)
+                                rel !== obj && continue
+                            end
+                        end
+                        fdu = get!(()->FieldDefUse(), fdefuses, (fidx, rel))
+                        push!(fdu.defs, def)
+                    end
+                end
             end
-            field = try_compute_fieldidx_stmt(ir, stmt::Expr, typ)
-            field === nothing && @goto skip
-            push!(fielddefuse[field].uses, use)
         end
-        for def in defuse.defs
-            stmt = ir[SSAValue(def)][:inst]::Expr # == `setfield!` call
-            field = try_compute_fieldidx_stmt(ir, stmt, typ)
-            field === nothing && @goto skip
-            isconst(typ, field) && @goto skip # we discovered an attempt to mutate a const field, which must error
-            push!(fielddefuse[field].defs, def)
+
+        Liveness = einfo.Liveness
+        for livepc in Liveness
+            livestmt = ir[SSAValue(livepc)][:inst]
+            if is_known_call(livestmt, Core.ifelse, ir) ||
+               is_known_call(livestmt, tuple, ir) ||
+               is_known_call(livestmt, arrayset, ir)
+                # TODO the succeeding domination analysis doesn't account for flow sensitivity
+                # introduced by those program constructs, just give up SROA for now
+                @goto next_itr
+            elseif is_known_call(livestmt, isdefined, ir)
+                args = livestmt.args
+                length(args) ≥ 3 || continue
+                obj = args[2]
+                isa(obj, SSAValue) || continue
+                obj in related || continue
+                fld = args[3]
+                fldval = try_compute_field(ir, fld)
+                fldval === nothing && continue
+                typ = unwrap_unionall(widenconst(argextype(obj, ir)))
+                isa(typ, DataType) || continue
+                fidx = try_compute_fieldidx(typ, fldval)
+                fidx === nothing && continue
+                fdu = get!(()->FieldDefUse(), fdefuses, (fidx, obj))
+                push!(fdu.uses, IsdefinedUse(livepc))
+            elseif isexpr(livestmt, :foreigncall)
+                # we shouldn't eliminate this use if it's used as a direct argument
+                args = livestmt.args
+                nccallargs = length(args[3]::SimpleVector)
+                for i = 6:(5+nccallargs)
+                    arg = args[i]
+                    isa(arg, SSAValue) && arg in related && @goto next_liveness
+                end
+                # this use is preserve, and may be eliminable
+                for i = (6+nccallargs):length(args)
+                    arg = args[i]
+                    if isa(arg, SSAValue) && arg in related
+                        for fidx in 1:nflds
+                            fdu = get!(()->FieldDefUse(), fdefuses, (fidx, arg))
+                            push!(fdu.uses, PreserveUse(livepc))
+                        end
+                    end
+                end
+            end
+            @label next_liveness
         end
-        # Check that the defexpr has defined values for all the fields
-        # we're accessing. In the future, we may want to relax this,
-        # but we should come up with semantics for well defined semantics
-        # for uninitialized fields first.
-        ndefuse = length(fielddefuse)
-        blocks = Vector{Tuple{#=phiblocks=# Vector{Int}, #=allblocks=# Vector{Int}}}(undef, ndefuse)
-        for fidx in 1:ndefuse
-            du = fielddefuse[fidx]
-            isempty(du.uses) && continue
-            push!(du.defs, newidx)
-            ldu = compute_live_ins(ir.cfg, du)
+
+        for ((fidx, objssa), fdu) in fdefuses
+            isempty(fdu.uses) && @goto next_field
+            # check if all uses have safe definitions first, otherwise we should bail out
+            # since then we may fail to form new ϕ-nodes
+            ldu = compute_live_ins(ir.cfg, fdu)
             if isempty(ldu.live_in_bbs)
                 phiblocks = Int[]
             else
                 phiblocks = iterated_dominance_frontier(ir.cfg, ldu, get_domtree())
             end
-            allblocks = sort(vcat(phiblocks, ldu.def_bbs))
-            blocks[fidx] = phiblocks, allblocks
-            if fidx + 1 > length(defexpr.args)
-                for use in du.uses
-                    has_safe_def(ir, get_domtree(), allblocks, du, newidx, use) || @goto skip
-                end
+            obj = ir[objssa][:inst]
+            if isa(obj, PhiNode)
+                push!(phiblocks, block_for_inst(ir, objssa.id))
             end
-        end
-        # Everything accounted for. Go field by field and perform idf:
-        # Compute domtree now, needed below, now that we have finished compacting the IR.
-        # This needs to be after we iterate through the IR with `IncrementalCompact`
-        # because removing dead blocks can invalidate the domtree.
-        domtree = get_domtree()
-        preserve_uses = isempty(defuse.ccall_preserve_uses) ? nothing :
-            IdDict{Int, Vector{Any}}((idx=>Any[] for idx in SPCSet(defuse.ccall_preserve_uses)))
-        for fidx in 1:ndefuse
-            du = fielddefuse[fidx]
-            ftyp = fieldtype(typ, fidx)
-            if !isempty(du.uses)
-                phiblocks, allblocks = blocks[fidx]
-                phinodes = IdDict{Int, SSAValue}()
-                for b in phiblocks
-                    phinodes[b] = insert_node!(ir, first(ir.cfg.blocks[b].stmts),
-                        NewInstruction(PhiNode(), ftyp))
+            allblocks = sort!(vcat(phiblocks, ldu.def_bbs))
+            for use in fdu.uses
+                isa(use, IsdefinedUse) && continue
+                if isa(use, PreserveUse) && isempty(fdu.defs)
+                    # nothing to preserve, just ignore this use (may happen when there are unintialized fields)
+                    continue
                 end
-                # Now go through all uses and rewrite them
-                for stmt in du.uses
-                    ir[SSAValue(stmt)][:inst] = compute_value_for_use(ir, domtree, allblocks, du, phinodes, fidx, stmt)
+                if !has_safe_def(ir, get_domtree(), allblocks, fdu, getuseidx(use))
+                    allpreserved = false
+                    @goto next_field
                 end
-                if !isbitstype(ftyp)
-                    if preserve_uses !== nothing
-                        for (use, list) in preserve_uses
-                            push!(list, compute_value_for_use(ir, domtree, allblocks, du, phinodes, fidx, use))
+            end
+            phinodes = IdDict{Int, SSAValue}()
+            for b in phiblocks
+                phinodes[b] = insert_node!(ir, first(ir.cfg.blocks[b].stmts),
+                    NewInstruction(PhiNode(), Any))
+            end
+            # Now go through all uses and rewrite them
+            domtree = get_domtree()
+            for use in fdu.uses
+                if isa(use, GetfieldLoad)
+                    use = getuseidx(use)
+                    ir[SSAValue(use)][:inst] = compute_value_for_use(
+                        ir, domtree, allblocks, fdu, phinodes, fidx, use)
+                    push!(eliminated, use)
+                elseif isa(use, PreserveUse)
+                    allpreserved || continue
+                    isempty(fdu.defs) && continue # nothing to preserve (may happen when there are unintialized fields)
+                    # record this `use` as replaceable no matter if we preserve new value or not
+                    use = getuseidx(use)
+                    newval = compute_value_for_use(
+                        ir, domtree, allblocks, fdu, phinodes, fidx, use)
+                    if !isbitstype(widenconst(argextype(newval, ir)))
+                        if newpreserves === nothing
+                            newpreserves = IdDict{Int,Vector{Any}}()
                         end
+                        newvalues = get!(()->Any[], newpreserves, use)
+                        push!(newvalues, newval)
+                    end
+                elseif isa(use, IsdefinedUse)
+                    use = getuseidx(use)
+                    if has_safe_def(ir, domtree, allblocks, fdu, use)
+                        ir[SSAValue(use)][:inst] = true
+                        push!(eliminated, use)
                     end
+                else
+                    throw("unexpected use")
                 end
-                for b in phiblocks
-                    n = ir[phinodes[b]][:inst]::PhiNode
+            end
+            for b in phiblocks
+                ϕssa = phinodes[b]
+                n = ir[ϕssa][:inst]::PhiNode
+                t = Bottom
+                if isa(obj, PhiNode)
+                    for i = 1:length(obj.edges)
+                        isassigned(obj.edges, i) || continue
+                        p = obj.edges[i]
+                        push!(n.edges, p)
+                        v = compute_value_for_block(ir, domtree, allblocks,
+                            fdefuses[(fidx, obj.values[i]::SSAValue)], phinodes, fidx, Int(p))
+                        push!(n.values, v)
+                        if t !== Any
+                            t = tmerge(t, argextype(v, ir))
+                        end
+                    end
+                else
                     for p in ir.cfg.blocks[b].preds
                         push!(n.edges, p)
-                        push!(n.values, compute_value_for_block(ir, domtree,
-                            allblocks, du, phinodes, fidx, p))
+                        v = compute_value_for_block(ir, domtree, allblocks, fdu, phinodes, fidx, p)
+                        push!(n.values, v)
+                        if t !== Any
+                            t = tmerge(t, argextype(v, ir))
+                        end
                     end
                 end
+                ir[ϕssa][:type] = t
             end
-            for stmt in du.defs
-                stmt == newidx && continue
-                ir[SSAValue(stmt)][:inst] = nothing
-            end
+            @label next_field
         end
-        preserve_uses === nothing && continue
-        if all_forwarded
-            # this means all ccall preserves have been replaced with forwarded loads
-            # so we can potentially eliminate the allocation, otherwise we must preserve
-            # the whole allocation.
-            push!(intermediaries, newidx)
+        push!(revisit, (related, Liveness))
+        @label next_itr
+    end
+
+    # remove dead setfield! and :new allocs
+    deadssas = IdSet{SSAValue}()
+    if allpreserved && newpreserves !== nothing
+        preserved = keys(newpreserves)
+    else
+        preserved = EMPTY_PRESERVED_SSAS
+    end
+    mark_dead_ssas!(ir, deadssas, revisit, eliminated, preserved)
+    for ssa in deadssas
+        ir[ssa][:inst] = nothing
+    end
+    if allpreserved && newpreserves !== nothing
+        deadssas = Int[ssa.id for ssa in deadssas]
+        for (idx, newuses) in newpreserves
+            ir[SSAValue(idx)][:inst] = form_new_preserves(
+                ir[SSAValue(idx)][:inst]::Expr, deadssas, newuses)
         end
-        # Insert the new preserves
-        for (use, new_preserves) in preserve_uses
-            ir[SSAValue(use)][:inst] = form_new_preserves(ir[SSAValue(use)][:inst]::Expr, intermediaries, new_preserves)
+    end
+
+    return ir
+end
+
+const EMPTY_PRESERVED_SSAS = keys(IdDict{Int,Vector{Any}}())
+const PreservedSets = typeof(EMPTY_PRESERVED_SSAS)
+
+function is_load_forwardable(x::EscapeInfo)
+    AliasInfo = x.AliasInfo
+    return isa(AliasInfo, IndexableFields)
+end
+
+struct FieldDefUse
+    uses::Vector{Any}
+    defs::Vector{Int}
+end
+FieldDefUse() = FieldDefUse(Any[], Int[])
+struct GetfieldLoad
+    idx::Int
+end
+struct PreserveUse
+    idx::Int
+end
+struct IsdefinedUse
+    idx::Int
+end
+function getuseidx(@nospecialize use)
+    if isa(use, GetfieldLoad)
+        return use.idx
+    elseif isa(use, PreserveUse)
+        return use.idx
+    elseif isa(use, IsdefinedUse)
+        return use.idx
+    end
+    throw("getuseidx: unexpected use")
+end
+
+function compute_live_ins(cfg::CFG, fdu::FieldDefUse)
+    uses = Int[]
+    for use in fdu.uses
+        isa(use, IsdefinedUse) && continue
+        push!(uses, getuseidx(use))
+    end
+    return compute_live_ins(cfg, fdu.defs, uses)
+end
+
+# even when the allocation contains an uninitialized field, we try an extra effort to check
+# if this load at `idx` have any "safe" `setfield!` calls that define the field
+# try to find
+function has_safe_def(ir::IRCode, domtree::DomTree, allblocks::Vector{Int},
+    fdu::FieldDefUse, use::Int)
+    dfu = find_def_for_use(ir, domtree, allblocks, fdu, use)
+    dfu === nothing && return false
+    def = dfu[1]
+    def ≠ 0 && return true # found a "safe" definition
+    # we may still be able to replace this load with `PhiNode` -- examine if all predecessors of
+    # this `block` have any "safe" definition
+    block = block_for_inst(ir, use)
+    seen = BitSet(block)
+    worklist = BitSet(ir.cfg.blocks[block].preds)
+    isempty(worklist) && return false
+    while !isempty(worklist)
+        pred = pop!(worklist)
+        # if this block has already been examined, bail out to avoid infinite cycles
+        pred in seen && return false
+        use = last(ir.cfg.blocks[pred].stmts)
+        # NOTE this `use` isn't a load, and so the inclusive condition can be used
+        dfu = find_def_for_use(ir, domtree, allblocks, fdu, use, true)
+        dfu === nothing && return false
+        def = dfu[1]
+        push!(seen, pred)
+        def ≠ 0 && continue # found a "safe" definition for this predecessor
+        # if not, check for the predecessors of this predecessor
+        for newpred in ir.cfg.blocks[pred].preds
+            push!(worklist, newpred)
         end
+    end
+    return true
+end
 
-        @label skip
+# find the first dominating def for the given use
+function find_def_for_use(ir::IRCode, domtree::DomTree, allblocks::Vector{Int},
+    fdu::FieldDefUse, use::Int, inclusive::Bool=false)
+    useblock = block_for_inst(ir.cfg, use)
+    curblock = find_curblock(domtree, allblocks, useblock)
+    curblock === nothing && return nothing
+    local def = 0
+    for idx in fdu.defs
+        if block_for_inst(ir.cfg, idx) == curblock
+            if curblock != useblock
+                # Find the last def in this block
+                def = max(def, idx)
+            else
+                # Find the last def before our use
+                if inclusive
+                    def = max(def, idx ≤ use ? idx : 0)
+                else
+                    def = max(def, idx < use ? idx : 0)
+                end
+            end
+        end
     end
+    return def, useblock, curblock
 end
 
-function form_new_preserves(origex::Expr, intermediates::Vector{Int}, new_preserves::Vector{Any})
-    newex = Expr(:foreigncall)
-    nccallargs = length(origex.args[3]::SimpleVector)
-    for i in 1:(6+nccallargs-1)
-        push!(newex.args, origex.args[i])
+function find_curblock(domtree::DomTree, allblocks::Vector{Int}, curblock::Int)
+    # TODO: This can be much faster by looking at current level and only
+    # searching for those blocks in a sorted order
+    while !(curblock in allblocks)
+        curblock = domtree.idoms_bb[curblock]
+        curblock == 0 && return nothing
     end
-    for i in (6+nccallargs):length(origex.args)
-        x = origex.args[i]
-        # don't need to preserve intermediaries
-        if isa(x, SSAValue) && x.id in intermediates
-            continue
+    return curblock
+end
+
+function compute_value_for_use(ir::IRCode, domtree::DomTree, allblocks::Vector{Int},
+    fdu::FieldDefUse, phinodes::IdDict{Int, SSAValue}, fidx::Int, use::Int)
+    dfu = find_def_for_use(ir, domtree, allblocks, fdu, use)
+    @assert dfu !== nothing "has_safe_def condition unsatisfied"
+    def, useblock, curblock = dfu
+    if def == 0
+        if !haskey(phinodes, curblock)
+            # If this happens, we need to search the predecessors for defs. Which
+            # one doesn't matter - if it did, we'd have had a phinode
+            return compute_value_for_block(ir, domtree, allblocks, fdu, phinodes, fidx, first(ir.cfg.blocks[useblock].preds))
         end
-        push!(newex.args, x)
+        # The use is the phinode
+        return phinodes[curblock]
+    else
+        return val_for_def_expr(ir, def, fidx)
     end
-    for i in 1:length(new_preserves)
-        push!(newex.args, new_preserves[i])
+end
+
+function compute_value_for_block(ir::IRCode, domtree::DomTree, allblocks::Vector{Int},
+    fdu::FieldDefUse, phinodes::IdDict{Int, SSAValue}, fidx::Int, curblock::Int)
+    curblock = find_curblock(domtree, allblocks, curblock)
+    @assert curblock !== nothing "has_safe_def condition unsatisfied"
+    def = 0
+    for stmt in fdu.defs
+        if block_for_inst(ir.cfg, stmt) == curblock
+            def = max(def, stmt)
+        end
     end
-    return newex
+    return def == 0 ? phinodes[curblock] : val_for_def_expr(ir, def, fidx)
+end
+
+function val_for_def_expr(ir::IRCode, def::Int, fidx::Int)
+    ex = ir[SSAValue(def)][:inst]
+    if isexpr(ex, :new) || is_known_call(ex, tuple, ir)
+        return ex.args[1+fidx]
+    else
+        @assert is_known_call(ex, setfield!, ir) "invalid load forwarding"
+        return ex.args[4]
+    end
+end
+
+function mark_dead_ssas!(ir::IRCode, deadssas::IdSet{SSAValue},
+    revisit::Vector{Tuple{Vector{SSAValue},LivenessSet}}, eliminated::BitSet,
+    preserved::PreservedSets)
+    workingset = BitSet(1:length(revisit))
+    while !isempty(workingset)
+        revisit_idx = pop!(workingset)
+        mark_dead_ssas!(ir, deadssas, revisit, eliminated, preserved, workingset, revisit_idx)
+    end
+end
+
+function mark_dead_ssas!(ir::IRCode, deadssas::IdSet{SSAValue},
+    revisit::Vector{Tuple{Vector{SSAValue},LivenessSet}}, eliminated::BitSet,
+    preserved::PreservedSets, workingset::BitSet, revisit_idx::Int)
+    related, Liveness = revisit[revisit_idx]
+    eliminable = SSAValue[]
+    for livepc in Liveness
+        livepc in eliminated && @goto next_live
+        ssa = SSAValue(livepc)
+        stmt = ir[ssa][:inst]
+        if isexpr(stmt, :new)
+            ssa in deadssas && @goto next_live
+            for new_revisit_idx in workingset
+                if ssa in revisit[new_revisit_idx][1]
+                    delete!(workingset, new_revisit_idx)
+                    if mark_dead_ssas!(ir, deadssas,
+                            revisit, eliminated,
+                            preserved, workingset, new_revisit_idx)
+                        push!(eliminable, ssa)
+                        @goto next_live
+                    else
+                        return false
+                    end
+                end
+            end
+            return false
+        elseif is_known_call(stmt, setfield!, ir)
+            @assert length(stmt.args) ≥ 4 "invalid escape analysis"
+            obj = stmt.args[2]
+            val = stmt.args[4]
+            if isa(obj, SSAValue)
+                if obj in related
+                    push!(eliminable, ssa)
+                    @goto next_live
+                end
+                if isa(val, SSAValue) && val in related
+                    if obj in deadssas
+                        push!(eliminable, ssa)
+                        @goto next_live
+                    end
+                    for new_revisit_idx in workingset
+                        if obj in revisit[new_revisit_idx][1]
+                            delete!(workingset, new_revisit_idx)
+                            if mark_dead_ssas!(ir, deadssas,
+                                    revisit, eliminated,
+                                    preserved, workingset, new_revisit_idx)
+                                push!(eliminable, ssa)
+                                @goto next_live
+                            else
+                                return false
+                            end
+                        end
+                    end
+                end
+            end
+            return false
+        elseif isexpr(stmt, :foreigncall)
+            livepc in preserved && @goto next_live
+            return false
+        else
+            return false
+        end
+        @label next_live
+    end
+    for ssa in related; push!(deadssas, ssa); end
+    for ssa in eliminable; push!(deadssas, ssa); end
+    return true
 end
 
 """
@@ -1125,15 +1279,15 @@ In addition to a simple DCE for unused values and allocations,
 this pass also nullifies `typeassert` calls that can be proved to be no-op,
 in order to allow LLVM to emit simpler code down the road.
 
-Note that this pass is more effective after SROA optimization (i.e. `sroa_pass!`),
+Note that this pass is more effective after SROA optimization (i.e. `linear_pass!`),
 since SROA often allows this pass to:
 - eliminate allocation of object whose field references are all replaced with scalar values, and
 - nullify `typeassert` call whose first operand has been replaced with a scalar value
   (, which may have introduced new type information that inference did not understand)
 
-Also note that currently this pass _needs_ to run after `sroa_pass!`, because
+Also note that currently this pass _needs_ to run after `linear_pass!`, because
 the `typeassert` elimination depends on the transformation by `canonicalize_typeassert!` done
-within `sroa_pass!` which redirects references of `typeassert`ed value to the corresponding `PiNode`.
+within `linear_pass!` which redirects references of `typeassert`ed value to the corresponding `PiNode`.
 """
 function adce_pass!(ir::IRCode)
     phi_uses = fill(0, length(ir.stmts) + length(ir.new_nodes))
diff --git a/test/compiler/EscapeAnalysis/EAUtils.jl b/test/compiler/EscapeAnalysis/EAUtils.jl
index 3ae9b41a0ddac..7ef50d5434932 100644
--- a/test/compiler/EscapeAnalysis/EAUtils.jl
+++ b/test/compiler/EscapeAnalysis/EAUtils.jl
@@ -71,8 +71,8 @@ import Core:
     CodeInstance, MethodInstance, CodeInfo
 import .CC:
     InferenceResult, OptimizationState, IRCode, copy as cccopy,
-    @timeit, convert_to_ircode, slot2reg, compact!, ssa_inlining_pass!, sroa_pass!,
-    adce_pass!, type_lift_pass!, JLOptions, verify_ir, verify_linetable
+    @timeit, convert_to_ircode, slot2reg, compact!, ssa_inlining_pass!, linear_pass!,
+    memory_opt_pass!, adce_pass!, type_lift_pass!, JLOptions, verify_ir, verify_linetable
 import .EA: analyze_escapes, ArgEscapeCache, EscapeInfo, EscapeState, is_ipo_profitable
 
 # when working outside of Core.Compiler,
@@ -227,6 +227,7 @@ function run_passes_with_ea(interp::EscapeAnalyzer, ci::CodeInfo, sv::Optimizati
     @timeit "Inlining"  ir = ssa_inlining_pass!(ir, ir.linetable, sv.inlining, ci.propagate_inbounds)
     # @timeit "verify 2" verify_ir(ir)
     @timeit "compact 2" ir = compact!(ir)
+    @timeit "SROA"   ir, _ = linear_pass!(ir)
     if caller.linfo.specTypes === interp.entry_tt && interp.optimize
         try
             @timeit "[Local EA]" state = analyze_escapes(ir, nargs, true, get_escape_cache(interp))
@@ -240,7 +241,6 @@ function run_passes_with_ea(interp::EscapeAnalyzer, ci::CodeInfo, sv::Optimizati
         interp.state = state
         interp.linfo = sv.linfo
     end
-    @timeit "SROA"      ir = sroa_pass!(ir)
     @timeit "ADCE"      ir = adce_pass!(ir)
     @timeit "type lift" ir = type_lift_pass!(ir)
     @timeit "compact 3" ir = compact!(ir)
diff --git a/test/compiler/irpasses.jl b/test/compiler/irpasses.jl
index 045cf833944c2..820958b7e7df5 100644
--- a/test/compiler/irpasses.jl
+++ b/test/compiler/irpasses.jl
@@ -2,7 +2,9 @@
 
 using Test
 using Base.Meta
-using Core: PhiNode, SSAValue, GotoNode, PiNode, QuoteNode, ReturnNode, GotoIfNot
+import Core:
+    CodeInfo, Argument, SSAValue, GotoNode, GotoIfNot, PiNode, PhiNode,
+    QuoteNode, ReturnNode
 
 include(normpath(@__DIR__, "irutils.jl"))
 
@@ -12,7 +14,7 @@ include(normpath(@__DIR__, "irutils.jl"))
 ## Test that domsort doesn't mangle single-argument phis (#29262)
 let m = Meta.@lower 1 + 1
     @assert Meta.isexpr(m, :thunk)
-    src = m.args[1]::Core.CodeInfo
+    src = m.args[1]::CodeInfo
     src.code = Any[
         # block 1
         Expr(:call, :opaque),
@@ -47,7 +49,7 @@ end
 # test that we don't stack-overflow in SNCA with large functions.
 let m = Meta.@lower 1 + 1
     @assert Meta.isexpr(m, :thunk)
-    src = m.args[1]::Core.CodeInfo
+    src = m.args[1]::CodeInfo
     code = Any[]
     N = 2^15
     for i in 1:2:N
@@ -73,30 +75,87 @@ end
 # SROA
 # ====
 
+import Core.Compiler: widenconst
+
+is_load_forwarded(src::CodeInfo) = !any(iscall((src, getfield)), src.code)
+is_scalar_replaced(src::CodeInfo) =
+    is_load_forwarded(src) && !any(iscall((src, setfield!)), src.code) && !any(isnew, src.code)
+
+function is_load_forwarded(@nospecialize(T), src::CodeInfo)
+    for i in 1:length(src.code)
+        x = src.code[i]
+        if iscall((src, getfield), x)
+            widenconst(argextype(x.args[1], src)) <: T && return false
+        end
+    end
+    return true
+end
+function is_scalar_replaced(@nospecialize(T), src::CodeInfo)
+    is_load_forwarded(T, src) || return false
+    for i in 1:length(src.code)
+        x = src.code[i]
+        if iscall((src, setfield!), x)
+            widenconst(argextype(x.args[1], src)) <: T && return false
+        elseif isnew(x)
+            widenconst(argextype(SSAValue(i), src)) <: T && return false
+        end
+    end
+    return true
+end
+
 struct ImmutableXYZ; x; y; z; end
 mutable struct MutableXYZ; x; y; z; end
+struct ImmutableOuter{T}; x::T; y::T; z::T; end
+mutable struct MutableOuter{T}; x::T; y::T; z::T; end
+struct ImmutableRef{T}; x::T; end
+Base.getindex(r::ImmutableRef) = r.x
+mutable struct SafeRef{T}; x::T; end
+Base.getindex(s::SafeRef) = getfield(s, 1)
+Base.setindex!(s::SafeRef, x) = setfield!(s, 1, x)
+
+# simple immutability
+# -------------------
 
-# should optimize away very basic cases
 let src = code_typed1((Any,Any,Any)) do x, y, z
         xyz = ImmutableXYZ(x, y, z)
         xyz.x, xyz.y, xyz.z
     end
-    @test !any(isnew, src.code)
+    @test is_scalar_replaced(src)
+    @test any(src.code) do @nospecialize x
+        iscall((src, tuple), x) &&
+        x.args[2:end] == Any[#=x=# Core.Argument(2), #=y=# Core.Argument(3), #=z=# Core.Argument(4)]
+    end
 end
+let src = code_typed1((Any,Any,Any)) do x, y, z
+        xyz = (x, y, z)
+        xyz[1], xyz[2], xyz[3]
+    end
+    @test is_scalar_replaced(src)
+    @test any(src.code) do @nospecialize x
+        iscall((src, tuple), x) &&
+        x.args[2:end] == Any[#=x=# Core.Argument(2), #=y=# Core.Argument(3), #=z=# Core.Argument(4)]
+    end
+end
+
+# simple mutability
+# -----------------
+
 let src = code_typed1((Any,Any,Any)) do x, y, z
         xyz = MutableXYZ(x, y, z)
         xyz.x, xyz.y, xyz.z
     end
-    @test !any(isnew, src.code)
+    @test is_scalar_replaced(src)
+    @test any(src.code) do @nospecialize x
+        iscall((src, tuple), x) &&
+        x.args[2:end] == Any[#=x=# Core.Argument(2), #=y=# Core.Argument(3), #=z=# Core.Argument(4)]
+    end
 end
-
-# should handle simple mutabilities
 let src = code_typed1((Any,Any,Any)) do x, y, z
         xyz = MutableXYZ(x, y, z)
         xyz.y = 42
         xyz.x, xyz.y, xyz.z
     end
-    @test !any(isnew, src.code)
+    @test is_scalar_replaced(src)
     @test any(src.code) do @nospecialize x
         iscall((src, tuple), x) &&
         x.args[2:end] == Any[#=x=# Core.Argument(2), 42, #=x=# Core.Argument(4)]
@@ -107,19 +166,23 @@ let src = code_typed1((Any,Any,Any)) do x, y, z
         xyz.x, xyz.z = xyz.z, xyz.x
         xyz.x, xyz.y, xyz.z
     end
-    @test !any(isnew, src.code)
+    @test is_scalar_replaced(src)
     @test any(src.code) do @nospecialize x
         iscall((src, tuple), x) &&
         x.args[2:end] == Any[#=z=# Core.Argument(4), #=y=# Core.Argument(3), #=x=# Core.Argument(2)]
     end
 end
-# circumvent uninitialized fields as far as there is a solid `setfield!` definition
+
+# uninitialized fields
+# --------------------
+
+# safe cases
 let src = code_typed1() do
         r = Ref{Any}()
         r[] = 42
         return r[]
     end
-    @test !any(isnew, src.code)
+    @test is_scalar_replaced(src)
 end
 let src = code_typed1((Bool,)) do cond
         r = Ref{Any}()
@@ -131,7 +194,7 @@ let src = code_typed1((Bool,)) do cond
             return r[]
         end
     end
-    @test !any(isnew, src.code)
+    @test is_scalar_replaced(src)
 end
 let src = code_typed1((Bool,)) do cond
         r = Ref{Any}()
@@ -142,7 +205,7 @@ let src = code_typed1((Bool,)) do cond
         end
         return r[]
     end
-    @test !any(isnew, src.code)
+    @test is_scalar_replaced(src)
 end
 let src = code_typed1((Bool,Bool,Any,Any,Any)) do c1, c2, x, y, z
         r = Ref{Any}()
@@ -157,7 +220,16 @@ let src = code_typed1((Bool,Bool,Any,Any,Any)) do c1, c2, x, y, z
         end
         return r[]
     end
-    @test !any(isnew, src.code)
+    @test is_scalar_replaced(src)
+end
+
+# unsafe cases
+let src = code_typed1() do
+        r = Ref{Any}()
+        return r[]
+    end
+    @test count(isnew, src.code) == 1
+    @test count(iscall((src, getfield)), src.code) == 1
 end
 let src = code_typed1((Bool,)) do cond
         r = Ref{Any}()
@@ -167,7 +239,9 @@ let src = code_typed1((Bool,)) do cond
         return r[]
     end
     # N.B. `r` should be allocated since `cond` might be `false` and then it will be thrown
-    @test any(isnew, src.code)
+    @test count(isnew, src.code) == 1
+    @test count(iscall((src, setfield!)), src.code) == 1
+    @test count(iscall((src, getfield)), src.code) == 1
 end
 let src = code_typed1((Bool,Bool,Any,Any)) do c1, c2, x, y
         r = Ref{Any}()
@@ -181,12 +255,119 @@ let src = code_typed1((Bool,Bool,Any,Any)) do c1, c2, x, y
         return r[]
     end
     # N.B. `r` should be allocated since `c2` might be `false` and then it will be thrown
-    @test any(isnew, src.code)
+    @test count(isnew, src.code) == 1
+    @test count(iscall((src, setfield!)), src.code) == 2
+    @test count(iscall((src, getfield)), src.code) == 1
 end
 
-# should include a simple alias analysis
-struct ImmutableOuter{T}; x::T; y::T; z::T; end
-mutable struct MutableOuter{T}; x::T; y::T; z::T; end
+# load forwarding
+# ---------------
+# even if allocation can't be eliminated
+
+# safe cases
+for T in (ImmutableRef{Any}, Ref{Any})
+    let src = @eval code_typed1((Bool,Any,)) do c, a
+            r = $T(a)
+            if c
+                return r[]
+            else
+                return r
+            end
+        end
+        @test is_load_forwarded(src)
+        @test count(isnew, src.code) == 1
+    end
+    let src = @eval code_typed1((Bool,String,)) do c, a
+            r = $T(a)
+            if c
+                return r[]::String # adce_pass! will further eliminate this type assert call also
+            else
+                return r
+            end
+        end
+        @test is_load_forwarded(src)
+        @test count(isnew, src.code) == 1
+        @test !any(iscall((src, typeassert)), src.code)
+    end
+    let src = @eval code_typed1((Bool,Any,)) do c, a
+            r = $T(a)
+            if c
+                return r[]
+            else
+                throw(r)
+            end
+        end
+        @test is_load_forwarded(src)
+        @test count(isnew, src.code) == 1
+    end
+end
+let src = code_typed1((Bool,Any,Any)) do c, a, b
+        r = Ref{Any}(a)
+        if c
+            return r[]
+        end
+        r[] = b
+        return r
+    end
+    @test is_load_forwarded(src)
+    @test count(isnew, src.code) == 1
+    @test count(iscall((src, setfield!)), src.code) == 1
+    @test count(src.code) do @nospecialize x
+        isreturn(x) && x.val === Argument(3) # a
+    end == 1
+end
+
+# unsafe case
+let src = code_typed1((Bool,Any,Any)) do c, a, b
+        r = Ref{Any}(a)
+        r[] = b
+        @noinline some_escape!(r)
+        return r[]
+    end
+    @test !is_load_forwarded(src)
+    @test count(isnew, src.code) == 1
+    @test count(iscall((src, setfield!)), src.code) == 1
+end
+let src = code_typed1((Bool,String,Regex)) do c, a, b
+        r1 = Ref{Any}(a)
+        r2 = Ref{Any}(b)
+        return ifelse(c, r1, r2)[]
+    end
+    r = only(findall(isreturn, src.code))
+    v = (src.code[r]::Core.ReturnNode).val
+    @test v !== Argument(3) # a
+    @test v !== Argument(4) # b
+    @test_broken is_scalar_replaced(src) # ideally
+end
+let src = code_typed1((Bool,String,Regex)) do c, a, b
+        r1 = Ref{Any}(a)
+        r2 = Ref{Any}(b)
+        t = (r1, r2)
+        return t[c ? 1 : 2][]
+    end
+    r = only(findall(isreturn, src.code))
+    v = (src.code[r]::Core.ReturnNode).val
+    @test v !== Argument(3) # a
+    @test v !== Argument(4) # b
+    @test_broken is_scalar_replaced(src) # ideally
+end
+let src = code_typed1((Bool,String,Regex)) do c, a, b
+        r1 = Ref{Any}(a)
+        r2 = Ref{Any}(b)
+        a = [r1, r2]
+        return a[c ? 1 : 2][]
+    end
+    r = only(findall(isreturn, src.code))
+    v = (src.code[r]::Core.ReturnNode).val
+    @test v !== Argument(3) # a
+    @test v !== Argument(4) # b
+    @test_broken is_scalar_replaced(src) # ideally
+end
+
+# aliased load forwarding
+# -----------------------
+
+# OK: immutable(immutable(...)) case
 let src = code_typed1((Any,Any,Any)) do x, y, z
         xyz = ImmutableXYZ(x, y, z)
         outer = ImmutableOuter(xyz, xyz, xyz)
@@ -214,22 +395,21 @@ let src = code_typed1((Any,Any,Any)) do x, y, z
     end
 end
 
-# FIXME our analysis isn't yet so powerful at this moment: may be unable to handle nested objects well
-# OK: mutable(immutable(...)) case
+# OK: immutable(mutable(...)) case
 let src = code_typed1((Any,Any,Any)) do x, y, z
         xyz = MutableXYZ(x, y, z)
         t   = (xyz,)
         v = t[1].x
         v, v, v
     end
-    @test !any(isnew, src.code)
+    @test is_scalar_replaced(src)
 end
 let src = code_typed1((Any,Any,Any)) do x, y, z
         xyz = MutableXYZ(x, y, z)
         outer = ImmutableOuter(xyz, xyz, xyz)
         outer.x.x, outer.y.y, outer.z.z
     end
-    @test !any(isnew, src.code)
+    @test is_scalar_replaced(src)
     @test any(src.code) do @nospecialize x
         iscall((src, tuple), x) &&
         x.args[2:end] == Any[#=x=# Core.Argument(2), #=y=# Core.Argument(3), #=y=# Core.Argument(4)]
@@ -240,32 +420,541 @@ let # this is a simple end to end test case, which demonstrates allocation elimi
     # NOTE this test case isn't so robust and might be subject to future changes of the broadcasting implementation,
     # in that case you don't really need to stick to keeping this test case around
     simple_sroa(s) = broadcast(identity, Ref(s))
+    let src = code_typed1(simple_sroa, (String,))
+        @test is_scalar_replaced(src)
+    end
     s = Base.inferencebarrier("julia")::String
     simple_sroa(s)
     # NOTE don't hard-code `"julia"` in `@allocated` clause and make sure to execute the
     # compiled code for `simple_sroa`, otherwise everything can be folded even without SROA
     @test @allocated(simple_sroa(s)) == 0
 end
-# FIXME: immutable(mutable(...)) case
+let # some insanely nested example
+    src = code_typed1((Int,)) do x
+        (Ref(Ref(Ref(Ref(Ref(Ref(Ref(Ref(Ref(Ref((x))))))))))))[][][][][][][][][][]
+    end
+    @test is_scalar_replaced(src)
+end
+
+# OK: mutable(immutable(...)) case
 let src = code_typed1((Any,Any,Any)) do x, y, z
         xyz = ImmutableXYZ(x, y, z)
         outer = MutableOuter(xyz, xyz, xyz)
         outer.x.x, outer.y.y, outer.z.z
     end
-    @test_broken !any(isnew, src.code)
+    @test is_scalar_replaced(src)
+    @test any(src.code) do @nospecialize x
+        iscall((src, tuple), x) &&
+        x.args[2:end] == Any[#=x=# Core.Argument(2), #=y=# Core.Argument(3), #=z=# Core.Argument(4)]
+    end
+end
+let src = code_typed1((String,String,String)) do x, y, z
+        xyz = (x, y, z)
+        r = Ref(xyz)
+        return r[][3], r[][2], r[][1]
+    end
+    @test is_scalar_replaced(src)
+    @test any(src.code) do @nospecialize x
+        iscall((src, tuple), x) &&
+        x.args[2:end] == Any[#=z=# Core.Argument(4), #=y=# Core.Argument(3), #=x=# Core.Argument(2)]
+    end
+end
+
+# OK: mutable(mutable(...)) case
+# new chain
+let src = code_typed1((Any,Any,Any)) do x, y, z
+        xyz = MutableXYZ(x, y, z)
+        outer = MutableOuter(xyz, xyz, xyz)
+        outer.x.x, outer.y.y, outer.z.z
+    end
+    @test is_scalar_replaced(src)
+    @test any(src.code) do @nospecialize x
+        iscall((src, tuple), x) &&
+        x.args[2:end] == Any[#=x=# Core.Argument(2), #=y=# Core.Argument(3), #=z=# Core.Argument(4)]
+    end
 end
-# FIXME: mutable(mutable(...)) case
 let src = code_typed1((Any,Any,Any)) do x, y, z
         xyz = MutableXYZ(x, y, z)
+        xyz.x, xyz.y, xyz.z = z, y, x
         outer = MutableOuter(xyz, xyz, xyz)
         outer.x.x, outer.y.y, outer.z.z
     end
-    @test_broken !any(isnew, src.code)
+    @test is_scalar_replaced(src)
+    @test any(src.code) do @nospecialize x
+        iscall((src, tuple), x) &&
+        x.args[2:end] == Any[#=z=# Core.Argument(4), #=y=# Core.Argument(3), #=x=# Core.Argument(2)]
+    end
 end
+let src = code_typed1((Any,Any,Any)) do x, y, z
+        xyz = MutableXYZ(x, y, z)
+        xyz.x, xyz.y, xyz.z = xyz.z, xyz.y, xyz.x
+        outer = MutableOuter(xyz, xyz, xyz)
+        outer.x.x, outer.y.y, outer.z.z
+    end
+    @test is_scalar_replaced(src)
+    @test any(src.code) do @nospecialize x
+        iscall((src, tuple), x) &&
+        x.args[2:end] == Any[#=z=# Core.Argument(4), #=y=# Core.Argument(3), #=x=# Core.Argument(2)]
+    end
+end
+let src = code_typed1((Any,Any,Any)) do x, y, z
+        xyz = MutableXYZ(x, y, z)
+        inner = MutableOuter(xyz, xyz, xyz)
+        outer = MutableOuter(inner, inner, inner)
+        outer.x.x.x, outer.y.y.y, outer.z.z.z
+    end
+    @test is_scalar_replaced(src)
+    @test any(src.code) do @nospecialize x
+        iscall((src, tuple), x) &&
+        x.args[2:end] == Any[#=x=# Core.Argument(2), #=y=# Core.Argument(3), #=z=# Core.Argument(4)]
+    end
+end
+let src = code_typed1((Any,Any,Any)) do x, y, z
+        xyz = MutableXYZ(x, y, z)
+        xyz.x, xyz.y, xyz.z = z, y, x
+        inner = MutableOuter(xyz, xyz, xyz)
+        outer = MutableOuter(inner, inner, inner)
+        outer.x.x.x, outer.y.y.y, outer.z.z.z
+    end
+    @test is_scalar_replaced(src)
+    @test any(src.code) do @nospecialize x
+        iscall((src, tuple), x) &&
+        x.args[2:end] == Any[#=z=# Core.Argument(4), #=y=# Core.Argument(3), #=x=# Core.Argument(2)]
+    end
+end
+# setfield! chain
+let src = code_typed1((Any,Any,Any)) do x, y, z
+        xyz = MutableXYZ(x, y, z)
+        outer = Ref{MutableXYZ}()
+        outer[] = xyz
+        return outer[].x, outer[].y, outer[].z
+    end
+    @test is_scalar_replaced(src)
+    @test any(src.code) do @nospecialize x
+        iscall((src, tuple), x) &&
+        x.args[2:end] == Any[#=x=# Core.Argument(2), #=y=# Core.Argument(3), #=z=# Core.Argument(4)]
+    end
+end
+let src = code_typed1((Any,Any,Any)) do x, y, z
+        xyz = MutableXYZ(x, y, z)
+        outer = Ref{MutableXYZ}()
+        outer[] = xyz
+        xyz.z = 42
+        return outer[].x, outer[].y, outer[].z
+    end
+    @test is_scalar_replaced(src)
+    @test any(src.code) do @nospecialize x
+        iscall((src, tuple), x) &&
+        x.args[2:end] == Any[#=x=# Core.Argument(2), #=y=# Core.Argument(3), 42]
+    end
+end
+
+# ϕ-allocation elimination
+# ------------------------
 
-let # should work with constant globals
-    # immutable case
-    # --------------
+# safe cases
+let src = code_typed1((Bool,Any,Any)) do cond, x, y
+        if cond
+            ϕ = Ref{Any}(x)
+        else
+            ϕ = Ref{Any}(y)
+        end
+        ϕ[]
+    end
+    @test is_scalar_replaced(src)
+    @test count(src.code) do @nospecialize x
+        isa(x, Core.PhiNode) &&
+        #=x=# Core.Argument(3) in x.values &&
+        #=y=# Core.Argument(4) in x.values
+    end == 1
+end
+let src = code_typed1((Bool,Bool,Any,Any,Any)) do cond1, cond2, x, y, z
+        if cond1
+            ϕ = Ref{Any}(x)
+        elseif cond2
+            ϕ = Ref{Any}(y)
+        else
+            ϕ = Ref{Any}(z)
+        end
+        ϕ[]
+    end
+    @test is_scalar_replaced(src)
+    @test count(src.code) do @nospecialize x
+        isa(x, Core.PhiNode) &&
+        #=x=# Core.Argument(4) in x.values &&
+        #=y=# Core.Argument(5) in x.values &&
+        #=z=# Core.Argument(6) in x.values
+    end == 1
+end
+let src = code_typed1((Bool,Any,Any,Any)) do cond, x, y, z
+        if cond
+            ϕ = Ref{Any}(x)
+        else
+            ϕ = Ref{Any}(y)
+        end
+        ϕ[] = z
+        ϕ[]
+    end
+    @test is_scalar_replaced(src)
+    @test count(src.code) do @nospecialize x
+        isa(x, Core.ReturnNode) &&
+        #=z=# Core.Argument(5) === x.val
+    end == 1
+end
+let src = code_typed1((Bool,Any,Any,)) do cond, x, y
+        if cond
+            ϕ = Ref{Any}(x)
+            out1 = ϕ[]
+        else
+            ϕ = Ref{Any}(y)
+            out1 = ϕ[]
+        end
+        out2 = ϕ[]
+        out1, out2
+    end
+    @test is_scalar_replaced(src)
+    @test count(src.code) do @nospecialize x
+        isa(x, Core.PhiNode) &&
+        #=x=# Core.Argument(3) in x.values &&
+        #=y=# Core.Argument(4) in x.values
+    end == 2
+end
+let src = code_typed1((Bool,Any,Any,Any)) do cond, x, y, z
+        if cond
+            ϕ = Ref{Any}(x)
+        else
+            ϕ = Ref{Any}(y)
+            ϕ[] = z
+        end
+        ϕ[]
+    end
+    @test is_scalar_replaced(src)
+    @test count(src.code) do @nospecialize x
+        isa(x, Core.PhiNode) &&
+        #=x=# Core.Argument(3) in x.values &&
+        #=z=# Core.Argument(5) in x.values
+    end == 1
+end
+let src = code_typed1((Bool,Any,Any,Any)) do cond, x, y, z
+        if cond
+            ϕ = Ref{Any}(x)
+            out1 = ϕ[]
+        else
+            ϕ = Ref{Any}(y)
+            out1 = ϕ[]
+            ϕ[] = z
+        end
+        out2 = ϕ[]
+        out1, out2
+    end
+    @test is_scalar_replaced(src)
+    @test count(src.code) do @nospecialize x
+        isa(x, Core.PhiNode) &&
+        #=x=# Core.Argument(3) in x.values &&
+        #=y=# Core.Argument(4) in x.values
+    end == 1
+    @test count(src.code) do @nospecialize x
+        isa(x, Core.PhiNode) &&
+        #=x=# Core.Argument(3) in x.values &&
+        #=z=# Core.Argument(5) in x.values
+    end == 1
+end
+let src = code_typed1((Bool,Any,Any)) do cond, x, y
+        # these allocation form multiple ϕ-nodes
+        if cond
+            ϕ2 = ϕ1 = Ref{Any}(x)
+        else
+            ϕ2 = ϕ1 = Ref{Any}(y)
+        end
+        ϕ1[], ϕ2[]
+    end
+    @test is_scalar_replaced(src)
+    @test count(src.code) do @nospecialize x
+        isa(x, Core.PhiNode) &&
+        #=x=# Core.Argument(3) in x.values &&
+        #=y=# Core.Argument(4) in x.values
+    end == 2
+end
+let src = code_typed1((Bool,String,)) do cond, x
+        # these allocation form multiple ϕ-nodes
+        if cond
+            ϕ2 = ϕ1 = Ref{Any}("foo")
+        else
+            ϕ2 = ϕ1 = Ref{Any}("bar")
+        end
+        ϕ2[] = x
+        y = ϕ1[] # => x
+        return y
+    end
+    @test is_scalar_replaced(src)
+    @test count(src.code) do @nospecialize x
+        isa(x, Core.ReturnNode) &&
+        #=x=# x.val === Core.Argument(3)
+    end == 1
+end
+let src = code_typed1((Bool,Any,Any,)) do cond, x, y
+        x′ = Ref{Any}(x)
+        y′ = Ref{Any}(y)
+        if cond
+            ϕ = x′
+        else
+            ϕ = y′
+        end
+        ϕ[]
+    end
+    @test is_scalar_replaced(src)
+    @test count(src.code) do @nospecialize x
+        isa(x, Core.PhiNode) &&
+        #=x=# Core.Argument(3) in x.values &&
+        #=y=# Core.Argument(4) in x.values
+    end == 1
+end
+
+# unsafe cases
+let src = code_typed1((Bool,Any,Any)) do cond, x, y
+        if cond
+            ϕ = Ref{Any}(x)
+        else
+            ϕ = Ref{Any}(y)
+        end
+        some_escape!(ϕ)
+        ϕ[]
+    end
+    @test count(isnew, src.code) == 2
+    @test count(iscall((src, getfield)), src.code) == 1
+end
+let src = code_typed1((Bool,Any,Any)) do cond, x, y
+        if cond
+            ϕ = Ref{Any}(x)
+            some_escape!(ϕ)
+        else
+            ϕ = Ref{Any}(y)
+        end
+        ϕ[]
+    end
+    @test count(isnew, src.code) == 2
+    @test count(iscall((src, getfield)), src.code) == 1
+end
+let src = code_typed1((Bool,Any,)) do cond, x
+        if cond
+            ϕ = Ref{Any}(x)
+        else
+            ϕ = Ref{Any}()
+        end
+        ϕ[]
+    end
+    @test count(isnew, src.code) == 2
+    @test count(iscall((src, getfield)), src.code) == 1
+end
+let src = code_typed1((Bool,Any)) do c, a
+        local r
+        if c
+            r = Ref{Any}(a)
+        end
+        (r::Base.RefValue{Any})[]
+    end
+    @test count(isnew, src.code) == 1
+    @test count(iscall((src, getfield)), src.code) == 1
+end
+
+function mutable_ϕ_elim(x, xs)
+    r = Ref(x)
+    for x in xs
+        r = Ref(x)
+    end
+    return r[]
+end
+let src = code_typed1(mutable_ϕ_elim, (String, Vector{String}))
+    @test is_scalar_replaced(src)
+
+    xs = String[string(gensym()) for _ in 1:100]
+    mutable_ϕ_elim("init", xs)
+    @test @allocated(mutable_ϕ_elim("init", xs)) == 0
+end
+
+@noinline mightaliase_noinline(a, b) = Base.mightalias(a, b)
+function assert_no_alias!(a, b, c)
+    x = Ref(a)
+    y = Ref(b)
+    @assert !mightaliase_noinline(x[], y[]) # shouldn't be transformed to `mightaliase_noinline(b, b)`
+    z = c ? x : y
+    z
+end
+let src = code_typed1(assert_no_alias!, (Vector{Any}, Vector{Any}, Bool,))
+    @test count(src.code) do @nospecialize x
+        if isinvoke(:mightaliase_noinline, x)
+            if x.args[3] === Argument(2) # a
+                if x.args[4] === Argument(3) # b
+                    return true
+                end
+            end
+        end
+        return false
+    end == 1
+    a = Any[1,2,3]
+    b = Any[1,2,3]
+    @test assert_no_alias!(a, b, true)[] === a
+end
+
+# demonstrate the power of our field / alias analysis with realistic end to end examples
+# adapted from http://wiki.luajit.org/Allocation-Sinking-Optimization#implementation%5B
+abstract type AbstractPoint{T} end
+struct Point{T} <: AbstractPoint{T}
+    x::T
+    y::T
+end
+mutable struct MPoint{T} <: AbstractPoint{T}
+    x::T
+    y::T
+end
+add(a::P, b::P) where P<:AbstractPoint = P(a.x + b.x, a.y + b.y)
+function compute_point(T, n, ax, ay, bx, by)
+    a = T(ax, ay)
+    b = T(bx, by)
+    for i in 0:(n-1)
+        a = add(add(a, b), b)
+    end
+    a.x, a.y
+end
+function compute_point(n, a, b)
+    for i in 0:(n-1)
+        a = add(add(a, b), b)
+    end
+    a.x, a.y
+end
+function compute_point!(n, a, b)
+    for i in 0:(n-1)
+        a′ = add(add(a, b), b)
+        a.x = a′.x
+        a.y = a′.y
+    end
+end
+
+let # immutable case
+    src = code_typed1((Int,)) do n
+        compute_point(Point, n, 1+.5, 2+.5, 2+.25, 4+.75)
+    end
+    @test is_scalar_replaced(Point, src)
+    src = code_typed1((Int,)) do n
+        compute_point(Point, n, 1+.5im, 2+.5im, 2+.25im, 4+.75im)
+    end
+    @test is_scalar_replaced(Point, src)
+    @test is_load_forwarded(ComplexF64, src)
+    @test !is_scalar_replaced(ComplexF64, src)
+
+    # mutable case
+    src = code_typed1((Int,)) do n
+        compute_point(MPoint, n, 1+.5, 2+.5, 2+.25, 4+.75)
+    end
+    @test is_scalar_replaced(MPoint, src)
+    src = code_typed1((Int,)) do n
+        compute_point(MPoint, n, 1+.5im, 2+.5im, 2+.25im, 4+.75im)
+    end
+    @test is_scalar_replaced(MPoint, src)
+    @test is_load_forwarded(ComplexF64, src)
+    @test !is_scalar_replaced(ComplexF64, src)
+end
+compute_point(MPoint, 10, 1+.5, 2+.5, 2+.25, 4+.75)
+compute_point(MPoint, 10, 1+.5im, 2+.5im, 2+.25im, 4+.75im)
+@test @allocated(compute_point(MPoint, 10000, 1+.5, 2+.5, 2+.25, 4+.75)) == 0
+@test @allocated(compute_point(MPoint, 10000, 1+.5im, 2+.5im, 2+.25im, 4+.75im)) == 0
+
+let # immutable case
+    src = code_typed1((Int,)) do n
+        compute_point(n, Point(1+.5, 2+.5), Point(2+.25, 4+.75))
+    end
+    @test is_scalar_replaced(Point, src)
+    src = code_typed1((Int,)) do n
+        compute_point(n, Point(1+.5im, 2+.5im), Point(2+.25im, 4+.75im))
+    end
+    @test is_scalar_replaced(Point, src)
+    @test is_load_forwarded(ComplexF64, src)
+    @test !is_scalar_replaced(ComplexF64, src)
+
+    # mutable case
+    src = code_typed1((Int,)) do n
+        compute_point(n, MPoint(1+.5, 2+.5), MPoint(2+.25, 4+.75))
+    end
+    @test is_scalar_replaced(MPoint, src)
+    src = code_typed1((Int,)) do n
+        compute_point(n, MPoint(1+.5im, 2+.5im), MPoint(2+.25im, 4+.75im))
+    end
+    @test is_scalar_replaced(MPoint, src)
+    @test is_load_forwarded(ComplexF64, src)
+    @test !is_scalar_replaced(ComplexF64, src)
+end
+compute_point(10, MPoint(1+.5, 2+.5), MPoint(2+.25, 4+.75))
+compute_point(10, MPoint(1+.5im, 2+.5im), MPoint(2+.25im, 4+.75im))
+@test @allocated(compute_point(10000, MPoint(1+.5, 2+.5), MPoint(2+.25, 4+.75))) == 0
+@test @allocated(compute_point(10000, MPoint(1+.5im, 2+.5im), MPoint(2+.25im, 4+.75im))) == 0
+
+let # mutable case
+    src = code_typed1(compute_point!, (Int,MPoint{Float64},MPoint{Float64}))
+    @test is_scalar_replaced(MPoint, src)
+    src = code_typed1(compute_point!, (Int,MPoint{ComplexF64},MPoint{ComplexF64}))
+    @test is_scalar_replaced(MPoint, src)
+    @test is_load_forwarded(ComplexF64, src)
+    @test !is_scalar_replaced(ComplexF64, src)
+end
+let
+    af, bf = MPoint(1+.5, 2+.5), MPoint(2+.25, 4+.75)
+    ac, bc = MPoint(1+.5im, 2+.5im), MPoint(2+.25im, 4+.75im)
+    compute_point!(10, af, bf)
+    compute_point!(10, ac, bc)
+    @test @allocated(compute_point!(10000, af, bf)) == 0
+    @test @allocated(compute_point!(10000, ac, bc)) == 0
+end
+
+# isdefined elimination
+# ---------------------
+
+let src = code_typed1((Any,)) do a
+        r = Ref{Any}()
+        r[] = a
+        if isassigned(r)
+            return r[]
+        end
+        return nothing
+    end
+    @test is_scalar_replaced(src)
+end
+
+callit(f, args...) = f(args...)
+function isdefined_elim()
+    local arr::Vector{Any}
+    callit() do
+        arr = Any[]
+    end
+    return arr
+end
+let src = code_typed1(isdefined_elim)
+    @test is_scalar_replaced(src)
+end
+@test isdefined_elim() == Any[]
+
+# preserve elimination
+# --------------------
+
+let src = code_typed1((String,)) do s
+        ccall(:some_ccall, Cint, (Ptr{String},), Ref(s))
+    end
+    @test count(isnew, src.code) == 0
+end
+
+# if the mutable struct is directly used, we shouldn't eliminate it
+let src = code_typed1() do
+        a = MutableXYZ(-512275808,882558299,-2133022131)
+        b = Int32(42)
+        ccall(:some_ccall, Cvoid, (MutableXYZ, Int32), a, b)
+        return a.x
+    end
+    @test count(isnew, src.code) == 1
+end
+
+# constant globals
+# ----------------
+
+let # immutable case
     src = @eval Module() begin
         const REF_FLD = :x
         struct ImmutableRef{T}
@@ -282,7 +971,6 @@ let # should work with constant globals
     @test count(isnew, src.code) == 0
 
     # mutable case
-    # ------------
     src = @eval Module() begin
         const REF_FLD = :x
         code_typed() do
@@ -295,25 +983,6 @@ let # should work with constant globals
     @test count(isnew, src.code) == 0
 end
 
-# should work nicely with inlining to optimize away a complicated case
-# adapted from http://wiki.luajit.org/Allocation-Sinking-Optimization#implementation%5B
-struct Point
-    x::Float64
-    y::Float64
-end
-#=@inline=# add(a::Point, b::Point) = Point(a.x + b.x, a.y + b.y)
-function compute_points()
-    a = Point(1.5, 2.5)
-    b = Point(2.25, 4.75)
-    for i in 0:(100000000-1)
-        a = add(add(a, b), b)
-    end
-    a.x, a.y
-end
-let src = code_typed1(compute_points)
-    @test !any(isnew, src.code)
-end
-
 # comparison lifting
 # ==================
 
@@ -454,7 +1123,7 @@ end
 # A SSAValue after the compaction line
 let m = Meta.@lower 1 + 1
     @assert Meta.isexpr(m, :thunk)
-    src = m.args[1]::Core.CodeInfo
+    src = m.args[1]::CodeInfo
     src.code = Any[
         # block 1
         nothing,
@@ -492,7 +1161,7 @@ let m = Meta.@lower 1 + 1
     src.ssaflags = fill(Int32(0), nstmts)
     ir = Core.Compiler.inflate_ir(src, Any[], Any[Any, Any])
     @test Core.Compiler.verify_ir(ir) === nothing
-    ir = @test_nowarn Core.Compiler.sroa_pass!(ir)
+    ir, = @test_nowarn Core.Compiler.linear_pass!(ir)
     @test Core.Compiler.verify_ir(ir) === nothing
 end
 
@@ -517,7 +1186,7 @@ end
 let m = Meta.@lower 1 + 1
     # Test that CFG simplify combines redundant basic blocks
     @assert Meta.isexpr(m, :thunk)
-    src = m.args[1]::Core.CodeInfo
+    src = m.args[1]::CodeInfo
     src.code = Any[
         Core.Compiler.GotoNode(2),
         Core.Compiler.GotoNode(3),
@@ -542,7 +1211,7 @@ end
 let m = Meta.@lower 1 + 1
     # Test that CFG simplify doesn't mess up when chaining past return blocks
     @assert Meta.isexpr(m, :thunk)
-    src = m.args[1]::Core.CodeInfo
+    src = m.args[1]::CodeInfo
     src.code = Any[
         Core.Compiler.GotoIfNot(Core.Compiler.Argument(2), 3),
         Core.Compiler.GotoNode(4),
@@ -572,7 +1241,7 @@ let m = Meta.@lower 1 + 1
     # Test that CFG simplify doesn't try to merge every block in a loop into
     # its predecessor
     @assert Meta.isexpr(m, :thunk)
-    src = m.args[1]::Core.CodeInfo
+    src = m.args[1]::CodeInfo
     src.code = Any[
         # Block 1
         Core.Compiler.GotoNode(2),