diff --git a/base/broadcast.jl b/base/broadcast.jl
index 36426840f5610..0d2e37f3451a0 100644
--- a/base/broadcast.jl
+++ b/base/broadcast.jl
@@ -828,12 +828,25 @@ preprocess_args(dest, args::Tuple{}) = ()
         end
     end
     bc′ = preprocess(dest, bc)
-    for I in eachindex(bc′)
-        @inbounds dest[I] = bc′[I]
+    if _is_simd_safe(dest, bc)
+        @inbounds @simd for I in eachindex(bc′)
+            dest[I] = bc′[I]
+        end
+    else
+        @inbounds for I in eachindex(bc′)
+            dest[I] = bc′[I]
+        end
     end
     return dest
 end
 
+_is_simd_safe(::Any, ::Any) = false
+@inline _is_simd_safe(::Array, bc::Broadcasted) = _args_are_simd_safe(bc)
+_args_are_simd_safe() = true
+_args_are_simd_safe(::Any, args...) = false
+@inline _args_are_simd_safe(::Union{Array, Number}, args...) = _args_are_simd_safe(args...)
+@inline _args_are_simd_safe(bc::Broadcasted, args...) = Base.simdable(bc.f) isa Base.SIMDableFunction && _args_are_simd_safe(args...)
+
 # Performance optimization: for BitArray outputs, we cache the result
 # in a "small" Vector{Bool}, and then copy in chunks into the output
 @inline function copyto!(dest::BitArray, bc::Broadcasted{Nothing})
diff --git a/base/multidimensional.jl b/base/multidimensional.jl
index 708709d952679..9d297a1509ca9 100644
--- a/base/multidimensional.jl
+++ b/base/multidimensional.jl
@@ -1470,6 +1470,27 @@ end
     end
     return B
 end
+# When both arrays are ::Array the SIMD transform is safe
+@noinline function extrema!(B::Array, A::Array)
+    sA = size(A)
+    sB = size(B)
+    for I in CartesianIndices(sB)
+        AI = A[I]
+        B[I] = (AI, AI)
+    end
+    Bmax = CartesianIndex(sB)
+    @inbounds @simd for I in CartesianIndices(sA)
+        J = min(Bmax,I)
+        BJ = B[J]
+        AI = A[I]
+        if AI < BJ[1]
+            B[J] = (AI, BJ[2])
+        elseif AI > BJ[2]
+            B[J] = (BJ[1], AI)
+        end
+    end
+    return B
+end
 
 # Show for pairs() with Cartesian indicies. Needs to be here rather than show.jl for bootstrap order
 function Base.showarg(io::IO, r::Iterators.Pairs{<:Integer, <:Any, <:Any, T}, toplevel) where T <: Union{AbstractVector, Tuple}
diff --git a/base/reduce.jl b/base/reduce.jl
index b1607e39de906..a0cf96614fc96 100644
--- a/base/reduce.jl
+++ b/base/reduce.jl
@@ -187,14 +187,7 @@ foldr(op, itr) = mapfoldr(identity, op, itr)
         return mapreduce_first(f, op, a1)
     elseif ifirst + blksize > ilast
         # sequential portion
-        @inbounds a1 = A[ifirst]
-        @inbounds a2 = A[ifirst+1]
-        v = op(f(a1), f(a2))
-        for i = ifirst + 2 : ilast
-            @inbounds ai = A[i]
-            v = op(v, f(ai))
-        end
-        return v
+        return _mapreduce_impl_loop(simdable(f), simdable(op), A, ifirst, ilast)
     else
         # pairwise portion
         imid = (ifirst + ilast) >> 1
@@ -204,6 +197,32 @@ foldr(op, itr) = mapfoldr(identity, op, itr)
     end
 end
 
+# The @simd transformation is only valid in limited situations
+struct SIMDableFunction{f}; end
+(::SIMDableFunction{f})(args...) where {f} = f(args...)
+simdable(f::Union{map(typeof, (+, *, &, |, add_sum, mul_prod, -, /, ^, identity))...}) = SIMDableFunction{f}()
+simdable(f) = f
+function _mapreduce_impl_loop(f::SIMDableFunction, op::SIMDableFunction, A::Array, ifirst, ilast)
+    @inbounds a1 = A[ifirst]
+    @inbounds a2 = A[ifirst+1]
+    v = op(f(a1), f(a2))
+    @simd for i = ifirst + 2 : ilast
+        @inbounds ai = A[i]
+        v = op(v, f(ai))
+    end
+    return v
+end
+function _mapreduce_impl_loop(f, op, A, ifirst, ilast)
+    @inbounds a1 = A[ifirst]
+    @inbounds a2 = A[ifirst+1]
+    v = op(f(a1), f(a2))
+    for i = ifirst + 2 : ilast
+        @inbounds ai = A[i]
+        v = op(v, f(ai))
+    end
+    return v
+end
+
 mapreduce_impl(f, op, A::AbstractArray, ifirst::Integer, ilast::Integer) =
     mapreduce_impl(f, op, A, ifirst, ilast, pairwise_blocksize(f, op))
 
diff --git a/base/reducedim.jl b/base/reducedim.jl
index 7801863e922b4..9051c26f14c11 100644
--- a/base/reducedim.jl
+++ b/base/reducedim.jl
@@ -222,24 +222,55 @@ function _mapreducedim!(f, op, R::AbstractArray, A::AbstractArray)
     if reducedim1(R, A)
         # keep the accumulator as a local variable when reducing along the first dimension
         i1 = first(indices1(R))
-        @inbounds for IA in CartesianIndices(indsAt)
+        for IA in CartesianIndices(indsAt)
             IR = Broadcast.newindex(IA, keep, Idefault)
-            r = R[i1,IR]
-            for i in axes(A, 1)
-                r = op(r, f(A[i, IA]))
-            end
-            R[i1,IR] = r
+            _mapreducedim_loop1!(simdable(f), simdable(op), R, A, IR, IA, i1)
         end
     else
-        @inbounds for IA in CartesianIndices(indsAt)
+        for IA in CartesianIndices(indsAt)
             IR = Broadcast.newindex(IA, keep, Idefault)
-            for i in axes(A, 1)
-                R[i,IR] = op(R[i,IR], f(A[i,IA]))
-            end
+            _mapreducedim_loop!(simdable(f), simdable(op), R, A, IR, IA)
+        end
+    end
+    return R
+end
+
+# The innermost loops are split out to allow for @simd in known safe cases
+# add a few more simd-safe functions that were not available earlier in bootstrap
+simdable(f::Union{map(typeof, (abs, sqrt, log, log10, log2))...}) = SIMDableFunction{f}()
+@inline function _mapreducedim_loop1!(f, op, R, A, IR, IA, i1)
+    @inbounds begin
+        r = R[i1,IR]
+        for i in axes(A, 1)
+            r = op(r, f(A[i, IA]))
         end
+        R[i1,IR] = r
     end
     return R
 end
+@inline function _mapreducedim_loop1!(f::SIMDableFunction, op::SIMDableFunction, R::Array, A::Array, IR, IA, i1)
+    @inbounds begin
+        r = R[i1,IR]
+        @simd for i in axes(A, 1)
+            r = op(r, f(A[i, IA]))
+        end
+        R[i1,IR] = r
+    end
+    return R
+end
+@inline function _mapreducedim_loop!(f, op, R, A, IR, IA)
+    @inbounds for i in axes(A, 1)
+        R[i,IR] = op(R[i,IR], f(A[i,IA]))
+    end
+    return R
+end
+@inline function _mapreducedim_loop!(f::SIMDableFunction, op::SIMDableFunction, R::Array, A::Array, IR, IA)
+    @inbounds @simd for i in axes(A, 1)
+        R[i,IR] = op(R[i,IR], f(A[i,IA]))
+    end
+    return R
+end
+
 
 mapreducedim!(f, op, R::AbstractArray, A::AbstractArray) =
     (_mapreducedim!(f, op, R, A); R)
diff --git a/base/simdloop.jl b/base/simdloop.jl
index 42ec45815100f..8c9d4c6fa3741 100644
--- a/base/simdloop.jl
+++ b/base/simdloop.jl
@@ -94,7 +94,7 @@ Annotate a `for` loop to allow the compiler to take extra liberties to allow loo
     This feature is experimental and could change or disappear in future versions of Julia.
     Incorrect use of the `@simd` macro may cause unexpected results.
 
-The object iterated over in a `@simd for` loop should be a one-dimensional range.
+The object iterated over in a `@simd for` loop should be a one-dimensional range or a CartesianIndices iterator.
 By using `@simd`, you are asserting several properties of the loop:
 
     * It is safe to execute iterations in arbitrary or overlapping order, with special consideration for reduction variables.
diff --git a/doc/src/manual/performance-tips.md b/doc/src/manual/performance-tips.md
index aba5ba560c0f8..006aa91d808a7 100644
--- a/doc/src/manual/performance-tips.md
+++ b/doc/src/manual/performance-tips.md
@@ -1144,7 +1144,7 @@ Sometimes you can enable better optimization by promising certain program proper
     like allowing floating-point re-associativity and ignoring dependent memory accesses. Again, be very
     careful when asserting `@simd` as erroneously annotating a loop with dependent iterations may result
     in unexpected results. In particular, note that `setindex!` on some `AbstractArray` subtypes is
-    enherently dependent upon iteration order. **This feature is experimental**
+    inherently dependent upon iteration order. **This feature is experimental**
     and could change or disappear in future versions of Julia.
 
 The common idiom of using 1:n to index into an AbstractArray is not safe if the Array uses unconventional indexing,