diff --git a/base/broadcast.jl b/base/broadcast.jl index 36426840f5610..0d2e37f3451a0 100644 --- a/base/broadcast.jl +++ b/base/broadcast.jl @@ -828,12 +828,25 @@ preprocess_args(dest, args::Tuple{}) = () end end bc′ = preprocess(dest, bc) - for I in eachindex(bc′) - @inbounds dest[I] = bc′[I] + if _is_simd_safe(dest, bc) + @inbounds @simd for I in eachindex(bc′) + dest[I] = bc′[I] + end + else + @inbounds for I in eachindex(bc′) + dest[I] = bc′[I] + end end return dest end +_is_simd_safe(::Any, ::Any) = false +@inline _is_simd_safe(::Array, bc::Broadcasted) = _args_are_simd_safe(bc) +_args_are_simd_safe() = true +_args_are_simd_safe(::Any, args...) = false +@inline _args_are_simd_safe(::Union{Array, Number}, args...) = _args_are_simd_safe(args...) +@inline _args_are_simd_safe(bc::Broadcasted, args...) = Base.simdable(bc.f) isa Base.SIMDableFunction && _args_are_simd_safe(args...) + # Performance optimization: for BitArray outputs, we cache the result # in a "small" Vector{Bool}, and then copy in chunks into the output @inline function copyto!(dest::BitArray, bc::Broadcasted{Nothing}) diff --git a/base/multidimensional.jl b/base/multidimensional.jl index 708709d952679..9d297a1509ca9 100644 --- a/base/multidimensional.jl +++ b/base/multidimensional.jl @@ -1470,6 +1470,27 @@ end end return B end +# When both arrays are ::Array the SIMD transform is safe +@noinline function extrema!(B::Array, A::Array) + sA = size(A) + sB = size(B) + for I in CartesianIndices(sB) + AI = A[I] + B[I] = (AI, AI) + end + Bmax = CartesianIndex(sB) + @inbounds @simd for I in CartesianIndices(sA) + J = min(Bmax,I) + BJ = B[J] + AI = A[I] + if AI < BJ[1] + B[J] = (AI, BJ[2]) + elseif AI > BJ[2] + B[J] = (BJ[1], AI) + end + end + return B +end # Show for pairs() with Cartesian indicies. Needs to be here rather than show.jl for bootstrap order function Base.showarg(io::IO, r::Iterators.Pairs{<:Integer, <:Any, <:Any, T}, toplevel) where T <: Union{AbstractVector, Tuple} diff --git a/base/reduce.jl b/base/reduce.jl index b1607e39de906..a0cf96614fc96 100644 --- a/base/reduce.jl +++ b/base/reduce.jl @@ -187,14 +187,7 @@ foldr(op, itr) = mapfoldr(identity, op, itr) return mapreduce_first(f, op, a1) elseif ifirst + blksize > ilast # sequential portion - @inbounds a1 = A[ifirst] - @inbounds a2 = A[ifirst+1] - v = op(f(a1), f(a2)) - for i = ifirst + 2 : ilast - @inbounds ai = A[i] - v = op(v, f(ai)) - end - return v + return _mapreduce_impl_loop(simdable(f), simdable(op), A, ifirst, ilast) else # pairwise portion imid = (ifirst + ilast) >> 1 @@ -204,6 +197,32 @@ foldr(op, itr) = mapfoldr(identity, op, itr) end end +# The @simd transformation is only valid in limited situations +struct SIMDableFunction{f}; end +(::SIMDableFunction{f})(args...) where {f} = f(args...) +simdable(f::Union{map(typeof, (+, *, &, |, add_sum, mul_prod, -, /, ^, identity))...}) = SIMDableFunction{f}() +simdable(f) = f +function _mapreduce_impl_loop(f::SIMDableFunction, op::SIMDableFunction, A::Array, ifirst, ilast) + @inbounds a1 = A[ifirst] + @inbounds a2 = A[ifirst+1] + v = op(f(a1), f(a2)) + @simd for i = ifirst + 2 : ilast + @inbounds ai = A[i] + v = op(v, f(ai)) + end + return v +end +function _mapreduce_impl_loop(f, op, A, ifirst, ilast) + @inbounds a1 = A[ifirst] + @inbounds a2 = A[ifirst+1] + v = op(f(a1), f(a2)) + for i = ifirst + 2 : ilast + @inbounds ai = A[i] + v = op(v, f(ai)) + end + return v +end + mapreduce_impl(f, op, A::AbstractArray, ifirst::Integer, ilast::Integer) = mapreduce_impl(f, op, A, ifirst, ilast, pairwise_blocksize(f, op)) diff --git a/base/reducedim.jl b/base/reducedim.jl index 7801863e922b4..9051c26f14c11 100644 --- a/base/reducedim.jl +++ b/base/reducedim.jl @@ -222,24 +222,55 @@ function _mapreducedim!(f, op, R::AbstractArray, A::AbstractArray) if reducedim1(R, A) # keep the accumulator as a local variable when reducing along the first dimension i1 = first(indices1(R)) - @inbounds for IA in CartesianIndices(indsAt) + for IA in CartesianIndices(indsAt) IR = Broadcast.newindex(IA, keep, Idefault) - r = R[i1,IR] - for i in axes(A, 1) - r = op(r, f(A[i, IA])) - end - R[i1,IR] = r + _mapreducedim_loop1!(simdable(f), simdable(op), R, A, IR, IA, i1) end else - @inbounds for IA in CartesianIndices(indsAt) + for IA in CartesianIndices(indsAt) IR = Broadcast.newindex(IA, keep, Idefault) - for i in axes(A, 1) - R[i,IR] = op(R[i,IR], f(A[i,IA])) - end + _mapreducedim_loop!(simdable(f), simdable(op), R, A, IR, IA) + end + end + return R +end + +# The innermost loops are split out to allow for @simd in known safe cases +# add a few more simd-safe functions that were not available earlier in bootstrap +simdable(f::Union{map(typeof, (abs, sqrt, log, log10, log2))...}) = SIMDableFunction{f}() +@inline function _mapreducedim_loop1!(f, op, R, A, IR, IA, i1) + @inbounds begin + r = R[i1,IR] + for i in axes(A, 1) + r = op(r, f(A[i, IA])) end + R[i1,IR] = r end return R end +@inline function _mapreducedim_loop1!(f::SIMDableFunction, op::SIMDableFunction, R::Array, A::Array, IR, IA, i1) + @inbounds begin + r = R[i1,IR] + @simd for i in axes(A, 1) + r = op(r, f(A[i, IA])) + end + R[i1,IR] = r + end + return R +end +@inline function _mapreducedim_loop!(f, op, R, A, IR, IA) + @inbounds for i in axes(A, 1) + R[i,IR] = op(R[i,IR], f(A[i,IA])) + end + return R +end +@inline function _mapreducedim_loop!(f::SIMDableFunction, op::SIMDableFunction, R::Array, A::Array, IR, IA) + @inbounds @simd for i in axes(A, 1) + R[i,IR] = op(R[i,IR], f(A[i,IA])) + end + return R +end + mapreducedim!(f, op, R::AbstractArray, A::AbstractArray) = (_mapreducedim!(f, op, R, A); R) diff --git a/base/simdloop.jl b/base/simdloop.jl index 42ec45815100f..8c9d4c6fa3741 100644 --- a/base/simdloop.jl +++ b/base/simdloop.jl @@ -94,7 +94,7 @@ Annotate a `for` loop to allow the compiler to take extra liberties to allow loo This feature is experimental and could change or disappear in future versions of Julia. Incorrect use of the `@simd` macro may cause unexpected results. -The object iterated over in a `@simd for` loop should be a one-dimensional range. +The object iterated over in a `@simd for` loop should be a one-dimensional range or a CartesianIndices iterator. By using `@simd`, you are asserting several properties of the loop: * It is safe to execute iterations in arbitrary or overlapping order, with special consideration for reduction variables. diff --git a/doc/src/manual/performance-tips.md b/doc/src/manual/performance-tips.md index aba5ba560c0f8..006aa91d808a7 100644 --- a/doc/src/manual/performance-tips.md +++ b/doc/src/manual/performance-tips.md @@ -1144,7 +1144,7 @@ Sometimes you can enable better optimization by promising certain program proper like allowing floating-point re-associativity and ignoring dependent memory accesses. Again, be very careful when asserting `@simd` as erroneously annotating a loop with dependent iterations may result in unexpected results. In particular, note that `setindex!` on some `AbstractArray` subtypes is - enherently dependent upon iteration order. **This feature is experimental** + inherently dependent upon iteration order. **This feature is experimental** and could change or disappear in future versions of Julia. The common idiom of using 1:n to index into an AbstractArray is not safe if the Array uses unconventional indexing,