diff --git a/ext/cuda/data_layouts.jl b/ext/cuda/data_layouts.jl index a9b185027e..20cc9d7178 100644 --- a/ext/cuda/data_layouts.jl +++ b/ext/cuda/data_layouts.jl @@ -27,121 +27,9 @@ Base.similar( include("data_layouts_fill.jl") include("data_layouts_copyto.jl") +include("data_layouts_fused_copyto.jl") include("data_layouts_mapreduce.jl") -Base.@propagate_inbounds function rcopyto_at!( - pair::Pair{<:AbstractData, <:Any}, - I, - v, -) - dest, bc = pair.first, pair.second - if 1 ≤ v <= size(dest, 4) - dest[I] = isascalar(bc) ? bc[] : bc[I] - end - return nothing -end -Base.@propagate_inbounds function rcopyto_at!(pair::Pair{<:DataF, <:Any}, I, v) - dest, bc = pair.first, pair.second - if 1 ≤ v <= size(dest, 4) - bcI = isascalar(bc) ? bc[] : bc[I] - dest[] = bcI - end - return nothing -end -Base.@propagate_inbounds function rcopyto_at!(pairs::Tuple, I, v) - rcopyto_at!(first(pairs), I, v) - rcopyto_at!(Base.tail(pairs), I, v) -end -Base.@propagate_inbounds rcopyto_at!(pairs::Tuple{<:Any}, I, v) = - rcopyto_at!(first(pairs), I, v) -@inline rcopyto_at!(pairs::Tuple{}, I, v) = nothing - -function knl_fused_copyto!(fmbc::FusedMultiBroadcast) - - @inbounds begin - i = CUDA.threadIdx().x - j = CUDA.threadIdx().y - - h = CUDA.blockIdx().x - v = CUDA.blockDim().z * (CUDA.blockIdx().y - 1) + CUDA.threadIdx().z - (; pairs) = fmbc - I = CartesianIndex((i, j, 1, v, h)) - rcopyto_at!(pairs, I, v) - end - return nothing -end - -function fused_copyto!( - fmbc::FusedMultiBroadcast, - dest1::VIJFH{S, Nv, Nij}, - ::ToCUDA, -) where {S, Nv, Nij} - _, _, _, _, Nh = size(dest1) - if Nv > 0 && Nh > 0 - Nv_per_block = min(Nv, fld(256, Nij * Nij)) - Nv_blocks = cld(Nv, Nv_per_block) - auto_launch!( - knl_fused_copyto!, - (fmbc,), - dest1; - threads_s = (Nij, Nij, Nv_per_block), - blocks_s = (Nh, Nv_blocks), - ) - end - return nothing -end - -function fused_copyto!( - fmbc::FusedMultiBroadcast, - dest1::IJFH{S, Nij}, - ::ToCUDA, -) where {S, Nij} - _, _, _, _, Nh = size(dest1) - if Nh > 0 - auto_launch!( - knl_fused_copyto!, - (fmbc,), - dest1; - threads_s = (Nij, Nij), - blocks_s = (Nh, 1), - ) - end - return nothing -end -function fused_copyto!( - fmbc::FusedMultiBroadcast, - dest1::VF{S, Nv}, - ::ToCUDA, -) where {S, Nv} - _, _, _, _, Nh = size(dest1) - if Nv > 0 && Nh > 0 - auto_launch!( - knl_fused_copyto!, - (fmbc,), - dest1; - threads_s = (1, 1), - blocks_s = (Nh, Nv), - ) - end - return nothing -end - -function fused_copyto!( - fmbc::FusedMultiBroadcast, - dest1::DataF{S}, - ::ToCUDA, -) where {S} - auto_launch!( - knl_fused_copyto!, - (fmbc,), - dest1; - threads_s = (1, 1), - blocks_s = (1, 1), - ) - return nothing -end - - adapt_f(to, f::F) where {F} = Adapt.adapt(to, f) adapt_f(to, ::Type{F}) where {F} = (x...) -> F(x...) diff --git a/ext/cuda/data_layouts_fused_copyto.jl b/ext/cuda/data_layouts_fused_copyto.jl new file mode 100644 index 0000000000..d6390cd36b --- /dev/null +++ b/ext/cuda/data_layouts_fused_copyto.jl @@ -0,0 +1,111 @@ +Base.@propagate_inbounds function rcopyto_at!( + pair::Pair{<:AbstractData, <:Any}, + I, + v, +) + dest, bc = pair.first, pair.second + if 1 ≤ v <= size(dest, 4) + dest[I] = isascalar(bc) ? bc[] : bc[I] + end + return nothing +end +Base.@propagate_inbounds function rcopyto_at!(pair::Pair{<:DataF, <:Any}, I, v) + dest, bc = pair.first, pair.second + if 1 ≤ v <= size(dest, 4) + bcI = isascalar(bc) ? bc[] : bc[I] + dest[] = bcI + end + return nothing +end +Base.@propagate_inbounds function rcopyto_at!(pairs::Tuple, I, v) + rcopyto_at!(first(pairs), I, v) + rcopyto_at!(Base.tail(pairs), I, v) +end +Base.@propagate_inbounds rcopyto_at!(pairs::Tuple{<:Any}, I, v) = + rcopyto_at!(first(pairs), I, v) +@inline rcopyto_at!(pairs::Tuple{}, I, v) = nothing + +function knl_fused_copyto!(fmbc::FusedMultiBroadcast) + + @inbounds begin + i = CUDA.threadIdx().x + j = CUDA.threadIdx().y + + h = CUDA.blockIdx().x + v = CUDA.blockDim().z * (CUDA.blockIdx().y - 1) + CUDA.threadIdx().z + (; pairs) = fmbc + I = CartesianIndex((i, j, 1, v, h)) + rcopyto_at!(pairs, I, v) + end + return nothing +end + +function fused_copyto!( + fmbc::FusedMultiBroadcast, + dest1::VIJFH{S, Nv, Nij}, + ::ToCUDA, +) where {S, Nv, Nij} + _, _, _, _, Nh = size(dest1) + if Nv > 0 && Nh > 0 + Nv_per_block = min(Nv, fld(256, Nij * Nij)) + Nv_blocks = cld(Nv, Nv_per_block) + auto_launch!( + knl_fused_copyto!, + (fmbc,), + dest1; + threads_s = (Nij, Nij, Nv_per_block), + blocks_s = (Nh, Nv_blocks), + ) + end + return nothing +end + +function fused_copyto!( + fmbc::FusedMultiBroadcast, + dest1::IJFH{S, Nij}, + ::ToCUDA, +) where {S, Nij} + _, _, _, _, Nh = size(dest1) + if Nh > 0 + auto_launch!( + knl_fused_copyto!, + (fmbc,), + dest1; + threads_s = (Nij, Nij), + blocks_s = (Nh, 1), + ) + end + return nothing +end +function fused_copyto!( + fmbc::FusedMultiBroadcast, + dest1::VF{S, Nv}, + ::ToCUDA, +) where {S, Nv} + _, _, _, _, Nh = size(dest1) + if Nv > 0 && Nh > 0 + auto_launch!( + knl_fused_copyto!, + (fmbc,), + dest1; + threads_s = (1, 1), + blocks_s = (Nh, Nv), + ) + end + return nothing +end + +function fused_copyto!( + fmbc::FusedMultiBroadcast, + dest1::DataF{S}, + ::ToCUDA, +) where {S} + auto_launch!( + knl_fused_copyto!, + (fmbc,), + dest1; + threads_s = (1, 1), + blocks_s = (1, 1), + ) + return nothing +end diff --git a/src/DataLayouts/DataLayouts.jl b/src/DataLayouts/DataLayouts.jl index fd6c817eff..cba9ffeeeb 100644 --- a/src/DataLayouts/DataLayouts.jl +++ b/src/DataLayouts/DataLayouts.jl @@ -176,11 +176,6 @@ Base.parent(data::AbstractData) = getfield(data, :array) Base.similar(data::AbstractData{S}) where {S} = similar(data, S) -function Base.copyto!(dest::D, src::D) where {D <: AbstractData} - copyto!(parent(dest), parent(src)) - return dest -end - @inline function ncomponents(data::AbstractData{S}) where {S} typesize(eltype(parent(data)), S) end @@ -330,15 +325,6 @@ function Base.size(data::IJFH{S, Nij}) where {S, Nij} (Nij, Nij, 1, Nv, Nh) end -function Base.fill!(data::IJFH, val, ::ToCPU) - (_, _, _, _, Nh) = size(data) - @inbounds for h in 1:Nh - fill!(slab(data, h), val) - end - return data -end - - function IJFH{S, Nij}(::Type{ArrayType}, nelements) where {S, Nij, ArrayType} T = eltype(ArrayType) IJFH{S, Nij}(ArrayType(undef, Nij, Nij, typesize(T, S), nelements)) @@ -499,14 +485,6 @@ function Base.size(data::IFH{S, Ni}) where {S, Ni} (Ni, 1, 1, Nv, Nh) end -function Base.fill!(data::IFH, val, ::ToCPU) - (_, _, _, _, Nh) = size(data) - @inbounds for h in 1:Nh - fill!(slab(data, h), val) - end - return data -end - @inline function slab(data::IFH{S, Ni}, h::Integer) where {S, Ni} @boundscheck (1 <= h <= size(parent(data), 3)) || throw(BoundsError(data, (h,))) @@ -623,10 +601,6 @@ function DataF(x::T) where {T} end -function Base.fill!(data::DataF, val, ::ToCPU) - @inbounds data[] = val - return data -end @inline function Base.getproperty(data::DataF{S}, i::Integer) where {S} array = parent(data) T = eltype(array) @@ -751,12 +725,6 @@ end function Base.size(data::IJF{S, Nij}) where {S, Nij} return (Nij, Nij, 1, 1, 1) end -function Base.fill!(data::IJF{S, Nij}, val, ::ToCPU) where {S, Nij} - @inbounds for j in 1:Nij, i in 1:Nij - data[i, j] = val - end - return data -end @generated function _property_view( data::IJF{S, Nij, A}, @@ -889,14 +857,6 @@ function replace_basetype(data::IF{S, Ni}, ::Type{T}) where {S, Ni, T} return IF{S′, Ni}(similar(array, T)) end -function Base.fill!(data::IF{S, Ni}, val, ::ToCPU) where {S, Ni} - @inbounds for i in 1:Ni - data[i] = val - end - return data -end - - @generated function _property_view( data::IF{S, Ni, A}, ::Val{Idx}, @@ -1003,14 +963,6 @@ Base.size(data::VF{S, Nv}) where {S, Nv} = (1, 1, 1, Nv, 1) nlevels(::VF{S, Nv}) where {S, Nv} = Nv -function Base.fill!(data::VF, val, ::ToCPU) - Nv = nlevels(data) - @inbounds for v in 1:Nv - data[v] = val - end - return data -end - @generated function _property_view( data::VF{S, Nv, A}, ::Val{Idx}, @@ -1128,14 +1080,6 @@ function Base.length(data::VIJFH) size(parent(data), 1) * size(parent(data), 5) end -function Base.fill!(data::VIJFH, val, ::ToCPU) - (Ni, Nj, _, Nv, Nh) = size(data) - @inbounds for h in 1:Nh, v in 1:Nv - fill!(slab(data, v, h), val) - end - return data -end - @generated function _property_view( data::VIJFH{S, Nv, Nij, A}, ::Val{Idx}, @@ -1295,13 +1239,6 @@ end function Base.length(data::VIFH) nlevels(data) * size(parent(data), 4) end -function Base.fill!(data::VIFH, val, ::ToCPU) - (Ni, _, _, Nv, Nh) = size(data) - @inbounds for h in 1:Nh, v in 1:Nv - fill!(slab(data, v, h), val) - end - return data -end @propagate_inbounds function Base.getindex( data::VIFH{S}, @@ -1630,7 +1567,9 @@ _device_dispatch(x::AbstractData) = _device_dispatch(parent(x)) _device_dispatch(x::SArray) = ToCPU() _device_dispatch(x::MArray) = ToCPU() -Base.fill!(dest::AbstractData, val) = - Base.fill!(dest, val, device_dispatch(dest)) +include("copyto.jl") +include("fused_copyto.jl") +include("fill.jl") +include("mapreduce.jl") end # module diff --git a/src/DataLayouts/broadcast.jl b/src/DataLayouts/broadcast.jl index c5821c41ad..35ef0422bc 100644 --- a/src/DataLayouts/broadcast.jl +++ b/src/DataLayouts/broadcast.jl @@ -377,252 +377,6 @@ function Base.similar( return VIJFH{Eltype, newNv, Nij}(array) end -# This is only defined for testing. -function mapreduce_cuda end - -function Base.mapreduce( - fn::F, - op::Op, - bc::BroadcastedUnionDataF{<:Any, A}, -) where {F, Op, A} - mapreduce(op, 1) do v - Base.@_inline_meta - @inbounds fn(bc[]) - end -end - -function Base.mapreduce( - fn::F, - op::Op, - bc::BroadcastedUnionIJFH{<:Any, Nij, A}, -) where {F, Op, Nij, A} - # mapreduce across DataSlab2D - _, _, _, _, Nh = size(bc) - mapreduce(op, 1:Nh) do h - Base.@_inline_meta - slabview = @inbounds slab(bc, h) - mapreduce(fn, op, slabview) - end -end - -function Base.mapreduce( - fn::F, - op::Op, - bc::BroadcastedUnionIFH{<:Any, Ni, A}, -) where {F, Op, Ni, A} - # mapreduce across DataSlab1D - _, _, _, _, Nh = size(bc) - mapreduce(op, 1:Nh) do h - Base.@_inline_meta - slabview = @inbounds slab(bc, h) - mapreduce(fn, op, slabview) - end -end - -function Base.mapreduce(fn::F, op::Op, bc::IJF{S, Nij}) where {F, Op, S, Nij} - # mapreduce across DataSlab2D nodes - mapreduce(op, Iterators.product(1:Nij, 1:Nij)) do (i, j) - Base.@_inline_meta - idx = CartesianIndex(i, j, 1, 1, 1) - node = @inbounds bc[idx] - fn(node) - end -end - -function Base.mapreduce(fn::F, op::Op, bc::IF{S, Ni}) where {F, Op, S, Ni} - # mapreduce across DataSlab1D nodes - mapreduce(op, 1:Ni) do i - Base.@_inline_meta - idx = CartesianIndex(i, 1, 1, 1, 1) - node = @inbounds bc[idx] - fn(node) - end -end - -function Base.mapreduce( - fn::F, - op::Op, - bc::BroadcastedUnionVF{<:Any, Nv, A}, -) where {F, Op, Nv, A} - # mapreduce across DataColumn levels - mapreduce(op, 1:Nv) do v - Base.@_inline_meta - idx = CartesianIndex(1, 1, 1, v, 1) - level = @inbounds bc[idx] - fn(level) - end -end - -function Base.mapreduce( - fn::F, - op::Op, - bc::BroadcastedUnionVIFH{<:Any, Nv, Ni, A}, -) where {F, Op, Nv, Ni, A} - # mapreduce across columns - _, _, _, _, Nh = size(bc) - mapreduce(op, Iterators.product(1:Ni, 1:Nh)) do (i, h) - Base.@_inline_meta - columnview = @inbounds column(bc, i, h) - mapreduce(fn, op, columnview) - end -end - -function Base.mapreduce( - fn::F, - op::Op, - bc::BroadcastedUnionVIJFH{<:Any, Nv, Nij, A}, -) where {F, Op, Nv, Nij, A} - # mapreduce across columns - _, _, _, _, Nh = size(bc) - mapreduce(op, Iterators.product(1:Nij, 1:Nij, 1:Nh)) do (i, j, h) - Base.@_inline_meta - columnview = @inbounds column(bc, i, j, h) - mapreduce(fn, op, columnview) - end -end - -Base.copyto!( - dest::AbstractData, - bc::Union{AbstractData, Base.Broadcast.Broadcasted}, -) = Base.copyto!(dest, bc, device_dispatch(dest)) - -# broadcasting scalar assignment -# Performance optimization for the common identity scalar case: dest .= val -# And this is valid for the CPU or GPU, since the broadcasted object -# is a scalar type. -function Base.copyto!( - dest::AbstractData, - bc::Base.Broadcast.Broadcasted{Style}, - ::AbstractDispatchToDevice, -) where { - Style <: - Union{Base.Broadcast.AbstractArrayStyle{0}, Base.Broadcast.Style{Tuple}}, -} - bc = Base.Broadcast.instantiate( - Base.Broadcast.Broadcasted{Style}(bc.f, bc.args, ()), - ) - @inbounds bc0 = bc[] - fill!(dest, bc0) -end - -function Base.copyto!( - dest::DataF{S}, - bc::BroadcastedUnionDataF{S, A}, - ::ToCPU, -) where {S, A} - @inbounds dest[] = convert(S, bc[]) - return dest -end - -function Base.copyto!( - dest::IJFH{S, Nij}, - bc::BroadcastedUnionIJFH{S, Nij}, - ::ToCPU, -) where {S, Nij} - _, _, _, _, Nh = size(bc) - @inbounds for h in 1:Nh - slab_dest = slab(dest, h) - slab_bc = slab(bc, h) - copyto!(slab_dest, slab_bc) - end - return dest -end - -function Base.copyto!( - dest::IFH{S, Ni}, - bc::BroadcastedUnionIFH{S, Ni}, - ::ToCPU, -) where {S, Ni} - _, _, _, _, Nh = size(bc) - @inbounds for h in 1:Nh - slab_dest = slab(dest, h) - slab_bc = slab(bc, h) - copyto!(slab_dest, slab_bc) - end - return dest -end - -# inline inner slab(::DataSlab2D) copy -function Base.copyto!( - dest::IJF{S, Nij}, - bc::BroadcastedUnionIJF{S, Nij, A}, - ::ToCPU, -) where {S, Nij, A} - @inbounds for j in 1:Nij, i in 1:Nij - idx = CartesianIndex(i, j, 1, 1, 1) - dest[idx] = convert(S, bc[idx]) - end - return dest -end - -function Base.copyto!( - dest::IF{S, Ni}, - bc::BroadcastedUnionIF{S, Ni, A}, - ::ToCPU, -) where {S, Ni, A} - @inbounds for i in 1:Ni - idx = CartesianIndex(i, 1, 1, 1, 1) - dest[idx] = convert(S, bc[idx]) - end - return dest -end - -# inline inner slab(::DataSlab1D) copy -function Base.copyto!( - dest::IF{S, Ni}, - bc::Base.Broadcast.Broadcasted{IFStyle{Ni, A}}, - ::ToCPU, -) where {S, Ni, A} - @inbounds for i in 1:Ni - idx = CartesianIndex(i, 1, 1, 1, 1) - dest[idx] = convert(S, bc[idx]) - end - return dest -end - -# inline inner column(::DataColumn) copy -function Base.copyto!( - dest::VF{S, Nv}, - bc::BroadcastedUnionVF{S, Nv, A}, - ::ToCPU, -) where {S, Nv, A} - @inbounds for v in 1:Nv - idx = CartesianIndex(1, 1, 1, v, 1) - dest[idx] = convert(S, bc[idx]) - end - return dest -end - -function Base.copyto!( - dest::VIFH{S, Nv, Ni}, - bc::BroadcastedUnionVIFH{S, Nv, Ni}, - ::ToCPU, -) where {S, Nv, Ni} - (_, _, _, _, Nh) = size(bc) - # copy contiguous columns - @inbounds for h in 1:Nh, i in 1:Ni - col_dest = column(dest, i, h) - col_bc = column(bc, i, h) - copyto!(col_dest, col_bc) - end - return dest -end - -function Base.copyto!( - dest::VIJFH{S, Nv, Nij}, - bc::BroadcastedUnionVIJFH{S, Nv, Nij}, - ::ToCPU, -) where {S, Nv, Nij} - # copy contiguous columns - _, _, _, _, Nh = size(dest) - @inbounds for h in 1:Nh, j in 1:Nij, i in 1:Nij - col_dest = column(dest, i, j, h) - col_bc = column(bc, i, j, h) - copyto!(col_dest, col_bc) - end - return dest -end - # ============= FusedMultiBroadcast isascalar( @@ -632,126 +386,3 @@ isascalar( Union{Base.Broadcast.AbstractArrayStyle{0}, Base.Broadcast.Style{Tuple}}, } = true isascalar(bc) = false - -# Fused multi-broadcast entry point for DataLayouts -function Base.copyto!( - fmbc::FusedMultiBroadcast{T}, -) where {N, T <: NTuple{N, Pair{<:AbstractData, <:Any}}} - dest1 = first(fmbc.pairs).first - fmb_inst = FusedMultiBroadcast( - map(fmbc.pairs) do pair - bc = pair.second - bc′ = if isascalar(bc) - Base.Broadcast.instantiate( - Base.Broadcast.Broadcasted(bc.style, bc.f, bc.args, ()), - ) - else - bc - end - Pair(pair.first, bc′) - end, - ) - # check_fused_broadcast_axes(fmbc) # we should already have checked the axes - fused_copyto!(fmb_inst, dest1, device_dispatch(dest1)) -end - -function fused_copyto!( - fmbc::FusedMultiBroadcast, - dest1::VIJFH{S1, Nv1, Nij}, - ::ToCPU, -) where {S1, Nv1, Nij} - _, _, _, _, Nh = size(dest1) - for (dest, bc) in fmbc.pairs - # Base.copyto!(dest, bc) # we can just fall back like this - @inbounds for h in 1:Nh, j in 1:Nij, i in 1:Nij, v in 1:Nv1 - I = CartesianIndex(i, j, 1, v, h) - bcI = isascalar(bc) ? bc[] : bc[I] - dest[I] = convert(eltype(dest), bcI) - end - end - return nothing -end - -function fused_copyto!( - fmbc::FusedMultiBroadcast, - dest1::IJFH{S, Nij}, - ::ToCPU, -) where {S, Nij} - # copy contiguous columns - _, _, _, Nv, Nh = size(dest1) - for (dest, bc) in fmbc.pairs - @inbounds for h in 1:Nh, j in 1:Nij, i in 1:Nij - I = CartesianIndex(i, j, 1, 1, h) - bcI = isascalar(bc) ? bc[] : bc[I] - dest[I] = convert(eltype(dest), bcI) - end - end - return nothing -end - -function fused_copyto!( - fmbc::FusedMultiBroadcast, - dest1::VIFH{S, Nv1, Ni}, - ::ToCPU, -) where {S, Nv1, Ni} - # copy contiguous columns - _, _, _, _, Nh = size(dest1) - for (dest, bc) in fmbc.pairs - @inbounds for h in 1:Nh, i in 1:Ni, v in 1:Nv1 - I = CartesianIndex(i, 1, 1, v, h) - bcI = isascalar(bc) ? bc[] : bc[I] - dest[I] = convert(eltype(dest), bcI) - end - end - return nothing -end - -function fused_copyto!( - fmbc::FusedMultiBroadcast, - dest1::VF{S1, Nv1}, - ::ToCPU, -) where {S1, Nv1} - for (dest, bc) in fmbc.pairs - @inbounds for v in 1:Nv1 - I = CartesianIndex(1, 1, 1, v, 1) - bcI = isascalar(bc) ? bc[] : bc[I] - dest[I] = convert(eltype(dest), bcI) - end - end - return nothing -end - -function fused_copyto!( - fmbc::FusedMultiBroadcast, - dest::DataF{S}, - ::ToCPU, -) where {S} - for (dest, bc) in fmbc.pairs - @inbounds dest[] = convert(S, bc[]) - end - return dest -end - -# we've already diagonalized dest, so we only need to make -# sure that all the broadcast axes are compatible. -# Logic here is similar to Base.Broadcast.instantiate -@inline function _check_fused_broadcast_axes(bc1, bc2) - axes = Base.Broadcast.combine_axes(bc1.args..., bc2.args...) - if !(axes isa Nothing) - Base.Broadcast.check_broadcast_axes(axes, bc1.args...) - Base.Broadcast.check_broadcast_axes(axes, bc2.args...) - end -end - -@inline check_fused_broadcast_axes(fmbc::FusedMultiBroadcast) = - check_fused_broadcast_axes( - map(x -> x.second, fmbc.pairs), - first(fmbc.pairs).second, - ) -@inline check_fused_broadcast_axes(bcs::Tuple{<:Any}, bc1) = - _check_fused_broadcast_axes(first(bcs), bc1) -@inline check_fused_broadcast_axes(bcs::Tuple{}, bc1) = nothing -@inline function check_fused_broadcast_axes(bcs::Tuple, bc1) - _check_fused_broadcast_axes(first(bcs), bc1) - check_fused_broadcast_axes(Base.tail(bcs), bc1) -end diff --git a/src/DataLayouts/copyto.jl b/src/DataLayouts/copyto.jl new file mode 100644 index 0000000000..eae87bc1c9 --- /dev/null +++ b/src/DataLayouts/copyto.jl @@ -0,0 +1,155 @@ +##### +##### Dispatching and edge cases +##### + +Base.copyto!( + dest::AbstractData, + bc::Union{AbstractData, Base.Broadcast.Broadcasted}, +) = Base.copyto!(dest, bc, device_dispatch(dest)) + +# Specialize on non-Broadcasted objects +function Base.copyto!(dest::D, src::D) where {D <: AbstractData} + copyto!(parent(dest), parent(src)) + return dest +end + +# broadcasting scalar assignment +# Performance optimization for the common identity scalar case: dest .= val +# And this is valid for the CPU or GPU, since the broadcasted object +# is a scalar type. +function Base.copyto!( + dest::AbstractData, + bc::Base.Broadcast.Broadcasted{Style}, + ::AbstractDispatchToDevice, +) where { + Style <: + Union{Base.Broadcast.AbstractArrayStyle{0}, Base.Broadcast.Style{Tuple}}, +} + bc = Base.Broadcast.instantiate( + Base.Broadcast.Broadcasted{Style}(bc.f, bc.args, ()), + ) + @inbounds bc0 = bc[] + fill!(dest, bc0) +end + +##### +##### DataLayouts +##### + +function Base.copyto!( + dest::DataF{S}, + bc::BroadcastedUnionDataF{S, A}, + ::ToCPU, +) where {S, A} + @inbounds dest[] = convert(S, bc[]) + return dest +end + +function Base.copyto!( + dest::IJFH{S, Nij}, + bc::BroadcastedUnionIJFH{S, Nij}, + ::ToCPU, +) where {S, Nij} + _, _, _, _, Nh = size(bc) + @inbounds for h in 1:Nh + slab_dest = slab(dest, h) + slab_bc = slab(bc, h) + copyto!(slab_dest, slab_bc) + end + return dest +end + +function Base.copyto!( + dest::IFH{S, Ni}, + bc::BroadcastedUnionIFH{S, Ni}, + ::ToCPU, +) where {S, Ni} + _, _, _, _, Nh = size(bc) + @inbounds for h in 1:Nh + slab_dest = slab(dest, h) + slab_bc = slab(bc, h) + copyto!(slab_dest, slab_bc) + end + return dest +end + +# inline inner slab(::DataSlab2D) copy +function Base.copyto!( + dest::IJF{S, Nij}, + bc::BroadcastedUnionIJF{S, Nij, A}, + ::ToCPU, +) where {S, Nij, A} + @inbounds for j in 1:Nij, i in 1:Nij + idx = CartesianIndex(i, j, 1, 1, 1) + dest[idx] = convert(S, bc[idx]) + end + return dest +end + +function Base.copyto!( + dest::IF{S, Ni}, + bc::BroadcastedUnionIF{S, Ni, A}, + ::ToCPU, +) where {S, Ni, A} + @inbounds for i in 1:Ni + idx = CartesianIndex(i, 1, 1, 1, 1) + dest[idx] = convert(S, bc[idx]) + end + return dest +end + +# inline inner slab(::DataSlab1D) copy +function Base.copyto!( + dest::IF{S, Ni}, + bc::Base.Broadcast.Broadcasted{IFStyle{Ni, A}}, + ::ToCPU, +) where {S, Ni, A} + @inbounds for i in 1:Ni + idx = CartesianIndex(i, 1, 1, 1, 1) + dest[idx] = convert(S, bc[idx]) + end + return dest +end + +# inline inner column(::DataColumn) copy +function Base.copyto!( + dest::VF{S, Nv}, + bc::BroadcastedUnionVF{S, Nv, A}, + ::ToCPU, +) where {S, Nv, A} + @inbounds for v in 1:Nv + idx = CartesianIndex(1, 1, 1, v, 1) + dest[idx] = convert(S, bc[idx]) + end + return dest +end + +function Base.copyto!( + dest::VIFH{S, Nv, Ni}, + bc::BroadcastedUnionVIFH{S, Nv, Ni}, + ::ToCPU, +) where {S, Nv, Ni} + (_, _, _, _, Nh) = size(bc) + # copy contiguous columns + @inbounds for h in 1:Nh, i in 1:Ni + col_dest = column(dest, i, h) + col_bc = column(bc, i, h) + copyto!(col_dest, col_bc) + end + return dest +end + +function Base.copyto!( + dest::VIJFH{S, Nv, Nij}, + bc::BroadcastedUnionVIJFH{S, Nv, Nij}, + ::ToCPU, +) where {S, Nv, Nij} + # copy contiguous columns + _, _, _, _, Nh = size(dest) + @inbounds for h in 1:Nh, j in 1:Nij, i in 1:Nij + col_dest = column(dest, i, j, h) + col_bc = column(bc, i, j, h) + copyto!(col_dest, col_bc) + end + return dest +end diff --git a/src/DataLayouts/fill.jl b/src/DataLayouts/fill.jl new file mode 100644 index 0000000000..1e3105810e --- /dev/null +++ b/src/DataLayouts/fill.jl @@ -0,0 +1,61 @@ +function Base.fill!(data::IJFH, val, ::ToCPU) + (_, _, _, _, Nh) = size(data) + @inbounds for h in 1:Nh + fill!(slab(data, h), val) + end + return data +end + +function Base.fill!(data::IFH, val, ::ToCPU) + (_, _, _, _, Nh) = size(data) + @inbounds for h in 1:Nh + fill!(slab(data, h), val) + end + return data +end + +function Base.fill!(data::DataF, val, ::ToCPU) + @inbounds data[] = val + return data +end + +function Base.fill!(data::IJF{S, Nij}, val, ::ToCPU) where {S, Nij} + @inbounds for j in 1:Nij, i in 1:Nij + data[i, j] = val + end + return data +end + +function Base.fill!(data::IF{S, Ni}, val, ::ToCPU) where {S, Ni} + @inbounds for i in 1:Ni + data[i] = val + end + return data +end + +function Base.fill!(data::VF, val, ::ToCPU) + Nv = nlevels(data) + @inbounds for v in 1:Nv + data[v] = val + end + return data +end + +function Base.fill!(data::VIJFH, val, ::ToCPU) + (Ni, Nj, _, Nv, Nh) = size(data) + @inbounds for h in 1:Nh, v in 1:Nv + fill!(slab(data, v, h), val) + end + return data +end + +function Base.fill!(data::VIFH, val, ::ToCPU) + (Ni, _, _, Nv, Nh) = size(data) + @inbounds for h in 1:Nh, v in 1:Nv + fill!(slab(data, v, h), val) + end + return data +end + +Base.fill!(dest::AbstractData, val) = + Base.fill!(dest, val, device_dispatch(dest)) diff --git a/src/DataLayouts/fused_copyto.jl b/src/DataLayouts/fused_copyto.jl new file mode 100644 index 0000000000..8589811792 --- /dev/null +++ b/src/DataLayouts/fused_copyto.jl @@ -0,0 +1,123 @@ + +# Fused multi-broadcast entry point for DataLayouts +function Base.copyto!( + fmbc::FusedMultiBroadcast{T}, +) where {N, T <: NTuple{N, Pair{<:AbstractData, <:Any}}} + dest1 = first(fmbc.pairs).first + fmb_inst = FusedMultiBroadcast( + map(fmbc.pairs) do pair + bc = pair.second + bc′ = if isascalar(bc) + Base.Broadcast.instantiate( + Base.Broadcast.Broadcasted(bc.style, bc.f, bc.args, ()), + ) + else + bc + end + Pair(pair.first, bc′) + end, + ) + # check_fused_broadcast_axes(fmbc) # we should already have checked the axes + fused_copyto!(fmb_inst, dest1, device_dispatch(dest1)) +end + +function fused_copyto!( + fmbc::FusedMultiBroadcast, + dest1::VIJFH{S1, Nv1, Nij}, + ::ToCPU, +) where {S1, Nv1, Nij} + _, _, _, _, Nh = size(dest1) + for (dest, bc) in fmbc.pairs + # Base.copyto!(dest, bc) # we can just fall back like this + @inbounds for h in 1:Nh, j in 1:Nij, i in 1:Nij, v in 1:Nv1 + I = CartesianIndex(i, j, 1, v, h) + bcI = isascalar(bc) ? bc[] : bc[I] + dest[I] = convert(eltype(dest), bcI) + end + end + return nothing +end + +function fused_copyto!( + fmbc::FusedMultiBroadcast, + dest1::IJFH{S, Nij}, + ::ToCPU, +) where {S, Nij} + # copy contiguous columns + _, _, _, Nv, Nh = size(dest1) + for (dest, bc) in fmbc.pairs + @inbounds for h in 1:Nh, j in 1:Nij, i in 1:Nij + I = CartesianIndex(i, j, 1, 1, h) + bcI = isascalar(bc) ? bc[] : bc[I] + dest[I] = convert(eltype(dest), bcI) + end + end + return nothing +end + +function fused_copyto!( + fmbc::FusedMultiBroadcast, + dest1::VIFH{S, Nv1, Ni}, + ::ToCPU, +) where {S, Nv1, Ni} + # copy contiguous columns + _, _, _, _, Nh = size(dest1) + for (dest, bc) in fmbc.pairs + @inbounds for h in 1:Nh, i in 1:Ni, v in 1:Nv1 + I = CartesianIndex(i, 1, 1, v, h) + bcI = isascalar(bc) ? bc[] : bc[I] + dest[I] = convert(eltype(dest), bcI) + end + end + return nothing +end + +function fused_copyto!( + fmbc::FusedMultiBroadcast, + dest1::VF{S1, Nv1}, + ::ToCPU, +) where {S1, Nv1} + for (dest, bc) in fmbc.pairs + @inbounds for v in 1:Nv1 + I = CartesianIndex(1, 1, 1, v, 1) + bcI = isascalar(bc) ? bc[] : bc[I] + dest[I] = convert(eltype(dest), bcI) + end + end + return nothing +end + +function fused_copyto!( + fmbc::FusedMultiBroadcast, + dest::DataF{S}, + ::ToCPU, +) where {S} + for (dest, bc) in fmbc.pairs + @inbounds dest[] = convert(S, bc[]) + end + return dest +end + +# we've already diagonalized dest, so we only need to make +# sure that all the broadcast axes are compatible. +# Logic here is similar to Base.Broadcast.instantiate +@inline function _check_fused_broadcast_axes(bc1, bc2) + axes = Base.Broadcast.combine_axes(bc1.args..., bc2.args...) + if !(axes isa Nothing) + Base.Broadcast.check_broadcast_axes(axes, bc1.args...) + Base.Broadcast.check_broadcast_axes(axes, bc2.args...) + end +end + +@inline check_fused_broadcast_axes(fmbc::FusedMultiBroadcast) = + check_fused_broadcast_axes( + map(x -> x.second, fmbc.pairs), + first(fmbc.pairs).second, + ) +@inline check_fused_broadcast_axes(bcs::Tuple{<:Any}, bc1) = + _check_fused_broadcast_axes(first(bcs), bc1) +@inline check_fused_broadcast_axes(bcs::Tuple{}, bc1) = nothing +@inline function check_fused_broadcast_axes(bcs::Tuple, bc1) + _check_fused_broadcast_axes(first(bcs), bc1) + check_fused_broadcast_axes(Base.tail(bcs), bc1) +end diff --git a/src/DataLayouts/mapreduce.jl b/src/DataLayouts/mapreduce.jl new file mode 100644 index 0000000000..5e63477c09 --- /dev/null +++ b/src/DataLayouts/mapreduce.jl @@ -0,0 +1,103 @@ +# This is only defined for testing. +function mapreduce_cuda end + +function Base.mapreduce( + fn::F, + op::Op, + bc::BroadcastedUnionDataF{<:Any, A}, +) where {F, Op, A} + mapreduce(op, 1) do v + Base.@_inline_meta + @inbounds fn(bc[]) + end +end + +function Base.mapreduce( + fn::F, + op::Op, + bc::BroadcastedUnionIJFH{<:Any, Nij, A}, +) where {F, Op, Nij, A} + # mapreduce across DataSlab2D + _, _, _, _, Nh = size(bc) + mapreduce(op, 1:Nh) do h + Base.@_inline_meta + slabview = @inbounds slab(bc, h) + mapreduce(fn, op, slabview) + end +end + +function Base.mapreduce( + fn::F, + op::Op, + bc::BroadcastedUnionIFH{<:Any, Ni, A}, +) where {F, Op, Ni, A} + # mapreduce across DataSlab1D + _, _, _, _, Nh = size(bc) + mapreduce(op, 1:Nh) do h + Base.@_inline_meta + slabview = @inbounds slab(bc, h) + mapreduce(fn, op, slabview) + end +end + +function Base.mapreduce(fn::F, op::Op, bc::IJF{S, Nij}) where {F, Op, S, Nij} + # mapreduce across DataSlab2D nodes + mapreduce(op, Iterators.product(1:Nij, 1:Nij)) do (i, j) + Base.@_inline_meta + idx = CartesianIndex(i, j, 1, 1, 1) + node = @inbounds bc[idx] + fn(node) + end +end + +function Base.mapreduce(fn::F, op::Op, bc::IF{S, Ni}) where {F, Op, S, Ni} + # mapreduce across DataSlab1D nodes + mapreduce(op, 1:Ni) do i + Base.@_inline_meta + idx = CartesianIndex(i, 1, 1, 1, 1) + node = @inbounds bc[idx] + fn(node) + end +end + +function Base.mapreduce( + fn::F, + op::Op, + bc::BroadcastedUnionVF{<:Any, Nv, A}, +) where {F, Op, Nv, A} + # mapreduce across DataColumn levels + mapreduce(op, 1:Nv) do v + Base.@_inline_meta + idx = CartesianIndex(1, 1, 1, v, 1) + level = @inbounds bc[idx] + fn(level) + end +end + +function Base.mapreduce( + fn::F, + op::Op, + bc::BroadcastedUnionVIFH{<:Any, Nv, Ni, A}, +) where {F, Op, Nv, Ni, A} + # mapreduce across columns + _, _, _, _, Nh = size(bc) + mapreduce(op, Iterators.product(1:Ni, 1:Nh)) do (i, h) + Base.@_inline_meta + columnview = @inbounds column(bc, i, h) + mapreduce(fn, op, columnview) + end +end + +function Base.mapreduce( + fn::F, + op::Op, + bc::BroadcastedUnionVIJFH{<:Any, Nv, Nij, A}, +) where {F, Op, Nv, Nij, A} + # mapreduce across columns + _, _, _, _, Nh = size(bc) + mapreduce(op, Iterators.product(1:Nij, 1:Nij, 1:Nh)) do (i, j, h) + Base.@_inline_meta + columnview = @inbounds column(bc, i, j, h) + mapreduce(fn, op, columnview) + end +end