Skip to content

Commit

Permalink
Merge pull request #1869 from CliMA/ck/reorganize
Browse files Browse the repository at this point in the history
Move copyto, fill and mapreduce to their own files
  • Loading branch information
charleskawczynski authored Jul 5, 2024
2 parents f9d110a + 88cf5e6 commit 4bd8ade
Show file tree
Hide file tree
Showing 8 changed files with 558 additions and 547 deletions.
114 changes: 1 addition & 113 deletions ext/cuda/data_layouts.jl
Original file line number Diff line number Diff line change
Expand Up @@ -27,121 +27,9 @@ Base.similar(

include("data_layouts_fill.jl")
include("data_layouts_copyto.jl")
include("data_layouts_fused_copyto.jl")
include("data_layouts_mapreduce.jl")

Base.@propagate_inbounds function rcopyto_at!(
pair::Pair{<:AbstractData, <:Any},
I,
v,
)
dest, bc = pair.first, pair.second
if 1 v <= size(dest, 4)
dest[I] = isascalar(bc) ? bc[] : bc[I]
end
return nothing
end
Base.@propagate_inbounds function rcopyto_at!(pair::Pair{<:DataF, <:Any}, I, v)
dest, bc = pair.first, pair.second
if 1 v <= size(dest, 4)
bcI = isascalar(bc) ? bc[] : bc[I]
dest[] = bcI
end
return nothing
end
Base.@propagate_inbounds function rcopyto_at!(pairs::Tuple, I, v)
rcopyto_at!(first(pairs), I, v)
rcopyto_at!(Base.tail(pairs), I, v)
end
Base.@propagate_inbounds rcopyto_at!(pairs::Tuple{<:Any}, I, v) =
rcopyto_at!(first(pairs), I, v)
@inline rcopyto_at!(pairs::Tuple{}, I, v) = nothing

function knl_fused_copyto!(fmbc::FusedMultiBroadcast)

@inbounds begin
i = CUDA.threadIdx().x
j = CUDA.threadIdx().y

h = CUDA.blockIdx().x
v = CUDA.blockDim().z * (CUDA.blockIdx().y - 1) + CUDA.threadIdx().z
(; pairs) = fmbc
I = CartesianIndex((i, j, 1, v, h))
rcopyto_at!(pairs, I, v)
end
return nothing
end

function fused_copyto!(
fmbc::FusedMultiBroadcast,
dest1::VIJFH{S, Nv, Nij},
::ToCUDA,
) where {S, Nv, Nij}
_, _, _, _, Nh = size(dest1)
if Nv > 0 && Nh > 0
Nv_per_block = min(Nv, fld(256, Nij * Nij))
Nv_blocks = cld(Nv, Nv_per_block)
auto_launch!(
knl_fused_copyto!,
(fmbc,),
dest1;
threads_s = (Nij, Nij, Nv_per_block),
blocks_s = (Nh, Nv_blocks),
)
end
return nothing
end

function fused_copyto!(
fmbc::FusedMultiBroadcast,
dest1::IJFH{S, Nij},
::ToCUDA,
) where {S, Nij}
_, _, _, _, Nh = size(dest1)
if Nh > 0
auto_launch!(
knl_fused_copyto!,
(fmbc,),
dest1;
threads_s = (Nij, Nij),
blocks_s = (Nh, 1),
)
end
return nothing
end
function fused_copyto!(
fmbc::FusedMultiBroadcast,
dest1::VF{S, Nv},
::ToCUDA,
) where {S, Nv}
_, _, _, _, Nh = size(dest1)
if Nv > 0 && Nh > 0
auto_launch!(
knl_fused_copyto!,
(fmbc,),
dest1;
threads_s = (1, 1),
blocks_s = (Nh, Nv),
)
end
return nothing
end

function fused_copyto!(
fmbc::FusedMultiBroadcast,
dest1::DataF{S},
::ToCUDA,
) where {S}
auto_launch!(
knl_fused_copyto!,
(fmbc,),
dest1;
threads_s = (1, 1),
blocks_s = (1, 1),
)
return nothing
end


adapt_f(to, f::F) where {F} = Adapt.adapt(to, f)
adapt_f(to, ::Type{F}) where {F} = (x...) -> F(x...)

Expand Down
111 changes: 111 additions & 0 deletions ext/cuda/data_layouts_fused_copyto.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
Base.@propagate_inbounds function rcopyto_at!(
pair::Pair{<:AbstractData, <:Any},
I,
v,
)
dest, bc = pair.first, pair.second
if 1 v <= size(dest, 4)
dest[I] = isascalar(bc) ? bc[] : bc[I]
end
return nothing
end
Base.@propagate_inbounds function rcopyto_at!(pair::Pair{<:DataF, <:Any}, I, v)
dest, bc = pair.first, pair.second
if 1 v <= size(dest, 4)
bcI = isascalar(bc) ? bc[] : bc[I]
dest[] = bcI
end
return nothing
end
Base.@propagate_inbounds function rcopyto_at!(pairs::Tuple, I, v)
rcopyto_at!(first(pairs), I, v)
rcopyto_at!(Base.tail(pairs), I, v)
end
Base.@propagate_inbounds rcopyto_at!(pairs::Tuple{<:Any}, I, v) =
rcopyto_at!(first(pairs), I, v)
@inline rcopyto_at!(pairs::Tuple{}, I, v) = nothing

function knl_fused_copyto!(fmbc::FusedMultiBroadcast)

@inbounds begin
i = CUDA.threadIdx().x
j = CUDA.threadIdx().y

h = CUDA.blockIdx().x
v = CUDA.blockDim().z * (CUDA.blockIdx().y - 1) + CUDA.threadIdx().z
(; pairs) = fmbc
I = CartesianIndex((i, j, 1, v, h))
rcopyto_at!(pairs, I, v)
end
return nothing
end

function fused_copyto!(
fmbc::FusedMultiBroadcast,
dest1::VIJFH{S, Nv, Nij},
::ToCUDA,
) where {S, Nv, Nij}
_, _, _, _, Nh = size(dest1)
if Nv > 0 && Nh > 0
Nv_per_block = min(Nv, fld(256, Nij * Nij))
Nv_blocks = cld(Nv, Nv_per_block)
auto_launch!(
knl_fused_copyto!,
(fmbc,),
dest1;
threads_s = (Nij, Nij, Nv_per_block),
blocks_s = (Nh, Nv_blocks),
)
end
return nothing
end

function fused_copyto!(
fmbc::FusedMultiBroadcast,
dest1::IJFH{S, Nij},
::ToCUDA,
) where {S, Nij}
_, _, _, _, Nh = size(dest1)
if Nh > 0
auto_launch!(
knl_fused_copyto!,
(fmbc,),
dest1;
threads_s = (Nij, Nij),
blocks_s = (Nh, 1),
)
end
return nothing
end
function fused_copyto!(
fmbc::FusedMultiBroadcast,
dest1::VF{S, Nv},
::ToCUDA,
) where {S, Nv}
_, _, _, _, Nh = size(dest1)
if Nv > 0 && Nh > 0
auto_launch!(
knl_fused_copyto!,
(fmbc,),
dest1;
threads_s = (1, 1),
blocks_s = (Nh, Nv),
)
end
return nothing
end

function fused_copyto!(
fmbc::FusedMultiBroadcast,
dest1::DataF{S},
::ToCUDA,
) where {S}
auto_launch!(
knl_fused_copyto!,
(fmbc,),
dest1;
threads_s = (1, 1),
blocks_s = (1, 1),
)
return nothing
end
69 changes: 4 additions & 65 deletions src/DataLayouts/DataLayouts.jl
Original file line number Diff line number Diff line change
Expand Up @@ -176,11 +176,6 @@ Base.parent(data::AbstractData) = getfield(data, :array)

Base.similar(data::AbstractData{S}) where {S} = similar(data, S)

function Base.copyto!(dest::D, src::D) where {D <: AbstractData}
copyto!(parent(dest), parent(src))
return dest
end

@inline function ncomponents(data::AbstractData{S}) where {S}
typesize(eltype(parent(data)), S)
end
Expand Down Expand Up @@ -330,15 +325,6 @@ function Base.size(data::IJFH{S, Nij}) where {S, Nij}
(Nij, Nij, 1, Nv, Nh)
end

function Base.fill!(data::IJFH, val, ::ToCPU)
(_, _, _, _, Nh) = size(data)
@inbounds for h in 1:Nh
fill!(slab(data, h), val)
end
return data
end


function IJFH{S, Nij}(::Type{ArrayType}, nelements) where {S, Nij, ArrayType}
T = eltype(ArrayType)
IJFH{S, Nij}(ArrayType(undef, Nij, Nij, typesize(T, S), nelements))
Expand Down Expand Up @@ -499,14 +485,6 @@ function Base.size(data::IFH{S, Ni}) where {S, Ni}
(Ni, 1, 1, Nv, Nh)
end

function Base.fill!(data::IFH, val, ::ToCPU)
(_, _, _, _, Nh) = size(data)
@inbounds for h in 1:Nh
fill!(slab(data, h), val)
end
return data
end

@inline function slab(data::IFH{S, Ni}, h::Integer) where {S, Ni}
@boundscheck (1 <= h <= size(parent(data), 3)) ||
throw(BoundsError(data, (h,)))
Expand Down Expand Up @@ -623,10 +601,6 @@ function DataF(x::T) where {T}
end


function Base.fill!(data::DataF, val, ::ToCPU)
@inbounds data[] = val
return data
end
@inline function Base.getproperty(data::DataF{S}, i::Integer) where {S}
array = parent(data)
T = eltype(array)
Expand Down Expand Up @@ -751,12 +725,6 @@ end
function Base.size(data::IJF{S, Nij}) where {S, Nij}
return (Nij, Nij, 1, 1, 1)
end
function Base.fill!(data::IJF{S, Nij}, val, ::ToCPU) where {S, Nij}
@inbounds for j in 1:Nij, i in 1:Nij
data[i, j] = val
end
return data
end

@generated function _property_view(
data::IJF{S, Nij, A},
Expand Down Expand Up @@ -889,14 +857,6 @@ function replace_basetype(data::IF{S, Ni}, ::Type{T}) where {S, Ni, T}
return IF{S′, Ni}(similar(array, T))
end

function Base.fill!(data::IF{S, Ni}, val, ::ToCPU) where {S, Ni}
@inbounds for i in 1:Ni
data[i] = val
end
return data
end


@generated function _property_view(
data::IF{S, Ni, A},
::Val{Idx},
Expand Down Expand Up @@ -1003,14 +963,6 @@ Base.size(data::VF{S, Nv}) where {S, Nv} = (1, 1, 1, Nv, 1)

nlevels(::VF{S, Nv}) where {S, Nv} = Nv

function Base.fill!(data::VF, val, ::ToCPU)
Nv = nlevels(data)
@inbounds for v in 1:Nv
data[v] = val
end
return data
end

@generated function _property_view(
data::VF{S, Nv, A},
::Val{Idx},
Expand Down Expand Up @@ -1128,14 +1080,6 @@ function Base.length(data::VIJFH)
size(parent(data), 1) * size(parent(data), 5)
end

function Base.fill!(data::VIJFH, val, ::ToCPU)
(Ni, Nj, _, Nv, Nh) = size(data)
@inbounds for h in 1:Nh, v in 1:Nv
fill!(slab(data, v, h), val)
end
return data
end

@generated function _property_view(
data::VIJFH{S, Nv, Nij, A},
::Val{Idx},
Expand Down Expand Up @@ -1295,13 +1239,6 @@ end
function Base.length(data::VIFH)
nlevels(data) * size(parent(data), 4)
end
function Base.fill!(data::VIFH, val, ::ToCPU)
(Ni, _, _, Nv, Nh) = size(data)
@inbounds for h in 1:Nh, v in 1:Nv
fill!(slab(data, v, h), val)
end
return data
end

@propagate_inbounds function Base.getindex(
data::VIFH{S},
Expand Down Expand Up @@ -1630,7 +1567,9 @@ _device_dispatch(x::AbstractData) = _device_dispatch(parent(x))
_device_dispatch(x::SArray) = ToCPU()
_device_dispatch(x::MArray) = ToCPU()

Base.fill!(dest::AbstractData, val) =
Base.fill!(dest, val, device_dispatch(dest))
include("copyto.jl")
include("fused_copyto.jl")
include("fill.jl")
include("mapreduce.jl")

end # module
Loading

0 comments on commit 4bd8ade

Please sign in to comment.