Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use prescribed thread-block configurations #1969

Merged
merged 1 commit into from
Sep 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions ext/ClimaCoreCUDAExt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ import ClimaCore.Utilities: cart_ind, linear_ind
import ClimaCore.RecursiveApply:
, , , radd, rmul, rsub, rdiv, rmap, rzero, rmin, rmax
import ClimaCore.DataLayouts: get_N, get_Nv, get_Nij, get_Nij, get_Nh
import ClimaCore.DataLayouts: UniversalSize

include(joinpath("cuda", "cuda_utils.jl"))
include(joinpath("cuda", "data_layouts.jl"))
Expand Down
6 changes: 6 additions & 0 deletions ext/cuda/cuda_utils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,12 @@ function auto_launch!(
return nothing
end

function threads_via_occupancy(f!::F!, args) where {F!}
kernel = CUDA.@cuda always_inline = true launch = false f!(args...)
config = CUDA.launch_configuration(kernel.fun)
return config.threads
end

"""
thread_index()
Expand Down
1 change: 1 addition & 0 deletions ext/cuda/data_layouts.jl
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ include("data_layouts_fill.jl")
include("data_layouts_copyto.jl")
include("data_layouts_fused_copyto.jl")
include("data_layouts_mapreduce.jl")
include("data_layouts_threadblock.jl")

adapt_f(to, f::F) where {F} = Adapt.adapt(to, f)
adapt_f(to, ::Type{F}) where {F} = (x...) -> F(x...)
Expand Down
107 changes: 17 additions & 90 deletions ext/cuda/data_layouts_copyto.jl
Original file line number Diff line number Diff line change
@@ -1,111 +1,38 @@
DataLayouts._device_dispatch(x::CUDA.CuArray) = ToCUDA()

function knl_copyto!(dest, src)

i = CUDA.threadIdx().x
j = CUDA.threadIdx().y

h = CUDA.blockIdx().x
v = CUDA.blockDim().z * (CUDA.blockIdx().y - 1) + CUDA.threadIdx().z

if v <= size(dest, 4)
I = CartesianIndex((i, j, 1, v, h))
function knl_copyto!(dest, src, us)
I = universal_index(dest)
if is_valid_index(dest, I, us)
@inbounds dest[I] = src[I]
end
return nothing
end

function Base.copyto!(
dest::IJFH{S, Nij, Nh},
bc::DataLayouts.BroadcastedUnionIJFH{S, Nij, Nh},
::ToCUDA,
) where {S, Nij, Nh}
if Nh > 0
auto_launch!(
knl_copyto!,
(dest, bc);
threads_s = (Nij, Nij),
blocks_s = (Nh, 1),
)
end
return dest
end

function Base.copyto!(
dest::VIJFH{S, Nv, Nij, Nh},
bc::DataLayouts.BroadcastedUnionVIJFH{S, Nv, Nij, Nh},
::ToCUDA,
) where {S, Nv, Nij, Nh}
if Nv > 0 && Nh > 0
Nv_per_block = min(Nv, fld(256, Nij * Nij))
Nv_blocks = cld(Nv, Nv_per_block)
auto_launch!(
knl_copyto!,
(dest, bc);
threads_s = (Nij, Nij, Nv_per_block),
blocks_s = (Nh, Nv_blocks),
)
end
return dest
end

function Base.copyto!(
dest::VF{S, Nv},
bc::DataLayouts.BroadcastedUnionVF{S, Nv},
::ToCUDA,
) where {S, Nv}
if Nv > 0
auto_launch!(
knl_copyto!,
(dest, bc);
threads_s = (1, 1),
blocks_s = (1, Nv),
)
end
return dest
end

function Base.copyto!(
dest::DataF{S},
bc::DataLayouts.BroadcastedUnionDataF{S},
::ToCUDA,
) where {S}
auto_launch!(knl_copyto!, (dest, bc); threads_s = (1, 1), blocks_s = (1, 1))
return dest
end

import ClimaCore.DataLayouts: isascalar
function knl_copyto_flat!(dest::AbstractData, bc, us)
@inbounds begin
tidx = thread_index()
if tidx get_N(us)
n = size(dest)
I = kernel_indexes(tidx, n)
dest[I] = bc[I]
end
end
return nothing
end

function cuda_copyto!(dest::AbstractData, bc)
(_, _, Nv, _, Nh) = DataLayouts.universal_size(dest)
us = DataLayouts.UniversalSize(dest)
if Nv > 0 && Nh > 0
nitems = prod(DataLayouts.universal_size(dest))
auto_launch!(knl_copyto_flat!, (dest, bc, us), nitems; auto = true)
args = (dest, bc, us)
threads = threads_via_occupancy(knl_copyto!, args)
n_max_threads = min(threads, get_N(us))
p = partition(dest, n_max_threads)
auto_launch!(
knl_copyto!,
args;
threads_s = p.threads,
blocks_s = p.blocks,
)
end
return dest
end

# TODO: can we use CUDA's luanch configuration for all data layouts?
# Currently, it seems to have a slight performance degradation.
#! format: off
# Base.copyto!(dest::IJFH{S, Nij}, bc::DataLayouts.BroadcastedUnionIJFH{S, Nij, Nh}, ::ToCUDA) where {S, Nij, Nh} = cuda_copyto!(dest, bc)
Base.copyto!(dest::IJFH{S, Nij}, bc::DataLayouts.BroadcastedUnionIJFH{S, Nij, Nh}, ::ToCUDA) where {S, Nij, Nh} = cuda_copyto!(dest, bc)
Base.copyto!(dest::IFH{S, Ni, Nh}, bc::DataLayouts.BroadcastedUnionIFH{S, Ni, Nh}, ::ToCUDA) where {S, Ni, Nh} = cuda_copyto!(dest, bc)
Base.copyto!(dest::IJF{S, Nij}, bc::DataLayouts.BroadcastedUnionIJF{S, Nij}, ::ToCUDA) where {S, Nij} = cuda_copyto!(dest, bc)
Base.copyto!(dest::IF{S, Ni}, bc::DataLayouts.BroadcastedUnionIF{S, Ni}, ::ToCUDA) where {S, Ni} = cuda_copyto!(dest, bc)
Base.copyto!(dest::VIFH{S, Nv, Ni, Nh}, bc::DataLayouts.BroadcastedUnionVIFH{S, Nv, Ni, Nh}, ::ToCUDA) where {S, Nv, Ni, Nh} = cuda_copyto!(dest, bc)
# Base.copyto!(dest::VIJFH{S, Nv, Nij, Nh}, bc::DataLayouts.BroadcastedUnionVIJFH{S, Nv, Nij, Nh}, ::ToCUDA) where {S, Nv, Nij, Nh} = cuda_copyto!(dest, bc)
# Base.copyto!(dest::VF{S, Nv}, bc::DataLayouts.BroadcastedUnionVF{S, Nv}, ::ToCUDA) where {S, Nv} = cuda_copyto!(dest, bc)
# Base.copyto!(dest::DataF{S}, bc::DataLayouts.BroadcastedUnionDataF{S}, ::ToCUDA) where {S} = cuda_copyto!(dest, bc)
Base.copyto!(dest::VIJFH{S, Nv, Nij, Nh}, bc::DataLayouts.BroadcastedUnionVIJFH{S, Nv, Nij, Nh}, ::ToCUDA) where {S, Nv, Nij, Nh} = cuda_copyto!(dest, bc)
Base.copyto!(dest::VF{S, Nv}, bc::DataLayouts.BroadcastedUnionVF{S, Nv}, ::ToCUDA) where {S, Nv} = cuda_copyto!(dest, bc)
Base.copyto!(dest::DataF{S}, bc::DataLayouts.BroadcastedUnionDataF{S}, ::ToCUDA) where {S} = cuda_copyto!(dest, bc)
#! format: on
37 changes: 16 additions & 21 deletions ext/cuda/data_layouts_fill.jl
Original file line number Diff line number Diff line change
@@ -1,32 +1,27 @@
function knl_fill_flat!(dest::AbstractData, val, us)
@inbounds begin
tidx = thread_index()
if tidx get_N(us)
n = size(dest)
I = kernel_indexes(tidx, n)
@inbounds dest[I] = val
end
function knl_fill!(dest, val, us)
I = universal_index(dest)
if is_valid_index(dest, I, us)
@inbounds dest[I] = val
end
return nothing
end

function cuda_fill!(dest::AbstractData, val)
function cuda_fill!(dest::AbstractData, bc)
(_, _, Nv, _, Nh) = DataLayouts.universal_size(dest)
us = DataLayouts.UniversalSize(dest)
if Nv > 0 && Nh > 0
nitems = prod(DataLayouts.universal_size(dest))
auto_launch!(knl_fill_flat!, (dest, val, us), nitems; auto = true)
args = (dest, bc, us)
threads = threads_via_occupancy(knl_fill!, args)
n_max_threads = min(threads, get_N(us))
p = partition(dest, n_max_threads)
auto_launch!(
knl_fill!,
args;
threads_s = p.threads,
blocks_s = p.blocks,
)
end
return dest
end

#! format: off
Base.fill!(dest::IJFH{<:Any, <:Any}, val, ::ToCUDA) = cuda_fill!(dest, val)
Base.fill!(dest::IFH{<:Any, <:Any}, val, ::ToCUDA) = cuda_fill!(dest, val)
Base.fill!(dest::IJF{<:Any, <:Any}, val, ::ToCUDA) = cuda_fill!(dest, val)
Base.fill!(dest::IF{<:Any, <:Any}, val, ::ToCUDA) = cuda_fill!(dest, val)
Base.fill!(dest::VIFH{<:Any, <:Any, <:Any}, val, ::ToCUDA) = cuda_fill!(dest, val)
Base.fill!(dest::VIJFH{<:Any, <:Any, <:Any}, val, ::ToCUDA) = cuda_fill!(dest, val)
Base.fill!(dest::VF{<:Any, <:Any}, val, ::ToCUDA) = cuda_fill!(dest, val)
Base.fill!(dest::DataF{<:Any}, val, ::ToCUDA) = cuda_fill!(dest, val)
#! format: on
Base.fill!(dest::AbstractData, val, ::ToCUDA) = cuda_fill!(dest, val)
101 changes: 27 additions & 74 deletions ext/cuda/data_layouts_fused_copyto.jl
Original file line number Diff line number Diff line change
@@ -1,106 +1,59 @@
Base.@propagate_inbounds function rcopyto_at!(
pair::Pair{<:AbstractData, <:Any},
I,
v,
us,
)
dest, bc = pair.first, pair.second
if 1 ≤ v <= size(dest, 4)
if is_valid_index(dest, I, us)
dest[I] = isascalar(bc) ? bc[] : bc[I]
end
return nothing
end
Base.@propagate_inbounds function rcopyto_at!(pair::Pair{<:DataF, <:Any}, I, v)
Base.@propagate_inbounds function rcopyto_at!(pair::Pair{<:DataF, <:Any}, I, us)
dest, bc = pair.first, pair.second
if 1 ≤ v <= size(dest, 4)
if is_valid_index(dest, I, us)
bcI = isascalar(bc) ? bc[] : bc[I]
dest[] = bcI
end
return nothing
end
Base.@propagate_inbounds function rcopyto_at!(pairs::Tuple, I, v)
rcopyto_at!(first(pairs), I, v)
rcopyto_at!(Base.tail(pairs), I, v)
Base.@propagate_inbounds function rcopyto_at!(pairs::Tuple, I, us)
rcopyto_at!(first(pairs), I, us)
rcopyto_at!(Base.tail(pairs), I, us)
end
Base.@propagate_inbounds rcopyto_at!(pairs::Tuple{<:Any}, I, v) =
rcopyto_at!(first(pairs), I, v)
@inline rcopyto_at!(pairs::Tuple{}, I, v) = nothing

function knl_fused_copyto!(fmbc::FusedMultiBroadcast)
Base.@propagate_inbounds rcopyto_at!(pairs::Tuple{<:Any}, I, us) =
rcopyto_at!(first(pairs), I, us)
@inline rcopyto_at!(pairs::Tuple{}, I, us) = nothing

function knl_fused_copyto!(fmbc::FusedMultiBroadcast, dest1, us)
@inbounds begin
i = CUDA.threadIdx().x
j = CUDA.threadIdx().y

h = CUDA.blockIdx().x
v = CUDA.blockDim().z * (CUDA.blockIdx().y - 1) + CUDA.threadIdx().z
(; pairs) = fmbc
I = CartesianIndex((i, j, 1, v, h))
rcopyto_at!(pairs, I, v)
I = universal_index(dest1)
if is_valid_index(dest1, I, us)
(; pairs) = fmbc
rcopyto_at!(pairs, I, us)
end
end
return nothing
end

function fused_copyto!(
fmbc::FusedMultiBroadcast,
dest1::VIJFH{S, Nv, Nij, Nh},
dest1::DataLayouts.AbstractData,
::ToCUDA,
) where {S, Nv, Nij, Nh}
if Nv > 0 && Nh > 0
Nv_per_block = min(Nv, fld(256, Nij * Nij))
Nv_blocks = cld(Nv, Nv_per_block)
auto_launch!(
knl_fused_copyto!,
(fmbc,);
threads_s = (Nij, Nij, Nv_per_block),
blocks_s = (Nh, Nv_blocks),
)
end
return nothing
end

function fused_copyto!(
fmbc::FusedMultiBroadcast,
dest1::IJFH{S, Nij},
::ToCUDA,
) where {S, Nij}
_, _, _, _, Nh = size(dest1)
if Nh > 0
auto_launch!(
knl_fused_copyto!,
(fmbc,);
threads_s = (Nij, Nij),
blocks_s = (Nh, 1),
)
end
return nothing
end
function fused_copyto!(
fmbc::FusedMultiBroadcast,
dest1::VF{S, Nv},
::ToCUDA,
) where {S, Nv}
_, _, _, _, Nh = size(dest1)
)
(_, _, Nv, _, Nh) = DataLayouts.universal_size(dest1)
if Nv > 0 && Nh > 0
us = DataLayouts.UniversalSize(dest1)
args = (fmbc, dest1, us)
threads = threads_via_occupancy(knl_fused_copyto!, args)
n_max_threads = min(threads, get_N(us))
p = partition(dest1, n_max_threads)
auto_launch!(
knl_fused_copyto!,
(fmbc,);
threads_s = (1, 1),
blocks_s = (Nh, Nv),
args;
threads_s = p.threads,
blocks_s = p.blocks,
)
end
return nothing
end

function fused_copyto!(
fmbc::FusedMultiBroadcast,
dest1::DataF{S},
::ToCUDA,
) where {S}
auto_launch!(
knl_fused_copyto!,
(fmbc,);
threads_s = (1, 1),
blocks_s = (1, 1),
)
return nothing
end
Loading
Loading