Skip to content

Commit

Permalink
Use prescribed thread-block configurations
Browse files Browse the repository at this point in the history
  • Loading branch information
charleskawczynski committed Sep 6, 2024
1 parent add3b1b commit 78058a0
Show file tree
Hide file tree
Showing 11 changed files with 330 additions and 170 deletions.
1 change: 1 addition & 0 deletions ext/ClimaCoreCUDAExt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ import ClimaCore.Utilities: cart_ind, linear_ind
import ClimaCore.RecursiveApply:
, , , radd, rmul, rsub, rdiv, rmap, rzero, rmin, rmax
import ClimaCore.DataLayouts: get_N, get_Nv, get_Nij, get_Nij, get_Nh
import ClimaCore.DataLayouts: UniversalSize

include(joinpath("cuda", "cuda_utils.jl"))
include(joinpath("cuda", "data_layouts.jl"))
Expand Down
6 changes: 6 additions & 0 deletions ext/cuda/cuda_utils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,12 @@ function auto_launch!(
return nothing
end

function threads_via_occupancy(f!::F!, args) where {F!}
kernel = CUDA.@cuda always_inline = true launch = false f!(args...)
config = CUDA.launch_configuration(kernel.fun)
return config.threads
end

"""
thread_index()
Expand Down
1 change: 1 addition & 0 deletions ext/cuda/data_layouts.jl
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ include("data_layouts_fill.jl")
include("data_layouts_copyto.jl")
include("data_layouts_fused_copyto.jl")
include("data_layouts_mapreduce.jl")
include("data_layouts_threadblock.jl")

adapt_f(to, f::F) where {F} = Adapt.adapt(to, f)
adapt_f(to, ::Type{F}) where {F} = (x...) -> F(x...)
Expand Down
107 changes: 17 additions & 90 deletions ext/cuda/data_layouts_copyto.jl
Original file line number Diff line number Diff line change
@@ -1,111 +1,38 @@
DataLayouts._device_dispatch(x::CUDA.CuArray) = ToCUDA()

function knl_copyto!(dest, src)

i = CUDA.threadIdx().x
j = CUDA.threadIdx().y

h = CUDA.blockIdx().x
v = CUDA.blockDim().z * (CUDA.blockIdx().y - 1) + CUDA.threadIdx().z

if v <= size(dest, 4)
I = CartesianIndex((i, j, 1, v, h))
function knl_copyto!(dest, src, us)
I = universal_index(dest)
if is_valid_index(dest, I, us)
@inbounds dest[I] = src[I]
end
return nothing
end

function Base.copyto!(
dest::IJFH{S, Nij, Nh},
bc::DataLayouts.BroadcastedUnionIJFH{S, Nij, Nh},
::ToCUDA,
) where {S, Nij, Nh}
if Nh > 0
auto_launch!(
knl_copyto!,
(dest, bc);
threads_s = (Nij, Nij),
blocks_s = (Nh, 1),
)
end
return dest
end

function Base.copyto!(
dest::VIJFH{S, Nv, Nij, Nh},
bc::DataLayouts.BroadcastedUnionVIJFH{S, Nv, Nij, Nh},
::ToCUDA,
) where {S, Nv, Nij, Nh}
if Nv > 0 && Nh > 0
Nv_per_block = min(Nv, fld(256, Nij * Nij))
Nv_blocks = cld(Nv, Nv_per_block)
auto_launch!(
knl_copyto!,
(dest, bc);
threads_s = (Nij, Nij, Nv_per_block),
blocks_s = (Nh, Nv_blocks),
)
end
return dest
end

function Base.copyto!(
dest::VF{S, Nv},
bc::DataLayouts.BroadcastedUnionVF{S, Nv},
::ToCUDA,
) where {S, Nv}
if Nv > 0
auto_launch!(
knl_copyto!,
(dest, bc);
threads_s = (1, 1),
blocks_s = (1, Nv),
)
end
return dest
end

function Base.copyto!(
dest::DataF{S},
bc::DataLayouts.BroadcastedUnionDataF{S},
::ToCUDA,
) where {S}
auto_launch!(knl_copyto!, (dest, bc); threads_s = (1, 1), blocks_s = (1, 1))
return dest
end

import ClimaCore.DataLayouts: isascalar
function knl_copyto_flat!(dest::AbstractData, bc, us)
@inbounds begin
tidx = thread_index()
if tidx get_N(us)
n = size(dest)
I = kernel_indexes(tidx, n)
dest[I] = bc[I]
end
end
return nothing
end

function cuda_copyto!(dest::AbstractData, bc)
(_, _, Nv, _, Nh) = DataLayouts.universal_size(dest)
us = DataLayouts.UniversalSize(dest)
if Nv > 0 && Nh > 0
nitems = prod(DataLayouts.universal_size(dest))
auto_launch!(knl_copyto_flat!, (dest, bc, us), nitems; auto = true)
args = (dest, bc, us)
threads = threads_via_occupancy(knl_copyto!, args)
n_max_threads = min(threads, get_N(us))
p = partition(dest, n_max_threads)
auto_launch!(
knl_copyto!,
args;
threads_s = p.threads,
blocks_s = p.blocks,
)
end
return dest
end

# TODO: can we use CUDA's luanch configuration for all data layouts?
# Currently, it seems to have a slight performance degradation.
#! format: off
# Base.copyto!(dest::IJFH{S, Nij}, bc::DataLayouts.BroadcastedUnionIJFH{S, Nij, Nh}, ::ToCUDA) where {S, Nij, Nh} = cuda_copyto!(dest, bc)
Base.copyto!(dest::IJFH{S, Nij}, bc::DataLayouts.BroadcastedUnionIJFH{S, Nij, Nh}, ::ToCUDA) where {S, Nij, Nh} = cuda_copyto!(dest, bc)
Base.copyto!(dest::IFH{S, Ni, Nh}, bc::DataLayouts.BroadcastedUnionIFH{S, Ni, Nh}, ::ToCUDA) where {S, Ni, Nh} = cuda_copyto!(dest, bc)
Base.copyto!(dest::IJF{S, Nij}, bc::DataLayouts.BroadcastedUnionIJF{S, Nij}, ::ToCUDA) where {S, Nij} = cuda_copyto!(dest, bc)
Base.copyto!(dest::IF{S, Ni}, bc::DataLayouts.BroadcastedUnionIF{S, Ni}, ::ToCUDA) where {S, Ni} = cuda_copyto!(dest, bc)
Base.copyto!(dest::VIFH{S, Nv, Ni, Nh}, bc::DataLayouts.BroadcastedUnionVIFH{S, Nv, Ni, Nh}, ::ToCUDA) where {S, Nv, Ni, Nh} = cuda_copyto!(dest, bc)
# Base.copyto!(dest::VIJFH{S, Nv, Nij, Nh}, bc::DataLayouts.BroadcastedUnionVIJFH{S, Nv, Nij, Nh}, ::ToCUDA) where {S, Nv, Nij, Nh} = cuda_copyto!(dest, bc)
# Base.copyto!(dest::VF{S, Nv}, bc::DataLayouts.BroadcastedUnionVF{S, Nv}, ::ToCUDA) where {S, Nv} = cuda_copyto!(dest, bc)
# Base.copyto!(dest::DataF{S}, bc::DataLayouts.BroadcastedUnionDataF{S}, ::ToCUDA) where {S} = cuda_copyto!(dest, bc)
Base.copyto!(dest::VIJFH{S, Nv, Nij, Nh}, bc::DataLayouts.BroadcastedUnionVIJFH{S, Nv, Nij, Nh}, ::ToCUDA) where {S, Nv, Nij, Nh} = cuda_copyto!(dest, bc)
Base.copyto!(dest::VF{S, Nv}, bc::DataLayouts.BroadcastedUnionVF{S, Nv}, ::ToCUDA) where {S, Nv} = cuda_copyto!(dest, bc)
Base.copyto!(dest::DataF{S}, bc::DataLayouts.BroadcastedUnionDataF{S}, ::ToCUDA) where {S} = cuda_copyto!(dest, bc)
#! format: on
37 changes: 16 additions & 21 deletions ext/cuda/data_layouts_fill.jl
Original file line number Diff line number Diff line change
@@ -1,32 +1,27 @@
function knl_fill_flat!(dest::AbstractData, val, us)
@inbounds begin
tidx = thread_index()
if tidx get_N(us)
n = size(dest)
I = kernel_indexes(tidx, n)
@inbounds dest[I] = val
end
function knl_fill!(dest, val, us)
I = universal_index(dest)
if is_valid_index(dest, I, us)
@inbounds dest[I] = val
end
return nothing
end

function cuda_fill!(dest::AbstractData, val)
function cuda_fill!(dest::AbstractData, bc)
(_, _, Nv, _, Nh) = DataLayouts.universal_size(dest)
us = DataLayouts.UniversalSize(dest)
if Nv > 0 && Nh > 0
nitems = prod(DataLayouts.universal_size(dest))
auto_launch!(knl_fill_flat!, (dest, val, us), nitems; auto = true)
args = (dest, bc, us)
threads = threads_via_occupancy(knl_fill!, args)
n_max_threads = min(threads, get_N(us))
p = partition(dest, n_max_threads)
auto_launch!(
knl_fill!,
args;
threads_s = p.threads,
blocks_s = p.blocks,
)
end
return dest
end

#! format: off
Base.fill!(dest::IJFH{<:Any, <:Any}, val, ::ToCUDA) = cuda_fill!(dest, val)
Base.fill!(dest::IFH{<:Any, <:Any}, val, ::ToCUDA) = cuda_fill!(dest, val)
Base.fill!(dest::IJF{<:Any, <:Any}, val, ::ToCUDA) = cuda_fill!(dest, val)
Base.fill!(dest::IF{<:Any, <:Any}, val, ::ToCUDA) = cuda_fill!(dest, val)
Base.fill!(dest::VIFH{<:Any, <:Any, <:Any}, val, ::ToCUDA) = cuda_fill!(dest, val)
Base.fill!(dest::VIJFH{<:Any, <:Any, <:Any}, val, ::ToCUDA) = cuda_fill!(dest, val)
Base.fill!(dest::VF{<:Any, <:Any}, val, ::ToCUDA) = cuda_fill!(dest, val)
Base.fill!(dest::DataF{<:Any}, val, ::ToCUDA) = cuda_fill!(dest, val)
#! format: on
Base.fill!(dest::AbstractData, val, ::ToCUDA) = cuda_fill!(dest, val)
Loading

0 comments on commit 78058a0

Please sign in to comment.