Skip to content

Commit

Permalink
Reduce use of DataLayouts internals in ClimaCore
Browse files Browse the repository at this point in the history
  • Loading branch information
charleskawczynski committed Aug 15, 2024
1 parent c2451ad commit 8628c64
Show file tree
Hide file tree
Showing 9 changed files with 166 additions and 162 deletions.
6 changes: 3 additions & 3 deletions ext/cuda/cuda_utils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -37,15 +37,15 @@ to benchmark compare against auto-determined threads/blocks (if `auto=false`).
function auto_launch!(
f!::F!,
args,
data;
nitems::Union{Integer, Nothing} = nothing;
auto = false,
threads_s = nothing,
blocks_s = nothing,
always_inline = true,
caller = :unknown,
) where {F!}
if auto
nitems = get_n_items(data)
@assert !isnothing(nitems)
if nitems 0
kernel = CUDA.@cuda always_inline = true launch = false f!(args...)
config = CUDA.launch_configuration(kernel.fun)
Expand All @@ -64,7 +64,7 @@ function auto_launch!(
# CUDA.registers(kernel) > 50 || return nothing # for debugging
# occursin("single_field_solve_kernel", string(nameof(F!))) || return nothing
if !haskey(reported_stats, key)
nitems = get_n_items(data)
@assert !isnothing(nitems)
kernel = CUDA.@cuda always_inline = true launch = false f!(args...)
config = CUDA.launch_configuration(kernel.fun)
threads = min(nitems, config.threads)
Expand Down
20 changes: 6 additions & 14 deletions ext/cuda/data_layouts_copyto.jl
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,7 @@ function Base.copyto!(
if Nh > 0
auto_launch!(
knl_copyto!,
(dest, bc),
dest;
(dest, bc);
threads_s = (Nij, Nij),
blocks_s = (Nh, 1),
)
Expand All @@ -42,8 +41,7 @@ function Base.copyto!(
Nv_blocks = cld(Nv, Nv_per_block)
auto_launch!(
knl_copyto!,
(dest, bc),
dest;
(dest, bc);
threads_s = (Nij, Nij, Nv_per_block),
blocks_s = (Nh, Nv_blocks),
)
Expand All @@ -59,8 +57,7 @@ function Base.copyto!(
if Nv > 0
auto_launch!(
knl_copyto!,
(dest, bc),
dest;
(dest, bc);
threads_s = (1, 1),
blocks_s = (1, Nv),
)
Expand All @@ -73,13 +70,7 @@ function Base.copyto!(
bc::DataLayouts.BroadcastedUnionDataF{S},
::ToCUDA,
) where {S}
auto_launch!(
knl_copyto!,
(dest, bc),
dest;
threads_s = (1, 1),
blocks_s = (1, 1),
)
auto_launch!(knl_copyto!, (dest, bc); threads_s = (1, 1), blocks_s = (1, 1))
return dest
end

Expand All @@ -100,7 +91,8 @@ function cuda_copyto!(dest::AbstractData, bc)
(_, _, Nv, _, Nh) = DataLayouts.universal_size(dest)
us = DataLayouts.UniversalSize(dest)
if Nv > 0 && Nh > 0
auto_launch!(knl_copyto_flat!, (dest, bc, us), dest; auto = true)
nitems = prod(DataLayouts.universal_size(dest))
auto_launch!(knl_copyto_flat!, (dest, bc, us), nitems; auto = true)
end
return dest
end
Expand Down
3 changes: 2 additions & 1 deletion ext/cuda/data_layouts_fill.jl
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@ function cuda_fill!(dest::AbstractData, val)
(_, _, Nv, _, Nh) = DataLayouts.universal_size(dest)
us = DataLayouts.UniversalSize(dest)
if Nv > 0 && Nh > 0
auto_launch!(knl_fill_flat!, (dest, val, us), dest; auto = true)
nitems = prod(DataLayouts.universal_size(dest))
auto_launch!(knl_fill_flat!, (dest, val, us), nitems; auto = true)
end
return dest
end
Expand Down
12 changes: 4 additions & 8 deletions ext/cuda/data_layouts_fused_copyto.jl
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,7 @@ function fused_copyto!(
Nv_blocks = cld(Nv, Nv_per_block)
auto_launch!(
knl_fused_copyto!,
(fmbc,),
dest1;
(fmbc,);
threads_s = (Nij, Nij, Nv_per_block),
blocks_s = (Nh, Nv_blocks),
)
Expand All @@ -68,8 +67,7 @@ function fused_copyto!(
if Nh > 0
auto_launch!(
knl_fused_copyto!,
(fmbc,),
dest1;
(fmbc,);
threads_s = (Nij, Nij),
blocks_s = (Nh, 1),
)
Expand All @@ -85,8 +83,7 @@ function fused_copyto!(
if Nv > 0 && Nh > 0
auto_launch!(
knl_fused_copyto!,
(fmbc,),
dest1;
(fmbc,);
threads_s = (1, 1),
blocks_s = (Nh, Nv),
)
Expand All @@ -101,8 +98,7 @@ function fused_copyto!(
) where {S}
auto_launch!(
knl_fused_copyto!,
(fmbc,),
dest1;
(fmbc,);
threads_s = (1, 1),
blocks_s = (1, 1),
)
Expand Down
2 changes: 1 addition & 1 deletion ext/cuda/data_layouts_mapreduce.jl
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ function mapreduce_cuda(
pdata = parent(data)
T = eltype(pdata)
(Ni, Nj, Nk, Nv, Nh) = size(data)
Nf = div(length(pdata), prod(size(data))) # length of field dimension
Nf = DataLayouts.ncomponents(data) # length of field dimension
pwt = parent(weighted_jacobian)

nitems = Nv * Ni * Nj * Nk * Nh
Expand Down
62 changes: 19 additions & 43 deletions ext/cuda/limiters.jl
Original file line number Diff line number Diff line change
Expand Up @@ -21,39 +21,26 @@ function compute_element_bounds!(
ρ,
::ClimaComms.CUDADevice,
)
S = size(Fields.field_values(ρ))
(Ni, Nj, _, Nv, Nh) = S
(_, _, Nv, _, Nh) = DataLayouts.universal_size(ρ)
nthreads, nblocks = config_threadblock(Nv, Nh)

args = (
limiter,
Fields.field_values(Operators.strip_space(ρq, axes(ρq))),
Fields.field_values(Operators.strip_space(ρ, axes(ρ))),
Nv,
Nh,
Val(Ni),
Val(Nj),
)
auto_launch!(
compute_element_bounds_kernel!,
args,
ρ;
args;
threads_s = nthreads,
blocks_s = nblocks,
)
return nothing
end


function compute_element_bounds_kernel!(
limiter,
ρq,
ρ,
Nv,
Nh,
::Val{Ni},
::Val{Nj},
) where {Ni, Nj}
function compute_element_bounds_kernel!(limiter, ρq, ρ)
(Ni, Nj, Nv, _, Nh) = DataLayouts.universal_size(ρ)
n = (Nv, Nh)
tidx = thread_index()
@inbounds if valid_range(tidx, prod(n))
Expand Down Expand Up @@ -88,21 +75,18 @@ function compute_neighbor_bounds_local!(
::ClimaComms.CUDADevice,
)
topology = Spaces.topology(axes(ρ))
Ni, Nj, _, Nv, Nh = size(Fields.field_values(ρ))
us = DataLayouts.UniversalSize(ρ)
(Ni, Nj, Nv, _, Nh) = DataLayouts.universal_size(us)
nthreads, nblocks = config_threadblock(Nv, Nh)
args = (
limiter,
topology.local_neighbor_elem,
topology.local_neighbor_elem_offset,
Nv,
Nh,
Val(Ni),
Val(Nj),
us,
)
auto_launch!(
compute_neighbor_bounds_local_kernel!,
args,
ρ;
args;
threads_s = nthreads,
blocks_s = nblocks,
)
Expand All @@ -112,12 +96,9 @@ function compute_neighbor_bounds_local_kernel!(
limiter,
local_neighbor_elem,
local_neighbor_elem_offset,
Nv,
Nh,
::Val{Ni},
::Val{Nj},
) where {Ni, Nj}

us::DataLayouts.UniversalSize,
)
(Ni, Nj, Nv, _, Nh) = DataLayouts.universal_size(us)
n = (Nv, Nh)
tidx = thread_index()
@inbounds if valid_range(tidx, prod(n))
Expand Down Expand Up @@ -147,27 +128,24 @@ function apply_limiter!(
::ClimaComms.CUDADevice,
)
ρq_data = Fields.field_values(ρq)
(Ni, Nj, _, Nv, Nh) = size(ρq_data)
Nf = DataLayouts.ncomponents(ρq_data)
us = DataLayouts.UniversalSize(ρq_data)
(Ni, Nj, _, Nv, Nh) = DataLayouts.universal_size(us)
maxiter = Ni * Nj
Nf = DataLayouts.ncomponents(ρq_data)
WJ = Spaces.local_geometry_data(axes(ρq)).WJ
nthreads, nblocks = config_threadblock(Nv, Nh)
args = (
limiter,
Fields.field_values(Operators.strip_space(ρq, axes(ρq))),
Fields.field_values(Operators.strip_space(ρ, axes(ρ))),
WJ,
Nv,
Nh,
us,
Val(Nf),
Val(Ni),
Val(Nj),
Val(maxiter),
)
auto_launch!(
apply_limiter_kernel!,
args,
ρ;
args;
threads_s = nthreads,
blocks_s = nblocks,
)
Expand All @@ -179,15 +157,13 @@ function apply_limiter_kernel!(
ρq_data,
ρ_data,
WJ_data,
Nv,
Nh,
us::DataLayouts.UniversalSize,
::Val{Nf},
::Val{Ni},
::Val{Nj},
::Val{maxiter},
) where {Nf, Ni, Nj, maxiter}
) where {Nf, maxiter}
(; q_bounds_nbr, rtol) = limiter
converged = true
(Ni, Nj, Nv, _, Nh) = DataLayouts.universal_size(us)
n = (Nv, Nh)
tidx = thread_index()
@inbounds if valid_range(tidx, prod(n))
Expand Down
Loading

0 comments on commit 8628c64

Please sign in to comment.