Skip to content

Commit

Permalink
Always inline
Browse files Browse the repository at this point in the history
  • Loading branch information
charleskawczynski committed Mar 19, 2024
1 parent f6fc6de commit 57469f6
Show file tree
Hide file tree
Showing 9 changed files with 50 additions and 29 deletions.
30 changes: 24 additions & 6 deletions src/DataLayouts/cuda.jl
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,10 @@ function Base.copyto!(
) where {S, Nij, A <: CUDA.CuArray}
_, _, _, _, Nh = size(bc)
if Nh > 0
CUDA.@cuda threads = (Nij, Nij) blocks = (Nh, 1) knl_copyto!(dest, bc)
CUDA.@cuda always_inline = true threads = (Nij, Nij) blocks = (Nh, 1) knl_copyto!(
dest,
bc,
)
end
return dest
end
Expand All @@ -92,7 +95,10 @@ function Base.fill!(
}
_, _, _, _, Nh = size(dest)
if Nh > 0
CUDA.@cuda threads = (Nij, Nij) blocks = (Nh, 1) knl_fill!(dest, val)
CUDA.@cuda always_inline = true threads = (Nij, Nij) blocks = (Nh, 1) knl_fill!(
dest,
val,
)
end
return dest
end
Expand Down Expand Up @@ -133,14 +139,20 @@ function Base.copyto!(
) where {S, A <: CUDA.CuArray}
_, _, _, Nv, Nh = size(bc)
if Nv > 0 && Nh > 0
CUDA.@cuda threads = (1, 1) blocks = (Nh, Nv) knl_copyto!(dest, bc)
CUDA.@cuda always_inline = true threads = (1, 1) blocks = (Nh, Nv) knl_copyto!(
dest,
bc,
)
end
return dest
end
function Base.fill!(dest::VF{S, A}, val) where {S, A <: CUDA.CuArray}
_, _, _, Nv, Nh = size(dest)
if Nv > 0 && Nh > 0
CUDA.@cuda threads = (1, 1) blocks = (Nh, Nv) knl_fill!(dest, val)
CUDA.@cuda always_inline = true threads = (1, 1) blocks = (Nh, Nv) knl_fill!(
dest,
val,
)
end
return dest
end
Expand All @@ -149,10 +161,16 @@ function Base.copyto!(
dest::DataF{S},
bc::Union{DataF{S, A}, Base.Broadcast.Broadcasted{DataFStyle{A}}},
) where {S, A <: CUDA.CuArray}
CUDA.@cuda threads = (1, 1) blocks = (1, 1) knl_copyto!(dest, bc)
CUDA.@cuda always_inline = true threads = (1, 1) blocks = (1, 1) knl_copyto!(
dest,
bc,
)
return dest
end
function Base.fill!(dest::DataF{S, A}, val) where {S, A <: CUDA.CuArray}
CUDA.@cuda threads = (1, 1) blocks = (1, 1) knl_fill!(dest, val)
CUDA.@cuda always_inline = true threads = (1, 1) blocks = (1, 1) knl_fill!(
dest,
val,
)
return dest
end
4 changes: 2 additions & 2 deletions src/Fields/mapreduce_cuda.jl
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ function mapreduce_cuda(
reduce_cuda = CuArray{T}(undef, nblocks, Nf)
shmemsize = nthreads
# place each field on a different block
@cuda threads = (nthreads) blocks = (nblocks, Nf) mapreduce_cuda_kernel!(
@cuda always_inline = true threads = (nthreads) blocks = (nblocks, Nf) mapreduce_cuda_kernel!(
reduce_cuda,
f,
op,
Expand All @@ -138,7 +138,7 @@ function mapreduce_cuda(
if nblocks > 1
nthreads = min(32, nblocks)
shmemsize = nthreads
@cuda threads = (nthreads) blocks = (Nf) reduce_cuda_blocks_kernel!(
@cuda always_inline = true threads = (nthreads) blocks = (Nf) reduce_cuda_blocks_kernel!(
reduce_cuda,
op,
Val(shmemsize),
Expand Down
6 changes: 3 additions & 3 deletions src/Limiters/quasimonotone.jl
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ function compute_element_bounds!(
(Ni, Nj, _, Nv, Nh) = S
nthreads, nblocks = config_threadblock(Nv, Nh)

CUDA.@cuda threads = nthreads blocks = nblocks compute_element_bounds_kernel!(
CUDA.@cuda always_inline = true threads = nthreads blocks = nblocks compute_element_bounds_kernel!(
limiter,
Fields.field_values(Operators.strip_space(ρq, axes(ρq))),
Fields.field_values(Operators.strip_space(ρ, axes(ρ))),
Expand Down Expand Up @@ -221,7 +221,7 @@ function compute_neighbor_bounds_local!(
topology = Spaces.topology(axes(ρ))
Ni, Nj, _, Nv, Nh = size(Fields.field_values(ρ))
nthreads, nblocks = config_threadblock(Nv, Nh)
CUDA.@cuda threads = nthreads blocks = nblocks compute_neighbor_bounds_local_kernel!(
CUDA.@cuda always_inline = true threads = nthreads blocks = nblocks compute_neighbor_bounds_local_kernel!(
limiter,
topology.local_neighbor_elem,
topology.local_neighbor_elem_offset,
Expand Down Expand Up @@ -388,7 +388,7 @@ function apply_limiter!(
maxiter = Ni * Nj
WJ = Spaces.local_geometry_data(axes(ρq)).WJ
nthreads, nblocks = config_threadblock(Nv, Nh)
CUDA.@cuda threads = nthreads blocks = nblocks apply_limiter_kernel!(
CUDA.@cuda always_inline = true threads = nthreads blocks = nblocks apply_limiter_kernel!(
limiter,
Fields.field_values(Operators.strip_space(ρq, axes(ρq))),
Fields.field_values(Operators.strip_space(ρ, axes(ρ))),
Expand Down
2 changes: 1 addition & 1 deletion src/MatrixFields/single_field_solver.jl
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ single_field_solve!(::ClimaComms.AbstractCPUDevice, cache, x, A, b) =
function single_field_solve!(::ClimaComms.CUDADevice, cache, x, A, b)
Ni, Nj, _, _, Nh = size(Fields.field_values(A))
nthreads, nblocks = Topologies._configure_threadblock(Ni * Nj * Nh)
CUDA.@cuda threads = nthreads blocks = nblocks single_field_solve_kernel!(
CUDA.@cuda always_inline = true threads = nthreads blocks = nblocks single_field_solve_kernel!(
cache,
x,
A,
Expand Down
6 changes: 3 additions & 3 deletions src/Operators/integrals.jl
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ function column_integral_definite!(
space = axes(∫field)
Ni, Nj, _, _, Nh = size(Fields.field_values(∫field))
nthreads, nblocks = Topologies._configure_threadblock(Ni * Nj * Nh)
@cuda threads = nthreads blocks = nblocks column_integral_definite_kernel!(
@cuda always_inline = true threads = nthreads blocks = nblocks column_integral_definite_kernel!(
strip_space(∫field, space),
strip_space(ᶜfield, space),
)
Expand Down Expand Up @@ -114,7 +114,7 @@ function column_integral_indefinite!(
)
Ni, Nj, _, _, Nh = size(Fields.field_values(ᶠ∫field))
nthreads, nblocks = Topologies._configure_threadblock(Ni * Nj * Nh)
@cuda threads = nthreads blocks = nblocks column_integral_indefinite_kernel!(
@cuda always_inline = true threads = nthreads blocks = nblocks column_integral_indefinite_kernel!(
ᶠ∫field,
ᶜfield,
)
Expand Down Expand Up @@ -295,7 +295,7 @@ function column_mapreduce_device!(
else
column_mapreduce_kernel!
end
@cuda threads = nthreads blocks = nblocks kernel!(
@cuda always_inline = true threads = nthreads blocks = nblocks kernel!(
fn,
op,
# reduced_field,
Expand Down
5 changes: 4 additions & 1 deletion src/Operators/thomas_algorithm.jl
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,10 @@ column_thomas_solve!(::ClimaComms.AbstractCPUDevice, A, b) =
function column_thomas_solve!(::ClimaComms.CUDADevice, A, b)
Ni, Nj, _, _, Nh = size(Fields.field_values(A))
nthreads, nblocks = Topologies._configure_threadblock(Ni * Nj * Nh)
@cuda threads = nthreads blocks = nblocks thomas_algorithm_kernel!(A, b)
@cuda always_inline = true threads = nthreads blocks = nblocks thomas_algorithm_kernel!(
A,
b,
)
end

function thomas_algorithm_kernel!(
Expand Down
4 changes: 2 additions & 2 deletions src/Remapping/distributed_remapping.jl
Original file line number Diff line number Diff line change
Expand Up @@ -515,7 +515,7 @@ function _set_interpolated_values!(
if ClimaComms.device(field) isa ClimaComms.CUDADevice
nblocks, _ = size(interpolation_matrix[1])
nthreads = length(vert_interpolation_weights)
@cuda threads = (nthreads) blocks = (nblocks) set_interpolated_values_kernel!(
@cuda always_inline = true threads = (nthreads) blocks = (nblocks) set_interpolated_values_kernel!(
out,
interpolation_matrix,
local_horiz_indices,
Expand Down Expand Up @@ -559,7 +559,7 @@ function _set_interpolated_values!(
if ClimaComms.device(space) isa ClimaComms.CUDADevice
nitems = length(out)
nthreads, nblocks = Topologies._configure_threadblock(nitems)
@cuda threads = (nthreads) blocks = (nblocks) set_interpolated_values_kernel!(
@cuda always_inline = true threads = (nthreads) blocks = (nblocks) set_interpolated_values_kernel!(
out,
local_horiz_interpolation_weights,
local_horiz_indices,
Expand Down
4 changes: 2 additions & 2 deletions src/Remapping/interpolate_array.jl
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ function interpolate_slab!(
nitems = length(output_array)
nthreads, nblocks = Topologies._configure_threadblock(nitems)

@cuda threads = (nthreads) blocks = (nblocks) interpolate_slab_kernel!(
@cuda always_inline = true threads = (nthreads) blocks = (nblocks) interpolate_slab_kernel!(
output_cuarray,
field,
cuslab_indices,
Expand Down Expand Up @@ -332,7 +332,7 @@ function interpolate_slab_level!(

nitems = length(vidx_ref_coordinates)
nthreads, nblocks = Topologies._configure_threadblock(nitems)
@cuda threads = (nthreads) blocks = (nblocks) interpolate_slab_level_kernel!(
@cuda always_inline = true threads = (nthreads) blocks = (nblocks) interpolate_slab_level_kernel!(
output_cuarray,
field,
cuvidx_ref_coordinates,
Expand Down
18 changes: 9 additions & 9 deletions src/Topologies/dss_cuda.jl
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ function dss_load_perimeter_data!(
(nlevels, nperimeter, nfid, nelems) = size(pperimeter_data)
nitems = nlevels * nperimeter * nfid * nelems
nthreads, nblocks = _configure_threadblock(nitems)
CUDA.@cuda threads = (nthreads) blocks = (nblocks) dss_load_perimeter_data_kernel!(
CUDA.@cuda always_inline = true threads = (nthreads) blocks = (nblocks) dss_load_perimeter_data_kernel!(
pperimeter_data,
pdata,
perimeter,
Expand Down Expand Up @@ -58,7 +58,7 @@ function dss_unload_perimeter_data!(
(nlevels, nperimeter, nfid, nelems) = size(pperimeter_data)
nitems = nlevels * nperimeter * nfid * nelems
nthreads, nblocks = _configure_threadblock(nitems)
CUDA.@cuda threads = (nthreads) blocks = (nblocks) dss_unload_perimeter_data_kernel!(
CUDA.@cuda always_inline = true threads = (nthreads) blocks = (nblocks) dss_unload_perimeter_data_kernel!(
pdata,
pperimeter_data,
perimeter,
Expand Down Expand Up @@ -98,7 +98,7 @@ function dss_local!(

nitems = nlevels * nfid * (nlocalfaces + nlocalvertices)
nthreads, nblocks = _configure_threadblock(nitems)
CUDA.@cuda threads = (nthreads) blocks = (nblocks) dss_local_kernel!(
CUDA.@cuda always_inline = true threads = (nthreads) blocks = (nblocks) dss_local_kernel!(
pperimeter_data,
topology.local_vertices,
topology.local_vertex_offset,
Expand Down Expand Up @@ -184,7 +184,7 @@ function dss_transform!(
(nlevels, nperimeter, _, _) = size(pperimeter_data)
nitems = nlevels * nperimeter * nlocalelems
nthreads, nblocks = _configure_threadblock(nitems)
CUDA.@cuda threads = (nthreads) blocks = (nblocks) dss_transform_kernel!(
CUDA.@cuda always_inline = true threads = (nthreads) blocks = (nblocks) dss_transform_kernel!(
pperimeter_data,
pdata,
p∂ξ∂x,
Expand Down Expand Up @@ -290,7 +290,7 @@ function dss_untransform!(
(nlevels, nperimeter, _, _) = size(pperimeter_data)
nitems = nlevels * nperimeter * nlocalelems
nthreads, nblocks = _configure_threadblock(nitems)
CUDA.@cuda threads = (nthreads) blocks = (nblocks) dss_untransform_kernel!(
CUDA.@cuda always_inline = true threads = (nthreads) blocks = (nblocks) dss_untransform_kernel!(
pperimeter_data,
pdata,
p∂ξ∂x,
Expand Down Expand Up @@ -376,7 +376,7 @@ function dss_local_ghost!(
max_threads = 256
nitems = nlevels * nfid * nghostvertices
nthreads, nblocks = _configure_threadblock(nitems)
CUDA.@cuda threads = (nthreads) blocks = (nblocks) dss_local_ghost_kernel!(
CUDA.@cuda always_inline = true threads = (nthreads) blocks = (nblocks) dss_local_ghost_kernel!(
pperimeter_data,
topology.ghost_vertices,
topology.ghost_vertex_offset,
Expand Down Expand Up @@ -431,7 +431,7 @@ function fill_send_buffer!(
if nsend > 0
nitems = nsend * nlevels * nfid
nthreads, nblocks = _configure_threadblock(nitems)
CUDA.@cuda threads = (nthreads) blocks = (nblocks) fill_send_buffer_kernel!(
CUDA.@cuda always_inline = true threads = (nthreads) blocks = (nblocks) fill_send_buffer_kernel!(
send_data,
send_buf_idx,
pperimeter_data,
Expand Down Expand Up @@ -473,7 +473,7 @@ function load_from_recv_buffer!(::ClimaComms.CUDADevice, dss_buffer::DSSBuffer)
if nrecv > 0
nitems = nrecv * nlevels * nfid
nthreads, nblocks = _configure_threadblock(nitems)
CUDA.@cuda threads = (nthreads) blocks = (nblocks) load_from_recv_buffer_kernel!(
CUDA.@cuda always_inline = true threads = (nthreads) blocks = (nblocks) load_from_recv_buffer_kernel!(
pperimeter_data,
recv_data,
recv_buf_idx,
Expand Down Expand Up @@ -517,7 +517,7 @@ function dss_ghost!(
nlevels, _, nfidx, _ = size(pperimeter_data)
nitems = nlevels * nfidx * nghostvertices
nthreads, nblocks = _configure_threadblock(nitems)
CUDA.@cuda threads = (nthreads) blocks = (nblocks) dss_ghost_kernel!(
CUDA.@cuda always_inline = true threads = (nthreads) blocks = (nblocks) dss_ghost_kernel!(
pperimeter_data,
topology.ghost_vertices,
topology.ghost_vertex_offset,
Expand Down

0 comments on commit 57469f6

Please sign in to comment.