diff --git a/src/DataLayouts/cuda.jl b/src/DataLayouts/cuda.jl index a3a4be35d1..2be14183e6 100644 --- a/src/DataLayouts/cuda.jl +++ b/src/DataLayouts/cuda.jl @@ -78,7 +78,10 @@ function Base.copyto!( ) where {S, Nij, A <: CUDA.CuArray} _, _, _, _, Nh = size(bc) if Nh > 0 - CUDA.@cuda threads = (Nij, Nij) blocks = (Nh, 1) knl_copyto!(dest, bc) + CUDA.@cuda always_inline = true threads = (Nij, Nij) blocks = (Nh, 1) knl_copyto!( + dest, + bc, + ) end return dest end @@ -92,7 +95,10 @@ function Base.fill!( } _, _, _, _, Nh = size(dest) if Nh > 0 - CUDA.@cuda threads = (Nij, Nij) blocks = (Nh, 1) knl_fill!(dest, val) + CUDA.@cuda always_inline = true threads = (Nij, Nij) blocks = (Nh, 1) knl_fill!( + dest, + val, + ) end return dest end @@ -133,14 +139,20 @@ function Base.copyto!( ) where {S, A <: CUDA.CuArray} _, _, _, Nv, Nh = size(bc) if Nv > 0 && Nh > 0 - CUDA.@cuda threads = (1, 1) blocks = (Nh, Nv) knl_copyto!(dest, bc) + CUDA.@cuda always_inline = true threads = (1, 1) blocks = (Nh, Nv) knl_copyto!( + dest, + bc, + ) end return dest end function Base.fill!(dest::VF{S, A}, val) where {S, A <: CUDA.CuArray} _, _, _, Nv, Nh = size(dest) if Nv > 0 && Nh > 0 - CUDA.@cuda threads = (1, 1) blocks = (Nh, Nv) knl_fill!(dest, val) + CUDA.@cuda always_inline = true threads = (1, 1) blocks = (Nh, Nv) knl_fill!( + dest, + val, + ) end return dest end @@ -149,10 +161,16 @@ function Base.copyto!( dest::DataF{S}, bc::Union{DataF{S, A}, Base.Broadcast.Broadcasted{DataFStyle{A}}}, ) where {S, A <: CUDA.CuArray} - CUDA.@cuda threads = (1, 1) blocks = (1, 1) knl_copyto!(dest, bc) + CUDA.@cuda always_inline = true threads = (1, 1) blocks = (1, 1) knl_copyto!( + dest, + bc, + ) return dest end function Base.fill!(dest::DataF{S, A}, val) where {S, A <: CUDA.CuArray} - CUDA.@cuda threads = (1, 1) blocks = (1, 1) knl_fill!(dest, val) + CUDA.@cuda always_inline = true threads = (1, 1) blocks = (1, 1) knl_fill!( + dest, + val, + ) return dest end diff --git a/src/Fields/mapreduce_cuda.jl b/src/Fields/mapreduce_cuda.jl index 64a88e13c7..a54b7a53af 100644 --- a/src/Fields/mapreduce_cuda.jl +++ b/src/Fields/mapreduce_cuda.jl @@ -124,7 +124,7 @@ function mapreduce_cuda( reduce_cuda = CuArray{T}(undef, nblocks, Nf) shmemsize = nthreads # place each field on a different block - @cuda threads = (nthreads) blocks = (nblocks, Nf) mapreduce_cuda_kernel!( + @cuda always_inline = true threads = (nthreads) blocks = (nblocks, Nf) mapreduce_cuda_kernel!( reduce_cuda, f, op, @@ -138,7 +138,7 @@ function mapreduce_cuda( if nblocks > 1 nthreads = min(32, nblocks) shmemsize = nthreads - @cuda threads = (nthreads) blocks = (Nf) reduce_cuda_blocks_kernel!( + @cuda always_inline = true threads = (nthreads) blocks = (Nf) reduce_cuda_blocks_kernel!( reduce_cuda, op, Val(shmemsize), diff --git a/src/Limiters/quasimonotone.jl b/src/Limiters/quasimonotone.jl index f87fd924e3..c740087ef7 100644 --- a/src/Limiters/quasimonotone.jl +++ b/src/Limiters/quasimonotone.jl @@ -106,7 +106,7 @@ function compute_element_bounds!( (Ni, Nj, _, Nv, Nh) = S nthreads, nblocks = config_threadblock(Nv, Nh) - CUDA.@cuda threads = nthreads blocks = nblocks compute_element_bounds_kernel!( + CUDA.@cuda always_inline = true threads = nthreads blocks = nblocks compute_element_bounds_kernel!( limiter, Fields.field_values(Operators.strip_space(ρq, axes(ρq))), Fields.field_values(Operators.strip_space(ρ, axes(ρ))), @@ -221,7 +221,7 @@ function compute_neighbor_bounds_local!( topology = Spaces.topology(axes(ρ)) Ni, Nj, _, Nv, Nh = size(Fields.field_values(ρ)) nthreads, nblocks = config_threadblock(Nv, Nh) - CUDA.@cuda threads = nthreads blocks = nblocks compute_neighbor_bounds_local_kernel!( + CUDA.@cuda always_inline = true threads = nthreads blocks = nblocks compute_neighbor_bounds_local_kernel!( limiter, topology.local_neighbor_elem, topology.local_neighbor_elem_offset, @@ -388,7 +388,7 @@ function apply_limiter!( maxiter = Ni * Nj WJ = Spaces.local_geometry_data(axes(ρq)).WJ nthreads, nblocks = config_threadblock(Nv, Nh) - CUDA.@cuda threads = nthreads blocks = nblocks apply_limiter_kernel!( + CUDA.@cuda always_inline = true threads = nthreads blocks = nblocks apply_limiter_kernel!( limiter, Fields.field_values(Operators.strip_space(ρq, axes(ρq))), Fields.field_values(Operators.strip_space(ρ, axes(ρ))), diff --git a/src/MatrixFields/single_field_solver.jl b/src/MatrixFields/single_field_solver.jl index a0f9f7b2c1..330a742c07 100644 --- a/src/MatrixFields/single_field_solver.jl +++ b/src/MatrixFields/single_field_solver.jl @@ -51,7 +51,7 @@ single_field_solve!(::ClimaComms.AbstractCPUDevice, cache, x, A, b) = function single_field_solve!(::ClimaComms.CUDADevice, cache, x, A, b) Ni, Nj, _, _, Nh = size(Fields.field_values(A)) nthreads, nblocks = Topologies._configure_threadblock(Ni * Nj * Nh) - CUDA.@cuda threads = nthreads blocks = nblocks single_field_solve_kernel!( + CUDA.@cuda always_inline = true threads = nthreads blocks = nblocks single_field_solve_kernel!( cache, x, A, diff --git a/src/Operators/integrals.jl b/src/Operators/integrals.jl index 4406b9ef28..364d1a6c63 100644 --- a/src/Operators/integrals.jl +++ b/src/Operators/integrals.jl @@ -23,7 +23,7 @@ function column_integral_definite!( space = axes(∫field) Ni, Nj, _, _, Nh = size(Fields.field_values(∫field)) nthreads, nblocks = Topologies._configure_threadblock(Ni * Nj * Nh) - @cuda threads = nthreads blocks = nblocks column_integral_definite_kernel!( + @cuda always_inline = true threads = nthreads blocks = nblocks column_integral_definite_kernel!( strip_space(∫field, space), strip_space(ᶜfield, space), ) @@ -114,7 +114,7 @@ function column_integral_indefinite!( ) Ni, Nj, _, _, Nh = size(Fields.field_values(ᶠ∫field)) nthreads, nblocks = Topologies._configure_threadblock(Ni * Nj * Nh) - @cuda threads = nthreads blocks = nblocks column_integral_indefinite_kernel!( + @cuda always_inline = true threads = nthreads blocks = nblocks column_integral_indefinite_kernel!( ᶠ∫field, ᶜfield, ) @@ -295,7 +295,7 @@ function column_mapreduce_device!( else column_mapreduce_kernel! end - @cuda threads = nthreads blocks = nblocks kernel!( + @cuda always_inline = true threads = nthreads blocks = nblocks kernel!( fn, op, # reduced_field, diff --git a/src/Operators/thomas_algorithm.jl b/src/Operators/thomas_algorithm.jl index f5f460c6ff..8047be965f 100644 --- a/src/Operators/thomas_algorithm.jl +++ b/src/Operators/thomas_algorithm.jl @@ -17,7 +17,10 @@ column_thomas_solve!(::ClimaComms.AbstractCPUDevice, A, b) = function column_thomas_solve!(::ClimaComms.CUDADevice, A, b) Ni, Nj, _, _, Nh = size(Fields.field_values(A)) nthreads, nblocks = Topologies._configure_threadblock(Ni * Nj * Nh) - @cuda threads = nthreads blocks = nblocks thomas_algorithm_kernel!(A, b) + @cuda always_inline = true threads = nthreads blocks = nblocks thomas_algorithm_kernel!( + A, + b, + ) end function thomas_algorithm_kernel!( diff --git a/src/Remapping/distributed_remapping.jl b/src/Remapping/distributed_remapping.jl index c399a424a3..72178f8519 100644 --- a/src/Remapping/distributed_remapping.jl +++ b/src/Remapping/distributed_remapping.jl @@ -515,7 +515,7 @@ function _set_interpolated_values!( if ClimaComms.device(field) isa ClimaComms.CUDADevice nblocks, _ = size(interpolation_matrix[1]) nthreads = length(vert_interpolation_weights) - @cuda threads = (nthreads) blocks = (nblocks) set_interpolated_values_kernel!( + @cuda always_inline = true threads = (nthreads) blocks = (nblocks) set_interpolated_values_kernel!( out, interpolation_matrix, local_horiz_indices, @@ -559,7 +559,7 @@ function _set_interpolated_values!( if ClimaComms.device(space) isa ClimaComms.CUDADevice nitems = length(out) nthreads, nblocks = Topologies._configure_threadblock(nitems) - @cuda threads = (nthreads) blocks = (nblocks) set_interpolated_values_kernel!( + @cuda always_inline = true threads = (nthreads) blocks = (nblocks) set_interpolated_values_kernel!( out, local_horiz_interpolation_weights, local_horiz_indices, diff --git a/src/Remapping/interpolate_array.jl b/src/Remapping/interpolate_array.jl index b5b957af13..990b726801 100644 --- a/src/Remapping/interpolate_array.jl +++ b/src/Remapping/interpolate_array.jl @@ -89,7 +89,7 @@ function interpolate_slab!( nitems = length(output_array) nthreads, nblocks = Topologies._configure_threadblock(nitems) - @cuda threads = (nthreads) blocks = (nblocks) interpolate_slab_kernel!( + @cuda always_inline = true threads = (nthreads) blocks = (nblocks) interpolate_slab_kernel!( output_cuarray, field, cuslab_indices, @@ -332,7 +332,7 @@ function interpolate_slab_level!( nitems = length(vidx_ref_coordinates) nthreads, nblocks = Topologies._configure_threadblock(nitems) - @cuda threads = (nthreads) blocks = (nblocks) interpolate_slab_level_kernel!( + @cuda always_inline = true threads = (nthreads) blocks = (nblocks) interpolate_slab_level_kernel!( output_cuarray, field, cuvidx_ref_coordinates, diff --git a/src/Topologies/dss_cuda.jl b/src/Topologies/dss_cuda.jl index ba65ef0ef3..17a6e865e7 100644 --- a/src/Topologies/dss_cuda.jl +++ b/src/Topologies/dss_cuda.jl @@ -21,7 +21,7 @@ function dss_load_perimeter_data!( (nlevels, nperimeter, nfid, nelems) = size(pperimeter_data) nitems = nlevels * nperimeter * nfid * nelems nthreads, nblocks = _configure_threadblock(nitems) - CUDA.@cuda threads = (nthreads) blocks = (nblocks) dss_load_perimeter_data_kernel!( + CUDA.@cuda always_inline = true threads = (nthreads) blocks = (nblocks) dss_load_perimeter_data_kernel!( pperimeter_data, pdata, perimeter, @@ -58,7 +58,7 @@ function dss_unload_perimeter_data!( (nlevels, nperimeter, nfid, nelems) = size(pperimeter_data) nitems = nlevels * nperimeter * nfid * nelems nthreads, nblocks = _configure_threadblock(nitems) - CUDA.@cuda threads = (nthreads) blocks = (nblocks) dss_unload_perimeter_data_kernel!( + CUDA.@cuda always_inline = true threads = (nthreads) blocks = (nblocks) dss_unload_perimeter_data_kernel!( pdata, pperimeter_data, perimeter, @@ -98,7 +98,7 @@ function dss_local!( nitems = nlevels * nfid * (nlocalfaces + nlocalvertices) nthreads, nblocks = _configure_threadblock(nitems) - CUDA.@cuda threads = (nthreads) blocks = (nblocks) dss_local_kernel!( + CUDA.@cuda always_inline = true threads = (nthreads) blocks = (nblocks) dss_local_kernel!( pperimeter_data, topology.local_vertices, topology.local_vertex_offset, @@ -184,7 +184,7 @@ function dss_transform!( (nlevels, nperimeter, _, _) = size(pperimeter_data) nitems = nlevels * nperimeter * nlocalelems nthreads, nblocks = _configure_threadblock(nitems) - CUDA.@cuda threads = (nthreads) blocks = (nblocks) dss_transform_kernel!( + CUDA.@cuda always_inline = true threads = (nthreads) blocks = (nblocks) dss_transform_kernel!( pperimeter_data, pdata, p∂ξ∂x, @@ -290,7 +290,7 @@ function dss_untransform!( (nlevels, nperimeter, _, _) = size(pperimeter_data) nitems = nlevels * nperimeter * nlocalelems nthreads, nblocks = _configure_threadblock(nitems) - CUDA.@cuda threads = (nthreads) blocks = (nblocks) dss_untransform_kernel!( + CUDA.@cuda always_inline = true threads = (nthreads) blocks = (nblocks) dss_untransform_kernel!( pperimeter_data, pdata, p∂ξ∂x, @@ -376,7 +376,7 @@ function dss_local_ghost!( max_threads = 256 nitems = nlevels * nfid * nghostvertices nthreads, nblocks = _configure_threadblock(nitems) - CUDA.@cuda threads = (nthreads) blocks = (nblocks) dss_local_ghost_kernel!( + CUDA.@cuda always_inline = true threads = (nthreads) blocks = (nblocks) dss_local_ghost_kernel!( pperimeter_data, topology.ghost_vertices, topology.ghost_vertex_offset, @@ -431,7 +431,7 @@ function fill_send_buffer!( if nsend > 0 nitems = nsend * nlevels * nfid nthreads, nblocks = _configure_threadblock(nitems) - CUDA.@cuda threads = (nthreads) blocks = (nblocks) fill_send_buffer_kernel!( + CUDA.@cuda always_inline = true threads = (nthreads) blocks = (nblocks) fill_send_buffer_kernel!( send_data, send_buf_idx, pperimeter_data, @@ -473,7 +473,7 @@ function load_from_recv_buffer!(::ClimaComms.CUDADevice, dss_buffer::DSSBuffer) if nrecv > 0 nitems = nrecv * nlevels * nfid nthreads, nblocks = _configure_threadblock(nitems) - CUDA.@cuda threads = (nthreads) blocks = (nblocks) load_from_recv_buffer_kernel!( + CUDA.@cuda always_inline = true threads = (nthreads) blocks = (nblocks) load_from_recv_buffer_kernel!( pperimeter_data, recv_data, recv_buf_idx, @@ -517,7 +517,7 @@ function dss_ghost!( nlevels, _, nfidx, _ = size(pperimeter_data) nitems = nlevels * nfidx * nghostvertices nthreads, nblocks = _configure_threadblock(nitems) - CUDA.@cuda threads = (nthreads) blocks = (nblocks) dss_ghost_kernel!( + CUDA.@cuda always_inline = true threads = (nthreads) blocks = (nblocks) dss_ghost_kernel!( pperimeter_data, topology.ghost_vertices, topology.ghost_vertex_offset,