diff --git a/ext/cuda/cuda_utils.jl b/ext/cuda/cuda_utils.jl index d9ee184d78..c3f3041049 100644 --- a/ext/cuda/cuda_utils.jl +++ b/ext/cuda/cuda_utils.jl @@ -37,7 +37,7 @@ to benchmark compare against auto-determined threads/blocks (if `auto=false`). function auto_launch!( f!::F!, args, - data; + nitems::Union{Integer, Nothing} = nothing; auto = false, threads_s = nothing, blocks_s = nothing, @@ -45,7 +45,7 @@ function auto_launch!( caller = :unknown, ) where {F!} if auto - nitems = get_n_items(data) + @assert !isnothing(nitems) if nitems ≥ 0 kernel = CUDA.@cuda always_inline = true launch = false f!(args...) config = CUDA.launch_configuration(kernel.fun) @@ -64,7 +64,7 @@ function auto_launch!( # CUDA.registers(kernel) > 50 || return nothing # for debugging # occursin("single_field_solve_kernel", string(nameof(F!))) || return nothing if !haskey(reported_stats, key) - nitems = get_n_items(data) + @assert !isnothing(nitems) kernel = CUDA.@cuda always_inline = true launch = false f!(args...) config = CUDA.launch_configuration(kernel.fun) threads = min(nitems, config.threads) diff --git a/ext/cuda/data_layouts_copyto.jl b/ext/cuda/data_layouts_copyto.jl index d32b6aee54..82cf46d88c 100644 --- a/ext/cuda/data_layouts_copyto.jl +++ b/ext/cuda/data_layouts_copyto.jl @@ -23,8 +23,7 @@ function Base.copyto!( if Nh > 0 auto_launch!( knl_copyto!, - (dest, bc), - dest; + (dest, bc); threads_s = (Nij, Nij), blocks_s = (Nh, 1), ) @@ -42,8 +41,7 @@ function Base.copyto!( Nv_blocks = cld(Nv, Nv_per_block) auto_launch!( knl_copyto!, - (dest, bc), - dest; + (dest, bc); threads_s = (Nij, Nij, Nv_per_block), blocks_s = (Nh, Nv_blocks), ) @@ -59,8 +57,7 @@ function Base.copyto!( if Nv > 0 auto_launch!( knl_copyto!, - (dest, bc), - dest; + (dest, bc); threads_s = (1, 1), blocks_s = (1, Nv), ) @@ -73,13 +70,7 @@ function Base.copyto!( bc::DataLayouts.BroadcastedUnionDataF{S}, ::ToCUDA, ) where {S} - auto_launch!( - knl_copyto!, - (dest, bc), - dest; - threads_s = (1, 1), - blocks_s = (1, 1), - ) + auto_launch!(knl_copyto!, (dest, bc); threads_s = (1, 1), blocks_s = (1, 1)) return dest end @@ -100,7 +91,8 @@ function cuda_copyto!(dest::AbstractData, bc) (_, _, Nv, _, Nh) = DataLayouts.universal_size(dest) us = DataLayouts.UniversalSize(dest) if Nv > 0 && Nh > 0 - auto_launch!(knl_copyto_flat!, (dest, bc, us), dest; auto = true) + nitems = prod(DataLayouts.universal_size(dest)) + auto_launch!(knl_copyto_flat!, (dest, bc, us), nitems; auto = true) end return dest end diff --git a/ext/cuda/data_layouts_fill.jl b/ext/cuda/data_layouts_fill.jl index 087d5f2a84..cac5bdf526 100644 --- a/ext/cuda/data_layouts_fill.jl +++ b/ext/cuda/data_layouts_fill.jl @@ -14,7 +14,8 @@ function cuda_fill!(dest::AbstractData, val) (_, _, Nv, _, Nh) = DataLayouts.universal_size(dest) us = DataLayouts.UniversalSize(dest) if Nv > 0 && Nh > 0 - auto_launch!(knl_fill_flat!, (dest, val, us), dest; auto = true) + nitems = prod(DataLayouts.universal_size(dest)) + auto_launch!(knl_fill_flat!, (dest, val, us), nitems; auto = true) end return dest end diff --git a/ext/cuda/data_layouts_fused_copyto.jl b/ext/cuda/data_layouts_fused_copyto.jl index a566e69a5f..0b1d1126d1 100644 --- a/ext/cuda/data_layouts_fused_copyto.jl +++ b/ext/cuda/data_layouts_fused_copyto.jl @@ -50,8 +50,7 @@ function fused_copyto!( Nv_blocks = cld(Nv, Nv_per_block) auto_launch!( knl_fused_copyto!, - (fmbc,), - dest1; + (fmbc,); threads_s = (Nij, Nij, Nv_per_block), blocks_s = (Nh, Nv_blocks), ) @@ -68,8 +67,7 @@ function fused_copyto!( if Nh > 0 auto_launch!( knl_fused_copyto!, - (fmbc,), - dest1; + (fmbc,); threads_s = (Nij, Nij), blocks_s = (Nh, 1), ) @@ -85,8 +83,7 @@ function fused_copyto!( if Nv > 0 && Nh > 0 auto_launch!( knl_fused_copyto!, - (fmbc,), - dest1; + (fmbc,); threads_s = (1, 1), blocks_s = (Nh, Nv), ) @@ -101,8 +98,7 @@ function fused_copyto!( ) where {S} auto_launch!( knl_fused_copyto!, - (fmbc,), - dest1; + (fmbc,); threads_s = (1, 1), blocks_s = (1, 1), ) diff --git a/ext/cuda/data_layouts_mapreduce.jl b/ext/cuda/data_layouts_mapreduce.jl index 5d63cf365d..18435eade6 100644 --- a/ext/cuda/data_layouts_mapreduce.jl +++ b/ext/cuda/data_layouts_mapreduce.jl @@ -28,7 +28,7 @@ function mapreduce_cuda( pdata = parent(data) T = eltype(pdata) (Ni, Nj, Nk, Nv, Nh) = size(data) - Nf = div(length(pdata), prod(size(data))) # length of field dimension + Nf = DataLayouts.ncomponents(data) # length of field dimension pwt = parent(weighted_jacobian) nitems = Nv * Ni * Nj * Nk * Nh diff --git a/ext/cuda/limiters.jl b/ext/cuda/limiters.jl index 7511c279f4..264d985507 100644 --- a/ext/cuda/limiters.jl +++ b/ext/cuda/limiters.jl @@ -21,23 +21,17 @@ function compute_element_bounds!( ρ, ::ClimaComms.CUDADevice, ) - S = size(Fields.field_values(ρ)) - (Ni, Nj, _, Nv, Nh) = S + (_, _, Nv, _, Nh) = DataLayouts.universal_size(ρ) nthreads, nblocks = config_threadblock(Nv, Nh) args = ( limiter, Fields.field_values(Operators.strip_space(ρq, axes(ρq))), Fields.field_values(Operators.strip_space(ρ, axes(ρ))), - Nv, - Nh, - Val(Ni), - Val(Nj), ) auto_launch!( compute_element_bounds_kernel!, - args, - ρ; + args; threads_s = nthreads, blocks_s = nblocks, ) @@ -45,15 +39,8 @@ function compute_element_bounds!( end -function compute_element_bounds_kernel!( - limiter, - ρq, - ρ, - Nv, - Nh, - ::Val{Ni}, - ::Val{Nj}, -) where {Ni, Nj} +function compute_element_bounds_kernel!(limiter, ρq, ρ) + (Ni, Nj, Nv, _, Nh) = DataLayouts.universal_size(ρ) n = (Nv, Nh) tidx = thread_index() @inbounds if valid_range(tidx, prod(n)) @@ -88,21 +75,18 @@ function compute_neighbor_bounds_local!( ::ClimaComms.CUDADevice, ) topology = Spaces.topology(axes(ρ)) - Ni, Nj, _, Nv, Nh = size(Fields.field_values(ρ)) + us = DataLayouts.UniversalSize(ρ) + (Ni, Nj, Nv, _, Nh) = DataLayouts.universal_size(us) nthreads, nblocks = config_threadblock(Nv, Nh) args = ( limiter, topology.local_neighbor_elem, topology.local_neighbor_elem_offset, - Nv, - Nh, - Val(Ni), - Val(Nj), + us, ) auto_launch!( compute_neighbor_bounds_local_kernel!, - args, - ρ; + args; threads_s = nthreads, blocks_s = nblocks, ) @@ -112,12 +96,9 @@ function compute_neighbor_bounds_local_kernel!( limiter, local_neighbor_elem, local_neighbor_elem_offset, - Nv, - Nh, - ::Val{Ni}, - ::Val{Nj}, -) where {Ni, Nj} - + us::DataLayouts.UniversalSize, +) + (Ni, Nj, Nv, _, Nh) = DataLayouts.universal_size(us) n = (Nv, Nh) tidx = thread_index() @inbounds if valid_range(tidx, prod(n)) @@ -147,9 +128,10 @@ function apply_limiter!( ::ClimaComms.CUDADevice, ) ρq_data = Fields.field_values(ρq) - (Ni, Nj, _, Nv, Nh) = size(ρq_data) - Nf = DataLayouts.ncomponents(ρq_data) + us = DataLayouts.UniversalSize(ρq_data) + (Ni, Nj, _, Nv, Nh) = DataLayouts.universal_size(us) maxiter = Ni * Nj + Nf = DataLayouts.ncomponents(ρq_data) WJ = Spaces.local_geometry_data(axes(ρq)).WJ nthreads, nblocks = config_threadblock(Nv, Nh) args = ( @@ -157,17 +139,13 @@ function apply_limiter!( Fields.field_values(Operators.strip_space(ρq, axes(ρq))), Fields.field_values(Operators.strip_space(ρ, axes(ρ))), WJ, - Nv, - Nh, + us, Val(Nf), - Val(Ni), - Val(Nj), Val(maxiter), ) auto_launch!( apply_limiter_kernel!, - args, - ρ; + args; threads_s = nthreads, blocks_s = nblocks, ) @@ -179,15 +157,13 @@ function apply_limiter_kernel!( ρq_data, ρ_data, WJ_data, - Nv, - Nh, + us::DataLayouts.UniversalSize, ::Val{Nf}, - ::Val{Ni}, - ::Val{Nj}, ::Val{maxiter}, -) where {Nf, Ni, Nj, maxiter} +) where {Nf, maxiter} (; q_bounds_nbr, rtol) = limiter converged = true + (Ni, Nj, Nv, _, Nh) = DataLayouts.universal_size(us) n = (Nv, Nh) tidx = thread_index() @inbounds if valid_range(tidx, prod(n)) diff --git a/ext/cuda/topologies_dss.jl b/ext/cuda/topologies_dss.jl index ac4a08b722..7973e49d1a 100644 --- a/ext/cuda/topologies_dss.jl +++ b/ext/cuda/topologies_dss.jl @@ -20,16 +20,13 @@ function Topologies.dss_load_perimeter_data!( data::Union{DataLayouts.IJFH, DataLayouts.VIJFH}, perimeter::Topologies.Perimeter2D, ) - pperimeter_data = parent(dss_buffer.perimeter_data) - pdata = parent(data) - (nlevels, nperimeter, nfid, nelems) = size(pperimeter_data) - nitems = nlevels * nperimeter * nfid * nelems + (; perimeter_data) = dss_buffer + nitems = prod(DataLayouts.array_size(perimeter_data)) nthreads, nblocks = _configure_threadblock(nitems) - args = (pperimeter_data, pdata, perimeter) + args = (perimeter_data, data, perimeter) auto_launch!( dss_load_perimeter_data_kernel!, - args, - pperimeter_data; + args; threads_s = (nthreads), blocks_s = (nblocks), ) @@ -37,13 +34,15 @@ function Topologies.dss_load_perimeter_data!( end function dss_load_perimeter_data_kernel!( - pperimeter_data::AbstractArray{FT, 4}, - pdata::Union{AbstractArray{FT, 4}, AbstractArray{FT, 5}}, + perimeter_data::DataLayouts.AbstractData, + data::Union{DataLayouts.IJFH, DataLayouts.VIJFH}, perimeter::Topologies.Perimeter2D{Nq}, -) where {FT <: AbstractFloat, Nq} - gidx = threadIdx().x + (blockIdx().x - 1) * blockDim().x - (nlevels, _, nfidx, nelems) = sizep = size(pperimeter_data) # size of perimeter data array +) where {Nq} + gidx = threadIdx().x + (blockIdx().x - Int32(1)) * blockDim().x + (nlevels, _, nfidx, nelems) = sizep = DataLayouts.array_size(perimeter_data) # size of perimeter data array sized = (nlevels, Nq, Nq, nfidx, nelems) # size of data + pperimeter_data = parent(perimeter_data) + pdata = parent(data) if gidx ≤ prod(sizep) (level, p, fidx, elem) = cart_ind(sizep, gidx).I @@ -60,16 +59,13 @@ function Topologies.dss_unload_perimeter_data!( dss_buffer::Topologies.DSSBuffer, perimeter, ) - pperimeter_data = parent(dss_buffer.perimeter_data) - pdata = parent(data) - (nlevels, nperimeter, nfid, nelems) = size(pperimeter_data) - nitems = nlevels * nperimeter * nfid * nelems + (; perimeter_data) = dss_buffer + nitems = prod(DataLayouts.array_size(perimeter_data)) nthreads, nblocks = _configure_threadblock(nitems) - args = (pdata, pperimeter_data, perimeter) + args = (data, perimeter_data, perimeter) auto_launch!( dss_unload_perimeter_data_kernel!, - args, - pdata; + args; threads_s = (nthreads), blocks_s = (nblocks), ) @@ -77,13 +73,16 @@ function Topologies.dss_unload_perimeter_data!( end function dss_unload_perimeter_data_kernel!( - pdata::Union{AbstractArray{FT, 4}, AbstractArray{FT, 5}}, - pperimeter_data::AbstractArray{FT, 4}, + data::Union{DataLayouts.IJFH, DataLayouts.VIJFH}, + perimeter_data::AbstractData, perimeter::Topologies.Perimeter2D{Nq}, -) where {FT <: AbstractFloat, Nq} - gidx = threadIdx().x + (blockIdx().x - 1) * blockDim().x - (nlevels, nperimeter, nfidx, nelems) = sizep = size(pperimeter_data) # size of perimeter data array +) where {Nq} + gidx = threadIdx().x + (blockIdx().x - Int32(1)) * blockDim().x + (nlevels, nperimeter, nfidx, nelems) = + sizep = DataLayouts.array_size(perimeter_data) # size of perimeter data array sized = (nlevels, Nq, Nq, nfidx, nelems) # size of data + pperimeter_data = parent(perimeter_data) + pdata = parent(data) if gidx ≤ prod(sizep) (level, p, fidx, elem) = cart_ind(sizep, gidx).I @@ -103,13 +102,13 @@ function Topologies.dss_local!( nlocalvertices = length(topology.local_vertex_offset) - 1 nlocalfaces = length(topology.interior_faces) if (nlocalvertices + nlocalfaces) > 0 - pperimeter_data = parent(perimeter_data) - (nlevels, nperimeter, nfid, nelems) = size(pperimeter_data) + (nlevels, nperimeter, nfid, nelems) = + DataLayouts.array_size(perimeter_data) nitems = nlevels * nfid * (nlocalfaces + nlocalvertices) nthreads, nblocks = _configure_threadblock(nitems) args = ( - pperimeter_data, + perimeter_data, topology.local_vertices, topology.local_vertex_offset, topology.interior_faces, @@ -117,8 +116,7 @@ function Topologies.dss_local!( ) auto_launch!( dss_local_kernel!, - args, - pperimeter_data; + args; threads_s = (nthreads), blocks_s = (nblocks), ) @@ -127,16 +125,19 @@ function Topologies.dss_local!( end function dss_local_kernel!( - pperimeter_data::AbstractArray{FT, 4}, + perimeter_data::DataLayouts.VIFH, local_vertices::AbstractVector{Tuple{Int, Int}}, local_vertex_offset::AbstractVector{Int}, interior_faces::AbstractVector{Tuple{Int, Int, Int, Int, Bool}}, perimeter::Topologies.Perimeter2D{Nq}, -) where {FT <: AbstractFloat, Nq} - gidx = threadIdx().x + (blockIdx().x - 1) * blockDim().x +) where {Nq} + FT = eltype(parent(perimeter_data)) + gidx = threadIdx().x + (blockIdx().x - Int32(1)) * blockDim().x nlocalvertices = length(local_vertex_offset) - 1 nlocalfaces = length(interior_faces) - (nlevels, nperimeter, nfidx, _) = size(pperimeter_data) + pperimeter_data = parent(perimeter_data) + FT = eltype(pperimeter_data) + (nlevels, nperimeter, nfidx, _) = DataLayouts.array_size(perimeter_data) if gidx ≤ nlevels * nfidx * nlocalvertices # local vertices sizev = (nlevels, nfidx, nlocalvertices) (level, fidx, vertexid) = cart_ind(sizev, gidx).I @@ -198,17 +199,16 @@ function Topologies.dss_transform!( pweight = parent(weight) p∂x∂ξ = parent(∂x∂ξ) p∂ξ∂x = parent(∂ξ∂x) - pperimeter_data = parent(perimeter_data) nmetric = cld(length(p∂ξ∂x), prod(size(∂ξ∂x))) (nlevels, nperimeter, _, _) = size(pperimeter_data) nitems = nlevels * nperimeter * nlocalelems nthreads, nblocks = _configure_threadblock(nitems) args = ( - pperimeter_data, + perimeter_data, pdata, p∂ξ∂x, p∂x∂ξ, - nmetric, + Val(nmetric), pweight, perimeter, scalarfidx, @@ -217,11 +217,11 @@ function Topologies.dss_transform!( covariant123fidx, contravariant123fidx, localelems, + Val(nlocalelems), ) auto_launch!( dss_transform_kernel!, - args, - pperimeter_data; + args; threads_s = (nthreads), blocks_s = (nblocks), ) @@ -230,11 +230,11 @@ function Topologies.dss_transform!( end function dss_transform_kernel!( - pperimeter_data::AbstractArray{FT, 4}, + perimeter_data::DataLayouts.VIFH, pdata::Union{AbstractArray{FT, 4}, AbstractArray{FT, 5}}, p∂ξ∂x::Union{AbstractArray{FT, 4}, AbstractArray{FT, 5}}, p∂x∂ξ::Union{AbstractArray{FT, 4}, AbstractArray{FT, 5}}, - nmetric::Int, + ::Val{nmetric}, pweight::AbstractArray{FT, 4}, perimeter::Topologies.Perimeter2D{Nq}, scalarfidx::AbstractVector{Int}, @@ -243,10 +243,11 @@ function dss_transform_kernel!( covariant123fidx::AbstractVector{Int}, contravariant123fidx::AbstractVector{Int}, localelems::AbstractVector{Int}, -) where {FT <: AbstractFloat, Nq} - gidx = threadIdx().x + (blockIdx().x - 1) * blockDim().x - (nlevels, nperimeter, nfid, nelems) = size(pperimeter_data) - nlocalelems = length(localelems) + ::Val{nlocalelems}, +) where {FT <: AbstractFloat, Nq, nmetric, nlocalelems} + pperimeter_data = parent(perimeter_data) + gidx = threadIdx().x + (blockIdx().x - Int32(1)) * blockDim().x + (nlevels, nperimeter, nfid, nelems) = DataLayouts.array_size(perimeter_data) if gidx ≤ nlevels * nperimeter * nlocalelems sizet = (nlevels, nperimeter, nlocalelems) sizet_data = (nlevels, Nq, Nq, nfid, nelems) @@ -398,7 +399,7 @@ function Topologies.dss_untransform!( nitems = nlevels * nperimeter * nlocalelems nthreads, nblocks = _configure_threadblock(nitems) args = ( - pperimeter_data, + perimeter_data, pdata, p∂ξ∂x, p∂x∂ξ, @@ -410,11 +411,11 @@ function Topologies.dss_untransform!( covariant123fidx, contravariant123fidx, localelems, + Val(nlocalelems), ) auto_launch!( dss_untransform_kernel!, - args, - pperimeter_data; + args; threads_s = (nthreads), blocks_s = (nblocks), ) @@ -423,7 +424,7 @@ function Topologies.dss_untransform!( end function dss_untransform_kernel!( - pperimeter_data::AbstractArray{FT, 4}, + perimeter_data::DataLayouts.VIFH, pdata::Union{AbstractArray{FT, 4}, AbstractArray{FT, 5}}, p∂ξ∂x::Union{AbstractArray{FT, 4}, AbstractArray{FT, 5}}, p∂x∂ξ::Union{AbstractArray{FT, 4}, AbstractArray{FT, 5}}, @@ -435,10 +436,10 @@ function dss_untransform_kernel!( covariant123fidx::AbstractVector{Int}, contravariant123fidx::AbstractVector{Int}, localelems::AbstractVector{Int}, -) where {FT <: AbstractFloat, Nq} - gidx = threadIdx().x + (blockIdx().x - 1) * blockDim().x - (nlevels, nperimeter, nfid, nelems) = size(pperimeter_data) - nlocalelems = length(localelems) + ::Val{nlocalelems}, +) where {FT <: AbstractFloat, Nq, nlocalelems} + gidx = threadIdx().x + (blockIdx().x - Int32(1)) * blockDim().x + (nlevels, nperimeter, nfid, nelems) = DataLayouts.array_size(perimeter_data) if gidx ≤ nlevels * nperimeter * nlocalelems sizet = (nlevels, nperimeter, nlocalelems) sizet_data = (nlevels, Nq, Nq, nfid, nelems) @@ -533,21 +534,20 @@ function Topologies.dss_local_ghost!( ) nghostvertices = length(topology.ghost_vertex_offset) - 1 if nghostvertices > 0 - pperimeter_data = parent(perimeter_data) - (nlevels, nperimeter, nfid, nelems) = size(pperimeter_data) + (nlevels, nperimeter, nfid, nelems) = + DataLayouts.array_size(perimeter_data) max_threads = 256 nitems = nlevels * nfid * nghostvertices nthreads, nblocks = _configure_threadblock(nitems) args = ( - pperimeter_data, + perimeter_data, topology.ghost_vertices, topology.ghost_vertex_offset, perimeter, ) auto_launch!( dss_local_ghost_kernel!, - args, - pperimeter_data; + args; threads_s = (nthreads), blocks_s = (nblocks), ) @@ -556,13 +556,14 @@ function Topologies.dss_local_ghost!( end function dss_local_ghost_kernel!( - pperimeter_data::AbstractArray{FT, 4}, + perimeter_data::DataLayouts.VIFH, ghost_vertices, ghost_vertex_offset, perimeter::Topologies.Perimeter2D{Nq}, ) where {FT <: AbstractFloat, Nq} - gidx = threadIdx().x + (blockIdx().x - 1) * blockDim().x - (nlevels, nperimeter, nfidx, _) = size(pperimeter_data) + gidx = threadIdx().x + (blockIdx().x - Int32(1)) * blockDim().x + pperimeter_data = parent(perimeter_data) + (nlevels, nperimeter, nfidx, _) = DataLayouts.array_size(perimeter_data) nghostvertices = length(ghost_vertex_offset) - 1 if gidx ≤ nlevels * nfidx * nghostvertices sizev = (nlevels, nfidx, nghostvertices) @@ -594,17 +595,15 @@ function Topologies.fill_send_buffer!( synchronize = true, ) (; perimeter_data, send_buf_idx, send_data) = dss_buffer - pperimeter_data = parent(perimeter_data) - (nlevels, nperimeter, nfid, nelems) = size(pperimeter_data) + (nlevels, nperimeter, nfid, nelems) = DataLayouts.array_size(perimeter_data) nsend = size(send_buf_idx, 1) if nsend > 0 nitems = nsend * nlevels * nfid nthreads, nblocks = _configure_threadblock(nitems) - args = (send_data, send_buf_idx, pperimeter_data) + args = (send_data, send_buf_idx, perimeter_data, Val(nsend)) auto_launch!( fill_send_buffer_kernel!, - args, - pperimeter_data; + args; threads_s = (nthreads), blocks_s = (nblocks), ) @@ -618,11 +617,12 @@ end function fill_send_buffer_kernel!( send_data::AbstractArray{FT, 1}, send_buf_idx::AbstractArray{I, 2}, - pperimeter_data::AbstractArray{FT, 4}, -) where {FT <: AbstractFloat, I <: Int} - gidx = threadIdx().x + (blockIdx().x - 1) * blockDim().x - (nlevels, _, nfid, nelems) = size(pperimeter_data) - nsend = size(send_buf_idx, 1) + perimeter_data::AbstractData, + ::Val{nsend}, +) where {FT <: AbstractFloat, I <: Int, nsend} + gidx = threadIdx().x + (blockIdx().x - Int32(1)) * blockDim().x + (nlevels, _, nfid, nelems) = DataLayouts.array_size(perimeter_data) + pperimeter_data = parent(perimeter_data) #sizet = (nsend, nlevels, nfid) sizet = (nlevels, nfid, nsend) #if gidx ≤ nsend * nlevels * nfid @@ -642,17 +642,15 @@ function Topologies.load_from_recv_buffer!( dss_buffer::Topologies.DSSBuffer, ) (; perimeter_data, recv_buf_idx, recv_data) = dss_buffer - pperimeter_data = parent(perimeter_data) - (nlevels, nperimeter, nfid, nelems) = size(pperimeter_data) + (nlevels, nperimeter, nfid, nelems) = DataLayouts.array_size(perimeter_data) nrecv = size(recv_buf_idx, 1) if nrecv > 0 nitems = nrecv * nlevels * nfid nthreads, nblocks = _configure_threadblock(nitems) - args = (pperimeter_data, recv_data, recv_buf_idx) + args = (perimeter_data, recv_data, recv_buf_idx, Val(nrecv)) auto_launch!( load_from_recv_buffer_kernel!, - args, - pperimeter_data; + args; threads_s = (nthreads), blocks_s = (nblocks), ) @@ -661,13 +659,14 @@ function Topologies.load_from_recv_buffer!( end function load_from_recv_buffer_kernel!( - pperimeter_data::AbstractArray{FT, 4}, + perimeter_data::AbstractData, recv_data::AbstractArray{FT, 1}, recv_buf_idx::AbstractArray{I, 2}, -) where {FT <: AbstractFloat, I <: Int} - gidx = threadIdx().x + (blockIdx().x - 1) * blockDim().x - nlevels, _, nfid, nelems = size(pperimeter_data) - nrecv = size(recv_buf_idx, 1) + ::Val{nrecv}, +) where {FT <: AbstractFloat, I <: Int, nrecv} + gidx = threadIdx().x + (blockIdx().x - Int32(1)) * blockDim().x + pperimeter_data = parent(perimeter_data) + (nlevels, _, nfid, nelems) = DataLayouts.array_size(perimeter_data) #sizet = (nrecv, nlevels, nfid) sizet = (nlevels, nfid, nrecv) #if gidx ≤ nrecv * nlevels * nfid @@ -691,8 +690,7 @@ function Topologies.dss_ghost!( ) nghostvertices = length(topology.ghost_vertex_offset) - 1 if nghostvertices > 0 - pperimeter_data = parent(perimeter_data) - nlevels, _, nfidx, _ = size(pperimeter_data) + (nlevels, _, nfidx, _) = DataLayouts.array_size(perimeter_data) nitems = nlevels * nfidx * nghostvertices nthreads, nblocks = _configure_threadblock(nitems) args = ( @@ -704,8 +702,7 @@ function Topologies.dss_ghost!( ) auto_launch!( dss_ghost_kernel!, - args, - pperimeter_data; + args; threads_s = (nthreads), blocks_s = (nblocks), ) @@ -714,14 +711,15 @@ function Topologies.dss_ghost!( end function dss_ghost_kernel!( - pperimeter_data::AbstractArray{FT, 4}, + perimeter_data::AbstractData, ghost_vertices, ghost_vertex_offset, repr_ghost_vertex, perimeter::Topologies.Perimeter2D{Nq}, ) where {FT <: AbstractFloat, Nq} - gidx = threadIdx().x + (blockIdx().x - 1) * blockDim().x - nlevels, _, nfidx, _ = size(pperimeter_data) + pperimeter_data = parent(perimeter_data) + gidx = threadIdx().x + (blockIdx().x - Int32(1)) * blockDim().x + (nlevels, _, nfidx, _) = DataLayouts.array_size(perimeter_data) nghostvertices = length(ghost_vertex_offset) - 1 if gidx ≤ nlevels * nfidx * nghostvertices diff --git a/src/DataLayouts/DataLayouts.jl b/src/DataLayouts/DataLayouts.jl index aefeae837e..95fbab5bbe 100644 --- a/src/DataLayouts/DataLayouts.jl +++ b/src/DataLayouts/DataLayouts.jl @@ -1276,6 +1276,49 @@ type parameters. @inline union_all(::Type{<:IH1JH2}) = IH1JH2 @inline union_all(::Type{<:IV1JH2}) = IV1JH2 +""" + array_size(data::AbstractData) + array_size(::Type{<:AbstractData}) + +This is an internal function, please do not use outside of ClimaCore. + +Returns the size of the backing array, with the field dimension set to 1 + +This function is helpful for writing generic +code, when reconstructing new datalayouts with new +type parameters. +""" +@inline array_size(::IJKFVH{S, Nij, Nk, Nv, Nh}) where {S, Nij, Nk, Nv, Nh} = (Nij, Nij, Nk, 1, Nv, Nh) +@inline array_size(::IJFH{S, Nij, Nh}) where {S, Nij, Nh} = (Nij, Nij, 1, Nh) +@inline array_size(::IFH{S, Ni, Nh}) where {S, Ni, Nh} = (Ni, 1, Nh) +@inline array_size(::DataF{S}) where {S} = (1,) +@inline array_size(::IJF{S, Nij}) where {S, Nij} = (Nij, Nij, 1) +@inline array_size(::IF{S, Ni}) where {S, Ni} = (Ni, 1) +@inline array_size(::VF{S, Nv}) where {S, Nv} = (Nv, 1) +@inline array_size(::VIJFH{S, Nv, Nij, Nh}) where {S, Nv, Nij, Nh} = (Nv, Nij, Nij, 1, Nh) +@inline array_size(::VIFH{S, Nv, Ni, Nh}) where {S, Nv, Ni, Nh} = (Nv, Ni, 1, Nh) + +""" + farray_size(data::AbstractData) + +This is an internal function, please do not use outside of ClimaCore. + +Returns the size of the backing array, including the field dimension + +This function is helpful for writing generic +code, when reconstructing new datalayouts with new +type parameters. +""" +@inline farray_size(data::IJKFVH{S, Nij, Nk, Nv, Nh}) where {S, Nij, Nk, Nv, Nh} = (Nij, Nij, Nk, ncomponents(data), Nv, Nh) +@inline farray_size(data::IJFH{S, Nij, Nh}) where {S, Nij, Nh} = (Nij, Nij, ncomponents(data), Nh) +@inline farray_size(data::IFH{S, Ni, Nh}) where {S, Ni, Nh} = (Ni, ncomponents(data), Nh) +@inline farray_size(data::DataF{S}) where {S} = (ncomponents(data),) +@inline farray_size(data::IJF{S, Nij}) where {S, Nij} = (Nij, Nij, ncomponents(data)) +@inline farray_size(data::IF{S, Ni}) where {S, Ni} = (Ni, ncomponents(data)) +@inline farray_size(data::VF{S, Nv}) where {S, Nv} = (Nv, ncomponents(data)) +@inline farray_size(data::VIJFH{S, Nv, Nij, Nh}) where {S, Nv, Nij, Nh} = (Nv, Nij, Nij, ncomponents(data), Nh) +@inline farray_size(data::VIFH{S, Nv, Ni, Nh}) where {S, Nv, Ni, Nh} = (Nv, Ni, ncomponents(data), Nh) + @inline slab_index(i, j) = CartesianIndex(i, j, 1, 1, 1) @inline slab_index(i) = CartesianIndex(i, 1, 1, 1, 1) @inline vindex(v) = CartesianIndex(1, 1, 1, v, 1) diff --git a/src/Topologies/dss.jl b/src/Topologies/dss.jl index 4432ffc0b6..31056ed87f 100644 --- a/src/Topologies/dss.jl +++ b/src/Topologies/dss.jl @@ -66,9 +66,7 @@ function create_dss_buffer( convert_to_array = DA isa Array ? false : true (_, _, _, Nv, Nh) = Base.size(data) Np = length(perimeter) - Nf = - length(parent(data)) == 0 ? 0 : - cld(length(parent(data)), (Nij * Nij * Nv * Nh)) + Nf = DataLayouts.ncomponents(data) nfacedof = Nij - 2 T = eltype(parent(data)) TS = _transformed_type(data, local_geometry, local_weights, DA) # extract transformed type @@ -941,7 +939,7 @@ function fill_send_buffer!( ) (; perimeter_data, send_buf_idx, send_data) = dss_buffer (Np, _, _, Nv, nelems) = size(perimeter_data) - Nf = cld(length(parent(perimeter_data)), (Nv * Np * nelems)) + Nf = DataLayouts.ncomponents(perimeter_data) pdata = parent(perimeter_data) nsend = size(send_buf_idx, 1) ctr = 1 @@ -970,7 +968,7 @@ function load_from_recv_buffer!( ) (; perimeter_data, recv_buf_idx, recv_data) = dss_buffer (Np, _, _, Nv, nelems) = size(perimeter_data) - Nf = cld(length(parent(perimeter_data)), (Nv * Np * nelems)) + Nf = DataLayouts.ncomponents(perimeter_data) pdata = parent(perimeter_data) nrecv = size(recv_buf_idx, 1) ctr = 1