diff --git a/ext/ClimaCoreCUDAExt.jl b/ext/ClimaCoreCUDAExt.jl index 167696e93d..8952a6b24a 100644 --- a/ext/ClimaCoreCUDAExt.jl +++ b/ext/ClimaCoreCUDAExt.jl @@ -17,6 +17,7 @@ import ClimaCore.Utilities: cart_ind, linear_ind import ClimaCore.RecursiveApply: ⊠, ⊞, ⊟, radd, rmul, rsub, rdiv, rmap, rzero, rmin, rmax import ClimaCore.DataLayouts: get_N, get_Nv, get_Nij, get_Nij, get_Nh +import ClimaCore.DataLayouts: UniversalSize include(joinpath("cuda", "cuda_utils.jl")) include(joinpath("cuda", "data_layouts.jl")) diff --git a/ext/cuda/cuda_utils.jl b/ext/cuda/cuda_utils.jl index 15ee90ce34..8c7d87a3bb 100644 --- a/ext/cuda/cuda_utils.jl +++ b/ext/cuda/cuda_utils.jl @@ -90,6 +90,12 @@ function auto_launch!( return nothing end +function threads_via_occupancy(f!::F!, args) where {F!} + kernel = CUDA.@cuda always_inline = true launch = false f!(args...) + config = CUDA.launch_configuration(kernel.fun) + return config.threads +end + """ thread_index() diff --git a/ext/cuda/data_layouts.jl b/ext/cuda/data_layouts.jl index 20cc9d7178..6af88f897d 100644 --- a/ext/cuda/data_layouts.jl +++ b/ext/cuda/data_layouts.jl @@ -29,6 +29,7 @@ include("data_layouts_fill.jl") include("data_layouts_copyto.jl") include("data_layouts_fused_copyto.jl") include("data_layouts_mapreduce.jl") +include("data_layouts_threadblock.jl") adapt_f(to, f::F) where {F} = Adapt.adapt(to, f) adapt_f(to, ::Type{F}) where {F} = (x...) -> F(x...) diff --git a/ext/cuda/data_layouts_copyto.jl b/ext/cuda/data_layouts_copyto.jl index 82cf46d88c..013892e602 100644 --- a/ext/cuda/data_layouts_copyto.jl +++ b/ext/cuda/data_layouts_copyto.jl @@ -1,111 +1,38 @@ DataLayouts._device_dispatch(x::CUDA.CuArray) = ToCUDA() -function knl_copyto!(dest, src) - - i = CUDA.threadIdx().x - j = CUDA.threadIdx().y - - h = CUDA.blockIdx().x - v = CUDA.blockDim().z * (CUDA.blockIdx().y - 1) + CUDA.threadIdx().z - - if v <= size(dest, 4) - I = CartesianIndex((i, j, 1, v, h)) +function knl_copyto!(dest, src, us) + I = universal_index(dest) + if is_valid_index(dest, I, us) @inbounds dest[I] = src[I] end return nothing end -function Base.copyto!( - dest::IJFH{S, Nij, Nh}, - bc::DataLayouts.BroadcastedUnionIJFH{S, Nij, Nh}, - ::ToCUDA, -) where {S, Nij, Nh} - if Nh > 0 - auto_launch!( - knl_copyto!, - (dest, bc); - threads_s = (Nij, Nij), - blocks_s = (Nh, 1), - ) - end - return dest -end - -function Base.copyto!( - dest::VIJFH{S, Nv, Nij, Nh}, - bc::DataLayouts.BroadcastedUnionVIJFH{S, Nv, Nij, Nh}, - ::ToCUDA, -) where {S, Nv, Nij, Nh} - if Nv > 0 && Nh > 0 - Nv_per_block = min(Nv, fld(256, Nij * Nij)) - Nv_blocks = cld(Nv, Nv_per_block) - auto_launch!( - knl_copyto!, - (dest, bc); - threads_s = (Nij, Nij, Nv_per_block), - blocks_s = (Nh, Nv_blocks), - ) - end - return dest -end - -function Base.copyto!( - dest::VF{S, Nv}, - bc::DataLayouts.BroadcastedUnionVF{S, Nv}, - ::ToCUDA, -) where {S, Nv} - if Nv > 0 - auto_launch!( - knl_copyto!, - (dest, bc); - threads_s = (1, 1), - blocks_s = (1, Nv), - ) - end - return dest -end - -function Base.copyto!( - dest::DataF{S}, - bc::DataLayouts.BroadcastedUnionDataF{S}, - ::ToCUDA, -) where {S} - auto_launch!(knl_copyto!, (dest, bc); threads_s = (1, 1), blocks_s = (1, 1)) - return dest -end - -import ClimaCore.DataLayouts: isascalar -function knl_copyto_flat!(dest::AbstractData, bc, us) - @inbounds begin - tidx = thread_index() - if tidx ≤ get_N(us) - n = size(dest) - I = kernel_indexes(tidx, n) - dest[I] = bc[I] - end - end - return nothing -end - function cuda_copyto!(dest::AbstractData, bc) (_, _, Nv, _, Nh) = DataLayouts.universal_size(dest) us = DataLayouts.UniversalSize(dest) if Nv > 0 && Nh > 0 - nitems = prod(DataLayouts.universal_size(dest)) - auto_launch!(knl_copyto_flat!, (dest, bc, us), nitems; auto = true) + args = (dest, bc, us) + threads = threads_via_occupancy(knl_copyto!, args) + n_max_threads = min(threads, get_N(us)) + p = partition(dest, n_max_threads) + auto_launch!( + knl_copyto!, + args; + threads_s = p.threads, + blocks_s = p.blocks, + ) end return dest end -# TODO: can we use CUDA's luanch configuration for all data layouts? -# Currently, it seems to have a slight performance degradation. #! format: off -# Base.copyto!(dest::IJFH{S, Nij}, bc::DataLayouts.BroadcastedUnionIJFH{S, Nij, Nh}, ::ToCUDA) where {S, Nij, Nh} = cuda_copyto!(dest, bc) +Base.copyto!(dest::IJFH{S, Nij}, bc::DataLayouts.BroadcastedUnionIJFH{S, Nij, Nh}, ::ToCUDA) where {S, Nij, Nh} = cuda_copyto!(dest, bc) Base.copyto!(dest::IFH{S, Ni, Nh}, bc::DataLayouts.BroadcastedUnionIFH{S, Ni, Nh}, ::ToCUDA) where {S, Ni, Nh} = cuda_copyto!(dest, bc) Base.copyto!(dest::IJF{S, Nij}, bc::DataLayouts.BroadcastedUnionIJF{S, Nij}, ::ToCUDA) where {S, Nij} = cuda_copyto!(dest, bc) Base.copyto!(dest::IF{S, Ni}, bc::DataLayouts.BroadcastedUnionIF{S, Ni}, ::ToCUDA) where {S, Ni} = cuda_copyto!(dest, bc) Base.copyto!(dest::VIFH{S, Nv, Ni, Nh}, bc::DataLayouts.BroadcastedUnionVIFH{S, Nv, Ni, Nh}, ::ToCUDA) where {S, Nv, Ni, Nh} = cuda_copyto!(dest, bc) -# Base.copyto!(dest::VIJFH{S, Nv, Nij, Nh}, bc::DataLayouts.BroadcastedUnionVIJFH{S, Nv, Nij, Nh}, ::ToCUDA) where {S, Nv, Nij, Nh} = cuda_copyto!(dest, bc) -# Base.copyto!(dest::VF{S, Nv}, bc::DataLayouts.BroadcastedUnionVF{S, Nv}, ::ToCUDA) where {S, Nv} = cuda_copyto!(dest, bc) -# Base.copyto!(dest::DataF{S}, bc::DataLayouts.BroadcastedUnionDataF{S}, ::ToCUDA) where {S} = cuda_copyto!(dest, bc) +Base.copyto!(dest::VIJFH{S, Nv, Nij, Nh}, bc::DataLayouts.BroadcastedUnionVIJFH{S, Nv, Nij, Nh}, ::ToCUDA) where {S, Nv, Nij, Nh} = cuda_copyto!(dest, bc) +Base.copyto!(dest::VF{S, Nv}, bc::DataLayouts.BroadcastedUnionVF{S, Nv}, ::ToCUDA) where {S, Nv} = cuda_copyto!(dest, bc) +Base.copyto!(dest::DataF{S}, bc::DataLayouts.BroadcastedUnionDataF{S}, ::ToCUDA) where {S} = cuda_copyto!(dest, bc) #! format: on diff --git a/ext/cuda/data_layouts_fill.jl b/ext/cuda/data_layouts_fill.jl index cac5bdf526..b38d8e7ff5 100644 --- a/ext/cuda/data_layouts_fill.jl +++ b/ext/cuda/data_layouts_fill.jl @@ -1,32 +1,27 @@ -function knl_fill_flat!(dest::AbstractData, val, us) - @inbounds begin - tidx = thread_index() - if tidx ≤ get_N(us) - n = size(dest) - I = kernel_indexes(tidx, n) - @inbounds dest[I] = val - end +function knl_fill!(dest, val, us) + I = universal_index(dest) + if is_valid_index(dest, I, us) + @inbounds dest[I] = val end return nothing end -function cuda_fill!(dest::AbstractData, val) +function cuda_fill!(dest::AbstractData, bc) (_, _, Nv, _, Nh) = DataLayouts.universal_size(dest) us = DataLayouts.UniversalSize(dest) if Nv > 0 && Nh > 0 - nitems = prod(DataLayouts.universal_size(dest)) - auto_launch!(knl_fill_flat!, (dest, val, us), nitems; auto = true) + args = (dest, bc, us) + threads = threads_via_occupancy(knl_fill!, args) + n_max_threads = min(threads, get_N(us)) + p = partition(dest, n_max_threads) + auto_launch!( + knl_fill!, + args; + threads_s = p.threads, + blocks_s = p.blocks, + ) end return dest end -#! format: off -Base.fill!(dest::IJFH{<:Any, <:Any}, val, ::ToCUDA) = cuda_fill!(dest, val) -Base.fill!(dest::IFH{<:Any, <:Any}, val, ::ToCUDA) = cuda_fill!(dest, val) -Base.fill!(dest::IJF{<:Any, <:Any}, val, ::ToCUDA) = cuda_fill!(dest, val) -Base.fill!(dest::IF{<:Any, <:Any}, val, ::ToCUDA) = cuda_fill!(dest, val) -Base.fill!(dest::VIFH{<:Any, <:Any, <:Any}, val, ::ToCUDA) = cuda_fill!(dest, val) -Base.fill!(dest::VIJFH{<:Any, <:Any, <:Any}, val, ::ToCUDA) = cuda_fill!(dest, val) -Base.fill!(dest::VF{<:Any, <:Any}, val, ::ToCUDA) = cuda_fill!(dest, val) -Base.fill!(dest::DataF{<:Any}, val, ::ToCUDA) = cuda_fill!(dest, val) -#! format: on +Base.fill!(dest::AbstractData, val, ::ToCUDA) = cuda_fill!(dest, val) diff --git a/ext/cuda/data_layouts_fused_copyto.jl b/ext/cuda/data_layouts_fused_copyto.jl index 0b1d1126d1..a39a0f9430 100644 --- a/ext/cuda/data_layouts_fused_copyto.jl +++ b/ext/cuda/data_layouts_fused_copyto.jl @@ -1,106 +1,59 @@ Base.@propagate_inbounds function rcopyto_at!( pair::Pair{<:AbstractData, <:Any}, I, - v, + us, ) dest, bc = pair.first, pair.second - if 1 ≤ v <= size(dest, 4) + if is_valid_index(dest, I, us) dest[I] = isascalar(bc) ? bc[] : bc[I] end return nothing end -Base.@propagate_inbounds function rcopyto_at!(pair::Pair{<:DataF, <:Any}, I, v) +Base.@propagate_inbounds function rcopyto_at!(pair::Pair{<:DataF, <:Any}, I, us) dest, bc = pair.first, pair.second - if 1 ≤ v <= size(dest, 4) + if is_valid_index(dest, I, us) bcI = isascalar(bc) ? bc[] : bc[I] dest[] = bcI end return nothing end -Base.@propagate_inbounds function rcopyto_at!(pairs::Tuple, I, v) - rcopyto_at!(first(pairs), I, v) - rcopyto_at!(Base.tail(pairs), I, v) +Base.@propagate_inbounds function rcopyto_at!(pairs::Tuple, I, us) + rcopyto_at!(first(pairs), I, us) + rcopyto_at!(Base.tail(pairs), I, us) end -Base.@propagate_inbounds rcopyto_at!(pairs::Tuple{<:Any}, I, v) = - rcopyto_at!(first(pairs), I, v) -@inline rcopyto_at!(pairs::Tuple{}, I, v) = nothing - -function knl_fused_copyto!(fmbc::FusedMultiBroadcast) +Base.@propagate_inbounds rcopyto_at!(pairs::Tuple{<:Any}, I, us) = + rcopyto_at!(first(pairs), I, us) +@inline rcopyto_at!(pairs::Tuple{}, I, us) = nothing +function knl_fused_copyto!(fmbc::FusedMultiBroadcast, dest1, us) @inbounds begin - i = CUDA.threadIdx().x - j = CUDA.threadIdx().y - - h = CUDA.blockIdx().x - v = CUDA.blockDim().z * (CUDA.blockIdx().y - 1) + CUDA.threadIdx().z - (; pairs) = fmbc - I = CartesianIndex((i, j, 1, v, h)) - rcopyto_at!(pairs, I, v) + I = universal_index(dest1) + if is_valid_index(dest1, I, us) + (; pairs) = fmbc + rcopyto_at!(pairs, I, us) + end end return nothing end function fused_copyto!( fmbc::FusedMultiBroadcast, - dest1::VIJFH{S, Nv, Nij, Nh}, + dest1::DataLayouts.AbstractData, ::ToCUDA, -) where {S, Nv, Nij, Nh} - if Nv > 0 && Nh > 0 - Nv_per_block = min(Nv, fld(256, Nij * Nij)) - Nv_blocks = cld(Nv, Nv_per_block) - auto_launch!( - knl_fused_copyto!, - (fmbc,); - threads_s = (Nij, Nij, Nv_per_block), - blocks_s = (Nh, Nv_blocks), - ) - end - return nothing -end - -function fused_copyto!( - fmbc::FusedMultiBroadcast, - dest1::IJFH{S, Nij}, - ::ToCUDA, -) where {S, Nij} - _, _, _, _, Nh = size(dest1) - if Nh > 0 - auto_launch!( - knl_fused_copyto!, - (fmbc,); - threads_s = (Nij, Nij), - blocks_s = (Nh, 1), - ) - end - return nothing -end -function fused_copyto!( - fmbc::FusedMultiBroadcast, - dest1::VF{S, Nv}, - ::ToCUDA, -) where {S, Nv} - _, _, _, _, Nh = size(dest1) +) + (_, _, Nv, _, Nh) = DataLayouts.universal_size(dest1) if Nv > 0 && Nh > 0 + us = DataLayouts.UniversalSize(dest1) + args = (fmbc, dest1, us) + threads = threads_via_occupancy(knl_fused_copyto!, args) + n_max_threads = min(threads, get_N(us)) + p = partition(dest1, n_max_threads) auto_launch!( knl_fused_copyto!, - (fmbc,); - threads_s = (1, 1), - blocks_s = (Nh, Nv), + args; + threads_s = p.threads, + blocks_s = p.blocks, ) end return nothing end - -function fused_copyto!( - fmbc::FusedMultiBroadcast, - dest1::DataF{S}, - ::ToCUDA, -) where {S} - auto_launch!( - knl_fused_copyto!, - (fmbc,); - threads_s = (1, 1), - blocks_s = (1, 1), - ) - return nothing -end diff --git a/ext/cuda/data_layouts_threadblock.jl b/ext/cuda/data_layouts_threadblock.jl new file mode 100644 index 0000000000..ff37ac278f --- /dev/null +++ b/ext/cuda/data_layouts_threadblock.jl @@ -0,0 +1,212 @@ +const CI5 = CartesianIndex{5} + +maximum_allowable_threads() = ( + CUDA.attribute(CUDA.device(), CUDA.DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X), + CUDA.attribute(CUDA.device(), CUDA.DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y), + CUDA.attribute(CUDA.device(), CUDA.DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z), +) + +""" + partition(::AbstractData, n_max_threads) + +Given `n_max_threads`, which should be determined +from CUDA's occupancy API, `partition` returns +a NamedTuple containing: + + - `threads` size of threads to pass to CUDA + - `blocks` size of blocks to pass to CUDA + +The general pattern followed here, which seems +to produce good results, is to satisfy a few +criteria: + + - Maximize the number of (allowable) threads + in the thread partition + - The order of the thread partition should + follow the fastest changing index in the + datalayout (e.g., VIJ in VIJFH) +""" +function partition end + +""" + universal_index(::AbstractData) + +Returns a universal cartesian index, +computed from CUDA's `threadIdx`, +`blockIdx` and `blockDim`. +""" +function universal_index end + +""" + is_valid_index(::AbstractData, I::CartesianIndex, us::UniversalSize) + +Check the minimal number of index +bounds to ensure that the result of +`universal_index` is valid. +""" +function is_valid_index end + +##### VIJFH +@inline function partition(data::DataLayouts.VIJFH, n_max_threads::Integer) + (Nij, _, _, Nv, Nh) = DataLayouts.universal_size(data) + Nv_thread = min(Int(fld(n_max_threads, Nij * Nij)), Nv) + Nv_blocks = cld(Nv, Nv_thread) + @assert prod((Nv_thread, Nij, Nij)) ≤ n_max_threads "threads,n_max_threads=($(prod((Nv_thread, Nij, Nij))),$n_max_threads)" + return (; threads = (Nv_thread, Nij, Nij), blocks = (Nv_blocks, Nh)) +end +@inline function universal_index(::DataLayouts.VIJFH) + (tv, i, j) = CUDA.threadIdx() + (bv, h) = CUDA.blockIdx() + v = tv + (bv - 1) * CUDA.blockDim().x + return CartesianIndex((i, j, 1, v, h)) +end +@inline is_valid_index(::DataLayouts.VIJFH, I::CI5, us::UniversalSize) = + 1 ≤ I[4] ≤ DataLayouts.get_Nv(us) + +##### IJFH +@inline function partition(data::DataLayouts.IJFH, n_max_threads::Integer) + (Nij, _, _, _, Nh) = DataLayouts.universal_size(data) + Nh_thread = min( + Int(fld(n_max_threads, Nij * Nij)), + Nh, + maximum_allowable_threads()[3], + ) + Nh_blocks = cld(Nh, Nh_thread) + @assert prod((Nij, Nij)) ≤ n_max_threads "threads,n_max_threads=($(prod((Nij, Nij))),$n_max_threads)" + return (; threads = (Nij, Nij, Nh_thread), blocks = (Nh_blocks,)) +end +@inline function universal_index(::DataLayouts.IJFH) + (i, j, th) = CUDA.threadIdx() + (bh,) = CUDA.blockIdx() + h = th + (bh - 1) * CUDA.blockDim().z + return CartesianIndex((i, j, 1, 1, h)) +end +@inline is_valid_index(::DataLayouts.IJFH, I::CI5, us::UniversalSize) = + 1 ≤ I[5] ≤ DataLayouts.get_Nh(us) + +##### IFH +@inline function partition(data::DataLayouts.IFH, n_max_threads::Integer) + (Ni, _, _, _, Nh) = DataLayouts.universal_size(data) + Nh_thread = min(Int(fld(n_max_threads, Ni)), Nh) + Nh_blocks = cld(Nh, Nh_thread) + @assert prod((Ni, Nh_thread)) ≤ n_max_threads "threads,n_max_threads=($(prod((Ni, Nh_thread))),$n_max_threads)" + return (; threads = (Ni, Nh_thread), blocks = (Nh_blocks,)) +end +@inline function universal_index(::DataLayouts.IFH) + (i, th) = CUDA.threadIdx() + (bh,) = CUDA.blockIdx() + h = th + (bh - 1) * CUDA.blockDim().y + return CartesianIndex((i, 1, 1, 1, h)) +end +@inline is_valid_index(::DataLayouts.IFH, I::CI5, us::UniversalSize) = + 1 ≤ I[5] ≤ DataLayouts.get_Nh(us) + +##### IJF +@inline function partition(data::DataLayouts.IJF, n_max_threads::Integer) + (Nij, _, _, _, _) = DataLayouts.universal_size(data) + @assert prod((Nij, Nij)) ≤ n_max_threads "threads,n_max_threads=($(prod((Nij, Nij))),$n_max_threads)" + return (; threads = (Nij, Nij), blocks = (1,)) +end +@inline function universal_index(::DataLayouts.IJF) + (i, j) = CUDA.threadIdx() + return CartesianIndex((i, j, 1, 1, 1)) +end +@inline is_valid_index(::DataLayouts.IJF, I::CI5, us::UniversalSize) = true + +##### IF +@inline function partition(data::DataLayouts.IF, n_max_threads::Integer) + (Ni, _, _, _, _) = DataLayouts.universal_size(data) + @assert Ni ≤ n_max_threads "threads,n_max_threads=($(Ni),$n_max_threads)" + return (; threads = (Ni,), blocks = (1,)) +end +@inline function universal_index(::DataLayouts.IF) + (i,) = CUDA.threadIdx() + return CartesianIndex((i, 1, 1, 1, 1)) +end +@inline is_valid_index(::DataLayouts.IF, I::CI5, us::UniversalSize) = true + +##### VIFH +@inline function partition(data::DataLayouts.VIFH, n_max_threads::Integer) + (Ni, _, _, Nv, Nh) = DataLayouts.universal_size(data) + Nv_thread = min(Int(fld(n_max_threads, Ni)), Nv) + Nv_blocks = cld(Nv, Nv_thread) + @assert prod((Nv_thread, Ni)) ≤ n_max_threads "threads,n_max_threads=($(prod((Nv_thread, Ni))),$n_max_threads)" + return (; threads = (Nv_thread, Ni), blocks = (Nv_blocks, Nh)) +end +@inline function universal_index(::DataLayouts.VIFH) + (tv, i) = CUDA.threadIdx() + (bv, h) = CUDA.blockIdx() + v = tv + (bv - 1) * CUDA.blockDim().x + return CartesianIndex((i, 1, 1, v, h)) +end +@inline is_valid_index(::DataLayouts.VIFH, I::CI5, us::UniversalSize) = + 1 ≤ I[4] ≤ DataLayouts.get_Nv(us) + +##### VF +@inline function partition(data::DataLayouts.VF, n_max_threads::Integer) + (_, _, _, Nv, _) = DataLayouts.universal_size(data) + Nvt = fld(n_max_threads, Nv) + Nv_thread = Nvt == 0 ? n_max_threads : min(Int(Nvt), Nv) + Nv_blocks = cld(Nv, Nv_thread) + @assert Nv_thread ≤ n_max_threads "threads,n_max_threads=($(Nv_thread),$n_max_threads)" + (; threads = (Nv_thread,), blocks = (Nv_blocks,)) +end +@inline function universal_index(::DataLayouts.VF) + (tv,) = CUDA.threadIdx() + (bv,) = CUDA.blockIdx() + v = tv + (bv - 1) * CUDA.blockDim().x + return CartesianIndex((1, 1, 1, v, 1)) +end +@inline is_valid_index(::DataLayouts.VF, I::CI5, us::UniversalSize) = + 1 ≤ I[4] ≤ DataLayouts.get_Nv(us) + +##### DataF +@inline partition(data::DataLayouts.DataF, n_max_threads::Integer) = + (; threads = 1, blocks = 1) +@inline universal_index(::DataLayouts.DataF) = CartesianIndex((1, 1, 1, 1, 1)) +@inline is_valid_index(::DataLayouts.DataF, I::CI5, us::UniversalSize) = true + +##### +##### Custom partitions +##### + +##### Column-wise +@inline function columnwise_partition( + us::DataLayouts.UniversalSize, + n_max_threads::Integer, +) + (Nij, _, _, _, Nh) = DataLayouts.universal_size(us) + Nh_thread = min(Int(fld(n_max_threads, Nij * Nij)), Nh) + Nh_blocks = cld(Nh, Nh_thread) + @assert prod((Nij, Nij, Nh_thread)) ≤ n_max_threads "threads,n_max_threads=($(prod((Nij, Nij, Nh_thread))),$n_max_threads)" + return (; threads = (Nij, Nij, Nh_thread), blocks = (Nh_blocks,)) +end +@inline function columnwise_universal_index() + (i, j, th) = CUDA.threadIdx() + (bh,) = CUDA.blockIdx() + h = th + (bh - 1) * CUDA.blockDim().z + return CartesianIndex((i, j, 1, 1, h)) +end +@inline columnwise_is_valid_index(I::CI5, us::UniversalSize) = + 1 ≤ I[5] ≤ DataLayouts.get_Nh(us) + +##### Element-wise (e.g., limiters) +# TODO + +##### Multiple-field solve partition +@inline function multiple_field_solve_partition( + us::DataLayouts.UniversalSize, + n_max_threads::Integer; + Nnames, +) + (Nij, _, _, _, Nh) = DataLayouts.universal_size(us) + @assert prod((Nij, Nij, Nnames)) ≤ n_max_threads "threads,n_max_threads=($(prod((Nij, Nij, Nnames))),$n_max_threads)" + return (; threads = (Nij, Nij, Nnames), blocks = (Nh,)) +end +@inline function multiple_field_solve_universal_index() + (i, j, iname) = CUDA.threadIdx() + (h,) = CUDA.blockIdx() + return (CartesianIndex((i, j, 1, 1, h)), iname) +end +@inline multiple_field_solve_is_valid_index(I::CI5, us::UniversalSize) = + 1 ≤ I[5] ≤ DataLayouts.get_Nh(us) diff --git a/ext/cuda/matrix_fields_multiple_field_solve.jl b/ext/cuda/matrix_fields_multiple_field_solve.jl index d4150da5bc..57a7c5c0cb 100644 --- a/ext/cuda/matrix_fields_multiple_field_solve.jl +++ b/ext/cuda/matrix_fields_multiple_field_solve.jl @@ -18,10 +18,9 @@ NVTX.@annotate function multiple_field_solve!( b, x1, ) - Ni, Nj, _, _, Nh = size(Fields.field_values(x1)) names = MatrixFields.matrix_row_keys(keys(A)) Nnames = length(names) - nthreads, nblocks = _configure_threadblock(Ni * Nj * Nh * Nnames) + Ni, Nj, _, _, Nh = size(Fields.field_values(x1)) sscache = Operators.strip_space(cache) ssx = Operators.strip_space(x) ssA = Operators.strip_space(A) @@ -36,11 +35,17 @@ NVTX.@annotate function multiple_field_solve!( args = (device, caches, xs, As, bs, x1, Val(Nnames)) + us = UniversalSize(Fields.field_values(x1)) + nitems = Ni * Nj * Nh * Nnames + threads = threads_via_occupancy(multiple_field_solve_kernel!, args) + n_max_threads = min(threads, nitems) + p = multiple_field_solve_partition(us, n_max_threads; Nnames) + auto_launch!( multiple_field_solve_kernel!, args; - threads_s = nthreads, - blocks_s = nblocks, + threads_s = p.threads, + blocks_s = p.blocks, always_inline = true, ) end @@ -83,11 +88,10 @@ function multiple_field_solve_kernel!( ::Val{Nnames}, ) where {Nnames} @inbounds begin - Ni, Nj, _, _, Nh = size(Fields.field_values(x1)) - tidx = thread_index() - n = (Ni, Nj, Nh, Nnames) - if valid_range(tidx, prod(n)) - (i, j, h, iname) = kernel_indexes(tidx, n).I + us = UniversalSize(Fields.field_values(x1)) + (I, iname) = multiple_field_solve_universal_index() + if multiple_field_solve_is_valid_index(I, us) + (i, j, _, _, h) = I.I generated_single_field_solve!( device, caches, diff --git a/ext/cuda/matrix_fields_single_field_solve.jl b/ext/cuda/matrix_fields_single_field_solve.jl index 67e520a823..59c9e1424a 100644 --- a/ext/cuda/matrix_fields_single_field_solve.jl +++ b/ext/cuda/matrix_fields_single_field_solve.jl @@ -15,24 +15,24 @@ import ClimaCore.RecursiveApply: ⊠, ⊞, ⊟, rmap, rzero, rdiv function single_field_solve!(device::ClimaComms.CUDADevice, cache, x, A, b) Ni, Nj, _, _, Nh = size(Fields.field_values(A)) + us = UniversalSize(Fields.field_values(A)) + args = (device, cache, x, A, b, us) + threads = threads_via_occupancy(single_field_solve_kernel!, args) nitems = Ni * Nj * Nh - nthreads = min(256, nitems) - nblocks = cld(nitems, nthreads) - args = (device, cache, x, A, b) + n_max_threads = min(threads, nitems) + p = columnwise_partition(us, n_max_threads) auto_launch!( single_field_solve_kernel!, args; - threads_s = nthreads, - blocks_s = nblocks, + threads_s = p.threads, + blocks_s = p.blocks, ) end -function single_field_solve_kernel!(device, cache, x, A, b) - idx = CUDA.threadIdx().x + (CUDA.blockIdx().x - 1) * CUDA.blockDim().x - Ni, Nj, _, _, Nh = size(Fields.field_values(A)) - if idx <= Ni * Nj * Nh - (i, j, h) = CartesianIndices((1:Ni, 1:Nj, 1:Nh))[idx].I - +function single_field_solve_kernel!(device, cache, x, A, b, us) + I = columnwise_universal_index() + if columnwise_is_valid_index(I, us) + (i, j, _, _, h) = I.I _single_field_solve!( device, Spaces.column(cache, i, j, h), diff --git a/ext/cuda/operators_finite_difference.jl b/ext/cuda/operators_finite_difference.jl index da23bc01ea..c93b4e0797 100644 --- a/ext/cuda/operators_finite_difference.jl +++ b/ext/cuda/operators_finite_difference.jl @@ -18,27 +18,21 @@ function Base.copyto!( }, ) space = axes(out) - if space isa Spaces.ExtrudedFiniteDifferenceSpace - QS = Spaces.quadrature_style(space) - Nq = Quadratures.degrees_of_freedom(QS) - Nh = Topologies.nlocalelems(Spaces.topology(space)) - else - Nq = 1 - Nh = 1 - end - (li, lw, rw, ri) = bounds = Operators.window_bounds(space, bc) - Nv = ri - li + 1 - max_threads = 256 - us = DataLayouts.UniversalSize(Fields.field_values(out)) - nitems = Nv * Nq * Nq * Nh # # of independent items - (nthreads, nblocks) = _configure_threadblock(max_threads, nitems) + bounds = Operators.window_bounds(space, bc) + out_fv = Fields.field_values(out) + us = DataLayouts.UniversalSize(out_fv) args = (strip_space(out, space), strip_space(bc, space), axes(out), bounds, us) + + threads = threads_via_occupancy(copyto_stencil_kernel!, args) + n_max_threads = min(threads, get_N(us)) + p = partition(out_fv, n_max_threads) + auto_launch!( copyto_stencil_kernel!, args; - threads_s = (nthreads,), - blocks_s = (nblocks,), + threads_s = p.threads, + blocks_s = p.blocks, ) return out end @@ -46,13 +40,11 @@ import ClimaCore.DataLayouts: get_N, get_Nv, get_Nij, get_Nij, get_Nh function copyto_stencil_kernel!(out, bc, space, bds, us) @inbounds begin - gid = threadIdx().x + (blockIdx().x - 1) * blockDim().x - if gid ≤ get_N(us) + out_fv = Fields.field_values(out) + I = universal_index(out_fv) + if is_valid_index(out_fv, I, us) (li, lw, rw, ri) = bds - Nv = get_Nv(us) - Nq = get_Nij(us) - Nh = get_Nh(us) - (v, i, j, h) = cart_ind((Nv, Nq, Nq, Nh), gid).I + (i, j, _, v, h) = I.I hidx = (i, j, h) idx = v - 1 + li if idx < lw diff --git a/ext/cuda/operators_integral.jl b/ext/cuda/operators_integral.jl index cf5e5d2ac8..ef116daa0d 100644 --- a/ext/cuda/operators_integral.jl +++ b/ext/cuda/operators_integral.jl @@ -19,7 +19,6 @@ function column_reduce_device!( space, ) where {F, T} Ni, Nj, _, _, Nh = size(Fields.field_values(output)) - threads_s, blocks_s = _configure_threadblock(Ni * Nj * Nh) args = ( single_column_reduce!, f, @@ -29,7 +28,17 @@ function column_reduce_device!( init, space, ) - auto_launch!(bycolumn_kernel!, args; threads_s, blocks_s) + us = UniversalSize(Fields.field_values(output)) + nitems = Ni * Nj * Nh + threads = threads_via_occupancy(bycolumn_kernel!, args) + n_max_threads = min(threads, nitems) + p = columnwise_partition(us, n_max_threads) + auto_launch!( + bycolumn_kernel!, + args; + threads_s = p.threads, + blocks_s = p.blocks, + ) end function column_accumulate_device!( @@ -41,8 +50,7 @@ function column_accumulate_device!( init, space, ) where {F, T} - Ni, Nj, _, _, Nh = size(Fields.field_values(output)) - threads_s, blocks_s = _configure_threadblock(Ni * Nj * Nh) + us = UniversalSize(Fields.field_values(output)) args = ( single_column_accumulate!, f, @@ -52,7 +60,17 @@ function column_accumulate_device!( init, space, ) - auto_launch!(bycolumn_kernel!, args; threads_s, blocks_s) + Ni, Nj, _, _, Nh = size(Fields.field_values(output)) + nitems = Ni * Nj * Nh + threads = threads_via_occupancy(bycolumn_kernel!, args) + n_max_threads = min(threads, nitems) + p = columnwise_partition(us, n_max_threads) + auto_launch!( + bycolumn_kernel!, + args; + threads_s = p.threads, + blocks_s = p.blocks, + ) end bycolumn_kernel!( @@ -67,10 +85,10 @@ bycolumn_kernel!( if space isa Spaces.FiniteDifferenceSpace single_column_function!(f, transform, output, input, init, space) else - idx = threadIdx().x + (blockIdx().x - 1) * blockDim().x - Ni, Nj, _, _, Nh = size(Fields.field_values(output)) - if idx <= Ni * Nj * Nh - i, j, h = cart_ind((Ni, Nj, Nh), idx).I + I = columnwise_universal_index() + us = UniversalSize(Fields.field_values(output)) + if columnwise_is_valid_index(I, us) + (i, j, _, _, h) = I.I single_column_function!( f, transform, diff --git a/ext/cuda/operators_thomas_algorithm.jl b/ext/cuda/operators_thomas_algorithm.jl index 83546518ab..11eea95fed 100644 --- a/ext/cuda/operators_thomas_algorithm.jl +++ b/ext/cuda/operators_thomas_algorithm.jl @@ -5,14 +5,18 @@ import ClimaCore.Operators: import CUDA using CUDA: @cuda function column_thomas_solve!(::ClimaComms.CUDADevice, A, b) - Ni, Nj, _, _, Nh = size(Fields.field_values(A)) - nthreads, nblocks = _configure_threadblock(Ni * Nj * Nh) + us = UniversalSize(Fields.field_values(A)) args = (A, b) + Ni, Nj, _, _, Nh = size(Fields.field_values(A)) + threads = threads_via_occupancy(thomas_algorithm_kernel!, args) + nitems = Ni * Nj * Nh + n_max_threads = min(threads, nitems) + p = columnwise_partition(us, n_max_threads) auto_launch!( thomas_algorithm_kernel!, args; - threads_s = nthreads, - blocks_s = nblocks, + threads_s = p.threads, + blocks_s = p.blocks, ) end @@ -20,10 +24,10 @@ function thomas_algorithm_kernel!( A::Fields.ExtrudedFiniteDifferenceField, b::Fields.ExtrudedFiniteDifferenceField, ) - idx = threadIdx().x + (blockIdx().x - 1) * blockDim().x - Ni, Nj, _, _, Nh = size(Fields.field_values(A)) - if idx <= Ni * Nj * Nh - i, j, h = cart_ind((Ni, Nj, Nh), idx).I + I = columnwise_universal_index() + us = UniversalSize(Fields.field_values(A)) + if columnwise_is_valid_index(I, us) + (i, j, _, _, h) = I.I thomas_algorithm!(Spaces.column(A, i, j, h), Spaces.column(b, i, j, h)) end return nothing diff --git a/test/Fields/benchmark_field_multi_broadcast_fusion.jl b/test/Fields/benchmark_field_multi_broadcast_fusion.jl index 8d0aefeceb..e30f00f4ca 100644 --- a/test/Fields/benchmark_field_multi_broadcast_fusion.jl +++ b/test/Fields/benchmark_field_multi_broadcast_fusion.jl @@ -11,8 +11,9 @@ include("utils_field_multi_broadcast_fusion.jl") device = ClimaComms.device() space = TU.CenterExtrudedFiniteDifferenceSpace( FT; - zelem = 3, - helem = 4, + zelem = 63, + helem = 30, + Nq = 4, context = ClimaComms.context(device), ) X = Fields.FieldVector( @@ -43,8 +44,9 @@ end if device isa ClimaComms.CPUSingleThreaded space = CenterExtrudedFiniteDifferenceSpaceLineHSpace( FT; - zelem = 3, - helem = 4, + zelem = 63, + helem = 30, + Nq = 4, context = ClimaComms.context(device), ) X = Fields.FieldVector( @@ -101,7 +103,7 @@ end device = ClimaComms.device() colspace = TU.ColumnCenterFiniteDifferenceSpace( FT; - zelem = 3, + zelem = 63, context = ClimaComms.context(device), ) VF_data() = Fields.Field(FT, colspace) diff --git a/test/Operators/integrals.jl b/test/Operators/integrals.jl index 6d7137e0ae..76701c378c 100644 --- a/test/Operators/integrals.jl +++ b/test/Operators/integrals.jl @@ -26,7 +26,7 @@ test_allocs(allocs) = if ClimaComms.device() isa ClimaComms.AbstractCPUDevice @test allocs == 0 else - @test allocs ≤ 5000 # GPU always has ~2 kB of non-deterministic allocs. + @test allocs ≤ 5008 # GPU always has ~2 kB of non-deterministic allocs. end function test_column_integral_definite!(center_space)