diff --git a/ext/cuda/cuda_utils.jl b/ext/cuda/cuda_utils.jl index 1d21a53304..6e84d4ef9e 100644 --- a/ext/cuda/cuda_utils.jl +++ b/ext/cuda/cuda_utils.jl @@ -97,38 +97,44 @@ function auto_launch!( end """ - kernel_indexes(n) + thread_index() + +Return the threadindex: +``` +(CUDA.blockIdx().x - Int32(1)) * CUDA.blockDim().x + CUDA.threadIdx().x +``` +""" +@inline thread_index() = + (CUDA.blockIdx().x - Int32(1)) * CUDA.blockDim().x + CUDA.threadIdx().x + +""" + kernel_indexes(tidx, n) Return a tuple of indexes from the kernel, -where `n` is a tuple of max lengths along each +where `tidx` is the cuda thread index and +`n` is a tuple of max lengths along each dimension of the accessed data. """ -function kernel_indexes(n::Tuple) - tidx = (CUDA.blockIdx().x - 1) * CUDA.blockDim().x + CUDA.threadIdx().x - inds = if 1 ≤ tidx ≤ prod(n) - CartesianIndices(map(x -> Base.OneTo(x), n))[tidx].I - else - ntuple(x -> -1, length(n)) - end - return inds -end +Base.@propagate_inbounds kernel_indexes(tidx, n::Tuple) = + CartesianIndices(map(x -> Base.OneTo(x), n))[tidx] """ - valid_range(inds, n) + valid_range(tidx, n::Int) + Returns a `Bool` indicating if the thread index -is in the valid range, based on `inds` (the result -of `kernel_indexes`) and `n`, a tuple of max lengths -along each dimension of the accessed data. +(`tidx`) is in the valid range, based on `n`, a +tuple of max lengths along each dimension of the + +accessed data. ```julia function kernel!(data, n) - inds = kernel_indexes(n) - if valid_range(inds, n) - do_work!(data[inds...]) + @inbounds begin + tidx = thread_index() + if valid_range(tidx, n) + I = kernel_indexes(tidx, n) + do_work!(data[I]) + end end end ``` """ -valid_range(inds::NTuple, n::Tuple) = all(i -> 1 ≤ inds[i] ≤ n[i], 1:length(n)) -function valid_range(n::Tuple) - inds = kernel_indexes(n) - return all(i -> 1 ≤ inds[i] ≤ n[i], 1:length(n)) -end +@inline valid_range(tidx, n) = 1 ≤ tidx ≤ n diff --git a/ext/cuda/fill.jl b/ext/cuda/fill.jl index 86e8a023e3..658d31226b 100644 --- a/ext/cuda/fill.jl +++ b/ext/cuda/fill.jl @@ -1,11 +1,11 @@ -cartesian_index(::AbstractData, inds) = CartesianIndex(inds) - function knl_fill_flat!(dest::AbstractData, val) - n = DataLayouts.universal_size(dest) - inds = kernel_indexes(n) - if valid_range(inds, n) - I = cartesian_index(dest, inds) - @inbounds dest[I] = val + @inbounds begin + tidx = thread_index() + n = DataLayouts.universal_size(dest) + if valid_range(tidx, prod(n)) + I = kernel_indexes(tidx, n) + @inbounds dest[I] = val + end end return nothing end diff --git a/ext/cuda/limiters.jl b/ext/cuda/limiters.jl index 4856f36c62..bf2c80e901 100644 --- a/ext/cuda/limiters.jl +++ b/ext/cuda/limiters.jl @@ -54,9 +54,9 @@ function compute_element_bounds_kernel!( ::Val{Nj}, ) where {Ni, Nj} n = (Nv, Nh) - if valid_range(n) - inds = kernel_indexes(n) - (v, h) = inds + tidx = thread_index() + @inbounds if valid_range(tidx, prod(n)) + (v, h) = kernel_indexes(tidx, n).I (; q_bounds) = limiter local q_min, q_max slab_ρq = slab(ρq, v, h) @@ -118,9 +118,9 @@ function compute_neighbor_bounds_local_kernel!( ) where {Ni, Nj} n = (Nv, Nh) - if valid_range(n) - inds = kernel_indexes(n) - (v, h) = inds + tidx = thread_index() + @inbounds if valid_range(tidx, prod(n)) + (v, h) = kernel_indexes(tidx, n).I (; q_bounds, q_bounds_nbr, ghost_buffer, rtol) = limiter slab_q_bounds = slab(q_bounds, v, h) q_min = slab_q_bounds[1] @@ -188,9 +188,9 @@ function apply_limiter_kernel!( (; q_bounds_nbr, rtol) = limiter converged = true n = (Nv, Nh) - if valid_range(n) - inds = kernel_indexes(n) - (v, h) = inds + tidx = thread_index() + @inbounds if valid_range(tidx, prod(n)) + (v, h) = kernel_indexes(tidx, n).I slab_ρ = slab(ρ_data, v, h) slab_ρq = slab(ρq_data, v, h) diff --git a/ext/cuda/matrix_fields_multiple_field_solve.jl b/ext/cuda/matrix_fields_multiple_field_solve.jl index b22f233441..afd5c4adb8 100644 --- a/ext/cuda/matrix_fields_multiple_field_solve.jl +++ b/ext/cuda/matrix_fields_multiple_field_solve.jl @@ -85,10 +85,10 @@ function multiple_field_solve_kernel!( ) where {Nnames} @inbounds begin Ni, Nj, _, _, Nh = size(Fields.field_values(x1)) - tidx = (CUDA.blockIdx().x - 1) * CUDA.blockDim().x + CUDA.threadIdx().x - if 1 ≤ tidx ≤ prod((Ni, Nj, Nh, Nnames)) - (i, j, h, iname) = - CartesianIndices((1:Ni, 1:Nj, 1:Nh, 1:Nnames))[tidx].I + tidx = thread_index() + n = (Ni, Nj, Nh, Nnames) + if valid_range(tidx, prod(n)) + (i, j, h, iname) = kernel_indexes(tidx, n).I generated_single_field_solve!( device, caches, diff --git a/src/DataLayouts/DataLayouts.jl b/src/DataLayouts/DataLayouts.jl index 784715f02d..758ccd5725 100644 --- a/src/DataLayouts/DataLayouts.jl +++ b/src/DataLayouts/DataLayouts.jl @@ -46,6 +46,21 @@ abstract type AbstractData{S} end Base.size(data::AbstractData, i::Integer) = size(data)[i] +""" + (Ni, Nj, Nf, Nv, Nh) = universal_size(data::AbstractData) + +Returns dimensions in a universal +format for all data layouts: + - `Ni` number of spectral element nodal degrees of freedom in first horizontal direction + - `Nj` number of spectral element nodal degrees of freedom in second horizontal direction + - `Nf` number of field components + - `Nv` number of vertical degrees of freedom + - `Nh` number of horizontal elements + +Note: this is similar to `Base.size`, except + that `universal_size` does not return 1 + for the number of field components. +""" function universal_size(data::AbstractData) s = size(data) return (s[1], s[2], ncomponents(data), s[4], s[5]) diff --git a/test/Spaces/distributed_cuda/ddss3.jl b/test/Spaces/distributed_cuda/ddss3.jl index b76089b267..2df6ef12a0 100644 --- a/test/Spaces/distributed_cuda/ddss3.jl +++ b/test/Spaces/distributed_cuda/ddss3.jl @@ -147,5 +147,5 @@ partition numbers end #! format: on p = @allocated Spaces.weighted_dss!(y0, dss_buffer) - iamroot && @test p ≤ 10816 + iamroot && @test p ≤ 11072 end