From 0854a06435c0bcee565277749db754361d32a51d Mon Sep 17 00:00:00 2001 From: Charles Kawczynski Date: Fri, 21 Jun 2024 16:56:05 -0400 Subject: [PATCH] Some minor improvement to cuda kernel utils --- ext/cuda/cuda_utils.jl | 52 +++++++++++-------- ext/cuda/fill.jl | 14 ++--- ext/cuda/limiters.jl | 18 +++---- .../matrix_fields_multiple_field_solve.jl | 8 +-- ext/cuda/operators_finite_difference.jl | 41 ++++++++------- src/DataLayouts/DataLayouts.jl | 15 ++++++ 6 files changed, 86 insertions(+), 62 deletions(-) diff --git a/ext/cuda/cuda_utils.jl b/ext/cuda/cuda_utils.jl index 1d21a53304..6e84d4ef9e 100644 --- a/ext/cuda/cuda_utils.jl +++ b/ext/cuda/cuda_utils.jl @@ -97,38 +97,44 @@ function auto_launch!( end """ - kernel_indexes(n) + thread_index() + +Return the threadindex: +``` +(CUDA.blockIdx().x - Int32(1)) * CUDA.blockDim().x + CUDA.threadIdx().x +``` +""" +@inline thread_index() = + (CUDA.blockIdx().x - Int32(1)) * CUDA.blockDim().x + CUDA.threadIdx().x + +""" + kernel_indexes(tidx, n) Return a tuple of indexes from the kernel, -where `n` is a tuple of max lengths along each +where `tidx` is the cuda thread index and +`n` is a tuple of max lengths along each dimension of the accessed data. """ -function kernel_indexes(n::Tuple) - tidx = (CUDA.blockIdx().x - 1) * CUDA.blockDim().x + CUDA.threadIdx().x - inds = if 1 ≤ tidx ≤ prod(n) - CartesianIndices(map(x -> Base.OneTo(x), n))[tidx].I - else - ntuple(x -> -1, length(n)) - end - return inds -end +Base.@propagate_inbounds kernel_indexes(tidx, n::Tuple) = + CartesianIndices(map(x -> Base.OneTo(x), n))[tidx] """ - valid_range(inds, n) + valid_range(tidx, n::Int) + Returns a `Bool` indicating if the thread index -is in the valid range, based on `inds` (the result -of `kernel_indexes`) and `n`, a tuple of max lengths -along each dimension of the accessed data. +(`tidx`) is in the valid range, based on `n`, a +tuple of max lengths along each dimension of the + +accessed data. ```julia function kernel!(data, n) - inds = kernel_indexes(n) - if valid_range(inds, n) - do_work!(data[inds...]) + @inbounds begin + tidx = thread_index() + if valid_range(tidx, n) + I = kernel_indexes(tidx, n) + do_work!(data[I]) + end end end ``` """ -valid_range(inds::NTuple, n::Tuple) = all(i -> 1 ≤ inds[i] ≤ n[i], 1:length(n)) -function valid_range(n::Tuple) - inds = kernel_indexes(n) - return all(i -> 1 ≤ inds[i] ≤ n[i], 1:length(n)) -end +@inline valid_range(tidx, n) = 1 ≤ tidx ≤ n diff --git a/ext/cuda/fill.jl b/ext/cuda/fill.jl index 86e8a023e3..658d31226b 100644 --- a/ext/cuda/fill.jl +++ b/ext/cuda/fill.jl @@ -1,11 +1,11 @@ -cartesian_index(::AbstractData, inds) = CartesianIndex(inds) - function knl_fill_flat!(dest::AbstractData, val) - n = DataLayouts.universal_size(dest) - inds = kernel_indexes(n) - if valid_range(inds, n) - I = cartesian_index(dest, inds) - @inbounds dest[I] = val + @inbounds begin + tidx = thread_index() + n = DataLayouts.universal_size(dest) + if valid_range(tidx, prod(n)) + I = kernel_indexes(tidx, n) + @inbounds dest[I] = val + end end return nothing end diff --git a/ext/cuda/limiters.jl b/ext/cuda/limiters.jl index 4856f36c62..bf2c80e901 100644 --- a/ext/cuda/limiters.jl +++ b/ext/cuda/limiters.jl @@ -54,9 +54,9 @@ function compute_element_bounds_kernel!( ::Val{Nj}, ) where {Ni, Nj} n = (Nv, Nh) - if valid_range(n) - inds = kernel_indexes(n) - (v, h) = inds + tidx = thread_index() + @inbounds if valid_range(tidx, prod(n)) + (v, h) = kernel_indexes(tidx, n).I (; q_bounds) = limiter local q_min, q_max slab_ρq = slab(ρq, v, h) @@ -118,9 +118,9 @@ function compute_neighbor_bounds_local_kernel!( ) where {Ni, Nj} n = (Nv, Nh) - if valid_range(n) - inds = kernel_indexes(n) - (v, h) = inds + tidx = thread_index() + @inbounds if valid_range(tidx, prod(n)) + (v, h) = kernel_indexes(tidx, n).I (; q_bounds, q_bounds_nbr, ghost_buffer, rtol) = limiter slab_q_bounds = slab(q_bounds, v, h) q_min = slab_q_bounds[1] @@ -188,9 +188,9 @@ function apply_limiter_kernel!( (; q_bounds_nbr, rtol) = limiter converged = true n = (Nv, Nh) - if valid_range(n) - inds = kernel_indexes(n) - (v, h) = inds + tidx = thread_index() + @inbounds if valid_range(tidx, prod(n)) + (v, h) = kernel_indexes(tidx, n).I slab_ρ = slab(ρ_data, v, h) slab_ρq = slab(ρq_data, v, h) diff --git a/ext/cuda/matrix_fields_multiple_field_solve.jl b/ext/cuda/matrix_fields_multiple_field_solve.jl index b22f233441..afd5c4adb8 100644 --- a/ext/cuda/matrix_fields_multiple_field_solve.jl +++ b/ext/cuda/matrix_fields_multiple_field_solve.jl @@ -85,10 +85,10 @@ function multiple_field_solve_kernel!( ) where {Nnames} @inbounds begin Ni, Nj, _, _, Nh = size(Fields.field_values(x1)) - tidx = (CUDA.blockIdx().x - 1) * CUDA.blockDim().x + CUDA.threadIdx().x - if 1 ≤ tidx ≤ prod((Ni, Nj, Nh, Nnames)) - (i, j, h, iname) = - CartesianIndices((1:Ni, 1:Nj, 1:Nh, 1:Nnames))[tidx].I + tidx = thread_index() + n = (Ni, Nj, Nh, Nnames) + if valid_range(tidx, prod(n)) + (i, j, h, iname) = kernel_indexes(tidx, n).I generated_single_field_solve!( device, caches, diff --git a/ext/cuda/operators_finite_difference.jl b/ext/cuda/operators_finite_difference.jl index 43ee90c86c..043a6e0608 100644 --- a/ext/cuda/operators_finite_difference.jl +++ b/ext/cuda/operators_finite_difference.jl @@ -51,26 +51,29 @@ function Base.copyto!( end function copyto_stencil_kernel!(out, bc, space, bds, Nq, Nh, Nv) - gid = threadIdx().x + (blockIdx().x - 1) * blockDim().x - if gid ≤ Nv * Nq * Nq * Nh - (li, lw, rw, ri) = bds - (v, i, j, h) = Topologies._get_idx((Nv, Nq, Nq, Nh), gid) - hidx = (i, j, h) - idx = v - 1 + li - window = - idx < lw ? LeftBoundaryWindow{Spaces.left_boundary_name(space)}() : - ( - idx > rw ? - RightBoundaryWindow{Spaces.right_boundary_name(space)}() : - Interior() + @inbounds begin + gid = threadIdx().x + (blockIdx().x - Int32(1)) * blockDim().x + if gid ≤ Nv * Nq * Nq * Nh + (li, lw, rw, ri) = bds + (v, i, j, h) = Topologies._get_idx((Nv, Nq, Nq, Nh), gid) + hidx = (i, j, h) + idx = v - 1 + li + window = + idx < lw ? + LeftBoundaryWindow{Spaces.left_boundary_name(space)}() : + ( + idx > rw ? + RightBoundaryWindow{Spaces.right_boundary_name(space)}() : + Interior() + ) + setidx!( + space, + out, + idx, + hidx, + Operators.getidx(space, bc, window, idx, hidx), ) - setidx!( - space, - out, - idx, - hidx, - Operators.getidx(space, bc, window, idx, hidx), - ) + end end return nothing end diff --git a/src/DataLayouts/DataLayouts.jl b/src/DataLayouts/DataLayouts.jl index 784715f02d..758ccd5725 100644 --- a/src/DataLayouts/DataLayouts.jl +++ b/src/DataLayouts/DataLayouts.jl @@ -46,6 +46,21 @@ abstract type AbstractData{S} end Base.size(data::AbstractData, i::Integer) = size(data)[i] +""" + (Ni, Nj, Nf, Nv, Nh) = universal_size(data::AbstractData) + +Returns dimensions in a universal +format for all data layouts: + - `Ni` number of spectral element nodal degrees of freedom in first horizontal direction + - `Nj` number of spectral element nodal degrees of freedom in second horizontal direction + - `Nf` number of field components + - `Nv` number of vertical degrees of freedom + - `Nh` number of horizontal elements + +Note: this is similar to `Base.size`, except + that `universal_size` does not return 1 + for the number of field components. +""" function universal_size(data::AbstractData) s = size(data) return (s[1], s[2], ncomponents(data), s[4], s[5])