Skip to content

Commit

Permalink
Merge pull request #1837 from CliMA/ck/gpu_kernel_tweaks
Browse files Browse the repository at this point in the history
Some minor improvement to cuda kernel utils
  • Loading branch information
charleskawczynski authored Jun 23, 2024
2 parents 2b4ecc8 + 4b48df1 commit 93194ba
Show file tree
Hide file tree
Showing 6 changed files with 65 additions and 44 deletions.
52 changes: 29 additions & 23 deletions ext/cuda/cuda_utils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -97,38 +97,44 @@ function auto_launch!(
end

"""
kernel_indexes(n)
thread_index()
Return the threadindex:
```
(CUDA.blockIdx().x - Int32(1)) * CUDA.blockDim().x + CUDA.threadIdx().x
```
"""
@inline thread_index() =
(CUDA.blockIdx().x - Int32(1)) * CUDA.blockDim().x + CUDA.threadIdx().x

"""
kernel_indexes(tidx, n)
Return a tuple of indexes from the kernel,
where `n` is a tuple of max lengths along each
where `tidx` is the cuda thread index and
`n` is a tuple of max lengths along each
dimension of the accessed data.
"""
function kernel_indexes(n::Tuple)
tidx = (CUDA.blockIdx().x - 1) * CUDA.blockDim().x + CUDA.threadIdx().x
inds = if 1 tidx prod(n)
CartesianIndices(map(x -> Base.OneTo(x), n))[tidx].I
else
ntuple(x -> -1, length(n))
end
return inds
end
Base.@propagate_inbounds kernel_indexes(tidx, n::Tuple) =
CartesianIndices(map(x -> Base.OneTo(x), n))[tidx]

"""
valid_range(inds, n)
valid_range(tidx, n::Int)
Returns a `Bool` indicating if the thread index
is in the valid range, based on `inds` (the result
of `kernel_indexes`) and `n`, a tuple of max lengths
along each dimension of the accessed data.
(`tidx`) is in the valid range, based on `n`, a
tuple of max lengths along each dimension of the
accessed data.
```julia
function kernel!(data, n)
inds = kernel_indexes(n)
if valid_range(inds, n)
do_work!(data[inds...])
@inbounds begin
tidx = thread_index()
if valid_range(tidx, n)
I = kernel_indexes(tidx, n)
do_work!(data[I])
end
end
end
```
"""
valid_range(inds::NTuple, n::Tuple) = all(i -> 1 inds[i] n[i], 1:length(n))
function valid_range(n::Tuple)
inds = kernel_indexes(n)
return all(i -> 1 inds[i] n[i], 1:length(n))
end
@inline valid_range(tidx, n) = 1 tidx n
14 changes: 7 additions & 7 deletions ext/cuda/fill.jl
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
cartesian_index(::AbstractData, inds) = CartesianIndex(inds)

function knl_fill_flat!(dest::AbstractData, val)
n = DataLayouts.universal_size(dest)
inds = kernel_indexes(n)
if valid_range(inds, n)
I = cartesian_index(dest, inds)
@inbounds dest[I] = val
@inbounds begin
tidx = thread_index()
n = DataLayouts.universal_size(dest)
if valid_range(tidx, prod(n))
I = kernel_indexes(tidx, n)
@inbounds dest[I] = val
end
end
return nothing
end
Expand Down
18 changes: 9 additions & 9 deletions ext/cuda/limiters.jl
Original file line number Diff line number Diff line change
Expand Up @@ -54,9 +54,9 @@ function compute_element_bounds_kernel!(
::Val{Nj},
) where {Ni, Nj}
n = (Nv, Nh)
if valid_range(n)
inds = kernel_indexes(n)
(v, h) = inds
tidx = thread_index()
@inbounds if valid_range(tidx, prod(n))
(v, h) = kernel_indexes(tidx, n).I
(; q_bounds) = limiter
local q_min, q_max
slab_ρq = slab(ρq, v, h)
Expand Down Expand Up @@ -118,9 +118,9 @@ function compute_neighbor_bounds_local_kernel!(
) where {Ni, Nj}

n = (Nv, Nh)
if valid_range(n)
inds = kernel_indexes(n)
(v, h) = inds
tidx = thread_index()
@inbounds if valid_range(tidx, prod(n))
(v, h) = kernel_indexes(tidx, n).I
(; q_bounds, q_bounds_nbr, ghost_buffer, rtol) = limiter
slab_q_bounds = slab(q_bounds, v, h)
q_min = slab_q_bounds[1]
Expand Down Expand Up @@ -188,9 +188,9 @@ function apply_limiter_kernel!(
(; q_bounds_nbr, rtol) = limiter
converged = true
n = (Nv, Nh)
if valid_range(n)
inds = kernel_indexes(n)
(v, h) = inds
tidx = thread_index()
@inbounds if valid_range(tidx, prod(n))
(v, h) = kernel_indexes(tidx, n).I

slab_ρ = slab(ρ_data, v, h)
slab_ρq = slab(ρq_data, v, h)
Expand Down
8 changes: 4 additions & 4 deletions ext/cuda/matrix_fields_multiple_field_solve.jl
Original file line number Diff line number Diff line change
Expand Up @@ -85,10 +85,10 @@ function multiple_field_solve_kernel!(
) where {Nnames}
@inbounds begin
Ni, Nj, _, _, Nh = size(Fields.field_values(x1))
tidx = (CUDA.blockIdx().x - 1) * CUDA.blockDim().x + CUDA.threadIdx().x
if 1 tidx prod((Ni, Nj, Nh, Nnames))
(i, j, h, iname) =
CartesianIndices((1:Ni, 1:Nj, 1:Nh, 1:Nnames))[tidx].I
tidx = thread_index()
n = (Ni, Nj, Nh, Nnames)
if valid_range(tidx, prod(n))
(i, j, h, iname) = kernel_indexes(tidx, n).I
generated_single_field_solve!(
device,
caches,
Expand Down
15 changes: 15 additions & 0 deletions src/DataLayouts/DataLayouts.jl
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,21 @@ abstract type AbstractData{S} end

Base.size(data::AbstractData, i::Integer) = size(data)[i]

"""
(Ni, Nj, Nf, Nv, Nh) = universal_size(data::AbstractData)
Returns dimensions in a universal
format for all data layouts:
- `Ni` number of spectral element nodal degrees of freedom in first horizontal direction
- `Nj` number of spectral element nodal degrees of freedom in second horizontal direction
- `Nf` number of field components
- `Nv` number of vertical degrees of freedom
- `Nh` number of horizontal elements
Note: this is similar to `Base.size`, except
that `universal_size` does not return 1
for the number of field components.
"""
function universal_size(data::AbstractData)
s = size(data)
return (s[1], s[2], ncomponents(data), s[4], s[5])
Expand Down
2 changes: 1 addition & 1 deletion test/Spaces/distributed_cuda/ddss3.jl
Original file line number Diff line number Diff line change
Expand Up @@ -147,5 +147,5 @@ partition numbers
end
#! format: on
p = @allocated Spaces.weighted_dss!(y0, dss_buffer)
iamroot && @test p 10816
iamroot && @test p 11072
end

0 comments on commit 93194ba

Please sign in to comment.