Skip to content

Commit

Permalink
Merge #1481
Browse files Browse the repository at this point in the history
1481: Inlining and threading of regular broadcasting r=simonbyrne a=simonbyrne

Last fix to #1455.

Gets us to over 4 SYPD on a single GPU (https://buildkite.com/clima/climaatmos-target-gpu-simulations/builds/85#018ae2cc-8cef-4c77-a9d9-dc444f05645d)

- [ ] Code follows the [style guidelines](https://clima.github.io/ClimateMachine.jl/latest/DevDocs/CodeStyle/) OR N/A.
- [ ] Unit tests are included OR N/A.
- [ ] Code is exercised in an integration test OR N/A.
- [ ] Documentation has been added/updated OR N/A.


Co-authored-by: Simon Byrne <[email protected]>
  • Loading branch information
bors[bot] and simonbyrne authored Oct 1, 2023
2 parents 28cc4a7 + e965335 commit 6b370c5
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 33 deletions.
45 changes: 18 additions & 27 deletions src/DataLayouts/cuda.jl
Original file line number Diff line number Diff line change
Expand Up @@ -42,32 +42,16 @@ Base.similar(

function knl_copyto!(dest, src)

#=
nij, nh = size(dest)
thread_idx = CUDA.threadIdx().x
block_idx = CUDA.blockIdx().x
block_dim = CUDA.blockDim().x
# mapping to global idx to make insensitive
# to number of blocks / threads per device
global_idx = thread_idx + (block_idx - 1) * block_dim
nx, ny = nij, nij
i = global_idx % nx == 0 ? nx : global_idx % nx
j = cld(global_idx, nx)
h = ((global_idx-1) % (nx*nx)) + 1
=#

i = CUDA.threadIdx().x
j = CUDA.threadIdx().y

h = CUDA.blockIdx().x
v = CUDA.blockIdx().y
v = CUDA.blockDim().z * (CUDA.blockIdx().y - 1) + CUDA.threadIdx().z

I = CartesianIndex((i, j, 1, v, h))

@inbounds dest[I] = src[I]
if v <= size(dest, 4)
I = CartesianIndex((i, j, 1, v, h))
@inbounds dest[I] = src[I]
end
return nothing
end

Expand All @@ -76,11 +60,12 @@ function knl_fill!(dest, val)
j = CUDA.threadIdx().y

h = CUDA.blockIdx().x
v = CUDA.blockIdx().y
v = CUDA.blockDim().z * (CUDA.blockIdx().y - 1) + CUDA.threadIdx().z

I = CartesianIndex((i, j, 1, v, h))

@inbounds dest[I] = val
if v <= size(dest, 4)
I = CartesianIndex((i, j, 1, v, h))
@inbounds dest[I] = val
end
return nothing
end

Expand Down Expand Up @@ -117,7 +102,10 @@ function Base.copyto!(
) where {S, Nij, A <: CUDA.CuArray}
_, _, _, Nv, Nh = size(bc)
if Nv > 0 && Nh > 0
CUDA.@cuda threads = (Nij, Nij) blocks = (Nh, Nv) knl_copyto!(dest, bc)
Nv_per_block = fld(256, Nij * Nij)
Nv_blocks = cld(Nv, Nv_per_block)
CUDA.@cuda always_inline = true threads = (Nij, Nij, Nv_per_block) blocks =
(Nh, Nv_blocks) knl_copyto!(dest, bc)
end
return dest
end
Expand All @@ -127,7 +115,10 @@ function Base.fill!(
) where {S, Nij, A <: CUDA.CuArray}
_, _, _, Nv, Nh = size(dest)
if Nv > 0 && Nh > 0
CUDA.@cuda threads = (Nij, Nij) blocks = (Nh, Nv) knl_fill!(dest, val)
Nv_per_block = fld(256, Nij * Nij)
Nv_blocks = cld(Nv, Nv_per_block)
CUDA.@cuda always_inline = true threads = (Nij, Nij, Nv_per_block) blocks =
(Nh, Nv_blocks) knl_fill!(dest, val)
end
return dest
end
Expand Down
31 changes: 25 additions & 6 deletions test/DataLayouts/cuda.jl
Original file line number Diff line number Diff line change
Expand Up @@ -33,25 +33,44 @@ end
@testset "broadcasting" begin
FT = Float64
S1 = NamedTuple{(:a, :b), Tuple{Complex{Float64}, Float64}}
data1 = CuArray(ones(FT, 2, 2, 3, 2))
S2 = Float64
data2 = CuArray(ones(FT, 2, 2, 1, 2))
data1 = IJFH{S1, 2}(data1)
data2 = IJFH{S2, 2}(data2)
data_arr1 = CuArray(ones(FT, 2, 2, 3, 2))
data_arr2 = CuArray(ones(FT, 2, 2, 1, 2))
data1 = IJFH{S1, 2}(data_arr1)
data2 = IJFH{S2, 2}(data_arr2)

f(a1, a2) = a1.a.re * a2 + a1.b
res = f.(data1, data2)
@test res isa IJFH{Float64}
@test Array(parent(res)) == FT[2 for i in 1:2, j in 1:2, f in 1:1, h in 1:2]

Nv = 33
data_arr1 = CuArray(ones(FT, Nv, 4, 4, 3, 2))
data_arr2 = CuArray(ones(FT, Nv, 4, 4, 1, 2))
data1 = VIJFH{S1, 4}(data_arr1)
data2 = VIJFH{S2, 4}(data_arr2)

f(a1, a2) = a1.a.re * a2 + a1.b
res = f.(data1, data2)
@test res isa VIJFH{Float64}
@test Array(parent(res)) ==
FT[2 for v in 1:Nv, i in 1:4, j in 1:4, f in 1:1, h in 1:2]
end

#=

@testset "broadcasting assignment from scalar" begin
FT = Float64
S = Complex{FT}
data = IJFH{S, 2}(CuArray{FT}, 3)
data .= Complex(1.0, 2.0)
@test Array(parent(data)) ==
FT[f == 1 ? 1 : 2 for i in 1:2, j in 1:2, f in 1:2, h in 1:3]

Nv = 33
data = VIJFH{S, 4}(CuArray{FT}(undef, Nv, 4, 4, 2, 3))
data .= Complex(1.0, 2.0)
@test Array(parent(data)) == FT[
f == 1 ? 1 : 2 for v in 1:Nv, i in 1:4, j in 1:4, f in 1:2, h in 1:3
]
end
=#

0 comments on commit 6b370c5

Please sign in to comment.