diff --git a/src/DataLayouts/cuda.jl b/src/DataLayouts/cuda.jl index 57e3dd4dd5..9f0c8e656b 100644 --- a/src/DataLayouts/cuda.jl +++ b/src/DataLayouts/cuda.jl @@ -42,32 +42,16 @@ Base.similar( function knl_copyto!(dest, src) - #= - nij, nh = size(dest) - - thread_idx = CUDA.threadIdx().x - block_idx = CUDA.blockIdx().x - block_dim = CUDA.blockDim().x - - # mapping to global idx to make insensitive - # to number of blocks / threads per device - global_idx = thread_idx + (block_idx - 1) * block_dim - - nx, ny = nij, nij - i = global_idx % nx == 0 ? nx : global_idx % nx - j = cld(global_idx, nx) - h = ((global_idx-1) % (nx*nx)) + 1 - =# - i = CUDA.threadIdx().x j = CUDA.threadIdx().y h = CUDA.blockIdx().x - v = CUDA.blockIdx().y + v = CUDA.blockDim().z * (CUDA.blockIdx().y - 1) + CUDA.threadIdx().z - I = CartesianIndex((i, j, 1, v, h)) - - @inbounds dest[I] = src[I] + if v <= size(dest, 4) + I = CartesianIndex((i, j, 1, v, h)) + @inbounds dest[I] = src[I] + end return nothing end @@ -76,11 +60,12 @@ function knl_fill!(dest, val) j = CUDA.threadIdx().y h = CUDA.blockIdx().x - v = CUDA.blockIdx().y + v = CUDA.blockDim().z * (CUDA.blockIdx().y - 1) + CUDA.threadIdx().z - I = CartesianIndex((i, j, 1, v, h)) - - @inbounds dest[I] = val + if v <= size(dest, 4) + I = CartesianIndex((i, j, 1, v, h)) + @inbounds dest[I] = val + end return nothing end @@ -117,7 +102,10 @@ function Base.copyto!( ) where {S, Nij, A <: CUDA.CuArray} _, _, _, Nv, Nh = size(bc) if Nv > 0 && Nh > 0 - CUDA.@cuda threads = (Nij, Nij) blocks = (Nh, Nv) knl_copyto!(dest, bc) + Nv_per_block = fld(256, Nij * Nij) + Nv_blocks = cld(Nv, Nv_per_block) + CUDA.@cuda always_inline = true threads = (Nij, Nij, Nv_per_block) blocks = + (Nh, Nv_blocks) knl_copyto!(dest, bc) end return dest end @@ -127,7 +115,10 @@ function Base.fill!( ) where {S, Nij, A <: CUDA.CuArray} _, _, _, Nv, Nh = size(dest) if Nv > 0 && Nh > 0 - CUDA.@cuda threads = (Nij, Nij) blocks = (Nh, Nv) knl_fill!(dest, val) + Nv_per_block = fld(256, Nij * Nij) + Nv_blocks = cld(Nv, Nv_per_block) + CUDA.@cuda always_inline = true threads = (Nij, Nij, Nv_per_block) blocks = + (Nh, Nv_blocks) knl_fill!(dest, val) end return dest end diff --git a/test/DataLayouts/cuda.jl b/test/DataLayouts/cuda.jl index 8b733723a9..2d18577465 100644 --- a/test/DataLayouts/cuda.jl +++ b/test/DataLayouts/cuda.jl @@ -33,19 +33,31 @@ end @testset "broadcasting" begin FT = Float64 S1 = NamedTuple{(:a, :b), Tuple{Complex{Float64}, Float64}} - data1 = CuArray(ones(FT, 2, 2, 3, 2)) S2 = Float64 - data2 = CuArray(ones(FT, 2, 2, 1, 2)) - data1 = IJFH{S1, 2}(data1) - data2 = IJFH{S2, 2}(data2) + data_arr1 = CuArray(ones(FT, 2, 2, 3, 2)) + data_arr2 = CuArray(ones(FT, 2, 2, 1, 2)) + data1 = IJFH{S1, 2}(data_arr1) + data2 = IJFH{S2, 2}(data_arr2) f(a1, a2) = a1.a.re * a2 + a1.b res = f.(data1, data2) @test res isa IJFH{Float64} @test Array(parent(res)) == FT[2 for i in 1:2, j in 1:2, f in 1:1, h in 1:2] + + Nv = 33 + data_arr1 = CuArray(ones(FT, Nv, 4, 4, 3, 2)) + data_arr2 = CuArray(ones(FT, Nv, 4, 4, 1, 2)) + data1 = VIJFH{S1, 4}(data_arr1) + data2 = VIJFH{S2, 4}(data_arr2) + + f(a1, a2) = a1.a.re * a2 + a1.b + res = f.(data1, data2) + @test res isa VIJFH{Float64} + @test Array(parent(res)) == + FT[2 for v in 1:Nv, i in 1:4, j in 1:4, f in 1:1, h in 1:2] end -#= + @testset "broadcasting assignment from scalar" begin FT = Float64 S = Complex{FT} @@ -53,5 +65,11 @@ end data .= Complex(1.0, 2.0) @test Array(parent(data)) == FT[f == 1 ? 1 : 2 for i in 1:2, j in 1:2, f in 1:2, h in 1:3] + + Nv = 33 + data = VIJFH{S, 4}(CuArray{FT}(undef, Nv, 4, 4, 2, 3)) + data .= Complex(1.0, 2.0) + @test Array(parent(data)) == FT[ + f == 1 ? 1 : 2 for v in 1:Nv, i in 1:4, j in 1:4, f in 1:2, h in 1:3 + ] end -=#