Skip to content

Commit

Permalink
Merge pull request #49 from maxfreu/deflate-upsampling
Browse files Browse the repository at this point in the history
get rid of duplicate code in upsampling code by using dispatch
  • Loading branch information
CarloLucibello authored Jun 15, 2022
2 parents a203439 + 2c19fbf commit bc07266
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 107 deletions.
2 changes: 1 addition & 1 deletion ext/CUDAExt/Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"

[compat]
CUDA = "3.3.1"
NNlib = "0.8.3"
NNlib = "0.8.6"
julia = "1.6"

[extras]
Expand Down
148 changes: 42 additions & 106 deletions ext/CUDAExt/src/upsample.jl
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@

# Forward and backward pass have been tested to produce the same output
# as pytorch with align_corners=True - it works modulo bit noise.
# pytorch's default is align_corners=False, because otherwise the gradients depend on the
# image size, which should be avoided -> this should be considered here as well

@inline function compute_source_index(ratio::T, dst_index, align_corners) where T
if align_corners
Expand All @@ -52,11 +54,45 @@
end
end

function NNlib.upsample_linear_kernel!(y::CuArray{T,N}, x::CuArray{T,N}; align_corners=true) where {T,N}
out_size = prod(size(y)[1:N-2])

if align_corners
ratios = ntuple(i -> T((size(x,i)-1) / (size(y,i)-1)), N-2)
else
ratios = ntuple(i -> T(size(x,i) / size(y,i)), N-2)
end

kernel = @cuda launch=false upsample_linear_cuda_kernel!(out_size, ratios..., x, y, align_corners)
config = launch_configuration(kernel.fun; max_threads=256)
threads = Base.min(out_size, config.threads)
blocks = cld(out_size, threads)
kernel(out_size, ratios..., x, y, align_corners; threads=threads, blocks=blocks)
return y
end

function NNlib.∇upsample_linear_kernel!(dx::CuArray{T,N}, Δ::CuArray{T,N}; align_corners=true) where {T,N}
in_size = prod(size(Δ)[1:N-2])

if align_corners
ratios = ntuple(i -> T((size(dx,i)-1) / (size(Δ,i)-1)), N-2) # reversed compared to forward pass
else
ratios = ntuple(i -> T(size(dx,i) / size(Δ,i)), N-2)
end

kernel = @cuda launch=false ∇upsample_linear_cuda_kernel!(in_size, ratios..., Δ, dx, align_corners)
config = launch_configuration(kernel.fun; max_threads=256)
threads = Base.min(in_size, config.threads)
blocks = cld(in_size, threads)
kernel(in_size, ratios..., Δ, dx, align_corners; threads=threads, blocks=blocks)
return dx
end


###########
# linear
###########
function upsample_linear_wcn_kernel!(n_elem, rwidth, x, y, align_corners)
function upsample_linear_cuda_kernel!(n_elem, rwidth, x::CuDeviceArray{<:Any, 3}, y::CuDeviceArray{<:Any, 3}, align_corners)
index = (threadIdx().x - 1) + (blockIdx().x - 1) * blockDim().x

if index < n_elem
Expand Down Expand Up @@ -86,7 +122,7 @@ function upsample_linear_wcn_kernel!(n_elem, rwidth, x, y, align_corners)
end

# Δ is the gradient backpropagated from downstream layers
function upsample_linear_wcn_kernel!(n_elem, rwidth, Δ, dx, align_corners)
function upsample_linear_cuda_kernel!(n_elem, rwidth, Δ::CuDeviceArray{<:Any, 3}, dx::CuDeviceArray{<:Any, 3}, align_corners)
index = (threadIdx().x - 1) + (blockIdx().x - 1) * blockDim().x

if index < n_elem
Expand Down Expand Up @@ -115,44 +151,11 @@ function ∇upsample_linear_wcn_kernel!(n_elem, rwidth, Δ, dx, align_corners)
return nothing
end

function NNlib.upsample_linear_wcn!(y::CuArray{T,3}, x::CuArray{T,3}; align_corners=true) where T
out_size = size(y)[1] # w

if align_corners
ratios = ntuple(i -> T((size(x,i)-1) / (size(y,i)-1)), 1)
else
ratios = ntuple(i -> T(size(x,i) / size(y,i)), 1)
end

kernel = @cuda launch=false upsample_linear_wcn_kernel!(out_size, ratios..., x, y, align_corners)
config = launch_configuration(kernel.fun; max_threads=256)
threads = Base.min(out_size, config.threads)
blocks = cld(out_size, threads)
kernel(out_size, ratios..., x, y, align_corners; threads=threads, blocks=blocks)
return y
end

function NNlib.∇upsample_linear_wcn!(dx::CuArray{T,3}, Δ::CuArray{T,3}; align_corners=true) where T
in_size = size(Δ)[1]
if align_corners
ratios = ntuple(i -> T((size(dx,i)-1) / (size(Δ,i)-1)), 1) # reversed compared to forward pass
else
ratios = ntuple(i -> T(size(dx,i) / size(Δ,i)), 1)
end

kernel = @cuda launch=false ∇upsample_linear_wcn_kernel!(in_size, ratios..., Δ, dx, align_corners)
config = launch_configuration(kernel.fun; max_threads=256)
threads = Base.min(in_size, config.threads)
blocks = cld(in_size, threads)
kernel(in_size, ratios..., Δ, dx, align_corners; threads=threads, blocks=blocks)
return dx
end


###########
# bilinear
###########
function upsample_bilinear_whcn_kernel!(n_elem, rwidth, rheight, x, y, align_corners)
function upsample_linear_cuda_kernel!(n_elem, rwidth, rheight, x::CuDeviceArray{<:Any, 4}, y::CuDeviceArray{<:Any, 4}, align_corners)
index = (threadIdx().x - 1) + (blockIdx().x - 1) * blockDim().x

if index < n_elem
Expand Down Expand Up @@ -194,7 +197,7 @@ function upsample_bilinear_whcn_kernel!(n_elem, rwidth, rheight, x, y, align_cor
end

# Δ is the gradient backpropagated from downstream layers
function upsample_bilinear_whcn_kernel!(n_elem, rwidth, rheight, Δ, dx, align_corners)
function upsample_linear_cuda_kernel!(n_elem, rwidth, rheight, Δ::CuDeviceArray{<:Any, 4}, dx::CuDeviceArray{<:Any, 4}, align_corners)
index = (threadIdx().x - 1) + (blockIdx().x - 1) * blockDim().x

if index < n_elem
Expand Down Expand Up @@ -237,44 +240,11 @@ function ∇upsample_bilinear_whcn_kernel!(n_elem, rwidth, rheight, Δ, dx, alig
return nothing
end

function NNlib.upsample_bilinear_whcn!(y::CuArray{T,4}, x::CuArray{T,4}; align_corners=true) where T
out_size = prod(size(y)[1:2]) # w*h

if align_corners
ratios = ntuple(i -> T((size(x,i)-1) / (size(y,i)-1)), 2)
else
ratios = ntuple(i -> T(size(x,i) / size(y,i)), 2)
end

kernel = @cuda launch=false upsample_bilinear_whcn_kernel!(out_size, ratios..., x, y, align_corners)
config = launch_configuration(kernel.fun; max_threads=256)
threads = Base.min(out_size, config.threads)
blocks = cld(out_size, threads)
kernel(out_size, ratios..., x, y, align_corners; threads=threads, blocks=blocks)
return y
end

function NNlib.∇upsample_bilinear_whcn!(dx::CuArray{T,4}, Δ::CuArray{T,4}; align_corners=true) where T
in_size = prod(size(Δ)[1:2])
if align_corners
ratios = ntuple(i -> T((size(dx,i)-1) / (size(Δ,i)-1)), 2) # reversed compared to forward pass
else
ratios = ntuple(i -> T(size(dx,i) / size(Δ,i)), 2)
end

kernel = @cuda launch=false ∇upsample_bilinear_whcn_kernel!(in_size, ratios..., Δ, dx, align_corners)
config = launch_configuration(kernel.fun; max_threads=256)
threads = Base.min(in_size, config.threads)
blocks = cld(in_size, threads)
kernel(in_size, ratios..., Δ, dx, align_corners; threads=threads, blocks=blocks)
return dx
end


###########
# trilinear
###########
function upsample_trilinear_whdcn_kernel!(n_elem, rwidth, rheight, rdepth, x, y, align_corners)
function upsample_linear_cuda_kernel!(n_elem, rwidth, rheight, rdepth, x::CuDeviceArray{<:Any, 5}, y::CuDeviceArray{<:Any, 5}, align_corners)
index = (threadIdx().x - 1) + (blockIdx().x - 1) * blockDim().x

if index < n_elem
Expand Down Expand Up @@ -337,7 +307,7 @@ function upsample_trilinear_whdcn_kernel!(n_elem, rwidth, rheight, rdepth, x, y,
end

# Δ is the gradient backpropagated from downstream layers
function upsample_trilinear_whdcn_kernel!(n_elem, rwidth, rheight, rdepth, Δ, dx, align_corners)
function upsample_linear_cuda_kernel!(n_elem, rwidth, rheight, rdepth, Δ::CuDeviceArray{<:Any, 5}, dx::CuDeviceArray{<:Any, 5}, align_corners)
index = (threadIdx().x - 1) + (blockIdx().x - 1) * blockDim().x

if index < n_elem
Expand Down Expand Up @@ -389,37 +359,3 @@ function ∇upsample_trilinear_whdcn_kernel!(n_elem, rwidth, rheight, rdepth, Δ
end # if
return nothing
end

function NNlib.upsample_trilinear_whdcn!(y::CuArray{T,5}, x::CuArray{T,5}; align_corners=true) where T
out_size = prod(size(y)[1:3]) # w*h*d

if align_corners
ratios = ntuple(i -> T((size(x,i)-1) / (size(y,i)-1)), 3)
else
ratios = ntuple(i -> T(size(x,i) / size(y,i)), 3)
end

kernel = @cuda launch=false upsample_trilinear_whdcn_kernel!(out_size, ratios..., x, y, align_corners)
config = launch_configuration(kernel.fun; max_threads=256)
threads = Base.min(out_size, config.threads)
blocks = cld(out_size, threads)
kernel(out_size, ratios..., x, y, align_corners; threads=threads, blocks=blocks)
return y
end

function NNlib.∇upsample_trilinear_whdcn!(dx::CuArray{T,5}, Δ::CuArray{T,5}; align_corners=true) where T
in_size = prod(size(Δ)[1:3])

if align_corners
ratios = ntuple(i -> T((size(dx,i)-1) / (size(Δ,i)-1)), 3) # reversed compared to forward pass
else
ratios = ntuple(i -> T(size(dx,i) / size(Δ,i)), 3)
end

kernel = @cuda launch=false ∇upsample_trilinear_whdcn_kernel!(in_size, ratios..., Δ, dx, align_corners)
config = launch_configuration(kernel.fun; max_threads=256)
threads = Base.min(in_size, config.threads)
blocks = cld(in_size, threads)
kernel(in_size, ratios..., Δ, dx, align_corners; threads=threads, blocks=blocks)
return dx
end

0 comments on commit bc07266

Please sign in to comment.